From 2a02072873b0b92bfbc36ac8325e0530e16474da Mon Sep 17 00:00:00 2001 From: Justin M Wozniak Date: Fri, 13 Mar 2020 14:59:13 -0500 Subject: [PATCH 001/601] Improve install-candle.R --- workflows/common/R/install-candle.R | 32 ++++++++++++++++++----------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/workflows/common/R/install-candle.R b/workflows/common/R/install-candle.R index 0277c457..16a7c691 100644 --- a/workflows/common/R/install-candle.R +++ b/workflows/common/R/install-candle.R @@ -4,19 +4,27 @@ # Run this via install-candle.sh # Installs all R packages needed for Supervisor workflows +# Installation settings: r <- getOption("repos") -r["CRAN"] <- "http://cran.cnr.berkeley.edu/" +# r["CRAN"] <- "http://cran.cnr.berkeley.edu/" +r["CRAN"] <- "http://cran.wustl.edu/" options(repos = r) +NCPUS = 8 -install.packages("RInside", Ncpus=4) - -# Force Plotly 4.5.6 - not latest! Do not want shiny/httpuv, it does not work on Cooley! +# Force Plotly 4.5.6 - not latest! install.packages("https://cran.r-project.org/src/contrib/Archive/plotly/plotly_4.5.6.tar.gz") -install.packages("smoof", Ncpus=4) -# mlrMBO may need APT packages libxml2-dev libssl-dev -install.packages("mlrMBO", Ncpus=4) -install.packages("rgenoud", Ncpus=4) -install.packages("DiceKriging", Ncpus=4) -install.packages("randomForest", Ncpus=4) -install.packages("jsonlite", Ncpus=4) -install.packages("parallelMap", Ncpus=4) + +PKGS = list("DiceKriging", + "jsonlite", + # mlrMBO may need APT packages libxml2-dev libssl-dev + "mlrMBO", + "parallelMap", + "randomForest", + "rgenoud", + "RInside", + "smoof" + ) + +for (pkg in PKGS) { + install.packages(pkg, Ncpus=NCPUS) +} From 73c33291f87d981f74105b89c247b804c2927966 Mon Sep 17 00:00:00 2001 From: Justin M Wozniak Date: Fri, 13 Mar 2020 15:17:50 -0500 Subject: [PATCH 002/601] Update header --- workflows/common/sh/env-theta.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/common/sh/env-theta.sh b/workflows/common/sh/env-theta.sh index c2362f1d..7ae55220 100644 --- a/workflows/common/sh/env-theta.sh +++ b/workflows/common/sh/env-theta.sh @@ -1,6 +1,6 @@ -# LANGS Theta -# Language settings for Theta (Swift, Python, R, Tcl, etc.) +# ENV THETA +# Environment settings for Theta (Swift, Python, R, Tcl, etc.) # TCL=/home/wozniak/Public/sfw/theta/tcl-8.6.1 # export R=/home/wozniak/Public/sfw/theta/R-3.4.0/lib64/R From 99af951d4c67183915b1fc5c222c64b3bc933175 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Fri, 15 May 2020 17:15:40 -0500 Subject: [PATCH 003/601] o Fix IMPL for site mcs --- workflows/common/sh/env-mcs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/sh/env-mcs.sh b/workflows/common/sh/env-mcs.sh index d150d24e..7e8e0df2 100755 --- a/workflows/common/sh/env-mcs.sh +++ b/workflows/common/sh/env-mcs.sh @@ -15,7 +15,7 @@ PYTHONPATH+=$WORKFLOWS_ROOT/common/python: export PATH=$SWIFT_T/turbine/bin:$SWIFT_T/stc/bin:$PATH echo $PATH -SWIFT_IMPL="app" +SWIFT_IMPL="py" # EMEWS Queues for R EQR=$WORKFLOWS_ROOT/common/ext/EQ-R From 2a42ac366aaf4280a63c8f50b81dff88abfbc6b2 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 1 Jun 2020 11:31:42 -0500 Subject: [PATCH 004/601] Add more logging --- workflows/cp-leaveout/scripts/extract-node-info.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index 023c53f4..63f601ce 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -21,7 +21,7 @@ log_list = args.directory + "/log-list.txt" node_pkl = args.directory + "/node-info.pkl" -logging.basicConfig(level=logging.WARN, format="%(message)s") +logging.basicConfig(level=logging.INFO, format="%(message)s") def read_log_filenames(log_list): result = [] @@ -46,10 +46,15 @@ def parse_logs(log_files): nodes = {} logging.warning("Opening %i log files..." % len(log_files)) try: + total = len(log_files) + index = 0 for log_file in log_files: - logging.info("Opening: " + log_file) + progress = "%4i/%4i (%2.f%%)" % \ + (index, total, 100.0*index/total) + logging.info("Opening: %12s %s" % (progress, log_file)) with open(log_file) as fp: parse_log(fp, nodes) + index += 1 except IOError as e: abort(e, os.EX_IOERR, "Could not read: " + log_file) return nodes From c8e6cef65f7dff02c1b55df567c20c3bb8ac9a93 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 1 Jun 2020 11:31:59 -0500 Subject: [PATCH 005/601] Add comments --- workflows/cp-leaveout/scripts/extract-node-info.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/extract-node-info.sh b/workflows/cp-leaveout/scripts/extract-node-info.sh index d7b75eb0..44f5e089 100755 --- a/workflows/cp-leaveout/scripts/extract-node-info.sh +++ b/workflows/cp-leaveout/scripts/extract-node-info.sh @@ -3,7 +3,8 @@ set -eu # EXTRACT NODE INFO SH # Extract all data from all logs in given experiment directory -# Provide an experiment directory +# Provide an experiment directory DIR +# Creates $DIR/node-info.pkl THIS=$( readlink --canonicalize $( dirname $0 ) ) From 518f5f1c0d54259c072557ad863a1b2ed5996b1b Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 1 Jun 2020 11:32:33 -0500 Subject: [PATCH 006/601] More work on outlier detection --- .../scripts/find-loss-increases.py | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/workflows/cp-leaveout/scripts/find-loss-increases.py b/workflows/cp-leaveout/scripts/find-loss-increases.py index 08e2ef80..a31bf251 100644 --- a/workflows/cp-leaveout/scripts/find-loss-increases.py +++ b/workflows/cp-leaveout/scripts/find-loss-increases.py @@ -40,6 +40,8 @@ except IOError as e: fail(e, os.EX_IOERR, 'Could not read: ' + node_pkl) +print("total nodes: %i" % len(data)) + # Artificial nodes for comparison: node_worst = Node("WORST") node_worst.val_loss = 0 @@ -49,16 +51,23 @@ if args.stage != STAGE_ANY: print("STAGE: %i" % args.stage) +leaves = 0 # stage 5 Nodes + # List of Nodes where val_loss increased: increases = [] # Total Node count: total = 0 for node_id in data.keys(): + print("node: " + node_id) parent_id = node_id[0:-2] # '1.2.3' -> '1.2' if len(parent_id) == 1: # stage=1 continue + if parent_id not in data: + print("parent not found.") + continue current = data[node_id] parent = data[parent_id] + if current.stage == 5: leaves += 1 if not (args.stage == STAGE_ANY or args.stage == current.stage): continue current.val_loss_delta = current.val_loss - parent.val_loss @@ -68,6 +77,8 @@ if current.val_loss < node_best.val_loss: node_best = current total += 1 +print("leaves: %i" % leaves) + if total == 0: fail('No matching Nodes found!') fraction = 100.0 * len(increases) / total @@ -107,9 +118,11 @@ def print_delta(prefix, node): values_increase = [] values_val_loss = [] + for node in increases: values_increase.append(node.get_val_loss_delta()) values_val_loss.append(node.val_loss) + avg_increase = avg(values_increase) avg_val_loss = avg(values_val_loss) print('avg increase: %f' % avg_increase) @@ -118,3 +131,35 @@ def print_delta(prefix, node): file_increase_deltas = "increase-deltas-%s.data" % args.token append(file_increase_deltas, "%i %5.1f" % (args.stage, delta_ratio)) + +outliers_file = "outliers-%s.data" % args.token +print("avg_increase", str(avg_increase)) +print("avg_val_loss", str(avg_val_loss)) + +print("%-2s %-12s %-8s %-8s %-8s %-8s" % \ + ("", "node", "val_loss", "parent", "delta", "ratio")) + +increases.sort(key=Node.get_val_loss_delta, reverse=True) +ratios = [] +index = 1 +for node in increases: + parent = data[node.parent()] + ratio = node.get_val_loss_delta() / parent.val_loss + print("%2i %-12s %0.6f %0.6f %0.6f %0.6f" % + (index, node.id, node.val_loss, parent.val_loss, + node.get_val_loss_delta(), ratio)) + ratios.append(ratio) + index += 1 +ratios.sort() + +with open(outliers_file, "w") as fp: + i = 0 + for ratio in ratios: + fp.write("%4i %0.7f\n" % (i, ratio)) + i += 1 + +# with open(outliers_file, "w") as fp: +# i = 0 +# for ratio in ratios: +# fp.write("%4i %0.7f\n" % (i, ratio)) +# i += 1 From e9b70a7668b39787712120e4d1a7f0336786f685 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 1 Jun 2020 11:32:59 -0500 Subject: [PATCH 007/601] Fix to best val_loss --- workflows/cp-leaveout/scripts/workflow-stats.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/workflow-stats.py b/workflows/cp-leaveout/scripts/workflow-stats.py index 731d6526..b86b7e83 100644 --- a/workflows/cp-leaveout/scripts/workflow-stats.py +++ b/workflows/cp-leaveout/scripts/workflow-stats.py @@ -115,7 +115,8 @@ def string_percentile(self, percentile): stops .add(node.stage, node.stopped_early) losses.add(node.stage, node.val_loss) times.add(node.stage, node.total_time(data)) - if node.val_loss < best_val_loss.val_loss: best_val_loss = node + if node.stage == 5 and node.val_loss < best_val_loss.val_loss: + best_val_loss = node tm_m = tm_s / 60 tm_h = tm_m / 60 From 02db432d888c2a804542e7407d3e661b3f9284dd Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 1 Jun 2020 11:46:52 -0500 Subject: [PATCH 008/601] Set ignores --- workflows/cp-leaveout/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/cp-leaveout/.gitignore b/workflows/cp-leaveout/.gitignore index 61e34c95..f4a85539 100644 --- a/workflows/cp-leaveout/.gitignore +++ b/workflows/cp-leaveout/.gitignore @@ -1,3 +1,4 @@ experiments turbine-output +*.data *.pkl From 0371e91f9565a545f248fc2d9002531be2406711 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 1 Jun 2020 11:47:38 -0500 Subject: [PATCH 009/601] Basic test for dunedin --- workflows/cp-leaveout/test/test-dunedin.sh | 89 ++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100755 workflows/cp-leaveout/test/test-dunedin.sh diff --git a/workflows/cp-leaveout/test/test-dunedin.sh b/workflows/cp-leaveout/test/test-dunedin.sh new file mode 100755 index 00000000..d710da70 --- /dev/null +++ b/workflows/cp-leaveout/test/test-dunedin.sh @@ -0,0 +1,89 @@ +#!/bin/bash +set -eu + +# CP LEAVEOUT TEST DUNEDIN + +usage() +{ + echo "Usage: test SITE EXPID WORKFLOW_ARGS" +} + +if (( ${#} == 0 )) +then + usage + exit 1 +fi + +SITE=$1 +RUN_DIR=$2 +shift 2 +WORKFLOW_ARGS=$* + +export MODEL_NAME=uno # nt3 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-1.sh +export CFG_PRM=$THIS/cfg-prm-1.sh + +# Data files +# PLAN_JSON=$EMEWS_PROJECT_ROOT/plangen_cell8-p2_drug8-p2.json + +PLAN_JSON=$EMEWS_PROJECT_ROOT/plangen_cell8-p2_drug8-p2.json +DATAFRAME_CSV=/usb1/wozniak/CANDLE-Benchmarks-Data/top21_dataframe_8x8.csv +BENCHMARK_DATA=$HOME/proj/Benchmarks/Pilot1/Uno + +# SCRATCH=/gpfs/alpine/med106/scratch/wozniak +# CANDLE_DATA=$SCRATCH/CANDLE-Data +# PLAN_JSON=$CANDLE_DATA/plangen_cell8-p2_drug8-p2.json +# DATAFRAME_CSV=$CANDLE_DATA/top21_dataframe_8x8.csv +# BENCHMARK_DATA=$SCRATCH/proj/Benchmarks/Pilot1/Uno + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export OBJ_RETURN="val_loss" + +if [[ $SITE == "theta" ]] +then + export WAIT=1 +fi + +for f in $DATAFRAME_CSV $PLAN_JSON +do + if ! [[ -f $f ]] + then + echo "$0: does not exist: $f" + exit 1 + fi +done + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME $WORKFLOW_ARGS \ + --plan_json=$PLAN_JSON \ + --dataframe_csv=$DATAFRAME_CSV \ + --benchmark_data=$BENCHMARK_DATA + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +OUTPUT=turbine-output/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +# Wait for job +queue_wait + +SCRIPT=$( basename $0 .sh ) +check_output "RESULTS:" $OUTPUT $WORKFLOW $SCRIPT $JOBID +check_output "EXIT CODE: 0" $OUTPUT $WORKFLOW $SCRIPT $JOBID + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: From 878e800ae8b9f0222b2f9d733809e0fb25d98039 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 1 Jun 2020 11:48:07 -0500 Subject: [PATCH 010/601] Minor formatting --- workflows/cp-leaveout/scripts/README.adoc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/scripts/README.adoc b/workflows/cp-leaveout/scripts/README.adoc index 94580170..3504d5b7 100644 --- a/workflows/cp-leaveout/scripts/README.adoc +++ b/workflows/cp-leaveout/scripts/README.adoc @@ -85,8 +85,11 @@ Compile workflow statistics === Analysis for model.log files -These are not really supported for Summit runs because we are using in-memory Python, -but they could be easily fixed. Also, they run against the model.logs and not the Pickle, so they are slow. +These are not really supported for Summit runs +because we are using in-memory Python, +but they could be easily fixed. +Also, they run against the model.logs and not the Pickle, +so they are slow. ==== extract-stats.sh From e429e2fb4d46f04d4a3fac5a6cc49d03a9962e78 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 2 Jun 2020 17:13:49 -0500 Subject: [PATCH 011/601] Update header --- workflows/common/sh/env-mcs.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/workflows/common/sh/env-mcs.sh b/workflows/common/sh/env-mcs.sh index 7e8e0df2..23e80218 100755 --- a/workflows/common/sh/env-mcs.sh +++ b/workflows/common/sh/env-mcs.sh @@ -1,7 +1,6 @@ -# LANGS LOCAL -# Language settings for any local machine like Ubuntu -# Assumes WORKFLOWS_ROOT, BENCHMARK_DIR, BENCHMARKS_ROOT are set +# ENV MCS +# Environment settings for ANL/MCS compute nodes export PY=/homes/jain/anaconda3/bin/python/ export R=/home/wozniak/Public/sfw/x86_64/R-3.4.1/lib/R/ From 7e270b963043a6076162bd346ab73cd6158986b3 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 3 Jun 2020 10:01:11 -0500 Subject: [PATCH 012/601] Update Swift/T, EQ/R on MCS --- workflows/common/sh/env-mcs.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/workflows/common/sh/env-mcs.sh b/workflows/common/sh/env-mcs.sh index 23e80218..de7080b3 100755 --- a/workflows/common/sh/env-mcs.sh +++ b/workflows/common/sh/env-mcs.sh @@ -5,7 +5,8 @@ export PY=/homes/jain/anaconda3/bin/python/ export R=/home/wozniak/Public/sfw/x86_64/R-3.4.1/lib/R/ # Modify to specify the location of SWIFT_T installation -export SWIFT_T=${SWIFT_T:-/homes/jain/install/swift-t/} +# export SWIFT_T=${SWIFT_T:-/homes/jain/install/swift-t/} +export SWIFT_T=$HOME/Public/sfw/x86_64/swift-t/2020-05-29 export LD_LIBRARY_PATH+=:$R/lib:$SWIFT_T/stc/lib:$SWIFT_T/turbine/lib/:$SWIFT_T/lb/lib:$SWIFT_T/cutils/lib # Python @@ -17,7 +18,8 @@ echo $PATH SWIFT_IMPL="py" # EMEWS Queues for R -EQR=$WORKFLOWS_ROOT/common/ext/EQ-R +# EQR=$WORKFLOWS_ROOT/common/ext/EQ-R +EQR=/home/wozniak/Public/sfw/x86_64/EQ-R EQPy=$WORKFLOWS_ROOT/common/ext/EQ-Py # Resident task workers and ranks if [ -z ${TURBINE_RESIDENT_WORK_WORKERS+x} ] From 96b4c0641a0952aa3ff0fbef2d6460e6da802f98 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 3 Jun 2020 10:02:55 -0500 Subject: [PATCH 013/601] Fix typo --- workflows/common/sh/env-mcs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/sh/env-mcs.sh b/workflows/common/sh/env-mcs.sh index de7080b3..c5f53d52 100755 --- a/workflows/common/sh/env-mcs.sh +++ b/workflows/common/sh/env-mcs.sh @@ -37,6 +37,6 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}${LD_LIBRARY_PATH:+:} export LOCAL=1 export CRAY=0 -# Cf. utils.s +# Cf. utils.sh log_path LD_LIBRARY_PATH log_path PYTHONPATH From 13d7610edab479806707aaacfd980590c52e96c6 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 3 Jun 2020 13:26:27 -0500 Subject: [PATCH 014/601] Update header --- workflows/common/sh/sched-mcs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/sh/sched-mcs.sh b/workflows/common/sh/sched-mcs.sh index 45339c66..0f355c8e 100755 --- a/workflows/common/sh/sched-mcs.sh +++ b/workflows/common/sh/sched-mcs.sh @@ -1,5 +1,5 @@ -# SCHED LOCAL +# SCHED MCS # Scheduler settings for Swift/MCS MACHINE="" From 30b8cb7fd2a88f38285c2f909cb15cff77dc3889 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 3 Jun 2020 13:57:30 -0500 Subject: [PATCH 015/601] Fix PYTHONPATH on MCS --- workflows/common/sh/utils.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index 9b3f751a..6e7baa54 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -90,9 +90,13 @@ python_envs() RESULT=() if [[ ${PYTHONPATH:-} != "" ]] then - # We do not currently need this- + # We do not currently need this except on MCS: # Swift/T should grab PYTHONPATH automatically - : # RESULT+=( -e PYTHONPATH=$PYTHONPATH ) + if [[ ${SITE} == "mcs" ]] + then + # MCS discards PYTHONPATH in subshells + RESULT+=( -e PYTHONPATH=$PYTHONPATH ) + fi fi if [[ ${PYTHONHOME:-} != "" ]] then From 2f6438b55a39e6197edf5231c1407743687053f3 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 3 Jun 2020 15:28:21 -0500 Subject: [PATCH 016/601] o Fix path to absolute --- workflows/common/sh/env-mcs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/sh/env-mcs.sh b/workflows/common/sh/env-mcs.sh index c5f53d52..d103cec8 100755 --- a/workflows/common/sh/env-mcs.sh +++ b/workflows/common/sh/env-mcs.sh @@ -6,7 +6,7 @@ export PY=/homes/jain/anaconda3/bin/python/ export R=/home/wozniak/Public/sfw/x86_64/R-3.4.1/lib/R/ # Modify to specify the location of SWIFT_T installation # export SWIFT_T=${SWIFT_T:-/homes/jain/install/swift-t/} -export SWIFT_T=$HOME/Public/sfw/x86_64/swift-t/2020-05-29 +export SWIFT_T=/homes/wozniak/Public/sfw/x86_64/swift-t/2020-05-29 export LD_LIBRARY_PATH+=:$R/lib:$SWIFT_T/stc/lib:$SWIFT_T/turbine/lib/:$SWIFT_T/lb/lib:$SWIFT_T/cutils/lib # Python From 1a1653485b24fe9190891e243702388c987cb56e Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Fri, 5 Jun 2020 16:25:24 -0500 Subject: [PATCH 017/601] o Update swift-t version --- workflows/common/sh/env-mcs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/sh/env-mcs.sh b/workflows/common/sh/env-mcs.sh index d103cec8..9dbb7582 100755 --- a/workflows/common/sh/env-mcs.sh +++ b/workflows/common/sh/env-mcs.sh @@ -6,7 +6,7 @@ export PY=/homes/jain/anaconda3/bin/python/ export R=/home/wozniak/Public/sfw/x86_64/R-3.4.1/lib/R/ # Modify to specify the location of SWIFT_T installation # export SWIFT_T=${SWIFT_T:-/homes/jain/install/swift-t/} -export SWIFT_T=/homes/wozniak/Public/sfw/x86_64/swift-t/2020-05-29 +export SWIFT_T=/homes/wozniak/Public/sfw/x86_64/swift-t/2020-06-04 export LD_LIBRARY_PATH+=:$R/lib:$SWIFT_T/stc/lib:$SWIFT_T/turbine/lib/:$SWIFT_T/lb/lib:$SWIFT_T/cutils/lib # Python From 235dc2f4153ca04366e6914b2ee9d460019adb94 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Fri, 5 Jun 2020 16:40:44 -0500 Subject: [PATCH 018/601] o add more benchmarks --- workflows/mlrMBO/swift/workflow.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index cf02242a..620cee96 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -15,8 +15,7 @@ then fi BENCHMARKS_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd ) export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} -BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/P1B1 -# $BENCHMARKS_ROOT/Pilot1/Attn1:$BENCHMARKS_ROOT/Pilot1/TC1: +BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/P1B1:$BENCHMARKS_ROOT/Pilot1/Attn1:$BENCHMARKS_ROOT/Pilot1/NT3:$BENCHMARKS_ROOT/examples/ADRP # :$BENCHMARKS_ROOT/Pilot1/P1B1:$BENCHMARKS_ROOT/Pilot1/Combo:$BENCHMARKS_ROOT/Pilot2/P2B1:$BENCHMARKS_ROOT/Pilot3/P3B1:$BENCHMARKS_ROOT/Pilot3/P3B3:$BENCHMARKS_ROOT/Pilot3/P3B4 export BENCHMARK_TIMEOUT export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} From 7502aeca814a55983fefbb7376ec1d0f2c85b393 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Sat, 6 Jun 2020 00:57:27 -0500 Subject: [PATCH 019/601] o fix LD_LIBRARY for mcs --- workflows/common/sh/env-mcs.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/common/sh/env-mcs.sh b/workflows/common/sh/env-mcs.sh index 9dbb7582..17ea0578 100755 --- a/workflows/common/sh/env-mcs.sh +++ b/workflows/common/sh/env-mcs.sh @@ -7,6 +7,7 @@ export R=/home/wozniak/Public/sfw/x86_64/R-3.4.1/lib/R/ # Modify to specify the location of SWIFT_T installation # export SWIFT_T=${SWIFT_T:-/homes/jain/install/swift-t/} export SWIFT_T=/homes/wozniak/Public/sfw/x86_64/swift-t/2020-06-04 +LD_LIBRARY_PATH=/homes/jain/anaconda3/lib/:/nfs2/jain/spack/opt/spack/linux-ubuntu14-x86_64/gcc-4.4.7/cuda-8.0.61-kxwh3jwkxjybyo3n3nnajezfyq3epo5y/lib:/usr/lib:$LD_LIBRARY_PATH export LD_LIBRARY_PATH+=:$R/lib:$SWIFT_T/stc/lib:$SWIFT_T/turbine/lib/:$SWIFT_T/lb/lib:$SWIFT_T/cutils/lib # Python From 7e8a4229e2fe11981a89ae73779113af7713b5cb Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Sun, 7 Jun 2020 02:23:34 -0500 Subject: [PATCH 020/601] o Fix formating --- workflows/common/sh/env-mcs.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/common/sh/env-mcs.sh b/workflows/common/sh/env-mcs.sh index 17ea0578..84c02f38 100755 --- a/workflows/common/sh/env-mcs.sh +++ b/workflows/common/sh/env-mcs.sh @@ -7,7 +7,8 @@ export R=/home/wozniak/Public/sfw/x86_64/R-3.4.1/lib/R/ # Modify to specify the location of SWIFT_T installation # export SWIFT_T=${SWIFT_T:-/homes/jain/install/swift-t/} export SWIFT_T=/homes/wozniak/Public/sfw/x86_64/swift-t/2020-06-04 -LD_LIBRARY_PATH=/homes/jain/anaconda3/lib/:/nfs2/jain/spack/opt/spack/linux-ubuntu14-x86_64/gcc-4.4.7/cuda-8.0.61-kxwh3jwkxjybyo3n3nnajezfyq3epo5y/lib:/usr/lib:$LD_LIBRARY_PATH + +export LD_LIBRARY_PATH+=/homes/jain/anaconda3/lib/:/nfs2/jain/spack/opt/spack/linux-ubuntu14-x86_64/gcc-4.4.7/cuda-8.0.61-kxwh3jwkxjybyo3n3nnajezfyq3epo5y/lib:/usr/lib export LD_LIBRARY_PATH+=:$R/lib:$SWIFT_T/stc/lib:$SWIFT_T/turbine/lib/:$SWIFT_T/lb/lib:$SWIFT_T/cutils/lib # Python From ce0e8c2c392256212b339a9878e470c6f2b07a98 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Tue, 9 Jun 2020 02:36:07 -0500 Subject: [PATCH 021/601] o Fix variables as per small run o python mpi4py is hard not trivial to get work on all machines: test: python -c "from mpi4py import MPI" --- workflows/async-search/test/cfg-prm-1.sh | 10 +++++----- workflows/async-search/test/cfg-sys-1.sh | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/workflows/async-search/test/cfg-prm-1.sh b/workflows/async-search/test/cfg-prm-1.sh index 3f6b904a..e3e6b85a 100644 --- a/workflows/async-search/test/cfg-prm-1.sh +++ b/workflows/async-search/test/cfg-prm-1.sh @@ -3,11 +3,11 @@ # async-search settings # Note: INIT_SIZE needs to be larger than PROCS-2 for now. -INIT_SIZE=${INIT_SIZE:-500} -MAX_EVALS=${MAX_EVALS:-750} -NUM_BUFFER=${NUM_BUFFER:-250} -MAX_THRESHOLD=${MAX_THRESHOLD:-5} -N_JOBS=${N_JOBS:-48} +INIT_SIZE=${INIT_SIZE:-4} +MAX_EVALS=${MAX_EVALS:-20} +NUM_BUFFER=${NUM_BUFFER:-2} +MAX_THRESHOLD=${MAX_THRESHOLD:-1} +N_JOBS=${N_JOBS:-1} #INIT_SIZE=${INIT_SIZE:-300} #MAX_EVALS=${MAX_EVALS:-550} diff --git a/workflows/async-search/test/cfg-sys-1.sh b/workflows/async-search/test/cfg-sys-1.sh index dc10b22d..ad9a694b 100644 --- a/workflows/async-search/test/cfg-sys-1.sh +++ b/workflows/async-search/test/cfg-sys-1.sh @@ -4,7 +4,7 @@ # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEMS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-256} +export PROCS=${PROCS:-4} # MPI processes per node # Cori has 32 cores per node, 128GB per node From 792ef752dfffb0e71a35435a62c15e209193f7b6 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 9 Jun 2020 11:27:51 -0500 Subject: [PATCH 022/601] New simple app python test --- scratch/swift-tests/app-py.swift | 6 ++++++ scratch/swift-tests/fake-model.py | 4 ++++ scratch/swift-tests/fake-model.sh | 7 +++++++ 3 files changed, 17 insertions(+) create mode 100644 scratch/swift-tests/app-py.swift create mode 100644 scratch/swift-tests/fake-model.py create mode 100755 scratch/swift-tests/fake-model.sh diff --git a/scratch/swift-tests/app-py.swift b/scratch/swift-tests/app-py.swift new file mode 100644 index 00000000..4b4b21a8 --- /dev/null +++ b/scratch/swift-tests/app-py.swift @@ -0,0 +1,6 @@ +app p() +{ + "./fake-model.sh" "hi" "bye"; +} + +p(); diff --git a/scratch/swift-tests/fake-model.py b/scratch/swift-tests/fake-model.py new file mode 100644 index 00000000..57726f66 --- /dev/null +++ b/scratch/swift-tests/fake-model.py @@ -0,0 +1,4 @@ + +# import something ? + +print("fake-model.py: python works") diff --git a/scratch/swift-tests/fake-model.sh b/scratch/swift-tests/fake-model.sh new file mode 100755 index 00000000..3e49a186 --- /dev/null +++ b/scratch/swift-tests/fake-model.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# source langs-app-X.sh + +echo "fake-model.sh: PWD=$PWD" + +python fake-model.py From 735e5494a4355c4d20d2480ba5a5f31b10ddfcc1 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Tue, 9 Jun 2020 15:31:14 -0400 Subject: [PATCH 023/601] new small set of values for GA --- workflows/GA/data/adrp_param_space_ga.json | 52 +++++++++++----------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/workflows/GA/data/adrp_param_space_ga.json b/workflows/GA/data/adrp_param_space_ga.json index 10aa92fd..144b462d 100644 --- a/workflows/GA/data/adrp_param_space_ga.json +++ b/workflows/GA/data/adrp_param_space_ga.json @@ -1,51 +1,49 @@ -[ +i[ { "name": "dense", "type": "categorical", "element_type": "string", "values": [ - "1000", - "1000 1000", - "1000 1000 1000", - "1000 1000 1000 1000", - "1000 1000 1000 1000 1000" + "500 250 125 60 30", + "250 125 60 30", + "400 150 75 30", + "300 175 90 45 20", + "400 200 100 50 25", + "350 170 85 40 20" ] }, - - { - "name": "dense_feature_layers", - "type": "categorical", - "element_type": "string", - "values": ["250 125 60 30", "500 250 125 60 30", "125 60 30"] - }, - { "name": "batch_size", "type": "ordered", "element_type": "int", - "values": [32, 64], + "values": [16, 32, 64], "sigma": 1 }, - { "name": "optimizer", "type": "categorical", "element_type": "string", - "values": ["adam", "sgd", "rmsprop"] - }, - - { - "name": "learning_rate", - "type": "float", - "lower": 0.00001, - "upper": 0.001, - "sigma": 0.0049995 + "values": ["adam", "sgd"] }, { "name": "epochs", "type": "int", - "lower": 2, - "upper": 2, + "lower": 90, + "upper": 90, "sigma": 20 + }, + { + "name": "dropout", + "type": "float", + "lower": 0.05, + "upper": 0.2, + "sigma": 0.045 + }, + { + "name": "activation", + "type": "categorical", + "element_type": "string", + "values": ["elu", "relu", "linear"] } ] + From d0634ce770be9177e31094661b99f5cd47afd9dd Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Tue, 9 Jun 2020 16:25:05 -0400 Subject: [PATCH 024/601] o Add files for summit runs --- workflows/GA/test/cfg-prm-summit.sh | 42 +++++++++++++++ workflows/GA/test/cfg-sys-summit.sh | 44 ++++++++++++++++ workflows/GA/test/test-summit.sh | 67 ++++++++++++++++++++++++ workflows/mlrMBO/data/adrp_nightly.R | 8 +-- workflows/mlrMBO/test/cfg-prm-nightly.sh | 2 +- workflows/mlrMBO/test/cfg-prm-summit.sh | 14 +++-- workflows/mlrMBO/test/cfg-sys-nightly.sh | 4 +- workflows/mlrMBO/test/cfg-sys-summit.sh | 12 ++--- 8 files changed, 174 insertions(+), 19 deletions(-) create mode 100644 workflows/GA/test/cfg-prm-summit.sh create mode 100644 workflows/GA/test/cfg-sys-summit.sh create mode 100755 workflows/GA/test/test-summit.sh diff --git a/workflows/GA/test/cfg-prm-summit.sh b/workflows/GA/test/cfg-prm-summit.sh new file mode 100644 index 00000000..62db490d --- /dev/null +++ b/workflows/GA/test/cfg-prm-summit.sh @@ -0,0 +1,42 @@ +# CFG PRM 1 + +# GA settings + +SEED=${SEED:-1} +# Total iterations +NUM_ITERATIONS=${NUM_ITERATIONS:-1} +# Size of GA population (i.e. the number of parameter sets to evaluate) +POPULATION_SIZE=${POPULATION_SIZE:-44} +# the GA strategy: one of 'simple' or 'mu_plus_lambda'. See +# https://deap.readthedocs.io/en/master/api/algo.html?highlight=eaSimple#module-deap.algorithms +# for more info. +GA_STRATEGY=${STRATEGY:-simple} + +INIT_PARAMS_FILE=${INIT_PARAMS_FILE:-} + +# TODO: move the following code to a utility library- +# this is a configuration file +# Set the ga parameter space definition file for running +if [ "$MODEL_NAME" = "combo" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/combo_param_space_ga.json} +elif [ "$MODEL_NAME" = "p1b1" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b1_param_space_ga.json} +elif [ "$MODEL_NAME" = "adrp" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/adrp_param_space_ga.json} +elif [ "$MODEL_NAME" = "nt3" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/nt3_param_space_ga.json} +elif [ "$MODEL_NAME" = "tc1" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/tc1_param_space_ga.json} +# TODO: Uncomment when parameter files are available +# elif [ "$MODEL_NAME" = "p1b3" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b3_param_space_ga.json} +# elif [ "$MODEL_NAME" = "p1b2" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b2_param_space_ga.json} +# elif [ "$MODEL_NAME" = "p2b1" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p2b1_param_space_ga.json} +elif [ "$PARAM_SET_FILE" != "" ]; then + PARAM_SET_FILE=${EMEWS_PROJECT_ROOT}/data/${PARAM_SET_FILE} +else + echo "Invalid model-" $MODEL_NAME + exit 1 +fi diff --git a/workflows/GA/test/cfg-sys-summit.sh b/workflows/GA/test/cfg-sys-summit.sh new file mode 100644 index 00000000..1a946208 --- /dev/null +++ b/workflows/GA/test/cfg-sys-summit.sh @@ -0,0 +1,44 @@ +# +# COMBO CFG SYS 1 + +# The number of MPI processes +# Note that 2 processes are reserved for Swift/EMEMS +# The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs +export PROCS=${PROCS:-46} + +# MPI processes per node +# Cori has 32 cores per node, 128GB per node +export PPN=${PPN:-1} + +export WALLTIME=${WALLTIME:-06:00:00} + +#export PROJECT=Candle_ECP + +# Benchmark run timeout: benchmark run will timeout +# after the specified number of seconds. +# If set to -1 there is no timeout. +# This timeout is implemented with Keras callbacks +BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} + +# Uncomment below to use custom python script to run +# Use file name without .py (e.g, my_script.py) +# MODEL_PYTHON_SCRIPT=my_script + +# Shell timeout: benchmark run will be killed +# after the specified number of seconds. +# If set to -1 or empty there is no timeout. +# This timeout is implemented with the shell command 'timeout' +export SH_TIMEOUT=${SH_TIMEOUT:-} + +# Ignore errors: If 1, unknown errors will be reported to model.log +# but will not bring down the Swift workflow. See model.sh . +export IGNORE_ERRORS=0 + +# if the deap python package is not installed with swift-t's embedded python +# it may be ncessary to include its location in the PYTHONPATH +# export PYTHONPATH=/global/u1/n/ncollier/.local/cori/deeplearning2.7/lib/python2.7/site-packages + +# for running locally, edit as necessary +# export PYTHONHOME=$HOME/anaconda3 +# export PYTHON=python3.6 +# export SWIFT_T=$HOME/sfw/swift-t-4c8f0afd diff --git a/workflows/GA/test/test-summit.sh b/workflows/GA/test/test-summit.sh new file mode 100755 index 00000000..e1fefa9a --- /dev/null +++ b/workflows/GA/test/test-summit.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -eu + +# GA TEST 1 + +usage() +{ + echo "Usage: test BENCHMARK_NAME SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 3 )) +then + RUN_DIR=$3 +elif (( ${#} == 2 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=$1 +SITE=$2 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-summit.sh +export CFG_PRM=$THIS/cfg-prm-summit.sh + +# The python GA model exploration algorithm +export GA_FILE=deap_ga.py + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export OBJ_RETURN="val_loss" + +if [[ $SITE == "theta" ]] +then + export WAIT=1 +fi + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME + + +# Wait for job +queue_wait + +echo "TO: $TURBINE_OUTPUT" + +cp $0 $TURBINE_OUTPUT +# Check job output +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) +check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID + +echo "$SCRIPT: SUCCESS" diff --git a/workflows/mlrMBO/data/adrp_nightly.R b/workflows/mlrMBO/data/adrp_nightly.R index 9ffc4f9d..e0cc14f4 100644 --- a/workflows/mlrMBO/data/adrp_nightly.R +++ b/workflows/mlrMBO/data/adrp_nightly.R @@ -1,7 +1,9 @@ param.set <- makeParamSet( - makeIntegerParam("epochs", lower = 2, upper = 2), + makeIntegerParam("epochs", lower = 90, upper = 90), makeNumericParam("dropout", lower = 0.1, upper = 0.2), - makeNumericParam("learning_rate", lower = 0.00001, upper = 0.001) - ## makeDiscreteParam("conv", values = c("32 20 16 32 10 1")) + makeNumericParam("learning_rate", lower = 0.00001, upper = 0.001), + makeDiscreteParam("activation", values = c("elu", "linear", "relu", "sigmoid", "tanh")), + makeDiscreteParam("optimizer", values = c("adam", "sgd", "rmsprop")), + makeDiscreteParam("dense", values = c("500 250 125 60 30", "250 125 60 30", "400 150 75 30","300 175 90 45 20","400 200 100 50 25", "350 170 85 40 20")) ) diff --git a/workflows/mlrMBO/test/cfg-prm-nightly.sh b/workflows/mlrMBO/test/cfg-prm-nightly.sh index 02b79fe0..1aef4fec 100644 --- a/workflows/mlrMBO/test/cfg-prm-nightly.sh +++ b/workflows/mlrMBO/test/cfg-prm-nightly.sh @@ -3,7 +3,7 @@ # mlrMBO settings # Total iterations -PROPOSE_POINTS=${PROPOSE_POINTS:-25} +PROPOSE_POINTS=${PROPOSE_POINTS:-15} MAX_CONCURRENT_EVALUATIONS=${MAX_CONCURRET_EVALUATIONS:-1} MAX_ITERATIONS=${MAX_ITERATIONS:-3} MAX_BUDGET=${MAX_BUDGET:-180} diff --git a/workflows/mlrMBO/test/cfg-prm-summit.sh b/workflows/mlrMBO/test/cfg-prm-summit.sh index 31b7b262..36d086c4 100644 --- a/workflows/mlrMBO/test/cfg-prm-summit.sh +++ b/workflows/mlrMBO/test/cfg-prm-summit.sh @@ -1,19 +1,23 @@ -# CFG PRM SUMMIT +# CFG PRM 1 # mlrMBO settings # Total iterations -PROPOSE_POINTS=${PROPOSE_POINTS:-64} +PROPOSE_POINTS=${PROPOSE_POINTS:-44} MAX_CONCURRENT_EVALUATIONS=${MAX_CONCURRET_EVALUATIONS:-1} -MAX_ITERATIONS=${MAX_ITERATIONS:-3} -MAX_BUDGET=${MAX_BUDGET:-280} -DESIGN_SIZE=${DESIGN_SIZE:-64} +MAX_ITERATIONS=${MAX_ITERATIONS:-2} +MAX_BUDGET=${MAX_BUDGET:-180} +DESIGN_SIZE=${DESIGN_SIZE:-44} # TODO: move the following code to a utility library- # this is a configuration file # Set the R data file for running if [ "$MODEL_NAME" = "combo" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/combo_nightly.R} +elif [ "$MODEL_NAME" = "attn" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/attn_nightly.R} +elif [ "$MODEL_NAME" = "adrp" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/adrp_nightly.R} elif [ "$MODEL_NAME" = "p1b1" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b1_nightly.R} elif [ "$MODEL_NAME" = "nt3" ]; then diff --git a/workflows/mlrMBO/test/cfg-sys-nightly.sh b/workflows/mlrMBO/test/cfg-sys-nightly.sh index 7933779d..f5e3b025 100644 --- a/workflows/mlrMBO/test/cfg-sys-nightly.sh +++ b/workflows/mlrMBO/test/cfg-sys-nightly.sh @@ -4,7 +4,7 @@ # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEMS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-4} +export PROCS=${PROCS:-17} # MPI processes per node # Cori has 32 cores per node, 128GB per node @@ -14,7 +14,7 @@ export PPN=${PPN:-1} # export QUEUE=${QUEUE:-debug-flat-quad} # export WALLTIME=${WALLTIME:-00:10:00} -export WALLTIME=${WALLTIME:-10} +export WALLTIME=${WALLTIME:-120} #export PROJECT=Candle_ECP diff --git a/workflows/mlrMBO/test/cfg-sys-summit.sh b/workflows/mlrMBO/test/cfg-sys-summit.sh index 59f47ed2..2629eb23 100644 --- a/workflows/mlrMBO/test/cfg-sys-summit.sh +++ b/workflows/mlrMBO/test/cfg-sys-summit.sh @@ -1,20 +1,16 @@ -# MLRMBO CFG SYS SUMMIT + +# MLRMBO CFG SYS 1 # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEMS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-66} +export PROCS=${PROCS:-46} # MPI processes per node # Cori has 32 cores per node, 128GB per node export PPN=${PPN:-1} -export TURBINE_DIRECTIVE="#BSUB -alloc_flags \"NVME maximizegpfs\"" -export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" -# For Theta: -# export QUEUE=${QUEUE:-debug-flat-quad} -# export WALLTIME=${WALLTIME:-00:10:00} -export WALLTIME=${WALLTIME:-360} +export WALLTIME=${WALLTIME:-06:00:00} #export PROJECT=Candle_ECP From 4d61819ac1bb83bbefec3fde97e741a748ff24a9 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 10 Jun 2020 19:13:49 -0500 Subject: [PATCH 025/601] o Fix site local and o/p for GA and async-search --- workflows/GA/swift/workflow.sh | 46 ++++++++++++++++++++++-- workflows/GA/test/test-1.sh | 12 +++---- workflows/async-search/swift/workflow.sh | 42 +++++++++++++++++++++- workflows/common/sh/env-local.sh | 2 +- workflows/common/sh/langs-app-local.sh | 11 +++--- 5 files changed, 94 insertions(+), 19 deletions(-) diff --git a/workflows/GA/swift/workflow.sh b/workflows/GA/swift/workflow.sh index ce414d5b..85985587 100755 --- a/workflows/GA/swift/workflow.sh +++ b/workflows/GA/swift/workflow.sh @@ -108,9 +108,36 @@ then echo "Turbine will wait for job completion." fi +# Use for Summit (LSF needs two %) +if [[ ${SITE:-} == "summit" ]] +then + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" +else + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%r.txt" +fi + +mkdir -pv $TURBINE_OUTPUT/out + +#swift-t -n $PROCS \ +# -o $TURBINE_OUTPUT/workflow.tic \ +if [[ ${MACHINE:-} == "" ]] +then + STDOUT=$TURBINE_OUTPUT/output.txt + # The turbine-output link is only created on scheduled systems, + # so if running locally, we create it here so the test*.sh wrappers + # can find it + [[ -L turbine-output ]] && rm turbine-output + ln -s $TURBINE_OUTPUT turbine-output +else + # When running on a scheduled system, Swift/T automatically redirects + # stdout to the turbine-output directory. This will just be for + # warnings or unusual messages + STDOUT="" +fi + # echo's anything following this to standard out -swift-t -n $PROCS \ +swift-t -O 0 -n $PROCS \ ${MACHINE:-} \ -p -I $EQPY -r $EQPY \ -I $OBJ_DIR \ @@ -131,5 +158,18 @@ swift-t -n $PROCS \ -e SH_TIMEOUT \ -e IGNORE_ERRORS \ $WAIT_ARG \ - $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} - + $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} |& \ + tee $STDOUT + + +if (( ${PIPESTATUS[0]} )) +then + echo "workflow.sh: swift-t exited with error!" + exit 1 +fi + +# echo "EXIT CODE: 0" | tee -a $STDOUT + +# Andrew: Needed this so that script to monitor job worked properly (queue_wait... function in utils.sh?) +echo $TURBINE_OUTPUT > turbine-directory.txt + diff --git a/workflows/GA/test/test-1.sh b/workflows/GA/test/test-1.sh index 002b414c..207f0cc2 100755 --- a/workflows/GA/test/test-1.sh +++ b/workflows/GA/test/test-1.sh @@ -50,18 +50,14 @@ fi # Submit job $EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME - -# Wait for job -queue_wait - -echo "TO: $TURBINE_OUTPUT" - -cp $0 $TURBINE_OUTPUT # Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +echo $TURBINE_OUTPUT OUTPUT=$TURBINE_OUTPUT/output.txt WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) SCRIPT=$( basename $0 .sh ) -check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID +#check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID echo "$SCRIPT: SUCCESS" + diff --git a/workflows/async-search/swift/workflow.sh b/workflows/async-search/swift/workflow.sh index 92187eaa..4f172182 100755 --- a/workflows/async-search/swift/workflow.sh +++ b/workflows/async-search/swift/workflow.sh @@ -131,6 +131,33 @@ then echo "Turbine will wait for job completion." fi +# Use for Summit (LSF needs two %) +if [[ ${SITE:-} == "summit" ]] +then + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" +else + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%r.txt" +fi + +mkdir -pv $TURBINE_OUTPUT/out + +#swift-t -n $PROCS \ +# -o $TURBINE_OUTPUT/workflow.tic \ +if [[ ${MACHINE:-} == "" ]] +then + STDOUT=$TURBINE_OUTPUT/output.txt + # The turbine-output link is only created on scheduled systems, + # so if running locally, we create it here so the test*.sh wrappers + # can find it + [[ -L turbine-output ]] && rm turbine-output + ln -s $TURBINE_OUTPUT turbine-output +else + # When running on a scheduled system, Swift/T automatically redirects + # stdout to the turbine-output directory. This will just be for + # warnings or unusual messages + STDOUT="" +fi + #export TURBINE_LAUNCH_OPTIONS="-cc none" swift-t -l -n $PROCS \ @@ -155,4 +182,17 @@ swift-t -l -n $PROCS \ -e MPICH_MAX_THREAD_SAFETY=$MPICH_MAX_THREAD_SAFETY \ -e TURBINE_MPI_THREAD=$TURBINE_MPI_THREAD \ $WAIT_ARG \ - $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} + $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} |& \ + tee $STDOUT + + +if (( ${PIPESTATUS[0]} )) +then + echo "workflow.sh: swift-t exited with error!" + exit 1 +fi + +# echo "EXIT CODE: 0" | tee -a $STDOUT + +# Andrew: Needed this so that script to monitor job worked properly (queue_wait... function in utils.sh?) +echo $TURBINE_OUTPUT > turbine-directory.txt \ No newline at end of file diff --git a/workflows/common/sh/env-local.sh b/workflows/common/sh/env-local.sh index c75b27d8..7646d78d 100644 --- a/workflows/common/sh/env-local.sh +++ b/workflows/common/sh/env-local.sh @@ -4,7 +4,7 @@ # Assumes WORKFLOWS_ROOT, BENCHMARK_DIR, BENCHMARKS_ROOT are set # Modify to specify the location of SWIFT_T installation export SWIFT_T=${SWIFT_T:-$HOME/install/swift-t/} -export LD_LIBRARY_PATH+=$SWIFT_T/turbine/lib:$SWIFT_T/lb/lib:$SWIFT_T/cutils/lib +export LD_LIBRARY_PATH+=:$SWIFT_T/turbine/lib:$SWIFT_T/lb/lib:$SWIFT_T/cutils/lib:$SWIFT_T/stc/lib: # Python export PYTHONPATH=${PYTHONPATH:-}${PYTHONPATH:+:} diff --git a/workflows/common/sh/langs-app-local.sh b/workflows/common/sh/langs-app-local.sh index c98eaaf3..b4cac56f 100644 --- a/workflows/common/sh/langs-app-local.sh +++ b/workflows/common/sh/langs-app-local.sh @@ -1,18 +1,17 @@ # LANGS APP LOCAL -PYTHONHOME=${PYTHONHOME:-"/usr/"} -export PYTHONHOME -PYTHON=${PYTHON:-python} -export LD_LIBRARY_PATH="$PYTHONHOME/lib":$LD_LIBRARY_PATH +export PYTHONHOME="$HOME/anaconda3" +PYTHON="$PYTHONHOME/bin/" +export LD_LIBRARY_PATH="$PYTHONHOME/lib" export PATH="$PYTHONHOME/bin:$PATH" COMMON_DIR=$EMEWS_PROJECT_ROOT/../common/python -PYTHONPATH+=":$PYTHONHOME/lib/$PYTHON:" +PYTHONPATH+=":$PYTHONHOME/lib/:" PYTHONPATH+=":$COMMON_DIR:" -PYTHONPATH+="$PYTHONHOME/lib/$PYTHON/dist-packages" APP_PYTHONPATH=${APP_PYTHONPATH:-} PYTHONPATH+=":$APP_PYTHONPATH" export PYTHONPATH + From 7185fb29ee6d534a9749a92b427d9437893c90c3 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 14 Jul 2020 14:33:03 -0500 Subject: [PATCH 026/601] Code for node reset on SQL LIKE --- workflows/cp-leaveout/db/reset-node.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/db/reset-node.sh b/workflows/cp-leaveout/db/reset-node.sh index 6271b10b..fe52381f 100755 --- a/workflows/cp-leaveout/db/reset-node.sh +++ b/workflows/cp-leaveout/db/reset-node.sh @@ -13,7 +13,11 @@ DB=$1 NODE=$2 sqlite3 $DB < 5 ); EOF -# update runhist SET status="RESET" where (length(subplan_id) > 5 ); + +# UPDATE runhist SET status="RESET" WHERE (subplan_id LIKE "${NODE}%") ; +# EOF + + From 4acc3b645793fd4f23267e4173c0a7dbff1c8eee Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 14 Jul 2020 14:35:50 -0500 Subject: [PATCH 027/601] New arguments for TopN_Args object for latest Uno --- workflows/cp-leaveout/py/data_setup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/py/data_setup.py b/workflows/cp-leaveout/py/data_setup.py index 0b2a9201..6055fbce 100644 --- a/workflows/cp-leaveout/py/data_setup.py +++ b/workflows/cp-leaveout/py/data_setup.py @@ -16,7 +16,9 @@ def __init__(self, dataframe_from, node, plan, output): self.plan = plan self.fold = None self.incremental = 'True' - self.output = output + self.cell_feature_selection = None + self.drug_feature_selection = None + self.output = output def pre_run(params): import sys, time From f34272c84ec61ee79579218c8bee2081cb7a0653 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 14 Jul 2020 14:37:13 -0500 Subject: [PATCH 028/601] WS --- workflows/cp-leaveout/py/data_setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/py/data_setup.py b/workflows/cp-leaveout/py/data_setup.py index 6055fbce..bf5a4498 100644 --- a/workflows/cp-leaveout/py/data_setup.py +++ b/workflows/cp-leaveout/py/data_setup.py @@ -51,10 +51,10 @@ def pre_run(params): return ModelResult.ERROR params["dataframe_from"] = dest.resolve() params["use_exported_data"] = "/mnt/bb/{}/{}".format(username, params["use_exported_data"]) - + # softlink to cache & config file # build node specific training/validation dataset - + args = TopN_Args(params["dataframe_from"], params["node"], params["plan"], From 4e32d3329b9c51be883730b45f345e039a3c5c27 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 14 Jul 2020 14:37:40 -0500 Subject: [PATCH 029/601] Do not make soft link to cache --- workflows/cp-leaveout/py/data_setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/py/data_setup.py b/workflows/cp-leaveout/py/data_setup.py index bf5a4498..d8574683 100644 --- a/workflows/cp-leaveout/py/data_setup.py +++ b/workflows/cp-leaveout/py/data_setup.py @@ -62,7 +62,7 @@ def pre_run(params): data = params["benchmark_data"] try: - for filename in [ "cache", "uno_auc_model.txt" ]: + for filename in [ "uno_auc_model.txt" ]: # "cache", if not os.path.islink(filename): os.symlink(f"{data}/{filename}", filename) except Exception as e: From d6f71f33cc1617ebdf1af688f90df501f9120a20 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 14 Jul 2020 14:37:58 -0500 Subject: [PATCH 030/601] If topN_to_uno.build_dataframe() raises ValueError, that is an ERROR --- workflows/cp-leaveout/py/data_setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflows/cp-leaveout/py/data_setup.py b/workflows/cp-leaveout/py/data_setup.py index d8574683..0916c536 100644 --- a/workflows/cp-leaveout/py/data_setup.py +++ b/workflows/cp-leaveout/py/data_setup.py @@ -27,8 +27,8 @@ def pre_run(params): # check NVMe disk is available username = os.environ['USER'] - nvme_enabled = Path('/mnt/bb/{}'.format(username)).exists() - + # nvme_enabled = Path('/mnt/bb/{}'.format(username)).exists() + nvme_enabled = False if nvme_enabled: # copy original datafrom to NVMe disk space try: @@ -81,7 +81,7 @@ def pre_run(params): print("data_setup: caught ValueError for node: '%s'" % params["node"]) # new 2019-12-02 traceback.print_exc(file=sys.stdout) - return ModelResult.SKIP + return ModelResult.ERROR except Exception as e: print("data_setup: error in build_dataframe!\n" + str(e)) traceback.print_exc() From d5199c50476a3a0b5d9277148ecaead728681160 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 14 Jul 2020 14:38:48 -0500 Subject: [PATCH 031/601] More error reporting in plangen --- workflows/cp-leaveout/py/plangen.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/workflows/cp-leaveout/py/plangen.py b/workflows/cp-leaveout/py/plangen.py index bb440cde..dd5efe25 100644 --- a/workflows/cp-leaveout/py/plangen.py +++ b/workflows/cp-leaveout/py/plangen.py @@ -10,6 +10,7 @@ import sys import sqlite3 from sqlite3 import Error as db_Error +import traceback import planargs @@ -529,8 +530,14 @@ def execute_sql_stmt(conn, stmt, cursor=None, trap_exception=False): except db_Error as e: db_exception = True + print('execute_sql_stmt: caught exception') print('execute_sql_stmt:', stmt) print('execute_sql_stmt:', e) + info = sys.exc_info() + s = traceback.format_tb(info[2]) + print('PLANGEN TRACEBACK:\n' + + str(e) + ' ... \n' + ''.join(s)) + sys.stdout.flush() if not trap_exception: raise finally: From 32f71023391b500275ac54215f9ace8bbf2198dc Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 14 Jul 2020 14:45:50 -0500 Subject: [PATCH 032/601] Use Bash syntax --- workflows/cp-leaveout/scripts/check-run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/check-run.sh b/workflows/cp-leaveout/scripts/check-run.sh index 406daf08..bbb92e3a 100755 --- a/workflows/cp-leaveout/scripts/check-run.sh +++ b/workflows/cp-leaveout/scripts/check-run.sh @@ -13,7 +13,7 @@ source $SUPERVISOR/workflows/common/sh/utils.sh SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ DIR - ${*} -if ! [ -d $DIR ] +if ! [[ -d $DIR ]] then echo "Does not exist: $DIR" exit 1 From 325a03fcec811a711115de3dae03e9db56c3f941 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 14 Jul 2020 14:46:16 -0500 Subject: [PATCH 033/601] Clean up --- workflows/cp-leaveout/scripts/extract-node-info.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/workflows/cp-leaveout/scripts/extract-node-info.sh b/workflows/cp-leaveout/scripts/extract-node-info.sh index d7b75eb0..33d09089 100755 --- a/workflows/cp-leaveout/scripts/extract-node-info.sh +++ b/workflows/cp-leaveout/scripts/extract-node-info.sh @@ -20,11 +20,6 @@ then fi -# # The stdout from the workflow (read by this script) -# OUTPUT=$DIR/output.txt -# # The output of this script, a plottable file -# SUMMARY=$DIR/summary.txt - # Put all matching file names in this file, one per line # (this could contain thousands of entries, too long for command line): LOG_LIST=$DIR/log-list.txt From f831ff3520b900d481cf1811ac7bacac8d6794c9 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 14 Jul 2020 14:47:02 -0500 Subject: [PATCH 034/601] Support STATS or INFER mode --- .../cp-leaveout/scripts/tar-experiment.sh | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/scripts/tar-experiment.sh b/workflows/cp-leaveout/scripts/tar-experiment.sh index 566c9279..161be878 100755 --- a/workflows/cp-leaveout/scripts/tar-experiment.sh +++ b/workflows/cp-leaveout/scripts/tar-experiment.sh @@ -10,11 +10,31 @@ SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) source $SUPERVISOR/workflows/common/sh/utils.sh SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ - DIR - ${*} + -H "Provide a MODE (STATS or INFER)!" \ + DIR MODE - ${*} # Get directory named "experiments" EXPERIMENTS=$( readlink --canonicalize $( dirname $DIR ) ) EXPID=$( basename $DIR ) +if [[ $MODE == "STATS" ]] +then + # For Node.py stats processing + OPTIONS=( --exclude '*.tsv' --exclude '*.h5' ) + Z="z" + EXT="tgz" +elif [[ $MODE == "INFER" ]] +then + # For inferencing runs + echo "find ..." + MATCHES=( -name '*.json' -or -name 'uno*.log' -or -name 'uno*.h5' ) + find $DIR ${MATCHES[@]} > tar.list + OPTIONS=( --files-from=tar.list ) + DIR="" # Unset this- only files in tar.list are included + Z="" + EXT="tar" +fi + set -x -nice tar cfz $EXPERIMENTS/$EXPID.tgz --exclude '*.h5' --exclude '*.tsv' $DIR +nice tar cf$Z $EXPERIMENTS/$EXPID.$EXT ${OPTIONS[@]} $DIR +du -h $EXPERIMENTS/$EXPID.$EXT From bf7db4ac2edf5bac26a8958caf4ed3cfc7a9e2a5 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 14 Jul 2020 14:54:46 -0500 Subject: [PATCH 035/601] Code to back up old runs --- workflows/cp-leaveout/swift/workflow.sh | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index 2f32739e..5db49b4d 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -110,9 +110,9 @@ then if [[ ! -f $TURBINE_OUTPUT/output.txt ]] then # If output.txt does not exist, assume the moves already happened - echo "The outputs were already moved from $EXPID" + echo "WARNING: The outputs were already moved from $EXPID" else - next $TURBINE_OUTPUT/restarts/%i + next $TURBINE_OUTPUT/restarts/%i # cf. utils.sh:next() PRIOR_RUN=$REPLY echo "Moving old outputs to $PRIOR_RUN" mkdir -pv $PRIOR_RUN @@ -122,6 +122,18 @@ then $TURBINE_OUTPUT/jobid.txt ) mv ${PRIORS[@]} $PRIOR_RUN cp -v $TURBINE_OUTPUT/cplo.db $PRIOR_RUN + echo $TURBINE_OUTPUT/run/*/save + for D in $TURBINE_OUTPUT/run/*/save + do + cd $D + echo D=$D + shopt -s nullglob + for f in *.json *.h5 *.log + do + : # cp -v --backup=numbered $f $f.bak + done + cd - + done fi else if [[ -f $TURBINE_OUTPUT/output.txt ]] From bf7220b0bfb7ddc30e50a98f7f2bf66ec0066261 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 14 Jul 2020 14:55:52 -0500 Subject: [PATCH 036/601] Default epochs=50 --- workflows/cp-leaveout/swift/workflow.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/swift/workflow.swift b/workflows/cp-leaveout/swift/workflow.swift index b166b301..d2ed3791 100644 --- a/workflows/cp-leaveout/swift/workflow.swift +++ b/workflows/cp-leaveout/swift/workflow.swift @@ -65,7 +65,7 @@ else { runtype = "plangen.RunType.RUN_ALL"; } -E_s = argv("E", "20"); +E_s = argv("E", "50"); assert(strlen(E_s) > 0, "workflow.swift: you must provide an argument to -E"); int max_epochs = string2int(E_s); // epochs=20 is just under 2h on Summit. string plan_json = argv("plan_json"); From 6c26e1cb708e568839b20b26920d24e639de0b72 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 14 Jul 2020 14:57:16 -0500 Subject: [PATCH 037/601] Better exception reporting --- workflows/cp-leaveout/swift/workflow.swift | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/workflows/cp-leaveout/swift/workflow.swift b/workflows/cp-leaveout/swift/workflow.swift index d2ed3791..0e4fe6ac 100644 --- a/workflows/cp-leaveout/swift/workflow.swift +++ b/workflows/cp-leaveout/swift/workflow.swift @@ -167,14 +167,15 @@ sys.stdout.flush() { result = python_db( ---- -import fcntl, sys, traceback +import sys, traceback import plangen try: result = str(plangen.start_subplan('%s', '%s', %s, '%s', %s)) except Exception as e: info = sys.exc_info() s = traceback.format_tb(info[2]) - print(str(e) + ' ... \\n' + ''.join(s)) + print('EXCEPTION in plangen_start()\\n' + + str(e) + ' ... \\n' + ''.join(s)) sys.stdout.flush() result = "EXCEPTION" ---- % (db_file, plan_json, plan_id, node, runtype), @@ -192,7 +193,8 @@ try: except Exception as e: info = sys.exc_info() s = traceback.format_tb(info[2]) - sys.stdout.write(str(e) + ' ... \\n' + ''.join(s) + '\\n') + sys.stdout.write('EXCEPTION in plangen_stop()\\n' + + str(e) + ' ... \\n' + ''.join(s) + '\\n') sys.stdout.flush() result = 'EXCEPTION' ---- % (db_file, plan_id, node), @@ -210,7 +212,7 @@ except Exception as e: "config_file": "uno_auc_model.txt", "cache": "cache/top6_auc", "dataframe_from": "%s", -"save_weights": "model.h5", +"save_weights": "save/model.h5", "gpus": "0", "epochs": %i, "es": "True", @@ -224,7 +226,7 @@ except Exception as e: parent = substring(this, 0, n-2); result = json_fragment + ---- , -"initial_weights": "../%s/model.h5" +"initial_weights": "../%s/save/model.h5" ---- % parent; } else From f4380fd5ea923b23c237c8d6da31f62c47b7dfbf Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 14 Jul 2020 14:58:41 -0500 Subject: [PATCH 038/601] Add more logging --- workflows/cp-leaveout/scripts/Node.py | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/cp-leaveout/scripts/Node.py b/workflows/cp-leaveout/scripts/Node.py index a6aa0e4c..bb4719ca 100644 --- a/workflows/cp-leaveout/scripts/Node.py +++ b/workflows/cp-leaveout/scripts/Node.py @@ -39,6 +39,7 @@ def __init__(self, id=None): def set_id(self, id): self.id = id self.stage = (len(self.id) - 1 ) // 2 + self.debug("SET ID: " + id) def parent(self): if self.stage == 1: From 1bcc6e839e5114c6f74c115e3f90f4a14b96ba67 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 14 Jul 2020 14:59:07 -0500 Subject: [PATCH 039/601] Better usage notes --- workflows/cp-leaveout/scripts/README.adoc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/scripts/README.adoc b/workflows/cp-leaveout/scripts/README.adoc index 94580170..363f5caa 100644 --- a/workflows/cp-leaveout/scripts/README.adoc +++ b/workflows/cp-leaveout/scripts/README.adoc @@ -33,7 +33,7 @@ COMPLETE / TOTAL = 1364 / 1364 : 0 remaining. ==== Generate a Node Pickle (extract-node-info) -This is a Python Pickle containing the Node data. See Node.py . +This makes the Python Pickle containing the Node data. See Node.py . This avoids needing to walk all logs all the time (which takes tens of seconds). ---- @@ -42,7 +42,7 @@ $ scripts/extract-node-info.sh $D ==== Print Node info (print-node-info) -Prints a big table of all Node statistics: +Prints a big table of all Node statistics using the Node Pickle. Format: From 35ec407611ab57c10997c0cfd45b7807246deed3 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 14 Jul 2020 14:59:50 -0500 Subject: [PATCH 040/601] Support latest log format, including training runs continued from restart --- workflows/cp-leaveout/scripts/Node.py | 18 ++++++++++++++++-- .../cp-leaveout/scripts/extract-node-info.py | 11 ++++++----- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/workflows/cp-leaveout/scripts/Node.py b/workflows/cp-leaveout/scripts/Node.py index bb4719ca..edca1349 100644 --- a/workflows/cp-leaveout/scripts/Node.py +++ b/workflows/cp-leaveout/scripts/Node.py @@ -89,6 +89,14 @@ def parse_epochs(self, line): self.epochs_planned = int(tokens[-1].strip()) self.debug("epochs_planned: %i" % self.epochs_planned) + def parse_epoch_status(self, line): + tokens = line.split() + assert len(tokens) == 2, "bad line: " + line + ints = tokens[1].split("/") + assert len(tokens) == 2 + self.epochs_actual = int(ints[0]) + self.debug("epochs_actual: " + str(self.epochs_actual)) + def stop_early(self): self.stopped_early = True self.debug("STOP EARLY") @@ -106,8 +114,10 @@ def parse_date_stop(self, line): self.debug("COMPLETE") def parse_training_done(self, line): - self.epochs_actual += 1 - # Find the location of training_done (td) (to accommodate prefixes) + # The current epoch should already be set + # by parse_epoch_status() + # First, find the location of training_done (td) + # (to accommodate prefixes) tokens = line.split() td = 0 while tokens[td] != Node.training_done: @@ -147,6 +157,10 @@ def total_time(self, nodes): 2019-12-14 09:46:32 MODEL RUNNER DEBUG epochs = 5 +parse_epoch_status() (from Keras) + +Epoch 29/50 + stop_early() Epoch 00004: early stopping diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index 023c53f4..84899bfc 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -59,17 +59,18 @@ def parse_log(log_fp, nodes): node_current = None while True: line = log_fp.readline() - if line == "": - break + if line == "": break if "PARAM UPDATE START" in line: node_current = Node() node_current.parse_date_start(line) - if "MODEL RUNNER DEBUG node =" in line: + if "MODEL RUNNER DEBUG node =" in line: tokens = line.split() node_id = tokens[-1].strip() node_current.set_id(node_id) - elif "MODEL RUNNER DEBUG epochs =" in line: + elif "MODEL RUNNER DEBUG epochs =" in line: node_current.parse_epochs(line) + elif line.startswith("Epoch ") and "/" in line: + node_current.parse_epoch_status(line) elif Node.training_done in line: node_current.parse_training_done(line) elif "early stopping" in line: @@ -79,7 +80,7 @@ def parse_log(log_fp, nodes): elif "DONE: run_id" in line: node_current.parse_date_stop(line) if node_current != None and node_current.complete: - # Store a complete Node + # Store a complete Node in global dict nodes nodes[node_current.id] = node_current nodes_found += 1 node_current = None From 8f548fc55e3b3853501efc71712662c2f42f6da0 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Tue, 14 Jul 2020 21:18:52 -0400 Subject: [PATCH 041/601] o Fix GA summit run scripts --- workflows/GA/data/adrp_param_space_ga.json | 6 +++--- workflows/GA/swift/workflow.sh | 1 + workflows/GA/test/cfg-prm-summit.sh | 2 +- workflows/GA/test/cfg-sys-summit.sh | 7 +++++-- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/workflows/GA/data/adrp_param_space_ga.json b/workflows/GA/data/adrp_param_space_ga.json index 144b462d..2b86d137 100644 --- a/workflows/GA/data/adrp_param_space_ga.json +++ b/workflows/GA/data/adrp_param_space_ga.json @@ -1,4 +1,4 @@ -i[ +[ { "name": "dense", "type": "categorical", @@ -28,8 +28,8 @@ i[ { "name": "epochs", "type": "int", - "lower": 90, - "upper": 90, + "lower": 200, + "upper": 200, "sigma": 20 }, { diff --git a/workflows/GA/swift/workflow.sh b/workflows/GA/swift/workflow.sh index 85985587..f6836bf1 100755 --- a/workflows/GA/swift/workflow.sh +++ b/workflows/GA/swift/workflow.sh @@ -156,6 +156,7 @@ swift-t -O 0 -n $PROCS \ -e SITE \ -e BENCHMARK_TIMEOUT \ -e SH_TIMEOUT \ + -e TURBINE_STDOUT \ -e IGNORE_ERRORS \ $WAIT_ARG \ $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} |& \ diff --git a/workflows/GA/test/cfg-prm-summit.sh b/workflows/GA/test/cfg-prm-summit.sh index 62db490d..58fa0a1e 100644 --- a/workflows/GA/test/cfg-prm-summit.sh +++ b/workflows/GA/test/cfg-prm-summit.sh @@ -6,7 +6,7 @@ SEED=${SEED:-1} # Total iterations NUM_ITERATIONS=${NUM_ITERATIONS:-1} # Size of GA population (i.e. the number of parameter sets to evaluate) -POPULATION_SIZE=${POPULATION_SIZE:-44} +POPULATION_SIZE=${POPULATION_SIZE:-274} # the GA strategy: one of 'simple' or 'mu_plus_lambda'. See # https://deap.readthedocs.io/en/master/api/algo.html?highlight=eaSimple#module-deap.algorithms # for more info. diff --git a/workflows/GA/test/cfg-sys-summit.sh b/workflows/GA/test/cfg-sys-summit.sh index 1a946208..38ee21f8 100644 --- a/workflows/GA/test/cfg-sys-summit.sh +++ b/workflows/GA/test/cfg-sys-summit.sh @@ -4,11 +4,14 @@ # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEMS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-46} +export PROCS=${PROCS:-276} # MPI processes per node # Cori has 32 cores per node, 128GB per node -export PPN=${PPN:-1} +export PPN=${PPN:-6} + +# for summit use these options +export TURBINE_LAUNCH_OPTIONS="-a 1 -g 1 -c 1" export WALLTIME=${WALLTIME:-06:00:00} From f385c7d880d83beb0b0a4da58099a73467f90d70 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 15 Jul 2020 11:54:06 -0500 Subject: [PATCH 042/601] Handle loss increase check as well as val_loss increase check --- workflows/cp-leaveout/scripts/Node.py | 10 ++- .../scripts/find-loss-increases.py | 65 +++++++++++++++---- 2 files changed, 59 insertions(+), 16 deletions(-) diff --git a/workflows/cp-leaveout/scripts/Node.py b/workflows/cp-leaveout/scripts/Node.py index edca1349..6da2cc23 100644 --- a/workflows/cp-leaveout/scripts/Node.py +++ b/workflows/cp-leaveout/scripts/Node.py @@ -18,6 +18,9 @@ def __init__(self, id=None): self.stage = None # Number of training steps performed self.steps = 0 + self.loss = None + # Difference wrt parent (lower is better) + self.loss_delta = None self.val_loss = None # Difference wrt parent (lower is better) self.val_loss_delta = None @@ -67,10 +70,10 @@ def str_table(self): special = " INCOMPLETE!" if self.stopped_early: special = " EARLY STOP!" - return "%-12s : %i : %2i / %2i : %0.5f : %s - %s : %s" % \ + return "%-12s : %i : %2i / %2i : loss: %0.5f vl: %0.5f : %s - %s : %s" % \ (self.id, self.stage, self.epochs_actual, self.epochs_planned, - self.val_loss, + self.loss, self.val_loss, self.date_start, self.date_stop, special) @@ -126,7 +129,8 @@ def parse_training_done(self, line): self.steps += int(stepii[0]) time_s = tokens[td+2] # e.g., "321s" self.time += int(time_s[0:-1]) - # Always collect val_loss: early stopping could happen: + # Always collect losses: early stopping could happen: + self.loss = float(tokens[td+6]) self.val_loss = float(tokens[td+15]) def get_val_loss_delta(node): diff --git a/workflows/cp-leaveout/scripts/find-loss-increases.py b/workflows/cp-leaveout/scripts/find-loss-increases.py index a31bf251..5884d66d 100644 --- a/workflows/cp-leaveout/scripts/find-loss-increases.py +++ b/workflows/cp-leaveout/scripts/find-loss-increases.py @@ -43,10 +43,46 @@ print("total nodes: %i" % len(data)) # Artificial nodes for comparison: -node_worst = Node("WORST") -node_worst.val_loss = 0 -node_best = Node("BEST") -node_best.val_loss = 1000 +node_loss_worst = Node("WORST") +node_loss_worst.loss = 0 +node_loss_best = Node("BEST") +node_loss_best.loss = 1000 + +# List of Nodes where loss increased: +increases_loss = [] +# Total Node count: +total = 0 +# Stage 5 Nodes +leaves = 0 +for node_id in data.keys(): + # print("node: " + node_id) + parent_id = node_id[0:-2] # '1.2.3' -> '1.2' + if len(parent_id) == 1: # stage=1 + continue + if parent_id not in data: + print("parent not found.") + continue + current = data[node_id] + parent = data[parent_id] + if current.stage == 5: leaves += 1 + if not (args.stage == STAGE_ANY or args.stage == current.stage): + continue + current.loss_delta = current.loss - parent.loss + if current.loss_delta > 0: + increases_loss.append(current) + if current.val_loss > node_loss_worst.loss: node_worst = current + if current.val_loss < node_loss_best.loss: node_best = current + total += 1 + +fraction = 100.0 * len(increases_loss) / total +print('increases_loss/total = %i / %i (%02.f%%)' % \ + (len(increases_loss), total, fraction)) + +# Artificial nodes for comparison: +node_vl_worst = Node("WORST") +node_vl_worst.val_loss = 0 +node_vl_best = Node("BEST") +node_vl_best.val_loss = 1000 if args.stage != STAGE_ANY: print("STAGE: %i" % args.stage) @@ -54,11 +90,11 @@ leaves = 0 # stage 5 Nodes # List of Nodes where val_loss increased: -increases = [] +increases_vl = [] # Total Node count: total = 0 for node_id in data.keys(): - print("node: " + node_id) + # print("node: " + node_id) parent_id = node_id[0:-2] # '1.2.3' -> '1.2' if len(parent_id) == 1: # stage=1 continue @@ -72,24 +108,27 @@ continue current.val_loss_delta = current.val_loss - parent.val_loss if current.val_loss_delta > 0: - increases.append(current) - if current.val_loss > node_worst.val_loss: node_worst = current - if current.val_loss < node_best.val_loss: node_best = current + increases_vl.append(current) + if current.val_loss > node_vl_worst.val_loss: node_worst = current + if current.val_loss < node_vl_best.val_loss: node_best = current total += 1 print("leaves: %i" % leaves) if total == 0: fail('No matching Nodes found!') -fraction = 100.0 * len(increases) / total -print('increases/total = %i / %i (%02.f%%)' % (len(increases), total, fraction)) +fraction = 100.0 * len(increases_vl) / total +print('increases_vl/total = %i / %i (%02.f%%)' % \ + (len(increases_vl), total, fraction)) -file_increases = "increases-%s.data" % args.token -append(file_increases, "%i %5.1f" % (args.stage, fraction)) +file_increases_vl = "increases-vl-%s.data" % args.token +append(file_increases_vl, "%i %5.1f" % (args.stage, fraction)) print('worst val_loss: ' + str(node_worst)) print('best val_loss: ' + str(node_best)) +exit() + print('DELTAS:') increases.sort(key=Node.get_val_loss_delta) From a8d1ec617e2c1b14c21792c27c7070386fb567b9 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 15 Jul 2020 11:57:56 -0500 Subject: [PATCH 043/601] New DB README --- workflows/cp-leaveout/db/README.adoc | 38 ++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 workflows/cp-leaveout/db/README.adoc diff --git a/workflows/cp-leaveout/db/README.adoc b/workflows/cp-leaveout/db/README.adoc new file mode 100644 index 00000000..13608c11 --- /dev/null +++ b/workflows/cp-leaveout/db/README.adoc @@ -0,0 +1,38 @@ + +== DB Tools + +Tools for the CP SQLite DB. + +=== print-db + +Dump a DB file to text output + +---- +$ ./print-db.sh workflow-1.db +---- + +=== diff-dbs + +Show difference between two DB files + +---- +$ ./diff-dbs.sh workflow-1.db workflow-2.db +---- + +=== print-stats + +Show short DB stats. + +---- +$ ./print-stats.sh workflow-1.db +COMPLETE / TOTAL = 1364 / 1364 : 0 remaining. +---- + +=== reset-node + +Reset (delete) DB nodes, forcing them to be re-run + +---- +$ db/reset-node.sh experiments/X085/restarts-1/cplo.db 1.2.3.2 +---- + From 5fd9d5fbaa2374b0bf4a89ce3abf5015e0f2c5ce Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 15 Jul 2020 11:59:21 -0500 Subject: [PATCH 044/601] Add loss to format --- workflows/cp-leaveout/scripts/README.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/README.adoc b/workflows/cp-leaveout/scripts/README.adoc index 1e6c9398..eb7f7af6 100644 --- a/workflows/cp-leaveout/scripts/README.adoc +++ b/workflows/cp-leaveout/scripts/README.adoc @@ -46,7 +46,7 @@ Prints a big table of all Node statistics using the Node Pickle. Format: -NODE STAGE EPOCHS-ACTUAL / EPOCHS-MAX VAL-LOSS TIME-START TIME_STOP EARLY-STOP? +NODE STAGE EPOCHS-ACTUAL / EPOCHS-MAX LOSS VAL-LOSS TIME-START TIME_STOP EARLY-STOP? ---- $ scripts/print-node-info.sh $D From 8f1dcce450f1cd93685432e5f67701ea7769c60c Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 17 Jul 2020 09:15:51 -0500 Subject: [PATCH 045/601] New holdout error extractor --- .../scripts/extract-holdout-errors.awk | 19 +++++++++++ .../scripts/extract-holdout-errors.sh | 33 +++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 workflows/cp-leaveout/scripts/extract-holdout-errors.awk create mode 100755 workflows/cp-leaveout/scripts/extract-holdout-errors.sh diff --git a/workflows/cp-leaveout/scripts/extract-holdout-errors.awk b/workflows/cp-leaveout/scripts/extract-holdout-errors.awk new file mode 100644 index 00000000..5340b16c --- /dev/null +++ b/workflows/cp-leaveout/scripts/extract-holdout-errors.awk @@ -0,0 +1,19 @@ + +# EXTRACT HOLDOUT ERRORS AWK +# Finds this data in the python.log and reports a summary of it: +# 2020-07-07 14:38:50 Comparing y_true and y_pred: +# 2020-07-07 14:38:50 mse: 0.0063 +# 2020-07-07 14:38:50 mae: 0.0541 +# 2020-07-07 14:38:50 r2: 0.7352 +# 2020-07-07 14:38:50 corr: 0.8590 + +$3 == "Comparing" { + getline + mse = $3 " " $4 + getline + mae = $3 " " $4 + getline + r2 = $3 " " $4 + printf "%-12s %s %s %s\n", node, mse, mae, r2 + exit +} diff --git a/workflows/cp-leaveout/scripts/extract-holdout-errors.sh b/workflows/cp-leaveout/scripts/extract-holdout-errors.sh new file mode 100755 index 00000000..354681d9 --- /dev/null +++ b/workflows/cp-leaveout/scripts/extract-holdout-errors.sh @@ -0,0 +1,33 @@ +#!/bin/bash +set -eu + +# EXTRACT HOLDOUT ERRORS SH +# Extract holdout error data from all python.logs +# in given experiment directory +# Provide an experiment directory DIR +# Creates $DIR/holdout-errors.txt +# See extract-holdout-errors.awk + +THIS=$( readlink --canonicalize $( dirname $0 ) ) + +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ + DIR - ${*} + +if [[ ! -d $DIR ]] +then + echo "$0: Given experiment directory does not exist: $DIR" + exit 1 +fi + +EXTRACT_HOLDOUT_ERRORS_AWK=$THIS/extract-holdout-errors.awk + +RUNS=$( ls $DIR/run ) +for RUN in $RUNS +do + NODE=$( basename $RUN ) + LOG=$DIR/run/$RUN/save/python.log + awk -f $EXTRACT_HOLDOUT_ERRORS_AWK -v node=$NODE < $LOG +done > $DIR/holdout-errors.txt From 4164970882f358eacbb05fd43e9dfbf4b625f33e Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 17 Jul 2020 09:30:56 -0500 Subject: [PATCH 046/601] Simplify --- workflows/cp-leaveout/scripts/extract-holdout-errors.sh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/workflows/cp-leaveout/scripts/extract-holdout-errors.sh b/workflows/cp-leaveout/scripts/extract-holdout-errors.sh index 354681d9..4fd105a0 100755 --- a/workflows/cp-leaveout/scripts/extract-holdout-errors.sh +++ b/workflows/cp-leaveout/scripts/extract-holdout-errors.sh @@ -24,10 +24,9 @@ fi EXTRACT_HOLDOUT_ERRORS_AWK=$THIS/extract-holdout-errors.awk -RUNS=$( ls $DIR/run ) -for RUN in $RUNS +NODES=$( ls $DIR/run ) +for NODE in $NODES do - NODE=$( basename $RUN ) - LOG=$DIR/run/$RUN/save/python.log + LOG=$DIR/run/$NODE/save/python.log awk -f $EXTRACT_HOLDOUT_ERRORS_AWK -v node=$NODE < $LOG -done > $DIR/holdout-errors.txt +done > $DIR/holdout-errors2.txt From e96d32a647dd218f0d5e35ecfcd1caf82ee2dc5f Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 17 Jul 2020 09:31:26 -0500 Subject: [PATCH 047/601] Revert debugging change --- workflows/cp-leaveout/scripts/extract-holdout-errors.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/extract-holdout-errors.sh b/workflows/cp-leaveout/scripts/extract-holdout-errors.sh index 4fd105a0..5c42211d 100755 --- a/workflows/cp-leaveout/scripts/extract-holdout-errors.sh +++ b/workflows/cp-leaveout/scripts/extract-holdout-errors.sh @@ -29,4 +29,4 @@ for NODE in $NODES do LOG=$DIR/run/$NODE/save/python.log awk -f $EXTRACT_HOLDOUT_ERRORS_AWK -v node=$NODE < $LOG -done > $DIR/holdout-errors2.txt +done > $DIR/holdout-errors.txt From a4812cbeddb224a1e4e37f39003a0b8219ce26e5 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 29 Jul 2020 11:49:28 -0500 Subject: [PATCH 048/601] Merge --- scratch/csv2f64/hdf2f64.py | 1 - workflows/common/sh/env-dunedin.sh | 5 +- workflows/common/sh/utils.sh | 7 +- workflows/cp-leaveout/scripts/Node.py | 39 ++- .../cp-leaveout/scripts/extract-node-info.py | 8 + .../scripts/find-loss-increases.py | 235 +++++++++++++----- workflows/cp-leaveout/swift/workflow.sh | 4 +- workflows/cp-leaveout/swift/workflow.swift | 6 +- workflows/cp-leaveout/test/test-1.sh | 26 +- workflows/cp-leaveout/test/test-512.sh | 27 +- workflows/mlrMBO/test/test-nightly.sh | 2 +- workflows/uq-noise/good-runs.txt | 8 - 12 files changed, 271 insertions(+), 97 deletions(-) delete mode 100644 workflows/uq-noise/good-runs.txt diff --git a/scratch/csv2f64/hdf2f64.py b/scratch/csv2f64/hdf2f64.py index e66feaca..8bd4fd35 100644 --- a/scratch/csv2f64/hdf2f64.py +++ b/scratch/csv2f64/hdf2f64.py @@ -22,6 +22,5 @@ # print(ds.shape) # print(ds.dtype) a8 = a.astype('float64') -# print(a[0,0,0]) a8.tofile(args.output) diff --git a/workflows/common/sh/env-dunedin.sh b/workflows/common/sh/env-dunedin.sh index d2b320a8..66e0d161 100644 --- a/workflows/common/sh/env-dunedin.sh +++ b/workflows/common/sh/env-dunedin.sh @@ -4,7 +4,7 @@ # Assumes WORKFLOWS_ROOT, BENCHMARK_DIR, BENCHMARKS_ROOT are set # Python -PY=/home/wozniak/Public/sfw/anaconda3-tf +PY=/home/wozniak/Public/sfw/anaconda3 export PYTHONPATH=${PYTHONPATH:-}${PYTHONPATH:+:} PYTHONPATH+=$WORKFLOWS_ROOT/common/python: export PYTHONHOME=$PY @@ -32,7 +32,8 @@ fi # LD_LIBRARY_PATH export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}${LD_LIBRARY_PATH:+:} -#LD_LIBRARY_PATH+=$R_HOME/lib +LD_LIBRARY_PATH+=$R_HOME/lib: +LD_LIBRARY_PATH+=$R_HOME/library/RInside/lib # LD_LIBRARY_PATH+=:/home/wozniak/Public/sfw/anaconda3/lib LD_LIBRARY_PATH=/usb2/wozniak/Public/sfw/R-3.5.3/lib/R/lib:$LD_LIBRARY_PATH show LD_LIBRARY_PATH diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index 6e7baa54..f5e86b8a 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -168,7 +168,12 @@ get_expid() then shift # Search for free experiment number - mkdir -pv $EXPERIMENTS + if ! mkdir -pv $EXPERIMENTS + then + echo "get_expid(): could not make experiments directory:" \ + $EXPERIMENTS + return 1 + fi EXPS=( $( ls $EXPERIMENTS ) ) if (( ${#EXPS[@]} != 0 )) then diff --git a/workflows/cp-leaveout/scripts/Node.py b/workflows/cp-leaveout/scripts/Node.py index 6da2cc23..f8da4629 100644 --- a/workflows/cp-leaveout/scripts/Node.py +++ b/workflows/cp-leaveout/scripts/Node.py @@ -4,7 +4,7 @@ # The training node information as stored in the logs # See the footer of this file for example log text that is parsed here -import math +# import math class Node: @@ -22,7 +22,10 @@ def __init__(self, id=None): # Difference wrt parent (lower is better) self.loss_delta = None self.val_loss = None - # Difference wrt parent (lower is better) + # Validation set size + self.val_data = None + # Differences wrt parent (lower is better) + self.loss_delta = None self.val_loss_delta = None # Epochs prescribed by the workflow self.epochs_planned = None @@ -55,11 +58,12 @@ def __str__(self): special = " INCOMPLETE!" if self.stopped_early: special = " EARLY STOP!" - return "Node [%s]: %s (epochs=%i/%s, val_loss=%s)%s" % \ + return "Node [%s]: %s (epochs=%i/%s, loss=%s, val_loss=%s)%s" % \ (Node.maybe_str_integer(self.stage), self.id, self.epochs_actual, Node.maybe_str_integer(self.epochs_planned), + Node.maybe_str_float(self.loss, "%0.6f"), Node.maybe_str_float(self.val_loss, "%0.6f"), special) @@ -99,7 +103,7 @@ def parse_epoch_status(self, line): assert len(tokens) == 2 self.epochs_actual = int(ints[0]) self.debug("epochs_actual: " + str(self.epochs_actual)) - + def stop_early(self): self.stopped_early = True self.debug("STOP EARLY") @@ -133,8 +137,29 @@ def parse_training_done(self, line): self.loss = float(tokens[td+6]) self.val_loss = float(tokens[td+15]) + def parse_val_data(self, fp): + """ + fp is the file pointer to save/python.log + If val data is not found, node.val_data will remain None + """ + marker = "val data = " + marker_length = len(marker) + while True: + line = fp.readline() + if line == "": break + index = line.find("val data =") + if index == -1: continue + tail = line[index+marker_length:] + comma = tail.find(",") + value_string = tail[:comma] + self.val_data = int(value_string) + + def get_loss_delta(node): + if node.loss_delta == None: + raise ValueError("No loss_delta!") + return node.loss_delta + def get_val_loss_delta(node): - ''' For sorting ''' if node.val_loss_delta == None: raise ValueError("No val_loss_delta!") return node.val_loss_delta @@ -165,6 +190,10 @@ def total_time(self, nodes): Epoch 29/50 +parse_val_data() ==> self.val_data + +2020-04-15 13:45:41 CV fold 0: train data = 5265, val data = 1400, test data = 0 + stop_early() Epoch 00004: early stopping diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index 8ddd986a..de2af875 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -87,11 +87,19 @@ def parse_log(log_fp, nodes): if node_current != None and node_current.complete: # Store a complete Node in global dict nodes nodes[node_current.id] = node_current + find_val_data(node_current) nodes_found += 1 node_current = None logging.info("Found %i nodes in log." % nodes_found) +def find_val_data(node): + python_log = args.directory + "/run/%s/save/python.log" % node.id + with open(python_log) as fp: + node.parse_val_data(fp) + if node.val_data == None: + logging.fatal("Could not find val data for node: " + node.id) + # List of log file names log_files = read_log_filenames(log_list) # Dict mapping Node id to Node for all complete Nodes diff --git a/workflows/cp-leaveout/scripts/find-loss-increases.py b/workflows/cp-leaveout/scripts/find-loss-increases.py index 5884d66d..73b9d761 100644 --- a/workflows/cp-leaveout/scripts/find-loss-increases.py +++ b/workflows/cp-leaveout/scripts/find-loss-increases.py @@ -43,6 +43,7 @@ print("total nodes: %i" % len(data)) # Artificial nodes for comparison: +# !! Updated upstream node_loss_worst = Node("WORST") node_loss_worst.loss = 0 node_loss_best = Node("BEST") @@ -53,7 +54,7 @@ # Total Node count: total = 0 # Stage 5 Nodes -leaves = 0 +leaves = 0 for node_id in data.keys(): # print("node: " + node_id) parent_id = node_id[0:-2] # '1.2.3' -> '1.2' @@ -77,16 +78,29 @@ fraction = 100.0 * len(increases_loss) / total print('increases_loss/total = %i / %i (%02.f%%)' % \ (len(increases_loss), total, fraction)) - + # Artificial nodes for comparison: node_vl_worst = Node("WORST") node_vl_worst.val_loss = 0 node_vl_best = Node("BEST") node_vl_best.val_loss = 1000 +# == +# val_loss: +node_worst_val_loss = Node("WORST VAL_LOSS") +node_worst_val_loss.val_loss = 0 +node_best_val_loss = Node("BEST VAL_LOSS") +node_best_val_loss.val_loss = 1000 +# loss: +node_worst_loss = Node("WORST LOSS") +node_worst_loss.loss = 0 +node_best_loss = Node("BEST LOSS") +node_best_loss.loss = 1000 +# !! Stashed changes if args.stage != STAGE_ANY: print("STAGE: %i" % args.stage) +# !! Updated upstream leaves = 0 # stage 5 Nodes # List of Nodes where val_loss increased: @@ -112,90 +126,181 @@ if current.val_loss > node_vl_worst.val_loss: node_worst = current if current.val_loss < node_vl_best.val_loss: node_best = current total += 1 +# == +def get_increases(): + # List of Nodes where loss increased: + global increases_loss + increases_loss = [] + # List of Nodes where val_loss increased: + global increases_val_loss + increases_val_loss = [] + + global node_worst_loss, node_worst_val_loss + global node_best_loss, node_best_val_loss + + # count of Nodes: + total = 0 + # count of stage 5 Nodes + leaves = 0 + # count of Nodes with missing parent + parents_missing = 0 + for node_id in data.keys(): + # print("node: " + node_id) + parent_id = node_id[0:-2] # '1.2.3' -> '1.2' + if len(parent_id) == 1: # stage=1 + continue + if parent_id not in data: + # print("parent not found.") + parents_missing += 1 + continue + current = data[node_id] + parent = data[parent_id] + if current.stage == 5: leaves += 1 + if not (args.stage == STAGE_ANY or args.stage == current.stage): + continue + current.val_loss_delta = current.val_loss - parent.val_loss + current.loss_delta = current.loss - parent.loss + # Register increases: + if current.val_loss_delta > 0: + increases_val_loss.append(current) + if current.loss_delta > 0: + increases_loss.append(current) + # Update best/worst: + if current.loss > node_worst_loss.loss: + node_worst_loss = current + if current.loss < node_best_loss.loss: + node_best_loss = current + if current.val_loss > node_worst_val_loss.val_loss: + node_worst_val_loss = current + if current.val_loss < node_best_val_loss.val_loss: + node_best_val_loss = current + total += 1 + print("parents_missing: %i" % parents_missing) + return total, leaves + +# total: count of Nodes +# leaves: count of stage 5 Nodes +total, leaves = get_increases() +# !! Stashed changes print("leaves: %i" % leaves) if total == 0: fail('No matching Nodes found!') +# !! Updated upstream fraction = 100.0 * len(increases_vl) / total print('increases_vl/total = %i / %i (%02.f%%)' % \ (len(increases_vl), total, fraction)) file_increases_vl = "increases-vl-%s.data" % args.token append(file_increases_vl, "%i %5.1f" % (args.stage, fraction)) +# == +fraction = 100.0 * len(increases_loss) / total +print('increases_loss/total = %i / %i (%02.f%%)' % \ + (len(increases_loss), total, fraction)) +filename = "increases-loss-%s.data" % args.token +append(filename, "%i %5.1f" % (args.stage, fraction)) + +fraction = 100.0 * len(increases_val_loss) / total +print('increases_val_loss/total = %i / %i (%02.f%%)' % \ + (len(increases_val_loss), total, fraction)) +filename = "increases-val_loss-%s.data" % args.token +append(filename, "%i %5.1f" % (args.stage, fraction)) +# !! Stashed changes -print('worst val_loss: ' + str(node_worst)) -print('best val_loss: ' + str(node_best)) +print('worst loss: ' + str(node_worst_loss)) +print('best loss: ' + str(node_best_loss)) +print('worst val_loss: ' + str(node_worst_val_loss)) +print('best val_loss: ' + str(node_best_val_loss)) exit() print('DELTAS:') -increases.sort(key=Node.get_val_loss_delta) -stopped_early = 0 -for i in increases: - # print('%f %-14s %r' % (i.val_loss_delta, i.id, i.stopped_early)) - if i.stopped_early: stopped_early += 1 +increases_loss .sort(key=Node.get_loss_delta) +increases_val_loss.sort(key=Node.get_val_loss_delta) +# stopped_early = 0 +# for i in increases: +# # print('%f %-14s %r' % (i.val_loss_delta, i.id, i.stopped_early)) +# if i.stopped_early: stopped_early += 1 def print_delta(prefix, node): print(prefix, str(node), 'delta: %f' % node.val_loss_delta) -worst = increases[-1] -print_delta('worst: ', worst) - -n_01p = int(round(len(increases) / 100)) # Worst 1 percentile -if n_01p == 0: n_01p = 1 -worst_01p = increases[-n_01p] -print_delta('worst 1%:', worst_01p) - -n_10p = int(round(len(increases) / 10)) # Worst 10 percentile -if n_10p == 0: n_10p = 1 -worst_10p = increases[-n_10p] -print_delta('worst 10%:', worst_10p) - -print('increases that stopped early: %i' % stopped_early) - -values_increase = [] -values_val_loss = [] - -for node in increases: - values_increase.append(node.get_val_loss_delta()) - values_val_loss.append(node.val_loss) - -avg_increase = avg(values_increase) -avg_val_loss = avg(values_val_loss) -print('avg increase: %f' % avg_increase) -delta_ratio = 100.0 * avg_increase / avg_val_loss -print('avg increase fraction: %f' % delta_ratio) - -file_increase_deltas = "increase-deltas-%s.data" % args.token -append(file_increase_deltas, "%i %5.1f" % (args.stage, delta_ratio)) - -outliers_file = "outliers-%s.data" % args.token -print("avg_increase", str(avg_increase)) -print("avg_val_loss", str(avg_val_loss)) - -print("%-2s %-12s %-8s %-8s %-8s %-8s" % \ - ("", "node", "val_loss", "parent", "delta", "ratio")) - -increases.sort(key=Node.get_val_loss_delta, reverse=True) -ratios = [] -index = 1 -for node in increases: - parent = data[node.parent()] - ratio = node.get_val_loss_delta() / parent.val_loss - print("%2i %-12s %0.6f %0.6f %0.6f %0.6f" % - (index, node.id, node.val_loss, parent.val_loss, - node.get_val_loss_delta(), ratio)) - ratios.append(ratio) - index += 1 -ratios.sort() - -with open(outliers_file, "w") as fp: - i = 0 - for ratio in ratios: - fp.write("%4i %0.7f\n" % (i, ratio)) - i += 1 +# worst = increases[-1] +# print_delta('worst: ', worst) + +# n_01p = int(round(len(increases) / 100)) # Worst 1 percentile +# if n_01p == 0: n_01p = 1 +# worst_01p = increases[-n_01p] +# print_delta('worst 1%:', worst_01p) + +# n_10p = int(round(len(increases) / 10)) # Worst 10 percentile +# if n_10p == 0: n_10p = 1 +# worst_10p = increases[-n_10p] +# print_delta('worst 10%:', worst_10p) + +# print('increases that stopped early: %i' % stopped_early) + +# values_increase = [] +# values_val_loss = [] + +# for node in increases: +# values_increase.append(node.get_val_loss_delta()) +# values_val_loss.append(node.val_loss) + +# avg_increase = avg(values_increase) +# avg_val_loss = avg(values_val_loss) +# print('avg increase: %f' % avg_increase) +# delta_ratio = 100.0 * avg_increase / avg_val_loss +# print('avg increase fraction: %f' % delta_ratio) + +# file_increase_deltas = "increase-deltas-%s.data" % args.token +# append(file_increase_deltas, "%i %5.1f" % (args.stage, delta_ratio)) + +# outliers_file = "outliers-%s.data" % args.token +# print("avg_increase", str(avg_increase)) +# print("avg_val_loss", str(avg_val_loss)) + +def report_top_loss_deltas(): + print("%-2s %-12s %-8s %-8s %-8s %-8s" % \ + ("", "node", "loss", "parent", "delta", "ratio")) + increases_loss.sort(key=Node.get_loss_delta, reverse=True) + ratios = [] + index = 1 + for node in increases_loss: + parent = data[node.parent()] + ratio = node.get_loss_delta() / parent.loss + print("%2i %-12s %0.6f %0.6f %0.6f %0.6f" % + (index, node.id, node.loss, parent.loss, + node.get_loss_delta(), ratio)) + ratios.append(ratio) + index += 1 + ratios.sort() + +def report_top_val_loss_deltas(increases_val_loss): + print("%-2s %-12s %-8s %-8s %-8s %-8s %-8s" % \ + ("", "node", "val_loss", "parent", "delta", "ratio", "val_data")) + increases_val_loss.sort(key=Node.get_val_loss_delta, reverse=True) + ratios = [] + index = 1 + for node in increases_val_loss: + parent = data[node.parent()] + ratio = node.get_val_loss_delta() / parent.loss + print("%2i %-12s %0.6f %0.6f %0.6f %0.6f %8i" % + (index, node.id, node.val_loss, parent.val_loss, + node.get_val_loss_delta(), ratio, node.val_data)) + ratios.append(ratio) + index += 1 + ratios.sort() + +report_top_val_loss_deltas(increases_val_loss) + +# with open(outliers_file, "w") as fp: +# i = 0 +# for ratio in ratios: +# fp.write("%4i %0.7f\n" % (i, ratio)) +# i += 1 # with open(outliers_file, "w") as fp: # i = 0 diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index 2bf7fbee..c2c4fb38 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -195,9 +195,11 @@ else STDOUT="" fi -TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" +TURBINE_STDOUT="" # "$TURBINE_OUTPUT/out/out-%%r.txt" mkdir -pv $TURBINE_OUTPUT/out +echo PROCS $PROCS + swift-t -O 0 -n $PROCS \ ${MACHINE:-} \ -p -I $EQR -r $EQR \ diff --git a/workflows/cp-leaveout/swift/workflow.swift b/workflows/cp-leaveout/swift/workflow.swift index 0e4fe6ac..5a351899 100644 --- a/workflows/cp-leaveout/swift/workflow.swift +++ b/workflows/cp-leaveout/swift/workflow.swift @@ -34,6 +34,10 @@ import python; import string; import sys; + +printf("OK"); + + import candle_utils; report_env(); @@ -79,7 +83,7 @@ string exp_id = getenv("EXPID"); string turbine_output = getenv("TURBINE_OUTPUT"); // END WORKFLOW ARGUMENTS -// For compatibility with obj(): +// // For compatibility with obj(): global const string FRAMEWORK = "keras"; /** RUN STAGE: A recursive function that manages the stage dependencies */ diff --git a/workflows/cp-leaveout/test/test-1.sh b/workflows/cp-leaveout/test/test-1.sh index 4ffd5501..2128a246 100755 --- a/workflows/cp-leaveout/test/test-1.sh +++ b/workflows/cp-leaveout/test/test-1.sh @@ -34,11 +34,27 @@ export CFG_PRM=$THIS/cfg-prm-1.sh # Data files # PLAN_JSON=$EMEWS_PROJECT_ROOT/plangen_cell8-p2_drug8-p2.json -SCRATCH=/gpfs/alpine/med106/scratch/wozniak -CANDLE_DATA=$SCRATCH/CANDLE-Data -PLAN_JSON=$CANDLE_DATA/plangen_cell8-p2_drug8-p2.json -DATAFRAME_CSV=$CANDLE_DATA/top21_dataframe_8x8.csv -BENCHMARK_DATA=$SCRATCH/proj/Benchmarks/Pilot1/Uno +# DATAFRAME_CSV=/usb1/wozniak/CANDLE-Benchmarks-Data/top21_dataframe_8x8.csv + +PLAN_JSON=$EMEWS_PROJECT_ROOT/plangen_cell1593-p4_drug1779-p1.json +BENCHMARK_DATA=$HOME/proj/Benchmarks/Pilot1/Uno +SCRATCH=/usb1/wozniak/CANDLE-Benchmarks-Data +CANDLE_DATA=$SCRATCH/CANDLE-Data/Milestone-13 +DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.csv + +# Summit data: +# SCRATCH=/gpfs/alpine/med106/scratch/wozniak +# CANDLE_DATA=$SCRATCH/CANDLE-Data/Milestone-13 +# PLAN_JSON=$CANDLE_DATA/plangen_cell1593-p4_drug1779-p1.json +# DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.csv +# BENCHMARK_DATA=$SCRATCH/proj/Benchmarks/Pilot1/Uno + + +# SCRATCH=/gpfs/alpine/med106/scratch/wozniak +# CANDLE_DATA=$SCRATCH/CANDLE-Data +# PLAN_JSON=$CANDLE_DATA/plangen_cell8-p2_drug8-p2.json +# DATAFRAME_CSV=$CANDLE_DATA/top21_dataframe_8x8.csv +# BENCHMARK_DATA=$SCRATCH/proj/Benchmarks/Pilot1/Uno # What to return from the objective function (Keras model) # val_loss (default) and val_corr are supported diff --git a/workflows/cp-leaveout/test/test-512.sh b/workflows/cp-leaveout/test/test-512.sh index eaa88766..e0d760ed 100755 --- a/workflows/cp-leaveout/test/test-512.sh +++ b/workflows/cp-leaveout/test/test-512.sh @@ -34,15 +34,20 @@ export CFG_PRM=$THIS/cfg-prm-1.sh # Data files # PLAN_JSON=$EMEWS_PROJECT_ROOT/plangen_cell8-p2_drug8-p2.json -SCRATCH=/gpfs/alpine/med106/scratch/hsyoo -CANDLE_DATA=$SCRATCH/Milestone13 +# SCRATCH=/gpfs/alpine/med106/scratch/hsyoo +# SCRATCH=/gpfs/alpine/med106/scratch/wozniak +SCRATCH=/usb2/wozniak +CANDLE_DATA=$SCRATCH/CANDLE-Data/Milestone-13 PLAN_JSON=$CANDLE_DATA/plangen_cell1593-p4_drug1779-p1.json -DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.csv -BENCHMARK_DATA=$SCRATCH/Milestone13/Benchmarks/Pilot1/Uno +# DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.csv +DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.feather +# BENCHMARK_DATA=$SCRATCH/proj/Benchmarks/Pilot1 +BENCHMARK_DATA=$HOME/proj/Benchmarks/Pilot1 # What to return from the objective function (Keras model) -# val_loss (default) and val_corr are supported -export OBJ_RETURN="val_loss" +# val_loss (default), loss, and val_corr are supported +# export OBJ_RETURN="val_loss" +export OBJ_RETURN="loss" if [[ $SITE == "theta" ]] then @@ -58,6 +63,13 @@ do fi done +if [[ ! -e $BENCHMARK_DATA/cache ]] +then + echo "$0: The cache does not exist: $BENCHMARK_DATA/cache" + echo "$0: Use mkdir to create this directory" + exit 1 +fi + # Submit job $EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ $MODEL_NAME $WORKFLOW_ARGS \ @@ -71,7 +83,8 @@ OUTPUT=turbine-output/output.txt WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) # Wait for job -queue_wait +# queue_wait +exit SCRIPT=$( basename $0 .sh ) check_output "RESULTS:" $OUTPUT $WORKFLOW $SCRIPT $JOBID diff --git a/workflows/mlrMBO/test/test-nightly.sh b/workflows/mlrMBO/test/test-nightly.sh index 53ee507b..9c96e281 100755 --- a/workflows/mlrMBO/test/test-nightly.sh +++ b/workflows/mlrMBO/test/test-nightly.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eu -# MLRMBO TEST 1 +# MLRMBO TEST NIGHTLY usage() { diff --git a/workflows/uq-noise/good-runs.txt b/workflows/uq-noise/good-runs.txt deleted file mode 100644 index 128d4020..00000000 --- a/workflows/uq-noise/good-runs.txt +++ /dev/null @@ -1,8 +0,0 @@ -Dunedin: -X053 : epochs=10 : modifying wrong DF -X056 : epochs=10 : good, small run -Theta: -* : wrong DF -X012 : epochs=7 : good data but flat plot -X013 : epochs=10 -X021 : epochs=10 : DONE From 4c1ca996836abfc6d2b8dcd810803041fa487f3f Mon Sep 17 00:00:00 2001 From: Justin M Wozniak Date: Wed, 29 Jul 2020 12:12:27 -0500 Subject: [PATCH 049/601] Do not need EQ/R here --- workflows/cp-leaveout/swift/workflow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index c2c4fb38..b5589229 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -202,7 +202,7 @@ echo PROCS $PROCS swift-t -O 0 -n $PROCS \ ${MACHINE:-} \ - -p -I $EQR -r $EQR \ + -p \ -I $OBJ_DIR \ -i $OBJ_MODULE \ -I $EMEWS_PROJECT_ROOT/swift \ From 181f8350f777b076f48896b907012789120ea80e Mon Sep 17 00:00:00 2001 From: Justin M Wozniak Date: Wed, 29 Jul 2020 12:13:03 -0500 Subject: [PATCH 050/601] Allow for unset variable --- workflows/cp-leaveout/swift/workflow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index b5589229..b5a5a418 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -207,7 +207,7 @@ swift-t -O 0 -n $PROCS \ -i $OBJ_MODULE \ -I $EMEWS_PROJECT_ROOT/swift \ -i $EPOCH_MODE_MODULE \ - -e LD_LIBRARY_PATH=$LD_LIBRARY_PATH \ + -e LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} \ -e BENCHMARKS_ROOT \ -e EMEWS_PROJECT_ROOT \ -e APP_PYTHONPATH=$APP_PYTHONPATH \ From e5df7821caf854360ded226a40591645aeae37c6 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 29 Jul 2020 12:43:53 -0500 Subject: [PATCH 051/601] Adding get-last-experiment.zsh --- workflows/common/sh/get-last-experiment.zsh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 workflows/common/sh/get-last-experiment.zsh diff --git a/workflows/common/sh/get-last-experiment.zsh b/workflows/common/sh/get-last-experiment.zsh new file mode 100644 index 00000000..6b3a3036 --- /dev/null +++ b/workflows/common/sh/get-last-experiment.zsh @@ -0,0 +1,19 @@ + +# GET LAST EXPERIMENT +# A couple handy interactive functions + +D() +# Find the latest experiment directory, assign to environment variable D +{ + D=( experiments/*(om[1]) ) ; d D + local _D + _D=$D + unset D + export D=$_D +} + +E() +# Inspect the outputs in $D +{ + e $D/output.txt $D/out/out-*.txt +} From 3546fe8d9ff77905b30635447d311585b2eb9586 Mon Sep 17 00:00:00 2001 From: Justin M Wozniak Date: Wed, 29 Jul 2020 12:45:20 -0500 Subject: [PATCH 052/601] Drop sweep.swift --- workflows/cp-leaveout/swift/sweep.swift | 61 ------------------------- 1 file changed, 61 deletions(-) delete mode 100644 workflows/cp-leaveout/swift/sweep.swift diff --git a/workflows/cp-leaveout/swift/sweep.swift b/workflows/cp-leaveout/swift/sweep.swift deleted file mode 100644 index 837241b7..00000000 --- a/workflows/cp-leaveout/swift/sweep.swift +++ /dev/null @@ -1,61 +0,0 @@ - -/* - CP LEAVEOUT SWIFT - Main workflow -*/ - -import assert; -import files; -import io; -import python; -import unix; -import sys; -import string; -import location; -import math; - -string FRAMEWORK = "keras"; - -string xcorr_root = getenv("XCORR_ROOT"); -string preprocess_rnaseq = getenv("PREPROP_RNASEQ"); -string emews_root = getenv("EMEWS_PROJECT_ROOT"); -string turbine_output = getenv("TURBINE_OUTPUT"); - -printf("TURBINE_OUTPUT: " + turbine_output); - -string db_file = argv("db_file"); -string cache_dir = argv("cache_dir"); -// string xcorr_data_dir = argv("xcorr_data_dir"); -string gpus = argv("gpus", ""); - -// string restart_number = argv("restart_number", "1"); -string site = argv("site"); - -int N = 4; // The divisor of the leave out rows/columns - -int X[] = [0:0]; -int Y[] = [0:N]; - -string results[][]; - -app (file o) fake_uno(int leaveout_cell_line, int leaveout_drug) -{ - (emews_root/"swift/fake-uno.sh") leaveout_cell_line leaveout_drug o ; -} - -app (file o) fake_nt3(int leaveout_punch_x, int leaveout_punch_y) -{ - (emews_root/"swift/fake-nt3.sh") leaveout_punch_x leaveout_punch_y o ; -} - -foreach punch_x in X -{ - foreach punch_y in Y - { - file f = fake_nt3(punch_x, punch_y); - results[punch_x][punch_y] = read(f); - } -} - -// The test*.sh scripts check for "RESULTS:" -printf("RESULTS: %i", size(results)); From 16169656e95f8e592879bc9f836e45978e39bb5a Mon Sep 17 00:00:00 2001 From: Justin M Wozniak Date: Wed, 29 Jul 2020 12:46:26 -0500 Subject: [PATCH 053/601] Move plangen.swift stuff into modules --- workflows/cp-leaveout/swift/plangen_0.swift | 25 +++++ .../swift/{plangen.swift => plangen_1.swift} | 6 ++ workflows/cp-leaveout/swift/plangen_2.swift | 94 +++++++++++++++++++ workflows/cp-leaveout/swift/workflow.swift | 91 +----------------- 4 files changed, 129 insertions(+), 87 deletions(-) create mode 100644 workflows/cp-leaveout/swift/plangen_0.swift rename workflows/cp-leaveout/swift/{plangen.swift => plangen_1.swift} (96%) create mode 100644 workflows/cp-leaveout/swift/plangen_2.swift diff --git a/workflows/cp-leaveout/swift/plangen_0.swift b/workflows/cp-leaveout/swift/plangen_0.swift new file mode 100644 index 00000000..3d0dc530 --- /dev/null +++ b/workflows/cp-leaveout/swift/plangen_0.swift @@ -0,0 +1,25 @@ + +/* + PLANGEN 0 SWIFT + Disables plangen. Used by ResNet 50 problem +*/ + +(string result) plangen_check() +{ + result = "OK"; +} + +(string result) plangen_prep(string db_file, string plan_json, string runtype) +{ + result = "42"; +} + +(string result) plangen_start(string node, string plan_id) +{ + result = "0"; +} + +(string result) plangen_stop(string node, string plan_id) +{ + result = "OK"; +} diff --git a/workflows/cp-leaveout/swift/plangen.swift b/workflows/cp-leaveout/swift/plangen_1.swift similarity index 96% rename from workflows/cp-leaveout/swift/plangen.swift rename to workflows/cp-leaveout/swift/plangen_1.swift index cdd06452..fe1f8913 100644 --- a/workflows/cp-leaveout/swift/plangen.swift +++ b/workflows/cp-leaveout/swift/plangen_1.swift @@ -1,3 +1,9 @@ + +/* + PLANGEN 1 SWIFT + An early attempt at plangen with FS locks - did not work. +*/ + import python; pragma worktypedef DB; diff --git a/workflows/cp-leaveout/swift/plangen_2.swift b/workflows/cp-leaveout/swift/plangen_2.swift new file mode 100644 index 00000000..1d1a0455 --- /dev/null +++ b/workflows/cp-leaveout/swift/plangen_2.swift @@ -0,0 +1,94 @@ + +/* + PLANGEN 2 SWIFT + Currently working version for Challenge Problem Uno +*/ + +// This DB configuration and python_db() function will put all +// calls to python_db() on rank DB corresponding to +// environment variable TURBINE_DB_WORKERS: + +pragma worktypedef DB; + +@dispatch=DB +(string output) python_db(string code, string expr="repr(0)") +"turbine" "0.1.0" + [ "set <> [ turbine::python 1 1 <> <> ]" ]; + +// Simply use python_db() to log the DB rank: +python_db( +---- +import os, sys +print("This rank is the DB rank: %s" % os.getenv("ADLB_RANK_SELF")) +sys.stdout.flush() +---- +); + +(string check) plangen_check() { + // Simple test that we can import plangen + check = python_db(---- +try: + import plangen + result = 'OK' +except Exception as e: + result = str(e) + ----, + "result"); +} + +(string result) plangen_prep(string db_file, string plan_json, string runtype) +{ +// Initialize the DB +result = python_persist( +---- +import sys, traceback +import plangen +try: + result = str(plangen.plan_prep('%s', '%s', %s)) +except Exception as e: + info = sys.exc_info() + s = traceback.format_tb(info[2]) + print(str(e) + ' ... \\n' + ''.join(s)) + sys.stdout.flush() + result = 'EXCEPTION' +---- % (db_file, plan_json, runtype), +"result"); +} + +(string result) plangen_start(string node, string plan_id) +{ + result = python_db( +---- +import sys, traceback +import plangen +try: + result = str(plangen.start_subplan('%s', '%s', %s, '%s', %s)) +except Exception as e: + info = sys.exc_info() + s = traceback.format_tb(info[2]) + print('EXCEPTION in plangen_start()\\n' + + str(e) + ' ... \\n' + ''.join(s)) + sys.stdout.flush() + result = "EXCEPTION" +---- % (db_file, plan_json, plan_id, node, runtype), + "result"); +} + +(string result) plangen_stop(string node, string plan_id) +{ + result = python_db( +---- +import plangen +import fcntl, sys, traceback +try: + result = str(plangen.stop_subplan('%s', '%s', '%s', {})) +except Exception as e: + info = sys.exc_info() + s = traceback.format_tb(info[2]) + sys.stdout.write('EXCEPTION in plangen_stop()\\n' + + str(e) + ' ... \\n' + ''.join(s) + '\\n') + sys.stdout.flush() + result = 'EXCEPTION' +---- % (db_file, plan_id, node), + "result"); +} diff --git a/workflows/cp-leaveout/swift/workflow.swift b/workflows/cp-leaveout/swift/workflow.swift index 5a351899..f193cc58 100644 --- a/workflows/cp-leaveout/swift/workflow.swift +++ b/workflows/cp-leaveout/swift/workflow.swift @@ -34,11 +34,8 @@ import python; import string; import sys; - -printf("OK"); - - import candle_utils; +import plangen_0; report_env(); @@ -147,64 +144,6 @@ run_stage(int N, int S, string this, int stage, void block, } } -// This DB configuration and python_db() function will put all -// calls to python_db() on rank DB corresponding to -// environment variable TURBINE_DB_WORKERS: - -pragma worktypedef DB; - -@dispatch=DB -(string output) python_db(string code, string expr="repr(0)") -"turbine" "0.1.0" - [ "set <> [ turbine::python 1 1 <> <> ]" ]; - -// Simply use python_db() to log the DB rank: -python_db( ----- -import os, sys -print("This rank is the DB rank: %s" % os.getenv("ADLB_RANK_SELF")) -sys.stdout.flush() ----- -); - -(string result) plangen_start(string node, string plan_id) -{ - result = python_db( ----- -import sys, traceback -import plangen -try: - result = str(plangen.start_subplan('%s', '%s', %s, '%s', %s)) -except Exception as e: - info = sys.exc_info() - s = traceback.format_tb(info[2]) - print('EXCEPTION in plangen_start()\\n' + - str(e) + ' ... \\n' + ''.join(s)) - sys.stdout.flush() - result = "EXCEPTION" ----- % (db_file, plan_json, plan_id, node, runtype), - "result"); -} - -(string result) plangen_stop(string node, string plan_id) -{ - result = python_db( ----- -import plangen -import fcntl, sys, traceback -try: - result = str(plangen.stop_subplan('%s', '%s', '%s', {})) -except Exception as e: - info = sys.exc_info() - s = traceback.format_tb(info[2]) - sys.stdout.write('EXCEPTION in plangen_stop()\\n' + - str(e) + ' ... \\n' + ''.join(s) + '\\n') - sys.stdout.flush() - result = 'EXCEPTION' ----- % (db_file, plan_id, node), - "result"); -} - /** MAKE JSON FRAGMENT: Construct the JSON parameter fragment for the model */ (string result) make_json_fragment(string this, int stage) { @@ -242,32 +181,10 @@ except Exception as e: printf("CP LEAVEOUT WORKFLOW: START: N=%i S=%i", N, S); // First: simple test that we can import plangen -check = python_persist(---- -try: - import plangen - result = 'OK' -except Exception as e: - result = str(e) -----, -"result"); -printf("python result: import plangen: '%s'", check) => - assert(check == "OK", "could not import plangen, check PYTHONPATH!"); +check = plangen_check(); +assert(check == "OK", "could not import plangen, check PYTHONPATH!"); -// Initialize the DB -plan_id = python_persist( ----- -import sys, traceback -import plangen -try: - result = str(plangen.plan_prep('%s', '%s', %s)) -except Exception as e: - info = sys.exc_info() - s = traceback.format_tb(info[2]) - print(str(e) + ' ... \\n' + ''.join(s)) - sys.stdout.flush() - result = 'EXCEPTION' ----- % (db_file, plan_json, runtype), -"result"); +plan_id = plangen_prep(db_file, plan_json, "NOTHING"); printf("DB plan_id: %s", plan_id); assert(plan_id != "EXCEPTION", "Plan prep failed!"); From c5ec6acb6b71472d2314bd7f757d8ce844f37170 Mon Sep 17 00:00:00 2001 From: Justin M Wozniak Date: Wed, 29 Jul 2020 12:47:23 -0500 Subject: [PATCH 054/601] New case test/test-rn-1.sh --- workflows/cp-leaveout/swift/workflow.sh | 12 ++-- workflows/cp-leaveout/test/test-rn-1.sh | 76 +++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 5 deletions(-) create mode 100755 workflows/cp-leaveout/test/test-rn-1.sh diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index b5a5a418..f81a6512 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -30,22 +30,24 @@ source $WORKFLOWS_ROOT/common/sh/utils.sh usage() { - echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME" + echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME EPOCH_MODE" + echo " EPOCH_MODE is one of the compute_epochs_*.swift modules." } -if (( ${#} < 5 )) +if (( ${#} < 6 )) then usage exit 1 fi +set -x if ! { get_site $1 # Sets SITE get_expid $2 # Sets EXPID get_cfg_sys $3 get_cfg_prm $4 MODEL_NAME=$5 - EPOCH_MODE=${6:-log} # Default to log mode + EPOCH_MODE=$6 } then usage @@ -57,6 +59,8 @@ WORKFLOW_ARGS=$* echo "WORKFLOW.SH: Running model: $MODEL_NAME for EXPID: $EXPID" +set +x + source_site env $SITE source_site sched $SITE @@ -198,8 +202,6 @@ fi TURBINE_STDOUT="" # "$TURBINE_OUTPUT/out/out-%%r.txt" mkdir -pv $TURBINE_OUTPUT/out -echo PROCS $PROCS - swift-t -O 0 -n $PROCS \ ${MACHINE:-} \ -p \ diff --git a/workflows/cp-leaveout/test/test-rn-1.sh b/workflows/cp-leaveout/test/test-rn-1.sh new file mode 100755 index 00000000..9fa3daf7 --- /dev/null +++ b/workflows/cp-leaveout/test/test-rn-1.sh @@ -0,0 +1,76 @@ +#!/bin/bash +set -eu + +# CP LEAVEOUT ResNet 1 + +usage() +{ + echo "Usage: test SITE EXPID EPOCH_MODE WORKFLOW_ARGS" + echo " EPOCH_MODE is one of the compute_epochs_*.swift modules." +} + +if (( ${#} < 3 )) +then + usage + exit 1 +fi + +SITE=$1 +RUN_DIR=$2 +EPOCH_MODE=$3 +shift 3 +WORKFLOW_ARGS=$* + +export MODEL_PYTHON_DIR=$HOME/proj/ai-apps +export MODEL_NAME=resnet50 + +# Self-configure +THIS=$( readlink --canonicalize $( dirname $0 ) ) +EMEWS_PROJECT_ROOT=$( readlink --canonicalize $THIS/.. ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( readlink --canonicalize $EMEWS_PROJECT_ROOT/.. ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-1.sh +export CFG_PRM=$THIS/cfg-prm-1.sh + +# What to return from the objective function (Keras model) +# val_loss (default), loss, and val_corr are supported +# export OBJ_RETURN="val_loss" +export OBJ_RETURN="loss" + +if [[ $SITE == "theta" ]] +then + export WAIT=1 +fi + +PLAN_JSON="" +DATAFRAME_CSV="" +BENCHMARK_DATA="" + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME $EPOCH_MODE $WORKFLOW_ARGS \ + --plan_json=$PLAN_JSON \ + --dataframe_csv=$DATAFRAME_CSV \ + --benchmark_data=$BENCHMARK_DATA + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +OUTPUT=turbine-output/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +# Wait for job +# queue_wait +exit + +SCRIPT=$( basename $0 .sh ) +check_output "RESULTS:" $OUTPUT $WORKFLOW $SCRIPT $JOBID +check_output "EXIT CODE: 0" $OUTPUT $WORKFLOW $SCRIPT $JOBID + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: From a84cb554f27a105738f1fa30697fdcf6ce151634 Mon Sep 17 00:00:00 2001 From: Justin M Wozniak Date: Wed, 29 Jul 2020 12:47:47 -0500 Subject: [PATCH 055/601] Example OpenMPI setting --- workflows/common/sh/env-default.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/common/sh/env-default.sh b/workflows/common/sh/env-default.sh index 08f585bb..9b575c81 100644 --- a/workflows/common/sh/env-default.sh +++ b/workflows/common/sh/env-default.sh @@ -15,3 +15,5 @@ then export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) fi +# This can be used for an OpenMPI hosts file +# export TURBINE_LAUNCH_OPTIONS="--hostfile $HOME/hosts.txt" From e4a4e2468174dcbebe96e9a908cb08acdb40a117 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 17 Aug 2020 11:15:40 -0500 Subject: [PATCH 056/601] Handle missing python.logs --- .../scripts/extract-holdout-errors.sh | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/workflows/cp-leaveout/scripts/extract-holdout-errors.sh b/workflows/cp-leaveout/scripts/extract-holdout-errors.sh index 5c42211d..0bf9d5c7 100755 --- a/workflows/cp-leaveout/scripts/extract-holdout-errors.sh +++ b/workflows/cp-leaveout/scripts/extract-holdout-errors.sh @@ -24,9 +24,21 @@ fi EXTRACT_HOLDOUT_ERRORS_AWK=$THIS/extract-holdout-errors.awk -NODES=$( ls $DIR/run ) -for NODE in $NODES +# Missing python.logs (usually due to no data): +MISSING=0 +NODES=( $( ls $DIR/run ) ) +# set -x +echo "NODES: ${#NODES[@]}" +# echo ${NODES[@]} +for NODE in ${NODES[@]} do LOG=$DIR/run/$NODE/save/python.log - awk -f $EXTRACT_HOLDOUT_ERRORS_AWK -v node=$NODE < $LOG + if [[ -r $LOG ]] + then + awk -f $EXTRACT_HOLDOUT_ERRORS_AWK -v node=$NODE < $LOG + else + MISSING=$(( MISSING + 1 )) + fi done > $DIR/holdout-errors.txt + +echo "Missing python.logs: $MISSING" From 5ce3c55a29d18bf948c6e23872665142a00ef436 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 17 Aug 2020 11:15:54 -0500 Subject: [PATCH 057/601] New describe-node.py, extract-holdout-errors.test --- .../cp-leaveout/scripts/describe-node.py | 31 +++++++++++++++++++ .../scripts/extract-holdout-errors.test | 10 ++++++ 2 files changed, 41 insertions(+) create mode 100755 workflows/cp-leaveout/scripts/describe-node.py create mode 100644 workflows/cp-leaveout/scripts/extract-holdout-errors.test diff --git a/workflows/cp-leaveout/scripts/describe-node.py b/workflows/cp-leaveout/scripts/describe-node.py new file mode 100755 index 00000000..f79ea7a2 --- /dev/null +++ b/workflows/cp-leaveout/scripts/describe-node.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python + +# DESCRIBE NODE PY +# + +import argparse, json + +parser = argparse.ArgumentParser() +parser.add_argument('plan', type=str, help='Plan data file') +parser.add_argument('node', type=str, help='The node e.g. "1.2.3"') +args = parser.parse_args() + +try: + with open(args.plan) as fp: + J = json.load(fp) +except Exception as e: + print("could not read JSON in file: %s\n" % args.plan + str(e)) + exit(1) + +for node in J.keys(): + if len(node) == 13: + # print(node) + # print(len(J[node]["train"])) + # print(J[node]["train"]) + for item in J[node]["train"]: + # print(item) + # print(item["cell"]) + print(len(item["cell"])) + print("") + # exit() + # print(str(J[args.node]["train"])) diff --git a/workflows/cp-leaveout/scripts/extract-holdout-errors.test b/workflows/cp-leaveout/scripts/extract-holdout-errors.test new file mode 100644 index 00000000..ceae4f3e --- /dev/null +++ b/workflows/cp-leaveout/scripts/extract-holdout-errors.test @@ -0,0 +1,10 @@ +2020-07-07 14:34:19 [Epoch: 48] loss: 0.004852, lr: 0.000012, mae: 0.048262, r2: -0.008181, val_loss: 0.008754, val_mae: 0.064672, val_r2: -0.532295 +2020-07-07 14:34:20 Epoch 49: lr=1.25e-05 +2020-07-07 14:38:27 [Epoch: 49] loss: 0.004851, lr: 0.000012, mae: 0.048300, r2: -0.012895, val_loss: 0.008730, val_mae: 0.064673, val_r2: -0.535607 +2020-07-07 14:38:50 Comparing y_true and y_pred: +2020-07-07 14:38:50 mse: 0.0063 +2020-07-07 14:38:50 mae: 0.0541 +2020-07-07 14:38:50 r2: 0.7352 +2020-07-07 14:38:50 corr: 0.8590 +2020-07-07 14:40:24 Cache parameter file does not exist: cache/top6_auc.params.json + From 98ac4615dc710590b59d459665febabcea59c42e Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 17 Aug 2020 11:31:05 -0500 Subject: [PATCH 058/601] Original holdout error plotting scripts from Brettin (Slack cp-leaveout 2020-07-24) --- .../scripts/distill-holdout-errors.pl | 24 +++++++++++++++++++ .../scripts/plot-holdout-errors.py | 6 +++++ 2 files changed, 30 insertions(+) create mode 100644 workflows/cp-leaveout/scripts/distill-holdout-errors.pl create mode 100644 workflows/cp-leaveout/scripts/plot-holdout-errors.py diff --git a/workflows/cp-leaveout/scripts/distill-holdout-errors.pl b/workflows/cp-leaveout/scripts/distill-holdout-errors.pl new file mode 100644 index 00000000..cc5db9e3 --- /dev/null +++ b/workflows/cp-leaveout/scripts/distill-holdout-errors.pl @@ -0,0 +1,24 @@ +$class = uc(shift @ARGV); +if($class eq "MSE") {$idx=1} +elsif($class eq "MAE") {$idx=2} +elsif($class eq "R2") {$idx=3} +else {die "invalid arg, usage: $0 MSE|MAE|R2"} +while(<>){ + chomp; + s/mse://; + s/mae://; + s/r2://; + @a=split/\s+/; + $a[0]=~s/\s+//g; + $h{$a[0]}=$a[$idx]; +} +foreach $s (sort keys %h) { + if ( $s=~/1\.(\d)\.(\d)\.(\d)\.(\d)\.(\d)/ ) { + #print "1.$1", "\n"; + print $h{"1.$1"}, "\t"; + print $h{"1.$1.$2"}, "\t"; + print $h{"1.$1.$2.$3"}, "\t"; + print $h{"1.$1.$2.$3.$4"}, "\t"; + print $h{"1.$1.$2.$3.$4.$5"}, "\t"; + print "$class\n"; + } diff --git a/workflows/cp-leaveout/scripts/plot-holdout-errors.py b/workflows/cp-leaveout/scripts/plot-holdout-errors.py new file mode 100644 index 00000000..a460cfe7 --- /dev/null +++ b/workflows/cp-leaveout/scripts/plot-holdout-errors.py @@ -0,0 +1,6 @@ +import pandas +import matplotlib.pyplot as plt +from pandas.tools.plotting import parallel_coordinates + +cpdata=pandas.read_csv('holdout-errors.parallel_plot.tsv',sep='\t', header=None, names=['Stage1','Stage2','Stage3','Stage4', 'Stage5','CLASS']) +parallel_coordinates(cpdata, class_column="CLASS", colormap=plt.get_cmap("Set2")) From 1c1530283556b46c7c88d641e201c38ea9cfce20 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 17 Aug 2020 11:46:55 -0500 Subject: [PATCH 059/601] Add output data example --- .../cp-leaveout/scripts/extract-holdout-errors.awk | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/extract-holdout-errors.awk b/workflows/cp-leaveout/scripts/extract-holdout-errors.awk index 5340b16c..5d6c6f83 100644 --- a/workflows/cp-leaveout/scripts/extract-holdout-errors.awk +++ b/workflows/cp-leaveout/scripts/extract-holdout-errors.awk @@ -1,12 +1,22 @@ # EXTRACT HOLDOUT ERRORS AWK -# Finds this data in the python.log and reports a summary of it: +# Finds error data in the python.log and reports a summary of it: + +# Input: # 2020-07-07 14:38:50 Comparing y_true and y_pred: # 2020-07-07 14:38:50 mse: 0.0063 # 2020-07-07 14:38:50 mae: 0.0541 # 2020-07-07 14:38:50 r2: 0.7352 # 2020-07-07 14:38:50 corr: 0.8590 +# Output: +# 1.1 mse: 0.0063 mae: 0.0538 r2: 0.7322 +# 1.1.1 mse: 0.0053 mae: 0.0492 r2: 0.7745 +# 1.1.1.1 mse: 0.0050 mae: 0.0480 r2: 0.7864 +# 1.1.1.1.1 mse: 0.0050 mae: 0.0473 r2: 0.7900 +# 1.1.1.1.1.1 mse: 0.0049 mae: 0.0469 r2: 0.7930 +# 1.1.1.1.1.2 mse: 0.0049 mae: 0.0470 r2: 0.7930 + $3 == "Comparing" { getline mse = $3 " " $4 From 2aff047999b1f3c00969627d091abde644c76ccb Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 17 Aug 2020 11:47:17 -0500 Subject: [PATCH 060/601] Add comment --- workflows/cp-leaveout/scripts/extract-holdout-errors.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/extract-holdout-errors.sh b/workflows/cp-leaveout/scripts/extract-holdout-errors.sh index 0bf9d5c7..1878ad6a 100755 --- a/workflows/cp-leaveout/scripts/extract-holdout-errors.sh +++ b/workflows/cp-leaveout/scripts/extract-holdout-errors.sh @@ -6,7 +6,7 @@ set -eu # in given experiment directory # Provide an experiment directory DIR # Creates $DIR/holdout-errors.txt -# See extract-holdout-errors.awk +# See extract-holdout-errors.awk for file formats THIS=$( readlink --canonicalize $( dirname $0 ) ) From b2831962e3ce50187fefb706e2de3d82913837cf Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 17 Aug 2020 11:47:37 -0500 Subject: [PATCH 061/601] Add header; fix typo from paste --- workflows/cp-leaveout/scripts/distill-holdout-errors.pl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/workflows/cp-leaveout/scripts/distill-holdout-errors.pl b/workflows/cp-leaveout/scripts/distill-holdout-errors.pl index cc5db9e3..f4a474ef 100644 --- a/workflows/cp-leaveout/scripts/distill-holdout-errors.pl +++ b/workflows/cp-leaveout/scripts/distill-holdout-errors.pl @@ -1,3 +1,10 @@ + +# DISTILL HOLDOUT ERRORS PL +# Original holdout error plotting scripts from Brettin +# Slack #cp-leaveout 2020-07-24 +# Input: holdout-errors.txt from extract-holdout-errors +# Output: Plottable file for plot-holdout-errors.py + $class = uc(shift @ARGV); if($class eq "MSE") {$idx=1} elsif($class eq "MAE") {$idx=2} @@ -22,3 +29,4 @@ print $h{"1.$1.$2.$3.$4.$5"}, "\t"; print "$class\n"; } +} From 9cd54af44c07f92c084000c9e3c012b837bad925 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 17 Aug 2020 11:48:49 -0500 Subject: [PATCH 062/601] Add header, argparse (not yet tested) --- .../cp-leaveout/scripts/plot-holdout-errors.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/scripts/plot-holdout-errors.py b/workflows/cp-leaveout/scripts/plot-holdout-errors.py index a460cfe7..f49981d0 100644 --- a/workflows/cp-leaveout/scripts/plot-holdout-errors.py +++ b/workflows/cp-leaveout/scripts/plot-holdout-errors.py @@ -1,6 +1,16 @@ + +# PLOT HOLDOUT ERRORS PY +# Plots holdout error data from distill-holdout-errors.pl + import pandas import matplotlib.pyplot as plt from pandas.tools.plotting import parallel_coordinates - -cpdata=pandas.read_csv('holdout-errors.parallel_plot.tsv',sep='\t', header=None, names=['Stage1','Stage2','Stage3','Stage4', 'Stage5','CLASS']) + +import argparse +parser = argparse.ArgumentParser(description='Make holdout errors plot') +parser.add_argument('input_file', help='The input errors TSV file') +args = parser.parse_args() + +names = ['Stage1','Stage2','Stage3','Stage4', 'Stage5', 'CLASS'] +cpdata=pandas.read_csv(args.input_file, sep='\t', header=None, names=names) parallel_coordinates(cpdata, class_column="CLASS", colormap=plt.get_cmap("Set2")) From 91b3fb1495375b13bd750052fb85c32afe939d09 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 18 Aug 2020 16:46:37 -0500 Subject: [PATCH 063/601] Update plotter for stage 6 --- .../scripts/distill-holdout-errors.pl | 53 +++++++++++++------ .../scripts/plot-holdout-errors.py | 29 ++++++++-- 2 files changed, 61 insertions(+), 21 deletions(-) diff --git a/workflows/cp-leaveout/scripts/distill-holdout-errors.pl b/workflows/cp-leaveout/scripts/distill-holdout-errors.pl index f4a474ef..7c656120 100644 --- a/workflows/cp-leaveout/scripts/distill-holdout-errors.pl +++ b/workflows/cp-leaveout/scripts/distill-holdout-errors.pl @@ -2,31 +2,52 @@ # DISTILL HOLDOUT ERRORS PL # Original holdout error plotting scripts from Brettin # Slack #cp-leaveout 2020-07-24 +# Uses stdin/stdout # Input: holdout-errors.txt from extract-holdout-errors -# Output: Plottable file for plot-holdout-errors.py +# Output: Plottable TSV file for plot-holdout-errors.py + +$stages = uc(shift @ARGV); +$class = uc(shift @ARGV); +# Select error type for this run +# (index is the column in the data after removing text tokens): +if ($class eq "MSE") {$idx=1} +elsif ($class eq "MAE") {$idx=2} +elsif ($class eq "R2" ) {$idx=3} +else {die "usage: $0 MSE|MAE|R2"} -$class = uc(shift @ARGV); -if($class eq "MSE") {$idx=1} -elsif($class eq "MAE") {$idx=2} -elsif($class eq "R2") {$idx=3} -else {die "invalid arg, usage: $0 MSE|MAE|R2"} while(<>){ chomp; + # Remove readability tokens: s/mse://; s/mae://; s/r2://; + # Split on WS: @a=split/\s+/; - $a[0]=~s/\s+//g; + # h: The big Perl hash of all the data + # Maps node ID to the selected error type value: $h{$a[0]}=$a[$idx]; } -foreach $s (sort keys %h) { - if ( $s=~/1\.(\d)\.(\d)\.(\d)\.(\d)\.(\d)/ ) { - #print "1.$1", "\n"; - print $h{"1.$1"}, "\t"; - print $h{"1.$1.$2"}, "\t"; - print $h{"1.$1.$2.$3"}, "\t"; - print $h{"1.$1.$2.$3.$4"}, "\t"; - print $h{"1.$1.$2.$3.$4.$5"}, "\t"; - print "$class\n"; + +# Suppresses a warning about the ~~ operator below: +use experimental 'smartmatch'; + +# Plot one line for each "leaf" node - a node ID with no children +foreach $id (sort keys %h) { + # Loop if there any children of this node in the hash + if (/$id\./ ~~ %h) { next; } + + # Construct a line for the output TSV via prepend: + @line = (); + while (1) { + unshift(@line, "$h{$id}\t"); + # Get the parent id for this id (drop 2 trailing chars): + $id = substr $id, 0, -2; + if (length $id < 2) { last; } + } + # Fill in any missing stages (pandas can handle a blank value): + while (scalar @line < $stages) { + push(@line, "\t"); } + push(@line, $class); + print(@line, "\n"); } diff --git a/workflows/cp-leaveout/scripts/plot-holdout-errors.py b/workflows/cp-leaveout/scripts/plot-holdout-errors.py index f49981d0..ceec043f 100644 --- a/workflows/cp-leaveout/scripts/plot-holdout-errors.py +++ b/workflows/cp-leaveout/scripts/plot-holdout-errors.py @@ -4,13 +4,32 @@ import pandas import matplotlib.pyplot as plt -from pandas.tools.plotting import parallel_coordinates +# This was removed from Pandas 1.6: +# Cf. https://stackoverflow.com/questions/54473018/where-is-pandas-tools +# from pandas.tools.plotting import parallel_coordinates +from pandas.plotting import parallel_coordinates import argparse parser = argparse.ArgumentParser(description='Make holdout errors plot') -parser.add_argument('input_file', help='The input errors TSV file') +parser.add_argument('stages', type=int, help='Number of stages') +parser.add_argument('file_input', help='The input errors TSV file') +parser.add_argument('file_output', help='The output PNG file') + args = parser.parse_args() -names = ['Stage1','Stage2','Stage3','Stage4', 'Stage5', 'CLASS'] -cpdata=pandas.read_csv(args.input_file, sep='\t', header=None, names=names) -parallel_coordinates(cpdata, class_column="CLASS", colormap=plt.get_cmap("Set2")) +# names = [ 'Stage1','Stage2','Stage3','Stage4', 'Stage5', 'CLASS'] + +names = [] +for i in range(1, args.stages+1): + names.append("Stage"+str(i)) +names.append('CLASS') + +print(str(names)) + +cpdata=pandas.read_csv(args.file_input, sep='\t', header=None, names=names) +p = parallel_coordinates(cpdata, class_column="CLASS", + colormap=plt.get_cmap("Set2")) + +# fig = p.gcf() +fig = p.get_figure() +fig.savefig(args.file_output) From 1b78504a50487579b4b886ab265aae021c27546f Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 19 Aug 2020 15:40:28 -0500 Subject: [PATCH 064/601] Update to new Swift/T --- workflows/common/sh/env-summit.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/common/sh/env-summit.sh b/workflows/common/sh/env-summit.sh index 59814139..e390d16e 100644 --- a/workflows/common/sh/env-summit.sh +++ b/workflows/common/sh/env-summit.sh @@ -23,7 +23,8 @@ MED106=/gpfs/alpine/world-shared/med106 # SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-02 # Python (med106/sw/condaenv-200408) and R: # SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-08 -SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-11 +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-11 +SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-08-19 export TURBINE_HOME=$SWIFT/turbine PATH=$SWIFT/stc/bin:$PATH From 5f1a9923a4b0af8d312c216802aa5f097219d933 Mon Sep 17 00:00:00 2001 From: rajeeja Date: Mon, 24 Aug 2020 23:50:44 -0500 Subject: [PATCH 065/601] add new noise_label workflow for nt3 study --- workflows/uq-noise/swift/workflow-noise.sh | 196 ++++++++++++++++++ workflows/uq-noise/swift/workflow-noise.swift | 53 +++++ workflows/uq-noise/swift/workflow.sh | 2 + workflows/uq-noise/test/test-noise.sh | 65 ++++++ 4 files changed, 316 insertions(+) create mode 100755 workflows/uq-noise/swift/workflow-noise.sh create mode 100644 workflows/uq-noise/swift/workflow-noise.swift create mode 100755 workflows/uq-noise/test/test-noise.sh diff --git a/workflows/uq-noise/swift/workflow-noise.sh b/workflows/uq-noise/swift/workflow-noise.sh new file mode 100755 index 00000000..1ebab699 --- /dev/null +++ b/workflows/uq-noise/swift/workflow-noise.sh @@ -0,0 +1,196 @@ +#! /usr/bin/env bash +set -eu + +# UQ NOISE WORKFLOW +# Main entry point for UQ-NOISE workflow +# See README.adoc for more information + +# Autodetect this workflow directory +export EMEWS_PROJECT_ROOT=$( cd $( dirname $0 )/.. ; /bin/pwd ) +export WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. ; /bin/pwd ) +if [[ ! -d $EMEWS_PROJECT_ROOT/../../../Benchmarks ]] +then + echo "Could not find Benchmarks in: $EMEWS_PROJECT_ROOT/../../../Benchmarks" + exit 1 +fi +BENCHMARKS_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd) +export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} +BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/NT3 +export BENCHMARK_TIMEOUT +export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} + +XCORR_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../xcorr ; /bin/pwd) +export XCORR_ROOT=${XCORR_ROOT:-$XCORR_DEFAULT} + +SCRIPT_NAME=$(basename $0) + +# Source some utility functions used by EMEWS in this script +source $WORKFLOWS_ROOT/common/sh/utils.sh + +usage() +{ + echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME" +} + +if (( ${#} != 5 )) +then + usage + exit 1 +fi + +if ! { + get_site $1 # Sets SITE + get_expid $2 # Sets EXPID + get_cfg_sys $3 + get_cfg_prm $4 + MODEL_NAME=$5 + } +then + usage + exit 1 +fi + +echo "Running "$MODEL_NAME "workflow" + +source_site env $SITE +source_site sched $SITE + +# Set PYTHONPATH for BENCHMARK related stuff +PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$XCORR_ROOT +PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # needed for model_runner and logs + +export APP_PYTHONPATH=$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$XCORR_ROOT + +export TURBINE_JOBNAME="JOB:${EXPID}" + +if [ -z ${GPU_STRING+x} ]; +then + GPU_ARG="" +else + GPU_ARG="-gpus=$GPU_STRING" +fi + +mkdir -pv $TURBINE_OUTPUT + +DB_FILE=$TURBINE_OUTPUT/uq-noise.db +if [[ ! -f DB_FILE ]] +then + if [[ ${UQ_NOISE_ID:-} == "" ]] + then + if [[ ${EXPID:0:1} == "X" ]] + then + UQ_NOISE_ID=${EXPID:1} + else + UQ_NOISE_ID=$EXPID + fi + fi + # $EMEWS_PROJECT_ROOT/db/db-cplo-init $DB_FILE $UQ_NOISE_ID +fi + +CMD_LINE_ARGS=( -benchmark_timeout=$BENCHMARK_TIMEOUT + -exp_id=$EXPID + -site=$SITE + -db_file=$DB_FILE + $GPU_ARG + -cache_dir=$CACHE_DIR + ) + +USER_VARS=( $CMD_LINE_ARGS ) +# log variables and script to to TURBINE_OUTPUT directory +log_script + +# Make run directory in advance to reduce contention +mkdir -pv $TURBINE_OUTPUT/run +mkdir -pv $TURBINE_OUTPUT/data +mkdir -pv $CACHE_DIR +mkdir -pv $XCORR_DATA_DIR +mkdir -pv $TURBINE_OUTPUT/hpo_log + +# Allow the user to set an objective function +OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} +SWIFT_IMPL="py" +OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +# This is used by the obj_app objective function +export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh + +# log_path PYTHONPATH + +WORKFLOW_SWIFT=${WORKFLOW_SWIFT:-workflow-noise.swift} +echo "WORKFLOW_SWIFT: $WORKFLOW_SWIFT" + +WAIT_ARG="" +if (( ${WAIT:-0} )) +then + WAIT_ARG="-t w" + echo "Turbine will wait for job completion." +fi + +if [[ ${MACHINE:-} == "" ]] +then + STDOUT=$TURBINE_OUTPUT/output.txt + # The turbine-output link is only created on scheduled systems, + # so if running locally, we create it here so the test*.sh wrappers + # can find it + [[ -L turbine-output ]] && rm turbine-output + ln -s $TURBINE_OUTPUT turbine-output +else + # When running on a scheduled system, Swift/T automatically redirects + # stdout to the turbine-output directory. This will just be for + # warnings or unusual messages + # use for summit (slurm needs two %) + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" + + #export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%r.txt" + mkdir -pv $TURBINE_OUTPUT/out + STDOUT="" +fi + +#echo ${CMD_LINE_ARGS[@]} + +cd $TURBINE_OUTPUT +cp $CFG_SYS $CFG_PRM $WORKFLOWS_ROOT/uq-noise/swift/workflow-noise.swift $TURBINE_OUTPUT + +if [[ ${SITE} == "summit" ]] +then + export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" +fi +TURBINE_RESIDENT_WORK_WORKERS=1 + +swift-t -n $PROCS \ + ${MACHINE:-} \ + -p \ + -I $OBJ_DIR \ + -i $OBJ_MODULE \ + -e LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} \ + -e TURBINE_RESIDENT_WORK_WORKERS=$TURBINE_RESIDENT_WORK_WORKERS \ + -e TURBINE_STDOUT \ + -e RESIDENT_WORK_RANKS=$RESIDENT_WORK_RANKS \ + -e BENCHMARKS_ROOT \ + -e EMEWS_PROJECT_ROOT \ + -e XCORR_ROOT \ + -e APP_PYTHONPATH=$APP_PYTHONPATH \ + $( python_envs ) \ + -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ + -e OBJ_RETURN \ + -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ + -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ + -e MODEL_SH \ + -e MODEL_NAME \ + -e SITE \ + -e BENCHMARK_TIMEOUT \ + -e BENCHMARKS_ROOT \ + -e SH_TIMEOUT \ + -e IGNORE_ERRORS \ + $WAIT_ARG \ + $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} |& \ + tee $STDOUT + + +if (( ${PIPESTATUS[0]} )) +then + echo "workflow.sh: swift-t exited with error!" + exit 1 +fi + +# echo "EXIT CODE: 0" | tee -a $STDOUT + diff --git a/workflows/uq-noise/swift/workflow-noise.swift b/workflows/uq-noise/swift/workflow-noise.swift new file mode 100644 index 00000000..703af616 --- /dev/null +++ b/workflows/uq-noise/swift/workflow-noise.swift @@ -0,0 +1,53 @@ +/* + UQ NOISE SWIFT + Main workflow +*/ + +import assert; +import files; +import io; +import python; +import unix; +import sys; +import string; +import location; +import math; + +string FRAMEWORK = "keras"; + +string xcorr_root = getenv("XCORR_ROOT"); +string preprocess_rnaseq = getenv("PREPROP_RNASEQ"); +string emews_root = getenv("EMEWS_PROJECT_ROOT"); +string turbine_output = getenv("TURBINE_OUTPUT"); + +string exp_id = argv("exp_id"); +int benchmark_timeout = toint(argv("benchmark_timeout", "-1")); +string model_name = getenv("MODEL_NAME"); + +printf("UQ NOISE WORKFLOW.SWIFT"); +printf("TURBINE_OUTPUT: " + turbine_output); + +float noise_step = 10.0; // Difference between noises +int num_trials = 1; + +float num_label_noise= 10; // Number of noise levels to try + +float label_noise_array[] = [0:num_label_noise]; +int trials[] = [0:num_trials-1]; + +foreach level, i in label_noise_array +{ + foreach trial, k in trials + { + label_noise = level * noise_step/100; + run_id = "%0.2f-%01i" % (label_noise, k); + params = ("{ \"label_noise\" : %f , " + + " \"epochs\" : 100 } ") % + (label_noise); + printf("running: %s", params); + result = obj(params, run_id); + printf("result %s : label_noise %0.2f : %s", + run_id, label_noise, result); + } +} + diff --git a/workflows/uq-noise/swift/workflow.sh b/workflows/uq-noise/swift/workflow.sh index 7c82bdaa..799cd2a2 100755 --- a/workflows/uq-noise/swift/workflow.sh +++ b/workflows/uq-noise/swift/workflow.sh @@ -108,6 +108,7 @@ mkdir -pv $TURBINE_OUTPUT/hpo_log # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} +SWIFT_IMPL="py" OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh @@ -153,6 +154,7 @@ if [[ ${SITE} == "summit" ]] then export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" fi +TURBINE_RESIDENT_WORK_WORKERS=1 swift-t -n $PROCS \ ${MACHINE:-} \ diff --git a/workflows/uq-noise/test/test-noise.sh b/workflows/uq-noise/test/test-noise.sh new file mode 100755 index 00000000..98ce08dd --- /dev/null +++ b/workflows/uq-noise/test/test-noise.sh @@ -0,0 +1,65 @@ +#!/bin/bash +set -eu + +# UQ NOISE TEST 1 + +usage() +{ + echo "Usage: test SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 2 )) +then + RUN_DIR=$2 +elif (( ${#} == 1 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=nt3 +SITE=$1 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-small.sh +# export CFG_SYS=$THIS/cfg-sys-big.sh +export CFG_PRM=$THIS/cfg-prm-1.sh + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export OBJ_RETURN="val_loss" + +if [[ $SITE == "theta" ]] +then + export WAIT=1 +fi + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow-noise.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME + +# Wait for job +TURBINE_OUTPUT=$( readlink turbine-output ) +queue_wait + +# Check job output +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: From f5fe967afbc6e97b504367d14d6093f0714fca92 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 25 Aug 2020 14:50:39 -0500 Subject: [PATCH 066/601] Add epoch-count.sh --- workflows/cp-leaveout/scripts/epoch-count.sh | 36 ++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100755 workflows/cp-leaveout/scripts/epoch-count.sh diff --git a/workflows/cp-leaveout/scripts/epoch-count.sh b/workflows/cp-leaveout/scripts/epoch-count.sh new file mode 100755 index 00000000..a883f360 --- /dev/null +++ b/workflows/cp-leaveout/scripts/epoch-count.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -eu + +# EPOCH COUNT SH +# Report run progress in number of completed epochs + +THIS=$( readlink --canonicalize $( dirname $0 ) ) +CPLO=$( readlink --canonicalize $THIS/.. ) +SUPERVISOR=$( readlink --canonicalize $CPLO/../.. ) + +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ + DIR - ${*} + +if ! [[ -d $DIR ]] +then + echo "Does not exist: $DIR" + exit 1 +fi + +EXPID=$( basename $DIR ) +JOBID=$( cat $DIR/jobid.txt ) +show EXPID JOBID + +LOGS=( $( find $DIR -name python.log ) ) +echo "epoch-count.sh: found ${#LOGS[@]} logs ..." +for LOG in ${LOGS[@]} +do + echo -n "$LOG :: " + # Pull out the last "Epoch:" line, print only the number: + # sed -n '/Epoch:/h;${g;s/.*Epoch: \([0-9]*\).*/\1/;p}' $LOG + grep "Epoch:" $LOG + # experiments/X362/run/1.3/save/python.log +done # | sort -r -n -k 2 | column -t + From f31ce97e1cecaf1346dc96013da79229cc7d2999 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 25 Aug 2020 16:07:36 -0500 Subject: [PATCH 067/601] New epoch-time scripts --- workflows/cp-leaveout/scripts/epoch-time.py | 69 +++++++++++++++++++++ workflows/cp-leaveout/scripts/epoch-time.sh | 42 +++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 workflows/cp-leaveout/scripts/epoch-time.py create mode 100755 workflows/cp-leaveout/scripts/epoch-time.sh diff --git a/workflows/cp-leaveout/scripts/epoch-time.py b/workflows/cp-leaveout/scripts/epoch-time.py new file mode 100644 index 00000000..b4ac0747 --- /dev/null +++ b/workflows/cp-leaveout/scripts/epoch-time.py @@ -0,0 +1,69 @@ + +# EPOCH TIME PY +# See epoch-time.sh + +import datetime, sys, time + +from Node import Node +from utils import fail + +# Main data structure: +# map from stage number to list of epoch times in seconds +stages = {} +for stage in range(1,6+1): + stages[stage] = [] + +# Files processed: +progress = 0 +total = 0 + +node_current = "NONE" +stage_current = -1 +start_current = None + +while True: + + line = sys.stdin.readline() + + if len(line) == 0: break # EOF + if len(line) == 1: continue # Blank line + tokens = line.split() + + if tokens[0] == "epoch-time:": + if tokens[1] == "node": + node_current = tokens[2] + stage_current = int(len(node_current) / 2) + start_current = None + # print("node: " + node_current) + # print("stage: " + str(stage_current)) + progress += 1 + elif tokens[1] == "total": + total = int(tokens[2]) + else: + assert(False) + continue + + if tokens[2] == "UNO" and tokens[3] == "START": + # This is a Keras restart: Reset the timer + start_current = None + + if tokens[2] == "Epoch": + ts = tokens[0] + " " + tokens[1] + dt = datetime.datetime.strptime(ts, "%Y-%m-%d %H:%M:%S") + if start_current == None: + start_current = dt + continue + start = start_current.timestamp() + stop = dt .timestamp() + duration = stop - start + # print("epoch complete: " + str(duration)) + start_current = dt + stages[stage_current].append(duration) + +for stage in range(1,6+1): + n = len(stages[stage]) + if n == 0: + avg = -1 + else: + avg = sum(stages[stage]) / n + print("stage %i count: %6i avg: %8.2f" % (stage, n, avg)) diff --git a/workflows/cp-leaveout/scripts/epoch-time.sh b/workflows/cp-leaveout/scripts/epoch-time.sh new file mode 100755 index 00000000..bf0a44bd --- /dev/null +++ b/workflows/cp-leaveout/scripts/epoch-time.sh @@ -0,0 +1,42 @@ +#!/bin/bash +set -eu + +# EPOCH TIME SH +# Report average time per epoch by stage + +THIS=$( readlink --canonicalize $( dirname $0 ) ) +CPLO=$( readlink --canonicalize $THIS/.. ) +SUPERVISOR=$( readlink --canonicalize $CPLO/../.. ) + +source $SUPERVISOR/workflows/common/sh/utils.sh + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ + DIR - ${*} + +if ! [[ -d $DIR ]] +then + echo "Does not exist: $DIR" + exit 1 +fi + +EXPID=$( basename $DIR ) +JOBID=$( cat $DIR/jobid.txt ) +show EXPID JOBID + +NODES=( $( ls $DIR/run | head -10000 ) ) # +echo "epoch-time.sh: found ${#NODES[@]} nodes ..." +{ + echo "epoch-time: total ${#NODES[@]}" + for NODE in ${NODES[@]} + do + echo "epoch-time: node $NODE" + PYTHON_LOG=$DIR/run/$NODE/save/python.log + if [[ ! -e $PYTHON_LOG ]] + then + continue + fi + cat $PYTHON_LOG + done +} | python $THIS/epoch-time.py From b86c603f411ada0411b053be45baf2273430c9bd Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 25 Aug 2020 16:46:54 -0500 Subject: [PATCH 068/601] Simplify --- workflows/cp-leaveout/scripts/epoch-time.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/workflows/cp-leaveout/scripts/epoch-time.sh b/workflows/cp-leaveout/scripts/epoch-time.sh index bf0a44bd..73ad058a 100755 --- a/workflows/cp-leaveout/scripts/epoch-time.sh +++ b/workflows/cp-leaveout/scripts/epoch-time.sh @@ -33,10 +33,9 @@ echo "epoch-time.sh: found ${#NODES[@]} nodes ..." do echo "epoch-time: node $NODE" PYTHON_LOG=$DIR/run/$NODE/save/python.log - if [[ ! -e $PYTHON_LOG ]] + if [[ -e $PYTHON_LOG ]] then - continue + cat $PYTHON_LOG fi - cat $PYTHON_LOG done } | python $THIS/epoch-time.py From 633af617900ee5e4cfec69064873d27e1a28b951 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 26 Aug 2020 11:56:40 -0500 Subject: [PATCH 069/601] Fix comment --- workflows/cp-leaveout/scripts/distill-holdout-errors.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/distill-holdout-errors.pl b/workflows/cp-leaveout/scripts/distill-holdout-errors.pl index 7c656120..7a99be54 100644 --- a/workflows/cp-leaveout/scripts/distill-holdout-errors.pl +++ b/workflows/cp-leaveout/scripts/distill-holdout-errors.pl @@ -33,7 +33,7 @@ # Plot one line for each "leaf" node - a node ID with no children foreach $id (sort keys %h) { - # Loop if there any children of this node in the hash + # Loop if there are any children of this node in the hash if (/$id\./ ~~ %h) { next; } # Construct a line for the output TSV via prepend: From f5784d6b8af77e95a0261b712b75a028dafd79d0 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 26 Aug 2020 11:56:50 -0500 Subject: [PATCH 070/601] Better loop --- workflows/cp-leaveout/scripts/distill-holdout-errors.pl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/workflows/cp-leaveout/scripts/distill-holdout-errors.pl b/workflows/cp-leaveout/scripts/distill-holdout-errors.pl index 7a99be54..3a71e02a 100644 --- a/workflows/cp-leaveout/scripts/distill-holdout-errors.pl +++ b/workflows/cp-leaveout/scripts/distill-holdout-errors.pl @@ -37,12 +37,11 @@ if (/$id\./ ~~ %h) { next; } # Construct a line for the output TSV via prepend: + # Gets the parent ids for each id (drops 2 trailing chars) + # until the id is too short @line = (); - while (1) { + for ( ; length $id > 2 ; $id = substr $id, 0, -2) { unshift(@line, "$h{$id}\t"); - # Get the parent id for this id (drop 2 trailing chars): - $id = substr $id, 0, -2; - if (length $id < 2) { last; } } # Fill in any missing stages (pandas can handle a blank value): while (scalar @line < $stages) { From 6939f34cc6c3103d6becbc1f596446f6bdbe211a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 28 Aug 2020 13:55:52 -0500 Subject: [PATCH 071/601] Initial data-size analysis scripts --- workflows/cp-leaveout/scripts/data-size.py | 63 ++++++++++++++++++++++ workflows/cp-leaveout/scripts/data-size.sh | 15 ++++++ 2 files changed, 78 insertions(+) create mode 100644 workflows/cp-leaveout/scripts/data-size.py create mode 100755 workflows/cp-leaveout/scripts/data-size.sh diff --git a/workflows/cp-leaveout/scripts/data-size.py b/workflows/cp-leaveout/scripts/data-size.py new file mode 100644 index 00000000..a4378c0a --- /dev/null +++ b/workflows/cp-leaveout/scripts/data-size.py @@ -0,0 +1,63 @@ + +# DATA SIZE PY +# Get the training data size from the file + +import argparse, logging, os, sys +import pandas as pd + +from utils import fail + +parser = argparse.ArgumentParser(description="Extract the data size") +parser.add_argument("input", + help="The training file") +args = parser.parse_args() + +print("data-size.py: opening '%s' ..." % args.input) + +_, ext = os.path.splitext(args.input) +if ext == ".h5" or ext == ".hdf5": + store = pd.HDFStore(args.input, "r") + # df = store.get("df") + df_y_train = store.get("y_train") + print("train " + str(df_y_train.shape)) + df_y_val = store.get("y_val") + print("val " + str(df_y_val.shape)) + df_x_train_0 = store.get("x_train_0") + print("x0 " + str(df_x_train_0.shape)) + df_x_train_1 = store.get("x_train_1") + print("x1 " + str(df_x_train_1.shape)) + + print(df_x_train_0.index) + + clms = df_x_train_0.columns + print(clms) + for clm in clms: + print(df_x_train_0.at[2,clm]) + # print(df_x_train_1.columns) + + store.close() + +elif ext == ".feather": + print("read feather " + str(args.input)) + df = pd.read_feather(args.input).fillna(0) + print(df.shape) + # total size: (529940, 6215) + +# store = pd.HDFStore(args.input, "r", complevel=9, complib="blosc:snappy") +# print(str(store)) + +# print(store.get("y_val")) + + +# f = h5py.File(args.file, "r") + +# # print(f.name) + +# K = list(f.keys()) +# print(K) +# for g in K: +# print(g) +# if type(f[g]) == h5py._hl.group.Group: +# D = f[g].keys() +# print(list(D)) +# print("") diff --git a/workflows/cp-leaveout/scripts/data-size.sh b/workflows/cp-leaveout/scripts/data-size.sh new file mode 100755 index 00000000..8724af93 --- /dev/null +++ b/workflows/cp-leaveout/scripts/data-size.sh @@ -0,0 +1,15 @@ +#!/bin/bash +set -eu + +# DATA SIZE SH +# See data-size.py + +THIS=$( readlink --canonicalize $( dirname $0 ) ) + +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) +source $SUPERVISOR/workflows/common/sh/utils.sh + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +set -x +python3 $THIS/data-size.py $* From 4d019a59bd7a0b999b6672c21c1cd0fdecca816a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 28 Aug 2020 13:56:12 -0500 Subject: [PATCH 072/601] Handle missing data from topN --- workflows/cp-leaveout/py/data_setup.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/workflows/cp-leaveout/py/data_setup.py b/workflows/cp-leaveout/py/data_setup.py index 0916c536..70abd568 100644 --- a/workflows/cp-leaveout/py/data_setup.py +++ b/workflows/cp-leaveout/py/data_setup.py @@ -77,6 +77,9 @@ def pre_run(params): duration = stop - start print("data_setup: build_dataframe() OK : " + "%0.1f seconds." % duration) + except topN_to_uno.topN_NoDataException: + print("data_setup: caught topN_NoDataException: SKIP") + return ModelResult.SKIP except ValueError: print("data_setup: caught ValueError for node: '%s'" % params["node"]) # new 2019-12-02 From 405f036b0e2194a4ba3d88c85e8239184ed1ff6a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 28 Aug 2020 13:56:28 -0500 Subject: [PATCH 073/601] More space for stage 6 output --- workflows/cp-leaveout/scripts/extract-holdout-errors.awk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/extract-holdout-errors.awk b/workflows/cp-leaveout/scripts/extract-holdout-errors.awk index 5d6c6f83..24254de2 100644 --- a/workflows/cp-leaveout/scripts/extract-holdout-errors.awk +++ b/workflows/cp-leaveout/scripts/extract-holdout-errors.awk @@ -24,6 +24,6 @@ $3 == "Comparing" { mae = $3 " " $4 getline r2 = $3 " " $4 - printf "%-12s %s %s %s\n", node, mse, mae, r2 + printf "%-14s %s %s %s\n", node, mse, mae, r2 exit } From 222a4575244c104512618686b458f050a6f02f02 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 2 Sep 2020 14:54:56 -0500 Subject: [PATCH 074/601] Update Swift/T --- workflows/common/sh/env-summit.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/common/sh/env-summit.sh b/workflows/common/sh/env-summit.sh index e390d16e..aa3c5cec 100644 --- a/workflows/common/sh/env-summit.sh +++ b/workflows/common/sh/env-summit.sh @@ -24,7 +24,8 @@ MED106=/gpfs/alpine/world-shared/med106 # Python (med106/sw/condaenv-200408) and R: # SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-08 # SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-11 -SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-08-19 +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-08-19 +SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-09-02 export TURBINE_HOME=$SWIFT/turbine PATH=$SWIFT/stc/bin:$PATH From 4c4aba44eba008bdcb1575674c5c725a6215dc23 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 2 Sep 2020 17:31:42 -0400 Subject: [PATCH 075/601] o Add noise workflows for Gaussian noise and abstention workflow for NT3 --- .../common/python/model_abstention_runner.py | 326 ++++++++++++++++++ workflows/common/sh/model_abstention.sh | 108 ++++++ workflows/common/sh/sched-summit.sh | 3 +- .../common/swift/obj_abstention_py.swift | 46 +++ .../uq-noise/swift/workflow-abstention.sh | 196 +++++++++++ .../uq-noise/swift/workflow-abstention.swift | 63 ++++ workflows/uq-noise/swift/workflow-gnoise.sh | 196 +++++++++++ .../uq-noise/swift/workflow-gnoise.swift | 63 ++++ workflows/uq-noise/swift/workflow-noise.swift | 18 +- workflows/uq-noise/swift/workflow.sh | 3 + workflows/uq-noise/test/abstention.sh | 65 ++++ workflows/uq-noise/test/cfg-sys-1.sh | 4 +- workflows/uq-noise/test/gnoise.sh | 65 ++++ workflows/uq-noise/test/test-noise.sh | 2 +- 14 files changed, 1150 insertions(+), 8 deletions(-) create mode 100644 workflows/common/python/model_abstention_runner.py create mode 100644 workflows/common/sh/model_abstention.sh create mode 100644 workflows/common/swift/obj_abstention_py.swift create mode 100755 workflows/uq-noise/swift/workflow-abstention.sh create mode 100644 workflows/uq-noise/swift/workflow-abstention.swift create mode 100755 workflows/uq-noise/swift/workflow-gnoise.sh create mode 100644 workflows/uq-noise/swift/workflow-gnoise.swift create mode 100755 workflows/uq-noise/test/abstention.sh create mode 100755 workflows/uq-noise/test/gnoise.sh diff --git a/workflows/common/python/model_abstention_runner.py b/workflows/common/python/model_abstention_runner.py new file mode 100644 index 00000000..b85a532b --- /dev/null +++ b/workflows/common/python/model_abstention_runner.py @@ -0,0 +1,326 @@ + +# MODEL RUNNER PY + +# See __main__ section for usage + +import sys +import json +import os +import time +import numpy as np +import importlib +import runner_utils +from runner_utils import ModelResult +import log_tools +import math + +logger = None + +print("MODEL RUNNER...") + +# Andrew: Adding the following line (switching the order of the following two lines) in order to append an arbitrary model's dependencies to the path *before* the benchmarks in order to accidentally use a benchmark dependency +# append ${MODEL_PYTHON_DIR} to $PATH if variable is set +python_dir = os.getenv("MODEL_PYTHON_DIR") +if python_dir: + sys.path.append(python_dir) +# append ${BENCHMARKS_ROOT}/common to $PATH if variable is set +benchmarks_root = os.getenv("BENCHMARKS_ROOT") +if benchmarks_root: + sys.path.append(benchmarks_root+"/common") + +# import candle_lrn_crv + +print("sys.path:") +for i in range(0, len(sys.path)-1): + print("%2i: %s" % (i, sys.path[i])) +print("") + +def import_pkg(framework, model_name): + # The model_name is the short form of the Benchmark: e.g., 'nt3' + # The module_name is the name of the Python module: e.g., 'nt3_baseline_keras2' + print("model_name: ", model_name) + module_name = os.getenv("MODEL_PYTHON_SCRIPT") + if framework == 'keras': + if module_name == None or module_name == "": + module_name = "{}_abstention_keras2".format(model_name) + print ("module_name:", module_name) + pkg = importlib.import_module(module_name) + + # For Summit: + from tensorflow.keras import backend as K + # For other systems: + # from keras import backend as K + if K.backend() == 'tensorflow' and 'NUM_INTER_THREADS' in os.environ: + import tensorflow as tf + inter_threads = int(os.environ['NUM_INTER_THREADS']) + intra_threads = int(os.environ['NUM_INTRA_THREADS']) + print("Configuring tensorflow with {} inter threads and " + + "{} intra threads" + .format(inter_threads, intra_threads)) + cfg = tf.ConfigProto(inter_op_parallelism_threads=inter_threads, + intra_op_parallelism_threads=intra_threads) + sess = tf.Session(graph=tf.get_default_graph(), config=cfg) + K.set_session(sess) + elif framework == 'pytorch': + import torch + if module_name == None or module_name == "": + module_name = "{}_baseline_pytorch".format(model_name) + print ("module_name:", module_name) + pkg = importlib.import_module(module_name) + else: + raise ValueError("Framework must either be `keras' or `pytorch' " + + "got `{}'!".format(framework)) + + return pkg + + +def log(msg): + global logger + logger.debug(msg) + +def timestamp(): + from datetime import datetime + return datetime.now().strftime("%Y-%m-%d %H:%M:%S") + +def setup_perf(params): + return { 'top': setup_perf_top(params), + 'nvidia': setup_perf_nvidia(params) } + + +def setup_perf_top(params): + if 'perf_top' not in params: + return None + if params['perf_top'] == '0': + return None + try: + delay = int(params['perf_top']) + except: + msg = 'setup_perf_top(): params[perf_top] not an int: got: "%s"' % \ + params['perf_top'] + print(msg) + raise Exception(msg) + import subprocess + with open('perf-top.log', 'a') as fp_out: + fp_out.write('model_runner: start: %s\n\n' % timestamp()) + P = subprocess.Popen(['top', '-b', '-d', params['perf_top']], + stdout=fp_out, + stderr=subprocess.STDOUT) + return P + +def setup_perf_nvidia(params): + if 'perf_nvidia' not in params: + return None + if params['perf_nvidia'] == '0': + return None + try: + delay = int(params['perf_nvidia']) + except: + msg = 'setup_perf_nvidia(): params[perf_nvidia] not an int: ' + \ + 'got: "%s"' % params['perf_nvidia'] + print(msg) + raise Exception(msg) + import subprocess + with open('perf-nvidia.log', 'a') as fp_out: + fp_out.write('model_runner: start: %s\n\n' % timestamp()) + P = subprocess.Popen(['nvidia-smi', '--loop='+params['perf_top']], + stdout=fp_out, + stderr=subprocess.STDOUT) + return P + + +def stop_perf(Ps): + for s in ['top', 'nvidia']: + if Ps[s] is not None: + Ps[s].terminate() + + +def run(hyper_parameter_map, obj_return): + start = time.time() + global logger + logger = log_tools.get_logger(logger, 'MODEL RUNNER') + + log("START:") + sys.stdout.flush() + + directory = hyper_parameter_map['instance_directory'] + os.chdir(directory) + + with open(directory + '/rank.txt', 'w') as fp: + fp.write(str(os.getenv('ADLB_RANK_SELF')) + '\n') + + framework = hyper_parameter_map['framework'] + model_name = hyper_parameter_map['model_name'] + pkg = import_pkg(framework, model_name) + + runner_utils.format_params(hyper_parameter_map) + + params_arg = {} + if 'config_file' in hyper_parameter_map: + config_file = hyper_parameter_map['config_file'] + logger.info('specified config_file: "%s"' % config_file) + params_arg = { 'default_model': config_file } + + # params is a python dictionary + params = setup_params(pkg, hyper_parameter_map, params_arg) + + Ps = setup_perf(params) + + # Run the model! + history = pkg.run(params) + + if framework == 'keras': + runner_utils.keras_clear_session(framework) + + # Default result if there is no val_loss (as in infer.py) + result = 0 + history_result = {} + if history != None: + result, history_result = get_results(history, obj_return) + + stop_perf(Ps) + + finish = time.time() + duration = finish - start + log(" DONE: run_id %s in %0.2f seconds." % + (hyper_parameter_map["run_id"], duration)) + return (result, history_result) + + +def get_obj_return(): + obj_return = os.getenv("OBJ_RETURN") + valid_obj_returns = [ "loss", "val_loss", "val_corr", "val_acc" ] + if obj_return == None: + raise Exception("No OBJ_RETURN was in the environment!") + if obj_return not in valid_obj_returns: + raise Exception("Invalid value for OBJ_RETURN: use: " + + str(valid_obj_returns)) + return obj_return + +def load_pre_post(hyper_parameter_map, key): + module = None + if key in hyper_parameter_map: + module_name = hyper_parameter_map[key] + module = importlib.import_module(module_name) + return module + +def run_pre(hyper_parameter_map): + module = load_pre_post(hyper_parameter_map, 'pre_module') + result = ModelResult.SUCCESS + if module != None: + logger.debug("PRE RUN START") + result = module.pre_run(hyper_parameter_map) + logger.debug("PRE RUN STOP") + return result + +def run_post(hyper_parameter_map, output_map): + module = load_pre_post(hyper_parameter_map, 'post_module') + if module != None: + logger.debug("POST RUN START") + module.post_run(hyper_parameter_map, output_map) + logger.debug("POST RUN STOP") + +def run_model(hyper_parameter_map): + instance_directory = hyper_parameter_map['instance_directory'] + os.chdir(instance_directory) + global logger + logger = log_tools.get_logger(logger, "MODEL RUNNER") + obj_return = get_obj_return() + result = run_pre(hyper_parameter_map) + if result == ModelResult.ERROR: + print("run_pre() returned ERROR!") + exit(1) + elif result == ModelResult.SKIP: + log("run_pre() returned SKIP ...") + sys.stdout.flush() + return ("SKIP", "HISTORY_EMPTY") + else: + assert(result == ModelResult.SUCCESS) # proceed... + + result, history = run(hyper_parameter_map, obj_return) + runner_utils.write_output(result, instance_directory) + runner_utils.write_output(json.dumps(history, cls=runner_utils.FromNPEncoder), + instance_directory, 'history.txt') + + run_post(hyper_parameter_map, {}) + log("RUN STOP") + return (result, history) + +def setup_params(pkg, hyper_parameter_map, params_arg): + params = pkg.initialize_parameters(**params_arg) + log("PARAM UPDATE START") + for k,v in hyper_parameter_map.items(): + if k == "dense" or k == "dense_feature_layers": + if(type(v) != list): + v = v.split(" ") + v = [int(i) for i in v] + if k == "cell_features": + cp_str = v + v = list() + v.append(cp_str) + log(str(k) + " = " + str(v)) + params[k] = v + log("PARAM UPDATE STOP") + + log("WRITE_PARAMS START") + runner_utils.write_params(params, hyper_parameter_map) + log("WRITE_PARAMS STOP") + return params + + +def get_results(history, obj_return): + """ + Return the history entry that the user requested. + history: The Keras history object + """ + values = history.history[obj_return] + # Default: the last value in the history + result = values[-1] + + known_params = [ "loss", "val_loss", "val_corr", "val_dice_coef" ] + if obj_return not in known_params: + raise ValueError("Unsupported objective function: " + + "use obj_param to specify one of " + + str(known_params)) + + # Fix NaNs: + if math.isnan(result): + if obj_return == "val_corr" or obj_return == "val_dice_coef": + # Return the negative result + result = -result + else: + # Just return a large number + result = 999999999 + + print("result: " + obj_return + ": " + str(result)) + history_result = history.history.copy() + return result, history_result + +# Usage: see how sys.argv is unpacked below: +if __name__ == '__main__': + logger = log_tools.get_logger(logger, "MODEL_RUNNER") + log("RUN START") + + ( _, # The Python program name (unused) + param_string, + instance_directory, + framework, + runid, + benchmark_timeout ) = sys.argv + + hyper_parameter_map = runner_utils.init(param_string, + instance_directory, + framework, + out_dir_key='save') + hyper_parameter_map['model_name'] = os.getenv("MODEL_NAME") + if hyper_parameter_map['model_name'] == None: + raise Exception("No MODEL_NAME was in the environment!") + hyper_parameter_map['experiment_id'] = os.getenv("EXPID") + hyper_parameter_map['run_id'] = runid + hyper_parameter_map['timeout'] = float(benchmark_timeout) + + # tensorflow.__init__ calls _os.path.basename(_sys.argv[0]) + # so we need to create a synthetic argv. + # if (not hasattr(sys, 'argv')) or (len(sys.argv) == 0): + # sys.argv = ['nt3_tc1'] + sys.argv = ['null'] + run_model(hyper_parameter_map) diff --git a/workflows/common/sh/model_abstention.sh b/workflows/common/sh/model_abstention.sh new file mode 100644 index 00000000..73b025bc --- /dev/null +++ b/workflows/common/sh/model_abstention.sh @@ -0,0 +1,108 @@ +#!/bin/bash +set -eu + +# MODEL.SH + +# Shell wrapper around Keras model + +usage() +{ + echo "Usage: model.sh FRAMEWORK PARAMS RUNID" + echo "The environment should have:" + echo " EMEWS_PROJECT_ROOT|WORKFLOWS_ROOT TURBINE_OUTPUT" + echo " SITE OBJ_RETURN BENCHMARK_TIMEOUT" + echo " and MODEL_NAME EXPID for model_runner.py" + echo "If SH_TIMEOUT is provided, we run under the shell command timeout" +} + +if (( ${#} != 3 )) +then + usage + exit 1 +fi + +FRAMEWORK=$1 # Usually "keras" +# JSON string of parameters +PARAMS="$2" +RUNID=$3 + +# Each model run, runs in its own "instance" directory +# Set instance_directory to that and cd into it. +INSTANCE_DIRECTORY=$TURBINE_OUTPUT/run/$RUNID + +SH_TIMEOUT=${SH_TIMEOUT:-} +TIMEOUT_CMD="" +if [[ -n "$SH_TIMEOUT" ]] && [[ $SH_TIMEOUT != "-1" ]] +then + TIMEOUT_CMD="timeout $SH_TIMEOUT" +fi + +# All stdout/stderr after this point goes into model.log ! +mkdir -p $INSTANCE_DIRECTORY +LOG_FILE=$INSTANCE_DIRECTORY/model.log +exec >> $LOG_FILE +exec 2>&1 +cd $INSTANCE_DIRECTORY + +echo "MODEL.SH START:" +echo "MODEL_NAME: $MODEL_NAME" +echo "RUNID: $RUNID" + +# Source langs-app-{SITE} from workflow/common/sh/ (cf. utils.sh) +if [[ ${WORKFLOWS_ROOT:-} == "" ]] +then + WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. ; /bin/pwd ) +fi +source $WORKFLOWS_ROOT/common/sh/utils.sh +source_site langs-app $SITE + +echo +echo PARAMS: +echo $PARAMS | print_json + +echo +echo "MODEL.SH: USING PYTHON:" +which python +echo + +arg_array=( "$WORKFLOWS_ROOT/common/python/model_abstention_runner.py" + "$PARAMS" + "$INSTANCE_DIRECTORY" + "$FRAMEWORK" + "$RUNID" + "$BENCHMARK_TIMEOUT") +MODEL_CMD="python3 -u ${arg_array[@]}" +# echo MODEL_CMD: $MODEL_CMD +if $TIMEOUT_CMD python3 -u "${arg_array[@]}" +then + : # Assume success so we can keep a failed exit code +else + # $? is the exit status of the most recently executed command + # (i.e the line in the 'if' condition) + CODE=$? + echo # spacer + if [ $CODE == 124 ] + then + echo "MODEL.SH: Timeout error in $MODEL_CMD" + # This will trigger a NaN (the result file does not exist) + exit 0 + else + echo "MODEL.SH: Error (CODE=$CODE) in $MODEL_CMD" + echo "MODEL.SH: TIMESTAMP:" $( date "+%Y-%m-%d %H:%M:%S" ) + if (( ${IGNORE_ERRORS:-0} )) + then + echo "MODEL.SH: IGNORING ERROR." + # This will trigger a NaN (the result file does not exist) + exit 0 + fi + echo "MODEL.SH: ABORTING WORKFLOW (exit 1)" + exit 1 # Unknown error in Python: abort the workflow + fi +fi + +echo "MODEL.SH END: SUCCESS" +exit 0 # Success + +# Local Variables: +# sh-basic-offset: 2 +# End: diff --git a/workflows/common/sh/sched-summit.sh b/workflows/common/sh/sched-summit.sh index 9462913d..514ceab2 100644 --- a/workflows/common/sh/sched-summit.sh +++ b/workflows/common/sh/sched-summit.sh @@ -5,6 +5,7 @@ MACHINE="-m lsf" # Default PROJECT for CANDLE -export PROJECT=${PROJECT:-MED106} +#export QUEUE=${QUEUE:-batch-hm} +export PROJECT=${PROJECT:-MED110} # export TURBINE_OUTPUT_SOFTLINK=/dev/null diff --git a/workflows/common/swift/obj_abstention_py.swift b/workflows/common/swift/obj_abstention_py.swift new file mode 100644 index 00000000..fe65c2a2 --- /dev/null +++ b/workflows/common/swift/obj_abstention_py.swift @@ -0,0 +1,46 @@ + +/** + OBJ PY SWIFT +*/ + +string code_template = +---- +try: + import sys, traceback, json, os + import model_abstention_runner + import tensorflow + from tensorflow import keras + + obj_result = '-100' + outdir = '%s' + + if not os.path.exists(outdir): + os.makedirs(outdir) + + hyper_parameter_map = json.loads("""%s""") + hyper_parameter_map['framework'] = 'keras' + hyper_parameter_map['save'] = '{}/output'.format(outdir) + hyper_parameter_map['instance_directory'] = outdir + hyper_parameter_map['model_name'] = '%s' + hyper_parameter_map['experiment_id'] = '%s' + hyper_parameter_map['run_id'] = '%s' + hyper_parameter_map['timeout'] = %d + + obj_result, history = model_abstention_runner.run_model(hyper_parameter_map) + +except Exception as e: + info = sys.exc_info() + s = traceback.format_tb(info[2]) + sys.stdout.write('EXCEPTION: \\n' + repr(e) + ' ... \\n' + ''.join(s)) + sys.stdout.write('\\n') + sys.stdout.flush() + obj_result = 'EXCEPTION' +----; + +(string obj_result) obj(string params, string iter_indiv_id) { + string outdir = "%s/run/%s" % (turbine_output, iter_indiv_id); + string code = code_template % (outdir, params, model_name, + exp_id, iter_indiv_id, benchmark_timeout); + obj_result = python_persist(code, "str(obj_result)"); + printf("obj_py:obj(): obj_result: '%s'", obj_result); +} diff --git a/workflows/uq-noise/swift/workflow-abstention.sh b/workflows/uq-noise/swift/workflow-abstention.sh new file mode 100755 index 00000000..663e8c46 --- /dev/null +++ b/workflows/uq-noise/swift/workflow-abstention.sh @@ -0,0 +1,196 @@ +#! /usr/bin/env bash +set -eu + +# UQ NOISE WORKFLOW +# Main entry point for UQ-NOISE workflow +# See README.adoc for more information + +# Autodetect this workflow directory +export EMEWS_PROJECT_ROOT=$( cd $( dirname $0 )/.. ; /bin/pwd ) +export WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. ; /bin/pwd ) +if [[ ! -d $EMEWS_PROJECT_ROOT/../../../Benchmarks ]] +then + echo "Could not find Benchmarks in: $EMEWS_PROJECT_ROOT/../../../Benchmarks" + exit 1 +fi +BENCHMARKS_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd) +export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} +BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/NT3 +export BENCHMARK_TIMEOUT +export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} + +XCORR_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../xcorr ; /bin/pwd) +export XCORR_ROOT=${XCORR_ROOT:-$XCORR_DEFAULT} + +SCRIPT_NAME=$(basename $0) + +# Source some utility functions used by EMEWS in this script +source $WORKFLOWS_ROOT/common/sh/utils.sh + +usage() +{ + echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME" +} + +if (( ${#} != 5 )) +then + usage + exit 1 +fi + +if ! { + get_site $1 # Sets SITE + get_expid $2 # Sets EXPID + get_cfg_sys $3 + get_cfg_prm $4 + MODEL_NAME=$5 + } +then + usage + exit 1 +fi + +echo "Running "$MODEL_NAME "workflow" + +source_site env $SITE +source_site sched $SITE + +# Set PYTHONPATH for BENCHMARK related stuff +PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$XCORR_ROOT +PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # needed for model_runner and logs + +export APP_PYTHONPATH=$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$XCORR_ROOT + +export TURBINE_JOBNAME="JOB:${EXPID}" + +if [ -z ${GPU_STRING+x} ]; +then + GPU_ARG="" +else + GPU_ARG="-gpus=$GPU_STRING" +fi + +mkdir -pv $TURBINE_OUTPUT + +DB_FILE=$TURBINE_OUTPUT/uq-noise.db +if [[ ! -f DB_FILE ]] +then + if [[ ${UQ_NOISE_ID:-} == "" ]] + then + if [[ ${EXPID:0:1} == "X" ]] + then + UQ_NOISE_ID=${EXPID:1} + else + UQ_NOISE_ID=$EXPID + fi + fi + # $EMEWS_PROJECT_ROOT/db/db-cplo-init $DB_FILE $UQ_NOISE_ID +fi + +CMD_LINE_ARGS=( -benchmark_timeout=$BENCHMARK_TIMEOUT + -exp_id=$EXPID + -site=$SITE + -db_file=$DB_FILE + $GPU_ARG + -cache_dir=$CACHE_DIR + ) + +USER_VARS=( $CMD_LINE_ARGS ) +# log variables and script to to TURBINE_OUTPUT directory +log_script + +# Make run directory in advance to reduce contention +mkdir -pv $TURBINE_OUTPUT/run +mkdir -pv $TURBINE_OUTPUT/data +mkdir -pv $CACHE_DIR +mkdir -pv $XCORR_DATA_DIR +mkdir -pv $TURBINE_OUTPUT/hpo_log + +# Allow the user to set an objective function +OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} +SWIFT_IMPL="py" +OBJ_MODULE=${OBJ_MODULE:-obj_abstention_$SWIFT_IMPL} +# This is used by the obj_app objective function +export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model_abstention.sh + +# log_path PYTHONPATH + +WORKFLOW_SWIFT=${WORKFLOW_SWIFT:-workflow-abstention.swift} +echo "WORKFLOW_SWIFT: $WORKFLOW_SWIFT" + +WAIT_ARG="" +if (( ${WAIT:-0} )) +then + WAIT_ARG="-t w" + echo "Turbine will wait for job completion." +fi + +if [[ ${MACHINE:-} == "" ]] +then + STDOUT=$TURBINE_OUTPUT/output.txt + # The turbine-output link is only created on scheduled systems, + # so if running locally, we create it here so the test*.sh wrappers + # can find it + [[ -L turbine-output ]] && rm turbine-output + ln -s $TURBINE_OUTPUT turbine-output +else + # When running on a scheduled system, Swift/T automatically redirects + # stdout to the turbine-output directory. This will just be for + # warnings or unusual messages + # use for summit (slurm needs two %) + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" + + #export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%r.txt" + mkdir -pv $TURBINE_OUTPUT/out + STDOUT="" +fi + +#echo ${CMD_LINE_ARGS[@]} + +cd $TURBINE_OUTPUT +cp $CFG_SYS $CFG_PRM $WORKFLOWS_ROOT/uq-noise/swift/workflow-noise.swift $TURBINE_OUTPUT + +if [[ ${SITE} == "summit" ]] +then + export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" +fi +TURBINE_RESIDENT_WORK_WORKERS=1 + +swift-t -n $PROCS \ + ${MACHINE:-} \ + -p \ + -I $OBJ_DIR \ + -i $OBJ_MODULE \ + -e LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} \ + -e TURBINE_RESIDENT_WORK_WORKERS=$TURBINE_RESIDENT_WORK_WORKERS \ + -e TURBINE_STDOUT \ + -e RESIDENT_WORK_RANKS=$RESIDENT_WORK_RANKS \ + -e BENCHMARKS_ROOT \ + -e EMEWS_PROJECT_ROOT \ + -e XCORR_ROOT \ + -e APP_PYTHONPATH=$APP_PYTHONPATH \ + $( python_envs ) \ + -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ + -e OBJ_RETURN \ + -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ + -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ + -e MODEL_SH \ + -e MODEL_NAME \ + -e SITE \ + -e BENCHMARK_TIMEOUT \ + -e BENCHMARKS_ROOT \ + -e SH_TIMEOUT \ + -e IGNORE_ERRORS \ + $WAIT_ARG \ + $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} |& \ + tee $STDOUT + + +if (( ${PIPESTATUS[0]} )) +then + echo "workflow.sh: swift-t exited with error!" + exit 1 +fi + +# echo "EXIT CODE: 0" | tee -a $STDOUT + diff --git a/workflows/uq-noise/swift/workflow-abstention.swift b/workflows/uq-noise/swift/workflow-abstention.swift new file mode 100644 index 00000000..2be7e831 --- /dev/null +++ b/workflows/uq-noise/swift/workflow-abstention.swift @@ -0,0 +1,63 @@ +/* + UQ NOISE SWIFT + Main workflow +*/ + +import assert; +import files; +import io; +import python; +import unix; +import sys; +import string; +import location; +import math; +import json; + +string FRAMEWORK = "keras"; + +string xcorr_root = getenv("XCORR_ROOT"); +string preprocess_rnaseq = getenv("PREPROP_RNASEQ"); +string emews_root = getenv("EMEWS_PROJECT_ROOT"); +string turbine_output = getenv("TURBINE_OUTPUT"); + +string exp_id = argv("exp_id"); +int benchmark_timeout = toint(argv("benchmark_timeout", "-1")); +string model_name = getenv("MODEL_NAME"); + +printf("UQ NOISE WORKFLOW.SWIFT"); +printf("TURBINE_OUTPUT: " + turbine_output); + +float std_dev_step = 0.05; // Difference between noises +int num_trials = 1; + +float num_std_dev_noise= 20; // Number of noise levels to try + +float std_dev_array[] = [0:num_std_dev_noise]; +int trials[] = [0:num_trials-1]; + +int feature_col = 11180; +float feature_threshold = 0.01; +string add_noise = "false"; +string noise_correlated = "true"; + +foreach level, i in std_dev_array +{ + foreach trial, k in trials + { + std_dev = level * std_dev_step; + run_id = "%0.2f-%01i" % (std_dev, k); + params = ("{ \"label_noise\" : %f , " + + " \"max_abs\" : %f, " + + " \"noise_correlated\" : %s, " + + " \"feature_col\" : %i, " + + " \"feature_threshold\" : %f, " + + " \"epochs\" : 100 } ") % + (std_dev, std_dev, noise_correlated, feature_col, feature_threshold); + printf("running: %s", params); + result = obj(params, run_id); + printf("result %s : std_dev %0.2f : %s", + run_id, std_dev, result); + } +} + diff --git a/workflows/uq-noise/swift/workflow-gnoise.sh b/workflows/uq-noise/swift/workflow-gnoise.sh new file mode 100755 index 00000000..b09be15a --- /dev/null +++ b/workflows/uq-noise/swift/workflow-gnoise.sh @@ -0,0 +1,196 @@ +#! /usr/bin/env bash +set -eu + +# UQ NOISE WORKFLOW +# Main entry point for UQ-NOISE workflow +# See README.adoc for more information + +# Autodetect this workflow directory +export EMEWS_PROJECT_ROOT=$( cd $( dirname $0 )/.. ; /bin/pwd ) +export WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. ; /bin/pwd ) +if [[ ! -d $EMEWS_PROJECT_ROOT/../../../Benchmarks ]] +then + echo "Could not find Benchmarks in: $EMEWS_PROJECT_ROOT/../../../Benchmarks" + exit 1 +fi +BENCHMARKS_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd) +export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} +BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/NT3 +export BENCHMARK_TIMEOUT +export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} + +XCORR_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../xcorr ; /bin/pwd) +export XCORR_ROOT=${XCORR_ROOT:-$XCORR_DEFAULT} + +SCRIPT_NAME=$(basename $0) + +# Source some utility functions used by EMEWS in this script +source $WORKFLOWS_ROOT/common/sh/utils.sh + +usage() +{ + echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME" +} + +if (( ${#} != 5 )) +then + usage + exit 1 +fi + +if ! { + get_site $1 # Sets SITE + get_expid $2 # Sets EXPID + get_cfg_sys $3 + get_cfg_prm $4 + MODEL_NAME=$5 + } +then + usage + exit 1 +fi + +echo "Running "$MODEL_NAME "workflow" + +source_site env $SITE +source_site sched $SITE + +# Set PYTHONPATH for BENCHMARK related stuff +PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$XCORR_ROOT +PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # needed for model_runner and logs + +export APP_PYTHONPATH=$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$XCORR_ROOT + +export TURBINE_JOBNAME="JOB:${EXPID}" + +if [ -z ${GPU_STRING+x} ]; +then + GPU_ARG="" +else + GPU_ARG="-gpus=$GPU_STRING" +fi + +mkdir -pv $TURBINE_OUTPUT + +DB_FILE=$TURBINE_OUTPUT/uq-noise.db +if [[ ! -f DB_FILE ]] +then + if [[ ${UQ_NOISE_ID:-} == "" ]] + then + if [[ ${EXPID:0:1} == "X" ]] + then + UQ_NOISE_ID=${EXPID:1} + else + UQ_NOISE_ID=$EXPID + fi + fi + # $EMEWS_PROJECT_ROOT/db/db-cplo-init $DB_FILE $UQ_NOISE_ID +fi + +CMD_LINE_ARGS=( -benchmark_timeout=$BENCHMARK_TIMEOUT + -exp_id=$EXPID + -site=$SITE + -db_file=$DB_FILE + $GPU_ARG + -cache_dir=$CACHE_DIR + ) + +USER_VARS=( $CMD_LINE_ARGS ) +# log variables and script to to TURBINE_OUTPUT directory +log_script + +# Make run directory in advance to reduce contention +mkdir -pv $TURBINE_OUTPUT/run +mkdir -pv $TURBINE_OUTPUT/data +mkdir -pv $CACHE_DIR +mkdir -pv $XCORR_DATA_DIR +mkdir -pv $TURBINE_OUTPUT/hpo_log + +# Allow the user to set an objective function +OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} +SWIFT_IMPL="py" +OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +# This is used by the obj_app objective function +export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh + +# log_path PYTHONPATH + +WORKFLOW_SWIFT=${WORKFLOW_SWIFT:-workflow-gnoise.swift} +echo "WORKFLOW_SWIFT: $WORKFLOW_SWIFT" + +WAIT_ARG="" +if (( ${WAIT:-0} )) +then + WAIT_ARG="-t w" + echo "Turbine will wait for job completion." +fi + +if [[ ${MACHINE:-} == "" ]] +then + STDOUT=$TURBINE_OUTPUT/output.txt + # The turbine-output link is only created on scheduled systems, + # so if running locally, we create it here so the test*.sh wrappers + # can find it + [[ -L turbine-output ]] && rm turbine-output + ln -s $TURBINE_OUTPUT turbine-output +else + # When running on a scheduled system, Swift/T automatically redirects + # stdout to the turbine-output directory. This will just be for + # warnings or unusual messages + # use for summit (slurm needs two %) + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" + + #export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%r.txt" + mkdir -pv $TURBINE_OUTPUT/out + STDOUT="" +fi + +#echo ${CMD_LINE_ARGS[@]} + +cd $TURBINE_OUTPUT +cp $CFG_SYS $CFG_PRM $WORKFLOWS_ROOT/uq-noise/swift/workflow-noise.swift $TURBINE_OUTPUT + +if [[ ${SITE} == "summit" ]] +then + export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" +fi +TURBINE_RESIDENT_WORK_WORKERS=1 + +swift-t -n $PROCS \ + ${MACHINE:-} \ + -p \ + -I $OBJ_DIR \ + -i $OBJ_MODULE \ + -e LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} \ + -e TURBINE_RESIDENT_WORK_WORKERS=$TURBINE_RESIDENT_WORK_WORKERS \ + -e TURBINE_STDOUT \ + -e RESIDENT_WORK_RANKS=$RESIDENT_WORK_RANKS \ + -e BENCHMARKS_ROOT \ + -e EMEWS_PROJECT_ROOT \ + -e XCORR_ROOT \ + -e APP_PYTHONPATH=$APP_PYTHONPATH \ + $( python_envs ) \ + -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ + -e OBJ_RETURN \ + -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ + -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ + -e MODEL_SH \ + -e MODEL_NAME \ + -e SITE \ + -e BENCHMARK_TIMEOUT \ + -e BENCHMARKS_ROOT \ + -e SH_TIMEOUT \ + -e IGNORE_ERRORS \ + $WAIT_ARG \ + $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} |& \ + tee $STDOUT + + +if (( ${PIPESTATUS[0]} )) +then + echo "workflow.sh: swift-t exited with error!" + exit 1 +fi + +# echo "EXIT CODE: 0" | tee -a $STDOUT + diff --git a/workflows/uq-noise/swift/workflow-gnoise.swift b/workflows/uq-noise/swift/workflow-gnoise.swift new file mode 100644 index 00000000..b89cd849 --- /dev/null +++ b/workflows/uq-noise/swift/workflow-gnoise.swift @@ -0,0 +1,63 @@ +/* + UQ NOISE SWIFT + Main workflow +*/ + +import assert; +import files; +import io; +import python; +import unix; +import sys; +import string; +import location; +import math; +import json; + +string FRAMEWORK = "keras"; + +string xcorr_root = getenv("XCORR_ROOT"); +string preprocess_rnaseq = getenv("PREPROP_RNASEQ"); +string emews_root = getenv("EMEWS_PROJECT_ROOT"); +string turbine_output = getenv("TURBINE_OUTPUT"); + +string exp_id = argv("exp_id"); +int benchmark_timeout = toint(argv("benchmark_timeout", "-1")); +string model_name = getenv("MODEL_NAME"); + +printf("UQ NOISE WORKFLOW.SWIFT"); +printf("TURBINE_OUTPUT: " + turbine_output); + +float std_dev_step = 0.025; // Difference between noises +int num_trials = 1; + +float num_std_dev_noise= 20; // Number of noise levels to try + +float std_dev_array[] = [0:num_std_dev_noise]; +int trials[] = [0:num_trials-1]; + +int feature_col = 50; +float feature_threshold = 0.02; +string add_noise = "false"; +string noise_correlated = "false"; + +foreach level, i in std_dev_array +{ + foreach trial, k in trials + { + std_dev = level * std_dev_step; + run_id = "%0.3f-%01i" % (std_dev, k); + params = ("{ \"std_dev\" : %f , " + + " \"gaussian_noise\" : %s, " + + " \"noise_correlated\" : %s, " + + " \"feature_threshold\" : %f, " + + " \"feature_col\" : %i, " + + " \"epochs\" : 200 } ") % + (std_dev, add_noise, noise_correlated, feature_threshold, feature_col); + printf("running: %s", params); + result = obj(params, run_id); + printf("result %s : std_dev %0.2f : %s", + run_id, std_dev, result); + } +} + diff --git a/workflows/uq-noise/swift/workflow-noise.swift b/workflows/uq-noise/swift/workflow-noise.swift index 703af616..4c0ea381 100644 --- a/workflows/uq-noise/swift/workflow-noise.swift +++ b/workflows/uq-noise/swift/workflow-noise.swift @@ -12,6 +12,7 @@ import sys; import string; import location; import math; +import json; string FRAMEWORK = "keras"; @@ -27,14 +28,19 @@ string model_name = getenv("MODEL_NAME"); printf("UQ NOISE WORKFLOW.SWIFT"); printf("TURBINE_OUTPUT: " + turbine_output); -float noise_step = 10.0; // Difference between noises +float noise_step = 5.0; // Difference between noises int num_trials = 1; -float num_label_noise= 10; // Number of noise levels to try +float num_label_noise= 20; // Number of noise levels to try float label_noise_array[] = [0:num_label_noise]; int trials[] = [0:num_trials-1]; +int feature_col = 11180; +float feature_threshold = 0.01; +string add_noise = "true"; +string noise_correlated = "true"; + foreach level, i in label_noise_array { foreach trial, k in trials @@ -42,8 +48,12 @@ foreach level, i in label_noise_array label_noise = level * noise_step/100; run_id = "%0.2f-%01i" % (label_noise, k); params = ("{ \"label_noise\" : %f , " + - " \"epochs\" : 100 } ") % - (label_noise); + " \"add_noise\" : %s, " + + " \"noise_correlated\" : %s, " + + " \"feature_threshold\" : %f, " + + " \"feature_col\" : %i, " + + " \"epochs\" : 200 } ") % + (label_noise, add_noise, noise_correlated, feature_threshold, feature_col); printf("running: %s", params); result = obj(params, run_id); printf("result %s : label_noise %0.2f : %s", diff --git a/workflows/uq-noise/swift/workflow.sh b/workflows/uq-noise/swift/workflow.sh index 799cd2a2..b8f3464f 100755 --- a/workflows/uq-noise/swift/workflow.sh +++ b/workflows/uq-noise/swift/workflow.sh @@ -154,6 +154,9 @@ if [[ ${SITE} == "summit" ]] then export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" fi + + +export TURBINE_DIRECTIVE="#BSUB -q batch-hm" TURBINE_RESIDENT_WORK_WORKERS=1 swift-t -n $PROCS \ diff --git a/workflows/uq-noise/test/abstention.sh b/workflows/uq-noise/test/abstention.sh new file mode 100755 index 00000000..45259f54 --- /dev/null +++ b/workflows/uq-noise/test/abstention.sh @@ -0,0 +1,65 @@ +#!/bin/bash +set -eu + +# UQ NOISE TEST 1 + +usage() +{ + echo "Usage: test SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 2 )) +then + RUN_DIR=$2 +elif (( ${#} == 1 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=nt3 +SITE=$1 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-1.sh +# export CFG_SYS=$THIS/cfg-sys-big.sh +export CFG_PRM=$THIS/cfg-prm-1.sh + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export OBJ_RETURN="val_loss" + +if [[ $SITE == "theta" ]] +then + export WAIT=1 +fi + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow-abstention.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME + +# Wait for job +TURBINE_OUTPUT=$( readlink turbine-output ) +queue_wait + +# Check job output +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: diff --git a/workflows/uq-noise/test/cfg-sys-1.sh b/workflows/uq-noise/test/cfg-sys-1.sh index d1c515a5..9713b82a 100644 --- a/workflows/uq-noise/test/cfg-sys-1.sh +++ b/workflows/uq-noise/test/cfg-sys-1.sh @@ -4,7 +4,7 @@ # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEMS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-2} +export PROCS=${PROCS:-25} # export PROCS=${PROCS:-128} # MPI processes per node @@ -12,7 +12,7 @@ export PROCS=${PROCS:-2} export PPN=${PPN:-1} # For Theta: -export QUEUE=${QUEUE:-debug-cache-quad} +#export QUEUE=${QUEUE:-batch-hm} # export QUEUE=R.candle export WALLTIME=${WALLTIME:-02:00:00} diff --git a/workflows/uq-noise/test/gnoise.sh b/workflows/uq-noise/test/gnoise.sh new file mode 100755 index 00000000..88fe90c7 --- /dev/null +++ b/workflows/uq-noise/test/gnoise.sh @@ -0,0 +1,65 @@ +#!/bin/bash +set -eu + +# UQ NOISE TEST 1 + +usage() +{ + echo "Usage: test SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 2 )) +then + RUN_DIR=$2 +elif (( ${#} == 1 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=nt3 +SITE=$1 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-1.sh +# export CFG_SYS=$THIS/cfg-sys-big.sh +export CFG_PRM=$THIS/cfg-prm-1.sh + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export OBJ_RETURN="val_loss" + +if [[ $SITE == "theta" ]] +then + export WAIT=1 +fi + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow-gnoise.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME + +# Wait for job +TURBINE_OUTPUT=$( readlink turbine-output ) +queue_wait + +# Check job output +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: diff --git a/workflows/uq-noise/test/test-noise.sh b/workflows/uq-noise/test/test-noise.sh index 98ce08dd..6513526f 100755 --- a/workflows/uq-noise/test/test-noise.sh +++ b/workflows/uq-noise/test/test-noise.sh @@ -32,7 +32,7 @@ WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) source $WORKFLOWS_ROOT/common/sh/utils.sh # Select configurations -export CFG_SYS=$THIS/cfg-sys-small.sh +export CFG_SYS=$THIS/cfg-sys-1.sh # export CFG_SYS=$THIS/cfg-sys-big.sh export CFG_PRM=$THIS/cfg-prm-1.sh From d0b4f953491767d9ef4dbd437c955cea1ac0f3ce Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Wed, 2 Sep 2020 20:31:50 -0400 Subject: [PATCH 076/601] Fixed R syntax and typo in workflows/common/R/install-candle.R and updated env-biowulf.sh --- workflows/common/R/install-candle.R | 4 +-- workflows/common/sh/env-biowulf.sh | 53 ++++++++++++++++------------- 2 files changed, 31 insertions(+), 26 deletions(-) diff --git a/workflows/common/R/install-candle.R b/workflows/common/R/install-candle.R index 51ade33b..8d16b094 100644 --- a/workflows/common/R/install-candle.R +++ b/workflows/common/R/install-candle.R @@ -16,11 +16,11 @@ options(repos = r) # Force Plotly 4.5.6 - not latest! Do not want shiny/httpuv, it does not work on Cooley! install.packages("https://cran.r-project.org/src/contrib/Archive/plotly/plotly_4.5.6.tar.gz") -PKGS=( +PKGS <- list( "smoof", "rgenoud", "DiceKriging", - "randomForest" + "randomForest", "jsonlite", "parallelMap", "RInside", diff --git a/workflows/common/sh/env-biowulf.sh b/workflows/common/sh/env-biowulf.sh index b9885481..9ebc4c74 100644 --- a/workflows/common/sh/env-biowulf.sh +++ b/workflows/common/sh/env-biowulf.sh @@ -1,11 +1,12 @@ #!/bin/bash -# Assume candle module is loaded as usual +# Assume the candle module is loaded as usual # Load the environments for each MPI implementation -if [ $USE_OPENMPI -eq 1 ]; then +if [ "x$USE_OPENMPI" == "x1" ]; then # probably always use this on Biowulf as it's the best supported #module load gcc/7.3.0 openmpi/3.1.2/cuda-9.0/gcc-7.3.0-pmi2 tcl_tk/8.6.8_gcc-7.2.0 ant/1.10.3 java/1.8.0_181 # Note I had to stop using openmpi/3.1.2/cuda-9.0/gcc-7.3.0-pmi2 because at least as of 6/19/19 Biowulf seemed to stop supporting it (it was available only as a "hidden" module) - module load gcc/7.3.0 openmpi/3.1.3/cuda-9.2/gcc-7.3.0-pmi2 tcl_tk/8.6.8_gcc-7.2.0 ant/1.10.3 java/1.8.0_181 + #module load gcc/7.3.0 openmpi/3.1.3/cuda-9.2/gcc-7.3.0-pmi2 tcl_tk/8.6.8_gcc-7.2.0 ant/1.10.3 java/1.8.0_181 + module load gcc/9.2.0 openmpi/4.0.4/cuda-10.2/gcc-9.2.0 tcl_tk/8.6.8_gcc-7.2.0 ant/1.10.3 java/12.0.1 pcre2/10.21 GSL/2.6_gcc-9.2.0 # new stack on 8/14/20 - note, per my emails with Biowulf, they disabled development in PMI2 OpenMPI environments; further added pcre2/10.21 on 9/2/20 as otherwise installing Supervisor's R packages wouldn't work as R could not start at all; further added GSL/2.6_gcc-9.2.0 on 9/2/20 as otherwise the ggplot2 installation for Supervisor failed export OMPI_MCA_mpi_warn_on_fork=0 else module load tcl_tk/8.6.8_gcc-7.2.0 ant/1.10.3 java/1.8.0_181 @@ -18,39 +19,43 @@ else export CPATH=/data/BIDS-HPC/public/software/builds/mpich-3.3-3/include:$CPATH fi -# Load R/3.5.0 paths manually since we can't load the module on the Biowulf submit nodes -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/GSL/gcc-7.2.0/2.4/lib:/usr/local/geos/3.6.2/lib:/usr/local/intel/compilers_and_libraries_2018.1.163/linux/mkl/lib/intel64 -export PATH=$PATH:/usr/local/GSL/gcc-7.2.0/2.4/bin:/usr/local/apps/R/3.5/3.5.0_build2/bin -export R_LIBS_SITE=/usr/local/apps/R/3.5/site-library_build2 -export R_LIBS_USER=~/R/%v/library -export R_LIBS=$CANDLE/R/libs +# Load R/4.0.0 paths manually since we can't load the module on the Biowulf submit nodes (part of new stack on 8/13/20) +#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/GSL/gcc-7.2.0/2.4/lib:/usr/local/geos/3.6.2/lib:/usr/local/intel/compilers_and_libraries_2018.1.163/linux/mkl/lib/intel64 +#export PATH=$PATH:/usr/local/GSL/gcc-7.2.0/2.4/bin:/usr/local/apps/R/3.5/3.5.0_build2/bin +#export R_LIBS_SITE=/usr/local/apps/R/3.5/site-library_build2 +#export R_LIBS_USER=~/R/%v/library +export PATH="$PATH:/usr/local/apps/R/4.0/4.0.0/bin" +export LIBRARY_PATH="$LIBRARY_PATH:/usr/local/intel/compilers_and_libraries_2019.1.144/linux/mkl/lib/intel64" +export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/intel/compilers_and_libraries_2019.1.144/linux/mkl/lib/intel64" +export R_LIBS_USER="$R_LIBS_USER:~/R/%v/library" +export R_LIBS_SITE="/usr/local/apps/R/4.0/site-library_4.0.0" +export R_LIBS="$CANDLE/R/libs" # Swift/T setup -export SWIFT_T_INSTALL=$CANDLE/swift-t-install -# NOTE: Below is 1 of 2 lines needed to run swift-t out-of-the-box -export PATH=$PATH:$SWIFT_T_INSTALL/stc/bin -export PATH=$PATH:$SWIFT_T_INSTALL/turbine/bin -export PYTHONPATH=$PYTHONPATH:$SWIFT_T_INSTALL/turbine/py -export TURBINE_HOME=$SWIFT_T_INSTALL/turbine -export TURBINE_LOG=1 -export ADLB_DEBUG_RANKS=1 -export ADLB_DEBUG_HOSTMAP=1 +export SWIFT_T_INSTALL="$CANDLE/swift-t-install" +export PATH="$PATH:$SWIFT_T_INSTALL/stc/bin" # this is likely 1 of 2 lines needed to run swift-t out-of-the-box +export PATH="$PATH:$SWIFT_T_INSTALL/turbine/bin" +export PYTHONPATH="$PYTHONPATH:$SWIFT_T_INSTALL/turbine/py" +export TURBINE_HOME="$SWIFT_T_INSTALL/turbine" +export TURBINE_LOG="1" +export ADLB_DEBUG_RANKS="1" +export ADLB_DEBUG_HOSTMAP="1" export SWIFT_IMPL="app" # Resident task workers and ranks if [ -z ${TURBINE_RESIDENT_WORK_WORKERS+x} ]; then - # Resident task workers and ranks - export TURBINE_RESIDENT_WORK_WORKERS=1 + export TURBINE_RESIDENT_WORK_WORKERS="1" export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) fi # NOTE: Below is 2 of 2 lines needed to run swift-t out-of-the-box (no longer needed!!) #export LD_PRELOAD=/usr/local/slurm/lib/libslurm.so:$LD_PRELOAD # this is the only way aside from recompiling Swift/T I believe to get past an error regarding /usr/local/slurm/lib/slurm/auth_munge.so, e.g., "/usr/local/Tcl_Tk/8.6.8/gcc_7.2.0/bin/tclsh8.6: symbol lookup error: /usr/local/slurm/lib/slurm/auth_munge.so: undefined symbol: slurm_debug" # Set up EMEWS Queues -export EQR=$CANDLE/Supervisor/workflows/common/ext/EQ-R # I don’t know where else to find this directory that needs to be available, e.g., in workflow.sh -export EQPy=$CANDLE/Supervisor/workflows/common/ext/EQ-Py +export EQR="$CANDLE/Supervisor/workflows/common/ext/EQ-R" # I don’t know where else to find this directory that needs to be available, e.g., in workflow.sh +export EQPy="$CANDLE/Supervisor/workflows/common/ext/EQ-Py" # Other additions -export PYTHONPATH=$PYTHONPATH:$CANDLE/Supervisor/workflows/common/python +export PYTHONPATH="$PYTHONPATH:$CANDLE/Supervisor/workflows/common/python" # Log settings to output -which python swift-t +command -v python || echo "WARNING: Program 'python' not found" +command -v swift-t || echo "WARNING: Program 'swift-t' not found" From 16726bc6c6cbe25040c3862eb7cbec7fbfc961ab Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Thu, 3 Sep 2020 15:44:14 -0400 Subject: [PATCH 077/601] o Fix TURBINE_LAUNCH_OPTIONS for summit --- workflows/upf/swift/workflow.sh | 5 +++++ workflows/uq-noise/swift/workflow-abstention.sh | 2 +- workflows/uq-noise/swift/workflow-abstention.swift | 4 ++-- workflows/uq-noise/swift/workflow-gnoise.sh | 2 +- workflows/uq-noise/swift/workflow-gnoise.swift | 4 +++- workflows/uq-noise/swift/workflow-noise.sh | 2 +- workflows/uq-noise/test/cfg-sys-1.sh | 4 ++-- 7 files changed, 15 insertions(+), 8 deletions(-) diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index bdfc0f9e..af26762c 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -88,6 +88,11 @@ module list cp -v $UPF $TURBINE_OUTPUT +if [[ ${SITE} == "summit" ]] +then + export TURBINE_LAUNCH_OPTIONS="-r6 -a1 -g1 -c7" +fi + TURBINE_STDOUT="$TURBINE_OUTPUT/out-%%r.txt" swift-t -n $PROCS \ diff --git a/workflows/uq-noise/swift/workflow-abstention.sh b/workflows/uq-noise/swift/workflow-abstention.sh index 663e8c46..8dc0c779 100755 --- a/workflows/uq-noise/swift/workflow-abstention.sh +++ b/workflows/uq-noise/swift/workflow-abstention.sh @@ -152,7 +152,7 @@ cp $CFG_SYS $CFG_PRM $WORKFLOWS_ROOT/uq-noise/swift/workflow-noise.swift $TURBIN if [[ ${SITE} == "summit" ]] then - export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" + export TURBINE_LAUNCH_OPTIONS="-r6 -a1 -g1 -c7" fi TURBINE_RESIDENT_WORK_WORKERS=1 diff --git a/workflows/uq-noise/swift/workflow-abstention.swift b/workflows/uq-noise/swift/workflow-abstention.swift index 2be7e831..3fdda975 100644 --- a/workflows/uq-noise/swift/workflow-abstention.swift +++ b/workflows/uq-noise/swift/workflow-abstention.swift @@ -29,14 +29,14 @@ printf("UQ NOISE WORKFLOW.SWIFT"); printf("TURBINE_OUTPUT: " + turbine_output); float std_dev_step = 0.05; // Difference between noises -int num_trials = 1; +int num_trials = 2; float num_std_dev_noise= 20; // Number of noise levels to try float std_dev_array[] = [0:num_std_dev_noise]; int trials[] = [0:num_trials-1]; -int feature_col = 11180; +int feature_col = 50; float feature_threshold = 0.01; string add_noise = "false"; string noise_correlated = "true"; diff --git a/workflows/uq-noise/swift/workflow-gnoise.sh b/workflows/uq-noise/swift/workflow-gnoise.sh index b09be15a..3369d556 100755 --- a/workflows/uq-noise/swift/workflow-gnoise.sh +++ b/workflows/uq-noise/swift/workflow-gnoise.sh @@ -152,7 +152,7 @@ cp $CFG_SYS $CFG_PRM $WORKFLOWS_ROOT/uq-noise/swift/workflow-noise.swift $TURBIN if [[ ${SITE} == "summit" ]] then - export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" + export TURBINE_LAUNCH_OPTIONS="-r6 -a1 -g1 -c7" fi TURBINE_RESIDENT_WORK_WORKERS=1 diff --git a/workflows/uq-noise/swift/workflow-gnoise.swift b/workflows/uq-noise/swift/workflow-gnoise.swift index b89cd849..1f37a269 100644 --- a/workflows/uq-noise/swift/workflow-gnoise.swift +++ b/workflows/uq-noise/swift/workflow-gnoise.swift @@ -39,6 +39,7 @@ int trials[] = [0:num_trials-1]; int feature_col = 50; float feature_threshold = 0.02; string add_noise = "false"; +string gaussian_noise = "true"; string noise_correlated = "false"; foreach level, i in std_dev_array @@ -48,12 +49,13 @@ foreach level, i in std_dev_array std_dev = level * std_dev_step; run_id = "%0.3f-%01i" % (std_dev, k); params = ("{ \"std_dev\" : %f , " + + " \"add_noise\" : %s, " + " \"gaussian_noise\" : %s, " + " \"noise_correlated\" : %s, " + " \"feature_threshold\" : %f, " + " \"feature_col\" : %i, " + " \"epochs\" : 200 } ") % - (std_dev, add_noise, noise_correlated, feature_threshold, feature_col); + (std_dev, add_noise, gaussian_noise, noise_correlated, feature_threshold, feature_col); printf("running: %s", params); result = obj(params, run_id); printf("result %s : std_dev %0.2f : %s", diff --git a/workflows/uq-noise/swift/workflow-noise.sh b/workflows/uq-noise/swift/workflow-noise.sh index 1ebab699..9efb8247 100755 --- a/workflows/uq-noise/swift/workflow-noise.sh +++ b/workflows/uq-noise/swift/workflow-noise.sh @@ -152,7 +152,7 @@ cp $CFG_SYS $CFG_PRM $WORKFLOWS_ROOT/uq-noise/swift/workflow-noise.swift $TURBIN if [[ ${SITE} == "summit" ]] then - export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" + export TURBINE_LAUNCH_OPTIONS="-r6 -a1 -g1 -c7" fi TURBINE_RESIDENT_WORK_WORKERS=1 diff --git a/workflows/uq-noise/test/cfg-sys-1.sh b/workflows/uq-noise/test/cfg-sys-1.sh index 9713b82a..732bd148 100644 --- a/workflows/uq-noise/test/cfg-sys-1.sh +++ b/workflows/uq-noise/test/cfg-sys-1.sh @@ -4,12 +4,12 @@ # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEMS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-25} +export PROCS=${PROCS:-12} # export PROCS=${PROCS:-128} # MPI processes per node # Cori has 32 cores per node, 128GB per node -export PPN=${PPN:-1} +export PPN=${PPN:-6} # For Theta: #export QUEUE=${QUEUE:-batch-hm} From f64cc576711e24b0bf37c1fa7ca64bff8158f9b8 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 3 Sep 2020 15:06:18 -0500 Subject: [PATCH 078/601] export TURBINE_STDOUT - something changed with this --- workflows/cp-leaveout/swift/workflow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index f81a6512..664c460b 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -199,7 +199,7 @@ else STDOUT="" fi -TURBINE_STDOUT="" # "$TURBINE_OUTPUT/out/out-%%r.txt" +export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" mkdir -pv $TURBINE_OUTPUT/out swift-t -O 0 -n $PROCS \ From d398595bfe79d0563bf4305efc500ec95cfc07af Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Fri, 11 Sep 2020 00:31:49 -0400 Subject: [PATCH 079/601] o Workflow: abstention for Gaussian noise --- .../uq-noise/swift/workflow-gauss-abs.sh | 196 ++++++++++++++++++ .../uq-noise/swift/workflow-gauss-abs.swift | 73 +++++++ workflows/uq-noise/test/cfg-sys-1.sh | 4 +- workflows/uq-noise/test/gauss-abs.sh | 65 ++++++ 4 files changed, 336 insertions(+), 2 deletions(-) create mode 100755 workflows/uq-noise/swift/workflow-gauss-abs.sh create mode 100644 workflows/uq-noise/swift/workflow-gauss-abs.swift create mode 100755 workflows/uq-noise/test/gauss-abs.sh diff --git a/workflows/uq-noise/swift/workflow-gauss-abs.sh b/workflows/uq-noise/swift/workflow-gauss-abs.sh new file mode 100755 index 00000000..3061c3d5 --- /dev/null +++ b/workflows/uq-noise/swift/workflow-gauss-abs.sh @@ -0,0 +1,196 @@ +#! /usr/bin/env bash +set -eu + +# UQ NOISE WORKFLOW +# Main entry point for UQ-NOISE workflow +# See README.adoc for more information + +# Autodetect this workflow directory +export EMEWS_PROJECT_ROOT=$( cd $( dirname $0 )/.. ; /bin/pwd ) +export WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. ; /bin/pwd ) +if [[ ! -d $EMEWS_PROJECT_ROOT/../../../Benchmarks ]] +then + echo "Could not find Benchmarks in: $EMEWS_PROJECT_ROOT/../../../Benchmarks" + exit 1 +fi +BENCHMARKS_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd) +export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} +BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/NT3 +export BENCHMARK_TIMEOUT +export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} + +XCORR_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../xcorr ; /bin/pwd) +export XCORR_ROOT=${XCORR_ROOT:-$XCORR_DEFAULT} + +SCRIPT_NAME=$(basename $0) + +# Source some utility functions used by EMEWS in this script +source $WORKFLOWS_ROOT/common/sh/utils.sh + +usage() +{ + echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME" +} + +if (( ${#} != 5 )) +then + usage + exit 1 +fi + +if ! { + get_site $1 # Sets SITE + get_expid $2 # Sets EXPID + get_cfg_sys $3 + get_cfg_prm $4 + MODEL_NAME=$5 + } +then + usage + exit 1 +fi + +echo "Running "$MODEL_NAME "workflow" + +source_site env $SITE +source_site sched $SITE + +# Set PYTHONPATH for BENCHMARK related stuff +PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$XCORR_ROOT +PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # needed for model_runner and logs + +export APP_PYTHONPATH=$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$XCORR_ROOT + +export TURBINE_JOBNAME="JOB:${EXPID}" + +if [ -z ${GPU_STRING+x} ]; +then + GPU_ARG="" +else + GPU_ARG="-gpus=$GPU_STRING" +fi + +mkdir -pv $TURBINE_OUTPUT + +DB_FILE=$TURBINE_OUTPUT/uq-noise.db +if [[ ! -f DB_FILE ]] +then + if [[ ${UQ_NOISE_ID:-} == "" ]] + then + if [[ ${EXPID:0:1} == "X" ]] + then + UQ_NOISE_ID=${EXPID:1} + else + UQ_NOISE_ID=$EXPID + fi + fi + # $EMEWS_PROJECT_ROOT/db/db-cplo-init $DB_FILE $UQ_NOISE_ID +fi + +CMD_LINE_ARGS=( -benchmark_timeout=$BENCHMARK_TIMEOUT + -exp_id=$EXPID + -site=$SITE + -db_file=$DB_FILE + $GPU_ARG + -cache_dir=$CACHE_DIR + ) + +USER_VARS=( $CMD_LINE_ARGS ) +# log variables and script to to TURBINE_OUTPUT directory +log_script + +# Make run directory in advance to reduce contention +mkdir -pv $TURBINE_OUTPUT/run +mkdir -pv $TURBINE_OUTPUT/data +mkdir -pv $CACHE_DIR +mkdir -pv $XCORR_DATA_DIR +mkdir -pv $TURBINE_OUTPUT/hpo_log + +# Allow the user to set an objective function +OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} +SWIFT_IMPL="py" +OBJ_MODULE=${OBJ_MODULE:-obj_abstention_$SWIFT_IMPL} +# This is used by the obj_app objective function +export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model_abstention.sh + +# log_path PYTHONPATH + +WORKFLOW_SWIFT=${WORKFLOW_SWIFT:-workflow-gauss-abs.swift} +echo "WORKFLOW_SWIFT: $WORKFLOW_SWIFT" + +WAIT_ARG="" +if (( ${WAIT:-0} )) +then + WAIT_ARG="-t w" + echo "Turbine will wait for job completion." +fi + +if [[ ${MACHINE:-} == "" ]] +then + STDOUT=$TURBINE_OUTPUT/output.txt + # The turbine-output link is only created on scheduled systems, + # so if running locally, we create it here so the test*.sh wrappers + # can find it + [[ -L turbine-output ]] && rm turbine-output + ln -s $TURBINE_OUTPUT turbine-output +else + # When running on a scheduled system, Swift/T automatically redirects + # stdout to the turbine-output directory. This will just be for + # warnings or unusual messages + # use for summit (slurm needs two %) + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" + + #export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%r.txt" + mkdir -pv $TURBINE_OUTPUT/out + STDOUT="" +fi + +#echo ${CMD_LINE_ARGS[@]} + +cd $TURBINE_OUTPUT +cp $CFG_SYS $CFG_PRM $WORKFLOWS_ROOT/uq-noise/swift/workflow-gauss-abs.swift $TURBINE_OUTPUT + +if [[ ${SITE} == "summit" ]] +then + export TURBINE_LAUNCH_OPTIONS="-r6 -a1 -g1 -c7" +fi +TURBINE_RESIDENT_WORK_WORKERS=1 + +swift-t -n $PROCS \ + ${MACHINE:-} \ + -p \ + -I $OBJ_DIR \ + -i $OBJ_MODULE \ + -e LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} \ + -e TURBINE_RESIDENT_WORK_WORKERS=$TURBINE_RESIDENT_WORK_WORKERS \ + -e TURBINE_STDOUT \ + -e RESIDENT_WORK_RANKS=$RESIDENT_WORK_RANKS \ + -e BENCHMARKS_ROOT \ + -e EMEWS_PROJECT_ROOT \ + -e XCORR_ROOT \ + -e APP_PYTHONPATH=$APP_PYTHONPATH \ + $( python_envs ) \ + -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ + -e OBJ_RETURN \ + -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ + -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ + -e MODEL_SH \ + -e MODEL_NAME \ + -e SITE \ + -e BENCHMARK_TIMEOUT \ + -e BENCHMARKS_ROOT \ + -e SH_TIMEOUT \ + -e IGNORE_ERRORS \ + $WAIT_ARG \ + $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} |& \ + tee $STDOUT + + +if (( ${PIPESTATUS[0]} )) +then + echo "workflow.sh: swift-t exited with error!" + exit 1 +fi + +# echo "EXIT CODE: 0" | tee -a $STDOUT + diff --git a/workflows/uq-noise/swift/workflow-gauss-abs.swift b/workflows/uq-noise/swift/workflow-gauss-abs.swift new file mode 100644 index 00000000..fd87b698 --- /dev/null +++ b/workflows/uq-noise/swift/workflow-gauss-abs.swift @@ -0,0 +1,73 @@ +/* + UQ NOISE SWIFT + Main workflow +*/ + +import assert; +import files; +import io; +import python; +import unix; +import sys; +import string; +import location; +import math; +import json; + +string FRAMEWORK = "keras"; + +string xcorr_root = getenv("XCORR_ROOT"); +string preprocess_rnaseq = getenv("PREPROP_RNASEQ"); +string emews_root = getenv("EMEWS_PROJECT_ROOT"); +string turbine_output = getenv("TURBINE_OUTPUT"); + +string exp_id = argv("exp_id"); +int benchmark_timeout = toint(argv("benchmark_timeout", "-1")); +string model_name = getenv("MODEL_NAME"); + +printf("UQ NOISE WORKFLOW.SWIFT"); +printf("TURBINE_OUTPUT: " + turbine_output); + +float std_dev_step = 0.025; // Difference between noises +int num_trials = 1; + +float num_std_dev_noise= 20; // Number of noise levels to try + +float std_dev_array[] = [0:num_std_dev_noise]; +int trials[] = [0:num_trials-1]; + +int feature_col = 50; +float feature_threshold = 0.01; +string add_noise = "false"; +string noise_correlated = "false"; +string gaussian_noise = "true"; + + +float abs_vals[] = [0.01964286183, 0.01785714711, 0.01785714711, 0.02500000596, 0.02500000596, 0.03035715009, 0.03392857526, 0.03392857526, 0.05892858122, 0.05714286438, 0.08928572493, 0.1000000047, 0.1053571467, 0.1821428537, 0.1732142823, 0.2124999974, 0.2339285719, 0.1982142861, 0.3696428559, 0.2250000026, 0.2999999991]; + +foreach level, i in std_dev_array +{ + foreach trial, k in trials + { + std_dev = level * std_dev_step; + run_id = "%0.2f-%01i" % (std_dev, k); + + max_abs = abs_vals[i]; + + params = ("{ \"label_noise\" : %f , " + + " \"max_abs\" : %f, " + + " \"std_dev\" : %f, " + + " \"gaussian_noise\" : %s, " + + " \"noise_correlated\" : %s, " + + " \"feature_col\" : %i, " + + " \"feature_threshold\" : %f, " + + " \"epochs\" : 100 } ") % + (std_dev, max_abs, std_dev, gaussian_noise, noise_correlated, feature_col, feature_threshold); + printf("running: %s", params); + result = obj(params, run_id); + printf("result %s : std_dev %0.2f : %s", + run_id, std_dev, result); + } +} + + diff --git a/workflows/uq-noise/test/cfg-sys-1.sh b/workflows/uq-noise/test/cfg-sys-1.sh index 732bd148..03183555 100644 --- a/workflows/uq-noise/test/cfg-sys-1.sh +++ b/workflows/uq-noise/test/cfg-sys-1.sh @@ -4,12 +4,12 @@ # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEMS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-12} +export PROCS=${PROCS:-30} # export PROCS=${PROCS:-128} # MPI processes per node # Cori has 32 cores per node, 128GB per node -export PPN=${PPN:-6} +export PPN=${PPN:-1} # For Theta: #export QUEUE=${QUEUE:-batch-hm} diff --git a/workflows/uq-noise/test/gauss-abs.sh b/workflows/uq-noise/test/gauss-abs.sh new file mode 100755 index 00000000..2938256b --- /dev/null +++ b/workflows/uq-noise/test/gauss-abs.sh @@ -0,0 +1,65 @@ +#!/bin/bash +set -eu + +# UQ NOISE TEST 1 + +usage() +{ + echo "Usage: test SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 2 )) +then + RUN_DIR=$2 +elif (( ${#} == 1 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=nt3 +SITE=$1 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-1.sh +# export CFG_SYS=$THIS/cfg-sys-big.sh +export CFG_PRM=$THIS/cfg-prm-1.sh + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export OBJ_RETURN="val_loss" + +if [[ $SITE == "theta" ]] +then + export WAIT=1 +fi + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow-gauss-abs.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME + +# Wait for job +TURBINE_OUTPUT=$( readlink turbine-output ) +queue_wait + +# Check job output +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: From 9228deda5933ade69610190339ab1d55d0cb62a5 Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Sat, 12 Sep 2020 21:44:49 -0400 Subject: [PATCH 080/601] About to update env-biowulf.sh to use the new tcl that I just built --- workflows/common/sh/env-biowulf.sh | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/workflows/common/sh/env-biowulf.sh b/workflows/common/sh/env-biowulf.sh index 9ebc4c74..44c8881f 100644 --- a/workflows/common/sh/env-biowulf.sh +++ b/workflows/common/sh/env-biowulf.sh @@ -6,7 +6,10 @@ if [ "x$USE_OPENMPI" == "x1" ]; then # probably always use this on Biowulf as it's the best supported #module load gcc/7.3.0 openmpi/3.1.2/cuda-9.0/gcc-7.3.0-pmi2 tcl_tk/8.6.8_gcc-7.2.0 ant/1.10.3 java/1.8.0_181 # Note I had to stop using openmpi/3.1.2/cuda-9.0/gcc-7.3.0-pmi2 because at least as of 6/19/19 Biowulf seemed to stop supporting it (it was available only as a "hidden" module) #module load gcc/7.3.0 openmpi/3.1.3/cuda-9.2/gcc-7.3.0-pmi2 tcl_tk/8.6.8_gcc-7.2.0 ant/1.10.3 java/1.8.0_181 - module load gcc/9.2.0 openmpi/4.0.4/cuda-10.2/gcc-9.2.0 tcl_tk/8.6.8_gcc-7.2.0 ant/1.10.3 java/12.0.1 pcre2/10.21 GSL/2.6_gcc-9.2.0 # new stack on 8/14/20 - note, per my emails with Biowulf, they disabled development in PMI2 OpenMPI environments; further added pcre2/10.21 on 9/2/20 as otherwise installing Supervisor's R packages wouldn't work as R could not start at all; further added GSL/2.6_gcc-9.2.0 on 9/2/20 as otherwise the ggplot2 installation for Supervisor failed + + #module load gcc/9.2.0 openmpi/4.0.4/cuda-10.2/gcc-9.2.0 tcl_tk/8.6.8_gcc-7.2.0 ant/1.10.3 java/12.0.1 pcre2/10.21 GSL/2.6_gcc-9.2.0 # new stack on 8/14/20 - note, per my emails with Biowulf, they disabled development in PMI2 OpenMPI environments; further added pcre2/10.21 on 9/2/20 as otherwise installing Supervisor's R packages wouldn't work as R could not start at all; further added GSL/2.6_gcc-9.2.0 on 9/2/20 as otherwise the ggplot2 installation for Supervisor failed; this seems to work up to a "Source option 6 is no longer supported. Use 7 or later", seemingly Java-related error + module load gcc/9.2.0 openmpi/4.0.4/cuda-10.2/gcc-9.2.0 tcl_tk/8.6.8_gcc-7.2.0 ant/1.10.3 java/1.8.0_211 pcre2/10.21 GSL/2.6_gcc-9.2.0 # going from java/12.0.1 --> java/1.8.0_211 seems to fix the Java-related error above + export OMPI_MCA_mpi_warn_on_fork=0 else module load tcl_tk/8.6.8_gcc-7.2.0 ant/1.10.3 java/1.8.0_181 @@ -19,6 +22,15 @@ else export CPATH=/data/BIDS-HPC/public/software/builds/mpich-3.3-3/include:$CPATH fi +# Set variables for CANDLE dependencies (mostly, Swift/T dependencies) +export CANDLE_DEP_MPI="/usr/local/OpenMPI/4.0.4/CUDA-10.2/gcc-9.2.0" +export CANDLE_DEP_TCL="/usr/local/Tcl_Tk/8.6.8/gcc_7.2.0" +export CANDLE_DEP_PY="/usr/local/Anaconda/envs/py3.7" +export CANDLE_DEP_R="/usr/local/apps/R/4.0/4.0.0/lib64/R" +export CANDLE_DEP_R_SITE="/usr/local/apps/R/4.0/site-library_4.0.0" +export CANDLE_DEP_ANT="/usr/local/apps/ant/1.10.3" +export CANDLE_LAUNCHER_OPTION="--with-launcher=/usr/local/slurm/bin/srun" + # Load R/4.0.0 paths manually since we can't load the module on the Biowulf submit nodes (part of new stack on 8/13/20) #export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/GSL/gcc-7.2.0/2.4/lib:/usr/local/geos/3.6.2/lib:/usr/local/intel/compilers_and_libraries_2018.1.163/linux/mkl/lib/intel64 #export PATH=$PATH:/usr/local/GSL/gcc-7.2.0/2.4/bin:/usr/local/apps/R/3.5/3.5.0_build2/bin @@ -28,7 +40,7 @@ export PATH="$PATH:/usr/local/apps/R/4.0/4.0.0/bin" export LIBRARY_PATH="$LIBRARY_PATH:/usr/local/intel/compilers_and_libraries_2019.1.144/linux/mkl/lib/intel64" export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/intel/compilers_and_libraries_2019.1.144/linux/mkl/lib/intel64" export R_LIBS_USER="$R_LIBS_USER:~/R/%v/library" -export R_LIBS_SITE="/usr/local/apps/R/4.0/site-library_4.0.0" +export R_LIBS_SITE="$CANDLE_DEP_R_SITE" export R_LIBS="$CANDLE/R/libs" # Swift/T setup From 7fba8e4d6213701457acdb7b8c4f69400a4c3e8a Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Sat, 12 Sep 2020 23:53:45 -0400 Subject: [PATCH 081/601] Got Swift/T working with updated stack due to recompiling tcl with similar stack --- workflows/common/R/install-candle.R | 14 +++++++++++++- workflows/common/sh/env-biowulf.sh | 12 ++++++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/workflows/common/R/install-candle.R b/workflows/common/R/install-candle.R index 8d16b094..8e22607d 100644 --- a/workflows/common/R/install-candle.R +++ b/workflows/common/R/install-candle.R @@ -15,13 +15,25 @@ options(repos = r) # Force Plotly 4.5.6 - not latest! Do not want shiny/httpuv, it does not work on Cooley! install.packages("https://cran.r-project.org/src/contrib/Archive/plotly/plotly_4.5.6.tar.gz") +install.packages("https://cran.r-project.org/src/contrib/Archive/jsonlite/jsonlite_1.7.0.tar.gz") # ALW adding this on 9/12/20 (and removing jsonlite from PKGS list below) because sometime in the first two weeks of Sept 2020 the default jsonlite version became 1.7.1 and this seems to throw an error that looks to me like a bug that should be fixed with time; e.g., while everything worked in early Sept 2020 (probably 9/2/20), now on 9/12/20 I get this error: +# * DONE (jsonlite) +# 1): succeeded '/usr/local/apps/R/4.0/4.0.0/lib64/R/bin/R CMD INSTALL -l '/gpfs/gsfs9/users/BIDS-HPC/public/software/distributions/candle/dev_2/builds/R/libs' '/lscratch/64803361/Rtmpnd5yDC/downloaded_packages/jsonlite_1.7.1.tar.gz'' +# The downloaded source packages are in +# ‘/lscratch/64803361/Rtmpnd5yDC/downloaded_packages’ +# [1] "" +# LOAD: jsonlite +# Error in value[[3L]](cond) : +# Package ‘jsonlite’ version 1.7.0 cannot be unloaded: +# Error in unloadNamespace(package) : namespace ‘jsonlite’ is imported by ‘plotly’ so cannot be unloaded +# Calls: library ... tryCatch -> tryCatchList -> tryCatchOne -> +# Execution halted +# ****NOTE**** that I tried installing both plotly and jsonlite the normal way (in the PKGS below instead of a specific version above) and I got the same error PKGS <- list( "smoof", "rgenoud", "DiceKriging", "randomForest", - "jsonlite", "parallelMap", "RInside", "mlrMBO" diff --git a/workflows/common/sh/env-biowulf.sh b/workflows/common/sh/env-biowulf.sh index 44c8881f..a9505616 100644 --- a/workflows/common/sh/env-biowulf.sh +++ b/workflows/common/sh/env-biowulf.sh @@ -8,7 +8,8 @@ if [ "x$USE_OPENMPI" == "x1" ]; then # probably always use this on Biowulf as it #module load gcc/7.3.0 openmpi/3.1.3/cuda-9.2/gcc-7.3.0-pmi2 tcl_tk/8.6.8_gcc-7.2.0 ant/1.10.3 java/1.8.0_181 #module load gcc/9.2.0 openmpi/4.0.4/cuda-10.2/gcc-9.2.0 tcl_tk/8.6.8_gcc-7.2.0 ant/1.10.3 java/12.0.1 pcre2/10.21 GSL/2.6_gcc-9.2.0 # new stack on 8/14/20 - note, per my emails with Biowulf, they disabled development in PMI2 OpenMPI environments; further added pcre2/10.21 on 9/2/20 as otherwise installing Supervisor's R packages wouldn't work as R could not start at all; further added GSL/2.6_gcc-9.2.0 on 9/2/20 as otherwise the ggplot2 installation for Supervisor failed; this seems to work up to a "Source option 6 is no longer supported. Use 7 or later", seemingly Java-related error - module load gcc/9.2.0 openmpi/4.0.4/cuda-10.2/gcc-9.2.0 tcl_tk/8.6.8_gcc-7.2.0 ant/1.10.3 java/1.8.0_211 pcre2/10.21 GSL/2.6_gcc-9.2.0 # going from java/12.0.1 --> java/1.8.0_211 seems to fix the Java-related error above + #module load gcc/9.2.0 openmpi/4.0.4/cuda-10.2/gcc-9.2.0 tcl_tk/8.6.8_gcc-7.2.0 ant/1.10.3 java/1.8.0_211 pcre2/10.21 GSL/2.6_gcc-9.2.0 # going from java/12.0.1 --> java/1.8.0_211 seems to fix the Java-related error above + module load gcc/9.2.0 openmpi/4.0.4/cuda-10.2/gcc-9.2.0 ant/1.10.3 java/1.8.0_211 pcre2/10.21 GSL/2.6_gcc-9.2.0 # removing tcl_tk in order to use the one we just built on 9/12/20 export OMPI_MCA_mpi_warn_on_fork=0 else @@ -22,9 +23,16 @@ else export CPATH=/data/BIDS-HPC/public/software/builds/mpich-3.3-3/include:$CPATH fi + +export PATH="/data/BIDS-HPC/public/software/builds/tcl/bin:$PATH" +export LD_LIBRARY_PATH="/data/BIDS-HPC/public/software/builds/tcl/lib:$LD_LIBRARY_PATH" +export MANPATH="/data/BIDS-HPC/public/software/builds/tcl/man:$MANPATH" + + # Set variables for CANDLE dependencies (mostly, Swift/T dependencies) export CANDLE_DEP_MPI="/usr/local/OpenMPI/4.0.4/CUDA-10.2/gcc-9.2.0" -export CANDLE_DEP_TCL="/usr/local/Tcl_Tk/8.6.8/gcc_7.2.0" +#export CANDLE_DEP_TCL="/usr/local/Tcl_Tk/8.6.8/gcc_7.2.0" +export CANDLE_DEP_TCL="/data/BIDS-HPC/public/software/builds/tcl" export CANDLE_DEP_PY="/usr/local/Anaconda/envs/py3.7" export CANDLE_DEP_R="/usr/local/apps/R/4.0/4.0.0/lib64/R" export CANDLE_DEP_R_SITE="/usr/local/apps/R/4.0/site-library_4.0.0" From 1401a02474a2d75f607a44723c4085ae7793b8bf Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 16 Sep 2020 11:01:26 -0500 Subject: [PATCH 082/601] Report skipped nodes --- workflows/cp-leaveout/db/print-stats.sh | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/workflows/cp-leaveout/db/print-stats.sh b/workflows/cp-leaveout/db/print-stats.sh index 0d804a88..6b9eb3a9 100755 --- a/workflows/cp-leaveout/db/print-stats.sh +++ b/workflows/cp-leaveout/db/print-stats.sh @@ -10,13 +10,19 @@ fi DB=$1 -COMPLETE=$( +COMPLETE=$( sqlite3 $DB < Date: Wed, 16 Sep 2020 11:02:32 -0500 Subject: [PATCH 083/601] Rename abort() -> fail() --- workflows/cp-leaveout/scripts/extract-node-info.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index de2af875..130f66b7 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -38,7 +38,7 @@ def read_log_filenames(log_list): line = line.strip() result.append(line) except IOError as e: - abort(e, os.EX_IOERR, "Could not read: " + log_list) + fail(e, os.EX_IOERR, "Could not read: " + log_list) return result def parse_logs(log_files): @@ -56,7 +56,7 @@ def parse_logs(log_files): parse_log(fp, nodes) index += 1 except IOError as e: - abort(e, os.EX_IOERR, "Could not read: " + log_file) + fail(e, os.EX_IOERR, "Could not read: " + log_file) return nodes def parse_log(log_fp, nodes): From 663e1b32b0bdd20eba3a7675ca4cd3eca5e36dd3 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 16 Sep 2020 11:02:58 -0500 Subject: [PATCH 084/601] Handle missing python.log --- workflows/cp-leaveout/scripts/extract-node-info.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index 130f66b7..e8b60bec 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -95,6 +95,8 @@ def parse_log(log_fp, nodes): def find_val_data(node): python_log = args.directory + "/run/%s/save/python.log" % node.id + if not os.path.exists(python_log): + return with open(python_log) as fp: node.parse_val_data(fp) if node.val_data == None: From 92817dcb7f1da2852c836d18fe3d2f5dd3917148 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 16 Sep 2020 11:03:22 -0500 Subject: [PATCH 085/601] Use modern syntax --- workflows/cp-leaveout/scripts/extract-stats.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/extract-stats.sh b/workflows/cp-leaveout/scripts/extract-stats.sh index 707ff384..11ba980e 100755 --- a/workflows/cp-leaveout/scripts/extract-stats.sh +++ b/workflows/cp-leaveout/scripts/extract-stats.sh @@ -38,7 +38,7 @@ FORMAT="%-6s %-10s %-8s %-8s %-8s %-8s" sed "/Current time/ {s/Current time \.\.\.\.\(.*\)/\1/ ; h}; \$!d; x" \ $DIR/run/$RUN/model.log ) ) - if [ ${#STATS[@]} -gt 0 ] + if (( ${#STATS[@]} > 0 )) then printf " $FORMAT\n" $STAGE $RUN ${STATS[@]} fi From 4fc3cfce3a58dc5a113d2b6c0f0d8d133c7b45a8 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 16 Sep 2020 11:05:24 -0500 Subject: [PATCH 086/601] New baseline-error-list.sh --- .../scripts/baseline-error-list.sh | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100755 workflows/cp-leaveout/scripts/baseline-error-list.sh diff --git a/workflows/cp-leaveout/scripts/baseline-error-list.sh b/workflows/cp-leaveout/scripts/baseline-error-list.sh new file mode 100755 index 00000000..eb5392bb --- /dev/null +++ b/workflows/cp-leaveout/scripts/baseline-error-list.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -eu + +# BASELINE ERROR LIST SH +# WIP: Script to extract python.logs from a given DIR and STAGE + +THIS=$( readlink --canonicalize $( dirname $0 ) ) + +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ + -H "and OUTPUT filename" \ + DIR STAGE OUTPUT - ${*} + +if [[ ! -d $DIR ]] +then + echo "$0: Given experiment directory does not exist: $DIR" + exit 1 +fi + +for F in experiments/X385/run/?.?.?.?.?.?/save/python.log +do + echo $( basename $( dirname $( dirname $F ) ) ) +done > $OUTPUT From c75c21ef6303b7a176346b6eb8208f23b5b933e4 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 16 Sep 2020 11:09:14 -0500 Subject: [PATCH 087/601] New baseline error workflow --- workflows/cp-leaveout/swift/baseline-error.sh | 200 ++++++++++++++++++ .../cp-leaveout/swift/baseline-error.swift | 66 ++++++ .../swift/compute_epochs_none.swift | 7 + workflows/cp-leaveout/test/test-bl-1.sh | 95 +++++++++ 4 files changed, 368 insertions(+) create mode 100755 workflows/cp-leaveout/swift/baseline-error.sh create mode 100644 workflows/cp-leaveout/swift/baseline-error.swift create mode 100644 workflows/cp-leaveout/swift/compute_epochs_none.swift create mode 100755 workflows/cp-leaveout/test/test-bl-1.sh diff --git a/workflows/cp-leaveout/swift/baseline-error.sh b/workflows/cp-leaveout/swift/baseline-error.sh new file mode 100755 index 00000000..66023e12 --- /dev/null +++ b/workflows/cp-leaveout/swift/baseline-error.sh @@ -0,0 +1,200 @@ +#! /usr/bin/env bash +set -eu + +# BASELINE ERROR SH +# Main entry point for baseline-error workflow +# See README.adoc for more information + +# Autodetect this workflow directory +export EMEWS_PROJECT_ROOT=$( cd $( dirname $0 )/.. ; /bin/pwd ) +export WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. ; /bin/pwd ) +if [[ ! -d $EMEWS_PROJECT_ROOT/../../../Benchmarks ]] +then + echo "Could not find Benchmarks in: $EMEWS_PROJECT_ROOT/../../../Benchmarks" + exit 1 +fi +BENCHMARKS_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd) +export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} +BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/Uno +export BENCHMARK_TIMEOUT +export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} + +PYTHONPATH=${PYTHONPATH:-}:$BENCHMARK_DIR + +SCRIPT_NAME=$(basename $0) + +export FRAMEWORK="keras" + +# Source some utility functions used by EMEWS in this script +source $WORKFLOWS_ROOT/common/sh/utils.sh + +usage() +{ + echo "baseline-error.sh:" \ + "usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME " +} + +if (( ${#} < 5 )) +then + usage + exit 1 +fi + +set -x +if ! { + get_site $1 # Sets SITE + get_expid $2 # Sets EXPID + get_cfg_sys $3 + get_cfg_prm $4 + MODEL_NAME=$5 + } +then + usage + exit 1 +fi + +shift 5 +WORKFLOW_ARGS=$* + +echo "WORKFLOW.SH: Running model: $MODEL_NAME for EXPID: $EXPID" + +set +x + +source_site env $SITE +source_site sched $SITE + +PYTHONPATH+=:$EMEWS_PROJECT_ROOT/py # For plangen, data_setup +PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # For log_tools, model_runner +APP_PYTHONPATH+=:$EMEWS_PROJECT_ROOT/py # For plangen, data_setup +APP_PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # For log_tools +APP_PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common # For Benchmarks + +export TURBINE_JOBNAME="JOB:${EXPID}" + +if [[ ${GPU_STRING:-} == "" ]] +then + GPU_ARG="" +else + GPU_ARG="-gpus=$GPU_STRING" +fi + +CMD_LINE_ARGS=( --benchmark_timeout=$BENCHMARK_TIMEOUT + --site=$SITE + $GPU_ARG + $WORKFLOW_ARGS + ) + +if [[ $WORKFLOW_ARGS = "-r"* ]] +then + echo "Restart requested ..." + if [[ ! -d $TURBINE_OUTPUT ]] + then + echo "No prior run found! (tried $TURBINE_OUTPUT/output.txt)" + exit 1 + fi + if [[ ! -f $TURBINE_OUTPUT/output.txt ]] + then + # If output.txt does not exist, assume the moves already happened + echo "WARNING: The outputs were already moved from $EXPID" + else + next $TURBINE_OUTPUT/restarts/%i # cf. utils.sh:next() + PRIOR_RUN=$REPLY + echo "Moving old outputs to $PRIOR_RUN" + mkdir -pv $PRIOR_RUN + PRIORS=( $TURBINE_OUTPUT/output.txt + $TURBINE_OUTPUT/out + $TURBINE_OUTPUT/turbine* + $TURBINE_OUTPUT/jobid.txt ) + mv ${PRIORS[@]} $PRIOR_RUN + fi +else # Not a restart + if [[ -f $TURBINE_OUTPUT/output.txt ]] + then + echo "TURBINE_OUTPUT already exists- you must specify restart!" + echo "TURBINE_OUTPUT=$TURBINE_OUTPUT" + exit 1 + fi +fi + +USER_VARS=( $CMD_LINE_ARGS ) +# log variables and script to to TURBINE_OUTPUT directory +log_script + +# Make run directory in advance to reduce contention +mkdir -p $TURBINE_OUTPUT/run + +# Allow the user to set an objective function +OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} +OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +# This is used by the obj_app objective function +export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh + +WORKFLOW_SWIFT=${WORKFLOW_SWIFT:-baseline-error.swift} +echo "WORKFLOW_SWIFT: $WORKFLOW_SWIFT" + +WAIT_ARG="" +if (( ${WAIT:-0} )) +then + WAIT_ARG="-t w" + echo "Turbine will wait for job completion." +fi + +# which python swift-t java + +if [[ ${MACHINE:-} == "" ]] +then + STDOUT=$TURBINE_OUTPUT/output.txt + # The turbine-output link is only created on scheduled systems, + # so if running locally, we create it here so the test*.sh wrappers + # can find it + [[ -L turbine-output ]] && rm turbine-output + ln -s $TURBINE_OUTPUT turbine-output +else + # When running on a scheduled system, Swift/T automatically redirects + # stdout to the turbine-output directory. This will just be for + # warnings or unusual messages + STDOUT="" +fi + +TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" +mkdir -pv $TURBINE_OUTPUT/out + +swift-t -O 0 -n $PROCS \ + ${MACHINE:-} \ + -p \ + -I $OBJ_DIR \ + -i $OBJ_MODULE \ + -I $EMEWS_PROJECT_ROOT/swift \ + -e LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} \ + -e BENCHMARKS_ROOT \ + -e EMEWS_PROJECT_ROOT \ + -e APP_PYTHONPATH=$APP_PYTHONPATH \ + $( python_envs ) \ + -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ + -e TURBINE_STDOUT=$TURBINE_STDOUT \ + -e OBJ_RETURN \ + -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ + -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ + -e MODEL_SH \ + -e MODEL_NAME \ + -e SITE \ + -e BENCHMARK_TIMEOUT \ + -e BENCHMARKS_ROOT \ + -e SH_TIMEOUT \ + -e IGNORE_ERRORS \ + -e TURBINE_DB_WORKERS=1 \ + $WAIT_ARG \ + $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} | \ + tee $STDOUT + +# -j /usr/bin/java # Give this to Swift/T if needed for Java +# -e PYTHONUNBUFFERED=1 # May be needed if error output is being lost + +if (( ${PIPESTATUS[0]} )) +then + echo "workflow.sh: swift-t exited with error!" + exit 1 +fi + +echo "WORKFLOW OK." +echo "EXIT CODE: 0" | tee -a $STDOUT diff --git a/workflows/cp-leaveout/swift/baseline-error.swift b/workflows/cp-leaveout/swift/baseline-error.swift new file mode 100644 index 00000000..96b5bf51 --- /dev/null +++ b/workflows/cp-leaveout/swift/baseline-error.swift @@ -0,0 +1,66 @@ + +/** + BASELINE ERROR SWIFT + Runs the given nodes in new output directory based on + the pre-processed data in another "reference" directory +*/ + +import assert; +import files; +import io; +import python; +import sys; + +import candle_utils; +report_env(); + +// == Command-line Arguments Begin == +// The big feather file or CSV +string dataframe_csv = argv("dataframe_csv"); +// Actual CP workflow output directory to use for data sources: +string reference = argv("reference"); +// List of node IDs, one per line +file file_nodes = input(argv("nodes")); +int benchmark_timeout = string2int(argv("benchmark_timeout", "-1")); +// == Command-line Arguments End == + +// == Environment Settings Begin == +string model_name = getenv("MODEL_NAME"); +string exp_id = getenv("EXPID"); +string turbine_output = getenv("TURBINE_OUTPUT"); +// == Environment Settings End == + +// Read file of node IDs +string lines[] = file_lines(file_nodes); + +// Resultant output values: +string results[]; + +// Basic parameters for all runs as JSON. +// Keys node and use_exported_data must be filled in later. +string params_basic = +---- +{ +"config_file": "uno_auc_model.txt", +"cache": "cache/top6_auc", +"dataframe_from": "%s", +"save_weights": "save/model.h5", +"gpus": "0", +"epochs": 50, +"es": "True", +"node": "%s", +"use_exported_data": "%s" +} +----; + +// Evaluate each parameter set +foreach node, i in lines +{ + printf("node: %s", node); + // Fill in missing hyperparameters: + string training_data = "%s/run/%s/topN.uno.h5" % (reference, node); + string params = params_basic % (dataframe_csv, node, training_data); + // NOTE: obj() is in the obj_*.swift supplied by workflow.sh + results[i] = obj(params, node); + assert(results[i] != "EXCEPTION", "exception in obj()!"); +} diff --git a/workflows/cp-leaveout/swift/compute_epochs_none.swift b/workflows/cp-leaveout/swift/compute_epochs_none.swift new file mode 100644 index 00000000..907b500d --- /dev/null +++ b/workflows/cp-leaveout/swift/compute_epochs_none.swift @@ -0,0 +1,7 @@ + +/** + COMPUTE EPOCH NONE SWIFT + + This is a dummy module for workflows that do not actually + compute epochs. It has no code. +*/ diff --git a/workflows/cp-leaveout/test/test-bl-1.sh b/workflows/cp-leaveout/test/test-bl-1.sh new file mode 100755 index 00000000..626aa1c7 --- /dev/null +++ b/workflows/cp-leaveout/test/test-bl-1.sh @@ -0,0 +1,95 @@ +#!/bin/bash +set -eu + +# CP LEAVEOUT BASELINE TEST 1 + +usage() +{ + echo "Usage: test SITE EXPID WORKFLOW_ARGS" +} + +if (( ${#} == 0 )) +then + usage + exit 1 +fi + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +SITE=$1 +RUN_DIR=$2 +shift 2 +WORKFLOW_ARGS=$* + +export MODEL_NAME=uno # nt3 + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-1.sh +export CFG_PRM=$THIS/cfg-prm-1.sh + +# Data files +# PLAN_JSON=$EMEWS_PROJECT_ROOT/plangen_cell8-p2_drug8-p2.json +# DATAFRAME_CSV=/usb1/wozniak/CANDLE-Benchmarks-Data/top21_dataframe_8x8.csv + +# Data files +# SUMMIT: +# PLAN_JSON=$EMEWS_PROJECT_ROOT/plangen_cell8-p2_drug8-p2.json +# SCRATCH=/gpfs/alpine/med106/scratch/hsyoo +SCRATCH=/gpfs/alpine/med106/scratch/wozniak +# SCRATCH=/usb2/wozniak +CANDLE_DATA=$SCRATCH/CANDLE-Data/Milestone-13 +PLAN_JSON=$CANDLE_DATA/plangen_cell1593-p4_drug1779-p1.json +# DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.csv +DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.feather +BENCHMARK_DATA=$SCRATCH/proj/Benchmarks/Pilot1/Uno +# BENCHMARK_DATA=$HOME/proj/Benchmarks/Pilot1/Uno + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export OBJ_RETURN="val_loss" + +if [[ $SITE == "theta" ]] +then + export WAIT=1 +fi + +for f in $DATAFRAME_CSV +do + if [[ ! -f $f ]] + then + abort "$0: does not exist: $f" + fi +done + +export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" + +# Submit job +export WORKFLOW_SWIFT=baseline-error.swift +# set -x +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME none $WORKFLOW_ARGS \ + --dataframe_csv=$DATAFRAME_CSV \ + --benchmark_data=$BENCHMARK_DATA \ + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +OUTPUT=turbine-output/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +# Wait for job +queue_wait + +SCRIPT=$( basename $0 .sh ) +check_output "RESULTS:" $OUTPUT $WORKFLOW $SCRIPT $JOBID +check_output "EXIT CODE: 0" $OUTPUT $WORKFLOW $SCRIPT $JOBID + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: From 525ee446f87753d2308e258328c78ef3d480baf8 Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Wed, 16 Sep 2020 22:24:41 -0400 Subject: [PATCH 088/601] Streamlined env-biowulf.sh --- workflows/common/sh/env-biowulf.sh | 73 ++++++++++++++---------------- 1 file changed, 33 insertions(+), 40 deletions(-) diff --git a/workflows/common/sh/env-biowulf.sh b/workflows/common/sh/env-biowulf.sh index a9505616..cd72a30a 100644 --- a/workflows/common/sh/env-biowulf.sh +++ b/workflows/common/sh/env-biowulf.sh @@ -1,37 +1,30 @@ #!/bin/bash -# Assume the candle module is loaded as usual +# Prerequisite: Assume the candle module is loaded as usual -# Load the environments for each MPI implementation -if [ "x$USE_OPENMPI" == "x1" ]; then # probably always use this on Biowulf as it's the best supported - #module load gcc/7.3.0 openmpi/3.1.2/cuda-9.0/gcc-7.3.0-pmi2 tcl_tk/8.6.8_gcc-7.2.0 ant/1.10.3 java/1.8.0_181 # Note I had to stop using openmpi/3.1.2/cuda-9.0/gcc-7.3.0-pmi2 because at least as of 6/19/19 Biowulf seemed to stop supporting it (it was available only as a "hidden" module) - #module load gcc/7.3.0 openmpi/3.1.3/cuda-9.2/gcc-7.3.0-pmi2 tcl_tk/8.6.8_gcc-7.2.0 ant/1.10.3 java/1.8.0_181 - - #module load gcc/9.2.0 openmpi/4.0.4/cuda-10.2/gcc-9.2.0 tcl_tk/8.6.8_gcc-7.2.0 ant/1.10.3 java/12.0.1 pcre2/10.21 GSL/2.6_gcc-9.2.0 # new stack on 8/14/20 - note, per my emails with Biowulf, they disabled development in PMI2 OpenMPI environments; further added pcre2/10.21 on 9/2/20 as otherwise installing Supervisor's R packages wouldn't work as R could not start at all; further added GSL/2.6_gcc-9.2.0 on 9/2/20 as otherwise the ggplot2 installation for Supervisor failed; this seems to work up to a "Source option 6 is no longer supported. Use 7 or later", seemingly Java-related error - #module load gcc/9.2.0 openmpi/4.0.4/cuda-10.2/gcc-9.2.0 tcl_tk/8.6.8_gcc-7.2.0 ant/1.10.3 java/1.8.0_211 pcre2/10.21 GSL/2.6_gcc-9.2.0 # going from java/12.0.1 --> java/1.8.0_211 seems to fix the Java-related error above - module load gcc/9.2.0 openmpi/4.0.4/cuda-10.2/gcc-9.2.0 ant/1.10.3 java/1.8.0_211 pcre2/10.21 GSL/2.6_gcc-9.2.0 # removing tcl_tk in order to use the one we just built on 9/12/20 - - export OMPI_MCA_mpi_warn_on_fork=0 -else - module load tcl_tk/8.6.8_gcc-7.2.0 ant/1.10.3 java/1.8.0_181 - module remove openmpi/3.0.2/gcc-7.3.0 - module load gcc/7.2.0 - export LD_LIBRARY_PATH=/usr/local/slurm/lib:$LD_LIBRARY_PATH - export PATH=/data/BIDS-HPC/public/software/builds/mpich-3.3-3/bin:$PATH - export LD_LIBRARY_PATH=/data/BIDS-HPC/public/software/builds/mpich-3.3-3/lib:$LD_LIBRARY_PATH - export LIBDIR=/data/BIDS-HPC/public/software/builds/mpich-3.3-3/lib:$LIBDIR - export CPATH=/data/BIDS-HPC/public/software/builds/mpich-3.3-3/include:$CPATH -fi +#### Load the stack ################################################################################################################ +# Load the lmod environment modules +module load gcc/9.2.0 openmpi/4.0.4/cuda-10.2/gcc-9.2.0 ant/1.10.3 java/1.8.0_211 pcre2/10.21 GSL/2.6_gcc-9.2.0 +# Load the Tcl we built on 9/12/20 export PATH="/data/BIDS-HPC/public/software/builds/tcl/bin:$PATH" export LD_LIBRARY_PATH="/data/BIDS-HPC/public/software/builds/tcl/lib:$LD_LIBRARY_PATH" export MANPATH="/data/BIDS-HPC/public/software/builds/tcl/man:$MANPATH" +# Load R/4.0.0 paths manually since we can't load the module on the Biowulf submit nodes (part of new stack on 8/13/20) +export PATH="$PATH:/usr/local/apps/R/4.0/4.0.0/bin" +export LIBRARY_PATH="$LIBRARY_PATH:/usr/local/intel/compilers_and_libraries_2019.1.144/linux/mkl/lib/intel64" +export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/intel/compilers_and_libraries_2019.1.144/linux/mkl/lib/intel64" +export R_LIBS_USER="$R_LIBS_USER:~/R/%v/library" +export R_LIBS_SITE="$CANDLE_DEP_R_SITE" +export R_LIBS="$CANDLE/R/libs" + + +#### Set variables for CANDLE dependencies (mostly, Swift/T dependencies) ########################################################## -# Set variables for CANDLE dependencies (mostly, Swift/T dependencies) +# This is for building CANDLE/Swift/T but it doesn't hurt to set these always export CANDLE_DEP_MPI="/usr/local/OpenMPI/4.0.4/CUDA-10.2/gcc-9.2.0" -#export CANDLE_DEP_TCL="/usr/local/Tcl_Tk/8.6.8/gcc_7.2.0" export CANDLE_DEP_TCL="/data/BIDS-HPC/public/software/builds/tcl" export CANDLE_DEP_PY="/usr/local/Anaconda/envs/py3.7" export CANDLE_DEP_R="/usr/local/apps/R/4.0/4.0.0/lib64/R" @@ -39,21 +32,12 @@ export CANDLE_DEP_R_SITE="/usr/local/apps/R/4.0/site-library_4.0.0" export CANDLE_DEP_ANT="/usr/local/apps/ant/1.10.3" export CANDLE_LAUNCHER_OPTION="--with-launcher=/usr/local/slurm/bin/srun" -# Load R/4.0.0 paths manually since we can't load the module on the Biowulf submit nodes (part of new stack on 8/13/20) -#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/GSL/gcc-7.2.0/2.4/lib:/usr/local/geos/3.6.2/lib:/usr/local/intel/compilers_and_libraries_2018.1.163/linux/mkl/lib/intel64 -#export PATH=$PATH:/usr/local/GSL/gcc-7.2.0/2.4/bin:/usr/local/apps/R/3.5/3.5.0_build2/bin -#export R_LIBS_SITE=/usr/local/apps/R/3.5/site-library_build2 -#export R_LIBS_USER=~/R/%v/library -export PATH="$PATH:/usr/local/apps/R/4.0/4.0.0/bin" -export LIBRARY_PATH="$LIBRARY_PATH:/usr/local/intel/compilers_and_libraries_2019.1.144/linux/mkl/lib/intel64" -export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/intel/compilers_and_libraries_2019.1.144/linux/mkl/lib/intel64" -export R_LIBS_USER="$R_LIBS_USER:~/R/%v/library" -export R_LIBS_SITE="$CANDLE_DEP_R_SITE" -export R_LIBS="$CANDLE/R/libs" -# Swift/T setup +#### Swift/T/MPI setup ############################################################################################################# + +# Basic Swift/T settings export SWIFT_T_INSTALL="$CANDLE/swift-t-install" -export PATH="$PATH:$SWIFT_T_INSTALL/stc/bin" # this is likely 1 of 2 lines needed to run swift-t out-of-the-box +export PATH="$PATH:$SWIFT_T_INSTALL/stc/bin" export PATH="$PATH:$SWIFT_T_INSTALL/turbine/bin" export PYTHONPATH="$PYTHONPATH:$SWIFT_T_INSTALL/turbine/py" export TURBINE_HOME="$SWIFT_T_INSTALL/turbine" @@ -61,19 +45,28 @@ export TURBINE_LOG="1" export ADLB_DEBUG_RANKS="1" export ADLB_DEBUG_HOSTMAP="1" export SWIFT_IMPL="app" + # Resident task workers and ranks if [ -z ${TURBINE_RESIDENT_WORK_WORKERS+x} ]; then export TURBINE_RESIDENT_WORK_WORKERS="1" export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) fi -# NOTE: Below is 2 of 2 lines needed to run swift-t out-of-the-box (no longer needed!!) -#export LD_PRELOAD=/usr/local/slurm/lib/libslurm.so:$LD_PRELOAD # this is the only way aside from recompiling Swift/T I believe to get past an error regarding /usr/local/slurm/lib/slurm/auth_munge.so, e.g., "/usr/local/Tcl_Tk/8.6.8/gcc_7.2.0/bin/tclsh8.6: symbol lookup error: /usr/local/slurm/lib/slurm/auth_munge.so: undefined symbol: slurm_debug" # Set up EMEWS Queues -export EQR="$CANDLE/Supervisor/workflows/common/ext/EQ-R" # I don’t know where else to find this directory that needs to be available, e.g., in workflow.sh +export EQR="$CANDLE/Supervisor/workflows/common/ext/EQ-R" export EQPy="$CANDLE/Supervisor/workflows/common/ext/EQ-Py" -# Other additions +# This is how Tim Miller told me to run interactive and batch MPI jobs on Biowulf GPU nodes recently (Aug/Sep 2020) +if [ "x$SLURM_JOB_PARTITION" == "xinteractive" ]; then + export TURBINE_LAUNCH_OPTIONS="--mpi=pmix --mem=0" +else + export TURBINE_LAUNCH_OPTIONS="--mpi=pmix" +fi + + +#### Miscellaneous settings/output ################################################################################################# + +# Add the Supervisor workflows scripts to the Python path export PYTHONPATH="$PYTHONPATH:$CANDLE/Supervisor/workflows/common/python" # Log settings to output From f97b9e4dbd0852d854d0e25e63410ab724485f60 Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Thu, 17 Sep 2020 01:11:08 -0400 Subject: [PATCH 089/601] Finished streamlining and generalizing setup scripts and about to start testing them --- workflows/common/sh/env-biowulf.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/workflows/common/sh/env-biowulf.sh b/workflows/common/sh/env-biowulf.sh index cd72a30a..74b0c984 100644 --- a/workflows/common/sh/env-biowulf.sh +++ b/workflows/common/sh/env-biowulf.sh @@ -2,8 +2,8 @@ # Prerequisite: Assume the candle module is loaded as usual -#### Load the stack ################################################################################################################ +#### Load the stack ################################################################################################################ # Load the lmod environment modules module load gcc/9.2.0 openmpi/4.0.4/cuda-10.2/gcc-9.2.0 ant/1.10.3 java/1.8.0_211 pcre2/10.21 GSL/2.6_gcc-9.2.0 @@ -19,10 +19,10 @@ export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/intel/compilers_and_librarie export R_LIBS_USER="$R_LIBS_USER:~/R/%v/library" export R_LIBS_SITE="$CANDLE_DEP_R_SITE" export R_LIBS="$CANDLE/R/libs" +#################################################################################################################################### #### Set variables for CANDLE dependencies (mostly, Swift/T dependencies) ########################################################## - # This is for building CANDLE/Swift/T but it doesn't hurt to set these always export CANDLE_DEP_MPI="/usr/local/OpenMPI/4.0.4/CUDA-10.2/gcc-9.2.0" export CANDLE_DEP_TCL="/data/BIDS-HPC/public/software/builds/tcl" @@ -31,10 +31,10 @@ export CANDLE_DEP_R="/usr/local/apps/R/4.0/4.0.0/lib64/R" export CANDLE_DEP_R_SITE="/usr/local/apps/R/4.0/site-library_4.0.0" export CANDLE_DEP_ANT="/usr/local/apps/ant/1.10.3" export CANDLE_LAUNCHER_OPTION="--with-launcher=/usr/local/slurm/bin/srun" +#################################################################################################################################### #### Swift/T/MPI setup ############################################################################################################# - # Basic Swift/T settings export SWIFT_T_INSTALL="$CANDLE/swift-t-install" export PATH="$PATH:$SWIFT_T_INSTALL/stc/bin" @@ -62,13 +62,14 @@ if [ "x$SLURM_JOB_PARTITION" == "xinteractive" ]; then else export TURBINE_LAUNCH_OPTIONS="--mpi=pmix" fi +#################################################################################################################################### #### Miscellaneous settings/output ################################################################################################# - # Add the Supervisor workflows scripts to the Python path export PYTHONPATH="$PYTHONPATH:$CANDLE/Supervisor/workflows/common/python" # Log settings to output command -v python || echo "WARNING: Program 'python' not found" command -v swift-t || echo "WARNING: Program 'swift-t' not found" +#################################################################################################################################### From 3cad0b2c80f587577e07997da1eb0c46a4957135 Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Thu, 17 Sep 2020 02:11:52 -0400 Subject: [PATCH 090/601] Turned TURBINE_LAUNCH_OPTIONS back from an array into a string --- workflows/common/sh/env-biowulf.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/common/sh/env-biowulf.sh b/workflows/common/sh/env-biowulf.sh index 74b0c984..5cafe6c6 100644 --- a/workflows/common/sh/env-biowulf.sh +++ b/workflows/common/sh/env-biowulf.sh @@ -58,8 +58,10 @@ export EQPy="$CANDLE/Supervisor/workflows/common/ext/EQ-Py" # This is how Tim Miller told me to run interactive and batch MPI jobs on Biowulf GPU nodes recently (Aug/Sep 2020) if [ "x$SLURM_JOB_PARTITION" == "xinteractive" ]; then + #export TURBINE_LAUNCH_OPTIONS=("--mpi=pmix" "--mem=0") export TURBINE_LAUNCH_OPTIONS="--mpi=pmix --mem=0" else + #export TURBINE_LAUNCH_OPTIONS=("--mpi=pmix") export TURBINE_LAUNCH_OPTIONS="--mpi=pmix" fi #################################################################################################################################### From accd3db576094ca73c4567266243659ff7e232b1 Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Tue, 29 Sep 2020 12:22:13 -0400 Subject: [PATCH 091/601] Updated env-summit.sh as minimally as possible --- workflows/common/sh/env-summit.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflows/common/sh/env-summit.sh b/workflows/common/sh/env-summit.sh index aa3c5cec..35d48182 100644 --- a/workflows/common/sh/env-summit.sh +++ b/workflows/common/sh/env-summit.sh @@ -48,10 +48,12 @@ R=/gpfs/alpine/world-shared/med106/wozniak/sw/gcc-6.4.0/R-3.6.1/lib64/R LD_LIBRARY_PATH+=:$R/lib PY=/gpfs/alpine/world-shared/med106/sw/condaenv-200408 +LD_LIBRARY_PATH+=:/lib64 # we need this path to be before the $PY/lib one, which is added below, or else for compiling using mpicc we get the error "/usr/bin/uuidgen: /gpfs/alpine/world-shared/med106/sw/condaenv-200408/lib/libuuid.so.1: no version information available (required by /usr/bin/uuidgen)" LD_LIBRARY_PATH+=:$PY/lib export PYTHONHOME=$PY -export LD_LIBRARY_PATH=/gpfs/alpine/world-shared/med106/sw/condaenv-200408/lib:$LD_LIBRARY_PATH +# ALW 9/28/20: This path is already added, albeit to the end rather than the beginning, in the LD_LIBRARY_PATH+=:$PY/lib line above +#export LD_LIBRARY_PATH=/gpfs/alpine/world-shared/med106/sw/condaenv-200408/lib:$LD_LIBRARY_PATH # EMEWS Queues for R EQR=$MED106/wozniak/sw/gcc-6.4.0/EQ-R From f8ee70bf06f989593a6825193c0ef881af23257f Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Wed, 30 Sep 2020 17:20:13 -0400 Subject: [PATCH 092/601] Updated install-candle.R to not install plotly manually due to dependency errors that way and to install jsonlite automatically as was the original case --- workflows/common/R/install-candle.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/workflows/common/R/install-candle.R b/workflows/common/R/install-candle.R index 8e22607d..c8b8b520 100644 --- a/workflows/common/R/install-candle.R +++ b/workflows/common/R/install-candle.R @@ -14,8 +14,8 @@ r["CRAN"] <- "http://cran.wustl.edu/" options(repos = r) # Force Plotly 4.5.6 - not latest! Do not want shiny/httpuv, it does not work on Cooley! -install.packages("https://cran.r-project.org/src/contrib/Archive/plotly/plotly_4.5.6.tar.gz") -install.packages("https://cran.r-project.org/src/contrib/Archive/jsonlite/jsonlite_1.7.0.tar.gz") # ALW adding this on 9/12/20 (and removing jsonlite from PKGS list below) because sometime in the first two weeks of Sept 2020 the default jsonlite version became 1.7.1 and this seems to throw an error that looks to me like a bug that should be fixed with time; e.g., while everything worked in early Sept 2020 (probably 9/2/20), now on 9/12/20 I get this error: +#install.packages("https://cran.r-project.org/src/contrib/Archive/plotly/plotly_4.5.6.tar.gz") # This dies with a dependency error but plotly is installed anyway as a dependency of the following packages, so I'm putting it back into the PKGS list (ALW, 9/29/20) +#install.packages("https://cran.r-project.org/src/contrib/Archive/jsonlite/jsonlite_1.7.0.tar.gz") # ALW adding this on 9/12/20 (and removing jsonlite from PKGS list below) because sometime in the first two weeks of Sept 2020 the default jsonlite version became 1.7.1 and this seems to throw an error that looks to me like a bug that should be fixed with time; e.g., while everything worked in early Sept 2020 (probably 9/2/20), now on 9/12/20 I get this error: # * DONE (jsonlite) # 1): succeeded '/usr/local/apps/R/4.0/4.0.0/lib64/R/bin/R CMD INSTALL -l '/gpfs/gsfs9/users/BIDS-HPC/public/software/distributions/candle/dev_2/builds/R/libs' '/lscratch/64803361/Rtmpnd5yDC/downloaded_packages/jsonlite_1.7.1.tar.gz'' # The downloaded source packages are in @@ -30,6 +30,8 @@ install.packages("https://cran.r-project.org/src/contrib/Archive/jsonlite/jsonli # ****NOTE**** that I tried installing both plotly and jsonlite the normal way (in the PKGS below instead of a specific version above) and I got the same error PKGS <- list( + "plotly", + "jsonlite", "smoof", "rgenoud", "DiceKriging", From d37cae990162181d8877c7951f6ba154b2c481a4 Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Mon, 5 Oct 2020 20:51:51 -0400 Subject: [PATCH 093/601] Made general changes to get CANDLE working on Biowulf --- workflows/common/sh/env-biowulf.sh | 46 +++++++++++++++++++----------- workflows/common/sh/env-summit.sh | 5 +++- workflows/upf/swift/workflow.sh | 2 +- 3 files changed, 34 insertions(+), 19 deletions(-) diff --git a/workflows/common/sh/env-biowulf.sh b/workflows/common/sh/env-biowulf.sh index 5cafe6c6..12c6b2f5 100644 --- a/workflows/common/sh/env-biowulf.sh +++ b/workflows/common/sh/env-biowulf.sh @@ -3,6 +3,18 @@ # Prerequisite: Assume the candle module is loaded as usual +#### Set variables for CANDLE dependencies (mostly, Swift/T dependencies) ########################################################## +# This is for building CANDLE/Swift/T but it doesn't hurt to set these always +export CANDLE_DEP_MPI="/usr/local/OpenMPI/4.0.4/CUDA-10.2/gcc-9.2.0" +export CANDLE_DEP_TCL="/data/BIDS-HPC/public/software/builds/tcl" +export CANDLE_DEP_PY="/usr/local/Anaconda/envs/py3.7" +export CANDLE_DEP_R="/usr/local/apps/R/4.0/4.0.0/lib64/R" +export CANDLE_DEP_R_SITE="/usr/local/apps/R/4.0/site-library_4.0.0" +export CANDLE_DEP_ANT="/usr/local/apps/ant/1.10.3" +export CANDLE_LAUNCHER_OPTION="--with-launcher=/usr/local/slurm/bin/srun" +#################################################################################################################################### + + #### Load the stack ################################################################################################################ # Load the lmod environment modules module load gcc/9.2.0 openmpi/4.0.4/cuda-10.2/gcc-9.2.0 ant/1.10.3 java/1.8.0_211 pcre2/10.21 GSL/2.6_gcc-9.2.0 @@ -16,24 +28,18 @@ export MANPATH="/data/BIDS-HPC/public/software/builds/tcl/man:$MANPATH" export PATH="$PATH:/usr/local/apps/R/4.0/4.0.0/bin" export LIBRARY_PATH="$LIBRARY_PATH:/usr/local/intel/compilers_and_libraries_2019.1.144/linux/mkl/lib/intel64" export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/intel/compilers_and_libraries_2019.1.144/linux/mkl/lib/intel64" -export R_LIBS_USER="$R_LIBS_USER:~/R/%v/library" +#export R_LIBS_USER="$R_LIBS_USER:~/R/%v/library" +# What we want: if $R_LIBS_USER is set then R_LIBS_USER="$R_LIBS_USER:~/R/%v/library", otherwise R_LIBS_USER="~/R/%v/library" +if [ -z ${R_LIBS_USER+x} ]; then + R_LIBS_USER="~/R/%v/library" +else + R_LIBS_USER="$R_LIBS_USER:~/R/%v/library" +fi export R_LIBS_SITE="$CANDLE_DEP_R_SITE" export R_LIBS="$CANDLE/R/libs" #################################################################################################################################### -#### Set variables for CANDLE dependencies (mostly, Swift/T dependencies) ########################################################## -# This is for building CANDLE/Swift/T but it doesn't hurt to set these always -export CANDLE_DEP_MPI="/usr/local/OpenMPI/4.0.4/CUDA-10.2/gcc-9.2.0" -export CANDLE_DEP_TCL="/data/BIDS-HPC/public/software/builds/tcl" -export CANDLE_DEP_PY="/usr/local/Anaconda/envs/py3.7" -export CANDLE_DEP_R="/usr/local/apps/R/4.0/4.0.0/lib64/R" -export CANDLE_DEP_R_SITE="/usr/local/apps/R/4.0/site-library_4.0.0" -export CANDLE_DEP_ANT="/usr/local/apps/ant/1.10.3" -export CANDLE_LAUNCHER_OPTION="--with-launcher=/usr/local/slurm/bin/srun" -#################################################################################################################################### - - #### Swift/T/MPI setup ############################################################################################################# # Basic Swift/T settings export SWIFT_T_INSTALL="$CANDLE/swift-t-install" @@ -47,7 +53,7 @@ export ADLB_DEBUG_HOSTMAP="1" export SWIFT_IMPL="app" # Resident task workers and ranks -if [ -z ${TURBINE_RESIDENT_WORK_WORKERS+x} ]; then +if [ -z ${TURBINE_RESIDENT_WORK_WORKERS+x} ]; then # if $TURBINE_RESIDENT_WORK_WORKERS is unset... export TURBINE_RESIDENT_WORK_WORKERS="1" export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) fi @@ -57,13 +63,19 @@ export EQR="$CANDLE/Supervisor/workflows/common/ext/EQ-R" export EQPy="$CANDLE/Supervisor/workflows/common/ext/EQ-Py" # This is how Tim Miller told me to run interactive and batch MPI jobs on Biowulf GPU nodes recently (Aug/Sep 2020) -if [ "x$SLURM_JOB_PARTITION" == "xinteractive" ]; then +#if [ "x$SLURM_JOB_PARTITION" == "xinteractive" ]; then +if [ "x${SLURM_JOB_PARTITION:-batch}" == "xinteractive" ]; then #export TURBINE_LAUNCH_OPTIONS=("--mpi=pmix" "--mem=0") - export TURBINE_LAUNCH_OPTIONS="--mpi=pmix --mem=0" + export TURBINE_LAUNCH_OPTIONS+=" --mpi=pmix --mem=0" else #export TURBINE_LAUNCH_OPTIONS=("--mpi=pmix") - export TURBINE_LAUNCH_OPTIONS="--mpi=pmix" + export TURBINE_LAUNCH_OPTIONS+=" --mpi=pmix" fi + + +export TURBINE_MPI_THREAD=0 + + #################################################################################################################################### diff --git a/workflows/common/sh/env-summit.sh b/workflows/common/sh/env-summit.sh index 35d48182..67c480aa 100644 --- a/workflows/common/sh/env-summit.sh +++ b/workflows/common/sh/env-summit.sh @@ -53,7 +53,10 @@ LD_LIBRARY_PATH+=:$PY/lib export PYTHONHOME=$PY # ALW 9/28/20: This path is already added, albeit to the end rather than the beginning, in the LD_LIBRARY_PATH+=:$PY/lib line above -#export LD_LIBRARY_PATH=/gpfs/alpine/world-shared/med106/sw/condaenv-200408/lib:$LD_LIBRARY_PATH +#export LD_LIBRARY_PATH=/gpfs/alpine/world-shared/med106/sw/condaenv-200408/lib:$ + +# ALW 10/1/20: Adding this per Justin and my experiments and discussion on 9/30/20 and 10/1/20 +export LD_LIBRARY_PATH="/sw/summit/gcc/6.4.0/lib64:$LD_LIBRARY_PATH" # EMEWS Queues for R EQR=$MED106/wozniak/sw/gcc-6.4.0/EQ-R diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index af26762c..e0a63e1a 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -110,7 +110,7 @@ swift-t -n $PROCS \ -e MODEL_NAME \ -e OBJ_RETURN \ -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ - -e TURBINE_MPI_THREAD=1 \ + -e TURBINE_MPI_THREAD=${TURBINE_MPI_THREAD:-1} \ $( python_envs ) \ -e TURBINE_STDOUT=$TURBINE_STDOUT \ -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ From 96bd143593ef7a7fa2fb2356dd2d58c4cbc8d914 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 13 Oct 2020 11:24:00 -0500 Subject: [PATCH 094/601] Loss comparator scripts --- .../cp-leaveout/scripts/compare-losses.py | 34 +++++++++++++++++++ .../cp-leaveout/scripts/compare-losses.sh | 30 ++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 workflows/cp-leaveout/scripts/compare-losses.py create mode 100755 workflows/cp-leaveout/scripts/compare-losses.sh diff --git a/workflows/cp-leaveout/scripts/compare-losses.py b/workflows/cp-leaveout/scripts/compare-losses.py new file mode 100644 index 00000000..37e6fd52 --- /dev/null +++ b/workflows/cp-leaveout/scripts/compare-losses.py @@ -0,0 +1,34 @@ + +# COMPARE LOSSES PY + +# Input: Provide two experiment directories +# Output: Stream of NODE_ID LOSS1 LOSS2 + +import argparse, pickle + +parser = argparse.ArgumentParser(description='Parse all log files') +parser.add_argument('directory1', + help='The 1st experiment directory (EXPID)') +parser.add_argument('directory2', + help='The 2nd experiment directory (EXPID)') + +args = parser.parse_args() + +# logging.basicConfig(level=logging.DEBUG, format="%(message)s") +# logger = logging.getLogger("extract_node_info") + +node_pkl_1 = args.directory1 + "/node-info.pkl" +node_pkl_2 = args.directory2 + "/node-info.pkl" + +with open(node_pkl_1, "rb") as fp: + nodes_1 = pickle.load(fp) +with open(node_pkl_2, "rb") as fp: + nodes_2 = pickle.load(fp) +# print("%i %i" % (len(nodes_1), len(nodes_2))) + +count = 1 +for node_id in nodes_2: + loss_1 = nodes_1[node_id].val_loss + loss_2 = nodes_2[node_id].val_loss + print("%2i %s %8.7f %8.7f" % (count, node_id, loss_1, loss_2)) + count += 1 diff --git a/workflows/cp-leaveout/scripts/compare-losses.sh b/workflows/cp-leaveout/scripts/compare-losses.sh new file mode 100755 index 00000000..7636c09b --- /dev/null +++ b/workflows/cp-leaveout/scripts/compare-losses.sh @@ -0,0 +1,30 @@ +#!/bin/bash +set -eu + +# COMPARE LOSSES SH +# Compare losses from $DIR1/node-info.pkl and $DIR2/node-info.pkl + +THIS=$( readlink --canonicalize $( dirname $0 ) ) + +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide 2 experiment DIRs (e.g., .../experiments/X042)!" \ + DIR1 DIR2 - ${*} + +for DIR in $DIR1 $DIR2 +do + if [[ ! -d $DIR ]] + then + echo "$0: Given experiment directory does not exist: $DIR" + exit 1 + fi +done + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +set -x +python3 -u $THIS/compare-losses.py $DIR1 $DIR2 > compared-losses.txt +awk '{print $3, $4}' < compared-losses.txt > compared-losses.data +sort -n compared-losses.data | nl --number-width=2 \ + > compared-losses-sorted.data From 1938ce35ee15c5470f28bd2dbdb7519f22a61c15 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 14 Oct 2020 14:59:05 -0500 Subject: [PATCH 095/601] Better logging features in Node.py --- workflows/cp-leaveout/scripts/Node.py | 52 ++++++++++++++++++--------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/workflows/cp-leaveout/scripts/Node.py b/workflows/cp-leaveout/scripts/Node.py index f8da4629..a4d60215 100644 --- a/workflows/cp-leaveout/scripts/Node.py +++ b/workflows/cp-leaveout/scripts/Node.py @@ -3,6 +3,8 @@ # The training node information as stored in the logs # See the footer of this file for example log text that is parsed here +# This class must remain simple enough to pickle +# thus it cannot contain its own logger (Python 3.6 issue 30520) # import math @@ -11,7 +13,7 @@ class Node: # TensorFlow is done when you see this training_done = "[==============================]" - def __init__(self, id=None): + def __init__(self, id=None, logger=None): # The ID is e.g.: "1.2.3" self.id = id # Use string length of id to deduce stage: @@ -39,19 +41,23 @@ def __init__(self, id=None): self.stopped_early = False # Did training complete for this node? self.complete = False - self.verbose = False - self.debug("START: " + str(self)) + # Can disable logging here: + self.verbose = True + self.debug(logger, "START: " + str(self)) - def set_id(self, id): + def set_id(self, id, logger=None): self.id = id self.stage = (len(self.id) - 1 ) // 2 - self.debug("SET ID: " + id) + self.debug(logger, "SET ID: " + id) def parent(self): if self.stage == 1: return None return self.id[0:-2] + def __repr__(self): + return self.__str__() + def __str__(self): special = "" if not self.complete: @@ -91,36 +97,39 @@ def maybe_str_float(f, spec): return "?" return spec % f - def parse_epochs(self, line): + def parse_epochs(self, line, logger=None): tokens = line.split() self.epochs_planned = int(tokens[-1].strip()) - self.debug("epochs_planned: %i" % self.epochs_planned) + self.debug(logger, "epochs_planned: %i" % self.epochs_planned) - def parse_epoch_status(self, line): + def parse_epoch_status(self, line, logger=None): tokens = line.split() assert len(tokens) == 2, "bad line: " + line ints = tokens[1].split("/") assert len(tokens) == 2 self.epochs_actual = int(ints[0]) - self.debug("epochs_actual: " + str(self.epochs_actual)) + self.trace(logger, "epochs_actual: " + str(self.epochs_actual)) - def stop_early(self): + def stop_early(self, logger=None): self.stopped_early = True - self.debug("STOP EARLY") + self.debug(logger, "STOP EARLY") def parse_date_start(self, line): tokens = line.split() self.date_start = tokens[0] + " " + tokens[1] - def parse_date_stop(self, line): + def parse_date_stop(self, line, logger=None): tokens = line.split() self.date_stop = tokens[0] + " " + tokens[1] + if self.epochs_planned == None: + self.debug(logger, "STOP : epochs_planned=None") + return if self.epochs_actual == self.epochs_planned or \ self.stopped_early: self.complete = True - self.debug("COMPLETE") + self.debug(logger, "COMPLETE") - def parse_training_done(self, line): + def parse_training_done(self, line, logger=None): # The current epoch should already be set # by parse_epoch_status() # First, find the location of training_done (td) @@ -164,10 +173,19 @@ def get_val_loss_delta(node): raise ValueError("No val_loss_delta!") return node.val_loss_delta - def debug(self, message): - if not self.verbose: + def debug(self, logger, message): + # assert(logger != None) # Use this to find missing loggers + if logger == None or not self.verbose: + return + logger.debug("NODE: [%s] %s" % (self.id, message)) + + def trace(self, logger, message): + # assert(logger != None) # Use this to find missing loggers + if logger == None or not self.verbose: return - print("NODE: " + message) + import logging + logger.log(level=logging.DEBUG-5, + msg=("NODE: [%s] %s" % (self.id, message))) def total_time(self, nodes): parent = self.parent() From c4f05e368621a38f7ac35cc6d49ffeaf3edef7c5 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 15 Oct 2020 14:34:42 -0500 Subject: [PATCH 096/601] Dedup some entries (fixed bad merge?) --- workflows/cp-leaveout/scripts/Node.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/workflows/cp-leaveout/scripts/Node.py b/workflows/cp-leaveout/scripts/Node.py index a4d60215..7ff8a662 100644 --- a/workflows/cp-leaveout/scripts/Node.py +++ b/workflows/cp-leaveout/scripts/Node.py @@ -21,14 +21,12 @@ def __init__(self, id=None, logger=None): # Number of training steps performed self.steps = 0 self.loss = None - # Difference wrt parent (lower is better) - self.loss_delta = None self.val_loss = None - # Validation set size - self.val_data = None # Differences wrt parent (lower is better) self.loss_delta = None self.val_loss_delta = None + # Validation set size + self.val_data = None # Epochs prescribed by the workflow self.epochs_planned = None # Epochs actually run (consider early stopping) From 4328b2023ed43c8584d02dd48cf0d92a0bba4a14 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 15 Oct 2020 14:34:58 -0500 Subject: [PATCH 097/601] Document Pickle structure --- workflows/cp-leaveout/scripts/README.adoc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/cp-leaveout/scripts/README.adoc b/workflows/cp-leaveout/scripts/README.adoc index eb7f7af6..ba96708e 100644 --- a/workflows/cp-leaveout/scripts/README.adoc +++ b/workflows/cp-leaveout/scripts/README.adoc @@ -40,6 +40,8 @@ This avoids needing to walk all logs all the time (which takes tens of seconds). $ scripts/extract-node-info.sh $D ---- +The data structure in the Pickle is a simple dictionary mapping node ID strings e.g. "1.2.3.4" to object of type Node. + ==== Print Node info (print-node-info) Prints a big table of all Node statistics using the Node Pickle. From 41ef89831b44283c29933bdb409d43f12a8d82e1 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 15 Oct 2020 14:35:23 -0500 Subject: [PATCH 098/601] Improve logging --- .../cp-leaveout/scripts/extract-node-info.py | 39 ++++++++++++------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index e8b60bec..9947e47e 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -8,6 +8,7 @@ # See Node.py for the data structure import argparse, logging, os, pickle, sys +import pprint from utils import fail from Node import Node @@ -21,7 +22,8 @@ log_list = args.directory + "/log-list.txt" node_pkl = args.directory + "/node-info.pkl" -logging.basicConfig(level=logging.INFO, format="%(message)s") +logging.basicConfig(level=logging.DEBUG, format="%(message)s") +logger = logging.getLogger("extract_node_info") def read_log_filenames(log_list): result = [] @@ -44,14 +46,14 @@ def read_log_filenames(log_list): def parse_logs(log_files): # Dict mapping Node id to Node for all complete Nodes nodes = {} - logging.warning("Opening %i log files..." % len(log_files)) + logger.warning("Opening %i log files..." % len(log_files)) try: total = len(log_files) index = 0 for log_file in log_files: progress = "%4i/%4i (%2.f%%)" % \ (index, total, 100.0*index/total) - logging.info("Opening: %12s %s" % (progress, log_file)) + logger.info("Opening: %12s %s" % (progress, log_file)) with open(log_file) as fp: parse_log(fp, nodes) index += 1 @@ -64,34 +66,41 @@ def parse_log(log_fp, nodes): node_current = None while True: line = log_fp.readline() + # print(line) if line == "": break if "PARAM UPDATE START" in line: - node_current = Node() + trace("New Node ...") + node_current = Node(logger=logger) node_current.parse_date_start(line) - if "MODEL RUNNER DEBUG node =" in line: + if "MODEL RUNNER DEBUG node =" in line: tokens = line.split() node_id = tokens[-1].strip() - node_current.set_id(node_id) - elif "MODEL RUNNER DEBUG epochs =" in line: - node_current.parse_epochs(line) + node_current.set_id(node_id, logger) + elif "MODEL RUNNER DEBUG epochs =" in line: + node_current.parse_epochs(line, logger) elif line.startswith("Epoch ") and "/" in line: - node_current.parse_epoch_status(line) + node_current.parse_epoch_status(line, logger) elif Node.training_done in line: - node_current.parse_training_done(line) + node_current.parse_training_done(line, logger) elif "early stopping" in line: if node_current != None: # TensorFlow may report early stopping even if at max epochs: node_current.stop_early() elif "DONE: run_id" in line: - node_current.parse_date_stop(line) + logger.debug("RUN DONE.") + node_current.parse_date_stop(line, logger) if node_current != None and node_current.complete: # Store a complete Node in global dict nodes + logger.debug("NODE DONE.") nodes[node_current.id] = node_current find_val_data(node_current) nodes_found += 1 node_current = None - logging.info("Found %i nodes in log." % nodes_found) + logger.info("Found %i nodes in log." % nodes_found) + +def trace(message): + logger.log(level=logging.DEBUG-5, msg=message) def find_val_data(node): python_log = args.directory + "/run/%s/save/python.log" % node.id @@ -100,16 +109,16 @@ def find_val_data(node): with open(python_log) as fp: node.parse_val_data(fp) if node.val_data == None: - logging.fatal("Could not find val data for node: " + node.id) + logger.fatal("Could not find val data for node: " + node.id) # List of log file names log_files = read_log_filenames(log_list) # Dict mapping Node id to Node for all complete Nodes nodes = parse_logs(log_files) -logging.warning("Found %i nodes in total." % len(nodes)) +logger.warning("Found %i nodes in total." % len(nodes)) with open(node_pkl, "wb") as fp: pickle.dump(nodes, fp) -logging.warning("Wrote %s ." % node_pkl) +logger.warning("Wrote pickle: %s ." % node_pkl) From 147a29eede25b27c645ef3fd81652b0f0a53c9f7 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 21 Oct 2020 10:56:29 -0500 Subject: [PATCH 099/601] Debug at INFO by default --- workflows/cp-leaveout/scripts/extract-node-info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index 9947e47e..1c010fda 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -22,7 +22,7 @@ log_list = args.directory + "/log-list.txt" node_pkl = args.directory + "/node-info.pkl" -logging.basicConfig(level=logging.DEBUG, format="%(message)s") +logging.basicConfig(level=logging.INFO, format="%(message)s") logger = logging.getLogger("extract_node_info") def read_log_filenames(log_list): From c4d749ccb072940bf95aeba3b77a37b1845e67dd Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 21 Oct 2020 10:56:48 -0500 Subject: [PATCH 100/601] Bigger limit (catch all by default) --- workflows/cp-leaveout/scripts/extract-node-info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index 1c010fda..ab35e5ae 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -28,7 +28,7 @@ def read_log_filenames(log_list): result = [] count = 0 - limit = 2000 # Reduce this for debugging + limit = 5000 # Reduce this for debugging try: with open(log_list) as fp: for line in fp.readlines(): From 291478e58cdd3cd06811389e8e714d729a37ea93 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 21 Oct 2020 10:57:13 -0500 Subject: [PATCH 101/601] Indicate data definition --- workflows/cp-leaveout/scripts/extract-node-info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index ab35e5ae..fad8f9d2 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -44,7 +44,7 @@ def read_log_filenames(log_list): return result def parse_logs(log_files): - # Dict mapping Node id to Node for all complete Nodes + # Dict mapping Node id to Node for all complete Nodes: nodes = {} logger.warning("Opening %i log files..." % len(log_files)) try: From 0132c981e833dfe335fc9570f70357e586c5aa0c Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 21 Oct 2020 10:57:44 -0500 Subject: [PATCH 102/601] Disambiguate from python.log files --- workflows/cp-leaveout/scripts/extract-node-info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index fad8f9d2..7b692621 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -46,7 +46,7 @@ def read_log_filenames(log_list): def parse_logs(log_files): # Dict mapping Node id to Node for all complete Nodes: nodes = {} - logger.warning("Opening %i log files..." % len(log_files)) + logger.warning("Opening %i out.txt files..." % len(log_files)) try: total = len(log_files) index = 0 From c3223454d2226ad88936b0b347ddc540b16d94f1 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 21 Oct 2020 10:58:28 -0500 Subject: [PATCH 103/601] Update log parser --- .../cp-leaveout/scripts/extract-node-info.py | 55 ++++++++++++------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index 7b692621..bcb1bcf6 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -68,16 +68,26 @@ def parse_log(log_fp, nodes): line = log_fp.readline() # print(line) if line == "": break - if "PARAM UPDATE START" in line: - trace("New Node ...") - node_current = Node(logger=logger) - node_current.parse_date_start(line) - if "MODEL RUNNER DEBUG node =" in line: - tokens = line.split() - node_id = tokens[-1].strip() - node_current.set_id(node_id, logger) - elif "MODEL RUNNER DEBUG epochs =" in line: - node_current.parse_epochs(line, logger) + if "DONE: run_id" in line: + # This is also a MODEL RUNNER line, + # but could be DEBUG or INFO + # (should be INFO in future) + logger.debug("RUN DONE.") + node_current.parse_date_stop(line, logger) + elif "MODEL RUNNER" in line: + # print(line.strip()) + if "DEBUG" in line: + if "PARAM UPDATE START" in line: + trace("New Node ...") + node_current = Node(logger=logger) + node_current.parse_date_start(line) + elif " node =" in line: + print(line) + tokens = line.split() + node_id = tokens[-1].strip() + node_current.set_id(node_id, logger) + elif " epochs =" in line: + node_current.parse_epochs(line, logger) elif line.startswith("Epoch ") and "/" in line: node_current.parse_epoch_status(line, logger) elif Node.training_done in line: @@ -86,14 +96,12 @@ def parse_log(log_fp, nodes): if node_current != None: # TensorFlow may report early stopping even if at max epochs: node_current.stop_early() - elif "DONE: run_id" in line: - logger.debug("RUN DONE.") - node_current.parse_date_stop(line, logger) if node_current != None and node_current.complete: # Store a complete Node in global dict nodes - logger.debug("NODE DONE.") + # logger.debug("NODE DONE.") nodes[node_current.id] = node_current - find_val_data(node_current) + # find_val_data(node_current) # old format? + find_error_data(node_current) nodes_found += 1 node_current = None @@ -102,14 +110,23 @@ def parse_log(log_fp, nodes): def trace(message): logger.log(level=logging.DEBUG-5, msg=message) -def find_val_data(node): +# def find_val_data(node): +# python_log = args.directory + "/run/%s/save/python.log" % node.id +# if not os.path.exists(python_log): +# return +# with open(python_log) as fp: +# node.parse_val_data(fp) +# if node.val_data == None: +# logger.fatal("Could not find val data for node: " + node.id) + +def find_error_data(node): python_log = args.directory + "/run/%s/save/python.log" % node.id if not os.path.exists(python_log): return with open(python_log) as fp: - node.parse_val_data(fp) - if node.val_data == None: - logger.fatal("Could not find val data for node: " + node.id) + node.parse_error_data(fp) + if node.mse == None: + logger.fatal("Could not find error data for node: " + node.id) # List of log file names log_files = read_log_filenames(log_list) From 928c171a427882c6867c17301ef1d6184f2a3955 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 21 Oct 2020 10:58:51 -0500 Subject: [PATCH 104/601] Handle more error stats --- workflows/cp-leaveout/scripts/Node.py | 63 +++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 4 deletions(-) diff --git a/workflows/cp-leaveout/scripts/Node.py b/workflows/cp-leaveout/scripts/Node.py index 7ff8a662..11777819 100644 --- a/workflows/cp-leaveout/scripts/Node.py +++ b/workflows/cp-leaveout/scripts/Node.py @@ -20,8 +20,13 @@ def __init__(self, id=None, logger=None): self.stage = None # Number of training steps performed self.steps = 0 - self.loss = None + # Various error metrics: + self.loss = None self.val_loss = None + self.mse = None + self.mae = None + self.r2 = None + self.corr = None # Differences wrt parent (lower is better) self.loss_delta = None self.val_loss_delta = None @@ -78,13 +83,25 @@ def str_table(self): special = " INCOMPLETE!" if self.stopped_early: special = " EARLY STOP!" - return "%-12s : %i : %2i / %2i : loss: %0.5f vl: %0.5f : %s - %s : %s" % \ + return "%-12s : %i : %2i / %2i : %s - %s : %s : %s" % \ (self.id, self.stage, self.epochs_actual, self.epochs_planned, - self.loss, self.val_loss, self.date_start, self.date_stop, + self.str_errors(), special) + def str_errors(self): + ''' Return errors as big string ''' + fmt = "%0.6f" + s = ("loss: %s vl: %s mse: %s mae: %s r2: %s corr: %s") % \ + (Node.maybe_str_float(self.loss, fmt), + Node.maybe_str_float(self.val_loss, fmt), + Node.maybe_str_float(self.mse, fmt), + Node.maybe_str_float(self.mae, fmt), + Node.maybe_str_float(self.r2, fmt), + Node.maybe_str_float(self.corr, fmt)) + return s + def maybe_str_integer(i): if i is None: return "?" @@ -98,7 +115,7 @@ def maybe_str_float(f, spec): def parse_epochs(self, line, logger=None): tokens = line.split() self.epochs_planned = int(tokens[-1].strip()) - self.debug(logger, "epochs_planned: %i" % self.epochs_planned) + self.trace(logger, "epochs_planned: %i" % self.epochs_planned) def parse_epoch_status(self, line, logger=None): tokens = line.split() @@ -161,6 +178,33 @@ def parse_val_data(self, fp): value_string = tail[:comma] self.val_data = int(value_string) + def parse_error_data(self, fp): + """ + fp is the file pointer to save/python.log + If lines are not found, node.mse, etc., will remain None + """ + marker = "Comparing y_true " + # The marker is just after the date: + # We search this way for speed. + date_len = len("YYYY-MM-DD HH:MM:SS ") # trailing space + while True: + line = fp.readline() + if line == "": break + if line.startswith(marker, date_len): + line = fp.readline() + tokens = check_token(line, 2, "mse:") + self.mse = float(tokens[3]) + line = fp.readline() + tokens = check_token(line, 2, "mae:") + self.mae = float(tokens[3]) + line = fp.readline() + tokens = check_token(line, 2, "r2:") + self.r2 = float(tokens[3]) + line = fp.readline() + tokens = check_token(line, 2, "corr:") + self.corr = float(tokens[3]) + # Loop! We want the last such values in the file + def get_loss_delta(node): if node.loss_delta == None: raise ValueError("No loss_delta!") @@ -191,6 +235,17 @@ def total_time(self, nodes): return self.time return self.time + nodes[parent].total_time(nodes) +def check_token(line, index, token): + tokens = line.split() + if tokens[index] != token: + raise Exception(("could not find token: '%s'\n" + + "in line: '%s'") % (token, line)) + return tokens + +def check(condition, message): + if not condition: + raise Exception(message) + ''' EXAMPLES: From 7523deddaadbffd931e0c953224580080053a521 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 21 Oct 2020 10:59:24 -0500 Subject: [PATCH 105/601] Add comments --- workflows/cp-leaveout/scripts/print-node-info.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/workflows/cp-leaveout/scripts/print-node-info.py b/workflows/cp-leaveout/scripts/print-node-info.py index 63092d9a..e29ca4e5 100644 --- a/workflows/cp-leaveout/scripts/print-node-info.py +++ b/workflows/cp-leaveout/scripts/print-node-info.py @@ -20,8 +20,10 @@ except IOError as e: fail(e, os.EX_IOERR, "Could not read: " + node_pkl) +# Raw data printing: +# print(len(data)) # print(data) -for item in data.values(): - print(item.str_table()) -# print(len(data)) +# Print the node info! +for node in data.values(): + print(node.str_table()) From 0964b4bfe50a35ac98639246d5c11876491efedc Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 21 Oct 2020 11:00:24 -0500 Subject: [PATCH 106/601] Add comment --- workflows/cp-leaveout/scripts/print-node-info.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/cp-leaveout/scripts/print-node-info.sh b/workflows/cp-leaveout/scripts/print-node-info.sh index a9f89bf2..db36d50c 100755 --- a/workflows/cp-leaveout/scripts/print-node-info.sh +++ b/workflows/cp-leaveout/scripts/print-node-info.sh @@ -5,6 +5,7 @@ set -eu # Input: Provide an experiment directory # Output: Node information printed to screen (pipe this into less) +# See Node.str_table() for the output format THIS=$( readlink --canonicalize $( dirname $0 ) ) From 365d55824038a17fada54ba9ffdee8f6d549c3fc Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 21 Oct 2020 11:03:10 -0500 Subject: [PATCH 107/601] New compare-errors script --- .../cp-leaveout/scripts/compare-errors.py | 67 +++++++++++++++++++ .../cp-leaveout/scripts/compare-errors.sh | 15 +++++ 2 files changed, 82 insertions(+) create mode 100644 workflows/cp-leaveout/scripts/compare-errors.py create mode 100755 workflows/cp-leaveout/scripts/compare-errors.sh diff --git a/workflows/cp-leaveout/scripts/compare-errors.py b/workflows/cp-leaveout/scripts/compare-errors.py new file mode 100644 index 00000000..cd1aa392 --- /dev/null +++ b/workflows/cp-leaveout/scripts/compare-errors.py @@ -0,0 +1,67 @@ + +# COMPARE ERRORS PY + +# Input: Provide two experiment DIRECTORIES and OUTPUT file +# Output: NODE_ID EPOCHS1 ERROR1 EPOCHS2 ERROR2 + +# Could easily be updated to pull out only one error stat +# (see commented code) + +import argparse, pickle + +parser = argparse.ArgumentParser(description="Parse all log files") +parser.add_argument("directory1", + help="The 1st experiment directory (EXPID)") +parser.add_argument("directory2", + help="The 2nd experiment directory (EXPID)") +# parser.add_argument("error", +# help="The error type to compare") +parser.add_argument("output", + help="The output file") + +args = parser.parse_args() + +# logging.basicConfig(level=logging.DEBUG, format="%(message)s") +# logger = logging.getLogger("extract_node_info") + +node_pkl_1 = args.directory1 + "/node-info.pkl" +node_pkl_2 = args.directory2 + "/node-info.pkl" + +# known_errors = ["mse", "mae", "r2", "corr"] +# if args.error not in known_errors: +# print("given error '%s' not in known errors: %s" % +# (args.error, known_errors)) +# exit(1) + +with open(node_pkl_1, "rb") as fp: + nodes_1 = pickle.load(fp) +with open(node_pkl_2, "rb") as fp: + nodes_2 = pickle.load(fp) +# print("%i %i" % (len(nodes_1), len(nodes_2))) + +def get_errors(node): + return "%f %f %f %f" % (node.mse, node.mae, node.r2, node.corr) + +# for node_id in nodes_1: +# print(node_id) +# exit(1) + +missing = 0 +count = 0 +with open(args.output, "w") as fp: + for node_id in nodes_2: + if node_id not in nodes_1: + print("missing: " + node_id) + missing += 1 + continue + count += 1 + epochs_1 = nodes_1[node_id].epochs_actual + errors_1 = get_errors(nodes_1[node_id]) + epochs_2 = nodes_2[node_id].epochs_actual + errors_2 = get_errors(nodes_2[node_id]) + fp.write("%2i %s %3i %s %3i %s\n" % (count, node_id, + epochs_1, errors_1, + epochs_2, errors_2)) + +print("compared: %2i" % count) +print("missing: %2i" % missing) diff --git a/workflows/cp-leaveout/scripts/compare-errors.sh b/workflows/cp-leaveout/scripts/compare-errors.sh new file mode 100755 index 00000000..985ba3e1 --- /dev/null +++ b/workflows/cp-leaveout/scripts/compare-errors.sh @@ -0,0 +1,15 @@ +#!/bin/bash +set -eu + +# COMPARE ERRORS SH +# Compare errors from $DIR1/node-info.pkl and $DIR2/node-info.pkl +# See compare-errors.py + +THIS=$( readlink --canonicalize $( dirname $0 ) ) + +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +set -x +python3 -u $THIS/compare-errors.py $* From d80b7359d174eb642c0439a9d4054f116fd3b965 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 21 Oct 2020 11:07:27 -0500 Subject: [PATCH 108/601] Improve comments, add logging TODOs --- workflows/common/python/model_runner.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index a38d06ae..209beb38 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -18,18 +18,16 @@ print("MODEL RUNNER...") -# Andrew: Adding the following line (switching the order of the following two lines) in order to append an arbitrary model's dependencies to the path *before* the benchmarks in order to accidentally use a benchmark dependency -# append ${MODEL_PYTHON_DIR} to $PATH if variable is set +# Set PYTHONPATH: +# Let MODEL_PYTHON_DIR override default Benchmarks model locations python_dir = os.getenv("MODEL_PYTHON_DIR") if python_dir: sys.path.append(python_dir) -# append ${BENCHMARKS_ROOT}/common to $PATH if variable is set benchmarks_root = os.getenv("BENCHMARKS_ROOT") if benchmarks_root: sys.path.append(benchmarks_root+"/common") -# import candle_lrn_crv - +# Report PYTHONPATH for debugging print("sys.path:") for i in range(0, len(sys.path)-1): print("%2i: %s" % (i, sys.path[i])) @@ -74,6 +72,7 @@ def import_pkg(framework, model_name): return pkg +# TODO: Separate INFO and DEBUG messages def log(msg): global logger logger.debug(msg) @@ -160,13 +159,15 @@ def run(hyper_parameter_map, obj_return): logger.info('specified config_file: "%s"' % config_file) params_arg = { 'default_model': config_file } - # params is a python dictionary + # params is a Python dictionary params = setup_params(pkg, hyper_parameter_map, params_arg) Ps = setup_perf(params) # Run the model! + log("PKG RUN START") history = pkg.run(params) + log("PKG RUN STOP") if framework == 'keras': runner_utils.keras_clear_session(framework) @@ -181,6 +182,7 @@ def run(hyper_parameter_map, obj_return): finish = time.time() duration = finish - start + # TODO: This should be on INFO log(" DONE: run_id %s in %0.2f seconds." % (hyper_parameter_map["run_id"], duration)) return (result, history_result) From a9adecf35640be6489e70c154519af8191961efb Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 21 Oct 2020 11:08:05 -0500 Subject: [PATCH 109/601] New node-times scripts --- workflows/cp-leaveout/scripts/node-times.py | 52 +++++++++++++++++++++ workflows/cp-leaveout/scripts/node-times.sh | 12 +++++ 2 files changed, 64 insertions(+) create mode 100644 workflows/cp-leaveout/scripts/node-times.py create mode 100755 workflows/cp-leaveout/scripts/node-times.sh diff --git a/workflows/cp-leaveout/scripts/node-times.py b/workflows/cp-leaveout/scripts/node-times.py new file mode 100644 index 00000000..d7833de2 --- /dev/null +++ b/workflows/cp-leaveout/scripts/node-times.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python + +# NODE TIMES PY +# + +import argparse, json, pickle + +import Node + +parser = argparse.ArgumentParser() +parser.add_argument('dir', type=str, + help='The directory with the node-info.pkl') +args = parser.parse_args() + +node_pkl = args.dir + "/" + "node-info.pkl" + +try: + with open(node_pkl, 'rb') as fp: + D = pickle.load(fp) +except Exception as e: + print("could not read PKL file: %s\n" % node_pkl + str(e)) + exit(1) + +# Each a (time, value) record +# value=1 means job start ; value=0 means job stop +events = [] + +import datetime + +for node_id in D.keys(): + node = D[node_id] + fmt = "%Y-%m-%d %H:%M:%S" + start = datetime.datetime.strptime(node.date_start, fmt).timestamp() + stop = datetime.datetime.strptime(node.date_stop, fmt).timestamp() + events.append((start, 1)) + events.append((stop, -1)) + +events.sort() + +node_times_data = args.dir + "/node-times.data" +load = 0 + +def scale(t): + offset = 1594305000 + return (t - offset)/3600 + +with open(node_times_data, "w") as fp: + if len(events) > 0: + for event in events: + fp.write("%12.1f %i\n" % (scale(event[0]), load)) + load = load + event[1] + fp.write("%12.1f %i\n" % (scale(event[0]), load)) diff --git a/workflows/cp-leaveout/scripts/node-times.sh b/workflows/cp-leaveout/scripts/node-times.sh new file mode 100755 index 00000000..5d0732f7 --- /dev/null +++ b/workflows/cp-leaveout/scripts/node-times.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -eu + +# NODE TIMES SH + +THIS=$( readlink --canonicalize $( dirname $0 ) ) +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +set -x +python3 -u $THIS/node-times.py $* From c9a99ae4bd7f29778d6a2b24fc4200774849c4c6 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 21 Oct 2020 11:08:22 -0500 Subject: [PATCH 110/601] Script to prevent OLCF auto-deletion --- workflows/cp-leaveout/scripts/touch-all.sh | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100755 workflows/cp-leaveout/scripts/touch-all.sh diff --git a/workflows/cp-leaveout/scripts/touch-all.sh b/workflows/cp-leaveout/scripts/touch-all.sh new file mode 100755 index 00000000..2b60e592 --- /dev/null +++ b/workflows/cp-leaveout/scripts/touch-all.sh @@ -0,0 +1,16 @@ +#!/bin/sh +set -eu + +# TOUCH ALL SH +# Touch all files in given experiment directories +# to prevent auto-deletion +# Finds dot files too + +THIS=$( readlink --canonicalize $( dirname $0 ) ) + +{ + for DIR in $* + do + nice find $DIR + done +} | $THIS/count-lines.awk | xargs -n 16 touch From e6ef6b278179f9808d0aab1e1222cd858b39e659 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 21 Oct 2020 11:08:54 -0500 Subject: [PATCH 111/601] Timing script --- workflows/cp-leaveout/scripts/count-lines.awk | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100755 workflows/cp-leaveout/scripts/count-lines.awk diff --git a/workflows/cp-leaveout/scripts/count-lines.awk b/workflows/cp-leaveout/scripts/count-lines.awk new file mode 100755 index 00000000..cfd5e05f --- /dev/null +++ b/workflows/cp-leaveout/scripts/count-lines.awk @@ -0,0 +1,25 @@ +#!/usr/bin/awk -f + +# COUNT LINES AWK +# Like cat, but counts lines and time + +BEGIN { + t0 = systime() + count = 0 +} + +{ + print $0 + count++ +} + +END { + t1 = systime() + duration = t1 - t0 + if (duration == 0) + rate = "infinity" + else + rate = count/duration + print "count:", count, "in", duration, "seconds. rate:", rate \ + > "/dev/stderr" +} From 78a6eea2f5f66ef84f8476473a32ce5bc3aab6ca Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 22 Oct 2020 11:58:59 -0500 Subject: [PATCH 112/601] Update Swift/T for new opence010env --- workflows/common/sh/env-summit.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/common/sh/env-summit.sh b/workflows/common/sh/env-summit.sh index 67c480aa..4a2b6a59 100644 --- a/workflows/common/sh/env-summit.sh +++ b/workflows/common/sh/env-summit.sh @@ -25,7 +25,8 @@ MED106=/gpfs/alpine/world-shared/med106 # SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-08 # SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-11 # SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-08-19 -SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-09-02 +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-09-02 +SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-10-22 export TURBINE_HOME=$SWIFT/turbine PATH=$SWIFT/stc/bin:$PATH From f22d0f2bf2dfb58515969f8c74b841f97a254901 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 22 Oct 2020 12:55:36 -0500 Subject: [PATCH 113/601] New get_epochs_cumul() for cumulative epochs including parents --- workflows/cp-leaveout/scripts/Node.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/workflows/cp-leaveout/scripts/Node.py b/workflows/cp-leaveout/scripts/Node.py index 11777819..709b67e2 100644 --- a/workflows/cp-leaveout/scripts/Node.py +++ b/workflows/cp-leaveout/scripts/Node.py @@ -36,6 +36,8 @@ def __init__(self, id=None, logger=None): self.epochs_planned = None # Epochs actually run (consider early stopping) self.epochs_actual = 0 + # Epochs cumulative: include parents' epochs (CP weight-sharing) + self.epochs_cumul = None self.date_start = None self.date_stop = None # Training time in seconds @@ -235,6 +237,18 @@ def total_time(self, nodes): return self.time return self.time + nodes[parent].total_time(nodes) + def get_epochs_cumul(self, nodes): + ''' Epochs cumulative including parents' time ''' + if self.epochs_cumul != None: + return self.epochs_cumul + # Initialize: + self.epochs_cumul = self.epochs_actual + parent = self.parent() + if parent != None and parent in nodes: + # Add parents: + self.epochs_cumul += nodes[parent].get_epochs_cumul(nodes) + return self.epochs_cumul + def check_token(line, index, token): tokens = line.split() if tokens[index] != token: From ddb8d27fd987d371356ed5fa6f07a9e47ef33aed Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 22 Oct 2020 12:56:22 -0500 Subject: [PATCH 114/601] Rename in parallel with get_epochs_cumul() --- workflows/cp-leaveout/scripts/Node.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/scripts/Node.py b/workflows/cp-leaveout/scripts/Node.py index 709b67e2..6fc45432 100644 --- a/workflows/cp-leaveout/scripts/Node.py +++ b/workflows/cp-leaveout/scripts/Node.py @@ -231,11 +231,12 @@ def trace(self, logger, message): logger.log(level=logging.DEBUG-5, msg=("NODE: [%s] %s" % (self.id, message))) - def total_time(self, nodes): + def get_time_cumul(self, nodes): + ''' Time cumulative including parents' time ''' parent = self.parent() if parent == None: return self.time - return self.time + nodes[parent].total_time(nodes) + return self.time + nodes[parent].get_time_cumul(nodes) def get_epochs_cumul(self, nodes): ''' Epochs cumulative including parents' time ''' From 4c52ec98211b511b67b05e9f60c14af3eb1c765f Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 22 Oct 2020 12:56:51 -0500 Subject: [PATCH 115/601] Use get_epochs_cumul() --- workflows/cp-leaveout/scripts/compare-errors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/scripts/compare-errors.py b/workflows/cp-leaveout/scripts/compare-errors.py index cd1aa392..4cd50ac7 100644 --- a/workflows/cp-leaveout/scripts/compare-errors.py +++ b/workflows/cp-leaveout/scripts/compare-errors.py @@ -55,9 +55,9 @@ def get_errors(node): missing += 1 continue count += 1 - epochs_1 = nodes_1[node_id].epochs_actual + epochs_1 = nodes_1[node_id].get_epochs_cumul(nodes_1) errors_1 = get_errors(nodes_1[node_id]) - epochs_2 = nodes_2[node_id].epochs_actual + epochs_2 = nodes_2[node_id].get_epochs_cumul(nodes_2) errors_2 = get_errors(nodes_2[node_id]) fp.write("%2i %s %3i %s %3i %s\n" % (count, node_id, epochs_1, errors_1, From 1cff9288a7801d7e02c42211af8d32ad4b6f4bd7 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 22 Oct 2020 12:57:55 -0500 Subject: [PATCH 116/601] Fix comment --- workflows/cp-leaveout/scripts/Node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/Node.py b/workflows/cp-leaveout/scripts/Node.py index 6fc45432..1360e460 100644 --- a/workflows/cp-leaveout/scripts/Node.py +++ b/workflows/cp-leaveout/scripts/Node.py @@ -239,7 +239,7 @@ def get_time_cumul(self, nodes): return self.time + nodes[parent].get_time_cumul(nodes) def get_epochs_cumul(self, nodes): - ''' Epochs cumulative including parents' time ''' + ''' Epochs cumulative including parents' epochs ''' if self.epochs_cumul != None: return self.epochs_cumul # Initialize: From e8b24f8bbc4c02ec038dfa6891e895afb6fcc396 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 29 Oct 2020 10:51:21 -0500 Subject: [PATCH 117/601] New summit files --- workflows/common/sh/env-summit-i.sh | 77 +++++++++++++++++++++++ workflows/common/sh/env-summit-tf1.sh | 74 ++++++++++++++++++++++ workflows/common/sh/env-summit-tf2.sh | 83 +++++++++++++++++++++++++ workflows/common/sh/sched-summit-i.sh | 7 +++ workflows/common/sh/sched-summit-tf1.sh | 19 ++++++ workflows/common/sh/sched-summit-tf2.sh | 20 ++++++ 6 files changed, 280 insertions(+) create mode 100644 workflows/common/sh/env-summit-i.sh create mode 100644 workflows/common/sh/env-summit-tf1.sh create mode 100644 workflows/common/sh/env-summit-tf2.sh create mode 100644 workflows/common/sh/sched-summit-i.sh create mode 100644 workflows/common/sh/sched-summit-tf1.sh create mode 100644 workflows/common/sh/sched-summit-tf2.sh diff --git a/workflows/common/sh/env-summit-i.sh b/workflows/common/sh/env-summit-i.sh new file mode 100644 index 00000000..fb3d61c1 --- /dev/null +++ b/workflows/common/sh/env-summit-i.sh @@ -0,0 +1,77 @@ + +# ENV Summit Interactive +# Environment settings for Summit (Swift, Python, R, Tcl, etc.) +# GCC 6.4.0, TensorFlow 1, condaenv-200408, R 3.6.1 + +# SWIFT_IMPL=echo +SWIFT_IMPL=py + +# Let modules initialize LD_LIBRARY_PATH before changing it: +set +eu # modules create errors outside our control +module load spectrum-mpi/10.3.1.2-20200121 +module unload darshan-runtime +# module load ibm-wml-ce/1.6.2-3 +module list +set -eu + +# From Wozniak +MED106=/gpfs/alpine/world-shared/med106 +# SWIFT=$MED106/sw/gcc-7.4.0/swift-t/2019-10-18 # Python (ibm-wml), no R +# SWIFT=$MED106/sw/gcc-7.4.0/swift-t/2019-11-06 # Python (ibm-wml) and R +# Python (ibm-wml-ce/1.7.0-1) and R: +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-03-31-c +# Python (ibm-wml-ce/1.6.2-3) and R: +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-02 +# Python (med106/sw/condaenv-200408) and R: +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-08 +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-11 +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-08-19 +SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-09-02 + +export TURBINE_HOME=$SWIFT/turbine +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH + +# log_path PATH + +# IBM_WML_CE=/autofs/nccs-svm1_sw/summit/ibm-wml-ce/anaconda-base/envs/ibm-wml-ce-1.6.2-3 + +# export LD_LIBRARY_PATH +# LD_LIBRARY_PATH=$IBM_WML_CE/lib:$LD_LIBRARY_PATH + +# Inject Python to PATH using PRELAUNCH: +# This would be better, but is broken for ZSH users: +# module load ibm-wml-ce/1.6.2-3 +# Must use PATH directly: +# export TURBINE_PRELAUNCH="PATH=$IBM_WML_CE/bin:\$PATH" + +R=/gpfs/alpine/world-shared/med106/wozniak/sw/gcc-6.4.0/R-3.6.1/lib64/R +LD_LIBRARY_PATH+=:$R/lib + +PY=/gpfs/alpine/world-shared/med106/sw/condaenv-200408 +# LD_LIBRARY_PATH+=:$PY/lib +export PYTHONHOME=$PY + +PATH=$PY/bin:$PATH + +export LD_LIBRARY_PATH=$PY/lib:$LD_LIBRARY_PATH + +# EMEWS Queues for R +EQR=$MED106/wozniak/sw/gcc-6.4.0/EQ-R +EQPy=$WORKFLOWS_ROOT/common/ext/EQ-Py + +# For test output processing: +LOCAL=0 +CRAY=1 + +# Resident task worker count and rank list +# If this is already set, we respect the user settings +# If this is unset, we set it to 1 +# and run the algorithm on the 2nd highest rank +# This value is only read in HPO workflows +if [[ ${TURBINE_RESIDENT_WORK_WORKERS:-} == "" ]] +then + export TURBINE_RESIDENT_WORK_WORKERS=1 + export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) +fi + diff --git a/workflows/common/sh/env-summit-tf1.sh b/workflows/common/sh/env-summit-tf1.sh new file mode 100644 index 00000000..89c07381 --- /dev/null +++ b/workflows/common/sh/env-summit-tf1.sh @@ -0,0 +1,74 @@ + +# ENV Summit TF1 +# Environment settings for Summit (Swift, Python, R, Tcl, etc.) +# GCC 6.4.0, TensorFlow 1, condaenv-200408, R 3.6.1 + +# SWIFT_IMPL=echo +SWIFT_IMPL=py + +# Let modules initialize LD_LIBRARY_PATH before changing it: +set +eu # modules create errors outside our control +module load spectrum-mpi/10.3.1.2-20200121 +module unload darshan-runtime +# module load ibm-wml-ce/1.6.2-3 +module list +set -eu + +# From Wozniak +MED106=/gpfs/alpine/world-shared/med106 +# SWIFT=$MED106/sw/gcc-7.4.0/swift-t/2019-10-18 # Python (ibm-wml), no R +# SWIFT=$MED106/sw/gcc-7.4.0/swift-t/2019-11-06 # Python (ibm-wml) and R +# Python (ibm-wml-ce/1.7.0-1) and R: +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-03-31-c +# Python (ibm-wml-ce/1.6.2-3) and R: +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-02 +# Python (med106/sw/condaenv-200408) and R: +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-08 +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-11 +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-08-19 +SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-09-02 + +export TURBINE_HOME=$SWIFT/turbine +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH + +# log_path PATH + +# IBM_WML_CE=/autofs/nccs-svm1_sw/summit/ibm-wml-ce/anaconda-base/envs/ibm-wml-ce-1.6.2-3 + +# export LD_LIBRARY_PATH +# LD_LIBRARY_PATH=$IBM_WML_CE/lib:$LD_LIBRARY_PATH + +# Inject Python to PATH using PRELAUNCH: +# This would be better, but is broken for ZSH users: +# module load ibm-wml-ce/1.6.2-3 +# Must use PATH directly: +# export TURBINE_PRELAUNCH="PATH=$IBM_WML_CE/bin:\$PATH" + +R=/gpfs/alpine/world-shared/med106/wozniak/sw/gcc-6.4.0/R-3.6.1/lib64/R +LD_LIBRARY_PATH+=:$R/lib + +PY=/gpfs/alpine/world-shared/med106/sw/condaenv-200408 +LD_LIBRARY_PATH+=:$PY/lib +export PYTHONHOME=$PY + +export LD_LIBRARY_PATH=/gpfs/alpine/world-shared/med106/sw/condaenv-200408/lib:$LD_LIBRARY_PATH + +# EMEWS Queues for R +EQR=$MED106/wozniak/sw/gcc-6.4.0/EQ-R +EQPy=$WORKFLOWS_ROOT/common/ext/EQ-Py + +# For test output processing: +LOCAL=0 +CRAY=1 + +# Resident task worker count and rank list +# If this is already set, we respect the user settings +# If this is unset, we set it to 1 +# and run the algorithm on the 2nd highest rank +# This value is only read in HPO workflows +if [[ ${TURBINE_RESIDENT_WORK_WORKERS:-} == "" ]] +then + export TURBINE_RESIDENT_WORK_WORKERS=1 + export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) +fi diff --git a/workflows/common/sh/env-summit-tf2.sh b/workflows/common/sh/env-summit-tf2.sh new file mode 100644 index 00000000..b2056bee --- /dev/null +++ b/workflows/common/sh/env-summit-tf2.sh @@ -0,0 +1,83 @@ + +# ENV Summit +# Environment settings for Summit (Swift, Python, R, Tcl, etc.) +# GCC 7.4.0, TensorFlow 2, opence010env, R 3.6.1 + +# SWIFT_IMPL=echo +SWIFT_IMPL=py + +# Let modules initialize LD_LIBRARY_PATH before changing it: +set +eu # modules create errors outside our control +module load spectrum-mpi/10.3.1.2-20200121 +module unload darshan-runtime +# module load ibm-wml-ce/1.6.2-3 +module list +set -eu + +# From Wozniak +MED106=/gpfs/alpine/world-shared/med106 +# SWIFT=$MED106/sw/gcc-7.4.0/swift-t/2019-10-18 # Python (ibm-wml), no R +# SWIFT=$MED106/sw/gcc-7.4.0/swift-t/2019-11-06 # Python (ibm-wml) and R +# Python (ibm-wml-ce/1.7.0-1) and R: +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-03-31-c +# Python (ibm-wml-ce/1.6.2-3) and R: +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-02 +# Python (med106/sw/condaenv-200408) and R: +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-08 +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-11 +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-08-19 +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-09-02 +SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-10-22 + +export TURBINE_HOME=$SWIFT/turbine +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH + +# log_path PATH + +# IBM_WML_CE=/autofs/nccs-svm1_sw/summit/ibm-wml-ce/anaconda-base/envs/ibm-wml-ce-1.6.2-3 + +# export LD_LIBRARY_PATH +# LD_LIBRARY_PATH=$IBM_WML_CE/lib:$LD_LIBRARY_PATH + +# Inject Python to PATH using PRELAUNCH: +# This would be better, but is broken for ZSH users: +# module load ibm-wml-ce/1.6.2-3 +# Must use PATH directly: +# export TURBINE_PRELAUNCH="PATH=$IBM_WML_CE/bin:\$PATH" + +R=/gpfs/alpine/world-shared/med106/wozniak/sw/gcc-6.4.0/R-3.6.1/lib64/R +LD_LIBRARY_PATH+=:$R/lib + +# PY=/gpfs/alpine/world-shared/med106/sw/condaenv-200408 +PY=$MED106/sw2/opence010env +LD_LIBRARY_PATH+=:$PY/lib +LD_LIBRARY_PATH+=:/lib64 # we need this path to be before the $PY/lib one, which is added below, or else for compiling using mpicc we get the error "/usr/bin/uuidgen: /gpfs/alpine/world-shared/med106/sw/condaenv-200408/lib/libuuid.so.1: no version information available (required by /usr/bin/uuidgen)" +export PYTHONHOME=$PY + +PATH=$PY/bin:$PATH + +# ALW 9/28/20: This path is already added, albeit to the end rather than the beginning, in the LD_LIBRARY_PATH+=:$PY/lib line above +#export LD_LIBRARY_PATH=/gpfs/alpine/world-shared/med106/sw/condaenv-200408/lib:$ + +# ALW 10/1/20: Adding this per Justin and my experiments and discussion on 9/30/20 and 10/1/20 +export LD_LIBRARY_PATH="/sw/summit/gcc/7.4.0/lib64:$LD_LIBRARY_PATH:/sw/summit/gcc/6.4.0/lib64" + +# EMEWS Queues for R +EQR=$MED106/wozniak/sw/gcc-6.4.0/EQ-R +EQPy=$WORKFLOWS_ROOT/common/ext/EQ-Py + +# For test output processing: +LOCAL=0 +CRAY=1 + +# Resident task worker count and rank list +# If this is already set, we respect the user settings +# If this is unset, we set it to 1 +# and run the algorithm on the 2nd highest rank +# This value is only read in HPO workflows +if [[ ${TURBINE_RESIDENT_WORK_WORKERS:-} == "" ]] +then + export TURBINE_RESIDENT_WORK_WORKERS=1 + export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) +fi diff --git a/workflows/common/sh/sched-summit-i.sh b/workflows/common/sh/sched-summit-i.sh new file mode 100644 index 00000000..2ecbc13f --- /dev/null +++ b/workflows/common/sh/sched-summit-i.sh @@ -0,0 +1,7 @@ + +# SCHED Summit Interactive +# Scheduler settings for Swift/Summit + +MACHINE="" + +# export TURBINE_OUTPUT_SOFTLINK=/dev/null diff --git a/workflows/common/sh/sched-summit-tf1.sh b/workflows/common/sh/sched-summit-tf1.sh new file mode 100644 index 00000000..726bae2f --- /dev/null +++ b/workflows/common/sh/sched-summit-tf1.sh @@ -0,0 +1,19 @@ + +# SCHED Summit TF1 +# Scheduler settings for Swift/Summit + +if (( ${INTERACTIVE:-0} )) +then + # Interactive settings + MACHINE="" + export TURBINE_LAUNCHER=jsrun +else + # Use LSF: + MACHINE="-m lsf" +fi + +# Default PROJECT for CANDLE +#export QUEUE=${QUEUE:-batch-hm} +export PROJECT=${PROJECT:-MED106} + +# export TURBINE_OUTPUT_SOFTLINK=/dev/null diff --git a/workflows/common/sh/sched-summit-tf2.sh b/workflows/common/sh/sched-summit-tf2.sh new file mode 100644 index 00000000..fe965bff --- /dev/null +++ b/workflows/common/sh/sched-summit-tf2.sh @@ -0,0 +1,20 @@ + +# SCHED Summit TF2 +# Scheduler settings for Swift/Summit + +if (( ${INTERACTIVE:-0} )) +then + # Interactive settings + MACHINE="" + export TURBINE_LAUNCHER=jsrun +else + # Use LSF: + MACHINE="-m lsf" +fi + +# Default PROJECT for CANDLE +#export QUEUE=${QUEUE:-batch-hm} +export PROJECT=${PROJECT:-MED106} + +# export TURBINE_OUTPUT_SOFTLINK=/dev/null + From 52ce49425077a3c54d84cacde819a29c42209a4a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 29 Oct 2020 10:54:34 -0500 Subject: [PATCH 118/601] Update summit-login --- workflows/common/sh/env-summit-login.sh | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/workflows/common/sh/env-summit-login.sh b/workflows/common/sh/env-summit-login.sh index df8033cc..0c58119c 100644 --- a/workflows/common/sh/env-summit-login.sh +++ b/workflows/common/sh/env-summit-login.sh @@ -2,15 +2,18 @@ # ENV SUMMIT LOGIN # Environment settings for Summit login node (Swift, Python, R, Tcl, etc.) +SWIFT_IMPL=echo # SWIFT_IMPL=app -SWIFT_IMPL=py +# SWIFT_IMPL=py # Load basic LD_LIBRARY_PATH before changing it: -module load gcc/7.4.0 -module load ibm-wml +# module load gcc/7.4.0 +module load gcc/6.4.0 +# module load ibm-wml module unload darshan-runtime module unload spectrum-mpi -module load gcc/7.4.0 +# module load gcc/7.4.0 +module load gcc/6.4.0 module list @@ -18,10 +21,12 @@ log_path PATH # From Wozniak MED106=/gpfs/alpine/world-shared/med106 -SWIFT=$MED106/sw/login/gcc-7.4.0/swift-t/2019-10-22 # Python (ibm-wml), no R - +# SWIFT=$MED106/sw/login/gcc-7.4.0/swift-t/2019-10-22 # Python (ibm-wml), no R +SWIFT=$MED106/wozniak/sw/login/gcc-6.4.0/swift-t/2020-10-22 # (opence010env) +# MPICH=$MED106/sw/login/gcc-7.4.0/mpich-3.2.1/bin +MPICH=$MED106/sw/login/gcc-6.4.0/mpich-3.2.1 PATH=$SWIFT/stc/bin:$PATH -PATH=$MED106/sw/login/gcc-7.4.0/mpich-3.2.1/bin:$PATH +PATH=$MPICH/bin:$PATH # log_path PATH @@ -45,7 +50,7 @@ LOCAL=0 CRAY=1 # Resident task workers and ranks -if [ -z ${TURBINE_RESIDENT_WORK_WORKERS+x} ] +if [[ ${TURBINE_RESIDENT_WORK_WORKERS:-} == "" ]] then # Resident task workers and ranks export TURBINE_RESIDENT_WORK_WORKERS=1 From e763c8183a71f55a655cf7fbf9e514b8c582c648 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 29 Oct 2020 10:55:45 -0500 Subject: [PATCH 119/601] Mark these as deprecated --- workflows/common/sh/env-summit.sh | 13 ++++--------- workflows/common/sh/sched-summit.sh | 5 +++-- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/workflows/common/sh/env-summit.sh b/workflows/common/sh/env-summit.sh index 4a2b6a59..21bd231b 100644 --- a/workflows/common/sh/env-summit.sh +++ b/workflows/common/sh/env-summit.sh @@ -1,5 +1,6 @@ -# ENV Summit +# ENV Summit - DEPRECATED - (Wozniak, 2020-10-29) +# Use summit-tf1 or summit-tf2 # Environment settings for Summit (Swift, Python, R, Tcl, etc.) # SWIFT_IMPL=echo @@ -25,8 +26,7 @@ MED106=/gpfs/alpine/world-shared/med106 # SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-08 # SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-11 # SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-08-19 -# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-09-02 -SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-10-22 +SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-09-02 export TURBINE_HOME=$SWIFT/turbine PATH=$SWIFT/stc/bin:$PATH @@ -49,15 +49,10 @@ R=/gpfs/alpine/world-shared/med106/wozniak/sw/gcc-6.4.0/R-3.6.1/lib64/R LD_LIBRARY_PATH+=:$R/lib PY=/gpfs/alpine/world-shared/med106/sw/condaenv-200408 -LD_LIBRARY_PATH+=:/lib64 # we need this path to be before the $PY/lib one, which is added below, or else for compiling using mpicc we get the error "/usr/bin/uuidgen: /gpfs/alpine/world-shared/med106/sw/condaenv-200408/lib/libuuid.so.1: no version information available (required by /usr/bin/uuidgen)" LD_LIBRARY_PATH+=:$PY/lib export PYTHONHOME=$PY -# ALW 9/28/20: This path is already added, albeit to the end rather than the beginning, in the LD_LIBRARY_PATH+=:$PY/lib line above -#export LD_LIBRARY_PATH=/gpfs/alpine/world-shared/med106/sw/condaenv-200408/lib:$ - -# ALW 10/1/20: Adding this per Justin and my experiments and discussion on 9/30/20 and 10/1/20 -export LD_LIBRARY_PATH="/sw/summit/gcc/6.4.0/lib64:$LD_LIBRARY_PATH" +export LD_LIBRARY_PATH=/gpfs/alpine/world-shared/med106/sw/condaenv-200408/lib:$LD_LIBRARY_PATH # EMEWS Queues for R EQR=$MED106/wozniak/sw/gcc-6.4.0/EQ-R diff --git a/workflows/common/sh/sched-summit.sh b/workflows/common/sh/sched-summit.sh index 514ceab2..f4e831e2 100644 --- a/workflows/common/sh/sched-summit.sh +++ b/workflows/common/sh/sched-summit.sh @@ -1,11 +1,12 @@ -# SCHED Summit +# SCHED Summit - DEPRECATED - (Wozniak, 2020-10-29) +# Use summit-tf1 or summit-tf2 # Scheduler settings for Swift/Summit MACHINE="-m lsf" # Default PROJECT for CANDLE #export QUEUE=${QUEUE:-batch-hm} -export PROJECT=${PROJECT:-MED110} +export PROJECT=${PROJECT:-MED106} # export TURBINE_OUTPUT_SOFTLINK=/dev/null From 5d798eeebbcb8fab46cc67954d378e763041883e Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Thu, 12 Nov 2020 00:59:36 -0500 Subject: [PATCH 120/601] Put back in a few changes to env-summit-tf1.sh which used to be env-summit.sh: added /lib64 back into LD_LIBRARY_PATH prior to the addition of $PY/lib; removed redundant and differently placed $PY/lib to the beginning of LD_LIBRARY_PATH below; and added back in the gcc library paths to LD_LIBRARY_PATH to the beginning in order to make Swift/T work --- workflows/common/sh/env-summit-tf1.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/workflows/common/sh/env-summit-tf1.sh b/workflows/common/sh/env-summit-tf1.sh index 89c07381..f0305a59 100644 --- a/workflows/common/sh/env-summit-tf1.sh +++ b/workflows/common/sh/env-summit-tf1.sh @@ -49,10 +49,15 @@ R=/gpfs/alpine/world-shared/med106/wozniak/sw/gcc-6.4.0/R-3.6.1/lib64/R LD_LIBRARY_PATH+=:$R/lib PY=/gpfs/alpine/world-shared/med106/sw/condaenv-200408 +LD_LIBRARY_PATH+=:/lib64 # we need this path to be before the $PY/lib one, which is added right below, or else for compiling using mpicc we get the error "/usr/bin/uuidgen: /gpfs/alpine/world-shared/med106/sw/condaenv-200408/lib/libuuid.so.1: no version information available (required by /usr/bin/uuidgen)" LD_LIBRARY_PATH+=:$PY/lib export PYTHONHOME=$PY -export LD_LIBRARY_PATH=/gpfs/alpine/world-shared/med106/sw/condaenv-200408/lib:$LD_LIBRARY_PATH +# ALW 11/12/20: Again, this path is already added, albeit to the end rather than the beginning, in the LD_LIBRARY_PATH+=:$PY/lib line above +#export LD_LIBRARY_PATH=/gpfs/alpine/world-shared/med106/sw/condaenv-200408/lib:$LD_LIBRARY_PATH + +# ALW 11/12/20: Again, adding this per Justin and my experiments and discussion on 9/30/20 and 10/1/20 +export LD_LIBRARY_PATH="/sw/summit/gcc/6.4.0/lib64:$LD_LIBRARY_PATH" # EMEWS Queues for R EQR=$MED106/wozniak/sw/gcc-6.4.0/EQ-R From 04c2910cea4b1a9a9c1400710a1355b96d0adda3 Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Fri, 13 Nov 2020 22:54:02 -0500 Subject: [PATCH 121/601] Adding cfg-sys-summit-tf1.sh --- workflows/common/sh/cfg-sys-summit-tf1.sh | 59 +++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 workflows/common/sh/cfg-sys-summit-tf1.sh diff --git a/workflows/common/sh/cfg-sys-summit-tf1.sh b/workflows/common/sh/cfg-sys-summit-tf1.sh new file mode 100644 index 00000000..529772c6 --- /dev/null +++ b/workflows/common/sh/cfg-sys-summit-tf1.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# UPF CFG SYS 1 + +# The number of MPI processes +# Note that 1 process is reserved for Swift/T +# For example, if PROCS=4 that gives you 3 workers, +# i.e., 3 concurrent Keras runs. +export PROCS=${PROCS:-2} + +# MPI processes per node. This should not exceed PROCS. +# Cori has 32 cores per node, 128GB per node +export PPN=${PPN:-1} + +#export QUEUE=${QUEUE:-batch} + +# Cori: (cf. sched-cori) +# export QUEUE=${QUEUE:-debug} +# Cori queues: debug, regular +# export QUEUE=regular +# export QUEUE=debug +# CANDLE on Cori: +# export PROJECT=m2924 + +# Theta: (cf. sched-theta) +# export QUEUE=${QUEUE:-debug-cache-quad} +#export QUEUE=${QUEUE:-debug-flat-quad} +# export PROJECT=${PROJECT:-ecp-testbed-01} +# export PROJECT=Candle_ECP +#export PROJECT=CSC249ADOA01 + +# Summit: +export QUEUE=${QUEUE:-batch} + +export PROJECT=${PROJECT:-med106} + +export WALLTIME=${WALLTIME:-0:30} + +# export MAIL_ENABLED=1 +# export MAIL_ADDRESS=wozniak@mcs.anl.gov + +# Benchmark run timeout: benchmark run will timeout +# after the specified number of seconds. -1 is no timeout. +BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} # probably not needed but this variable is baked into rest of code, e.g., workflow.sh + +# Uncomment below to use custom python script to run +# Use file name without .py (e.g, my_script.py) +# BENCHMARK_DIR=/path/to/ +# MODEL_PYTHON_SCRIPT=my_script + +# Shell timeout: benchmark run will be killed +# after the specified number of seconds. +# If set to -1 or empty there is no timeout. +# This timeout is implemented with the shell command 'timeout' +export SH_TIMEOUT=${SH_TIMEOUT:-} + +# Ignore errors: If 1, unknown errors will be reported to model.log +# but will not bring down the Swift workflow. See model.sh . +export IGNORE_ERRORS=0 From af8b2473a5a1bf808b2791d0f007397c46df79c4 Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Sat, 14 Nov 2020 00:13:24 -0500 Subject: [PATCH 122/601] I reverted env-summit-tf1.sh back to what Justin had it at at the end of October I believe... see setup-biowulf.md for more details, but long story short while applying my fixes to that file fixed the uuidgen error in at least the setup process, it seemed to cause import-tensorflow-related issues in model_runner.py when actually trying to run CANDLE proper --- workflows/common/sh/env-summit-tf1.sh | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/workflows/common/sh/env-summit-tf1.sh b/workflows/common/sh/env-summit-tf1.sh index f0305a59..89c07381 100644 --- a/workflows/common/sh/env-summit-tf1.sh +++ b/workflows/common/sh/env-summit-tf1.sh @@ -49,15 +49,10 @@ R=/gpfs/alpine/world-shared/med106/wozniak/sw/gcc-6.4.0/R-3.6.1/lib64/R LD_LIBRARY_PATH+=:$R/lib PY=/gpfs/alpine/world-shared/med106/sw/condaenv-200408 -LD_LIBRARY_PATH+=:/lib64 # we need this path to be before the $PY/lib one, which is added right below, or else for compiling using mpicc we get the error "/usr/bin/uuidgen: /gpfs/alpine/world-shared/med106/sw/condaenv-200408/lib/libuuid.so.1: no version information available (required by /usr/bin/uuidgen)" LD_LIBRARY_PATH+=:$PY/lib export PYTHONHOME=$PY -# ALW 11/12/20: Again, this path is already added, albeit to the end rather than the beginning, in the LD_LIBRARY_PATH+=:$PY/lib line above -#export LD_LIBRARY_PATH=/gpfs/alpine/world-shared/med106/sw/condaenv-200408/lib:$LD_LIBRARY_PATH - -# ALW 11/12/20: Again, adding this per Justin and my experiments and discussion on 9/30/20 and 10/1/20 -export LD_LIBRARY_PATH="/sw/summit/gcc/6.4.0/lib64:$LD_LIBRARY_PATH" +export LD_LIBRARY_PATH=/gpfs/alpine/world-shared/med106/sw/condaenv-200408/lib:$LD_LIBRARY_PATH # EMEWS Queues for R EQR=$MED106/wozniak/sw/gcc-6.4.0/EQ-R From 169f43b87f5774f28b845700af791bf44025e484 Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Mon, 16 Nov 2020 01:59:46 -0500 Subject: [PATCH 123/601] Mostly added changes to allow for Justin's nomenclature of adding -things to $SITE variables, e.g., summit-tf1 --- workflows/common/sh/utils.sh | 14 ++++++++------ workflows/mlrMBO/swift/workflow.sh | 4 +++- workflows/upf/swift/workflow.sh | 4 +++- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index f5e86b8a..a59378a9 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -357,22 +357,24 @@ queue_wait_site() SITE=$1 JOBID=$2 - if [[ $SITE == "cori" ]] + site2=$(echo $SITE | awk -v FS="-" '{print $1}') # ALW 2020-11-15: allow $SITEs to have hyphens in them as Justin implemented for Summit on 2020-10-29, e.g., summit-tf1 + + if [[ $site2 == "cori" ]] then queue_wait_slurm $JOBID - elif [[ $SITE == "theta" ]] + elif [[ $site2 == "theta" ]] then queue_wait_cobalt $JOBID - elif [[ $SITE == "titan" ]] + elif [[ $site2 == "titan" ]] then queue_wait_pbs $JOBID - elif [[ $SITE == "summit" ]] + elif [[ $site2 == "summit" ]] then queue_wait_lsf $JOBID - elif [[ $SITE == "pascal" ]] + elif [[ $site2 == "pascal" ]] then queue_wait_slurm $JOBID - elif [[ $SITE == "biowulf" ]] + elif [[ $site2 == "biowulf" ]] then queue_wait_slurm $JOBID else diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index 620cee96..1f94138e 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -130,8 +130,10 @@ then echo "Turbine will wait for job completion." fi +site2=$(echo $SITE | awk -v FS="-" '{print $1}') # ALW 2020-11-15: allow $SITEs to have hyphens in them as Justin implemented for Summit on 2020-10-29, e.g., summit-tf1 + # Use for Summit (LSF needs two %) -if [[ ${SITE:-} == "summit" ]] +if [[ ${site2:-} == "summit" ]] then export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" else diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index e0a63e1a..c030d968 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -88,7 +88,9 @@ module list cp -v $UPF $TURBINE_OUTPUT -if [[ ${SITE} == "summit" ]] +site2=$(echo $SITE | awk -v FS="-" '{print $1}') # ALW 2020-11-15: allow $SITEs to have hyphens in them as Justin implemented for Summit on 2020-10-29, e.g., summit-tf1 + +if [ ${site2} == "summit" -a "x$CANDLE_RUN_WORKFLOW" != "x1" ] # ALW 2020-11-15: If we're running the candle wrapper scripts in which case if this file were being called then $CANDLE_RUN_WORKFLOW=1, don't set $TURBINE_LAUNCH_OPTIONS as this variable and the settings in the declaration below are handled by the wrapper scripts then export TURBINE_LAUNCH_OPTIONS="-r6 -a1 -g1 -c7" fi From 05e3a2f9cb88ae2d129fadf18d97ef6c09c5b99a Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Mon, 23 Nov 2020 18:45:12 -0500 Subject: [PATCH 124/601] Cleaned up env-biowulf.sh --- workflows/common/sh/env-biowulf.sh | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/workflows/common/sh/env-biowulf.sh b/workflows/common/sh/env-biowulf.sh index 12c6b2f5..97fd9964 100644 --- a/workflows/common/sh/env-biowulf.sh +++ b/workflows/common/sh/env-biowulf.sh @@ -1,5 +1,6 @@ #!/bin/bash +# Note: It probably would make most sense to source site-specific_settings.sh here and then to use below the variables set in that file # Prerequisite: Assume the candle module is loaded as usual @@ -28,12 +29,10 @@ export MANPATH="/data/BIDS-HPC/public/software/builds/tcl/man:$MANPATH" export PATH="$PATH:/usr/local/apps/R/4.0/4.0.0/bin" export LIBRARY_PATH="$LIBRARY_PATH:/usr/local/intel/compilers_and_libraries_2019.1.144/linux/mkl/lib/intel64" export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/intel/compilers_and_libraries_2019.1.144/linux/mkl/lib/intel64" -#export R_LIBS_USER="$R_LIBS_USER:~/R/%v/library" -# What we want: if $R_LIBS_USER is set then R_LIBS_USER="$R_LIBS_USER:~/R/%v/library", otherwise R_LIBS_USER="~/R/%v/library" if [ -z ${R_LIBS_USER+x} ]; then - R_LIBS_USER="~/R/%v/library" + R_LIBS_USER="$HOME/R/%v/library" else - R_LIBS_USER="$R_LIBS_USER:~/R/%v/library" + R_LIBS_USER="$R_LIBS_USER:$HOME/R/%v/library" fi export R_LIBS_SITE="$CANDLE_DEP_R_SITE" export R_LIBS="$CANDLE/R/libs" @@ -63,19 +62,14 @@ export EQR="$CANDLE/Supervisor/workflows/common/ext/EQ-R" export EQPy="$CANDLE/Supervisor/workflows/common/ext/EQ-Py" # This is how Tim Miller told me to run interactive and batch MPI jobs on Biowulf GPU nodes recently (Aug/Sep 2020) -#if [ "x$SLURM_JOB_PARTITION" == "xinteractive" ]; then if [ "x${SLURM_JOB_PARTITION:-batch}" == "xinteractive" ]; then - #export TURBINE_LAUNCH_OPTIONS=("--mpi=pmix" "--mem=0") export TURBINE_LAUNCH_OPTIONS+=" --mpi=pmix --mem=0" else - #export TURBINE_LAUNCH_OPTIONS=("--mpi=pmix") export TURBINE_LAUNCH_OPTIONS+=" --mpi=pmix" fi - -export TURBINE_MPI_THREAD=0 - - +# This prevents PMIx errors I believe +export TURBINE_MPI_THREAD=0 # only currently used in Supervisor/workflows/upf/swift/workflow.sh #################################################################################################################################### From 21eb0b4ed26b23f355f67f5a352923531cea681f Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Mon, 23 Nov 2020 23:25:56 -0500 Subject: [PATCH 125/601] Added test comment line to env-biowulf.sh --- workflows/common/sh/env-biowulf.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/common/sh/env-biowulf.sh b/workflows/common/sh/env-biowulf.sh index 97fd9964..2281f460 100644 --- a/workflows/common/sh/env-biowulf.sh +++ b/workflows/common/sh/env-biowulf.sh @@ -2,6 +2,7 @@ # Note: It probably would make most sense to source site-specific_settings.sh here and then to use below the variables set in that file # Prerequisite: Assume the candle module is loaded as usual +# Test comment line #### Set variables for CANDLE dependencies (mostly, Swift/T dependencies) ########################################################## From 10c9a6c23395416e7ee6e838a28a797f539ca7ad Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Mon, 23 Nov 2020 23:54:38 -0500 Subject: [PATCH 126/601] Deleted test comment line in env-biowulf.sh --- workflows/common/sh/env-biowulf.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/workflows/common/sh/env-biowulf.sh b/workflows/common/sh/env-biowulf.sh index 2281f460..97fd9964 100644 --- a/workflows/common/sh/env-biowulf.sh +++ b/workflows/common/sh/env-biowulf.sh @@ -2,7 +2,6 @@ # Note: It probably would make most sense to source site-specific_settings.sh here and then to use below the variables set in that file # Prerequisite: Assume the candle module is loaded as usual -# Test comment line #### Set variables for CANDLE dependencies (mostly, Swift/T dependencies) ########################################################## From d1dbda6b97708bda85715fffb4d4170d18feb4fb Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Tue, 24 Nov 2020 00:07:26 -0500 Subject: [PATCH 127/601] Adding a second test comment line, this time using a new branch --- workflows/common/sh/env-biowulf.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/common/sh/env-biowulf.sh b/workflows/common/sh/env-biowulf.sh index 97fd9964..c53c697f 100644 --- a/workflows/common/sh/env-biowulf.sh +++ b/workflows/common/sh/env-biowulf.sh @@ -2,6 +2,7 @@ # Note: It probably would make most sense to source site-specific_settings.sh here and then to use below the variables set in that file # Prerequisite: Assume the candle module is loaded as usual +# This is a second test comment line #### Set variables for CANDLE dependencies (mostly, Swift/T dependencies) ########################################################## From 726479bff917c24dbb9f8938c5fd5ca983b07242 Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Tue, 1 Dec 2020 11:27:08 -0500 Subject: [PATCH 128/601] Updated langs-app-biowulf.sh with correct Python module variable and forced Biowulf to use two percent signs in mlrMBO workflow.sh just like Summit --- workflows/common/sh/langs-app-biowulf.sh | 6 +++--- workflows/mlrMBO/swift/workflow.sh | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/workflows/common/sh/langs-app-biowulf.sh b/workflows/common/sh/langs-app-biowulf.sh index 869b60af..c844f1c6 100644 --- a/workflows/common/sh/langs-app-biowulf.sh +++ b/workflows/common/sh/langs-app-biowulf.sh @@ -1,9 +1,9 @@ -# LANGS APP Singularity on Biowulf -# Language settings for singularity app functions (Python, R, etc.) +# LANGS APP Biowulf +# Language settings for app functions (Python, R, etc.) # Load the environment in which CANDLE was built -module load $DEFAULT_PYTHON_MODULE +module load "$CANDLE_DEFAULT_PYTHON_MODULE" #module load openmpi/3.1.2/cuda-9.0/gcc-7.3.0-pmi2 cuDNN/7.1/CUDA-9.0 CUDA/9.0 #source /data/$USER/conda/etc/profile.d/conda.sh diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index 1f94138e..6fa45be8 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -132,8 +132,8 @@ fi site2=$(echo $SITE | awk -v FS="-" '{print $1}') # ALW 2020-11-15: allow $SITEs to have hyphens in them as Justin implemented for Summit on 2020-10-29, e.g., summit-tf1 -# Use for Summit (LSF needs two %) -if [[ ${site2:-} == "summit" ]] +# Use for Summit (LSF needs two %)... actually, it may not be LSF as Biowulf (which uses SLURM) seems to need this too now +if [ ${site2:-} == "summit" ] || [ ${site2:-} == "biowulf" ] then export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" else From aa12088ea2e3db35d4626f00d667565a5860d077 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Wed, 2 Dec 2020 11:13:28 -0600 Subject: [PATCH 129/601] thread setting should be handled in benchmark code level --- .../common/python/model_abstention_runner.py | 16 ---------------- workflows/common/python/model_runner.py | 16 ---------------- 2 files changed, 32 deletions(-) diff --git a/workflows/common/python/model_abstention_runner.py b/workflows/common/python/model_abstention_runner.py index b85a532b..5747f40c 100644 --- a/workflows/common/python/model_abstention_runner.py +++ b/workflows/common/python/model_abstention_runner.py @@ -45,22 +45,6 @@ def import_pkg(framework, model_name): module_name = "{}_abstention_keras2".format(model_name) print ("module_name:", module_name) pkg = importlib.import_module(module_name) - - # For Summit: - from tensorflow.keras import backend as K - # For other systems: - # from keras import backend as K - if K.backend() == 'tensorflow' and 'NUM_INTER_THREADS' in os.environ: - import tensorflow as tf - inter_threads = int(os.environ['NUM_INTER_THREADS']) - intra_threads = int(os.environ['NUM_INTRA_THREADS']) - print("Configuring tensorflow with {} inter threads and " + - "{} intra threads" - .format(inter_threads, intra_threads)) - cfg = tf.ConfigProto(inter_op_parallelism_threads=inter_threads, - intra_op_parallelism_threads=intra_threads) - sess = tf.Session(graph=tf.get_default_graph(), config=cfg) - K.set_session(sess) elif framework == 'pytorch': import torch if module_name == None or module_name == "": diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index 209beb38..050d12d6 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -43,22 +43,6 @@ def import_pkg(framework, model_name): module_name = "{}_baseline_keras2".format(model_name) print ("module_name:", module_name) pkg = importlib.import_module(module_name) - - # For Summit: - from tensorflow.keras import backend as K - # For other systems: - # from keras import backend as K - if K.backend() == 'tensorflow' and 'NUM_INTER_THREADS' in os.environ: - import tensorflow as tf - inter_threads = int(os.environ['NUM_INTER_THREADS']) - intra_threads = int(os.environ['NUM_INTRA_THREADS']) - print("Configuring tensorflow with {} inter threads and " + - "{} intra threads" - .format(inter_threads, intra_threads)) - cfg = tf.ConfigProto(inter_op_parallelism_threads=inter_threads, - intra_op_parallelism_threads=intra_threads) - sess = tf.Session(graph=tf.get_default_graph(), config=cfg) - K.set_session(sess) elif framework == 'pytorch': import torch if module_name == None or module_name == "": From 130c79988dc520b9be6c6bc6980165a59fc8fab9 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 7 Dec 2020 11:14:08 -0600 Subject: [PATCH 130/601] Add actual sed command --- workflows/cp-leaveout/scripts/epoch-count.sh | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/workflows/cp-leaveout/scripts/epoch-count.sh b/workflows/cp-leaveout/scripts/epoch-count.sh index a883f360..3f4979c1 100755 --- a/workflows/cp-leaveout/scripts/epoch-count.sh +++ b/workflows/cp-leaveout/scripts/epoch-count.sh @@ -29,8 +29,5 @@ for LOG in ${LOGS[@]} do echo -n "$LOG :: " # Pull out the last "Epoch:" line, print only the number: - # sed -n '/Epoch:/h;${g;s/.*Epoch: \([0-9]*\).*/\1/;p}' $LOG - grep "Epoch:" $LOG - # experiments/X362/run/1.3/save/python.log -done # | sort -r -n -k 2 | column -t - + sed -n '/Epoch:/h;${g;s/.*Epoch: \([0-9]*\).*/\1/;p}' $LOG +done | nl # | sort -r -n -k 2 | column -t From 768574b7bee2010f9bd2e35b139c2d72b61d9cf7 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 7 Dec 2020 11:15:03 -0600 Subject: [PATCH 131/601] Update data location on Summit --- workflows/cp-leaveout/test/test-bl-1.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/test/test-bl-1.sh b/workflows/cp-leaveout/test/test-bl-1.sh index 626aa1c7..cda3cc48 100755 --- a/workflows/cp-leaveout/test/test-bl-1.sh +++ b/workflows/cp-leaveout/test/test-bl-1.sh @@ -42,10 +42,10 @@ export CFG_PRM=$THIS/cfg-prm-1.sh # SCRATCH=/gpfs/alpine/med106/scratch/hsyoo SCRATCH=/gpfs/alpine/med106/scratch/wozniak # SCRATCH=/usb2/wozniak -CANDLE_DATA=$SCRATCH/CANDLE-Data/Milestone-13 +CANDLE_DATA=$SCRATCH/CANDLE-Data/ChallengeProblem PLAN_JSON=$CANDLE_DATA/plangen_cell1593-p4_drug1779-p1.json # DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.csv -DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.feather +DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labeled.hdf5 BENCHMARK_DATA=$SCRATCH/proj/Benchmarks/Pilot1/Uno # BENCHMARK_DATA=$HOME/proj/Benchmarks/Pilot1/Uno From fb036dd34e13c77b55ff43702c1ba92411065373 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 7 Dec 2020 11:15:26 -0600 Subject: [PATCH 132/601] WS --- workflows/cp-leaveout/test/test-bl-1.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/test/test-bl-1.sh b/workflows/cp-leaveout/test/test-bl-1.sh index cda3cc48..4e60bc53 100755 --- a/workflows/cp-leaveout/test/test-bl-1.sh +++ b/workflows/cp-leaveout/test/test-bl-1.sh @@ -58,7 +58,7 @@ then export WAIT=1 fi -for f in $DATAFRAME_CSV +for f in $DATAFRAME_CSV do if [[ ! -f $f ]] then From 1cb6bbd5541af5625dc46de0bfda6f13f1a9c7b4 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 7 Dec 2020 11:16:41 -0600 Subject: [PATCH 133/601] Note about interactive workflows --- workflows/upf/test/cfg-sys-1.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/workflows/upf/test/cfg-sys-1.sh b/workflows/upf/test/cfg-sys-1.sh index c8180e15..f5a5bcbe 100644 --- a/workflows/upf/test/cfg-sys-1.sh +++ b/workflows/upf/test/cfg-sys-1.sh @@ -1,6 +1,9 @@ # UPF CFG SYS 1 +# Use 1 for interactive workflows +# export INTERACTIVE=1 + # The number of MPI processes # Note that 1 process is reserved for Swift/T # For example, if PROCS=4 that gives you 3 workers, @@ -23,7 +26,7 @@ export PPN=${PPN:-1} # Theta: (cf. sched-theta) # export QUEUE=${QUEUE:-debug-cache-quad} -export QUEUE=${QUEUE:-debug-flat-quad} +# export QUEUE=${QUEUE:-debug-flat-quad} # export PROJECT=${PROJECT:-ecp-testbed-01} # export PROJECT=Candle_ECP export PROJECT=CSC249ADOA01 @@ -31,6 +34,7 @@ export PROJECT=CSC249ADOA01 # Summit: export QUEUE=${QUEUE:-batch} export PROJECT=med106 +# export TURBINE_LAUNCH_OPTIONS="-a1 -g6 -c7" export WALLTIME=${WALLTIME:-0:30} From 3b81450dbcd79fee5ee96281592a724e028c2ee9 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 7 Dec 2020 11:20:12 -0600 Subject: [PATCH 134/601] New epoch-status script to report epoch progress status --- workflows/cp-leaveout/scripts/epoch-status.sh | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100755 workflows/cp-leaveout/scripts/epoch-status.sh diff --git a/workflows/cp-leaveout/scripts/epoch-status.sh b/workflows/cp-leaveout/scripts/epoch-status.sh new file mode 100755 index 00000000..838ddb0e --- /dev/null +++ b/workflows/cp-leaveout/scripts/epoch-status.sh @@ -0,0 +1,41 @@ +#!/bin/bash +set -eu + +# EPOCH STATUS SH +# Report epoch progress status for all python.logs + +THIS=$( readlink --canonicalize $( dirname $0 ) ) +CPLO=$( readlink --canonicalize $THIS/.. ) +SUPERVISOR=$( readlink --canonicalize $CPLO/../.. ) + +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ + DIR - ${*} + +if ! [[ -d $DIR ]] +then + echo "Does not exist: $DIR" + exit 1 +fi + +EXPID=$( basename $DIR ) +JOBID=$( cat $DIR/jobid.txt ) +show EXPID JOBID + +LOGS=( $( find $DIR -name python.log ) ) +echo "epoch-count.sh: found ${#LOGS[@]} logs ..." + +COMPLETED=0 +for LOG in ${LOGS[@]} +do + if grep -q "EPOCHS COMPLETED" $LOG + then + (( COMPLETED = COMPLETED+1 )) + else + echo + echo $LOG + tail $LOG + fi +done +echo "COMPLETED: $COMPLETED" From 40a4a11ec2b4569be66ab6d1b5a3a35288de0b61 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 7 Dec 2020 11:27:09 -0600 Subject: [PATCH 135/601] Update header --- workflows/common/sh/env-summit-tf2.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/sh/env-summit-tf2.sh b/workflows/common/sh/env-summit-tf2.sh index b2056bee..7daa191d 100644 --- a/workflows/common/sh/env-summit-tf2.sh +++ b/workflows/common/sh/env-summit-tf2.sh @@ -1,5 +1,5 @@ -# ENV Summit +# ENV Summit TF2 # Environment settings for Summit (Swift, Python, R, Tcl, etc.) # GCC 7.4.0, TensorFlow 2, opence010env, R 3.6.1 From 4803a8f88f86d904f48231b752eb4b2188b2f305 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 7 Dec 2020 13:53:22 -0600 Subject: [PATCH 136/601] Use readlink --- workflows/upf/swift/workflow.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index c030d968..7c55a740 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -4,10 +4,10 @@ set -eu # UPF WORKFLOW SH # Autodetect this workflow directory -export EMEWS_PROJECT_ROOT=$( cd $( dirname $0 )/.. ; /bin/pwd ) -export WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. ; /bin/pwd ) +export EMEWS_PROJECT_ROOT=$( readlink --canonicalize $( dirname $0 )/.. ) +export WORKFLOWS_ROOT=$( readlink --canonicalize $EMEWS_PROJECT_ROOT/.. ) +export BENCHMARKS_ROOT=$( readlink --canonicalize $EMEWS_PROJECT_ROOT/../../../Benchmarks.tf2 ) -export BENCHMARKS_ROOT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd) BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/NT3:$BENCHMARKS_ROOT/Pilot2/P2B1:$BENCHMARKS_ROOT/Pilot1/P1B1:$BENCHMARKS_ROOT/Pilot1/Combo:$BENCHMARKS_ROOT/Pilot3/P3B1:$BENCHMARKS_ROOT/Pilot3/P3B3:$BENCHMARKS_ROOT/Pilot3/P3B4:$BENCHMARKS_ROOT/Pilot3/P3B5 export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} SCRIPT_NAME=$(basename $0) From a2d21ba61ec9e1810c5890e36c30c57cd1ac1784 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 7 Dec 2020 13:53:46 -0600 Subject: [PATCH 137/601] Reflow comment --- workflows/upf/swift/workflow.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index 7c55a740..7026a7df 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -90,7 +90,11 @@ cp -v $UPF $TURBINE_OUTPUT site2=$(echo $SITE | awk -v FS="-" '{print $1}') # ALW 2020-11-15: allow $SITEs to have hyphens in them as Justin implemented for Summit on 2020-10-29, e.g., summit-tf1 -if [ ${site2} == "summit" -a "x$CANDLE_RUN_WORKFLOW" != "x1" ] # ALW 2020-11-15: If we're running the candle wrapper scripts in which case if this file were being called then $CANDLE_RUN_WORKFLOW=1, don't set $TURBINE_LAUNCH_OPTIONS as this variable and the settings in the declaration below are handled by the wrapper scripts +# ALW 2020-11-15: If we're running the candle wrapper scripts in which +# case if this file were being called then $CANDLE_RUN_WORKFLOW=1, +# don't set $TURBINE_LAUNCH_OPTIONS as this variable and the settings +# in the declaration below are handled by the wrapper scripts +if [[ ${site2} == "summit" -a "x$CANDLE_RUN_WORKFLOW" != "x1" ]] then export TURBINE_LAUNCH_OPTIONS="-r6 -a1 -g1 -c7" fi From 30e8e80d97196d2c34f6cf8ffc89944234133d98 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 7 Dec 2020 13:56:55 -0600 Subject: [PATCH 138/601] Use modern syntax --- workflows/upf/swift/workflow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index 7026a7df..897d5a67 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -94,7 +94,7 @@ site2=$(echo $SITE | awk -v FS="-" '{print $1}') # ALW 2020-11-15: allow $SITEs # case if this file were being called then $CANDLE_RUN_WORKFLOW=1, # don't set $TURBINE_LAUNCH_OPTIONS as this variable and the settings # in the declaration below are handled by the wrapper scripts -if [[ ${site2} == "summit" -a "x$CANDLE_RUN_WORKFLOW" != "x1" ]] +if [[ ${site2} == "summit" && ${CANDLE_RUN_WORKFLOW:-0} != 1 ]] then export TURBINE_LAUNCH_OPTIONS="-r6 -a1 -g1 -c7" fi From aa31e16de71dc2c970c3612c66f6b7b2ec56bb51 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 10 Dec 2020 11:10:57 -0600 Subject: [PATCH 139/601] New quick interactive test --- scratch/summit/README.adoc | 10 ++++++++-- scratch/summit/workflow-interactive.sh | 21 +++++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) create mode 100755 scratch/summit/workflow-interactive.sh diff --git a/scratch/summit/README.adoc b/scratch/summit/README.adoc index be05a9ea..90236df4 100644 --- a/scratch/summit/README.adoc +++ b/scratch/summit/README.adoc @@ -4,11 +4,17 @@ This is a stand-alone Swift/T test for Summit. Use ---- -./workflow.sh hello.swift +$ ./workflow.sh hello.swift ---- or ---- -./workflow.sh pyr.swift +$ ./workflow.sh pyr.swift +---- + +For an interactive test, get on a batch node and run: + +---- +$ ./workflow-interactive.sh hello.swift ---- diff --git a/scratch/summit/workflow-interactive.sh b/scratch/summit/workflow-interactive.sh new file mode 100755 index 00000000..e5070a32 --- /dev/null +++ b/scratch/summit/workflow-interactive.sh @@ -0,0 +1,21 @@ +#!/bin/bash +set -eu + +# WORKFLOW INTERACTIVE SH + +if [[ ${#} != 1 ]] +then + echo "Specify a Swift script!" + exit 1 +fi +SCRIPT=$1 + +MED106=/gpfs/alpine/world-shared/med106/ +SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-10-22 +PATH=$SWIFT/stc/bin:$PATH + +# This is for an interactive run: +export TURBINE_LAUNCHER=jsrun + +set -x +swift-t -n $PROCS $SCRIPT From f11000e545b18a5b005d6c75c88a7570f3b4a4c9 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 10 Dec 2020 11:11:58 -0600 Subject: [PATCH 140/601] Fix typo --- scratch/summit/workflow-interactive.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scratch/summit/workflow-interactive.sh b/scratch/summit/workflow-interactive.sh index e5070a32..e6048d55 100755 --- a/scratch/summit/workflow-interactive.sh +++ b/scratch/summit/workflow-interactive.sh @@ -10,7 +10,7 @@ then fi SCRIPT=$1 -MED106=/gpfs/alpine/world-shared/med106/ +MED106=/gpfs/alpine/world-shared/med106 SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-10-22 PATH=$SWIFT/stc/bin:$PATH From 9fd068451c56f2e30aa45f6d7821dc3ff0997edb Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 10 Dec 2020 11:30:07 -0600 Subject: [PATCH 141/601] Add TURBINE_LAUNCH_OPTIONS to test --- scratch/summit/workflow-interactive.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scratch/summit/workflow-interactive.sh b/scratch/summit/workflow-interactive.sh index e6048d55..7cca3158 100755 --- a/scratch/summit/workflow-interactive.sh +++ b/scratch/summit/workflow-interactive.sh @@ -16,6 +16,7 @@ PATH=$SWIFT/stc/bin:$PATH # This is for an interactive run: export TURBINE_LAUNCHER=jsrun +export TURBINE_LAUNCH_OPTIONS="-r 4" set -x swift-t -n $PROCS $SCRIPT From af09cbcaa43dcb516f98bb83fbb02132ffca0688 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 11 Dec 2020 11:18:50 -0600 Subject: [PATCH 142/601] Set sys.argv if needed - fixes #85 --- workflows/common/python/model_runner.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index 050d12d6..6fdd63dd 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -206,6 +206,10 @@ def run_post(hyper_parameter_map, output_map): logger.debug("POST RUN STOP") def run_model(hyper_parameter_map): + # In-memory Python runs may not create sys.argv + if 'argv' not in dir(sys): + # This is needed for CANDLE Benchmarks finalize_parameters(): + sys.argv = ['null'] instance_directory = hyper_parameter_map['instance_directory'] os.chdir(instance_directory) global logger From 7de496cb2ee2b35d76cc57773ff8182d79ebe34b Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Wed, 16 Dec 2020 12:43:58 -0500 Subject: [PATCH 143/601] fix cuda driver issue --- workflows/common/sh/env-summit-tf2.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/common/sh/env-summit-tf2.sh b/workflows/common/sh/env-summit-tf2.sh index 7daa191d..45a50e82 100644 --- a/workflows/common/sh/env-summit-tf2.sh +++ b/workflows/common/sh/env-summit-tf2.sh @@ -11,6 +11,7 @@ set +eu # modules create errors outside our control module load spectrum-mpi/10.3.1.2-20200121 module unload darshan-runtime # module load ibm-wml-ce/1.6.2-3 +module load cuda/10.2.89 module list set -eu From 96c9a4c7a19caf25353e115f16e0193f1eed2184 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 6 Jan 2021 13:37:27 -0600 Subject: [PATCH 144/601] Improve header usage info --- workflows/cp-leaveout/scripts/compare-errors.py | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/cp-leaveout/scripts/compare-errors.py b/workflows/cp-leaveout/scripts/compare-errors.py index 4cd50ac7..50bb5d11 100644 --- a/workflows/cp-leaveout/scripts/compare-errors.py +++ b/workflows/cp-leaveout/scripts/compare-errors.py @@ -3,6 +3,7 @@ # Input: Provide two experiment DIRECTORIES and OUTPUT file # Output: NODE_ID EPOCHS1 ERROR1 EPOCHS2 ERROR2 +# where an ERROR is MSE MAE R2 CORR # Could easily be updated to pull out only one error stat # (see commented code) From 3e16468ba9063fbc1120ee2a9527b848a0921415 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 6 Jan 2021 13:37:46 -0600 Subject: [PATCH 145/601] Report output file --- workflows/cp-leaveout/scripts/compare-errors.py | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/cp-leaveout/scripts/compare-errors.py b/workflows/cp-leaveout/scripts/compare-errors.py index 50bb5d11..85072ae7 100644 --- a/workflows/cp-leaveout/scripts/compare-errors.py +++ b/workflows/cp-leaveout/scripts/compare-errors.py @@ -66,3 +66,4 @@ def get_errors(node): print("compared: %2i" % count) print("missing: %2i" % missing) +print("wrote: %s" % args.output) From 2a88c650dc1e0f0f48e65f1613b6a740ddf676cb Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Thu, 14 Jan 2021 18:07:26 -0500 Subject: [PATCH 146/601] Removing the line creating the file turbine-directory.txt in mlrMBO/swift/workflow.sh, as I may not care about the job monitoring anymore and it clouds up the working directory --- workflows/mlrMBO/swift/workflow.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index 6fa45be8..dc5ace07 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -195,4 +195,5 @@ fi # echo "EXIT CODE: 0" | tee -a $STDOUT # Andrew: Needed this so that script to monitor job worked properly (queue_wait... function in utils.sh?) -echo $TURBINE_OUTPUT > turbine-directory.txt +# ALW 1/14/21: Removing this line again as I may not care about the job monitoring anymore and it clouds up the working directory +#echo $TURBINE_OUTPUT > turbine-directory.txt From 24f6c7a142a2496ddd66f1b6fac37ed7bf1ea775 Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Sat, 16 Jan 2021 01:01:37 -0500 Subject: [PATCH 147/601] Fix spacing in utils.sh and move block to copy over CANDLE input file to TURBINE_OUTPUT directory to more effective location --- workflows/common/sh/utils.sh | 58 ++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index a59378a9..d3a9f54b 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -206,13 +206,6 @@ get_expid() mv metadata.json $TURBINE_OUTPUT fi - # Andrew: Copy the CANDLE input file to the current experiments directory for reference - if [ -n "${CANDLE_INPUT_FILE-}" ]; then - if [ -f "$CANDLE_INPUT_FILE" ]; then - cp "$CANDLE_INPUT_FILE" "$TURBINE_OUTPUT" - fi - fi - } next() @@ -360,29 +353,29 @@ queue_wait_site() site2=$(echo $SITE | awk -v FS="-" '{print $1}') # ALW 2020-11-15: allow $SITEs to have hyphens in them as Justin implemented for Summit on 2020-10-29, e.g., summit-tf1 if [[ $site2 == "cori" ]] - then - queue_wait_slurm $JOBID - elif [[ $site2 == "theta" ]] - then - queue_wait_cobalt $JOBID - elif [[ $site2 == "titan" ]] - then - queue_wait_pbs $JOBID - elif [[ $site2 == "summit" ]] - then - queue_wait_lsf $JOBID - elif [[ $site2 == "pascal" ]] - then - queue_wait_slurm $JOBID - elif [[ $site2 == "biowulf" ]] - then - queue_wait_slurm $JOBID - else - echo "queue_wait(): unknown site: $SITE" - return 1 - fi + then + queue_wait_slurm $JOBID + elif [[ $site2 == "theta" ]] + then + queue_wait_cobalt $JOBID + elif [[ $site2 == "titan" ]] + then + queue_wait_pbs $JOBID + elif [[ $site2 == "summit" ]] + then + queue_wait_lsf $JOBID + elif [[ $site2 == "pascal" ]] + then + queue_wait_slurm $JOBID + elif [[ $site2 == "biowulf" ]] + then + queue_wait_slurm $JOBID + else + echo "queue_wait(): unknown site: $SITE" + return 1 + fi - echo "Job completed: $JOBID" + echo "Job completed: $JOBID" } queue_wait_slurm() @@ -575,6 +568,13 @@ log_script() { echo "" >> $LOG_NAME echo "## SCRIPT ###" >> $LOG_NAME cat $EMEWS_PROJECT_ROOT/swift/$SCRIPT_NAME >> $LOG_NAME + + # Andrew: Copy the CANDLE input file to the current experiments directory for reference + if [ -n "${CANDLE_INPUT_FILE-}" ]; then + if [ -f "$CANDLE_INPUT_FILE" ]; then + cp "$CANDLE_INPUT_FILE" "$TURBINE_OUTPUT" + fi + fi } check_directory_exists() { From db96b9cf36ff2c078c46306839705cf18f9fe7b6 Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Thu, 21 Jan 2021 17:51:45 -0500 Subject: [PATCH 148/601] Fixed mlrMBO permissions issue on Biowulf by adding '-o /workflow.tic' to the swift-t call in workflow.sh, which seemed to have been commented out --- workflows/mlrMBO/swift/workflow.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index dc5ace07..574d9647 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -159,7 +159,9 @@ else STDOUT="" fi +# ALW 2021-01-21: Please don't comment out the "-o $TURBINE_OUTPUT/workflow.tic" option below; otherwise, we get permissions issues on Biowulf. Thanks! swift-t -O 0 -n $PROCS \ + -o $TURBINE_OUTPUT/workflow.tic \ ${MACHINE:-} \ -p -I $EQR -r $EQR \ -I $OBJ_DIR \ From 776dbabe34be41faef5668112972bc3145af1743 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 26 Jan 2021 14:33:26 -0600 Subject: [PATCH 149/601] New data resizer --- scratch/resizer/resize.py | 79 ++++++++++++++++++++++++++++++++++++ scratch/resizer/sample-1.csv | 3 ++ 2 files changed, 82 insertions(+) create mode 100644 scratch/resizer/resize.py create mode 100644 scratch/resizer/sample-1.csv diff --git a/scratch/resizer/resize.py b/scratch/resizer/resize.py new file mode 100644 index 00000000..ce75a988 --- /dev/null +++ b/scratch/resizer/resize.py @@ -0,0 +1,79 @@ + +# RESIZE PY + +description = "Resize and/or add noise to CSV data." + +def parse_args(): + import argparse + parser = argparse.ArgumentParser(description=description) + parser.add_argument("--resize", action="store", default=1.0, + help=""" + Output size scale compared to input size as float. + Examples: + 1.0=same size, + 0.5=half size, + 2.0=double size.""") + parser.add_argument("--noise", action="store", default=0.0, + help="""" + Noise injection as float. + Examples: + 0.0=no noise + 0.1=noise +/- 10%""") + parser.add_argument("input", action="store", + help="The input CSV.") + parser.add_argument("output", action="store", + help="The output CSV.") + args = parser.parse_args() + argvars = vars(args) + # print(str(argvars)) + return argvars + +def write_data(args, fp, data_out): + from random import random + wholes = int(float(args["resize"])) + noise = float(args["noise"]) + rows, cols = data_out.shape + for i in range(0, wholes): + for row in range(0, rows): + for col in range(0, cols-1): + value = data_out[row, col] + if noise != 0.0: + value = value * (1 - noise) + value * (noise * 2) * random() + fp.write("%f," % value) + col += 1 + value = data_out[row, col] + if noise != 0.0: + value = value * (1 - noise) + value * (noise * 2) * random() + fp.write("%f" % value) + fp.write("\n") + fraction = float(args["resize"]) - wholes + for row in range(0, int(fraction * rows)): + for col in range(0, cols-1): + value = data_out[row, col] + if noise != 0.0: + value = value * (1 - noise) + value * (noise * 2) * random() + fp.write("%f," % value) + col += 1 + value = data_out[row, col] + if noise != 0.0: + value = value * (1 - noise) + value * (noise * 2) * random() + fp.write("%f" % value) + fp.write("\n") + +import sys +import numpy as np + +args = parse_args() + +data_in = np.loadtxt(args["input"], delimiter=",") +data_out = np.copy(data_in) + +if args["output"] == "/dev/stdout" or args["output"] == "-": + fp = sys.stdout +else: + fp = open(args["output"], "w") + +write_data(args, fp, data_out) + +if fp is not sys.stdout: + fp.close() diff --git a/scratch/resizer/sample-1.csv b/scratch/resizer/sample-1.csv new file mode 100644 index 00000000..d3494f6d --- /dev/null +++ b/scratch/resizer/sample-1.csv @@ -0,0 +1,3 @@ +1,2,3 +4,5,6 +7,8,9 From eb28a0408dbec5b107d544e1cb8761e6a775ac1f Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Mon, 10 May 2021 15:46:23 -0400 Subject: [PATCH 150/601] Add workflows/common/sh/cfg-sys-summit-tf2.sh for use with the central installation of CANDLE --- workflows/common/sh/cfg-sys-summit-tf2.sh | 59 +++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 workflows/common/sh/cfg-sys-summit-tf2.sh diff --git a/workflows/common/sh/cfg-sys-summit-tf2.sh b/workflows/common/sh/cfg-sys-summit-tf2.sh new file mode 100644 index 00000000..529772c6 --- /dev/null +++ b/workflows/common/sh/cfg-sys-summit-tf2.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# UPF CFG SYS 1 + +# The number of MPI processes +# Note that 1 process is reserved for Swift/T +# For example, if PROCS=4 that gives you 3 workers, +# i.e., 3 concurrent Keras runs. +export PROCS=${PROCS:-2} + +# MPI processes per node. This should not exceed PROCS. +# Cori has 32 cores per node, 128GB per node +export PPN=${PPN:-1} + +#export QUEUE=${QUEUE:-batch} + +# Cori: (cf. sched-cori) +# export QUEUE=${QUEUE:-debug} +# Cori queues: debug, regular +# export QUEUE=regular +# export QUEUE=debug +# CANDLE on Cori: +# export PROJECT=m2924 + +# Theta: (cf. sched-theta) +# export QUEUE=${QUEUE:-debug-cache-quad} +#export QUEUE=${QUEUE:-debug-flat-quad} +# export PROJECT=${PROJECT:-ecp-testbed-01} +# export PROJECT=Candle_ECP +#export PROJECT=CSC249ADOA01 + +# Summit: +export QUEUE=${QUEUE:-batch} + +export PROJECT=${PROJECT:-med106} + +export WALLTIME=${WALLTIME:-0:30} + +# export MAIL_ENABLED=1 +# export MAIL_ADDRESS=wozniak@mcs.anl.gov + +# Benchmark run timeout: benchmark run will timeout +# after the specified number of seconds. -1 is no timeout. +BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} # probably not needed but this variable is baked into rest of code, e.g., workflow.sh + +# Uncomment below to use custom python script to run +# Use file name without .py (e.g, my_script.py) +# BENCHMARK_DIR=/path/to/ +# MODEL_PYTHON_SCRIPT=my_script + +# Shell timeout: benchmark run will be killed +# after the specified number of seconds. +# If set to -1 or empty there is no timeout. +# This timeout is implemented with the shell command 'timeout' +export SH_TIMEOUT=${SH_TIMEOUT:-} + +# Ignore errors: If 1, unknown errors will be reported to model.log +# but will not bring down the Swift workflow. See model.sh . +export IGNORE_ERRORS=0 From 93219d2b380f0d20b98370a6e5dd3a9a016bdc96 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 13 May 2021 11:17:55 -0500 Subject: [PATCH 151/601] Initial support for #86 --- workflows/common/python/model_runner.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index 6fdd63dd..2c487525 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -138,6 +138,10 @@ def run(hyper_parameter_map, obj_return): runner_utils.format_params(hyper_parameter_map) params_arg = {} + if 'CANDLE_DEFAULT_MODEL_FILE' in os.environ: + config_file = os.getenv('CANDLE_DEFAULT_MODEL_FILE') + logger.info('CANDLE_DEFAULT_MODEL_FILE: "%s"' % config_file) + params_arg = { 'default_model': config_file } if 'config_file' in hyper_parameter_map: config_file = hyper_parameter_map['config_file'] logger.info('specified config_file: "%s"' % config_file) From 24044eb19de9ea6f12bb5fc783ad364f287af8aa Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 18 May 2021 14:50:34 -0500 Subject: [PATCH 152/601] flake8 fixes --- workflows/common/python/model_runner.py | 51 ++++++++++++++++--------- 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index 2c487525..25a86724 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -33,21 +33,23 @@ print("%2i: %s" % (i, sys.path[i])) print("") + def import_pkg(framework, model_name): # The model_name is the short form of the Benchmark: e.g., 'nt3' - # The module_name is the name of the Python module: e.g., 'nt3_baseline_keras2' + # The module_name is the name of the Python module: + # e.g., 'nt3_baseline_keras2' print("model_name: ", model_name) module_name = os.getenv("MODEL_PYTHON_SCRIPT") if framework == 'keras': - if module_name == None or module_name == "": + if module_name is None or module_name == "": module_name = "{}_baseline_keras2".format(model_name) - print ("module_name:", module_name) + print("module_name: " + module_name) pkg = importlib.import_module(module_name) elif framework == 'pytorch': - import torch - if module_name == None or module_name == "": + import torch # noqa: F401 + if module_name is None or module_name == "": module_name = "{}_baseline_pytorch".format(model_name) - print ("module_name:", module_name) + print("module_name: " + module_name) pkg = importlib.import_module(module_name) else: raise ValueError("Framework must either be `keras' or `pytorch' " + @@ -61,10 +63,12 @@ def log(msg): global logger logger.debug(msg) + def timestamp(): from datetime import datetime return datetime.now().strftime("%Y-%m-%d %H:%M:%S") + def setup_perf(params): return { 'top': setup_perf_top(params), 'nvidia': setup_perf_nvidia(params) } @@ -77,7 +81,7 @@ def setup_perf_top(params): return None try: delay = int(params['perf_top']) - except: + except Exception: msg = 'setup_perf_top(): params[perf_top] not an int: got: "%s"' % \ params['perf_top'] print(msg) @@ -85,11 +89,12 @@ def setup_perf_top(params): import subprocess with open('perf-top.log', 'a') as fp_out: fp_out.write('model_runner: start: %s\n\n' % timestamp()) - P = subprocess.Popen(['top', '-b', '-d', params['perf_top']], + P = subprocess.Popen(['top', '-b', '-d', delay], stdout=fp_out, stderr=subprocess.STDOUT) return P + def setup_perf_nvidia(params): if 'perf_nvidia' not in params: return None @@ -97,7 +102,7 @@ def setup_perf_nvidia(params): return None try: delay = int(params['perf_nvidia']) - except: + except Exception: msg = 'setup_perf_nvidia(): params[perf_nvidia] not an int: ' + \ 'got: "%s"' % params['perf_nvidia'] print(msg) @@ -105,7 +110,7 @@ def setup_perf_nvidia(params): import subprocess with open('perf-nvidia.log', 'a') as fp_out: fp_out.write('model_runner: start: %s\n\n' % timestamp()) - P = subprocess.Popen(['nvidia-smi', '--loop='+params['perf_top']], + P = subprocess.Popen(['nvidia-smi', '--loop=%i' % delay], stdout=fp_out, stderr=subprocess.STDOUT) return P @@ -163,8 +168,11 @@ def run(hyper_parameter_map, obj_return): # Default result if there is no val_loss (as in infer.py) result = 0 history_result = {} - if history != None: - result, history_result = get_results(history, obj_return) + if history is not None: + if history == "EPOCHS_COMPLETED_ALREADY": + result, history_result = "EPOCHS_COMPLETED_ALREADY", None + else: + result, history_result = get_results(history, obj_return) stop_perf(Ps) @@ -219,6 +227,8 @@ def run_model(hyper_parameter_map): global logger logger = log_tools.get_logger(logger, "MODEL RUNNER") obj_return = get_obj_return() + directory = hyper_parameter_map['instance_directory'] + os.chdir(directory) result = run_pre(hyper_parameter_map) if result == ModelResult.ERROR: print("run_pre() returned ERROR!") @@ -231,9 +241,10 @@ def run_model(hyper_parameter_map): assert(result == ModelResult.SUCCESS) # proceed... result, history = run(hyper_parameter_map, obj_return) - runner_utils.write_output(result, instance_directory) - runner_utils.write_output(json.dumps(history, cls=runner_utils.FromNPEncoder), - instance_directory, 'history.txt') + runner_utils.write_output(result, directory) + runner_utils.write_output(json.dumps(history, + cls=runner_utils.FromNPEncoder), + directory, 'history.txt') run_post(hyper_parameter_map, {}) log("RUN STOP") @@ -266,16 +277,17 @@ def get_results(history, obj_return): Return the history entry that the user requested. history: The Keras history object """ - values = history.history[obj_return] - # Default: the last value in the history - result = values[-1] - known_params = [ "loss", "val_loss", "val_corr", "val_dice_coef" ] if obj_return not in known_params: raise ValueError("Unsupported objective function: " + "use obj_param to specify one of " + str(known_params)) + if obj_return in history.history: + values = history.history[obj_return] + # Default: the last value in the history + result = values[-1] + # Fix NaNs: if math.isnan(result): if obj_return == "val_corr" or obj_return == "val_dice_coef": @@ -294,6 +306,7 @@ def get_results(history, obj_return): logger = log_tools.get_logger(logger, "MODEL_RUNNER") log("RUN START") + import sys ( _, # The Python program name (unused) param_string, instance_directory, From 7cb75f9ea9cf7e3ac90af097d6843f607462005d Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 18 May 2021 14:51:54 -0500 Subject: [PATCH 153/601] More verbose output --- workflows/cp-leaveout/py/data_setup.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/workflows/cp-leaveout/py/data_setup.py b/workflows/cp-leaveout/py/data_setup.py index 70abd568..20514c4a 100644 --- a/workflows/cp-leaveout/py/data_setup.py +++ b/workflows/cp-leaveout/py/data_setup.py @@ -44,9 +44,11 @@ def pre_run(params): "copied to NVM in %0.1f seconds (%0.1f MB/s)." % (duration, rate)) else: - print("File copy skipped. Original dataframe already exists in NVM.") + print("File copy skipped. " + + "Original dataframe already exists in NVM.") except Exception as e: - print("Error occurred in copying original dataframe\n" + str(e)) + print("Error occurred in copying original dataframe\n" + + str(e)) traceback.print_exc() return ModelResult.ERROR params["dataframe_from"] = dest.resolve() @@ -64,9 +66,16 @@ def pre_run(params): try: for filename in [ "uno_auc_model.txt" ]: # "cache", if not os.path.islink(filename): - os.symlink(f"{data}/{filename}", filename) + src = f"{data}/{filename}" + print("data_setup: src: (%s)" % src) + print("data_setup: dest: (%s)" % filename) + os.symlink(src, filename) except Exception as e: - print("data_setup: error making symlink: %s\n" % filename + str(e)) + print("data_setup: error making symlink:") + print("data_setup: pwd: " + os.getcwd()) + print("data_setup: src: (%s)" % src) + print("data_setup: dest: (%s)" % filename) + print(str(e)) return ModelResult.ERROR try: From 7fc0ee821e93dc24243dec0a5303ed20fe7411eb Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 18 May 2021 14:54:12 -0500 Subject: [PATCH 154/601] Add comments --- workflows/cp-leaveout/scripts/Node.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/cp-leaveout/scripts/Node.py b/workflows/cp-leaveout/scripts/Node.py index 1360e460..7d014b18 100644 --- a/workflows/cp-leaveout/scripts/Node.py +++ b/workflows/cp-leaveout/scripts/Node.py @@ -251,6 +251,7 @@ def get_epochs_cumul(self, nodes): return self.epochs_cumul def check_token(line, index, token): + ''' Assert that token is in line at given index ''' tokens = line.split() if tokens[index] != token: raise Exception(("could not find token: '%s'\n" + @@ -258,6 +259,7 @@ def check_token(line, index, token): return tokens def check(condition, message): + ''' Check condition or raise Exception with given message ''' if not condition: raise Exception(message) From 6b15bc9cc7c440ddff52a8c5a15d56d984ef7af7 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 18 May 2021 14:55:00 -0500 Subject: [PATCH 155/601] Fix quoting --- workflows/cp-leaveout/swift/plangen_2.swift | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/swift/plangen_2.swift b/workflows/cp-leaveout/swift/plangen_2.swift index 1d1a0455..06e2faef 100644 --- a/workflows/cp-leaveout/swift/plangen_2.swift +++ b/workflows/cp-leaveout/swift/plangen_2.swift @@ -44,7 +44,7 @@ result = python_persist( import sys, traceback import plangen try: - result = str(plangen.plan_prep('%s', '%s', %s)) + result = str(plangen.plan_prep('%s', '%s', '%s')) except Exception as e: info = sys.exc_info() s = traceback.format_tb(info[2]) @@ -62,7 +62,7 @@ except Exception as e: import sys, traceback import plangen try: - result = str(plangen.start_subplan('%s', '%s', %s, '%s', %s)) + result = str(plangen.start_subplan('%s', '%s', %s, '%s', '%s')) except Exception as e: info = sys.exc_info() s = traceback.format_tb(info[2]) From dbd5bf4806a2e94cc91cb4b8368a0894806135b6 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 18 May 2021 14:56:48 -0500 Subject: [PATCH 156/601] Add error check --- workflows/cp-leaveout/swift/workflow.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index 664c460b..d265c4af 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -163,6 +163,11 @@ export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh EPOCH_MODE_MODULE="compute_epochs_$EPOCH_MODE" +if [[ ! -f swift/$EPOCH_MODE_MODULE.swift ]] +then + abort "workflow.sh: No such EPOCH_MODE: swift/$EPOCH_MODE_MODULE.swift" +fi + WORKFLOW_SWIFT=${WORKFLOW_SWIFT:-workflow.swift} echo "WORKFLOW_SWIFT: $WORKFLOW_SWIFT" From 725912ea2b977a99db86c72895dd4ae85d8014b9 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 18 May 2021 14:57:14 -0500 Subject: [PATCH 157/601] Turn DB back on --- workflows/cp-leaveout/swift/workflow.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/swift/workflow.swift b/workflows/cp-leaveout/swift/workflow.swift index f193cc58..cf21572e 100644 --- a/workflows/cp-leaveout/swift/workflow.swift +++ b/workflows/cp-leaveout/swift/workflow.swift @@ -35,7 +35,7 @@ import string; import sys; import candle_utils; -import plangen_0; +import plangen_2; report_env(); From 5fdb614fbd209efb616b66f703e60d947929337b Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 18 May 2021 14:57:29 -0500 Subject: [PATCH 158/601] Better output --- workflows/cp-leaveout/swift/workflow.swift | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/swift/workflow.swift b/workflows/cp-leaveout/swift/workflow.swift index cf21572e..d4da18d2 100644 --- a/workflows/cp-leaveout/swift/workflow.swift +++ b/workflows/cp-leaveout/swift/workflow.swift @@ -80,6 +80,8 @@ string exp_id = getenv("EXPID"); string turbine_output = getenv("TURBINE_OUTPUT"); // END WORKFLOW ARGUMENTS +printf("benchmark_data: " + benchmark_data); + // // For compatibility with obj(): global const string FRAMEWORK = "keras"; @@ -89,7 +91,7 @@ run_stage(int N, int S, string this, int stage, void block, string plan_id, string db_file, string runtype) { - printf("stage: %i this: %s", stage, this); + // printf("stage: %i this: %s", stage, this); // Run the model void parent = run_single(this, stage, block, plan_id); From 8ecb6ae18dca6e7a26e46d98c8fa47a96bdbec06 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 18 May 2021 14:58:19 -0500 Subject: [PATCH 159/601] Update data locations --- workflows/cp-leaveout/test/test-512.sh | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/workflows/cp-leaveout/test/test-512.sh b/workflows/cp-leaveout/test/test-512.sh index e0d760ed..fda47f14 100755 --- a/workflows/cp-leaveout/test/test-512.sh +++ b/workflows/cp-leaveout/test/test-512.sh @@ -32,17 +32,27 @@ source $WORKFLOWS_ROOT/common/sh/utils.sh export CFG_SYS=$THIS/cfg-sys-512.sh export CFG_PRM=$THIS/cfg-prm-1.sh +# # Data files +# # PLAN_JSON=$EMEWS_PROJECT_ROOT/plangen_cell8-p2_drug8-p2.json +# SCRATCH=/gpfs/alpine/med106/scratch/hsyoo +# CANDLE_DATA=$SCRATCH/Milestone13 +# PLAN_JSON=$CANDLE_DATA/plangen_cell1593-p4_drug1779-p1.json +# DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.csv +# BENCHMARK_DATA=$SCRATCH/Milestone13/Benchmarks/Pilot1/Uno + # Data files # PLAN_JSON=$EMEWS_PROJECT_ROOT/plangen_cell8-p2_drug8-p2.json # SCRATCH=/gpfs/alpine/med106/scratch/hsyoo -# SCRATCH=/gpfs/alpine/med106/scratch/wozniak -SCRATCH=/usb2/wozniak -CANDLE_DATA=$SCRATCH/CANDLE-Data/Milestone-13 +SCRATCH=/gpfs/alpine/med106/scratch/wozniak +# SCRATCH=/usb2/wozniak +# CANDLE_DATA=$SCRATCH/CANDLE-Data/Milestone-13 +CANDLE_DATA=$SCRATCH/CANDLE-Data/ChallengeProblem PLAN_JSON=$CANDLE_DATA/plangen_cell1593-p4_drug1779-p1.json # DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.csv DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.feather -# BENCHMARK_DATA=$SCRATCH/proj/Benchmarks/Pilot1 -BENCHMARK_DATA=$HOME/proj/Benchmarks/Pilot1 +# BENCHMARK_DATA=$SCRATCH/proj/Benchmarks/Pilot1/Uno +# BENCHMARK_DATA=$HOME/proj/Benchmarks/Pilot1/Uno +BENCHMARK_DATA=$CANDLE_DATA # What to return from the objective function (Keras model) # val_loss (default), loss, and val_corr are supported From 85479d8a5ee97e25d3cede456131a03c7cb0a265 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 15 Jun 2021 14:17:37 -0500 Subject: [PATCH 160/601] Support early stopping level --- workflows/cp-leaveout/swift/workflow.swift | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/swift/workflow.swift b/workflows/cp-leaveout/swift/workflow.swift index d4da18d2..2ad503f9 100644 --- a/workflows/cp-leaveout/swift/workflow.swift +++ b/workflows/cp-leaveout/swift/workflow.swift @@ -9,6 +9,7 @@ Flags: -N : Number of nodes per stage (see default in code) -S : Number of stages (see default in code) + -P : Early stopping patience (see default in code) -r : Use RunType.RESTART, default is RunType.RUN_ALL RUN_ALL means this is a fresh run with no prior results @@ -69,6 +70,9 @@ else E_s = argv("E", "50"); assert(strlen(E_s) > 0, "workflow.swift: you must provide an argument to -E"); int max_epochs = string2int(E_s); // epochs=20 is just under 2h on Summit. +P_s = argv("P", "10"); +assert(strlen(P_s) > 0, "workflow.swift: you must provide an argument to -P"); +int early_stopping = string2int(P_s); string plan_json = argv("plan_json"); string dataframe_csv = argv("dataframe_csv"); string db_file = argv("db_file"); @@ -161,10 +165,11 @@ run_stage(int N, int S, string this, int stage, void block, "gpus": "0", "epochs": %i, "es": "True", +"early_stopping": %i, "use_exported_data": "topN.uno.h5", "benchmark_data": "%s" ---- % -(plan_json, dataframe_csv, epochs, benchmark_data); +(plan_json, dataframe_csv, epochs, early_stopping, benchmark_data); if (stage > 1) { n = strlen(this); From c0a1ba7d84b671d281029fdd88d7c65a9cff030f Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 28 Jul 2021 15:40:30 -0500 Subject: [PATCH 161/601] Adding sh/env-summit-tf-2.4.1.sh --- workflows/common/sh/env-summit-tf-2.4.1.sh | 44 ++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 workflows/common/sh/env-summit-tf-2.4.1.sh diff --git a/workflows/common/sh/env-summit-tf-2.4.1.sh b/workflows/common/sh/env-summit-tf-2.4.1.sh new file mode 100644 index 00000000..9c0c24b2 --- /dev/null +++ b/workflows/common/sh/env-summit-tf-2.4.1.sh @@ -0,0 +1,44 @@ + +# ENV Summit - TF 2.4.1 +# Environment settings for Summit (Swift, Python, R, Tcl, etc.) + +# SWIFT_IMPL=echo +SWIFT_IMPL=py + +# Let modules initialize LD_LIBRARY_PATH before changing it: +set +eu # modules create errors outside our control +module load spectrum-mpi/10.3.1.2-20200121 +module unload darshan-runtime +module load gcc/7.4.0 +module list +set -eu + +# Base project directory +MED106=/gpfs/alpine/world-shared/med106 + +# Swift/T location +SWIFT=$MED106/sw/gcc-7.4.0/swift-t/2021-07-28 +export TURBINE_HOME=$SWIFT/turbine +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH + +# R settings +R=$MED106/wozniak/sw/gcc-6.4.0/R-3.6.1/lib64/R +LD_LIBRARY_PATH+=:$R/lib +# EMEWS Queues for R +EQR=$MED106/wozniak/sw/gcc-6.4.0/EQ-R + +# For test output processing: +LOCAL=0 +CRAY=1 + +# Resident task worker count and rank list +# If this is already set, we respect the user settings +# If this is unset, we set it to 1 +# and run the algorithm on the 2nd highest rank +# This value is only read in HPO workflows +if [[ ${TURBINE_RESIDENT_WORK_WORKERS:-} == "" ]] +then + export TURBINE_RESIDENT_WORK_WORKERS=1 + export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) +fi From 69d4122e82481708f004561bc5c00c17c9d20d6f Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 28 Jul 2021 15:41:06 -0500 Subject: [PATCH 162/601] Update settings for test --- scratch/summit/workflow.sh | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/scratch/summit/workflow.sh b/scratch/summit/workflow.sh index 42aa99f9..058f3e76 100755 --- a/scratch/summit/workflow.sh +++ b/scratch/summit/workflow.sh @@ -8,21 +8,17 @@ then fi SCRIPT=$1 -SWIFT= - -module load spectrum-mpi/10.3.1.2-20200121 - -G=/sw/summit/gcc/6.4.0/lib64 -R="" -LD_LIBRARY_PATH=$G:$R:$LD_LIBRARY_PATH +THIS=$( readlink --canonicalize $( dirname $0 ) ) +SV=$( readlink --canonicalize $THIS/../.. ) +source $SV/workflows/common/sh/env-summit-tf-2.4.1.sh +# Basic Swift/T environment settings: export PROJECT=MED106 -# export QUEUE=debug export PPN=2 -PROCS=4 +PROCS=2 -SWIFT=/gpfs/alpine/world-shared/med106/wozniak/sw/gcc-6.4.0/swift-t/2020-03-31-c/stc/bin/swift-t +which swift-t set -x -$SWIFT -m lsf -n $PROCS \ +swift-t -m lsf -n $PROCS \ $SCRIPT From 2d3d1fe54beb1365636babf6651f2ecd740d724c Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 28 Jul 2021 16:10:21 -0500 Subject: [PATCH 163/601] Adding sched-summit-tf-2.4.1.sh --- workflows/common/sh/sched-summit-tf-2.4.1.sh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 workflows/common/sh/sched-summit-tf-2.4.1.sh diff --git a/workflows/common/sh/sched-summit-tf-2.4.1.sh b/workflows/common/sh/sched-summit-tf-2.4.1.sh new file mode 100644 index 00000000..ba482cec --- /dev/null +++ b/workflows/common/sh/sched-summit-tf-2.4.1.sh @@ -0,0 +1,19 @@ + +# SCHED Summit TF 2.4.1 +# Scheduler settings for Swift/Summit + +if (( ${INTERACTIVE:-0} )) +then + # Interactive settings + MACHINE="" + export TURBINE_LAUNCHER=jsrun +else + # Use LSF: + MACHINE="-m lsf" +fi + +# Default PROJECT for CANDLE +#export QUEUE=${QUEUE:-batch-hm} +export PROJECT=${PROJECT:-MED106} + +# export TURBINE_OUTPUT_SOFTLINK=/dev/null From 79ac45cbf60bbca0687812644f13d8f566e9ebe7 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 28 Jul 2021 17:36:03 -0500 Subject: [PATCH 164/601] Fix settings for new Python and test --- scratch/summit/py-tf.swift | 17 +++++++++++++++++ scratch/summit/workflow.sh | 2 +- workflows/common/sh/env-summit-tf-2.4.1.sh | 6 ++++++ workflows/common/sh/utils.sh | 2 +- 4 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 scratch/summit/py-tf.swift diff --git a/scratch/summit/py-tf.swift b/scratch/summit/py-tf.swift new file mode 100644 index 00000000..e41a689c --- /dev/null +++ b/scratch/summit/py-tf.swift @@ -0,0 +1,17 @@ + +import io; +import python; + +result_python = python(""" +import sys, traceback +try: + sys.argv = [ 'python' ] + import tensorflow as tf +except Exception as e: + info = sys.exc_info() + s = traceback.format_tb(info[2]) + print(str(e) + ' ... \\n' + ''.join(s)) + sys.stdout.flush() +""", + "repr(40+2)"); +printf("result_python: %s", result_python); diff --git a/scratch/summit/workflow.sh b/scratch/summit/workflow.sh index 058f3e76..f9f7a666 100755 --- a/scratch/summit/workflow.sh +++ b/scratch/summit/workflow.sh @@ -20,5 +20,5 @@ PROCS=2 which swift-t set -x -swift-t -m lsf -n $PROCS \ +swift-t -p -m lsf -n $PROCS -e PYTHONHOME \ $SCRIPT diff --git a/workflows/common/sh/env-summit-tf-2.4.1.sh b/workflows/common/sh/env-summit-tf-2.4.1.sh index 9c0c24b2..a30e7b14 100644 --- a/workflows/common/sh/env-summit-tf-2.4.1.sh +++ b/workflows/common/sh/env-summit-tf-2.4.1.sh @@ -28,6 +28,12 @@ LD_LIBRARY_PATH+=:$R/lib # EMEWS Queues for R EQR=$MED106/wozniak/sw/gcc-6.4.0/EQ-R +# Python settings +PY=$MED106/sw/open-ce-1.1.3-py37 +LD_LIBRARY_PATH+=:$PY/lib +export PYTHONHOME=$PY +PATH=$PY/bin:$PATH + # For test output processing: LOCAL=0 CRAY=1 diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index d3a9f54b..1e22ac99 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -361,7 +361,7 @@ queue_wait_site() elif [[ $site2 == "titan" ]] then queue_wait_pbs $JOBID - elif [[ $site2 == "summit" ]] + elif [[ $site2 =~ summit* ]] then queue_wait_lsf $JOBID elif [[ $site2 == "pascal" ]] From a1234bd07e44e5098a4e49d2e599590d5f3de680 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 29 Jul 2021 13:18:04 -0500 Subject: [PATCH 165/601] Set sys.argv before importing tensorflow - needed for TF 2.4.1 --- workflows/common/swift/obj_py.swift | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/workflows/common/swift/obj_py.swift b/workflows/common/swift/obj_py.swift index b058d924..3d70734a 100644 --- a/workflows/common/swift/obj_py.swift +++ b/workflows/common/swift/obj_py.swift @@ -6,10 +6,15 @@ string code_template = ---- try: - import sys, traceback, json, os + import json + import os + import sys + import traceback import model_runner - import tensorflow - from tensorflow import keras + + sys.argv = [ 'python' ] + import tensorflow + from tensorflow import keras obj_result = '-100' outdir = '%s' From b1467c05ed7d6684bce39ed915b0ab14fd997486 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 1 Sep 2021 11:52:04 -0500 Subject: [PATCH 166/601] Working settings for new OS on Summit --- workflows/common/sh/env-summit-tf2.sh | 51 ++++----------------------- 1 file changed, 7 insertions(+), 44 deletions(-) diff --git a/workflows/common/sh/env-summit-tf2.sh b/workflows/common/sh/env-summit-tf2.sh index 45a50e82..8e61eccf 100644 --- a/workflows/common/sh/env-summit-tf2.sh +++ b/workflows/common/sh/env-summit-tf2.sh @@ -1,68 +1,31 @@ # ENV Summit TF2 # Environment settings for Summit (Swift, Python, R, Tcl, etc.) -# GCC 7.4.0, TensorFlow 2, opence010env, R 3.6.1 +# GCC 8.3.1, TensorFlow 2.4.1, opence 1.2.0-py38-0, R 3.6.1 -# SWIFT_IMPL=echo SWIFT_IMPL=py # Let modules initialize LD_LIBRARY_PATH before changing it: set +eu # modules create errors outside our control -module load spectrum-mpi/10.3.1.2-20200121 +module load spectrum-mpi/10.4.0.3-20210112 module unload darshan-runtime -# module load ibm-wml-ce/1.6.2-3 -module load cuda/10.2.89 +module load open-ce/1.2.0-py38-0 module list set -eu # From Wozniak MED106=/gpfs/alpine/world-shared/med106 -# SWIFT=$MED106/sw/gcc-7.4.0/swift-t/2019-10-18 # Python (ibm-wml), no R -# SWIFT=$MED106/sw/gcc-7.4.0/swift-t/2019-11-06 # Python (ibm-wml) and R -# Python (ibm-wml-ce/1.7.0-1) and R: -# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-03-31-c -# Python (ibm-wml-ce/1.6.2-3) and R: -# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-02 -# Python (med106/sw/condaenv-200408) and R: -# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-08 -# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-11 -# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-08-19 -# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-09-02 -SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-10-22 +ROOT=$MED106/sw/gcc-8.3.1 +SWIFT=$ROOT/swift-t/2021-08-27 -export TURBINE_HOME=$SWIFT/turbine PATH=$SWIFT/stc/bin:$PATH PATH=$SWIFT/turbine/bin:$PATH -# log_path PATH - -# IBM_WML_CE=/autofs/nccs-svm1_sw/summit/ibm-wml-ce/anaconda-base/envs/ibm-wml-ce-1.6.2-3 - -# export LD_LIBRARY_PATH -# LD_LIBRARY_PATH=$IBM_WML_CE/lib:$LD_LIBRARY_PATH - -# Inject Python to PATH using PRELAUNCH: -# This would be better, but is broken for ZSH users: -# module load ibm-wml-ce/1.6.2-3 -# Must use PATH directly: -# export TURBINE_PRELAUNCH="PATH=$IBM_WML_CE/bin:\$PATH" - R=/gpfs/alpine/world-shared/med106/wozniak/sw/gcc-6.4.0/R-3.6.1/lib64/R LD_LIBRARY_PATH+=:$R/lib -# PY=/gpfs/alpine/world-shared/med106/sw/condaenv-200408 -PY=$MED106/sw2/opence010env -LD_LIBRARY_PATH+=:$PY/lib -LD_LIBRARY_PATH+=:/lib64 # we need this path to be before the $PY/lib one, which is added below, or else for compiling using mpicc we get the error "/usr/bin/uuidgen: /gpfs/alpine/world-shared/med106/sw/condaenv-200408/lib/libuuid.so.1: no version information available (required by /usr/bin/uuidgen)" -export PYTHONHOME=$PY - -PATH=$PY/bin:$PATH - -# ALW 9/28/20: This path is already added, albeit to the end rather than the beginning, in the LD_LIBRARY_PATH+=:$PY/lib line above -#export LD_LIBRARY_PATH=/gpfs/alpine/world-shared/med106/sw/condaenv-200408/lib:$ - -# ALW 10/1/20: Adding this per Justin and my experiments and discussion on 9/30/20 and 10/1/20 -export LD_LIBRARY_PATH="/sw/summit/gcc/7.4.0/lib64:$LD_LIBRARY_PATH:/sw/summit/gcc/6.4.0/lib64" +PYTHON=$( which python3 ) +export PYTHONHOME=$( dirname $( dirname $PYTHON ) ) # EMEWS Queues for R EQR=$MED106/wozniak/sw/gcc-6.4.0/EQ-R From 84a725ea1ab57888750c19593328c6e4c1f279b7 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 1 Sep 2021 16:30:27 -0500 Subject: [PATCH 167/601] Remove WS and bad characters --- workflows/common/R/install-candle.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/workflows/common/R/install-candle.R b/workflows/common/R/install-candle.R index c8b8b520..4f0e2409 100644 --- a/workflows/common/R/install-candle.R +++ b/workflows/common/R/install-candle.R @@ -19,12 +19,12 @@ options(repos = r) # * DONE (jsonlite) # 1): succeeded '/usr/local/apps/R/4.0/4.0.0/lib64/R/bin/R CMD INSTALL -l '/gpfs/gsfs9/users/BIDS-HPC/public/software/distributions/candle/dev_2/builds/R/libs' '/lscratch/64803361/Rtmpnd5yDC/downloaded_packages/jsonlite_1.7.1.tar.gz'' # The downloaded source packages are in -# ‘/lscratch/64803361/Rtmpnd5yDC/downloaded_packages’ +# /lscratch/64803361/Rtmpnd5yDC/downloaded_packages # [1] "" -# LOAD: jsonlite -# Error in value[[3L]](cond) : -# Package ‘jsonlite’ version 1.7.0 cannot be unloaded: -# Error in unloadNamespace(package) : namespace ‘jsonlite’ is imported by ‘plotly’ so cannot be unloaded +# LOAD: jsonlite +# Error in value[[3L]](cond) : +# Package 'jsonlite' version 1.7.0 cannot be unloaded: +# Error in unloadNamespace(package) : namespace jsonlite is imported by plotly so cannot be unloaded # Calls: library ... tryCatch -> tryCatchList -> tryCatchOne -> # Execution halted # ****NOTE**** that I tried installing both plotly and jsonlite the normal way (in the PKGS below instead of a specific version above) and I got the same error From ae921e5a86c730620baafbdbbf948a20d26d240d Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 1 Oct 2021 12:08:37 -0500 Subject: [PATCH 168/601] New Swift/T settings after OS upgrade --- workflows/common/sh/env-summit.sh | 36 ++++++------------------------- 1 file changed, 7 insertions(+), 29 deletions(-) diff --git a/workflows/common/sh/env-summit.sh b/workflows/common/sh/env-summit.sh index 21bd231b..c98194b5 100644 --- a/workflows/common/sh/env-summit.sh +++ b/workflows/common/sh/env-summit.sh @@ -1,7 +1,5 @@ -# ENV Summit - DEPRECATED - (Wozniak, 2020-10-29) -# Use summit-tf1 or summit-tf2 -# Environment settings for Summit (Swift, Python, R, Tcl, etc.) +# ENV Summit # SWIFT_IMPL=echo SWIFT_IMPL=py @@ -16,43 +14,23 @@ set -eu # From Wozniak MED106=/gpfs/alpine/world-shared/med106 -# SWIFT=$MED106/sw/gcc-7.4.0/swift-t/2019-10-18 # Python (ibm-wml), no R -# SWIFT=$MED106/sw/gcc-7.4.0/swift-t/2019-11-06 # Python (ibm-wml) and R -# Python (ibm-wml-ce/1.7.0-1) and R: -# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-03-31-c -# Python (ibm-wml-ce/1.6.2-3) and R: -# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-02 -# Python (med106/sw/condaenv-200408) and R: -# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-08 -# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-11 -# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-08-19 -SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-09-02 +ROOT=$MED106/sw/gcc-8.3.1 +SWIFT=$ROOT/swift-t/2021-08-27 export TURBINE_HOME=$SWIFT/turbine PATH=$SWIFT/stc/bin:$PATH PATH=$SWIFT/turbine/bin:$PATH -# log_path PATH - -# IBM_WML_CE=/autofs/nccs-svm1_sw/summit/ibm-wml-ce/anaconda-base/envs/ibm-wml-ce-1.6.2-3 - -# export LD_LIBRARY_PATH -# LD_LIBRARY_PATH=$IBM_WML_CE/lib:$LD_LIBRARY_PATH - -# Inject Python to PATH using PRELAUNCH: -# This would be better, but is broken for ZSH users: -# module load ibm-wml-ce/1.6.2-3 -# Must use PATH directly: -# export TURBINE_PRELAUNCH="PATH=$IBM_WML_CE/bin:\$PATH" - R=/gpfs/alpine/world-shared/med106/wozniak/sw/gcc-6.4.0/R-3.6.1/lib64/R LD_LIBRARY_PATH+=:$R/lib -PY=/gpfs/alpine/world-shared/med106/sw/condaenv-200408 +# PY=/gpfs/alpine/world-shared/med106/sw/condaenv-200408 +PY=/sw/summit/open-ce/anaconda-base/envs/open-ce-1.2.0-py38-0 LD_LIBRARY_PATH+=:$PY/lib export PYTHONHOME=$PY -export LD_LIBRARY_PATH=/gpfs/alpine/world-shared/med106/sw/condaenv-200408/lib:$LD_LIBRARY_PATH +# /gpfs/alpine/world-shared/med106/sw/condaenv-200408 +export LD_LIBRARY_PATH=$PY/lib:$LD_LIBRARY_PATH # EMEWS Queues for R EQR=$MED106/wozniak/sw/gcc-6.4.0/EQ-R From cc7581d0c5ab1b198f6c7daf25db2d8874e22955 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 1 Oct 2021 12:10:09 -0500 Subject: [PATCH 169/601] Deprecate env-summits from before OS upgrade --- workflows/common/sh/env-summit-tf1.sh | 1 + workflows/common/sh/env-summit-tf2.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/workflows/common/sh/env-summit-tf1.sh b/workflows/common/sh/env-summit-tf1.sh index 89c07381..8427dcac 100644 --- a/workflows/common/sh/env-summit-tf1.sh +++ b/workflows/common/sh/env-summit-tf1.sh @@ -1,4 +1,5 @@ +# DEPRECATED 2021-10-01: Use env-summit # ENV Summit TF1 # Environment settings for Summit (Swift, Python, R, Tcl, etc.) # GCC 6.4.0, TensorFlow 1, condaenv-200408, R 3.6.1 diff --git a/workflows/common/sh/env-summit-tf2.sh b/workflows/common/sh/env-summit-tf2.sh index 8e61eccf..d5607c17 100644 --- a/workflows/common/sh/env-summit-tf2.sh +++ b/workflows/common/sh/env-summit-tf2.sh @@ -1,4 +1,5 @@ +# DEPRECATED 2021-10-01: Use env-summit # ENV Summit TF2 # Environment settings for Summit (Swift, Python, R, Tcl, etc.) # GCC 8.3.1, TensorFlow 2.4.1, opence 1.2.0-py38-0, R 3.6.1 From df5867bf7b42d24e4e05953da07f900df9ca4f91 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 1 Oct 2021 12:11:08 -0500 Subject: [PATCH 170/601] Better form for debugging --- workflows/common/swift/obj_py.swift | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/common/swift/obj_py.swift b/workflows/common/swift/obj_py.swift index 3d70734a..ff591635 100644 --- a/workflows/common/swift/obj_py.swift +++ b/workflows/common/swift/obj_py.swift @@ -22,7 +22,8 @@ try: if not os.path.exists(outdir): os.makedirs(outdir) - hyper_parameter_map = json.loads("""%s""") + J = """%s""" + hyper_parameter_map = json.loads(J) hyper_parameter_map['framework'] = 'keras' hyper_parameter_map['save'] = '{}/output'.format(outdir) hyper_parameter_map['instance_directory'] = outdir From 0e4e7c1918ab832a2a0d94af23c25c361c30f0e4 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 1 Oct 2021 12:11:23 -0500 Subject: [PATCH 171/601] Better exception report message --- workflows/common/swift/obj_py.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/swift/obj_py.swift b/workflows/common/swift/obj_py.swift index ff591635..6933e57e 100644 --- a/workflows/common/swift/obj_py.swift +++ b/workflows/common/swift/obj_py.swift @@ -37,7 +37,7 @@ try: except Exception as e: info = sys.exc_info() s = traceback.format_tb(info[2]) - sys.stdout.write('EXCEPTION: \\n' + repr(e) + ' ... \\n' + ''.join(s)) + sys.stdout.write('EXCEPTION in obj() code: \\n' + repr(e) + ' ... \\n' + ''.join(s)) sys.stdout.write('\\n') sys.stdout.flush() obj_result = 'EXCEPTION' From 4ed921f99bc0d7bfaaec97fa0ff2b814357ca554 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 4 Oct 2021 14:54:48 -0500 Subject: [PATCH 172/601] Initial Swift/T test for Spock --- scratch/spock/README.adoc | 10 ++++++++++ scratch/spock/hello.swift | 2 ++ scratch/spock/test.sh | 30 ++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+) create mode 100644 scratch/spock/README.adoc create mode 100644 scratch/spock/hello.swift create mode 100644 scratch/spock/test.sh diff --git a/scratch/spock/README.adoc b/scratch/spock/README.adoc new file mode 100644 index 00000000..7127d7ad --- /dev/null +++ b/scratch/spock/README.adoc @@ -0,0 +1,10 @@ + +Run with + +---- +$ ./test.sh hello.swift +---- + +Wait for job to complete, then check turbine_output/output.txt + +Logs are in turbine_output/ diff --git a/scratch/spock/hello.swift b/scratch/spock/hello.swift new file mode 100644 index 00000000..79ecd0d2 --- /dev/null +++ b/scratch/spock/hello.swift @@ -0,0 +1,2 @@ +import io; +printf("HELLO"); diff --git a/scratch/spock/test.sh b/scratch/spock/test.sh new file mode 100644 index 00000000..3e12dae3 --- /dev/null +++ b/scratch/spock/test.sh @@ -0,0 +1,30 @@ +#!/bin/bash -l +set -eu + +if (( ${#} != 1 )) +then + echo "Provide the workflow!" + exit 1 +fi + +WORKFLOW=$1 + +MED106=/gpfs/alpine/world-shared/med106 +ROOT=$MED106/sw/spock/gcc-10.3.0 +SWIFT=$ROOT/swift-t/2021-10-04 + +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH + +which swift-t + +export PROJECT=MED106 # CSC249ADCD01 +export QUEUE=ecp +export WALLTIME=00:05:00 +export PROCS=2 +export PPN=2 + +export TURBINE_LAUNCHER=srun + +set -x +swift-t -m slurm -n $PROCS $WORKFLOW From e49b285d86a652b0f95256a8382c47c875756c80 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 4 Oct 2021 14:56:42 -0500 Subject: [PATCH 173/601] Formatting --- scratch/spock/README.adoc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scratch/spock/README.adoc b/scratch/spock/README.adoc index 7127d7ad..c38de0b1 100644 --- a/scratch/spock/README.adoc +++ b/scratch/spock/README.adoc @@ -5,6 +5,6 @@ Run with $ ./test.sh hello.swift ---- -Wait for job to complete, then check turbine_output/output.txt +Wait for job to complete, then check `turbine_output/output.txt` -Logs are in turbine_output/ +Logs are in `turbine_output/` From ed94cf50e039c5919398d1c4f294426e1ac07e9b Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 4 Oct 2021 14:57:53 -0500 Subject: [PATCH 174/601] Clean up --- scratch/spock/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scratch/spock/test.sh b/scratch/spock/test.sh index 3e12dae3..06851fd4 100644 --- a/scratch/spock/test.sh +++ b/scratch/spock/test.sh @@ -18,7 +18,7 @@ PATH=$SWIFT/turbine/bin:$PATH which swift-t -export PROJECT=MED106 # CSC249ADCD01 +export PROJECT=MED106 export QUEUE=ecp export WALLTIME=00:05:00 export PROCS=2 From e9e1195ebf7abb05f632eaab922a47ba26f7d95a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 4 Oct 2021 15:18:18 -0500 Subject: [PATCH 175/601] Test installation w/ Python --- scratch/spock/py0.swift | 7 +++++++ scratch/spock/test.sh | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 scratch/spock/py0.swift mode change 100644 => 100755 scratch/spock/test.sh diff --git a/scratch/spock/py0.swift b/scratch/spock/py0.swift new file mode 100644 index 00000000..4b262431 --- /dev/null +++ b/scratch/spock/py0.swift @@ -0,0 +1,7 @@ + +import io; +import python; + +i = python("print(\"python works\")", + "repr(2+2)"); +printf("result of 2+2='%s'", i); diff --git a/scratch/spock/test.sh b/scratch/spock/test.sh old mode 100644 new mode 100755 index 06851fd4..09d2b13d --- a/scratch/spock/test.sh +++ b/scratch/spock/test.sh @@ -11,7 +11,7 @@ WORKFLOW=$1 MED106=/gpfs/alpine/world-shared/med106 ROOT=$MED106/sw/spock/gcc-10.3.0 -SWIFT=$ROOT/swift-t/2021-10-04 +SWIFT=$ROOT/swift-t/2021-10-04_B PATH=$SWIFT/stc/bin:$PATH PATH=$SWIFT/turbine/bin:$PATH From ec3b0ad2870f4ab34f5d5ee5837561e218c60d33 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 5 Oct 2021 14:42:49 -0500 Subject: [PATCH 176/601] Test Swift/T with new TF --- scratch/spock/py-tf.swift | 18 ++++++++++++++++++ scratch/spock/test.sh | 6 ++++-- 2 files changed, 22 insertions(+), 2 deletions(-) create mode 100644 scratch/spock/py-tf.swift diff --git a/scratch/spock/py-tf.swift b/scratch/spock/py-tf.swift new file mode 100644 index 00000000..d48ef6a4 --- /dev/null +++ b/scratch/spock/py-tf.swift @@ -0,0 +1,18 @@ + +import io; +import python; + +r = python(---- +import sys, traceback +try: + sys.argv = ['python'] + import tensorflow as tf +except Exception as e: + info = sys.exc_info() + s = traceback.format_tb(info[2]) + sys.stdout.write('EXCEPTION in Python code: \\n' + repr(e) + ' ... \\n' + ''.join(s)) + sys.stdout.write('\\n') + sys.stdout.flush() +----, + "repr(tf.__version__)"); // +printf("TensorFlow version: %s", r); diff --git a/scratch/spock/test.sh b/scratch/spock/test.sh index 09d2b13d..aa591d17 100755 --- a/scratch/spock/test.sh +++ b/scratch/spock/test.sh @@ -11,11 +11,13 @@ WORKFLOW=$1 MED106=/gpfs/alpine/world-shared/med106 ROOT=$MED106/sw/spock/gcc-10.3.0 -SWIFT=$ROOT/swift-t/2021-10-04_B +SWIFT=$ROOT/swift-t/2021-10-05 PATH=$SWIFT/stc/bin:$PATH PATH=$SWIFT/turbine/bin:$PATH +PY=/gpfs/alpine/med106/world-shared/hsyoo/spock_tf2_py37_rocm42 + which swift-t export PROJECT=MED106 @@ -27,4 +29,4 @@ export PPN=2 export TURBINE_LAUNCHER=srun set -x -swift-t -m slurm -n $PROCS $WORKFLOW +swift-t -m slurm -n $PROCS -e PYTHONHOME=$PY $WORKFLOW From 083192372f068024410bb77061201486f183a7b0 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 5 Oct 2021 14:57:06 -0500 Subject: [PATCH 177/601] This file is no longer deprecated --- workflows/common/sh/sched-summit.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/workflows/common/sh/sched-summit.sh b/workflows/common/sh/sched-summit.sh index f4e831e2..fad0ac15 100644 --- a/workflows/common/sh/sched-summit.sh +++ b/workflows/common/sh/sched-summit.sh @@ -1,6 +1,4 @@ -# SCHED Summit - DEPRECATED - (Wozniak, 2020-10-29) -# Use summit-tf1 or summit-tf2 # Scheduler settings for Swift/Summit MACHINE="-m lsf" From 7271093809f5e3aa736a56f59f78bd835ab7224d Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 5 Oct 2021 15:36:07 -0500 Subject: [PATCH 178/601] Move Summit jsrun flags here --- workflows/common/sh/sched-summit.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/common/sh/sched-summit.sh b/workflows/common/sh/sched-summit.sh index fad0ac15..9001713c 100644 --- a/workflows/common/sh/sched-summit.sh +++ b/workflows/common/sh/sched-summit.sh @@ -8,3 +8,5 @@ MACHINE="-m lsf" export PROJECT=${PROJECT:-MED106} # export TURBINE_OUTPUT_SOFTLINK=/dev/null + +export TURBINE_LAUNCH_OPTIONS="-a1 -g6 -c7" From 14078a13074ee42e6cb92a56375f9a0a5a0698ff Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 5 Oct 2021 15:36:32 -0500 Subject: [PATCH 179/601] Fixes to UPF workflow start script --- workflows/upf/swift/workflow.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index 897d5a67..60ef60c0 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -6,7 +6,7 @@ set -eu # Autodetect this workflow directory export EMEWS_PROJECT_ROOT=$( readlink --canonicalize $( dirname $0 )/.. ) export WORKFLOWS_ROOT=$( readlink --canonicalize $EMEWS_PROJECT_ROOT/.. ) -export BENCHMARKS_ROOT=$( readlink --canonicalize $EMEWS_PROJECT_ROOT/../../../Benchmarks.tf2 ) +export BENCHMARKS_ROOT=$( readlink --canonicalize $EMEWS_PROJECT_ROOT/../../../Benchmarks ) BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/NT3:$BENCHMARKS_ROOT/Pilot2/P2B1:$BENCHMARKS_ROOT/Pilot1/P1B1:$BENCHMARKS_ROOT/Pilot1/Combo:$BENCHMARKS_ROOT/Pilot3/P3B1:$BENCHMARKS_ROOT/Pilot3/P3B3:$BENCHMARKS_ROOT/Pilot3/P3B4:$BENCHMARKS_ROOT/Pilot3/P3B5 export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} @@ -42,6 +42,7 @@ fi # Set PYTHONPATH for BENCHMARK related stuff PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common PYTHONPATH+=:$WORKFLOWS_ROOT/common/python +export PYTHONPATH source_site env $SITE source_site sched $SITE From 8fcc59288b05f4aee048020d77e15663fbae358f Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 5 Oct 2021 15:37:01 -0500 Subject: [PATCH 180/601] Comment out settings in UPF cfg-sys-1 --- workflows/upf/test/cfg-sys-1.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflows/upf/test/cfg-sys-1.sh b/workflows/upf/test/cfg-sys-1.sh index f5a5bcbe..058d05f5 100644 --- a/workflows/upf/test/cfg-sys-1.sh +++ b/workflows/upf/test/cfg-sys-1.sh @@ -29,14 +29,14 @@ export PPN=${PPN:-1} # export QUEUE=${QUEUE:-debug-flat-quad} # export PROJECT=${PROJECT:-ecp-testbed-01} # export PROJECT=Candle_ECP -export PROJECT=CSC249ADOA01 +# export PROJECT=CSC249ADOA01 # Summit: -export QUEUE=${QUEUE:-batch} -export PROJECT=med106 +# export QUEUE=${QUEUE:-batch} +# export PROJECT=med106 # export TURBINE_LAUNCH_OPTIONS="-a1 -g6 -c7" -export WALLTIME=${WALLTIME:-0:30} +# export WALLTIME=${WALLTIME:-0:30} # export MAIL_ENABLED=1 # export MAIL_ADDRESS=wozniak@mcs.anl.gov From f946befe53733f2f39b4da0fcabad825c062b487 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 5 Oct 2021 15:37:17 -0500 Subject: [PATCH 181/601] Remove Summit-specific settings --- workflows/upf/test/upf-1.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/workflows/upf/test/upf-1.sh b/workflows/upf/test/upf-1.sh index fefbe00d..c288525a 100755 --- a/workflows/upf/test/upf-1.sh +++ b/workflows/upf/test/upf-1.sh @@ -21,6 +21,4 @@ export EMEWS_PROJECT_ROOT export OBJ_RETURN="val_loss" CFG_SYS=$THIS/cfg-sys-1.sh -export TURBINE_LAUNCH_OPTIONS="-a1 -g6 -c7" - $EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $THIS/upf-1.txt From ab24b894a20bbf22a5964f61cf6a927d39dd5939 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 5 Oct 2021 15:38:49 -0500 Subject: [PATCH 182/601] env and sched settings for Spock --- workflows/common/sh/env-spock.sh | 25 +++++++++++++++++++++++++ workflows/common/sh/sched-spock.sh | 10 ++++++++++ 2 files changed, 35 insertions(+) create mode 100644 workflows/common/sh/env-spock.sh create mode 100644 workflows/common/sh/sched-spock.sh diff --git a/workflows/common/sh/env-spock.sh b/workflows/common/sh/env-spock.sh new file mode 100644 index 00000000..7e39d2c4 --- /dev/null +++ b/workflows/common/sh/env-spock.sh @@ -0,0 +1,25 @@ + +# ENV Spock + +# SWIFT_IMPL=echo +SWIFT_IMPL=py + +# CANDLE software installation root: +MED106=/gpfs/alpine/world-shared/med106 +ROOT=$MED106/sw/spock/gcc-10.3.0 + +# Add Swift/T to PATH +SWIFT=$ROOT/swift-t/2021-10-05 +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH + +# Set up Python: +PY=/gpfs/alpine/med106/world-shared/hsyoo/spock_tf2_py37_rocm42 +export PYTHONHOME=$PY + +# For test output processing: +LOCAL=0 +CRAY=1 + +# Dummy setting: EQ/R is not installed on Spock yet +EQR=not-installed diff --git a/workflows/common/sh/sched-spock.sh b/workflows/common/sh/sched-spock.sh new file mode 100644 index 00000000..32e659e2 --- /dev/null +++ b/workflows/common/sh/sched-spock.sh @@ -0,0 +1,10 @@ + +# SCHED Spock + +# Tell Swift/T to use SLURM: +MACHINE="-m slurm" +export TURBINE_LAUNCHER=srun + +# Default CANDLE account settings for Spock: +export PROJECT=${PROJECT:-MED106} +export QUEUE=${QUEUE:-ecp} From 25650921d4d607ea3ad0817fa80aab6045ca6841 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 11 Oct 2021 17:49:50 -0500 Subject: [PATCH 183/601] Update to new Swift/T, new Conda --- workflows/common/sh/env-summit.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/workflows/common/sh/env-summit.sh b/workflows/common/sh/env-summit.sh index c98194b5..18758c5c 100644 --- a/workflows/common/sh/env-summit.sh +++ b/workflows/common/sh/env-summit.sh @@ -15,7 +15,7 @@ set -eu # From Wozniak MED106=/gpfs/alpine/world-shared/med106 ROOT=$MED106/sw/gcc-8.3.1 -SWIFT=$ROOT/swift-t/2021-08-27 +SWIFT=$ROOT/swift-t/2021-10-06 export TURBINE_HOME=$SWIFT/turbine PATH=$SWIFT/stc/bin:$PATH @@ -25,9 +25,11 @@ R=/gpfs/alpine/world-shared/med106/wozniak/sw/gcc-6.4.0/R-3.6.1/lib64/R LD_LIBRARY_PATH+=:$R/lib # PY=/gpfs/alpine/world-shared/med106/sw/condaenv-200408 -PY=/sw/summit/open-ce/anaconda-base/envs/open-ce-1.2.0-py38-0 +# PY=/sw/summit/open-ce/anaconda-base/envs/open-ce-1.2.0-py38-0 +PY=/gpfs/alpine/world-shared/med106/sw/conda/2021-10-06/envs/CANDLE-2021-10-06 LD_LIBRARY_PATH+=:$PY/lib export PYTHONHOME=$PY +PATH=$PY/bin:$PATH # /gpfs/alpine/world-shared/med106/sw/condaenv-200408 export LD_LIBRARY_PATH=$PY/lib:$LD_LIBRARY_PATH From 257d30cf4c4d8f14702662af13e53c8a866ed978 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 11 Oct 2021 17:51:06 -0500 Subject: [PATCH 184/601] Build dataframe only if it doesn't exist --- workflows/cp-leaveout/py/data_setup.py | 36 ++++++++++++++++++-------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/workflows/cp-leaveout/py/data_setup.py b/workflows/cp-leaveout/py/data_setup.py index 20514c4a..30e757f7 100644 --- a/workflows/cp-leaveout/py/data_setup.py +++ b/workflows/cp-leaveout/py/data_setup.py @@ -2,7 +2,7 @@ # DATA SETUP PY import json -import os +import os, sys from pathlib import Path import traceback @@ -26,7 +26,7 @@ def pre_run(params): sys.stdout.flush() # check NVMe disk is available - username = os.environ['USER'] + # username = os.environ['USER'] # nvme_enabled = Path('/mnt/bb/{}'.format(username)).exists() nvme_enabled = False if nvme_enabled: @@ -79,28 +79,42 @@ def pre_run(params): return ModelResult.ERROR try: - print("data_setup: build_dataframe() ...") - start = time.time() - topN_to_uno.build_dataframe(args) - stop = time.time() - duration = stop - start - print("data_setup: build_dataframe() OK : " + - "%0.1f seconds." % duration) + print("data_setup: build_dataframe(output=%s) ..." % + args.output) + sys.stdout.flush() + if not os.path.exists(args.output): + sys.stdout.flush() + start = time.time() + topN_to_uno.build_dataframe(args) + stop = time.time() + duration = stop - start + print("data_setup: build_dataframe() OK : " + + "%0.1f seconds." % duration) + sys.stdout.flush() + else: + print("data_setup: dataframe exists: %s" % + os.path.realpath(args.output)) except topN_to_uno.topN_NoDataException: print("data_setup: caught topN_NoDataException: SKIP") + sys.stdout.flush() return ModelResult.SKIP except ValueError: print("data_setup: caught ValueError for node: '%s'" % - params["node"]) # new 2019-12-02 + params["node"]) # new 2019-12-02 + sys.stdout.flush() traceback.print_exc(file=sys.stdout) return ModelResult.ERROR except Exception as e: print("data_setup: error in build_dataframe!\n" + str(e)) + sys.stdout.flush() traceback.print_exc() + sys.stdout.flush() return ModelResult.ERROR print("data_setup.pre_run() done.") + sys.stdout.flush() return ModelResult.SUCCESS def post_run(params, output_dict): - print("post_run") + print("data_setup(): post_run") + sys.stdout.flush() return ModelResult.SUCCESS From 108f433187a08669b1557b1e471784713b5741ca Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 11 Oct 2021 17:56:45 -0500 Subject: [PATCH 185/601] WS --- workflows/cp-leaveout/py/plangen.py | 58 ++++++++++++++++------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/workflows/cp-leaveout/py/plangen.py b/workflows/cp-leaveout/py/plangen.py index dd5efe25..cd13aa28 100644 --- a/workflows/cp-leaveout/py/plangen.py +++ b/workflows/cp-leaveout/py/plangen.py @@ -108,7 +108,7 @@ def validate_args(args): if verbose: print("Writing plan information to %s" % os.path.abspath(args.out_dir)) - # expand, validate and load input feature-set content lists + # expand, validate and load input feature-set content lists fs_content = [] args.fs_lines = [] file_error = False @@ -241,7 +241,7 @@ def partition(self, base, size=None, count=0, name=None): omit_size = base_len - size increment = min(size, omit_size) - # omit consecutive blocks of feature-name entries + # omit consecutive blocks of feature-name entries for i in range(count): org = i * increment if org >= base_len: @@ -384,7 +384,7 @@ class RunStat(Enum): # subplan execution status ) _select_row_from_planstat = """ - SELECT rowid, + SELECT rowid, plan_name, create_date, feature_sets, partitions, nbr_subplans FROM planstat WHERE plan_name='{}' @@ -481,15 +481,23 @@ class RunStat(Enum): # subplan execution status DELETE FROM runhist where plan_id = {} """ + +def log(msg): + if DEBUG_SQL: + with open("plangen_db.log", "a") as fp: + fp.write(msg + "\n") + fp.flush() + + #------------------------------------------------------------------------------ # "Plan management" Database functions # -# db_connect - establish database connection returning conn handle -# execute_sql_stmt - execute a SQL statement with optional error trap +# db_connect - establish database connection returning conn handle +# execute_sql_stmt - execute a SQL statement with optional error trap # plan_prep - prepare for the execution of a multi-step "plan" -# start_subplan - start a subplan, (ex. '1.4.8'), write RunhistRow -# stop_subplan - stop a subplan, update RunhistRow -# get_subplan_runhist - return a RunhistRow for a given subplan +# start_subplan - start a subplan, (ex. '1.4.8'), write RunhistRow +# stop_subplan - stop a subplan, update RunhistRow +# get_subplan_runhist - return a RunhistRow for a given subplan # plan_remove - remove all database records for the named plan #------------------------------------------------------------------------------ @@ -556,7 +564,7 @@ def db_connect(db_path): """Connect to the plan management database. Establish a connection to the sqlite3 database contained in the named file. - A plan management database is created and populated at db_path if the file + A plan management database is created and populated at db_path if the file does not exist. Args @@ -577,7 +585,7 @@ def db_connect(db_path): print('db_connect', error) raise - # create plan management tables on initial database allocation + # create plan management tables on initial database allocation if conn and not prev_allocated: complete = execute_sql_stmt(conn, _planstat_ddl) complete &= execute_sql_stmt(conn, _runhist_ddl) @@ -675,7 +683,7 @@ def plan_prep(db_path, plan_path, run_type=RunType.RUN_ALL): partitions = get_plan_fs_parts(plan_dict) nbr_subplans = get_plan_nbr_subplans(plan_dict) - # determine if a plan of the given name has already been registered + # determine if a plan of the given name has already been registered conn = db_connect(db_path) plan_key = _get_planstat_key(plan_path) stmt = _select_row_from_planstat.format(plan_key) @@ -689,7 +697,7 @@ def plan_prep(db_path, plan_path, run_type=RunType.RUN_ALL): plan_rec = PlanstatRow._make(row) # column-name addressable rowid = plan_rec.rowid # the unique rowid will be the uniquifier returned - # compare run_type to initial expectations + # compare run_type to initial expectations error = False if run_type == RunType.RUN_ALL and rowid > 0: @@ -720,7 +728,7 @@ def plan_prep(db_path, plan_path, run_type=RunType.RUN_ALL): status = execute_sql_stmt(conn, stmt, cursor=csr) rowid = csr.lastrowid - # cleanup resources and return uniquifier or error indicator + # cleanup resources and return uniquifier or error indicator csr.close() conn.commit() @@ -766,7 +774,7 @@ def start_subplan(db_path, plan_path, plan_id=None, subplan_id=None, run_type=No if runhist_rec.status == RunStat.COMPLETE.name: skip = True - # construct/reinit a new runhist record + # construct/reinit a new runhist record if not skip: currtime = datetime.now() start_time = currtime.isoformat(timespec=ISO_TIMESTAMP) @@ -853,7 +861,7 @@ def stop_subplan(db_path, plan_id=None, subplan_id=None, comp_info_dict={}): comp_dict['val_r2'], comp_dict['lr'], other_info, - # key spec + # key spec plan_id, subplan_id ) @@ -949,7 +957,7 @@ def _delete_runhistory(conn, plan_id): #------------------------------------------------------------------------------ -# Plan navigation, content retrieval +# Plan navigation, content retrieval #------------------------------------------------------------------------------ def load_plan(filepath): @@ -1126,7 +1134,7 @@ def get_subplan_features(plan_dict, subplan_id, parent_features=False): train_set = content['train'][0] fs_names = [name for name in train_set.keys()] - # categorize the results + # categorize the results result = {} result[0] = fs_names result['train'] = {} @@ -1152,7 +1160,7 @@ def get_subplan_features(plan_dict, subplan_id, parent_features=False): return result, result[0], result['train'], result['val'] #------------------------------------------------------------------------------ -# Plan construction +# Plan construction #------------------------------------------------------------------------------ def build_dictionary_from_lists(seq_list, names): @@ -1203,7 +1211,7 @@ def build_plan_tree(args, feature_set_content, parent_plan_id='', depth=0, data_ all_parts = [] #flat_partitions = [] # preserve, used for file-based approach - #files = [] # preserve, used for file-based approach + #files = [] # preserve, used for file-based approach #sequence = 0 # preserve, used for file-based approach xxx = False @@ -1234,13 +1242,13 @@ def build_plan_tree(args, feature_set_content, parent_plan_id='', depth=0, data_ else: train.append(section) - # generate next depth/level (successor) plans + # generate next depth/level (successor) plans curr_plan_id = '{}.{}'.format(parent_plan_id, step + 1) args.plan_dict[curr_plan_id] = {'val': val, 'train': train} data_name = '{}.{}'.format(data_pfx, step + 1) plan_name = '{}.{}'.format(plan_pfx, step + 1) - # depth-first, shorthand representation of tree showing first feature names + # depth-first, shorthand representation of tree showing first feature names if args.debug: indent = ' ' * (depth * 4) print(indent, curr_plan_id) @@ -1345,7 +1353,7 @@ def write_dict_to_json(dictionary, fname): """ #---------------------------------------------------------------------------------- -# mainline +# mainline #---------------------------------------------------------------------------------- def main(): @@ -1364,10 +1372,10 @@ def main(): maxdepth = args.maxdepth ) - # feature_set_content = [cell_names, drug_names] + # feature_set_content = [cell_names, drug_names] # feature_set_content = [synthetic_cell_names, synthetic_drug_names] - # remove by-1 dimensions, they do not need to be represented in the plan explicitly + # remove by-1 dimensions, they do not need to be represented in the plan explicitly while True: try: ndx = args.fs_parts.index(1) @@ -1378,7 +1386,7 @@ def main(): except ValueError: break - # Plan generation + # Plan generation data_fname_pfx = os.path.join(args.out_dir, 'DATA.1') plan_fname_pfx = os.path.join(args.out_dir, 'PLAN.1') From a3d40fb1ce9eff2469ba3d65c8f2b3d26b4099a9 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 11 Oct 2021 17:57:09 -0500 Subject: [PATCH 186/601] Improve logging --- workflows/cp-leaveout/py/plangen.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/workflows/cp-leaveout/py/plangen.py b/workflows/cp-leaveout/py/plangen.py index cd13aa28..16c91d4d 100644 --- a/workflows/cp-leaveout/py/plangen.py +++ b/workflows/cp-leaveout/py/plangen.py @@ -529,11 +529,8 @@ def execute_sql_stmt(conn, stmt, cursor=None, trap_exception=False): else: lclcsr = conn.cursor() try: - if DEBUG_SQL: - with open("plangen_db.log", "a") as fp: - fp.write("STMT: " + stmt + "\n") - db_exception = False + log("STMT: " + stmt) lclcsr.execute(stmt) except db_Error as e: @@ -759,20 +756,35 @@ def start_subplan(db_path, plan_path, plan_id=None, subplan_id=None, run_type=No already exists for the plan/subplan and is marked COMPLETE. """ + print("plangen: start_subplan: subplan_id=%s" % subplan_id) + sys.stdout.flush() conn = db_connect(db_path) csr = conn.cursor() skip = False + print("plangen: start_subplan: run_type: '%s'" % str(run_type)) + print("plangen: start_subplan: run_type type: %s" % str(type(run_type))) + print("plangen: start_subplan: base: '%s'" % str(RunType.RESTART)) + sys.stdout.flush() + # skip previously completed work if RESTART if run_type == RunType.RESTART: + log("plangen: start_subplan: checking restart: %i" % plan_id) + sys.stdout.flush() stmt = _select_row_from_runhist.format(plan_id, subplan_id) execute_sql_stmt(conn, stmt, cursor=csr) row = csr.fetchone() if row: + log("plangen: start_subplan: found row.") runhist_rec = RunhistRow._make(row) + log("plangen: start_subplan: found '%s'" % runhist_rec.status) if runhist_rec.status == RunStat.COMPLETE.name: skip = True + log("plangen: start_subplan: skip %r" % skip) + else: + print("plangen: start_subplan: not checking restart") + sys.stdout.flush() # construct/reinit a new runhist record if not skip: @@ -793,8 +805,10 @@ def start_subplan(db_path, plan_path, plan_id=None, subplan_id=None, run_type=No conn.close() if skip: + print("plangen: start_subplan: subplan_id=%s: SKIP" % subplan_id) return -1 else: + print("plangen: start_subplan: subplan_id=%s: RUN" % subplan_id) return 0 From a6faf8e55bbf8985c0f28530936b7b7edbfd2356 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 11 Oct 2021 17:57:22 -0500 Subject: [PATCH 187/601] Fix return signature --- workflows/cp-leaveout/py/plangen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/py/plangen.py b/workflows/cp-leaveout/py/plangen.py index 16c91d4d..5bdbb9e2 100644 --- a/workflows/cp-leaveout/py/plangen.py +++ b/workflows/cp-leaveout/py/plangen.py @@ -1142,7 +1142,7 @@ def get_subplan_features(plan_dict, subplan_id, parent_features=False): # acquire feature_set names populated in the plan content, _ = get_subplan(plan_dict, subplan_id) if not content: - return None, None + return None, None, None, None # peek inside the training set to capture active feature-set names train_set = content['train'][0] From 614587739c5ad25f476c921a26262bf0915f5ce3 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 11 Oct 2021 17:57:37 -0500 Subject: [PATCH 188/601] Fix logger --- workflows/cp-leaveout/scripts/compare-errors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/compare-errors.py b/workflows/cp-leaveout/scripts/compare-errors.py index 85072ae7..29ac45a0 100644 --- a/workflows/cp-leaveout/scripts/compare-errors.py +++ b/workflows/cp-leaveout/scripts/compare-errors.py @@ -23,7 +23,7 @@ args = parser.parse_args() # logging.basicConfig(level=logging.DEBUG, format="%(message)s") -# logger = logging.getLogger("extract_node_info") +# logger = logging.getLogger("compare_errors") node_pkl_1 = args.directory1 + "/node-info.pkl" node_pkl_2 = args.directory2 + "/node-info.pkl" From a9ff215f0cb28e309cbd2e129682b25ab2546b3b Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 11 Oct 2021 17:57:44 -0500 Subject: [PATCH 189/601] Fix use of size getter --- workflows/cp-leaveout/scripts/data-size.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/data-size.py b/workflows/cp-leaveout/scripts/data-size.py index a4378c0a..553c4016 100644 --- a/workflows/cp-leaveout/scripts/data-size.py +++ b/workflows/cp-leaveout/scripts/data-size.py @@ -40,7 +40,7 @@ elif ext == ".feather": print("read feather " + str(args.input)) df = pd.read_feather(args.input).fillna(0) - print(df.shape) + print(df.dtypes) # total size: (529940, 6215) # store = pd.HDFStore(args.input, "r", complevel=9, complib="blosc:snappy") From 3af407914e95e6b90241d80fa6c23f8006e946c0 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 11 Oct 2021 17:58:11 -0500 Subject: [PATCH 190/601] flake8 fixes --- workflows/cp-leaveout/scripts/epoch-time.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/workflows/cp-leaveout/scripts/epoch-time.py b/workflows/cp-leaveout/scripts/epoch-time.py index b4ac0747..401abf08 100644 --- a/workflows/cp-leaveout/scripts/epoch-time.py +++ b/workflows/cp-leaveout/scripts/epoch-time.py @@ -2,15 +2,13 @@ # EPOCH TIME PY # See epoch-time.sh -import datetime, sys, time +import datetime, sys -from Node import Node -from utils import fail # Main data structure: # map from stage number to list of epoch times in seconds stages = {} -for stage in range(1,6+1): +for stage in range(1, 6+1): stages[stage] = [] # Files processed: @@ -25,8 +23,8 @@ line = sys.stdin.readline() - if len(line) == 0: break # EOF - if len(line) == 1: continue # Blank line + if len(line) == 0: break # EOF + if len(line) == 1: continue # Blank line tokens = line.split() if tokens[0] == "epoch-time:": @@ -50,7 +48,7 @@ if tokens[2] == "Epoch": ts = tokens[0] + " " + tokens[1] dt = datetime.datetime.strptime(ts, "%Y-%m-%d %H:%M:%S") - if start_current == None: + if start_current is None: start_current = dt continue start = start_current.timestamp() @@ -60,7 +58,7 @@ start_current = dt stages[stage_current].append(duration) -for stage in range(1,6+1): +for stage in range(1, 6+1): n = len(stages[stage]) if n == 0: avg = -1 From cd4e80feb0f5ac4cbb8cf7e06ec9c7f4aaaa5399 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 11 Oct 2021 17:58:35 -0500 Subject: [PATCH 191/601] Improvements to baseline error workflow --- .../cp-leaveout/swift/baseline-error.swift | 37 ++++++++++++++----- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/workflows/cp-leaveout/swift/baseline-error.swift b/workflows/cp-leaveout/swift/baseline-error.swift index 96b5bf51..409acd2b 100644 --- a/workflows/cp-leaveout/swift/baseline-error.swift +++ b/workflows/cp-leaveout/swift/baseline-error.swift @@ -9,6 +9,7 @@ import assert; import files; import io; import python; +import string; import sys; import candle_utils; @@ -21,6 +22,8 @@ string dataframe_csv = argv("dataframe_csv"); string reference = argv("reference"); // List of node IDs, one per line file file_nodes = input(argv("nodes")); +// Mapping from node ID to epochs, one per line +// file file_epochs = input(argv("epochs")); int benchmark_timeout = string2int(argv("benchmark_timeout", "-1")); // == Command-line Arguments End == @@ -30,23 +33,37 @@ string exp_id = getenv("EXPID"); string turbine_output = getenv("TURBINE_OUTPUT"); // == Environment Settings End == -// Read file of node IDs -string lines[] = file_lines(file_nodes); +// For compatibility with obj(): +global const string FRAMEWORK = "keras"; + +// Read file of node IDs: +string nodes_lines[] = file_lines(file_nodes); + +// Read file of epochs: +// string epochs_lines[] = file_lines(file_epochs); + +// // Mapping from node ID to epochs: +// string map_epochs[string]; +// foreach line in epochs_lines +// { +// tokens = split(line); +// map_epochs[tokens[0]] = tokens[1]; +// } // Resultant output values: string results[]; -// Basic parameters for all runs as JSON. -// Keys node and use_exported_data must be filled in later. -string params_basic = +// Templated parameters for all runs as JSON. +// Some keys must be filled in later. +string params_template = ---- -{ +{ "config_file": "uno_auc_model.txt", "cache": "cache/top6_auc", "dataframe_from": "%s", "save_weights": "save/model.h5", "gpus": "0", -"epochs": 50, +"epochs": %i, "es": "True", "node": "%s", "use_exported_data": "%s" @@ -54,12 +71,14 @@ string params_basic = ----; // Evaluate each parameter set -foreach node, i in lines +foreach node, i in nodes_lines { printf("node: %s", node); // Fill in missing hyperparameters: string training_data = "%s/run/%s/topN.uno.h5" % (reference, node); - string params = params_basic % (dataframe_csv, node, training_data); + // int epochs = string2int(map_epochs[node]); + int epochs = 250; + string params = params_template % (dataframe_csv, epochs, node, training_data); // NOTE: obj() is in the obj_*.swift supplied by workflow.sh results[i] = obj(params, node); assert(results[i] != "EXCEPTION", "exception in obj()!"); From 53873d5d46af46d1caf8745cebecfc7811f0b2b2 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 11 Oct 2021 17:58:57 -0500 Subject: [PATCH 192/601] Note and fix in plangen test --- workflows/cp-leaveout/swift/plangen_2.swift | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/swift/plangen_2.swift b/workflows/cp-leaveout/swift/plangen_2.swift index 06e2faef..0649252d 100644 --- a/workflows/cp-leaveout/swift/plangen_2.swift +++ b/workflows/cp-leaveout/swift/plangen_2.swift @@ -8,6 +8,8 @@ // calls to python_db() on rank DB corresponding to // environment variable TURBINE_DB_WORKERS: +// Use plangen from Supervisor! + pragma worktypedef DB; @dispatch=DB @@ -62,7 +64,7 @@ except Exception as e: import sys, traceback import plangen try: - result = str(plangen.start_subplan('%s', '%s', %s, '%s', '%s')) + result = str(plangen.start_subplan('%s', '%s', %s, '%s', %s)) except Exception as e: info = sys.exc_info() s = traceback.format_tb(info[2]) From 5a11f03c92899137ac20f44ef459825df6ab2f12 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 11 Oct 2021 17:59:33 -0500 Subject: [PATCH 193/601] Turn off set -x --- workflows/cp-leaveout/swift/workflow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index d265c4af..98626b0f 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -40,7 +40,7 @@ then exit 1 fi -set -x +# set -x if ! { get_site $1 # Sets SITE get_expid $2 # Sets EXPID From 4800fa254ab94e86b273f7b3a699ba1a5cf8dbbb Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 11 Oct 2021 18:00:07 -0500 Subject: [PATCH 194/601] Note to self --- workflows/cp-leaveout/swift/workflow.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index 98626b0f..14daf37e 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -64,7 +64,8 @@ set +x source_site env $SITE source_site sched $SITE -PYTHONPATH+=:$EMEWS_PROJECT_ROOT/py # For plangen, data_setup +# Note: insist on plangen from Supervisor! +PYTHONPATH=$EMEWS_PROJECT_ROOT/py:$PYTHONPATH # For plangen, data_setup PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # For log_tools, model_runner APP_PYTHONPATH+=:$EMEWS_PROJECT_ROOT/py # For plangen, data_setup APP_PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # For log_tools From 7f404e3cd762a242bea92eb06a4bb5480bec8688 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 11 Oct 2021 18:01:24 -0500 Subject: [PATCH 195/601] Use fixed-width integer --- workflows/cp-leaveout/swift/workflow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index 14daf37e..4d6e8d28 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -117,7 +117,7 @@ then # If output.txt does not exist, assume the moves already happened echo "WARNING: The outputs were already moved from $EXPID" else - next $TURBINE_OUTPUT/restarts/%i # cf. utils.sh:next() + next "$TURBINE_OUTPUT/restarts/%02i" # cf. utils.sh:next() PRIOR_RUN=$REPLY echo "Moving old outputs to $PRIOR_RUN" mkdir -pv $PRIOR_RUN From 2a1aa1655065714a2cce2a871d04ddbbd752fdf8 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 11 Oct 2021 18:04:19 -0500 Subject: [PATCH 196/601] Now moving just logs, not DB or TF run data --- workflows/cp-leaveout/swift/workflow.sh | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index 4d6e8d28..ce58e5d2 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -125,20 +125,7 @@ then $TURBINE_OUTPUT/out $TURBINE_OUTPUT/turbine* $TURBINE_OUTPUT/jobid.txt ) - mv ${PRIORS[@]} $PRIOR_RUN - cp -v $TURBINE_OUTPUT/cplo.db $PRIOR_RUN - echo $TURBINE_OUTPUT/run/*/save - for D in $TURBINE_OUTPUT/run/*/save - do - cd $D - echo D=$D - shopt -s nullglob - for f in *.json *.h5 *.log - do - : # cp -v --backup=numbered $f $f.bak - done - cd - - done + mv ${PRIORS[@]} $PRIOR_RUN fi else # Not a restart if [[ -f $TURBINE_OUTPUT/output.txt ]] From b87081f35bafecd46502af9e234c5d92a6bdf812 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 11 Oct 2021 18:57:48 -0500 Subject: [PATCH 197/601] Merge --- workflows/cp-leaveout/py/plangen.py | 3 ++- workflows/cp-leaveout/scripts/epoch-count.sh | 2 +- workflows/cp-leaveout/swift/workflow.sh | 11 +++++++++-- workflows/cp-leaveout/test/cfg-sys-1.sh | 12 ++++++++---- workflows/cp-leaveout/test/cfg-sys-512.sh | 8 ++++---- workflows/cp-leaveout/test/test-bl-1.sh | 8 ++++++-- workflows/mlrMBO/swift/workflow.sh | 3 +++ workflows/mlrMBO/test/cfg-sys-nightly.sh | 6 +++--- workflows/mlrMBO/test/test-nightly.sh | 1 - workflows/upf/swift/workflow.sh | 5 +++-- workflows/upf/test/cfg-sys-1.sh | 3 ++- 11 files changed, 41 insertions(+), 21 deletions(-) diff --git a/workflows/cp-leaveout/py/plangen.py b/workflows/cp-leaveout/py/plangen.py index 5bdbb9e2..d99fbb45 100644 --- a/workflows/cp-leaveout/py/plangen.py +++ b/workflows/cp-leaveout/py/plangen.py @@ -22,7 +22,8 @@ ISO_TIMESTAMP = "seconds" # timestamp to ISO string ISO_TIMESTAMP_ENCODE = "%Y-%m-%dT%H:%M:%S" # ISO string to timestamp -DEBUG_SQL = False +DEBUG_SQL = True + def isempty(path): """Determine whether the given directory is empty.""" diff --git a/workflows/cp-leaveout/scripts/epoch-count.sh b/workflows/cp-leaveout/scripts/epoch-count.sh index 3f4979c1..478f7f06 100755 --- a/workflows/cp-leaveout/scripts/epoch-count.sh +++ b/workflows/cp-leaveout/scripts/epoch-count.sh @@ -30,4 +30,4 @@ do echo -n "$LOG :: " # Pull out the last "Epoch:" line, print only the number: sed -n '/Epoch:/h;${g;s/.*Epoch: \([0-9]*\).*/\1/;p}' $LOG -done | nl # | sort -r -n -k 2 | column -t +done | nl | sort -r -n -k 4 | column -t diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index ce58e5d2..5d582a01 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -192,9 +192,11 @@ else STDOUT="" fi +# TURBINE_STDOUT="" export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" mkdir -pv $TURBINE_OUTPUT/out +# set -x swift-t -O 0 -n $PROCS \ ${MACHINE:-} \ -p \ @@ -221,11 +223,16 @@ swift-t -O 0 -n $PROCS \ -e IGNORE_ERRORS \ -e TURBINE_DB_WORKERS=1 \ $WAIT_ARG \ - $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} | \ - tee $STDOUT + $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} + # | \ + # tee $STDOUT + +# # -j /usr/bin/java # Give this to Swift/T if needed for Java # -e PYTHONUNBUFFERED=1 # May be needed if error output is being lost +# -e PYTHONVERBOSE=1 # Debugs module load confusion + if (( ${PIPESTATUS[0]} )) then diff --git a/workflows/cp-leaveout/test/cfg-sys-1.sh b/workflows/cp-leaveout/test/cfg-sys-1.sh index 02f7bb0f..7df9707e 100644 --- a/workflows/cp-leaveout/test/cfg-sys-1.sh +++ b/workflows/cp-leaveout/test/cfg-sys-1.sh @@ -4,7 +4,7 @@ # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEMS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-5} +export PROCS=${PROCS:-2} # # Number of processes to use for resident tasks, # # i.e., the number of mlrMBO instances to run @@ -15,15 +15,19 @@ export PROCS=${PROCS:-5} export PPN=${PPN:-1} # For Theta: -export QUEUE=${QUEUE:-debug-flat-quad} +# export QUEUE=${QUEUE:-debug-flat-quad} # export QUEUE=R.candle -export WALLTIME=${WALLTIME:-02:00:00} +# export QUEUE=debug + +export WALLTIME=${WALLTIME:-00:05:00} +# export WALLTIME=00:05:00 # command separated list of gpu ids # export GPU_STRING=${GPU_STRING:-0} #export TURBINE_LAUNCH_OPTIONS="-a6 -g6 -c42" -#export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" +export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" +# export TURBINE_LAUNCH_OPTIONS="-g3 -c21 -a2" # For PPN=2 # Does not work export TURBINE_DIRECTIVE="#BSUB -alloc_flags \"NVME maximizegpfs\"" #export PROJECT=Candle_ECP diff --git a/workflows/cp-leaveout/test/cfg-sys-512.sh b/workflows/cp-leaveout/test/cfg-sys-512.sh index 2c77c29e..d99071e3 100644 --- a/workflows/cp-leaveout/test/cfg-sys-512.sh +++ b/workflows/cp-leaveout/test/cfg-sys-512.sh @@ -6,22 +6,22 @@ # and 1 process is reserved for the DB client. # The default of 4 gives you 2 workers, # i.e., 2 concurrent Keras runs. -export PROCS=${PROCS:-4} +export PROCS=${PROCS:-2} # MPI processes per node # Cori has 32 cores per node, 128GB per node export PPN=${PPN:-1} # For Theta: -export QUEUE=${QUEUE:-debug-flat-quad} +# export QUEUE=${QUEUE:-debug-flat-quad} # export QUEUE=R.candle -export WALLTIME=${WALLTIME:-12:00:00} +export WALLTIME=${WALLTIME:-2:00:00} # command separated list of gpu ids # export GPU_STRING=${GPU_STRING:-0} #export TURBINE_LAUNCH_OPTIONS="-a6 -g6 -c42" -#export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" +export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" export TURBINE_DIRECTIVE="#BSUB -alloc_flags \"NVME maximizegpfs\"" #export PROJECT=Candle_ECP diff --git a/workflows/cp-leaveout/test/test-bl-1.sh b/workflows/cp-leaveout/test/test-bl-1.sh index 4e60bc53..bfe0d4ed 100755 --- a/workflows/cp-leaveout/test/test-bl-1.sh +++ b/workflows/cp-leaveout/test/test-bl-1.sh @@ -46,8 +46,9 @@ CANDLE_DATA=$SCRATCH/CANDLE-Data/ChallengeProblem PLAN_JSON=$CANDLE_DATA/plangen_cell1593-p4_drug1779-p1.json # DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.csv DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labeled.hdf5 -BENCHMARK_DATA=$SCRATCH/proj/Benchmarks/Pilot1/Uno +# BENCHMARK_DATA=$SCRATCH/proj/Benchmarks/Pilot1/Uno # BENCHMARK_DATA=$HOME/proj/Benchmarks/Pilot1/Uno +BENCHMARK_DATA=$CANDLE_DATA # What to return from the objective function (Keras model) # val_loss (default) and val_corr are supported @@ -66,7 +67,10 @@ do fi done -export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" +if [[ $SITE == "summit" ]] +then + export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" +fi # Submit job export WORKFLOW_SWIFT=baseline-error.swift diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index 574d9647..2e901964 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -159,7 +159,10 @@ else STDOUT="" fi +echo WF LLP $LD_LIBRARY_PATH + # ALW 2021-01-21: Please don't comment out the "-o $TURBINE_OUTPUT/workflow.tic" option below; otherwise, we get permissions issues on Biowulf. Thanks! +set -x swift-t -O 0 -n $PROCS \ -o $TURBINE_OUTPUT/workflow.tic \ ${MACHINE:-} \ diff --git a/workflows/mlrMBO/test/cfg-sys-nightly.sh b/workflows/mlrMBO/test/cfg-sys-nightly.sh index f5e3b025..a62ffd8e 100644 --- a/workflows/mlrMBO/test/cfg-sys-nightly.sh +++ b/workflows/mlrMBO/test/cfg-sys-nightly.sh @@ -4,7 +4,7 @@ # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEMS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-17} +export PROCS=${PROCS:-3} # MPI processes per node # Cori has 32 cores per node, 128GB per node @@ -13,8 +13,8 @@ export PPN=${PPN:-1} # For Theta: # export QUEUE=${QUEUE:-debug-flat-quad} -# export WALLTIME=${WALLTIME:-00:10:00} -export WALLTIME=${WALLTIME:-120} +export WALLTIME=${WALLTIME:-00:05:00} +# export WALLTIME=${WALLTIME:-120} #export PROJECT=Candle_ECP diff --git a/workflows/mlrMBO/test/test-nightly.sh b/workflows/mlrMBO/test/test-nightly.sh index 9c96e281..f6b7b069 100755 --- a/workflows/mlrMBO/test/test-nightly.sh +++ b/workflows/mlrMBO/test/test-nightly.sh @@ -50,7 +50,6 @@ fi # Submit job $EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME - # Check job output TURBINE_OUTPUT=$( readlink turbine-output ) echo $TURBINE_OUTPUT diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index 60ef60c0..68fca8d9 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -97,10 +97,11 @@ site2=$(echo $SITE | awk -v FS="-" '{print $1}') # ALW 2020-11-15: allow $SITEs # in the declaration below are handled by the wrapper scripts if [[ ${site2} == "summit" && ${CANDLE_RUN_WORKFLOW:-0} != 1 ]] then - export TURBINE_LAUNCH_OPTIONS="-r6 -a1 -g1 -c7" + export TURBINE_LAUNCH_OPTIONS="-a1 -g1 -c7" fi -TURBINE_STDOUT="$TURBINE_OUTPUT/out-%%r.txt" +# TURBINE_STDOUT="$TURBINE_OUTPUT/out-%%r.txt" +TURBINE_STDOUT= swift-t -n $PROCS \ -o $TURBINE_OUTPUT/workflow.tic \ diff --git a/workflows/upf/test/cfg-sys-1.sh b/workflows/upf/test/cfg-sys-1.sh index 058d05f5..8a436a88 100644 --- a/workflows/upf/test/cfg-sys-1.sh +++ b/workflows/upf/test/cfg-sys-1.sh @@ -12,7 +12,7 @@ export PROCS=${PROCS:-2} # MPI processes per node. This should not exceed PROCS. # Cori has 32 cores per node, 128GB per node -export PPN=${PPN:-1} +export PPN=${PPN:-2} #export QUEUE=${QUEUE:-batch} @@ -37,6 +37,7 @@ export PPN=${PPN:-1} # export TURBINE_LAUNCH_OPTIONS="-a1 -g6 -c7" # export WALLTIME=${WALLTIME:-0:30} +echo WALLTIME: $WALLTIME # export MAIL_ENABLED=1 # export MAIL_ADDRESS=wozniak@mcs.anl.gov From a117fec140bd5c008c5c6c34bfbdd4bb95255f49 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 11 Oct 2021 19:00:03 -0500 Subject: [PATCH 198/601] Merge --- workflows/cp-leaveout/py/plangen.py | 4 ++-- workflows/cp-leaveout/scripts/Node.py | 1 + workflows/cp-leaveout/swift/plangen_2.swift | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/workflows/cp-leaveout/py/plangen.py b/workflows/cp-leaveout/py/plangen.py index d99fbb45..260634ec 100644 --- a/workflows/cp-leaveout/py/plangen.py +++ b/workflows/cp-leaveout/py/plangen.py @@ -736,7 +736,7 @@ def plan_prep(db_path, plan_path, run_type=RunType.RUN_ALL): return rowid -def start_subplan(db_path, plan_path, plan_id=None, subplan_id=None, run_type=None): +def start _subplan(db_path, plan_path, plan_id=None, subplan_id=None, run_type=None): """Schedule the execution of a subplan. This function writes a RunhistRow record to the runhist table indicating that @@ -760,7 +760,7 @@ def start_subplan(db_path, plan_path, plan_id=None, subplan_id=None, run_type=No print("plangen: start_subplan: subplan_id=%s" % subplan_id) sys.stdout.flush() conn = db_connect(db_path) - csr = conn.cursor() + csr = conn.cu rsor() skip = False print("plangen: start_subplan: run_type: '%s'" % str(run_type)) diff --git a/workflows/cp-leaveout/scripts/Node.py b/workflows/cp-leaveout/scripts/Node.py index 7d014b18..8150723b 100644 --- a/workflows/cp-leaveout/scripts/Node.py +++ b/workflows/cp-leaveout/scripts/Node.py @@ -196,6 +196,7 @@ def parse_error_data(self, fp): line = fp.readline() tokens = check_token(line, 2, "mse:") self.mse = float(tokens[3]) + print("mse: " + str(self.mse)) line = fp.readline() tokens = check_token(line, 2, "mae:") self.mae = float(tokens[3]) diff --git a/workflows/cp-leaveout/swift/plangen_2.swift b/workflows/cp-leaveout/swift/plangen_2.swift index 0649252d..cb925439 100644 --- a/workflows/cp-leaveout/swift/plangen_2.swift +++ b/workflows/cp-leaveout/swift/plangen_2.swift @@ -64,7 +64,7 @@ except Exception as e: import sys, traceback import plangen try: - result = str(plangen.start_subplan('%s', '%s', %s, '%s', %s)) + result = str(plangen.start_subplan('%s', '%s', %s, '%s', '%s')) except Exception as e: info = sys.exc_info() s = traceback.format_tb(info[2]) From 047792ad3d37165f27cfcecaa3a7fe465e2c06ed Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 12 Oct 2021 09:56:06 -0500 Subject: [PATCH 199/601] Merge --- workflows/GA/swift/workflow.sh | 5 +- workflows/GA/test/cfg-prm-1.sh | 2 - workflows/async-search/swift/workflow.sh | 3 - workflows/async-search/test/test-1.sh | 16 ++-- workflows/async-search/test/test-5K.sh | 15 ++- workflows/common/python/log_tools.py | 27 +++++- workflows/common/python/model_runner.py | 109 ++++++++++++---------- workflows/common/sh/env-summit.sh | 20 ++-- workflows/common/swift/obj_app.swift | 2 +- workflows/cp-leaveout/test/cfg-sys-1.sh | 5 +- workflows/cp-leaveout/test/cfg-sys-512.sh | 11 ++- workflows/cp-leaveout/test/test-1.sh | 3 +- workflows/cp-leaveout/test/test-512.sh | 2 + workflows/upf/swift/workflow.sh | 7 +- workflows/upf/test/upf-1.txt | 2 +- 15 files changed, 133 insertions(+), 96 deletions(-) diff --git a/workflows/GA/swift/workflow.sh b/workflows/GA/swift/workflow.sh index f6836bf1..a39256da 100755 --- a/workflows/GA/swift/workflow.sh +++ b/workflows/GA/swift/workflow.sh @@ -14,7 +14,7 @@ then exit 1 fi export BENCHMARKS_ROOT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd) -BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/TC1:$BENCHMARKS_ROOT/Pilot1/NT3:$BENCHMARKS_ROOT/Pilot1/P1B1:$BENCHMARKS_ROOT/Pilot1/Combo:$BENCHMARKS_ROOT/Pilot2/P2B1:$BENCHMARKS_ROOT/examples/ADRP +BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/TC1:$BENCHMARKS_ROOT/Pilot1/NT3:$BENCHMARKS_ROOT/Pilot1/P1B1:$BENCHMARKS_ROOT/Pilot1/Combo:$BENCHMARKS_ROOT/Pilot2/P2B1 export BENCHMARK_TIMEOUT export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} @@ -62,9 +62,6 @@ source_site sched $SITE EQPY=${EQPY:-$WORKFLOWS_ROOT/common/ext/EQ-Py} PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$EQPY - -PYTHONPATH+=:$WORKFLOWS_ROOT/common/python/ - export TURBINE_JOBNAME="JOB:${EXPID}" CMD_LINE_ARGS=( -ga_params=$PARAM_SET_FILE -seed=$SEED diff --git a/workflows/GA/test/cfg-prm-1.sh b/workflows/GA/test/cfg-prm-1.sh index 0e5ccbbe..520afc7c 100644 --- a/workflows/GA/test/cfg-prm-1.sh +++ b/workflows/GA/test/cfg-prm-1.sh @@ -21,8 +21,6 @@ if [ "$MODEL_NAME" = "combo" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/combo_param_space_ga.json} elif [ "$MODEL_NAME" = "p1b1" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b1_param_space_ga.json} -elif [ "$MODEL_NAME" = "adrp" ]; then - PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/adrp_param_space_ga.json} elif [ "$MODEL_NAME" = "nt3" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/nt3_param_space_ga.json} elif [ "$MODEL_NAME" = "tc1" ]; then diff --git a/workflows/async-search/swift/workflow.sh b/workflows/async-search/swift/workflow.sh index 4f172182..b84fbcac 100755 --- a/workflows/async-search/swift/workflow.sh +++ b/workflows/async-search/swift/workflow.sh @@ -118,9 +118,6 @@ cp $WORKFLOWS_ROOT/async-search/python/$PY_PACKAGE.py $CFG_SYS $CFG_PRM $TURBINE # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} -echo "OBJ_MODULE: $OBJ_MODULE" -echo "OBJ_DIR: $OBJ_DIR" - # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh diff --git a/workflows/async-search/test/test-1.sh b/workflows/async-search/test/test-1.sh index 0c487327..3544bbbd 100755 --- a/workflows/async-search/test/test-1.sh +++ b/workflows/async-search/test/test-1.sh @@ -48,7 +48,7 @@ export OBJ_RETURN="val_loss" # Set OBJ_DIR export OBJ_DIR=$EMEWS_PROJECT_ROOT/obj_folder -export OBJ_MODULE=obj_app +# export OBJ_MODULE= if [[ $SITE == "theta" ]] then @@ -58,21 +58,19 @@ fi # Submit job $EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM -# Check job output -TURBINE_OUTPUT=$( readlink turbine-output ) -OUTPUT=turbine-output/output.txt -WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) - # Wait for job queue_wait +cp $0 $TURBINE_OUTPUT +# Check job output +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + SCRIPT=$( basename $0 .sh ) -check_output "RESULTS:" $OUTPUT $WORKFLOW $SCRIPT $JOBID -check_output "EXIT CODE: 0" $OUTPUT $WORKFLOW $SCRIPT $JOBID +check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID echo "$SCRIPT: SUCCESS" - # Local Variables: # c-basic-offset: 2; # End: diff --git a/workflows/async-search/test/test-5K.sh b/workflows/async-search/test/test-5K.sh index 222caae9..0f6e0300 100755 --- a/workflows/async-search/test/test-5K.sh +++ b/workflows/async-search/test/test-5K.sh @@ -48,7 +48,7 @@ export OBJ_RETURN="val_loss" # Set OBJ_DIR export OBJ_DIR=$EMEWS_PROJECT_ROOT/obj_folder -export OBJ_MODULE=obj_app +# export OBJ_MODULE= if [[ $SITE == "theta" ]] then @@ -58,17 +58,16 @@ fi # Submit job $EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM -# Check job output -TURBINE_OUTPUT=$( readlink turbine-output ) -OUTPUT=turbine-output/output.txt -WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) - # Wait for job queue_wait +cp $0 $TURBINE_OUTPUT +# Check job output +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + SCRIPT=$( basename $0 .sh ) -check_output "RESULTS:" $OUTPUT $WORKFLOW $SCRIPT $JOBID -check_output "EXIT CODE: 0" $OUTPUT $WORKFLOW $SCRIPT $JOBID +check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID echo "$SCRIPT: SUCCESS" diff --git a/workflows/common/python/log_tools.py b/workflows/common/python/log_tools.py index 557fb5c0..d60cf489 100644 --- a/workflows/common/python/log_tools.py +++ b/workflows/common/python/log_tools.py @@ -5,6 +5,8 @@ import sys +logger = None + def get_logger(logger, name, stream=sys.stdout): """ Set up logging """ if logger is not None: @@ -13,8 +15,31 @@ def get_logger(logger, name, stream=sys.stdout): logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) h = logging.StreamHandler(stream=stream) - fmtr = logging.Formatter('%(asctime)s %(name)s %(levelname)-6s %(message)s', + fmtr = logging.Formatter('%(asctime)s %(name)s %(levelname)-5s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') h.setFormatter(fmtr) logger.addHandler(h) return logger + + + +# def log(msg): +# global logger +# logger.info(msg) + + +# def log_info(msg): +# global logger +# logger = get_logger(logger) +# logger.info(msg) + + +# def debug(msg): +# global logger +# logger = get_logger(logger) +# logger.debug(msg) + + +def timestamp(): + from datetime import datetime + return datetime.now().strftime("%Y-%m-%d %H:%M:%S") diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index 25a86724..a3d657b1 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -3,20 +3,21 @@ # See __main__ section for usage -import sys import json +import math import os +import sys import time import numpy as np import importlib import runner_utils from runner_utils import ModelResult -import log_tools -import math +from log_tools import * + logger = None -print("MODEL RUNNER...") +print('MODEL RUNNER...') # Set PYTHONPATH: # Let MODEL_PYTHON_DIR override default Benchmarks model locations @@ -25,13 +26,14 @@ sys.path.append(python_dir) benchmarks_root = os.getenv("BENCHMARKS_ROOT") if benchmarks_root: - sys.path.append(benchmarks_root+"/common") + sys.path.append(benchmarks_root+'/common') + # Report PYTHONPATH for debugging print("sys.path:") for i in range(0, len(sys.path)-1): - print("%2i: %s" % (i, sys.path[i])) -print("") + print('%2i: %s' % (i, sys.path[i])) +print('') def import_pkg(framework, model_name): @@ -52,8 +54,8 @@ def import_pkg(framework, model_name): print("module_name: " + module_name) pkg = importlib.import_module(module_name) else: - raise ValueError("Framework must either be `keras' or `pytorch' " + - "got `{}'!".format(framework)) + raise ValueError('Framework must either be "keras" or "pytorch" ' + + 'got: "{}"'.format(framework)) return pkg @@ -125,9 +127,9 @@ def stop_perf(Ps): def run(hyper_parameter_map, obj_return): start = time.time() global logger - logger = log_tools.get_logger(logger, 'MODEL RUNNER') + logger = get_logger(logger, 'MODEL RUNNER') - log("START:") + logger.info('run(): START:') sys.stdout.flush() directory = hyper_parameter_map['instance_directory'] @@ -178,19 +180,18 @@ def run(hyper_parameter_map, obj_return): finish = time.time() duration = finish - start - # TODO: This should be on INFO - log(" DONE: run_id %s in %0.2f seconds." % - (hyper_parameter_map["run_id"], duration)) + logger.info('DONE: run_id %s in %0.2f seconds.' % + (hyper_parameter_map['run_id'], duration)) return (result, history_result) def get_obj_return(): - obj_return = os.getenv("OBJ_RETURN") - valid_obj_returns = [ "loss", "val_loss", "val_corr", "val_acc" ] + obj_return = os.getenv('OBJ_RETURN') + valid_obj_returns = [ 'loss', 'val_loss', 'val_corr', 'val_acc' ] if obj_return == None: - raise Exception("No OBJ_RETURN was in the environment!") + raise Exception('No OBJ_RETURN was in the environment!') if obj_return not in valid_obj_returns: - raise Exception("Invalid value for OBJ_RETURN: use: " + + raise Exception('Invalid value for OBJ_RETURN: use: ' + str(valid_obj_returns)) return obj_return @@ -205,17 +206,17 @@ def run_pre(hyper_parameter_map): module = load_pre_post(hyper_parameter_map, 'pre_module') result = ModelResult.SUCCESS if module != None: - logger.debug("PRE RUN START") + logger.debug('PRE RUN START') result = module.pre_run(hyper_parameter_map) - logger.debug("PRE RUN STOP") + logger.debug('PRE RUN STOP') return result def run_post(hyper_parameter_map, output_map): module = load_pre_post(hyper_parameter_map, 'post_module') if module != None: - logger.debug("POST RUN START") + logger.debug('POST RUN START') module.post_run(hyper_parameter_map, output_map) - logger.debug("POST RUN STOP") + logger.debug('POST RUN STOP') def run_model(hyper_parameter_map): # In-memory Python runs may not create sys.argv @@ -225,18 +226,18 @@ def run_model(hyper_parameter_map): instance_directory = hyper_parameter_map['instance_directory'] os.chdir(instance_directory) global logger - logger = log_tools.get_logger(logger, "MODEL RUNNER") + logger = get_logger(logger, 'MODEL RUNNER') obj_return = get_obj_return() directory = hyper_parameter_map['instance_directory'] os.chdir(directory) result = run_pre(hyper_parameter_map) if result == ModelResult.ERROR: - print("run_pre() returned ERROR!") + print('run_pre() returned ERROR!') exit(1) elif result == ModelResult.SKIP: - log("run_pre() returned SKIP ...") + logger.info('run_pre() returned SKIP ...') sys.stdout.flush() - return ("SKIP", "HISTORY_EMPTY") + return ('SKIP', 'HISTORY_EMPTY') else: assert(result == ModelResult.SUCCESS) # proceed... @@ -245,30 +246,29 @@ def run_model(hyper_parameter_map): runner_utils.write_output(json.dumps(history, cls=runner_utils.FromNPEncoder), directory, 'history.txt') - run_post(hyper_parameter_map, {}) - log("RUN STOP") + logger.info('RUN STOP') return (result, history) def setup_params(pkg, hyper_parameter_map, params_arg): params = pkg.initialize_parameters(**params_arg) - log("PARAM UPDATE START") + logger.debug('PARAM UPDATE START') for k,v in hyper_parameter_map.items(): - if k == "dense" or k == "dense_feature_layers": + if k == 'dense' or k == 'dense_feature_layers': if(type(v) != list): - v = v.split(" ") + v = v.split(' ') v = [int(i) for i in v] - if k == "cell_features": + if k == 'cell_features': cp_str = v v = list() v.append(cp_str) - log(str(k) + " = " + str(v)) + logger.debug(str(k) + ' = ' + str(v)) params[k] = v - log("PARAM UPDATE STOP") + logger.debug('PARAM UPDATE STOP') - log("WRITE_PARAMS START") + logger.debug('WRITE_PARAMS START') runner_utils.write_params(params, hyper_parameter_map) - log("WRITE_PARAMS STOP") + logger.debug('WRITE_PARAMS STOP') return params @@ -277,34 +277,47 @@ def get_results(history, obj_return): Return the history entry that the user requested. history: The Keras history object """ - known_params = [ "loss", "val_loss", "val_corr", "val_dice_coef" ] + + logger.debug('get_results(): "%s"' % obj_return) + + known_params = [ 'loss', 'val_loss', 'val_corr', 'val_dice_coef' ] + if obj_return not in known_params: - raise ValueError("Unsupported objective function: " + - "use obj_param to specify one of " + + raise ValueError('Unsupported objective function return ' + + 'key: "' + obj_return + '" - ' + + 'use obj_param to specify one of ' + str(known_params)) if obj_return in history.history: + # Good value values = history.history[obj_return] - # Default: the last value in the history - result = values[-1] + # Default: the last value in the history + result = values[-1] + else: + logger.warning('get_results(): objective function return key ' + + 'not found: ' + + 'key: "' + obj_return + '" - ' + + 'history: ' + str(history.history.keys())) + logger.warning('get_results(): returning NaN') + result = math.nan # Fix NaNs: if math.isnan(result): - if obj_return == "val_corr" or obj_return == "val_dice_coef": + if obj_return == 'val_corr' or obj_return == 'val_dice_coef': # Return the negative result result = -result else: # Just return a large number result = 999999999 - print("result: " + obj_return + ": " + str(result)) + print('result: ' + obj_return + ': ' + str(result)) history_result = history.history.copy() return result, history_result # Usage: see how sys.argv is unpacked below: if __name__ == '__main__': - logger = log_tools.get_logger(logger, "MODEL_RUNNER") - log("RUN START") + logger = get_logger(logger, 'MODEL_RUNNER') + logger.info('main: RUN START') import sys ( _, # The Python program name (unused) @@ -318,10 +331,10 @@ def get_results(history, obj_return): instance_directory, framework, out_dir_key='save') - hyper_parameter_map['model_name'] = os.getenv("MODEL_NAME") + hyper_parameter_map['model_name'] = os.getenv('MODEL_NAME') if hyper_parameter_map['model_name'] == None: - raise Exception("No MODEL_NAME was in the environment!") - hyper_parameter_map['experiment_id'] = os.getenv("EXPID") + raise Exception('No MODEL_NAME was in the environment!') + hyper_parameter_map['experiment_id'] = os.getenv('EXPID') hyper_parameter_map['run_id'] = runid hyper_parameter_map['timeout'] = float(benchmark_timeout) diff --git a/workflows/common/sh/env-summit.sh b/workflows/common/sh/env-summit.sh index 18758c5c..82922e53 100644 --- a/workflows/common/sh/env-summit.sh +++ b/workflows/common/sh/env-summit.sh @@ -42,13 +42,13 @@ EQPy=$WORKFLOWS_ROOT/common/ext/EQ-Py LOCAL=0 CRAY=1 -# Resident task worker count and rank list -# If this is already set, we respect the user settings -# If this is unset, we set it to 1 -# and run the algorithm on the 2nd highest rank -# This value is only read in HPO workflows -if [[ ${TURBINE_RESIDENT_WORK_WORKERS:-} == "" ]] -then - export TURBINE_RESIDENT_WORK_WORKERS=1 - export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) -fi +# # Resident task worker count and rank list +# # If this is already set, we respect the user settings +# # If this is unset, we set it to 1 +# # and run the algorithm on the 2nd highest rank +# # This value is only read in HPO workflows +# if [[ ${TURBINE_RESIDENT_WORK_WORKERS:-} == "" ]] +# then +# export TURBINE_RESIDENT_WORK_WORKERS=1 +# export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) +# fi diff --git a/workflows/common/swift/obj_app.swift b/workflows/common/swift/obj_app.swift index 0fcc49fe..fce1cf9a 100644 --- a/workflows/common/swift/obj_app.swift +++ b/workflows/common/swift/obj_app.swift @@ -66,7 +66,7 @@ app (void o) run_model (string model_sh, string params, file line = input(result_file); obj_result = trim(read(line)); } else { - printf("File not found: %s", result_file, " - benchmark might have stopped without completing/returning history variable."); + printf("File not found: %s", result_file); // return with a large value obj_result = "1e7"; } diff --git a/workflows/cp-leaveout/test/cfg-sys-1.sh b/workflows/cp-leaveout/test/cfg-sys-1.sh index 7df9707e..21014013 100644 --- a/workflows/cp-leaveout/test/cfg-sys-1.sh +++ b/workflows/cp-leaveout/test/cfg-sys-1.sh @@ -18,10 +18,7 @@ export PPN=${PPN:-1} # export QUEUE=${QUEUE:-debug-flat-quad} # export QUEUE=R.candle -# export QUEUE=debug - -export WALLTIME=${WALLTIME:-00:05:00} -# export WALLTIME=00:05:00 +export WALLTIME=${WALLTIME:-01:00:00} # command separated list of gpu ids # export GPU_STRING=${GPU_STRING:-0} diff --git a/workflows/cp-leaveout/test/cfg-sys-512.sh b/workflows/cp-leaveout/test/cfg-sys-512.sh index d99071e3..bafc77e0 100644 --- a/workflows/cp-leaveout/test/cfg-sys-512.sh +++ b/workflows/cp-leaveout/test/cfg-sys-512.sh @@ -6,7 +6,13 @@ # and 1 process is reserved for the DB client. # The default of 4 gives you 2 workers, # i.e., 2 concurrent Keras runs. -export PROCS=${PROCS:-2} +# Bin Min Nodes Max Nodes Max Walltime (Hours) Aging Boost (Days) +# 1 2,765 4,608 24.0 15 +# 2 922 2,764 24.0 10 +# 3 92 921 12.0 0 +# 4 46 91 6.0 0 +# 5 1 45 2.0 +export PROCS=${PROCS:-6} # MPI processes per node # Cori has 32 cores per node, 128GB per node @@ -16,7 +22,8 @@ export PPN=${PPN:-1} # export QUEUE=${QUEUE:-debug-flat-quad} # export QUEUE=R.candle -export WALLTIME=${WALLTIME:-2:00:00} +# Summit: Limited to 2h if small job +export WALLTIME=${WALLTIME:-02:00:00} # command separated list of gpu ids # export GPU_STRING=${GPU_STRING:-0} diff --git a/workflows/cp-leaveout/test/test-1.sh b/workflows/cp-leaveout/test/test-1.sh index 2128a246..fe92d7d5 100755 --- a/workflows/cp-leaveout/test/test-1.sh +++ b/workflows/cp-leaveout/test/test-1.sh @@ -87,7 +87,8 @@ OUTPUT=turbine-output/output.txt WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) # Wait for job -queue_wait +# queue_wait +exit SCRIPT=$( basename $0 .sh ) check_output "RESULTS:" $OUTPUT $WORKFLOW $SCRIPT $JOBID diff --git a/workflows/cp-leaveout/test/test-512.sh b/workflows/cp-leaveout/test/test-512.sh index fda47f14..1b9ef3b4 100755 --- a/workflows/cp-leaveout/test/test-512.sh +++ b/workflows/cp-leaveout/test/test-512.sh @@ -53,6 +53,8 @@ DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.feather # BENCHMARK_DATA=$SCRATCH/proj/Benchmarks/Pilot1/Uno # BENCHMARK_DATA=$HOME/proj/Benchmarks/Pilot1/Uno BENCHMARK_DATA=$CANDLE_DATA +# PROJ_SHARED=/gpfs/alpine/med106/proj-shared/wozniak +# BENCHMARK_DATA=$PROJ_SHARED/proj/Benchmarks/Pilot1/Uno # What to return from the objective function (Keras model) # val_loss (default), loss, and val_corr are supported diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index 68fca8d9..e4d684ab 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -103,13 +103,17 @@ fi # TURBINE_STDOUT="$TURBINE_OUTPUT/out-%%r.txt" TURBINE_STDOUT= +echo OMP_NUM_THREADS ${OMP_NUM_THREADS:-UNSET} +export OMP_NUM_THREADS=1 + +log_path LD_LIBRARY_PATH + swift-t -n $PROCS \ -o $TURBINE_OUTPUT/workflow.tic \ ${MACHINE:-} \ -p -I $EQR -r $EQR \ -I $WORKFLOWS_ROOT/common/swift \ -i obj_$SWIFT_IMPL \ - -e LD_LIBRARY_PATH=$LD_LIBRARY_PATH \ -e BENCHMARKS_ROOT \ -e EMEWS_PROJECT_ROOT \ -e MODEL_SH \ @@ -121,7 +125,6 @@ swift-t -n $PROCS \ -e TURBINE_MPI_THREAD=${TURBINE_MPI_THREAD:-1} \ $( python_envs ) \ -e TURBINE_STDOUT=$TURBINE_STDOUT \ - -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ -e PYTHONUNBUFFERED=1 \ $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} diff --git a/workflows/upf/test/upf-1.txt b/workflows/upf/test/upf-1.txt index 36c4667d..6059a9e0 100644 --- a/workflows/upf/test/upf-1.txt +++ b/workflows/upf/test/upf-1.txt @@ -1 +1 @@ -{"id": "test0", "epochs": 3} +{"id": "test0", "epochs": 1} From 88b0e1231f8ba4e351dcb28d07e8a7d62c75af03 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 15 Oct 2021 15:10:42 -0500 Subject: [PATCH 200/601] Provide a default for TURBINE_LAUNCH_OPTIONS --- workflows/common/sh/sched-summit.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/common/sh/sched-summit.sh b/workflows/common/sh/sched-summit.sh index 9001713c..e7359509 100644 --- a/workflows/common/sh/sched-summit.sh +++ b/workflows/common/sh/sched-summit.sh @@ -9,4 +9,5 @@ export PROJECT=${PROJECT:-MED106} # export TURBINE_OUTPUT_SOFTLINK=/dev/null -export TURBINE_LAUNCH_OPTIONS="-a1 -g6 -c7" +JSRUN_DEFAULT="-a1 -g6 -c7" +export TURBINE_LAUNCH_OPTIONS=${TURBINE_LAUNCH_OPTIONS:-${JSRUN_DEFAULT}} From 2edb25b7da1c840d7ccbfae0079daab27e9bdb58 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 15 Oct 2021 15:11:10 -0500 Subject: [PATCH 201/601] Fix typos --- workflows/cp-leaveout/py/plangen.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/py/plangen.py b/workflows/cp-leaveout/py/plangen.py index 260634ec..d99fbb45 100644 --- a/workflows/cp-leaveout/py/plangen.py +++ b/workflows/cp-leaveout/py/plangen.py @@ -736,7 +736,7 @@ def plan_prep(db_path, plan_path, run_type=RunType.RUN_ALL): return rowid -def start _subplan(db_path, plan_path, plan_id=None, subplan_id=None, run_type=None): +def start_subplan(db_path, plan_path, plan_id=None, subplan_id=None, run_type=None): """Schedule the execution of a subplan. This function writes a RunhistRow record to the runhist table indicating that @@ -760,7 +760,7 @@ def start _subplan(db_path, plan_path, plan_id=None, subplan_id=None, run_type=N print("plangen: start_subplan: subplan_id=%s" % subplan_id) sys.stdout.flush() conn = db_connect(db_path) - csr = conn.cu rsor() + csr = conn.cursor() skip = False print("plangen: start_subplan: run_type: '%s'" % str(run_type)) From 2069dcbb3d2df9781de833719ac0767a558316eb Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 15 Oct 2021 15:12:22 -0500 Subject: [PATCH 202/601] Add jsrun setting for PPN=4 --- workflows/cp-leaveout/test/cfg-sys-512.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/test/cfg-sys-512.sh b/workflows/cp-leaveout/test/cfg-sys-512.sh index bafc77e0..4b40df60 100644 --- a/workflows/cp-leaveout/test/cfg-sys-512.sh +++ b/workflows/cp-leaveout/test/cfg-sys-512.sh @@ -28,7 +28,13 @@ export WALLTIME=${WALLTIME:-02:00:00} # command separated list of gpu ids # export GPU_STRING=${GPU_STRING:-0} #export TURBINE_LAUNCH_OPTIONS="-a6 -g6 -c42" -export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" +if (( PPN == 1 )) +then + export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" +else + # For PPN=4 debugging: + export TURBINE_LAUNCH_OPTIONS="-g1 -c7 -a1" +fi export TURBINE_DIRECTIVE="#BSUB -alloc_flags \"NVME maximizegpfs\"" #export PROJECT=Candle_ECP From 838a5a61f7ff65ae47b8b79f447007c9be4496b8 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 18 Oct 2021 11:28:21 -0500 Subject: [PATCH 203/601] Add shrink-output scripts --- .../cp-leaveout/scripts/shrink-output.py | 67 +++++++++++++++++++ .../cp-leaveout/scripts/shrink-output.sh | 27 ++++++++ 2 files changed, 94 insertions(+) create mode 100644 workflows/cp-leaveout/scripts/shrink-output.py create mode 100755 workflows/cp-leaveout/scripts/shrink-output.sh diff --git a/workflows/cp-leaveout/scripts/shrink-output.py b/workflows/cp-leaveout/scripts/shrink-output.py new file mode 100644 index 00000000..8ae792ae --- /dev/null +++ b/workflows/cp-leaveout/scripts/shrink-output.py @@ -0,0 +1,67 @@ + +# SHRINK OUTPUT PY +# Receives list of filenames on stdin +# Converts filenames from out-*.txt to summary-*.txt +# Removes non-printing characters (backspace) +# Reduces the number of training lines in output +# Removes redundant batch size information +# Fixes newline before "Current time" report + +import re, sys +from collections import deque + + +# Only 1/shrink_factor training lines are copied +shrink_factor = 100 +# Number of additional consecutive lines at beginning and end of +# training that are retained +hold_space = 5 + + +def shrink(fp_in, fp_out): + # Queue to hold extra lines that may be printed at end of run + Q = deque() + index = 0 + starts = 0 # Initial hold_space ETAs are immediately printed + for line in fp_in: + if len(line) == 1: continue # Blank line + line = line.replace("\b", "") + if "batch:" in line or "Current" in line: + line = re.sub("- batch: .* 32.0000 -", "", line) + line = line.replace("Current", "\nCurrent") + if starts < hold_space: + fp_out.write(line) + starts += 1 + continue + Q.append(line) + if len(Q) > hold_space: + index += 1 + line = Q.popleft() + if index % shrink_factor == 0: + fp_out.write(line) + else: + starts = 0 + while len(Q) > 0: + fp_out.write(Q.popleft()) + fp_out.write(line) + # Done: flush Q: + while len(Q) > 0: + fp_out.write(Q.popleft()) + + +while True: + + line = sys.stdin.readline() + + if len(line) == 0: break # EOF + if len(line) == 1: continue # Blank line + + file_in = line.strip() + print("reading: " + file_in) + file_out = re.sub("/out-", "/summary-", file_in) + + with open(file_in, "r") as fp_in: + with open(file_out, "w") as fp_out: + shrink(fp_in, fp_out) + +print("shrink-output.py: OK") diff --git a/workflows/cp-leaveout/scripts/shrink-output.sh b/workflows/cp-leaveout/scripts/shrink-output.sh new file mode 100755 index 00000000..2e92adf5 --- /dev/null +++ b/workflows/cp-leaveout/scripts/shrink-output.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +# SHRINK OUTPUT SH +# Accepts a whole workflow output directory +# Clean up and shrink TensorFlow output +# See shrink-output.py for details + +THIS=$( readlink --canonicalize $( dirname $0 ) ) +CPLO=$( readlink --canonicalize $THIS/.. ) +SUPERVISOR=$( readlink --canonicalize $CPLO/../.. ) + +source $SUPERVISOR/workflows/common/sh/utils.sh + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ + DIR - ${*} + +if ! [[ -d $DIR ]] +then + echo "Does not exist: $DIR" + exit 1 +fi + +OUTS=() + +find $DIR/out -name "out-*" | python $THIS/shrink-output.py From 710863be3dfd7bc58281c56999515bf980f71e85 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 18 Oct 2021 11:38:01 -0500 Subject: [PATCH 204/601] Write partial training file to temp file, then rename --- workflows/cp-leaveout/py/data_setup.py | 77 +++++++++++++++----------- 1 file changed, 44 insertions(+), 33 deletions(-) diff --git a/workflows/cp-leaveout/py/data_setup.py b/workflows/cp-leaveout/py/data_setup.py index 30e757f7..02c71344 100644 --- a/workflows/cp-leaveout/py/data_setup.py +++ b/workflows/cp-leaveout/py/data_setup.py @@ -1,8 +1,7 @@ # DATA SETUP PY -import json -import os, sys +import os, sys, time from pathlib import Path import traceback @@ -20,40 +19,46 @@ def __init__(self, dataframe_from, node, plan, output): self.drug_feature_selection = None self.output = output +def setup_nvm(params): + # username = os.environ['USER'] # No longer works on Summit 2021-10-13 + username = params["user"] + nvme_enabled = Path('/mnt/bb/{}'.format(username)).exists() + # nvme_enabled = True + print("NVMe: %r" % nvme_enabled) + if not nvme_enabled: + return + # copy original datafrom to NVMe disk space + try: + src = Path(params["dataframe_from"]) + dest = Path("/mnt/bb/{}/{}".format(username, src.name)) + if not dest.exists(): + start = time.time() + count = dest.write_bytes(src.read_bytes()) + stop = time.time() + duration = stop - start + rate = count / duration / (1024*1024) + print("File copy completed. Original dataframe " + + "copied to NVM in %0.1f seconds (%0.1f MB/s)." % + (duration, rate)) + else: + print("File copy skipped. " + + "Original dataframe already exists in NVM.") + except Exception as e: + print("Error occurred in copying original dataframe\n" + + str(e)) + traceback.print_exc() + return ModelResult.ERROR + params["dataframe_from"] = dest.resolve() + # Do not do this: it changes the location of the training data + # params["use_exported_data"] = "/mnt/bb/{}/{}".format(username, params["use_exported_data"]) + return params + + def pre_run(params): import sys, time print("data_setup.pre_run(): node: '%s' ..." % params["node"]) sys.stdout.flush() - # check NVMe disk is available - # username = os.environ['USER'] - # nvme_enabled = Path('/mnt/bb/{}'.format(username)).exists() - nvme_enabled = False - if nvme_enabled: - # copy original datafrom to NVMe disk space - try: - src = Path(params["dataframe_from"]) - dest = Path("/mnt/bb/{}/{}".format(username, src.name)) - if not dest.exists(): - start = time.time() - count = dest.write_bytes(src.read_bytes()) - stop = time.time() - duration = stop - start - rate = count / duration / (1024*1024) - print("File copy completed. Original dataframe " + - "copied to NVM in %0.1f seconds (%0.1f MB/s)." % - (duration, rate)) - else: - print("File copy skipped. " + - "Original dataframe already exists in NVM.") - except Exception as e: - print("Error occurred in copying original dataframe\n" + - str(e)) - traceback.print_exc() - return ModelResult.ERROR - params["dataframe_from"] = dest.resolve() - params["use_exported_data"] = "/mnt/bb/{}/{}".format(username, params["use_exported_data"]) - # softlink to cache & config file # build node specific training/validation dataset @@ -64,7 +69,7 @@ def pre_run(params): data = params["benchmark_data"] try: - for filename in [ "uno_auc_model.txt" ]: # "cache", + for filename in [ "uno_auc_model.txt" ]: # "cache", if not os.path.islink(filename): src = f"{data}/{filename}" print("data_setup: src: (%s)" % src) @@ -83,7 +88,9 @@ def pre_run(params): args.output) sys.stdout.flush() if not os.path.exists(args.output): - sys.stdout.flush() + params = setup_nvm(params) + out_orig = args.output + args.output = out_orig + ".part" start = time.time() topN_to_uno.build_dataframe(args) stop = time.time() @@ -91,6 +98,10 @@ def pre_run(params): print("data_setup: build_dataframe() OK : " + "%0.1f seconds." % duration) sys.stdout.flush() + os.rename(args.output, out_orig) + print("data_setup: rename() OK") + sys.stdout.flush() + args.output = out_orig else: print("data_setup: dataframe exists: %s" % os.path.realpath(args.output)) From 11c06b26b55b84ff9c33dbd240ae5b4e464bf218 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 20 Oct 2021 15:15:21 -0500 Subject: [PATCH 205/601] Add user to params instead of relying on environment --- workflows/cp-leaveout/swift/workflow.sh | 2 ++ workflows/cp-leaveout/swift/workflow.swift | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index 5d582a01..ff0aeb45 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -100,6 +100,7 @@ fi CMD_LINE_ARGS=( --benchmark_timeout=$BENCHMARK_TIMEOUT --site=$SITE --db_file=$DB_FILE + --user=$USER $GPU_ARG $WORKFLOW_ARGS ) @@ -229,6 +230,7 @@ swift-t -O 0 -n $PROCS \ # +# -e USER # Needed on Summit to find NVME # -j /usr/bin/java # Give this to Swift/T if needed for Java # -e PYTHONUNBUFFERED=1 # May be needed if error output is being lost # -e PYTHONVERBOSE=1 # Debugs module load confusion diff --git a/workflows/cp-leaveout/swift/workflow.swift b/workflows/cp-leaveout/swift/workflow.swift index 2ad503f9..70ce048a 100644 --- a/workflows/cp-leaveout/swift/workflow.swift +++ b/workflows/cp-leaveout/swift/workflow.swift @@ -76,6 +76,7 @@ int early_stopping = string2int(P_s); string plan_json = argv("plan_json"); string dataframe_csv = argv("dataframe_csv"); string db_file = argv("db_file"); +string user = argv("user", "NONE"); // for Summit NVME string benchmark_data = argv("benchmark_data"); int epoch_mode = string2int(argv("epoch_mode", "1")); int benchmark_timeout = string2int(argv("benchmark_timeout", "-1")); @@ -160,6 +161,7 @@ run_stage(int N, int S, string this, int stage, void block, "plan": "%s", "config_file": "uno_auc_model.txt", "cache": "cache/top6_auc", +"user": "%s", "dataframe_from": "%s", "save_weights": "save/model.h5", "gpus": "0", @@ -169,7 +171,7 @@ run_stage(int N, int S, string this, int stage, void block, "use_exported_data": "topN.uno.h5", "benchmark_data": "%s" ---- % -(plan_json, dataframe_csv, epochs, early_stopping, benchmark_data); +(plan_json, user, dataframe_csv, epochs, early_stopping, benchmark_data); if (stage > 1) { n = strlen(this); From 27caa94e2e0f8fb3a2becec384b5ab88808e961c Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 15 Nov 2021 11:02:10 -0600 Subject: [PATCH 206/601] Merge --- scratch/py-eval/Makefile | 28 +++++++-- scratch/py-eval/py-eval.c | 28 +++++++++ workflows/common/python/model_runner.py | 1 + workflows/common/sh/model.sh | 66 ++++++++++++---------- workflows/cp-leaveout/scripts/data-size.py | 20 +++++++ 5 files changed, 108 insertions(+), 35 deletions(-) diff --git a/scratch/py-eval/Makefile b/scratch/py-eval/Makefile index a0830b06..4f4add31 100644 --- a/scratch/py-eval/Makefile +++ b/scratch/py-eval/Makefile @@ -20,23 +20,39 @@ MPI_ENABLED = 0 # Cori # Module tensorflow/intel-head -PYTHON_HOME = /usr/common/software/tensorflow/intel-tensorflow/head -PYTHON_VERSION_MAJOR = 2 -PYTHON_VERSION_MINOR = 7 +# PYTHON_HOME = /usr/common/software/tensorflow/intel-tensorflow/head +# PYTHON_VERSION_MAJOR = 2 +# PYTHON_VERSION_MINOR = 7 +# PYTHON_VERSION_SUFFIX = +# CC = gcc # module load gcc + +CC = gcc + +# Dunedin 3.8 Spack +PYTHON_HOME = /usr +PYTHON_VERSION_MAJOR = 3 +PYTHON_VERSION_MINOR = 8 PYTHON_VERSION_SUFFIX = -CC = gcc # module load gcc -# Dunedin +# Dunedin 3.7.3 TF +# PYTHON_HOME = ${HOME}/Public/sfw/Python-3.7.3-tf +# PYTHON_VERSION_MAJOR = 3 +# PYTHON_VERSION_MINOR = 7 +# PYTHON_VERSION_SUFFIX = m + +# Dunedin 2.7 # PYTHON_HOME = /usr # PYTHON_VERSION_MAJOR = 2 # PYTHON_VERSION_MINOR = 7 # PYTHON_VERSION_SUFFIX = + # End Python settings PYTHON_VERSION = $(PYTHON_VERSION_MAJOR).$(PYTHON_VERSION_MINOR)$(PYTHON_VERSION_SUFFIX) INCLUDES = -I $(PYTHON_HOME)/include/python$(PYTHON_VERSION) -LIBS = -L $(PYTHON_HOME)/lib -lpython$(PYTHON_VERSION) +# LIBS = -L $(PYTHON_HOME)/lib +LIBS += -lpython$(PYTHON_VERSION) -ldl RPATHS = -Wl,-rpath -Wl,$(PYTHON_HOME)/lib DEFINES = -DPYTHON_VERSION_MAJOR=$(PYTHON_VERSION_MAJOR) \ diff --git a/scratch/py-eval/py-eval.c b/scratch/py-eval/py-eval.c index 5289ec1b..edeec357 100644 --- a/scratch/py-eval/py-eval.c +++ b/scratch/py-eval/py-eval.c @@ -5,6 +5,8 @@ #include +#include + #include #include @@ -57,6 +59,18 @@ python_init() { if (initialized) return true; verbose("python: initializing..."); + + + char str_python_lib[32]; +#ifdef _WIN32 + sprintf(str_python_lib, "lib%s.dll", PYTHON_NAME); +#elif defined __unix__ + sprintf(str_python_lib, "lib%s.so", "python3.8"); +#elif defined __APPLE__ + sprintf(str_python_lib, "lib%s.dylib", PYTHON_NAME); +#endif + dlopen(str_python_lib, RTLD_NOW | RTLD_GLOBAL); + Py_InitializeEx(1); main_module = PyImport_AddModule("__main__"); if (main_module == NULL) return handle_python_exception(); @@ -65,6 +79,20 @@ python_init() local_dict = PyDict_New(); if (local_dict == NULL) return handle_python_exception(); initialized = true; + + // long val = 43; + char* val = "MY VALUE!"; + // if (PyDict_SetItemString(main_dict, "myvar", PyLong_FromLong(val))) { + if (PyDict_SetItemString(main_dict, "myvar", val)) { + assert(false); + } + + char* result; + PyObject* po = PyDict_GetItemString(main_dict, "myvar"); + int pc = PyArg_Parse(po, "s", &result); + if (pc != 1) return handle_python_non_string(po); + printf("result: %s\n", result); + return true; } diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index a3d657b1..c0e52d08 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -314,6 +314,7 @@ def get_results(history, obj_return): history_result = history.history.copy() return result, history_result + # Usage: see how sys.argv is unpacked below: if __name__ == '__main__': logger = get_logger(logger, 'MODEL_RUNNER') diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index cd82bba6..0b66455d 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -5,6 +5,12 @@ set -eu # Shell wrapper around Keras model +# Note: Under Swift/T, the initial output from here will go +# to the main Swift/T stdout and be mixed with output from +# other models. +# Thus, we redirect to a separate model.log file for each model run +# and normally we do not produce output until after the redirection. + usage() { echo "Usage: model.sh FRAMEWORK PARAMS RUNID" @@ -30,13 +36,6 @@ RUNID=$3 # Set instance_directory to that and cd into it. INSTANCE_DIRECTORY=$TURBINE_OUTPUT/run/$RUNID -SH_TIMEOUT=${SH_TIMEOUT:-} -TIMEOUT_CMD="" -if [[ -n "$SH_TIMEOUT" ]] && [[ $SH_TIMEOUT != "-1" ]] -then - TIMEOUT_CMD="timeout $SH_TIMEOUT" -fi - # All stdout/stderr after this point goes into model.log ! mkdir -p $INSTANCE_DIRECTORY LOG_FILE=$INSTANCE_DIRECTORY/model.log @@ -44,9 +43,20 @@ exec >> $LOG_FILE exec 2>&1 cd $INSTANCE_DIRECTORY -echo "MODEL.SH START:" -echo "MODEL_NAME: $MODEL_NAME" -echo "RUNID: $RUNID" +TIMEOUT_CMD="" +if [[ ${SH_TIMEOUT:-} != "" ]] && [[ $SH_TIMEOUT != "-1" ]] +then + TIMEOUT_CMD="timeout $SH_TIMEOUT" +fi + +log() +{ + echo $( date "+%Y-%m-%d %H:%M:%S" ) "MODEL.SH:" $* +} + +log "START" +log "MODEL_NAME: $MODEL_NAME" +log "RUNID: $RUNID" # Source langs-app-{SITE} from workflow/common/sh/ (cf. utils.sh) if [[ ${WORKFLOWS_ROOT:-} == "" ]] @@ -57,23 +67,22 @@ source $WORKFLOWS_ROOT/common/sh/utils.sh source_site langs-app $SITE echo -echo PARAMS: +log "PARAMS:" echo $PARAMS | print_json echo -echo "MODEL.SH: USING PYTHON:" -which python +log "USING PYTHON:" $( which python ) echo -arg_array=( "$WORKFLOWS_ROOT/common/python/model_runner.py" - "$PARAMS" - "$INSTANCE_DIRECTORY" - "$FRAMEWORK" - "$RUNID" - "$BENCHMARK_TIMEOUT") -MODEL_CMD="python3 -u ${arg_array[@]}" -# echo MODEL_CMD: $MODEL_CMD -if $TIMEOUT_CMD python3 -u "${arg_array[@]}" +PY_CMD=( "$WORKFLOWS_ROOT/common/python/model_runner.py" + "$PARAMS" + "$INSTANCE_DIRECTORY" + "$FRAMEWORK" + "$RUNID" + "$BENCHMARK_TIMEOUT" ) +MODEL_CMD="python3 -u ${PY_CMD[@]}" +log "MODEL_CMD: ${MODEL_CMD[@]}" +if $TIMEOUT_CMD ${MODEL_CMD[@]} then : # Assume success so we can keep a failed exit code else @@ -81,26 +90,25 @@ else # (i.e the line in the 'if' condition) CODE=$? echo # spacer - if [ $CODE == 124 ] + if (( $CODE == 124 )) then - echo "MODEL.SH: Timeout error in $MODEL_CMD" + log "TIMEOUT ERROR! (timeout=$SH_TIMEOUT)" # This will trigger a NaN (the result file does not exist) exit 0 else - echo "MODEL.SH: Error (CODE=$CODE) in $MODEL_CMD" - echo "MODEL.SH: TIMESTAMP:" $( date "+%Y-%m-%d %H:%M:%S" ) + log "MODEL ERROR! (CODE=$CODE)" if (( ${IGNORE_ERRORS:-0} )) then - echo "MODEL.SH: IGNORING ERROR." + log "IGNORING ERROR." # This will trigger a NaN (the result file does not exist) exit 0 fi - echo "MODEL.SH: ABORTING WORKFLOW (exit 1)" + log "ABORTING WORKFLOW (exit 1)" exit 1 # Unknown error in Python: abort the workflow fi fi -echo "MODEL.SH END: SUCCESS" +log "END: SUCCESS" exit 0 # Success # Local Variables: diff --git a/workflows/cp-leaveout/scripts/data-size.py b/workflows/cp-leaveout/scripts/data-size.py index 553c4016..066ba003 100644 --- a/workflows/cp-leaveout/scripts/data-size.py +++ b/workflows/cp-leaveout/scripts/data-size.py @@ -40,7 +40,27 @@ elif ext == ".feather": print("read feather " + str(args.input)) df = pd.read_feather(args.input).fillna(0) + print(df.shape) print(df.dtypes) + print(str(df["CELL"])) + C = {} + for s in df["CELL"]: + C[s] = "" + D = {} + for s in df["DRUG"]: + D[s] = "" + print("df.columns: " + str(df.columns)) + print("df.index: " + str(df.index)) + print("len(df): " + str(len(df))) + print("len(C): " + str(len(C))) + print("len(D): " + str(len(D))) + print("len(AUC): " + str(len(df["AUC"]))) + + # print(str(df["CELL"][0:9])) + # print(str(type(df["CELL"][0]))) + +print("data-size: OK.") + # total size: (529940, 6215) # store = pd.HDFStore(args.input, "r", complevel=9, complib="blosc:snappy") From 962f22b63c4bc6f6c9c4b1c0cb5125fcddbc44cb Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 15 Nov 2021 11:05:29 -0600 Subject: [PATCH 207/601] Minor fixes for local execution --- workflows/upf/swift/workflow.sh | 2 +- workflows/upf/test/cfg-sys-1.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index e4d684ab..b13e6b1d 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -85,7 +85,7 @@ mkdir -pv $TURBINE_OUTPUT/run which mpicc which swift-t -module list +# module list cp -v $UPF $TURBINE_OUTPUT diff --git a/workflows/upf/test/cfg-sys-1.sh b/workflows/upf/test/cfg-sys-1.sh index 8a436a88..821d44a6 100644 --- a/workflows/upf/test/cfg-sys-1.sh +++ b/workflows/upf/test/cfg-sys-1.sh @@ -37,7 +37,7 @@ export PPN=${PPN:-2} # export TURBINE_LAUNCH_OPTIONS="-a1 -g6 -c7" # export WALLTIME=${WALLTIME:-0:30} -echo WALLTIME: $WALLTIME +# echo WALLTIME: $WALLTIME # export MAIL_ENABLED=1 # export MAIL_ADDRESS=wozniak@mcs.anl.gov From 9ec4045bb500c34ed98660b6506878af24f9b109 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 15 Nov 2021 11:05:39 -0600 Subject: [PATCH 208/601] Add more lines --- workflows/upf/test/upf-1.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflows/upf/test/upf-1.txt b/workflows/upf/test/upf-1.txt index 6059a9e0..fb4a4730 100644 --- a/workflows/upf/test/upf-1.txt +++ b/workflows/upf/test/upf-1.txt @@ -1 +1,3 @@ -{"id": "test0", "epochs": 1} +{"id": "test1", "epochs": 1} +{"id": "test2", "epochs": 2} +{"id": "test3", "epochs": 3} From 028f2a4533885ba9f5dbc1453c8f465051b03196 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 15 Nov 2021 12:52:11 -0600 Subject: [PATCH 209/601] Better exception reporter --- workflows/cp-leaveout/py/data_setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/py/data_setup.py b/workflows/cp-leaveout/py/data_setup.py index 02c71344..a4929af4 100644 --- a/workflows/cp-leaveout/py/data_setup.py +++ b/workflows/cp-leaveout/py/data_setup.py @@ -118,7 +118,7 @@ def pre_run(params): except Exception as e: print("data_setup: error in build_dataframe!\n" + str(e)) sys.stdout.flush() - traceback.print_exc() + traceback.print_exc(file=sys.stdout) sys.stdout.flush() return ModelResult.ERROR print("data_setup.pre_run() done.") From 670fe79416212c00ea8c63983457c35ff2393d00 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 15 Nov 2021 13:11:34 -0600 Subject: [PATCH 210/601] Improve shrink-output --- .../cp-leaveout/scripts/shrink-output.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/scripts/shrink-output.py b/workflows/cp-leaveout/scripts/shrink-output.py index 8ae792ae..a255c3d2 100644 --- a/workflows/cp-leaveout/scripts/shrink-output.py +++ b/workflows/cp-leaveout/scripts/shrink-output.py @@ -7,7 +7,7 @@ # Removes redundant batch size information # Fixes newline before "Current time" report -import re, sys +import os, re, sys from collections import deque @@ -49,6 +49,9 @@ def shrink(fp_in, fp_out): fp_out.write(Q.popleft()) +files_total = 0 +files_shrunk = 0 + while True: line = sys.stdin.readline() @@ -56,12 +59,24 @@ def shrink(fp_in, fp_out): if len(line) == 0: break # EOF if len(line) == 1: continue # Blank line + files_total += 1 + file_in = line.strip() - print("reading: " + file_in) file_out = re.sub("/out-", "/summary-", file_in) + # Do not process files that have not changed since the last run + # of this script: + if os.path.exists(file_out) and \ + os.path.getmtime(file_in) < os.path.getmtime(file_out): + print("skipping: " + file_in) + continue + + print("shrinking: " + file_in) with open(file_in, "r") as fp_in: with open(file_out, "w") as fp_out: shrink(fp_in, fp_out) + files_shrunk += 1 +print("shrink-output.py: shrank %i / %i files." % + (files_shrunk, files_total)) print("shrink-output.py: OK") From c17c63730af0b7c01eff3a34dddabea781a9792c Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 15 Nov 2021 13:46:52 -0600 Subject: [PATCH 211/601] Latest for Spock --- workflows/common/sh/env-spock.sh | 5 +++-- workflows/common/sh/utils.sh | 4 ++-- workflows/cp-leaveout/swift/workflow.sh | 3 +-- workflows/cp-leaveout/test/cfg-sys-512.sh | 14 +++++++------- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/workflows/common/sh/env-spock.sh b/workflows/common/sh/env-spock.sh index 7e39d2c4..d9d247c8 100644 --- a/workflows/common/sh/env-spock.sh +++ b/workflows/common/sh/env-spock.sh @@ -6,10 +6,11 @@ SWIFT_IMPL=py # CANDLE software installation root: MED106=/gpfs/alpine/world-shared/med106 -ROOT=$MED106/sw/spock/gcc-10.3.0 +# ROOT=$MED106/sw/spock/gcc-10.3.0 +ROOT=$MED106/sw/spock/gcc-11.2.0 # Add Swift/T to PATH -SWIFT=$ROOT/swift-t/2021-10-05 +SWIFT=$ROOT/swift-t/2021-11-14 PATH=$SWIFT/stc/bin:$PATH PATH=$SWIFT/turbine/bin:$PATH diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index 1e22ac99..68cc9e4b 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -90,9 +90,9 @@ python_envs() RESULT=() if [[ ${PYTHONPATH:-} != "" ]] then - # We do not currently need this except on MCS: + # We do not currently need this except on MCS and Spock: # Swift/T should grab PYTHONPATH automatically - if [[ ${SITE} == "mcs" ]] + if [[ ${SITE} == "mcs" ]] || [[ ${SITE} == "spock" ]] then # MCS discards PYTHONPATH in subshells RESULT+=( -e PYTHONPATH=$PYTHONPATH ) diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index ff0aeb45..245c160f 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -40,7 +40,6 @@ then exit 1 fi -# set -x if ! { get_site $1 # Sets SITE get_expid $2 # Sets EXPID @@ -197,7 +196,7 @@ fi export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" mkdir -pv $TURBINE_OUTPUT/out -# set -x +set -x swift-t -O 0 -n $PROCS \ ${MACHINE:-} \ -p \ diff --git a/workflows/cp-leaveout/test/cfg-sys-512.sh b/workflows/cp-leaveout/test/cfg-sys-512.sh index 4b40df60..b03cdf0f 100644 --- a/workflows/cp-leaveout/test/cfg-sys-512.sh +++ b/workflows/cp-leaveout/test/cfg-sys-512.sh @@ -18,16 +18,12 @@ export PROCS=${PROCS:-6} # Cori has 32 cores per node, 128GB per node export PPN=${PPN:-1} -# For Theta: -# export QUEUE=${QUEUE:-debug-flat-quad} -# export QUEUE=R.candle - -# Summit: Limited to 2h if small job -export WALLTIME=${WALLTIME:-02:00:00} +export WALLTIME=${WALLTIME:-12:00:00} # command separated list of gpu ids # export GPU_STRING=${GPU_STRING:-0} #export TURBINE_LAUNCH_OPTIONS="-a6 -g6 -c42" + if (( PPN == 1 )) then export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" @@ -35,7 +31,11 @@ else # For PPN=4 debugging: export TURBINE_LAUNCH_OPTIONS="-g1 -c7 -a1" fi -export TURBINE_DIRECTIVE="#BSUB -alloc_flags \"NVME maximizegpfs\"" + +if [[ $SITE == "summit" ]] +then + export TURBINE_DIRECTIVE="#BSUB -alloc_flags \"NVME maximizegpfs\"" +fi #export PROJECT=Candle_ECP From 4321836f2393e691628ef291c6acfecc0f2e43a0 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 15 Nov 2021 13:59:17 -0600 Subject: [PATCH 212/601] Move Summit launch stuff --- workflows/common/sh/sched-summit.sh | 11 ++++++++++- workflows/cp-leaveout/test/cfg-sys-512.sh | 13 ------------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/workflows/common/sh/sched-summit.sh b/workflows/common/sh/sched-summit.sh index e7359509..064176f4 100644 --- a/workflows/common/sh/sched-summit.sh +++ b/workflows/common/sh/sched-summit.sh @@ -10,4 +10,13 @@ export PROJECT=${PROJECT:-MED106} # export TURBINE_OUTPUT_SOFTLINK=/dev/null JSRUN_DEFAULT="-a1 -g6 -c7" -export TURBINE_LAUNCH_OPTIONS=${TURBINE_LAUNCH_OPTIONS:-${JSRUN_DEFAULT}} + +if (( PPN == 1 )) +then + export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" +else + # For PPN=4 debugging: + export TURBINE_LAUNCH_OPTIONS="-g1 -c7 -a1" +fi + +export TURBINE_DIRECTIVE="#BSUB -alloc_flags \"NVME maximizegpfs\"" diff --git a/workflows/cp-leaveout/test/cfg-sys-512.sh b/workflows/cp-leaveout/test/cfg-sys-512.sh index b03cdf0f..447383d1 100644 --- a/workflows/cp-leaveout/test/cfg-sys-512.sh +++ b/workflows/cp-leaveout/test/cfg-sys-512.sh @@ -24,19 +24,6 @@ export WALLTIME=${WALLTIME:-12:00:00} # export GPU_STRING=${GPU_STRING:-0} #export TURBINE_LAUNCH_OPTIONS="-a6 -g6 -c42" -if (( PPN == 1 )) -then - export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" -else - # For PPN=4 debugging: - export TURBINE_LAUNCH_OPTIONS="-g1 -c7 -a1" -fi - -if [[ $SITE == "summit" ]] -then - export TURBINE_DIRECTIVE="#BSUB -alloc_flags \"NVME maximizegpfs\"" -fi - #export PROJECT=Candle_ECP # Benchmark run timeout: benchmark run will timeout From 5e7eaf52bb769d96d9882bd7c32c9a076080bd78 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 24 Nov 2021 10:50:04 -0600 Subject: [PATCH 213/601] Better output shrinker --- .../cp-leaveout/scripts/shrink-output.mk | 14 +++++ .../cp-leaveout/scripts/shrink-output.py | 59 ++++++++----------- .../cp-leaveout/scripts/shrink-output.sh | 10 +++- 3 files changed, 47 insertions(+), 36 deletions(-) create mode 100644 workflows/cp-leaveout/scripts/shrink-output.mk diff --git a/workflows/cp-leaveout/scripts/shrink-output.mk b/workflows/cp-leaveout/scripts/shrink-output.mk new file mode 100644 index 00000000..65607e07 --- /dev/null +++ b/workflows/cp-leaveout/scripts/shrink-output.mk @@ -0,0 +1,14 @@ + +.DELETE_ON_ERROR: + +OUTS = $(wildcard out-*.txt) + +SUMMARIES = $(subst out-,summary-,$(OUTS)) + +all: $(SUMMARIES) + +/tmp/${USER}/tr-%.txt: out-%.txt + @ tr "\r" "\n" < $(<) > $(@) + +summary-%.txt: /tmp/${USER}/tr-%.txt + @ python $(THIS)/shrink-output.py $(<) $(@) diff --git a/workflows/cp-leaveout/scripts/shrink-output.py b/workflows/cp-leaveout/scripts/shrink-output.py index a255c3d2..c3a2a1f0 100644 --- a/workflows/cp-leaveout/scripts/shrink-output.py +++ b/workflows/cp-leaveout/scripts/shrink-output.py @@ -1,7 +1,8 @@ # SHRINK OUTPUT PY # Receives list of filenames on stdin -# Converts filenames from out-*.txt to summary-*.txt +# Converts filenames from tr-*.txt to summary-*.txt +# The tr file should have used tr to change carriage return to newline # Removes non-printing characters (backspace) # Reduces the number of training lines in output # Removes redundant batch size information @@ -15,7 +16,7 @@ shrink_factor = 100 # Number of additional consecutive lines at beginning and end of # training that are retained -hold_space = 5 +hold_space = 3 def shrink(fp_in, fp_out): @@ -23,6 +24,7 @@ def shrink(fp_in, fp_out): Q = deque() index = 0 starts = 0 # Initial hold_space ETAs are immediately printed + line_previous = "" for line in fp_in: if len(line) == 1: continue # Blank line line = line.replace("\b", "") @@ -34,49 +36,40 @@ def shrink(fp_in, fp_out): starts += 1 continue Q.append(line) + index += 1 if len(Q) > hold_space: - index += 1 line = Q.popleft() - if index % shrink_factor == 0: - fp_out.write(line) + if index % shrink_factor == 0: + fp_out.write(line) else: starts = 0 while len(Q) > 0: fp_out.write(Q.popleft()) + if line == line_previous: + continue fp_out.write(line) + line_previous = line # Done: flush Q: while len(Q) > 0: fp_out.write(Q.popleft()) -files_total = 0 -files_shrunk = 0 - -while True: - - line = sys.stdin.readline() - - if len(line) == 0: break # EOF - if len(line) == 1: continue # Blank line - - files_total += 1 - - file_in = line.strip() - file_out = re.sub("/out-", "/summary-", file_in) +file_in = sys.argv[1] +file_out = sys.argv[2] - # Do not process files that have not changed since the last run - # of this script: - if os.path.exists(file_out) and \ - os.path.getmtime(file_in) < os.path.getmtime(file_out): - print("skipping: " + file_in) - continue +# Do not process files that have not changed since the last run +# of this script: +if os.path.exists(file_out) and \ + os.path.getmtime(file_in) < os.path.getmtime(file_out): + print("skipping: " + file_in) + exit() - print("shrinking: " + file_in) - with open(file_in, "r") as fp_in: - with open(file_out, "w") as fp_out: - shrink(fp_in, fp_out) - files_shrunk += 1 +print("shrinking: " + file_in) +with open(file_in, "r") as fp_in: + with open(file_out, "w") as fp_out: + shrink(fp_in, fp_out) +# files_shrunk += 1 -print("shrink-output.py: shrank %i / %i files." % - (files_shrunk, files_total)) -print("shrink-output.py: OK") +# print("shrink-output.py: shrank %i / %i files." % +# (files_shrunk, files_total)) +# print("shrink-output.py: OK") diff --git a/workflows/cp-leaveout/scripts/shrink-output.sh b/workflows/cp-leaveout/scripts/shrink-output.sh index 2e92adf5..ac395c57 100755 --- a/workflows/cp-leaveout/scripts/shrink-output.sh +++ b/workflows/cp-leaveout/scripts/shrink-output.sh @@ -8,14 +8,15 @@ THIS=$( readlink --canonicalize $( dirname $0 ) ) CPLO=$( readlink --canonicalize $THIS/.. ) SUPERVISOR=$( readlink --canonicalize $CPLO/../.. ) +export THIS source $SUPERVISOR/workflows/common/sh/utils.sh -export PYTHONPATH+=:$SUPERVISOR/workflows/common/python - SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ DIR - ${*} +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + if ! [[ -d $DIR ]] then echo "Does not exist: $DIR" @@ -24,4 +25,7 @@ fi OUTS=() -find $DIR/out -name "out-*" | python $THIS/shrink-output.py +mkdir -pv /tmp/$USER + +cd $D/out +nice -n 19 make -j 8 -f $THIS/shrink-output.mk From bbec06eed2bbfaddace28104f0f7e892f719f3e7 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 24 Nov 2021 13:48:48 -0600 Subject: [PATCH 214/601] plangen: Better RESTART detection and logging --- workflows/cp-leaveout/py/plangen.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/workflows/cp-leaveout/py/plangen.py b/workflows/cp-leaveout/py/plangen.py index d99fbb45..43554b06 100644 --- a/workflows/cp-leaveout/py/plangen.py +++ b/workflows/cp-leaveout/py/plangen.py @@ -764,28 +764,28 @@ def start_subplan(db_path, plan_path, plan_id=None, subplan_id=None, run_type=No skip = False print("plangen: start_subplan: run_type: '%s'" % str(run_type)) - print("plangen: start_subplan: run_type type: %s" % str(type(run_type))) + # print("plangen: start_subplan: run_type type: %s" % str(type(run_type))) print("plangen: start_subplan: base: '%s'" % str(RunType.RESTART)) sys.stdout.flush() # skip previously completed work if RESTART - if run_type == RunType.RESTART: - log("plangen: start_subplan: checking restart: %i" % plan_id) + if 'RESTART' in str(run_type): + print("plangen: start_subplan: checking restart: %i" % plan_id) sys.stdout.flush() stmt = _select_row_from_runhist.format(plan_id, subplan_id) execute_sql_stmt(conn, stmt, cursor=csr) row = csr.fetchone() if row: - log("plangen: start_subplan: found row.") + print("plangen: start_subplan: found row.") runhist_rec = RunhistRow._make(row) - log("plangen: start_subplan: found '%s'" % runhist_rec.status) + print("plangen: start_subplan: found '%s'" % runhist_rec.status) if runhist_rec.status == RunStat.COMPLETE.name: skip = True - log("plangen: start_subplan: skip %r" % skip) + print("plangen: start_subplan: skip %r" % skip) else: print("plangen: start_subplan: not checking restart") - sys.stdout.flush() + sys.stdout.flush() # construct/reinit a new runhist record if not skip: From fd8f75cdfbcdd2df2968aaccd9113c63d3565b5f Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 24 Nov 2021 13:49:06 -0600 Subject: [PATCH 215/601] Bug fixes --- workflows/cp-leaveout/scripts/shrink-output.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflows/cp-leaveout/scripts/shrink-output.sh b/workflows/cp-leaveout/scripts/shrink-output.sh index ac395c57..c34503cf 100755 --- a/workflows/cp-leaveout/scripts/shrink-output.sh +++ b/workflows/cp-leaveout/scripts/shrink-output.sh @@ -1,4 +1,5 @@ #!/bin/bash +set -eu # SHRINK OUTPUT SH # Accepts a whole workflow output directory @@ -23,9 +24,8 @@ then exit 1 fi -OUTS=() - +# This is used inside the Makefile below: mkdir -pv /tmp/$USER -cd $D/out +cd $DIR/out nice -n 19 make -j 8 -f $THIS/shrink-output.mk From a86adaba2cc6a4406b1c11f682a2e3c3464d44cf Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 24 Nov 2021 13:49:35 -0600 Subject: [PATCH 216/601] Log runtype in workflow --- workflows/cp-leaveout/swift/workflow.swift | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/cp-leaveout/swift/workflow.swift b/workflows/cp-leaveout/swift/workflow.swift index 70ce048a..5e198f17 100644 --- a/workflows/cp-leaveout/swift/workflow.swift +++ b/workflows/cp-leaveout/swift/workflow.swift @@ -85,6 +85,7 @@ string exp_id = getenv("EXPID"); string turbine_output = getenv("TURBINE_OUTPUT"); // END WORKFLOW ARGUMENTS +printf("plangen: runtype:" + runtype); printf("benchmark_data: " + benchmark_data); // // For compatibility with obj(): From 5e1a1a77141f8d7171d06d560e22c2313e287f60 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 25 Nov 2021 17:34:29 -0600 Subject: [PATCH 217/601] flake8 fixes --- workflows/cp-leaveout/scripts/extract-node-info.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index bcb1bcf6..2e444cd9 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -7,8 +7,7 @@ # Use print-node-info to print the node info # See Node.py for the data structure -import argparse, logging, os, pickle, sys -import pprint +import argparse, logging, os, pickle from utils import fail from Node import Node @@ -25,10 +24,11 @@ logging.basicConfig(level=logging.INFO, format="%(message)s") logger = logging.getLogger("extract_node_info") + def read_log_filenames(log_list): result = [] count = 0 - limit = 5000 # Reduce this for debugging + limit = 5000 # Reduce this for debugging try: with open(log_list) as fp: for line in fp.readlines(): @@ -43,6 +43,7 @@ def read_log_filenames(log_list): fail(e, os.EX_IOERR, "Could not read: " + log_list) return result + def parse_logs(log_files): # Dict mapping Node id to Node for all complete Nodes: nodes = {} @@ -61,6 +62,7 @@ def parse_logs(log_files): fail(e, os.EX_IOERR, "Could not read: " + log_file) return nodes + def parse_log(log_fp, nodes): nodes_found = 0 node_current = None @@ -107,9 +109,11 @@ def parse_log(log_fp, nodes): logger.info("Found %i nodes in log." % nodes_found) + def trace(message): logger.log(level=logging.DEBUG-5, msg=message) + # def find_val_data(node): # python_log = args.directory + "/run/%s/save/python.log" % node.id # if not os.path.exists(python_log): @@ -119,15 +123,17 @@ def trace(message): # if node.val_data == None: # logger.fatal("Could not find val data for node: " + node.id) + def find_error_data(node): python_log = args.directory + "/run/%s/save/python.log" % node.id if not os.path.exists(python_log): return with open(python_log) as fp: node.parse_error_data(fp) - if node.mse == None: + if node.mse is None: logger.fatal("Could not find error data for node: " + node.id) + # List of log file names log_files = read_log_filenames(log_list) # Dict mapping Node id to Node for all complete Nodes From 04caa7f97f238a675d4afd3f82662a023beea6c4 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 25 Nov 2021 17:59:51 -0600 Subject: [PATCH 218/601] flake8 fixes --- workflows/cp-leaveout/scripts/Node.py | 28 +++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/workflows/cp-leaveout/scripts/Node.py b/workflows/cp-leaveout/scripts/Node.py index 8150723b..a5793eb2 100644 --- a/workflows/cp-leaveout/scripts/Node.py +++ b/workflows/cp-leaveout/scripts/Node.py @@ -8,6 +8,7 @@ # import math + class Node: # TensorFlow is done when you see this @@ -138,7 +139,7 @@ def parse_date_start(self, line): def parse_date_stop(self, line, logger=None): tokens = line.split() self.date_stop = tokens[0] + " " + tokens[1] - if self.epochs_planned == None: + if self.epochs_planned is None: self.debug(logger, "STOP : epochs_planned=None") return if self.epochs_actual == self.epochs_planned or \ @@ -157,7 +158,7 @@ def parse_training_done(self, line, logger=None): td = td + 1 stepii = tokens[td-1].split("/") self.steps += int(stepii[0]) - time_s = tokens[td+2] # e.g., "321s" + time_s = tokens[td+2] # e.g., "321s" self.time += int(time_s[0:-1]) # Always collect losses: early stopping could happen: self.loss = float(tokens[td+6]) @@ -188,7 +189,7 @@ def parse_error_data(self, fp): marker = "Comparing y_true " # The marker is just after the date: # We search this way for speed. - date_len = len("YYYY-MM-DD HH:MM:SS ") # trailing space + date_len = len("YYYY-MM-DD HH:MM:SS ") # trailing space while True: line = fp.readline() if line == "": break @@ -209,24 +210,24 @@ def parse_error_data(self, fp): # Loop! We want the last such values in the file def get_loss_delta(node): - if node.loss_delta == None: + if node.loss_delta is None: raise ValueError("No loss_delta!") return node.loss_delta def get_val_loss_delta(node): - if node.val_loss_delta == None: + if node.val_loss_delta is None: raise ValueError("No val_loss_delta!") return node.val_loss_delta def debug(self, logger, message): # assert(logger != None) # Use this to find missing loggers - if logger == None or not self.verbose: + if logger is None or not self.verbose: return logger.debug("NODE: [%s] %s" % (self.id, message)) def trace(self, logger, message): # assert(logger != None) # Use this to find missing loggers - if logger == None or not self.verbose: + if logger is None or not self.verbose: return import logging logger.log(level=logging.DEBUG-5, @@ -235,35 +236,38 @@ def trace(self, logger, message): def get_time_cumul(self, nodes): ''' Time cumulative including parents' time ''' parent = self.parent() - if parent == None: + if parent is None: return self.time return self.time + nodes[parent].get_time_cumul(nodes) def get_epochs_cumul(self, nodes): ''' Epochs cumulative including parents' epochs ''' - if self.epochs_cumul != None: + if self.epochs_cumul is not None: return self.epochs_cumul # Initialize: self.epochs_cumul = self.epochs_actual parent = self.parent() - if parent != None and parent in nodes: + if parent is not None and parent in nodes: # Add parents: self.epochs_cumul += nodes[parent].get_epochs_cumul(nodes) return self.epochs_cumul + def check_token(line, index, token): ''' Assert that token is in line at given index ''' tokens = line.split() if tokens[index] != token: - raise Exception(("could not find token: '%s'\n" + - "in line: '%s'") % (token, line)) + raise Exception(("could not find token: '%s'\n" + + "in line: '%s'") % (token, line)) return tokens + def check(condition, message): ''' Check condition or raise Exception with given message ''' if not condition: raise Exception(message) + ''' EXAMPLES: From ca1ce9131d67b92d06f1d8762399d7933f91c8c3 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 25 Nov 2021 18:01:58 -0600 Subject: [PATCH 219/601] flake8 fixes --- workflows/cp-leaveout/scripts/extract-node-info.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index 2e444cd9..ad891dce 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -93,12 +93,12 @@ def parse_log(log_fp, nodes): elif line.startswith("Epoch ") and "/" in line: node_current.parse_epoch_status(line, logger) elif Node.training_done in line: - node_current.parse_training_done(line, logger) + node_current.parse_training_done(line, logger) elif "early stopping" in line: - if node_current != None: + if node_current is not None: # TensorFlow may report early stopping even if at max epochs: node_current.stop_early() - if node_current != None and node_current.complete: + if node_current is not None and node_current.complete: # Store a complete Node in global dict nodes # logger.debug("NODE DONE.") nodes[node_current.id] = node_current From 0ac93e75f9570c7c77d18305afe985bc98672be8 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 25 Nov 2021 18:17:48 -0600 Subject: [PATCH 220/601] Better error reporting --- workflows/cp-leaveout/scripts/Node.py | 31 +++++++++++++++++---------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/workflows/cp-leaveout/scripts/Node.py b/workflows/cp-leaveout/scripts/Node.py index a5793eb2..6e7b4670 100644 --- a/workflows/cp-leaveout/scripts/Node.py +++ b/workflows/cp-leaveout/scripts/Node.py @@ -115,6 +115,11 @@ def maybe_str_float(f, spec): return "?" return spec % f + def bad_line(self, line): + print("") + print("BAD LINE: " + line) + print("") + def parse_epochs(self, line, logger=None): tokens = line.split() self.epochs_planned = int(tokens[-1].strip()) @@ -152,17 +157,21 @@ def parse_training_done(self, line, logger=None): # by parse_epoch_status() # First, find the location of training_done (td) # (to accommodate prefixes) - tokens = line.split() - td = 0 - while tokens[td] != Node.training_done: - td = td + 1 - stepii = tokens[td-1].split("/") - self.steps += int(stepii[0]) - time_s = tokens[td+2] # e.g., "321s" - self.time += int(time_s[0:-1]) - # Always collect losses: early stopping could happen: - self.loss = float(tokens[td+6]) - self.val_loss = float(tokens[td+15]) + try: + tokens = line.split() + td = 0 + while tokens[td] != Node.training_done: + td = td + 1 + stepii = tokens[td-1].split("/") + self.steps += int(stepii[0]) + time_s = tokens[td+2] # e.g., "321s" + self.time += int(time_s[0:-1]) + # Always collect losses: early stopping could happen: + self.loss = float(tokens[td+5]) + self.val_loss = float(tokens[td+14]) + except Exception as e: + self.bad_line(line) + raise(e) def parse_val_data(self, fp): """ From ff4565c53d6dc61efb9effa5d6e5acc9683b49a6 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 25 Nov 2021 18:18:09 -0600 Subject: [PATCH 221/601] Shrink output before processing --- workflows/cp-leaveout/scripts/extract-node-info.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/scripts/extract-node-info.sh b/workflows/cp-leaveout/scripts/extract-node-info.sh index e941fc10..08b62706 100755 --- a/workflows/cp-leaveout/scripts/extract-node-info.sh +++ b/workflows/cp-leaveout/scripts/extract-node-info.sh @@ -27,13 +27,17 @@ LOG_LIST=$DIR/log-list.txt RESTARTS=( $DIR/restarts/* ) +for RESTART in ${RESTARTS[@]} +do + $THIS/shrink-output.sh $RESTART +done shopt -s nullglob # Ignore empty globs { for RESTART in ${RESTARTS[@]} do - echo $RESTART/out/out-*.txt + echo $RESTART/out/summary-*.txt done - echo $DIR/out/out-*.txt + echo $DIR/out/summary-*.txt } | fmt -w 1 > $LOG_LIST export PYTHONPATH+=:$SUPERVISOR/workflows/common/python From 50f948f74b44e074d6b20d736025cbe2ac86b548 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 25 Nov 2021 18:18:36 -0600 Subject: [PATCH 222/601] Better message --- workflows/cp-leaveout/scripts/extract-node-info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index ad891dce..5a2a5032 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -47,7 +47,7 @@ def read_log_filenames(log_list): def parse_logs(log_files): # Dict mapping Node id to Node for all complete Nodes: nodes = {} - logger.warning("Opening %i out.txt files..." % len(log_files)) + logger.warning("Opening %i files..." % len(log_files)) try: total = len(log_files) index = 0 From c921f2a70d7c2375d4a9c44eac062d3f9d0843a3 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 25 Nov 2021 18:53:41 -0600 Subject: [PATCH 223/601] Update line parser for TF 2.5.0 --- workflows/cp-leaveout/scripts/extract-node-info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index 5a2a5032..0210c9da 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -92,7 +92,7 @@ def parse_log(log_fp, nodes): node_current.parse_epochs(line, logger) elif line.startswith("Epoch ") and "/" in line: node_current.parse_epoch_status(line, logger) - elif Node.training_done in line: + elif Node.training_done in line and "ETA:" not in line: node_current.parse_training_done(line, logger) elif "early stopping" in line: if node_current is not None: From 756e59041e085b353013773cf91166e6bf432e08 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 25 Nov 2021 18:58:50 -0600 Subject: [PATCH 224/601] flake8 fixes --- workflows/common/python/utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/workflows/common/python/utils.py b/workflows/common/python/utils.py index f8a8b3ce..3259b51c 100644 --- a/workflows/common/python/utils.py +++ b/workflows/common/python/utils.py @@ -1,29 +1,36 @@ # UTILS PY +import os + + def fail(*args): if len(args) == 1: fail1(args[0]) elif len(args) == 3: fail3(*args) + def fail1(message): """ Fail with message, return exit code 1 """ print(message) exit(1) + def fail3(e, code, message): """ Fail with message due to Exception e , return exit code """ print(message) print(str(e)) exit(code) + def avg(values): total = 0.0 for v in values: total += v return total / len(values) + def append(filename, text): try: with open(filename, 'a') as fp: From d3128ba4f66287841870f1b559b3abbd68b866bc Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 25 Nov 2021 19:43:36 -0600 Subject: [PATCH 225/601] Adding scripts/avg-stage.py scripts/avg-stage.sh --- workflows/cp-leaveout/scripts/avg-stage.py | 54 ++++++++++++++++++++++ workflows/cp-leaveout/scripts/avg-stage.sh | 26 +++++++++++ 2 files changed, 80 insertions(+) create mode 100644 workflows/cp-leaveout/scripts/avg-stage.py create mode 100755 workflows/cp-leaveout/scripts/avg-stage.sh diff --git a/workflows/cp-leaveout/scripts/avg-stage.py b/workflows/cp-leaveout/scripts/avg-stage.py new file mode 100644 index 00000000..910e5fe3 --- /dev/null +++ b/workflows/cp-leaveout/scripts/avg-stage.py @@ -0,0 +1,54 @@ + +# AVG STAGE PY + +import argparse, os, pickle, statistics + +from utils import fail + +STAGE_ANY = 0 + +parser = argparse.ArgumentParser(description="Finds loss increases.") +parser.add_argument("directory", + help="The experiment directory (EXPID)") +parser.add_argument("--filename", "-f", + default="node-info", + help="Change the node pkl file name") +args = parser.parse_args() + + +node_pkl = args.directory + "/" + args.filename + ".pkl" + +try: + with open(node_pkl, "rb") as fp: + # This is a dict ("node_id" -> Node) + data = pickle.load(fp) +except IOError as e: + fail(e, os.EX_IOERR, "Could not read: " + node_pkl) + +print("total nodes: %i" % len(data)) + +# Total Node count: +total = 0 +# stages = { 1:[], 2:[], 3:[], 4:[], 5:[] } +# epochs = { 1:[], 2:[], 3:[], 4:[], 5:[] } +times = { 1:[], 2:[], 3:[], 4:[], 5:[] } +vlosses = { 1:[], 2:[], 3:[], 4:[], 5:[] } + +for node_id in data.keys(): + node = data[node_id] + # stages[node.stage].append(node.time) + # epochs[node.stage].append(node.epochs_actual) + times[node.stage].append(node.time/node.epochs_actual) + vlosses[node.stage].append(node.val_loss) + +with open(args.directory + "/times.data", "w") as fp: + for stage in times.keys(): + count = len(times[stage]) + timer = statistics.mean(times[stage]) + fp.write("%i %0.2f # count=%i\n" % (stage, timer, count)) + +with open(args.directory + "/vlosses.data", "w") as fp: + for stage in times.keys(): + count = len(times[stage]) + vloss = statistics.mean(vlosses[stage]) + fp.write("%i %0.6f # count=%i\n" % (stage, vloss, count)) diff --git a/workflows/cp-leaveout/scripts/avg-stage.sh b/workflows/cp-leaveout/scripts/avg-stage.sh new file mode 100755 index 00000000..913d0353 --- /dev/null +++ b/workflows/cp-leaveout/scripts/avg-stage.sh @@ -0,0 +1,26 @@ +#!/bin/bash +set -eu + +# AVG STAGE SH + +# Input: Provide an experiment directory +# Output: Per-stage averages printed to plottable files + +THIS=$( readlink --canonicalize $( dirname $0 ) ) + +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ + DIR - ${*} + +if [[ ! -d $DIR ]] +then + echo "$0: Given experiment directory does not exist: $DIR" + exit 1 +fi + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +set -x +python3 -u $THIS/avg-stage.py ${*} From 2090519b2b4227f87ff98cf6576c19e4005765b9 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 25 Nov 2021 21:09:00 -0600 Subject: [PATCH 226/601] Better Node parsing stuff --- workflows/cp-leaveout/scripts/Node.py | 23 +++++++++++++++- .../cp-leaveout/scripts/extract-node-info.py | 26 ++++++++++++++++--- 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/workflows/cp-leaveout/scripts/Node.py b/workflows/cp-leaveout/scripts/Node.py index 6e7b4670..3f1a7e74 100644 --- a/workflows/cp-leaveout/scripts/Node.py +++ b/workflows/cp-leaveout/scripts/Node.py @@ -43,6 +43,10 @@ def __init__(self, id=None, logger=None): self.date_stop = None # Training time in seconds self.time = 0 + # Training run restarts- each log file makes a new segment + self.segment = 0 + # Time for given segment from "Current time" + self.segments = {} # Did EarlyStopping stop this node? self.stopped_early = False # Did training complete for this node? @@ -56,6 +60,9 @@ def set_id(self, id, logger=None): self.stage = (len(self.id) - 1 ) // 2 self.debug(logger, "SET ID: " + id) + def new_segment(self): + self.segment += 1 + def parent(self): if self.stage == 1: return None @@ -86,7 +93,7 @@ def str_table(self): special = " INCOMPLETE!" if self.stopped_early: special = " EARLY STOP!" - return "%-12s : %i : %2i / %2i : %s - %s : %s : %s" % \ + return "%-13s : %i : %2i / %2i : %s - %s : %s : %s" % \ (self.id, self.stage, self.epochs_actual, self.epochs_planned, self.date_start, self.date_stop, @@ -133,6 +140,14 @@ def parse_epoch_status(self, line, logger=None): self.epochs_actual = int(ints[0]) self.trace(logger, "epochs_actual: " + str(self.epochs_actual)) + def parse_current_time(self, line, logger=None): + tokens = line.split() + assert len(tokens) == 3, "bad line: " + line + # Chop off leading dots: ....123.123 + t = tokens[2][4:] + self.segments[self.segment] = float(t) + # print("%-13s %i %r" % (self.id, self.segment, self.segments)) + def stop_early(self, logger=None): self.stopped_early = True self.debug(logger, "STOP EARLY") @@ -249,6 +264,12 @@ def get_time_cumul(self, nodes): return self.time return self.time + nodes[parent].get_time_cumul(nodes) + def get_segments(self): + total = 0 + for s, t in self.segments.items(): + total += t + return total + def get_epochs_cumul(self, nodes): ''' Epochs cumulative including parents' epochs ''' if self.epochs_cumul is not None: diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index 0210c9da..c8dfba3f 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -74,7 +74,10 @@ def parse_log(log_fp, nodes): # This is also a MODEL RUNNER line, # but could be DEBUG or INFO # (should be INFO in future) - logger.debug("RUN DONE.") + if node_current is None: + # Restarted node with no epochs remaining: + continue + logger.info("RUN DONE.") node_current.parse_date_stop(line, logger) elif "MODEL RUNNER" in line: # print(line.strip()) @@ -84,14 +87,30 @@ def parse_log(log_fp, nodes): node_current = Node(logger=logger) node_current.parse_date_start(line) elif " node =" in line: - print(line) + logger.info("start: " + line) tokens = line.split() node_id = tokens[-1].strip() - node_current.set_id(node_id, logger) + if node_id not in nodes: + if node_id == "1.2.3.4": + print("NEW NODE") + node_current.set_id(node_id, logger) + nodes[node_id] = node_current + else: + if node_id == "1.2.3.4": + print("REFIND") + node_current = nodes[node_id] + node_current.new_segment() elif " epochs =" in line: + if node_current is None: + # Restarted node with no epochs remaining: + continue + logger.info(line) + logger.info("found epochs =") node_current.parse_epochs(line, logger) elif line.startswith("Epoch ") and "/" in line: node_current.parse_epoch_status(line, logger) + elif line.startswith("Current "): + node_current.parse_current_time(line, logger) elif Node.training_done in line and "ETA:" not in line: node_current.parse_training_done(line, logger) elif "early stopping" in line: @@ -101,7 +120,6 @@ def parse_log(log_fp, nodes): if node_current is not None and node_current.complete: # Store a complete Node in global dict nodes # logger.debug("NODE DONE.") - nodes[node_current.id] = node_current # find_val_data(node_current) # old format? find_error_data(node_current) nodes_found += 1 From 171533c792e9aa01d12168cefb32ee95d76b2665 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 25 Nov 2021 21:09:32 -0600 Subject: [PATCH 227/601] Use segment times --- workflows/cp-leaveout/scripts/avg-stage.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/avg-stage.py b/workflows/cp-leaveout/scripts/avg-stage.py index 910e5fe3..16c9d7ed 100644 --- a/workflows/cp-leaveout/scripts/avg-stage.py +++ b/workflows/cp-leaveout/scripts/avg-stage.py @@ -36,14 +36,21 @@ for node_id in data.keys(): node = data[node_id] + if not node.complete: + continue # stages[node.stage].append(node.time) # epochs[node.stage].append(node.epochs_actual) - times[node.stage].append(node.time/node.epochs_actual) + times[node.stage].append(node.get_segments()/node.epochs_actual) vlosses[node.stage].append(node.val_loss) + if node.stage == 3: + print("%s %0.2f %i" % (node.id, + node.get_segments(), + node.epochs_actual)) with open(args.directory + "/times.data", "w") as fp: for stage in times.keys(): count = len(times[stage]) + # print("stage: %i (%i) %r" % (stage, count, times[stage])) timer = statistics.mean(times[stage]) fp.write("%i %0.2f # count=%i\n" % (stage, timer, count)) From 98ef241ea2f3de38cbd0ae54fabf4daea4b4ba64 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 25 Nov 2021 21:16:09 -0600 Subject: [PATCH 228/601] Rename data file --- workflows/cp-leaveout/scripts/avg-stage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/avg-stage.py b/workflows/cp-leaveout/scripts/avg-stage.py index 16c9d7ed..1ad076b8 100644 --- a/workflows/cp-leaveout/scripts/avg-stage.py +++ b/workflows/cp-leaveout/scripts/avg-stage.py @@ -54,7 +54,7 @@ timer = statistics.mean(times[stage]) fp.write("%i %0.2f # count=%i\n" % (stage, timer, count)) -with open(args.directory + "/vlosses.data", "w") as fp: +with open(args.directory + "/vloss.data", "w") as fp: for stage in times.keys(): count = len(times[stage]) vloss = statistics.mean(vlosses[stage]) From 54a14ba639f0c8be127fca34e1949745825352ce Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sun, 28 Nov 2021 10:43:24 -0600 Subject: [PATCH 229/601] New plot cfgs --- workflows/cp-leaveout/scripts/stage-times.cfg | 13 +++++++++++++ workflows/cp-leaveout/scripts/stage-vloss.cfg | 13 +++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 workflows/cp-leaveout/scripts/stage-times.cfg create mode 100644 workflows/cp-leaveout/scripts/stage-vloss.cfg diff --git a/workflows/cp-leaveout/scripts/stage-times.cfg b/workflows/cp-leaveout/scripts/stage-times.cfg new file mode 100644 index 00000000..5d30b734 --- /dev/null +++ b/workflows/cp-leaveout/scripts/stage-times.cfg @@ -0,0 +1,13 @@ +xlabel = stage +ylabel = time (seconds) + +width = 800 +height = 600 + +label.times-X743.data = Summit E=10 +label.times-X744.data = Summit E=50 +label.times-X750.data = Spock E=10 +label.times-X746.data = Spock E=50 + + +# legend.enabled = false diff --git a/workflows/cp-leaveout/scripts/stage-vloss.cfg b/workflows/cp-leaveout/scripts/stage-vloss.cfg new file mode 100644 index 00000000..dadeddb3 --- /dev/null +++ b/workflows/cp-leaveout/scripts/stage-vloss.cfg @@ -0,0 +1,13 @@ +xlabel = stage +ylabel = validation loss + +width = 800 +height = 600 + +label.vloss-X743.data = Summit E=10 +label.vloss-X744.data = Summit E=50 +label.vloss-X750.data = Spock E=10 +label.vloss-X746.data = Spock E=50 + + +# legend.enabled = false From 62af22c0bbc6586cb14127717b7d2b8daf8cba42 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sun, 28 Nov 2021 10:51:01 -0600 Subject: [PATCH 230/601] Capture more I/O times --- workflows/cp-leaveout/scripts/Node.py | 17 +++++++++++ .../cp-leaveout/scripts/extract-node-info.py | 29 +++++++++++++++---- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/workflows/cp-leaveout/scripts/Node.py b/workflows/cp-leaveout/scripts/Node.py index 3f1a7e74..b25afd94 100644 --- a/workflows/cp-leaveout/scripts/Node.py +++ b/workflows/cp-leaveout/scripts/Node.py @@ -41,12 +41,18 @@ def __init__(self, id=None, logger=None): self.epochs_cumul = None self.date_start = None self.date_stop = None + # Time to build dataframe + self.build_df = None + # Time to load initial weights + self.load_initial = None # Training time in seconds self.time = 0 # Training run restarts- each log file makes a new segment self.segment = 0 # Time for given segment from "Current time" self.segments = {} + # Bandwidths for checkpoint write by segment + self.ckpt_writes = {} # Did EarlyStopping stop this node? self.stopped_early = False # Did training complete for this node? @@ -132,6 +138,11 @@ def parse_epochs(self, line, logger=None): self.epochs_planned = int(tokens[-1].strip()) self.trace(logger, "epochs_planned: %i" % self.epochs_planned) + def parse_load_initial(self, line, logger=None): + tokens = line.split() + self.load_initial = float(tokens[4]) + print("load_initial: " + str(self.load_initial)) + def parse_epoch_status(self, line, logger=None): tokens = line.split() assert len(tokens) == 2, "bad line: " + line @@ -148,6 +159,12 @@ def parse_current_time(self, line, logger=None): self.segments[self.segment] = float(t) # print("%-13s %i %r" % (self.id, self.segment, self.segments)) + def parse_model_write(self, line, logger=None): + tokens = line.split() + t = float(tokens[7][1:]) + self.ckpt_writes[self.segment] = t + self.trace(logger, "model_write: %0.3f" % t) + def stop_early(self, logger=None): self.stopped_early = True self.debug(logger, "STOP EARLY") diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index c8dfba3f..db723d82 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -25,6 +25,7 @@ logger = logging.getLogger("extract_node_info") + def read_log_filenames(log_list): result = [] count = 0 @@ -66,6 +67,10 @@ def parse_logs(log_files): def parse_log(log_fp, nodes): nodes_found = 0 node_current = None + # Temporary way to capture build DF time, which happens before + # node_current is defined. Fixing log format to address this. 2021-11-28 + build_df = None + while True: line = log_fp.readline() # print(line) @@ -91,28 +96,31 @@ def parse_log(log_fp, nodes): tokens = line.split() node_id = tokens[-1].strip() if node_id not in nodes: - if node_id == "1.2.3.4": - print("NEW NODE") node_current.set_id(node_id, logger) nodes[node_id] = node_current + if build_df is not None: + node_current.build_df = build_df + build_df = None else: - if node_id == "1.2.3.4": - print("REFIND") node_current = nodes[node_id] node_current.new_segment() elif " epochs =" in line: if node_current is None: # Restarted node with no epochs remaining: continue - logger.info(line) - logger.info("found epochs =") node_current.parse_epochs(line, logger) + elif line.startswith("data_setup: build_dataframe() OK"): + build_df = parse_build_df(line, logger) + elif line.startswith("Loaded from initial_weights"): + node_current.parse_load_initial(line, logger) elif line.startswith("Epoch ") and "/" in line: node_current.parse_epoch_status(line, logger) elif line.startswith("Current "): node_current.parse_current_time(line, logger) elif Node.training_done in line and "ETA:" not in line: node_current.parse_training_done(line, logger) + elif line.startswith("model wrote:"): + node_current.parse_model_write(line, logger) elif "early stopping" in line: if node_current is not None: # TensorFlow may report early stopping even if at max epochs: @@ -128,6 +136,15 @@ def parse_log(log_fp, nodes): logger.info("Found %i nodes in log." % nodes_found) +def parse_build_df(line, logger=None): + tokens = line.split() + assert len(tokens) == 6 + global build_df + build_df = float(tokens[4]) + logger.info("build_df: %0.2f" % build_df) + return build_df + + def trace(message): logger.log(level=logging.DEBUG-5, msg=message) From 88920fab8a094c3cb545b7d62bde54d756b8087e Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 6 Jan 2022 11:13:36 -0600 Subject: [PATCH 231/601] Specify epochs --- workflows/cp-leaveout/swift/baseline-error.swift | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/swift/baseline-error.swift b/workflows/cp-leaveout/swift/baseline-error.swift index 409acd2b..bd766e81 100644 --- a/workflows/cp-leaveout/swift/baseline-error.swift +++ b/workflows/cp-leaveout/swift/baseline-error.swift @@ -25,6 +25,7 @@ file file_nodes = input(argv("nodes")); // Mapping from node ID to epochs, one per line // file file_epochs = input(argv("epochs")); int benchmark_timeout = string2int(argv("benchmark_timeout", "-1")); +int epochs_all = string2int(argv("E", "50")); // == Command-line Arguments End == // == Environment Settings Begin == @@ -77,7 +78,7 @@ foreach node, i in nodes_lines // Fill in missing hyperparameters: string training_data = "%s/run/%s/topN.uno.h5" % (reference, node); // int epochs = string2int(map_epochs[node]); - int epochs = 250; + int epochs = epochs_all; string params = params_template % (dataframe_csv, epochs, node, training_data); // NOTE: obj() is in the obj_*.swift supplied by workflow.sh results[i] = obj(params, node); From fde43000c7ee14621ebc820edd958d8271b0cfda Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 6 Jan 2022 11:24:15 -0600 Subject: [PATCH 232/601] WS --- workflows/common/sh/utils.sh | 44 ++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index 68cc9e4b..b44e3be4 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -353,29 +353,29 @@ queue_wait_site() site2=$(echo $SITE | awk -v FS="-" '{print $1}') # ALW 2020-11-15: allow $SITEs to have hyphens in them as Justin implemented for Summit on 2020-10-29, e.g., summit-tf1 if [[ $site2 == "cori" ]] - then - queue_wait_slurm $JOBID - elif [[ $site2 == "theta" ]] - then - queue_wait_cobalt $JOBID - elif [[ $site2 == "titan" ]] - then - queue_wait_pbs $JOBID - elif [[ $site2 =~ summit* ]] - then - queue_wait_lsf $JOBID - elif [[ $site2 == "pascal" ]] - then - queue_wait_slurm $JOBID - elif [[ $site2 == "biowulf" ]] - then - queue_wait_slurm $JOBID - else - echo "queue_wait(): unknown site: $SITE" - return 1 - fi + then + queue_wait_slurm $JOBID + elif [[ $site2 == "theta" ]] + then + queue_wait_cobalt $JOBID + elif [[ $site2 == "titan" ]] + then + queue_wait_pbs $JOBID + elif [[ $site2 =~ summit* ]] + then + queue_wait_lsf $JOBID + elif [[ $site2 == "pascal" ]] + then + queue_wait_slurm $JOBID + elif [[ $site2 == "biowulf" ]] + then + queue_wait_slurm $JOBID + else + echo "queue_wait(): unknown site: $SITE" + return 1 + fi - echo "Job completed: $JOBID" + echo "Job completed: $JOBID" } queue_wait_slurm() From 58d4f218c2ed0e0735dd607ed7341a5f114e678e Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 6 Jan 2022 11:25:02 -0600 Subject: [PATCH 233/601] Drop Titan --- workflows/common/sh/utils.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index b44e3be4..8c969896 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -358,9 +358,6 @@ queue_wait_site() elif [[ $site2 == "theta" ]] then queue_wait_cobalt $JOBID - elif [[ $site2 == "titan" ]] - then - queue_wait_pbs $JOBID elif [[ $site2 =~ summit* ]] then queue_wait_lsf $JOBID From fead58be68289855270fd885003b0cdcc7a7b561 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 6 Jan 2022 13:17:14 -0600 Subject: [PATCH 234/601] Better output shrinker --- .../scripts/shrink-output-single.sh | 16 +++++++ .../cp-leaveout/scripts/shrink-output.mk | 7 +-- .../cp-leaveout/scripts/shrink-output.py | 43 ++++++++++++++----- .../cp-leaveout/scripts/shrink-output.sh | 5 ++- 4 files changed, 54 insertions(+), 17 deletions(-) create mode 100755 workflows/cp-leaveout/scripts/shrink-output-single.sh diff --git a/workflows/cp-leaveout/scripts/shrink-output-single.sh b/workflows/cp-leaveout/scripts/shrink-output-single.sh new file mode 100755 index 00000000..489a59ed --- /dev/null +++ b/workflows/cp-leaveout/scripts/shrink-output-single.sh @@ -0,0 +1,16 @@ +#!/bin/sh +set -eu + +# SHRINK OUTPUT SINGLE SH +# Called by shrink-output.mk + +INPUT=$1 +OUTPUT=$2 + +D=/tmp/${USER}/shrink +T=${INPUT/out/tr} + +tr "\r" "\n" < $INPUT > $T +python $THIS/shrink-output.py $T $OUTPUT + +rm $T diff --git a/workflows/cp-leaveout/scripts/shrink-output.mk b/workflows/cp-leaveout/scripts/shrink-output.mk index 65607e07..c62878a7 100644 --- a/workflows/cp-leaveout/scripts/shrink-output.mk +++ b/workflows/cp-leaveout/scripts/shrink-output.mk @@ -7,8 +7,5 @@ SUMMARIES = $(subst out-,summary-,$(OUTS)) all: $(SUMMARIES) -/tmp/${USER}/tr-%.txt: out-%.txt - @ tr "\r" "\n" < $(<) > $(@) - -summary-%.txt: /tmp/${USER}/tr-%.txt - @ python $(THIS)/shrink-output.py $(<) $(@) +summary-%.txt: out-%.txt + @ ${THIS}/shrink-output-single.sh $(<) $(@) diff --git a/workflows/cp-leaveout/scripts/shrink-output.py b/workflows/cp-leaveout/scripts/shrink-output.py index c3a2a1f0..880f12be 100644 --- a/workflows/cp-leaveout/scripts/shrink-output.py +++ b/workflows/cp-leaveout/scripts/shrink-output.py @@ -1,19 +1,19 @@ # SHRINK OUTPUT PY -# Receives list of filenames on stdin -# Converts filenames from tr-*.txt to summary-*.txt -# The tr file should have used tr to change carriage return to newline +# argv: 2 filenames : tr-*.txt and summary-*.txt +# Called by shrink-output-single.sh +# The tr-*.txt file should have used tr to change CR to NL # Removes non-printing characters (backspace) # Reduces the number of training lines in output # Removes redundant batch size information # Fixes newline before "Current time" report -import os, re, sys +import os, re, stat, sys, time from collections import deque # Only 1/shrink_factor training lines are copied -shrink_factor = 100 +shrink_factor = 200 # Number of additional consecutive lines at beginning and end of # training that are retained hold_space = 3 @@ -54,6 +54,18 @@ def shrink(fp_in, fp_out): fp_out.write(Q.popleft()) +# From https://www.codegrepper.com/code-examples/python/python+get+human+readable+file+size +def hsize(size, decimal_places=2): + if size < 1024: + return "%4i B" % size + size /= 1024 + for unit in ["KB","MB","GB","TB"]: + if size < 1024: + break + size /= 1024 + return f"{size:.{decimal_places}f} {unit}" + + file_in = sys.argv[1] file_out = sys.argv[2] @@ -64,12 +76,23 @@ def shrink(fp_in, fp_out): print("skipping: " + file_in) exit() -print("shrinking: " + file_in) +t0 = time.time() +s0 = os.stat(file_in) +z0 = s0[stat.ST_SIZE] +h0 = hsize(z0) +print("shrink: %11s %s" % + (h0, file_in)) + with open(file_in, "r") as fp_in: with open(file_out, "w") as fp_out: shrink(fp_in, fp_out) -# files_shrunk += 1 -# print("shrink-output.py: shrank %i / %i files." % -# (files_shrunk, files_total)) -# print("shrink-output.py: OK") +s1 = os.stat(file_out) +t1 = time.time() +z1 = s1[stat.ST_SIZE] + +t = t1 - t0 +rate = hsize(z0/t) + +print("shrank: %0.2fs %11s/s %11s -> %11s %s" % + (t, rate, hsize(z0), hsize(z1), file_in)) diff --git a/workflows/cp-leaveout/scripts/shrink-output.sh b/workflows/cp-leaveout/scripts/shrink-output.sh index c34503cf..0492d513 100755 --- a/workflows/cp-leaveout/scripts/shrink-output.sh +++ b/workflows/cp-leaveout/scripts/shrink-output.sh @@ -5,6 +5,7 @@ set -eu # Accepts a whole workflow output directory # Clean up and shrink TensorFlow output # See shrink-output.py for details +# Parallelizable via make THIS=$( readlink --canonicalize $( dirname $0 ) ) CPLO=$( readlink --canonicalize $THIS/.. ) @@ -25,7 +26,7 @@ then fi # This is used inside the Makefile below: -mkdir -pv /tmp/$USER +mkdir -pv /tmp/$USER/shrink cd $DIR/out -nice -n 19 make -j 8 -f $THIS/shrink-output.mk +nice -n 19 make -j 1 -f $THIS/shrink-output.mk From f0d72b4b946cc2818ee3185558d33cc0cf0af346 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 26 Jan 2022 12:39:56 -0600 Subject: [PATCH 235/601] WS --- workflows/cp-leaveout/scripts/extract-node-info.py | 1 - 1 file changed, 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index db723d82..d04d3cf0 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -25,7 +25,6 @@ logger = logging.getLogger("extract_node_info") - def read_log_filenames(log_list): result = [] count = 0 From 0f13bf962cbba054c041cc9b04e6a394398fd668 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 28 Jan 2022 13:40:47 -0600 Subject: [PATCH 236/601] Ignore case with no restart directories; shrink latest data too --- workflows/cp-leaveout/scripts/extract-node-info.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/extract-node-info.sh b/workflows/cp-leaveout/scripts/extract-node-info.sh index 08b62706..d291a47e 100755 --- a/workflows/cp-leaveout/scripts/extract-node-info.sh +++ b/workflows/cp-leaveout/scripts/extract-node-info.sh @@ -25,13 +25,15 @@ fi # (this could contain thousands of entries, too long for command line): LOG_LIST=$DIR/log-list.txt +shopt -s nullglob # Ignore empty globs RESTARTS=( $DIR/restarts/* ) for RESTART in ${RESTARTS[@]} do $THIS/shrink-output.sh $RESTART done -shopt -s nullglob # Ignore empty globs +$THIS/shrink-output.sh $DIR + { for RESTART in ${RESTARTS[@]} do From b99fb6bf4ea1b2c7351e1e9f3f51670f1827b3f7 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 28 Jan 2022 13:41:09 -0600 Subject: [PATCH 237/601] Report early stops --- workflows/cp-leaveout/scripts/print-node-info.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/workflows/cp-leaveout/scripts/print-node-info.py b/workflows/cp-leaveout/scripts/print-node-info.py index e29ca4e5..2884fac4 100644 --- a/workflows/cp-leaveout/scripts/print-node-info.py +++ b/workflows/cp-leaveout/scripts/print-node-info.py @@ -25,5 +25,11 @@ # print(data) # Print the node info! +count = 0 +earlies = 0 for node in data.values(): print(node.str_table()) + count += 1 + if node.stopped_early: earlies += 1 + +print("print-node-info: %i/%i runs stopped early." % (count, earlies)) From 0b977119dd05205575372b56b45e95a0002744ff Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 28 Jan 2022 13:51:43 -0600 Subject: [PATCH 238/601] flake8 fix --- workflows/common/python/model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index c0e52d08..93f6ddc4 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -239,7 +239,7 @@ def run_model(hyper_parameter_map): sys.stdout.flush() return ('SKIP', 'HISTORY_EMPTY') else: - assert(result == ModelResult.SUCCESS) # proceed... + assert(result == ModelResult.SUCCESS) # proceed... result, history = run(hyper_parameter_map, obj_return) runner_utils.write_output(result, directory) From 2e8b00e2c5aedca432ea979ac28590eac3b25473 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 28 Jan 2022 13:51:53 -0600 Subject: [PATCH 239/601] Use builtin --- workflows/common/python/utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/workflows/common/python/utils.py b/workflows/common/python/utils.py index 3259b51c..2db1e08d 100644 --- a/workflows/common/python/utils.py +++ b/workflows/common/python/utils.py @@ -25,9 +25,7 @@ def fail3(e, code, message): def avg(values): - total = 0.0 - for v in values: - total += v + total = sum(values) return total / len(values) From 4fd82dae4250930afc11d2a26ac8580d3f62475d Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 2 Mar 2022 14:05:47 -0600 Subject: [PATCH 240/601] Report early stops --- workflows/cp-leaveout/scripts/epoch-count.sh | 24 ++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/workflows/cp-leaveout/scripts/epoch-count.sh b/workflows/cp-leaveout/scripts/epoch-count.sh index 478f7f06..e94db027 100755 --- a/workflows/cp-leaveout/scripts/epoch-count.sh +++ b/workflows/cp-leaveout/scripts/epoch-count.sh @@ -23,11 +23,27 @@ EXPID=$( basename $DIR ) JOBID=$( cat $DIR/jobid.txt ) show EXPID JOBID -LOGS=( $( find $DIR -name python.log ) ) -echo "epoch-count.sh: found ${#LOGS[@]} logs ..." +# Must use TMPFILE to avoid subshell for shell variables +mkdir -pv /tmp/$USER +TMPFILE=/tmp/$USER/epoch-count-XXX.tmp + +EARLIES=0 +LOGS=( $( find $DIR -name python.log | head -10 ) ) +TOTAL=${#LOGS[@]} +echo "epoch-count.sh: found $TOTAL logs ..." for LOG in ${LOGS[@]} do echo -n "$LOG :: " # Pull out the last "Epoch:" line, print only the number: - sed -n '/Epoch:/h;${g;s/.*Epoch: \([0-9]*\).*/\1/;p}' $LOG -done | nl | sort -r -n -k 4 | column -t + EPOCH=$( sed -n '/Epoch:/h;${g;s/.*Epoch: \([0-9]*\).*/\1/;p}' $LOG ) + if grep -q "stopping: early" $LOG + then + EARLY="EARLY" + (( EARLIES += 1 )) + else + EARLY="" + fi + echo $EPOCH $EARLY +done > $TMPFILE +cat $TMPFILE | nl | sort -r -n -k 4 | column -t +echo "earlies: $EARLIES / $TOTAL" From 515f74fecb9f687c4b1862927dfc96c45e4995e0 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 2 Mar 2022 14:13:24 -0600 Subject: [PATCH 241/601] Better logging. Also fixes complete node bug. --- workflows/cp-leaveout/scripts/extract-node-info.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index d04d3cf0..ef9e505f 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -87,7 +87,7 @@ def parse_log(log_fp, nodes): # print(line.strip()) if "DEBUG" in line: if "PARAM UPDATE START" in line: - trace("New Node ...") + logger.debug("New Node ...") node_current = Node(logger=logger) node_current.parse_date_start(line) elif " node =" in line: @@ -101,8 +101,10 @@ def parse_log(log_fp, nodes): node_current.build_df = build_df build_df = None else: + logger.debug("lookup: " + node_id) node_current = nodes[node_id] node_current.new_segment() + node_current.complete = False elif " epochs =" in line: if node_current is None: # Restarted node with no epochs remaining: @@ -126,7 +128,7 @@ def parse_log(log_fp, nodes): node_current.stop_early() if node_current is not None and node_current.complete: # Store a complete Node in global dict nodes - # logger.debug("NODE DONE.") + logger.info("node done.") # find_val_data(node_current) # old format? find_error_data(node_current) nodes_found += 1 From 5226317ce4182ef0a35a9c00f797e0bf98b0de8b Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 2 Mar 2022 14:16:14 -0600 Subject: [PATCH 242/601] Merge --- workflows/common/python/model_runner.py | 41 +++++++++++++------ workflows/common/sh/utils.sh | 3 ++ workflows/common/swift/obj_py.swift | 3 +- workflows/cp-leaveout/db/print-stats.sh | 7 ++++ workflows/cp-leaveout/py/data_setup.py | 12 ++++-- .../cp-leaveout/scripts/extract-node-info.py | 2 +- .../scripts/shrink-output-single.sh | 1 - workflows/cp-leaveout/swift/workflow.sh | 6 ++- workflows/cp-leaveout/swift/workflow.swift | 14 +++++-- workflows/cp-leaveout/test/test-bl-1.sh | 2 +- 10 files changed, 67 insertions(+), 24 deletions(-) diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index 93f6ddc4..f0764f47 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -14,7 +14,6 @@ from runner_utils import ModelResult from log_tools import * - logger = None print('MODEL RUNNER...') @@ -159,29 +158,44 @@ def run(hyper_parameter_map, obj_return): Ps = setup_perf(params) + history = None + exception = False + + from tensorflow.errors import InvalidArgumentError + # Run the model! log("PKG RUN START") - history = pkg.run(params) + + try: + history = pkg.run(params) + except Exception as e: + logger.warn("RUN EXCEPTION: " + str(e)) + print("RUN EXCEPTION: " + str(e)) + # logger.warn("Caught InvalidArgumentError") + exception = True log("PKG RUN STOP") if framework == 'keras': runner_utils.keras_clear_session(framework) + stop_perf(Ps) + finish = time.time() + duration = finish - start + # Default result if there is no val_loss (as in infer.py) result = 0 history_result = {} - if history is not None: - if history == "EPOCHS_COMPLETED_ALREADY": - result, history_result = "EPOCHS_COMPLETED_ALREADY", None - else: - result, history_result = get_results(history, obj_return) - - stop_perf(Ps) + if not exception: + logger.info('DONE: run_id %s in %0.2f seconds.' % + (hyper_parameter_map['run_id'], duration)) + if history is not None: + if history == "EPOCHS_COMPLETED_ALREADY": + result, history_result = "EPOCHS_COMPLETED_ALREADY", None + else: + result, history_result = get_results(history, obj_return) + else: + result, history_result = "RUN_EXCEPTION", None - finish = time.time() - duration = finish - start - logger.info('DONE: run_id %s in %0.2f seconds.' % - (hyper_parameter_map['run_id'], duration)) return (result, history_result) @@ -228,6 +242,7 @@ def run_model(hyper_parameter_map): global logger logger = get_logger(logger, 'MODEL RUNNER') obj_return = get_obj_return() + logger.info("run_model: node: " + hyper_parameter_map['node']) directory = hyper_parameter_map['instance_directory'] os.chdir(directory) result = run_pre(hyper_parameter_map) diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index 8c969896..93146917 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -361,6 +361,9 @@ queue_wait_site() elif [[ $site2 =~ summit* ]] then queue_wait_lsf $JOBID + elif [[ $site2 == "spock" ]] + then + queue_wait_slurm $JOBID elif [[ $site2 == "pascal" ]] then queue_wait_slurm $JOBID diff --git a/workflows/common/swift/obj_py.swift b/workflows/common/swift/obj_py.swift index 6933e57e..d4a53c62 100644 --- a/workflows/common/swift/obj_py.swift +++ b/workflows/common/swift/obj_py.swift @@ -37,7 +37,8 @@ try: except Exception as e: info = sys.exc_info() s = traceback.format_tb(info[2]) - sys.stdout.write('EXCEPTION in obj() code: \\n' + repr(e) + ' ... \\n' + ''.join(s)) + sys.stdout.write('\\n\\nEXCEPTION in obj() code: \\n' + + repr(e) + ' ... \\n' + ''.join(s)) sys.stdout.write('\\n') sys.stdout.flush() obj_result = 'EXCEPTION' diff --git a/workflows/cp-leaveout/db/print-stats.sh b/workflows/cp-leaveout/db/print-stats.sh index 6b9eb3a9..c470afce 100755 --- a/workflows/cp-leaveout/db/print-stats.sh +++ b/workflows/cp-leaveout/db/print-stats.sh @@ -1,4 +1,5 @@ #!/bin/sh +set -eu # PRINT STATS SH @@ -10,6 +11,12 @@ fi DB=$1 +if ! which sqlite3 > /dev/null +then + echo "print-stats.sh: Add sqlite3 to PATH!" + exit 1 +fi + COMPLETE=$( sqlite3 $DB < $T diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index 245c160f..3517a60c 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -196,7 +196,10 @@ fi export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" mkdir -pv $TURBINE_OUTPUT/out -set -x +export TURBINE_LAUNCH_OPTIONS="-n $PROCS -c 4 --gpus-per-task=1 --gpu-bind=closest" + +which java + swift-t -O 0 -n $PROCS \ ${MACHINE:-} \ -p \ @@ -228,6 +231,7 @@ swift-t -O 0 -n $PROCS \ # tee $STDOUT # +# -e HIP_VISIBLE_DEVICES="0,1" \ # -e USER # Needed on Summit to find NVME # -j /usr/bin/java # Give this to Swift/T if needed for Java diff --git a/workflows/cp-leaveout/swift/workflow.swift b/workflows/cp-leaveout/swift/workflow.swift index 5e198f17..3d0b1da9 100644 --- a/workflows/cp-leaveout/swift/workflow.swift +++ b/workflows/cp-leaveout/swift/workflow.swift @@ -132,11 +132,19 @@ run_stage(int N, int S, string this, int stage, void block, if (result1 == "0") { // Run the model - obj_result = obj(json, node) - // Update the DB to complete the model run - => result2 = plangen_stop(node, plan_id); + obj_result = obj(json, node); printf("run_single(): completed: node: '%s' result: '%s'", node, obj_result); + // Update the DB to complete the model run + string result2; + if (obj_result != "RUN_EXCEPTION") + { + result2 = plangen_stop(node, plan_id); + } + else + { + result2 = "RETRY"; + } assert(obj_result != "EXCEPTION" && obj_result != "", "Exception in obj()!"); assert(result2 != "EXCEPTION", "Exception in plangen_stop()!"); diff --git a/workflows/cp-leaveout/test/test-bl-1.sh b/workflows/cp-leaveout/test/test-bl-1.sh index bfe0d4ed..2422bece 100755 --- a/workflows/cp-leaveout/test/test-bl-1.sh +++ b/workflows/cp-leaveout/test/test-bl-1.sh @@ -29,7 +29,7 @@ WORKFLOW_ARGS=$* export MODEL_NAME=uno # nt3 # Select configurations -export CFG_SYS=$THIS/cfg-sys-1.sh +export CFG_SYS=$THIS/cfg-sys-512.sh export CFG_PRM=$THIS/cfg-prm-1.sh # Data files From 6804fd948e132869f5de4b4b1e1157ec7847eff6 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 2 Mar 2022 14:18:18 -0600 Subject: [PATCH 243/601] New clean-ckpts stuff --- .../cp-leaveout/scripts/clean-ckpts-run.sh | 51 +++++++++++++++++++ workflows/cp-leaveout/scripts/clean-ckpts.sh | 27 ++++++++++ 2 files changed, 78 insertions(+) create mode 100755 workflows/cp-leaveout/scripts/clean-ckpts-run.sh create mode 100755 workflows/cp-leaveout/scripts/clean-ckpts.sh diff --git a/workflows/cp-leaveout/scripts/clean-ckpts-run.sh b/workflows/cp-leaveout/scripts/clean-ckpts-run.sh new file mode 100755 index 00000000..68085bc7 --- /dev/null +++ b/workflows/cp-leaveout/scripts/clean-ckpts-run.sh @@ -0,0 +1,51 @@ +#!/bin/bash +set -eu + +# CLEAN CKPTS RUN SH + +# See ./clean-ckpts.sh + +THIS=$( readlink --canonicalize $( dirname $0 ) ) + +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an run DIR (e.g., .../experiments/X042/run/1.2.3)!" \ + DIR - ${*} + +if [[ ! -d $DIR ]] +then + echo "$0: Given run directory does not exist: $DIR" + exit 1 +fi + +if ! [[ -d $DIR/save/ckpts/epochs ]] +then + exit +fi + +cd $DIR/save/ckpts/epochs + +MODELS=( $( ls ) ) + +echo ${MODELS[@]} + +N=${#MODELS[@]} +echo $N + +# Do not clean the last 3 models +for (( i=0 ; i<$N-3 ; i++ )) +do + MODEL=${MODELS[$i]} + # Use 10# to force MODEL as base-10 + # (Bash treats e.g. MODEL=010 as octal) + if (( 10#$MODEL % 5 == 0 )) + then + continue + fi + if ! [[ -f $MODEL/model.h5 ]] + then + continue + fi + rm -v $MODEL/model.h5 +done diff --git a/workflows/cp-leaveout/scripts/clean-ckpts.sh b/workflows/cp-leaveout/scripts/clean-ckpts.sh new file mode 100755 index 00000000..4721f4a7 --- /dev/null +++ b/workflows/cp-leaveout/scripts/clean-ckpts.sh @@ -0,0 +1,27 @@ +#!/bin/sh + +# CLEAN CKPTS SH + +# Clean up old checkpoints + +THIS=$( readlink --canonicalize $( dirname $0 ) ) + +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ + DIR - ${*} + +if [[ ! -d $DIR ]] +then + echo "$0: Given experiment directory does not exist: $DIR" + exit 1 +fi + +RUNS=( $( echo $DIR/run/* ) ) + +for RUN in ${RUNS[@]} +do + set -x + $THIS/clean-ckpts-run.sh $RUN +done From a04a1fb6ab6c3b9c2494c5ffabdb9d750dd57552 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 2 Mar 2022 14:18:51 -0600 Subject: [PATCH 244/601] New plots for I/O times --- .../cp-leaveout/scripts/plot-io-times.sh | 53 +++++++++++++++++++ .../cp-leaveout/scripts/plot_io_times.py | 53 +++++++++++++++++++ .../cp-leaveout/scripts/stage-builds.cfg | 13 +++++ workflows/cp-leaveout/scripts/stage-loads.cfg | 13 +++++ .../cp-leaveout/scripts/stage-writes.cfg | 13 +++++ 5 files changed, 145 insertions(+) create mode 100755 workflows/cp-leaveout/scripts/plot-io-times.sh create mode 100644 workflows/cp-leaveout/scripts/plot_io_times.py create mode 100644 workflows/cp-leaveout/scripts/stage-builds.cfg create mode 100644 workflows/cp-leaveout/scripts/stage-loads.cfg create mode 100644 workflows/cp-leaveout/scripts/stage-writes.cfg diff --git a/workflows/cp-leaveout/scripts/plot-io-times.sh b/workflows/cp-leaveout/scripts/plot-io-times.sh new file mode 100755 index 00000000..8d1dbb7d --- /dev/null +++ b/workflows/cp-leaveout/scripts/plot-io-times.sh @@ -0,0 +1,53 @@ +#!/bin/bash +set -eu + +# PLOT IO TIMES SH + +# Input: Provide an experiment directory DIR +# Output: Plots in PWD for data + +THIS=$( readlink --canonicalize $( dirname $0 ) ) + +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) +source $SUPERVISOR/workflows/common/sh/utils.sh + +# SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ +# DIR - ${*} + +# if [[ ! -d $DIR ]] +# then +# echo "$0: Given experiment directory does not exist: $DIR" +# exit 1 +# fi + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +DIRS=( ${*} ) +BUILDS="" +LOADS="" +WRITES="" +for DIR in ${DIRS[@]} +do + # python $THIS/plot_io_times.py $DIR + + X=$( basename $DIR ) + D=builds-$X.data + cp -uv $DIR/builds.data $D + BUILDS+="$D " + + X=$( basename $DIR ) + D=loads-$X.data + cp -uv $DIR/loads.data $D + LOADS+="$D " + + X=$( basename $DIR ) + D=writes-$X.data + cp -uv $DIR/writes.data $D + WRITES+="$D " + +done + +set -x +jwplot builds.eps $THIS/stage-builds.cfg $BUILDS +jwplot writes.eps $THIS/stage-writes.cfg $WRITES +jwplot loads.eps $THIS/stage-loads.cfg $LOADS diff --git a/workflows/cp-leaveout/scripts/plot_io_times.py b/workflows/cp-leaveout/scripts/plot_io_times.py new file mode 100644 index 00000000..34fd90ef --- /dev/null +++ b/workflows/cp-leaveout/scripts/plot_io_times.py @@ -0,0 +1,53 @@ + +# PLOT IO TIMES PY + +import argparse, os, pickle, statistics + + +from utils import fail + +parser = argparse.ArgumentParser(description='Plot I/O stats') +parser.add_argument('directory', + help='The experiment directory (EXPID)') + +args = parser.parse_args() + +node_pkl = args.directory + "/node-info.pkl" + +try: + with open(node_pkl, 'rb') as fp: + data = pickle.load(fp) +except IOError as e: + fail(e, os.EX_IOERR, "Could not read: " + node_pkl) + +builds = { 1:[], 2:[], 3:[], 4:[], 5:[] } +loads = { 1:[], 2:[], 3:[], 4:[], 5:[] } +writes = { 1:[], 2:[], 3:[], 4:[], 5:[] } + +# Print the node info! +for node in data.values(): + if node.stage == 6: + continue + if node.build_df is not None: + builds[node.stage].append(node.build_df) + if node.load_initial is not None: + loads[node.stage].append(node.load_initial) + if node.ckpt_writes is not None: + writes[node.stage] += list(node.ckpt_writes.values()) + +with open(args.directory + "/builds.data", "w") as fp: + for stage in builds.keys(): + fp.write("%i " % stage) + fp.write("%0.3f" % statistics.mean(builds[stage])) + fp.write(" # count = %i\n" % len(builds[stage])) + +with open(args.directory + "/loads.data", "w") as fp: + for stage in loads.keys(): + if stage == 1: continue # stage 1 does not do a load + fp.write("%i " % stage) + fp.write("%0.3f\n" % statistics.mean(loads[stage])) + +with open(args.directory + "/writes.data", "w") as fp: + for stage in writes.keys(): + fp.write("%i " % stage) + fp.write("%0.3f\n" % statistics.mean(writes[stage])) diff --git a/workflows/cp-leaveout/scripts/stage-builds.cfg b/workflows/cp-leaveout/scripts/stage-builds.cfg new file mode 100644 index 00000000..344a3c50 --- /dev/null +++ b/workflows/cp-leaveout/scripts/stage-builds.cfg @@ -0,0 +1,13 @@ +xlabel = stage +ylabel = dataframe build time (seconds) + +width = 400 +height = 400 + +label.builds-X743.data = Summit E=10 +# label.builds-X744.data = Summit E=50 +label.builds-X750.data = Spock E=10 +# label.builds-X746.data = Spock E=50 + + +# legend.enabled = false diff --git a/workflows/cp-leaveout/scripts/stage-loads.cfg b/workflows/cp-leaveout/scripts/stage-loads.cfg new file mode 100644 index 00000000..80013aec --- /dev/null +++ b/workflows/cp-leaveout/scripts/stage-loads.cfg @@ -0,0 +1,13 @@ +xlabel = stage +ylabel = load weights time (seconds) + +width = 400 +height = 400 + +label.loads-X743.data = Summit E=10 +# label.builds-X744.data = Summit E=50 +label.loads-X750.data = Spock E=10 +# label.builds-X746.data = Spock E=50 + + +# legend.enabled = false diff --git a/workflows/cp-leaveout/scripts/stage-writes.cfg b/workflows/cp-leaveout/scripts/stage-writes.cfg new file mode 100644 index 00000000..90853d4f --- /dev/null +++ b/workflows/cp-leaveout/scripts/stage-writes.cfg @@ -0,0 +1,13 @@ +xlabel = stage +ylabel = checkpoint write rate (MB/s) + +width = 400 +height = 400 + +label.writes-X743.data = Summit E=10 +# label.builds-X744.data = Summit E=50 +label.writes-X750.data = Spock E=10 +# label.builds-X746.data = Spock E=50 + + +# legend.enabled = false From 3dd72f2a44bf6f3fdd51f0b0348b62da2f47e131 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 2 Mar 2022 14:29:04 -0600 Subject: [PATCH 245/601] Handle early stopping --- workflows/common/python/model_runner.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index f0764f47..8e5495ca 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -182,6 +182,8 @@ def run(hyper_parameter_map, obj_return): finish = time.time() duration = finish - start + epochs = hyper_parameter_map['epochs'] + # Default result if there is no val_loss (as in infer.py) result = 0 history_result = {} @@ -192,7 +194,8 @@ def run(hyper_parameter_map, obj_return): if history == "EPOCHS_COMPLETED_ALREADY": result, history_result = "EPOCHS_COMPLETED_ALREADY", None else: - result, history_result = get_results(history, obj_return) + result, history_result = get_results(history, obj_return, + epochs) else: result, history_result = "RUN_EXCEPTION", None @@ -245,6 +248,9 @@ def run_model(hyper_parameter_map): logger.info("run_model: node: " + hyper_parameter_map['node']) directory = hyper_parameter_map['instance_directory'] os.chdir(directory) + if os.path.exists('stop.marker'): + logger.info('stop.marker exists!') + return ('SKIP', 'STOP_MARKER') result = run_pre(hyper_parameter_map) if result == ModelResult.ERROR: print('run_pre() returned ERROR!') @@ -287,9 +293,10 @@ def setup_params(pkg, hyper_parameter_map, params_arg): return params -def get_results(history, obj_return): +def get_results(history, obj_return, epochs_expected): """ Return the history entry that the user requested. + Also checks for early stopping and if so marks the directory. history: The Keras history object """ @@ -306,6 +313,12 @@ def get_results(history, obj_return): if obj_return in history.history: # Good value values = history.history[obj_return] + if len(values) < epochs_expected: + msg = 'early stopping: %i/%i' % \ + (len(values), epochs_expected) + logger.info('get_results(): ' + msg) + with open('stop.marker', 'w') as fp: + fp.write(msg + '\n') # Default: the last value in the history result = values[-1] else: From 71f23bd97e5fbfcdd3182fdbb88310fac63c62e0 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 2 Mar 2022 14:29:17 -0600 Subject: [PATCH 246/601] Improve plot --- workflows/cp-leaveout/scripts/stage-times.cfg | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/scripts/stage-times.cfg b/workflows/cp-leaveout/scripts/stage-times.cfg index 5d30b734..162fc79c 100644 --- a/workflows/cp-leaveout/scripts/stage-times.cfg +++ b/workflows/cp-leaveout/scripts/stage-times.cfg @@ -1,13 +1,14 @@ xlabel = stage ylabel = time (seconds) -width = 800 -height = 600 +width = 600 +height = 400 label.times-X743.data = Summit E=10 label.times-X744.data = Summit E=50 label.times-X750.data = Spock E=10 label.times-X746.data = Spock E=50 +axis.type.x = integer # legend.enabled = false From 64827cb9ef647c7c8fb1739dbe46538f30571c5a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 2 Mar 2022 14:44:10 -0600 Subject: [PATCH 247/601] Drop debugging limit --- workflows/cp-leaveout/scripts/epoch-count.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/epoch-count.sh b/workflows/cp-leaveout/scripts/epoch-count.sh index e94db027..655099ed 100755 --- a/workflows/cp-leaveout/scripts/epoch-count.sh +++ b/workflows/cp-leaveout/scripts/epoch-count.sh @@ -28,7 +28,7 @@ mkdir -pv /tmp/$USER TMPFILE=/tmp/$USER/epoch-count-XXX.tmp EARLIES=0 -LOGS=( $( find $DIR -name python.log | head -10 ) ) +LOGS=( $( find $DIR -name python.log ) ) TOTAL=${#LOGS[@]} echo "epoch-count.sh: found $TOTAL logs ..." for LOG in ${LOGS[@]} From 0d17ea230fec768a1a02ab2afb6ddf441e53f044 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 7 Apr 2022 13:40:40 -0500 Subject: [PATCH 248/601] Support patience --- workflows/cp-leaveout/swift/baseline-error.swift | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/swift/baseline-error.swift b/workflows/cp-leaveout/swift/baseline-error.swift index bd766e81..4f3c4a12 100644 --- a/workflows/cp-leaveout/swift/baseline-error.swift +++ b/workflows/cp-leaveout/swift/baseline-error.swift @@ -26,6 +26,7 @@ file file_nodes = input(argv("nodes")); // file file_epochs = input(argv("epochs")); int benchmark_timeout = string2int(argv("benchmark_timeout", "-1")); int epochs_all = string2int(argv("E", "50")); +int patience = string2int(argv("P", "50")); // == Command-line Arguments End == // == Environment Settings Begin == @@ -66,6 +67,7 @@ string params_template = "gpus": "0", "epochs": %i, "es": "True", +"patience": %i, "node": "%s", "use_exported_data": "%s" } @@ -79,7 +81,8 @@ foreach node, i in nodes_lines string training_data = "%s/run/%s/topN.uno.h5" % (reference, node); // int epochs = string2int(map_epochs[node]); int epochs = epochs_all; - string params = params_template % (dataframe_csv, epochs, node, training_data); + string params = params_template % (dataframe_csv, epochs, patience, + node, training_data); // NOTE: obj() is in the obj_*.swift supplied by workflow.sh results[i] = obj(params, node); assert(results[i] != "EXCEPTION", "exception in obj()!"); From 398a61a512e37bba3b1c4da271a00fe73e073d94 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 7 Apr 2022 13:41:13 -0500 Subject: [PATCH 249/601] New plotting and management scripts --- workflows/cp-leaveout/scripts/plot-avgs.sh | 40 ++++++++++++++++++ .../cp-leaveout/scripts/report-leaves.sh | 25 +++++++++++ .../cp-leaveout/scripts/report-stopping.sh | 25 +++++++++++ .../cp-leaveout/scripts/report_leaves.py | 37 +++++++++++++++++ .../cp-leaveout/scripts/report_stopping.py | 41 +++++++++++++++++++ workflows/cp-leaveout/scripts/touch-exps.zsh | 39 ++++++++++++++++++ 6 files changed, 207 insertions(+) create mode 100755 workflows/cp-leaveout/scripts/plot-avgs.sh create mode 100755 workflows/cp-leaveout/scripts/report-leaves.sh create mode 100755 workflows/cp-leaveout/scripts/report-stopping.sh create mode 100644 workflows/cp-leaveout/scripts/report_leaves.py create mode 100644 workflows/cp-leaveout/scripts/report_stopping.py create mode 100644 workflows/cp-leaveout/scripts/touch-exps.zsh diff --git a/workflows/cp-leaveout/scripts/plot-avgs.sh b/workflows/cp-leaveout/scripts/plot-avgs.sh new file mode 100755 index 00000000..586e2612 --- /dev/null +++ b/workflows/cp-leaveout/scripts/plot-avgs.sh @@ -0,0 +1,40 @@ +#!/bin/bash +set -eu + +# PLOT AVGS SH + +# Input: Provide an experiment directory DIR +# Output: Plots in PWD for data from times.data & vloss.data + +THIS=$( readlink --canonicalize $( dirname $0 ) ) + +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) +source $SUPERVISOR/workflows/common/sh/utils.sh + +# SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ +# DIR - ${*} + +# if [[ ! -d $DIR ]] +# then +# echo "$0: Given experiment directory does not exist: $DIR" +# exit 1 +# fi + +DIRS=( ${*} ) +TIMES="" +VLOSS="" +for DIR in ${DIRS[@]} +do + X=$( basename $DIR ) + + D=times-$X.data + cp -uv $DIR/times.data $D + TIMES+="$D " + + # D=vloss-$X.data + # cp $DIR/vloss.data $D + # VLOSS+="$D " +done + +jwplot stage-times.eps $THIS/stage-times.cfg $TIMES +# jwplot stage-vloss.eps $THIS/stage-vloss.cfg $VLOSS diff --git a/workflows/cp-leaveout/scripts/report-leaves.sh b/workflows/cp-leaveout/scripts/report-leaves.sh new file mode 100755 index 00000000..3bff44cf --- /dev/null +++ b/workflows/cp-leaveout/scripts/report-leaves.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -eu + +# REPORT LEAVES SH +# Report nodes with no children + +THIS=$( realpath $( dirname $0 ) ) +CPLO=$( realpath $THIS/.. ) +SUPERVISOR=$( realpath $CPLO/../.. ) + +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ + DIR - ${*} + +if ! [[ -d $DIR ]] +then + echo "Does not exist: $DIR" + exit 1 +fi + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +set -x +python3 -u $THIS/report_leaves.py $DIR diff --git a/workflows/cp-leaveout/scripts/report-stopping.sh b/workflows/cp-leaveout/scripts/report-stopping.sh new file mode 100755 index 00000000..ce1b46b5 --- /dev/null +++ b/workflows/cp-leaveout/scripts/report-stopping.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -eu + +# REPORT STOPPING SH +# Report early stopping by epoch + +THIS=$( realpath $( dirname $0 ) ) +CPLO=$( realpath $THIS/.. ) +SUPERVISOR=$( realpath $CPLO/../.. ) + +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ + DIR - ${*} + +if ! [[ -d $DIR ]] +then + echo "Does not exist: $DIR" + exit 1 +fi + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +set -x +python3 -u $THIS/report_stopping.py $DIR diff --git a/workflows/cp-leaveout/scripts/report_leaves.py b/workflows/cp-leaveout/scripts/report_leaves.py new file mode 100644 index 00000000..08d7d51a --- /dev/null +++ b/workflows/cp-leaveout/scripts/report_leaves.py @@ -0,0 +1,37 @@ + +# REPORT LEAVES PY + +import argparse, os, pickle, sys + +from Node import Node +from utils import fail + +parser = argparse.ArgumentParser(description= + 'Report nodes with no children.') +parser.add_argument('directory', + help='The experiment directory (EXPID)') + +args = parser.parse_args() + +node_pkl = args.directory + "/node-info.pkl" + +try: + with open(node_pkl, 'rb') as fp: + data = pickle.load(fp) +except IOError as e: + fail(e, os.EX_IOERR, "Could not read: " + node_pkl) + +nodes = data.keys() +leaves = data.copy() + +for node in nodes: + parent = node[0:-2] + if parent in leaves: + print("drop: " + parent) + del leaves[parent] + +results = list(leaves.keys()) +results.sort() + +for leaf in results: + print(leaf) diff --git a/workflows/cp-leaveout/scripts/report_stopping.py b/workflows/cp-leaveout/scripts/report_stopping.py new file mode 100644 index 00000000..3d427ec9 --- /dev/null +++ b/workflows/cp-leaveout/scripts/report_stopping.py @@ -0,0 +1,41 @@ + +# REPORT STOPPING PY + +import argparse, os, pickle, sys + +from Node import Node +from utils import fail, avg + +parser = argparse.ArgumentParser(description= + 'Report nodes with no children.') +parser.add_argument('directory', + help='The experiment directory (EXPID)') + +args = parser.parse_args() + +node_pkl = args.directory + "/node-info.pkl" + +try: + with open(node_pkl, 'rb') as fp: + data = pickle.load(fp) +except IOError as e: + fail(e, os.EX_IOERR, "Could not read: " + node_pkl) + +stages = { 1:[], 2:[], 3:[], 4:[], 5:[], 6:[] } + +for key in data: + # print(key) + node = data[key] + # print(data[node]) + print("%-14s %i %i" % (key, node.stage, node.epochs_actual)) + stages[node.stage].append(node.epochs_actual) + # exit() + +for i in range(1, 7): + L = stages[i] + a = avg(L) + print("%i: %0.3f" % (i, a)) + + + # a = st + # 1.3.2.4.2.4.1 diff --git a/workflows/cp-leaveout/scripts/touch-exps.zsh b/workflows/cp-leaveout/scripts/touch-exps.zsh new file mode 100644 index 00000000..e9506262 --- /dev/null +++ b/workflows/cp-leaveout/scripts/touch-exps.zsh @@ -0,0 +1,39 @@ +#!/bin/zsh +set -eu + +which python + +A=( 750 + 746 + 757 + 771 + 743 + 744 + 759 + 763 + ) + +{ + sw0 + print "START: " $( date-nice ) + print + + for X in $A + do + ds experiments/X$X + last-access experiments/X$X + touch-all experiments/X$X + print + done + + last-access ~/S/proj + touch-all ~/S/proj + print + + last-access /gpfs/alpine/med106/scratch/wozniak/CANDLE-Data + touch-all /gpfs/alpine/med106/scratch/wozniak/CANDLE-Data + + print + print "STOP: " $( date-nice ) + sw1 +} |& teeb touch-exps.out From 94b67a41a7436da2feff94eb516d0674db2f813a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 7 Apr 2022 13:42:23 -0500 Subject: [PATCH 250/601] Bring in some unused settings --- workflows/cp-leaveout/swift/workflow.sh | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index 3517a60c..62432c65 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -196,10 +196,7 @@ fi export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" mkdir -pv $TURBINE_OUTPUT/out -export TURBINE_LAUNCH_OPTIONS="-n $PROCS -c 4 --gpus-per-task=1 --gpu-bind=closest" - -which java - +# set -x swift-t -O 0 -n $PROCS \ ${MACHINE:-} \ -p \ @@ -245,5 +242,16 @@ then exit 1 fi +# # Check job output +# TURBINE_OUTPUT=$( readlink turbine-output ) +# OUTPUT=turbine-output/output.txt +# WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +# # Wait for job +# queue_wait + +# SCRIPT=$( basename $0 .sh ) +# check_output "EXIT CODE: 0" $OUTPUT $WORKFLOW $SCRIPT $JOBID + echo "WORKFLOW OK." echo "EXIT CODE: 0" | tee -a $STDOUT From 0835ffe2d60244d410a54751da1d86b15403d961 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 11 Apr 2022 14:18:34 -0500 Subject: [PATCH 251/601] New Swift/T for Summit --- workflows/common/sh/env-summit.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflows/common/sh/env-summit.sh b/workflows/common/sh/env-summit.sh index 82922e53..71eb410c 100644 --- a/workflows/common/sh/env-summit.sh +++ b/workflows/common/sh/env-summit.sh @@ -14,15 +14,15 @@ set -eu # From Wozniak MED106=/gpfs/alpine/world-shared/med106 -ROOT=$MED106/sw/gcc-8.3.1 -SWIFT=$ROOT/swift-t/2021-10-06 +ROOT=$MED106/sw/gcc-9.1.0 +SWIFT=$ROOT/swift-t/2022-04-06 export TURBINE_HOME=$SWIFT/turbine PATH=$SWIFT/stc/bin:$PATH PATH=$SWIFT/turbine/bin:$PATH -R=/gpfs/alpine/world-shared/med106/wozniak/sw/gcc-6.4.0/R-3.6.1/lib64/R -LD_LIBRARY_PATH+=:$R/lib +# R=/gpfs/alpine/world-shared/med106/wozniak/sw/gcc-6.4.0/R-3.6.1/lib64/R +# LD_LIBRARY_PATH+=:$R/lib # PY=/gpfs/alpine/world-shared/med106/sw/condaenv-200408 # PY=/sw/summit/open-ce/anaconda-base/envs/open-ce-1.2.0-py38-0 From 121e806ab9bfd56b938af5e7d419dd9498e9f13d Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 13 Apr 2022 13:10:11 -0500 Subject: [PATCH 252/601] Update installation for GCC/7.5.0 --- workflows/common/sh/env-summit.sh | 32 ++++++++++--------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/workflows/common/sh/env-summit.sh b/workflows/common/sh/env-summit.sh index 71eb410c..a1dc08da 100644 --- a/workflows/common/sh/env-summit.sh +++ b/workflows/common/sh/env-summit.sh @@ -6,27 +6,25 @@ SWIFT_IMPL=py # Let modules initialize LD_LIBRARY_PATH before changing it: set +eu # modules create errors outside our control -module load spectrum-mpi/10.3.1.2-20200121 +module load spectrum-mpi module unload darshan-runtime -# module load ibm-wml-ce/1.6.2-3 module list set -eu # From Wozniak MED106=/gpfs/alpine/world-shared/med106 -ROOT=$MED106/sw/gcc-9.1.0 -SWIFT=$ROOT/swift-t/2022-04-06 +ROOT=$MED106/sw/summit/gcc-7.5.0 +SWIFT=$ROOT/swift-t/2022-04-12 export TURBINE_HOME=$SWIFT/turbine PATH=$SWIFT/stc/bin:$PATH PATH=$SWIFT/turbine/bin:$PATH -# R=/gpfs/alpine/world-shared/med106/wozniak/sw/gcc-6.4.0/R-3.6.1/lib64/R -# LD_LIBRARY_PATH+=:$R/lib +R=$ROOT/R/4.1.3/lib64/R +LD_LIBRARY_PATH+=:$R/lib -# PY=/gpfs/alpine/world-shared/med106/sw/condaenv-200408 -# PY=/sw/summit/open-ce/anaconda-base/envs/open-ce-1.2.0-py38-0 -PY=/gpfs/alpine/world-shared/med106/sw/conda/2021-10-06/envs/CANDLE-2021-10-06 +# PY=/gpfs/alpine/world-shared/med106/sw/conda/2021-10-06/envs/CANDLE-2021-10-06 +PY=/sw/summit/open-ce/anaconda-base/envs/open-ce-1.5.2-py39-0 LD_LIBRARY_PATH+=:$PY/lib export PYTHONHOME=$PY PATH=$PY/bin:$PATH @@ -35,20 +33,10 @@ PATH=$PY/bin:$PATH export LD_LIBRARY_PATH=$PY/lib:$LD_LIBRARY_PATH # EMEWS Queues for R -EQR=$MED106/wozniak/sw/gcc-6.4.0/EQ-R -EQPy=$WORKFLOWS_ROOT/common/ext/EQ-Py +EQR=$ROOT/EQ-R + +# EQPy=$WORKFLOWS_ROOT/common/ext/EQ-Py # For test output processing: LOCAL=0 CRAY=1 - -# # Resident task worker count and rank list -# # If this is already set, we respect the user settings -# # If this is unset, we set it to 1 -# # and run the algorithm on the 2nd highest rank -# # This value is only read in HPO workflows -# if [[ ${TURBINE_RESIDENT_WORK_WORKERS:-} == "" ]] -# then -# export TURBINE_RESIDENT_WORK_WORKERS=1 -# export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) -# fi From ac6ccb83469d6568977bd6623df5db8f87f2bf40 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 13 Apr 2022 13:11:15 -0500 Subject: [PATCH 253/601] Better short-run defaults --- workflows/mlrMBO/test/cfg-sys-summit.sh | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/workflows/mlrMBO/test/cfg-sys-summit.sh b/workflows/mlrMBO/test/cfg-sys-summit.sh index 2629eb23..bc1ea24e 100644 --- a/workflows/mlrMBO/test/cfg-sys-summit.sh +++ b/workflows/mlrMBO/test/cfg-sys-summit.sh @@ -4,13 +4,14 @@ # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEMS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-46} +export PROCS=${PROCS:-6} # MPI processes per node # Cori has 32 cores per node, 128GB per node -export PPN=${PPN:-1} +export PPN=${PPN:-6} -export WALLTIME=${WALLTIME:-06:00:00} +# export WALLTIME=${WALLTIME:-06:00:00} +export WALLTIME=00:10:00 #export PROJECT=Candle_ECP @@ -33,3 +34,14 @@ export SH_TIMEOUT=${SH_TIMEOUT:-} # Ignore errors: If 1, unknown errors will be reported to model.log # but will not bring down the Swift workflow. See model.sh . export IGNORE_ERRORS=0 + +# Resident task worker count and rank list +# If this is already set, we respect the user settings +# If this is unset, we set it to 1 +# and run the algorithm on the 2nd highest rank +# This value is only read in HPO workflows +if [[ ${TURBINE_RESIDENT_WORK_WORKERS:-} == "" ]] +then + export TURBINE_RESIDENT_WORK_WORKERS=1 + export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) +fi From f1c85a434f256cff16aa872e9924991f27dbca19 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 13 Apr 2022 13:12:05 -0500 Subject: [PATCH 254/601] Turn off log message about 'node' --- workflows/common/python/model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index 8e5495ca..6ff6a37d 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -245,7 +245,7 @@ def run_model(hyper_parameter_map): global logger logger = get_logger(logger, 'MODEL RUNNER') obj_return = get_obj_return() - logger.info("run_model: node: " + hyper_parameter_map['node']) + # logger.info("run_model: node: " + hyper_parameter_map['node']) directory = hyper_parameter_map['instance_directory'] os.chdir(directory) if os.path.exists('stop.marker'): From f1b68d7257fc1d6e0bd00995903aa46585c34b92 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 13 Apr 2022 13:12:47 -0500 Subject: [PATCH 255/601] WS --- workflows/mlrMBO/swift/workflow.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/mlrMBO/swift/workflow.swift b/workflows/mlrMBO/swift/workflow.swift index 1f9936d6..89db5929 100644 --- a/workflows/mlrMBO/swift/workflow.swift +++ b/workflows/mlrMBO/swift/workflow.swift @@ -16,7 +16,7 @@ import assert; import python; /* Helper for reporting environment variables common/swift/candle_utils.swift * import candle_utils; -* +* * report_env(); */ From 0cd84980ade45b315391a2159fd161d3830bf656 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 13 Apr 2022 13:13:36 -0500 Subject: [PATCH 256/601] Fix string format specifiers --- workflows/mlrMBO/swift/workflow.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/mlrMBO/swift/workflow.swift b/workflows/mlrMBO/swift/workflow.swift index 89db5929..43d264d3 100644 --- a/workflows/mlrMBO/swift/workflow.swift +++ b/workflows/mlrMBO/swift/workflow.swift @@ -89,7 +89,7 @@ string FRAMEWORK = "keras"; foreach param, j in param_array { results[j] = obj(param, - "%00i_%000i_%0000i" % (restart_number,i,j)); + "%02i_%03i_%04i" % (restart_number,i,j)); } string result = join(results, ";"); // printf(result); From df01d0fe75cf6a66ca4b4d250232726a8e106211 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 13 Apr 2022 13:14:06 -0500 Subject: [PATCH 257/601] WS --- workflows/common/R/install-candle.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflows/common/R/install-candle.sh b/workflows/common/R/install-candle.sh index 27f70414..5e55ae11 100755 --- a/workflows/common/R/install-candle.sh +++ b/workflows/common/R/install-candle.sh @@ -5,8 +5,10 @@ set -eu # Installs all R packages needed for Supervisor workflows -# pass CONFIRM=0 via command line for by passing options, default is CONFIRM=1 +# pass CONFIRM=0 via command line for by passing options, +# default is CONFIRM=1 : ${CONFIRM:=1} + while getopts ":y" OPTION do case $OPTION in From c54e3b0c509b9bc30f3828001c2b588069ffc10e Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 13 Apr 2022 13:14:11 -0500 Subject: [PATCH 258/601] Capture output to log --- workflows/common/R/install-candle.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/R/install-candle.sh b/workflows/common/R/install-candle.sh index 5e55ae11..1f6d8e19 100755 --- a/workflows/common/R/install-candle.sh +++ b/workflows/common/R/install-candle.sh @@ -36,4 +36,4 @@ then fi THIS=$( dirname $0 ) -nice R -f $THIS/install-candle.R +nice R -f $THIS/install-candle.R |& tee install-candle.log From 992d775ba170d1c4091cd56af0c949b08f2daecb Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 13 Apr 2022 13:14:34 -0500 Subject: [PATCH 259/601] Slight package reordering and notes --- workflows/common/R/install-candle.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/workflows/common/R/install-candle.R b/workflows/common/R/install-candle.R index 4f0e2409..426cbf83 100644 --- a/workflows/common/R/install-candle.R +++ b/workflows/common/R/install-candle.R @@ -30,14 +30,15 @@ options(repos = r) # ****NOTE**** that I tried installing both plotly and jsonlite the normal way (in the PKGS below instead of a specific version above) and I got the same error PKGS <- list( + "RInside", "plotly", "jsonlite", - "smoof", "rgenoud", "DiceKriging", + # not available for R 3.6.1 : needed for mlrMBO HPO "randomForest", "parallelMap", - "RInside", + # requires smoof requires misc3d requires --with-tcltk : "mlrMBO" ) From 97f5f1ce87c5dc4a6babd5eb0b2ab33672ce9a21 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 13 Apr 2022 14:28:44 -0500 Subject: [PATCH 260/601] Update doc --- workflows/common/sh/README.adoc | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/workflows/common/sh/README.adoc b/workflows/common/sh/README.adoc index f39d3b1f..00670314 100644 --- a/workflows/common/sh/README.adoc +++ b/workflows/common/sh/README.adoc @@ -5,23 +5,22 @@ This is the location for common shell scripting tools for the CANDLE Supervisor Workflows. -New developments with https://github.com/ECP-CANDLE/Supervisor/issues/20[Issue #20] are marke with *(#20)*. == Entry points -Each CANDLE workflow is invoked with a shell script, typically called +site_workflow.sh+, where the +site+ is +theta+, +titan+, etc. In this document we will call this the workflow_sh script. +Each CANDLE workflow is invoked with a shell script, typically called +test_*.sh site ...+, where the +site+ is +theta+, +summit+, etc. In this document we will call this the workflow_sh script. === Purpose The purpose of these scripts is: -. Determining the run directory, which is the +TURBINE_OUTPUT+ directory used by Swift/T. *(#20)* This is now set by +link:utils.sh:get_expid()+. - -. Set key environment variables for Python, R, etc. These include +PYTHONPATH+, +LD_LIBRARY_PATH+, etc. *(#20)* These are now set for each site in +langs-site.sh+. +. Determining the run directory, which is the +TURBINE_OUTPUT+ directory used by Swift/T. . Set key environment variables and Swift/T options for the system scheduler. These include +PROCS+, +WALLTIME+, +QUEUE+, +PROJECT+, etc. *(#20)* These are now set for each site in +sched-site.sh+. -. Loading modules. *(#20)* These are now set for each site in +modules-site.sh+. +. Loading modules. *(#20)* These are now set for each site in +env-site.sh+. + +. Set key environment variables for Python, R, etc. These include +PYTHONPATH+, +LD_LIBRARY_PATH+, etc. These are now set for each site in +env-site.sh+. . Determining which Swift functions to load. Swift/T may or may not be configured to use its efficient in-memory Python interpreter for the Benchmarks (because of challenges compiling against the site-provided Python plus deep learning stack). A shell variable called +SWIFT_IMPL+ denotes the Swift function implementation for the benchmarks. The value is one of: + @@ -39,17 +38,19 @@ The https://github.com/ECP-CANDLE/Supervisor/tree/master/workflows#objective-fun . Record what happened. This involves writing additional logs into the TURBINE_OUTPUT directory, particularly to capture settings and provenance that Swift/T cannot. -=== Future ideas +== Site list + +=== Summit + +`MED106=/gpfs/alpine/world-shared/med106` -Deduplication (https://github.com/ECP-CANDLE/Supervisor/issues/20[#20]). The current scripts flex the previously developed EMEWS templates, which make it easy to rapidly develop many workflows; additionally, they are highly linear, readable scripts. However, they duplicate a great deal of code, making Supervisor development more difficult. +==== `env-summit.sh` -. Source reusable settings. -.. Write key environment variables once for each system. E.g., https://github.com/ECP-CANDLE/Supervisor/blob/master/workflows/common/sh/langs-cori.sh[langs-cori.sh] -.. Same for scheduler settings, module settings. -. Create a new, site-generic workflow_sh for each workflow. This script will take the +site+ as an argument and source the appropriate settings files. -. Put application parameters in a separate file. This can be Bash-formatted for compatibility now, but may become something else. -. Install EQ/Py, EQ/R from Git and put in a common location, maintained by Wozniak. -. Test scripts. Each workflow directory will have a +test/+ subdirectory. This will contain enough bootstrapping code so that it can run and do something small without user configuration. This will build on the daily testing scripts that Brettin has started. +* GCC: 7.5.0 +* ROOT: `$MED106/sw/summit/gcc-7.5.0` +* Swift/T: 2022-04-12 +* Python: `/sw/summit/open-ce/anaconda-base/envs/open-ce-1.5.2-py39-0` +* R: `$ROOT/R/4.1.3` == Other shell tools From 5529151bf5c5bef69aa7d69204d34449edce405a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 13 Apr 2022 14:30:50 -0500 Subject: [PATCH 261/601] Use markup --- workflows/common/sh/README.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/sh/README.adoc b/workflows/common/sh/README.adoc index 00670314..9acae0cd 100644 --- a/workflows/common/sh/README.adoc +++ b/workflows/common/sh/README.adoc @@ -48,7 +48,7 @@ The https://github.com/ECP-CANDLE/Supervisor/tree/master/workflows#objective-fun * GCC: 7.5.0 * ROOT: `$MED106/sw/summit/gcc-7.5.0` -* Swift/T: 2022-04-12 +* Swift/T: `2022-04-12` * Python: `/sw/summit/open-ce/anaconda-base/envs/open-ce-1.5.2-py39-0` * R: `$ROOT/R/4.1.3` From 04a368e5062a776e8a6c6682580e63cab5c315d2 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Fri, 15 Apr 2022 11:52:15 -0400 Subject: [PATCH 262/601] o Update langs-app for summit --- workflows/common/sh/langs-app-summit.sh | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/workflows/common/sh/langs-app-summit.sh b/workflows/common/sh/langs-app-summit.sh index 0f056ea8..dff04fd3 100644 --- a/workflows/common/sh/langs-app-summit.sh +++ b/workflows/common/sh/langs-app-summit.sh @@ -1,17 +1,13 @@ # LANGS APP SUMMIT SH -# WIP 2019-02-28 - APP_PYTHONPATH=${APP_PYTHONPATH:-$PYTHONPATH} # Clear anything set by the system or Swift/T environment unset PYTHONPATH unset LD_LIBRARY_PATH -# ROOT=/ccs/proj/med106/gounley1/summit -ROOT=/ccs/proj/med106/hsyoo/summit -export PY=$ROOT/conda36 -export LD_LIBRARY_PATH=/sw/summit/cuda/10.1.168/lib64:/sw/summit/gcc/4.8.5/lib64:$PY/lib -export PYTHONHOME=$ROOT/conda36 +export PY=/gpfs/alpine/world-shared/med106/sw/open-ce-1.1.3-py37/ +export LD_LIBRARY_PATH=$PY/lib:$LD_LIBRARY_PATH +export PYTHONHOME=$PY export PATH=$PYTHONHOME/bin:$PATH -export PYTHONPATH=$PYTHONHOME/lib/python3.6:$PYTHONHOME/lib/python3.6/site-packages:$APP_PYTHONPATH +export PYTHONPATH=$PYTHONHOME/lib/python3.9:$PYTHONHOME/lib/python3.9/site-packages:$APP_PYTHONPATH From 0e22e6292d0dcd3770a60598c8bbed1102baa86f Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 15 Apr 2022 12:35:09 -0500 Subject: [PATCH 263/601] Fix unset variables in langs-app-summit --- workflows/common/sh/langs-app-summit.sh | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/workflows/common/sh/langs-app-summit.sh b/workflows/common/sh/langs-app-summit.sh index dff04fd3..7401008a 100644 --- a/workflows/common/sh/langs-app-summit.sh +++ b/workflows/common/sh/langs-app-summit.sh @@ -1,13 +1,12 @@ -# LANGS APP SUMMIT SH -APP_PYTHONPATH=${APP_PYTHONPATH:-$PYTHONPATH} +# LANGS APP SUMMIT SH -# Clear anything set by the system or Swift/T environment -unset PYTHONPATH -unset LD_LIBRARY_PATH +# Allow for user PYTHONPATH additions: +APP_PYTHONPATH=${APP_PYTHONPATH:-} -export PY=/gpfs/alpine/world-shared/med106/sw/open-ce-1.1.3-py37/ -export LD_LIBRARY_PATH=$PY/lib:$LD_LIBRARY_PATH +# Overwrite anything else set by the system or Swift/T environment: +export PY=/gpfs/alpine/world-shared/med106/sw/open-ce-1.1.3-py37 +export LD_LIBRARY_PATH=$PY/lib export PYTHONHOME=$PY export PATH=$PYTHONHOME/bin:$PATH export PYTHONPATH=$PYTHONHOME/lib/python3.9:$PYTHONHOME/lib/python3.9/site-packages:$APP_PYTHONPATH From 8924a0336060cd9d2997b63ec182dbbf0e9b19c9 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 15 Apr 2022 12:35:40 -0500 Subject: [PATCH 264/601] Add comments to model.sh --- workflows/common/sh/model.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index 0b66455d..e43df732 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -74,14 +74,19 @@ echo log "USING PYTHON:" $( which python ) echo +# The Python command line arguments: PY_CMD=( "$WORKFLOWS_ROOT/common/python/model_runner.py" "$PARAMS" "$INSTANCE_DIRECTORY" "$FRAMEWORK" "$RUNID" "$BENCHMARK_TIMEOUT" ) + +# The desired model command: MODEL_CMD="python3 -u ${PY_CMD[@]}" log "MODEL_CMD: ${MODEL_CMD[@]}" + +# Run Python! if $TIMEOUT_CMD ${MODEL_CMD[@]} then : # Assume success so we can keep a failed exit code From 424c5d766be0381ad4bfc606c45a223cb78a4984 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 15 Apr 2022 12:58:48 -0500 Subject: [PATCH 265/601] Set APP_PYTHONPATH --- workflows/mlrMBO/swift/workflow.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index 2e901964..1e9dac6a 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -59,6 +59,8 @@ echo "Running "$MODEL_NAME "workflow" # Set PYTHONPATH for BENCHMARK related stuff PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common +# Set PYTHONPATH for BENCHMARK related stuff in obj_app mode +export APP_PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common source_site env $SITE source_site sched $SITE @@ -172,6 +174,7 @@ swift-t -O 0 -n $PROCS \ -e LD_LIBRARY_PATH=$LD_LIBRARY_PATH \ -e TURBINE_RESIDENT_WORK_WORKERS=$TURBINE_RESIDENT_WORK_WORKERS \ -e RESIDENT_WORK_RANKS=$RESIDENT_WORK_RANKS \ + -e APP_PYTHONPATH -e BENCHMARKS_ROOT \ -e EMEWS_PROJECT_ROOT \ $( python_envs ) \ From 5ae67259dc2932b18c87542c6af8073fdf271b2c Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Fri, 15 Apr 2022 14:12:51 -0400 Subject: [PATCH 266/601] o Add missing slash --- workflows/common/sh/env-summit.sh | 5 +++-- workflows/mlrMBO/swift/workflow.sh | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/workflows/common/sh/env-summit.sh b/workflows/common/sh/env-summit.sh index a1dc08da..85e6d2a7 100644 --- a/workflows/common/sh/env-summit.sh +++ b/workflows/common/sh/env-summit.sh @@ -2,7 +2,7 @@ # ENV Summit # SWIFT_IMPL=echo -SWIFT_IMPL=py +SWIFT_IMPL=app # Let modules initialize LD_LIBRARY_PATH before changing it: set +eu # modules create errors outside our control @@ -24,7 +24,8 @@ R=$ROOT/R/4.1.3/lib64/R LD_LIBRARY_PATH+=:$R/lib # PY=/gpfs/alpine/world-shared/med106/sw/conda/2021-10-06/envs/CANDLE-2021-10-06 -PY=/sw/summit/open-ce/anaconda-base/envs/open-ce-1.5.2-py39-0 +#PY=/sw/summit/open-ce/anaconda-base/envs/open-ce-1.5.2-py39-0 +PY=/gpfs/alpine/world-shared/med106/sw/open-ce-1.1.3-py37/ LD_LIBRARY_PATH+=:$PY/lib export PYTHONHOME=$PY PATH=$PY/bin:$PATH diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index 1e9dac6a..355ff8c9 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -174,7 +174,7 @@ swift-t -O 0 -n $PROCS \ -e LD_LIBRARY_PATH=$LD_LIBRARY_PATH \ -e TURBINE_RESIDENT_WORK_WORKERS=$TURBINE_RESIDENT_WORK_WORKERS \ -e RESIDENT_WORK_RANKS=$RESIDENT_WORK_RANKS \ - -e APP_PYTHONPATH + -e APP_PYTHONPATH \ -e BENCHMARKS_ROOT \ -e EMEWS_PROJECT_ROOT \ $( python_envs ) \ From dd6d4e62334ea86210842d6bb530510609c633ae Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Sat, 16 Apr 2022 01:13:55 -0400 Subject: [PATCH 267/601] o Working summit test o Add xform-smiles --- workflows/mlrMBO/swift/workflow.sh | 2 +- workflows/mlrMBO/test/cfg-prm-summit.sh | 6 +++--- workflows/mlrMBO/test/cfg-sys-summit.sh | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index 355ff8c9..e0ff804d 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -15,7 +15,7 @@ then fi BENCHMARKS_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd ) export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} -BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/P1B1:$BENCHMARKS_ROOT/Pilot1/Attn1:$BENCHMARKS_ROOT/Pilot1/NT3:$BENCHMARKS_ROOT/examples/ADRP +BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/P1B1:$BENCHMARKS_ROOT/Pilot1/Attn1:$BENCHMARKS_ROOT/Pilot1/NT3:$BENCHMARKS_ROOT/examples/ADRP:$BENCHMARKS_ROOT/examples/xform-smiles # :$BENCHMARKS_ROOT/Pilot1/P1B1:$BENCHMARKS_ROOT/Pilot1/Combo:$BENCHMARKS_ROOT/Pilot2/P2B1:$BENCHMARKS_ROOT/Pilot3/P3B1:$BENCHMARKS_ROOT/Pilot3/P3B3:$BENCHMARKS_ROOT/Pilot3/P3B4 export BENCHMARK_TIMEOUT export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} diff --git a/workflows/mlrMBO/test/cfg-prm-summit.sh b/workflows/mlrMBO/test/cfg-prm-summit.sh index 36d086c4..a7047295 100644 --- a/workflows/mlrMBO/test/cfg-prm-summit.sh +++ b/workflows/mlrMBO/test/cfg-prm-summit.sh @@ -3,11 +3,11 @@ # mlrMBO settings # Total iterations -PROPOSE_POINTS=${PROPOSE_POINTS:-44} -MAX_CONCURRENT_EVALUATIONS=${MAX_CONCURRET_EVALUATIONS:-1} +PROPOSE_POINTS=${PROPOSE_POINTS:-6} +MAX_CONCURRENT_EVALUATIONS=${MAX_CONCURRET_EVALUATIONS:-6} MAX_ITERATIONS=${MAX_ITERATIONS:-2} MAX_BUDGET=${MAX_BUDGET:-180} -DESIGN_SIZE=${DESIGN_SIZE:-44} +DESIGN_SIZE=${DESIGN_SIZE:-6} # TODO: move the following code to a utility library- # this is a configuration file diff --git a/workflows/mlrMBO/test/cfg-sys-summit.sh b/workflows/mlrMBO/test/cfg-sys-summit.sh index bc1ea24e..ac701ac8 100644 --- a/workflows/mlrMBO/test/cfg-sys-summit.sh +++ b/workflows/mlrMBO/test/cfg-sys-summit.sh @@ -11,7 +11,7 @@ export PROCS=${PROCS:-6} export PPN=${PPN:-6} # export WALLTIME=${WALLTIME:-06:00:00} -export WALLTIME=00:10:00 +export WALLTIME=00:40:00 #export PROJECT=Candle_ECP From c6719ca67f7bf14875165c9b19bf85681217202f Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Tue, 19 Apr 2022 16:01:03 -0400 Subject: [PATCH 268/601] o First draft for sct mlrMBO HPO --- workflows/mlrMBO/data/sct_nightly.R | 20 ++++++++++++++++++++ workflows/mlrMBO/test/cfg-prm-summit.sh | 4 +++- workflows/mlrMBO/test/cfg-sys-summit.sh | 2 +- 3 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 workflows/mlrMBO/data/sct_nightly.R diff --git a/workflows/mlrMBO/data/sct_nightly.R b/workflows/mlrMBO/data/sct_nightly.R new file mode 100644 index 00000000..93239e08 --- /dev/null +++ b/workflows/mlrMBO/data/sct_nightly.R @@ -0,0 +1,20 @@ + +# NT3 Hyperparameter Search - Test 1 +# These parameters should stay small for short tests +# and use no dense parameters to avoid mlrMBO crashes + +# see https://cran.r-project.org/web/packages/ParamHelpers/ParamHelpers.pdfmakeNum +# the parameter names should match names of the arguments expected by the benchmark + +param.set <- makeParamSet( + makeDiscreteParam("batch_size", values = c(16, 32)), + makeIntegerParam("epochs", lower = 1, upper = 1), +# makeDiscreteParam("activation", values = c("softmax", "elu", "softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear")), +# makeDiscreteParam("dense", values = c("500 100 50", "1000 500 100 50", "2000 1000 500 100 50", "2000 1000 1000 500 100 50", "2000 1000 1000 1000 500 100 50")), + makeDiscreteParam("optimizer", values = c("adam", "sgd", "rmsprop", "adagrad", "adadelta")), + makeNumericParam("dropout", lower = 0, upper = 0.9), + makeNumericParam("learning_rate", lower = 0.00001, upper = 0.1) +# makeDiscreteParam("conv", values = c("50 50 50 50 50 1", "25 25 25 25 25 1", "64 32 16 32 64 1", "100 100 100 100 100 1", "32 20 16 32 10 1")) + ## DEBUG PARAMETERS: DON'T USE THESE IN PRODUCTION RUN + ## makeDiscreteParam("conv", values = c("32 20 16 32 10 1")) +) diff --git a/workflows/mlrMBO/test/cfg-prm-summit.sh b/workflows/mlrMBO/test/cfg-prm-summit.sh index a7047295..fd58b0d0 100644 --- a/workflows/mlrMBO/test/cfg-prm-summit.sh +++ b/workflows/mlrMBO/test/cfg-prm-summit.sh @@ -4,7 +4,7 @@ # Total iterations PROPOSE_POINTS=${PROPOSE_POINTS:-6} -MAX_CONCURRENT_EVALUATIONS=${MAX_CONCURRET_EVALUATIONS:-6} +MAX_CONCURRENT_EVALUATIONS=${MAX_CONCURRET_EVALUATIONS:-1} MAX_ITERATIONS=${MAX_ITERATIONS:-2} MAX_BUDGET=${MAX_BUDGET:-180} DESIGN_SIZE=${DESIGN_SIZE:-6} @@ -16,6 +16,8 @@ if [ "$MODEL_NAME" = "combo" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/combo_nightly.R} elif [ "$MODEL_NAME" = "attn" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/attn_nightly.R} +elif [ "$MODEL_NAME" = "sct" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/sct_nightly.R} elif [ "$MODEL_NAME" = "adrp" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/adrp_nightly.R} elif [ "$MODEL_NAME" = "p1b1" ]; then diff --git a/workflows/mlrMBO/test/cfg-sys-summit.sh b/workflows/mlrMBO/test/cfg-sys-summit.sh index ac701ac8..b313f3ad 100644 --- a/workflows/mlrMBO/test/cfg-sys-summit.sh +++ b/workflows/mlrMBO/test/cfg-sys-summit.sh @@ -11,7 +11,7 @@ export PROCS=${PROCS:-6} export PPN=${PPN:-6} # export WALLTIME=${WALLTIME:-06:00:00} -export WALLTIME=00:40:00 +export WALLTIME=02:00:00 #export PROJECT=Candle_ECP From 7537b03221cb458187d48e21527ab9a450e96e5d Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 20 Apr 2022 14:19:32 -0500 Subject: [PATCH 269/601] Check for error in input name translation --- workflows/cp-leaveout/scripts/shrink-output-single.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/workflows/cp-leaveout/scripts/shrink-output-single.sh b/workflows/cp-leaveout/scripts/shrink-output-single.sh index 9b41265f..a9a55d84 100755 --- a/workflows/cp-leaveout/scripts/shrink-output-single.sh +++ b/workflows/cp-leaveout/scripts/shrink-output-single.sh @@ -9,6 +9,12 @@ OUTPUT=$2 T=${INPUT/out/tr} +if [ $INPUT == $T ] +then + echo "shrink-output-single.sh: ERROR: INPUT is wrong." + exit 1 +fi + tr "\r" "\n" < $INPUT > $T python $THIS/shrink-output.py $T $OUTPUT From aed559f9a5d84dcf1f4dba212f10fb45d013ffc5 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 20 Apr 2022 14:32:09 -0500 Subject: [PATCH 270/601] Move shrink scripts --- .../scripts/shrink-output-single.sh | 21 ---- .../cp-leaveout/scripts/shrink-output.mk | 11 --- .../cp-leaveout/scripts/shrink-output.py | 98 ------------------- .../cp-leaveout/scripts/shrink-output.sh | 32 ------ workflows/upf/test/upf-1.sh | 2 +- 5 files changed, 1 insertion(+), 163 deletions(-) delete mode 100755 workflows/cp-leaveout/scripts/shrink-output-single.sh delete mode 100644 workflows/cp-leaveout/scripts/shrink-output.mk delete mode 100644 workflows/cp-leaveout/scripts/shrink-output.py delete mode 100755 workflows/cp-leaveout/scripts/shrink-output.sh diff --git a/workflows/cp-leaveout/scripts/shrink-output-single.sh b/workflows/cp-leaveout/scripts/shrink-output-single.sh deleted file mode 100755 index a9a55d84..00000000 --- a/workflows/cp-leaveout/scripts/shrink-output-single.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/sh -set -eu - -# SHRINK OUTPUT SINGLE SH -# Called by shrink-output.mk - -INPUT=$1 -OUTPUT=$2 - -T=${INPUT/out/tr} - -if [ $INPUT == $T ] -then - echo "shrink-output-single.sh: ERROR: INPUT is wrong." - exit 1 -fi - -tr "\r" "\n" < $INPUT > $T -python $THIS/shrink-output.py $T $OUTPUT - -rm $T diff --git a/workflows/cp-leaveout/scripts/shrink-output.mk b/workflows/cp-leaveout/scripts/shrink-output.mk deleted file mode 100644 index c62878a7..00000000 --- a/workflows/cp-leaveout/scripts/shrink-output.mk +++ /dev/null @@ -1,11 +0,0 @@ - -.DELETE_ON_ERROR: - -OUTS = $(wildcard out-*.txt) - -SUMMARIES = $(subst out-,summary-,$(OUTS)) - -all: $(SUMMARIES) - -summary-%.txt: out-%.txt - @ ${THIS}/shrink-output-single.sh $(<) $(@) diff --git a/workflows/cp-leaveout/scripts/shrink-output.py b/workflows/cp-leaveout/scripts/shrink-output.py deleted file mode 100644 index 880f12be..00000000 --- a/workflows/cp-leaveout/scripts/shrink-output.py +++ /dev/null @@ -1,98 +0,0 @@ - -# SHRINK OUTPUT PY -# argv: 2 filenames : tr-*.txt and summary-*.txt -# Called by shrink-output-single.sh -# The tr-*.txt file should have used tr to change CR to NL -# Removes non-printing characters (backspace) -# Reduces the number of training lines in output -# Removes redundant batch size information -# Fixes newline before "Current time" report - -import os, re, stat, sys, time -from collections import deque - - -# Only 1/shrink_factor training lines are copied -shrink_factor = 200 -# Number of additional consecutive lines at beginning and end of -# training that are retained -hold_space = 3 - - -def shrink(fp_in, fp_out): - # Queue to hold extra lines that may be printed at end of run - Q = deque() - index = 0 - starts = 0 # Initial hold_space ETAs are immediately printed - line_previous = "" - for line in fp_in: - if len(line) == 1: continue # Blank line - line = line.replace("\b", "") - if "batch:" in line or "Current" in line: - line = re.sub("- batch: .* 32.0000 -", "", line) - line = line.replace("Current", "\nCurrent") - if starts < hold_space: - fp_out.write(line) - starts += 1 - continue - Q.append(line) - index += 1 - if len(Q) > hold_space: - line = Q.popleft() - if index % shrink_factor == 0: - fp_out.write(line) - else: - starts = 0 - while len(Q) > 0: - fp_out.write(Q.popleft()) - if line == line_previous: - continue - fp_out.write(line) - line_previous = line - # Done: flush Q: - while len(Q) > 0: - fp_out.write(Q.popleft()) - - -# From https://www.codegrepper.com/code-examples/python/python+get+human+readable+file+size -def hsize(size, decimal_places=2): - if size < 1024: - return "%4i B" % size - size /= 1024 - for unit in ["KB","MB","GB","TB"]: - if size < 1024: - break - size /= 1024 - return f"{size:.{decimal_places}f} {unit}" - - -file_in = sys.argv[1] -file_out = sys.argv[2] - -# Do not process files that have not changed since the last run -# of this script: -if os.path.exists(file_out) and \ - os.path.getmtime(file_in) < os.path.getmtime(file_out): - print("skipping: " + file_in) - exit() - -t0 = time.time() -s0 = os.stat(file_in) -z0 = s0[stat.ST_SIZE] -h0 = hsize(z0) -print("shrink: %11s %s" % - (h0, file_in)) - -with open(file_in, "r") as fp_in: - with open(file_out, "w") as fp_out: - shrink(fp_in, fp_out) - -s1 = os.stat(file_out) -t1 = time.time() -z1 = s1[stat.ST_SIZE] - -t = t1 - t0 -rate = hsize(z0/t) - -print("shrank: %0.2fs %11s/s %11s -> %11s %s" % - (t, rate, hsize(z0), hsize(z1), file_in)) diff --git a/workflows/cp-leaveout/scripts/shrink-output.sh b/workflows/cp-leaveout/scripts/shrink-output.sh deleted file mode 100755 index 0492d513..00000000 --- a/workflows/cp-leaveout/scripts/shrink-output.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -set -eu - -# SHRINK OUTPUT SH -# Accepts a whole workflow output directory -# Clean up and shrink TensorFlow output -# See shrink-output.py for details -# Parallelizable via make - -THIS=$( readlink --canonicalize $( dirname $0 ) ) -CPLO=$( readlink --canonicalize $THIS/.. ) -SUPERVISOR=$( readlink --canonicalize $CPLO/../.. ) -export THIS - -source $SUPERVISOR/workflows/common/sh/utils.sh - -SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ - DIR - ${*} - -export PYTHONPATH+=:$SUPERVISOR/workflows/common/python - -if ! [[ -d $DIR ]] -then - echo "Does not exist: $DIR" - exit 1 -fi - -# This is used inside the Makefile below: -mkdir -pv /tmp/$USER/shrink - -cd $DIR/out -nice -n 19 make -j 1 -f $THIS/shrink-output.mk diff --git a/workflows/upf/test/upf-1.sh b/workflows/upf/test/upf-1.sh index c288525a..baf0a417 100755 --- a/workflows/upf/test/upf-1.sh +++ b/workflows/upf/test/upf-1.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash set -eu # TEST UPF 1 From 1eba6c4b4769db00558e7f3140f95fc8fcd01f52 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 20 Apr 2022 14:33:35 -0500 Subject: [PATCH 271/601] Initial version for tests --- scripts/shrink-log-single.sh | 29 +++++++++++ scripts/shrink-log.mk | 11 ++++ scripts/shrink-log.py | 98 ++++++++++++++++++++++++++++++++++++ scripts/shrink-logs.sh | 32 ++++++++++++ 4 files changed, 170 insertions(+) create mode 100644 scripts/shrink-log-single.sh create mode 100644 scripts/shrink-log.mk create mode 100644 scripts/shrink-log.py create mode 100755 scripts/shrink-logs.sh diff --git a/scripts/shrink-log-single.sh b/scripts/shrink-log-single.sh new file mode 100644 index 00000000..6f32a848 --- /dev/null +++ b/scripts/shrink-log-single.sh @@ -0,0 +1,29 @@ +#!/bin/sh +set -eu + +# SHRINK LOG SINGLE SH +# Called by shrink-logs.mk + +INPUT=$1 +OUTPUT=$2 + +NAME=$( basename --suffix=.log $INPUT ) + +T=${INPUT/$NAME/tr} + +if [ $INPUT == $T ] +then + echo "shrink-log-single.sh: ERROR: INPUT is wrong." + exit 1 +fi + + +if [ "${THIS:-}" == "" ] +then + THIS=$( readlink --canonicalize $( dirname $0 ) ) +fi + +tr "\r" "\n" < $INPUT > $T +python $THIS/shrink-log.py $T $OUTPUT + +rm $T diff --git a/scripts/shrink-log.mk b/scripts/shrink-log.mk new file mode 100644 index 00000000..ccb40fc0 --- /dev/null +++ b/scripts/shrink-log.mk @@ -0,0 +1,11 @@ + +.DELETE_ON_ERROR: + +OUTS = $(wildcard out-*.log) + +SUMMARIES = $(subst out-,summary-,$(OUTS)) + +all: $(SUMMARIES) + +summary-%.log: out-%.log + @ ${THIS}/shrink-log-single.sh $(<) $(@) diff --git a/scripts/shrink-log.py b/scripts/shrink-log.py new file mode 100644 index 00000000..12f9d5ff --- /dev/null +++ b/scripts/shrink-log.py @@ -0,0 +1,98 @@ + +# SHRINK LOG PY +# argv: 2 filenames : tr-*.log and summary-*.log +# Called by shrink-log-single.sh +# The tr-*.log file should have used tr to change CR to NL +# Removes non-printing characters (backspace) +# Reduces the number of training lines in output +# Removes redundant batch size information +# Fixes newline before "Current time" report + +import os, re, stat, sys, time +from collections import deque + + +# Only 1/shrink_factor training lines are copied +shrink_factor = 200 +# Number of additional consecutive lines at beginning and end of +# training that are retained +hold_space = 3 + + +def shrink(fp_in, fp_out): + # Queue to hold extra lines that may be printed at end of run + Q = deque() + index = 0 + starts = 0 # Initial hold_space ETAs are immediately printed + line_previous = "" + for line in fp_in: + if len(line) == 1: continue # Blank line + line = line.replace("\b", "") + if "batch:" in line or "Current" in line: + line = re.sub("- batch: .* 32.0000 -", "", line) + line = line.replace("Current", "\nCurrent") + if starts < hold_space: + fp_out.write(line) + starts += 1 + continue + Q.append(line) + index += 1 + if len(Q) > hold_space: + line = Q.popleft() + if index % shrink_factor == 0: + fp_out.write(line) + else: + starts = 0 + while len(Q) > 0: + fp_out.write(Q.popleft()) + if line == line_previous: + continue + fp_out.write(line) + line_previous = line + # Done: flush Q: + while len(Q) > 0: + fp_out.write(Q.popleft()) + + +# From https://www.codegrepper.com/code-examples/python/python+get+human+readable+file+size +def hsize(size, decimal_places=2): + if size < 1024: + return "%4i B" % size + size /= 1024 + for unit in ["KB","MB","GB","TB"]: + if size < 1024: + break + size /= 1024 + return f"{size:.{decimal_places}f} {unit}" + + +file_in = sys.argv[1] +file_out = sys.argv[2] + +# Do not process files that have not changed since the last run +# of this script: +if os.path.exists(file_out) and \ + os.path.getmtime(file_in) < os.path.getmtime(file_out): + print("skipping: " + file_in) + exit() + +t0 = time.time() +s0 = os.stat(file_in) +z0 = s0[stat.ST_SIZE] +h0 = hsize(z0) +print("shrink: %11s %s" % + (h0, file_in)) + +with open(file_in, "r") as fp_in: + with open(file_out, "w") as fp_out: + shrink(fp_in, fp_out) + +s1 = os.stat(file_out) +t1 = time.time() +z1 = s1[stat.ST_SIZE] + +t = t1 - t0 +rate = hsize(z0/t) + +print("shrank: %0.2fs %11s/s %11s -> %11s %s" % + (t, rate, hsize(z0), hsize(z1), file_in)) diff --git a/scripts/shrink-logs.sh b/scripts/shrink-logs.sh new file mode 100755 index 00000000..54a32635 --- /dev/null +++ b/scripts/shrink-logs.sh @@ -0,0 +1,32 @@ +#!/bin/bash +set -eu + +# SHRINK LOGS SH +# Accepts a whole workflow output directory +# Clean up and shrink TensorFlow output logs +# See shrink-log.py for details +# Parallelizable via make + +THIS=$( readlink --canonicalize $( dirname $0 ) ) +CPLO=$( readlink --canonicalize $THIS/.. ) +SUPERVISOR=$( readlink --canonicalize $CPLO/../.. ) +export THIS + +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an output DIR (e.g., .../experiments/X042/out)!" \ + DIR - ${*} + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +if ! [[ -d $DIR ]] +then + echo "Does not exist: $DIR" + exit 1 +fi + +# This is used inside the Makefile below: +mkdir -pv /tmp/$USER/shrink + +cd $DIR +nice -n 19 make -j 8 -f $THIS/shrink-log.mk From 621a9e2891e0b14a734705847291c4cc9a5a8a9f Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 20 Apr 2022 14:35:23 -0500 Subject: [PATCH 272/601] Bug fix --- scripts/shrink-logs.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/shrink-logs.sh b/scripts/shrink-logs.sh index 54a32635..7a1f387f 100755 --- a/scripts/shrink-logs.sh +++ b/scripts/shrink-logs.sh @@ -7,9 +7,8 @@ set -eu # See shrink-log.py for details # Parallelizable via make -THIS=$( readlink --canonicalize $( dirname $0 ) ) -CPLO=$( readlink --canonicalize $THIS/.. ) -SUPERVISOR=$( readlink --canonicalize $CPLO/../.. ) +THIS=$( realpath $( dirname $0 ) ) +SUPERVISOR=$( realpath $THIS/.. ) export THIS source $SUPERVISOR/workflows/common/sh/utils.sh From 474b0d8b19b33cbeda27dfcda33e1497da593b09 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 20 Apr 2022 14:41:46 -0500 Subject: [PATCH 273/601] WS --- scripts/shrink-log-single.sh | 1 - 1 file changed, 1 deletion(-) mode change 100644 => 100755 scripts/shrink-log-single.sh diff --git a/scripts/shrink-log-single.sh b/scripts/shrink-log-single.sh old mode 100644 new mode 100755 index 6f32a848..9d1284c1 --- a/scripts/shrink-log-single.sh +++ b/scripts/shrink-log-single.sh @@ -17,7 +17,6 @@ then exit 1 fi - if [ "${THIS:-}" == "" ] then THIS=$( readlink --canonicalize $( dirname $0 ) ) From 8306a77b44de1680e3396c13be7f6b175d12e413 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 20 Apr 2022 14:41:52 -0500 Subject: [PATCH 274/601] Go back to sequential make --- scripts/shrink-logs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/shrink-logs.sh b/scripts/shrink-logs.sh index 7a1f387f..2d9a8207 100755 --- a/scripts/shrink-logs.sh +++ b/scripts/shrink-logs.sh @@ -28,4 +28,4 @@ fi mkdir -pv /tmp/$USER/shrink cd $DIR -nice -n 19 make -j 8 -f $THIS/shrink-log.mk +nice -n 19 make -j 1 -f $THIS/shrink-log.mk From 4c12562c1d5d8ebe8c735ef1fb4c8660f66d17f9 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 6 May 2022 16:19:54 -0500 Subject: [PATCH 275/601] New CPLO analysis scripts --- workflows/cp-leaveout/scripts/README.adoc | 8 +++ workflows/cp-leaveout/scripts/leaf-stats.sh | 57 +++++++++++++++++++ .../cp-leaveout/scripts/list-node-singles.py | 33 +++++++++++ 3 files changed, 98 insertions(+) create mode 100755 workflows/cp-leaveout/scripts/leaf-stats.sh create mode 100755 workflows/cp-leaveout/scripts/list-node-singles.py diff --git a/workflows/cp-leaveout/scripts/README.adoc b/workflows/cp-leaveout/scripts/README.adoc index ba96708e..bf827762 100644 --- a/workflows/cp-leaveout/scripts/README.adoc +++ b/workflows/cp-leaveout/scripts/README.adoc @@ -127,6 +127,14 @@ Average GPU utilization List the Nodes from the JSON file. +==== List Node Singles + +List the Nodes from the JSON file with a single cell line. + +==== Leaf Stats + +Report key stats from the python.log for the given nodes. + ==== tar experiment Make backup tars for experiment data diff --git a/workflows/cp-leaveout/scripts/leaf-stats.sh b/workflows/cp-leaveout/scripts/leaf-stats.sh new file mode 100755 index 00000000..a38852cf --- /dev/null +++ b/workflows/cp-leaveout/scripts/leaf-stats.sh @@ -0,0 +1,57 @@ +#!/bin/bash +set -eu + +# LEAF STATS +# Report stats for given nodes +# LIST: A file containing a simple per-line list of nodes, +# e.g., "1.1.2\n2.3.1\n4.1.1.1\n" + +THIS=$( readlink --canonicalize $( dirname $0 ) ) + +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an run DIR (e.g., .../experiments/X042/run/1.2.3)!" \ + -H "Provide a node list (from list-node-singles)" \ + DIR LIST - ${*} + +if [[ ! -d $DIR ]] +then + echo "$0: Given run directory does not exist: $DIR" + exit 1 +fi + +# Read the node list into a Bash array +NODES=() +while read NODE CELL +do + NODES+=( $NODE ) + CELLS+=( $CELL ) +done < $LIST + +{ + echo "CELL NODE POINTS EPOCHS MAE R2 VAL_LOSS EARLY" + for (( i=1 ; i < ${#NODES[@]} ; i++ )) + do + NODE=${NODES[$i]} + CELL=${CELLS[$i]} + LOG=$D/run/$NODE/save/python.log + # Pull out validation points: + POINTS=$( grep "Data points per epoch:" < $LOG | cut -d ' ' -f 12 ) + echo -n "$CELL $NODE ${POINTS:0:-1} " + # grep "loss:" $LOG | tail -1 + # Grab the last Epoch line in the log, + # extract the desired stats, + # remove commas, delete trailing newline + grep "loss:" $LOG | tail -1 | \ + awk '{ printf( "%i %f %+f %f ", strtonum($4), strtonum($8), strtonum($10), strtonum($12)); }' | \ + tr ',' ' ' | tr --delete '\n' + if grep -q "stopping: early" $LOG + then + EARLY=1 + else + EARLY=0 + fi + echo $EARLY + done +} | column -t diff --git a/workflows/cp-leaveout/scripts/list-node-singles.py b/workflows/cp-leaveout/scripts/list-node-singles.py new file mode 100755 index 00000000..5c5bf367 --- /dev/null +++ b/workflows/cp-leaveout/scripts/list-node-singles.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python + +# LIST NODE SINGLES PY +# Extract the nodes from the JSON file with a single cell line +# report the node and cell line + +import argparse, json + +parser = argparse.ArgumentParser() +parser.add_argument('plan', type=str, help='Plan data file') +args = parser.parse_args() + +try: + with open(args.plan) as fp: + J = json.load(fp) +except Exception as e: + print("could not read JSON in file: %s\n" % args.plan + str(e)) + exit(1) + +count = 0 + +for k in J.keys(): + entry = J[k] + if "val" not in entry: + # Root entry + continue + val = entry["val"] # A list + cells = val[0]["cell"] + if len(cells) == 1: + print(k + " " + cells[0]) + count += 1 + +# print(f"count: {count}") From 8ae086afc80e2aedb29a6533332c8cab906967e2 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 10 May 2022 13:53:17 -0500 Subject: [PATCH 276/601] New leaf-stats scripts --- workflows/common/python/utils.py | 74 ++++++++++++++ workflows/cp-leaveout/scripts/leaf-stats.py | 103 ++++++++++++++++++++ workflows/cp-leaveout/scripts/leaf-stats.sh | 37 +------ 3 files changed, 180 insertions(+), 34 deletions(-) create mode 100644 workflows/cp-leaveout/scripts/leaf-stats.py diff --git a/workflows/common/python/utils.py b/workflows/common/python/utils.py index 2db1e08d..c21b3981 100644 --- a/workflows/common/python/utils.py +++ b/workflows/common/python/utils.py @@ -36,3 +36,77 @@ def append(filename, text): fp.write('\n') except Exception as e: fail(e, os.EX_IOERR, 'Could not append to: ' + filename) + +import re + +class Matcher: + + """ Abstract class for use with Grepper """ + + def __init__(self, regexp): + self.regexp = regexp + self.pattern = re.compile(self.regexp) + + def match(self, line): + m = self.pattern.match(line) + if m is None: + return None + self.run(line) + + def run(self, line): + """ User code should override this """ + pass + + def reset(self): + """ User code should override this """ + pass + + +class Grepper: + + def __init__(self, matchers): + """ matchers: List of Matchers """ + self.matchers = matchers + + def grep(self, filename): + with open(filename, "r") as fp: + while True: + line = fp.readline() + if len(line) == 0: break + for matcher in self.matchers: + matcher.match(line) + + def reset(self): + for matcher in self.matchers: + matcher.reset() + + +def columnPrint(D, aligns): + """ D: a dict mapping a header string to a list of string data """ + """ aligns: a string "llrlr" for left or right alignment by column """ + headers = D.keys() + # Format specs for headers + fmth = "" + # Format specs for data + fmtd = "" + maxlist = 0 + index = 0 # To track aligns + for header in headers: + maxstr = len(header) + if len(D[header]) > maxlist: + maxlist = len(D[header]) + for item in D[header]: + if len(item) > maxstr: + maxstr = len(item) + # Header is always left-aligned + fmth += "%%-%is " % maxstr + sign = "-" if aligns[index] == "l" else "" + fmtd += "%%%s%is " % (sign, maxstr) + index += 1 + # Start printing + print(fmth % tuple(headers)) + for i in range(0, maxlist-1): + L = [] + for header in headers: + L.append(D[header][i]) + print(fmtd % tuple(L)) diff --git a/workflows/cp-leaveout/scripts/leaf-stats.py b/workflows/cp-leaveout/scripts/leaf-stats.py new file mode 100644 index 00000000..ac26d3f9 --- /dev/null +++ b/workflows/cp-leaveout/scripts/leaf-stats.py @@ -0,0 +1,103 @@ +# LEAF STATS PY + +import argparse, os, sys + +import utils + +parser = argparse.ArgumentParser(description='Print leaf stats') +parser.add_argument('directory', + help='The experiment directory (EXPID)') +parser.add_argument('list', + help='The list of nodes to process') + +args = parser.parse_args() + +# Map from node "1.1.1.1.2.3" to cell line "CCLE.KMS11" +nodes = {} + +with open(args.list, 'r') as fp: + while True: + line = fp.readline() + if len(line) == 0: break + tokens = line.split() + node = tokens[0] + cell = tokens[1] + nodes[node] = cell + +from collections import OrderedDict + +headers = [ "CELL", "NODE", "POINTS", "EPOCHS", "MAE", "R2", "VAL_LOSS", "EARLY" ] +columns = OrderedDict() +for header in headers: + columns[header] = [] + + +class MatcherPoints(utils.Matcher): + + def __init__(self): + super(MatcherPoints, self).__init__(".*Data points per epoch.*") + self.reset() + + def run(self, line): + tokens = line.split() + # Remove trailing comma: + self.points = tokens[11][0:-1] + + def reset(self): + self.points = 0 + + +class MatcherStats(utils.Matcher): + + def __init__(self): + super(MatcherStats, self).__init__(".*loss:.*") + self.reset() + + def run(self, line): + tokens = line.split() + # Remove trailing bracket or comma: + self.epochs = tokens[ 3][0:-1] + self.mae = tokens[ 7][0:-1] + self.r2 = tokens[ 9][0:-1] + self.val_loss = tokens[11][0:-1] + + def reset(self): + self.epochs = 0 + self.mae = 0 + self.r2 = 0 + self.val_loss = 0 + + +class MatcherEarly(utils.Matcher): + + def __init__(self): + super(MatcherEarly, self).__init__(".*stopping: early.*") + self.reset() + + def run(self, line): + self.early = "1" + + def reset(self): + self.early = "0" + + +matcherPoints = MatcherPoints() +matcherStats = MatcherStats() +matcherEarly = MatcherEarly() +grepper = utils.Grepper([matcherPoints, matcherStats, matcherEarly]) + +for node in nodes: + cell = nodes[node] + log = f"{args.directory}/run/{node}/save/python.log" + grepper.grep(log) + columns["CELL"] .append(cell) + columns["NODE"] .append(node) + columns["POINTS"] .append(matcherPoints.points) + columns["EPOCHS"] .append(matcherStats.epochs) + columns["MAE"] .append(matcherStats.mae) + columns["R2"] .append(matcherStats.r2) + columns["VAL_LOSS"].append(matcherStats.val_loss) + columns["EARLY"] .append(matcherEarly.early) + grepper.reset() + +utils.columnPrint(columns, "llrrrrrr") diff --git a/workflows/cp-leaveout/scripts/leaf-stats.sh b/workflows/cp-leaveout/scripts/leaf-stats.sh index a38852cf..3ef394a9 100755 --- a/workflows/cp-leaveout/scripts/leaf-stats.sh +++ b/workflows/cp-leaveout/scripts/leaf-stats.sh @@ -6,7 +6,7 @@ set -eu # LIST: A file containing a simple per-line list of nodes, # e.g., "1.1.2\n2.3.1\n4.1.1.1\n" -THIS=$( readlink --canonicalize $( dirname $0 ) ) +THIS=$( realpath $( dirname $0 ) ) SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) source $SUPERVISOR/workflows/common/sh/utils.sh @@ -21,37 +21,6 @@ then exit 1 fi -# Read the node list into a Bash array -NODES=() -while read NODE CELL -do - NODES+=( $NODE ) - CELLS+=( $CELL ) -done < $LIST +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python -{ - echo "CELL NODE POINTS EPOCHS MAE R2 VAL_LOSS EARLY" - for (( i=1 ; i < ${#NODES[@]} ; i++ )) - do - NODE=${NODES[$i]} - CELL=${CELLS[$i]} - LOG=$D/run/$NODE/save/python.log - # Pull out validation points: - POINTS=$( grep "Data points per epoch:" < $LOG | cut -d ' ' -f 12 ) - echo -n "$CELL $NODE ${POINTS:0:-1} " - # grep "loss:" $LOG | tail -1 - # Grab the last Epoch line in the log, - # extract the desired stats, - # remove commas, delete trailing newline - grep "loss:" $LOG | tail -1 | \ - awk '{ printf( "%i %f %+f %f ", strtonum($4), strtonum($8), strtonum($10), strtonum($12)); }' | \ - tr ',' ' ' | tr --delete '\n' - if grep -q "stopping: early" $LOG - then - EARLY=1 - else - EARLY=0 - fi - echo $EARLY - done -} | column -t +python $THIS/leaf-stats.py $DIR $LIST From d5461681172802a87f4b3cf672a14c4cd62b1b0f Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 10 May 2022 14:51:51 -0500 Subject: [PATCH 277/601] More output --- workflows/cp-leaveout/scripts/touch-exps.zsh | 1 + 1 file changed, 1 insertion(+) mode change 100644 => 100755 workflows/cp-leaveout/scripts/touch-exps.zsh diff --git a/workflows/cp-leaveout/scripts/touch-exps.zsh b/workflows/cp-leaveout/scripts/touch-exps.zsh old mode 100644 new mode 100755 index e9506262..306fbf5f --- a/workflows/cp-leaveout/scripts/touch-exps.zsh +++ b/workflows/cp-leaveout/scripts/touch-exps.zsh @@ -20,6 +20,7 @@ A=( 750 for X in $A do + print experiments/X$X ds experiments/X$X last-access experiments/X$X touch-all experiments/X$X From 7ffa271e34febbe02c8dd1a356deec6113be2d9f Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 10 May 2022 14:52:09 -0500 Subject: [PATCH 278/601] Check input --- workflows/common/python/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/workflows/common/python/utils.py b/workflows/common/python/utils.py index c21b3981..ac18f20c 100644 --- a/workflows/common/python/utils.py +++ b/workflows/common/python/utils.py @@ -85,6 +85,10 @@ def columnPrint(D, aligns): """ D: a dict mapping a header string to a list of string data """ """ aligns: a string "llrlr" for left or right alignment by column """ headers = D.keys() + assert len(aligns) == len(headers), \ + "Length of aligns (%i) does not match headers (%i)!" % \ + (len(aligns), len(headers)) + # Format specs for headers fmth = "" # Format specs for data From 02ca6df686340ce0147c128e9ac8ce56d07ad5a3 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 10 May 2022 14:55:09 -0500 Subject: [PATCH 279/601] Pull out holdout errors --- workflows/cp-leaveout/scripts/leaf-stats.py | 54 +++++++++++++++++++-- 1 file changed, 51 insertions(+), 3 deletions(-) diff --git a/workflows/cp-leaveout/scripts/leaf-stats.py b/workflows/cp-leaveout/scripts/leaf-stats.py index ac26d3f9..72b350fd 100644 --- a/workflows/cp-leaveout/scripts/leaf-stats.py +++ b/workflows/cp-leaveout/scripts/leaf-stats.py @@ -26,7 +26,8 @@ from collections import OrderedDict -headers = [ "CELL", "NODE", "POINTS", "EPOCHS", "MAE", "R2", "VAL_LOSS", "EARLY" ] +headers = [ "CELL", "NODE", "POINTS", "EPOCHS", "MAE", "R2", "VAL_LOSS", + "EARLY", "HO_MSE", "HO_MAE", "HO_R2" ] columns = OrderedDict() for header in headers: columns[header] = [] @@ -81,10 +82,54 @@ def reset(self): self.early = "0" +class MatcherHoldoutMSE(utils.Matcher): + + def __init__(self): + super(MatcherHoldoutMSE, self).__init__(".* mse:.*") + self.reset() + + def run(self, line): + tokens = line.split() + self.ho_mse = tokens[3] + + def reset(self): + self.ho_mse = "0" + +class MatcherHoldoutMAE(utils.Matcher): + + def __init__(self): + super(MatcherHoldoutMAE, self).__init__(".* mae:.*") + self.reset() + + def run(self, line): + tokens = line.split() + self.ho_mae = tokens[3] + + def reset(self): + self.ho_mae = "0" + +class MatcherHoldoutR2(utils.Matcher): + + def __init__(self): + super(MatcherHoldoutR2, self).__init__(".* r2:.*") + self.reset() + + def run(self, line): + tokens = line.split() + self.ho_r2 = tokens[3] + + def reset(self): + self.ho_r2 = "0" + + matcherPoints = MatcherPoints() matcherStats = MatcherStats() matcherEarly = MatcherEarly() -grepper = utils.Grepper([matcherPoints, matcherStats, matcherEarly]) +matcherHO_MSE = MatcherHoldoutMSE() +matcherHO_MAE = MatcherHoldoutMAE() +matcherHO_R2 = MatcherHoldoutR2() +grepper = utils.Grepper([matcherPoints, matcherStats, matcherEarly, + matcherHO_MSE, matcherHO_MAE, matcherHO_R2]) for node in nodes: cell = nodes[node] @@ -98,6 +143,9 @@ def reset(self): columns["R2"] .append(matcherStats.r2) columns["VAL_LOSS"].append(matcherStats.val_loss) columns["EARLY"] .append(matcherEarly.early) + columns["HO_MSE"] .append(matcherHO_MSE.ho_mse) + columns["HO_MAE"] .append(matcherHO_MAE.ho_mae) + columns["HO_R2"] .append(matcherHO_R2 .ho_r2) grepper.reset() -utils.columnPrint(columns, "llrrrrrr") +utils.columnPrint(columns, "llrrrrrrrrr") From 4870dcd908f29ecdd9f30540ade4e2ebe8b33d2c Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 10 May 2022 16:21:20 -0500 Subject: [PATCH 280/601] Use Pandas --- workflows/cp-leaveout/scripts/leaf-stats.py | 35 +++++++++++---------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/workflows/cp-leaveout/scripts/leaf-stats.py b/workflows/cp-leaveout/scripts/leaf-stats.py index 72b350fd..8f1fcd5e 100644 --- a/workflows/cp-leaveout/scripts/leaf-stats.py +++ b/workflows/cp-leaveout/scripts/leaf-stats.py @@ -2,6 +2,8 @@ import argparse, os, sys +import pandas as pd + import utils parser = argparse.ArgumentParser(description='Print leaf stats') @@ -26,12 +28,10 @@ from collections import OrderedDict -headers = [ "CELL", "NODE", "POINTS", "EPOCHS", "MAE", "R2", "VAL_LOSS", +columns = [ "CELL", "NODE", "POINTS", "EPOCHS", "MAE", "R2", "VAL_LOSS", "EARLY", "HO_MSE", "HO_MAE", "HO_R2" ] -columns = OrderedDict() -for header in headers: - columns[header] = [] +df = pd.DataFrame(columns=columns) class MatcherPoints(utils.Matcher): @@ -135,17 +135,20 @@ def reset(self): cell = nodes[node] log = f"{args.directory}/run/{node}/save/python.log" grepper.grep(log) - columns["CELL"] .append(cell) - columns["NODE"] .append(node) - columns["POINTS"] .append(matcherPoints.points) - columns["EPOCHS"] .append(matcherStats.epochs) - columns["MAE"] .append(matcherStats.mae) - columns["R2"] .append(matcherStats.r2) - columns["VAL_LOSS"].append(matcherStats.val_loss) - columns["EARLY"] .append(matcherEarly.early) - columns["HO_MSE"] .append(matcherHO_MSE.ho_mse) - columns["HO_MAE"] .append(matcherHO_MAE.ho_mae) - columns["HO_R2"] .append(matcherHO_R2 .ho_r2) + newrow = pd.DataFrame({ + "CELL" : [cell], + "NODE" : [node], + "POINTS" : [matcherPoints.points], + "EPOCHS" : [matcherStats.epochs], + "MAE" : [matcherStats.mae], + "R2" : [matcherStats.r2], + "VAL_LOSS" : [matcherStats.val_loss], + "EARLY" : [matcherEarly.early], + "HO_MSE" : [matcherHO_MSE.ho_mse], + "HO_MAE" : [matcherHO_MAE.ho_mae], + "HO_R2" : [matcherHO_R2 .ho_r2] + }) + df = pd.concat([df, newrow], ignore_index=True) grepper.reset() -utils.columnPrint(columns, "llrrrrrrrrr") +print(df.to_string()) From 93976fa666464cc201240301730e2575c35f9919 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 10 May 2022 19:04:08 -0500 Subject: [PATCH 281/601] Use tabulate --- workflows/cp-leaveout/scripts/leaf-stats.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/leaf-stats.py b/workflows/cp-leaveout/scripts/leaf-stats.py index 8f1fcd5e..bd5d4f83 100644 --- a/workflows/cp-leaveout/scripts/leaf-stats.py +++ b/workflows/cp-leaveout/scripts/leaf-stats.py @@ -151,4 +151,5 @@ def reset(self): df = pd.concat([df, newrow], ignore_index=True) grepper.reset() -print(df.to_string()) +from tabulate import tabulate +print(tabulate(df, headers='keys', tablefmt='plain')) From cf5f64cb0a9e7cd77ee72149cffa1622c25eae95 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 18 May 2022 15:24:34 -0400 Subject: [PATCH 282/601] o Fix hyper_parameter_map epochs error --- workflows/common/python/model_runner.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index 6ff6a37d..9c1658e4 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -181,8 +181,12 @@ def run(hyper_parameter_map, obj_return): stop_perf(Ps) finish = time.time() duration = finish - start - - epochs = hyper_parameter_map['epochs'] + + # check for epochs if not present set to 1, used for checking early stopping in function get_results + if 'epochs' in hyper_parameter_map: + epochs = hyper_parameter_map['epochs'] + else: + epochs = 1 # Default result if there is no val_loss (as in infer.py) result = 0 From 7dda5d75ae11cf7aeb4b5a53a7bbdbab99670d0b Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 23 May 2022 14:20:45 -0500 Subject: [PATCH 283/601] Clean up --- workflows/upf/swift/workflow.swift | 2 -- 1 file changed, 2 deletions(-) diff --git a/workflows/upf/swift/workflow.swift b/workflows/upf/swift/workflow.swift index 98afcb1a..3c1a4b88 100644 --- a/workflows/upf/swift/workflow.swift +++ b/workflows/upf/swift/workflow.swift @@ -39,8 +39,6 @@ foreach params,i in upf_lines { printf("params: %s", params); id = json_get(params, "id"); - // NOTE: obj() is in the obj_*.swift supplied by workflow.sh - // id = "id_%02i"%i; results[i] = obj(params, id); assert(results[i] != "EXCEPTION", "exception in obj()!"); } From 6cc705aa4fb7f33d6e12db9e19dac81793926f04 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 23 May 2022 14:20:55 -0500 Subject: [PATCH 284/601] Add findable prefix --- workflows/upf/swift/workflow.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/upf/swift/workflow.swift b/workflows/upf/swift/workflow.swift index 3c1a4b88..687a18ab 100644 --- a/workflows/upf/swift/workflow.swift +++ b/workflows/upf/swift/workflow.swift @@ -46,4 +46,4 @@ foreach params,i in upf_lines // Join all result values into one big semicolon-delimited string string result = join(results, ";"); // and print it -printf(result); +printf("WORKFLOW RESULT: " + result); From 6d275b49c74a5698309bdbdcb703d07bf04e3c21 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 23 May 2022 14:22:02 -0500 Subject: [PATCH 285/601] Demo scripts --- workflows/upf/test/cfg-sys-demo-1.sh | 23 +++++++++++++++++++++++ workflows/upf/test/demo-1.json | 3 +++ workflows/upf/test/demo-sweep-1.sh | 24 ++++++++++++++++++++++++ 3 files changed, 50 insertions(+) create mode 100644 workflows/upf/test/cfg-sys-demo-1.sh create mode 100644 workflows/upf/test/demo-1.json create mode 100755 workflows/upf/test/demo-sweep-1.sh diff --git a/workflows/upf/test/cfg-sys-demo-1.sh b/workflows/upf/test/cfg-sys-demo-1.sh new file mode 100644 index 00000000..e4788c04 --- /dev/null +++ b/workflows/upf/test/cfg-sys-demo-1.sh @@ -0,0 +1,23 @@ + +# CFG SYS DEMO 1 + + +# The number of MPI processes +# Note that 1 process is reserved for Swift/T +# For example, if PROCS=4 that gives you 3 workers, +# i.e., 3 concurrent Keras runs. +export PROCS=${PROCS:-6} + +# MPI processes per node. This should not exceed PROCS. +export PPN=${PPN:-6} + +# Summit: +export QUEUE=${QUEUE:-batch} +export PROJECT=med106 +export TURBINE_LAUNCH_OPTIONS="-a1 -g1 -c7" + +# export WALLTIME=${WALLTIME:-0:30} + +# Benchmark run timeout: benchmark run will timeouT +# after the specified number of seconds. -1 is no timeout. +BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} diff --git a/workflows/upf/test/demo-1.json b/workflows/upf/test/demo-1.json new file mode 100644 index 00000000..fa511783 --- /dev/null +++ b/workflows/upf/test/demo-1.json @@ -0,0 +1,3 @@ +{"id": "test1", "epochs": 3, "dense": "200 15"} +{"id": "test2", "epochs": 3, "dense": "200 20"} +{"id": "test3", "epochs": 3, "dense": "200 25"} diff --git a/workflows/upf/test/demo-sweep-1.sh b/workflows/upf/test/demo-sweep-1.sh new file mode 100755 index 00000000..5eab8d82 --- /dev/null +++ b/workflows/upf/test/demo-sweep-1.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -eu + +# DEMO SWEEP 1 + +if (( ${#} != 2 )) +then + echo "usage: test BENCHMARK_NAME SITE" + exit 1 +fi + +export MODEL_NAME=$1 +SITE=$2 + +# Self-configure +THIS=$( cd $( dirname $0 ) ; /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. ; /bin/pwd ) +WORKFLOWS_ROOT=$( cd $THIS/../.. ; /bin/pwd ) +export EMEWS_PROJECT_ROOT + +export OBJ_RETURN="val_loss" +CFG_SYS=$THIS/cfg-sys-1.sh + +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $THIS/demo-1.json From 0bef49a596fa27b883e28cabc7e4917eb93ccd4a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 1 Jun 2022 11:17:04 +0200 Subject: [PATCH 286/601] flake8 fixes --- workflows/cp-leaveout/scripts/leaf-stats.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/workflows/cp-leaveout/scripts/leaf-stats.py b/workflows/cp-leaveout/scripts/leaf-stats.py index bd5d4f83..7bcc11de 100644 --- a/workflows/cp-leaveout/scripts/leaf-stats.py +++ b/workflows/cp-leaveout/scripts/leaf-stats.py @@ -1,6 +1,7 @@ + # LEAF STATS PY -import argparse, os, sys +import argparse import pandas as pd @@ -26,13 +27,12 @@ cell = tokens[1] nodes[node] = cell -from collections import OrderedDict - columns = [ "CELL", "NODE", "POINTS", "EPOCHS", "MAE", "R2", "VAL_LOSS", "EARLY", "HO_MSE", "HO_MAE", "HO_R2" ] df = pd.DataFrame(columns=columns) + class MatcherPoints(utils.Matcher): def __init__(self): @@ -95,6 +95,7 @@ def run(self, line): def reset(self): self.ho_mse = "0" + class MatcherHoldoutMAE(utils.Matcher): def __init__(self): @@ -108,6 +109,7 @@ def run(self, line): def reset(self): self.ho_mae = "0" + class MatcherHoldoutR2(utils.Matcher): def __init__(self): From dfcc9a3ec289bfb8a412c24438ea9ad92c6c20cc Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 29 Jul 2022 14:17:55 -0500 Subject: [PATCH 287/601] WS --- workflows/common/python/model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index 9c1658e4..21561bdb 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -181,7 +181,7 @@ def run(hyper_parameter_map, obj_return): stop_perf(Ps) finish = time.time() duration = finish - start - + # check for epochs if not present set to 1, used for checking early stopping in function get_results if 'epochs' in hyper_parameter_map: epochs = hyper_parameter_map['epochs'] From 7aa11519d866e283fc72465a2bc74236eb686c4d Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 29 Jul 2022 14:19:15 -0500 Subject: [PATCH 288/601] flake8 fixes --- workflows/common/python/model_runner.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index 21561bdb..20646873 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -8,7 +8,6 @@ import os import sys import time -import numpy as np import importlib import runner_utils from runner_utils import ModelResult @@ -161,8 +160,6 @@ def run(hyper_parameter_map, obj_return): history = None exception = False - from tensorflow.errors import InvalidArgumentError - # Run the model! log("PKG RUN START") @@ -209,13 +206,14 @@ def run(hyper_parameter_map, obj_return): def get_obj_return(): obj_return = os.getenv('OBJ_RETURN') valid_obj_returns = [ 'loss', 'val_loss', 'val_corr', 'val_acc' ] - if obj_return == None: + if obj_return is None: raise Exception('No OBJ_RETURN was in the environment!') if obj_return not in valid_obj_returns: raise Exception('Invalid value for OBJ_RETURN: use: ' + str(valid_obj_returns)) return obj_return + def load_pre_post(hyper_parameter_map, key): module = None if key in hyper_parameter_map: @@ -223,22 +221,25 @@ def load_pre_post(hyper_parameter_map, key): module = importlib.import_module(module_name) return module + def run_pre(hyper_parameter_map): module = load_pre_post(hyper_parameter_map, 'pre_module') result = ModelResult.SUCCESS - if module != None: + if module is not None: logger.debug('PRE RUN START') result = module.pre_run(hyper_parameter_map) logger.debug('PRE RUN STOP') return result + def run_post(hyper_parameter_map, output_map): module = load_pre_post(hyper_parameter_map, 'post_module') - if module != None: + if module is not None: logger.debug('POST RUN START') module.post_run(hyper_parameter_map, output_map) logger.debug('POST RUN STOP') + def run_model(hyper_parameter_map): # In-memory Python runs may not create sys.argv if 'argv' not in dir(sys): @@ -275,6 +276,7 @@ def run_model(hyper_parameter_map): logger.info('RUN STOP') return (result, history) + def setup_params(pkg, hyper_parameter_map, params_arg): params = pkg.initialize_parameters(**params_arg) logger.debug('PARAM UPDATE START') @@ -353,7 +355,7 @@ def get_results(history, obj_return, epochs_expected): logger.info('main: RUN START') import sys - ( _, # The Python program name (unused) + ( _, # The Python program name (unused) param_string, instance_directory, framework, @@ -365,7 +367,7 @@ def get_results(history, obj_return, epochs_expected): framework, out_dir_key='save') hyper_parameter_map['model_name'] = os.getenv('MODEL_NAME') - if hyper_parameter_map['model_name'] == None: + if hyper_parameter_map['model_name'] is None: raise Exception('No MODEL_NAME was in the environment!') hyper_parameter_map['experiment_id'] = os.getenv('EXPID') hyper_parameter_map['run_id'] = runid From 8031f57b3eb222c98f5702d2e033b05f38931eab Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 19 Oct 2022 11:30:03 -0700 Subject: [PATCH 289/601] Update notes based on experience on Lambda --- workflows/common/R/README.adoc | 20 ++++++++++++++++++++ workflows/common/R/install-candle.R | 21 ++++----------------- 2 files changed, 24 insertions(+), 17 deletions(-) create mode 100644 workflows/common/R/README.adoc diff --git a/workflows/common/R/README.adoc b/workflows/common/R/README.adoc new file mode 100644 index 00000000..809bc44f --- /dev/null +++ b/workflows/common/R/README.adoc @@ -0,0 +1,20 @@ + +Run install-candle.sh to set up R for CANDLE HPO via mlrMBO. + +Unstructured historical notes follow. + +# Force Plotly 4.5.6 - not latest! Do not want shiny/httpuv, it does not work on Cooley! +#install.packages("https://cran.r-project.org/src/contrib/Archive/plotly/plotly_4.5.6.tar.gz") # This dies with a dependency error but plotly is installed anyway as a dependency of the following packages, so I'm putting it back into the PKGS list (ALW, 9/29/20) +#install.packages("https://cran.r-project.org/src/contrib/Archive/jsonlite/jsonlite_1.7.0.tar.gz") # ALW adding this on 9/12/20 (and removing jsonlite from PKGS list below) because sometime in the first two weeks of Sept 2020 the default jsonlite version became 1.7.1 and this seems to throw an error that looks to me like a bug that should be fixed with time; e.g., while everything worked in early Sept 2020 (probably 9/2/20), now on 9/12/20 I get this error: +# * DONE (jsonlite) +# 1): succeeded '/usr/local/apps/R/4.0/4.0.0/lib64/R/bin/R CMD INSTALL -l '/gpfs/gsfs9/users/BIDS-HPC/public/software/distributions/candle/dev_2/builds/R/libs' '/lscratch/64803361/Rtmpnd5yDC/downloaded_packages/jsonlite_1.7.1.tar.gz'' +# The downloaded source packages are in +# /lscratch/64803361/Rtmpnd5yDC/downloaded_packages +# [1] "" +# LOAD: jsonlite +# Error in value[[3L]](cond) : +# Package 'jsonlite' version 1.7.0 cannot be unloaded: +# Error in unloadNamespace(package) : namespace jsonlite is imported by plotly so cannot be unloaded +# Calls: library ... tryCatch -> tryCatchList -> tryCatchOne -> +# Execution halted +# ****NOTE**** that I tried installing both plotly and jsonlite the normal way (in the PKGS below instead of a specific version above) and I got the same error diff --git a/workflows/common/R/install-candle.R b/workflows/common/R/install-candle.R index 426cbf83..1f27e3d7 100644 --- a/workflows/common/R/install-candle.R +++ b/workflows/common/R/install-candle.R @@ -9,33 +9,20 @@ NCPUS = 16 r <- getOption("repos") +# Change this mirror as needed: # r["CRAN"] <- "http://cran.cnr.berkeley.edu/" r["CRAN"] <- "http://cran.wustl.edu/" options(repos = r) -# Force Plotly 4.5.6 - not latest! Do not want shiny/httpuv, it does not work on Cooley! -#install.packages("https://cran.r-project.org/src/contrib/Archive/plotly/plotly_4.5.6.tar.gz") # This dies with a dependency error but plotly is installed anyway as a dependency of the following packages, so I'm putting it back into the PKGS list (ALW, 9/29/20) -#install.packages("https://cran.r-project.org/src/contrib/Archive/jsonlite/jsonlite_1.7.0.tar.gz") # ALW adding this on 9/12/20 (and removing jsonlite from PKGS list below) because sometime in the first two weeks of Sept 2020 the default jsonlite version became 1.7.1 and this seems to throw an error that looks to me like a bug that should be fixed with time; e.g., while everything worked in early Sept 2020 (probably 9/2/20), now on 9/12/20 I get this error: -# * DONE (jsonlite) -# 1): succeeded '/usr/local/apps/R/4.0/4.0.0/lib64/R/bin/R CMD INSTALL -l '/gpfs/gsfs9/users/BIDS-HPC/public/software/distributions/candle/dev_2/builds/R/libs' '/lscratch/64803361/Rtmpnd5yDC/downloaded_packages/jsonlite_1.7.1.tar.gz'' -# The downloaded source packages are in -# /lscratch/64803361/Rtmpnd5yDC/downloaded_packages -# [1] "" -# LOAD: jsonlite -# Error in value[[3L]](cond) : -# Package 'jsonlite' version 1.7.0 cannot be unloaded: -# Error in unloadNamespace(package) : namespace jsonlite is imported by plotly so cannot be unloaded -# Calls: library ... tryCatch -> tryCatchList -> tryCatchOne -> -# Execution halted -# ****NOTE**** that I tried installing both plotly and jsonlite the normal way (in the PKGS below instead of a specific version above) and I got the same error - +# Do plotly early in the list: It requires OpenSSL and Curl headers +# which may not be available. PKGS <- list( "RInside", "plotly", "jsonlite", "rgenoud", "DiceKriging", - # not available for R 3.6.1 : needed for mlrMBO HPO + # not available for R 3.6.1 : needed for mlrMBO HPO: "randomForest", "parallelMap", # requires smoof requires misc3d requires --with-tcltk : From ebd59dec2f3042e828c352543a62ea34893d604c Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 19 Oct 2022 11:30:14 -0700 Subject: [PATCH 290/601] This uses bash syntax --- workflows/common/R/install-candle.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/R/install-candle.sh b/workflows/common/R/install-candle.sh index 1f6d8e19..8c30626b 100755 --- a/workflows/common/R/install-candle.sh +++ b/workflows/common/R/install-candle.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash set -eu # INSTALL CANDLE R From d8236d4e6256f85b3f77c524d565a0b21ca67d38 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 19 Oct 2022 13:59:32 -0500 Subject: [PATCH 291/601] Initial env file for Lambda- seems to work --- workflows/common/sh/env-lambda.sh | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 workflows/common/sh/env-lambda.sh diff --git a/workflows/common/sh/env-lambda.sh b/workflows/common/sh/env-lambda.sh new file mode 100644 index 00000000..bc21f858 --- /dev/null +++ b/workflows/common/sh/env-lambda.sh @@ -0,0 +1,27 @@ + +# ENV Lambda +# Environment settings for Lambda (Swift, Python, R, Tcl, etc.) + +# Everything is installed in here: +SFW=/home/woz/Public/sfw + +SWIFT=$SFW/swift-t/2022-10-19 +PY=$SFW/Anaconda +EQR=$SFW/EQ-R +R=$SFW/R-4.1.0 + +PATH=$SWIFT/stc/bin:$PATH +PATH=$PY/bin:$PATH + +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:$R/lib/R/lib + +# How to run CANDLE models: +SWIFT_IMPL="app" + +# Log settings to output +echo "Programs:" +which python swift-t | nl +# Cf. utils.sh +show PYTHONHOME +log_path LD_LIBRARY_PATH +log_path PYTHONPATH From af0ea9fa3a3c93171cc7a83592b4cc70f8240d1a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 19 Oct 2022 13:59:45 -0500 Subject: [PATCH 292/601] Fix header --- workflows/common/sh/env-theta.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/common/sh/env-theta.sh b/workflows/common/sh/env-theta.sh index c2362f1d..dffcc4f0 100644 --- a/workflows/common/sh/env-theta.sh +++ b/workflows/common/sh/env-theta.sh @@ -1,6 +1,6 @@ -# LANGS Theta -# Language settings for Theta (Swift, Python, R, Tcl, etc.) +# ENV Theta +# Environment settings for Theta (Swift, Python, R, Tcl, etc.) # TCL=/home/wozniak/Public/sfw/theta/tcl-8.6.1 # export R=/home/wozniak/Public/sfw/theta/R-3.4.0/lib64/R From 296f9a87d24269c10eed2642fafc8863e8d7e3b9 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 19 Oct 2022 13:59:59 -0500 Subject: [PATCH 293/601] Clean up --- workflows/mlrMBO/swift/workflow.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index e0ff804d..7ba80b15 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -161,8 +161,6 @@ else STDOUT="" fi -echo WF LLP $LD_LIBRARY_PATH - # ALW 2021-01-21: Please don't comment out the "-o $TURBINE_OUTPUT/workflow.tic" option below; otherwise, we get permissions issues on Biowulf. Thanks! set -x swift-t -O 0 -n $PROCS \ From ad6b06a22c03ebf25f5f8dd4386642580018325a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 19 Oct 2022 14:00:05 -0500 Subject: [PATCH 294/601] Handle unset LD_LIBRARY_PATH --- workflows/mlrMBO/swift/workflow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index 7ba80b15..0197485c 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -169,7 +169,7 @@ swift-t -O 0 -n $PROCS \ -p -I $EQR -r $EQR \ -I $OBJ_DIR \ -i $OBJ_MODULE \ - -e LD_LIBRARY_PATH=$LD_LIBRARY_PATH \ + -e LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} \ -e TURBINE_RESIDENT_WORK_WORKERS=$TURBINE_RESIDENT_WORK_WORKERS \ -e RESIDENT_WORK_RANKS=$RESIDENT_WORK_RANKS \ -e APP_PYTHONPATH \ From c52e3cff6573b5a5eaed9cc860e2283575e6c9a3 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 19 Oct 2022 14:00:13 -0500 Subject: [PATCH 295/601] Set resident worker for mlrMBO --- workflows/mlrMBO/test/cfg-sys-nightly.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/workflows/mlrMBO/test/cfg-sys-nightly.sh b/workflows/mlrMBO/test/cfg-sys-nightly.sh index a62ffd8e..ad2d8352 100644 --- a/workflows/mlrMBO/test/cfg-sys-nightly.sh +++ b/workflows/mlrMBO/test/cfg-sys-nightly.sh @@ -37,3 +37,10 @@ export SH_TIMEOUT=${SH_TIMEOUT:-} # Ignore errors: If 1, unknown errors will be reported to model.log # but will not bring down the Swift workflow. See model.sh . export IGNORE_ERRORS=0 + +# Resident task worker rank for mlrMBO algorithm +if [[ ${TURBINE_RESIDENT_WORK_WORKERS:-} == "" ]] +then + export TURBINE_RESIDENT_WORK_WORKERS=1 + export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) +fi From 9aa2792d190249cbcc899ff4e6b593c1578ee855 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Sun, 23 Oct 2022 14:56:07 -0700 Subject: [PATCH 296/601] o Fix turbine setting for test-1 o uses homes instead of home for env-lambda --- workflows/common/sh/env-lambda.sh | 2 +- workflows/mlrMBO/test/cfg-sys-1.sh | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/workflows/common/sh/env-lambda.sh b/workflows/common/sh/env-lambda.sh index bc21f858..b64e3640 100644 --- a/workflows/common/sh/env-lambda.sh +++ b/workflows/common/sh/env-lambda.sh @@ -3,7 +3,7 @@ # Environment settings for Lambda (Swift, Python, R, Tcl, etc.) # Everything is installed in here: -SFW=/home/woz/Public/sfw +SFW=/homes/woz/Public/sfw SWIFT=$SFW/swift-t/2022-10-19 PY=$SFW/Anaconda diff --git a/workflows/mlrMBO/test/cfg-sys-1.sh b/workflows/mlrMBO/test/cfg-sys-1.sh index 147cdd7a..5720edcd 100644 --- a/workflows/mlrMBO/test/cfg-sys-1.sh +++ b/workflows/mlrMBO/test/cfg-sys-1.sh @@ -35,6 +35,13 @@ BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} # This timeout is implemented with the shell command 'timeout' export SH_TIMEOUT=${SH_TIMEOUT:-} +# Resident task worker rank for mlrMBO algorithm +if [[ ${TURBINE_RESIDENT_WORK_WORKERS:-} == "" ]] +then + export TURBINE_RESIDENT_WORK_WORKERS=1 + export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) +fi + # Ignore errors: If 1, unknown errors will be reported to model.log # but will not bring down the Swift workflow. See model.sh . export IGNORE_ERRORS=0 From 090c4012a181849f27f30442dcfa7e0afe3ad43e Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 26 Oct 2022 11:20:34 -0700 Subject: [PATCH 297/601] o Fix LD_LIBRARY_PATH --- workflows/common/sh/env-lambda.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/sh/env-lambda.sh b/workflows/common/sh/env-lambda.sh index b64e3640..e3951c64 100644 --- a/workflows/common/sh/env-lambda.sh +++ b/workflows/common/sh/env-lambda.sh @@ -13,7 +13,7 @@ R=$SFW/R-4.1.0 PATH=$SWIFT/stc/bin:$PATH PATH=$PY/bin:$PATH -export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:$R/lib/R/lib +export LD_LIBRARY_PATH=$R/lib/R/lib:${LD_LIBRARY_PATH} # How to run CANDLE models: SWIFT_IMPL="app" From 703b5e09ad4506fb5f451c867d77499b442c4f4a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 26 Oct 2022 13:21:24 -0500 Subject: [PATCH 298/601] New langs-app-lambda --- workflows/common/sh/langs-app-lambda.sh | 27 +++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 workflows/common/sh/langs-app-lambda.sh diff --git a/workflows/common/sh/langs-app-lambda.sh b/workflows/common/sh/langs-app-lambda.sh new file mode 100644 index 00000000..8133fff3 --- /dev/null +++ b/workflows/common/sh/langs-app-lambda.sh @@ -0,0 +1,27 @@ + +# LANGS APP LAMBDA + +echo "langs-app-lambda ..." + +SFW=/home/woz/Public/sfw + +PY=$SFW/Anaconda + +PATH=$PY/bin:$PATH + +echo "Programs:" +which python + +PYTHONPATH=${APP_PYTHONPATH:-}:${PYTHONPATH:-} + +# Cf. utils.sh +show PYTHONHOME +log_path LD_LIBRARY_PATH +log_path PYTHONPATH + +echo "APP_PYTHONPATH: ${APP_PYTHONPATH:-}" +echo "PYTHONPATH: $PYTHONPATH" +echo "PYTHONHOME: ${PYTHONHOME:-}" +export PYTHONPATH + +echo "langs-app-lambda done." From 6cac6e948d3a18771ecb3e31c7ea520376007b94 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 26 Oct 2022 13:25:34 -0500 Subject: [PATCH 299/601] Handle empty LD_LIBRARY_PATH --- workflows/common/sh/env-lambda.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/sh/env-lambda.sh b/workflows/common/sh/env-lambda.sh index e3951c64..1a60d65d 100644 --- a/workflows/common/sh/env-lambda.sh +++ b/workflows/common/sh/env-lambda.sh @@ -13,7 +13,7 @@ R=$SFW/R-4.1.0 PATH=$SWIFT/stc/bin:$PATH PATH=$PY/bin:$PATH -export LD_LIBRARY_PATH=$R/lib/R/lib:${LD_LIBRARY_PATH} +export LD_LIBRARY_PATH=$R/lib/R/lib:${LD_LIBRARY_PATH:-} # How to run CANDLE models: SWIFT_IMPL="app" From f935e9d572400f689ca1e7edbd4b1d3ec3b878de Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 26 Oct 2022 14:08:32 -0500 Subject: [PATCH 300/601] Do not auto-add Benchmarks/common to sys.path --- workflows/common/python/model_runner.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index 20646873..13e05d62 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -22,10 +22,11 @@ python_dir = os.getenv("MODEL_PYTHON_DIR") if python_dir: sys.path.append(python_dir) -benchmarks_root = os.getenv("BENCHMARKS_ROOT") -if benchmarks_root: - sys.path.append(benchmarks_root+'/common') +# This is for candle_lib, which is not in Benchmarks any more +# benchmarks_root = os.getenv("BENCHMARKS_ROOT") +# if benchmarks_root: +# sys.path.append(benchmarks_root+'/common') # Report PYTHONPATH for debugging print("sys.path:") From 724957e1558424ae7692cacf8bc94801d39d40c9 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 26 Oct 2022 14:08:53 -0500 Subject: [PATCH 301/601] Better configuration logging --- workflows/common/sh/langs-app-lambda.sh | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/workflows/common/sh/langs-app-lambda.sh b/workflows/common/sh/langs-app-lambda.sh index 8133fff3..82e0943b 100644 --- a/workflows/common/sh/langs-app-lambda.sh +++ b/workflows/common/sh/langs-app-lambda.sh @@ -12,16 +12,12 @@ PATH=$PY/bin:$PATH echo "Programs:" which python -PYTHONPATH=${APP_PYTHONPATH:-}:${PYTHONPATH:-} +export PYTHONPATH=${APP_PYTHONPATH:-}:${PYTHONPATH:-} # Cf. utils.sh -show PYTHONHOME -log_path LD_LIBRARY_PATH +log_path APP_PYTHONPATH log_path PYTHONPATH - -echo "APP_PYTHONPATH: ${APP_PYTHONPATH:-}" -echo "PYTHONPATH: $PYTHONPATH" -echo "PYTHONHOME: ${PYTHONHOME:-}" -export PYTHONPATH +log_path LD_LIBRARY_PATH +show PYTHONHOME echo "langs-app-lambda done." From 32a6b60c2c2022187e2be13a6fdec004bedb837c Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 26 Oct 2022 14:09:19 -0500 Subject: [PATCH 302/601] Clean up --- workflows/mlrMBO/swift/workflow.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index 0197485c..ce5290db 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -16,7 +16,6 @@ fi BENCHMARKS_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd ) export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/P1B1:$BENCHMARKS_ROOT/Pilot1/Attn1:$BENCHMARKS_ROOT/Pilot1/NT3:$BENCHMARKS_ROOT/examples/ADRP:$BENCHMARKS_ROOT/examples/xform-smiles -# :$BENCHMARKS_ROOT/Pilot1/P1B1:$BENCHMARKS_ROOT/Pilot1/Combo:$BENCHMARKS_ROOT/Pilot2/P2B1:$BENCHMARKS_ROOT/Pilot3/P3B1:$BENCHMARKS_ROOT/Pilot3/P3B3:$BENCHMARKS_ROOT/Pilot3/P3B4 export BENCHMARK_TIMEOUT export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} From bf2377352fa97d07553668da4ad8c96c0d15775d Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 26 Oct 2022 14:09:35 -0500 Subject: [PATCH 303/601] Remove Benchmarks/common from PYTHONPATH, APP_PYTHONPATH --- workflows/mlrMBO/swift/workflow.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index ce5290db..92e88f41 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -57,9 +57,9 @@ fi echo "Running "$MODEL_NAME "workflow" # Set PYTHONPATH for BENCHMARK related stuff -PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common +PYTHONPATH+=:$BENCHMARK_DIR # :$BENCHMARKS_ROOT/common # This is now candle_lib # Set PYTHONPATH for BENCHMARK related stuff in obj_app mode -export APP_PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common +export APP_PYTHONPATH+=:$BENCHMARK_DIR # :$BENCHMARKS_ROOT/common # This is now candle_lib source_site env $SITE source_site sched $SITE From 202a1bad6674dc39b4e638582fa19c6a12459c21 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 26 Oct 2022 14:11:05 -0500 Subject: [PATCH 304/601] Add check for CANDLE_DATA_DIR before submitting workflow --- workflows/mlrMBO/swift/workflow.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index 92e88f41..26eb9f1d 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -160,6 +160,12 @@ else STDOUT="" fi +if [[ ${CANDLE_DATA_DIR:-} == "" ]] +then + echo "CANDLE_DATA_DIR is not set in the environment! Exiting..." + exit 1 +fi + # ALW 2021-01-21: Please don't comment out the "-o $TURBINE_OUTPUT/workflow.tic" option below; otherwise, we get permissions issues on Biowulf. Thanks! set -x swift-t -O 0 -n $PROCS \ @@ -186,11 +192,11 @@ swift-t -O 0 -n $PROCS \ -e SH_TIMEOUT \ -e TURBINE_STDOUT \ -e IGNORE_ERRORS \ + -e CANDLE_DATA_DIR \ $WAIT_ARG \ $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} |& \ tee $STDOUT - if (( ${PIPESTATUS[0]} )) then echo "workflow.sh: swift-t exited with error!" From 042d6eeb24096f5df56ac7aaafcc284cec994db8 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 2 Nov 2022 14:10:05 -0500 Subject: [PATCH 305/601] New Swift/T for Lambda --- workflows/common/sh/env-lambda.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/sh/env-lambda.sh b/workflows/common/sh/env-lambda.sh index 1a60d65d..6caf7ef1 100644 --- a/workflows/common/sh/env-lambda.sh +++ b/workflows/common/sh/env-lambda.sh @@ -5,7 +5,7 @@ # Everything is installed in here: SFW=/homes/woz/Public/sfw -SWIFT=$SFW/swift-t/2022-10-19 +SWIFT=$SFW/swift-t/2022-11-02 PY=$SFW/Anaconda EQR=$SFW/EQ-R R=$SFW/R-4.1.0 From 6738296b6a9e66515327115bc8e7d82d060d9c2a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 2 Nov 2022 14:11:03 -0500 Subject: [PATCH 306/601] Fix Bash quoting syntax --- workflows/common/sh/model.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index e43df732..d2a8c76b 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -83,11 +83,11 @@ PY_CMD=( "$WORKFLOWS_ROOT/common/python/model_runner.py" "$BENCHMARK_TIMEOUT" ) # The desired model command: -MODEL_CMD="python3 -u ${PY_CMD[@]}" +MODEL_CMD=( python3 -u "${PY_CMD[@]}" ) log "MODEL_CMD: ${MODEL_CMD[@]}" # Run Python! -if $TIMEOUT_CMD ${MODEL_CMD[@]} +if $TIMEOUT_CMD "${MODEL_CMD[@]}" then : # Assume success so we can keep a failed exit code else From f214869e4e0026f9b89f7919f854073b51ca31b3 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 3 Nov 2022 11:08:19 -0500 Subject: [PATCH 307/601] Draft IMPROVE integration --- workflows/common/python/model_runner.py | 2 +- workflows/common/python/runner_utils.py | 15 +++++++-- workflows/common/sh/model.sh | 45 +++++++++++++++++++------ workflows/common/swift/obj_app.swift | 12 +++++-- 4 files changed, 58 insertions(+), 16 deletions(-) diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index 13e05d62..496b3c6a 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -131,7 +131,7 @@ def run(hyper_parameter_map, obj_return): logger.info('run(): START:') sys.stdout.flush() - directory = hyper_parameter_map['instance_directory'] + directory = hyper_parameter_map['instance_directory'] # should be output_dir os.chdir(directory) with open(directory + '/rank.txt', 'w') as fp: diff --git a/workflows/common/python/runner_utils.py b/workflows/common/python/runner_utils.py index 739a8d0b..1d05443e 100644 --- a/workflows/common/python/runner_utils.py +++ b/workflows/common/python/runner_utils.py @@ -3,9 +3,9 @@ from enum import Enum try: - basestring + basestring except NameError: - basestring = str + basestring = str DATA_TYPES = {type(np.float16): 'f16', type(np.float32): 'f32', type(np.float64): 'f64'} @@ -90,3 +90,14 @@ class ModelResult(Enum): SUCCESS = 1 SKIP = 2 ERROR = 3 + + +def main(): + # Need argparse + if sys.argv[1] == "write_params": + # Assume we are in the correct directory + write_params(sys.argv[2], {}) + + +if __name__ == "__main__": + main() diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index d2a8c76b..750d93ef 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -34,7 +34,13 @@ RUNID=$3 # Each model run, runs in its own "instance" directory # Set instance_directory to that and cd into it. -INSTANCE_DIRECTORY=$TURBINE_OUTPUT/run/$RUNID +# TODO: rename INSTANCE_DIRECTORY to OUTPUT_DIR +if [[ $CANDLE_MODEL_TYPE = "SINGULARITY" ]] +then + INSTANCE_DIRECTORY=$CANDLE_DATA_DIR/output/$EXPID/run/$RUNID +else # "BENCHMARKS" + INSTANCE_DIRECTORY=$TURBINE_OUTPUT/run/$RUNID +fi # All stdout/stderr after this point goes into model.log ! mkdir -p $INSTANCE_DIRECTORY @@ -74,16 +80,35 @@ echo log "USING PYTHON:" $( which python ) echo -# The Python command line arguments: -PY_CMD=( "$WORKFLOWS_ROOT/common/python/model_runner.py" - "$PARAMS" - "$INSTANCE_DIRECTORY" - "$FRAMEWORK" - "$RUNID" - "$BENCHMARK_TIMEOUT" ) +# Construct the desired model command MODEL_CMD based on CANDLE_MODEL_TYPE: +if [[ $CANDLE_MODEL_TYPE == "SINGULARITY" ]] +then + # No model_runner, need to write parameters.txt explicitly: + python3 $WORKFLOWS_ROOT/common/python/runner_utils.py write_params "$PARAMS" + MODEL_CMD=( singularity exec --nv $CANDLE_IMAGE train.sh $ADLB_RANK_OFFSET + $CANDLE_DATA_DIR $INSTANCE_DIRECTORY/parameters.txt ) + # train.sh must write $INSTANCE_DIRECTORY/result.txt ! + # or + # Suggest: + # grep "CANDLE_RESULT: " $INSTANCE_DIRECTORY/model.log + # grep "CANDLE_ERROR:" + RESULT=$( sed -n '/val_loss:/{s/val_loss: \(.*\)/\1/;p}' | tail -1 ) + log "found result: $RESULT" + echo $RESULT > $INSTANCE_DIRECTORY/result.txt +else # "BENCHMARKS" + + # The Python command line arguments: + PY_CMD=( "$WORKFLOWS_ROOT/common/python/model_runner.py" + "$PARAMS" + "$INSTANCE_DIRECTORY" + "$FRAMEWORK" + "$RUNID" + "$BENCHMARK_TIMEOUT" ) + + MODEL_CMD=( python3 -u "${PY_CMD[@]}" ) + # model_runner/runner_utils writes result.txt +fi -# The desired model command: -MODEL_CMD=( python3 -u "${PY_CMD[@]}" ) log "MODEL_CMD: ${MODEL_CMD[@]}" # Run Python! diff --git a/workflows/common/swift/obj_app.swift b/workflows/common/swift/obj_app.swift index fce1cf9a..f381e754 100644 --- a/workflows/common/swift/obj_app.swift +++ b/workflows/common/swift/obj_app.swift @@ -12,8 +12,15 @@ string model_sh = getenv("MODEL_SH"); string turbine_output = getenv("TURBINE_OUTPUT"); - string outdir = "%s/run/%s" % (turbine_output, run_id); - // printf("running model shell script in: %s", outdir); + string outdir; + if (getenv("CANDLE_MODEL_TYPE") == "SINGULARITY") { + outdir = "%s/run/%s" % (turbine_output, run_id); + } else { + outdir = "%s/output/%s/run/%s" % (getenv("CANDLE_DATA_DIR"), getenv("EXPID"), run_id); + } + + printf("running model shell script in: %s", outdir); + // We do not use a file type here because this file may not be created, // which is handled by get_results() string result_file = outdir/"result.txt"; @@ -35,7 +42,6 @@ string model_sh = getenv("MODEL_SH"); string turbine_output = getenv("TURBINE_OUTPUT"); - string outdir = "%s/run/%s" % (turbine_output, run_id); // printf("running model shell script in: %s", outdir); // We do not use a file type here because this file may not be created, // which is handled by get_results() From 57943d40ff6474de8841ea425339b91cf307e412 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Thu, 3 Nov 2022 15:52:26 -0700 Subject: [PATCH 308/601] o Fixes to initial draft and moving towards a test case on lambda0 o Need to agree on conventions for loading the initial hyperparameters file on Supervisor and creating and parsing for results for value to be optimized --- workflows/common/python/runner_utils.py | 5 +- workflows/common/sh/model.sh | 15 +++-- workflows/common/swift/obj_app.swift | 4 +- workflows/mlrMBO/swift/workflow.sh | 8 ++- workflows/mlrMBO/swift/workflow.swift | 4 ++ workflows/mlrMBO/test/test-nightly.sh | 2 + workflows/mlrMBO/test/test-singularity.sh | 70 +++++++++++++++++++++++ 7 files changed, 99 insertions(+), 9 deletions(-) create mode 100755 workflows/mlrMBO/test/test-singularity.sh diff --git a/workflows/common/python/runner_utils.py b/workflows/common/python/runner_utils.py index 1d05443e..bd587082 100644 --- a/workflows/common/python/runner_utils.py +++ b/workflows/common/python/runner_utils.py @@ -1,5 +1,5 @@ import numpy as np -import json, os +import json, os, sys from enum import Enum try: @@ -95,8 +95,9 @@ class ModelResult(Enum): def main(): # Need argparse if sys.argv[1] == "write_params": + hyper_parameter_map = json.loads(sys.argv[3]) # Assume we are in the correct directory - write_params(sys.argv[2], {}) + write_params(sys.argv[2], hyper_parameter_map) if __name__ == "__main__": diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index 750d93ef..8f137dbe 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -63,6 +63,7 @@ log() log "START" log "MODEL_NAME: $MODEL_NAME" log "RUNID: $RUNID" +# log "CANDLE_MODEL_TYPE: $CANDLE_MODEL_TYPE" # Source langs-app-{SITE} from workflow/common/sh/ (cf. utils.sh) if [[ ${WORKFLOWS_ROOT:-} == "" ]] @@ -83,18 +84,24 @@ echo # Construct the desired model command MODEL_CMD based on CANDLE_MODEL_TYPE: if [[ $CANDLE_MODEL_TYPE == "SINGULARITY" ]] then + # No model_runner, need to write parameters.txt explicitly: - python3 $WORKFLOWS_ROOT/common/python/runner_utils.py write_params "$PARAMS" + # get hyper_parameter_map to pass as 2nd argument + + python3 $WORKFLOWS_ROOT/common/python/runner_utils.py write_params $PARAMS $INIT_PARAMS_FILE MODEL_CMD=( singularity exec --nv $CANDLE_IMAGE train.sh $ADLB_RANK_OFFSET $CANDLE_DATA_DIR $INSTANCE_DIRECTORY/parameters.txt ) # train.sh must write $INSTANCE_DIRECTORY/result.txt ! # or # Suggest: + + # Uncomment later # grep "CANDLE_RESULT: " $INSTANCE_DIRECTORY/model.log # grep "CANDLE_ERROR:" - RESULT=$( sed -n '/val_loss:/{s/val_loss: \(.*\)/\1/;p}' | tail -1 ) - log "found result: $RESULT" - echo $RESULT > $INSTANCE_DIRECTORY/result.txt + # RESULT=$( sed -n '/val_loss:/{s/val_loss: \(.*\)/\1/;p}' | tail -1 ) + # log "found result: $RESULT" + # echo $RESULT > $INSTANCE_DIRECTORY/result.txt + echo $MODEL_CMD else # "BENCHMARKS" # The Python command line arguments: diff --git a/workflows/common/swift/obj_app.swift b/workflows/common/swift/obj_app.swift index f381e754..66a4a40b 100644 --- a/workflows/common/swift/obj_app.swift +++ b/workflows/common/swift/obj_app.swift @@ -13,7 +13,8 @@ string turbine_output = getenv("TURBINE_OUTPUT"); string outdir; - if (getenv("CANDLE_MODEL_TYPE") == "SINGULARITY") { + string myenv = getenv("CANDLE_MODEL_TYPE"); + if (myenv == "SINGULARITY") { outdir = "%s/run/%s" % (turbine_output, run_id); } else { outdir = "%s/output/%s/run/%s" % (getenv("CANDLE_DATA_DIR"), getenv("EXPID"), run_id); @@ -45,6 +46,7 @@ // printf("running model shell script in: %s", outdir); // We do not use a file type here because this file may not be created, // which is handled by get_results() + string outdir = "%s/run/%s" % (turbine_output, run_id); string result_file = outdir/"result.txt"; wait (@prio=prio run_model(model_sh, params, run_id)) { diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index 26eb9f1d..f3fb4dbb 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -36,7 +36,7 @@ usage() echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME" } -if (( ${#} != 5 )) +if (( ${#} != 7 )) then usage exit 1 @@ -48,13 +48,15 @@ if ! { get_cfg_sys $3 get_cfg_prm $4 MODEL_NAME=$5 + CANDLE_MODEL_TYPE=$6 + CANDLE_IMAGE=$7 } then usage exit 1 fi -echo "Running "$MODEL_NAME "workflow" +echo "Running "$MODEL_NAME "workflow with" $CANDLE_MODEL_TYPE "and image" $CANDLE_IMAGE # Set PYTHONPATH for BENCHMARK related stuff PYTHONPATH+=:$BENCHMARK_DIR # :$BENCHMARKS_ROOT/common # This is now candle_lib @@ -187,6 +189,8 @@ swift-t -O 0 -n $PROCS \ -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ -e MODEL_SH \ -e MODEL_NAME \ + -e CANDLE_MODEL_TYPE \ + -e CANDLE_IMAGE \ -e SITE \ -e BENCHMARK_TIMEOUT \ -e SH_TIMEOUT \ diff --git a/workflows/mlrMBO/swift/workflow.swift b/workflows/mlrMBO/swift/workflow.swift index 43d264d3..fb211a91 100644 --- a/workflows/mlrMBO/swift/workflow.swift +++ b/workflows/mlrMBO/swift/workflow.swift @@ -35,6 +35,10 @@ string restart_file = argv("restart_file", "DISABLED"); string r_file = argv("r_file", "mlrMBO1.R"); string model_name = getenv("MODEL_NAME"); +string candle_model_type = getenv("CANDLE_MODEL_TYPE"); +string candle_image = getenv("CANDLE_IMAGE"); +string init_params_file = getenv("INIT_PARAMS_FILE"); + printf("CANDLE mlrMBO Workflow"); printf("TURBINE_OUTPUT: " + turbine_output); diff --git a/workflows/mlrMBO/test/test-nightly.sh b/workflows/mlrMBO/test/test-nightly.sh index f6b7b069..378bf8a9 100755 --- a/workflows/mlrMBO/test/test-nightly.sh +++ b/workflows/mlrMBO/test/test-nightly.sh @@ -47,6 +47,8 @@ then export WAIT=1 fi +export CANDLE_MODEL_TYPE="Benchmarks" + # Submit job $EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME diff --git a/workflows/mlrMBO/test/test-singularity.sh b/workflows/mlrMBO/test/test-singularity.sh new file mode 100755 index 00000000..585679a4 --- /dev/null +++ b/workflows/mlrMBO/test/test-singularity.sh @@ -0,0 +1,70 @@ +#!/bin/bash +set -eu + +# MLRMBO TEST NIGHTLY + +usage() +{ + echo "Usage: test BENCHMARK_NAME SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 3 )) +then + RUN_DIR=$3 +elif (( ${#} == 2 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=$1 +SITE=$2 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-nightly.sh +export CFG_PRM=$THIS/cfg-prm-nightly.sh + +# Specify the R file for This file must be present in the $EMEWS_PROJECT_ROOT/R +export R_FILE=mlrMBO-mbo.R + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export OBJ_RETURN="val_loss" + +if [[ $SITE == "theta" ]] +then + export WAIT=1 +fi + +export CANDLE_MODEL_TYPE="SINGULARITY" +export CANDLE_IMAGE="/software/improve/images/GraphDRP:0.0.1-20221028" +export INIT_PARAMS_FILE="/tmp/test_graphdrp_apartin/graphdrp_default_model.txt" + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME $CANDLE_MODEL_TYPE $CANDLE_IMAGE + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +echo $TURBINE_OUTPUT +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) +#check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: From f84f9bca99c7d9cdc2b673902b0f0ef09120b228 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 4 Nov 2022 14:26:53 -0500 Subject: [PATCH 309/601] WIP on merging hyperparameters --- workflows/common/python/runner_utils.py | 33 ++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/workflows/common/python/runner_utils.py b/workflows/common/python/runner_utils.py index bd587082..c2a1ba75 100644 --- a/workflows/common/python/runner_utils.py +++ b/workflows/common/python/runner_utils.py @@ -1,3 +1,4 @@ +import configparser import numpy as np import json, os, sys from enum import Enum @@ -92,12 +93,38 @@ class ModelResult(Enum): ERROR = 3 +def read_config_file_dict(file: str) -> dict: + result = {} + config = configparser.ConfigParser() + config.read(file) + + for section in config.sections(): + for k, v in config.items(section): + result[k] = v + return result + + +def merge_params(defaults, params): + result = defaults.copy() + for k, v in params.items(): + print("merge_params(): set " + str(k) + ' = ' + str(v)) + result[k] = v + return result + + def main(): # Need argparse + print("runner_utils.main(): " + str(sys.argv)) if sys.argv[1] == "write_params": - hyper_parameter_map = json.loads(sys.argv[3]) - # Assume we are in the correct directory - write_params(sys.argv[2], hyper_parameter_map) + # Merge params from the user-provided params file and + # the workflow-generated parameters + # Assume we are in the correct directory for this file: + defaults = read_config_file_dict(sys.argv[3]) + # Parse the workflow-provided JSON string: + J = json.loads(sys.argv[2]) + params = merge_params(defaults, J) + print("params: " + str(params)) + write_params(params, {}) if __name__ == "__main__": From cb4396ab39cd860b59505fc5ebbeba1d30367b12 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 4 Nov 2022 14:38:54 -0500 Subject: [PATCH 310/601] Parse config file data types --- workflows/common/python/runner_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/python/runner_utils.py b/workflows/common/python/runner_utils.py index c2a1ba75..4d3f2f02 100644 --- a/workflows/common/python/runner_utils.py +++ b/workflows/common/python/runner_utils.py @@ -100,7 +100,7 @@ def read_config_file_dict(file: str) -> dict: for section in config.sections(): for k, v in config.items(section): - result[k] = v + result[k] = eval(v) return result From e6e95d509e353eb0b40350e0f7c118f028b16b25 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 4 Nov 2022 14:42:31 -0500 Subject: [PATCH 311/601] New empty sched-lambda --- workflows/common/sh/sched-lambda.sh | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 workflows/common/sh/sched-lambda.sh diff --git a/workflows/common/sh/sched-lambda.sh b/workflows/common/sh/sched-lambda.sh new file mode 100644 index 00000000..779ed610 --- /dev/null +++ b/workflows/common/sh/sched-lambda.sh @@ -0,0 +1,4 @@ + +# SCHED LAMBDA + +# Empty- Lambda uses normal unscheduled mpiexec execution in Swift/T From b7406527f69fa4e58da5c2180d861cb483b71a33 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Sun, 6 Nov 2022 07:33:06 -0800 Subject: [PATCH 312/601] o Singularity runs work on lambda0 with graphdrp container. o Need to formalize a way to get results out of the container. o Added pre-commit changes to all files. o Also, actions that check for pre-commit with every commit. o Before commiting changes please run " pre-commit run --all-files " --- .github/workflows/pre-commit.yml | 15 + .pre-commit-config.yaml | 22 + archives/py-loc/p.swift | 6 +- archives/templates/README.md | 22 +- .../language_agnostic/submit_candle_job.sh | 2 +- .../language_agnostic/train_model.py | 8 +- archives/templates/model_params/mnist1.txt | 2 +- archives/templates/model_params/uno1.txt | 2 +- archives/templates/models/mnist/mnist.py | 12 +- archives/templates/models/mnist/mnist_mlp.py | 75 +- archives/templates/models/resnet.py | 476 ++-- archives/templates/models/unet.py | 511 ++-- archives/templates/models/uno.py | 406 +-- .../models/wrapper_compliant/mnist_mlp.py | 78 +- archives/templates/run_without_candle.sh | 2 +- .../scripts/candle_compliant_wrapper.py | 66 +- .../templates/scripts/copy_candle_template | 2 +- .../scripts/copy_candle_template-new | 2 +- archives/templates/scripts/model_wrapper.sh | 2 +- archives/templates/scripts/restart.py | 129 +- .../templates/scripts/run_without_candle.sh | 2 +- archives/templates/scripts/run_workflows.sh | 2 +- .../templates/scripts/submit_candle_job.sh | 2 +- .../templates/scripts/wrapper_connector.py | 28 +- .../templates/workflow_settings/mlrmbo1.sh | 2 +- .../workflow_settings/upf-default.txt | 2 +- archives/templates/workflow_settings/upf1.txt | 2 +- archives/templates/workflow_settings/upf3.txt | 2 +- archives/workflows/auen41_ff/auen41_ff.py | 46 +- archives/workflows/p1b1_hyperopt/Readme.md | 78 +- .../workflows/p1b1_hyperopt/data/.gitignore | 2 +- .../p1b1_hyperopt/ext/EQ-Py/EQPy.swift | 8 +- .../workflows/p1b1_hyperopt/ext/EQ-Py/eqpy.py | 5 +- .../p1b1_hyperopt/swift/cori_settings.sh | 2 - .../simple_hyperopt_example/Readme.md | 26 +- .../simple_hyperopt_example/ext/EQ-Py/eqpy.py | 5 +- .../swift/cori_settings.sh | 2 - .../swift/simple_workflow.sh | 1 - .../simple_mlrMBO_example/R/mlrMBO_utils.R | 2 +- .../R/test/mlrMBO_utils_tests.R | 27 +- .../R/test/simple_mlrMBO_run_test.R | 2 +- .../simple_mlrMBO_example/R/test/test_utils.R | 4 +- .../workflows/simple_mlrMBO_example/README.md | 63 +- .../ext/EQ-R/eqr/BlockingQueue.h | 2 +- .../ext/EQ-R/eqr/settings.mk | 4 +- .../workflows/simple_uq/python/permute.py | 19 +- .../simple_uq/python/test-permute.py | 11 +- archives/workflows/simple_uq/swift/junk.py | 1 - .../workflows/simple_uq/swift/obj_func.py | 15 +- docs/format.css | 1 - docs/home.html | 1927 +++++++------ docs/summit.txt | 2 +- docs/user_guide.adoc | 5 +- docs/user_guide.html | 2406 ++++++++++------- python/eqpy/eqpy.py | 4 +- python/hyperopt/Readme.md | 52 +- .../hyperopt/eqpy_hyperopt/hyperopt_runner.py | 57 +- python/hyperopt/tests/test_hyperopt.py | 76 +- scratch/csv2f64/f64_2csv.c | 7 +- scratch/csv2f64/f64_2hdf.py | 19 +- scratch/csv2f64/hdf2f64.py | 16 +- scratch/csv2f64/inject-noise.py | 19 +- scratch/csv2f64/test/data-4x3.csv | 2 - scratch/csv2f64/test/data-5x3.csv | 1 - scratch/csv2f64/test/err-4x3-1.csv | 2 - scratch/csv2f64/test/err-4x3-2.csv | 2 - scratch/fake-lbann/test_1.py | 1 + scratch/histawk/hist.awk | 2 +- scratch/horovod/horovod-1.py | 2 +- scratch/horovod/test-2.swift | 2 +- scratch/horovod2/test-2.C | 3 +- scratch/horovod2/test-5-1.py | 13 +- scratch/indices/i1.swift | 3 +- scratch/launch-opts/README.adoc | 4 +- scratch/load/load.py | 8 +- scratch/py-eval/py/err.py | 2 +- scratch/py-eval/py/import-stringio.py | 2 + scratch/py-eval/py/numpy-array.py | 1 - scratch/py-eval/py/numpy-print-A.py | 2 - scratch/resizer/resize.py | 45 +- scratch/swift-tests/fake-model.py | 1 - scripts/shrink-log.py | 24 +- spack/spack.yaml | 33 +- workflows/GA/README.md | 170 +- workflows/GA/data/adrp_param_space_ga.json | 1 - workflows/GA/data/combo_param_space_ga.json | 48 +- workflows/GA/data/nt3_param_space_ga.json | 133 +- workflows/GA/data/p1b1_param_space_ga.json | 39 +- workflows/GA/data/tc1_param_space_ga.json | 13 +- workflows/GA/swift/workflow.sh | 1 - workflows/GA/test/cfg-prm-1.sh | 4 +- workflows/GA/test/cfg-prm-summit.sh | 4 +- workflows/GA/test/test-1.sh | 1 - workflows/async-horovod/Problem.py | 21 +- workflows/async-horovod/Task.py | 23 +- workflows/async-horovod/main.py | 125 +- workflows/async-horovod/uno-1.json | 6 +- workflows/async-horovod/utils.py | 24 +- workflows/async-local/Problem.py | 17 +- workflows/async-local/Task.py | 8 +- workflows/async-local/main.py | 45 +- workflows/async-local/utils.py | 12 +- workflows/async-search/README.md | 43 +- workflows/async-search/python/as_problem.py | 34 +- .../async-search/python/as_problem_tc1.py | 18 +- workflows/async-search/python/async-search.py | 89 +- workflows/async-search/python/utils.py | 16 +- workflows/async-search/swift/workflow.sh | 2 +- workflows/common/R/mlrMBO-default.R | 16 +- workflows/common/R/mlrMBO-ls1.R | 38 +- workflows/common/R/mlrMBO-ls2.R | 38 +- workflows/common/R/mlrMBO-ls3.R | 38 +- workflows/common/R/mlrMBO-mbo.R | 18 +- workflows/common/R/mlrMBO-rs.R | 18 +- workflows/common/R/mlrMBO1.R | 4 +- workflows/common/R/mlrMBO2.R | 24 +- workflows/common/R/mlrMBO2a.R | 20 +- workflows/common/R/mlrMBO_km.R | 10 +- workflows/common/R/test/ils-test.R | 72 +- .../R/test/learner-discrete-param-bug.R | 30 +- .../common/R/test/mlrMBOMixedIntegerTest11a.R | 19 +- workflows/common/R/test/test_utils.R | 4 +- workflows/common/db/candle_sql.py | 47 +- workflows/common/ext/EQ-Py/eqpy.py | 13 +- workflows/common/ext/EQ-R/eqr/BlockingQueue.h | 2 +- workflows/common/python/deap_ga.py | 110 +- .../common/python/dummy_baseline_keras2.py | 12 +- workflows/common/python/ga_utils.py | 83 +- workflows/common/python/log_runner.py | 21 +- workflows/common/python/log_tools.py | 13 +- .../common/python/model_abstention_runner.py | 165 +- workflows/common/python/model_runner.py | 242 +- workflows/common/python/runner_utils.py | 55 +- workflows/common/python/utils.py | 41 +- workflows/common/sh/env-summit-i.sh | 1 - workflows/common/sh/langs-app-local.sh | 1 - workflows/common/sh/model.sh | 16 +- workflows/common/sh/run_logger.sh | 2 +- workflows/common/sh/sched-local-as.sh | 2 +- workflows/common/sh/sched-local.sh | 2 +- workflows/common/sh/sched-summit-tf2.sh | 1 - workflows/common/swift/candle_utils.swift | 2 +- .../common/swift/obj_abstention_py.swift | 4 +- workflows/common/swift/obj_app.swift | 16 +- workflows/cp-leaveout/README-chained.md | 106 +- workflows/cp-leaveout/db/README.adoc | 1 - workflows/cp-leaveout/db/diff-dbs.sh | 1 - workflows/cp-leaveout/db/reset-node.sh | 2 - workflows/cp-leaveout/py/README.md | 39 +- workflows/cp-leaveout/py/data_setup.py | 49 +- workflows/cp-leaveout/py/planargs.py | 147 +- workflows/cp-leaveout/py/plangen.py | 615 +++-- workflows/cp-leaveout/py/run_chained.py | 289 +- workflows/cp-leaveout/py/tests/.gitignore | 2 +- .../cp-leaveout/py/tests/test_run_chained.py | 54 +- workflows/cp-leaveout/scripts/Node.py | 153 +- workflows/cp-leaveout/scripts/avg-stage.py | 24 +- workflows/cp-leaveout/scripts/avg-utils.py | 1 - workflows/cp-leaveout/scripts/check-db-pkl.py | 19 +- .../cp-leaveout/scripts/compare-errors.py | 24 +- .../cp-leaveout/scripts/compare-losses.py | 12 +- .../cp-leaveout/scripts/compute-node-count.py | 3 +- workflows/cp-leaveout/scripts/data-size.py | 16 +- .../cp-leaveout/scripts/describe-node.py | 7 +- workflows/cp-leaveout/scripts/epoch-time.py | 25 +- .../scripts/extract-holdout-errors.sh | 2 +- .../scripts/extract-holdout-errors.test | 1 - .../cp-leaveout/scripts/extract-node-info.py | 22 +- .../scripts/find-loss-increases.py | 147 +- workflows/cp-leaveout/scripts/leaf-stats.py | 77 +- .../cp-leaveout/scripts/list-node-singles.py | 5 +- workflows/cp-leaveout/scripts/list-nodes.py | 7 +- .../cp-leaveout/scripts/loss-histogram.py | 1 - workflows/cp-leaveout/scripts/node-times.py | 19 +- .../scripts/plot-holdout-errors.py | 26 +- .../cp-leaveout/scripts/plot_io_times.py | 23 +- .../cp-leaveout/scripts/print-node-info.py | 18 +- .../cp-leaveout/scripts/report_leaves.py | 16 +- .../cp-leaveout/scripts/report_stopping.py | 19 +- workflows/cp-leaveout/scripts/stage-avg.py | 11 +- workflows/cp-leaveout/scripts/time-nvm.data | 1 - .../cp-leaveout/scripts/workflow-stats.py | 68 +- .../cp-leaveout/swift/cpl-upf-workflow.sh | 2 +- .../cp-leaveout/swift/cpl-upf-workflow.swift | 4 +- .../cp-leaveout/test-chained/cfg-stage-sys.sh | 9 +- workflows/cp-leaveout/test-chained/cfg.json | 50 +- workflows/cp-leaveout/test/test-1.sh | 2 +- .../cp-leaveout/test/test-numpy-delete.py | 3 +- workflows/cp1/README.adoc | 90 +- .../cp1/data/upf_use_exported_no_nci.txt | 1 - workflows/cp1/db/db-hpo-init.py | 13 +- workflows/cp1/db/db-hpo-list.py | 49 +- workflows/cp1/db/db-hpo-setup.py | 48 +- workflows/cp1/db/hpo-defns-1.yaml | 20 +- workflows/cp1/nested_me_ex/.gitignore | 2 +- workflows/cp1/nested_me_ex/README.md | 87 +- workflows/cp1/nested_me_ex/ext/EQ-Py/eqpy.py | 20 +- workflows/cp1/nested_me_ex/python/me1.py | 16 +- workflows/cp1/nested_me_ex/python/me2.py | 27 +- .../cp1/nested_me_ex/python/task_cache.py | 28 +- workflows/cp1/nested_me_ex/swift/workflow.sh | 2 +- .../cp1/nested_me_ex/swift/workflow.swift | 10 +- workflows/cp1/scripts/parse_infer_results.py | 99 +- workflows/cp1/scripts/parse_start_stop.py | 46 +- workflows/cp1/scripts/parse_start_stop_upf.py | 70 +- workflows/cp1/scripts/plots.R | 14 +- workflows/cp1/sh/infer.sh | 2 +- workflows/cp1/swift/infer_workflow.swift | 3 +- workflows/cp1/swift/nci_workflow.swift | 10 +- workflows/cp1/swift/upf_workflow.sh | 2 +- workflows/cp1/swift/upf_workflow.swift | 3 +- workflows/cp1/swift/workflow.swift | 8 +- workflows/cp1/test/cfg-prm-1.sh | 2 +- workflows/cp1/test/cfg-sys-1.sh | 2 +- workflows/cp1/test/cfg-sys-3.sh | 2 +- workflows/cp1/test/create-new-test.sh | 3 - workflows/cp1/test_infer/cfg-prm-1.sh | 2 - workflows/cp1/test_infer/cfg-prm-250.sh | 1 - workflows/cp1/test_upf/cfg-prm-1.sh | 1 - workflows/grid/README.md | 12 +- workflows/grid/data/settings.json | 17 +- workflows/grid/python/computeStats.py | 30 +- workflows/grid/python/determineParameters.py | 51 +- workflows/grid/python/evaluateOne.py | 62 +- workflows/grid/python/p1b1_runner.py | 29 +- workflows/grid/swift/grid-sweep.swift | 1 - workflows/grid/swift/workflow.sh | 2 +- workflows/grid/test/cfg-prm-1.sh | 1 - workflows/grid/test/cfg-sys-1.sh | 1 - workflows/mlrMBO/.gitignore | 1 - workflows/mlrMBO/README.md | 57 +- workflows/mlrMBO/data/adrp_nightly.R | 1 - workflows/mlrMBO/data/combo_hps_exp_01.R | 17 +- workflows/mlrMBO/data/combo_nightly.R | 12 +- workflows/mlrMBO/data/p1b1_hps_exp_01.R | 24 +- workflows/mlrMBO/data/p1b1_nightly.R | 24 +- workflows/mlrMBO/swift/workflow.sh | 20 +- .../test-1000-01-mbo/cfg-prm-restart.sh | 6 +- .../mlrMBO/test-1000-01-mbo/cfg-sys-1.sh | 2 - .../mlrMBO/test-1000-01-mbo/test-restart.sh | 4 +- .../mlrMBO/test-1000-01-rs/cfg-prm-restart.sh | 6 +- workflows/mlrMBO/test-1000-01-rs/cfg-sys-1.sh | 2 - .../mlrMBO/test-1000-01-rs/test-restart.sh | 4 +- workflows/mlrMBO/test/cfg-prm-30.sh | 4 +- workflows/mlrMBO/test/cfg-prm-restart.sh | 6 +- workflows/mlrMBO/test/cfg-sys-30.sh | 1 - workflows/mlrMBO/test/create-new-test.sh | 3 - workflows/mlrMBO/test/restart-combo.csv | 1 - workflows/mlrMBO/test/test-30.sh | 4 +- workflows/mlrMBO/test/test-restart.sh | 4 +- workflows/one-shot/load.py | 5 +- workflows/pbt/Readme.md | 179 +- workflows/pbt/data/tc1_params_full.json | 13 +- workflows/pbt/data/tc1_params_small.json | 13 +- workflows/pbt/models/mnist/mnist_cnn.py | 63 +- .../pbt/models/tc1/tc1_baseline_keras2.py | 338 ++- workflows/pbt/models/tc1/tc1_runner.py | 58 +- workflows/pbt/python/file_test.py | 19 +- workflows/pbt/python/pbt.py | 234 +- workflows/pbt/python/pbt_utils.py | 196 +- workflows/pbt/python/tc1_pbt.py | 70 +- workflows/pbt/python/tc1_pbt_ds.py | 85 +- workflows/pbt/python/test/pbt_tests.py | 31 +- workflows/pbt/scripts/common.m4 | 2 +- workflows/pbt/scripts/local.cfg | 2 +- workflows/pbt/scripts/local_submit.cfg | 2 +- workflows/pbt/scripts/local_submit.m4 | 2 +- workflows/pbt/scripts/pbt_run.sh | 1 - workflows/pbt/scripts/theta.cfg | 2 +- workflows/pbt/scripts/theta_submit.cfg | 2 +- workflows/pbt/scripts/theta_submit.m4 | 2 +- workflows/pbt/scripts/titan.cfg | 2 +- workflows/pbt/scripts/titan_submit.cfg | 2 +- workflows/pbt/src/Readme.md | 2 +- workflows/random/README.md | 19 +- workflows/random/data/settings.json | 24 +- workflows/random/python/computeStats.py | 29 +- .../random/python/determineParameters.py | 70 +- workflows/random/python/evaluateOne.py | 62 +- workflows/random/python/p1b1_runner.py | 29 +- workflows/random/python/test/run_test_p1b1.sh | 2 +- workflows/random/python/test/test_p1b1.py | 15 +- workflows/random/swift/cooley_workflow.sh | 3 +- workflows/random/swift/workflow.sh | 4 +- workflows/random/test/cfg-prm-1.sh | 1 - workflows/random/test/cfg-sys-1.sh | 1 - workflows/random/test/test.sh | 1 - workflows/test-horovod/make.sh | 2 +- workflows/test-horovod/test.py | 4 +- workflows/upf/README.md | 51 +- workflows/upf/test/upf-infer-orig.txt | 1 - .../uq-noise/scripts/plot-extract-logs.py | 44 +- workflows/uq-noise/scripts/plot-extract.py | 16 +- .../uq-noise/swift/workflow-abstention.sh | 3 +- .../uq-noise/swift/workflow-abstention.swift | 1 - .../uq-noise/swift/workflow-gauss-abs.sh | 3 +- .../uq-noise/swift/workflow-gauss-abs.swift | 2 - workflows/uq-noise/swift/workflow-gnoise.sh | 1 - .../uq-noise/swift/workflow-gnoise.swift | 1 - workflows/uq-noise/swift/workflow-noise.sh | 1 - workflows/uq-noise/swift/workflow-noise.swift | 1 - workflows/uq-noise/swift/workflow.sh | 1 - workflows/uq-noise/swift/workflow.swift | 1 - workflows/uq-noise/swift/xy_workflow.swift | 1 - .../xcorr/CandleFeatureSelectionFunction.py | 82 +- workflows/xcorr/CandlePilotWorkflow.py | 41 +- workflows/xcorr/README.adoc | 22 +- workflows/xcorr/db-init.py | 54 +- workflows/xcorr/db-insert-junk.py | 24 +- workflows/xcorr/list-records.py | 14 +- workflows/xcorr/make-fake-data.py | 29 +- workflows/xcorr/record.py | 8 +- workflows/xcorr/tests/uno_xcorr_tests.py | 30 +- workflows/xcorr/uno_xcorr.py | 79 +- workflows/xcorr/xcorr.py | 43 +- workflows/xcorr/xcorr_db.py | 115 +- 316 files changed, 8297 insertions(+), 6227 deletions(-) create mode 100644 .github/workflows/pre-commit.yml create mode 100644 .pre-commit-config.yaml diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 00000000..bc4bfa46 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,15 @@ +name: pre-commit + +on: + pull_request: + push: + branches: + - master + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4.3.0 + - uses: pre-commit/action@v3.0.0 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..81743682 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,22 @@ +repos: +- repo: https://github.com/pre-commit/mirrors-yapf # To format the code to conform YAPF + rev: v0.31.0 + hooks: + - id: yapf + args: ['--in-place', '--recursive', '--style', 'google'] + +- repo: https://github.com/myint/docformatter # To format the doc strings to conform PEP257 + rev: v1.4 + hooks: + - id: docformatter + args: [--in-place] + +- repo: https://github.com/pre-commit/pre-commit-hooks # Some common pre-commit hooks + rev: v3.4.0 + hooks: + - id: check-yaml # Checks the syntax of .yaml files. + args: [--allow-multiple-documents] + exclude: 'meta.yaml' # Exclude this because it gives an error for '%' in Line 1 and couldn't fix yet + - id: end-of-file-fixer # Makes sure files end with a newline. + - id: trailing-whitespace # Checks for any tabs or spaces after the last non-whitespace character on the line. + - id: check-docstring-first # Checks that code comes after the docstrings. diff --git a/archives/py-loc/p.swift b/archives/py-loc/p.swift index c6803a9e..67255e9e 100644 --- a/archives/py-loc/p.swift +++ b/archives/py-loc/p.swift @@ -5,11 +5,11 @@ import location; L0 = locationFromRank(0); L1 = locationFromRank(1); - + @location=L0 python_persist("L = []"); @location=L1 python_persist("L = []"); -string D[]; -foreach j in [0:9] { +string D[]; +foreach j in [0:9] { L = locationFromRank(j%%2); D[j] = @location=L python_persist("L.append(repr(2+%i)) " % j); } diff --git a/archives/templates/README.md b/archives/templates/README.md index fc271770..74b1a745 100644 --- a/archives/templates/README.md +++ b/archives/templates/README.md @@ -16,17 +16,17 @@ In more detail, here are the steps required for running an arbitrary workflow on 1. Ensure the `$SITE` and `$CANDLE` variables are exported to the environment as specified [here](#CANDLE-settings-at-different-SITEs). 1. Copy the submission script `$CANDLE/Supervisor/templates/submit_candle_job.sh` to a working directory. 1. Specify the model in the submission script: - 1. Set the `$MODEL_PYTHON_SCRIPT` variable to one of the models in the `$CANDLE/Supervisor/templates/models` directory (currently either "resnet", "unet", "uno", or "mnist_mlp"). Or, specify your own [CANDLE-compliant](https://ecp-candle.github.io/Candle/html/tutorials/writing_candle_code.html) Python model by setting both the `$MODEL_PYTHON_DIR` and `$MODEL_PYTHON_SCRIPT` variables as appropriate. - 1. Specify the corresponding default model parameters by setting the `$DEFAULT_PARAMS_FILE` variable to one of the files in the `$CANDLE/Supervisor/templates/model_params` directory. Or, copy one of these template files to the working directory, modify it accordingly, and point the `$DEFAULT_PARAMS_FILE` variable to this file. + 1. Set the `$MODEL_PYTHON_SCRIPT` variable to one of the models in the `$CANDLE/Supervisor/templates/models` directory (currently either "resnet", "unet", "uno", or "mnist_mlp"). Or, specify your own [CANDLE-compliant](https://ecp-candle.github.io/Candle/html/tutorials/writing_candle_code.html) Python model by setting both the `$MODEL_PYTHON_DIR` and `$MODEL_PYTHON_SCRIPT` variables as appropriate. + 1. Specify the corresponding default model parameters by setting the `$DEFAULT_PARAMS_FILE` variable to one of the files in the `$CANDLE/Supervisor/templates/model_params` directory. Or, copy one of these template files to the working directory, modify it accordingly, and point the `$DEFAULT_PARAMS_FILE` variable to this file. 1. Specify the workflow in the submission script: 1. Set the `$WORKFLOW_TYPE` variable as appropriate (currently supported are "upf", and, to a less-tested extent, "mlrMBO"). - 1. Specify the corresponding workflow settings by setting the `$WORKFLOW_SETTINGS_FILE` variable to one of the files in the `$CANDLE/Supervisor/templates/workflow_settings` directory. Or, copy one of these template files to the working directory, modify it accordingly, and point the `$WORKFLOW_SETTINGS_FILE` variable to this file. + 1. Specify the corresponding workflow settings by setting the `$WORKFLOW_SETTINGS_FILE` variable to one of the files in the `$CANDLE/Supervisor/templates/workflow_settings` directory. Or, copy one of these template files to the working directory, modify it accordingly, and point the `$WORKFLOW_SETTINGS_FILE` variable to this file. 1. Adjust any other variables in the submission script such as the output directory (specified by `$EXPERIMENTS`), the scheduler settings, etc. 1. Run the script from a submit node like `./submit_candle_job.sh`. ## Background -In general, it would be nice to allow for an arbitrary model (U-Net, ResNet, etc.) to be run using an arbitrary workflow (UPF, mlrMBO, etc.), all in an external working directory. For example, here is a sample submission script: +In general, it would be nice to allow for an arbitrary model (U-Net, ResNet, etc.) to be run using an arbitrary workflow (UPF, mlrMBO, etc.), all in an external working directory. For example, here is a sample submission script: ```bash #!/bin/bash @@ -60,13 +60,13 @@ export WORKFLOW_SETTINGS_FILE="/home/weismanal/notebook/2019-02-28/unet/upf1.txt $CANDLE/Supervisor/workflows/$WORKFLOW_TYPE/swift/workflow.sh $SITE -a $CANDLE/Supervisor/workflows/common/sh/cfg-sys-$SITE.sh $WORKFLOW_SETTINGS_FILE ``` -When this script is run (no arguments accepted) on a Biowulf submit node, the necessarily [CANDLE-compliant](https://ecp-candle.github.io/Candle/html/tutorials/writing_candle_code.html) file `$MODEL_PYTHON_DIR/$MODEL_PYTHON_SCRIPT.py` will be run using the default parameters specified in `$DEFAULT_PARAMS_FILE`. The CANDLE workflow used will be UPF (specified by `$WORKFLOW_TYPE`) and will be run using the parameters specified in `$WORKFLOW_SETTINGS_FILE`. The results of the job will be output in `$EXPERIMENTS`. Note that we can choose a different workflow by simply changing the value of the `$WORKFLOW_TYPE` variable, e.g., +When this script is run (no arguments accepted) on a Biowulf submit node, the necessarily [CANDLE-compliant](https://ecp-candle.github.io/Candle/html/tutorials/writing_candle_code.html) file `$MODEL_PYTHON_DIR/$MODEL_PYTHON_SCRIPT.py` will be run using the default parameters specified in `$DEFAULT_PARAMS_FILE`. The CANDLE workflow used will be UPF (specified by `$WORKFLOW_TYPE`) and will be run using the parameters specified in `$WORKFLOW_SETTINGS_FILE`. The results of the job will be output in `$EXPERIMENTS`. Note that we can choose a different workflow by simply changing the value of the `$WORKFLOW_TYPE` variable, e.g., ```bash export WORKFLOW_TYPE="mlrMBO" ``` -In the sample submission script above, the Python script containing the model (my_specialized_unet.py), the default model parameters (default_params.txt), and the unrolled parameter file (upf1.txt) are all specified in the "unet" subdirectory of the working directory "/home/weismanal/notebook/2019-02-28". However, often a model, its default parameters, and a workflow's settings can be reused. +In the sample submission script above, the Python script containing the model (my_specialized_unet.py), the default model parameters (default_params.txt), and the unrolled parameter file (upf1.txt) are all specified in the "unet" subdirectory of the working directory "/home/weismanal/notebook/2019-02-28". However, often a model, its default parameters, and a workflow's settings can be reused. Thus, we provide templates of these three types of files in the `$CANDLE/Supervisor/templates` directory, the current structure of which is: @@ -102,7 +102,7 @@ export WORKFLOW_SETTINGS_FILE="/home/weismanal/notebook/2019-02-28/unet/upf1.txt export WORKFLOW_SETTINGS_FILE="$CANDLE/Supervisor/templates/workflow_settings/upf1.txt" ``` -The template submission script located at `$CANDLE/Supervisor/templates/submit_candle_job.sh` utilizes all three of these types of templates and will just work (running an HPO on the MNIST dataset) as long as the `$CANDLE` and `$SITE` variables are set correctly. +The template submission script located at `$CANDLE/Supervisor/templates/submit_candle_job.sh` utilizes all three of these types of templates and will just work (running an HPO on the MNIST dataset) as long as the `$CANDLE` and `$SITE` variables are set correctly. ## Notes @@ -119,10 +119,10 @@ mymodel_common = candle.Benchmark(file_path, os.getenv("DEFAULT_PARAMS_FILE"), ' I'd recommend this be added to the standard method for making a model [CANDLE-compliant](https://ecp-candle.github.io/Candle/html/tutorials/writing_candle_code.html). -Note further that `$DEFAULT_PARAMS_FILE` must be a full pathname. Otherwise, if we just used the filename "default_params.txt" hardcoded into the `$MODEL_PYTHON_SCRIPT`, the script would look for this global parameter file in the same directory that it's in (i.e., `$MODEL_PYTHON_DIR`), but that would preclude using a `$MODEL_PYTHON_SCRIPT` that's a symbolic link. In that case, we'd have to always copy the `$MODEL_PYTHON_SCRIPT` to the current working directory, which is inefficient because this leads to unnecessary duplication of code. +Note further that `$DEFAULT_PARAMS_FILE` must be a full pathname. Otherwise, if we just used the filename "default_params.txt" hardcoded into the `$MODEL_PYTHON_SCRIPT`, the script would look for this global parameter file in the same directory that it's in (i.e., `$MODEL_PYTHON_DIR`), but that would preclude using a `$MODEL_PYTHON_SCRIPT` that's a symbolic link. In that case, we'd have to always copy the `$MODEL_PYTHON_SCRIPT` to the current working directory, which is inefficient because this leads to unnecessary duplication of code. ### CANDLE settings at different SITEs -`$SITE` | `$CANDLE` -:---: | :---: -biowulf | /data/BIDS-HPC/public/candle \ No newline at end of file +| `$SITE` | `$CANDLE` | +| :-----: | :--------------------------: | +| biowulf | /data/BIDS-HPC/public/candle | diff --git a/archives/templates/language_agnostic/submit_candle_job.sh b/archives/templates/language_agnostic/submit_candle_job.sh index 990cc07a..51ed573c 100755 --- a/archives/templates/language_agnostic/submit_candle_job.sh +++ b/archives/templates/language_agnostic/submit_candle_job.sh @@ -8,7 +8,7 @@ export SITE="biowulf" # Job specification export EXPERIMENTS="$MY_DIR" #TODO GZ: These 2 variables are not needed -export MODEL_NAME="mnist_upf_test" +export MODEL_NAME="mnist_upf_test" export OBJ_RETURN="val_loss" # Scheduler settings diff --git a/archives/templates/language_agnostic/train_model.py b/archives/templates/language_agnostic/train_model.py index 5013c6a4..9af290a1 100755 --- a/archives/templates/language_agnostic/train_model.py +++ b/archives/templates/language_agnostic/train_model.py @@ -1,8 +1,8 @@ -import sys -import pickle import os +import pickle import random +import sys -#Generate a random loss function +# Generate a random loss function print(str(sys.argv)) -print(random.uniform(0,1)) +print(random.uniform(0, 1)) diff --git a/archives/templates/model_params/mnist1.txt b/archives/templates/model_params/mnist1.txt index 430bec5a..3a33c6ed 100644 --- a/archives/templates/model_params/mnist1.txt +++ b/archives/templates/model_params/mnist1.txt @@ -3,4 +3,4 @@ epochs=20 batch_size=128 activation='relu' optimizer='rmsprop' -num_filters=32 \ No newline at end of file +num_filters=32 diff --git a/archives/templates/model_params/uno1.txt b/archives/templates/model_params/uno1.txt index 12fbf6b7..8b83f3d9 100644 --- a/archives/templates/model_params/uno1.txt +++ b/archives/templates/model_params/uno1.txt @@ -51,4 +51,4 @@ use_landmark_genes = True validation_split = 0.2 verbose = None warmup_lr = False -save='save/uno' \ No newline at end of file +save='save/uno' diff --git a/archives/templates/models/mnist/mnist.py b/archives/templates/models/mnist/mnist.py index 5c5e2837..de2605b1 100644 --- a/archives/templates/models/mnist/mnist.py +++ b/archives/templates/models/mnist/mnist.py @@ -1,13 +1,14 @@ # add candle_keras library in path -candle_lib = '/data/BIDS-HPC/public/candle/Candle/common' +candle_lib = "/data/BIDS-HPC/public/candle/Candle/common" import sys -sys.path.append(candle_lib) +sys.path.append(candle_lib) import os -#import sys + +# import sys file_path = os.path.dirname(os.path.realpath(__file__)) -lib_path = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) +lib_path = os.path.abspath(os.path.join(file_path, "..", "..", "common")) sys.path.append(lib_path) import candle_keras as candle @@ -19,10 +20,11 @@ additional_definitions = None required = None + class MNIST(candle.Benchmark): + def set_locals(self): if required is not None: self.required = set(required) if additional_definitions is not None: self.additional_definitions = additional_definitions - diff --git a/archives/templates/models/mnist/mnist_mlp.py b/archives/templates/models/mnist/mnist_mlp.py index c0c13c12..d2751896 100644 --- a/archives/templates/models/mnist/mnist_mlp.py +++ b/archives/templates/models/mnist/mnist_mlp.py @@ -1,62 +1,64 @@ -import mnist import os -from keras.callbacks import CSVLogger +import mnist from keras import backend as K +from keras.callbacks import CSVLogger + def initialize_parameters(): - mnist_common = mnist.MNIST(mnist.file_path, + mnist_common = mnist.MNIST( + mnist.file_path, os.getenv("DEFAULT_PARAMS_FILE"), - 'keras', - prog='mnist_mlp', - desc='MNIST example' + "keras", + prog="mnist_mlp", + desc="MNIST example", ) import candle_keras as candle # Initialize parameters gParameters = candle.initialize_parameters(mnist_common) - csv_logger = CSVLogger('{}/params.log'.format(gParameters)) + csv_logger = CSVLogger("{}/params.log".format(gParameters)) return gParameters + def run(gParameters): ########################################## # Your DL start here. See mnist_mlp.py # ########################################## - '''Trains a simple deep NN on the MNIST dataset. + """Trains a simple deep NN on the MNIST dataset. - Gets to 98.40% test accuracy after 20 epochs - (there is *a lot* of margin for parameter tuning). - 2 seconds per epoch on a K520 GPU. - ''' + Gets to 98.40% test accuracy after 20 epochs (there is *a lot* of + margin for parameter tuning). 2 seconds per epoch on a K520 GPU. + """ # from __future__ import print_function import keras from keras.datasets import mnist - from keras.models import Sequential from keras.layers import Dense, Dropout + from keras.models import Sequential from keras.optimizers import RMSprop - batch_size = gParameters['batch_size'] + batch_size = gParameters["batch_size"] num_classes = 10 - epochs = gParameters['epochs'] + epochs = gParameters["epochs"] - activation = gParameters['activation'] - optimizer = gParameters['optimizer'] + activation = gParameters["activation"] + optimizer = gParameters["optimizer"] # the data, split between train and test sets (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train = x_train.reshape(60000, 784) x_test = x_test.reshape(10000, 784) - x_train = x_train.astype('float32') - x_test = x_test.astype('float32') + x_train = x_train.astype("float32") + x_test = x_test.astype("float32") x_train /= 255 x_test /= 255 - print(x_train.shape[0], 'train samples') - print(x_test.shape[0], 'test samples') + print(x_train.shape[0], "train samples") + print(x_test.shape[0], "test samples") # convert class vectors to binary class matrices y_train = keras.utils.to_categorical(y_train, num_classes) @@ -67,32 +69,37 @@ def run(gParameters): model.add(Dropout(0.2)) model.add(Dense(512, activation=activation)) model.add(Dropout(0.2)) - model.add(Dense(num_classes, activation='softmax')) + model.add(Dense(num_classes, activation="softmax")) model.summary() - model.compile(loss='categorical_crossentropy', - optimizer=optimizer, - metrics=['accuracy']) - - history = model.fit(x_train, y_train, - batch_size=batch_size, - epochs=epochs, - verbose=1, - validation_data=(x_test, y_test)) + model.compile(loss="categorical_crossentropy", + optimizer=optimizer, + metrics=["accuracy"]) + + history = model.fit( + x_train, + y_train, + batch_size=batch_size, + epochs=epochs, + verbose=1, + validation_data=(x_test, y_test), + ) score = model.evaluate(x_test, y_test, verbose=0) - print('Test loss:', score[0]) - print('Test accuracy:', score[1]) + print("Test loss:", score[0]) + print("Test accuracy:", score[1]) ########################################## # End of mnist_mlp.py #################### ########################################## return history + def main(): gParameters = initialize_parameters() run(gParameters) -if __name__ == '__main__': + +if __name__ == "__main__": main() try: K.clear_session() diff --git a/archives/templates/models/resnet.py b/archives/templates/models/resnet.py index 778e09f5..1de769cf 100644 --- a/archives/templates/models/resnet.py +++ b/archives/templates/models/resnet.py @@ -1,321 +1,439 @@ -from keras import backend as K import os +from keras import backend as K + # Parameters -candle_lib = '/data/BIDS-HPC/public/candle/Candle/common' +candle_lib = "/data/BIDS-HPC/public/candle/Candle/common" + def initialize_parameters(): - print('Initializing parameters...') - + print("Initializing parameters...") + # Obtain the path of the directory of this script file_path = os.path.dirname(os.path.realpath(__file__)) # Import the CANDLE library import sys + sys.path.append(candle_lib) import candle_keras as candle # Instantiate the candle.Benchmark class - mymodel_common = candle.Benchmark(file_path,os.getenv("DEFAULT_PARAMS_FILE"),'keras',prog='myprog',desc='My model') + mymodel_common = candle.Benchmark( + file_path, + os.getenv("DEFAULT_PARAMS_FILE"), + "keras", + prog="myprog", + desc="My model", + ) # Get a dictionary of the model hyperparamters gParameters = candle.initialize_parameters(mymodel_common) # Return the dictionary of the hyperparameters - return(gParameters) - + return gParameters + + def run(gParameters): - print('Running model...') + print("Running model...") #### Begin model input ########################################################################################## - - def get_model(model_json_fname,modelwtsfname): + + def get_model(model_json_fname, modelwtsfname): # This is only for prediction if os.path.isfile(model_json_fname): - # Model reconstruction from JSON file - with open(model_json_fname, 'r') as f: + # Model reconstruction from JSON file + with open(model_json_fname, "r") as f: model = model_from_json(f.read()) else: - model = get_unet() - - #model.summary() + model = get_unet() + + # model.summary() # Load weights into the new model model.load_weights(modelwtsfname) - return model - - def focal_loss(gamma=2., alpha=.25): + return model + + def focal_loss(gamma=2.0, alpha=0.25): + def focal_loss_fixed(y_true, y_pred): pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred)) pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred)) - return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1))-K.sum((1-alpha) * K.pow( pt_0, gamma) * K.log(1. - pt_0)) + return -K.sum( + alpha * K.pow(1.0 - pt_1, gamma) * K.log(pt_1)) - K.sum( + (1 - alpha) * K.pow(pt_0, gamma) * K.log(1.0 - pt_0)) + return focal_loss_fixed - + def jaccard_coef(y_true, y_pred): smooth = 1.0 intersection = K.sum(y_true * y_pred, axis=[-0, -1, 2]) sum_ = K.sum(y_true + y_pred, axis=[-0, -1, 2]) - + jac = (intersection + smooth) / (sum_ - intersection + smooth) - + return K.mean(jac) - + def jaccard_coef_int(y_true, y_pred): smooth = 1.0 y_pred_pos = K.round(K.clip(y_pred, 0, 1)) - + intersection = K.sum(y_true * y_pred_pos, axis=[-0, -1, 2]) sum_ = K.sum(y_true + y_pred_pos, axis=[-0, -1, 2]) - + jac = (intersection + smooth) / (sum_ - intersection + smooth) - + return K.mean(jac) - + def jaccard_coef_loss(y_true, y_pred): - return -K.log(jaccard_coef(y_true, y_pred)) + binary_crossentropy(y_pred, y_true) - + return -K.log(jaccard_coef(y_true, y_pred)) + binary_crossentropy( + y_pred, y_true) + def dice_coef_batch(y_true, y_pred): smooth = 1.0 intersection = K.sum(y_true * y_pred, axis=[-0, -1, 2]) sum_ = K.sum(y_true + y_pred, axis=[-0, -1, 2]) - - dice = ((2.0*intersection) + smooth) / (sum_ + intersection + smooth) - + + dice = ((2.0 * intersection) + smooth) / (sum_ + intersection + smooth) + return K.mean(dice) - + def dice_coef(y_true, y_pred): smooth = 1.0 y_true_f = K.flatten(y_true) y_pred_f = K.flatten(y_pred) intersection = K.sum(y_true_f * y_pred_f) - dice_smooth = ((2. * intersection) + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth) - return (dice_smooth) - + dice_smooth = ((2.0 * intersection) + + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth) + return dice_smooth + def dice_coef_loss(y_true, y_pred): return -dice_coef(y_true, y_pred) - + def dice_coef_batch_loss(y_true, y_pred): return -dice_coef_batch(y_true, y_pred) - - #Define the neural network + + # Define the neural network def get_unet(): droprate = 0.25 filt_size = 32 inputs = Input((None, None, 1)) - conv1 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(inputs) - conv1 = Dropout(droprate)(conv1) - conv1 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(conv1) + conv1 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(inputs) + conv1 = Dropout(droprate)(conv1) + conv1 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(conv1) pool1 = MaxPooling2D(pool_size=(2, 2))(conv1) - filt_size = filt_size*2 - - conv2 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(pool1) - conv2 = Dropout(droprate)(conv2) - conv2 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(conv2) + filt_size = filt_size * 2 + + conv2 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(pool1) + conv2 = Dropout(droprate)(conv2) + conv2 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(conv2) pool2 = MaxPooling2D(pool_size=(2, 2))(conv2) - filt_size = filt_size*2 - - conv3 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(pool2) - conv3 = Dropout(droprate)(conv3) - conv3 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(conv3) + filt_size = filt_size * 2 + + conv3 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(pool2) + conv3 = Dropout(droprate)(conv3) + conv3 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(conv3) pool3 = MaxPooling2D(pool_size=(2, 2))(conv3) - filt_size = filt_size*2 - - conv4 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(pool3) - conv4 = Dropout(droprate)(conv4) - conv4 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(conv4) + filt_size = filt_size * 2 + + conv4 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(pool3) + conv4 = Dropout(droprate)(conv4) + conv4 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(conv4) pool4 = MaxPooling2D(pool_size=(2, 2))(conv4) - filt_size = filt_size*2 - - conv5 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(pool4) + filt_size = filt_size * 2 + + conv5 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(pool4) conv5 = Dropout(droprate)(conv5) - conv5 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(conv5) - - filt_size = filt_size/2 - - up6 = concatenate([Conv2DTranspose(filt_size, (2, 2), strides=(2, 2), padding='same')(conv5), conv4], axis=3) - conv6 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(up6) + conv5 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(conv5) + + filt_size = filt_size / 2 + + up6 = concatenate( + [ + Conv2DTranspose( + filt_size, (2, 2), strides=(2, 2), padding="same")(conv5), + conv4, + ], + axis=3, + ) + conv6 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(up6) conv6 = Dropout(droprate)(conv6) - conv6 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(conv6) - - filt_size = filt_size/2 - - up7 = concatenate([Conv2DTranspose(filt_size, (2, 2), strides=(2, 2), padding='same')(conv6), conv3], axis=3) - conv7 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(up7) + conv6 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(conv6) + + filt_size = filt_size / 2 + + up7 = concatenate( + [ + Conv2DTranspose( + filt_size, (2, 2), strides=(2, 2), padding="same")(conv6), + conv3, + ], + axis=3, + ) + conv7 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(up7) conv7 = Dropout(droprate)(conv7) - conv7 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(conv7) - - filt_size = filt_size/2 - - up8 = concatenate([Conv2DTranspose(filt_size, (2, 2), strides=(2, 2), padding='same')(conv7), conv2], axis=3) - conv8 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(up8) + conv7 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(conv7) + + filt_size = filt_size / 2 + + up8 = concatenate( + [ + Conv2DTranspose( + filt_size, (2, 2), strides=(2, 2), padding="same")(conv7), + conv2, + ], + axis=3, + ) + conv8 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(up8) conv8 = Dropout(droprate)(conv8) - conv8 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(conv8) - filt_size = filt_size/2 - - up9 = concatenate([Conv2DTranspose(filt_size, (2, 2), strides=(2, 2), padding='same')(conv8), conv1], axis=3) - conv9 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(up9) + conv8 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(conv8) + filt_size = filt_size / 2 + + up9 = concatenate( + [ + Conv2DTranspose( + filt_size, (2, 2), strides=(2, 2), padding="same")(conv8), + conv1, + ], + axis=3, + ) + conv9 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(up9) conv9 = Dropout(droprate)(conv9) - conv9 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(conv9) - - - conv10 = Conv2D(1, (1, 1), activation='sigmoid')(conv9) - + conv9 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(conv9) + + conv10 = Conv2D(1, (1, 1), activation="sigmoid")(conv9) + model = Model(inputs=[inputs], outputs=[conv10]) - - #model.compile(optimizer=Adam(lr=1e-5), loss=dice_coef_loss, metrics=[dice_coef]) - #model.compile(optimizer=Nadam(lr=1e-3), loss=dice_coef_loss, metrics=[dice_coef]) - #model.compile(optimizer=Adadelta(), loss=dice_coef_loss, metrics=[dice_coef]) - + + # model.compile(optimizer=Adam(lr=1e-5), loss=dice_coef_loss, metrics=[dice_coef]) + # model.compile(optimizer=Nadam(lr=1e-3), loss=dice_coef_loss, metrics=[dice_coef]) + # model.compile(optimizer=Adadelta(), loss=dice_coef_loss, metrics=[dice_coef]) + return model - - def save_model_to_json(model,model_json_fname): - - #model = unet.UResNet152(input_shape=(None, None, 3), classes=1,encoder_weights="imagenet11k") - #model = get_unet() - - #model.summary() + + def save_model_to_json(model, model_json_fname): + + # model = unet.UResNet152(input_shape=(None, None, 3), classes=1,encoder_weights="imagenet11k") + # model = get_unet() + + # model.summary() # serialize model to JSON model_json = model.to_json() with open(model_json_fname, "w") as json_file: - json_file.write(model_json) - - def preprocess_data(do_prediction,inputnpyfname,targetnpyfname,expandChannel,backbone): + json_file.write(model_json) + + def preprocess_data(do_prediction, inputnpyfname, targetnpyfname, + expandChannel, backbone): # Preprocess the data (beyond what I already did before) - - print('-'*30) - print('Loading and preprocessing data...') - print('-'*30) - + + print("-" * 30) + print("Loading and preprocessing data...") + print("-" * 30) + # Load, normalize, and cast the data - imgs_input = ( np.load(inputnpyfname).astype('float32') / (2**16-1) * (2**8-1) ).astype('uint8') - print('Input images information:') + imgs_input = (np.load(inputnpyfname).astype("float32") / (2**16 - 1) * + (2**8 - 1)).astype("uint8") + print("Input images information:") print(imgs_input.shape) print(imgs_input.dtype) - hist,bins = np.histogram(imgs_input) + hist, bins = np.histogram(imgs_input) print(hist) print(bins) if not do_prediction: - imgs_mask_train = np.load(targetnpyfname).astype('uint8') - print('Input masks information:') + imgs_mask_train = np.load(targetnpyfname).astype("uint8") + print("Input masks information:") print(imgs_mask_train.shape) print(imgs_mask_train.dtype) - hist,bins = np.histogram(imgs_mask_train) + hist, bins = np.histogram(imgs_mask_train) print(hist) print(bins) - + # Make the grayscale images RGB since that's what the model expects apparently - if expandChannel: - imgs_input = np.stack((imgs_input,)*3, -1) + if expandChannel: + imgs_input = np.stack((imgs_input,) * 3, -1) else: - imgs_input = np.expand_dims(imgs_input, 3) - print('New shape of input images:') + imgs_input = np.expand_dims(imgs_input, 3) + print("New shape of input images:") print(imgs_input.shape) if not do_prediction: - imgs_mask_train = np.expand_dims(imgs_mask_train, 3) - print('New shape of masks:') - print(imgs_mask_train.shape) - + imgs_mask_train = np.expand_dims(imgs_mask_train, 3) + print("New shape of masks:") + print(imgs_mask_train.shape) + # Preprocess as per https://github.com/qubvel/segmentation_models preprocessing_fn = get_preprocessing(backbone) imgs_input = preprocessing_fn(imgs_input) - + # Return appropriate variables if not do_prediction: - return(imgs_input,imgs_mask_train) + return (imgs_input, imgs_mask_train) else: - return(imgs_input) + return imgs_input # Import relevant modules and functions import sys - sys.path.append(gParameters['segmentation_models_repo']) - import numpy as np - from keras.models import Model - from keras.layers import Input, concatenate, Conv2D, MaxPooling2D, Conv2DTranspose, Dropout - from keras.optimizers import Adam - from keras.callbacks import ModelCheckpoint,ReduceLROnPlateau,EarlyStopping,CSVLogger - from keras.layers.normalization import BatchNormalization - from keras.backend import binary_crossentropy - import keras + + sys.path.append(gParameters["segmentation_models_repo"]) import random + + import keras + import numpy as np import tensorflow as tf - from keras.models import model_from_json + from keras.backend import binary_crossentropy + from keras.callbacks import ( + CSVLogger, + EarlyStopping, + ModelCheckpoint, + ReduceLROnPlateau, + ) + from keras.layers import ( + Conv2D, + Conv2DTranspose, + Dropout, + Input, + MaxPooling2D, + concatenate, + ) + from keras.layers.normalization import BatchNormalization + from keras.models import Model, model_from_json + from keras.optimizers import Adam from segmentation_models import Unet from segmentation_models.backbones import get_preprocessing - K.set_image_data_format('channels_last') # TF dimension ordering in this code - + + K.set_image_data_format( + "channels_last") # TF dimension ordering in this code + # Basically constants expandChannel = True - modelwtsfname = 'model_weights.h5' - model_json_fname = 'model.json' - csvfname = 'model.csv' - - do_prediction = gParameters['predict'] - if not do_prediction: # Train... - print('Training...') + modelwtsfname = "model_weights.h5" + model_json_fname = "model.json" + csvfname = "model.csv" + + do_prediction = gParameters["predict"] + if not do_prediction: # Train... + print("Training...") # Parameters - inputnpyfname = gParameters['images'] - labels = gParameters['labels'] - initialize = gParameters['initialize'] - backbone = gParameters['backbone'] - encoder = gParameters['encoder'] - lr = float(gParameters['lr']) - batch_size = gParameters['batch_size'] - obj_return = gParameters['obj_return'] - epochs = gParameters['epochs'] + inputnpyfname = gParameters["images"] + labels = gParameters["labels"] + initialize = gParameters["initialize"] + backbone = gParameters["backbone"] + encoder = gParameters["encoder"] + lr = float(gParameters["lr"]) + batch_size = gParameters["batch_size"] + obj_return = gParameters["obj_return"] + epochs = gParameters["epochs"] # Preprocess the data - imgs_train,imgs_mask_train = preprocess_data(do_prediction,inputnpyfname,labels,expandChannel,backbone) + imgs_train, imgs_mask_train = preprocess_data(do_prediction, + inputnpyfname, labels, + expandChannel, backbone) # Load, save, and compile the model model = Unet(backbone_name=backbone, encoder_weights=encoder) - save_model_to_json(model,model_json_fname) - model.compile(optimizer=Adam(lr=lr), loss='binary_crossentropy', metrics=['binary_crossentropy','mean_squared_error',dice_coef, dice_coef_batch, focal_loss()]) + save_model_to_json(model, model_json_fname) + model.compile( + optimizer=Adam(lr=lr), + loss="binary_crossentropy", + metrics=[ + "binary_crossentropy", + "mean_squared_error", + dice_coef, + dice_coef_batch, + focal_loss(), + ], + ) # Load previous weights for restarting, if desired and possible if os.path.isfile(initialize): - print('-'*30) - print('Loading previous weights ...') + print("-" * 30) + print("Loading previous weights ...") model.load_weights(initialize) # Set up the training callback functions - model_checkpoint = ModelCheckpoint(modelwtsfname, monitor=obj_return, save_best_only=True) - reduce_lr = ReduceLROnPlateau(monitor=obj_return, factor=0.1,patience=100, min_lr=0.001,verbose=1) - model_es = EarlyStopping(monitor=obj_return, min_delta=0.00000001, patience=100, verbose=1, mode='auto') + model_checkpoint = ModelCheckpoint(modelwtsfname, + monitor=obj_return, + save_best_only=True) + reduce_lr = ReduceLROnPlateau(monitor=obj_return, + factor=0.1, + patience=100, + min_lr=0.001, + verbose=1) + model_es = EarlyStopping( + monitor=obj_return, + min_delta=0.00000001, + patience=100, + verbose=1, + mode="auto", + ) csv_logger = CSVLogger(csvfname, append=True) # Train the model - history_callback = model.fit(imgs_train, imgs_mask_train, batch_size=batch_size, epochs=epochs, verbose=2, shuffle=True, validation_split=0.10, callbacks=[model_checkpoint, reduce_lr, model_es, csv_logger]) + history_callback = model.fit( + imgs_train, + imgs_mask_train, + batch_size=batch_size, + epochs=epochs, + verbose=2, + shuffle=True, + validation_split=0.10, + callbacks=[model_checkpoint, reduce_lr, model_es, csv_logger], + ) print("Minimum validation loss:") print(min(history_callback.history[obj_return])) - else: # ...or predict - print('Inferring...') + else: # ...or predict + print("Inferring...") # Parameters - inputnpyfname = gParameters['images'] - initialize = gParameters['initialize'] - backbone = gParameters['backbone'] + inputnpyfname = gParameters["images"] + initialize = gParameters["initialize"] + backbone = gParameters["backbone"] # lr = float(gParameters['lr']) # this isn't needed but we're keeping it for the U-Net, where it is "needed" # Preprocess the data - imgs_infer = preprocess_data(do_prediction,inputnpyfname,'',expandChannel,backbone) + imgs_infer = preprocess_data(do_prediction, inputnpyfname, "", + expandChannel, backbone) # Load the model - #model = get_model(model_json_fname,initialize) - model = get_model(os.path.dirname(initialize)+'/'+model_json_fname,initialize) - + # model = get_model(model_json_fname,initialize) + model = get_model( + os.path.dirname(initialize) + "/" + model_json_fname, initialize) + # Run inference imgs_test_predict = model.predict(imgs_infer, batch_size=1, verbose=1) # Save the predicted masks - np.save('mask_predictions.npy', np.squeeze(np.round(imgs_test_predict).astype('uint8'))) + np.save( + "mask_predictions.npy", + np.squeeze(np.round(imgs_test_predict).astype("uint8")), + ) history_callback = None - + #### End model input ############################################################################################ - - return(history_callback) + + return history_callback + def main(): - print('Running main program...') + print("Running main program...") gParameters = initialize_parameters() run(gParameters) -if __name__ == '__main__': + +if __name__ == "__main__": main() try: K.clear_session() except AttributeError: - pass \ No newline at end of file + pass diff --git a/archives/templates/models/unet.py b/archives/templates/models/unet.py index 0f609877..8dfbf697 100644 --- a/archives/templates/models/unet.py +++ b/archives/templates/models/unet.py @@ -1,12 +1,13 @@ # Import relevant modules -from keras import backend as K import numpy as np +from keras import backend as K # Parameters -candle_lib = '/data/BIDS-HPC/public/candle/Candle/common' +candle_lib = "/data/BIDS-HPC/public/candle/Candle/common" + def initialize_parameters(): - print('Initializing parameters...') + print("Initializing parameters...") import os @@ -15,20 +16,28 @@ def initialize_parameters(): # Import the CANDLE library import sys + sys.path.append(candle_lib) import candle_keras as candle # Instantiate the candle.Benchmark class - mymodel_common = candle.Benchmark(file_path,os.getenv("DEFAULT_PARAMS_FILE"),'keras',prog='myprog',desc='My model') + mymodel_common = candle.Benchmark( + file_path, + os.getenv("DEFAULT_PARAMS_FILE"), + "keras", + prog="myprog", + desc="My model", + ) # Get a dictionary of the model hyperparamters gParameters = candle.initialize_parameters(mymodel_common) # Return the dictionary of the hyperparameters - return(gParameters) + return gParameters + def run(gParameters): - print('Running model...') + print("Running model...") #### Begin model input ########################################################################################## # Currently based off run_unet.py @@ -40,10 +49,10 @@ def focal_loss(labels, logits, gamma=0, alpha=1.0): Notice: logits is probability after softmax gradient is d(Fl)/d(p_t) not d(Fl)/d(x) as described in paper d(Fl)/d(p_t) * [p_t(1-p_t)] = d(Fl)/d(x) - - Focal Loss for Dense Object Detection, + + Focal Loss for Dense Object Detection, https://doi.org/10.1016/j.ajodo.2005.02.022 - + :param labels: ground truth labels, shape of [batch_size] :param logits: model's output, shape of [batch_size, num_cls] :param gamma: @@ -53,87 +62,125 @@ def focal_loss(labels, logits, gamma=0, alpha=1.0): import tensorflow as tf - epsilon = 1.e-9 + epsilon = 1.0e-9 labels = tf.to_int64(labels) labels = tf.convert_to_tensor(labels, tf.int64) logits = tf.convert_to_tensor(logits, tf.float32) num_cls = logits.shape[1] - + model_out = tf.add(logits, epsilon) onehot_labels = tf.one_hot(labels, num_cls) ce = tf.multiply(onehot_labels, -tf.log(model_out)) - weight = tf.multiply(onehot_labels, tf.pow(tf.subtract(1., model_out), gamma)) + weight = tf.multiply(onehot_labels, + tf.pow(tf.subtract(1.0, model_out), gamma)) fl = tf.multiply(alpha, tf.multiply(weight, ce)) reduced_fl = tf.reduce_max(fl, axis=1) # reduced_fl = tf.reduce_sum(fl, axis=1) # same as reduce_max return reduced_fl def dice_coef(y_true, y_pred): - smooth = 1. - intersection = K.sum(y_true * y_pred, axis=[1,2,3]) - union = K.sum(y_true, axis=[1,2,3]) + K.sum(y_pred, axis=[1,2,3]) - dc = K.mean( (2. * intersection + smooth) / (union + smooth), axis=0) + smooth = 1.0 + intersection = K.sum(y_true * y_pred, axis=[1, 2, 3]) + union = K.sum(y_true, axis=[1, 2, 3]) + K.sum(y_pred, axis=[1, 2, 3]) + dc = K.mean((2.0 * intersection + smooth) / (union + smooth), axis=0) return dc def dice_coef_loss(y_true, y_pred): return -dice_coef(y_true, y_pred) - def get_unet(img_rows, img_cols, n_layers, filter_size, dropout, activation_func, conv_size, loss_func, last_activation, batch_norm, learning_rate): - - print('-'*30) - print('Creating and compiling model...') - print('-'*30) - print (img_rows) - print (img_cols) + def get_unet( + img_rows, + img_cols, + n_layers, + filter_size, + dropout, + activation_func, + conv_size, + loss_func, + last_activation, + batch_norm, + learning_rate, + ): + + print("-" * 30) + print("Creating and compiling model...") + print("-" * 30) + print(img_rows) + print(img_cols) inputs = Input((img_rows, img_cols, 1)) - conv_layers=[] - pool_layers=[inputs] - conv_filter=(conv_size, conv_size ) + conv_layers = [] + pool_layers = [inputs] + conv_filter = (conv_size, conv_size) for i in range(n_layers): - conv = Conv2D(filter_size, conv_filter, activation=activation_func, padding='same')(pool_layers[i]) + conv = Conv2D(filter_size, + conv_filter, + activation=activation_func, + padding="same")(pool_layers[i]) conv = BatchNormalization()(conv) if batch_norm else conv if dropout != None: conv = Dropout(dropout)(conv) - conv = Conv2D(filter_size, conv_filter, activation=activation_func, padding='same')(conv) + conv = Conv2D(filter_size, + conv_filter, + activation=activation_func, + padding="same")(conv) conv = BatchNormalization()(conv) if batch_norm else conv pool = MaxPooling2D(pool_size=(2, 2))(conv) conv_layers.append(conv) pool_layers.append(pool) - filter_size *=2 - - filter_size /=4 + filter_size *= 2 + + filter_size /= 4 - for i in range(n_layers-1): + for i in range(n_layers - 1): filter_size = int(filter_size) - up = concatenate([Conv2DTranspose(filter_size, (2, 2), strides=(2, 2), padding='same')(conv_layers[-1]), conv_layers[n_layers-i-2]], axis=3) - conv = Conv2D(filter_size, conv_filter, activation=activation_func, padding='same')(up) + up = concatenate( + [ + Conv2DTranspose(filter_size, (2, 2), + strides=(2, 2), + padding="same")(conv_layers[-1]), + conv_layers[n_layers - i - 2], + ], + axis=3, + ) + conv = Conv2D(filter_size, + conv_filter, + activation=activation_func, + padding="same")(up) conv = BatchNormalization()(conv) if batch_norm else conv - if dropout != None: + if dropout != None: conv = Dropout(dropout)(conv) - conv = Conv2D(filter_size, conv_filter, activation=activation_func, padding='same')(conv) + conv = Conv2D(filter_size, + conv_filter, + activation=activation_func, + padding="same")(conv) conv = BatchNormalization()(conv) if batch_norm else conv conv_layers.append(conv) filter_size /= 2 - #For binary classification, last activation should be sigmoid. + # For binary classification, last activation should be sigmoid. # if loss_func == 'dice': # last_activation = 'sigmoid' # else: # print ("WARNING: last_activation set to None") # last_activation = None - last_conv = Conv2D(1, (1, 1), activation=last_activation)(conv_layers[-1]) + last_conv = Conv2D(1, (1, 1), + activation=last_activation)(conv_layers[-1]) conv_layers.append(last_conv) - + model = Model(inputs=[inputs], outputs=[last_conv]) - - if loss_func == 'dice': - model.compile(optimizer=Adam(lr=learning_rate), loss=dice_coef_loss, metrics=[dice_coef]) + + if loss_func == "dice": + model.compile( + optimizer=Adam(lr=learning_rate), + loss=dice_coef_loss, + metrics=[dice_coef], + ) else: - #Any Keras loss function will be passed - model.compile(optimizer=Adam(lr=learning_rate), loss = loss_func) + # Any Keras loss function will be passed + model.compile(optimizer=Adam(lr=learning_rate), loss=loss_func) model.summary() model_json = model.to_json() with open("model.json", "w") as json_file: @@ -142,220 +189,328 @@ def get_unet(img_rows, img_cols, n_layers, filter_size, dropout, activation_func def get_images(images, masks, normalize_mask=False): - print('-'*30) - print('Loading and preprocessing train data...') - print('-'*30) + print("-" * 30) + print("Loading and preprocessing train data...") + print("-" * 30) - imgs_train = preprocess_images(images) - imgs_mask_train = preprocess_masks(masks, normalize_mask) + imgs_train = preprocess_images(images) + imgs_mask_train = preprocess_masks(masks, normalize_mask) - #Shuffle the images + # Shuffle the images np.random.seed(10) shuffled_id = np.random.permutation(imgs_train.shape[0]) imgs_train = imgs_train[shuffled_id] imgs_mask_train = imgs_mask_train[shuffled_id] - assert(np.amax(imgs_mask_train) <= 1) - assert(np.amin(imgs_mask_train) >= 0) - return_images = imgs_train - return_masks = imgs_mask_train + assert np.amax(imgs_mask_train) <= 1 + assert np.amin(imgs_mask_train) >= 0 + return_images = imgs_train + return_masks = imgs_mask_train - print (np.shape(return_images)) - print (np.shape(return_masks)) + print(np.shape(return_images)) + print(np.shape(return_masks)) return [return_images, return_masks] - def evaluate_params(images, labels, batch_size, epochs, obj_return, initialize, n_layers, filter_size, dropout, activation_func, conv_size, loss_func, last_activation, batch_norm, learning_rate): - - images , masks = get_images(images,labels) - - print("Training images histogram") + def evaluate_params( + images, + labels, + batch_size, + epochs, + obj_return, + initialize, + n_layers, + filter_size, + dropout, + activation_func, + conv_size, + loss_func, + last_activation, + batch_norm, + learning_rate, + ): + + images, masks = get_images(images, labels) + + print("Training images histogram") hist, bin_edges = np.histogram(images) print(hist) print(bin_edges) - - print("Training masks histogram") + + print("Training masks histogram") hist, bin_edges = np.histogram(masks) print(hist) print(bin_edges) - - #Get the images size + + # Get the images size img_rows = np.shape(images)[1] img_cols = np.shape(images)[2] - - model = get_unet(img_rows, img_cols, n_layers, filter_size, dropout, activation_func, conv_size, loss_func, last_activation, batch_norm, learning_rate) - - history_callback = train(model, images, masks, batch_size, epochs, obj_return, initialize=initialize) - return history_callback # note that history_callback is what's returned by model.fit() + + model = get_unet( + img_rows, + img_cols, + n_layers, + filter_size, + dropout, + activation_func, + conv_size, + loss_func, + last_activation, + batch_norm, + learning_rate, + ) + + history_callback = train(model, + images, + masks, + batch_size, + epochs, + obj_return, + initialize=initialize) + return history_callback # note that history_callback is what's returned by model.fit() def preprocess_images(images): imgs_train = np.squeeze(np.load(images)) if imgs_train.ndim != 3: - raise Exception("Error: The number of dimensions for images should equal 3, after squeezing the shape is:{0}".format(np.shape(images))) - imgs_train = imgs_train.astype('float32') + raise Exception( + "Error: The number of dimensions for images should equal 3, after squeezing the shape is:{0}" + .format(np.shape(images))) + imgs_train = imgs_train.astype("float32") print("MAX before:{0}".format(np.amax(imgs_train))) - #Normalize all number between 0 and 1. - uint16_info = np.iinfo('uint16') + # Normalize all number between 0 and 1. + uint16_info = np.iinfo("uint16") imgs_train = imgs_train / uint16_info.max print("MAX after:{0}".format(np.amax(imgs_train))) - imgs_train = np.expand_dims(imgs_train, axis= 3) + imgs_train = np.expand_dims(imgs_train, axis=3) return imgs_train def preprocess_masks(masks, normalize_mask=False): imgs_mask_train = np.squeeze(np.load(masks)) if imgs_mask_train.ndim != 3: - raise Exception("Error: The number of dimensions for masks should equal 3, after squeezing the shape is:{0}".format(np.shape(masks))) - imgs_mask_train = imgs_mask_train.astype('float32') + raise Exception( + "Error: The number of dimensions for masks should equal 3, after squeezing the shape is:{0}" + .format(np.shape(masks))) + imgs_mask_train = imgs_mask_train.astype("float32") if normalize_mask: - imgs_mask_train /= 255. # scale masks to [0, 1] - imgs_mask_train = np.expand_dims(imgs_mask_train, axis= 3) + imgs_mask_train /= 255.0 # scale masks to [0, 1] + imgs_mask_train = np.expand_dims(imgs_mask_train, axis=3) return imgs_mask_train - def train(model, imgs_train, imgs_mask_train, batch_size, epochs, obj_return, initialize=None): - - model_checkpoint = ModelCheckpoint(modelwtsfname, monitor=obj_return, save_best_only=True) - reduce_lr = ReduceLROnPlateau(monitor=obj_return, factor=0.1,patience=100, verbose=1) - model_es = EarlyStopping(monitor=obj_return, min_delta=0.000001, patience=400, verbose=1, mode='auto') - csv_logger = CSVLogger('training.csv') - - print('-'*30) - print('Fitting model...') - print('-'*30) - + def train( + model, + imgs_train, + imgs_mask_train, + batch_size, + epochs, + obj_return, + initialize=None, + ): + + model_checkpoint = ModelCheckpoint(modelwtsfname, + monitor=obj_return, + save_best_only=True) + reduce_lr = ReduceLROnPlateau(monitor=obj_return, + factor=0.1, + patience=100, + verbose=1) + model_es = EarlyStopping(monitor=obj_return, + min_delta=0.000001, + patience=400, + verbose=1, + mode="auto") + csv_logger = CSVLogger("training.csv") + + print("-" * 30) + print("Fitting model...") + print("-" * 30) + if initialize != None: print("Initializing the model using:{0}\n", initialize) model.load_weights(initialize) - - #test_call=TestCallback((imgs_train,imgs_mask_train)) - + + # test_call=TestCallback((imgs_train,imgs_mask_train)) + print(np.shape(imgs_train)) print(np.shape(imgs_mask_train)) - #return model.fit(imgs_train, imgs_mask_train, batch_size=2, epochs=3000, verbose=2, shuffle=True, - return model.fit(imgs_train, imgs_mask_train, batch_size=batch_size, epochs=epochs, verbose=2, shuffle=True, - #return model.fit(imgs_train, imgs_mask_train, batch_size=2, epochs=1500, verbose=2, shuffle=True, - #return model.fit(imgs_train, imgs_mask_train, batch_size=2, epochs=4, verbose=2, shuffle=True, - validation_split=0.10, callbacks=[model_checkpoint, reduce_lr, model_es, csv_logger]) + # return model.fit(imgs_train, imgs_mask_train, batch_size=2, epochs=3000, verbose=2, shuffle=True, + return model.fit( + imgs_train, + imgs_mask_train, + batch_size=batch_size, + epochs=epochs, + verbose=2, + shuffle=True, + # return model.fit(imgs_train, imgs_mask_train, batch_size=2, epochs=1500, verbose=2, shuffle=True, + # return model.fit(imgs_train, imgs_mask_train, batch_size=2, epochs=4, verbose=2, shuffle=True, + validation_split=0.10, + callbacks=[model_checkpoint, reduce_lr, model_es, csv_logger], + ) def predict(model, weights, images): - print('-'*30) - print('Loading and preprocessing test data...') - print('-'*30) - - #imgs_test = np.load('./data_python/1CDT_Green_Red_FarRed_Annotated_FISH_Dilation4Conn1Iter_Testing_128by128_normalize.npy') - #imgs_mask_test = np.load('.//data_python/1CDT_Green_Red_FarRed_Annotated_FISH_Dilation4Conn1Iter_Testing_128by128_normalize_Mask.npy') - #imgs_test = imgs_test.astype('float32') - - #imgs_train = np.load('../data_python/1CDT_Green_Red_Annotated_FISH_Dilation8Conn1Iter_Training_128by128.npy') - #imgs_train = imgs_train.astype('float32') - #mean = np.mean(imgs_train) # mean for data centering - #std = np.std(imgs_train) # std for data normalization - #del imgs_train - #imgs_test -= mean - #imgs_test /= std - - print('-'*30) - print('Loading saved weights...') - print('-'*30) + print("-" * 30) + print("Loading and preprocessing test data...") + print("-" * 30) + + # imgs_test = np.load('./data_python/1CDT_Green_Red_FarRed_Annotated_FISH_Dilation4Conn1Iter_Testing_128by128_normalize.npy') + # imgs_mask_test = np.load('.//data_python/1CDT_Green_Red_FarRed_Annotated_FISH_Dilation4Conn1Iter_Testing_128by128_normalize_Mask.npy') + # imgs_test = imgs_test.astype('float32') + + # imgs_train = np.load('../data_python/1CDT_Green_Red_Annotated_FISH_Dilation8Conn1Iter_Training_128by128.npy') + # imgs_train = imgs_train.astype('float32') + # mean = np.mean(imgs_train) # mean for data centering + # std = np.std(imgs_train) # std for data normalization + # del imgs_train + # imgs_test -= mean + # imgs_test /= std + + print("-" * 30) + print("Loading saved weights...") + print("-" * 30) model.load_weights(weights) - print('-'*30) - print('Predicting masks on test data...') - print('-'*30) - #imgs_test = np.expand_dims(imgs_test,3) - - print ('{0}'.format(np.shape(images))) - print ('{0}'.format(type(images))) + print("-" * 30) + print("Predicting masks on test data...") + print("-" * 30) + # imgs_test = np.expand_dims(imgs_test,3) + print("{0}".format(np.shape(images))) + print("{0}".format(type(images))) - print("Inference images histogram") + print("Inference images histogram") hist, bin_edges = np.histogram(images) print(hist) print(bin_edges) - imgs_mask_test = model.predict(images, batch_size = 1,verbose=1) + imgs_mask_test = model.predict(images, batch_size=1, verbose=1) - print("Inference predictions histogram") + print("Inference predictions histogram") hist, bin_edges = np.histogram(imgs_mask_test) print(hist) print(bin_edges) - - #np.save('mask_predictions.npy', np.squeeze(imgs_mask_test)) - np.save('mask_predictions.npy', np.squeeze(np.round(imgs_mask_test).astype('uint8'))) + + # np.save('mask_predictions.npy', np.squeeze(imgs_mask_test)) + np.save("mask_predictions.npy", + np.squeeze(np.round(imgs_mask_test).astype("uint8"))) # Import relevant modules and functions + import pickle + + from keras.callbacks import ( + Callback, + CSVLogger, + EarlyStopping, + ModelCheckpoint, + ReduceLROnPlateau, + ) + from keras.layers import ( + BatchNormalization, + Conv2D, + Conv2DTranspose, + Dropout, + Input, + MaxPooling2D, + concatenate, + ) from keras.models import Model - from keras.layers import Input, concatenate, Conv2D, MaxPooling2D, Conv2DTranspose, Dropout, BatchNormalization from keras.optimizers import Adam - from keras.callbacks import ModelCheckpoint,ReduceLROnPlateau,EarlyStopping, CSVLogger, Callback - import pickle # Basically a constant - modelwtsfname = 'model_weights.h5' + modelwtsfname = "model_weights.h5" - if not gParameters['predict']: - print('Training...') + if not gParameters["predict"]: + print("Training...") # Parameters - n_layers = gParameters['nlayers'] - filter_size = gParameters['num_filters'] - dropout = gParameters['dropout'] - activation_func = gParameters['activation'] - conv_size = gParameters['conv_size'] - loss_func = gParameters['loss_func'] - last_activation = gParameters['last_act'] - batch_norm = gParameters['batch_norm'] - learning_rate = float(gParameters['lr']) - images = gParameters['images'] - labels = gParameters['labels'] - batch_size = gParameters['batch_size'] - epochs = gParameters['epochs'] - obj_return = gParameters['obj_return'] - initialize = gParameters['initialize'] - - history_callback = evaluate_params(images, labels, batch_size, epochs, obj_return, initialize, n_layers, filter_size, dropout, activation_func, conv_size, loss_func, last_activation, batch_norm, learning_rate) # note that history_callback is what's returned by model.fit() + n_layers = gParameters["nlayers"] + filter_size = gParameters["num_filters"] + dropout = gParameters["dropout"] + activation_func = gParameters["activation"] + conv_size = gParameters["conv_size"] + loss_func = gParameters["loss_func"] + last_activation = gParameters["last_act"] + batch_norm = gParameters["batch_norm"] + learning_rate = float(gParameters["lr"]) + images = gParameters["images"] + labels = gParameters["labels"] + batch_size = gParameters["batch_size"] + epochs = gParameters["epochs"] + obj_return = gParameters["obj_return"] + initialize = gParameters["initialize"] + + history_callback = evaluate_params( + images, + labels, + batch_size, + epochs, + obj_return, + initialize, + n_layers, + filter_size, + dropout, + activation_func, + conv_size, + loss_func, + last_activation, + batch_norm, + learning_rate, + ) # note that history_callback is what's returned by model.fit() print("Minimum validation loss:") print(min(history_callback.history[obj_return])) - #Save the history as pickle object - pickle.dump(history_callback.history, open( "fit_history.p", "wb" ) ) + # Save the history as pickle object + pickle.dump(history_callback.history, open("fit_history.p", "wb")) else: - print('Inferring...') + print("Inferring...") # Parameters - n_layers = gParameters['nlayers'] - filter_size = gParameters['num_filters'] - dropout = gParameters['dropout'] - activation_func = gParameters['activation'] - conv_size = gParameters['conv_size'] - loss_func = gParameters['loss_func'] - last_activation = gParameters['last_act'] - batch_norm = gParameters['batch_norm'] - learning_rate = float(gParameters['lr']) - images = gParameters['images'] - initialize = gParameters['initialize'] - - #It is not necessary to pass masks for prediction, but I am just following the function - #prototype for now. + n_layers = gParameters["nlayers"] + filter_size = gParameters["num_filters"] + dropout = gParameters["dropout"] + activation_func = gParameters["activation"] + conv_size = gParameters["conv_size"] + loss_func = gParameters["loss_func"] + last_activation = gParameters["last_act"] + batch_norm = gParameters["batch_norm"] + learning_rate = float(gParameters["lr"]) + images = gParameters["images"] + initialize = gParameters["initialize"] + + # It is not necessary to pass masks for prediction, but I am just following the function + # prototype for now. images = preprocess_images(images) - #Get the images size + # Get the images size img_rows = np.shape(images)[1] img_cols = np.shape(images)[2] - model = get_unet(img_rows, img_cols, n_layers, filter_size, dropout, activation_func, conv_size, loss_func, last_activation, batch_norm, learning_rate) + model = get_unet( + img_rows, + img_cols, + n_layers, + filter_size, + dropout, + activation_func, + conv_size, + loss_func, + last_activation, + batch_norm, + learning_rate, + ) weights = initialize predict(model, weights, images) history_callback = None - + #### End model input ############################################################################################ - - return(history_callback) + + return history_callback + def main(): - print('Running main program...') + print("Running main program...") gParameters = initialize_parameters() run(gParameters) -if __name__ == '__main__': + +if __name__ == "__main__": main() try: K.clear_session() except AttributeError: - pass \ No newline at end of file + pass diff --git a/archives/templates/models/uno.py b/archives/templates/models/uno.py index 8a6ae339..a8c39555 100644 --- a/archives/templates/models/uno.py +++ b/archives/templates/models/uno.py @@ -1,10 +1,10 @@ #! /usr/bin/env python -#Note this file (model.py) is the same as that in Benchmarks/Pilot1/Uno/uno_baseline_keras2.py except with the following change:: +# Note this file (model.py) is the same as that in Benchmarks/Pilot1/Uno/uno_baseline_keras2.py except with the following change:: # -#- unoBmk = benchmark.BenchmarkUno(benchmark.file_path, 'uno_default_model.txt', 'keras', -#+ #mymodel_common = candle.Benchmark(file_path,os.getenv("DEFAULT_PARAMS_FILE"),'keras',prog='myprog',desc='My model') -#+ unoBmk = benchmark.BenchmarkUno(benchmark.file_path, os.getenv("DEFAULT_PARAMS_FILE"), 'keras', +# - unoBmk = benchmark.BenchmarkUno(benchmark.file_path, 'uno_default_model.txt', 'keras', +# + #mymodel_common = candle.Benchmark(file_path,os.getenv("DEFAULT_PARAMS_FILE"),'keras',prog='myprog',desc='My model') +# + unoBmk = benchmark.BenchmarkUno(benchmark.file_path, os.getenv("DEFAULT_PARAMS_FILE"), 'keras', from __future__ import division, print_function @@ -15,46 +15,50 @@ import random import threading +import keras + +# For non-interactive plotting +import matplotlib as mpl import numpy as np import pandas as pd - -import keras from keras import backend as K from keras import optimizers +from keras.callbacks import ( + Callback, + LearningRateScheduler, + ModelCheckpoint, + ReduceLROnPlateau, + TensorBoard, +) +from keras.layers import Dense, Dropout, Input from keras.models import Model -from keras.layers import Input, Dense, Dropout -from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler, TensorBoard from keras.utils import get_custom_objects from keras.utils.vis_utils import plot_model -from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error -from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold from scipy.stats.stats import pearsonr +from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score +from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold -# For non-interactive plotting -import matplotlib as mpl -mpl.use('Agg') +mpl.use("Agg") +import candle_keras as candle import matplotlib.pyplot as plt - import uno as benchmark -import candle_keras as candle - import uno_data -from uno_data import CombinedDataLoader, CombinedDataGenerator - +from uno_data import CombinedDataGenerator, CombinedDataLoader logger = logging.getLogger(__name__) -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" def set_seed(seed): - os.environ['PYTHONHASHSEED'] = '0' + os.environ["PYTHONHASHSEED"] = "0" np.random.seed(seed) random.seed(seed) - if K.backend() == 'tensorflow': + if K.backend() == "tensorflow": import tensorflow as tf + tf.set_random_seed(seed) # session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) # sess = tf.Session(graph=tf.get_default_graph(), config=session_conf) @@ -63,7 +67,7 @@ def set_seed(seed): # Uncommit when running on an optimized tensorflow where NUM_INTER_THREADS and # NUM_INTRA_THREADS env vars are set. # session_conf = tf.ConfigProto(inter_op_parallelism_threads=int(os.environ['NUM_INTER_THREADS']), - # intra_op_parallelism_threads=int(os.environ['NUM_INTRA_THREADS'])) + # intra_op_parallelism_threads=int(os.environ['NUM_INTRA_THREADS'])) # sess = tf.Session(graph=tf.get_default_graph(), config=session_conf) # K.set_session(sess) @@ -77,11 +81,13 @@ def verify_path(path): def set_up_logger(logfile, verbose): verify_path(logfile) fh = logging.FileHandler(logfile) - fh.setFormatter(logging.Formatter("[%(asctime)s %(process)d] %(message)s", datefmt="%Y-%m-%d %H:%M:%S")) + fh.setFormatter( + logging.Formatter("[%(asctime)s %(process)d] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S")) fh.setLevel(logging.DEBUG) sh = logging.StreamHandler() - sh.setFormatter(logging.Formatter('')) + sh.setFormatter(logging.Formatter("")) sh.setLevel(logging.DEBUG if verbose else logging.INFO) for log in [logger, uno_data.logger]: @@ -91,37 +97,37 @@ def set_up_logger(logfile, verbose): def extension_from_parameters(args): - """Construct string for saving model with annotation of parameters""" - ext = '' - ext += '.A={}'.format(args.activation) - ext += '.B={}'.format(args.batch_size) - ext += '.E={}'.format(args.epochs) - ext += '.O={}'.format(args.optimizer) + """Construct string for saving model with annotation of parameters.""" + ext = "" + ext += ".A={}".format(args.activation) + ext += ".B={}".format(args.batch_size) + ext += ".E={}".format(args.epochs) + ext += ".O={}".format(args.optimizer) # ext += '.LEN={}'.format(args.maxlen) - ext += '.LR={}'.format(args.learning_rate) - ext += '.CF={}'.format(''.join([x[0] for x in sorted(args.cell_features)])) - ext += '.DF={}'.format(''.join([x[0] for x in sorted(args.drug_features)])) + ext += ".LR={}".format(args.learning_rate) + ext += ".CF={}".format("".join([x[0] for x in sorted(args.cell_features)])) + ext += ".DF={}".format("".join([x[0] for x in sorted(args.drug_features)])) if args.feature_subsample > 0: - ext += '.FS={}'.format(args.feature_subsample) + ext += ".FS={}".format(args.feature_subsample) if args.drop > 0: - ext += '.DR={}'.format(args.drop) + ext += ".DR={}".format(args.drop) if args.warmup_lr: - ext += '.wu_lr' + ext += ".wu_lr" if args.reduce_lr: - ext += '.re_lr' + ext += ".re_lr" if args.residual: - ext += '.res' + ext += ".res" if args.use_landmark_genes: - ext += '.L1000' + ext += ".L1000" if args.no_gen: - ext += '.ng' + ext += ".ng" for i, n in enumerate(args.dense): if n > 0: - ext += '.D{}={}'.format(i+1, n) + ext += ".D{}={}".format(i + 1, n) if args.dense_feature_layers != args.dense: for i, n in enumerate(args.dense): if n > 0: - ext += '.FD{}={}'.format(i+1, n) + ext += ".FD{}={}".format(i + 1, n) return ext @@ -134,9 +140,9 @@ def discretize(y, bins=5): def r2(y_true, y_pred): - SS_res = K.sum(K.square(y_true - y_pred)) + SS_res = K.sum(K.square(y_true - y_pred)) SS_tot = K.sum(K.square(y_true - K.mean(y_true))) - return (1 - SS_res/(SS_tot + K.epsilon())) + return 1 - SS_res / (SS_tot + K.epsilon()) def mae(y_true, y_pred): @@ -148,56 +154,63 @@ def evaluate_prediction(y_true, y_pred): mae = mean_absolute_error(y_true, y_pred) r2 = r2_score(y_true, y_pred) corr, _ = pearsonr(y_true, y_pred) - return {'mse': mse, 'mae': mae, 'r2': r2, 'corr': corr} + return {"mse": mse, "mae": mae, "r2": r2, "corr": corr} -def log_evaluation(metric_outputs, description='Comparing y_true and y_pred:'): +def log_evaluation(metric_outputs, description="Comparing y_true and y_pred:"): logger.info(description) for metric, value in metric_outputs.items(): - logger.info(' {}: {:.4f}'.format(metric, value)) + logger.info(" {}: {:.4f}".format(metric, value)) -def plot_history(out, history, metric='loss', title=None): - title = title or 'model {}'.format(metric) - val_metric = 'val_{}'.format(metric) +def plot_history(out, history, metric="loss", title=None): + title = title or "model {}".format(metric) + val_metric = "val_{}".format(metric) plt.figure(figsize=(8, 6)) - plt.plot(history.history[metric], marker='o') - plt.plot(history.history[val_metric], marker='d') + plt.plot(history.history[metric], marker="o") + plt.plot(history.history[val_metric], marker="d") plt.title(title) plt.ylabel(metric) - plt.xlabel('epoch') - plt.legend(['train_{}'.format(metric), 'val_{}'.format(metric)], loc='upper center') - png = '{}.plot.{}.png'.format(out, metric) - plt.savefig(png, bbox_inches='tight') + plt.xlabel("epoch") + plt.legend(["train_{}".format(metric), "val_{}".format(metric)], + loc="upper center") + png = "{}.plot.{}.png".format(out, metric) + plt.savefig(png, bbox_inches="tight") class LoggingCallback(Callback): + def __init__(self, print_fcn=print): Callback.__init__(self) self.print_fcn = print_fcn def on_epoch_end(self, epoch, logs={}): - msg = "[Epoch: %i] %s" % (epoch, ", ".join("%s: %f" % (k, v) for k, v in sorted(logs.items()))) + msg = "[Epoch: %i] %s" % ( + epoch, + ", ".join("%s: %f" % (k, v) for k, v in sorted(logs.items())), + ) self.print_fcn(msg) class PermanentDropout(Dropout): + def __init__(self, rate, **kwargs): super(PermanentDropout, self).__init__(rate, **kwargs) self.uses_learning_phase = False def call(self, x, mask=None): - if 0. < self.rate < 1.: + if 0.0 < self.rate < 1.0: noise_shape = self._get_noise_shape(x) x = K.dropout(x, self.rate, noise_shape) return x class ModelRecorder(Callback): + def __init__(self, save_all_models=False): Callback.__init__(self) self.save_all_models = save_all_models - get_custom_objects()['PermanentDropout'] = PermanentDropout + get_custom_objects()["PermanentDropout"] = PermanentDropout def on_train_begin(self, logs={}): self.val_losses = [] @@ -205,16 +218,22 @@ def on_train_begin(self, logs={}): self.best_model = None def on_epoch_end(self, epoch, logs={}): - val_loss = logs.get('val_loss') + val_loss = logs.get("val_loss") self.val_losses.append(val_loss) if val_loss < self.best_val_loss: self.best_model = keras.models.clone_model(self.model) self.best_val_loss = val_loss -def build_feature_model(input_shape, name='', dense_layers=[1000, 1000], - activation='relu', residual=False, - dropout_rate=0, permanent_dropout=True): +def build_feature_model( + input_shape, + name="", + dense_layers=[1000, 1000], + activation="relu", + residual=False, + dropout_rate=0, + permanent_dropout=True, +): x_input = Input(shape=input_shape) h = x_input for i, layer in enumerate(dense_layers): @@ -238,13 +257,17 @@ def build_model(loader, args, permanent_dropout=True, silent=False): input_models = {} dropout_rate = args.drop for fea_type, shape in loader.feature_shapes.items(): - base_type = fea_type.split('.')[0] - if base_type in ['cell', 'drug']: - box = build_feature_model(input_shape=shape, name=fea_type, - dense_layers=args.dense_feature_layers, - dropout_rate=dropout_rate, permanent_dropout=permanent_dropout) + base_type = fea_type.split(".")[0] + if base_type in ["cell", "drug"]: + box = build_feature_model( + input_shape=shape, + name=fea_type, + dense_layers=args.dense_feature_layers, + dropout_rate=dropout_rate, + permanent_dropout=permanent_dropout, + ) if not silent: - logger.debug('Feature encoding submodel for %s:', fea_type) + logger.debug("Feature encoding submodel for %s:", fea_type) box.summary(print_fn=logger.debug) input_models[fea_type] = box @@ -252,7 +275,7 @@ def build_model(loader, args, permanent_dropout=True, silent=False): encoded_inputs = [] for fea_name, fea_type in loader.input_features.items(): shape = loader.feature_shapes[fea_type] - fea_input = Input(shape, name='input.'+fea_name) + fea_input = Input(shape, name="input." + fea_name) inputs.append(fea_input) if fea_type in input_models: input_model = input_models[fea_type] @@ -285,18 +308,25 @@ def build_model(loader, args, permanent_dropout=True, silent=False): def initialize_parameters(): # Build benchmark object - #mymodel_common = candle.Benchmark(file_path,os.getenv("DEFAULT_PARAMS_FILE"),'keras',prog='myprog',desc='My model') - unoBmk = benchmark.BenchmarkUno(benchmark.file_path, os.getenv("DEFAULT_PARAMS_FILE"), 'keras', - prog='uno_baseline', desc='Build neural network based models to predict tumor response to single and paired drugs.') - + # mymodel_common = candle.Benchmark(file_path,os.getenv("DEFAULT_PARAMS_FILE"),'keras',prog='myprog',desc='My model') + unoBmk = benchmark.BenchmarkUno( + benchmark.file_path, + os.getenv("DEFAULT_PARAMS_FILE"), + "keras", + prog="uno_baseline", + desc= + "Build neural network based models to predict tumor response to single and paired drugs.", + ) + # Initialize parameters gParameters = candle.initialize_parameters(unoBmk) - #benchmark.logger.info('Params: {}'.format(gParameters)) + # benchmark.logger.info('Params: {}'.format(gParameters)) return gParameters class Struct: + def __init__(self, **entries): self.__dict__.update(entries) @@ -307,79 +337,106 @@ def run(params): ext = extension_from_parameters(args) verify_path(args.save) prefix = args.save + ext - logfile = args.logfile if args.logfile else prefix+'.log' + logfile = args.logfile if args.logfile else prefix + ".log" set_up_logger(logfile, args.verbose) - logger.info('Params: {}'.format(params)) + logger.info("Params: {}".format(params)) loader = CombinedDataLoader(seed=args.rng_seed) - loader.load(cache=args.cache, - ncols=args.feature_subsample, - cell_features=args.cell_features, - drug_features=args.drug_features, - drug_median_response_min=args.drug_median_response_min, - drug_median_response_max=args.drug_median_response_max, - use_landmark_genes=args.use_landmark_genes, - use_filtered_genes=args.use_filtered_genes, - preprocess_rnaseq=args.preprocess_rnaseq, - single=args.single, - train_sources=args.train_sources, - test_sources=args.test_sources, - embed_feature_source=not args.no_feature_source, - encode_response_source=not args.no_response_source, - ) + loader.load( + cache=args.cache, + ncols=args.feature_subsample, + cell_features=args.cell_features, + drug_features=args.drug_features, + drug_median_response_min=args.drug_median_response_min, + drug_median_response_max=args.drug_median_response_max, + use_landmark_genes=args.use_landmark_genes, + use_filtered_genes=args.use_filtered_genes, + preprocess_rnaseq=args.preprocess_rnaseq, + single=args.single, + train_sources=args.train_sources, + test_sources=args.test_sources, + embed_feature_source=not args.no_feature_source, + encode_response_source=not args.no_response_source, + ) val_split = args.validation_split train_split = 1 - val_split if args.export_data: fname = args.export_data - loader.partition_data(cv_folds=args.cv, train_split=train_split, val_split=val_split, - cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug) - train_gen = CombinedDataGenerator(loader, batch_size=args.batch_size, shuffle=args.shuffle) - val_gen = CombinedDataGenerator(loader, partition='val', batch_size=args.batch_size, shuffle=args.shuffle) - x_train_list, y_train = train_gen.get_slice(size=train_gen.size, dataframe=True, single=args.single) - x_val_list, y_val = val_gen.get_slice(size=val_gen.size, dataframe=True, single=args.single) + loader.partition_data( + cv_folds=args.cv, + train_split=train_split, + val_split=val_split, + cell_types=args.cell_types, + by_cell=args.by_cell, + by_drug=args.by_drug, + ) + train_gen = CombinedDataGenerator(loader, + batch_size=args.batch_size, + shuffle=args.shuffle) + val_gen = CombinedDataGenerator(loader, + partition="val", + batch_size=args.batch_size, + shuffle=args.shuffle) + x_train_list, y_train = train_gen.get_slice(size=train_gen.size, + dataframe=True, + single=args.single) + x_val_list, y_val = val_gen.get_slice(size=val_gen.size, + dataframe=True, + single=args.single) df_train = pd.concat([y_train] + x_train_list, axis=1) df_val = pd.concat([y_val] + x_val_list, axis=1) df = pd.concat([df_train, df_val]).reset_index(drop=True) if args.growth_bins > 1: - df = uno_data.discretize(df, 'Growth', bins=args.growth_bins) - df.to_csv(fname, sep='\t', index=False, float_format="%.3g") + df = uno_data.discretize(df, "Growth", bins=args.growth_bins) + df.to_csv(fname, sep="\t", index=False, float_format="%.3g") return - loader.partition_data(cv_folds=args.cv, train_split=train_split, val_split=val_split, - cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug) + loader.partition_data( + cv_folds=args.cv, + train_split=train_split, + val_split=val_split, + cell_types=args.cell_types, + by_cell=args.by_cell, + by_drug=args.by_drug, + ) model = build_model(loader, args) - logger.info('Combined model:') + logger.info("Combined model:") model.summary(print_fn=logger.info) # plot_model(model, to_file=prefix+'.model.png', show_shapes=True) if args.cp: model_json = model.to_json() - with open(prefix+'.model.json', 'w') as f: + with open(prefix + ".model.json", "w") as f: print(model_json, file=f) def warmup_scheduler(epoch): - lr = args.learning_rate or base_lr * args.batch_size/100 + lr = args.learning_rate or base_lr * args.batch_size / 100 if epoch <= 5: - K.set_value(model.optimizer.lr, (base_lr * (5-epoch) + lr * epoch) / 5) - logger.debug('Epoch {}: lr={:.5g}'.format(epoch, K.get_value(model.optimizer.lr))) + K.set_value(model.optimizer.lr, + (base_lr * (5 - epoch) + lr * epoch) / 5) + logger.debug("Epoch {}: lr={:.5g}".format( + epoch, K.get_value(model.optimizer.lr))) return K.get_value(model.optimizer.lr) df_pred_list = [] - cv_ext = '' + cv_ext = "" cv = args.cv if args.cv > 1 else 1 for fold in range(cv): if args.cv > 1: - logger.info('Cross validation fold {}/{}:'.format(fold+1, cv)) - cv_ext = '.cv{}'.format(fold+1) + logger.info("Cross validation fold {}/{}:".format(fold + 1, cv)) + cv_ext = ".cv{}".format(fold + 1) model = build_model(loader, args, silent=True) - optimizer = optimizers.deserialize({'class_name': args.optimizer, 'config': {}}) + optimizer = optimizers.deserialize({ + "class_name": args.optimizer, + "config": {} + }) base_lr = args.base_lr or K.get_value(optimizer.lr) if args.learning_rate: K.set_value(optimizer.lr, args.learning_rate) @@ -390,17 +447,24 @@ def warmup_scheduler(epoch): params.update(candle.compute_trainable_params(model)) candle_monitor = candle.CandleRemoteMonitor(params=params) - timeout_monitor = candle.TerminateOnTimeOut(params['timeout']) + timeout_monitor = candle.TerminateOnTimeOut(params["timeout"]) - reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001) + reduce_lr = ReduceLROnPlateau(monitor="val_loss", + factor=0.5, + patience=5, + min_lr=0.00001) warmup_lr = LearningRateScheduler(warmup_scheduler) - checkpointer = ModelCheckpoint(prefix+cv_ext+'.weights.h5', save_best_only=True, save_weights_only=True) + checkpointer = ModelCheckpoint(prefix + cv_ext + ".weights.h5", + save_best_only=True, + save_weights_only=True) tensorboard = TensorBoard(log_dir="tb/tb{}{}".format(ext, cv_ext)) history_logger = LoggingCallback(logger.debug) model_recorder = ModelRecorder() # callbacks = [history_logger, model_recorder] - callbacks = [candle_monitor, timeout_monitor, history_logger, model_recorder] + callbacks = [ + candle_monitor, timeout_monitor, history_logger, model_recorder + ] if args.reduce_lr: callbacks.append(reduce_lr) if args.warmup_lr: @@ -410,41 +474,66 @@ def warmup_scheduler(epoch): if args.tb: callbacks.append(tensorboard) - train_gen = CombinedDataGenerator(loader, fold=fold, batch_size=args.batch_size, shuffle=args.shuffle) - val_gen = CombinedDataGenerator(loader, partition='val', fold=fold, batch_size=args.batch_size, shuffle=args.shuffle) + train_gen = CombinedDataGenerator(loader, + fold=fold, + batch_size=args.batch_size, + shuffle=args.shuffle) + val_gen = CombinedDataGenerator( + loader, + partition="val", + fold=fold, + batch_size=args.batch_size, + shuffle=args.shuffle, + ) df_val = val_gen.get_response(copy=True) - y_val = df_val['Growth'].values + y_val = df_val["Growth"].values y_shuf = np.random.permutation(y_val) - log_evaluation(evaluate_prediction(y_val, y_shuf), - description='Between random pairs in y_val:') + log_evaluation( + evaluate_prediction(y_val, y_shuf), + description="Between random pairs in y_val:", + ) if args.no_gen: - x_train_list, y_train = train_gen.get_slice(size=train_gen.size, single=args.single) - x_val_list, y_val = val_gen.get_slice(size=val_gen.size, single=args.single) - history = model.fit(x_train_list, y_train, - batch_size=args.batch_size, - epochs=args.epochs, - callbacks=callbacks, - validation_data=(x_val_list, y_val)) + x_train_list, y_train = train_gen.get_slice(size=train_gen.size, + single=args.single) + x_val_list, y_val = val_gen.get_slice(size=val_gen.size, + single=args.single) + history = model.fit( + x_train_list, + y_train, + batch_size=args.batch_size, + epochs=args.epochs, + callbacks=callbacks, + validation_data=(x_val_list, y_val), + ) else: - logger.info('Data points per epoch: train = %d, val = %d',train_gen.size, val_gen.size) - logger.info('Steps per epoch: train = %d, val = %d',train_gen.steps, val_gen.steps) - history = model.fit_generator(train_gen.flow(single=args.single), train_gen.steps, - epochs=args.epochs, - callbacks=callbacks, - validation_data=val_gen.flow(single=args.single), - validation_steps=val_gen.steps) + logger.info( + "Data points per epoch: train = %d, val = %d", + train_gen.size, + val_gen.size, + ) + logger.info("Steps per epoch: train = %d, val = %d", + train_gen.steps, val_gen.steps) + history = model.fit_generator( + train_gen.flow(single=args.single), + train_gen.steps, + epochs=args.epochs, + callbacks=callbacks, + validation_data=val_gen.flow(single=args.single), + validation_steps=val_gen.steps, + ) if args.cp: - model.load_weights(prefix+cv_ext+'.weights.h5') + model.load_weights(prefix + cv_ext + ".weights.h5") # model = model_recorder.best_model if args.no_gen: y_val_pred = model.predict(x_val_list, batch_size=args.batch_size) else: val_gen.reset() - y_val_pred = model.predict_generator(val_gen.flow(single=args.single), val_gen.steps) + y_val_pred = model.predict_generator( + val_gen.flow(single=args.single), val_gen.steps) y_val_pred = y_val_pred[:val_gen.size] y_val_pred = y_val_pred.flatten() @@ -452,39 +541,52 @@ def warmup_scheduler(epoch): scores = evaluate_prediction(y_val, y_val_pred) log_evaluation(scores) - df_val = df_val.assign(PredictedGrowth=y_val_pred, GrowthError=y_val_pred-y_val) + df_val = df_val.assign(PredictedGrowth=y_val_pred, + GrowthError=y_val_pred - y_val) df_pred_list.append(df_val) - plot_history(prefix, history, 'loss') - plot_history(prefix, history, 'r2') + plot_history(prefix, history, "loss") + plot_history(prefix, history, "r2") - pred_fname = prefix + '.predicted.tsv' + pred_fname = prefix + ".predicted.tsv" df_pred = pd.concat(df_pred_list) - df_pred.sort_values(['Source', 'Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'], inplace=True) - df_pred.to_csv(pred_fname, sep='\t', index=False, float_format='%.4g') + df_pred.sort_values( + ["Source", "Sample", "Drug1", "Drug2", "Dose1", "Dose2", "Growth"], + inplace=True) + df_pred.to_csv(pred_fname, sep="\t", index=False, float_format="%.4g") if args.cv > 1: - scores = evaluate_prediction(df_pred['Growth'], df_pred['PredictedGrowth']) - log_evaluation(scores, description='Combining cross validation folds:') + scores = evaluate_prediction(df_pred["Growth"], + df_pred["PredictedGrowth"]) + log_evaluation(scores, description="Combining cross validation folds:") for test_source in loader.test_sep_sources: - test_gen = CombinedDataGenerator(loader, partition='test', batch_size=args.batch_size, source=test_source) + test_gen = CombinedDataGenerator(loader, + partition="test", + batch_size=args.batch_size, + source=test_source) df_test = test_gen.get_response(copy=True) - y_test = df_test['Growth'].values + y_test = df_test["Growth"].values n_test = len(y_test) if n_test == 0: continue if args.no_gen: - x_test_list, y_test = test_gen.get_slice(size=test_gen.size, single=args.single) + x_test_list, y_test = test_gen.get_slice(size=test_gen.size, + single=args.single) y_test_pred = model.predict(x_test_list, batch_size=args.batch_size) else: - y_test_pred = model.predict_generator(test_gen.flow(single=args.single), test_gen.steps) + y_test_pred = model.predict_generator( + test_gen.flow(single=args.single), test_gen.steps) y_test_pred = y_test_pred[:test_gen.size] y_test_pred = y_test_pred.flatten() scores = evaluate_prediction(y_test, y_test_pred) - log_evaluation(scores, description='Testing on data from {} ({})'.format(test_source, n_test)) + log_evaluation( + scores, + description="Testing on data from {} ({})".format( + test_source, n_test), + ) - if K.backend() == 'tensorflow': + if K.backend() == "tensorflow": K.clear_session() logger.handlers = [] @@ -497,7 +599,7 @@ def main(): run(params) -if __name__ == '__main__': +if __name__ == "__main__": main() - if K.backend() == 'tensorflow': + if K.backend() == "tensorflow": K.clear_session() diff --git a/archives/templates/models/wrapper_compliant/mnist_mlp.py b/archives/templates/models/wrapper_compliant/mnist_mlp.py index c059d1c4..06f11171 100644 --- a/archives/templates/models/wrapper_compliant/mnist_mlp.py +++ b/archives/templates/models/wrapper_compliant/mnist_mlp.py @@ -1,45 +1,46 @@ # Run the wrapper_connector script, which (1) appends $SUPP_PYTHONPATH to the Python environment if it's defined and (2) defines the function for loading the hyperparameters -import sys, os -sys.path.append(os.getenv("CANDLE")+'/Supervisor/templates/scripts') +import os +import sys + +sys.path.append(os.getenv("CANDLE") + "/Supervisor/templates/scripts") import wrapper_connector -gParameters = wrapper_connector.load_params('params.json') -################ ADD MODEL BELOW USING gParameters DICTIONARY AS CURRENT HYPERPARAMETER SET; DO NOT MODIFY ABOVE ####################################### +gParameters = wrapper_connector.load_params("params.json") +################ ADD MODEL BELOW USING gParameters DICTIONARY AS CURRENT HYPERPARAMETER SET; DO NOT MODIFY ABOVE ####################################### ########################################## # Your DL start here. See mnist_mlp.py # ########################################## -'''Trains a simple deep NN on the MNIST dataset. +# Trains a simple deep NN on the MNIST dataset. -Gets to 98.40% test accuracy after 20 epochs -(there is *a lot* of margin for parameter tuning). -2 seconds per epoch on a K520 GPU. -''' +# Gets to 98.40% test accuracy after 20 epochs +# (there is *a lot* of margin for parameter tuning). +# 2 seconds per epoch on a K520 GPU. import keras from keras.datasets import mnist -from keras.models import Sequential from keras.layers import Dense, Dropout +from keras.models import Sequential from keras.optimizers import RMSprop -batch_size = gParameters['batch_size'] +batch_size = gParameters["batch_size"] num_classes = 10 -epochs = gParameters['epochs'] +epochs = gParameters["epochs"] -activation = gParameters['activation'] -optimizer = gParameters['optimizer'] +activation = gParameters["activation"] +optimizer = gParameters["optimizer"] # the data, split between train and test sets (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train = x_train.reshape(60000, 784) x_test = x_test.reshape(10000, 784) -x_train = x_train.astype('float32') -x_test = x_test.astype('float32') +x_train = x_train.astype("float32") +x_test = x_test.astype("float32") x_train /= 255 x_test /= 255 -print(x_train.shape[0], 'train samples') -print(x_test.shape[0], 'test samples') +print(x_train.shape[0], "train samples") +print(x_test.shape[0], "test samples") # convert class vectors to binary class matrices y_train = keras.utils.to_categorical(y_train, num_classes) @@ -50,36 +51,43 @@ model.add(Dropout(0.2)) model.add(Dense(512, activation=activation)) model.add(Dropout(0.2)) -model.add(Dense(num_classes, activation='softmax')) +model.add(Dense(num_classes, activation="softmax")) model.summary() -model.compile(loss='categorical_crossentropy', - optimizer=optimizer, - metrics=['accuracy']) +model.compile(loss="categorical_crossentropy", + optimizer=optimizer, + metrics=["accuracy"]) -history = model.fit(x_train, y_train, - batch_size=batch_size, - epochs=epochs, - verbose=1, - validation_data=(x_test, y_test)) +history = model.fit( + x_train, + y_train, + batch_size=batch_size, + epochs=epochs, + verbose=1, + validation_data=(x_test, y_test), +) score = model.evaluate(x_test, y_test, verbose=0) -print('Test loss:', score[0]) -print('Test accuracy:', score[1]) +print("Test loss:", score[0]) +print("Test accuracy:", score[1]) ########################################## # End of mnist_mlp.py #################### ########################################## - ################ ADD MODEL ABOVE USING gParameters DICTIONARY AS CURRENT HYPERPARAMETER SET; DO NOT MODIFY BELOW ####################################### # Ensure that above you DEFINE the history object (as in, e.g., the return value of model.fit()) or val_to_return (a single number) in your model; below we essentially RETURN those values -try: history +try: + history except NameError: - try: val_to_return + try: + val_to_return except NameError: - print("Error: Neither a history object nor a val_to_return variable was defined upon running the model on the current hyperparameter set; exiting") + print( + "Error: Neither a history object nor a val_to_return variable was defined upon running the model on the current hyperparameter set; exiting" + ) exit else: - wrapper_connector.write_history_from_value(val_to_return, 'val_to_return.json') + wrapper_connector.write_history_from_value(val_to_return, + "val_to_return.json") else: - wrapper_connector.write_history(history, 'val_to_return.json') \ No newline at end of file + wrapper_connector.write_history(history, "val_to_return.json") diff --git a/archives/templates/run_without_candle.sh b/archives/templates/run_without_candle.sh index f4d5f170..84e5f3cf 100755 --- a/archives/templates/run_without_candle.sh +++ b/archives/templates/run_without_candle.sh @@ -17,4 +17,4 @@ module load python/3.6 export DEFAULT_PARAMS_FILE="$CANDLE/Supervisor/templates/model_params/mnist1.txt" # Run the model -python $CANDLE/Supervisor/templates/models/mnist/mnist_mlp.py \ No newline at end of file +python $CANDLE/Supervisor/templates/models/mnist/mnist_mlp.py diff --git a/archives/templates/scripts/candle_compliant_wrapper.py b/archives/templates/scripts/candle_compliant_wrapper.py index 815eb833..d426deff 100644 --- a/archives/templates/scripts/candle_compliant_wrapper.py +++ b/archives/templates/scripts/candle_compliant_wrapper.py @@ -1,59 +1,91 @@ # This file should generally follow the standard CANDLE-compliance procedure + def initialize_parameters(): # Add the candle_keras library to the Python path - import sys, os - sys.path.append(os.getenv("CANDLE")+'/Candle/common') + import os + import sys + + sys.path.append(os.getenv("CANDLE") + "/Candle/common") # Instantiate the Benchmark class (the values of the prog and desc parameters don't really matter) import candle_keras as candle - mymodel_common = candle.Benchmark(os.path.dirname(os.path.realpath(__file__)), os.getenv("DEFAULT_PARAMS_FILE"), 'keras', prog='myprogram', desc='My CANDLE example') + + mymodel_common = candle.Benchmark( + os.path.dirname(os.path.realpath(__file__)), + os.getenv("DEFAULT_PARAMS_FILE"), + "keras", + prog="myprogram", + desc="My CANDLE example", + ) # Read the parameters (in a dictionary format) pointed to by the environment variable DEFAULT_PARAMS_FILE gParameters = candle.initialize_parameters(mymodel_common) # Return this dictionary of parameters - return(gParameters) + return gParameters + def run(gParameters): # Define the dummy history class; defining it here to keep this file aligned with the standard CANDLE-compliance procedure class HistoryDummy: + def __init__(self, mynum): - self.history = {'val_loss': [mynum], 'val_corr': [mynum], 'val_dice_coef': [mynum]} + self.history = { + "val_loss": [mynum], + "val_corr": [mynum], + "val_dice_coef": [mynum], + } # Reformat a value that doesn't have an analogous field in the JSON format - gParameters['datatype'] = str(gParameters['datatype']) + gParameters["datatype"] = str(gParameters["datatype"]) # Write the current set of hyperparameters to a JSON file import json - with open('params.json', 'w') as outfile: + + with open("params.json", "w") as outfile: json.dump(gParameters, outfile) # Run the wrapper script model_wrapper.sh where the environment is defined and the model (whether in Python or R) is called - myfile = open('subprocess_out_and_err.txt','w') - import subprocess, os - print('Starting run of model_wrapper.sh from candle_compliant_wrapper.py...') - subprocess.run(['bash', os.getenv("CANDLE")+'/Supervisor/templates/scripts/model_wrapper.sh'], stdout=myfile, stderr=subprocess.STDOUT) - print('Finished run of model_wrapper.sh from candle_compliant_wrapper.py') + myfile = open("subprocess_out_and_err.txt", "w") + import os + import subprocess + + print( + "Starting run of model_wrapper.sh from candle_compliant_wrapper.py...") + subprocess.run( + [ + "bash", + os.getenv("CANDLE") + + "/Supervisor/templates/scripts/model_wrapper.sh", + ], + stdout=myfile, + stderr=subprocess.STDOUT, + ) + print("Finished run of model_wrapper.sh from candle_compliant_wrapper.py") myfile.close() # Read in the history.history dictionary containing the result from the JSON file created by the model history = HistoryDummy(4444) import json - with open('val_to_return.json') as infile: + + with open("val_to_return.json") as infile: history.history = json.load(infile) - return(history) - + return history + + def main(): gParameters = initialize_parameters() run(gParameters) -if __name__ == '__main__': + +if __name__ == "__main__": main() try: from keras import backend as K + K.clear_session() except AttributeError: - pass \ No newline at end of file + pass diff --git a/archives/templates/scripts/copy_candle_template b/archives/templates/scripts/copy_candle_template index 46137fdf..29046a1e 100755 --- a/archives/templates/scripts/copy_candle_template +++ b/archives/templates/scripts/copy_candle_template @@ -33,4 +33,4 @@ if [ "a$ret1" == "a0" ] && [ "a$ret2" == "a0" ]; then echo -e " (2) First modify $submission_script using https://cbiit.github.io/fnlcr-bids-hpc/documentation/candle/how_to_modify_the_candle_templates as a guide and then submit your own CANDLE job by running './$submission_script' (no 'sbatch' needed)\n" else echo -e "\nAn error occurred; see error message(s) above\n" -fi \ No newline at end of file +fi diff --git a/archives/templates/scripts/copy_candle_template-new b/archives/templates/scripts/copy_candle_template-new index 79960adb..8effbdf0 100755 --- a/archives/templates/scripts/copy_candle_template-new +++ b/archives/templates/scripts/copy_candle_template-new @@ -33,4 +33,4 @@ if [ "a$ret1" == "a0" ] && [ "a$ret2" == "a0" ]; then echo -e " (2) First modify $submission_script using https://cbiit.github.io/fnlcr-bids-hpc/documentation/candle/how_to_modify_the_candle_templates as a guide and then submit your own CANDLE job by running './$submission_script' (no 'sbatch' needed)\n" else echo -e "\nAn error occurred; see error message(s) above\n" -fi \ No newline at end of file +fi diff --git a/archives/templates/scripts/model_wrapper.sh b/archives/templates/scripts/model_wrapper.sh index e31e0300..78903174 100644 --- a/archives/templates/scripts/model_wrapper.sh +++ b/archives/templates/scripts/model_wrapper.sh @@ -43,4 +43,4 @@ elif [ "x$suffix" == "xr" ]; then fi # Display timing information -echo "MODEL_WRAPPER.SH END TIME: $(date +%s)" \ No newline at end of file +echo "MODEL_WRAPPER.SH END TIME: $(date +%s)" diff --git a/archives/templates/scripts/restart.py b/archives/templates/scripts/restart.py index 9a5b89c3..7b791fa9 100644 --- a/archives/templates/scripts/restart.py +++ b/archives/templates/scripts/restart.py @@ -1,9 +1,9 @@ -import os import datetime -import pandas as pd -import numpy as np import json +import os +import numpy as np +import pandas as pd result_file = "result.txt" params_log = "params.json" @@ -11,55 +11,59 @@ objective_str = "objective" eval_dir = "eval_dir" config_json = "configuration.json" -TIME_FORMAT='%Y-%m-%d %H:%M:%S' +TIME_FORMAT = "%Y-%m-%d %H:%M:%S" start = "start_time" stop = "stop_time" -eval_key = 'id' +eval_key = "id" exp_dir = "EXPERIMENTS" upf_space = "WORKFLOW_SETTINGS_FILE" - + def grep(model_log): """ Parse the log file to generate the start and stop times - Arguments: + Arguments: model_log: filepath The log file for the evaluation returns: dict Dictionary with start and stop times. - + """ import subprocess global TIME_FORMAT global start - global stop - - output = subprocess.check_output(['grep', '-E', "RUN START|RUN STOP", model_log]) + global stop + + output = subprocess.check_output( + ["grep", "-E", "RUN START|RUN STOP", model_log]) lines = output.decode("utf-8") result = {} - for line in lines.split('\n'): - idx = line.find(' __main') + for line in lines.split("\n"): + idx = line.find(" __main") if idx != -1: ts = line[0:idx] dt = datetime.datetime.strptime(ts, TIME_FORMAT).timestamp() - if line.endswith('START'): + if line.endswith("START"): result[start] = dt else: result[stop] = dt - + return result + def get_immediate_subdirectories(a_dir): - return [name for name in os.listdir(a_dir) - if os.path.isdir(os.path.join(a_dir, name))] + return [ + name for name in os.listdir(a_dir) + if os.path.isdir(os.path.join(a_dir, name)) + ] def get_successful_evaluations(all_eval): """ Returns a data frame with the evaluations that run successfully only Arguments - all_eval: dataframe + all_eval: dataframe Dataframe that includes all evaluations Returns: @@ -67,49 +71,54 @@ def get_successful_evaluations(all_eval): """ global objective_str - #For now return all evaluations that a result value. + # For now return all evaluations that a result value. u = ~all_eval[objective_str].isnull() - return all_eval[u] + return all_eval[u] + def get_remaining_evaluations(upf_file, all_eval): - """ - Generate a upf file with that contains all the evaluations that did not - complete successuflly - + """Generate a upf file with that contains all the evaluations that did not + complete successuflly. + Arguments: - upf_file: filename + upf_file: filename The orignial file that contains the parameter space all_eval: dataframe The dataframe that has attemped simulation parameters - + Return: str A str that contains information the upf info for the configuration that did not complete """ - #Read and parse the originla upf + # Read and parse the originla upf global eval_key if os.path.exists(upf_file): - with open(upf_file, 'r') as upf: - upf_str = upf.read() - else: + with open(upf_file, "r") as upf: + upf_str = upf.read() + else: raise Exception("The upf file {} does not exist".format(upf_file)) - #parse the upf string to a list of dictionaries - lines = upf_str.split('\n') - lines = [l for l in lines if l.strip() != ''] + # parse the upf string to a list of dictionaries + lines = upf_str.split("\n") + lines = [l for l in lines if l.strip() != ""] params = [] for configuration in lines: params.append(eval(configuration)) - + total_ids = set([x[eval_key] for x in params]) - success_eval_df = get_successful_evaluations(all_eval) + success_eval_df = get_successful_evaluations(all_eval) success_ids = set(success_eval_df[eval_key].tolist()) remaining_ids = total_ids.difference(success_ids) - new_upf = [json.dumps(config) for config in params if config[eval_key] in remaining_ids] + new_upf = [ + json.dumps(config) + for config in params + if config[eval_key] in remaining_ids + ] return "\n".join(new_upf) + def all_runs_log(exp_dir): """ Gather information about all the runs in an experiment @@ -117,24 +126,25 @@ def all_runs_log(exp_dir): exp_dir: str Path to the experiment directory - Returns: Dataframe - Every evaluation will occupy a row + Returns: Dataframe + Every evaluation will occupy a row """ eval_list = [] - launch_dirs = get_immediate_subdirectories(exp_dir) - for launch in launch_dirs: - run_dir = os.path.join(exp_dir, launch, "run") - #print(run_dir) + launch_dirs = get_immediate_subdirectories(exp_dir) + for launch in launch_dirs: + run_dir = os.path.join(exp_dir, launch, "run") + # print(run_dir) eval_dirs = get_immediate_subdirectories(run_dir) for evaluation in eval_dirs: eval_dir = os.path.join(run_dir, evaluation) eval_dic = single_evaluation_log(eval_dir) - eval_list.append(pd.Series(eval_dic, index = eval_dic.keys())) + eval_list.append(pd.Series(eval_dic, index=eval_dic.keys())) df = pd.DataFrame(eval_list) return df + def single_evaluation_log(evaluation_dir): """ Checks if the an evaluation is successful and generate evaluation parameters @@ -146,38 +156,38 @@ def single_evaluation_log(evaluation_dir): Dictionary with all the parameters of the evaluation and the objective value """ - global result_file - global params_log - global eval_log - global objective_str + global result_file + global params_log + global eval_log + global objective_str global eval_dir - global config_json + global config_json eval_dic = {} - #See if evaluation completed successfully if resutls.txt contains a float + # See if evaluation completed successfully if resutls.txt contains a float result_path = os.path.join(evaluation_dir, result_file) if not os.path.exists(result_path): obj_value = np.nan else: - with open(result_path,mode='r') as result: + with open(result_path, mode="r") as result: obj_str = result.read() try: obj_value = float(obj_str) except Exception as e: - obj_value = np.nan + obj_value = np.nan eval_dic[objective_str] = obj_value - #Read the parameters dictionary + # Read the parameters dictionary params_path = os.path.join(evaluation_dir, params_log) if os.path.exists(params_path): - with open(params_path, 'r') as f: + with open(params_path, "r") as f: model_params = json.load(f) eval_dic.update(model_params) - #Read the timing metadata + # Read the timing metadata model_log = os.path.join(evaluation_dir, eval_log) if os.path.exists(model_log): timing_dic = grep(model_log) @@ -185,14 +195,17 @@ def single_evaluation_log(evaluation_dir): return eval_dic + if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser(description = 'Restart a UPF experiment') - parser.add_argument('submit_args', help='The biowulf submission configuration') + parser = argparse.ArgumentParser(description="Restart a UPF experiment") + + parser.add_argument("submit_args", + help="The biowulf submission configuration") args = parser.parse_args() - with open(args.submit_args) as json_file: + with open(args.submit_args) as json_file: config_json = json.load(json_file) experiment = config_json[exp_dir] @@ -201,4 +214,4 @@ def single_evaluation_log(evaluation_dir): status = all_runs_log(experiment) new_upf = get_remaining_evaluations(upf_file, status) if new_upf != "": - print(new_upf) \ No newline at end of file + print(new_upf) diff --git a/archives/templates/scripts/run_without_candle.sh b/archives/templates/scripts/run_without_candle.sh index 4b026f81..c79adc26 100755 --- a/archives/templates/scripts/run_without_candle.sh +++ b/archives/templates/scripts/run_without_candle.sh @@ -12,4 +12,4 @@ #SBATCH --job-name=mnist_test_no_candle export USE_CANDLE=0 -./submit_candle_job.sh \ No newline at end of file +./submit_candle_job.sh diff --git a/archives/templates/scripts/run_workflows.sh b/archives/templates/scripts/run_workflows.sh index c040d4d0..014cf29f 100755 --- a/archives/templates/scripts/run_workflows.sh +++ b/archives/templates/scripts/run_workflows.sh @@ -109,4 +109,4 @@ if [ "${USE_CANDLE:-1}" -eq 1 ]; then # ...otherwise, run the wrapper alone, outside of CANDLE else python "$MODEL_PYTHON_DIR/$MODEL_PYTHON_SCRIPT.py" -fi \ No newline at end of file +fi diff --git a/archives/templates/scripts/submit_candle_job.sh b/archives/templates/scripts/submit_candle_job.sh index 80a6a444..566db7aa 100755 --- a/archives/templates/scripts/submit_candle_job.sh +++ b/archives/templates/scripts/submit_candle_job.sh @@ -32,4 +32,4 @@ export USE_CANDLE=1 # if not already set, as in e.g. by run_without_candle.sh, s ################ MODIFY ONLY ABOVE; DO NOT MODIFY BELOW #################################################################### -$CANDLE/Supervisor/templates/scripts/run_workflows.sh \ No newline at end of file +$CANDLE/Supervisor/templates/scripts/run_workflows.sh diff --git a/archives/templates/scripts/wrapper_connector.py b/archives/templates/scripts/wrapper_connector.py index e1e3ca92..eb44d81e 100644 --- a/archives/templates/scripts/wrapper_connector.py +++ b/archives/templates/scripts/wrapper_connector.py @@ -1,22 +1,36 @@ # If it's defined in the environment, append $SUPP_PYTHONPATH to the Python path -import os, json -supp_pythonpath = os.getenv('SUPP_PYTHONPATH') +import json +import os + +supp_pythonpath = os.getenv("SUPP_PYTHONPATH") if supp_pythonpath is not None: import sys + sys.path.append(supp_pythonpath) # Load the hyperparameter dictionary stored in the JSON file params.json def load_params(params_json_file): with open(params_json_file) as infile: - return(json.load(infile)) + return json.load(infile) + # Write the history.history dictionary to a JSON file def write_history(history, val_to_return_json_file): - with open(val_to_return_json_file, 'w') as outfile: + with open(val_to_return_json_file, "w") as outfile: json.dump(history.history, outfile) + # Make a history.history dictionary from a return value and write it to a JSON file -def write_history_from_value(val_to_return, val_to_return_json_file): # val_to_return_json_file should be val_to_return.json to match the value in candle_compliant_wrapper.py - with open(val_to_return_json_file, 'w') as outfile: - json.dump({'val_loss': [val_to_return], 'val_corr': [val_to_return], 'val_dice_coef': [val_to_return]}, outfile) \ No newline at end of file +def write_history_from_value( + val_to_return, val_to_return_json_file +): # val_to_return_json_file should be val_to_return.json to match the value in candle_compliant_wrapper.py + with open(val_to_return_json_file, "w") as outfile: + json.dump( + { + "val_loss": [val_to_return], + "val_corr": [val_to_return], + "val_dice_coef": [val_to_return], + }, + outfile, + ) diff --git a/archives/templates/workflow_settings/mlrmbo1.sh b/archives/templates/workflow_settings/mlrmbo1.sh index fba6629f..851a4f29 100644 --- a/archives/templates/workflow_settings/mlrmbo1.sh +++ b/archives/templates/workflow_settings/mlrmbo1.sh @@ -5,4 +5,4 @@ MAX_CONCURRENT_EVALUATIONS=${MAX_CONCURRENT_EVALUATIONS:-1} MAX_ITERATIONS=${MAX_ITERATIONS:-3} MAX_BUDGET=${MAX_BUDGET:-180} DESIGN_SIZE=${DESIGN_SIZE:-9} -PARAM_SET_FILE=${PARAM_SET_FILE:-$CANDLE/Supervisor/workflows/$WORKFLOW_TYPE/data/nt3_hps_exp_01.R} \ No newline at end of file +PARAM_SET_FILE=${PARAM_SET_FILE:-$CANDLE/Supervisor/workflows/$WORKFLOW_TYPE/data/nt3_hps_exp_01.R} diff --git a/archives/templates/workflow_settings/upf-default.txt b/archives/templates/workflow_settings/upf-default.txt index 2e78b091..c70294d7 100644 --- a/archives/templates/workflow_settings/upf-default.txt +++ b/archives/templates/workflow_settings/upf-default.txt @@ -1 +1 @@ -{"id": "default_run"} \ No newline at end of file +{"id": "default_run"} diff --git a/archives/templates/workflow_settings/upf1.txt b/archives/templates/workflow_settings/upf1.txt index 3607213d..cabc1dc0 100644 --- a/archives/templates/workflow_settings/upf1.txt +++ b/archives/templates/workflow_settings/upf1.txt @@ -1 +1 @@ -{"id": "mytest", "batch_size": 2048, "learning_rate": 0.00001} \ No newline at end of file +{"id": "mytest", "batch_size": 2048, "learning_rate": 0.00001} diff --git a/archives/templates/workflow_settings/upf3.txt b/archives/templates/workflow_settings/upf3.txt index aafae5db..76b9a800 100644 --- a/archives/templates/workflow_settings/upf3.txt +++ b/archives/templates/workflow_settings/upf3.txt @@ -4,4 +4,4 @@ {"id": "hpset_04", "epochs": 30, "activation": "relu"} {"id": "hpset_05", "epochs": 10, "batch_size": 128} {"id": "hpset_06", "epochs": 10, "batch_size": 256} -{"id": "hpset_07", "epochs": 10, "batch_size": 512} \ No newline at end of file +{"id": "hpset_07", "epochs": 10, "batch_size": 512} diff --git a/archives/workflows/auen41_ff/auen41_ff.py b/archives/workflows/auen41_ff/auen41_ff.py index 0236dbe5..b757ad36 100644 --- a/archives/workflows/auen41_ff/auen41_ff.py +++ b/archives/workflows/auen41_ff/auen41_ff.py @@ -1,27 +1,27 @@ -import pandas as pd -import numpy as np - -from keras.layers import Input, Dense, Dropout -from keras.models import Model - -import time import json +import time import matplotlib as mpl +import numpy as np +import pandas as pd +from keras.layers import Dense, Dropout, Input +from keras.models import Model + mpl.use('Agg') import matplotlib.pyplot as plt EPOCH = 10 BATCH = 50 -P = 60025 # 245 x 245 -N1 = 2000 -NE = 600 # encoded dim +P = 60025 # 245 x 245 +N1 = 2000 +NE = 600 # encoded dim F_MAX = 33.3 -DR = 0.2 +DR = 0.2 class AutoEncoder(): + def __init__(self, trainFileName, testFileName, metaDataDict): self.train = None self.test = None @@ -65,7 +65,8 @@ def createEncoder(self): end = time.time() encoded_input = Input(shape=(NE,)) self.encoder = Model(input_vector, encoded) - self.decoder = Model(encoded_input, + self.decoder = Model( + encoded_input, self.ae.layers[-1](self.ae.layers[-2](encoded_input))) self.ae.compile(optimizer='rmsprop', loss='mean_squared_error') self.initTime = end - start @@ -74,8 +75,11 @@ def createEncoder(self): def trainEncoder(self): start = time.time() - self.ae.fit(self.x_train, self.x_train, batch_size=BATCH, - nb_epoch=EPOCH, validation_data=[self.x_test, self.x_test]) + self.ae.fit(self.x_train, + self.x_train, + batch_size=BATCH, + nb_epoch=EPOCH, + validation_data=[self.x_test, self.x_test]) end = time.time() self.trainTime = end - start @@ -106,17 +110,19 @@ def plotResults(self): plt.title("Histogram of Errors with 'auto' bins") plt.savefig('histogram.png') + def saveJsonResult(jsonResult, jsonFilename): f = open(jsonFilename, 'w') f.write('[\n') for i, val in enumerate(jsonResult): - if i < len(jsonResult)-1: - f.write('\t'+val+',\n') + if i < len(jsonResult) - 1: + f.write('\t' + val + ',\n') else: - f.write('\t'+val+'\n') + f.write('\t' + val + '\n') f.write(']\n') f.close() + def go(dir): # runs = 5 jsonResult = [] @@ -127,14 +133,14 @@ def go(dir): metaDataDict['benchmark-name'] = 'benchmark1' metaDataDict['type'] = 'autoencoder' # for i in range(runs): - autoencode = AutoEncoder(dir+'/breast.train.csv', - dir+'/breast.test.csv', - metaDataDict) + autoencode = AutoEncoder(dir + '/breast.train.csv', + dir + '/breast.test.csv', metaDataDict) jsonResult.append(autoencode.resultJson) print jsonResult saveJsonResult(jsonResult, 'jsonResults.json') return repr(jsonResult) # return "OK" + if __name__ == '__main__': go('.') diff --git a/archives/workflows/p1b1_hyperopt/Readme.md b/archives/workflows/p1b1_hyperopt/Readme.md index dc5b209c..357139e4 100644 --- a/archives/workflows/p1b1_hyperopt/Readme.md +++ b/archives/workflows/p1b1_hyperopt/Readme.md @@ -1,4 +1,4 @@ -# P1B1 hyperopt Workflow # +# P1B1 hyperopt Workflow The P1B1 hyperopt workflow evaluates a modified version of the P1B1 benchmark autoencoder using hyperparameters provided by a hyperopt instance. The P1B1 @@ -8,14 +8,14 @@ loss. Requirements: -* Python 2.7 -* P1B1 Autoencoder - git@github.com:ECP-CANDLE/Benchmarks.git. Clone and switch -to the supervisor branch. -* P1B1 Data - `http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/P1B1.train.csv` and `http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/P1B1.test.csv`. Download these into some suitable directory (e.g. `workflows/p1b1_hyperopt/data`) -* Hyperopt - http://hyperopt.github.io/hyperopt/ -* Keras - https://keras.io. The supervisor branch of P1B1 should work with -both version 1 and 2. -* Swift-t with Python 2.7 enabled - http://swift-lang.org/Swift-T/ +- Python 2.7 +- P1B1 Autoencoder - git@github.com:ECP-CANDLE/Benchmarks.git. Clone and switch + to the supervisor branch. +- P1B1 Data - `http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/P1B1.train.csv` and `http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/P1B1.test.csv`. Download these into some suitable directory (e.g. `workflows/p1b1_hyperopt/data`) +- Hyperopt - http://hyperopt.github.io/hyperopt/ +- Keras - https://keras.io. The supervisor branch of P1B1 should work with + both version 1 and 2. +- Swift-t with Python 2.7 enabled - http://swift-lang.org/Swift-T/ This workflow also uses code included in this repository: the EMEWS EQ/Py extension (`workflows/p1b1_hyperopt/ext/EQ-Py`) and the eqpy hyperopt bridge code @@ -31,26 +31,25 @@ p1b1_hyperopt/ swift/ ``` - * `data` - model input etc. data, such as the hyperopt space description. - * `etc` - additional code used by EMEWS - * `ext/EQ-Py` - swift-t EQ\Py extension - * `swift/workflow.swift` - the swift workflow script - * `swift/workflow.sh` - generic launch script to set the appropriate enviroment variables etc. and then launch the swift workflow script - * `swift/cori_settings.sh` - settings specific to the Cori supercomputer - * `swift/cori_workflow.sh` - launch script customized for the Cori supercomputer - * `swift/cooley_workflow.sh` - launch script customized for the Cooley supercomputer +- `data` - model input etc. data, such as the hyperopt space description. +- `etc` - additional code used by EMEWS +- `ext/EQ-Py` - swift-t EQ\Py extension +- `swift/workflow.swift` - the swift workflow script +- `swift/workflow.sh` - generic launch script to set the appropriate enviroment variables etc. and then launch the swift workflow script +- `swift/cori_settings.sh` - settings specific to the Cori supercomputer +- `swift/cori_workflow.sh` - launch script customized for the Cori supercomputer +- `swift/cooley_workflow.sh` - launch script customized for the Cooley supercomputer +## Running the Workflow - ## Running the Workflow ## +The launch scripts in the `swift` directory can be used to run the workflow. +Copy the `workflow.sh` and edit it as appropriate. The swift script takes +4 arguments, each of which is set in the launch script. - The launch scripts in the `swift` directory can be used to run the workflow. - Copy the `workflow.sh` and edit it as appropriate. The swift script takes - 4 arguments, each of which is set in the launch script. - - * EVALUATIONS - the total number of runs to perform - * PARAM_BATCH_SIZE - the number of hyperparameter sets to evaluate in parallel. Hyperopt will produce this many sets of hyperparameters each iteration until EVALUATIONS has been reached. - * SPACE_FILE - the path of the file that defines hyperopt's hyperparameter space (e.g. EMEWS_PROJECT_ROOT/data/space_description.txt) - * DATA_DIRECTORY - the directory containing the test and training data. The files themselves are assumed to be named `P1B1.train.csv` and `P1B1.test.csv` +- EVALUATIONS - the total number of runs to perform +- PARAM_BATCH_SIZE - the number of hyperparameter sets to evaluate in parallel. Hyperopt will produce this many sets of hyperparameters each iteration until EVALUATIONS has been reached. +- SPACE_FILE - the path of the file that defines hyperopt's hyperparameter space (e.g. EMEWS_PROJECT_ROOT/data/space_description.txt) +- DATA_DIRECTORY - the directory containing the test and training data. The files themselves are assumed to be named `P1B1.train.csv` and `P1B1.test.csv` The launch script also sets PYTHONPATH to include the swift-t EQ-Py extension, the eqpy hyperopt bridge, and the location of the P1B1 python code. Only the @@ -63,35 +62,34 @@ directory where X is the experiment id. A copy of the launch script that was used to launch the workflow will also be written to this directory. -### Running on Cori ### +### Running on Cori 0. You can debug on the login node with `nice swift/workflow.sh ID` 1. The Cori workflow uses Cori's existing deeplearing environment. This includes -Keras, but NOT hyperopt. To install hyperopt, if you haven't already: + Keras, but NOT hyperopt. To install hyperopt, if you haven't already: + +``` +module load deeplearning +pip install --user hyperopt +``` - ``` - module load deeplearning - pip install --user hyperopt - ``` 2. Source the `swift/cori_settings.sh` file to load the required modules etc: - ```source cori_settings``` + `source cori_settings` 3. In the swift directory, run the `cori_workflow.sh` launch script with an -experiment id. For example, - - ```./cori_workflow.sh T1``` - + experiment id. For example, +`./cori_workflow.sh T1` -### Running on Cooley ### +### Running on Cooley Cooley uses this python: `/soft/analytics/conda/env/Candle_ML/lib/python2.7/` with hyperopt, keras etc. already installed. 0. You can debug on the login node with `nice swift/workflow.sh ID` 1. Add this Swift/T to your PATH: `~wozniak/Public/sfw/x86_64/login/swift-t-conda/stc/bin` 2. In the swift directory, run the `cooley_workflow.sh` launch scrip with an -experiment id. For example, + experiment id. For example, - ```./cooley_workflow.sh T1``` +`./cooley_workflow.sh T1` diff --git a/archives/workflows/p1b1_hyperopt/data/.gitignore b/archives/workflows/p1b1_hyperopt/data/.gitignore index 6d363580..b244c341 100644 --- a/archives/workflows/p1b1_hyperopt/data/.gitignore +++ b/archives/workflows/p1b1_hyperopt/data/.gitignore @@ -1,2 +1,2 @@ P1B1.test.csv -P1B1.train.csv \ No newline at end of file +P1B1.train.csv diff --git a/archives/workflows/p1b1_hyperopt/ext/EQ-Py/EQPy.swift b/archives/workflows/p1b1_hyperopt/ext/EQ-Py/EQPy.swift index b9dcf3e1..ac2c13f5 100644 --- a/archives/workflows/p1b1_hyperopt/ext/EQ-Py/EQPy.swift +++ b/archives/workflows/p1b1_hyperopt/ext/EQ-Py/EQPy.swift @@ -11,10 +11,10 @@ pragma worktypedef resident_work; string init_package_string = """ -import eqpy -import %s -import threading -p = threading.Thread(target=%s.run) +import eqpy +import %s +import threading +p = threading.Thread(target=%s.run) p.start() """; diff --git a/archives/workflows/p1b1_hyperopt/ext/EQ-Py/eqpy.py b/archives/workflows/p1b1_hyperopt/ext/EQ-Py/eqpy.py index 1c739bb7..582c13d2 100644 --- a/archives/workflows/p1b1_hyperopt/ext/EQ-Py/eqpy.py +++ b/archives/workflows/p1b1_hyperopt/ext/EQ-Py/eqpy.py @@ -1,5 +1,5 @@ -import threading import sys +import threading try: from queue import Queue @@ -10,13 +10,14 @@ input_q = Queue() output_q = Queue() + def OUT_put(string_params): output_q.put(string_params) + def IN_get(): global input_q # print("IN_get() ...") result = input_q.get() # print("IN_get(): " + result) return result - diff --git a/archives/workflows/p1b1_hyperopt/swift/cori_settings.sh b/archives/workflows/p1b1_hyperopt/swift/cori_settings.sh index af95237f..0f8d8c85 100644 --- a/archives/workflows/p1b1_hyperopt/swift/cori_settings.sh +++ b/archives/workflows/p1b1_hyperopt/swift/cori_settings.sh @@ -4,5 +4,3 @@ module swap PrgEnv-intel PrgEnv-gnu export PATH=/global/homes/w/wozniak/Public/sfw/compute/swift-t/stc/bin:$PATH #export PATH=/global/homes/w/wozniak/Public/sfw/login/swift-t/stc/bin:$PATH - - diff --git a/archives/workflows/simple_hyperopt_example/Readme.md b/archives/workflows/simple_hyperopt_example/Readme.md index bdc7d60f..76845027 100644 --- a/archives/workflows/simple_hyperopt_example/Readme.md +++ b/archives/workflows/simple_hyperopt_example/Readme.md @@ -1,14 +1,14 @@ -# Simple Example of EMEWS Integration with hyperopt # +# Simple Example of EMEWS Integration with hyperopt This directory contains a simple example of integrating hyperopt with EMEWS. Requirements: -* Python 2.7 or 3 -* hyperopt : (http://hyperopt.github.io/hyperopt/). Install with -`pip install hyperopt` -* Swift/T with python extension +- Python 2.7 or 3 +- hyperopt : (http://hyperopt.github.io/hyperopt/). Install with + `pip install hyperopt` +- Swift/T with python extension Run the example with `swift/simple_workflow.sh`. That should properly set the PYTHONPATH, but it does assume that swift-t is in your PATH already. @@ -42,7 +42,8 @@ returned back to hyperopt via the eqpy_hyperopt package. The swift workflow in `swift/swift_run_eqpy.swift` performs the following steps: 1. Initialize the eqpy_hyperopt python with the hyperopt algorithm parameters. -These are formated as a string representation of a python dictionary. + These are formated as a string representation of a python dictionary. + ``` {'space' : %s, 'algo' : %s, @@ -50,22 +51,25 @@ These are formated as a string representation of a python dictionary. 'param_batch_size' : %d, 'seed' : %d} ``` + These are explained in the Readme for eqpy_hyperopt in this repository. 2. Request a list of parameter sets from hyperopt. The list is a ";" separated -string of python dictionaries. For example, + string of python dictionaries. For example, + ``` {'x': [-1.5477895914281512]};{'x': [1.23432434]};{'x': [0.32343]} ``` + If there were more parameters in addition to 'x', those would appear in the dictionary as well. 3. Split the list of parameters into an array and execute the model on -each element in that array in parallel. As explained above, executing the model consists -of pasting in the parameters in the python 'model' code and executing that -with a swift python call. + each element in that array in parallel. As explained above, executing the model consists + of pasting in the parameters in the python 'model' code and executing that + with a swift python call. 4. Repeat 2 and 3 until the maximum number of evaluations has been reached -(`max_evals`). + (`max_evals`). 5. Print and write out the best parameter set found by hyperopt. diff --git a/archives/workflows/simple_hyperopt_example/ext/EQ-Py/eqpy.py b/archives/workflows/simple_hyperopt_example/ext/EQ-Py/eqpy.py index a544f020..99a8debc 100644 --- a/archives/workflows/simple_hyperopt_example/ext/EQ-Py/eqpy.py +++ b/archives/workflows/simple_hyperopt_example/ext/EQ-Py/eqpy.py @@ -1,5 +1,5 @@ -import threading import sys +import threading try: from queue import Queue @@ -10,11 +10,12 @@ input_q = Queue() output_q = Queue() + def OUT_put(string_params): output_q.put(string_params) + def IN_get(): global input_q result = input_q.get() return result - diff --git a/archives/workflows/simple_hyperopt_example/swift/cori_settings.sh b/archives/workflows/simple_hyperopt_example/swift/cori_settings.sh index af95237f..0f8d8c85 100644 --- a/archives/workflows/simple_hyperopt_example/swift/cori_settings.sh +++ b/archives/workflows/simple_hyperopt_example/swift/cori_settings.sh @@ -4,5 +4,3 @@ module swap PrgEnv-intel PrgEnv-gnu export PATH=/global/homes/w/wozniak/Public/sfw/compute/swift-t/stc/bin:$PATH #export PATH=/global/homes/w/wozniak/Public/sfw/login/swift-t/stc/bin:$PATH - - diff --git a/archives/workflows/simple_hyperopt_example/swift/simple_workflow.sh b/archives/workflows/simple_hyperopt_example/swift/simple_workflow.sh index ea76b7a7..58ede262 100755 --- a/archives/workflows/simple_hyperopt_example/swift/simple_workflow.sh +++ b/archives/workflows/simple_hyperopt_example/swift/simple_workflow.sh @@ -108,4 +108,3 @@ swift-t -O0 -l -n $PROCS $MACHINE -p -I $EQPY -r $EQPY \ -e PATH=$PATH \ -e PYTHONPATH=$PYTHONPATH \ $EMEWS_PROJECT_ROOT/swift/$SWIFT_FILE $CMD_LINE_ARGS - diff --git a/archives/workflows/simple_mlrMBO_example/R/mlrMBO_utils.R b/archives/workflows/simple_mlrMBO_example/R/mlrMBO_utils.R index 5a405c6c..c88c74f6 100644 --- a/archives/workflows/simple_mlrMBO_example/R/mlrMBO_utils.R +++ b/archives/workflows/simple_mlrMBO_example/R/mlrMBO_utils.R @@ -30,4 +30,4 @@ append_extras_if_exist <- function(res_element,x){ result_with_extras_if_exist <- function(res,time_value){ lapply(res, function(x) append_extras_if_exist(c(list(y=x[1]), list(time=time_value)),x)) -} \ No newline at end of file +} diff --git a/archives/workflows/simple_mlrMBO_example/R/test/mlrMBO_utils_tests.R b/archives/workflows/simple_mlrMBO_example/R/test/mlrMBO_utils_tests.R index 7db8cb2b..a2ac7c9f 100644 --- a/archives/workflows/simple_mlrMBO_example/R/test/mlrMBO_utils_tests.R +++ b/archives/workflows/simple_mlrMBO_example/R/test/mlrMBO_utils_tests.R @@ -3,7 +3,7 @@ require(testthat) test_that("list_to_string works",{ l = list(x1 = -4.5, x2 = 6.3) - expected_string = "-4.5, 6.3" + expected_string = "-4.5, 6.3" result_string = list_to_string(l) # print(result_string) expect_equal(expected_string,result_string) @@ -13,7 +13,7 @@ test_that("elements_of_lists_to_string works",{ l1 = list(x1 = -4.5, x2 = 6.3) l2 = list(x1 = 7.6, x2 = 0.3) l3 = list(l1,l2) - expected_string = "-4.5, 6.3;7.6, 0.3" + expected_string = "-4.5, 6.3;7.6, 0.3" result_string = elements_of_lists_to_string(l3) # print(result_string) expect_equal(expected_string,result_string) @@ -21,16 +21,16 @@ test_that("elements_of_lists_to_string works",{ test_that("append_extras_if_exist works",{ x = c(1,2,3) - res_element = list(y = 1, time = 2.3) + res_element = list(y = 1, time = 2.3) new_res_element = append_extras_if_exist(res_element,x) - expected_res_element = list(y = 1, time = 2.3, user.extras = list(2,3)) + expected_res_element = list(y = 1, time = 2.3, user.extras = list(2,3)) # print(new_res_element) expect_equal(expected_res_element,new_res_element, info = "length(x) > 1") - + x = c(3) - res_element = list(y = 3, time = 2.3) + res_element = list(y = 3, time = 2.3) new_res_element = append_extras_if_exist(res_element,x) - expected_res_element = list(y = 3, time = 2.3) + expected_res_element = list(y = 3, time = 2.3) # print(new_res_element) expect_equal(expected_res_element,new_res_element, info = "length(x) == 1") }) @@ -40,24 +40,23 @@ test_that("result_with_extras_if_exist works",{ new_res = result_with_extras_if_exist(list_of_vectors,4.5) expected_res = list(list(y = 1, time = 4.5, user.extras = list(2,3)), list(y = 4, time = 4.5, user.extras = list(5,6)), - list(y = 7, time = 4.5, user.extras = list(8,9))) + list(y = 7, time = 4.5, user.extras = list(8,9))) # print(new_res_element) expect_equal(expected_res,new_res, info = "length(x) > 1, uniform") - + list_of_vectors = list(c(1,2,3),c(4,6),c(7)) new_res = result_with_extras_if_exist(list_of_vectors,4.5) expected_res = list(list(y = 1, time = 4.5, user.extras = list(2,3)), list(y = 4, time = 4.5, user.extras = list(6)), - list(y = 7, time = 4.5)) + list(y = 7, time = 4.5)) # print(new_res_element) expect_equal(expected_res,new_res, info = "length(x) mixed") - + list_of_vectors = list(c(1),c(4),c(7)) new_res = result_with_extras_if_exist(list_of_vectors,4.5) expected_res = list(list(y = 1, time = 4.5), list(y = 4, time = 4.5), - list(y = 7, time = 4.5)) + list(y = 7, time = 4.5)) # print(new_res_element) - expect_equal(expected_res,new_res, info = "length(x) == 1") + expect_equal(expected_res,new_res, info = "length(x) == 1") }) - diff --git a/archives/workflows/simple_mlrMBO_example/R/test/simple_mlrMBO_run_test.R b/archives/workflows/simple_mlrMBO_example/R/test/simple_mlrMBO_run_test.R index 109264d2..962b9b46 100644 --- a/archives/workflows/simple_mlrMBO_example/R/test/simple_mlrMBO_run_test.R +++ b/archives/workflows/simple_mlrMBO_example/R/test/simple_mlrMBO_run_test.R @@ -29,4 +29,4 @@ IN_get <- function(){ ## Assumes working directory is ../ source("simple_mlrMBO.R") -## Look at result with: readRDS("final_res.Rds") \ No newline at end of file +## Look at result with: readRDS("final_res.Rds") diff --git a/archives/workflows/simple_mlrMBO_example/R/test/test_utils.R b/archives/workflows/simple_mlrMBO_example/R/test/test_utils.R index adbeedbc..8fb4a7ef 100644 --- a/archives/workflows/simple_mlrMBO_example/R/test/test_utils.R +++ b/archives/workflows/simple_mlrMBO_example/R/test/test_utils.R @@ -1,4 +1,4 @@ -# Split the string pushed into OUT_put into +# Split the string pushed into OUT_put into # list of numerical vectors (used in simple_mlrMBO_run_test.R) split.into.param.lines <- function(x){ res1 <- unlist(strsplit(x,split = ";")) @@ -7,4 +7,4 @@ split.into.param.lines <- function(x){ make.into.q.res <- function(x){ paste0(x,collapse = ";") -} \ No newline at end of file +} diff --git a/archives/workflows/simple_mlrMBO_example/README.md b/archives/workflows/simple_mlrMBO_example/README.md index 04c20954..0a3c0afa 100644 --- a/archives/workflows/simple_mlrMBO_example/README.md +++ b/archives/workflows/simple_mlrMBO_example/README.md @@ -1,33 +1,36 @@ -# Simple Example of EMEWS Integration with mlrMBO # +# Simple Example of EMEWS Integration with mlrMBO This directory contains a simple example of integrating mlrMBO with EMEWS. Requirements: -* R 3.2+ -* All required R packages can be installed with -`install.packages("")` - * mlrMBO and dependencies : (https://mlr-org.github.io/mlrMBO/). - * parallelMap : (https://cran.r-project.org/web/packages/parallelMap/index.html) - * DiceKriging and dependencies : (https://cran.r-project.org/web/packages/DiceKriging/index.html) - * rgenoud : (https://cran.r-project.org/web/packages/rgenoud/index.html) - * testthat (for testing) : (https://cran.r-project.org/web/packages/testthat/index.html) -* Swift/T with R extension -* Compiled EQ/R, instructions in `ext/EQ-R/eqr/COMPILING.txt` +- R 3.2+ +- All required R packages can be installed with + `install.packages("")` + - mlrMBO and dependencies : (https://mlr-org.github.io/mlrMBO/). + - parallelMap : (https://cran.r-project.org/web/packages/parallelMap/index.html) + - DiceKriging and dependencies : (https://cran.r-project.org/web/packages/DiceKriging/index.html) + - rgenoud : (https://cran.r-project.org/web/packages/rgenoud/index.html) + - testthat (for testing) : (https://cran.r-project.org/web/packages/testthat/index.html) +- Swift/T with R extension +- Compiled EQ/R, instructions in `ext/EQ-R/eqr/COMPILING.txt` Run the example with `./swift_run_eqr.sh `. That assume that swift-t is in your PATH already. ## Workflow details + The workflow attempts to minimize the example function `sum(x^2)` for a two dimensional space `(x1,x2)` defined by the variables: + ```R "x1": lower = -5, upper = 5 "x2": lower = -10, upper = 20 ``` and using existing capabilities from mlrMBO: -* **expected improvement** for the infill criterion -* **constant liar** for multi-point proposals + +- **expected improvement** for the infill criterion +- **constant liar** for multi-point proposals The example uses **multi-point proposals** for concurrency in the iterative steps, defined via a `pp=` argument within the `swift/swift_run_eqr.sh` script. Maximum algorithm iteration is defined via a `it=` argument, also within the `swift/swift_run_eqr.sh` script. @@ -36,15 +39,16 @@ The mlrMBO algorithm is defined in `R/simple_mlrMBO.R` and it controls the overa As indicated above, the workflow is run with `./swift_run_eqr.sh `. When the workflow completes, the results from running `mbo` are saved to the experiment directory in `experiments/experiment_ID/final_res.Rds` and can be loaded within an R session using `readRDS("/final_res.Rds")`. ## Testing the R components + The `R/test` directory contains tests for the R components in the workflow and for running the mlrMBO algorithm without Swift/T. -* `mlrMBO_utils_tests.R`: unit tests for `R/mlrMBO_utils.R`, which provides R components to the workflow (run using the testthat library's `test_file("/mlrMBO_utils_tests.R")` function) -* `simple_mlrMBO_run_test.R`: script that provides R implementations for the EQ/R `OUT_put` and `IN_get` calls to be able to run `R/simple_mlrMBO.R` at smaller scales for testing without Swift/T (run from the `R` directory via `source("test/simple_mlrMBO_run_test.R")`) -* `test_utils_tests.R`: tests for functions in `R/test/test_utils.R` which are used to make `simple_mlrMBO_run_test.R` work (run using `test_file("/test_utils_tests.R")`) -*(Below is the information that was generated when the simple_mlrMBO_example EMEWS project was created.)* +- `mlrMBO_utils_tests.R`: unit tests for `R/mlrMBO_utils.R`, which provides R components to the workflow (run using the testthat library's `test_file("/mlrMBO_utils_tests.R")` function) +- `simple_mlrMBO_run_test.R`: script that provides R implementations for the EQ/R `OUT_put` and `IN_get` calls to be able to run `R/simple_mlrMBO.R` at smaller scales for testing without Swift/T (run from the `R` directory via `source("test/simple_mlrMBO_run_test.R")`) +- `test_utils_tests.R`: tests for functions in `R/test/test_utils.R` which are used to make `simple_mlrMBO_run_test.R` work (run using `test_file("/test_utils_tests.R")`) + +_(Below is the information that was generated when the simple_mlrMBO_example EMEWS project was created.)_ -EMEWS project template ------------------------ +## EMEWS project template You have just created an EMEWS project. The project consists of the following directories: @@ -62,18 +66,19 @@ simple_mlrMBO_example/ swift/ README.md ``` + The directories are intended to contain the following: - * `data` - model input etc. data - * `etc` - additional code used by EMEWS - * `ext` - swift-t extensions such as eqpy, eqr - * `python` - python code (e.g. model exploration algorithms written in python) - * `python/test` - tests of the python code - * `R` - R code (e.g. model exploration algorithms written R) - * `R/test` - tests of the R code - * `scripts` - any necessary scripts (e.g. scripts to launch a model), excluding - scripts used to run the workflow. - * `swift` - swift code +- `data` - model input etc. data +- `etc` - additional code used by EMEWS +- `ext` - swift-t extensions such as eqpy, eqr +- `python` - python code (e.g. model exploration algorithms written in python) +- `python/test` - tests of the python code +- `R` - R code (e.g. model exploration algorithms written R) +- `R/test` - tests of the R code +- `scripts` - any necessary scripts (e.g. scripts to launch a model), excluding + scripts used to run the workflow. +- `swift` - swift code Use the subtemplates to customize this structure for particular types of workflows. These are: sweep, eqpy, and eqr. diff --git a/archives/workflows/simple_mlrMBO_example/ext/EQ-R/eqr/BlockingQueue.h b/archives/workflows/simple_mlrMBO_example/ext/EQ-R/eqr/BlockingQueue.h index c9dfd41c..a9f983da 100644 --- a/archives/workflows/simple_mlrMBO_example/ext/EQ-R/eqr/BlockingQueue.h +++ b/archives/workflows/simple_mlrMBO_example/ext/EQ-R/eqr/BlockingQueue.h @@ -24,7 +24,7 @@ class BlockingQueue { } this->d_condition.notify_one(); } - + T pop() { std::unique_lock lock(this->d_mutex); // [ capture-list ] ( params ) { body } diff --git a/archives/workflows/simple_mlrMBO_example/ext/EQ-R/eqr/settings.mk b/archives/workflows/simple_mlrMBO_example/ext/EQ-R/eqr/settings.mk index c8dacae6..7906db3c 100644 --- a/archives/workflows/simple_mlrMBO_example/ext/EQ-R/eqr/settings.mk +++ b/archives/workflows/simple_mlrMBO_example/ext/EQ-R/eqr/settings.mk @@ -1,6 +1,6 @@ -CXXFLAGS = -g -O0 -fPIC -std=c++0x -I/usr/local/include -I/Library/Frameworks/R.framework/Versions/3.3/Resources/include -I/Users/jozik/Library/R/3.3/library/Rcpp/include -I/Users/jozik/Library/R/3.3/library/RInside/include -CPPFLAGS = -I/usr/local/include -I/Library/Frameworks/R.framework/Versions/3.3/Resources/include -I/Users/jozik/Library/R/3.3/library/Rcpp/include -I/Users/jozik/Library/R/3.3/library/RInside/include +CXXFLAGS = -g -O0 -fPIC -std=c++0x -I/usr/local/include -I/Library/Frameworks/R.framework/Versions/3.3/Resources/include -I/Users/jozik/Library/R/3.3/library/Rcpp/include -I/Users/jozik/Library/R/3.3/library/RInside/include +CPPFLAGS = -I/usr/local/include -I/Library/Frameworks/R.framework/Versions/3.3/Resources/include -I/Users/jozik/Library/R/3.3/library/Rcpp/include -I/Users/jozik/Library/R/3.3/library/RInside/include LDFLAGS = -L/Users/jozik/Library/R/3.3/library/RInside/lib -lRInside -L/Library/Frameworks/R.framework/Versions/3.3/Resources/lib -lR -L/usr/local/lib -ltcl8.6 -Wl,-rpath -Wl,/usr/local/lib -Wl,-rpath -Wl,/Library/Frameworks/R.framework/Versions/3.3/Resources/lib -Wl,-rpath -Wl,/Users/jozik/Library/R/3.3/library/RInside/lib TCL_VERSION = 8.6 diff --git a/archives/workflows/simple_uq/python/permute.py b/archives/workflows/simple_uq/python/permute.py index 48fa530f..f47507e8 100644 --- a/archives/workflows/simple_uq/python/permute.py +++ b/archives/workflows/simple_uq/python/permute.py @@ -1,22 +1,25 @@ - from random import randint + class State: seed = None size = None training = None + state = State() + def configure(seed, size, training): global state state.seed = seed state.size = size state.training = training print("permute: configure(seed=%i, size=%i, training=%i)" % - (seed, size, training)) + (seed, size, training)) return "OK" + def get(): global state result = [] @@ -27,25 +30,27 @@ def get(): n = state.training for i in range(0, state.training): # print(pool) - i = randint(0,n+1) + i = randint(0, n + 1) v = pool[i] result.append(v) del pool[i] - n = n-1 + n = n - 1 return result + def validation(size, training): - """ Obtain the validation set corresponding to the given training set """ + """Obtain the validation set corresponding to the given training set.""" result = [] for i in range(0, size): if i not in training: result.append(i) return result + def get_tv(): - """ Get training and validation """ + """Get training and validation.""" global state t = get() v = validation(state.size, t) # return str([t, v]) - return t,v + return t, v diff --git a/archives/workflows/simple_uq/python/test-permute.py b/archives/workflows/simple_uq/python/test-permute.py index c36d7d5d..8413de3a 100644 --- a/archives/workflows/simple_uq/python/test-permute.py +++ b/archives/workflows/simple_uq/python/test-permute.py @@ -1,11 +1,10 @@ - import permute size = 10 validation = 2 -permute.configure(seed=10101, size=size, training=size-validation) +permute.configure(seed=10101, size=size, training=size - validation) -for i in range(0,9): - training = permute.get() - validation = permute.validation(size, training) - print str(training) + " " + str(validation) +for i in range(0, 9): + training = permute.get() + validation = permute.validation(size, training) + print str(training) + " " + str(validation) diff --git a/archives/workflows/simple_uq/swift/junk.py b/archives/workflows/simple_uq/swift/junk.py index f7fc57d0..5287911d 100644 --- a/archives/workflows/simple_uq/swift/junk.py +++ b/archives/workflows/simple_uq/swift/junk.py @@ -1,3 +1,2 @@ - inputs = eval(permutation_sets) training, validation = inputs diff --git a/archives/workflows/simple_uq/swift/obj_func.py b/archives/workflows/simple_uq/swift/obj_func.py index bbbc3bcd..49a355f1 100644 --- a/archives/workflows/simple_uq/swift/obj_func.py +++ b/archives/workflows/simple_uq/swift/obj_func.py @@ -1,4 +1,3 @@ - # OBJ FUNC PY import os @@ -18,22 +17,24 @@ size = 10 validation = 2 -permute.configure(seed=int(index)+10101, size=size, training=size-validation) +permute.configure(seed=int(index) + 10101, + size=size, + training=size - validation) training, validation = permute.get_tv() log = directory + "/" + "run.log" with open(log, "w") as fp: - fp.write("training: " + str(training) + "\n") + fp.write("training: " + str(training) + "\n") fp.write("validation: " + str(validation) + "\n\n") # Funny function result = float(0.0) -multiplier = float(10*10*10) -for i in range(0,5): - result = result + training[i]*multiplier +multiplier = float(10 * 10 * 10) +for i in range(0, 5): + result = result + training[i] * multiplier multiplier /= 10 with open(output, "w") as fp: # fp.write("training: " + str(training) + "\n") - fp.write(str(result)+"\n") + fp.write(str(result) + "\n") diff --git a/docs/format.css b/docs/format.css index c1223e5d..596beb36 100644 --- a/docs/format.css +++ b/docs/format.css @@ -1,4 +1,3 @@ - /* Asciidoc customizations */ a:visited { diff --git a/docs/home.html b/docs/home.html index 868defef..0b551b14 100644 --- a/docs/home.html +++ b/docs/home.html @@ -1,850 +1,1115 @@ - + - - - -CANDLE Supervisor Home Page - - - - - -
-
-
-

This is the main home page about CANDLE Supervisor effort with links to workflows and other supporting information.

-
-
-
-

Workflows

-
-

The workflows are currently indexed in the README visible here.

-
-
-
-

Database integration

-
-

The database work is described in the README visible here.

-
-
-
-

Swift installations

-
-
-

Theta

-

This is linked to Python and R but currently without ML libs.

-

Other Theta ESP notes are here: https://collab.cels.anl.gov/display/ESP

-
-

Python

-

Installed in:

-
-
-
/projects/Candle_ECP/swift/deps/Python-2.7.12
-
-

To run this installation, you must set:

-
-
-
$ export LD_LIBRARY_PATH=/projects/Candle_ECP/swift/deps/Python-2.7.12/lib
-
-
    -
  • -

    -Cori -

    -

    This uses the system-installed Python with ML libs in module:
    -tensorflow/intel-head

    -
  • -
  • -

    -Titan -

    -

    This is a CANDLE-only installation. It uses the OLCF-provided Python deeplearning module (Python 3.6 plus TensorFlow, Theano, and Keras) and R 3.3.2 .

    -

    Add to PATH: /lustre/atlas2/csc249/proj-shared/sfw/swift-t/stc/bin

    -

    Run with:

    -
    -
    -
    $ export TITAN=true
    +
    +      a:visited {
    +        color: gray;
    +      }
    +      h5 {
    +        font-size: 0.8em;
    +      }
    +    
    +    
    +  
    +  
    +    
    +    
    +
    +
    +
    +

    + This is the main home page about CANDLE Supervisor effort with + links to workflows and other supporting information. +

    +
    +
    +
    +
    +

    Workflows

    +
    +
    +

    + The workflows are currently indexed in the README visible + here. +

    +
    +
    +
    +
    +

    Database integration

    +
    +
    +

    + The database work is described in the README visible + here. +

    +
    +
    +
    +
    +

    Swift installations

    +
    +
    +

    + Theta +

    +
    +

    + This is linked to Python and R but currently without ML libs. +

    +
    +
    +

    + Other Theta ESP notes are here: + https://collab.cels.anl.gov/display/ESP +

    +
    +
    +

    Python

    +

    Installed in:

    +
    +
    +
    /projects/Candle_ECP/swift/deps/Python-2.7.12
    +
    +
    +
    +

    To run this installation, you must set:

    +
    +
    +
    +
    $ export LD_LIBRARY_PATH=/projects/Candle_ECP/swift/deps/Python-2.7.12/lib
    +
    +
    +
    +
      +
    • +

      + Cori +

      +
      +

      + This uses the system-installed Python with ML libs in + module:
      + tensorflow/intel-head +

      +
      +
    • +
    • +

      + Titan +

      +
      +

      + This is a CANDLE-only installation. It uses the + OLCF-provided Python deeplearning module + (Python 3.6 plus TensorFlow, Theano, and Keras) and R + 3.3.2 . +

      +
      +
      +

      + Add to PATH: + /lustre/atlas2/csc249/proj-shared/sfw/swift-t/stc/bin +

      +
      +

      Run with:

      +
      +
      +
      $ export TITAN=true
       $ export PROJECT=... QUEUE=...
       $ export LD_LIBRARY_PATH=/sw/xk6/deeplearning/1.0/sles11.3_gnu4.9.3/lib:/sw/xk6/deeplearning/1.0/sles11.3_gnu4.9.3/cuda/lib64:/opt/gcc/4.9.3/snos/lib64:/sw/xk6/r/3.3.2/sles11.3_gnu4.9.3x/lib64/R/lib
       $ swift-t -m cray -e LD_LIBRARY_PATH=$LD_LIBRARY_PATH workflow.swift
      -
      -
    • -
    -

    On Titan, do module load autoconf to get Autoconf 2.69 .

    -
      -
    • -

      -Cooley -

      -

      This uses the system-installed Python with ML libs at:
      -/soft/analytics/conda/env/Candle_ML

      -
    • -
    • -

      -JLSE KNL -

      -

      This does not yet have Python.

      -
    • -
    • -

      -JLSE Prasanna
      -This uses a VirtualEnv Python at /home/pbalapra/.virtualenvs -

      -
        -
      • -

        -~wozniak/Public/sfw/icc/swift-t-pb/stc/bin -

        -
      • -
      -
    • -
    -
    -
    -
    -
    -
    -

    - - +
    +
    +
  • +
+
+
+

+ On Titan, do module load autoconf to get Autoconf + 2.69 . +

+
+
+
    +
  • +

    + Cooley +

    +
    +

    + This uses the system-installed Python with ML libs + at:
    + /soft/analytics/conda/env/Candle_ML +

    +
    +
  • +
  • +

    + JLSE KNL +

    +
    +

    This does not yet have Python.

    +
    +
  • +
  • +

    + JLSE Prasanna
    + This uses a VirtualEnv Python at + /home/pbalapra/.virtualenvs +

    +
    +
      +
    • +

      + ~wozniak/Public/sfw/icc/swift-t-pb/stc/bin +

      +
    • +
    +
    +
  • +
+
+
+
+
+
+
+

+ + diff --git a/docs/summit.txt b/docs/summit.txt index dd2ce52b..46285aad 100644 --- a/docs/summit.txt +++ b/docs/summit.txt @@ -17,7 +17,7 @@ $ make install-binaries install-libraries install-headers == Install Python -$ ./configure +$ ./configure --prefix=/gpfs/alpine/world-shared/med106/sw/gcc-7.4.0/Python-3.5.1 --enable-shared $ nice make -j diff --git a/docs/user_guide.adoc b/docs/user_guide.adoc index e197d27c..c54e3ae8 100644 --- a/docs/user_guide.adoc +++ b/docs/user_guide.adoc @@ -203,7 +203,7 @@ $ export WALLTIME=00:10:00 $ ./test/upf-1.sh theta -// or +// or $ QUEUE=debug-cache-quad PROJECT=myproject PROCS=3 WALLTIME=00:10:00 ./test/upf-1.sh theta ---- @@ -252,7 +252,7 @@ result: 2.10822688904 <1> `output.txt` contains stdout and stderr of this experiment. This is helpful to debug errors. <2> `run` directory contains the output files. You will see two directories that are corresponding the IDs configured in upf-1.txt <3> a copy of configuration files are available so that you can trace what were passed to this experiment. -<4> stdout of test0. After 10 epoches, validation loss was 2.1082. +<4> stdout of test0. After 10 epoches, validation loss was 2.1082. == Running mlrMBO based Hyperparameters Optimization (HPO) on Theta @@ -315,4 +315,3 @@ You can specify the HPO search strategy. As you can see in `test/cfg-prm-1.sh`, * `MAX_ITERATIONS` is a number of iterations. * `PROPOSE_POINTS` is a number of parameter sets that CANDLE will evaluate in each iteration. So, if `MAX_ITERATION=3` and `PROPOSE_POINTS=5`, CANDLE will be ended up evaluating 25 params (10 + 3 x 5). * `MAX_BUDGET` should be greater than total evaluations. In this example, 45. - diff --git a/docs/user_guide.html b/docs/user_guide.html index e7da7374..ee657c0f 100644 --- a/docs/user_guide.html +++ b/docs/user_guide.html @@ -1,1012 +1,1394 @@ - - - - - - -CANDLE Library User Guide - - - - - -
-
-
-

The CANDLE library provides a wrapper class and utility functions, which enable users run their own deep learning code in high performance computers that CANDLE supports. With the current version of CANDLE library, users should be able to run hyperparameter optimization (mlrMBO workflow) or parallel excution (upf workflow). Due to the design of both workflows, users are required to implement certain methods (will be explained in section 1) and modify several config files (section 2). This user guide will provide an overview of structure and explanation of parameters or varaiables as needed.

-
-
-

How to write CANDLE compliant deep learning code

-
-

Minimum requirements

-
-

The CANDLE requires two methods, initialize_parameters() and run().

-
-

Initialize_parameters Method

-

In initialize_parameters method, we will construct a class and build a parameter set, which will be used inside your deep learning code (run method). We provides some common parameters such as batch_size, epochs, etc. In addition to that, you can construct your own parameters (see Aurgument Specification section below). Finally, the initialize_parameters should return a python dictionary, in this doc, will be called gParameters (global parameters).

-
-
-

Run Method

-

You can place your deep learning code in run(Dict) method. You can use parameter varaiable like gParameters['batch_size'].

-

We have an example, that converted a simple MNIST neural net mnist_mlp.py provided by Keras Team into CANDLE compliant form. In this example, you will see how the initialize_parameters method is implemented and how the actual NN code was transplanted in run method.

-

Finally, the run() returns history. This can be omitted for upf workflow, but required for HPO workflow.

-
-
-
    -
  1. -

    -In next section, we will explain where the common.MNIST class came from. -

    -
  2. -
  3. -

    -initialize_parameters return dictionary -

    -
  4. -
  5. -

    -run method receives parameter dictionary -

    -
  6. -
  7. -

    -returns history object -

    -
  8. -
-
-
-
-
-

Argument Sepcification

-
-

In order to take advantage of the CANDLE framework, a model needs to be able to modify its parameters via either reading from the default_model file, or overwriting those parameters via an appropriate command line argument. We standadized frequently used ML keywords, as well as certain other keywords which are used by the CANDLE scripts. We recommend users aware of these arguemtns to avoid conflicts. For these CANDLE built-in command line arguments, please see default_utils.py

-
-

Adding keyword

-

In order to simplify the process of adding keywords, we require the user to provide a list of metadata of how to parse the arugment.

-
-
-
[{
-  'name':'shared_nnet_spec', // <1>
-  'nargs':'+', // <2>
-  'type': int, // <3>
-  'help':'network structure of shared layer' // <4>
-}, ...]
-
-
    -
  1. -

    -required. Name of parameter. -

    -
  2. -
  3. -

    -optional. The number of command-line arguments. -

    -
  4. -
  5. -

    -required. The type to which the command-line arguments should be converted. -

    -
  6. -
  7. -

    -optional. A brief description of what the argument does. -you can add default, choices, and action as needed. -

    -
  8. -
-
-
-

Building Class

-

When you have a list of additional paramaters, you need to pass the definition to be parsed. Even though you don’t have any additional parameters, this is generally recommended, since you can buid your own shared method and build data processing code that will be shared.

-

Please take a look this example. This is a source of common.MNIST class definition.

-
-
-
additional_definitions = None
-required = None
-
-class MNIST(default_utils.Benchmark):
-    def set_locals(self):
-        if required is not None:
-            self.required = set(required)
-        if additional_definitions is not None:
-            self.additional_definitions = additional_definitions
-
-
-
-

Thead Optimization

-

Some HPC machines like Theta, the performance will greatly improved if we let CANDLE handles threads. So, it is generally recommended to have code like line 14 to 21 in this example

-
-
-
-

How to run CANDLE compliant code in Theta

-

As mentioned above, we offer two different workflows in CANDLE: -Unrolled Parameter File (UPF) and Hyper Parameter Optimization (HPO). -The UPF workflow allows you to run parallel multi-node executions with different parameters, -while HPO workflow evaluates the best value of hyperparameters based on mlrMBO algorithm.

-
-

Running UPF on Theta

-
-

Step 1. Checkout Supervisor repo

-
-
-
$ git clone https://github.com/ECP-CANDLE/Supervisor.git
-
-

Step 2. Move to upf workflow directory

-
-
-
$ cd Supervisor/workflow/upf
-
-

Step 3. Set Env variables. In test/cfg-sys-1.sh, -you will need to set BENCHMARK_DIR to point the directory that your script locates, and -MODEL_PYTHON_SCRIPT to name the script you want to run

-
-
-
BENCHMARK_DIR=directory_where_my_script_locates
-MODEL_PYTHON_SCRIPT=my_script
-
-

Step 4. Set execution plan. Check test/upf-1.txt for parameter configuration and modify as needed. -This file contains multiple number of JSON documents. Each JSON document will contain the command line parameters. -For example,

-
-
-
{"id": "test0", "epochs": 10}
-{"id": "test1", "epochs": 20}
-
-

This will invoke two instances, which will run 10 epochs and 20 epochs respectively.

-

Step 5. Submit your job. You will need to set QUEUE, PROJECT, PROCS, and WALLTIME. -You can configure those in cfg-sys-1.sh (see Step 3), set as env variables, or you can provide in your command line (see below).

-
-
-
$ export QUEUE=default
-$ export PROJECT=myproject
-$ export PROCS=3
-$ export WALLTIME=01:00:00
-
-$ ./test/upf-1.sh theta upf-1.txt
-
-// or
-
-$ QUEUE=default PROJECT=myproject PROCS=3 WALLTIME=01:00:00 ./test/upf-1.sh theta upf-1.txt
-
-
-

Step 6. Check queue status

-
-
-
$ qstat -h user_name -f
-
-
-
-
-

Running mlrMBO based Hyperparameters Optimization (HPO) on Theta

-
-

Step 1. Checkout Supervisor repo

-
-
-
$ git clone https://github.com/ECP-CANDLE/Supervisor.git
-
-

Step 2. Move to mlrMBO workflow directory

-
-
-
$ cd Supervisor/workflow/mlrMBO
-
-

Step 3. Set Env variables. In test/cfg-sys-1.sh, -you will need to set BENCHMARK_DIR to point the directory that your script locates, and -MODEL_PYTHON_SCRIPT to name the script you want to run

-
-
-
BENCHMARK_DIR=directory_where_my_script_locates
-MODEL_PYTHON_SCRIPT=my_script
-
-

Step 4. Config hyper parameters. In this step, we are configuring parameter sets, which we will iteratively evaluate. -For example, you can create workflow/data/mnist.R as below.

-
-
-
param.set <- makeParamSet(
-  makeDiscreteParam("batch_size", values=c(32, 64, 128, 256, 512)),
-  makeDiscreteParam("activation", values=c("relu", "sigmoid", "tanh")),
-  makeDiscreteParam("optimizer", values=c("adam", "sgd", "rmsprop")),
-  makeIntegerParam("epochs", lower=20, upper=20)
-)
-
-

In this example, we are varying four paramters, batch_size, activation, optimizer, epochs. -Entire parameter space will be 5 x 3 x 3 x 1.

-

After creating this file, we need to point this file.

-
-
-
$ export PARAM_SET_FILE=mnist.R
-
-

Step 5. Submit your job.

-
-
-
$ ./test/test-1.sh mnist theta
-
-

The first argument is MODEL_NAME. If the name is registered in test/cfg-prm-1.sh, it will use the pre-configured parameter file. -Otherwise, CANDLE will use PARAM_SET_FILE we configured in step 4.

-

You can specify the HPO search strategy. As you can see in test/cfg-prm-1.sh, you are able to config PROPOSE_POINTS, MAX_CONCURRENT_EVALUATIONS, MAX_ITERATIONS, MAX_BUDGE, DESIGN_SIZE.

-
    -
  • -

    -DESIGN_SIZE is a number of param sets that will evaluate at the beginning of HPO search. In this example, CANDLE will select random 10 param sets out of 45 (see Step 4, for break downs). -

    -
  • -
  • -

    -MAX_ITERATIONS is a number of iteration. -

    -
  • -
  • -

    -PROPOSE_POINTS is a number of param sets that CANDLE will evaluate in each iteration. So, if MAX_ITERATION=3 and PROPOSE_POINTS=5, CANDLE will ended up evaluating 25 params (3 x 5 + 10). -

    -
  • -
  • -

    -MAX_BUDGET should be greater than total evaluations. In this example, 45. -

    -
  • -
-
-
-
-

- - - + + + + + + + CANDLE Library User Guide + + + + + +
+
+
+
+

+ The CANDLE library provides a wrapper class and utility functions, + which enable users run their own deep learning code in high + performance computers that CANDLE supports. With the current + version of CANDLE library, users should be able to run + hyperparameter optimization (mlrMBO workflow) or parallel excution + (upf workflow). Due to the design of both workflows, users are + required to implement certain methods (will be explained in + section 1) and modify several config files (section 2). This user + guide will provide an overview of structure and explanation of + parameters or varaiables as needed. +

+
+
+
+

+ How to write CANDLE compliant deep learning code +

+
+

Minimum requirements

+
+
+

+ The CANDLE requires two methods, + initialize_parameters() and run(). +

+
+
+

+ Initialize_parameters Method +

+
+

+ In initialize_parameters method, we will construct + a class and build a parameter set, which will be used inside + your deep learning code (run method). We provides some common + parameters such as batch_size, epochs, + etc. In addition to that, you can construct your own parameters + (see Aurgument Specification section below). Finally, the + initialize_parameters should return a python + dictionary, in this doc, will be called + gParameters (global parameters). +

+
+
+
+

Run Method

+
+

+ You can place your deep learning code in + run(Dict) method. You can use parameter varaiable + like gParameters['batch_size']. +

+
+
+

+ We have an + example, that converted a simple MNIST neural net + mnist_mlp.py provided by + Keras Team + into CANDLE compliant form. In this example, you will see how + the initialize_parameters method is implemented and + how the actual NN code was transplanted in + run method. +

+
+
+

+ Finally, the run() returns history. This can be + omitted for upf workflow, but required for HPO workflow. +

+
+
+
+
+
+
    +
  1. +

    + In next section, we will explain where the common.MNIST + class came from. +

    +
  2. +
  3. +

    initialize_parameters return dictionary

    +
  4. +
  5. +

    run method receives parameter dictionary

    +
  6. +
  7. +

    returns history object

    +
  8. +
+
+
+
+
+
+

Argument Sepcification

+
+
+

+ In order to take advantage of the CANDLE framework, a model needs + to be able to modify its parameters via either reading from the + default_model file, or overwriting those parameters via an + appropriate command line argument. We standadized frequently used + ML keywords, as well as certain other keywords which are used by + the CANDLE scripts. We recommend users aware of these arguemtns to + avoid conflicts. For these CANDLE built-in command line arguments, + please see + default_utils.py +

+
+
+

Adding keyword

+
+

+ In order to simplify the process of adding keywords, we require + the user to provide a list of metadata of how to parse the + arugment. +

+
+
+
+
[{
+  'name':'shared_nnet_spec', // <1>
+  'nargs':'+', // <2>
+  'type': int, // <3>
+  'help':'network structure of shared layer' // <4>
+}, ...]
+
+
+
+
    +
  1. +

    required. Name of parameter.

    +
  2. +
  3. +

    optional. The number of command-line arguments.

    +
  4. +
  5. +

    + required. The type to which the command-line arguments + should be converted. +

    +
  6. +
  7. +

    + optional. A brief description of what the argument does. you + can add default, choices, and + action as needed. +

    +
  8. +
+
+
+
+

Building Class

+
+

+ When you have a list of additional paramaters, you need to pass + the definition to be parsed. Even though you don’t have + any additional parameters, this is generally recommended, since + you can buid your own shared method and build data processing + code that will be shared. +

+
+
+

+ Please take a look this + example. This is a source of common.MNIST class definition. +

+
+
+
+
additional_definitions = None
+required = None
+
+class MNIST(default_utils.Benchmark):
+    def set_locals(self):
+        if required is not None:
+            self.required = set(required)
+        if additional_definitions is not None:
+            self.additional_definitions = additional_definitions
+
+
+
+
+

Thead Optimization

+
+

+ Some HPC machines like Theta, the performance will + greatly improved if we let CANDLE handles threads. So, it is + generally recommended to have code like line 14 to 21 in + this example +

+
+
+
+
+

+ How to run CANDLE compliant code in Theta +

+
+

+ As mentioned above, we offer two different workflows in CANDLE: + Unrolled Parameter File (UPF) and Hyper Parameter Optimization (HPO). + The UPF workflow allows you to run parallel multi-node executions with + different parameters, while HPO workflow evaluates the best value of + hyperparameters based on mlrMBO algorithm. +

+
+
+

Running UPF on Theta

+
+

Step 1. Checkout Supervisor repo

+
+
+
$ git clone https://github.com/ECP-CANDLE/Supervisor.git
+
+
+
+

Step 2. Move to upf workflow directory

+
+
+
+
$ cd Supervisor/workflow/upf
+
+
+
+

+ Step 3. Set Env variables. In test/cfg-sys-1.sh, you + will need to set BENCHMARK_DIR to point the directory + that your script locates, and MODEL_PYTHON_SCRIPT to + name the script you want to run +

+
+
+
+
BENCHMARK_DIR=directory_where_my_script_locates
+MODEL_PYTHON_SCRIPT=my_script
+
+
+
+

+ Step 4. Set execution plan. Check test/upf-1.txt for + parameter configuration and modify as needed. This file contains + multiple number of JSON documents. Each JSON document will contain + the command line parameters. For example, +

+
+
+
+
{"id": "test0", "epochs": 10}
+{"id": "test1", "epochs": 20}
+
+
+
+

+ This will invoke two instances, which will run 10 epochs and 20 + epochs respectively. +

+
+
+

+ Step 5. Submit your job. You will need to set QUEUE, + PROJECT, PROCS, and + WALLTIME. You can configure those in + cfg-sys-1.sh (see Step 3), set as env variables, or + you can provide in your command line (see below). +

+
+
+
+
$ export QUEUE=default
+$ export PROJECT=myproject
+$ export PROCS=3
+$ export WALLTIME=01:00:00
+
+$ ./test/upf-1.sh theta upf-1.txt
+
+// or
+
+$ QUEUE=default PROJECT=myproject PROCS=3 WALLTIME=01:00:00 ./test/upf-1.sh theta upf-1.txt
+
+
+
+ +
+

Step 6. Check queue status

+
+
+
$ qstat -h user_name -f
+
+
+
+
+
+

+ Running mlrMBO based Hyperparameters Optimization (HPO) on Theta +

+
+

Step 1. Checkout Supervisor repo

+
+
+
$ git clone https://github.com/ECP-CANDLE/Supervisor.git
+
+
+
+

Step 2. Move to mlrMBO workflow directory

+
+
+
+
$ cd Supervisor/workflow/mlrMBO
+
+
+
+

+ Step 3. Set Env variables. In test/cfg-sys-1.sh, you + will need to set BENCHMARK_DIR to point the directory + that your script locates, and MODEL_PYTHON_SCRIPT to + name the script you want to run +

+
+
+
+
BENCHMARK_DIR=directory_where_my_script_locates
+MODEL_PYTHON_SCRIPT=my_script
+
+
+
+

+ Step 4. Config hyper parameters. In this step, we are configuring + parameter sets, which we will iteratively evaluate. For example, + you can create workflow/data/mnist.R as below. +

+
+
+
+
param.set <- makeParamSet(
+  makeDiscreteParam("batch_size", values=c(32, 64, 128, 256, 512)),
+  makeDiscreteParam("activation", values=c("relu", "sigmoid", "tanh")),
+  makeDiscreteParam("optimizer", values=c("adam", "sgd", "rmsprop")),
+  makeIntegerParam("epochs", lower=20, upper=20)
+)
+
+
+
+

+ In this example, we are varying four paramters, + batch_size, activation, + optimizer, epochs. Entire parameter + space will be 5 x 3 x 3 x 1. +

+
+
+

After creating this file, we need to point this file.

+
+
+
+
$ export PARAM_SET_FILE=mnist.R
+
+
+

Step 5. Submit your job.

+
+
+
$ ./test/test-1.sh mnist theta
+
+
+
+

+ The first argument is MODEL_NAME. If the name is registered in + test/cfg-prm-1.sh, it will use the pre-configured + parameter file. Otherwise, CANDLE will use + PARAM_SET_FILE we configured in step 4. +

+
+
+

+ You can specify the HPO search strategy. As you can see in + test/cfg-prm-1.sh, you are able to config + PROPOSE_POINTS, + MAX_CONCURRENT_EVALUATIONS, + MAX_ITERATIONS, MAX_BUDGE, + DESIGN_SIZE. +

+
+
+
    +
  • +

    + DESIGN_SIZE is a number of param sets that will + evaluate at the beginning of HPO search. In this example, + CANDLE will select random 10 param sets out of 45 (see Step 4, + for break downs). +

    +
  • +
  • +

    MAX_ITERATIONS is a number of iteration.

    +
  • +
  • +

    + PROPOSE_POINTS is a number of param sets that + CANDLE will evaluate in each iteration. So, if + MAX_ITERATION=3 and + PROPOSE_POINTS=5, CANDLE will ended up evaluating + 25 params (3 x 5 + 10). +

    +
  • +
  • +

    + MAX_BUDGET should be greater than total + evaluations. In this example, 45. +

    +
  • +
+
+
+
+
+

+ + + diff --git a/python/eqpy/eqpy.py b/python/eqpy/eqpy.py index 30cfc3e7..99a8debc 100644 --- a/python/eqpy/eqpy.py +++ b/python/eqpy/eqpy.py @@ -1,5 +1,5 @@ -import threading import sys +import threading try: from queue import Queue @@ -10,9 +10,11 @@ input_q = Queue() output_q = Queue() + def OUT_put(string_params): output_q.put(string_params) + def IN_get(): global input_q result = input_q.get() diff --git a/python/hyperopt/Readme.md b/python/hyperopt/Readme.md index 028e92b6..f1132a0e 100644 --- a/python/hyperopt/Readme.md +++ b/python/hyperopt/Readme.md @@ -1,21 +1,24 @@ -# EQPy-enabled Hyperopt # +# EQPy-enabled Hyperopt Files: -* eqpy_hyperopt/ - eqpy_hyperopt python package -* tests/ - unit tests for eqpy_hyperopt +- eqpy_hyperopt/ - eqpy_hyperopt python package +- tests/ - unit tests for eqpy_hyperopt eqpy_hyperopt/hyperopt_runner.py contains code that integrates hyperopt with a swift script via eqpy get and put calls. -Initialize eqpy_hyperopt from swift with +Initialize eqpy_hyperopt from swift with + ``` EQPy_init_package(ME,"eqpy_hyperopt.hyperopt_runner") ``` + On initialization eqpy_hyperopt will put an empty string in the output queue as handshake for swift to receive. Swift should then send a string containing the hyperopt parameters. This string should be formatted as a python dictionary. For example, + ``` { 'space' : hyperopt.hp.uniform(\'x\', -2, 2), @@ -24,24 +27,25 @@ python dictionary. For example, 'param_batch_size' : 10 } ``` + The elements of the dictionary are: -* space : see https://github.com/hyperopt/hyperopt/wiki/FMin#2-defining-a-search-space - The set of possible arguments to the model. +- space : see https://github.com/hyperopt/hyperopt/wiki/FMin#2-defining-a-search-space + The set of possible arguments to the model. -* algo : search algorithm - This object, such as `hyperopt.rand.suggest` and - `hyperopt.tpe.suggest` provides logic for sequential search of the - hyperparameter space. +- algo : search algorithm + This object, such as `hyperopt.rand.suggest` and + `hyperopt.tpe.suggest` provides logic for sequential search of the + hyperparameter space. -* max_evals : int - Allow up to this many function evaluations before returning. +- max_evals : int + Allow up to this many function evaluations before returning. -* param_batch_size : int - Retrieve at most this many new parameters sets from the search - algorithm for evaluation up to max_evals. Note that the actual - number of new parameter sets to evaluate is dependent on the - search algorithm. +- param_batch_size : int + Retrieve at most this many new parameters sets from the search + algorithm for evaluation up to max_evals. Note that the actual + number of new parameter sets to evaluate is dependent on the + search algorithm. Once these are received eqpy_hyperopt will initialize hyperopt and put the first of set (up to `param_batch_size`) in size in the output queue for swift @@ -50,9 +54,11 @@ model with these parameters. The evaluation results should be returned as a "," separated string where is element is a single number. For example, + ``` -1.23434,0.42422,-0.0001 ``` + The order of the results in the results string should match the order of the parameters (i.e. the first number in the results string is the result of the first model evaluation). @@ -61,7 +67,8 @@ When the `max_evals` number of evaluations has occurred, eqpy_hyperopt will put "FINAL" in the output queue, and then put the best parameters in the output queue. -## Tests ## +## Tests + The tests test basic eqpy_hyperopt functionality by running it 'stand-alone' without any eqpy mediated interation and also using eqpy but in a pure python context. @@ -70,7 +77,8 @@ Run the unit tests from within the tests directory with `python -m unittest test_hyperopt` - Source settings.sh to set the PYTHONPATH correctly. - - ## Misc ## - Pymongo / BSON were causing issues on Cori so that's "monkey patched" by setting `hyperopt.base.have_bson = False`. +Source settings.sh to set the PYTHONPATH correctly. + +## Misc + +Pymongo / BSON were causing issues on Cori so that's "monkey patched" by setting `hyperopt.base.have_bson = False`. diff --git a/python/hyperopt/eqpy_hyperopt/hyperopt_runner.py b/python/hyperopt/eqpy_hyperopt/hyperopt_runner.py index ee97269b..be590726 100644 --- a/python/hyperopt/eqpy_hyperopt/hyperopt_runner.py +++ b/python/hyperopt/eqpy_hyperopt/hyperopt_runner.py @@ -1,18 +1,20 @@ from __future__ import print_function -import numpy as np -import eqpy -from hyperopt import base, hp +import eqpy import hyperopt +import numpy as np +from hyperopt import base, hp # monkey patch hyperopt not to use bson. We don't # use any of the pymongo / bson parts of hyperopt and # they cause issues when running on Cori's compute node base.have_bson = False + class Runner: - def __init__(self, algo, domain, max_evals, param_batch_size, trials, rstate): + def __init__(self, algo, domain, max_evals, param_batch_size, trials, + rstate): self.algo = algo self.domain = domain self.max_evals = max_evals @@ -27,13 +29,13 @@ def run(self): if n_to_enqueue + done > self.max_evals: n_to_enqueue = self.max_evals - done - #print("to enqueue {}".format(n_to_enqueue)) + # print("to enqueue {}".format(n_to_enqueue)) new_ids = self.trials.new_trial_ids(n_to_enqueue) - #print("new_ids size: {}".format(len(new_ids))) + # print("new_ids size: {}".format(len(new_ids))) self.trials.refresh() new_trials = self.algo(new_ids, self.domain, self.trials, - self.rstate.randint(2 ** 31 - 1)) + self.rstate.randint(2**31 - 1)) if len(new_trials): self.trials.insert_trial_docs(new_trials) self.trials.refresh() @@ -45,16 +47,20 @@ def run(self): self.trials.refresh() def evaluate(self): - new_trials = [t for t in self.trials._dynamic_trials if t['state'] == base.JOB_STATE_NEW] - params = [t['misc']['vals'] for t in new_trials] + new_trials = [ + t for t in self.trials._dynamic_trials + if t["state"] == base.JOB_STATE_NEW + ] + params = [t["misc"]["vals"] for t in new_trials] rvals = self.domain.fn(params) for i in range(len(new_trials)): t = new_trials[i] - t['result'] = rvals[i] - t['state'] = base.JOB_STATE_DONE + t["result"] = rvals[i] + t["state"] = base.JOB_STATE_DONE self.trials.refresh() + def eqpy_func(params): retvals = [] # unpack and send to out @@ -64,10 +70,11 @@ def eqpy_func(params): # get result and format for hyperopt result = eqpy.IN_get() split_result = result.split(",") - return [{'loss': float(x), 'status' : base.STATUS_OK} for x in split_result] + return [{"loss": float(x), "status": base.STATUS_OK} for x in split_result] + def run(): - """run function for eqpy based run""" + """run function for eqpy based run.""" eqpy.OUT_put("") # params should be formatted as a dictionary @@ -76,14 +83,22 @@ def run(): trials = base.Trials() rstate = None - if 'seed' in hp_dict: - rstate = np.random.RandomState(hp_dict['seed']) - - fmin(eqpy_func, hp_dict['space'], hp_dict['algo'], hp_dict['max_evals'], - hp_dict['param_batch_size'], trials, rstate) + if "seed" in hp_dict: + rstate = np.random.RandomState(hp_dict["seed"]) + + fmin( + eqpy_func, + hp_dict["space"], + hp_dict["algo"], + hp_dict["max_evals"], + hp_dict["param_batch_size"], + trials, + rstate, + ) eqpy.OUT_put("FINAL") eqpy.OUT_put(str(trials.argmin)) + def fmin(fn, space, algo, max_evals, param_batch_size, trials, rstate=None): """Minimize a function over a hyperparameter space. @@ -128,7 +143,8 @@ def fmin(fn, space, algo, max_evals, param_batch_size, trials, rstate=None): a trials object, then that trials object will be affected by side-effect of this call. - rstate : numpy.RandomState, default numpy.random""" + rstate : numpy.RandomState, default numpy.random + """ if rstate is None: rstate = np.random.RandomState() @@ -136,6 +152,5 @@ def fmin(fn, space, algo, max_evals, param_batch_size, trials, rstate=None): # need a domain to pass to the algorithm to provide the space domain = base.Domain(fn, space, pass_expr_memo_ctrl=None) - runner = Runner(algo, domain, max_evals, param_batch_size, - trials, rstate) + runner = Runner(algo, domain, max_evals, param_batch_size, trials, rstate) runner.run() diff --git a/python/hyperopt/tests/test_hyperopt.py b/python/hyperopt/tests/test_hyperopt.py index 98b9de12..763862e1 100644 --- a/python/hyperopt/tests/test_hyperopt.py +++ b/python/hyperopt/tests/test_hyperopt.py @@ -1,58 +1,71 @@ from __future__ import print_function -import eqpy_hyperopt.hyperopt_runner as hr -from hyperopt import hp, base, tpe, rand -import numpy as np +import ast import math - import threading -import eqpy -import ast - import unittest +import eqpy +import eqpy_hyperopt.hyperopt_runner as hr +import numpy as np +from hyperopt import base, hp, rand, tpe def math_sin_func(params): retvals = [] - #print("len params: {}".format(len(params))) + # print("len params: {}".format(len(params))) for p in params: - x = p['x'][0] + x = p["x"][0] r = math.sin(x) - retvals.append({'loss': float(r), 'status': base.STATUS_OK}) + retvals.append({"loss": float(r), "status": base.STATUS_OK}) return retvals + class TestHyperopt(unittest.TestCase): def test_simple_rand(self): - space = hp.uniform('x', -2, 2) + space = hp.uniform("x", -2, 2) max_evals = 100 trials = base.Trials() - algo = rand.suggest #tpe.suggest + algo = rand.suggest # tpe.suggest param_batch_size = 10 # if seed is changed then the test will fail rstate = np.random.RandomState(42) - hr.fmin(math_sin_func, space, algo, max_evals, - param_batch_size, trials, rstate=rstate) + hr.fmin( + math_sin_func, + space, + algo, + max_evals, + param_batch_size, + trials, + rstate=rstate, + ) self.assertEqual(len(trials.results), 100) - self.assertAlmostEqual(trials.argmin['x'], -1.5805633657891858) + self.assertAlmostEqual(trials.argmin["x"], -1.5805633657891858) def test_simple_tpe(self): - space = hp.uniform('x', -2, 2) + space = hp.uniform("x", -2, 2) max_evals = 100 trials = base.Trials() - algo = tpe.suggest #tpe.suggest + algo = tpe.suggest # tpe.suggest # note that tpe won't always return more than 1 # parameter conbimation max_parallel_param_count = 10 # if seed is changed then the test will fail rstate = np.random.RandomState(42) - hr.fmin(math_sin_func, space, algo, max_evals, - max_parallel_param_count, trials, rstate=rstate) + hr.fmin( + math_sin_func, + space, + algo, + max_evals, + max_parallel_param_count, + trials, + rstate=rstate, + ) self.assertEqual(len(trials.results), 100) - self.assertAlmostEqual(trials.argmin['x'], -1.5708577298673572) + self.assertAlmostEqual(trials.argmin["x"], -1.5708577298673572) def test_eqpy(self): p = threading.Thread(target=hr.run) @@ -66,25 +79,27 @@ def test_eqpy(self): eqpy.input_q.put(hp_params_dict) # gets initial set of candidate parameters result = eqpy.output_q.get() - while (True): + while True: # result = {'x': [1.8382913715287232]};{...} split_result = result.split(";") - rs = ",".join([str(math.sin(ast.literal_eval(r)['x'][0])) for r in split_result]) + rs = ",".join([ + str(math.sin(ast.literal_eval(r)["x"][0])) for r in split_result + ]) # iff algo is rand.suggest, then len(split_result) should # equal max_parallel_param_count self.assertEqual(len(split_result), 10) eqpy.input_q.put(rs) # get the next set of candidate parameters result = eqpy.output_q.get() - if (result == "FINAL"): + if result == "FINAL": break # get final result self.assertEqual("{'x': -1.5477895914281512}", eqpy.output_q.get()) def test_no_seed(self): - """ Tests that passing no seed to eqpy_hyperopt doesn't raise - an exception """ + """Tests that passing no seed to eqpy_hyperopt doesn't raise an + exception.""" p = threading.Thread(target=hr.run) p.start() @@ -97,21 +112,24 @@ def test_no_seed(self): eqpy.input_q.put(hp_params_dict) # gets initial set of candidate parameters result = eqpy.output_q.get() - while (True): + while True: # result = {'x': [1.8382913715287232]};{...} split_result = result.split(";") - rs = ",".join([str(math.sin(ast.literal_eval(r)['x'][0])) for r in split_result]) + rs = ",".join([ + str(math.sin(ast.literal_eval(r)["x"][0])) for r in split_result + ]) # iff algo is rand.suggest, then len(split_result) should # equal max_parallel_param_count self.assertEqual(len(split_result), 10) eqpy.input_q.put(rs) # get the next set of candidate parameters result = eqpy.output_q.get() - if (result == "FINAL"): + if result == "FINAL": break # get final result self.assertTrue(len(eqpy.output_q.get()) > 0) -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/scratch/csv2f64/f64_2csv.c b/scratch/csv2f64/f64_2csv.c index 0cdf2c29..43c74005 100644 --- a/scratch/csv2f64/f64_2csv.c +++ b/scratch/csv2f64/f64_2csv.c @@ -37,7 +37,7 @@ main(int argc, char* argv[]) int total_rows, total_cols; sscanf(argv[3], "%i", &total_rows); sscanf(argv[4], "%i", &total_cols); - + bool result = convert(input, output, total_rows, total_cols); if (!result) { @@ -93,7 +93,7 @@ convert_fps(FILE* fp_i, FILE* fp_o, int total_rows, int total_cols) int cols = 1; // current column counter int rows = 1; // current row counter size_t actual_r = 0; // actual number of items read at last fread - int offset = 0; // starting offset in chars + int offset = 0; // starting offset in chars bool b; int i = 0; while (true) @@ -119,7 +119,7 @@ convert_fps(FILE* fp_i, FILE* fp_o, int total_rows, int total_cols) fprintf(fp_o, "\n"); rows++; } - + free(chars); free(floats); return true; @@ -185,4 +185,3 @@ format_double(int total_cols, double value, return true; } - diff --git a/scratch/csv2f64/f64_2hdf.py b/scratch/csv2f64/f64_2hdf.py index 9e1bc741..fdae655f 100644 --- a/scratch/csv2f64/f64_2hdf.py +++ b/scratch/csv2f64/f64_2hdf.py @@ -1,30 +1,27 @@ #!/usr/bin/env python3 +import argparse import sys import h5py import numpy as np -import argparse parser = argparse.ArgumentParser() -parser.add_argument('input', - help='The input f64 file') -parser.add_argument('output', - help='The output HDF file') +parser.add_argument("input", help="The input f64 file") +parser.add_argument("output", help="The output HDF file") args = parser.parse_args(sys.argv[1:]) print(args.input, args.output) - -f = h5py.File(args.output, 'r+') +f = h5py.File(args.output, "r+") print(f.keys()) -ds = f['conv1d_1']['conv1d_1']['kernel:0'] -a = ds[:,:,:] +ds = f["conv1d_1"]["conv1d_1"]["kernel:0"] +a = ds[:, :, :] # print(ds.shape) # print(ds.dtype) -a8 = a.astype('float64') +a8 = a.astype("float64") # print(a[0,0,0]) -a8 = np.fromfile(args.input, dtype='float64') +a8 = np.fromfile(args.input, dtype="float64") diff --git a/scratch/csv2f64/hdf2f64.py b/scratch/csv2f64/hdf2f64.py index 8bd4fd35..3b111c4d 100644 --- a/scratch/csv2f64/hdf2f64.py +++ b/scratch/csv2f64/hdf2f64.py @@ -1,26 +1,24 @@ #!/usr/bin/env python3 +import argparse import sys import h5py -import argparse parser = argparse.ArgumentParser() -parser.add_argument('input', - help='The input H5 file') -parser.add_argument('output', - help='The output f64 file') +parser.add_argument("input", help="The input H5 file") +parser.add_argument("output", help="The output f64 file") args = parser.parse_args(sys.argv[1:]) print(args) -f = h5py.File(args.input, 'r') +f = h5py.File(args.input, "r") print(f.keys()) -ds = f['conv1d_1']['conv1d_1']['kernel:0'] -a = ds[:,:,:] +ds = f["conv1d_1"]["conv1d_1"]["kernel:0"] +a = ds[:, :, :] # print(ds.shape) # print(ds.dtype) -a8 = a.astype('float64') +a8 = a.astype("float64") a8.tofile(args.output) diff --git a/scratch/csv2f64/inject-noise.py b/scratch/csv2f64/inject-noise.py index 7079c86f..3d263936 100644 --- a/scratch/csv2f64/inject-noise.py +++ b/scratch/csv2f64/inject-noise.py @@ -1,15 +1,14 @@ #!/usr/bin/env python3 -import random, sys +import argparse +import random +import sys import numpy as np -import argparse parser = argparse.ArgumentParser() -parser.add_argument('file', - help='The file to modify') -parser.add_argument('rate', - help='The fraction to modify') +parser.add_argument("file", help="The file to modify") +parser.add_argument("rate", help="The fraction to modify") args = parser.parse_args(sys.argv[1:]) @@ -17,9 +16,9 @@ rate = float(args.rate) -a8 = np.fromfile(args.file, dtype='float64') -print('input size: ', a8.shape[0]) -print('flip pct: ', rate, '%') +a8 = np.fromfile(args.file, dtype="float64") +print("input size: ", a8.shape[0]) +print("flip pct: ", rate, "%") rate = rate / 100 flips = 0 @@ -30,4 +29,4 @@ flips += 1 a8.tofile(args.file) -print('flipped: ', flips) +print("flipped: ", flips) diff --git a/scratch/csv2f64/test/data-4x3.csv b/scratch/csv2f64/test/data-4x3.csv index 0eb6ecb8..551dac9a 100644 --- a/scratch/csv2f64/test/data-4x3.csv +++ b/scratch/csv2f64/test/data-4x3.csv @@ -2,5 +2,3 @@ 0.2,5,100 6,5,10 70,6.2,-2 - - diff --git a/scratch/csv2f64/test/data-5x3.csv b/scratch/csv2f64/test/data-5x3.csv index 2b7e069f..b1398260 100644 --- a/scratch/csv2f64/test/data-5x3.csv +++ b/scratch/csv2f64/test/data-5x3.csv @@ -3,4 +3,3 @@ 6,-16.5,10 70,6.2,-2 42,-32,22 - diff --git a/scratch/csv2f64/test/err-4x3-1.csv b/scratch/csv2f64/test/err-4x3-1.csv index a73da841..e4583f91 100644 --- a/scratch/csv2f64/test/err-4x3-1.csv +++ b/scratch/csv2f64/test/err-4x3-1.csv @@ -2,5 +2,3 @@ 0.2,,5,100 6,5,10 70,6.2,-2 - - diff --git a/scratch/csv2f64/test/err-4x3-2.csv b/scratch/csv2f64/test/err-4x3-2.csv index a77de647..ae6a53f8 100644 --- a/scratch/csv2f64/test/err-4x3-2.csv +++ b/scratch/csv2f64/test/err-4x3-2.csv @@ -2,5 +2,3 @@ 0.2,,5,100 6,5,10x 70,6.2,-2 - - diff --git a/scratch/fake-lbann/test_1.py b/scratch/fake-lbann/test_1.py index 4d1c9d75..9f1e4500 100644 --- a/scratch/fake-lbann/test_1.py +++ b/scratch/fake-lbann/test_1.py @@ -1,4 +1,5 @@ import os + import fl_interface comm = os.getenv("COMM") diff --git a/scratch/histawk/hist.awk b/scratch/histawk/hist.awk index f968525e..e05c75f5 100644 --- a/scratch/histawk/hist.awk +++ b/scratch/histawk/hist.awk @@ -12,7 +12,7 @@ BEGIN { } } -{ +{ C[$0] = C[$0]+1; } diff --git a/scratch/horovod/horovod-1.py b/scratch/horovod/horovod-1.py index cddb5c20..de2326a1 100644 --- a/scratch/horovod/horovod-1.py +++ b/scratch/horovod/horovod-1.py @@ -1,2 +1,2 @@ horovod = "/home/wozniak/proj/horovod" -execfile(horovod+"/examples/keras_mnist.py") +execfile(horovod + "/examples/keras_mnist.py") diff --git a/scratch/horovod/test-2.swift b/scratch/horovod/test-2.swift index 81ead2f7..23ed8569 100644 --- a/scratch/horovod/test-2.swift +++ b/scratch/horovod/test-2.swift @@ -6,4 +6,4 @@ int exitcode = @par=2 launch("python", a1); printf("%i", exitcode); string a2[] = [ "/home/nick/Documents/repos/horovod/examples/keras_mnist.py", "Instance_2" ]; -int e2 = @par=2 launch("python", a2); \ No newline at end of file +int e2 = @par=2 launch("python", a2); diff --git a/scratch/horovod2/test-2.C b/scratch/horovod2/test-2.C index 7d6a7ee5..e49e5253 100644 --- a/scratch/horovod2/test-2.C +++ b/scratch/horovod2/test-2.C @@ -5,8 +5,7 @@ #include "controller.h" -int -main() +int main() { printf("OK\n"); return 0; diff --git a/scratch/horovod2/test-5-1.py b/scratch/horovod2/test-5-1.py index 1a1d09fc..ea84ab82 100644 --- a/scratch/horovod2/test-5-1.py +++ b/scratch/horovod2/test-5-1.py @@ -1,19 +1,18 @@ - # TEST 5-1 from __future__ import print_function print("TEST 5-1 PY") +import math + +import horovod.keras as hvd import keras +import tensorflow as tf +from keras import backend as K from keras.datasets import mnist +from keras.layers import Conv2D, Dense, Dropout, Flatten, MaxPooling2D from keras.models import Sequential -from keras.layers import Dense, Dropout, Flatten -from keras.layers import Conv2D, MaxPooling2D -from keras import backend as K -import math -import tensorflow as tf -import horovod.keras as hvd # Horovod: initialize Horovod. hvd.init() diff --git a/scratch/indices/i1.swift b/scratch/indices/i1.swift index 173ec82a..07347170 100644 --- a/scratch/indices/i1.swift +++ b/scratch/indices/i1.swift @@ -29,10 +29,9 @@ printf("A:") => printf("B[\"%s\"]=%i", k0, v0); C[string2int(k0)] = v0; } - + foreach v1, k1 in C { printf("C[%i]=%i", k1, v1); } } - diff --git a/scratch/launch-opts/README.adoc b/scratch/launch-opts/README.adoc index 468b5413..7a287a85 100644 --- a/scratch/launch-opts/README.adoc +++ b/scratch/launch-opts/README.adoc @@ -4,7 +4,7 @@ Edit +example.sh+ to select a +TURBINE_LAUNCH_OPTIONS+. Then: ---- -$ ./example.sh +$ ./example.sh TURBINE-THETA SCRIPT NODES=2 PROCS=2 @@ -15,7 +15,7 @@ running qsub ... Job routed to queue "debug-cache-quad". Memory mode set to cache quad for queue debug-cache-quad JOB_ID=161050 -$ +$ ---- Wait for job completion. Then: diff --git a/scratch/load/load.py b/scratch/load/load.py index 96613588..d14e5001 100644 --- a/scratch/load/load.py +++ b/scratch/load/load.py @@ -1,11 +1,10 @@ - # LOAD PY # Convert the date/time markers in the extracted start/stop times # into plottable data -from datetime import datetime import sys +from datetime import datetime load = 0 D = [] @@ -14,15 +13,18 @@ print("usage: load.py START STOP < INPUT") exit(1) + def parse(d): return datetime.strptime(d, "%Y-%m-%d %H:%M:%S") + def emit(d, old_load, load): print("%0.2f %03i" % (d.timestamp() - ts_start, old_load)) print("%0.2f %03i" % (d.timestamp() - ts_start, load)) + start = parse(sys.argv[1]) -stop = parse(sys.argv[2]) +stop = parse(sys.argv[2]) ts_start = start.timestamp() diff --git a/scratch/py-eval/py/err.py b/scratch/py-eval/py/err.py index 3892497c..aa2bdf66 100644 --- a/scratch/py-eval/py/err.py +++ b/scratch/py-eval/py/err.py @@ -1 +1 @@ -2+ +# 2+ diff --git a/scratch/py-eval/py/import-stringio.py b/scratch/py-eval/py/import-stringio.py index 2f93186a..0e4adf21 100644 --- a/scratch/py-eval/py/import-stringio.py +++ b/scratch/py-eval/py/import-stringio.py @@ -1,3 +1,5 @@ from StringIO import StringIO + + def get_string_io(): return StringIO() diff --git a/scratch/py-eval/py/numpy-array.py b/scratch/py-eval/py/numpy-array.py index 99f8c648..676903b5 100644 --- a/scratch/py-eval/py/numpy-array.py +++ b/scratch/py-eval/py/numpy-array.py @@ -1,2 +1 @@ A = numpy.array(3) - diff --git a/scratch/py-eval/py/numpy-print-A.py b/scratch/py-eval/py/numpy-print-A.py index 089fd8aa..c757e0ac 100644 --- a/scratch/py-eval/py/numpy-print-A.py +++ b/scratch/py-eval/py/numpy-print-A.py @@ -1,3 +1 @@ print(A) - - diff --git a/scratch/resizer/resize.py b/scratch/resizer/resize.py index ce75a988..ecd6275b 100644 --- a/scratch/resizer/resize.py +++ b/scratch/resizer/resize.py @@ -1,41 +1,50 @@ - # RESIZE PY description = "Resize and/or add noise to CSV data." + def parse_args(): import argparse + parser = argparse.ArgumentParser(description=description) - parser.add_argument("--resize", action="store", default=1.0, - help=""" + parser.add_argument( + "--resize", + action="store", + default=1.0, + help=""" Output size scale compared to input size as float. Examples: 1.0=same size, 0.5=half size, - 2.0=double size.""") - parser.add_argument("--noise", action="store", default=0.0, - help="""" + 2.0=double size.""", + ) + parser.add_argument( + "--noise", + action="store", + default=0.0, + help="""" Noise injection as float. Examples: 0.0=no noise - 0.1=noise +/- 10%""") - parser.add_argument("input", action="store", - help="The input CSV.") - parser.add_argument("output", action="store", - help="The output CSV.") + 0.1=noise +/- 10%""", + ) + parser.add_argument("input", action="store", help="The input CSV.") + parser.add_argument("output", action="store", help="The output CSV.") args = parser.parse_args() argvars = vars(args) # print(str(argvars)) return argvars + def write_data(args, fp, data_out): from random import random + wholes = int(float(args["resize"])) noise = float(args["noise"]) rows, cols = data_out.shape for i in range(0, wholes): for row in range(0, rows): - for col in range(0, cols-1): + for col in range(0, cols - 1): value = data_out[row, col] if noise != 0.0: value = value * (1 - noise) + value * (noise * 2) * random() @@ -48,7 +57,7 @@ def write_data(args, fp, data_out): fp.write("\n") fraction = float(args["resize"]) - wholes for row in range(0, int(fraction * rows)): - for col in range(0, cols-1): + for col in range(0, cols - 1): value = data_out[row, col] if noise != 0.0: value = value * (1 - noise) + value * (noise * 2) * random() @@ -60,20 +69,22 @@ def write_data(args, fp, data_out): fp.write("%f" % value) fp.write("\n") + import sys + import numpy as np args = parse_args() -data_in = np.loadtxt(args["input"], delimiter=",") +data_in = np.loadtxt(args["input"], delimiter=",") data_out = np.copy(data_in) if args["output"] == "/dev/stdout" or args["output"] == "-": fp = sys.stdout -else: +else: fp = open(args["output"], "w") - + write_data(args, fp, data_out) -if fp is not sys.stdout: +if fp is not sys.stdout: fp.close() diff --git a/scratch/swift-tests/fake-model.py b/scratch/swift-tests/fake-model.py index 57726f66..480627a8 100644 --- a/scratch/swift-tests/fake-model.py +++ b/scratch/swift-tests/fake-model.py @@ -1,4 +1,3 @@ - # import something ? print("fake-model.py: python works") diff --git a/scripts/shrink-log.py b/scripts/shrink-log.py index 12f9d5ff..74e64900 100644 --- a/scripts/shrink-log.py +++ b/scripts/shrink-log.py @@ -1,4 +1,3 @@ - # SHRINK LOG PY # argv: 2 filenames : tr-*.log and summary-*.log # Called by shrink-log-single.sh @@ -8,10 +7,13 @@ # Removes redundant batch size information # Fixes newline before "Current time" report -import os, re, stat, sys, time +import os +import re +import stat +import sys +import time from collections import deque - # Only 1/shrink_factor training lines are copied shrink_factor = 200 # Number of additional consecutive lines at beginning and end of @@ -26,7 +28,8 @@ def shrink(fp_in, fp_out): starts = 0 # Initial hold_space ETAs are immediately printed line_previous = "" for line in fp_in: - if len(line) == 1: continue # Blank line + if len(line) == 1: + continue # Blank line line = line.replace("\b", "") if "batch:" in line or "Current" in line: line = re.sub("- batch: .* 32.0000 -", "", line) @@ -59,20 +62,20 @@ def hsize(size, decimal_places=2): if size < 1024: return "%4i B" % size size /= 1024 - for unit in ["KB","MB","GB","TB"]: + for unit in ["KB", "MB", "GB", "TB"]: if size < 1024: break size /= 1024 return f"{size:.{decimal_places}f} {unit}" -file_in = sys.argv[1] +file_in = sys.argv[1] file_out = sys.argv[2] # Do not process files that have not changed since the last run # of this script: -if os.path.exists(file_out) and \ - os.path.getmtime(file_in) < os.path.getmtime(file_out): +if os.path.exists( + file_out) and os.path.getmtime(file_in) < os.path.getmtime(file_out): print("skipping: " + file_in) exit() @@ -80,8 +83,7 @@ def hsize(size, decimal_places=2): s0 = os.stat(file_in) z0 = s0[stat.ST_SIZE] h0 = hsize(z0) -print("shrink: %11s %s" % - (h0, file_in)) +print("shrink: %11s %s" % (h0, file_in)) with open(file_in, "r") as fp_in: with open(file_out, "w") as fp_out: @@ -92,7 +94,7 @@ def hsize(size, decimal_places=2): z1 = s1[stat.ST_SIZE] t = t1 - t0 -rate = hsize(z0/t) +rate = hsize(z0 / t) print("shrank: %0.2fs %11s/s %11s -> %11s %s" % (t, rate, hsize(z0), hsize(z1), file_in)) diff --git a/spack/spack.yaml b/spack/spack.yaml index 7e7743f5..eac5b770 100644 --- a/spack/spack.yaml +++ b/spack/spack.yaml @@ -5,18 +5,18 @@ spack: # add package specs to the `specs` list specs: - - py-keras ^py-theano+gpu - - py-scikit-learn - - py-pandas - - py-requests - - py-mdanalysis ^py-matplotlib@:2.2.3+image - - r-mlrmbo ^r-plotly@4.5.6 - - r-rgenoud - - r-dicekriging - - r-randomforest - - r-jsonlite - - stc@develop ^turbine@develop+python+r - - eqr + - py-keras ^py-theano+gpu + - py-scikit-learn + - py-pandas + - py-requests + - py-mdanalysis ^py-matplotlib@:2.2.3+image + - r-mlrmbo ^r-plotly@4.5.6 + - r-rgenoud + - r-dicekriging + - r-randomforest + - r-jsonlite + - stc@develop ^turbine@develop+python+r + - eqr config: {} mirrors: {} modules: @@ -25,12 +25,17 @@ spack: packages: all: providers: - mpi: [mvapich2@2.3 arch=linux-rhel7-x86_64, spectrum-mpi@rolling-release arch=linux-rhel7-ppc64le] + mpi: + [ + mvapich2@2.3 arch=linux-rhel7-x86_64, + spectrum-mpi@rolling-release arch=linux-rhel7-ppc64le, + ] buildable: true version: [] paths: {} modules: {} - compiler: [gcc@7.3.0 arch=linux-rhel7-x86_64, gcc@7.3.1 arch=linux-rhel7-ppc64le] + compiler: + [gcc@7.3.0 arch=linux-rhel7-x86_64, gcc@7.3.1 arch=linux-rhel7-ppc64le] python: buildable: true version: [3.7.2] diff --git a/workflows/GA/README.md b/workflows/GA/README.md index db42def1..bac4e162 100644 --- a/workflows/GA/README.md +++ b/workflows/GA/README.md @@ -1,34 +1,33 @@ -# GA (genetic algorithm) based based hyperparameter optimization on CANDLE Benchmarks # +# GA (genetic algorithm) based based hyperparameter optimization on CANDLE Benchmarks -The GA workflow uses the Python deap package (http://deap.readthedocs.io/en/master) to optimize hyperparameters using a genetic algorithm. +The GA workflow uses the Python deap package (http://deap.readthedocs.io/en/master) to optimize hyperparameters using a genetic algorithm. -## Running ## +## Running 1. cd into the **Supervisor/workflows/GA/test** directory 2. Specify the GA parameters in the **cfg-prm-1.sh** file (see [below](#structure) for more information on the GA parameters) 3. Specify the PROCS, QUEUE etc. in **cfg-sys-1.sh** file 4. You will pass the MODEL_NAME, SITE, and optional experiment id arguments to **test-1.sh** file when launching: -`./test-1.sh [expid]` -where `model_name` can be tc1 etc., `machine_name` can be local, cori, theta, titan etc. (see [NOTE](#making_changes) below on creating new SITE files.) + `./test-1.sh [expid]` + where `model_name` can be tc1 etc., `machine_name` can be local, cori, theta, titan etc. (see [NOTE](#making_changes) below on creating new SITE files.) 5. Update the parameter space json file if necessary. The parameter space is defined in json file (see **workflows/GA/data/tc1_param_space_ga.json** for an example with tc1). The -**cfg-prm-1.sh** script will attempt to select the correct json given the model name. Edit that file as appropriate. The parameter space json file is further described [here](#config) + **cfg-prm-1.sh** script will attempt to select the correct json given the model name. Edit that file as appropriate. The parameter space json file is further described [here](#config) 6. The benchmark will be run for the number of processors specified 7. Final objective function values, along with parameters, will be available in the experiments directory in a **finals_results** file and also printed to standard out. - -## User requirements ## +## User requirements What you need to install to run the workflow: -* This workflow - `git@github.com:ECP-CANDLE/Supervisor.git` . +- This workflow - `git@github.com:ECP-CANDLE/Supervisor.git` . Clone and switch to the `master` branch. Then `cd` to `workflows/GA` (the directory containing this README). -* TC1 or other benchmark - `git@github.com:ECP-CANDLE/Benchmarks.git` . +- TC1 or other benchmark - `git@github.com:ECP-CANDLE/Benchmarks.git` . Clone and switch to the `frameworks` branch. -* benchmark data - - See the individual benchmarks README for obtaining the initial data +- benchmark data - + See the individual benchmarks README for obtaining the initial data - Python specific installation requirements: +Python specific installation requirements: 1. pandas 2. deap @@ -41,10 +40,10 @@ directory to the PYTHONPATH specified in **cfg-sys-1.sh**. For example, `export PYTHONPATH=/global/u1/n/ncollier/.local/cori/deeplearning2.7/lib/python2.7/site-packages` - -## Calling sequence ## +## Calling sequence Function calls: + ``` test-1.sh -> swift/workflow.sh -> @@ -61,13 +60,16 @@ test-1.sh -> swift/workflow.sh -> ``` Scheduling scripts: + ``` test-1.sh -> cfg-sys-1.sh -> common/sh/ - module, scheduling, langs .sh files ``` + ## Making Changes To create your own SITE files in workflows/common/sh/: + - langs-SITE.sh - langs-app-SITE.sh - modules-SITE.sh @@ -77,17 +79,18 @@ copy existing ones but modify the langs-SITE.sh file to define the EQPy location ### Structure ### -The point of the script structure is that it is easy to make copy and modify the `test-*.sh` script, and the `cfg-*.sh` scripts. These can be checked back -into the repo for use by others. The `test-*.sh` script and the `cfg-*.sh` scripts should simply contain environment variables that control how `workflow.sh` +The point of the script structure is that it is easy to make copy and modify the `test-*.sh` script, and the `cfg-*.sh` scripts. These can be checked back +into the repo for use by others. The `test-*.sh` script and the `cfg-*.sh` scripts should simply contain environment variables that control how `workflow.sh` and `workflow.swift` operate. `test-1.sh` and `cfg-{sys,prm}-1.sh` should be unmodified for simple testing. The relevant parameters for the GA algorithm are defined in `cfg-prm-*.sh` scripts (see example in `cfg-prm-1.sh`). These are: + - SEED: The random seed used by deap in the GA. - NUM_ITERATIONS: The number of iterations the GA should perform. -- POPULATION_SIZE: The maximum number of hyperparameter sets to evaluate in each iteration. -GA_STRATEGY: The algorithm used by the GA. Can be one of "simple" or "mu_plus_lambda". See eaSimple and eaMuPlusLambda at https://deap.readthedocs.io/en/master/api/algo.html?highlight=eaSimple#module-deap.algorithms for more information. +- POPULATION_SIZE: The maximum number of hyperparameter sets to evaluate in each iteration. + GA_STRATEGY: The algorithm used by the GA. Can be one of "simple" or "mu_plus_lambda". See eaSimple and eaMuPlusLambda at https://deap.readthedocs.io/en/master/api/algo.html?highlight=eaSimple#module-deap.algorithms for more information. ### Hyperparameter Configuration File ### @@ -96,95 +99,109 @@ The GA workflow uses a json format file for defining the hyperparameter space. T The hyperparameter configuration file has a json format consisting of a list of json dictionaries, each one of which defines a hyperparameter. Each dictionary has the following required keys: -* name: the name of the hyperparameter (e.g. *epochs*) -* type: determines how the initial population (i.e. the hyperparameter sets) are initialized from the named parameter and how those values are subsequently mutated by the GA. Type is one of `constant`, `int`, `float`, `logical`, `categorical`, or `ordered`. - * `constant`: - * each model is initialized with the same specifed value - * mutation always returns the same specified value - * `int`: - * each model is initialized with an int randomly drawn from the range defined by `lower` and `upper` bounds - * mutation is peformed by adding the results of a random draw from - a gaussian distribution to the current value, where the gaussian distribution's mu is 0 and its sigma is specified by the `sigma` entry. - * `float`: - * each model is initialized with a float randomly drawn from the range defined by `lower` and `upper` bounds - * mutation is peformed by adding the results of a random draw from - a gaussian distribution to the current value, where the gaussian distribution's mu is 0 and its sigma is specified by the `sigma` entry. - * `logical`: - * each model is initialized with a random boolean. - * mutation flips the logical value - * `categorical`: - * each model is initialized with an element chosen at random from the list of elements in `values`. - * mutation chooses an element from the `values` list at random - * `ordered`: - * each model is inititalized with an element chosen at random from the list of elements in `values`. - * given the index of the current value in the list of `values`, mutation selects the element *n* number of indices away, where n is the result of a random draw between 1 and `sigma` and then is negated with a 0.5 probability. +- name: the name of the hyperparameter (e.g. _epochs_) +- type: determines how the initial population (i.e. the hyperparameter sets) are initialized from the named parameter and how those values are subsequently mutated by the GA. Type is one of `constant`, `int`, `float`, `logical`, `categorical`, or `ordered`. + - `constant`: + - each model is initialized with the same specifed value + - mutation always returns the same specified value + - `int`: + - each model is initialized with an int randomly drawn from the range defined by `lower` and `upper` bounds + - mutation is peformed by adding the results of a random draw from + a gaussian distribution to the current value, where the gaussian distribution's mu is 0 and its sigma is specified by the `sigma` entry. + - `float`: + - each model is initialized with a float randomly drawn from the range defined by `lower` and `upper` bounds + - mutation is peformed by adding the results of a random draw from + a gaussian distribution to the current value, where the gaussian distribution's mu is 0 and its sigma is specified by the `sigma` entry. + - `logical`: + - each model is initialized with a random boolean. + - mutation flips the logical value + - `categorical`: + - each model is initialized with an element chosen at random from the list of elements in `values`. + - mutation chooses an element from the `values` list at random + - `ordered`: + - each model is inititalized with an element chosen at random from the list of elements in `values`. + - given the index of the current value in the list of `values`, mutation selects the element _n_ number of indices away, where n is the result of a random draw between 1 and `sigma` and then is negated with a 0.5 probability. The following keys are required depending on value of the `type` key. If the `type` is `constant`: - * `value`: the constant value + +- `value`: the constant value If the `type` is `int`, or `float`: - * `lower`: the lower bound of the range to draw from - * `upper`: the upper bound of the range to draw from - * `sigma`: the sigma value used by the mutation operator (see above). + +- `lower`: the lower bound of the range to draw from +- `upper`: the upper bound of the range to draw from +- `sigma`: the sigma value used by the mutation operator (see above). If the `type` is `categorical`: - * `values`: the list of elements to choose from - * `element_type`: the type of the elements to choose from. One of `int`, `float`, `string`, or `logical` + +- `values`: the list of elements to choose from +- `element_type`: the type of the elements to choose from. One of `int`, `float`, `string`, or `logical` If the `type` is `ordered`: - * `values`: the list of elements to choose from - * `element_type`: the type of the elements to choose from. One of `int`, `float`, `string`, or `logical` - * `sigma`: the sigma value used by the mutation operator (see above). + +- `values`: the list of elements to choose from +- `element_type`: the type of the elements to choose from. One of `int`, `float`, `string`, or `logical` +- `sigma`: the sigma value used by the mutation operator (see above). A sample hyperparameter definition file: ```javascript [ { - "name": "activation", - "type": "categorical", - "element_type": "string", - "values": ["softmax", "elu", "softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear"] + name: "activation", + type: "categorical", + element_type: "string", + values: [ + "softmax", + "elu", + "softplus", + "softsign", + "relu", + "tanh", + "sigmoid", + "hard_sigmoid", + "linear", + ], }, { - "name": "optimizer", - "type": "categorical", - "element_type": "string", - "values": ["adam", "rmsprop"] + name: "optimizer", + type: "categorical", + element_type: "string", + values: ["adam", "rmsprop"], }, { - "name": "lr", - "type": "float", - "lower": 0.0001, - "upper": 0.01, - "sigma": "0.000495" + name: "lr", + type: "float", + lower: 0.0001, + upper: 0.01, + sigma: "0.000495", }, { - "name": "batch_size", - "type": "ordered", - "element_type": "int", - "values": [16, 32, 64, 128, 256], - "sigma": 1 - } -] + name: "batch_size", + type: "ordered", + element_type: "int", + values: [16, 32, 64, 128, 256], + sigma: 1, + }, +]; ``` Note that any other keys are ignored by the workflow but can be used to add additional information about the hyperparameter. For example, the sample files contain a `comment` entry that contains additional information about that hyperparameter. -### Where to check for output ### +### Where to check for output This includes error output. -When you run the test script, you will get a message about `TURBINE_OUTPUT` . This will be the main output directory for your run. +When you run the test script, you will get a message about `TURBINE_OUTPUT` . This will be the main output directory for your run. -* On a local system, stdout/stderr for the workflow will go to your terminal. -* On a scheduled system, stdout/stderr for the workflow will go to `TURBINE_OUTPUT/output.txt` +- On a local system, stdout/stderr for the workflow will go to your terminal. +- On a scheduled system, stdout/stderr for the workflow will go to `TURBINE_OUTPUT/output.txt` The individual objective function (model) runs stdout/stderr go into directories of the form: @@ -197,10 +214,7 @@ Each successful run of the workflow will produce a `final_results_2` file. The f - gen: the generation / iteration - nevals: the number of evaluations performed in this generation. In generations after the first, this may be less the total population size as some combinations will already have been evaluated. - avg: the average score -- std: the standard deviation +- std: the standard deviation - min: the minimum score - max: the maximum score - ts: a timestamp recording when this generation finished. The value is the number of seconds since the epoch in floating point format - - - diff --git a/workflows/GA/data/adrp_param_space_ga.json b/workflows/GA/data/adrp_param_space_ga.json index 2b86d137..c0f7689d 100644 --- a/workflows/GA/data/adrp_param_space_ga.json +++ b/workflows/GA/data/adrp_param_space_ga.json @@ -46,4 +46,3 @@ "values": ["elu", "relu", "linear"] } ] - diff --git a/workflows/GA/data/combo_param_space_ga.json b/workflows/GA/data/combo_param_space_ga.json index fd2139c4..a392d9df 100644 --- a/workflows/GA/data/combo_param_space_ga.json +++ b/workflows/GA/data/combo_param_space_ga.json @@ -16,22 +16,26 @@ "name": "dense", "type": "categorical", "element_type": "string", - "values": ["1000", - "1000 1000", - "1000 1000 1000", - "1000 1000 1000 1000", - "1000 1000 1000 1000 1000"] + "values": [ + "1000", + "1000 1000", + "1000 1000 1000", + "1000 1000 1000 1000", + "1000 1000 1000 1000 1000" + ] }, { "name": "dense_feature_layers", "type": "categorical", "element_type": "string", - "values": ["1000", - "1000 1000", - "1000 1000 1000", - "1000 1000 1000 1000", - "1000 1000 1000 1000 1000"] + "values": [ + "1000", + "1000 1000", + "1000 1000 1000", + "1000 1000 1000 1000", + "1000 1000 1000 1000 1000" + ] }, { @@ -90,16 +94,16 @@ { "name": "clipnorm", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 }, { "name": "clipvalue", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 }, @@ -107,7 +111,7 @@ "name": "decay", "type": "float", "lower": 0, - "upper": 1e01, + "upper": 1e1, "sigma": 0.5 }, @@ -122,8 +126,8 @@ { "name": "rho", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 }, @@ -131,7 +135,7 @@ "name": "momentum", "type": "float", "lower": 0, - "upper": 1e01, + "upper": 1e1, "sigma": 0.5 }, @@ -143,16 +147,16 @@ { "name": "beta_1", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 }, { "name": "beta_2", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 } ] diff --git a/workflows/GA/data/nt3_param_space_ga.json b/workflows/GA/data/nt3_param_space_ga.json index 527d07b5..bf946317 100644 --- a/workflows/GA/data/nt3_param_space_ga.json +++ b/workflows/GA/data/nt3_param_space_ga.json @@ -19,18 +19,30 @@ "name": "activation", "type": "categorical", "element_type": "string", - "values": ["softmax", "elu", "softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear"] + "values": [ + "softmax", + "elu", + "softplus", + "softsign", + "relu", + "tanh", + "sigmoid", + "hard_sigmoid", + "linear" + ] }, { "name": "dense", "type": "categorical", "element_type": "string", - "values": ["500 100 50", - "1000 500 100 50", - "2000 1000 500 100 50", - "2000 1000 1000 500 100 50", - "2000 1000 1000 1000 500 100 50"] + "values": [ + "500 100 50", + "1000 500 100 50", + "2000 1000 500 100 50", + "2000 1000 1000 500 100 50", + "2000 1000 1000 1000 500 100 50" + ] }, { @@ -60,26 +72,28 @@ "name": "conv", "type": "categorical", "element_type": "string", - "values": ["50 50 50 50 50 1", - "25 25 25 25 25 1", - "64 32 16 32 64 1", - "100 100 100 100 100 1", - "32 20 16 32 10 1"] + "values": [ + "50 50 50 50 50 1", + "25 25 25 25 25 1", + "64 32 16 32 64 1", + "100 100 100 100 100 1", + "32 20 16 32 10 1" + ] }, { "name": "clipnorm", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 }, { "name": "clipvalue", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 }, @@ -87,53 +101,52 @@ "name": "decay", "type": "float", "lower": 0, - "upper": 1e01, + "upper": 1e1, "sigma": 0.5 }, + { + "name": "epsilon", + "type": "ordered", + "element_type": "float", + "values": [1e-6, 1e-8, 1e-10, 1e-12, 1e-14], + "sigma": 1 + }, + + { + "name": "rho", + "type": "float", + "lower": 1e-4, + "upper": 1e1, + "sigma": 0.499995 + }, + + { + "name": "momentum", + "type": "float", + "lower": 0, + "upper": 1e1, + "sigma": 0.5 + }, - { - "name": "epsilon", - "type": "ordered", - "element_type": "float", - "values": [1e-6, 1e-8, 1e-10, 1e-12, 1e-14], - "sigma": 1 - }, - - { - "name": "rho", - "type": "float", - "lower": 1e-04, - "upper": 1e01, - "sigma": 0.499995 - }, - - { - "name": "momentum", - "type": "float", - "lower": 0, - "upper": 1e01, - "sigma": 0.5 - }, - - { - "name": "nesterov", - "type": "logical" - }, - - { - "name": "beta_1", - "type": "float", - "lower": 1e-04, - "upper": 1e01, - "sigma": 0.499995 - }, - - { - "name": "beta_2", - "type": "float", - "lower": 1e-04, - "upper": 1e01, - "sigma": 0.499995 - } + { + "name": "nesterov", + "type": "logical" + }, + + { + "name": "beta_1", + "type": "float", + "lower": 1e-4, + "upper": 1e1, + "sigma": 0.499995 + }, + + { + "name": "beta_2", + "type": "float", + "lower": 1e-4, + "upper": 1e1, + "sigma": 0.499995 + } ] diff --git a/workflows/GA/data/p1b1_param_space_ga.json b/workflows/GA/data/p1b1_param_space_ga.json index eae9684b..1def4610 100644 --- a/workflows/GA/data/p1b1_param_space_ga.json +++ b/workflows/GA/data/p1b1_param_space_ga.json @@ -31,12 +31,14 @@ "name": "dense", "type": "categorical", "element_type": "string", - "values": ["1500 500", - "978 978", - "978 978 978", - "978 978 978 978", - "978 978 978 978 978", - "978 978 978 978 978 978"] + "values": [ + "1500 500", + "978 978", + "978 978 978", + "978 978 978 978", + "978 978 978 978 978", + "978 978 978 978 978 978" + ] }, { @@ -95,16 +97,16 @@ { "name": "clipnorm", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 }, { "name": "clipvalue", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 }, @@ -112,11 +114,10 @@ "name": "decay", "type": "float", "lower": 0, - "upper": 1e01, + "upper": 1e1, "sigma": 0.5 }, - { "name": "epsilon", "type": "ordered", @@ -128,8 +129,8 @@ { "name": "rho", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 }, @@ -137,7 +138,7 @@ "name": "momentum", "type": "float", "lower": 0, - "upper": 1e01, + "upper": 1e1, "sigma": 0.5 }, @@ -149,16 +150,16 @@ { "name": "beta_1", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 }, { "name": "beta_2", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 } ] diff --git a/workflows/GA/data/tc1_param_space_ga.json b/workflows/GA/data/tc1_param_space_ga.json index eb321669..ffa1a4d4 100644 --- a/workflows/GA/data/tc1_param_space_ga.json +++ b/workflows/GA/data/tc1_param_space_ga.json @@ -23,7 +23,17 @@ "name": "activation", "type": "categorical", "element_type": "string", - "values": ["softmax", "elu", "softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear"] + "values": [ + "softmax", + "elu", + "softplus", + "softsign", + "relu", + "tanh", + "sigmoid", + "hard_sigmoid", + "linear" + ] }, { @@ -62,5 +72,4 @@ "type": "constant", "value": 5 } - ] diff --git a/workflows/GA/swift/workflow.sh b/workflows/GA/swift/workflow.sh index a39256da..33d1153e 100755 --- a/workflows/GA/swift/workflow.sh +++ b/workflows/GA/swift/workflow.sh @@ -170,4 +170,3 @@ fi # Andrew: Needed this so that script to monitor job worked properly (queue_wait... function in utils.sh?) echo $TURBINE_OUTPUT > turbine-directory.txt - diff --git a/workflows/GA/test/cfg-prm-1.sh b/workflows/GA/test/cfg-prm-1.sh index 520afc7c..85893270 100644 --- a/workflows/GA/test/cfg-prm-1.sh +++ b/workflows/GA/test/cfg-prm-1.sh @@ -7,8 +7,8 @@ SEED=${SEED:-1} NUM_ITERATIONS=${NUM_ITERATIONS:-2} # Size of GA population (i.e. the number of parameter sets to evaluate) POPULATION_SIZE=${POPULATION_SIZE:-4} -# the GA strategy: one of 'simple' or 'mu_plus_lambda'. See -# https://deap.readthedocs.io/en/master/api/algo.html?highlight=eaSimple#module-deap.algorithms +# the GA strategy: one of 'simple' or 'mu_plus_lambda'. See +# https://deap.readthedocs.io/en/master/api/algo.html?highlight=eaSimple#module-deap.algorithms # for more info. GA_STRATEGY=${STRATEGY:-simple} diff --git a/workflows/GA/test/cfg-prm-summit.sh b/workflows/GA/test/cfg-prm-summit.sh index 58fa0a1e..04981a4f 100644 --- a/workflows/GA/test/cfg-prm-summit.sh +++ b/workflows/GA/test/cfg-prm-summit.sh @@ -7,8 +7,8 @@ SEED=${SEED:-1} NUM_ITERATIONS=${NUM_ITERATIONS:-1} # Size of GA population (i.e. the number of parameter sets to evaluate) POPULATION_SIZE=${POPULATION_SIZE:-274} -# the GA strategy: one of 'simple' or 'mu_plus_lambda'. See -# https://deap.readthedocs.io/en/master/api/algo.html?highlight=eaSimple#module-deap.algorithms +# the GA strategy: one of 'simple' or 'mu_plus_lambda'. See +# https://deap.readthedocs.io/en/master/api/algo.html?highlight=eaSimple#module-deap.algorithms # for more info. GA_STRATEGY=${STRATEGY:-simple} diff --git a/workflows/GA/test/test-1.sh b/workflows/GA/test/test-1.sh index 207f0cc2..fe71cc6a 100755 --- a/workflows/GA/test/test-1.sh +++ b/workflows/GA/test/test-1.sh @@ -60,4 +60,3 @@ SCRIPT=$( basename $0 .sh ) #check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID echo "$SCRIPT: SUCCESS" - diff --git a/workflows/async-horovod/Problem.py b/workflows/async-horovod/Problem.py index 75760d58..b039f42d 100644 --- a/workflows/async-horovod/Problem.py +++ b/workflows/async-horovod/Problem.py @@ -1,21 +1,31 @@ - # PROBLEM # The bounding box for the optimization problem # This should be a user plug-in from collections import OrderedDict -class Problem(): + +class Problem: + def __init__(self): space = OrderedDict() - #problem specific parameters + # problem specific parameters # space['drop'] = (0, 0.9) # space['batch_size'] = [16, 32, 64, 128, 256, 512] # space['p3'] = [2 , 4, 8, 16, 32, 64, 128] # space['p4'] = ['a', 'b', 'c'] # space["learning_rate"] = (0,0.009) # Make discrete values - space["learning_rate"] = [ 0.001, 0.002, 0.003, 0.004, 0.005, - 0.006, 0.007, 0.008, 0.009 ] + space["learning_rate"] = [ + 0.001, + 0.002, + 0.003, + 0.004, + 0.005, + 0.006, + 0.007, + 0.008, + 0.009, + ] # Use 5 epochs # Add Horovod PARALLELISM [ 64, 128 , 256, 512 ] # ? 1.5h ? ? @@ -24,6 +34,7 @@ def __init__(self): self.params = self.space.keys() self.starting_point = [0.1, 16] + # if __name__ == '__main__': # instance = Problem() # print(instance.space) diff --git a/workflows/async-horovod/Task.py b/workflows/async-horovod/Task.py index c7ab03ee..52c10511 100644 --- a/workflows/async-horovod/Task.py +++ b/workflows/async-horovod/Task.py @@ -1,10 +1,11 @@ - # TASK # This should be a user plug-in from __future__ import print_function + import os + class Task: def __init__(self, logger, output, script, parallelism, number, params): @@ -18,7 +19,8 @@ def __init__(self, logger, output, script, parallelism, number, params): self.params = params def go(self): - import json, subprocess + import json + import subprocess J = json.loads(self.params) learning_rate = J["learning_rate"] @@ -26,9 +28,14 @@ def go(self): self.open_output() try: - args = [ self.script, self.output, "%04i"%self.number, - str(self.parallelism), - "adam", str(learning_rate) ] + args = [ + self.script, + self.output, + "%04i" % self.number, + str(self.parallelism), + "adam", + str(learning_rate), + ] self.logger.debug("task: " + " ".join(args)) self.process = subprocess.Popen(args=args, stdin=None, @@ -37,6 +44,7 @@ def go(self): print("started: ", self.process.pid) except Exception as e: import traceback + traceback.print_exc() print("") print("error while attempting to run: " + " ".join(args)) @@ -51,8 +59,9 @@ def open_output(self): except Exception as e: print("") from utils import fail - fail("Could not open task output file: " + - output_file + "\n" + str(e)) + + fail("Could not open task output file: " + output_file + "\n" + + str(e)) def __del__(self): if self.fd is not None: diff --git a/workflows/async-horovod/main.py b/workflows/async-horovod/main.py index 772fab43..eaa8d70f 100644 --- a/workflows/async-horovod/main.py +++ b/workflows/async-horovod/main.py @@ -1,74 +1,90 @@ - # MAIN PY # The main code for the search algorithm from __future__ import print_function -import logging, os, sys, time - -from utils import * +import logging +import os +import sys +import time from Problem import Problem from Task import Task +from utils import * logger = logging.getLogger(__name__) + def main(): setup_log(logging.INFO) parallelism, points_init, points_max, cfg, output = parse_args() script, launch_delay = read_cfg(cfg) output = setup_run(output) problem, optimizer = setup_optz() - success = search(problem, optimizer, output, script, launch_delay, - parallelism, points_init, points_max) + success = search( + problem, + optimizer, + output, + script, + launch_delay, + parallelism, + points_init, + points_max, + ) print("Workflow success!" if success else "Workflow failed!") + def setup_log(level): - """ Note that the log level may be changed by the cfg file """ - logging.basicConfig(level=level, - format='%(asctime)s %(levelname)s: %(message)s', - datefmt='%Y/%m/%d %H:%M:%S') + """Note that the log level may be changed by the cfg file.""" + logging.basicConfig( + level=level, + format="%(asctime)s %(levelname)s: %(message)s", + datefmt="%Y/%m/%d %H:%M:%S", + ) logger.debug("DEBUG") + def parse_args(): import argparse + parser = argparse.ArgumentParser() - parser.add_argument("parallelism", - help="Nodes per Keras run") - parser.add_argument("points_init", - help="Number of initial sample points") - parser.add_argument("points_max", - help="Number of total sample points") - parser.add_argument("cfg_file", - help="The cfg file (see README)") + parser.add_argument("parallelism", help="Nodes per Keras run") + parser.add_argument("points_init", help="Number of initial sample points") + parser.add_argument("points_max", help="Number of total sample points") + parser.add_argument("cfg_file", help="The cfg file (see README)") parser.add_argument("output_directory", help="The output directory (see README)") args = parser.parse_args() print_namespace("optimizer settings:", args) - return (int(args.parallelism), - int(args.points_init), - int(args.points_max), - args.cfg_file, - args.output_directory) + return ( + int(args.parallelism), + int(args.points_init), + int(args.points_max), + args.cfg_file, + args.output_directory, + ) + def read_cfg(cfg): import json + try: with open(cfg) as fp: J = json.load(fp) except: fail("Could not open: " + cfg) - defaults = { "launch_delay" : 0, - "log_level" : "INFO" } + defaults = {"launch_delay": 0, "log_level": "INFO"} for d in defaults: if not d in J: J[d] = defaults[d] - check(is_integer(J["launch_delay"]), - "launch_delay must be integer if present: launch_delay='%s'" % - str(J["launch_delay"])) + check( + is_integer(J["launch_delay"]), + "launch_delay must be integer if present: launch_delay='%s'" % + str(J["launch_delay"]), + ) global logger level = string2level(J["log_level"]) @@ -76,6 +92,7 @@ def read_cfg(cfg): return J["script"], J["launch_delay"] + def setup_run(output): if not output[0] == "/": output = os.getcwd() + "/" + output @@ -86,10 +103,10 @@ def setup_run(output): os.makedirs(output) os.chdir(output) except Exception as e: - fail("could not make output directory: " + - output + "\n" + str(e)) + fail("could not make output directory: " + output + "\n" + str(e)) return output + def setup_optz(): logger.debug("setup() START") @@ -101,15 +118,30 @@ def setup_optz(): seed = 42 # Start the optimizer - parDict = { 'kappa' : 1.96 } + parDict = {"kappa": 1.96} space = [problem.space[key] for key in problem.params] - optimizer = Optimizer(space, base_estimator='RF', acq_optimizer='sampling', - acq_func='LCB', acq_func_kwargs={}, random_state=seed) + optimizer = Optimizer( + space, + base_estimator="RF", + acq_optimizer="sampling", + acq_func="LCB", + acq_func_kwargs={}, + random_state=seed, + ) logger.debug("setup() STOP") return (problem, optimizer) -def search(problem, optimizer, output, script, launch_delay, - parallelism, points_init, points_max): + +def search( + problem, + optimizer, + output, + script, + launch_delay, + parallelism, + points_init, + points_max, +): print("search start:") # Create the initial sample points @@ -133,8 +165,12 @@ def search(problem, optimizer, output, script, launch_delay, for i, json in enumerate(jsons): # Note: this puts the task in a background process global logger - T = Task(logger, output, script, - parallelism, number=task_count, params=json) + T = Task(logger, + output, + script, + parallelism, + number=task_count, + params=json) status = T.go() if not status: success = False @@ -169,22 +205,23 @@ def search(problem, optimizer, output, script, launch_delay, points = [] return success + def read_val_loss(output, task): - filename = output+"/val_loss-%04i.txt" % task.number + filename = output + "/val_loss-%04i.txt" % task.number try: with open(filename, "r") as fp: result = fp.read() result = result.strip() except Exception as e: - fail("Could not open result file: " + - filename + "\n" + str(e)) + fail("Could not open result file: " + filename + "\n" + str(e)) try: number = float(result) except Exception as e: - fail("Invalid number \"" + result + "\" in result file: " + - filename + "\n" + str(e)) + fail('Invalid number "' + result + '" in result file: ' + filename + + "\n" + str(e)) + + return number - return number -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/workflows/async-horovod/uno-1.json b/workflows/async-horovod/uno-1.json index ba9ef727..18f30904 100644 --- a/workflows/async-horovod/uno-1.json +++ b/workflows/async-horovod/uno-1.json @@ -1,5 +1,5 @@ { - "log_level" : "DEBUG", - "launch_delay": 1, - "script": "/home/wozniak/proj/SV/workflows/async-horovod/run-uno.sh" + "log_level": "DEBUG", + "launch_delay": 1, + "script": "/home/wozniak/proj/SV/workflows/async-horovod/run-uno.sh" } diff --git a/workflows/async-horovod/utils.py b/workflows/async-horovod/utils.py index bfa3f2be..8236dbd3 100644 --- a/workflows/async-horovod/utils.py +++ b/workflows/async-horovod/utils.py @@ -1,7 +1,8 @@ - import json + class MyEncoder(json.JSONEncoder): + def default(self, obj): if isinstance(obj, np.integer): return int(obj) @@ -12,36 +13,42 @@ def default(self, obj): else: return super(MyEncoder, self).default(obj) + def is_integer(v): try: - t = v+1 + t = v + 1 except: return False return True + def check(condition, msg): if not condition: fail(msg) + def fail(msg): print(msg) import sys + sys.exit(1) + def string2level(s): import logging - table = { "" : logging.INFO, - "INFO" : logging.INFO, - "DEBUG" : logging.DEBUG } + + table = {"": logging.INFO, "INFO": logging.INFO, "DEBUG": logging.DEBUG} check(s in table, "Invalid log level: " + s) return table[s] + def depth(l): if isinstance(l, list): return 1 + max(depth(item) for item in l) else: return 0 + def create_list_of_json_strings(list_of_lists, params, super_delim=";"): if len(list_of_lists) == 0: @@ -50,12 +57,12 @@ def create_list_of_json_strings(list_of_lists, params, super_delim=";"): # create string of ; separated jsonified maps result = [] - if (depth(list_of_lists) == 1): + if depth(list_of_lists) == 1: list_of_lists = [list_of_lists] for l in list_of_lists: jmap = {} - for i,p in enumerate(params): + for i, p in enumerate(params): jmap[p] = l[i] jstring = json.dumps(jmap, cls=MyEncoder) @@ -63,9 +70,10 @@ def create_list_of_json_strings(list_of_lists, params, super_delim=";"): return result + def print_namespace(title, ns): print("") print(title) - for k,v in vars(ns).items(): + for k, v in vars(ns).items(): print(" %s %s" % (k, v)) print("") diff --git a/workflows/async-local/Problem.py b/workflows/async-local/Problem.py index f0c0d18e..ed4a8693 100644 --- a/workflows/async-local/Problem.py +++ b/workflows/async-local/Problem.py @@ -1,21 +1,24 @@ - # PROBLEM # The bounding box for the optimization problem # This should be a user plug-in from collections import OrderedDict -class Problem(): + + +class Problem: + def __init__(self): space = OrderedDict() - #problem specific parameters - space['drop'] = (0, 0.9) - space['batch_size'] = [16, 32, 64, 128, 256, 512] - #space['p3'] = [2 , 4, 8, 16, 32, 64, 128] - #space['p4'] = ['a', 'b', 'c'] + # problem specific parameters + space["drop"] = (0, 0.9) + space["batch_size"] = [16, 32, 64, 128, 256, 512] + # space['p3'] = [2 , 4, 8, 16, 32, 64, 128] + # space['p4'] = ['a', 'b', 'c'] self.space = space self.params = self.space.keys() self.starting_point = [0.1, 16] + # if __name__ == '__main__': # instance = Problem() # print(instance.space) diff --git a/workflows/async-local/Task.py b/workflows/async-local/Task.py index 7cabd2fe..d9ea6d50 100644 --- a/workflows/async-local/Task.py +++ b/workflows/async-local/Task.py @@ -1,10 +1,11 @@ - # TASK # This should be a user plug-in from __future__ import print_function + import os + class Task: def __init__(self, parallelism, number, params): @@ -16,13 +17,14 @@ def __init__(self, parallelism, number, params): def go(self): import subprocess + # script = "/home/wozniak/proj/SV/workflows/test-horovod/template-theta.sh" script = "/home/wozniak/proj/SV/workflows/async-local/task.sh" try: output = get_output() log = output + ("/%04i.txt" % self.number) self.fd = open(log, "w") - args = [script, str(self.parallelism), self.params], + args = ([script, str(self.parallelism), self.params],) self.process = subprocess.Popen(args=args, stdin=None, stdout=self.fd, @@ -30,6 +32,7 @@ def go(self): print("started: ", self.process.pid) except Exception as e: import traceback + traceback.print_exc() return False return True @@ -39,6 +42,7 @@ def __del__(self): print("closing: " + str(self.number)) self.fd.close() + def get_output(): o = os.getenv("OUTPUT") if o is None: diff --git a/workflows/async-local/main.py b/workflows/async-local/main.py index b75619d3..6a3bb5b6 100644 --- a/workflows/async-local/main.py +++ b/workflows/async-local/main.py @@ -1,42 +1,47 @@ - # MAIN PY # The main code for the search algorithm from __future__ import print_function -import logging, os, sys, time - -from utils import * +import logging +import os +import sys +import time from Problem import Problem from Task import Task +from utils import * logger = logging.getLogger(__name__) + def main(): setup_log() parallelism, points_init, points_max = parse_args() problem, optimizer = setup() - success = search(problem, optimizer, - parallelism, points_init, points_max) + success = search(problem, optimizer, parallelism, points_init, points_max) print("Workflow success!" if success else "Workflow failed!") + def setup_log(): - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s: %(message)s', - datefmt='%Y/%m/%d %H:%M:%S') + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s: %(message)s", + datefmt="%Y/%m/%d %H:%M:%S", + ) + def parse_args(): import argparse + parser = argparse.ArgumentParser() parser.add_argument("parallelism") parser.add_argument("points_init") parser.add_argument("points_max") args = parser.parse_args() print_namespace("optimizer settings:", args) - return (int(args.parallelism), - int(args.points_init), - int(args.points_max)) + return (int(args.parallelism), int(args.points_init), int(args.points_max)) + def setup(): @@ -49,13 +54,20 @@ def setup(): seed = 42 # Start the optimizer - parDict = { 'kappa' : 1.96 } + parDict = {"kappa": 1.96} space = [problem.space[key] for key in problem.params] - optimizer = Optimizer(space, base_estimator='RF', acq_optimizer='sampling', - acq_func='LCB', acq_func_kwargs={}, random_state=seed) + optimizer = Optimizer( + space, + base_estimator="RF", + acq_optimizer="sampling", + acq_func="LCB", + acq_func_kwargs={}, + random_state=seed, + ) logger.debug("setup() STOP") return (problem, optimizer) + def search(problem, optimizer, parallelism, points_init, points_max): print("search start:") @@ -113,5 +125,6 @@ def search(problem, optimizer, parallelism, points_init, points_max): points = [] return success -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/workflows/async-local/utils.py b/workflows/async-local/utils.py index 89691668..3efadb76 100644 --- a/workflows/async-local/utils.py +++ b/workflows/async-local/utils.py @@ -1,7 +1,8 @@ - import json + class MyEncoder(json.JSONEncoder): + def default(self, obj): if isinstance(obj, np.integer): return int(obj) @@ -12,12 +13,14 @@ def default(self, obj): else: return super(MyEncoder, self).default(obj) + def depth(l): if isinstance(l, list): return 1 + max(depth(item) for item in l) else: return 0 + def create_list_of_json_strings(list_of_lists, params, super_delim=";"): if len(list_of_lists) == 0: @@ -26,12 +29,12 @@ def create_list_of_json_strings(list_of_lists, params, super_delim=";"): # create string of ; separated jsonified maps result = [] - if (depth(list_of_lists) == 1): + if depth(list_of_lists) == 1: list_of_lists = [list_of_lists] for l in list_of_lists: jmap = {} - for i,p in enumerate(params): + for i, p in enumerate(params): jmap[p] = l[i] jstring = json.dumps(jmap, cls=MyEncoder) @@ -39,9 +42,10 @@ def create_list_of_json_strings(list_of_lists, params, super_delim=";"): return result + def print_namespace(title, ns): print("") print(title) - for k,v in vars(ns).items(): + for k, v in vars(ns).items(): print(" %s %s" % (k, v)) print("") diff --git a/workflows/async-search/README.md b/workflows/async-search/README.md index 42eec684..e236f461 100644 --- a/workflows/async-search/README.md +++ b/workflows/async-search/README.md @@ -2,33 +2,33 @@ async-search is an asynchronous iterative optimizer written in Python. It evaluates the best values of hyperparameters for CANDLE "Benchmarks" available here: `git@github.com:ECP-CANDLE/Benchmarks.git` -## Running ## +## Running 1. cd into the **Supervisor/workflows/async-search/test** directory -2. Specify the async-search parameters in the *cfg-prm-1.sh* file (INIT_SIZE, etc.). +2. Specify the async-search parameters in the _cfg-prm-1.sh_ file (INIT_SIZE, etc.). 3. Specify the PROCS, queue etc. in **cfg-sys-1.sh** file -(NOTE: currently INIT_SIZE must be at least PROCS-2) + (NOTE: currently INIT_SIZE must be at least PROCS-2) 4. You will pass the MODEL_NAME, SITE, and optional experiment id arguments to **test-1.sh** file when launching: -`./test-1.sh [expid]` -where `model_name` can be tc1 etc., `machine_name` can be local, cori, theta, titan etc. (see [NOTE](#making_changes) below on creating new SITE files.) + `./test-1.sh [expid]` + where `model_name` can be tc1 etc., `machine_name` can be local, cori, theta, titan etc. (see [NOTE](#making_changes) below on creating new SITE files.) 5. The parameter space is defined in a **problem\*.py** file (see **workflows/async-search/python/problem_tc1.py** for an example with tc1.). This is imported as `problem` in **async-search.py**. 6. The benchmark will be run for the number of processors specified 7. Final objective function values, along with parameters, will be available in the experiments directory and also printed - -## User requirements ## +## User requirements What you need to install to run the workflow: -* This workflow - `git@github.com:ECP-CANDLE/Supervisor.git` . +- This workflow - `git@github.com:ECP-CANDLE/Supervisor.git` . Clone and switch to the `master` branch. Then `cd` to `workflows/async-search` (the directory containing this README). -* TC1 benchmark - `git@github.com:ECP-CANDLE/Benchmarks.git` . +- TC1 benchmark - `git@github.com:ECP-CANDLE/Benchmarks.git` . Clone and switch to the `frameworks` branch. -* benchmark data - - See the individual benchmarks README for obtaining the initial data +- benchmark data - + See the individual benchmarks README for obtaining the initial data + +Python specific installation needed: - Python specific installation needed: ``` conda install h5py conda install scikit-learn @@ -38,9 +38,10 @@ conda install -c conda-forge keras conda install -c conda-forge scikit-optimize ``` -## Calling sequence ## +## Calling sequence Function calls: + ``` test-1.sh -> swift/workflow.sh -> @@ -56,13 +57,16 @@ test-1.sh -> swift/workflow.sh -> ``` Scheduling scripts: + ``` test-1.sh -> cfg-sys-1.sh -> common/sh/ - module, scheduling, langs .sh files ``` + ## Making Changes To create your own SITE files in workflows/common/sh/: + - langs-SITE.sh - langs-app-SITE.sh - modules-SITE.sh @@ -70,27 +74,28 @@ To create your own SITE files in workflows/common/sh/: copy existing ones but modify the langs-SITE.sh file to define the EQPy location (see workflows/common/sh/langs-local.sh for an example). -### Structure ### +### Structure -The point of the script structure is that it is easy to make copy and modify the `test-*.sh` script, and the `cfg-*.sh` scripts. These can be checked back into the repo for use by others. The `test-*.sh` script and the `cfg-*.sh` scripts should simply contain environment variables that control how `workflow.sh` and `workflow.swift` operate. +The point of the script structure is that it is easy to make copy and modify the `test-*.sh` script, and the `cfg-*.sh` scripts. These can be checked back into the repo for use by others. The `test-*.sh` script and the `cfg-*.sh` scripts should simply contain environment variables that control how `workflow.sh` and `workflow.swift` operate. `test-1.sh` and `cfg-{sys,prm}-1.sh` should be unmodified for simple testing. The relevant parameters for the asynchronous search algorithm are defined in `cfg-*.sh` scripts (see example in `cfg-prm-1.sh`). These are: + - INIT_SIZE: The number of initial random samples. (Note: INIT_SIZE needs to be larger than PROCS-2 for now.) - MAX_EVALS: The maximum number of evaluations/tasks to perform. - NUM_BUFFER: The size of the tasks buffer that should be maintained above the available workers (num_workers) such that if the currently out tasks are less than (num workers + NUM_BUFFER), more tasks are generated. - MAX_THRESHOLD: Under normal circumstances, when a single model evaluation is finished, a new hyper parameter set is produced for evaluation. If the model evaluations occur within 15 seconds of each other, a MAX_THRESHOLD number of evalutions must occur before the corresponding number of new values are produced for evaluation. This can help with performance when many models finish within a few seconds of each other. - N_JOBS: The number of jobs to run in parallel when producing points (i.e. hyperparameter values) for evaluation. -1 will set this to the number of cores. -### Where to check for output ### +### Where to check for output This includes error output. -When you run the test script, you will get a message about `TURBINE_OUTPUT` . This will be the main output directory for your run. +When you run the test script, you will get a message about `TURBINE_OUTPUT` . This will be the main output directory for your run. -* On a local system, stdout/stderr for the workflow will go to your terminal. -* On a scheduled system, stdout/stderr for the workflow will go to `TURBINE_OUTPUT/output.txt` +- On a local system, stdout/stderr for the workflow will go to your terminal. +- On a scheduled system, stdout/stderr for the workflow will go to `TURBINE_OUTPUT/output.txt` The individual objective function (model) runs stdout/stderr go into directories of the form: diff --git a/workflows/async-search/python/as_problem.py b/workflows/async-search/python/as_problem.py index 4a996453..a8dad8c9 100644 --- a/workflows/async-search/python/as_problem.py +++ b/workflows/async-search/python/as_problem.py @@ -1,21 +1,31 @@ from collections import OrderedDict -class Problem(): + + +class Problem: + def __init__(self): space = OrderedDict() - #problem specific parameters - space['drop'] = (0, 0.9) - space['epochs'] = (2,3) - space['learning_rate'] = (0.00001, 0.1) - space['conv'] = ["50 50 50 50 50 1", "25 25 25 25 25 1", "64 32 16 32 64 1", "100 100 100 100 100 1", "32 20 16 32 10 1"] - space['optimizer'] = ["adam", "sgd", "rmsprop", "adagrad", "adadelta"] - space['batch_size'] = [16, 32, 64, 128, 256, 512] #, 256, 512] - #space['p3'] = [2 , 4, 8, 16, 32, 64, 128] - #space['p4'] = ['a', 'b', 'c'] + # problem specific parameters + space["drop"] = (0, 0.9) + space["epochs"] = (2, 3) + space["learning_rate"] = (0.00001, 0.1) + space["conv"] = [ + "50 50 50 50 50 1", + "25 25 25 25 25 1", + "64 32 16 32 64 1", + "100 100 100 100 100 1", + "32 20 16 32 10 1", + ] + space["optimizer"] = ["adam", "sgd", "rmsprop", "adagrad", "adadelta"] + space["batch_size"] = [16, 32, 64, 128, 256, 512] # , 256, 512] + # space['p3'] = [2 , 4, 8, 16, 32, 64, 128] + # space['p4'] = ['a', 'b', 'c'] self.space = space self.params = self.space.keys() self.starting_point = [0.1, 16] -if __name__ == '__main__': + +if __name__ == "__main__": instance = Problem() print(instance.space) - print(instance.params) \ No newline at end of file + print(instance.params) diff --git a/workflows/async-search/python/as_problem_tc1.py b/workflows/async-search/python/as_problem_tc1.py index b3db05f6..c751385f 100644 --- a/workflows/async-search/python/as_problem_tc1.py +++ b/workflows/async-search/python/as_problem_tc1.py @@ -1,17 +1,21 @@ from collections import OrderedDict -class Problem(): + + +class Problem: + def __init__(self): space = OrderedDict() - #problem specific parameters - space['drop'] = (0, 0.9) - space['batch_size'] = [16, 32, 64, 128] #, 256, 512] - #space['p3'] = [2 , 4, 8, 16, 32, 64, 128] - #space['p4'] = ['a', 'b', 'c'] + # problem specific parameters + space["drop"] = (0, 0.9) + space["batch_size"] = [16, 32, 64, 128] # , 256, 512] + # space['p3'] = [2 , 4, 8, 16, 32, 64, 128] + # space['p4'] = ['a', 'b', 'c'] self.space = space self.params = self.space.keys() self.starting_point = [0.1, 16] -if __name__ == '__main__': + +if __name__ == "__main__": instance = Problem() print(instance.space) print(instance.params) diff --git a/workflows/async-search/python/async-search.py b/workflows/async-search/python/async-search.py index 4d5b0df4..1bff744f 100644 --- a/workflows/async-search/python/async-search.py +++ b/workflows/async-search/python/async-search.py @@ -1,17 +1,19 @@ -from mpi4py import MPI -import eqpy -import time -import json -import numpy as np -from skopt import Optimizer -import as_problem as problem import datetime +import json import math import sys +import time + +import as_problem as problem +import eqpy +import numpy as np +from mpi4py import MPI +from skopt import Optimizer # list of ga_utils parameter objects problem_params = None + class MyEncoder(json.JSONEncoder): def default(self, obj): @@ -24,8 +26,10 @@ def default(self, obj): else: return super(MyEncoder, self).default(obj) + def create_points(num): - return(";".join([str(i) for i in range(num)])) + return ";".join([str(i) for i in range(num)]) + def depth(l): if isinstance(l, list): @@ -33,16 +37,17 @@ def depth(l): else: return 0 + def create_list_of_json_strings(list_of_lists, super_delim=";"): # create string of ; separated jsonified maps res = [] global problem_params - if (depth(list_of_lists) == 1): + if depth(list_of_lists) == 1: list_of_lists = [list_of_lists] for l in list_of_lists: jmap = {} - for i,p in enumerate(problem_params): + for i, p in enumerate(problem_params): jmap[p] = l[i] jstring = json.dumps(jmap, cls=MyEncoder) @@ -50,13 +55,14 @@ def create_list_of_json_strings(list_of_lists, super_delim=";"): return res, (super_delim.join(res)) + def run(): start_time = time.time() print("run() start: {}".format(str(datetime.datetime.now()))) - comm = MPI.COMM_WORLD # get MPI communicator object - size = comm.size # total number of processes - rank = comm.rank # rank of this process - status = MPI.Status() # get MPI status object + comm = MPI.COMM_WORLD # get MPI communicator object + size = comm.size # total number of processes + rank = comm.rank # rank of this process + status = MPI.Status() # get MPI status object print("ME rank is {}".format(rank)) instance = problem.Problem() @@ -70,28 +76,37 @@ def run(): eqpy.OUT_put("Params") # initial parameter set telling us the number of times to run the loop initparams = eqpy.IN_get() - (init_size, max_evals, num_workers, num_buffer, seed, max_threshold, n_jobs) = eval('{}'.format(initparams)) + (init_size, max_evals, num_workers, num_buffer, seed, max_threshold, + n_jobs) = eval("{}".format(initparams)) space = [spaceDict[key] for key in params] print(space) parDict = {} resultsList = [] - parDict['kappa'] = 1.96 + parDict["kappa"] = 1.96 # can set to num cores - parDict['n_jobs'] = n_jobs + parDict["n_jobs"] = n_jobs init_x = [] - opt = Optimizer(space, base_estimator='RF', acq_optimizer='sampling', - acq_func='LCB', acq_func_kwargs=parDict, random_state=seed) + opt = Optimizer( + space, + base_estimator="RF", + acq_optimizer="sampling", + acq_func="LCB", + acq_func_kwargs=parDict, + random_state=seed, + ) eval_counter = 0 askedDict = {} - print("Master starting with {} init_size, {} max_evals, {} num_workers, {} num_buffer, {} max_threshold".format(init_size,max_evals,num_workers,num_buffer, max_threshold)) + print( + "Master starting with {} init_size, {} max_evals, {} num_workers, {} num_buffer, {} max_threshold" + .format(init_size, max_evals, num_workers, num_buffer, max_threshold)) x = opt.ask(n_points=init_size) res, resstring = create_list_of_json_strings(x) print("Initial design is {}".format(resstring)) - for r,xx in zip(res,x): + for r, xx in zip(res, x): askedDict[r] = xx eqpy.OUT_put(resstring) currently_out = init_size @@ -101,11 +116,11 @@ def run(): group = comm.Get_group() # Assumes only one adlb_server # num_workers + 1 = num_turbine_workers - newgroup = group.Excl([num_workers+1]) - #print("ME newgroup size is {}".format(newgroup.size)) - newcomm = comm.Create_group(newgroup,1) + newgroup = group.Excl([num_workers + 1]) + # print("ME newgroup size is {}".format(newgroup.size)) + newcomm = comm.Create_group(newgroup, 1) nrank = newcomm.rank - #print("ME nrank is {}".format(nrank)) + # print("ME nrank is {}".format(nrank)) counter_threshold = 1 counter = 0 @@ -115,17 +130,17 @@ def run(): print("\neval_counter = {}".format(eval_counter)) data = newcomm.recv(source=MPI.ANY_SOURCE, status=status) counter = counter + 1 - xstring = data['x'] + xstring = data["x"] x = askedDict[xstring] - y = data['cost'] + y = data["cost"] if math.isnan(y): - y=sys.float_info.max + y = sys.float_info.max opt.tell(x, y) - #source = status.Get_source() - #tag = status.Get_tag() + # source = status.Get_source() + # tag = status.Get_tag() elapsed_time = float(time.time() - start_time) - print('elapsed_time:%1.3f'%elapsed_time) + print("elapsed_time:%1.3f" % elapsed_time) results.append(str(data)) eval_counter = eval_counter + 1 currently_out = currently_out - 1 @@ -142,25 +157,27 @@ def run(): counter_threshold = 1 print("counter_threshold: {}".format(counter_threshold)) - print("currently_out:{}, total_out:{}".format(currently_out,total_out)) - if currently_out < num_workers + num_buffer and total_out < max_evals and counter >= counter_threshold: + print("currently_out:{}, total_out:{}".format(currently_out, total_out)) + if (currently_out < num_workers + num_buffer and + total_out < max_evals and counter >= counter_threshold): n_points = counter if n_points + total_out > max_evals: n_points = max_evals - total_out ts = time.time() x = opt.ask(n_points=n_points) res, resstring = create_list_of_json_strings(x) - for r,xx in zip(res,x): + for r, xx in zip(res, x): askedDict[r] = xx eqpy.OUT_put(resstring) - print('point production elapsed_time:%1.3f' % float(time.time() - ts)) + print("point production elapsed_time:%1.3f" % + float(time.time() - ts)) currently_out = currently_out + n_points total_out = total_out + n_points counter = 0 end_iter_time = start_iter_time - print('Search finishing') + print("Search finishing") eqpy.OUT_put("DONE") eqpy.OUT_put(";".join(results)) diff --git a/workflows/async-search/python/utils.py b/workflows/async-search/python/utils.py index e787cf97..8cd5c4f2 100644 --- a/workflows/async-search/python/utils.py +++ b/workflows/async-search/python/utils.py @@ -1,22 +1,22 @@ -from string import Template -import re -import os -import sys -import time +import csv import json import math import os +import re import subprocess -import csv +import sys +import time +from string import Template + def saveResults(resultsList, json_fname, csv_fname): print(resultsList) print(json.dumps(resultsList, indent=4, sort_keys=True)) - with open(json_fname, 'w') as outfile: + with open(json_fname, "w") as outfile: json.dump(resultsList, outfile, indent=4, sort_keys=True) keys = resultsList[0].keys() - with open(csv_fname, 'w') as output_file: + with open(csv_fname, "w") as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(resultsList) diff --git a/workflows/async-search/swift/workflow.sh b/workflows/async-search/swift/workflow.sh index b84fbcac..4f0b5879 100755 --- a/workflows/async-search/swift/workflow.sh +++ b/workflows/async-search/swift/workflow.sh @@ -192,4 +192,4 @@ fi # echo "EXIT CODE: 0" | tee -a $STDOUT # Andrew: Needed this so that script to monitor job worked properly (queue_wait... function in utils.sh?) -echo $TURBINE_OUTPUT > turbine-directory.txt \ No newline at end of file +echo $TURBINE_OUTPUT > turbine-directory.txt diff --git a/workflows/common/R/mlrMBO-default.R b/workflows/common/R/mlrMBO-default.R index 2774704d..5bd52e9f 100644 --- a/workflows/common/R/mlrMBO-default.R +++ b/workflows/common/R/mlrMBO-default.R @@ -70,26 +70,26 @@ restart.file) { print("Using randomForest") - surr.rf = makeLearner("regr.randomForest", - predict.type = "se", + surr.rf = makeLearner("regr.randomForest", + predict.type = "se", fix.factors.prediction = TRUE) #mtry = 6, #se.method = "bootstrap", se.boot = 50, se.ntree = 100) - ctrl = makeMBOControl(n.objectives = 1, + ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$double.xmax, trafo.y.fun = makeMBOTrafoFunction('log', log)) - ctrl = setMBOControlInfill(ctrl, + ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritCB(), - opt.restarts = 1, + opt.restarts = 1, opt.focussearch.points = 1000) - ctrl = setMBOControlTermination(ctrl, - max.evals = max.budget, + ctrl = setMBOControlTermination(ctrl, + max.evals = max.budget, iters = max.iterations) chkpntResults<-NULL # TODO: Make this an argument - restartFile<-restart.file + restartFile<-restart.file if (file.exists(restart.file)) { print(paste("Loading restart:", restart.file)) diff --git a/workflows/common/R/mlrMBO-ls1.R b/workflows/common/R/mlrMBO-ls1.R index 63548d75..4e8360b4 100644 --- a/workflows/common/R/mlrMBO-ls1.R +++ b/workflows/common/R/mlrMBO-ls1.R @@ -68,16 +68,16 @@ main_function <- function(max.budget = 110, max.iterations = 10, design.size=10, propose.points=10, - restart.file="DISABLED", + restart.file="DISABLED", learner1.name = "randomForest") { if (learner1.name == "km"){ print("Using Kriging.") - surr.rf = makeLearner("regr.km", predict.type = "se") #covtype = "matern3_2", control = list(trace = FALSE)) + surr.rf = makeLearner("regr.km", predict.type = "se") #covtype = "matern3_2", control = list(trace = FALSE)) - #TODO: Avoid error: + #TODO: Avoid error: # [mbo] 3: latent_dim=2; batch_size=35; learning_rate=0.0762; epochs=8 : y = 0.203 : 29.6 secs : infill_cb - # Error in chol.default(R) : + # Error in chol.default(R) : # the leading minor of order 29 is not positive definite # The issue is mentioned here: https://github.com/mlr-org/mlrMBO/issues/80 # y = MyTrainingData$MyTarget @@ -86,7 +86,7 @@ main_function <- function(max.budget = 110, ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$integer.max * 0.1) - + # y = MyTrainingData$MyTarget # Nuggets = 1e-8*var(y) # setHyperPars(learner = surr.rf, nugget=Nuggets) @@ -98,15 +98,15 @@ main_function <- function(max.budget = 110, } else if (learner1.name == "randomForest"){ print("Using randomForest") - surr.rf = makeLearner("regr.randomForest", predict.type = "se", + surr.rf = makeLearner("regr.randomForest", predict.type = "se", fix.factors.prediction = TRUE, - se.method = "bootstrap", + se.method = "bootstrap", se.boot = 2, se.ntree = 10, ntree=1000, mtry=8) - ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, + ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$integer.max * 0.1 ) ctrl = setMBOControlTermination(ctrl, max.evals = propose.points) - ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritEI(se.threshold = 0.0), + ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritEI(se.threshold = 0.0), opt.restarts = 1, opt.focussearch.points = 1000) } else{ @@ -117,7 +117,7 @@ main_function <- function(max.budget = 110, chkpntResults<-NULL # TODO: Make this an argument - restartFile<-restart.file + restartFile<-restart.file if (file.exists(restart.file)) { print(paste("Loading restart:", restart.file)) @@ -168,29 +168,29 @@ main_function <- function(max.budget = 110, #iterative phase starts while (itr < max_itr){ - + print(sprintf("nevals = %03d", nrow(all_res))) min.index<-which(itr_res$y==min(itr_res$y)) - + par.set.t = par.set0 pars = par.set.t$pars lens = getParamLengths(par.set.t) k = sum(lens) pids = getParamIds(par.set.t, repeated = TRUE, with.nr = TRUE) - + snames = c("y", pids) reqDF = subset(itr_res, select = snames, drop =TRUE) bestDF <- reqDF[min.index,] print("reqDF") print(nrow(reqDF)) print(summary(reqDF)) - + train.model <- randomForest(y ~ ., data=reqDF, ntree=100000, keep.forest=TRUE, importance=TRUE) var.imp <- importance(train.model, type = 1) index <- sort(abs(var.imp[,1]), decreasing = TRUE, index.return = TRUE)$ix - + inputs <- rownames(var.imp)[index] scores <- abs(var.imp[index,1]) norm.scores <- 100 * scores / sum(scores) @@ -200,7 +200,7 @@ main_function <- function(max.budget = 110, rnames <- inputs[remove.index] print('removing:') print(rnames) - + par.set1<-par.set0 pnames<-names(par.set$pars) print(par.set1) @@ -233,7 +233,7 @@ main_function <- function(max.budget = 110, temp<-rbind(design,reqDF[,-1]) design <- head(temp, n = propose.points) yvals <- predict(train.model,design) - + USE_MODEL <- TRUE if(USE_MODEL){ design <- cbind(y=yvals, design) @@ -245,12 +245,12 @@ main_function <- function(max.budget = 110, res = mbo(obj.fun, design = design, learner = surr.rf, control = ctrl, show.info = TRUE) itr_res<-as.data.frame(res$opt.path) itr_res<-tail(itr_res, n = propose.points) - + par.set0<-par.set1 itr <- itr + 1 all_res <- rbind(all_res, itr_res) } - + return(all_res) } diff --git a/workflows/common/R/mlrMBO-ls2.R b/workflows/common/R/mlrMBO-ls2.R index dd7be142..65c0ffc0 100644 --- a/workflows/common/R/mlrMBO-ls2.R +++ b/workflows/common/R/mlrMBO-ls2.R @@ -68,16 +68,16 @@ main_function <- function(max.budget = 110, max.iterations = 10, design.size=10, propose.points=10, - restart.file="DISABLED", + restart.file="DISABLED", learner1.name = "randomForest") { if (learner1.name == "km"){ print("Using Kriging.") - surr.rf = makeLearner("regr.km", predict.type = "se") #covtype = "matern3_2", control = list(trace = FALSE)) + surr.rf = makeLearner("regr.km", predict.type = "se") #covtype = "matern3_2", control = list(trace = FALSE)) - #TODO: Avoid error: + #TODO: Avoid error: # [mbo] 3: latent_dim=2; batch_size=35; learning_rate=0.0762; epochs=8 : y = 0.203 : 29.6 secs : infill_cb - # Error in chol.default(R) : + # Error in chol.default(R) : # the leading minor of order 29 is not positive definite # The issue is mentioned here: https://github.com/mlr-org/mlrMBO/issues/80 # y = MyTrainingData$MyTarget @@ -86,7 +86,7 @@ main_function <- function(max.budget = 110, ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$integer.max * 0.1) - + # y = MyTrainingData$MyTarget # Nuggets = 1e-8*var(y) # setHyperPars(learner = surr.rf, nugget=Nuggets) @@ -98,15 +98,15 @@ main_function <- function(max.budget = 110, } else if (learner1.name == "randomForest"){ print("Using randomForest") - surr.rf = makeLearner("regr.randomForest", predict.type = "se", + surr.rf = makeLearner("regr.randomForest", predict.type = "se", fix.factors.prediction = TRUE, - se.method = "bootstrap", + se.method = "bootstrap", se.boot = 2, se.ntree = 10, ntree=1000, mtry=8) - ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, + ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$integer.max * 0.1 ) ctrl = setMBOControlTermination(ctrl, max.evals = propose.points) - ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritEI(se.threshold = 0.0), + ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritEI(se.threshold = 0.0), opt.restarts = 1, opt.focussearch.points = 1000) } else{ @@ -117,7 +117,7 @@ main_function <- function(max.budget = 110, chkpntResults<-NULL # TODO: Make this an argument - restartFile<-restart.file + restartFile<-restart.file if (file.exists(restart.file)) { print(paste("Loading restart:", restart.file)) @@ -167,28 +167,28 @@ main_function <- function(max.budget = 110, #iterative phase starts while (itr < max_itr){ - + min.index<-which(itr_res$y==min(itr_res$y)) - + par.set.t = par.set0 pars = par.set.t$pars lens = getParamLengths(par.set.t) k = sum(lens) pids = getParamIds(par.set.t, repeated = TRUE, with.nr = TRUE) - + snames = c("y", pids) reqDF = subset(itr_res, select = snames, drop =TRUE) bestDF <- reqDF[min.index,] print("reqDF") print(nrow(reqDF)) print(summary(reqDF)) - + train.model <- randomForest(y ~ ., data=reqDF, ntree=100000, keep.forest=TRUE, importance=TRUE) var.imp <- importance(train.model, type = 1) index <- sort(abs(var.imp[,1]), decreasing = TRUE, index.return = TRUE)$ix - + inputs <- rownames(var.imp)[index] scores <- abs(var.imp[index,1]) norm.scores <- 100 * scores / sum(scores) @@ -198,7 +198,7 @@ main_function <- function(max.budget = 110, rnames <- inputs[remove.index] print('removing:') print(rnames) - + par.set1<-par.set0 pnames<-names(par.set$pars) for (index in c(1:k)){ @@ -228,7 +228,7 @@ main_function <- function(max.budget = 110, temp<-rbind(design,reqDF[,-1]) design <- head(temp, n = propose.points) yvals <- predict(train.model,design) - + USE_MODEL <- FALSE if(USE_MODEL){ design <- cbind(y=yvals, design) @@ -240,12 +240,12 @@ main_function <- function(max.budget = 110, res = mbo(obj.fun, design = design, learner = surr.rf, control = ctrl, show.info = TRUE) itr_res<-as.data.frame(res$opt.path) itr_res<-tail(itr_res, n = propose.points) - + par.set0<-par.set1 itr <- itr + 1 all_res <- rbind(all_res, itr_res) } - + return(all_res) } diff --git a/workflows/common/R/mlrMBO-ls3.R b/workflows/common/R/mlrMBO-ls3.R index c12a972b..cdd38db0 100644 --- a/workflows/common/R/mlrMBO-ls3.R +++ b/workflows/common/R/mlrMBO-ls3.R @@ -68,16 +68,16 @@ main_function <- function(max.budget = 110, max.iterations = 10, design.size=10, propose.points=10, - restart.file="DISABLED", + restart.file="DISABLED", learner1.name = "randomForest") { if (learner1.name == "km"){ print("Using Kriging.") - surr.rf = makeLearner("regr.km", predict.type = "se") #covtype = "matern3_2", control = list(trace = FALSE)) + surr.rf = makeLearner("regr.km", predict.type = "se") #covtype = "matern3_2", control = list(trace = FALSE)) - #TODO: Avoid error: + #TODO: Avoid error: # [mbo] 3: latent_dim=2; batch_size=35; learning_rate=0.0762; epochs=8 : y = 0.203 : 29.6 secs : infill_cb - # Error in chol.default(R) : + # Error in chol.default(R) : # the leading minor of order 29 is not positive definite # The issue is mentioned here: https://github.com/mlr-org/mlrMBO/issues/80 # y = MyTrainingData$MyTarget @@ -86,7 +86,7 @@ main_function <- function(max.budget = 110, ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$integer.max * 0.1) - + # y = MyTrainingData$MyTarget # Nuggets = 1e-8*var(y) # setHyperPars(learner = surr.rf, nugget=Nuggets) @@ -98,15 +98,15 @@ main_function <- function(max.budget = 110, } else if (learner1.name == "randomForest"){ print("Using randomForest") - surr.rf = makeLearner("regr.randomForest", predict.type = "se", + surr.rf = makeLearner("regr.randomForest", predict.type = "se", fix.factors.prediction = TRUE, - se.method = "bootstrap", + se.method = "bootstrap", se.boot = 2, se.ntree = 10, ntree=1000, mtry=8) - ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, + ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$integer.max * 0.1 ) ctrl = setMBOControlTermination(ctrl, max.evals = propose.points) - ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritEI(se.threshold = 0.0), + ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritEI(se.threshold = 0.0), opt.restarts = 1, opt.focussearch.points = 1000) } else{ @@ -117,7 +117,7 @@ main_function <- function(max.budget = 110, chkpntResults<-NULL # TODO: Make this an argument - restartFile<-restart.file + restartFile<-restart.file if (file.exists(restart.file)) { print(paste("Loading restart:", restart.file)) @@ -167,28 +167,28 @@ main_function <- function(max.budget = 110, #iterative phase starts while (itr < max_itr){ - + min.index<-which(itr_res$y==min(itr_res$y)) - + par.set.t = par.set0 pars = par.set.t$pars lens = getParamLengths(par.set.t) k = sum(lens) pids = getParamIds(par.set.t, repeated = TRUE, with.nr = TRUE) - + snames = c("y", pids) reqDF = subset(itr_res, select = snames, drop =TRUE) bestDF <- reqDF[min.index,] print("reqDF") print(nrow(reqDF)) print(summary(reqDF)) - + train.model <- randomForest(y ~ ., data=reqDF, ntree=100000, keep.forest=TRUE, importance=TRUE) var.imp <- importance(train.model, type = 1) index <- sort(abs(var.imp[,1]), decreasing = TRUE, index.return = TRUE)$ix - + inputs <- rownames(var.imp)[index] scores <- abs(var.imp[index,1]) norm.scores <- 100 * scores / sum(scores) @@ -198,7 +198,7 @@ main_function <- function(max.budget = 110, rnames <- inputs[remove.index] print('removing:') print(rnames) - + par.set1<-par.set0 pnames<-names(par.set$pars) @@ -235,7 +235,7 @@ main_function <- function(max.budget = 110, temp<-rbind(design,reqDF[,-1]) design <- head(temp, n = propose.points) yvals <- predict(train.model,design) - + USE_MODEL <- TRUE if(USE_MODEL){ design <- cbind(y=yvals, design) @@ -247,12 +247,12 @@ main_function <- function(max.budget = 110, res = mbo(obj.fun, design = design, learner = surr.rf, control = ctrl, show.info = TRUE) itr_res<-as.data.frame(res$opt.path) itr_res<-tail(itr_res, n = propose.points) - + par.set0<-par.set1 itr <- itr + 1 all_res <- rbind(all_res, itr_res) } - + return(all_res) } diff --git a/workflows/common/R/mlrMBO-mbo.R b/workflows/common/R/mlrMBO-mbo.R index 98a9702b..feeeb490 100644 --- a/workflows/common/R/mlrMBO-mbo.R +++ b/workflows/common/R/mlrMBO-mbo.R @@ -71,26 +71,26 @@ restart.file) { print("Using randomForest") - surr.rf = makeLearner("regr.randomForest", - predict.type = "se", + surr.rf = makeLearner("regr.randomForest", + predict.type = "se", fix.factors.prediction = TRUE, - se.method = "jackknife", + se.method = "jackknife", se.boot = 2) - ctrl = makeMBOControl(n.objectives = 1, + ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$double.xmax, trafo.y.fun = makeMBOTrafoFunction('log', log)) - ctrl = setMBOControlInfill(ctrl, + ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritCB(), - opt.restarts = 1, + opt.restarts = 1, opt.focussearch.points = 1000) - ctrl = setMBOControlTermination(ctrl, - max.evals = max.budget, + ctrl = setMBOControlTermination(ctrl, + max.evals = max.budget, iters = max.iterations) chkpntResults<-NULL # TODO: Make this an argument - restartFile<-restart.file + restartFile<-restart.file if (file.exists(restart.file)) { print(paste("Loading restart:", restart.file)) diff --git a/workflows/common/R/mlrMBO-rs.R b/workflows/common/R/mlrMBO-rs.R index 6d352719..0a8ef054 100644 --- a/workflows/common/R/mlrMBO-rs.R +++ b/workflows/common/R/mlrMBO-rs.R @@ -69,25 +69,25 @@ restart.file) { print("Using randomForest") - surr.rf = makeLearner("regr.randomForest", - predict.type = "se", + surr.rf = makeLearner("regr.randomForest", + predict.type = "se", fix.factors.prediction = TRUE) #mtry = 6, #se.method = "bootstrap", se.boot = 50, se.ntree = 100) - ctrl = makeMBOControl(n.objectives = 1, + ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$double.xmax ) - ctrl = setMBOControlInfill(ctrl, + ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritCB(), - opt.restarts = 1, + opt.restarts = 1, opt.focussearch.points = 1000) - ctrl = setMBOControlTermination(ctrl, - max.evals = max.budget, + ctrl = setMBOControlTermination(ctrl, + max.evals = max.budget, iters = max.iterations) chkpntResults<-NULL # TODO: Make this an argument - restartFile<-restart.file + restartFile<-restart.file if (file.exists(restart.file)) { print(paste("Loading restart:", restart.file)) @@ -119,7 +119,7 @@ if (is.null(chkpntResults)){ par.set = getParamSet(obj.fun) - + ## represent each discrete value once # get the maximum number of variables max_val_discrete = 0 diff --git a/workflows/common/R/mlrMBO1.R b/workflows/common/R/mlrMBO1.R index 6f02de3c..f651d19e 100644 --- a/workflows/common/R/mlrMBO1.R +++ b/workflows/common/R/mlrMBO1.R @@ -70,7 +70,7 @@ restart.file) { print("Using randomForest") - surr.rf = makeLearner("regr.randomForest", predict.type = "se", + surr.rf = makeLearner("regr.randomForest", predict.type = "se", fix.factors.prediction = TRUE, mtry = 6, se.method = "bootstrap", se.boot = 50, se.ntree = 100) @@ -83,7 +83,7 @@ chkpntResults<-NULL # TODO: Make this an argument - restartFile<-restart.file + restartFile<-restart.file if (file.exists(restart.file)) { print(paste("Loading restart1:", restart.file)) diff --git a/workflows/common/R/mlrMBO2.R b/workflows/common/R/mlrMBO2.R index 7a322909..62f071d6 100644 --- a/workflows/common/R/mlrMBO2.R +++ b/workflows/common/R/mlrMBO2.R @@ -67,16 +67,16 @@ main_function <- function(max.budget = 110, max.iterations = 10, design.size=10, propose.points=10, - restart.file="DISABLED", + restart.file="DISABLED", learner1.name = "randomForest") { if (learner1.name == "km"){ print("Using Kriging.") - surr.rf = makeLearner("regr.km", predict.type = "se") #covtype = "matern3_2", control = list(trace = FALSE)) + surr.rf = makeLearner("regr.km", predict.type = "se") #covtype = "matern3_2", control = list(trace = FALSE)) - #TODO: Avoid error: + #TODO: Avoid error: # [mbo] 3: latent_dim=2; batch_size=35; learning_rate=0.0762; epochs=8 : y = 0.203 : 29.6 secs : infill_cb - # Error in chol.default(R) : + # Error in chol.default(R) : # the leading minor of order 29 is not positive definite # The issue is mentioned here: https://github.com/mlr-org/mlrMBO/issues/80 # y = MyTrainingData$MyTarget @@ -85,7 +85,7 @@ main_function <- function(max.budget = 110, ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$integer.max * 0.1) - + # y = MyTrainingData$MyTarget # Nuggets = 1e-8*var(y) # setHyperPars(learner = surr.rf, nugget=Nuggets) @@ -97,15 +97,15 @@ main_function <- function(max.budget = 110, } else if (learner1.name == "randomForest"){ print("Using randomForest") - surr.rf = makeLearner("regr.randomForest", predict.type = "se", + surr.rf = makeLearner("regr.randomForest", predict.type = "se", fix.factors.prediction = TRUE, - se.method = "bootstrap", + se.method = "bootstrap", se.boot = 2, se.ntree = 10, ntree=1000, mtry=8) - ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, + ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$integer.max * 0.1 ) ctrl = setMBOControlTermination(ctrl, max.evals = propose.points) - ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritEI(se.threshold = 0.0), + ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritEI(se.threshold = 0.0), opt.restarts = 1, opt.focussearch.points = 1000) } else{ @@ -116,7 +116,7 @@ main_function <- function(max.budget = 110, chkpntResults<-NULL # TODO: Make this an argument - restartFile<-restart.file + restartFile<-restart.file if (file.exists(restart.file)) { print(paste("Loading restart:", restart.file)) @@ -156,10 +156,10 @@ main_function <- function(max.budget = 110, configureMlr(show.info = FALSE, show.learner.output = FALSE, on.learner.warning = "quiet") res = mbo(obj.fun, design = design, learner = surr.rf, control = ctrl, show.info = TRUE) #return(res) - + init_res<-as.data.frame(res$opt.path) min.index<-which(init_res$y==min(init_res$y))[1] - + par.set = getParamSet(obj.fun) pars = par.set$pars lens = getParamLengths(par.set) diff --git a/workflows/common/R/mlrMBO2a.R b/workflows/common/R/mlrMBO2a.R index 274693cf..b182518b 100644 --- a/workflows/common/R/mlrMBO2a.R +++ b/workflows/common/R/mlrMBO2a.R @@ -71,7 +71,7 @@ restart.file) { print("Using randomForest") - surr.rf = makeLearner("regr.randomForest", predict.type = "se", + surr.rf = makeLearner("regr.randomForest", predict.type = "se", fix.factors.prediction = TRUE, mtry = 6, se.method = "bootstrap", se.boot = 50, se.ntree = 100) @@ -84,7 +84,7 @@ chkpntResults<-NULL # TODO: Make this an argument - restartFile<-restart.file + restartFile<-restart.file if (file.exists(restart.file)) { print(paste("Loading restart:", restart.file)) @@ -135,30 +135,30 @@ #iterative phase starts while (itr < max_itr){ - + print(sprintf("nevals = %03d", nrow(all_res))) min.index<-which(itr_res$y==min(itr_res$y)) - + par.set.t = par.set0 pars = par.set.t$pars lens = getParamLengths(par.set.t) k = sum(lens) pids = getParamIds(par.set.t, repeated = TRUE, with.nr = TRUE) - + snames = c("y", pids) reqDF = subset(itr_res, select = snames, drop =TRUE) bestDF <- reqDF[min.index,] print("reqDF") print(nrow(reqDF)) print(summary(reqDF)) - + train.model <- randomForest(log(y) ~ ., data=reqDF, ntree=100000, keep.forest=TRUE, importance=TRUE) var.imp <- importance(train.model, type = 1) var.imp[which(var.imp[,1] < 0),1]<-0 index <- sort(abs(var.imp[,1]), decreasing = TRUE, index.return = TRUE)$ix - + inputs <- rownames(var.imp)[index] scores <- abs(var.imp[index,1]) norm.scores <- 100 * scores / sum(scores) @@ -168,7 +168,7 @@ rnames <- inputs[remove.index] print('removing:') print(rnames) - + par.set1<-par.set0 pnames<-names(par.set$pars) print(par.set1) @@ -201,7 +201,7 @@ temp<-rbind(design,reqDF[,-1]) design <- head(temp, n = propose.points) yvals <- predict(train.model,design) - + USE_MODEL <- FALSE #TRUE if(USE_MODEL){ design <- cbind(y=yvals, design) @@ -213,7 +213,7 @@ res = mbo(obj.fun, design = design, learner = surr.rf, control = ctrl, show.info = TRUE) itr_res<-as.data.frame(res$opt.path) itr_res<-tail(itr_res, n = propose.points) - + par.set0<-par.set1 itr <- itr + 1 all_res <- rbind(all_res, itr_res) diff --git a/workflows/common/R/mlrMBO_km.R b/workflows/common/R/mlrMBO_km.R index 186a1417..ca706f0f 100644 --- a/workflows/common/R/mlrMBO_km.R +++ b/workflows/common/R/mlrMBO_km.R @@ -69,11 +69,11 @@ main_function <- function(max.budget = 110, propose.points=10, restart.file) { print("Using Kriging.") - surr.rf = makeLearner("regr.km", predict.type = "se") #covtype = "matern3_2", control = list(trace = FALSE)) + surr.rf = makeLearner("regr.km", predict.type = "se") #covtype = "matern3_2", control = list(trace = FALSE)) - #TODO: Avoid error: + #TODO: Avoid error: # [mbo] 3: latent_dim=2; batch_size=35; learning_rate=0.0762; epochs=8 : y = 0.203 : 29.6 secs : infill_cb - # Error in chol.default(R) : + # Error in chol.default(R) : # the leading minor of order 29 is not positive definite # The issue is mentioned here: https://github.com/mlr-org/mlrMBO/issues/80 # y = MyTrainingData$MyTarget @@ -82,7 +82,7 @@ main_function <- function(max.budget = 110, ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$integer.max * 0.1) - + # y = MyTrainingData$MyTarget # Nuggets = 1e-8*var(y) # setHyperPars(learner = surr.rf, nugget=Nuggets) @@ -93,7 +93,7 @@ main_function <- function(max.budget = 110, chkpntResults<-NULL # TODO: Make this an argument - restartFile<-restart.file + restartFile<-restart.file if (file.exists(restart.file)) { print(paste("Loading restart:", restart.file)) diff --git a/workflows/common/R/test/ils-test.R b/workflows/common/R/test/ils-test.R index a94b1667..71321d83 100644 --- a/workflows/common/R/test/ils-test.R +++ b/workflows/common/R/test/ils-test.R @@ -10,48 +10,48 @@ library(randomForest) fun = function(x) { x = as.list(x) - res = 0 + res = 0 print(x) print(paste(x,sep=",",collapse=";")) r = as.numeric(x$batch_size) i = as.numeric(x$drop) res<-r+i - + if(x$model=="ae"){ res<-res*1000 } - + if(x$activation == "relu"){ res<-res*1000 } - + if(x$optimizer == "sgd"){ res<-res*1000 } - + if(x$optimizer == "sgd"){ res<-res*1000 - } - + } + if(as.numeric(x$reduce_lr)){ res<-res*1000 } - + return(res) } par.set = makeParamSet( # we optimize for ae and vae separately makeDiscreteParam("model", values=c("ae")), - + # makeDiscreteParam("latent_dim", values=c(2, 8, 32, 128, 512)), makeIntegerParam("latent_dim", lower=1, upper=9, trafo = function(x) 2L^x), # use a subset of 978 landmark features only to speed up training makeDiscreteParam("use_landmark_genes", values=c(0)), - - + + # use consecutive 978-neuron layers to facilitate residual connections # makeDiscreteParam("dense", values=c("1500 500", # "978 978", @@ -59,21 +59,21 @@ par.set = makeParamSet( # "978 978 978 978", # "978 978 978 978 978", # "978 978 978 978 978 978")), - + makeDiscreteParam("residual", values=c(1, 0)), - + makeDiscreteParam("activation", values=c("relu", "sigmoid", "tanh")), - + makeDiscreteParam("optimizer", values=c("adam", "sgd")), - + makeNumericParam("learning_rate", lower=0.00001, upper=0.1), - + makeDiscreteParam("reduce_lr", values=c(1, 0)), - + makeDiscreteParam("warmup_lr", values=c(1, 0)), - + makeNumericParam("drop", lower=0, upper=0.9), - + makeIntegerParam("epochs", lower=2, upper=3) ) @@ -89,13 +89,13 @@ max.budget <- 1500 propose.points<-9 max.iterations<-5 -ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, +ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, trafo.y.fun = makeMBOTrafoFunction('log', log), impute.y.fun = function(x, y, opt.path, ...) .Machine$double.xmax ) ctrl = setMBOControlTermination(ctrl, max.evals = max.budget, iters = max.iterations) -ctrl = setMBOControlInfill(ctrl, - crit = makeMBOInfillCritCB(), - opt.restarts = 1, +ctrl = setMBOControlInfill(ctrl, + crit = makeMBOInfillCritCB(), + opt.restarts = 1, opt.focussearch.points = 1000) # d1 = generateGridDesign(par.set, trafo = TRUE) @@ -151,10 +151,10 @@ ptm <- proc.time() # dummy objective function simple.obj.fun = function(x){} -surr.rf = makeLearner("regr.randomForest", - predict.type = "se", +surr.rf = makeLearner("regr.randomForest", + predict.type = "se", fix.factors.prediction = TRUE, - se.method = "jackknife", + se.method = "jackknife", se.boot = 8) @@ -178,20 +178,20 @@ surr.rf = makeLearner("regr.randomForest", time <-(proc.time() - ptm) print(sprintf("nevals = %03d; itr = %03d; time = %5.5f;", nrow(all_res), itr, as.numeric(time[3]))) min.index<-which(itr_res$y==min(itr_res$y)) - + par.set.t = par.set0 pars = par.set.t$pars lens = getParamLengths(par.set.t) k = sum(lens) pids = getParamIds(par.set.t, repeated = TRUE, with.nr = TRUE) - + snames = c("y", pids) reqDF = subset(itr_res, select = snames, drop =TRUE) bestDF <- reqDF[min.index,] print("reqDF") print(nrow(reqDF)) print(summary(reqDF)) - + print("itr-rf") train.model <- randomForest(log(y) ~ ., data=reqDF, ntree=10000, keep.forest=TRUE, importance=TRUE) var.imp <- importance(train.model, type = 1) @@ -199,7 +199,7 @@ surr.rf = makeLearner("regr.randomForest", index <- sort(abs(var.imp[,1]), decreasing = TRUE, index.return = TRUE)$ix - + inputs <- rownames(var.imp)[index] scores <- var.imp[index,1] remove.index <- which(scores >= 0.9*max(scores)) @@ -208,7 +208,7 @@ surr.rf = makeLearner("regr.randomForest", print('removing:') print(rnames) - + par.set1<-par.set0 pnames<-names(par.set$pars) print(par.set1) @@ -239,7 +239,7 @@ surr.rf = makeLearner("regr.randomForest", } else { par.set1$pars[[index]]<-makeNumericParam(p, lower=ll, upper=uu, trafo = trafo) } - } + } } } } @@ -258,8 +258,8 @@ surr.rf = makeLearner("regr.randomForest", temp<-rbind(design,reqDF[,-1]) design <- head(temp, n = propose.points) - - + + USE_MODEL <- TRUE if(USE_MODEL){ yvals <- predict(train.model,design) @@ -270,13 +270,13 @@ surr.rf = makeLearner("regr.randomForest", } print("mbo-itr") print(yvals) - + print(summary(yvals)) res = mbo(obj.fun, design = design, learner = surr.rf, control = ctrl, show.info = FALSE) itr_res<-as.data.frame(res$opt.path) itr_res<-cbind(itr_res, stime = as.numeric(time[3])) itr_res<-tail(itr_res, n = propose.points) - + par.set0<-par.set1 itr <- itr + 1 print("bug msg:") diff --git a/workflows/common/R/test/learner-discrete-param-bug.R b/workflows/common/R/test/learner-discrete-param-bug.R index 4af45b36..1b62abc2 100644 --- a/workflows/common/R/test/learner-discrete-param-bug.R +++ b/workflows/common/R/test/learner-discrete-param-bug.R @@ -16,27 +16,27 @@ fun = function(x) { r = as.numeric(x$batch_size) i = as.numeric(x$drop) res<-r+i - + if(x$model=="ae"){ res<-res*1000 } - + if(x$activation == "relu"){ res<-res*1000 } - + if(x$optimizer == "sgd"){ res<-res*1000 } - + if(x$optimizer == "sgd"){ res<-res*1000 - } - + } + if(as.numeric(x$reduce_lr)){ res<-res*1000 } - + return(res) } @@ -49,7 +49,7 @@ par.set = makeParamSet( # use a subset of 978 landmark features only to speed up training makeDiscreteParam("use_landmark_genes", values=c(1)), # large batch_size only makes sense when warmup_lr is on - # makeDiscreteParam("batch_size", values=c(32, 64, 128, 256, 512, 1024), + # makeDiscreteParam("batch_size", values=c(32, 64, 128, 256, 512, 1024), makeIntegerParam("batch_size", lower=5, upper=10, trafo = function(x) 2L^x), # use consecutive 978-neuron layers to facilitate residual connections makeDiscreteParam("dense", values=c("1500 500", @@ -80,13 +80,13 @@ max.budget <- 1500 propose.points<-9 max.iterations<-5 -ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, +ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, trafo.y.fun = makeMBOTrafoFunction('log', log), impute.y.fun = function(x, y, opt.path, ...) .Machine$double.xmax ) ctrl = setMBOControlTermination(ctrl, max.evals = max.budget, iters = max.iterations) -ctrl = setMBOControlInfill(ctrl, - crit = makeMBOInfillCritCB(), - opt.restarts = 1, +ctrl = setMBOControlInfill(ctrl, + crit = makeMBOInfillCritCB(), + opt.restarts = 1, opt.focussearch.points = 1000) # d1 = generateGridDesign(par.set, trafo = TRUE) @@ -138,10 +138,10 @@ for (v in par.set$pars){ design=mydesign -surr.rf = makeLearner("regr.randomForest", - predict.type = "se", +surr.rf = makeLearner("regr.randomForest", + predict.type = "se", fix.factors.prediction = TRUE, - se.method = "jackknife", + se.method = "jackknife", se.boot = 8) diff --git a/workflows/common/R/test/mlrMBOMixedIntegerTest11a.R b/workflows/common/R/test/mlrMBOMixedIntegerTest11a.R index eec3496b..68aaa098 100644 --- a/workflows/common/R/test/mlrMBOMixedIntegerTest11a.R +++ b/workflows/common/R/test/mlrMBOMixedIntegerTest11a.R @@ -20,23 +20,23 @@ fun = function(x) { if(x$model=="ae"){ res<-res*1000 } - + if(x$activation == "relu"){ res<-res*1000 } - + if(x$optimizer == "sgd"){ res<-res*1000 } if(x$optimizer == "sgd"){ res<-res*1000 - } - + } + if(as.numeric(x$reduce_lr)){ res<-res*1000 } - + return(res) } @@ -49,7 +49,7 @@ par.set = makeParamSet( # use a subset of 978 landmark features only to speed up training makeDiscreteParam("use_landmark_genes", values=c(1)), # large batch_size only makes sense when warmup_lr is on - # makeDiscreteParam("batch_size", values=c(32, 64, 128, 256, 512, 1024), + # makeDiscreteParam("batch_size", values=c(32, 64, 128, 256, 512, 1024), makeIntegerParam("batch_size", lower=5, upper=10, trafo = function(x) 2L^x), # use consecutive 978-neuron layers to facilitate residual connections makeDiscreteParam("dense", values=c("1500 500", @@ -79,13 +79,13 @@ obj.fun = makeSingleObjectiveFunction( max.budget <- 1500 propose.points<-5 -ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, +ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, trafo.y.fun = makeMBOTrafoFunction('log', log), impute.y.fun = function(x, y, opt.path, ...) .Machine$double.xmax ) ctrl = setMBOControlTermination(ctrl, max.evals = max.budget) -ctrl = setMBOControlInfill(ctrl, +ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritCB(), - opt.restarts = 1, + opt.restarts = 1, opt.focussearch.points = 1000) @@ -94,4 +94,3 @@ design = head(design, n = propose.points) configureMlr(show.info = FALSE, show.learner.output = FALSE, on.learner.warning = "quiet") res = mbo(obj.fun, design = design, learner = NULL, control = ctrl, show.info = TRUE) - diff --git a/workflows/common/R/test/test_utils.R b/workflows/common/R/test/test_utils.R index 7fac7440..0ebb9dfa 100644 --- a/workflows/common/R/test/test_utils.R +++ b/workflows/common/R/test/test_utils.R @@ -1,4 +1,4 @@ -# Split the string pushed into OUT_put into +# Split the string pushed into OUT_put into # list of numerical vectors (used in simple_mlrMBO_run_test.R) split.into.param.lines <- function(x){ res1 <- unlist(strsplit(x,split = ";")) @@ -17,4 +17,4 @@ split.json.into.dummy.param.lines <- function(x){ make.into.q.res <- function(x){ paste0(x,collapse = ";") -} \ No newline at end of file +} diff --git a/workflows/common/db/candle_sql.py b/workflows/common/db/candle_sql.py index 1f653d51..c92faaf2 100644 --- a/workflows/common/db/candle_sql.py +++ b/workflows/common/db/candle_sql.py @@ -1,34 +1,31 @@ - import datetime import logging import os import sqlite3 import sys + def setup_db(db_file): - """ - Convenience function to use from Swift/T - """ - if 'DB' not in globals(): - rank = os.getenv('PMIX_RANK') - print('rank %s Connecting to DB...' % rank) + """Convenience function to use from Swift/T.""" + if "DB" not in globals(): + rank = os.getenv("PMIX_RANK") + print("rank %s Connecting to DB..." % rank) global DB DB = candle_sql(db_file) return DB + class candle_sql: def __init__(self, db_file, log=False): - """ - Sets up a wrapper around the SQL connection and cursor objects - Also caches dicts that convert between names and ids for the - features and studies tables - """ - #self.conn = sqlite3.connect(db_file) - #self.cursor = self.conn.cursor() + """Sets up a wrapper around the SQL connection and cursor objects Also + caches dicts that convert between names and ids for the features and + studies tables.""" + # self.conn = sqlite3.connect(db_file) + # self.cursor = self.conn.cursor() self.db_file = db_file - self.autoclose = True - self.logger = None # Default + self.autoclose = True + self.logger = None # Default if log: logging.basicConfig(format="SQL: %(message)s") self.logger = logging.getLogger("candle_sql") @@ -40,10 +37,11 @@ def connect(self): self.cursor.execute("PRAGMA busy_timeout = 30000") def insert(self, table, names, values): - """ Do a SQL insert """ - names_tpl = sql_tuple(names) + """Do a SQL insert.""" + names_tpl = sql_tuple(names) values_tpl = sql_tuple(values) - cmd = "insert into {} {} values {};".format(table, names_tpl, values_tpl) + cmd = "insert into {} {} values {};".format(table, names_tpl, + values_tpl) self.execute(cmd) rowid = str(self.cursor.lastrowid) return rowid @@ -75,19 +73,22 @@ def __del__(self): def q(s): - """ Quote the given string """ + """Quote the given string.""" return "'" + str(s) + "'" + def qL(L): - """ Quote each list entry as a string """ + """Quote each list entry as a string.""" return map(q, L) + def qA(*args): - """ Quote each argument as a string """ + """Quote each argument as a string.""" return map(q, args) + def sql_tuple(L): - """ Make the given list into a SQL-formatted tuple """ + """Make the given list into a SQL-formatted tuple.""" result = "" result += "(" result += ",".join(L) diff --git a/workflows/common/ext/EQ-Py/eqpy.py b/workflows/common/ext/EQ-Py/eqpy.py index ace77806..96ac970f 100644 --- a/workflows/common/ext/EQ-Py/eqpy.py +++ b/workflows/common/ext/EQ-Py/eqpy.py @@ -1,6 +1,7 @@ -import threading +import importlib import sys -import importlib, traceback +import threading +import traceback EQPY_ABORT = "EQPY_ABORT" @@ -17,6 +18,7 @@ aborted = False wait_info = None + class WaitInfo: def __init__(self): @@ -27,6 +29,7 @@ def getWait(self): self.wait += 1 return self.wait + class ThreadRunner(threading.Thread): def __init__(self, runnable): @@ -41,6 +44,7 @@ def run(self): # tuple of type, value and traceback self.exc = traceback.format_exc() + def init(pkg): global p, wait_info wait_info = WaitInfo() @@ -48,6 +52,7 @@ def init(pkg): p = ThreadRunner(imported_pkg) p.start() + def output_q_get(): global output_q, aborted wait = wait_info.getWait() @@ -71,10 +76,12 @@ def output_q_get(): return result + def OUT_put(string_params): output_q.put(string_params) + def IN_get(): - #global input_q + # global input_q result = input_q.get() return result diff --git a/workflows/common/ext/EQ-R/eqr/BlockingQueue.h b/workflows/common/ext/EQ-R/eqr/BlockingQueue.h index c9dfd41c..a9f983da 100644 --- a/workflows/common/ext/EQ-R/eqr/BlockingQueue.h +++ b/workflows/common/ext/EQ-R/eqr/BlockingQueue.h @@ -24,7 +24,7 @@ class BlockingQueue { } this->d_condition.notify_one(); } - + T pop() { std::unique_lock lock(this->d_mutex); // [ capture-list ] ( params ) { body } diff --git a/workflows/common/python/deap_ga.py b/workflows/common/python/deap_ga.py index 046ff2f6..550dd318 100644 --- a/workflows/common/python/deap_ga.py +++ b/workflows/common/python/deap_ga.py @@ -1,51 +1,51 @@ -import threading -import random -import time -import math import csv import json +import math +import random +import threading import time +import eqpy +import ga_utils import numpy as np - -from deap import base -from deap import creator -from deap import tools -from deap import algorithms - -import eqpy, ga_utils +from deap import algorithms, base, creator, tools # list of ga_utils parameter objects ga_params = None + def obj_func(x): return 0 + # {"batch_size":512,"epochs":51,"activation":"softsign", -#"dense":"2000 1000 1000 500 100 50","optimizer":"adagrad","drop":0.1378, -#"learning_rate":0.0301,"conv":"25 25 25 25 25 1"} +# "dense":"2000 1000 1000 500 100 50","optimizer":"adagrad","drop":0.1378, +# "learning_rate":0.0301,"conv":"25 25 25 25 25 1"} def create_list_of_json_strings(list_of_lists, super_delim=";"): # create string of ; separated jsonified maps res = [] global ga_params for l in list_of_lists: jmap = {} - for i,p in enumerate(ga_params): + for i, p in enumerate(ga_params): jmap[p.name] = l[i] jstring = json.dumps(jmap) res.append(jstring) - return (super_delim.join(res)) + return super_delim.join(res) + def create_fitnesses(params_string): - """return equivalent length tuple list + """return equivalent length tuple list. + :type params_string: str """ params = params_string.split(";") # get length res = [(i,) for i in range(len(params))] - return (res) + return res + def queue_map(obj_func, pops): # Note that the obj_func is not used @@ -55,15 +55,15 @@ def queue_map(obj_func, pops): return [] eqpy.OUT_put(create_list_of_json_strings(pops)) result = eqpy.IN_get() - split_result = result.split(';') + split_result = result.split(";") # TODO determine if max'ing or min'ing and use -9999999 or 99999999 - return [(float(x),) if not math.isnan(float(x)) else (float(99999999),) for x in split_result] - #return [(float(x),) for x in split_result] + return [(float(x),) if not math.isnan(float(x)) else (float(99999999),) + for x in split_result] + # return [(float(x),) for x in split_result] + def make_random_params(): - """ - Performs initial random draw on each parameter - """ + """Performs initial random draw on each parameter.""" global ga_params draws = [] @@ -72,26 +72,31 @@ def make_random_params(): return draws + def parse_init_params(params_file): init_params = [] with open(params_file) as f_in: reader = csv.reader(f_in) header = next(reader) for row in reader: - init_params.append(dict(zip(header,row))) + init_params.append(dict(zip(header, row))) return init_params + def update_init_pop(pop, params_file): global ga_params print("Reading initial population from {}".format(params_file)) init_params = parse_init_params(params_file) if len(pop) > len(init_params): - raise ValueError("Not enough initial params to set the population: size of init params < population size") + raise ValueError( + "Not enough initial params to set the population: size of init params < population size" + ) for i, indiv in enumerate(pop): for j, param in enumerate(ga_params): indiv[j] = param.parse(init_params[i][param.name]) + # keep as reference for log type # def mutGaussian_log(x, mu, sigma, mi, mx, indpb): # if random.random() < indpb: @@ -101,11 +106,10 @@ def update_init_pop(pop, params_file): # x = math.pow(10, logx) # return x + # Returns a tuple of one individual def custom_mutate(individual, indpb): - """ - Mutates the values in list individual with probability indpb - """ + """Mutates the values in list individual with probability indpb.""" # Note, if we had some aggregate constraint on the individual # (e.g. individual[1] * individual[2] < 10), we could copy @@ -116,15 +120,18 @@ def custom_mutate(individual, indpb): for i, param in enumerate(ga_params): individual[i] = param.mutate(individual[i], mu=0, indpb=indpb) - return individual, + return (individual,) + def cxUniform(ind1, ind2, indpb): c1, c2 = tools.cxUniform(ind1, ind2, indpb) return (c1, c2) + def timestamp(scores): return str(time.time()) + def run(): """ :param num_iter: number of generations @@ -138,7 +145,8 @@ def run(): params = eqpy.IN_get() # parse params - (num_iter, num_pop, seed, strategy, mut_prob, ga_params_file, param_file) = eval('{}'.format(params)) + (num_iter, num_pop, seed, strategy, mut_prob, ga_params_file, + param_file) = eval("{}".format(params)) random.seed(seed) global ga_params ga_params = ga_utils.create_parameters(ga_params_file) @@ -172,20 +180,37 @@ def run(): # num_iter-1 generations since the initial population is evaluated once first mutpb = mut_prob start_time = time.time() - if strategy == 'simple': - pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=mutpb, ngen=num_iter - 1, - stats=stats, halloffame=hof, verbose=True) - elif strategy == 'mu_plus_lambda': + if strategy == "simple": + pop, log = algorithms.eaSimple( + pop, + toolbox, + cxpb=0.5, + mutpb=mutpb, + ngen=num_iter - 1, + stats=stats, + halloffame=hof, + verbose=True, + ) + elif strategy == "mu_plus_lambda": mu = int(math.floor(float(num_pop) * 0.5)) lam = int(math.floor(float(num_pop) * 0.5)) if mu + lam < num_pop: mu += num_pop - (mu + lam) - pop, log = algorithms.eaMuPlusLambda(pop, toolbox, mu=mu, lambda_=lam, - cxpb=0.5, mutpb=mutpb, ngen=num_iter - 1, - stats=stats, halloffame=hof, verbose=True) + pop, log = algorithms.eaMuPlusLambda( + pop, + toolbox, + mu=mu, + lambda_=lam, + cxpb=0.5, + mutpb=mutpb, + ngen=num_iter - 1, + stats=stats, + halloffame=hof, + verbose=True, + ) else: - raise NameError('invalid strategy: {}'.format(strategy)) + raise NameError("invalid strategy: {}".format(strategy)) end_time = time.time() @@ -193,5 +218,10 @@ def run(): eqpy.OUT_put("DONE") # return the final population - eqpy.OUT_put("{}\n{}\n{}\n{}\n{}".format(create_list_of_json_strings(pop), ';'.join(fitnesses), - start_time, log, end_time)) + eqpy.OUT_put("{}\n{}\n{}\n{}\n{}".format( + create_list_of_json_strings(pop), + ";".join(fitnesses), + start_time, + log, + end_time, + )) diff --git a/workflows/common/python/dummy_baseline_keras2.py b/workflows/common/python/dummy_baseline_keras2.py index c0edfa5b..64b10a22 100644 --- a/workflows/common/python/dummy_baseline_keras2.py +++ b/workflows/common/python/dummy_baseline_keras2.py @@ -1,18 +1,22 @@ - # DUMMY BASELINE KERAS2 # To support workflow debugging + def initialize_parameters(): - return {} # empty dictionary + return {} # empty dictionary + class fake_history: + def __init__(self, x): - self.history = {"val_loss":[x]} + self.history = {"val_loss": [x]} + def run(params): print("RUNNING DUMMY: " + str(params)) import random - #value = float(len(str(params))) + random.random() + + # value = float(len(str(params))) + random.random() value = random.random() result = fake_history(value) return result diff --git a/workflows/common/python/ga_utils.py b/workflows/common/python/ga_utils.py index 2454dfde..f5bc4a3d 100644 --- a/workflows/common/python/ga_utils.py +++ b/workflows/common/python/ga_utils.py @@ -1,6 +1,10 @@ from __future__ import print_function -import random, json, sys, math +import json +import math +import random +import sys + def is_number(s): try: @@ -9,6 +13,7 @@ def is_number(s): except ValueError: return False + class ConstantParameter(object): def __init__(self, name, value): @@ -28,6 +33,7 @@ def parse(self, s): return int(s) return s + class NumericParameter(object): def __init__(self, name, lower, upper, sigma): @@ -56,6 +62,7 @@ def mutate(self, x, mu, indpb): def parse(self, s): return int(s) + class FloatParameter(NumericParameter): def __init__(self, name, lower, upper, sigma): @@ -71,9 +78,11 @@ def mutate(self, x, mu, indpb): def parse(self, s): return float(s) -#import logging -#logging.basicConfig() -#log = logging.getLogger("a") + +# import logging +# logging.basicConfig() +# log = logging.getLogger("a") + def str_to_bool(s): if s.lower() == "true": @@ -81,30 +90,35 @@ def str_to_bool(s): else: return False + class ListParameter(object): def __init__(self, name, categories, element_type): self.name = name self.categories = categories - if element_type == 'float': + if element_type == "float": self.parse_func = float - elif element_type == 'int': + elif element_type == "int": self.parse_func = int - elif element_type == 'string': + elif element_type == "string": self.parse_func = str - elif element_type == 'logical': + elif element_type == "logical": self.parse_func = str_to_bool else: - raise ValueError("Invalid type: {} - must be one of 'float', 'int', 'string', or 'logical'") + raise ValueError( + "Invalid type: {} - must be one of 'float', 'int', 'string', or 'logical'" + ) def parse(self, s): return self.parse_func(s) + class CategoricalParameter(ListParameter): def __init__(self, name, categories, element_type): - super(CategoricalParameter, self).__init__(name, categories, element_type) + super(CategoricalParameter, self).__init__(name, categories, + element_type) def randomDraw(self): i = random.randint(0, len(self.categories) - 1) @@ -119,6 +133,7 @@ def mutate(self, x, mu, indpb): x = a return x + class OrderedParameter(ListParameter): def __init__(self, name, categories, sigma, element_type): @@ -145,6 +160,7 @@ def mutate(self, x, mu, indpb): x = self.categories[n] return x + class LogicalParameter: def __init__(self, name): @@ -164,48 +180,51 @@ def parse(self, s): else: return False + def create_parameters(param_file, ignore_sigma=False): with open(param_file) as json_file: data = json.load(json_file) params = [] for item in data: - name = item['name'] - t = item['type'] + name = item["name"] + t = item["type"] if ignore_sigma: - sigma = float('nan') - if t == 'int' or t == 'float': - lower = item['lower'] - upper = item['upper'] + sigma = float("nan") + if t == "int" or t == "float": + lower = item["lower"] + upper = item["upper"] if not ignore_sigma: - sigma = item['sigma'] + sigma = item["sigma"] - if t == 'int': - params.append(IntParameter(name, int(lower), int(upper), - int(sigma))) + if t == "int": + params.append( + IntParameter(name, int(lower), int(upper), int(sigma))) else: - params.append(FloatParameter(name, float(lower), float(upper), - float(sigma))) + params.append( + FloatParameter(name, float(lower), float(upper), + float(sigma))) - elif t == 'categorical': - vs = item['values'] - element_type = item['element_type'] + elif t == "categorical": + vs = item["values"] + element_type = item["element_type"] params.append(CategoricalParameter(name, vs, element_type)) - elif t == 'logical': + elif t == "logical": params.append(LogicalParameter(name)) elif t == "ordered": - vs = item['values'] + vs = item["values"] if not ignore_sigma: - sigma = item['sigma'] - element_type = item['element_type'] + sigma = item["sigma"] + element_type = item["element_type"] params.append(OrderedParameter(name, vs, sigma, element_type)) - elif t == 'constant': - vs = item['value'] + elif t == "constant": + vs = item["value"] params.append(ConstantParameter(name, vs)) return params -if __name__ == '__main__': + +if __name__ == "__main__": create_parameters(sys.argv[1]) diff --git a/workflows/common/python/log_runner.py b/workflows/common/python/log_runner.py index dea00252..f23709ef 100644 --- a/workflows/common/python/log_runner.py +++ b/workflows/common/python/log_runner.py @@ -1,27 +1,32 @@ import sys + import exp_logger + def log_start(): parameter_map = {} - parameter_map['pp'] = sys.argv[2] - parameter_map['iterations'] = sys.argv[3] - parameter_map['params'] = "\"\"\"{}\"\"\"".format(sys.argv[4]) - parameter_map['algorithm'] = sys.argv[5] - parameter_map['experiment_id'] = sys.argv[6] - sys_env = "\"\"\"{}\"\"\"".format(sys.argv[7]) + parameter_map["pp"] = sys.argv[2] + parameter_map["iterations"] = sys.argv[3] + parameter_map["params"] = '"""{}"""'.format(sys.argv[4]) + parameter_map["algorithm"] = sys.argv[5] + parameter_map["experiment_id"] = sys.argv[6] + sys_env = '"""{}"""'.format(sys.argv[7]) exp_logger.start(parameter_map, sys_env) + def log_end(): exp_id = sys.argv[2] exp_logger.end(exp_id) + def main(): print(sys.argv) - if sys.argv[1] == 'start': + if sys.argv[1] == "start": log_start() else: log_end() -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/workflows/common/python/log_tools.py b/workflows/common/python/log_tools.py index d60cf489..f4648413 100644 --- a/workflows/common/python/log_tools.py +++ b/workflows/common/python/log_tools.py @@ -1,4 +1,3 @@ - # LOG TOOLS # Standardize some Python logging techniques @@ -7,33 +6,32 @@ logger = None + def get_logger(logger, name, stream=sys.stdout): - """ Set up logging """ + """Set up logging.""" if logger is not None: return logger import logging + logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) h = logging.StreamHandler(stream=stream) - fmtr = logging.Formatter('%(asctime)s %(name)s %(levelname)-5s %(message)s', - datefmt='%Y-%m-%d %H:%M:%S') + fmtr = logging.Formatter("%(asctime)s %(name)s %(levelname)-5s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S") h.setFormatter(fmtr) logger.addHandler(h) return logger - # def log(msg): # global logger # logger.info(msg) - # def log_info(msg): # global logger # logger = get_logger(logger) # logger.info(msg) - # def debug(msg): # global logger # logger = get_logger(logger) @@ -42,4 +40,5 @@ def get_logger(logger, name, stream=sys.stdout): def timestamp(): from datetime import datetime + return datetime.now().strftime("%Y-%m-%d %H:%M:%S") diff --git a/workflows/common/python/model_abstention_runner.py b/workflows/common/python/model_abstention_runner.py index 5747f40c..129f53fe 100644 --- a/workflows/common/python/model_abstention_runner.py +++ b/workflows/common/python/model_abstention_runner.py @@ -1,18 +1,18 @@ - # MODEL RUNNER PY # See __main__ section for usage -import sys +import importlib import json +import math import os +import sys import time + +import log_tools import numpy as np -import importlib import runner_utils from runner_utils import ModelResult -import log_tools -import math logger = None @@ -26,30 +26,32 @@ # append ${BENCHMARKS_ROOT}/common to $PATH if variable is set benchmarks_root = os.getenv("BENCHMARKS_ROOT") if benchmarks_root: - sys.path.append(benchmarks_root+"/common") + sys.path.append(benchmarks_root + "/common") # import candle_lrn_crv print("sys.path:") -for i in range(0, len(sys.path)-1): +for i in range(0, len(sys.path) - 1): print("%2i: %s" % (i, sys.path[i])) print("") + def import_pkg(framework, model_name): # The model_name is the short form of the Benchmark: e.g., 'nt3' # The module_name is the name of the Python module: e.g., 'nt3_baseline_keras2' print("model_name: ", model_name) module_name = os.getenv("MODEL_PYTHON_SCRIPT") - if framework == 'keras': + if framework == "keras": if module_name == None or module_name == "": module_name = "{}_abstention_keras2".format(model_name) - print ("module_name:", module_name) + print("module_name:", module_name) pkg = importlib.import_module(module_name) - elif framework == 'pytorch': + elif framework == "pytorch": import torch + if module_name == None or module_name == "": module_name = "{}_baseline_pytorch".format(model_name) - print ("module_name:", module_name) + print("module_name:", module_name) pkg = importlib.import_module(module_name) else: raise ValueError("Framework must either be `keras' or `pytorch' " + @@ -62,58 +64,67 @@ def log(msg): global logger logger.debug(msg) + def timestamp(): from datetime import datetime + return datetime.now().strftime("%Y-%m-%d %H:%M:%S") + def setup_perf(params): - return { 'top': setup_perf_top(params), - 'nvidia': setup_perf_nvidia(params) } + return {"top": setup_perf_top(params), "nvidia": setup_perf_nvidia(params)} def setup_perf_top(params): - if 'perf_top' not in params: + if "perf_top" not in params: return None - if params['perf_top'] == '0': + if params["perf_top"] == "0": return None try: - delay = int(params['perf_top']) + delay = int(params["perf_top"]) except: - msg = 'setup_perf_top(): params[perf_top] not an int: got: "%s"' % \ - params['perf_top'] + msg = ('setup_perf_top(): params[perf_top] not an int: got: "%s"' % + params["perf_top"]) print(msg) raise Exception(msg) import subprocess - with open('perf-top.log', 'a') as fp_out: - fp_out.write('model_runner: start: %s\n\n' % timestamp()) - P = subprocess.Popen(['top', '-b', '-d', params['perf_top']], - stdout=fp_out, - stderr=subprocess.STDOUT) + + with open("perf-top.log", "a") as fp_out: + fp_out.write("model_runner: start: %s\n\n" % timestamp()) + P = subprocess.Popen( + ["top", "-b", "-d", params["perf_top"]], + stdout=fp_out, + stderr=subprocess.STDOUT, + ) return P + def setup_perf_nvidia(params): - if 'perf_nvidia' not in params: + if "perf_nvidia" not in params: return None - if params['perf_nvidia'] == '0': + if params["perf_nvidia"] == "0": return None try: - delay = int(params['perf_nvidia']) + delay = int(params["perf_nvidia"]) except: - msg = 'setup_perf_nvidia(): params[perf_nvidia] not an int: ' + \ - 'got: "%s"' % params['perf_nvidia'] + msg = ("setup_perf_nvidia(): params[perf_nvidia] not an int: " + + 'got: "%s"' % params["perf_nvidia"]) print(msg) raise Exception(msg) import subprocess - with open('perf-nvidia.log', 'a') as fp_out: - fp_out.write('model_runner: start: %s\n\n' % timestamp()) - P = subprocess.Popen(['nvidia-smi', '--loop='+params['perf_top']], - stdout=fp_out, - stderr=subprocess.STDOUT) + + with open("perf-nvidia.log", "a") as fp_out: + fp_out.write("model_runner: start: %s\n\n" % timestamp()) + P = subprocess.Popen( + ["nvidia-smi", "--loop=" + params["perf_top"]], + stdout=fp_out, + stderr=subprocess.STDOUT, + ) return P def stop_perf(Ps): - for s in ['top', 'nvidia']: + for s in ["top", "nvidia"]: if Ps[s] is not None: Ps[s].terminate() @@ -121,28 +132,28 @@ def stop_perf(Ps): def run(hyper_parameter_map, obj_return): start = time.time() global logger - logger = log_tools.get_logger(logger, 'MODEL RUNNER') + logger = log_tools.get_logger(logger, "MODEL RUNNER") log("START:") sys.stdout.flush() - directory = hyper_parameter_map['instance_directory'] + directory = hyper_parameter_map["instance_directory"] os.chdir(directory) - with open(directory + '/rank.txt', 'w') as fp: - fp.write(str(os.getenv('ADLB_RANK_SELF')) + '\n') + with open(directory + "/rank.txt", "w") as fp: + fp.write(str(os.getenv("ADLB_RANK_SELF")) + "\n") - framework = hyper_parameter_map['framework'] - model_name = hyper_parameter_map['model_name'] + framework = hyper_parameter_map["framework"] + model_name = hyper_parameter_map["model_name"] pkg = import_pkg(framework, model_name) runner_utils.format_params(hyper_parameter_map) params_arg = {} - if 'config_file' in hyper_parameter_map: - config_file = hyper_parameter_map['config_file'] + if "config_file" in hyper_parameter_map: + config_file = hyper_parameter_map["config_file"] logger.info('specified config_file: "%s"' % config_file) - params_arg = { 'default_model': config_file } + params_arg = {"default_model": config_file} # params is a python dictionary params = setup_params(pkg, hyper_parameter_map, params_arg) @@ -152,7 +163,7 @@ def run(hyper_parameter_map, obj_return): # Run the model! history = pkg.run(params) - if framework == 'keras': + if framework == "keras": runner_utils.keras_clear_session(framework) # Default result if there is no val_loss (as in infer.py) @@ -172,7 +183,7 @@ def run(hyper_parameter_map, obj_return): def get_obj_return(): obj_return = os.getenv("OBJ_RETURN") - valid_obj_returns = [ "loss", "val_loss", "val_corr", "val_acc" ] + valid_obj_returns = ["loss", "val_loss", "val_corr", "val_acc"] if obj_return == None: raise Exception("No OBJ_RETURN was in the environment!") if obj_return not in valid_obj_returns: @@ -180,6 +191,7 @@ def get_obj_return(): str(valid_obj_returns)) return obj_return + def load_pre_post(hyper_parameter_map, key): module = None if key in hyper_parameter_map: @@ -187,8 +199,9 @@ def load_pre_post(hyper_parameter_map, key): module = importlib.import_module(module_name) return module + def run_pre(hyper_parameter_map): - module = load_pre_post(hyper_parameter_map, 'pre_module') + module = load_pre_post(hyper_parameter_map, "pre_module") result = ModelResult.SUCCESS if module != None: logger.debug("PRE RUN START") @@ -196,15 +209,17 @@ def run_pre(hyper_parameter_map): logger.debug("PRE RUN STOP") return result + def run_post(hyper_parameter_map, output_map): - module = load_pre_post(hyper_parameter_map, 'post_module') + module = load_pre_post(hyper_parameter_map, "post_module") if module != None: logger.debug("POST RUN START") module.post_run(hyper_parameter_map, output_map) logger.debug("POST RUN STOP") + def run_model(hyper_parameter_map): - instance_directory = hyper_parameter_map['instance_directory'] + instance_directory = hyper_parameter_map["instance_directory"] os.chdir(instance_directory) global logger logger = log_tools.get_logger(logger, "MODEL RUNNER") @@ -218,23 +233,27 @@ def run_model(hyper_parameter_map): sys.stdout.flush() return ("SKIP", "HISTORY_EMPTY") else: - assert(result == ModelResult.SUCCESS) # proceed... + assert result == ModelResult.SUCCESS # proceed... result, history = run(hyper_parameter_map, obj_return) runner_utils.write_output(result, instance_directory) - runner_utils.write_output(json.dumps(history, cls=runner_utils.FromNPEncoder), - instance_directory, 'history.txt') + runner_utils.write_output( + json.dumps(history, cls=runner_utils.FromNPEncoder), + instance_directory, + "history.txt", + ) run_post(hyper_parameter_map, {}) log("RUN STOP") return (result, history) + def setup_params(pkg, hyper_parameter_map, params_arg): params = pkg.initialize_parameters(**params_arg) log("PARAM UPDATE START") - for k,v in hyper_parameter_map.items(): + for k, v in hyper_parameter_map.items(): if k == "dense" or k == "dense_feature_layers": - if(type(v) != list): + if type(v) != list: v = v.split(" ") v = [int(i) for i in v] if k == "cell_features": @@ -252,19 +271,18 @@ def setup_params(pkg, hyper_parameter_map, params_arg): def get_results(history, obj_return): - """ - Return the history entry that the user requested. + """Return the history entry that the user requested. + history: The Keras history object """ values = history.history[obj_return] # Default: the last value in the history result = values[-1] - known_params = [ "loss", "val_loss", "val_corr", "val_dice_coef" ] + known_params = ["loss", "val_loss", "val_corr", "val_dice_coef"] if obj_return not in known_params: raise ValueError("Unsupported objective function: " + - "use obj_param to specify one of " + - str(known_params)) + "use obj_param to specify one of " + str(known_params)) # Fix NaNs: if math.isnan(result): @@ -279,32 +297,35 @@ def get_results(history, obj_return): history_result = history.history.copy() return result, history_result + # Usage: see how sys.argv is unpacked below: -if __name__ == '__main__': +if __name__ == "__main__": logger = log_tools.get_logger(logger, "MODEL_RUNNER") log("RUN START") - ( _, # The Python program name (unused) - param_string, - instance_directory, - framework, - runid, - benchmark_timeout ) = sys.argv + ( + _, # The Python program name (unused) + param_string, + instance_directory, + framework, + runid, + benchmark_timeout, + ) = sys.argv hyper_parameter_map = runner_utils.init(param_string, instance_directory, framework, - out_dir_key='save') - hyper_parameter_map['model_name'] = os.getenv("MODEL_NAME") - if hyper_parameter_map['model_name'] == None: + out_dir_key="save") + hyper_parameter_map["model_name"] = os.getenv("MODEL_NAME") + if hyper_parameter_map["model_name"] == None: raise Exception("No MODEL_NAME was in the environment!") - hyper_parameter_map['experiment_id'] = os.getenv("EXPID") - hyper_parameter_map['run_id'] = runid - hyper_parameter_map['timeout'] = float(benchmark_timeout) + hyper_parameter_map["experiment_id"] = os.getenv("EXPID") + hyper_parameter_map["run_id"] = runid + hyper_parameter_map["timeout"] = float(benchmark_timeout) # tensorflow.__init__ calls _os.path.basename(_sys.argv[0]) # so we need to create a synthetic argv. # if (not hasattr(sys, 'argv')) or (len(sys.argv) == 0): # sys.argv = ['nt3_tc1'] - sys.argv = ['null'] + sys.argv = ["null"] run_model(hyper_parameter_map) diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index 496b3c6a..2c4e2708 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -1,21 +1,21 @@ - # MODEL RUNNER PY # See __main__ section for usage +import importlib import json import math import os import sys import time -import importlib + import runner_utils -from runner_utils import ModelResult from log_tools import * +from runner_utils import ModelResult logger = None -print('MODEL RUNNER...') +print("MODEL RUNNER...") # Set PYTHONPATH: # Let MODEL_PYTHON_DIR override default Benchmarks model locations @@ -30,9 +30,9 @@ # Report PYTHONPATH for debugging print("sys.path:") -for i in range(0, len(sys.path)-1): - print('%2i: %s' % (i, sys.path[i])) -print('') +for i in range(0, len(sys.path) - 1): + print("%2i: %s" % (i, sys.path[i])) +print("") def import_pkg(framework, model_name): @@ -41,13 +41,14 @@ def import_pkg(framework, model_name): # e.g., 'nt3_baseline_keras2' print("model_name: ", model_name) module_name = os.getenv("MODEL_PYTHON_SCRIPT") - if framework == 'keras': + if framework == "keras": if module_name is None or module_name == "": module_name = "{}_baseline_keras2".format(model_name) print("module_name: " + module_name) pkg = importlib.import_module(module_name) - elif framework == 'pytorch': + elif framework == "pytorch": import torch # noqa: F401 + if module_name is None or module_name == "": module_name = "{}_baseline_pytorch".format(model_name) print("module_name: " + module_name) @@ -67,58 +68,60 @@ def log(msg): def timestamp(): from datetime import datetime + return datetime.now().strftime("%Y-%m-%d %H:%M:%S") def setup_perf(params): - return { 'top': setup_perf_top(params), - 'nvidia': setup_perf_nvidia(params) } + return {"top": setup_perf_top(params), "nvidia": setup_perf_nvidia(params)} def setup_perf_top(params): - if 'perf_top' not in params: + if "perf_top" not in params: return None - if params['perf_top'] == '0': + if params["perf_top"] == "0": return None try: - delay = int(params['perf_top']) + delay = int(params["perf_top"]) except Exception: - msg = 'setup_perf_top(): params[perf_top] not an int: got: "%s"' % \ - params['perf_top'] + msg = ('setup_perf_top(): params[perf_top] not an int: got: "%s"' % + params["perf_top"]) print(msg) raise Exception(msg) import subprocess - with open('perf-top.log', 'a') as fp_out: - fp_out.write('model_runner: start: %s\n\n' % timestamp()) - P = subprocess.Popen(['top', '-b', '-d', delay], + + with open("perf-top.log", "a") as fp_out: + fp_out.write("model_runner: start: %s\n\n" % timestamp()) + P = subprocess.Popen(["top", "-b", "-d", delay], stdout=fp_out, stderr=subprocess.STDOUT) return P def setup_perf_nvidia(params): - if 'perf_nvidia' not in params: + if "perf_nvidia" not in params: return None - if params['perf_nvidia'] == '0': + if params["perf_nvidia"] == "0": return None try: - delay = int(params['perf_nvidia']) + delay = int(params["perf_nvidia"]) except Exception: - msg = 'setup_perf_nvidia(): params[perf_nvidia] not an int: ' + \ - 'got: "%s"' % params['perf_nvidia'] + msg = ("setup_perf_nvidia(): params[perf_nvidia] not an int: " + + 'got: "%s"' % params["perf_nvidia"]) print(msg) raise Exception(msg) import subprocess - with open('perf-nvidia.log', 'a') as fp_out: - fp_out.write('model_runner: start: %s\n\n' % timestamp()) - P = subprocess.Popen(['nvidia-smi', '--loop=%i' % delay], + + with open("perf-nvidia.log", "a") as fp_out: + fp_out.write("model_runner: start: %s\n\n" % timestamp()) + P = subprocess.Popen(["nvidia-smi", "--loop=%i" % delay], stdout=fp_out, stderr=subprocess.STDOUT) return P def stop_perf(Ps): - for s in ['top', 'nvidia']: + for s in ["top", "nvidia"]: if Ps[s] is not None: Ps[s].terminate() @@ -126,39 +129,40 @@ def stop_perf(Ps): def run(hyper_parameter_map, obj_return): start = time.time() global logger - logger = get_logger(logger, 'MODEL RUNNER') + logger = get_logger(logger, "MODEL RUNNER") - logger.info('run(): START:') + logger.info("run(): START:") sys.stdout.flush() - directory = hyper_parameter_map['instance_directory'] # should be output_dir + directory = hyper_parameter_map[ + "instance_directory"] # should be output_dir os.chdir(directory) - with open(directory + '/rank.txt', 'w') as fp: - fp.write(str(os.getenv('ADLB_RANK_SELF')) + '\n') + with open(directory + "/rank.txt", "w") as fp: + fp.write(str(os.getenv("ADLB_RANK_SELF")) + "\n") - framework = hyper_parameter_map['framework'] - model_name = hyper_parameter_map['model_name'] + framework = hyper_parameter_map["framework"] + model_name = hyper_parameter_map["model_name"] pkg = import_pkg(framework, model_name) runner_utils.format_params(hyper_parameter_map) params_arg = {} - if 'CANDLE_DEFAULT_MODEL_FILE' in os.environ: - config_file = os.getenv('CANDLE_DEFAULT_MODEL_FILE') + if "CANDLE_DEFAULT_MODEL_FILE" in os.environ: + config_file = os.getenv("CANDLE_DEFAULT_MODEL_FILE") logger.info('CANDLE_DEFAULT_MODEL_FILE: "%s"' % config_file) - params_arg = { 'default_model': config_file } - if 'config_file' in hyper_parameter_map: - config_file = hyper_parameter_map['config_file'] + params_arg = {"default_model": config_file} + if "config_file" in hyper_parameter_map: + config_file = hyper_parameter_map["config_file"] logger.info('specified config_file: "%s"' % config_file) - params_arg = { 'default_model': config_file } + params_arg = {"default_model": config_file} # params is a Python dictionary params = setup_params(pkg, hyper_parameter_map, params_arg) Ps = setup_perf(params) - history = None + history = None exception = False # Run the model! @@ -173,7 +177,7 @@ def run(hyper_parameter_map, obj_return): exception = True log("PKG RUN STOP") - if framework == 'keras': + if framework == "keras": runner_utils.keras_clear_session(framework) stop_perf(Ps) @@ -181,8 +185,8 @@ def run(hyper_parameter_map, obj_return): duration = finish - start # check for epochs if not present set to 1, used for checking early stopping in function get_results - if 'epochs' in hyper_parameter_map: - epochs = hyper_parameter_map['epochs'] + if "epochs" in hyper_parameter_map: + epochs = hyper_parameter_map["epochs"] else: epochs = 1 @@ -190,8 +194,8 @@ def run(hyper_parameter_map, obj_return): result = 0 history_result = {} if not exception: - logger.info('DONE: run_id %s in %0.2f seconds.' % - (hyper_parameter_map['run_id'], duration)) + logger.info("DONE: run_id %s in %0.2f seconds." % + (hyper_parameter_map["run_id"], duration)) if history is not None: if history == "EPOCHS_COMPLETED_ALREADY": result, history_result = "EPOCHS_COMPLETED_ALREADY", None @@ -205,12 +209,12 @@ def run(hyper_parameter_map, obj_return): def get_obj_return(): - obj_return = os.getenv('OBJ_RETURN') - valid_obj_returns = [ 'loss', 'val_loss', 'val_corr', 'val_acc' ] + obj_return = os.getenv("OBJ_RETURN") + valid_obj_returns = ["loss", "val_loss", "val_corr", "val_acc"] if obj_return is None: - raise Exception('No OBJ_RETURN was in the environment!') + raise Exception("No OBJ_RETURN was in the environment!") if obj_return not in valid_obj_returns: - raise Exception('Invalid value for OBJ_RETURN: use: ' + + raise Exception("Invalid value for OBJ_RETURN: use: " + str(valid_obj_returns)) return obj_return @@ -224,159 +228,159 @@ def load_pre_post(hyper_parameter_map, key): def run_pre(hyper_parameter_map): - module = load_pre_post(hyper_parameter_map, 'pre_module') + module = load_pre_post(hyper_parameter_map, "pre_module") result = ModelResult.SUCCESS if module is not None: - logger.debug('PRE RUN START') + logger.debug("PRE RUN START") result = module.pre_run(hyper_parameter_map) - logger.debug('PRE RUN STOP') + logger.debug("PRE RUN STOP") return result def run_post(hyper_parameter_map, output_map): - module = load_pre_post(hyper_parameter_map, 'post_module') + module = load_pre_post(hyper_parameter_map, "post_module") if module is not None: - logger.debug('POST RUN START') + logger.debug("POST RUN START") module.post_run(hyper_parameter_map, output_map) - logger.debug('POST RUN STOP') + logger.debug("POST RUN STOP") def run_model(hyper_parameter_map): # In-memory Python runs may not create sys.argv - if 'argv' not in dir(sys): + if "argv" not in dir(sys): # This is needed for CANDLE Benchmarks finalize_parameters(): - sys.argv = ['null'] - instance_directory = hyper_parameter_map['instance_directory'] + sys.argv = ["null"] + instance_directory = hyper_parameter_map["instance_directory"] os.chdir(instance_directory) global logger - logger = get_logger(logger, 'MODEL RUNNER') + logger = get_logger(logger, "MODEL RUNNER") obj_return = get_obj_return() # logger.info("run_model: node: " + hyper_parameter_map['node']) - directory = hyper_parameter_map['instance_directory'] + directory = hyper_parameter_map["instance_directory"] os.chdir(directory) - if os.path.exists('stop.marker'): - logger.info('stop.marker exists!') - return ('SKIP', 'STOP_MARKER') + if os.path.exists("stop.marker"): + logger.info("stop.marker exists!") + return ("SKIP", "STOP_MARKER") result = run_pre(hyper_parameter_map) if result == ModelResult.ERROR: - print('run_pre() returned ERROR!') + print("run_pre() returned ERROR!") exit(1) elif result == ModelResult.SKIP: - logger.info('run_pre() returned SKIP ...') + logger.info("run_pre() returned SKIP ...") sys.stdout.flush() - return ('SKIP', 'HISTORY_EMPTY') + return ("SKIP", "HISTORY_EMPTY") else: - assert(result == ModelResult.SUCCESS) # proceed... + assert result == ModelResult.SUCCESS # proceed... result, history = run(hyper_parameter_map, obj_return) runner_utils.write_output(result, directory) - runner_utils.write_output(json.dumps(history, - cls=runner_utils.FromNPEncoder), - directory, 'history.txt') + runner_utils.write_output( + json.dumps(history, cls=runner_utils.FromNPEncoder), directory, + "history.txt") run_post(hyper_parameter_map, {}) - logger.info('RUN STOP') + logger.info("RUN STOP") return (result, history) def setup_params(pkg, hyper_parameter_map, params_arg): params = pkg.initialize_parameters(**params_arg) - logger.debug('PARAM UPDATE START') - for k,v in hyper_parameter_map.items(): - if k == 'dense' or k == 'dense_feature_layers': - if(type(v) != list): - v = v.split(' ') + logger.debug("PARAM UPDATE START") + for k, v in hyper_parameter_map.items(): + if k == "dense" or k == "dense_feature_layers": + if type(v) != list: + v = v.split(" ") v = [int(i) for i in v] - if k == 'cell_features': + if k == "cell_features": cp_str = v v = list() v.append(cp_str) - logger.debug(str(k) + ' = ' + str(v)) + logger.debug(str(k) + " = " + str(v)) params[k] = v - logger.debug('PARAM UPDATE STOP') + logger.debug("PARAM UPDATE STOP") - logger.debug('WRITE_PARAMS START') + logger.debug("WRITE_PARAMS START") runner_utils.write_params(params, hyper_parameter_map) - logger.debug('WRITE_PARAMS STOP') + logger.debug("WRITE_PARAMS STOP") return params def get_results(history, obj_return, epochs_expected): - """ - Return the history entry that the user requested. + """Return the history entry that the user requested. + Also checks for early stopping and if so marks the directory. history: The Keras history object """ logger.debug('get_results(): "%s"' % obj_return) - known_params = [ 'loss', 'val_loss', 'val_corr', 'val_dice_coef' ] + known_params = ["loss", "val_loss", "val_corr", "val_dice_coef"] if obj_return not in known_params: - raise ValueError('Unsupported objective function return ' + - 'key: "' + obj_return + '" - ' + - 'use obj_param to specify one of ' + - str(known_params)) + raise ValueError("Unsupported objective function return " + 'key: "' + + obj_return + '" - ' + + "use obj_param to specify one of " + str(known_params)) if obj_return in history.history: # Good value values = history.history[obj_return] if len(values) < epochs_expected: - msg = 'early stopping: %i/%i' % \ - (len(values), epochs_expected) - logger.info('get_results(): ' + msg) - with open('stop.marker', 'w') as fp: - fp.write(msg + '\n') + msg = "early stopping: %i/%i" % (len(values), epochs_expected) + logger.info("get_results(): " + msg) + with open("stop.marker", "w") as fp: + fp.write(msg + "\n") # Default: the last value in the history result = values[-1] else: - logger.warning('get_results(): objective function return key ' + - 'not found: ' + - 'key: "' + obj_return + '" - ' + - 'history: ' + str(history.history.keys())) - logger.warning('get_results(): returning NaN') + logger.warning("get_results(): objective function return key " + + "not found: " + 'key: "' + obj_return + '" - ' + + "history: " + str(history.history.keys())) + logger.warning("get_results(): returning NaN") result = math.nan # Fix NaNs: if math.isnan(result): - if obj_return == 'val_corr' or obj_return == 'val_dice_coef': + if obj_return == "val_corr" or obj_return == "val_dice_coef": # Return the negative result result = -result else: # Just return a large number result = 999999999 - print('result: ' + obj_return + ': ' + str(result)) + print("result: " + obj_return + ": " + str(result)) history_result = history.history.copy() return result, history_result # Usage: see how sys.argv is unpacked below: -if __name__ == '__main__': - logger = get_logger(logger, 'MODEL_RUNNER') - logger.info('main: RUN START') +if __name__ == "__main__": + logger = get_logger(logger, "MODEL_RUNNER") + logger.info("main: RUN START") import sys - ( _, # The Python program name (unused) - param_string, - instance_directory, - framework, - runid, - benchmark_timeout ) = sys.argv + + ( + _, # The Python program name (unused) + param_string, + instance_directory, + framework, + runid, + benchmark_timeout, + ) = sys.argv hyper_parameter_map = runner_utils.init(param_string, instance_directory, framework, - out_dir_key='save') - hyper_parameter_map['model_name'] = os.getenv('MODEL_NAME') - if hyper_parameter_map['model_name'] is None: - raise Exception('No MODEL_NAME was in the environment!') - hyper_parameter_map['experiment_id'] = os.getenv('EXPID') - hyper_parameter_map['run_id'] = runid - hyper_parameter_map['timeout'] = float(benchmark_timeout) + out_dir_key="save") + hyper_parameter_map["model_name"] = os.getenv("MODEL_NAME") + if hyper_parameter_map["model_name"] is None: + raise Exception("No MODEL_NAME was in the environment!") + hyper_parameter_map["experiment_id"] = os.getenv("EXPID") + hyper_parameter_map["run_id"] = runid + hyper_parameter_map["timeout"] = float(benchmark_timeout) # tensorflow.__init__ calls _os.path.basename(_sys.argv[0]) # so we need to create a synthetic argv. # if (not hasattr(sys, 'argv')) or (len(sys.argv) == 0): # sys.argv = ['nt3_tc1'] - sys.argv = ['null'] + sys.argv = ["null"] run_model(hyper_parameter_map) diff --git a/workflows/common/python/runner_utils.py b/workflows/common/python/runner_utils.py index 4d3f2f02..08142002 100644 --- a/workflows/common/python/runner_utils.py +++ b/workflows/common/python/runner_utils.py @@ -1,16 +1,26 @@ +import json +import os +import sys import configparser -import numpy as np -import json, os, sys + from enum import Enum +import numpy as np + try: basestring except NameError: basestring = str -DATA_TYPES = {type(np.float16): 'f16', type(np.float32): 'f32', type(np.float64): 'f64'} +DATA_TYPES = { + type(np.float16): "f16", + type(np.float32): "f32", + type(np.float64): "f64" +} + class FromNPEncoder(json.JSONEncoder): + def default(self, obj): if isinstance(obj, np.integer): return int(obj) @@ -21,24 +31,27 @@ def default(self, obj): else: return super(FromNPEncoder, self).default(obj) -def write_output(result, instance_directory, fname='result.txt'): - with open('{}/{}'.format(instance_directory, fname), 'w') as f_out: + +def write_output(result, instance_directory, fname="result.txt"): + with open("{}/{}".format(instance_directory, fname), "w") as f_out: f_out.write("{}\n".format(result)) + def init(param_string, instance_directory, framework, out_dir_key): - #with open(param_file) as f_in: + # with open(param_file) as f_in: # hyper_parameter_map = json.load(f_in) hyper_parameter_map = json.loads(param_string.strip()) if not os.path.exists(instance_directory): os.makedirs(instance_directory) - hyper_parameter_map['framework'] = framework - hyper_parameter_map[out_dir_key] = '{}/output'.format(instance_directory) - hyper_parameter_map['instance_directory'] = instance_directory + hyper_parameter_map["framework"] = framework + hyper_parameter_map[out_dir_key] = "{}/output".format(instance_directory) + hyper_parameter_map["instance_directory"] = instance_directory return hyper_parameter_map + def is_numeric(val): try: float(val) @@ -46,8 +59,9 @@ def is_numeric(val): except ValueError: return False + def format_params(hyper_parameter_map): - for k,v in hyper_parameter_map.items(): + for k, v in hyper_parameter_map.items(): vals = str(v).split(" ") if len(vals) > 1 and is_numeric(vals[0]): # assume this should be a list @@ -56,37 +70,42 @@ def format_params(hyper_parameter_map): else: hyper_parameter_map[k] = [int(x) for x in vals] + def write_params(params, hyper_parameter_map): - parent_dir = hyper_parameter_map['instance_directory'] if 'instance_directory' in hyper_parameter_map else '.' + parent_dir = (hyper_parameter_map["instance_directory"] + if "instance_directory" in hyper_parameter_map else ".") f = "{}/parameters.txt".format(parent_dir) - montr=[] # Monitor params + montr = [] # Monitor params with open(f, "w") as f_out: f_out.write("[Global Params]\n") - for k,v in params.items(): + for k, v in params.items(): if type(v) in DATA_TYPES: v = DATA_TYPES[type(v)] if isinstance(v, basestring): v = "'{}'".format(v) - if(k =='solr_root' or k == 'timeout' ): + if k == "solr_root" or k == "timeout": # this must written at the end - montr.append((k,v)) + montr.append((k, v)) else: f_out.write("{}={}\n".format(k, v)) f_out.write("[Monitor Params]\n") for kv in montr: f_out.write("{}={}\n".format(*kv)) + def keras_clear_session(framework): - if framework == 'keras': + if framework == "keras": # works around this error: # https://github.com/tensorflow/tensorflow/issues/3388 try: from tensorflow.keras import backend as K + K.clear_session() - except AttributeError: # theano does not have this function + except AttributeError: # theano does not have this function pass + class ModelResult(Enum): SUCCESS = 1 SKIP = 2 @@ -128,4 +147,4 @@ def main(): if __name__ == "__main__": - main() + main() diff --git a/workflows/common/python/utils.py b/workflows/common/python/utils.py index ac18f20c..139eeedb 100644 --- a/workflows/common/python/utils.py +++ b/workflows/common/python/utils.py @@ -1,4 +1,3 @@ - # UTILS PY import os @@ -12,13 +11,13 @@ def fail(*args): def fail1(message): - """ Fail with message, return exit code 1 """ + """Fail with message, return exit code 1.""" print(message) exit(1) def fail3(e, code, message): - """ Fail with message due to Exception e , return exit code """ + """Fail with message due to Exception e , return exit code.""" print(message) print(str(e)) exit(code) @@ -31,20 +30,21 @@ def avg(values): def append(filename, text): try: - with open(filename, 'a') as fp: + with open(filename, "a") as fp: fp.write(text) - fp.write('\n') + fp.write("\n") except Exception as e: - fail(e, os.EX_IOERR, 'Could not append to: ' + filename) + fail(e, os.EX_IOERR, "Could not append to: " + filename) + import re -class Matcher: - """ Abstract class for use with Grepper """ +class Matcher: + """Abstract class for use with Grepper.""" def __init__(self, regexp): - self.regexp = regexp + self.regexp = regexp self.pattern = re.compile(self.regexp) def match(self, line): @@ -54,25 +54,26 @@ def match(self, line): self.run(line) def run(self, line): - """ User code should override this """ + """User code should override this.""" pass def reset(self): - """ User code should override this """ + """User code should override this.""" pass class Grepper: def __init__(self, matchers): - """ matchers: List of Matchers """ + """matchers: List of Matchers""" self.matchers = matchers def grep(self, filename): with open(filename, "r") as fp: while True: line = fp.readline() - if len(line) == 0: break + if len(line) == 0: + break for matcher in self.matchers: matcher.match(line) @@ -82,12 +83,14 @@ def reset(self): def columnPrint(D, aligns): - """ D: a dict mapping a header string to a list of string data """ + """D: a dict mapping a header string to a list of string data""" """ aligns: a string "llrlr" for left or right alignment by column """ headers = D.keys() - assert len(aligns) == len(headers), \ - "Length of aligns (%i) does not match headers (%i)!" % \ - (len(aligns), len(headers)) + assert len(aligns) == len( + headers), "Length of aligns (%i) does not match headers (%i)!" % ( + len(aligns), + len(headers), + ) # Format specs for headers fmth = "" @@ -105,11 +108,11 @@ def columnPrint(D, aligns): # Header is always left-aligned fmth += "%%-%is " % maxstr sign = "-" if aligns[index] == "l" else "" - fmtd += "%%%s%is " % (sign, maxstr) + fmtd += "%%%s%is " % (sign, maxstr) index += 1 # Start printing print(fmth % tuple(headers)) - for i in range(0, maxlist-1): + for i in range(0, maxlist - 1): L = [] for header in headers: L.append(D[header][i]) diff --git a/workflows/common/sh/env-summit-i.sh b/workflows/common/sh/env-summit-i.sh index fb3d61c1..34afbbbb 100644 --- a/workflows/common/sh/env-summit-i.sh +++ b/workflows/common/sh/env-summit-i.sh @@ -74,4 +74,3 @@ then export TURBINE_RESIDENT_WORK_WORKERS=1 export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) fi - diff --git a/workflows/common/sh/langs-app-local.sh b/workflows/common/sh/langs-app-local.sh index b4cac56f..855a25b7 100644 --- a/workflows/common/sh/langs-app-local.sh +++ b/workflows/common/sh/langs-app-local.sh @@ -14,4 +14,3 @@ APP_PYTHONPATH=${APP_PYTHONPATH:-} PYTHONPATH+=":$APP_PYTHONPATH" export PYTHONPATH - diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index 8f137dbe..12536259 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -87,7 +87,7 @@ then # No model_runner, need to write parameters.txt explicitly: # get hyper_parameter_map to pass as 2nd argument - + python3 $WORKFLOWS_ROOT/common/python/runner_utils.py write_params $PARAMS $INIT_PARAMS_FILE MODEL_CMD=( singularity exec --nv $CANDLE_IMAGE train.sh $ADLB_RANK_OFFSET $CANDLE_DATA_DIR $INSTANCE_DIRECTORY/parameters.txt ) @@ -101,7 +101,19 @@ then # RESULT=$( sed -n '/val_loss:/{s/val_loss: \(.*\)/\1/;p}' | tail -1 ) # log "found result: $RESULT" # echo $RESULT > $INSTANCE_DIRECTORY/result.txt - echo $MODEL_CMD + + + # TODO: Add wait for the above and standardize getting results from container. + echo $MODEL_CMD & + PID=$! + # FIX: This doesn't work. + wait $PID + + + # get results of the format Loss: xxx last occurence of in the model.log file + RESULT=$(awk -v FS="Loss:" 'NF>1{print $2}' model.log | tail -1) + echo $RESULT > $INSTANCE_DIRECTORY/result.txt + else # "BENCHMARKS" # The Python command line arguments: diff --git a/workflows/common/sh/run_logger.sh b/workflows/common/sh/run_logger.sh index e0dd2458..40b2d700 100644 --- a/workflows/common/sh/run_logger.sh +++ b/workflows/common/sh/run_logger.sh @@ -17,7 +17,7 @@ fi # "start" propose_points, max_iterations, ps, algorithm, exp_id, sys_env if [ $CMD == "start" ] - then + then SITE=$9 source $WORKFLOWS_ROOT/common/sh/utils.sh source_site langs-app $SITE diff --git a/workflows/common/sh/sched-local-as.sh b/workflows/common/sh/sched-local-as.sh index c5625211..25294442 100644 --- a/workflows/common/sh/sched-local-as.sh +++ b/workflows/common/sh/sched-local-as.sh @@ -8,4 +8,4 @@ MACHINE="" export LOCAL true # Default PROJECT for CANDLE -export PROJECT=NONE \ No newline at end of file +export PROJECT=NONE diff --git a/workflows/common/sh/sched-local.sh b/workflows/common/sh/sched-local.sh index c5625211..25294442 100644 --- a/workflows/common/sh/sched-local.sh +++ b/workflows/common/sh/sched-local.sh @@ -8,4 +8,4 @@ MACHINE="" export LOCAL true # Default PROJECT for CANDLE -export PROJECT=NONE \ No newline at end of file +export PROJECT=NONE diff --git a/workflows/common/sh/sched-summit-tf2.sh b/workflows/common/sh/sched-summit-tf2.sh index fe965bff..48038d49 100644 --- a/workflows/common/sh/sched-summit-tf2.sh +++ b/workflows/common/sh/sched-summit-tf2.sh @@ -17,4 +17,3 @@ fi export PROJECT=${PROJECT:-MED106} # export TURBINE_OUTPUT_SOFTLINK=/dev/null - diff --git a/workflows/common/swift/candle_utils.swift b/workflows/common/swift/candle_utils.swift index afaf494f..9f033b05 100644 --- a/workflows/common/swift/candle_utils.swift +++ b/workflows/common/swift/candle_utils.swift @@ -9,7 +9,7 @@ puts "" puts "report_env() ..." puts "" global env -# puts [ array names env ] +# puts [ array names env ] puts "TURBINE_HOME: $env(TURBINE_HOME)" puts "" set tokens [ split $env(PATH) ":" ] diff --git a/workflows/common/swift/obj_abstention_py.swift b/workflows/common/swift/obj_abstention_py.swift index fe65c2a2..89075f72 100644 --- a/workflows/common/swift/obj_abstention_py.swift +++ b/workflows/common/swift/obj_abstention_py.swift @@ -8,8 +8,8 @@ string code_template = try: import sys, traceback, json, os import model_abstention_runner - import tensorflow - from tensorflow import keras + import tensorflow + from tensorflow import keras obj_result = '-100' outdir = '%s' diff --git a/workflows/common/swift/obj_app.swift b/workflows/common/swift/obj_app.swift index 66a4a40b..315790af 100644 --- a/workflows/common/swift/obj_app.swift +++ b/workflows/common/swift/obj_app.swift @@ -13,12 +13,16 @@ string turbine_output = getenv("TURBINE_OUTPUT"); string outdir; - string myenv = getenv("CANDLE_MODEL_TYPE"); - if (myenv == "SINGULARITY") { - outdir = "%s/run/%s" % (turbine_output, run_id); - } else { - outdir = "%s/output/%s/run/%s" % (getenv("CANDLE_DATA_DIR"), getenv("EXPID"), run_id); - } + + outdir = "%s/run/%s" % (turbine_output, run_id); + + // Comment: this is not needed as turbine_output has already been adjusted + // string myenv = getenv("CANDLE_MODEL_TYPE"); + // if (myenv == "SINGULARITY") { + // outdir = "%s/run/%s" % (turbine_output, run_id); + // } else { + // // outdir = "%s/output/%s/run/%s" % (getenv("CANDLE_DATA_DIR"), getenv("EXPID"), run_id); + // } printf("running model shell script in: %s", outdir); diff --git a/workflows/cp-leaveout/README-chained.md b/workflows/cp-leaveout/README-chained.md index 64d4d335..102a8e46 100644 --- a/workflows/cp-leaveout/README-chained.md +++ b/workflows/cp-leaveout/README-chained.md @@ -1,41 +1,40 @@ -# Challenge Problem: Leave Out - Job Chained Workflow # +# Challenge Problem: Leave Out - Job Chained Workflow This workflow runs the CP Leave Out workflow using job chaining. Each stage of the workflow will be submitted as a separate job where subsequent stages are only run when the previous job on which they depend has successfully completed. -For example, if the workflow configuration consists of an initial 4 Uno model runs, and a -subsequent 16 model runs where each of those model runs require the trained weights -of one of the initial 4 as input, then the first 4 will be submitted as a job, and -the second 16 as a job that will only begin running when the first has successfully +For example, if the workflow configuration consists of an initial 4 Uno model runs, and a +subsequent 16 model runs where each of those model runs require the trained weights +of one of the initial 4 as input, then the first 4 will be submitted as a job, and +the second 16 as a job that will only begin running when the first has successfully completed. ## Requirements -* Check out Benchmarks branch loocv into a compute-node writeable directory, - e.g., /gpfs/alpine/med106/scratch/$USER - * Edit uno_baseline_keras2.py to replace uno_default_model.txt with uno_auc_model.txt - * Set `BENCHMARKS_ROOT` in your submission script (see below), -e.g., test-1.sh, to this compute node writable Benchmarks directory. -* The following data files are required: - * A plan json file (e.g., `plangen_cell1593-p4_drug1779-p1.json`) - * A dataframe file (e.g., `top_21.res_reg.cf_rnaseq.dd_dragon7.labled.feather`), a feather or parquet - file will be faster. +- Check out Benchmarks branch loocv into a compute-node writeable directory, + e.g., /gpfs/alpine/med106/scratch/$USER + - Edit uno_baseline_keras2.py to replace uno_default_model.txt with uno_auc_model.txt + - Set `BENCHMARKS_ROOT` in your submission script (see below), + e.g., test-1.sh, to this compute node writable Benchmarks directory. +- The following data files are required: + - A plan json file (e.g., `plangen_cell1593-p4_drug1779-p1.json`) + - A dataframe file (e.g., `top_21.res_reg.cf_rnaseq.dd_dragon7.labled.feather`), a feather or parquet + file will be faster. +## Running the Workflow -## Running the Workflow ## - -Sample files for configuring and running the workflow are in the `test-chained` directory. +Sample files for configuring and running the workflow are in the `test-chained` directory. The workflow itself is launched using the python script `py/run_chained.py`. Essentially, `run_chained.py` does the following: 1. Reads a configuration file specifying what data files to use, how many stages to run, -and how to configure each of those stages (e.g. PROCS, WALLTIME, etc.), + and how to configure each of those stages (e.g. PROCS, WALLTIME, etc.), 2. Generates a UPF file for each stage where each UPF file contains the node ids to run for that stage, -3. Runs each stage as a separate UPF-style workflow job, managing the job and parent model weight location dependencies appropriately. +3. Runs each stage as a separate UPF-style workflow job, managing the job and parent model weight location dependencies appropriately. Each individual stage job submission launched by `run_chained.py` follows the pattern of the other Supervisor workflows where -a *test* submission script is executed which in turn sources *sys* and *prm* configurations, and then -calls another script (e.g., `swift/cpl-upf-workflow.sh`) that performs further configuration and executes the swift script +a _test_ submission script is executed which in turn sources _sys_ and _prm_ configurations, and then +calls another script (e.g., `swift/cpl-upf-workflow.sh`) that performs further configuration and executes the swift script (e.g., `swift/cpl-upf-workflow.swift`). `run_chained.py` performs this individual job submission for each stage by: @@ -53,17 +52,17 @@ usage: run_chained.py [-h] --config CONFIG [--stages STAGES] [--dry_run] [--first_stage_parent_directory FIRST_STAGE_PARENT_DIRECTORY] ``` -* --config - the path of the workflow configuration file -* --stages - the number of stages to run. This will override the value specified in the configuration file -* --dry_run - executes the workflow, displaying the configuration for each stage, but does **not** submit any jobs -* --first_stage - the stage at which to start the workflow. The stage count starts with *1* and a `first_stage` of *1* corresponds to the initial parentless stage. This will override the value specified in the configuration file -* --first_stage_parent_directory - the file system location of the first stage's parent stage, when `first_stage` is greater than 1. This will override the value specified in the configuration file +- --config - the path of the workflow configuration file +- --stages - the number of stages to run. This will override the value specified in the configuration file +- --dry_run - executes the workflow, displaying the configuration for each stage, but does **not** submit any jobs +- --first*stage - the stage at which to start the workflow. The stage count starts with \_1* and a `first_stage` of _1_ corresponds to the initial parentless stage. This will override the value specified in the configuration file +- --first_stage_parent_directory - the file system location of the first stage's parent stage, when `first_stage` is greater than 1. This will override the value specified in the configuration file Of these only `--config` is required. The `first_stage` argument can be used to continue a previously run job chaining workflow. For example, if the previous workflow ran stages 1 and 2. Then a `first_stage` argument of 3 and -a `first_stage_parent_directory` argument that points to the experiment directory of the previously run stage 2 will continue the previous workflow starting at stage 3. +a `first_stage_parent_directory` argument that points to the experiment directory of the previously run stage 2 will continue the previous workflow starting at stage 3. `run_chained.py` should be run from within the test-chained directory. @@ -71,24 +70,23 @@ a `first_stage_parent_directory` argument that points to the experiment director The configuration file has the following json format (see `test-chained/cfg.json` for an example): -* site: the name of the hpc site (e.g. "summit") -* plan: the path to the challenge problem leave one out plan file -* submit_script: the script used for the individual stage job submission (e.g. test-chained/test-1.sh) -* upf_directory: the directory where the upf files are written out to -* stages: the number of stages to run. -1 = run all the stages -* first_stage: the stage at which to start the workflow. A value of 1 means the initial parentless stage. -* first_stage_parent_directory: the file system location of the first stage's parent stage, when `first_stage` is greater than 1. -* stage_cfg_script: the staget configuration script (e.g. `test-chained/cfg-stage-sys.sh`) sourced by the -submit script to set the configuration (WALLTIME etc.) for each individual stage run. -Environment variables specified in the "stage_cfgs" (see below) will override those in this file. -* stage_cfgs: a list of optional stage configurations, where each configuration is a json map. By default, if no -stage configuration is defined for a particular stage or PROCS and PPN are not defined in that -stage configuration, then PROCS will be set to the number of plan nodes to run (i.e., the length of the UPF file) + 1 and PPN will be set to 1. In this way, the default is to run all the Uno model runs -concurrently. For the other environment variables in a stage configuration, the defaults in the -stage_cfg_script will be used. All the key value pairs in a stage configuration except for *stage* are preserved as environment variables when the submit_script is called and will override those (e.g., WALLTIME, etc.) in the stage_cfg_script. A stage configuration map can have the following entries. - * stage: the stage number - * X: where X is an environment variable from the stage_cfg_script, e.g. WALLTIME, PROCS, PPN, etc. - +- site: the name of the hpc site (e.g. "summit") +- plan: the path to the challenge problem leave one out plan file +- submit_script: the script used for the individual stage job submission (e.g. test-chained/test-1.sh) +- upf_directory: the directory where the upf files are written out to +- stages: the number of stages to run. -1 = run all the stages +- first_stage: the stage at which to start the workflow. A value of 1 means the initial parentless stage. +- first_stage_parent_directory: the file system location of the first stage's parent stage, when `first_stage` is greater than 1. +- stage_cfg_script: the staget configuration script (e.g. `test-chained/cfg-stage-sys.sh`) sourced by the + submit script to set the configuration (WALLTIME etc.) for each individual stage run. + Environment variables specified in the "stage_cfgs" (see below) will override those in this file. +- stage*cfgs: a list of optional stage configurations, where each configuration is a json map. By default, if no + stage configuration is defined for a particular stage or PROCS and PPN are not defined in that + stage configuration, then PROCS will be set to the number of plan nodes to run (i.e., the length of the UPF file) + 1 and PPN will be set to 1. In this way, the default is to run all the Uno model runs + concurrently. For the other environment variables in a stage configuration, the defaults in the + stage_cfg_script will be used. All the key value pairs in a stage configuration except for \_stage* are preserved as environment variables when the submit_script is called and will override those (e.g., WALLTIME, etc.) in the stage_cfg_script. A stage configuration map can have the following entries. + - stage: the stage number + - X: where X is an environment variable from the stage_cfg_script, e.g. WALLTIME, PROCS, PPN, etc. ### An Example Run @@ -114,9 +112,9 @@ Resovled Stage Configuration: PPN: 1 WALLTIME: 01:00:00 TURBINE_DIRECTIVE: \n#BSUB -alloc_flags "NVME maximizegpfs"\n## JOB 0 - TURBINE_LAUNCH_OPTIONS: -a1 -c42 -g1 + TURBINE_LAUNCH_OPTIONS: -a1 -c42 -g1 BENCHMARK_TIMEOUT: -1 - SH_TIMEOUT: + SH_TIMEOUT: IGNORE_ERRORS: 0 CPL-UPF-WORKFLOW.SH: Running model: uno for EXPID: X134 sourcing /autofs/nccs-svm1_proj/med106/ncollier/repos/Supervisor/workflows/common/sh/env-summit.sh @@ -157,9 +155,9 @@ Resovled Stage Configuration: PPN: 1 WALLTIME: 00:45:00 TURBINE_DIRECTIVE: \n#BSUB -alloc_flags "NVME maximizegpfs"\n#BSUB -w done(704496) - TURBINE_LAUNCH_OPTIONS: -a1 -c42 -g1 + TURBINE_LAUNCH_OPTIONS: -a1 -c42 -g1 BENCHMARK_TIMEOUT: -1 - SH_TIMEOUT: + SH_TIMEOUT: IGNORE_ERRORS: 0 CPL-UPF-WORKFLOW.SH: Running model: uno for EXPID: X135 sourcing /autofs/nccs-svm1_proj/med106/ncollier/repos/Supervisor/workflows/common/sh/env-summit.sh @@ -215,9 +213,9 @@ Resovled Stage Configuration: PPN: 1 WALLTIME: 01:00:00 TURBINE_DIRECTIVE: \n#BSUB -alloc_flags "NVME maximizegpfs"\n## JOB 0 - TURBINE_LAUNCH_OPTIONS: -a1 -c42 -g1 + TURBINE_LAUNCH_OPTIONS: -a1 -c42 -g1 BENCHMARK_TIMEOUT: -1 - SH_TIMEOUT: + SH_TIMEOUT: IGNORE_ERRORS: 0 @@ -228,8 +226,8 @@ Resovled Stage Configuration: PPN: 1 WALLTIME: 00:45:00 TURBINE_DIRECTIVE: \n#BSUB -alloc_flags "NVME maximizegpfs"\n#BSUB -w done() - TURBINE_LAUNCH_OPTIONS: -a1 -c42 -g1 + TURBINE_LAUNCH_OPTIONS: -a1 -c42 -g1 BENCHMARK_TIMEOUT: -1 - SH_TIMEOUT: + SH_TIMEOUT: IGNORE_ERRORS: 0 -``` \ No newline at end of file +``` diff --git a/workflows/cp-leaveout/db/README.adoc b/workflows/cp-leaveout/db/README.adoc index 13608c11..770c5779 100644 --- a/workflows/cp-leaveout/db/README.adoc +++ b/workflows/cp-leaveout/db/README.adoc @@ -35,4 +35,3 @@ Reset (delete) DB nodes, forcing them to be re-run ---- $ db/reset-node.sh experiments/X085/restarts-1/cplo.db 1.2.3.2 ---- - diff --git a/workflows/cp-leaveout/db/diff-dbs.sh b/workflows/cp-leaveout/db/diff-dbs.sh index a208750e..78026b08 100755 --- a/workflows/cp-leaveout/db/diff-dbs.sh +++ b/workflows/cp-leaveout/db/diff-dbs.sh @@ -23,4 +23,3 @@ sqlite3 $DB2 < $THIS/print-db.sql > $TXT2 diff $TXT1 $TXT2 rm $TXT1 $TXT2 - diff --git a/workflows/cp-leaveout/db/reset-node.sh b/workflows/cp-leaveout/db/reset-node.sh index fe52381f..eba72930 100755 --- a/workflows/cp-leaveout/db/reset-node.sh +++ b/workflows/cp-leaveout/db/reset-node.sh @@ -19,5 +19,3 @@ EOF # UPDATE runhist SET status="RESET" WHERE (subplan_id LIKE "${NODE}%") ; # EOF - - diff --git a/workflows/cp-leaveout/py/README.md b/workflows/cp-leaveout/py/README.md index b195320b..d4f6a18f 100644 --- a/workflows/cp-leaveout/py/README.md +++ b/workflows/cp-leaveout/py/README.md @@ -1,36 +1,38 @@ -# Uno: Milestone 13 Transfer Learning +# Uno: Milestone 13 Transfer Learning + This README discusses the use of the `plangen.py` script to partition feature sets for experiments with large scale transfer learning and parallel model training. The utility does the following: -* Accept text files containing lists of feature names of arbitray length, each is called a feature-set -* Generate unique combinations of features from each feature set, setting the stage for transfer learning (partitioning) -* Construct a tree depicting how successive, parallel training sessions can be scheduled upon the completion of a predecessor/parent (planning) +- Accept text files containing lists of feature names of arbitray length, each is called a feature-set +- Generate unique combinations of features from each feature set, setting the stage for transfer learning (partitioning) +- Construct a tree depicting how successive, parallel training sessions can be scheduled upon the completion of a predecessor/parent (planning) ## Overview + A number of partitioning schemes and data representation strategies have been discussed. The focus here is the configuration agreed upon at the May 2019 CANDLE hack-a-thon. Specifically: -* There are two feature sets, cell-lines and drugs. -* In a prototype implementation, each feature-set will contain eight entries. The target configuration will have 1000 cell features and 1000 drug features. -* Partitioning is accomplished by recursively splitting the cell vs drug graph into quadrants. -* Each such partitioning presents four training opportunities, each uniquely combines three quadrants and omits one. -* The omitted quadrant defines validation data for the training run. Partitioning/planning recurs on this quadrant to define successors. -* The four training operations can be scheduled to run in parallel once the training of their common parent completes. -* The partitioning scheme as well as the training parent/child relationships will be expressed in a JSON document. +- There are two feature sets, cell-lines and drugs. +- In a prototype implementation, each feature-set will contain eight entries. The target configuration will have 1000 cell features and 1000 drug features. +- Partitioning is accomplished by recursively splitting the cell vs drug graph into quadrants. +- Each such partitioning presents four training opportunities, each uniquely combines three quadrants and omits one. +- The omitted quadrant defines validation data for the training run. Partitioning/planning recurs on this quadrant to define successors. +- The four training operations can be scheduled to run in parallel once the training of their common parent completes. +- The partitioning scheme as well as the training parent/child relationships will be expressed in a JSON document. ## Running the script -`plangen.py` arguments are defined in `planargs.py`. `sample-command-line` is a script that demonstrates the parameters used to accomplish the objectives outlined above. Refer to that sample when reading the argument descriptions below. `--help` gives a brief summary of all arguments. +`plangen.py` arguments are defined in `planargs.py`. `sample-command-line` is a script that demonstrates the parameters used to accomplish the objectives outlined above. Refer to that sample when reading the argument descriptions below. `--help` gives a brief summary of all arguments. The critical parameters are `--fs_names`, `--fs_paths` and `--fs_parts`. In each `fs` stands for feature_set. Each parameter is required and each must specify the same number values. `--fs_names` takes two or more values providing feature set names such as `cells` and `drugs`. -`fs_paths` takes path sepecifications for the corresponding feature-set files. All of the usual file search rules apply, they can be relative or absolute paths. Optionally, `--in_dir` can be used to provide common high-level qualification. +`fs_paths` takes path sepecifications for the corresponding feature-set files. All of the usual file search rules apply, they can be relative or absolute paths. Optionally, `--in_dir` can be used to provide common high-level qualification. -`fs_parts` defines the partitioning scheme for each of the feature-sets. So in our scenario above, `--fs_parts 2 2` specifies that at each iteration, both the `cells` and `drugs` feature-sets will be halved, giving the quadrants discussed above at each iteration. Non-symetric partitioning may prove useful when the number of feature-set line items diverges from the "square" model. +`fs_parts` defines the partitioning scheme for each of the feature-sets. So in our scenario above, `--fs_parts 2 2` specifies that at each iteration, both the `cells` and `drugs` feature-sets will be halved, giving the quadrants discussed above at each iteration. Non-symetric partitioning may prove useful when the number of feature-set line items diverges from the "square" model. `--in_dir` is optional. It can be used to simplify the coding of `--fs_paths` path names. The rules of os.path.join() apply. `--out_dir` is optional. It can be used to place output files, the JSON format plan in particular, to a specific directory. -`--debug` is optional. If specified, the final plan dictionary is pretty-printed. This is quite a bit easier to read than the JSON file. +`--debug` is optional. If specified, the final plan dictionary is pretty-printed. This is quite a bit easier to read than the JSON file. `--test` is optional. If specified, a demonstration of the plan tree navigation API is . See below. @@ -40,7 +42,7 @@ plangen.cell8-p2.drug8-p2.json is a sample plan constructed using cell and drug ## Plan tree navigation and content retrieval -Given a JSON-format "plan tree" generated by `plangen.py` and loaded by `load_plan()`, the navigation and retrieval functions described below are used to navigate predecessor/successor (i.e. parent/child) relationships useful for synchronizing transfer learning training suites. A plan tree is a true tree. It has a single "root" node at its origin and any number of successor nodes. The root is the predecessor of these successors. Every node in the tree, except for the root, has a single predecessor and zero or more successors. In a transfer learning environment, the successors of a given training session inherit the model of their predecessor once that predecessor completes. +Given a JSON-format "plan tree" generated by `plangen.py` and loaded by `load_plan()`, the navigation and retrieval functions described below are used to navigate predecessor/successor (i.e. parent/child) relationships useful for synchronizing transfer learning training suites. A plan tree is a true tree. It has a single "root" node at its origin and any number of successor nodes. The root is the predecessor of these successors. Every node in the tree, except for the root, has a single predecessor and zero or more successors. In a transfer learning environment, the successors of a given training session inherit the model of their predecessor once that predecessor completes. Each plan tree node is named (it is a dictionary) - the root node is usually named '1' and its value is a dictionary of metadata including the arguments that were used to generate it. Use the `get_node()` function without the `node_name` argument to acquire the root name and its associated metadata. All successor node names are derived from their parent by appending sequence numbers separated by a delmiter - but this is of no concern to a program navigating the tree, the names are opaque. @@ -53,7 +55,6 @@ Each plan tree node is named (it is a dictionary) - the root node is usually nam ## Contact Richard Turgeon - -Created: 2019-06-07 + +Created: 2019-06-07 Modified: 2019-06-18 - diff --git a/workflows/cp-leaveout/py/data_setup.py b/workflows/cp-leaveout/py/data_setup.py index 953d0d20..c77589d4 100644 --- a/workflows/cp-leaveout/py/data_setup.py +++ b/workflows/cp-leaveout/py/data_setup.py @@ -1,28 +1,33 @@ - # DATA SETUP PY -import datetime, os, sys, time - -from pathlib import Path +import datetime +import os +import sys +import time import traceback -from runner_utils import ModelResult +from pathlib import Path + import topN_to_uno +from runner_utils import ModelResult + class TopN_Args: + def __init__(self, dataframe_from, node, plan, output): self.dataframe_from = dataframe_from self.node = node self.plan = plan self.fold = None - self.incremental = 'True' + self.incremental = "True" self.cell_feature_selection = None self.drug_feature_selection = None self.output = output + def setup_nvm(params): # username = os.environ['USER'] # No longer works on Summit 2021-10-13 username = params["user"] - nvme_enabled = Path('/mnt/bb/{}'.format(username)).exists() + nvme_enabled = Path("/mnt/bb/{}".format(username)).exists() # nvme_enabled = True print("NVMe: %r" % nvme_enabled) if not nvme_enabled: @@ -36,7 +41,7 @@ def setup_nvm(params): count = dest.write_bytes(src.read_bytes()) stop = time.time() duration = stop - start - rate = count / duration / (1024*1024) + rate = count / duration / (1024 * 1024) print("File copy completed. Original dataframe " + "copied to NVM in %0.1f seconds (%0.1f MB/s)." % (duration, rate)) @@ -44,8 +49,7 @@ def setup_nvm(params): print("File copy skipped. " + "Original dataframe already exists in NVM.") except Exception as e: - print("Error occurred in copying original dataframe\n" + - str(e)) + print("Error occurred in copying original dataframe\n" + str(e)) traceback.print_exc() return ModelResult.ERROR params["dataframe_from"] = dest.resolve() @@ -55,21 +59,25 @@ def setup_nvm(params): def pre_run(params): - import sys, time + import sys + import time + print("data_setup.pre_run(): node: '%s' ..." % params["node"]) sys.stdout.flush() # softlink to cache & config file # build node specific training/validation dataset - args = TopN_Args(params["dataframe_from"], - params["node"], - params["plan"], - params["use_exported_data"]) + args = TopN_Args( + params["dataframe_from"], + params["node"], + params["plan"], + params["use_exported_data"], + ) data = params["benchmark_data"] try: - for filename in [ "uno_auc_model.txt" ]: # "cache", + for filename in ["uno_auc_model.txt"]: # "cache", if not os.path.islink(filename): src = f"{data}/{filename}" print("data_setup: src: (%s)" % src) @@ -84,8 +92,7 @@ def pre_run(params): return ModelResult.ERROR try: - print("data_setup: build_dataframe(output=%s) ..." % - args.output) + print("data_setup: build_dataframe(output=%s) ..." % args.output) sys.stdout.flush() if not os.path.exists(args.output): params = setup_nvm(params) @@ -112,12 +119,11 @@ def pre_run(params): directory = params["instance_directory"] with open(directory + "/NO-DATA.txt", "a") as fp: ts = datetime.datetime.now() - iso = ts.isoformat(sep=' ', timespec='seconds') + iso = ts.isoformat(sep=" ", timespec="seconds") fp.write(iso + "\n") return ModelResult.SKIP except ValueError: - print("data_setup: caught ValueError for node: '%s'" % - params["node"]) + print("data_setup: caught ValueError for node: '%s'" % params["node"]) sys.stdout.flush() traceback.print_exc(file=sys.stdout) return ModelResult.ERROR @@ -131,6 +137,7 @@ def pre_run(params): sys.stdout.flush() return ModelResult.SUCCESS + def post_run(params, output_dict): print("data_setup(): post_run") sys.stdout.flush() diff --git a/workflows/cp-leaveout/py/planargs.py b/workflows/cp-leaveout/py/planargs.py index 27a1cd60..2bb6b785 100644 --- a/workflows/cp-leaveout/py/planargs.py +++ b/workflows/cp-leaveout/py/planargs.py @@ -1,90 +1,107 @@ -""" -plangen command line argument definitions -""" +"""plangen command line argument definitions.""" +import argparse +import glob import os import sys -import glob -import argparse -partitioning_strategies = ['leaveout', 'undefined1', 'undefined2'] # to be completed ????????????? +partitioning_strategies = [ + "leaveout", + "undefined1", + "undefined2", +] # to be completed ????????????? + def parse_arguments(): - parser = argparse.ArgumentParser( - description='feature-set partioning' - ) + parser = argparse.ArgumentParser(description="feature-set partioning") - parser.add_argument('--in_dir', + parser.add_argument("--in_dir", type=str, - help='Directory containing feature-set list files') + help="Directory containing feature-set list files") - parser.add_argument('--out_dir', - default='results', - type=str, - help='Directory to contain generated plan files') + parser.add_argument( + "--out_dir", + default="results", + type=str, + help="Directory to contain generated plan files", + ) - parser.add_argument('--json', - action='store_true', - help='Generate plan in JSON format') + parser.add_argument("--json", + action="store_true", + help="Generate plan in JSON format") - parser.add_argument('--overwrite', - action='store_true', - help='Accept non-empty out_dir, contents overwritten') + parser.add_argument( + "--overwrite", + action="store_true", + help="Accept non-empty out_dir, contents overwritten", + ) - parser.add_argument ('--partition_strategy', - choices=partitioning_strategies, - default=partitioning_strategies[0], - help='Specify a feature-set partitioning strategy') + parser.add_argument( + "--partition_strategy", + choices=partitioning_strategies, + default=partitioning_strategies[0], + help="Specify a feature-set partitioning strategy", + ) # The following fs_* arguments are required, the number of values specified for each - # must match, and at least two values are required for each - - parser.add_argument('--fs_names', - required=True, - type=str, - nargs='+', - help='Specify a list of (arbitrary) feature-set names') + # must match, and at least two values are required for each + + parser.add_argument( + "--fs_names", + required=True, + type=str, + nargs="+", + help="Specify a list of (arbitrary) feature-set names", + ) - parser.add_argument('--fs_paths', - required=True, - type=str, - nargs='+', - help='Specify a list of feature-set file paths') + parser.add_argument( + "--fs_paths", + required=True, + type=str, + nargs="+", + help="Specify a list of feature-set file paths", + ) - parser.add_argument('--fs_parts', - required=True, - type=int, - nargs='+', - help='Specify a list of partition counts') + parser.add_argument( + "--fs_parts", + required=True, + type=int, + nargs="+", + help="Specify a list of partition counts", + ) - parser.add_argument('--first_parts', - required=False, - type=int, - nargs='+', - help='Optionally, specify a list of first pass partition counts') + parser.add_argument( + "--first_parts", + required=False, + type=int, + nargs="+", + help="Optionally, specify a list of first pass partition counts", + ) - # misc + # misc - parser.add_argument('--maxdepth', - type=int, - default=0, - help='Apply a constraint to the plan tree depth') + parser.add_argument( + "--maxdepth", + type=int, + default=0, + help="Apply a constraint to the plan tree depth", + ) - parser.add_argument('--verbose', - action='store_true', - help='Verbosity') + parser.add_argument("--verbose", action="store_true", help="Verbosity") - parser.add_argument('--debug', - action='store_true', - help='Show complete plan tree structure') + parser.add_argument("--debug", + action="store_true", + help="Show complete plan tree structure") - parser.add_argument('--print_tree', - action='store_true', - help='Dump the complete plan tree - potentially lengthy!') + parser.add_argument( + "--print_tree", + action="store_true", + help="Dump the complete plan tree - potentially lengthy!", + ) - parser.add_argument('--test', - action='store_true', - help='Test plan navigation and entry retrieval') + parser.add_argument("--test", + action="store_true", + help="Test plan navigation and entry retrieval") - args= parser.parse_args() + args = parser.parse_args() return args diff --git a/workflows/cp-leaveout/py/plangen.py b/workflows/cp-leaveout/py/plangen.py index 43554b06..c5275cb0 100644 --- a/workflows/cp-leaveout/py/plangen.py +++ b/workflows/cp-leaveout/py/plangen.py @@ -1,33 +1,29 @@ - -from collections import deque -from collections import namedtuple -from enum import Enum import glob import itertools as it import json -import numpy as np import os -import sys import sqlite3 -from sqlite3 import Error as db_Error +import sys import traceback +from abc import ABC, abstractmethod # abstract class support +from collections import OrderedDict, deque, namedtuple +from datetime import datetime +from enum import Enum +from pprint import pprint as pp +from sqlite3 import Error as db_Error +import numpy as np import planargs - -from abc import ABC, abstractmethod # abstract class support -from collections import OrderedDict from scipy.special import comb -from pprint import pprint as pp -from datetime import datetime -ISO_TIMESTAMP = "seconds" # timestamp to ISO string -ISO_TIMESTAMP_ENCODE = "%Y-%m-%dT%H:%M:%S" # ISO string to timestamp +ISO_TIMESTAMP = "seconds" # timestamp to ISO string +ISO_TIMESTAMP_ENCODE = "%Y-%m-%dT%H:%M:%S" # ISO string to timestamp DEBUG_SQL = True def isempty(path): """Determine whether the given directory is empty.""" - flist = glob.glob(os.path.join(path,'*')) + flist = glob.glob(os.path.join(path, "*")) return flist == [] @@ -85,7 +81,9 @@ def validate_args(args): reqd_lengths = [nbr_feature_sets] * 4 if test_lengths != reqd_lengths: - sys.exit("Error: The lengths of all feature set definition args (fs_<>) must be identical") + sys.exit( + "Error: The lengths of all feature set definition args (fs_<>) must be identical" + ) if nbr_feature_sets <= 1: sys.exit("Error: Partitioning requires multiple feature sets") @@ -98,13 +96,19 @@ def validate_args(args): # validate input and output directories if args.in_dir and not os.path.isdir(args.in_dir): - sys.exit("Error: --in_dir must designate a directory, '%s' is not valid" % args.in_dir) + sys.exit( + "Error: --in_dir must designate a directory, '%s' is not valid" % + args.in_dir) if not os.path.isdir(args.out_dir): - sys.exit("Error: --out_dir must designate a directory, '%s' is not valid" % args.out_dir) + sys.exit( + "Error: --out_dir must designate a directory, '%s' is not valid" % + args.out_dir) if not args.overwrite and not isempty(args.out_dir): - sys.exit("Error: --out_dir '%s' is not empty, --overwrite not specified" % args.out_dir) + sys.exit( + "Error: --out_dir '%s' is not empty, --overwrite not specified" % + args.out_dir) if verbose: print("Writing plan information to %s" % os.path.abspath(args.out_dir)) @@ -114,7 +118,7 @@ def validate_args(args): args.fs_lines = [] file_error = False if args.in_dir == None: - args.in_dir = '' # prepare for use in os.path.join() + args.in_dir = "" # prepare for use in os.path.join() for i, path in enumerate(args.fs_paths): fullpath = os.path.join(args.in_dir, path) @@ -122,23 +126,23 @@ def validate_args(args): file_error = True print("Error: %s file not found" % fullpath) else: - with open(fullpath, 'r') as f: # read text and sanitize + with open(fullpath, "r") as f: # read text and sanitize raw_lines = f.readlines() text = [line.strip() for line in raw_lines] - text = [l for l in text if l != ''] + text = [l for l in text if l != ""] fs_content.append(text) args.fs_lines.append(len(text)) if verbose: - print("Loading '%s' feature set definition from %s - %d lines" - % (args.fs_names[i], fullpath, len(text))) + print("Loading '%s' feature set definition from %s - %d lines" % + (args.fs_names[i], fullpath, len(text))) if file_error: sys.exit("Terminating due to error") # construct a partitioning object exporting a partion() function - if args.partition_strategy == 'leaveout': + if args.partition_strategy == "leaveout": generator = LeaveoutSubsetGenerator() # return feature-set contents lists @@ -157,47 +161,40 @@ class SubsetGenerator(ABC): partitioning schemes. Subclasses should implement their specializations. """ - def __init__(self, name=''): + def __init__(self, name=""): self.name = name self.term_msg = "Terminating due to error" @abstractmethod - def partition( - self, - base, - size=None, - count=None, - name='-unspecified-' - ): + def partition(self, base, size=None, count=None, name="-unspecified-"): """Partition a feature-set array. - Partition the 'base', a list of elements, using the abstract arguments - 'size' and 'count' to tailor the implementation's algorithm. 'name' is - used in error reporting and is optional. + Partition the 'base', a list of elements, using the abstract + arguments 'size' and 'count' to tailor the implementation's + algorithm. 'name' is used in error reporting and is optional. """ validate(self, base, size, count, name) return [] def get_plan_label(self, plan_dict, root_name): root = plan_dict[root_name] - return root['label'] + return root["label"] - def _validation_error(self, base_len, size, count, name='-unspecified-'): - """Provide a common error reporting function. """ + def _validation_error(self, base_len, size, count, name="-unspecified-"): + """Provide a common error reporting function.""" print("Base list length: %d requested %d sublists of length %d" % - (base_len, count, size)) + (base_len, count, size)) - def validate(self, base, size=None, count=None, name='-unspecified-'): + def validate(self, base, size=None, count=None, name="-unspecified-"): """Provide basic request validation, specific generators may impose - additional requirements. - """ + additional requirements.""" berror = False base_len = len(base) if size == None or size <= 0 or size > base_len: berror = True else: - unique_combos = comb(base_len, size) # implements N take K + unique_combos = comb(base_len, size) # implements N take K if count > unique_combos: berror = True if berror: @@ -205,22 +202,27 @@ def validate(self, base, size=None, count=None, name='-unspecified-'): return not berror + # # UNDER EVALUATION ????????????????????????????????????????????????????? # + class IterativeSubsetGenerator(SubsetGenerator): - """ Tom Brettin method... subset generation via iteration over base""" + """Tom Brettin method... + + subset generation via iteration over base + """ + def __init__(self): - SubsetGenerator.__init__(self, 'IterativeSubsetGenerator') + SubsetGenerator.__init__(self, "IterativeSubsetGenerator") def partition(self, base, size=None, count=0, name=None): - """ """ + """""" if size is None: print("Error: Unspecified list partitioning size") sys.exit(3) - """ base_len = len(base) if count == 0: # a simplification useful in the iterative approach @@ -248,12 +250,13 @@ def partition(self, base, size=None, count=0, name=None): if org >= base_len: org = org % base_len if org == 0 and i > 0: - print("Warning: %d sublists of %s completed short of the requested %d" + print( + "Warning: %d sublists of %s completed short of the requested %d" % (i, name, count)) break end = org + size - sublist = np_base.take(range(org, end), mode='wrap') + sublist = np_base.take(range(org, end), mode="wrap") print(sublist) selected_sublists.append(sublist) @@ -263,43 +266,52 @@ def partition(self, base, size=None, count=0, name=None): class LeaveoutSubsetGenerator(SubsetGenerator): """CANDLE milestone 13 style feature set partitioning. - All SubsetGenerator subclasses are required to implement partition(), - plan_init() and plan_term() functions. + All SubsetGenerator subclasses are required to implement + partition(), plan_init() and plan_term() functions. """ def __init__(self): - SubsetGenerator.__init__(self, 'LeaveoutSubsetGenerator') + SubsetGenerator.__init__(self, "LeaveoutSubsetGenerator") self.strategy = "leaveout" - def plan_init(self, fs_names, fs_paths, fs_lines, fs_parts, maxdepth, root_name='1'): - """Initialize - collect plan metadata """ + def plan_init(self, + fs_names, + fs_paths, + fs_lines, + fs_parts, + maxdepth, + root_name="1"): + """Initialize - collect plan metadata""" currtime = datetime.now() - details = {'fs_names': fs_names, 'fs_filepaths':fs_paths, 'fs_parts': fs_parts} - details['create_date'] = currtime.isoformat(timespec=ISO_TIMESTAMP) - details['strategy'] = self.strategy - - label = '' + details = { + "fs_names": fs_names, + "fs_filepaths": fs_paths, + "fs_parts": fs_parts + } + details["create_date"] = currtime.isoformat(timespec=ISO_TIMESTAMP) + details["strategy"] = self.strategy + + label = "" for i in range(len(fs_names)): if i != 0: - label += '_' - s = '{}{}-p{}'.format(fs_names[i], fs_lines[i], fs_parts[i]) + label += "_" + s = "{}{}-p{}".format(fs_names[i], fs_lines[i], fs_parts[i]) label += s if maxdepth > 0: - label += '-maxdepth{}'.format(maxdepth) + label += "-maxdepth{}".format(maxdepth) - details['label'] = label + details["label"] = label plan_dict = OrderedDict() plan_dict[root_name] = details return root_name, plan_dict def plan_term(self, plan_dict, root_name, nbr_subplans): - """Completion - post plan summary metadata """ + """Completion - post plan summary metadata""" meta = plan_dict[root_name] - meta['nbr_subplans'] = nbr_subplans - + meta["nbr_subplans"] = nbr_subplans - def partition(self, base, size='n/a', count=None, name=None): + def partition(self, base, size="n/a", count=None, name=None): """Partition a feature-set list into lists of equal sized elements. This partitioner accepts a list of feature-set names and returns @@ -331,7 +343,7 @@ def partition(self, base, size='n/a', count=None, name=None): """ base_len = len(base) - if base_len < count: # can partition any further? + if base_len < count: # can partition any further? return [[feature] for feature in base] size = base_len // count @@ -348,19 +360,23 @@ def partition(self, base, size='n/a', count=None, name=None): return sublists -#------------------------------------------------------------------------------ + +# ------------------------------------------------------------------------------ # Database support, table and column definitions, DDL and DML # Refer to the plan_prep() function for a discussion of the "planstat" and # "runhist" tables defined below. -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ + class RunType(Enum): RUN_ALL = 0 RESTART = 1 -class RunStat(Enum): # subplan execution status - SCHEDULED = 'scheduled' - COMPLETE = 'complete' + +class RunStat(Enum): # subplan execution status + SCHEDULED = "scheduled" + COMPLETE = "complete" + # planstat table, rows are returned via the PlanstatRow namedtuple @@ -373,15 +389,12 @@ class RunStat(Enum): # subplan execution status nbr_subplans INTEGER ); """ -PlanstatRow = namedtuple('PlanstatRow', +PlanstatRow = namedtuple( + "PlanstatRow", [ - 'rowid', - 'plan_name', - 'create_date', - 'feature_sets', - 'partitions', - 'nbr_subplans' - ] + "rowid", "plan_name", "create_date", "feature_sets", "partitions", + "nbr_subplans" + ], ) _select_row_from_planstat = """ @@ -422,23 +435,24 @@ class RunStat(Enum): # subplan execution status PRIMARY KEY (plan_id, subplan_id) ); """ -RunhistRow = namedtuple('RunhistRow', +RunhistRow = namedtuple( + "RunhistRow", [ - 'plan_id', - 'subplan_id', - 'status', - 'start_time', - 'stop_time', - 'run_mins', - 'loss', - 'mae', - 'r2', - 'val_loss', - 'val_mae', - 'val_r2', - 'lr', - 'other_info' - ] + "plan_id", + "subplan_id", + "status", + "start_time", + "stop_time", + "run_mins", + "loss", + "mae", + "r2", + "val_loss", + "val_mae", + "val_r2", + "lr", + "other_info", + ], ) _select_row_from_runhist = """ @@ -490,7 +504,7 @@ def log(msg): fp.flush() -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ # "Plan management" Database functions # # db_connect - establish database connection returning conn handle @@ -500,7 +514,8 @@ def log(msg): # stop_subplan - stop a subplan, update RunhistRow # get_subplan_runhist - return a RunhistRow for a given subplan # plan_remove - remove all database records for the named plan -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ + def execute_sql_stmt(conn, stmt, cursor=None, trap_exception=False): """Execute a SQL statement. @@ -536,13 +551,12 @@ def execute_sql_stmt(conn, stmt, cursor=None, trap_exception=False): except db_Error as e: db_exception = True - print('execute_sql_stmt: caught exception') - print('execute_sql_stmt:', stmt) - print('execute_sql_stmt:', e) + print("execute_sql_stmt: caught exception") + print("execute_sql_stmt:", stmt) + print("execute_sql_stmt:", e) info = sys.exc_info() s = traceback.format_tb(info[2]) - print('PLANGEN TRACEBACK:\n' + - str(e) + ' ... \n' + ''.join(s)) + print("PLANGEN TRACEBACK:\n" + str(e) + " ... \n" + "".join(s)) sys.stdout.flush() if not trap_exception: raise @@ -572,7 +586,7 @@ def db_connect(db_path): A connection handle is returned upon success, else None """ - if db_path == ':memory:' or not os.path.exists(db_path): + if db_path == ":memory:" or not os.path.exists(db_path): prev_allocated = False else: prev_allocated = True @@ -580,12 +594,12 @@ def db_connect(db_path): try: conn = sqlite3.connect(db_path) except db_Error as error: - print('db_connect', error) + print("db_connect", error) raise # create plan management tables on initial database allocation if conn and not prev_allocated: - complete = execute_sql_stmt(conn, _planstat_ddl) + complete = execute_sql_stmt(conn, _planstat_ddl) complete &= execute_sql_stmt(conn, _runhist_ddl) if complete: @@ -612,10 +626,10 @@ def plan_remove(db_path, plan_path): conn = db_connect(db_path) plan_key = _get_planstat_key(plan_path) stmt = _select_row_from_planstat.format(plan_key) - csr = conn.cursor() + csr = conn.cursor() execute_sql_stmt(conn, stmt, cursor=csr) nrow = csr.rowcount - row = csr.fetchone() + row = csr.fetchone() print("%d run history rows deleted" % nrow) @@ -623,8 +637,8 @@ def plan_remove(db_path, plan_path): print("Error: CLEANUP request failed - %s has not been run" % plan_key) status = -1 else: - plan_rec = PlanstatRow._make(row) # column-name addressable - rowid = plan_rec.rowid # the unique rowid is the plan uniquifier + plan_rec = PlanstatRow._make(row) # column-name addressable + rowid = plan_rec.rowid # the unique rowid is the plan uniquifier _delete_runhistory(conn, rowid) stmt = _delete_planstat_plan.format(rowid) status = execute_sql_stmt(conn, stmt) @@ -675,53 +689,55 @@ def plan_prep(db_path, plan_path, run_type=RunType.RUN_ALL): """ # load the plan and retrieve identity info - plan_dict = load_plan(plan_path) - create_date = get_plan_create_date(plan_dict) + plan_dict = load_plan(plan_path) + create_date = get_plan_create_date(plan_dict) feature_sets = get_plan_fs_names(plan_dict) - partitions = get_plan_fs_parts(plan_dict) - nbr_subplans = get_plan_nbr_subplans(plan_dict) + partitions = get_plan_fs_parts(plan_dict) + nbr_subplans = get_plan_nbr_subplans(plan_dict) # determine if a plan of the given name has already been registered conn = db_connect(db_path) plan_key = _get_planstat_key(plan_path) stmt = _select_row_from_planstat.format(plan_key) - csr = conn.cursor() + csr = conn.cursor() execute_sql_stmt(conn, stmt, cursor=csr) - row = csr.fetchone() + row = csr.fetchone() if not row: rowid = -1 else: - plan_rec = PlanstatRow._make(row) # column-name addressable - rowid = plan_rec.rowid # the unique rowid will be the uniquifier returned + plan_rec = PlanstatRow._make(row) # column-name addressable + rowid = plan_rec.rowid # the unique rowid will be the uniquifier returned # compare run_type to initial expectations error = False if run_type == RunType.RUN_ALL and rowid > 0: - print("Error: RUN_ALL specified but plan: %s has already been defined" % plan_key) + print("Error: RUN_ALL specified but plan: %s has already been defined" % + plan_key) error = True elif run_type == RunType.RESTART and rowid < 0: - print("Warning: RESTART specified but plan: %s has not been previously run" % plan_key) - - elif rowid > 0 and create_date != create_date: # DEBUG ???????????????????????????????????? plan_rec.create_date: - print("Error: RESTART specified but the signature of the previously defined plan: %s does not match" % plan_key) + print( + "Warning: RESTART specified but plan: %s has not been previously run" + % plan_key) + + elif (rowid > 0 and create_date != create_date + ): # DEBUG ???????????????????????????????????? plan_rec.create_date: + print( + "Error: RESTART specified but the signature of the previously defined plan: %s does not match" + % plan_key) error = True # register new plans acquiring the uniquifying plan_id used to compose runhistory table keys if not error and rowid < 0: feature_sets = str(feature_sets) - feature_sets = feature_sets.replace("'", "") # create string literal from list of str - partitions = str(partitions) # create string literal from list of int - - stmt = _insert_planstat_plan.format( - plan_key, - create_date, - feature_sets, - partitions, - nbr_subplans - ) + feature_sets = feature_sets.replace( + "'", "") # create string literal from list of str + partitions = str(partitions) # create string literal from list of int + + stmt = _insert_planstat_plan.format(plan_key, create_date, feature_sets, + partitions, nbr_subplans) status = execute_sql_stmt(conn, stmt, cursor=csr) rowid = csr.lastrowid @@ -736,7 +752,11 @@ def plan_prep(db_path, plan_path, run_type=RunType.RUN_ALL): return rowid -def start_subplan(db_path, plan_path, plan_id=None, subplan_id=None, run_type=None): +def start_subplan(db_path, + plan_path, + plan_id=None, + subplan_id=None, + run_type=None): """Schedule the execution of a subplan. This function writes a RunhistRow record to the runhist table indicating that @@ -760,7 +780,7 @@ def start_subplan(db_path, plan_path, plan_id=None, subplan_id=None, run_type=No print("plangen: start_subplan: subplan_id=%s" % subplan_id) sys.stdout.flush() conn = db_connect(db_path) - csr = conn.cursor() + csr = conn.cursor() skip = False print("plangen: start_subplan: run_type: '%s'" % str(run_type)) @@ -769,7 +789,7 @@ def start_subplan(db_path, plan_path, plan_id=None, subplan_id=None, run_type=No sys.stdout.flush() # skip previously completed work if RESTART - if 'RESTART' in str(run_type): + if "RESTART" in str(run_type): print("plangen: start_subplan: checking restart: %i" % plan_id) sys.stdout.flush() stmt = _select_row_from_runhist.format(plan_id, subplan_id) @@ -792,12 +812,9 @@ def start_subplan(db_path, plan_path, plan_id=None, subplan_id=None, run_type=No currtime = datetime.now() start_time = currtime.isoformat(timespec=ISO_TIMESTAMP) - stmt = _insupd_scheduled_runhist.format( - plan_id, - subplan_id, - RunStat.SCHEDULED.name, - start_time - ) + stmt = _insupd_scheduled_runhist.format(plan_id, subplan_id, + RunStat.SCHEDULED.name, + start_time) execute_sql_stmt(conn, stmt, cursor=csr) @@ -832,21 +849,23 @@ def stop_subplan(db_path, plan_id=None, subplan_id=None, comp_info_dict={}): """ conn = db_connect(db_path) - csr = conn.cursor() - curr_time = datetime.now() + csr = conn.cursor() + curr_time = datetime.now() stop_time = curr_time.isoformat(timespec=ISO_TIMESTAMP) - comp_dict = dict( - loss=0.0, mae=0.0, r2=0.0, - val_loss=0.0, val_mae=0.0, val_r2=0.0, - lr=0.0 - ) + comp_dict = dict(loss=0.0, + mae=0.0, + r2=0.0, + val_loss=0.0, + val_mae=0.0, + val_r2=0.0, + lr=0.0) comp_info_dict = extract_history(comp_info_dict) remainder = _acquire_actuals(comp_dict, comp_info_dict) if len(remainder) == 0: - other_info = '' + other_info = "" else: other_info = json.dumps(remainder) @@ -855,30 +874,31 @@ def stop_subplan(db_path, plan_id=None, subplan_id=None, comp_info_dict={}): execute_sql_stmt(conn, stmt, csr) row = csr.fetchone() - if row: # expected, caller error if already marked COMPLETED + if row: # expected, caller error if already marked COMPLETED runhist_rec = RunhistRow._make(row) if runhist_rec.status != RunStat.COMPLETE.name: - start_time = datetime.strptime(runhist_rec.start_time, ISO_TIMESTAMP_ENCODE) - duration = curr_time - start_time - run_mins = int((duration.total_seconds() + 59) / 60) + start_time = datetime.strptime(runhist_rec.start_time, + ISO_TIMESTAMP_ENCODE) + duration = curr_time - start_time + run_mins = int((duration.total_seconds() + 59) / 60) # update runhist record stmt = _insupd_completed_runhist.format( - # column values + # column values RunStat.COMPLETE.name, stop_time, run_mins, - comp_dict['loss'], - comp_dict['mae'], - comp_dict['r2'], - comp_dict['val_loss'], - comp_dict['val_mae'], - comp_dict['val_r2'], - comp_dict['lr'], + comp_dict["loss"], + comp_dict["mae"], + comp_dict["r2"], + comp_dict["val_loss"], + comp_dict["val_mae"], + comp_dict["val_r2"], + comp_dict["lr"], other_info, - # key spec + # key spec plan_id, - subplan_id + subplan_id, ) execute_sql_stmt(conn, stmt) @@ -930,7 +950,7 @@ def get_subplan_runhist(db_path, plan_id=None, subplan_id=None): """ conn = db_connect(db_path) stmt = _select_row_from_runhist.format(plan_id, subplan_id) - csr = conn.cursor() + csr = conn.cursor() execute_sql_stmt(conn, stmt, csr) row = csr.fetchone() @@ -950,19 +970,19 @@ def _acquire_actuals(dft_dict, actuals_dict): dft_dict[key] = actuals[key] actuals.pop(key) - return actuals # possibly empty + return actuals # possibly empty def _get_planstat_key(plan_path): """Extract the name portion of a plan from a filepath.""" basename = os.path.basename(plan_path) - basepfx = basename.split(sep='.') + basepfx = basename.split(sep=".") return basepfx[0] def _delete_runhistory(conn, plan_id): """Delete RunhistRows containing the given plan_id.""" - csr = conn.cursor() + csr = conn.cursor() stmt = _delete_from_runhistory.format(plan_id) execute_sql_stmt(conn, stmt, cursor=csr, trap_exception=True) rowcount = csr.rowcount @@ -971,9 +991,10 @@ def _delete_runhistory(conn, plan_id): return rowcount -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ # Plan navigation, content retrieval -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ + def load_plan(filepath): """Load a JSON transfer learning plan. @@ -989,30 +1010,36 @@ def load_plan(filepath): An entry-ordered plan in OrderedDict format is returned. """ - with open(filepath, 'r') as f: + with open(filepath, "r") as f: ordered_plan_dict = json.load(f, object_pairs_hook=OrderedDict) return ordered_plan_dict + def get_plan_create_date(plan_dict): _, value = _get_first_entry(plan_dict) - return value['create_date'] + return value["create_date"] + def get_plan_fs_names(plan_dict): _, value = _get_first_entry(plan_dict) - return value['fs_names'] + return value["fs_names"] + def get_plan_fs_parts(plan_dict): _, value = _get_first_entry(plan_dict) - return value['fs_parts'] + return value["fs_parts"] + def get_plan_nbr_subplans(plan_dict): _, value = _get_first_entry(plan_dict) - return value['nbr_subplans'] + return value["nbr_subplans"] + def _get_first_entry(ordered_dict): key, value = next(iter(ordered_dict.items())) return key, value + def get_subplan(plan_dict, subplan_id=None): """Retrieve the content of a named subplan or the root plan. @@ -1048,12 +1075,12 @@ def get_predecessor(plan_dict, subplan_id): is specified None is returned. """ - segments = subplan_id.split(sep='.') + segments = subplan_id.split(sep=".") if len(segments) <= 1: subplan_id = None else: segments.pop() - subplan_id = '.'.join(segments) + subplan_id = ".".join(segments) return subplan_id @@ -1073,7 +1100,7 @@ def get_successors(plan_dict, subplan_id): """ successor_names = [] for i in it.count(start=1): - new_name = subplan_id + '.' + str(i) + new_name = subplan_id + "." + str(i) value = plan_dict.get(new_name) if not value: break @@ -1082,12 +1109,17 @@ def get_successors(plan_dict, subplan_id): return successor_names -def _get_named_set(plan_dict, subplan_id, section_tag, fs_name, collector, parent_features=None): - """ """ +def _get_named_set(plan_dict, + subplan_id, + section_tag, + fs_name, + collector, + parent_features=None): + """""" while True: content, _ = get_subplan(plan_dict, subplan_id) - assert(content) + assert content section = content[section_tag] for i, section_features in enumerate(section): @@ -1146,37 +1178,37 @@ def get_subplan_features(plan_dict, subplan_id, parent_features=False): return None, None, None, None # peek inside the training set to capture active feature-set names - train_set = content['train'][0] + train_set = content["train"][0] fs_names = [name for name in train_set.keys()] # categorize the results result = {} result[0] = fs_names - result['train'] = {} - result['val'] = {} + result["train"] = {} + result["val"] = {} - for set_name, pf in [('train', True), ('val', False)]: + for set_name, pf in [("train", True), ("val", False)]: if pf == True: pf = parent_features for fs_name in fs_names: collector = [] - _get_named_set( - plan_dict, - subplan_id, - set_name, - fs_name, - collector, - parent_features=pf - ) + _get_named_set(plan_dict, + subplan_id, + set_name, + fs_name, + collector, + parent_features=pf) result[set_name][fs_name] = collector - return result, result[0], result['train'], result['val'] + return result, result[0], result["train"], result["val"] + -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ # Plan construction -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ + def build_dictionary_from_lists(seq_list, names): """Create a dictionary with 'names' as labels and 'seq_list' values.""" @@ -1186,7 +1218,12 @@ def build_dictionary_from_lists(seq_list, names): return dict -def build_plan_tree(args, feature_set_content, parent_plan_id='', depth=0, data_pfx='', plan_pfx=''): +def build_plan_tree(args, + feature_set_content, + parent_plan_id="", + depth=0, + data_pfx="", + plan_pfx=""): """Generate a plan supporting training, transfer-learning, resume-training. ADD GENERAL DOC @@ -1225,9 +1262,9 @@ def build_plan_tree(args, feature_set_content, parent_plan_id='', depth=0, data_ all_parts = [] - #flat_partitions = [] # preserve, used for file-based approach - #files = [] # preserve, used for file-based approach - #sequence = 0 # preserve, used for file-based approach + # flat_partitions = [] # preserve, used for file-based approach + # files = [] # preserve, used for file-based approach + # sequence = 0 # preserve, used for file-based approach xxx = False for i in range(len(args.fs_names)): @@ -1236,7 +1273,8 @@ def build_plan_tree(args, feature_set_content, parent_plan_id='', depth=0, data_ if depth == 0: count = args.first_parts[i] feature_set_name = args.fs_names[i] - partitions = args.generator.partition(feature_set_content[i], count=count) + partitions = args.generator.partition(feature_set_content[i], + count=count) all_parts.append(partitions) # acquire a cross-product of all feature-set partitions @@ -1258,19 +1296,26 @@ def build_plan_tree(args, feature_set_content, parent_plan_id='', depth=0, data_ train.append(section) # generate next depth/level (successor) plans - curr_plan_id = '{}.{}'.format(parent_plan_id, step + 1) - args.plan_dict[curr_plan_id] = {'val': val, 'train': train} - data_name = '{}.{}'.format(data_pfx, step + 1) - plan_name = '{}.{}'.format(plan_pfx, step + 1) + curr_plan_id = "{}.{}".format(parent_plan_id, step + 1) + args.plan_dict[curr_plan_id] = {"val": val, "train": train} + data_name = "{}.{}".format(data_pfx, step + 1) + plan_name = "{}.{}".format(plan_pfx, step + 1) # depth-first, shorthand representation of tree showing first feature names if args.debug: - indent = ' ' * (depth * 4) + indent = " " * (depth * 4) print(indent, curr_plan_id) - indent += ' ' * 4 + indent += " " * 4 fs = parts_xprod[step] for i in range(len(fs)): - print(indent, args.fs_names[i], 'count:', len(fs[i]), 'first:', fs[i][0]) + print( + indent, + args.fs_names[i], + "count:", + len(fs[i]), + "first:", + fs[i][0], + ) substeps += build_plan_tree( args, @@ -1278,12 +1323,11 @@ def build_plan_tree(args, feature_set_content, parent_plan_id='', depth=0, data_ parent_plan_id=curr_plan_id, depth=curr_depth, data_pfx=data_name, - plan_pfx=plan_name + plan_pfx=plan_name, ) steps += substeps return steps - """ # THIS IS A WORK-IN-PROGRESS ... GENERATING FILES FOR DATA AND PLAN @@ -1347,44 +1391,46 @@ def build_plan_tree(args, feature_set_content, parent_plan_id='', depth=0, data_ return """ + def write_file(fname, title, string_list): """Write text expressed as an array of lines to file.""" - with open(fname, 'w') as f: + with open(fname, "w") as f: for line in string_list: f.write(line) + def write_dict_to_json(dictionary, fname): """Write dictionary to a json file.""" - with open(fname, 'w') as f: + with open(fname, "w") as f: json.dump(dictionary, f) -#---------------------------------------------------------------------------------- + +# ---------------------------------------------------------------------------------- # various hard-coded lists, test cases - the synthetic feature-sets remain useful -#---------------------------------------------------------------------------------- +# ---------------------------------------------------------------------------------- -""" -synthetic_cell_names = ['cell_' + '%04d' % (x) for x in range(1000)] -synthetic_drug_names = ['drug_' + '%04d' % (x) for x in range(1000)] -""" +# synthetic_cell_names = ['cell_' + '%04d' % (x) for x in range(1000)] +# synthetic_drug_names = ['drug_' + '%04d' % (x) for x in range(1000)] -#---------------------------------------------------------------------------------- +# ---------------------------------------------------------------------------------- # mainline -#---------------------------------------------------------------------------------- +# ---------------------------------------------------------------------------------- + def main(): # Acquire and validate arguments args = planargs.parse_arguments() - args.json = True # the only available option thus far + args.json = True # the only available option thus far generator, feature_set_content = validate_args(args) args.generator = generator root_name, args.plan_dict = generator.plan_init( - fs_names = args.fs_names, # validated cmdline arg - fs_paths = args.fs_paths, # validated cmdline arg - fs_lines = args.fs_lines, # created by validate_args - fs_parts = args.fs_parts, # validated cmdline arg - maxdepth = args.maxdepth + fs_names=args.fs_names, # validated cmdline arg + fs_paths=args.fs_paths, # validated cmdline arg + fs_lines=args.fs_lines, # created by validate_args + fs_parts=args.fs_parts, # validated cmdline arg + maxdepth=args.maxdepth, ) # feature_set_content = [cell_names, drug_names] @@ -1402,23 +1448,26 @@ def main(): break # Plan generation - data_fname_pfx = os.path.join(args.out_dir, 'DATA.1') - plan_fname_pfx = os.path.join(args.out_dir, 'PLAN.1') + data_fname_pfx = os.path.join(args.out_dir, "DATA.1") + plan_fname_pfx = os.path.join(args.out_dir, "PLAN.1") steps = build_plan_tree( - args, # command line argument namespace - feature_set_content, # for example [[cell1 ... celln] [drug1 ... drugn]] - parent_plan_id=root_name, # name of root plan, subplan names created from this stem - data_pfx=data_fname_pfx, # DATA file prefix, building block for feature name files - plan_pfx=plan_fname_pfx # PLAN file prefix, building block for plan name files + args, # command line argument namespace + feature_set_content, # for example [[cell1 ... celln] [drug1 ... drugn]] + parent_plan_id= + root_name, # name of root plan, subplan names created from this stem + data_pfx= + data_fname_pfx, # DATA file prefix, building block for feature name files + plan_pfx= + plan_fname_pfx, # PLAN file prefix, building block for plan name files ) generator.plan_term(args.plan_dict, root_name, steps) - print("Plan generation complete, total steps: %d" % steps) + print("Plan generation complete, total steps: %d" % steps) if args.json: label = args.generator.get_plan_label(args.plan_dict, root_name) - qualified_name = 'plangen_' + label + '.json' + qualified_name = "plangen_" + label + ".json" json_file_name = os.path.join(args.out_dir, qualified_name) json_abspath = os.path.abspath(json_file_name) write_dict_to_json(args.plan_dict, json_abspath) @@ -1426,22 +1475,24 @@ def main(): if args.print_tree: print("Plan dictionary generated") - pp(args.plan_dict, width=160) # DEBUG comment this out for large plans + pp(args.plan_dict, width=160) # DEBUG comment this out for large plans if args.test: # test1(json_abspath, "test1_sql.db") test2(json_abspath, "test3_sql.db") -#---------------------------------------------------------------------------------- + +# ---------------------------------------------------------------------------------- # sqlite3 API functions -#---------------------------------------------------------------------------------- +# ---------------------------------------------------------------------------------- + def test2(plan_path, db_path): - #run_type = RunType.RESTART + # run_type = RunType.RESTART run_type = RunType.RUN_ALL plan_name = os.path.basename(plan_path) - plan_id = plan_prep(db_path, plan_name, run_type) + plan_id = plan_prep(db_path, plan_name, run_type) plan_dict = load_plan(plan_path) metadata, root_name = get_subplan(plan_dict) @@ -1450,7 +1501,7 @@ def test2(plan_path, db_path): queue.append(root_name) print("Test2 start") - for iloop in it.count(start = 0): + for iloop in it.count(start=0): if len(queue) == 0: print("Test2 complete - proc loop count: %d" % iloop) break @@ -1468,7 +1519,7 @@ def test2(plan_path, db_path): plan_path, plan_id=plan_id, subplan_id=curr_subplan, - run_type=run_type + run_type=run_type, ) if status < 0: @@ -1476,12 +1527,14 @@ def test2(plan_path, db_path): continue completion_status = dict( - loss=['dont', 'want', 'this', 1.1], - mae=['nope', 2.2], + loss=["dont", "want", "this", 1.1], + mae=["nope", 2.2], r2=[3.3], - val_loss=6.6, val_mae=7.7, val_r2=8.8, + val_loss=6.6, + val_mae=7.7, + val_r2=8.8, lr=0.9, - some_new_thing='abc' + some_new_thing="abc", ) scalar_dict = extract_history(completion_status) @@ -1490,20 +1543,21 @@ def test2(plan_path, db_path): db_path, plan_id=plan_id, subplan_id=curr_subplan, - comp_info_dict=scalar_dict + comp_info_dict=scalar_dict, ) print("Completing subplan %6d" % iloop) -#---------------------------------------------------------------------------------- + +# ---------------------------------------------------------------------------------- # def test1(plan_path, db_path): run_type = RunType.RESTART - #run_type = RunType.RUN_ALL + # run_type = RunType.RUN_ALL plan_name = os.path.basename(plan_path) - plan_id = plan_prep(db_path, plan_name, run_type) + plan_id = plan_prep(db_path, plan_name, run_type) - if (plan_id < 0): + if plan_id < 0: sys.exit("Terminating due to database detected error") print("\nBegin plan navigation and subplan retrieval test") @@ -1514,11 +1568,13 @@ def test1(plan_path, db_path): # the root has no parent / predecessor parent_name = get_predecessor(plan_dict, root_name) - print("Demonstrate that root \'%s\' predecessor is not defined: %s" % (root_name, parent_name)) + print("Demonstrate that root '%s' predecessor is not defined: %s" % + (root_name, parent_name)) # the root contains metadata, it is not a run specification successor_names = get_successors(plan_dict, root_name) - print("\nThe first runable configurations are defined in %s\n" % successor_names) + print("\nThe first runable configurations are defined in %s\n" % + successor_names) # the root is the predecessor of these first level runables for sname in successor_names: @@ -1527,35 +1583,38 @@ def test1(plan_path, db_path): # run the right subtree print("\nRun the rightmost subtree \n") - for i in it.count(start = 1): + for i in it.count(start=1): listlen = len(successor_names) if listlen == 0: break for name in successor_names: - status = start_subplan( - db_path, - plan_path, - plan_id=plan_id, - subplan_id=name, - run_type=run_type - ) + status = start_subplan(db_path, + plan_path, + plan_id=plan_id, + subplan_id=name, + run_type=run_type) if status < 0: print("subplan: %s skipped, previously processed" % name) - select_one = successor_names[listlen - 1] + select_one = successor_names[listlen - 1] parent_name = get_predecessor(plan_dict, select_one) - print("%-16s is a successor of %-16s - all successors: %s" % (select_one, parent_name, successor_names)) + print("%-16s is a successor of %-16s - all successors: %s" % + (select_one, parent_name, successor_names)) # test feature lists retrieval API get_subplan_features - value,_ = get_subplan(plan_dict, select_one) + value, _ = get_subplan(plan_dict, select_one) if i < 3: for pf in [False, True]: - _, fs_name_list, train_list, val_list = get_subplan_features(plan_dict, select_one, parent_features=pf) - if False: # very verbose, use only as needed! ??????????????????????????????????????????????????????? - print("\nsubplan original:", select_one, "parent features:", pf) + _, fs_name_list, train_list, val_list = get_subplan_features( + plan_dict, select_one, parent_features=pf) + if ( + False + ): # very verbose, use only as needed! ??????????????????????????????????????????????????????? + print("\nsubplan original:", select_one, "parent features:", + pf) pp(plan_dict[select_one]) print("\nflattened TRAIN") pp(train_list) @@ -1563,25 +1622,33 @@ def test1(plan_path, db_path): pp(val_list) # test runhist retrieval api - row = get_subplan_runhist(db_path, plan_id=plan_id, subplan_id=select_one) - #print(row) + row = get_subplan_runhist(db_path, + plan_id=plan_id, + subplan_id=select_one) + # print(row) # post subplan termination - completion_status = dict(mse=1.1, mae=2.2, r_square=.555, misc='no such column', data=123) + completion_status = dict(mse=1.1, + mae=2.2, + r_square=0.555, + misc="no such column", + data=123) stop_subplan( db_path, plan_id=plan_id, subplan_id=select_one, - comp_info_dict=completion_status + comp_info_dict=completion_status, ) successor_names = get_successors(plan_dict, select_one) print("\nEnd of branch reached") + + # plan_remove(db_path, "plangen_cell8-p2_drug8-p2.json") -#---------------------------------------------------------------------------------- +# ---------------------------------------------------------------------------------- if __name__ == "__main__": main() diff --git a/workflows/cp-leaveout/py/run_chained.py b/workflows/cp-leaveout/py/run_chained.py index bd321c3b..c7f8c2fa 100644 --- a/workflows/cp-leaveout/py/run_chained.py +++ b/workflows/cp-leaveout/py/run_chained.py @@ -1,20 +1,39 @@ -import subprocess -import os +import argparse +import io import json +import os +import subprocess import sys -import io -import argparse import plangen + class Config: - - REQS = ['site', 'plan', 'submit_script', 'upf_directory', 'stages', 'stage_cfg_script', 'job_chain_arg'] - STAGE_CFG_KEYS = ['stage', 'PROCS', 'TURBINE_LAUNCH_ARGS', 'TURBINE_DIRECTIVE_ARGS', - 'WALLTIME', 'IGNORE_ERRORS', 'SH_TIMEOUT', 'BENCHMARK_TIMEOUT', - 'PPN'] - INT_KEYS = ['PROCS', 'PPN', 'BENCHMARK_TIMEOUT', 'SH_TIMEOUT', 'IGNORE_ERRORS'] - + + REQS = [ + "site", + "plan", + "submit_script", + "upf_directory", + "stages", + "stage_cfg_script", + "job_chain_arg", + ] + STAGE_CFG_KEYS = [ + "stage", + "PROCS", + "TURBINE_LAUNCH_ARGS", + "TURBINE_DIRECTIVE_ARGS", + "WALLTIME", + "IGNORE_ERRORS", + "SH_TIMEOUT", + "BENCHMARK_TIMEOUT", + "PPN", + ] + INT_KEYS = [ + "PROCS", "PPN", "BENCHMARK_TIMEOUT", "SH_TIMEOUT", "IGNORE_ERRORS" + ] + def __init__(self, cfg): self.cfg = cfg self.stage_cfgs = {} @@ -23,23 +42,28 @@ def validate(self): for r in Config.REQS: if not r in self.cfg: return (False, "Required property '{}' is missing".format(r)) - - self.cfg['stages'] = int(self.cfg['stages']) - if 'stage_cfgs' in self.cfg: - for stage_cfg in self.cfg['stage_cfgs']: - if not 'stage' in stage_cfg: - return (False, "A stage_cfg map is missing required 'stage' property") + self.cfg["stages"] = int(self.cfg["stages"]) + + if "stage_cfgs" in self.cfg: + for stage_cfg in self.cfg["stage_cfgs"]: + if not "stage" in stage_cfg: + return ( + False, + "A stage_cfg map is missing required 'stage' property", + ) for k in stage_cfg: if k not in Config.STAGE_CFG_KEYS: - return (False, "Unknow stage configuration property {}".format(k)) - - stage = int(stage_cfg['stage']) + return ( + False, + "Unknow stage configuration property {}".format(k), + ) + + stage = int(stage_cfg["stage"]) # delete it as its not a proper env var - del stage_cfg['stage'] + del stage_cfg["stage"] self.stage_cfgs[stage] = stage_cfg - return (True,) def get_stage_environment(self, stage): @@ -60,81 +84,105 @@ def update_stage_cfgs(self, runs_per_stage): scfg = self.stage_cfgs[stage] if "PROCS" not in scfg: # + 2: one for swift and one for db rank - scfg['PROCS'] = str(runs + 2) + scfg["PROCS"] = str(runs + 2) if "PPN" not in scfg: - scfg['PPN'] = str(1) - + scfg["PPN"] = str(1) + # update any numeric vals to str values as required for env vars self._vars_to_string(scfg) else: # + 2: one for swift and one for db rank - self.stage_cfgs[stage] = {'PROCS' : str(runs + 2), 'PPN' : str(1)} - + self.stage_cfgs[stage] = {"PROCS": str(runs + 2), "PPN": str(1)} + @property def site(self): - return self.cfg['site'] + return self.cfg["site"] @property def plan(self): - return self.cfg['plan'] + return self.cfg["plan"] @property def submit_script(self): - return self.cfg['submit_script'] + return self.cfg["submit_script"] @property def first_stage(self): - return self.cfg['first_stage'] + return self.cfg["first_stage"] @property def first_stage_parent_directory(self): - return self.cfg['first_stage_parent_directory'] + return self.cfg["first_stage_parent_directory"] @first_stage.setter def first_stage(self, value): - self.cfg['first_stage'] = value + self.cfg["first_stage"] = value @first_stage_parent_directory.setter def first_stage_parent_directory(self, value): - self.cfg['first_stage_parent_directory'] = value + self.cfg["first_stage_parent_directory"] = value @property def upf_directory(self): - return self.cfg['upf_directory'] + return self.cfg["upf_directory"] @property def stages(self): - return self.cfg['stages'] - + return self.cfg["stages"] + @stages.setter def stages(self, value): - self.cfg['stages'] = value + self.cfg["stages"] = value @property def stage_cfg_script(self): - return self.cfg['stage_cfg_script'] + return self.cfg["stage_cfg_script"] @property def job_chain_arg(self): - return self.cfg['job_chain_arg'] + return self.cfg["job_chain_arg"] def create_job_chain_directive(self, job_id): - return self.job_chain_arg.replace('', job_id) + return self.job_chain_arg.replace("", job_id) + - def parse_arguments(): parser = argparse.ArgumentParser() # parser.add_argument('--plan', type=str, default='plan.json', # help='plan data file') - parser.add_argument('--stages', type=int, default=-1, - help='number of stages to run (overrides configuration file if non-0)') - parser.add_argument('--config', type=str, default=None, required=True, - help='the configuration file in json format') - parser.add_argument('--dry_run', action='store_true', - help="Runs the workflow with actual job submission, displaying each job's configuration") - - parser.add_argument('--first_stage', type=int, default=1, help='the stage to begin the workflow with') - parser.add_argument('--first_stage_parent_directory', type=str, default='', help='the directory containing the parent model runs for the initial stage, if initial_stage > 1') + parser.add_argument( + "--stages", + type=int, + default=-1, + help="number of stages to run (overrides configuration file if non-0)", + ) + parser.add_argument( + "--config", + type=str, + default=None, + required=True, + help="the configuration file in json format", + ) + parser.add_argument( + "--dry_run", + action="store_true", + help= + "Runs the workflow with actual job submission, displaying each job's configuration", + ) + + parser.add_argument( + "--first_stage", + type=int, + default=1, + help="the stage to begin the workflow with", + ) + parser.add_argument( + "--first_stage_parent_directory", + type=str, + default="", + help= + "the directory containing the parent model runs for the initial stage, if initial_stage > 1", + ) # parser.add_argument('--upf_dir', type=str, default=None, required=True, # help='the output directory for the generated upf files') @@ -145,55 +193,67 @@ def parse_arguments(): return parser.parse_args() + def parse_run_vars(outs): - to_prefix = 'TURBINE_OUTPUT=' - job_id_prefix = 'JOB_ID=' + to_prefix = "TURBINE_OUTPUT=" + job_id_prefix = "JOB_ID=" str_io = io.StringIO(outs) - turbine_output = '' - job_id = '' + turbine_output = "" + job_id = "" for line in str_io.readlines(): line = line.strip() if line.startswith(to_prefix): - turbine_output = line[len(to_prefix) : ] + turbine_output = line[len(to_prefix):] elif line.startswith(job_id_prefix): - job_id = line[len(job_id_prefix) : ] - + job_id = line[len(job_id_prefix):] + return (turbine_output, job_id) def run_script(cfg, args, stage): cmd = [cfg.submit_script] + args env = cfg.get_stage_environment(stage) - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env) + p = subprocess.Popen(cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + env=env) # stderr is redirected to stdout outs, _ = p.communicate() - return outs.decode('utf-8') + return outs.decode("utf-8") + def run_dry_run(upfs, cfg): for i, upf in enumerate(upfs): # UPFS are in stage order stage = i + cfg.first_stage - args = [cfg.site, '-a', cfg.stage_cfg_script, cfg.plan, upf, str(stage)] + args = [cfg.site, "-a", cfg.stage_cfg_script, cfg.plan, upf, str(stage)] if i > 0: - args += ['', '{}'.format(cfg.job_chain_arg)] + args += ["", "{}".format(cfg.job_chain_arg)] elif cfg.first_stage > 1: - args += [cfg.first_stage_parent_directory, '## JOB 0'] + args += [cfg.first_stage_parent_directory, "## JOB 0"] else: - args += ['job0', '## JOB 0'] + args += ["job0", "## JOB 0"] - print('\n########### DRY RUN JOB {}, Stage {} ##############'.format(stage - cfg.first_stage + 1, stage)) - print("Running: {} {}".format(cfg.submit_script, ' '.join(args))) + print("\n########### DRY RUN JOB {}, Stage {} ##############".format( + stage - cfg.first_stage + 1, stage)) + print("Running: {} {}".format(cfg.submit_script, " ".join(args))) env = cfg.get_stage_environment(stage) - if 'TURBINE_DIRECTIVE_ARGS' in env: - env['TURBINE_DIRECTIVE_ARGS'] = '{}\\n{}'.format(args[7], env['TURBINE_DIRECTIVE_ARGS']) + if "TURBINE_DIRECTIVE_ARGS" in env: + env["TURBINE_DIRECTIVE_ARGS"] = "{}\\n{}".format( + args[7], env["TURBINE_DIRECTIVE_ARGS"]) else: - env['TURBINE_DIRECTIVE_ARGS'] = args[7] - p = subprocess.Popen(['bash', "-c", "source {}".format(cfg.stage_cfg_script)], stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, env=env) + env["TURBINE_DIRECTIVE_ARGS"] = args[7] + p = subprocess.Popen( + ["bash", "-c", "source {}".format(cfg.stage_cfg_script)], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + env=env, + ) # stderr is redirected to stdout outs, _ = p.communicate() - print(outs.decode('utf-8')) + print(outs.decode("utf-8")) + def run_upfs(upfs, cfg): job_id = None @@ -201,27 +261,29 @@ def run_upfs(upfs, cfg): for i, upf in enumerate(upfs): # UPFS are in stage order stage = i + cfg.first_stage - args = [cfg.site, '-a', cfg.stage_cfg_script, cfg.plan, upf, str(stage)] + args = [cfg.site, "-a", cfg.stage_cfg_script, cfg.plan, upf, str(stage)] if job_id: # at least second iteration args += [turbine_output, cfg.create_job_chain_directive(job_id)] elif cfg.first_stage > 1: - args += [cfg.first_stage_parent_directory, '## JOB 0'] + args += [cfg.first_stage_parent_directory, "## JOB 0"] else: - args += ['job0', '## JOB 0'] + args += ["job0", "## JOB 0"] outs = run_script(cfg, args, stage) turbine_output, job_id = parse_run_vars(outs) exp_id = os.path.basename(turbine_output) - print('\n########### JOB {} - Stage {} - {} - {} ##############'.format(stage - cfg.first_stage + 1,stage, exp_id, job_id)) - print("Running: {} {}".format(cfg.submit_script, ' '.join(args))) + print("\n########### JOB {} - Stage {} - {} - {} ##############".format( + stage - cfg.first_stage + 1, stage, exp_id, job_id)) + print("Running: {} {}".format(cfg.submit_script, " ".join(args))) print(outs) - print('TURBINE_OUTPUT: {}'.format(turbine_output)) - print('JOB_ID: {}\n'.format(job_id)) + print("TURBINE_OUTPUT: {}".format(turbine_output)) + print("JOB_ID: {}\n".format(job_id)) if not job_id: print("JOB_ID NOT FOUND - ABORTING RUNS") break + def get_plan_info(plan_file): plan_dict = plangen.load_plan(plan_file) # key of first entry is the root node @@ -230,42 +292,45 @@ def get_plan_info(plan_file): total_stages = -1 total_nodes = -1 for k in iter_pd: - # has skipped the root node, so we can get + # has skipped the root node, so we can get # the second element in val - vals = (k.split(".")) + vals = k.split(".") n_vals = len(vals) total_stages = max(total_stages, n_vals) total_nodes = max(total_nodes, int(vals[1])) - + return (root_node, total_stages, total_nodes) + def generate_upfs(prefix, cfg, root_nodes, n_nodes): parents = root_nodes - upf_prefix = '{}/{}_'.format(cfg.upf_directory, prefix) + upf_prefix = "{}/{}_".format(cfg.upf_directory, prefix) upfs = [] counts = [] for s in range(cfg.first_stage, cfg.first_stage + cfg.stages): - upf_path = '{}s{}_upf.txt'.format(upf_prefix, s) + upf_path = "{}s{}_upf.txt".format(upf_prefix, s) parents = generate_stage(parents, n_nodes, upf_path) upfs.append(upf_path) counts.append(len(parents)) return (upfs, counts) + def generate_stage(parents, n_nodes, f_path): children = [] - with open(f_path, 'w') as f_out: + with open(f_path, "w") as f_out: for p in parents: for n in range(1, n_nodes + 1): - child = '{}.{}'.format(p, n) - f_out.write('{}\n'.format(child)) + child = "{}.{}".format(p, n) + f_out.write("{}\n".format(child)) children.append(child) # print('Stage {}: {}'.format(stage, ' '.join(children))) return children - + + def parse_config(args): cfg = None - with open(args.config, 'r') as fin: + with open(args.config, "r") as fin: cfg = Config(json.load(fin)) result = cfg.validate() if not result[0]: @@ -274,40 +339,44 @@ def parse_config(args): if args.stages != 0: cfg.stages = args.stages - + if args.first_stage != 1: cfg.first_stage = args.first_stage - - if args.first_stage_parent_directory != '': + + if args.first_stage_parent_directory != "": cfg.first_stage_parent_directory = args.first_stage_parent_directory return cfg + def compute_parent_nodes(root_node, stage, n_nodes): - """ Computes the the parents nodes of the specified stage """ + """Computes the the parents nodes of the specified stage.""" root_nodes = [root_node] for _ in range(1, stage): children = [] for r in root_nodes: for n in range(1, n_nodes + 1): - child = '{}.{}'.format(r, n) + child = "{}.{}".format(r, n) children.append(child) root_nodes = children - + return root_nodes - + def run(args): cfg = parse_config(args) root_node, total_stages, n_nodes = get_plan_info(cfg.plan) if cfg.first_stage > total_stages: - print("First stage must be less than or equal to total number of stages") + print( + "First stage must be less than or equal to total number of stages") sys.exit() - if cfg.first_stage > 1 and ('first_stage_parent_directory' not in cfg.cfg or - cfg.cfg['first_stage_parent_directory'] == ''): - print("Missing required 'first_stage_parent_directory' argument, when first_stage > 1") + if cfg.first_stage > 1 and ("first_stage_parent_directory" not in cfg.cfg or + cfg.cfg["first_stage_parent_directory"] == ""): + print( + "Missing required 'first_stage_parent_directory' argument, when first_stage > 1" + ) sys.exit() if cfg.stages == -1 or cfg.stages >= total_stages: @@ -318,15 +387,22 @@ def run(args): upfs, runs_per_stage = generate_upfs(prefix, cfg, root_nodes, n_nodes) cfg.update_stage_cfgs(runs_per_stage) - print("\nTotal Jobs: {}\nTotal Stages: {}\nNodes: {}".format(cfg.stages, cfg.stages, n_nodes)) - print("Site: {}\nPlan: {}\nSubmit Script: {}\nStage Configuration Script:{}\nUPF directory: {}".format(cfg.site, cfg.plan, - cfg.submit_script, cfg.stage_cfg_script, cfg.upf_directory)) + print("\nTotal Jobs: {}\nTotal Stages: {}\nNodes: {}".format( + cfg.stages, cfg.stages, n_nodes)) + print( + "Site: {}\nPlan: {}\nSubmit Script: {}\nStage Configuration Script:{}\nUPF directory: {}" + .format( + cfg.site, + cfg.plan, + cfg.submit_script, + cfg.stage_cfg_script, + cfg.upf_directory, + )) for i, c in enumerate(runs_per_stage): stage = cfg.first_stage + i scfg = cfg.stage_cfgs[stage] - print("\tStage: {}, UPF: {}, Model Runs: {}, PROCS: {}, PPN: {}".format(stage, - os.path.basename(upfs[i]), c, scfg['PROCS'], scfg['PPN'])) - + print("\tStage: {}, UPF: {}, Model Runs: {}, PROCS: {}, PPN: {}".format( + stage, os.path.basename(upfs[i]), c, scfg["PROCS"], scfg["PPN"])) # TODO Add Dry Run -- for each upf source the cfg-sys as a POpen if args.dry_run: @@ -334,6 +410,7 @@ def run(args): else: run_upfs(upfs, cfg) + if __name__ == "__main__": args = parse_arguments() run(args) diff --git a/workflows/cp-leaveout/py/tests/.gitignore b/workflows/cp-leaveout/py/tests/.gitignore index 91273a57..7634d2a9 100644 --- a/workflows/cp-leaveout/py/tests/.gitignore +++ b/workflows/cp-leaveout/py/tests/.gitignore @@ -1 +1 @@ -test_out/ \ No newline at end of file +test_out/ diff --git a/workflows/cp-leaveout/py/tests/test_run_chained.py b/workflows/cp-leaveout/py/tests/test_run_chained.py index 79518cf7..bc343026 100644 --- a/workflows/cp-leaveout/py/tests/test_run_chained.py +++ b/workflows/cp-leaveout/py/tests/test_run_chained.py @@ -1,27 +1,29 @@ # Run with python -m unittest tests.test_run_chained from parent directory - -import unittest import os +import unittest import run_chained + class RunChainedTests(unittest.TestCase): def test_root_nodes(self): - root_node = '1' + root_node = "1" first_stage = 1 n_nodes = 4 - root_nodes = run_chained.compute_parent_nodes(root_node, first_stage, n_nodes) - self.assertEqual(['1'], root_nodes) + root_nodes = run_chained.compute_parent_nodes(root_node, first_stage, + n_nodes) + self.assertEqual(["1"], root_nodes) first_stage = 3 n_nodes = 4 - root_nodes = run_chained.compute_parent_nodes(root_node, first_stage, n_nodes) + root_nodes = run_chained.compute_parent_nodes(root_node, first_stage, + n_nodes) self.assertEqual(16, len(root_nodes)) for a in range(1, 5): for b in range(1, 5): - self.assertTrue('1.{}.{}'.format(a, b) in root_nodes) + self.assertTrue("1.{}.{}".format(a, b) in root_nodes) def read_lines(self, fname): with open(fname) as f_in: @@ -30,37 +32,45 @@ def read_lines(self, fname): def test_upfs(self): - if os.path.exists('./tests/test_out/test_upf_s1_upf.txt'): - os.remove('./tests/test_out/test_upf_s1_upf.txt') + if os.path.exists("./tests/test_out/test_upf_s1_upf.txt"): + os.remove("./tests/test_out/test_upf_s1_upf.txt") - args = {'upf_directory' : './tests/test_out', 'first_stage' : 1, 'stages' : 1} + args = { + "upf_directory": "./tests/test_out", + "first_stage": 1, + "stages": 1 + } cfg = run_chained.Config(args) root_nodes = run_chained.compute_parent_nodes(1, 1, 4) - run_chained.generate_upfs('test_upf', cfg, root_nodes, 4) - vals = self.read_lines('./tests/test_out/test_upf_s1_upf.txt') - self.assertEqual(['1.1', '1.2', '1.3', '1.4'], vals) + run_chained.generate_upfs("test_upf", cfg, root_nodes, 4) + vals = self.read_lines("./tests/test_out/test_upf_s1_upf.txt") + self.assertEqual(["1.1", "1.2", "1.3", "1.4"], vals) - if os.path.exists('./tests/test_out/test_upf_s2_upf.txt'): - os.remove('./tests/test_out/test_upf_s2_upf.txt') - os.remove('./tests/test_out/test_upf_s3_upf.txt') + if os.path.exists("./tests/test_out/test_upf_s2_upf.txt"): + os.remove("./tests/test_out/test_upf_s2_upf.txt") + os.remove("./tests/test_out/test_upf_s3_upf.txt") - args = {'upf_directory' : './tests/test_out', 'first_stage' : 2, 'stages' : 2} + args = { + "upf_directory": "./tests/test_out", + "first_stage": 2, + "stages": 2 + } cfg = run_chained.Config(args) root_nodes = run_chained.compute_parent_nodes(1, 2, 4) - upfs, runs_per_stage = run_chained.generate_upfs('test_upf', cfg, root_nodes, 4) + upfs, runs_per_stage = run_chained.generate_upfs( + "test_upf", cfg, root_nodes, 4) vals = self.read_lines(upfs[0]) self.assertEqual(16, len(vals)) self.assertEqual(16, runs_per_stage[0]) for a in range(1, 5): for b in range(1, 5): - self.assertTrue('1.{}.{}'.format(a, b) in vals) - + self.assertTrue("1.{}.{}".format(a, b) in vals) + vals = self.read_lines(upfs[1]) self.assertEqual(64, len(vals)) self.assertEqual(64, runs_per_stage[1]) for a in range(1, 5): for b in range(1, 5): for c in range(1, 5): - self.assertTrue('1.{}.{}.{}'.format(a, b, c) in vals) - \ No newline at end of file + self.assertTrue("1.{}.{}.{}".format(a, b, c) in vals) diff --git a/workflows/cp-leaveout/scripts/Node.py b/workflows/cp-leaveout/scripts/Node.py index b25afd94..24ada79d 100644 --- a/workflows/cp-leaveout/scripts/Node.py +++ b/workflows/cp-leaveout/scripts/Node.py @@ -1,4 +1,3 @@ - # NODE PY # The training node information as stored in the logs @@ -22,12 +21,12 @@ def __init__(self, id=None, logger=None): # Number of training steps performed self.steps = 0 # Various error metrics: - self.loss = None + self.loss = None self.val_loss = None - self.mse = None - self.mae = None - self.r2 = None - self.corr = None + self.mse = None + self.mae = None + self.r2 = None + self.corr = None # Differences wrt parent (lower is better) self.loss_delta = None self.val_loss_delta = None @@ -36,11 +35,11 @@ def __init__(self, id=None, logger=None): # Epochs prescribed by the workflow self.epochs_planned = None # Epochs actually run (consider early stopping) - self.epochs_actual = 0 + self.epochs_actual = 0 # Epochs cumulative: include parents' epochs (CP weight-sharing) - self.epochs_cumul = None + self.epochs_cumul = None self.date_start = None - self.date_stop = None + self.date_stop = None # Time to build dataframe self.build_df = None # Time to load initial weights @@ -63,7 +62,7 @@ def __init__(self, id=None, logger=None): def set_id(self, id, logger=None): self.id = id - self.stage = (len(self.id) - 1 ) // 2 + self.stage = (len(self.id) - 1) // 2 self.debug(logger, "SET ID: " + id) def new_segment(self): @@ -83,39 +82,45 @@ def __str__(self): special = " INCOMPLETE!" if self.stopped_early: special = " EARLY STOP!" - return "Node [%s]: %s (epochs=%i/%s, loss=%s, val_loss=%s)%s" % \ - (Node.maybe_str_integer(self.stage), - self.id, - self.epochs_actual, - Node.maybe_str_integer(self.epochs_planned), - Node.maybe_str_float(self.loss, "%0.6f"), - Node.maybe_str_float(self.val_loss, "%0.6f"), - special) + return "Node [%s]: %s (epochs=%i/%s, loss=%s, val_loss=%s)%s" % ( + Node.maybe_str_integer(self.stage), + self.id, + self.epochs_actual, + Node.maybe_str_integer(self.epochs_planned), + Node.maybe_str_float(self.loss, "%0.6f"), + Node.maybe_str_float(self.val_loss, "%0.6f"), + special, + ) def str_table(self): - ''' Like str() but uses fixed-width fields ''' + """Like str() but uses fixed-width fields.""" special = "" if not self.complete: special = " INCOMPLETE!" if self.stopped_early: special = " EARLY STOP!" - return "%-13s : %i : %2i / %2i : %s - %s : %s : %s" % \ - (self.id, self.stage, - self.epochs_actual, self.epochs_planned, - self.date_start, self.date_stop, - self.str_errors(), - special) + return "%-13s : %i : %2i / %2i : %s - %s : %s : %s" % ( + self.id, + self.stage, + self.epochs_actual, + self.epochs_planned, + self.date_start, + self.date_stop, + self.str_errors(), + special, + ) def str_errors(self): - ''' Return errors as big string ''' + """Return errors as big string.""" fmt = "%0.6f" - s = ("loss: %s vl: %s mse: %s mae: %s r2: %s corr: %s") % \ - (Node.maybe_str_float(self.loss, fmt), - Node.maybe_str_float(self.val_loss, fmt), - Node.maybe_str_float(self.mse, fmt), - Node.maybe_str_float(self.mae, fmt), - Node.maybe_str_float(self.r2, fmt), - Node.maybe_str_float(self.corr, fmt)) + s = ("loss: %s vl: %s mse: %s mae: %s r2: %s corr: %s") % ( + Node.maybe_str_float(self.loss, fmt), + Node.maybe_str_float(self.val_loss, fmt), + Node.maybe_str_float(self.mse, fmt), + Node.maybe_str_float(self.mae, fmt), + Node.maybe_str_float(self.r2, fmt), + Node.maybe_str_float(self.corr, fmt), + ) return s def maybe_str_integer(i): @@ -179,8 +184,7 @@ def parse_date_stop(self, line, logger=None): if self.epochs_planned is None: self.debug(logger, "STOP : epochs_planned=None") return - if self.epochs_actual == self.epochs_planned or \ - self.stopped_early: + if self.epochs_actual == self.epochs_planned or self.stopped_early: self.complete = True self.debug(logger, "COMPLETE") @@ -194,46 +198,45 @@ def parse_training_done(self, line, logger=None): td = 0 while tokens[td] != Node.training_done: td = td + 1 - stepii = tokens[td-1].split("/") + stepii = tokens[td - 1].split("/") self.steps += int(stepii[0]) - time_s = tokens[td+2] # e.g., "321s" + time_s = tokens[td + 2] # e.g., "321s" self.time += int(time_s[0:-1]) # Always collect losses: early stopping could happen: - self.loss = float(tokens[td+5]) - self.val_loss = float(tokens[td+14]) + self.loss = float(tokens[td + 5]) + self.val_loss = float(tokens[td + 14]) except Exception as e: self.bad_line(line) - raise(e) + raise (e) def parse_val_data(self, fp): - """ - fp is the file pointer to save/python.log - If val data is not found, node.val_data will remain None - """ + """fp is the file pointer to save/python.log If val data is not found, + node.val_data will remain None.""" marker = "val data = " marker_length = len(marker) while True: line = fp.readline() - if line == "": break + if line == "": + break index = line.find("val data =") - if index == -1: continue - tail = line[index+marker_length:] + if index == -1: + continue + tail = line[index + marker_length:] comma = tail.find(",") value_string = tail[:comma] self.val_data = int(value_string) def parse_error_data(self, fp): - """ - fp is the file pointer to save/python.log - If lines are not found, node.mse, etc., will remain None - """ + """fp is the file pointer to save/python.log If lines are not found, + node.mse, etc., will remain None.""" marker = "Comparing y_true " # The marker is just after the date: # We search this way for speed. date_len = len("YYYY-MM-DD HH:MM:SS ") # trailing space while True: line = fp.readline() - if line == "": break + if line == "": + break if line.startswith(marker, date_len): line = fp.readline() tokens = check_token(line, 2, "mse:") @@ -271,11 +274,12 @@ def trace(self, logger, message): if logger is None or not self.verbose: return import logging - logger.log(level=logging.DEBUG-5, + + logger.log(level=logging.DEBUG - 5, msg=("NODE: [%s] %s" % (self.id, message))) def get_time_cumul(self, nodes): - ''' Time cumulative including parents' time ''' + """Time cumulative including parents' time.""" parent = self.parent() if parent is None: return self.time @@ -288,7 +292,7 @@ def get_segments(self): return total def get_epochs_cumul(self, nodes): - ''' Epochs cumulative including parents' epochs ''' + """Epochs cumulative including parents' epochs.""" if self.epochs_cumul is not None: return self.epochs_cumul # Initialize: @@ -301,47 +305,44 @@ def get_epochs_cumul(self, nodes): def check_token(line, index, token): - ''' Assert that token is in line at given index ''' + """Assert that token is in line at given index.""" tokens = line.split() if tokens[index] != token: - raise Exception(("could not find token: '%s'\n" + - "in line: '%s'") % (token, line)) + raise Exception( + ("could not find token: '%s'\n" + "in line: '%s'") % (token, line)) return tokens def check(condition, message): - ''' Check condition or raise Exception with given message ''' + """Check condition or raise Exception with given message.""" if not condition: raise Exception(message) -''' -EXAMPLES: - -__init__() +# EXAMPLES: -2019-12-14 09:46:32 MODEL RUNNER DEBUG node = 1.4.2.1 +# __init__() -parse_epochs() ==> self.epochs_planned +# 2019-12-14 09:46:32 MODEL RUNNER DEBUG node = 1.4.2.1 -2019-12-14 09:46:32 MODEL RUNNER DEBUG epochs = 5 +# parse_epochs() ==> self.epochs_planned -parse_epoch_status() (from Keras) +# 2019-12-14 09:46:32 MODEL RUNNER DEBUG epochs = 5 -Epoch 29/50 +# parse_epoch_status() (from Keras) -parse_val_data() ==> self.val_data +# Epoch 29/50 -2020-04-15 13:45:41 CV fold 0: train data = 5265, val data = 1400, test data = 0 +# parse_val_data() ==> self.val_data -stop_early() +# 2020-04-15 13:45:41 CV fold 0: train data = 5265, val data = 1400, test data = 0 -Epoch 00004: early stopping +# stop_early() -training_done() +# Epoch 00004: early stopping -16092/16092 [==============================] - 315s 20ms/step - loss: 0.0065 - mae: 0.0565 - r2: -0.6208 - val_loss: 0.0139 - val_mae: 0.0575 - val_r2: -0.3959 +# training_done() -==> self.epochs_actual, self.val_loss, self.time, self.complete +# 16092/16092 [==============================] - 315s 20ms/step - loss: 0.0065 - mae: 0.0565 - r2: -0.6208 - val_loss: 0.0139 - val_mae: 0.0575 - val_r2: -0.3959 -''' +# ==> self.epochs_actual, self.val_loss, self.time, self.complete diff --git a/workflows/cp-leaveout/scripts/avg-stage.py b/workflows/cp-leaveout/scripts/avg-stage.py index 1ad076b8..abeb70da 100644 --- a/workflows/cp-leaveout/scripts/avg-stage.py +++ b/workflows/cp-leaveout/scripts/avg-stage.py @@ -1,21 +1,22 @@ - # AVG STAGE PY -import argparse, os, pickle, statistics +import argparse +import os +import pickle +import statistics from utils import fail STAGE_ANY = 0 parser = argparse.ArgumentParser(description="Finds loss increases.") -parser.add_argument("directory", - help="The experiment directory (EXPID)") -parser.add_argument("--filename", "-f", +parser.add_argument("directory", help="The experiment directory (EXPID)") +parser.add_argument("--filename", + "-f", default="node-info", help="Change the node pkl file name") args = parser.parse_args() - node_pkl = args.directory + "/" + args.filename + ".pkl" try: @@ -31,8 +32,8 @@ total = 0 # stages = { 1:[], 2:[], 3:[], 4:[], 5:[] } # epochs = { 1:[], 2:[], 3:[], 4:[], 5:[] } -times = { 1:[], 2:[], 3:[], 4:[], 5:[] } -vlosses = { 1:[], 2:[], 3:[], 4:[], 5:[] } +times = {1: [], 2: [], 3: [], 4: [], 5: []} +vlosses = {1: [], 2: [], 3: [], 4: [], 5: []} for node_id in data.keys(): node = data[node_id] @@ -40,12 +41,11 @@ continue # stages[node.stage].append(node.time) # epochs[node.stage].append(node.epochs_actual) - times[node.stage].append(node.get_segments()/node.epochs_actual) + times[node.stage].append(node.get_segments() / node.epochs_actual) vlosses[node.stage].append(node.val_loss) if node.stage == 3: - print("%s %0.2f %i" % (node.id, - node.get_segments(), - node.epochs_actual)) + print("%s %0.2f %i" % + (node.id, node.get_segments(), node.epochs_actual)) with open(args.directory + "/times.data", "w") as fp: for stage in times.keys(): diff --git a/workflows/cp-leaveout/scripts/avg-utils.py b/workflows/cp-leaveout/scripts/avg-utils.py index b655944a..a0739b65 100644 --- a/workflows/cp-leaveout/scripts/avg-utils.py +++ b/workflows/cp-leaveout/scripts/avg-utils.py @@ -1,4 +1,3 @@ - import sys import numpy diff --git a/workflows/cp-leaveout/scripts/check-db-pkl.py b/workflows/cp-leaveout/scripts/check-db-pkl.py index ef1ddfaf..cac64bbb 100644 --- a/workflows/cp-leaveout/scripts/check-db-pkl.py +++ b/workflows/cp-leaveout/scripts/check-db-pkl.py @@ -1,26 +1,26 @@ - # CHECK DB PKL PY # WIP -import argparse, os, pickle, sys - +import argparse +import os +import pickle import sqlite3 +import sys from sqlite3 import Error as db_Error from Node import Node from utils import abort -parser = argparse.ArgumentParser(description='Parse all log files') -parser.add_argument('directory', - help='The experiment directory (EXPID)') +parser = argparse.ArgumentParser(description="Parse all log files") +parser.add_argument("directory", help="The experiment directory (EXPID)") args = parser.parse_args() node_pkl = args.directory + "/node-info.pkl" -db_file = args.directory + "/cplo.db" +db_file = args.directory + "/cplo.db" -try: - with open(node_pkl, 'rb') as fp: +try: + with open(node_pkl, "rb") as fp: data = pickle.load(fp) except IOError as e: abort(e, os.EX_IOERR, "Could not load pickle: " + node_pkl) @@ -37,7 +37,6 @@ if d == None: break print(str(d[0])) - cursor.close() conn.close() diff --git a/workflows/cp-leaveout/scripts/compare-errors.py b/workflows/cp-leaveout/scripts/compare-errors.py index 29ac45a0..3d612c9e 100644 --- a/workflows/cp-leaveout/scripts/compare-errors.py +++ b/workflows/cp-leaveout/scripts/compare-errors.py @@ -1,4 +1,3 @@ - # COMPARE ERRORS PY # Input: Provide two experiment DIRECTORIES and OUTPUT file @@ -8,17 +7,15 @@ # Could easily be updated to pull out only one error stat # (see commented code) -import argparse, pickle +import argparse +import pickle parser = argparse.ArgumentParser(description="Parse all log files") -parser.add_argument("directory1", - help="The 1st experiment directory (EXPID)") -parser.add_argument("directory2", - help="The 2nd experiment directory (EXPID)") +parser.add_argument("directory1", help="The 1st experiment directory (EXPID)") +parser.add_argument("directory2", help="The 2nd experiment directory (EXPID)") # parser.add_argument("error", # help="The error type to compare") -parser.add_argument("output", - help="The output file") +parser.add_argument("output", help="The output file") args = parser.parse_args() @@ -40,15 +37,17 @@ nodes_2 = pickle.load(fp) # print("%i %i" % (len(nodes_1), len(nodes_2))) + def get_errors(node): return "%f %f %f %f" % (node.mse, node.mae, node.r2, node.corr) + # for node_id in nodes_1: # print(node_id) # exit(1) missing = 0 -count = 0 +count = 0 with open(args.output, "w") as fp: for node_id in nodes_2: if node_id not in nodes_1: @@ -60,10 +59,9 @@ def get_errors(node): errors_1 = get_errors(nodes_1[node_id]) epochs_2 = nodes_2[node_id].get_epochs_cumul(nodes_2) errors_2 = get_errors(nodes_2[node_id]) - fp.write("%2i %s %3i %s %3i %s\n" % (count, node_id, - epochs_1, errors_1, - epochs_2, errors_2)) + fp.write("%2i %s %3i %s %3i %s\n" % + (count, node_id, epochs_1, errors_1, epochs_2, errors_2)) print("compared: %2i" % count) print("missing: %2i" % missing) -print("wrote: %s" % args.output) +print("wrote: %s" % args.output) diff --git a/workflows/cp-leaveout/scripts/compare-losses.py b/workflows/cp-leaveout/scripts/compare-losses.py index 37e6fd52..78294b0a 100644 --- a/workflows/cp-leaveout/scripts/compare-losses.py +++ b/workflows/cp-leaveout/scripts/compare-losses.py @@ -1,16 +1,14 @@ - # COMPARE LOSSES PY # Input: Provide two experiment directories # Output: Stream of NODE_ID LOSS1 LOSS2 -import argparse, pickle +import argparse +import pickle -parser = argparse.ArgumentParser(description='Parse all log files') -parser.add_argument('directory1', - help='The 1st experiment directory (EXPID)') -parser.add_argument('directory2', - help='The 2nd experiment directory (EXPID)') +parser = argparse.ArgumentParser(description="Parse all log files") +parser.add_argument("directory1", help="The 1st experiment directory (EXPID)") +parser.add_argument("directory2", help="The 2nd experiment directory (EXPID)") args = parser.parse_args() diff --git a/workflows/cp-leaveout/scripts/compute-node-count.py b/workflows/cp-leaveout/scripts/compute-node-count.py index 2a7ff86d..1379f4dc 100644 --- a/workflows/cp-leaveout/scripts/compute-node-count.py +++ b/workflows/cp-leaveout/scripts/compute-node-count.py @@ -1,4 +1,3 @@ - # COMPUTE NODE COUNT PY # Simply calculate the node count @@ -7,7 +6,7 @@ S = 5 total = 0 -current = 1 # Number of nodes in current stage +current = 1 # Number of nodes in current stage for stage in range(0, S): current *= 4 print("%i: current: %4i" % (stage, current)) diff --git a/workflows/cp-leaveout/scripts/data-size.py b/workflows/cp-leaveout/scripts/data-size.py index 066ba003..e7235096 100644 --- a/workflows/cp-leaveout/scripts/data-size.py +++ b/workflows/cp-leaveout/scripts/data-size.py @@ -1,15 +1,16 @@ - # DATA SIZE PY # Get the training data size from the file -import argparse, logging, os, sys -import pandas as pd +import argparse +import logging +import os +import sys +import pandas as pd from utils import fail parser = argparse.ArgumentParser(description="Extract the data size") -parser.add_argument("input", - help="The training file") +parser.add_argument("input", help="The training file") args = parser.parse_args() print("data-size.py: opening '%s' ..." % args.input) @@ -32,7 +33,7 @@ clms = df_x_train_0.columns print(clms) for clm in clms: - print(df_x_train_0.at[2,clm]) + print(df_x_train_0.at[2, clm]) # print(df_x_train_1.columns) store.close() @@ -61,14 +62,13 @@ print("data-size: OK.") - # total size: (529940, 6215) +# total size: (529940, 6215) # store = pd.HDFStore(args.input, "r", complevel=9, complib="blosc:snappy") # print(str(store)) # print(store.get("y_val")) - # f = h5py.File(args.file, "r") # # print(f.name) diff --git a/workflows/cp-leaveout/scripts/describe-node.py b/workflows/cp-leaveout/scripts/describe-node.py index f79ea7a2..e1a2d2d7 100755 --- a/workflows/cp-leaveout/scripts/describe-node.py +++ b/workflows/cp-leaveout/scripts/describe-node.py @@ -3,11 +3,12 @@ # DESCRIBE NODE PY # -import argparse, json +import argparse +import json parser = argparse.ArgumentParser() -parser.add_argument('plan', type=str, help='Plan data file') -parser.add_argument('node', type=str, help='The node e.g. "1.2.3"') +parser.add_argument("plan", type=str, help="Plan data file") +parser.add_argument("node", type=str, help='The node e.g. "1.2.3"') args = parser.parse_args() try: diff --git a/workflows/cp-leaveout/scripts/epoch-time.py b/workflows/cp-leaveout/scripts/epoch-time.py index 401abf08..1021f8b2 100644 --- a/workflows/cp-leaveout/scripts/epoch-time.py +++ b/workflows/cp-leaveout/scripts/epoch-time.py @@ -1,21 +1,20 @@ - # EPOCH TIME PY # See epoch-time.sh -import datetime, sys - +import datetime +import sys # Main data structure: # map from stage number to list of epoch times in seconds stages = {} -for stage in range(1, 6+1): +for stage in range(1, 6 + 1): stages[stage] = [] # Files processed: progress = 0 -total = 0 +total = 0 -node_current = "NONE" +node_current = "NONE" stage_current = -1 start_current = None @@ -23,13 +22,15 @@ line = sys.stdin.readline() - if len(line) == 0: break # EOF - if len(line) == 1: continue # Blank line + if len(line) == 0: + break # EOF + if len(line) == 1: + continue # Blank line tokens = line.split() if tokens[0] == "epoch-time:": if tokens[1] == "node": - node_current = tokens[2] + node_current = tokens[2] stage_current = int(len(node_current) / 2) start_current = None # print("node: " + node_current) @@ -38,7 +39,7 @@ elif tokens[1] == "total": total = int(tokens[2]) else: - assert(False) + assert False continue if tokens[2] == "UNO" and tokens[3] == "START": @@ -52,13 +53,13 @@ start_current = dt continue start = start_current.timestamp() - stop = dt .timestamp() + stop = dt.timestamp() duration = stop - start # print("epoch complete: " + str(duration)) start_current = dt stages[stage_current].append(duration) -for stage in range(1, 6+1): +for stage in range(1, 6 + 1): n = len(stages[stage]) if n == 0: avg = -1 diff --git a/workflows/cp-leaveout/scripts/extract-holdout-errors.sh b/workflows/cp-leaveout/scripts/extract-holdout-errors.sh index 1878ad6a..c3ae51e8 100755 --- a/workflows/cp-leaveout/scripts/extract-holdout-errors.sh +++ b/workflows/cp-leaveout/scripts/extract-holdout-errors.sh @@ -25,7 +25,7 @@ fi EXTRACT_HOLDOUT_ERRORS_AWK=$THIS/extract-holdout-errors.awk # Missing python.logs (usually due to no data): -MISSING=0 +MISSING=0 NODES=( $( ls $DIR/run ) ) # set -x echo "NODES: ${#NODES[@]}" diff --git a/workflows/cp-leaveout/scripts/extract-holdout-errors.test b/workflows/cp-leaveout/scripts/extract-holdout-errors.test index ceae4f3e..c7c1c637 100644 --- a/workflows/cp-leaveout/scripts/extract-holdout-errors.test +++ b/workflows/cp-leaveout/scripts/extract-holdout-errors.test @@ -7,4 +7,3 @@ 2020-07-07 14:38:50 r2: 0.7352 2020-07-07 14:38:50 corr: 0.8590 2020-07-07 14:40:24 Cache parameter file does not exist: cache/top6_auc.params.json - diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index 965e7113..ac1a51d4 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -1,4 +1,3 @@ - # EXTRACT NODE INFO PY # Input: Provide an experiment directory @@ -7,14 +6,16 @@ # Use print-node-info to print the node info # See Node.py for the data structure -import argparse, logging, os, pickle +import argparse +import logging +import os +import pickle -from utils import fail from Node import Node +from utils import fail -parser = argparse.ArgumentParser(description='Parse all log files') -parser.add_argument('directory', - help='The experiment directory (EXPID)') +parser = argparse.ArgumentParser(description="Parse all log files") +parser.add_argument("directory", help="The experiment directory (EXPID)") args = parser.parse_args() @@ -52,8 +53,8 @@ def parse_logs(log_files): total = len(log_files) index = 0 for log_file in log_files: - progress = "%4i/%4i (%2.f%%)" % \ - (index, total, 100.0*index/total) + progress = "%4i/%4i (%2.f%%)" % (index, total, + 100.0 * index / total) logger.info("Opening: %12s %s" % (progress, log_file)) with open(log_file) as fp: parse_log(fp, nodes) @@ -73,7 +74,8 @@ def parse_log(log_fp, nodes): while True: line = log_fp.readline() # print(line) - if line == "": break + if line == "": + break if "DONE: run_id" in line: # This is also a MODEL RUNNER line, # but could be DEBUG or INFO @@ -147,7 +149,7 @@ def parse_build_df(line, logger=None): def trace(message): - logger.log(level=logging.DEBUG-5, msg=message) + logger.log(level=logging.DEBUG - 5, msg=message) # def find_val_data(node): diff --git a/workflows/cp-leaveout/scripts/find-loss-increases.py b/workflows/cp-leaveout/scripts/find-loss-increases.py index 73b9d761..c7072500 100644 --- a/workflows/cp-leaveout/scripts/find-loss-increases.py +++ b/workflows/cp-leaveout/scripts/find-loss-increases.py @@ -1,4 +1,3 @@ - # FIND LOSS INCREASES PY # Brettin email 2019-12-18: @@ -6,39 +5,45 @@ # that when added to the training samples, # cause the performance of the node/model to decrease. -import argparse, os, pickle, sys +import argparse +import os +import pickle +import sys from Node import Node from utils import append, avg, fail STAGE_ANY = 0 -parser = argparse.ArgumentParser(description='Finds loss increases.') -parser.add_argument('directory', - help='The experiment directory (EXPID)') -parser.add_argument('--filename', '-f', - default='node-info', - help='Change the node pkl file name') -parser.add_argument('--stage', '-S', +parser = argparse.ArgumentParser(description="Finds loss increases.") +parser.add_argument("directory", help="The experiment directory (EXPID)") +parser.add_argument("--filename", + "-f", + default="node-info", + help="Change the node pkl file name") +parser.add_argument("--stage", + "-S", type=int, default=STAGE_ANY, - help='Select the stage') -parser.add_argument('--token', '-T', default=None, - help='User-readable naming token') + help="Select the stage") +parser.add_argument("--token", + "-T", + default=None, + help="User-readable naming token") args = parser.parse_args() if args.token == None: args.token = os.path.basename(args.directory) -node_pkl = args.directory + '/' + args.filename + '.pkl' +node_pkl = args.directory + "/" + args.filename + ".pkl" try: - with open(node_pkl, 'rb') as fp: + with open(node_pkl, "rb") as fp: # This is a dict ('node_id' -> Node) data = pickle.load(fp) except IOError as e: - fail(e, os.EX_IOERR, 'Could not read: ' + node_pkl) + fail(e, os.EX_IOERR, "Could not read: " + node_pkl) print("total nodes: %i" % len(data)) @@ -46,7 +51,7 @@ # !! Updated upstream node_loss_worst = Node("WORST") node_loss_worst.loss = 0 -node_loss_best = Node("BEST") +node_loss_best = Node("BEST") node_loss_best.loss = 1000 # List of Nodes where loss increased: @@ -57,43 +62,46 @@ leaves = 0 for node_id in data.keys(): # print("node: " + node_id) - parent_id = node_id[0:-2] # '1.2.3' -> '1.2' - if len(parent_id) == 1: # stage=1 + parent_id = node_id[0:-2] # '1.2.3' -> '1.2' + if len(parent_id) == 1: # stage=1 continue if parent_id not in data: print("parent not found.") continue current = data[node_id] - parent = data[parent_id] - if current.stage == 5: leaves += 1 + parent = data[parent_id] + if current.stage == 5: + leaves += 1 if not (args.stage == STAGE_ANY or args.stage == current.stage): continue current.loss_delta = current.loss - parent.loss if current.loss_delta > 0: increases_loss.append(current) - if current.val_loss > node_loss_worst.loss: node_worst = current - if current.val_loss < node_loss_best.loss: node_best = current + if current.val_loss > node_loss_worst.loss: + node_worst = current + if current.val_loss < node_loss_best.loss: + node_best = current total += 1 fraction = 100.0 * len(increases_loss) / total -print('increases_loss/total = %i / %i (%02.f%%)' % \ +print("increases_loss/total = %i / %i (%02.f%%)" % (len(increases_loss), total, fraction)) # Artificial nodes for comparison: node_vl_worst = Node("WORST") node_vl_worst.val_loss = 0 -node_vl_best = Node("BEST") +node_vl_best = Node("BEST") node_vl_best.val_loss = 1000 # == # val_loss: node_worst_val_loss = Node("WORST VAL_LOSS") node_worst_val_loss.val_loss = 0 -node_best_val_loss = Node("BEST VAL_LOSS") +node_best_val_loss = Node("BEST VAL_LOSS") node_best_val_loss.val_loss = 1000 # loss: node_worst_loss = Node("WORST LOSS") node_worst_loss.loss = 0 -node_best_loss = Node("BEST LOSS") +node_best_loss = Node("BEST LOSS") node_best_loss.loss = 1000 # !! Stashed changes @@ -101,7 +109,7 @@ print("STAGE: %i" % args.stage) # !! Updated upstream -leaves = 0 # stage 5 Nodes +leaves = 0 # stage 5 Nodes # List of Nodes where val_loss increased: increases_vl = [] @@ -109,23 +117,28 @@ total = 0 for node_id in data.keys(): # print("node: " + node_id) - parent_id = node_id[0:-2] # '1.2.3' -> '1.2' - if len(parent_id) == 1: # stage=1 + parent_id = node_id[0:-2] # '1.2.3' -> '1.2' + if len(parent_id) == 1: # stage=1 continue if parent_id not in data: print("parent not found.") continue current = data[node_id] - parent = data[parent_id] - if current.stage == 5: leaves += 1 + parent = data[parent_id] + if current.stage == 5: + leaves += 1 if not (args.stage == STAGE_ANY or args.stage == current.stage): continue current.val_loss_delta = current.val_loss - parent.val_loss if current.val_loss_delta > 0: increases_vl.append(current) - if current.val_loss > node_vl_worst.val_loss: node_worst = current - if current.val_loss < node_vl_best.val_loss: node_best = current + if current.val_loss > node_vl_worst.val_loss: + node_worst = current + if current.val_loss < node_vl_best.val_loss: + node_best = current total += 1 + + # == def get_increases(): # List of Nodes where loss increased: @@ -136,7 +149,7 @@ def get_increases(): increases_val_loss = [] global node_worst_loss, node_worst_val_loss - global node_best_loss, node_best_val_loss + global node_best_loss, node_best_val_loss # count of Nodes: total = 0 @@ -146,16 +159,17 @@ def get_increases(): parents_missing = 0 for node_id in data.keys(): # print("node: " + node_id) - parent_id = node_id[0:-2] # '1.2.3' -> '1.2' - if len(parent_id) == 1: # stage=1 + parent_id = node_id[0:-2] # '1.2.3' -> '1.2' + if len(parent_id) == 1: # stage=1 continue if parent_id not in data: # print("parent not found.") parents_missing += 1 continue current = data[node_id] - parent = data[parent_id] - if current.stage == 5: leaves += 1 + parent = data[parent_id] + if current.stage == 5: + leaves += 1 if not (args.stage == STAGE_ANY or args.stage == current.stage): continue current.val_loss_delta = current.val_loss - parent.val_loss @@ -166,18 +180,19 @@ def get_increases(): if current.loss_delta > 0: increases_loss.append(current) # Update best/worst: - if current.loss > node_worst_loss.loss: + if current.loss > node_worst_loss.loss: node_worst_loss = current - if current.loss < node_best_loss.loss: - node_best_loss = current + if current.loss < node_best_loss.loss: + node_best_loss = current if current.val_loss > node_worst_val_loss.val_loss: node_worst_val_loss = current if current.val_loss < node_best_val_loss.val_loss: - node_best_val_loss = current + node_best_val_loss = current total += 1 print("parents_missing: %i" % parents_missing) return total, leaves + # total: count of Nodes # leaves: count of stage 5 Nodes total, leaves = get_increases() @@ -185,47 +200,50 @@ def get_increases(): print("leaves: %i" % leaves) -if total == 0: fail('No matching Nodes found!') +if total == 0: + fail("No matching Nodes found!") # !! Updated upstream fraction = 100.0 * len(increases_vl) / total -print('increases_vl/total = %i / %i (%02.f%%)' % \ +print("increases_vl/total = %i / %i (%02.f%%)" % (len(increases_vl), total, fraction)) file_increases_vl = "increases-vl-%s.data" % args.token append(file_increases_vl, "%i %5.1f" % (args.stage, fraction)) # == fraction = 100.0 * len(increases_loss) / total -print('increases_loss/total = %i / %i (%02.f%%)' % \ +print("increases_loss/total = %i / %i (%02.f%%)" % (len(increases_loss), total, fraction)) filename = "increases-loss-%s.data" % args.token append(filename, "%i %5.1f" % (args.stage, fraction)) fraction = 100.0 * len(increases_val_loss) / total -print('increases_val_loss/total = %i / %i (%02.f%%)' % \ +print("increases_val_loss/total = %i / %i (%02.f%%)" % (len(increases_val_loss), total, fraction)) filename = "increases-val_loss-%s.data" % args.token append(filename, "%i %5.1f" % (args.stage, fraction)) # !! Stashed changes -print('worst loss: ' + str(node_worst_loss)) -print('best loss: ' + str(node_best_loss)) -print('worst val_loss: ' + str(node_worst_val_loss)) -print('best val_loss: ' + str(node_best_val_loss)) +print("worst loss: " + str(node_worst_loss)) +print("best loss: " + str(node_best_loss)) +print("worst val_loss: " + str(node_worst_val_loss)) +print("best val_loss: " + str(node_best_val_loss)) exit() -print('DELTAS:') +print("DELTAS:") -increases_loss .sort(key=Node.get_loss_delta) +increases_loss.sort(key=Node.get_loss_delta) increases_val_loss.sort(key=Node.get_val_loss_delta) # stopped_early = 0 # for i in increases: # # print('%f %-14s %r' % (i.val_loss_delta, i.id, i.stopped_early)) # if i.stopped_early: stopped_early += 1 + def print_delta(prefix, node): - print(prefix, str(node), 'delta: %f' % node.val_loss_delta) + print(prefix, str(node), "delta: %f" % node.val_loss_delta) + # worst = increases[-1] # print_delta('worst: ', worst) @@ -262,8 +280,9 @@ def print_delta(prefix, node): # print("avg_increase", str(avg_increase)) # print("avg_val_loss", str(avg_val_loss)) + def report_top_loss_deltas(): - print("%-2s %-12s %-8s %-8s %-8s %-8s" % \ + print("%-2s %-12s %-8s %-8s %-8s %-8s" % ("", "node", "loss", "parent", "delta", "ratio")) increases_loss.sort(key=Node.get_loss_delta, reverse=True) ratios = [] @@ -272,14 +291,15 @@ def report_top_loss_deltas(): parent = data[node.parent()] ratio = node.get_loss_delta() / parent.loss print("%2i %-12s %0.6f %0.6f %0.6f %0.6f" % - (index, node.id, node.loss, parent.loss, - node.get_loss_delta(), ratio)) + (index, node.id, node.loss, parent.loss, node.get_loss_delta(), + ratio)) ratios.append(ratio) index += 1 ratios.sort() + def report_top_val_loss_deltas(increases_val_loss): - print("%-2s %-12s %-8s %-8s %-8s %-8s %-8s" % \ + print("%-2s %-12s %-8s %-8s %-8s %-8s %-8s" % ("", "node", "val_loss", "parent", "delta", "ratio", "val_data")) increases_val_loss.sort(key=Node.get_val_loss_delta, reverse=True) ratios = [] @@ -287,13 +307,20 @@ def report_top_val_loss_deltas(increases_val_loss): for node in increases_val_loss: parent = data[node.parent()] ratio = node.get_val_loss_delta() / parent.loss - print("%2i %-12s %0.6f %0.6f %0.6f %0.6f %8i" % - (index, node.id, node.val_loss, parent.val_loss, - node.get_val_loss_delta(), ratio, node.val_data)) + print("%2i %-12s %0.6f %0.6f %0.6f %0.6f %8i" % ( + index, + node.id, + node.val_loss, + parent.val_loss, + node.get_val_loss_delta(), + ratio, + node.val_data, + )) ratios.append(ratio) index += 1 ratios.sort() + report_top_val_loss_deltas(increases_val_loss) # with open(outliers_file, "w") as fp: diff --git a/workflows/cp-leaveout/scripts/leaf-stats.py b/workflows/cp-leaveout/scripts/leaf-stats.py index 7bcc11de..2900e510 100644 --- a/workflows/cp-leaveout/scripts/leaf-stats.py +++ b/workflows/cp-leaveout/scripts/leaf-stats.py @@ -1,34 +1,42 @@ - # LEAF STATS PY import argparse import pandas as pd - import utils -parser = argparse.ArgumentParser(description='Print leaf stats') -parser.add_argument('directory', - help='The experiment directory (EXPID)') -parser.add_argument('list', - help='The list of nodes to process') +parser = argparse.ArgumentParser(description="Print leaf stats") +parser.add_argument("directory", help="The experiment directory (EXPID)") +parser.add_argument("list", help="The list of nodes to process") args = parser.parse_args() # Map from node "1.1.1.1.2.3" to cell line "CCLE.KMS11" nodes = {} -with open(args.list, 'r') as fp: +with open(args.list, "r") as fp: while True: line = fp.readline() - if len(line) == 0: break + if len(line) == 0: + break tokens = line.split() node = tokens[0] cell = tokens[1] nodes[node] = cell -columns = [ "CELL", "NODE", "POINTS", "EPOCHS", "MAE", "R2", "VAL_LOSS", - "EARLY", "HO_MSE", "HO_MAE", "HO_R2" ] +columns = [ + "CELL", + "NODE", + "POINTS", + "EPOCHS", + "MAE", + "R2", + "VAL_LOSS", + "EARLY", + "HO_MSE", + "HO_MAE", + "HO_R2", +] df = pd.DataFrame(columns=columns) @@ -57,9 +65,9 @@ def __init__(self): def run(self, line): tokens = line.split() # Remove trailing bracket or comma: - self.epochs = tokens[ 3][0:-1] - self.mae = tokens[ 7][0:-1] - self.r2 = tokens[ 9][0:-1] + self.epochs = tokens[3][0:-1] + self.mae = tokens[7][0:-1] + self.r2 = tokens[9][0:-1] self.val_loss = tokens[11][0:-1] def reset(self): @@ -125,33 +133,40 @@ def reset(self): matcherPoints = MatcherPoints() -matcherStats = MatcherStats() -matcherEarly = MatcherEarly() +matcherStats = MatcherStats() +matcherEarly = MatcherEarly() matcherHO_MSE = MatcherHoldoutMSE() matcherHO_MAE = MatcherHoldoutMAE() -matcherHO_R2 = MatcherHoldoutR2() -grepper = utils.Grepper([matcherPoints, matcherStats, matcherEarly, - matcherHO_MSE, matcherHO_MAE, matcherHO_R2]) +matcherHO_R2 = MatcherHoldoutR2() +grepper = utils.Grepper([ + matcherPoints, + matcherStats, + matcherEarly, + matcherHO_MSE, + matcherHO_MAE, + matcherHO_R2, +]) for node in nodes: cell = nodes[node] log = f"{args.directory}/run/{node}/save/python.log" grepper.grep(log) newrow = pd.DataFrame({ - "CELL" : [cell], - "NODE" : [node], - "POINTS" : [matcherPoints.points], - "EPOCHS" : [matcherStats.epochs], - "MAE" : [matcherStats.mae], - "R2" : [matcherStats.r2], - "VAL_LOSS" : [matcherStats.val_loss], - "EARLY" : [matcherEarly.early], - "HO_MSE" : [matcherHO_MSE.ho_mse], - "HO_MAE" : [matcherHO_MAE.ho_mae], - "HO_R2" : [matcherHO_R2 .ho_r2] + "CELL": [cell], + "NODE": [node], + "POINTS": [matcherPoints.points], + "EPOCHS": [matcherStats.epochs], + "MAE": [matcherStats.mae], + "R2": [matcherStats.r2], + "VAL_LOSS": [matcherStats.val_loss], + "EARLY": [matcherEarly.early], + "HO_MSE": [matcherHO_MSE.ho_mse], + "HO_MAE": [matcherHO_MAE.ho_mae], + "HO_R2": [matcherHO_R2.ho_r2], }) df = pd.concat([df, newrow], ignore_index=True) grepper.reset() from tabulate import tabulate -print(tabulate(df, headers='keys', tablefmt='plain')) + +print(tabulate(df, headers="keys", tablefmt="plain")) diff --git a/workflows/cp-leaveout/scripts/list-node-singles.py b/workflows/cp-leaveout/scripts/list-node-singles.py index 5c5bf367..268cfef2 100755 --- a/workflows/cp-leaveout/scripts/list-node-singles.py +++ b/workflows/cp-leaveout/scripts/list-node-singles.py @@ -4,10 +4,11 @@ # Extract the nodes from the JSON file with a single cell line # report the node and cell line -import argparse, json +import argparse +import json parser = argparse.ArgumentParser() -parser.add_argument('plan', type=str, help='Plan data file') +parser.add_argument("plan", type=str, help="Plan data file") args = parser.parse_args() try: diff --git a/workflows/cp-leaveout/scripts/list-nodes.py b/workflows/cp-leaveout/scripts/list-nodes.py index 378cfcb1..2d3b85eb 100755 --- a/workflows/cp-leaveout/scripts/list-nodes.py +++ b/workflows/cp-leaveout/scripts/list-nodes.py @@ -3,10 +3,11 @@ # LIST NODES PY # Extract just the nodes from the JSON file for human inspection -import argparse, json +import argparse +import json parser = argparse.ArgumentParser() -parser.add_argument('plan', type=str, help='Plan data file') +parser.add_argument("plan", type=str, help="Plan data file") args = parser.parse_args() try: @@ -17,4 +18,4 @@ exit(1) for k in J.keys(): - print(k) + print(k) diff --git a/workflows/cp-leaveout/scripts/loss-histogram.py b/workflows/cp-leaveout/scripts/loss-histogram.py index 4e558eb3..a8968f78 100644 --- a/workflows/cp-leaveout/scripts/loss-histogram.py +++ b/workflows/cp-leaveout/scripts/loss-histogram.py @@ -1,4 +1,3 @@ - # LOSS HISTOGRAM # usage: python3 scripts/loss-histogram.py < $D/losses.txt diff --git a/workflows/cp-leaveout/scripts/node-times.py b/workflows/cp-leaveout/scripts/node-times.py index d7833de2..3727b062 100644 --- a/workflows/cp-leaveout/scripts/node-times.py +++ b/workflows/cp-leaveout/scripts/node-times.py @@ -3,19 +3,22 @@ # NODE TIMES PY # -import argparse, json, pickle +import argparse +import json +import pickle import Node parser = argparse.ArgumentParser() -parser.add_argument('dir', type=str, - help='The directory with the node-info.pkl') +parser.add_argument("dir", + type=str, + help="The directory with the node-info.pkl") args = parser.parse_args() node_pkl = args.dir + "/" + "node-info.pkl" try: - with open(node_pkl, 'rb') as fp: + with open(node_pkl, "rb") as fp: D = pickle.load(fp) except Exception as e: print("could not read PKL file: %s\n" % node_pkl + str(e)) @@ -31,7 +34,7 @@ node = D[node_id] fmt = "%Y-%m-%d %H:%M:%S" start = datetime.datetime.strptime(node.date_start, fmt).timestamp() - stop = datetime.datetime.strptime(node.date_stop, fmt).timestamp() + stop = datetime.datetime.strptime(node.date_stop, fmt).timestamp() events.append((start, 1)) events.append((stop, -1)) @@ -40,12 +43,14 @@ node_times_data = args.dir + "/node-times.data" load = 0 + def scale(t): offset = 1594305000 - return (t - offset)/3600 + return (t - offset) / 3600 + with open(node_times_data, "w") as fp: - if len(events) > 0: + if len(events) > 0: for event in events: fp.write("%12.1f %i\n" % (scale(event[0]), load)) load = load + event[1] diff --git a/workflows/cp-leaveout/scripts/plot-holdout-errors.py b/workflows/cp-leaveout/scripts/plot-holdout-errors.py index ceec043f..e70d33b0 100644 --- a/workflows/cp-leaveout/scripts/plot-holdout-errors.py +++ b/workflows/cp-leaveout/scripts/plot-holdout-errors.py @@ -1,33 +1,35 @@ - # PLOT HOLDOUT ERRORS PY # Plots holdout error data from distill-holdout-errors.pl -import pandas +import argparse + import matplotlib.pyplot as plt +import pandas + # This was removed from Pandas 1.6: # Cf. https://stackoverflow.com/questions/54473018/where-is-pandas-tools # from pandas.tools.plotting import parallel_coordinates from pandas.plotting import parallel_coordinates -import argparse -parser = argparse.ArgumentParser(description='Make holdout errors plot') -parser.add_argument('stages', type=int, help='Number of stages') -parser.add_argument('file_input', help='The input errors TSV file') -parser.add_argument('file_output', help='The output PNG file') +parser = argparse.ArgumentParser(description="Make holdout errors plot") +parser.add_argument("stages", type=int, help="Number of stages") +parser.add_argument("file_input", help="The input errors TSV file") +parser.add_argument("file_output", help="The output PNG file") args = parser.parse_args() # names = [ 'Stage1','Stage2','Stage3','Stage4', 'Stage5', 'CLASS'] names = [] -for i in range(1, args.stages+1): - names.append("Stage"+str(i)) -names.append('CLASS') +for i in range(1, args.stages + 1): + names.append("Stage" + str(i)) +names.append("CLASS") print(str(names)) -cpdata=pandas.read_csv(args.file_input, sep='\t', header=None, names=names) -p = parallel_coordinates(cpdata, class_column="CLASS", +cpdata = pandas.read_csv(args.file_input, sep="\t", header=None, names=names) +p = parallel_coordinates(cpdata, + class_column="CLASS", colormap=plt.get_cmap("Set2")) # fig = p.gcf() diff --git a/workflows/cp-leaveout/scripts/plot_io_times.py b/workflows/cp-leaveout/scripts/plot_io_times.py index 34fd90ef..478306fa 100644 --- a/workflows/cp-leaveout/scripts/plot_io_times.py +++ b/workflows/cp-leaveout/scripts/plot_io_times.py @@ -1,28 +1,28 @@ - # PLOT IO TIMES PY -import argparse, os, pickle, statistics - +import argparse +import os +import pickle +import statistics from utils import fail -parser = argparse.ArgumentParser(description='Plot I/O stats') -parser.add_argument('directory', - help='The experiment directory (EXPID)') +parser = argparse.ArgumentParser(description="Plot I/O stats") +parser.add_argument("directory", help="The experiment directory (EXPID)") args = parser.parse_args() node_pkl = args.directory + "/node-info.pkl" try: - with open(node_pkl, 'rb') as fp: + with open(node_pkl, "rb") as fp: data = pickle.load(fp) except IOError as e: fail(e, os.EX_IOERR, "Could not read: " + node_pkl) -builds = { 1:[], 2:[], 3:[], 4:[], 5:[] } -loads = { 1:[], 2:[], 3:[], 4:[], 5:[] } -writes = { 1:[], 2:[], 3:[], 4:[], 5:[] } +builds = {1: [], 2: [], 3: [], 4: [], 5: []} +loads = {1: [], 2: [], 3: [], 4: [], 5: []} +writes = {1: [], 2: [], 3: [], 4: [], 5: []} # Print the node info! for node in data.values(): @@ -43,7 +43,8 @@ with open(args.directory + "/loads.data", "w") as fp: for stage in loads.keys(): - if stage == 1: continue # stage 1 does not do a load + if stage == 1: + continue # stage 1 does not do a load fp.write("%i " % stage) fp.write("%0.3f\n" % statistics.mean(loads[stage])) diff --git a/workflows/cp-leaveout/scripts/print-node-info.py b/workflows/cp-leaveout/scripts/print-node-info.py index 2884fac4..68f0ab02 100644 --- a/workflows/cp-leaveout/scripts/print-node-info.py +++ b/workflows/cp-leaveout/scripts/print-node-info.py @@ -1,21 +1,22 @@ - # PRINT NODE INFO PY -import argparse, os, pickle, sys +import argparse +import os +import pickle +import sys from Node import Node from utils import fail -parser = argparse.ArgumentParser(description='Print Node info stats') -parser.add_argument('directory', - help='The experiment directory (EXPID)') +parser = argparse.ArgumentParser(description="Print Node info stats") +parser.add_argument("directory", help="The experiment directory (EXPID)") args = parser.parse_args() node_pkl = args.directory + "/node-info.pkl" try: - with open(node_pkl, 'rb') as fp: + with open(node_pkl, "rb") as fp: data = pickle.load(fp) except IOError as e: fail(e, os.EX_IOERR, "Could not read: " + node_pkl) @@ -25,11 +26,12 @@ # print(data) # Print the node info! -count = 0 +count = 0 earlies = 0 for node in data.values(): print(node.str_table()) count += 1 - if node.stopped_early: earlies += 1 + if node.stopped_early: + earlies += 1 print("print-node-info: %i/%i runs stopped early." % (count, earlies)) diff --git a/workflows/cp-leaveout/scripts/report_leaves.py b/workflows/cp-leaveout/scripts/report_leaves.py index 08d7d51a..6ca2736a 100644 --- a/workflows/cp-leaveout/scripts/report_leaves.py +++ b/workflows/cp-leaveout/scripts/report_leaves.py @@ -1,27 +1,27 @@ - # REPORT LEAVES PY -import argparse, os, pickle, sys +import argparse +import os +import pickle +import sys from Node import Node from utils import fail -parser = argparse.ArgumentParser(description= - 'Report nodes with no children.') -parser.add_argument('directory', - help='The experiment directory (EXPID)') +parser = argparse.ArgumentParser(description="Report nodes with no children.") +parser.add_argument("directory", help="The experiment directory (EXPID)") args = parser.parse_args() node_pkl = args.directory + "/node-info.pkl" try: - with open(node_pkl, 'rb') as fp: + with open(node_pkl, "rb") as fp: data = pickle.load(fp) except IOError as e: fail(e, os.EX_IOERR, "Could not read: " + node_pkl) -nodes = data.keys() +nodes = data.keys() leaves = data.copy() for node in nodes: diff --git a/workflows/cp-leaveout/scripts/report_stopping.py b/workflows/cp-leaveout/scripts/report_stopping.py index 3d427ec9..20d1f5e2 100644 --- a/workflows/cp-leaveout/scripts/report_stopping.py +++ b/workflows/cp-leaveout/scripts/report_stopping.py @@ -1,27 +1,27 @@ - # REPORT STOPPING PY -import argparse, os, pickle, sys +import argparse +import os +import pickle +import sys from Node import Node -from utils import fail, avg +from utils import avg, fail -parser = argparse.ArgumentParser(description= - 'Report nodes with no children.') -parser.add_argument('directory', - help='The experiment directory (EXPID)') +parser = argparse.ArgumentParser(description="Report nodes with no children.") +parser.add_argument("directory", help="The experiment directory (EXPID)") args = parser.parse_args() node_pkl = args.directory + "/node-info.pkl" try: - with open(node_pkl, 'rb') as fp: + with open(node_pkl, "rb") as fp: data = pickle.load(fp) except IOError as e: fail(e, os.EX_IOERR, "Could not read: " + node_pkl) -stages = { 1:[], 2:[], 3:[], 4:[], 5:[], 6:[] } +stages = {1: [], 2: [], 3: [], 4: [], 5: [], 6: []} for key in data: # print(key) @@ -36,6 +36,5 @@ a = avg(L) print("%i: %0.3f" % (i, a)) - # a = st # 1.3.2.4.2.4.1 diff --git a/workflows/cp-leaveout/scripts/stage-avg.py b/workflows/cp-leaveout/scripts/stage-avg.py index ea938dbe..efdf426b 100755 --- a/workflows/cp-leaveout/scripts/stage-avg.py +++ b/workflows/cp-leaveout/scripts/stage-avg.py @@ -28,21 +28,23 @@ stage, run = tokens[0:2] # print(stage, run) offset = 2 - for index in range(0,len(labels)): + for index in range(0, len(labels)): label = labels[index] if stage not in data[label]: data[label][stage] = [] - data[label][stage].append(tokens[offset+index]) + data[label][stage].append(tokens[offset + index]) # Debug dump of all data: # print(data) + def avg(L): s = 0.0 for v in L: s += float(v) return s / len(L) + def mean_confidence_interval(data, confidence=0.95): import numpy as np import scipy.stats @@ -51,10 +53,11 @@ def mean_confidence_interval(data, confidence=0.95): a = 1.0 * np.array(data) n = len(a) m, se = np.mean(a), scipy.stats.sem(a) - h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1) - c = 100.0 * h / m # Interval scaled to mean + h = se * scipy.stats.t.ppf((1 + confidence) / 2.0, n - 1) + c = 100.0 * h / m # Interval scaled to mean return m, h, c + # Average each data[label][stage] and report print("# %-5s %-6s AVG" % ("STAT", "STAGE")) for label in labels: diff --git a/workflows/cp-leaveout/scripts/time-nvm.data b/workflows/cp-leaveout/scripts/time-nvm.data index fbbf8cd3..56c3d494 100644 --- a/workflows/cp-leaveout/scripts/time-nvm.data +++ b/workflows/cp-leaveout/scripts/time-nvm.data @@ -5,4 +5,3 @@ restart/0 2 16.35 restart/1 16 14.83 restart/2 126 13.75 restart/3 254 11.69 - diff --git a/workflows/cp-leaveout/scripts/workflow-stats.py b/workflows/cp-leaveout/scripts/workflow-stats.py index b86b7e83..40b07492 100644 --- a/workflows/cp-leaveout/scripts/workflow-stats.py +++ b/workflows/cp-leaveout/scripts/workflow-stats.py @@ -1,18 +1,20 @@ - # WORKFLOW STATS PY -import argparse, math, os, pickle, sys +import argparse +import math +import os +import pickle +import sys from Node import Node from utils import fail -parser = argparse.ArgumentParser(description='Print workflow total stats') -parser.add_argument('directory', - help='The experiment directory (EXPID)') -parser.add_argument('--percentiles', action='store_true', - help='If given, run percentiles analysis') -parser.add_argument('--token', default=None, - help='User-readable naming token') +parser = argparse.ArgumentParser(description="Print workflow total stats") +parser.add_argument("directory", help="The experiment directory (EXPID)") +parser.add_argument("--percentiles", + action="store_true", + help="If given, run percentiles analysis") +parser.add_argument("--token", default=None, help="User-readable naming token") args = parser.parse_args() @@ -25,18 +27,18 @@ node_pkl = args.directory + "/node-info.pkl" try: - with open(node_pkl, 'rb') as fp: + with open(node_pkl, "rb") as fp: data = pickle.load(fp) except IOError as e: fail(e, os.EX_IOERR, "Could not read: " + node_pkl) # print(data) + class Statter: - ''' - Compute states for some quantity (epochs_actual, stops, val_loss) - by stage - ''' + """Compute states for some quantity (epochs_actual, stops, val_loss) by + stage.""" + def __init__(self, name=None, token=None): self.data = {} self.name = name @@ -61,7 +63,8 @@ def percentile(self, stage, percentile): self.data[stage].sort(reverse=True) n = len(self.data[stage]) i = round(percentile * n) - 1 - if i < 0: i = 0 + if i < 0: + i = 0 return self.data[stage][i] def report_avg(self): @@ -81,40 +84,41 @@ def string_avg(self): return result def string_avg_pct(self): - ''' Average as percentage, i.e., x100 ''' + """Average as percentage, i.e., x100.""" keys = list(self.data.keys()) keys.sort() result = "# %s: avg %%\n" % self.name for key in keys: - result += "%i %6.2f\n" % (key, 100*self.avg(key)) + result += "%i %6.2f\n" % (key, 100 * self.avg(key)) return result def string_percentile(self, percentile): keys = list(self.data.keys()) keys.sort() - result = "# %s: %s: percentile %0.2f\n" % \ - (self.token, self.name, percentile) + result = "# %s: %s: percentile %0.2f\n" % (self.token, self.name, + percentile) for key in keys: result += "%i %0.4f\n" % (key, self.percentile(key, percentile)) return result + epochs = Statter("epochs by stage", token=args.token) -stops = Statter("stops by stage", token=args.token) +stops = Statter("stops by stage", token=args.token) losses = Statter("val_loss by stage", token=args.token) -times = Statter("times by stage", token=args.token) -count = 0 # Total Nodes -steps = 0 # Training steps -tm_s = 0.0 # Total training time +times = Statter("times by stage", token=args.token) +count = 0 # Total Nodes +steps = 0 # Training steps +tm_s = 0.0 # Total training time best_val_loss = Node(id="BEST") best_val_loss.val_loss = 1000 for node in data.values(): count += 1 steps += node.steps - tm_s += node.time + tm_s += node.time epochs.add(node.stage, node.epochs_actual) - stops .add(node.stage, node.stopped_early) + stops.add(node.stage, node.stopped_early) losses.add(node.stage, node.val_loss) - times.add(node.stage, node.total_time(data)) + times.add(node.stage, node.total_time(data)) if node.stage == 5 and node.val_loss < best_val_loss.val_loss: best_val_loss = node @@ -134,18 +138,18 @@ def string_percentile(self, percentile): epochs.report_avg() + def do_percentiles(): for percentile in [0.99, 0.75, 0.50, 0.25, 0.10]: report = losses.string_percentile(percentile) - filename = 'percentile-%s-%0.2f.data' % \ - (args.token, percentile) - with open(filename, 'w') as fp: + filename = "percentile-%s-%0.2f.data" % (args.token, percentile) + with open(filename, "w") as fp: fp.write(report) + if args.percentiles: do_percentiles() print("best_val_loss: %s %0.2f hours , %i steps" % - (str(best_val_loss), - best_val_loss.total_time(data)/3600, + (str(best_val_loss), best_val_loss.total_time(data) / 3600, best_val_loss.steps)) diff --git a/workflows/cp-leaveout/swift/cpl-upf-workflow.sh b/workflows/cp-leaveout/swift/cpl-upf-workflow.sh index d6c60622..8eced6d6 100755 --- a/workflows/cp-leaveout/swift/cpl-upf-workflow.sh +++ b/workflows/cp-leaveout/swift/cpl-upf-workflow.sh @@ -65,7 +65,7 @@ CPL_PY=$EMEWS_PROJECT_ROOT/../cp-leaveout/py PYTHONPATH+=:$EMEWS_PROJECT_ROOT/py: # For plangen, data_setup PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # For log_tools PYTHONPATH+=:$CPL_PY -PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common # For Benchmarks +PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common # For Benchmarks APP_PYTHONPATH+=:$EMEWS_PROJECT_ROOT/py # For plangen, data_setup APP_PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # For log_tools diff --git a/workflows/cp-leaveout/swift/cpl-upf-workflow.swift b/workflows/cp-leaveout/swift/cpl-upf-workflow.swift index 4bad5ef7..79a9e2d6 100644 --- a/workflows/cp-leaveout/swift/cpl-upf-workflow.swift +++ b/workflows/cp-leaveout/swift/cpl-upf-workflow.swift @@ -84,7 +84,7 @@ global const string FRAMEWORK = "keras"; r = obj(json2, node) => string hist_json = read_history(node); db_stop_result = plangen_stop(db_file, node, plan_id, hist_json) => - assert(db_stop_result != "EXCEPTION", "Exception in plangen_stop()!") => + assert(db_stop_result != "EXCEPTION", "Exception in plangen_stop()!") => printf("stop_subplan result: %s", db_stop_result); } else { printf("plan node already marked complete: %s result=%s", node, db_start_result) => @@ -194,6 +194,6 @@ main() { // string result = join(results, ";") => file out<"%s/plan_id.txt" % turbine_output> = write("%s\n" % plan_id); - write_lines(results, "results.txt") => + write_lines(results, "results.txt") => printf("CP LEAVEOUT WORKFLOW: RESULTS: COMPLETE"); } diff --git a/workflows/cp-leaveout/test-chained/cfg-stage-sys.sh b/workflows/cp-leaveout/test-chained/cfg-stage-sys.sh index 77d03789..e27b7fc9 100644 --- a/workflows/cp-leaveout/test-chained/cfg-stage-sys.sh +++ b/workflows/cp-leaveout/test-chained/cfg-stage-sys.sh @@ -10,8 +10,8 @@ export PROCS=${PROCS:-12} # MPI processes per node. This should not exceed PROCS. export PPN=${PPN:-1} -# Benchmark run timeout: benchmark run will timeouT -# after the specified number of seconds. -1 is no timeout. +# Benchmark run timeout: benchmark run will timeouT +# after the specified number of seconds. -1 is no timeout. BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:--1} # Uncomment below to use custom python script to run @@ -35,9 +35,9 @@ export WALLTIME=${WALLTIME:-00:10:00} # queue export QUEUE=${QUEUE:-batch} -# += is necessary here as the job dependency args are +# += is necessary here as the job dependency args are # set via TURBINE_DIRECTIVE -TURBINE_DIRECTIVE="\n#BSUB -q $QUEUE\n#BSUB -alloc_flags \"NVME maximizegpfs\"\n" +TURBINE_DIRECTIVE="\n#BSUB -q $QUEUE\n#BSUB -alloc_flags \"NVME maximizegpfs\"\n" export TURBINE_DIRECTIVE+=${TURBINE_DIRECTIVE_ARGS:-} TURBINE_LAUNCH_OPTIONS="-a1 -c42 -g1" @@ -58,4 +58,3 @@ echo " IGNORE_ERRORS: $IGNORE_ERRORS" # export MAIL_ENABLED=1 # export MAIL_ADDRESS=wozniak@mcs.anl.gov - diff --git a/workflows/cp-leaveout/test-chained/cfg.json b/workflows/cp-leaveout/test-chained/cfg.json index 1dc4e65f..ddc2bc65 100644 --- a/workflows/cp-leaveout/test-chained/cfg.json +++ b/workflows/cp-leaveout/test-chained/cfg.json @@ -1,30 +1,30 @@ { - "site" : "summit", - "plan" : "/gpfs/alpine/med106/scratch/ncollier/job-chain/inputs/plangen_cell1593-p4_drug1779-p1.json", - "submit_script" : "./test-1.sh", - "upf_directory" : "/gpfs/alpine/med106/scratch/ncollier/job-chain/inputs", - "job_chain_arg" : "#BSUB -w done()", - "stages" : "-1", - "first_stage" : 1, - "first_stage_parent_directory" : "", - - "stage_cfg_script" : "./cfg-stage-sys.sh", + "site": "summit", + "plan": "/gpfs/alpine/med106/scratch/ncollier/job-chain/inputs/plangen_cell1593-p4_drug1779-p1.json", + "submit_script": "./test-1.sh", + "upf_directory": "/gpfs/alpine/med106/scratch/ncollier/job-chain/inputs", + "job_chain_arg": "#BSUB -w done()", + "stages": "-1", + "first_stage": 1, + "first_stage_parent_directory": "", - "stage_cfgs" : [ - { - "stage" : 1, - "WALLTIME" : "02:00:00", - "PROCS" : 6 - }, + "stage_cfg_script": "./cfg-stage-sys.sh", - { - "stage" : 2, - "WALLTIME" : "01:00:00" - }, + "stage_cfgs": [ + { + "stage": 1, + "WALLTIME": "02:00:00", + "PROCS": 6 + }, - { - "stage" : 3, - "WALLTIME" : "01:00:00" - } - ] + { + "stage": 2, + "WALLTIME": "01:00:00" + }, + + { + "stage": 3, + "WALLTIME": "01:00:00" + } + ] } diff --git a/workflows/cp-leaveout/test/test-1.sh b/workflows/cp-leaveout/test/test-1.sh index fe92d7d5..afc4942d 100755 --- a/workflows/cp-leaveout/test/test-1.sh +++ b/workflows/cp-leaveout/test/test-1.sh @@ -88,7 +88,7 @@ WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) # Wait for job # queue_wait -exit +exit SCRIPT=$( basename $0 .sh ) check_output "RESULTS:" $OUTPUT $WORKFLOW $SCRIPT $JOBID diff --git a/workflows/cp-leaveout/test/test-numpy-delete.py b/workflows/cp-leaveout/test/test-numpy-delete.py index e8161108..f6539091 100644 --- a/workflows/cp-leaveout/test/test-numpy-delete.py +++ b/workflows/cp-leaveout/test/test-numpy-delete.py @@ -1,7 +1,6 @@ - import numpy as np A = np.eye(4) print(A) -A = np.delete(A,1,axis=0) +A = np.delete(A, 1, axis=0) print(A) diff --git a/workflows/cp1/README.adoc b/workflows/cp1/README.adoc index bd4b9d6b..913e2aea 100644 --- a/workflows/cp1/README.adoc +++ b/workflows/cp1/README.adoc @@ -3,13 +3,13 @@ The Mini Challenge Problem has 4 distinct stages: 1) preprocessing feature selection; 2) HPO optimization using mlrMBO; 3) additional training of the best models identified in 2; and 4) performing inference -on the trained models from 3. +on the trained models from 3. ==== Feature Selection Rather than using all of the ~17,000 genes for prediction of drug response for Uno, we used the COXEN approach (see the _xcorr_ README) to select genes for building the prediction model in a scenario of model adaptation between studies/datasets. The COXEN approach performs statistical analysis to identify predictive and generalizable genes for prediction. We have five public cancer cell line drug response datasets, i.e., NCI-60, CTRP, GDSC, CCLE, and gCSI. These five studies may have common cancer cell lines and drugs included in their experiments, but each of them may also include unique cell lines, drugs, or combinations of the two. Thus, the COXEN approach targeting genes that are both predictive and generalizable can be helpful in the model adaptation between studies. The approach produces _cross correleated feature set_ files that are used as input to the model. A preprocessed data file will have a file name like `CTRP_CCLE_2000_1000_features.txt` -where the two studies are `CTRP` and `CCLE`, and the cross correlation +where the two studies are `CTRP` and `CCLE`, and the cross correlation coefficients are 2000 and 1000 ==== HPO using mlrMBO @@ -43,7 +43,7 @@ develop branch. The Uno benchmarks is in `Benchmarks/Pilot1/Uno`. === Data *Download the raw Uno feature data to the `Benchmarks/Data/Pilot1` directory.* -Note the data is quite large so, depending on the file system and machine, a better +Note the data is quite large so, depending on the file system and machine, a better choice may be to download and symlink to `Benchmarks/Data/Pilot1`. ---- @@ -61,8 +61,8 @@ $ tar xfz candle-cp1-data.tgz ---- *Generate the input data frames from the raw data and the feature files.* -Uno runs much faster with these as input rather than the raw data. -A train and test data frame needs to be created for each cross correlated feature file. +Uno runs much faster with these as input rather than the raw data. +A train and test data frame needs to be created for each cross correlated feature file. For example, to generate the data associated with the `CTRP_CCLE_2000_1000.txt` feature file: @@ -72,25 +72,25 @@ $ python python uno_baseline_keras2.py --train_sources CTRP --cell_feature_subse $ python uno_baseline_keras2.py --train_sources CCLE --cell_feature_subset_path CTRP_CCLE_2000_1000_features.txt --no_feature_source True --no_response_source True --preprocess_rnaseq combat --export_data CTRP_CCLE_2000_1000_test.h5 ---- -Note that in the train data creation the train_source is CTRP and in the test data -creation the train source is CCLE. The creation of the training and test data can be -time consuming for the larger datasets. +Note that in the train data creation the train_source is CTRP and in the test data +creation the train source is CCLE. The creation of the training and test data can be +time consuming for the larger datasets. == Running the Workflows === HPO Workflow -. Edit `Supervisor/workflows/cp1/data/studies1.txt` and `Supervisor/workflows/cp1/data/studies2.txt`. -These two study files specify the cross correlation between studies where each study in `studies1.txt` -is cross correlated with each study in `studies2.txt` except where they are the same. Add or remove +. Edit `Supervisor/workflows/cp1/data/studies1.txt` and `Supervisor/workflows/cp1/data/studies2.txt`. +These two study files specify the cross correlation between studies where each study in `studies1.txt` +is cross correlated with each study in `studies2.txt` except where they are the same. Add or remove (or comment out) study names in these files to omit that cross-correlation from the HPO instances. For example, if study1 contains CTRP and study2 contains CCLE and GDSC, then the workflow will run two HPOs: one for the CTRP_CCLE and one for the CTRP_GDSC cross-correlations. . Create a set of _cfg_ and _test_ scripts for an experiment run by copying an existing set, i.e., `cfg-prm-1.sh`, `cfg-sys-1.sh`, and `test-1.sh` -from the `test` directory. Be sure to update the lines in your `test-N.sh` that +from the `test` directory. Be sure to update the lines in your `test-N.sh` that export the `cfg-prm-N.sh`, and `cfg-sys-N.sh` scripts to point to your _cfg_ scripts. Namely, + @@ -103,18 +103,18 @@ export CFG_PRM=$THIS/cfg-prm-N.sh See the comments in `cfg-prm-1.sh`, and `cfg-sys-1.sh` for additional information on the various configuration parameters and how to edit them. -. Launch the run using your `test-N.sh` script, passing SITE, and optional -experiment id as arguments (e.g., `./test-10.sh [expid]`) where +. Launch the run using your `test-N.sh` script, passing SITE, and optional +experiment id as arguments (e.g., `./test-10.sh [expid]`) where site can be one of local, cori, theta, summit etc. All the output from running the workflow will appear in a directory named with the experiment id, either your specified one or the auto-generated one (e.g. X001). Each Uno -model run launched by the mlrMBO instances runs in own directory: -`exp_id/run/W_X_Y_Z` where _W_ is the id of the mlrMBO instance that launched the run, _X_ is the restart number +model run launched by the mlrMBO instances runs in own directory: +`exp_id/run/W_X_Y_Z` where _W_ is the id of the mlrMBO instance that launched the run, _X_ is the restart number (almost always 1 here), _Y_ is the iteration of the mlrMBO instance, and _Z_ is the id of the hyper parameter set produced by mlrMBO instance _W_ and with which Uno was launched. -A summary of each Uno run, organized by mlrMBO instance and iteration, will be +A summary of each Uno run, organized by mlrMBO instance and iteration, will be output in `exp_id/hpo_log/X_Y_hpo_runs.txt` where _X_ is the mlrMBO instance id, and _Y_ is the mlrMBO instance's iteration. Each row of this hpo log output contains info for a single Uno run and has the following format: @@ -127,15 +127,15 @@ where the `|` character is the delimiter. * Configuration and launch scripts in `test/` (e.g. `cfg-prm-1.sh`, `test-1.sh`, etc.) * `swift/workflow.swift` - swift file that executes the workflow -* `swift/workflow.sh` - launch script for the swift file. This script is +* `swift/workflow.sh` - launch script for the swift file. This script is configured and launched from the scripts in `test/`. === Further Training Workflow (AKA the UPF workflow) -. Select N number of models from those produced by each HPO instance and +. Select N number of models from those produced by each HPO instance and create the input parameter file (i.e., the "upf" file). The hpo_log results -from the HPO workflow can be used for this. The "Combine HPO logs files +from the HPO workflow can be used for this. The "Combine HPO logs files adding hpo_id and iteration" code in `scripts/plots.R` is an example of how those logs can be concatenated together while adding the hpo_id and iteration as column values. The python jupyter notebook `script/cp1_scripts.ipynb` contains @@ -150,8 +150,8 @@ in the upf file contains the hyperparameters for an Uno run in JSON format. . Create a set of _cfg_ and _test_ scripts for an experiment run by copying an existing set, i.e., `cfg-prm-1.sh`, `cfg-sys-1.sh`, and `test-1.sh` -from the `test_upf/` directory. Note this is *NOT* the `test/` directory. Be sure -to update the lines in your `test-N.sh` that +from the `test_upf/` directory. Note this is *NOT* the `test/` directory. Be sure +to update the lines in your `test-N.sh` that export the `cfg-prm-N.sh`, and `cfg-sys-N.sh` scripts to point to your scripts. Namely, + @@ -164,16 +164,16 @@ export CFG_PRM=$THIS/cfg-prm-N.sh See the comments in `cfg-prm-1.sh`, and `cfg-sys-1.sh` for additional information on the various configuration parameters and how to edit them. -. Launch the run using your `test-N.sh` script, passing SITE, -and optional experiment id as arguments (e.g., `./test-10.sh [expid]`) where +. Launch the run using your `test-N.sh` script, passing SITE, +and optional experiment id as arguments (e.g., `./test-10.sh [expid]`) where site can be one of local, cori, theta, summit etc. All the output from running the workflow will appear in a directory named with the experiment id, either your specified one or the auto-generated one (e.g., X001). Each Uno -model run launched by the workflow runs in own directory: -`exp_id/run/X` where _X_ is the id of the run and corresponds to the index -of the line of input data that was used for that run, that is, `run/0` contains -the output for the run that ran with the 1st line from the upf input +model run launched by the workflow runs in own directory: +`exp_id/run/X` where _X_ is the id of the run and corresponds to the index +of the line of input data that was used for that run, that is, `run/0` contains +the output for the run that ran with the 1st line from the upf input file, `run/1` for the second line and so on. In addition, `inputs.txt` and `results.txt` files are also created. @@ -183,13 +183,13 @@ The first contains the parameters used for each run and the second final val los * Configuration and launch scripts in `test_upf/` (e.g., `cfg-prm-1.sh`, `test-1.sh`, etc.) * `swift/upf_workflow.swift` - swift file that executes the workflow -* `swift/upf_workflow.sh` - launch script for the swift file. This script is +* `swift/upf_workflow.sh` - launch script for the swift file. This script is configured and launched from the scripts in `test_upf/`. === Inference -. Create the inference parameter file. Each line of the inference parameter +. Create the inference parameter file. Each line of the inference parameter file contains the HPO parameters for a single inference run in csv format with the following columns @@ -199,16 +199,16 @@ the following columns For example, `CTRP_GDSC_2000_1000_test.h5,/gpfs/alpine/med106/scratch/ncollier/experiments/full_training_2/run/0/,CTRP_GDSC_2000_1000` + -The test data is part of the data generated as part of the data requirments (see above), -and found in the so-called CACHE_DIR directory as defined in the `cfg-prm-N.sh` files. The +The test data is part of the data generated as part of the data requirments (see above), +and found in the so-called CACHE_DIR directory as defined in the `cfg-prm-N.sh` files. The "directory of the trained model" is a directory that contains a model trained in the further - training workflow. The run label can be an informative label for the run. The python jupyter + training workflow. The run label can be an informative label for the run. The python jupyter notebook `script/cp1_scripts.ipynb` has some sample code for creating this parameter file. . Create a set of _cfg_ and _test_ scripts for an experiment run by copying an existing set, i.e., `cfg-prm-1.sh`, `cfg-sys-1.sh`, and `test-1.sh` -from the `test_infer/` directory. Note this is *NOT* the `test/` directory. -Be sure to update the lines in your `test-N.sh` that +from the `test_infer/` directory. Note this is *NOT* the `test/` directory. +Be sure to update the lines in your `test-N.sh` that export the `cfg-prm-N.sh`, and `cfg-sys-N.sh` scripts to point to your _cfg_ scripts. Namely, + @@ -226,28 +226,28 @@ information on the various configuration parameters and how to edit them. to create multiple copies of the input data to avoid IO contention. If this is unnecessary, then the `infer.sh` should not need to be changed. -. Launch the run using your `test-N.sh` script, passing SITE, and optional experiment id -as arguments (e.g., `./test-10.sh [expid]`) where +. Launch the run using your `test-N.sh` script, passing SITE, and optional experiment id +as arguments (e.g., `./test-10.sh [expid]`) where site can be one of local, cori, theta, summit etc. All the output from running the workflow will appear in a directory named with the experiment id, either your specified one or the auto-generated one. Each Uno -model inference run launched by the workflow runs in its own directory: -`exp_id/run/X` where _X_ is the id of the run and corresponds to the index of the -line of input data that was used for that run. So, `run/0` contains -the output for the run that ran with the 1st line from the input -file, `run/1` for the second line and so on. Each inference run will +model inference run launched by the workflow runs in its own directory: +`exp_id/run/X` where _X_ is the id of the run and corresponds to the index of the +line of input data that was used for that run. So, `run/0` contains +the output for the run that ran with the 1st line from the input +file, `run/1` for the second line and so on. Each inference run will produce an `uno_pred.all.tsv` and an `uno_pred.tsv` file. The first contains -the predictions for each feature and the second is an aggregate view +the predictions for each feature and the second is an aggregate view of the first. Additionally a `log.txt` file is created in the experiment directory -that contains the name of the data input file, the model, the output directory, +that contains the name of the data input file, the model, the output directory, and number of predictions performed for each inference run. ==== Associated Files * Configuration and launch scripts in `test_infer/` (e.g. `cfg-prm-1.sh`, `test-1.sh`, etc.) * `swift/infer_workflow.swift` - swift file that executes the workflow -* `swift/infer_workflow.sh` - launch script for the swift file. This script is +* `swift/infer_workflow.sh` - launch script for the swift file. This script is configured and launched from the scripts in `test_infer/`. * `sh/infer.sh` - script used to launch the Uno benchmark's `uno_infer.py` to perform the actual inference. diff --git a/workflows/cp1/data/upf_use_exported_no_nci.txt b/workflows/cp1/data/upf_use_exported_no_nci.txt index 6bc6e2b8..deecf6b7 100644 --- a/workflows/cp1/data/upf_use_exported_no_nci.txt +++ b/workflows/cp1/data/upf_use_exported_no_nci.txt @@ -18,4 +18,3 @@ {"study1": "CTRP", "epochs": 1, "batch_size": 6144, "use_exported" : 1 } {"study1": "gCSI", "epochs": 1, "batch_size": 6144, "use_exported" : 1 } {"study1": "GDSC", "epochs": 1, "batch_size": 6144, "use_exported" : 1 } - diff --git a/workflows/cp1/db/db-hpo-init.py b/workflows/cp1/db/db-hpo-init.py index 88a7e371..b7153cc4 100644 --- a/workflows/cp1/db/db-hpo-init.py +++ b/workflows/cp1/db/db-hpo-init.py @@ -1,22 +1,24 @@ - # DB HPO INIT PY # Initialize the SQLite DB for HPO # See db-hpo-init.sql for the table schema -import os, sys +import os +import sys + import yaml +from xcorr_db import q, xcorr_db -from xcorr_db import xcorr_db, q +DB = xcorr_db("xcorr.db", log=False) -DB = xcorr_db('xcorr.db', log=False) def create_tables(db_hpo_init_sql): - """ Set up the tables defined in the SQL file """ + """Set up the tables defined in the SQL file.""" with open(db_hpo_init_sql) as fp: sqlcode = fp.read() DB.executescript(sqlcode) DB.commit() + # def create_indices(): # """ Create indices after data insertion for speed """ # DB.execute("create index features_index on features(record_id);") @@ -31,6 +33,7 @@ def create_tables(db_hpo_init_sql): success = True except Exception as e: import traceback + print(traceback.format_exc()) if not success: diff --git a/workflows/cp1/db/db-hpo-list.py b/workflows/cp1/db/db-hpo-list.py index e9c88065..ef776165 100644 --- a/workflows/cp1/db/db-hpo-list.py +++ b/workflows/cp1/db/db-hpo-list.py @@ -1,7 +1,7 @@ - # DB HPO LIST -from xcorr_db import xcorr_db, q +from xcorr_db import q, xcorr_db + def list_hpos(): results = [] @@ -9,49 +9,58 @@ def list_hpos(): DB.execute(cmd) while True: row = DB.cursor.fetchone() - if row == None: break + if row == None: + break id, t = row[0:2] print("r") - results.append([id,t]) + results.append([id, t]) return results + def list_params(hpo_id): - """ hpo_id is a string here """ + """hpo_id is a string here.""" results = {} - cmd = "select param_id, name from hpo_hyperparam_defns " + \ - "where hpo_id=%s;" % hpo_id + cmd = ("select param_id, name from hpo_hyperparam_defns " + + "where hpo_id=%s;" % hpo_id) DB.execute(cmd) while True: row = DB.cursor.fetchone() - if row == None: break + if row == None: + break param_id, name = row[0:2] - results[param_id] = [ name ] + results[param_id] = [name] for param_id in results.keys(): values = list_values(param_id) results[param_id].append(values) return results + def list_values(param_id): - """ param_id is a string here """ + """param_id is a string here.""" results = [] - cmd = "select value_id, value from hpo_hyperparam_values " + \ - "where param_id=%s;" % param_id + cmd = ("select value_id, value from hpo_hyperparam_values " + + "where param_id=%s;" % param_id) DB.execute(cmd) while True: row = DB.cursor.fetchone() - if row == None: break + if row == None: + break value_id, value = row[0:2] - results.append([value_id,value]) + results.append([value_id, value]) return results + import argparse + parser = argparse.ArgumentParser(description="Query the DB.") parser.add_argument("--hpo", action="store", help="specify HPO ID") -parser.add_argument("--list-hpos", action="store_true", - help="list HPO IDs") -parser.add_argument("--list-params", action="store_true", +parser.add_argument("--list-hpos", action="store_true", help="list HPO IDs") +parser.add_argument("--list-params", + action="store_true", help="list hyperparameters") -parser.add_argument("-v", "--verbose", action="store_true", +parser.add_argument("-v", + "--verbose", + action="store_true", help="echo SQL statements") args = parser.parse_args() argv = vars(args) @@ -59,7 +68,8 @@ def list_values(param_id): if argv["verbose"]: print(str(args)) -DB = xcorr_db('xcorr.db', log=argv["verbose"]) +DB = xcorr_db("xcorr.db", log=argv["verbose"]) + def argv_hpo(): global argv @@ -68,6 +78,7 @@ def argv_hpo(): exit(1) return argv["hpo"] + if argv["list_hpos"]: entries = list_hpos() for entry in entries: diff --git a/workflows/cp1/db/db-hpo-setup.py b/workflows/cp1/db/db-hpo-setup.py index 36367654..a7de9ead 100644 --- a/workflows/cp1/db/db-hpo-setup.py +++ b/workflows/cp1/db/db-hpo-setup.py @@ -1,57 +1,64 @@ - # DB HPO SETUP -import os, sys +import os +import sys + import yaml +from xcorr_db import q, xcorr_db -from xcorr_db import xcorr_db, q +DB = xcorr_db("xcorr.db", log=True) -DB = xcorr_db('xcorr.db', log=True) def ensure_hpo_exists(hpo_id): - cmd = "select hpo_id from hpo_ids where hpo_id="+str(hpo_id)+";" + cmd = "select hpo_id from hpo_ids where hpo_id=" + str(hpo_id) + ";" DB.cursor.execute(cmd) while True: row = DB.cursor.fetchone() - if row == None: break + if row == None: + break print("Found in DB: hpo_id=" + str(hpo_id)) return import datetime - ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") DB.insert(table="hpo_ids", names=["hpo_id", "time"], - values=[q(hpo_id),q(ts)]) + values=[q(hpo_id), q(ts)]) print("SQL: created: hpo_id=" + str(hpo_id)) + def insert_hyperparam_defns(hpo_id, yamlfile): - """ - Copy hyperparameter definitions from YAML to SQL - """ + """Copy hyperparameter definitions from YAML to SQL.""" with open(yamlfile) as fp: s = fp.read() y = yaml.load(s) for hp in y: - print("hyperparameter '%s' has %2i values" % \ - (hp, len(y[hp]["values"]))) - param_id = DB.insert(table="hpo_hyperparam_defns", - names=["hpo_id", "name"], - values=[q(hpo_id), q(hp)]) + print("hyperparameter '%s' has %2i values" % (hp, len(y[hp]["values"]))) + param_id = DB.insert( + table="hpo_hyperparam_defns", + names=["hpo_id", "name"], + values=[q(hpo_id), q(hp)], + ) # print("param_id " + str(param_id)) values = y[hp]["values"] for p in values: print(" " + p) - DB.insert(table="hpo_hyperparam_values", - names=["param_id","value"], - values=[q(param_id),q(p)]) + DB.insert( + table="hpo_hyperparam_values", + names=["param_id", "value"], + values=[q(param_id), q(p)], + ) + def usage(): print("usage: db-hpo-setup ") + if len(sys.argv) != 3: usage() exit(1) -hpo_id = int(sys.argv[1]) +hpo_id = int(sys.argv[1]) yamlfile = sys.argv[2] # Catch and print all exceptions to improve visibility of success/failure @@ -62,6 +69,7 @@ def usage(): success = True except Exception as e: import traceback + print(traceback.format_exc()) if not success: diff --git a/workflows/cp1/db/hpo-defns-1.yaml b/workflows/cp1/db/hpo-defns-1.yaml index 99591daa..27b53ad3 100644 --- a/workflows/cp1/db/hpo-defns-1.yaml +++ b/workflows/cp1/db/hpo-defns-1.yaml @@ -1,15 +1,19 @@ - # HPO DEFNS 1 # Example for testing with values from Google Sheet "cp1" activation: description: activation - values: [ relu, tanh, sigmoid ] + values: [relu, tanh, sigmoid] dense_feature_layers: description: dense_feature_layers - values: [ "[1000, 1000, 1000]", - "[1000, 1000, 1000,1000]", "[1000, 1000, 1000, 1000, 1000]", - "[1000, 1000, 1000, 1000, 1000, 1000]", - "[2000, 2000, 2000]", - "[2000, 2000, 2000, 2000]", "[2000, 2000, 2000, 2000, 2000]", - "[2000, 2000, 2000, 2000, 2000, 2000]" ] + values: + [ + "[1000, 1000, 1000]", + "[1000, 1000, 1000,1000]", + "[1000, 1000, 1000, 1000, 1000]", + "[1000, 1000, 1000, 1000, 1000, 1000]", + "[2000, 2000, 2000]", + "[2000, 2000, 2000, 2000]", + "[2000, 2000, 2000, 2000, 2000]", + "[2000, 2000, 2000, 2000, 2000, 2000]", + ] diff --git a/workflows/cp1/nested_me_ex/.gitignore b/workflows/cp1/nested_me_ex/.gitignore index f6b35bb3..c0ee542e 100644 --- a/workflows/cp1/nested_me_ex/.gitignore +++ b/workflows/cp1/nested_me_ex/.gitignore @@ -1,4 +1,4 @@ *.tic __pycache__/ *.pyc -experiments/ \ No newline at end of file +experiments/ diff --git a/workflows/cp1/nested_me_ex/README.md b/workflows/cp1/nested_me_ex/README.md index d27aa813..2f2067ac 100644 --- a/workflows/cp1/nested_me_ex/README.md +++ b/workflows/cp1/nested_me_ex/README.md @@ -1,17 +1,17 @@ -## An Example Nested Model Exploration (ME) Workflow ## - -The workflow in `swift/workflow.swift` is a nested workflow where a *me1* resident -task provides parameters to any available *me2* resident tasks. The number of -*me2* resident tasks must be set before hand in `swift/workflow.sh` via -the *TURBINE_RESIDENT_WORK_WORKERS* variable. There must be at least 3 -*TURBINE_RESIDENT_WORK_WORKERS*: one for the *me1* resident task, one for -the *task_cache* (see below) resident task and one for an *me2* -resident task. Any more than 3 and the additional resident tasks are *me2* +## An Example Nested Model Exploration (ME) Workflow + +The workflow in `swift/workflow.swift` is a nested workflow where a _me1_ resident +task provides parameters to any available _me2_ resident tasks. The number of +_me2_ resident tasks must be set before hand in `swift/workflow.sh` via +the _TURBINE_RESIDENT_WORK_WORKERS_ variable. There must be at least 3 +_TURBINE_RESIDENT_WORK_WORKERS_: one for the _me1_ resident task, one for +the _task_cache_ (see below) resident task and one for an _me2_ +resident task. Any more than 3 and the additional resident tasks are _me2_ resident tasks. To run the workflow, edit `swift/workflow.sh` for your machine (i.e. edit -swift-t location, number of PROCS etc.), and -run. The script takes a single argument: an experiment id. So, ```./workflow.sh t1``` +swift-t location, number of PROCS etc.), and +run. The script takes a single argument: an experiment id. So, `./workflow.sh t1` Note that is work in progress and I have seen some seg faults when then entire workflow has finished. @@ -20,45 +20,44 @@ The implementation consists of two nested loops driven by these resident tasks. The overall flow looks like: 1. Initialization -2. The *me1* produces sets of parameters -3. Each parameter set is consumed by an *me2* instance -4. An *me2* instance produces parameters for model runs -5. After some number of model runs, the *me2* returns a result to the *me1* and we go back to step 2. - -Both loops are typical EMEWS style ME loops where some python code is intialized -with an *EQPy_init_package* and an *EQPy_run* (this latter call is new and custom -for this). For the *me1* we can see the initialization in line 133 and run in line 134. -The *me1* package is in `python/me1.py` which constains some dummy code +2. The _me1_ produces sets of parameters +3. Each parameter set is consumed by an _me2_ instance +4. An _me2_ instance produces parameters for model runs +5. After some number of model runs, the _me2_ returns a result to the _me1_ and we go back to step 2. + +Both loops are typical EMEWS style ME loops where some python code is intialized +with an _EQPy_init_package_ and an _EQPy_run_ (this latter call is new and custom +for this). For the _me1_ we can see the initialization in line 133 and run in line 134. +The _me1_ package is in `python/me1.py` which constains some dummy code to exercise the workflow. -The *me1* loop starts on line 151. The *EQPy_get* on line 157 produces the actual -parameters for the me2 to work on. THe *eqpy.OUT_put* on line 19 of -me1.py is what is sending these parameters from *me1.py*. +The _me1_ loop starts on line 151. The _EQPy_get_ on line 157 produces the actual +parameters for the me2 to work on. THe _eqpy.OUT_put_ on line 19 of +me1.py is what is sending these parameters from _me1.py_. -The *me1* loops runs an me2 instance in lines 180-181. +The _me1_ loops runs an me2 instance in lines 180-181. ```objc string free_rank = EQPy_get(cache_loc); results[j] = start_me2(p, i, j, free_rank); ``` -The *EQPy_get* call gets the rank of an available resident task that can -be used to run the me2. *start_me2* then runs the -me2 loop using that resident task. - -The placeholder me2 ME is implemented in `python/me2.py`. -As usual with EMEWS and like the *me1.py* above, this produces parameters and -passes them to swift for evaluation. The *eqpy.OUT_put(ps)* on line 32 in - `python/me2.py` produces the parameters and those parameters - are received by swift on line 72 in `swift/workflow.swift` in the *run_me2* - loop. Note that currently the *run_model* call on line 95 that receives these parameters - is just a placeholder. In the actual case, that would call the actual code to run the model. - - There's an additional swift resident task that runs the `python/task_cache.py` package. - This keeps track of which me2 resident tasks are available for work. MPI is used to - communicate between `task_cache` and `me2`. `task_cache` contains a list of MPI ranks - that can be used to run `me2` resident tasks. These ranks are pushed into an EQPY - queue where they can be retreived by the swift workflow. When an `me2` instance completes, its rank is pushed into the queue, indicating that that rank is now free for work. `task_cache.init_comm` and `me2.init` create an MPI communicator that they - use to communicate. I couldn't get this work without the back channel MPI. The code - seemed to deadlock at various points. If there's a better way, please let me know. - +The _EQPy_get_ call gets the rank of an available resident task that can +be used to run the me2. _start_me2_ then runs the +me2 loop using that resident task. + +The placeholder me2 ME is implemented in `python/me2.py`. +As usual with EMEWS and like the _me1.py_ above, this produces parameters and +passes them to swift for evaluation. The _eqpy.OUT_put(ps)_ on line 32 in +`python/me2.py` produces the parameters and those parameters +are received by swift on line 72 in `swift/workflow.swift` in the _run_me2_ +loop. Note that currently the _run_model_ call on line 95 that receives these parameters +is just a placeholder. In the actual case, that would call the actual code to run the model. + +There's an additional swift resident task that runs the `python/task_cache.py` package. +This keeps track of which me2 resident tasks are available for work. MPI is used to +communicate between `task_cache` and `me2`. `task_cache` contains a list of MPI ranks +that can be used to run `me2` resident tasks. These ranks are pushed into an EQPY +queue where they can be retreived by the swift workflow. When an `me2` instance completes, its rank is pushed into the queue, indicating that that rank is now free for work. `task_cache.init_comm` and `me2.init` create an MPI communicator that they +use to communicate. I couldn't get this work without the back channel MPI. The code +seemed to deadlock at various points. If there's a better way, please let me know. diff --git a/workflows/cp1/nested_me_ex/ext/EQ-Py/eqpy.py b/workflows/cp1/nested_me_ex/ext/EQ-Py/eqpy.py index e98e7f94..8b2f6618 100644 --- a/workflows/cp1/nested_me_ex/ext/EQ-Py/eqpy.py +++ b/workflows/cp1/nested_me_ex/ext/EQ-Py/eqpy.py @@ -1,6 +1,7 @@ -import threading +import importlib import sys -import importlib, traceback +import threading +import traceback EQPY_ABORT = "EQPY_ABORT" @@ -18,6 +19,7 @@ aborted = False wait_info = None + class WaitInfo: def __init__(self): @@ -28,6 +30,7 @@ def getWait(self): self.wait += 1 return self.wait + class InitializingThreadRunner(threading.Thread): def __init__(self, runnable): @@ -41,7 +44,7 @@ def run(self): except AttributeError: pass - + class ThreadRunner(threading.Thread): def __init__(self, runnable): @@ -56,20 +59,23 @@ def run(self): # tuple of type, value and traceback self.exc = traceback.format_exc() + def init(pkg): global p1, wait_info wait_info = WaitInfo() imported_pkg = importlib.import_module(pkg) - #print(pkg);sys.stdout.flush() + # print(pkg);sys.stdout.flush() p1 = InitializingThreadRunner(imported_pkg) p1.start() + def run(): global p2 p2 = ThreadRunner(p1.runnable) - #print(p.runnable);sys.stdout.flush() + # print(p.runnable);sys.stdout.flush() p2.start() + def output_q_get(): global output_q, aborted wait = wait_info.getWait() @@ -93,15 +99,19 @@ def output_q_get(): return result + import sys + def input_q_put(val): # print("q put {}".format(val));sys.stdout.flush() input_q.put(val) + def OUT_put(string_params): output_q.put(string_params) + def IN_get(): # global input_q result = input_q.get() diff --git a/workflows/cp1/nested_me_ex/python/me1.py b/workflows/cp1/nested_me_ex/python/me1.py index 4edbe111..d2febbca 100644 --- a/workflows/cp1/nested_me_ex/python/me1.py +++ b/workflows/cp1/nested_me_ex/python/me1.py @@ -1,9 +1,10 @@ -import eqpy import random +import eqpy # Generates parameters to be used by other MEs + def run(): # gets dummy params for this me params = eqpy.IN_get() @@ -12,15 +13,18 @@ def run(): for _ in range(10): op = [] for _ in range(5): - p = "{},{},{},{}".format(random.randint(1, 10), - random.randint(1, 10), random.randint(1, 10), - random.randint(1, 10)) + p = "{},{},{},{}".format( + random.randint(1, 10), + random.randint(1, 10), + random.randint(1, 10), + random.randint(1, 10), + ) op.append(p) - + ps = ";".join(op) eqpy.OUT_put(ps) # wait to get result back eqpy.IN_get() - + eqpy.OUT_put("DONE") eqpy.OUT_put("final result") diff --git a/workflows/cp1/nested_me_ex/python/me2.py b/workflows/cp1/nested_me_ex/python/me2.py index ea59f57e..acdc8386 100644 --- a/workflows/cp1/nested_me_ex/python/me2.py +++ b/workflows/cp1/nested_me_ex/python/me2.py @@ -1,30 +1,36 @@ -import eqpy, sys +import sys + +import eqpy from mpi4py import MPI + def printf(s): print(s) sys.stdout.flush() + cache_comm = None + def init(): global cache_comm ranks_str = eqpy.IN_get() - ranks = ranks_str.split(',')[1:] - #print(ranks) + ranks = ranks_str.split(",")[1:] + # print(ranks) if cache_comm == None: comm = MPI.COMM_WORLD group = comm.Get_group() cache_group = group.Incl([int(x) for x in ranks]) - #printf("ME newgroup size is {}".format(cache_group.size)) - cache_comm = comm.Create_group(cache_group,1) + # printf("ME newgroup size is {}".format(cache_group.size)) + cache_comm = comm.Create_group(cache_group, 1) + def run(): # my swift-t MPI comm rank, and destination rank for cache_comm rank = eqpy.IN_get() - #printf("AL Start on {}".format(rank)) + # printf("AL Start on {}".format(rank)) param = eqpy.IN_get() - + for _ in range(10): op = [param] * 5 ps = ";".join(op) @@ -34,10 +40,5 @@ def run(): eqpy.OUT_put("DONE") eqpy.OUT_put("42") - data = {'msg' : 'put', 'rank' : rank} + data = {"msg": "put", "rank": rank} cache_comm.send(data, dest=0, tag=1) - - - - - diff --git a/workflows/cp1/nested_me_ex/python/task_cache.py b/workflows/cp1/nested_me_ex/python/task_cache.py index 08965c82..00fab96f 100644 --- a/workflows/cp1/nested_me_ex/python/task_cache.py +++ b/workflows/cp1/nested_me_ex/python/task_cache.py @@ -1,38 +1,42 @@ -import eqpy import sys + +import eqpy from mpi4py import MPI + def printf(s): print(s) sys.stdout.flush() + def init_comm(ranks): - comm = MPI.COMM_WORLD + comm = MPI.COMM_WORLD group = comm.Get_group() cache_group = group.Incl([int(x) for x in ranks]) - #printf("Cache Group size is {}".format(cache_group.size)) - return comm.Create_group(cache_group,1) + # printf("Cache Group size is {}".format(cache_group.size)) + return comm.Create_group(cache_group, 1) + def run(): ranks_str = eqpy.IN_get() - ranks = ranks_str.split(',') + ranks = ranks_str.split(",") # include only the al ranks task_ranks = ranks[2:] - + for r in task_ranks: eqpy.OUT_put(r) # include self and tasks in comm comm = init_comm(ranks[1:]) rank = comm.rank - #printf("task cache rank: {}".format(rank)) + # printf("task cache rank: {}".format(rank)) while True: - status = MPI.Status() + status = MPI.Status() data = comm.recv(source=MPI.ANY_SOURCE, status=status) - msg = data['msg'] - if msg == 'put': + msg = data["msg"] + if msg == "put": # this is its rank in the swift mpi communicator - eqpy.OUT_put(data['rank']) - elif msg == 'DONE': + eqpy.OUT_put(data["rank"]) + elif msg == "DONE": break diff --git a/workflows/cp1/nested_me_ex/swift/workflow.sh b/workflows/cp1/nested_me_ex/swift/workflow.sh index f92b57f1..30809b8a 100755 --- a/workflows/cp1/nested_me_ex/swift/workflow.sh +++ b/workflows/cp1/nested_me_ex/swift/workflow.sh @@ -86,4 +86,4 @@ set -x swift-t -n $PROCS $MACHINE -p -r$EQPy -I $EQPy \ -e MPICH_MAX_THREAD_SAFETY=$MPICH_MAX_THREAD_SAFETY \ -e PYTHONPATH=$PYTHONPATH \ - $EMEWS_PROJECT_ROOT/swift/workflow.swift \ No newline at end of file + $EMEWS_PROJECT_ROOT/swift/workflow.swift diff --git a/workflows/cp1/nested_me_ex/swift/workflow.swift b/workflows/cp1/nested_me_ex/swift/workflow.swift index 3fb47d81..31dde226 100644 --- a/workflows/cp1/nested_me_ex/swift/workflow.swift +++ b/workflows/cp1/nested_me_ex/swift/workflow.swift @@ -54,7 +54,7 @@ int CACHE_RANK_IDX = 1; EQPy_run(me2_location) => EQPy_put(me2_location, me2_rank) => EQPy_put(me2_location, params) => - run_me2(me2_location, iter, param_id, me2_rank) => + run_me2(me2_location, iter, param_id, me2_rank) => // get fake results from ME2 run result = get_result(); } @@ -91,7 +91,7 @@ int CACHE_RANK_IDX = 1; foreach p, j in param_array { // TODO update run_model with code to actually - // run the model with the parameters + // run the model with the parameters // produced from the active learning. results[j] = run_model(p, i, j); } @@ -105,7 +105,7 @@ int CACHE_RANK_IDX = 1; (void o) init_tasks_cache() { rank = r_ranks[CACHE_RANK_IDX]; location loc = locationFromRank(string2int(rank)); - EQPy_init_package(loc, "task_cache") => + EQPy_init_package(loc, "task_cache") => EQPy_run(loc) => EQPy_put(loc, join(r_ranks, ",")) => o = propagate(); @@ -122,7 +122,7 @@ int CACHE_RANK_IDX = 1; foreach i in [2 : size(r_ranks) - 1] { init_me2_rank(r_ranks[i]); waiter[i] = r_ranks[i]; - } + } } (void o) start() { @@ -171,7 +171,7 @@ int CACHE_RANK_IDX = 1; string param_array[] = split(params, ";"); string results[]; // printf("%i", size(param_array)); - // Lauch an me2 run for each set of parameters produced by + // Lauch an me2 run for each set of parameters produced by // me1 foreach p, j in param_array { diff --git a/workflows/cp1/scripts/parse_infer_results.py b/workflows/cp1/scripts/parse_infer_results.py index c98562fd..c45ac7ef 100644 --- a/workflows/cp1/scripts/parse_infer_results.py +++ b/workflows/cp1/scripts/parse_infer_results.py @@ -1,82 +1,113 @@ - -import sys import csv +import datetime +import json import subprocess -import datetime, json +import sys from os import path + import numpy as np - #mse: 0.2190,min,max,std - #mae: 0.3251 - #r2: 0.4320 - #corr: 0.6584 +# mse: 0.2190,min,max,std +# mae: 0.3251 +# r2: 0.4320 +# corr: 0.6584 def grep(infer_log): - output = subprocess.check_output(['grep', '-E', "mse:|mae:|r2:|corr:", infer_log]) - lines = output.decode("utf-8").strip().split('\n') + output = subprocess.check_output( + ["grep", "-E", "mse:|mae:|r2:|corr:", infer_log]) + lines = output.decode("utf-8").strip().split("\n") # print(lines) result = [np.nan] * 16 - # id, start, end, train time, epochs + # id, start, end, train time, epochs for line in lines: line = line.strip() - if line.startswith('mse:') and line.find(',') != -1: + if line.startswith("mse:") and line.find(",") != -1: l = line[5:] - result[0], result[1], result[2], result[3] = [float(x) for x in l.split(',')] - elif line.startswith('mae:') and line.find(',') != -1: + result[0], result[1], result[2], result[3] = [ + float(x) for x in l.split(",") + ] + elif line.startswith("mae:") and line.find(",") != -1: l = line[5:] - result[4], result[5], result[6], result[7] = [float(x) for x in l.split(',')] - elif line.startswith('r2') and line.find(',') != -1: + result[4], result[5], result[6], result[7] = [ + float(x) for x in l.split(",") + ] + elif line.startswith("r2") and line.find(",") != -1: l = line[3:] - result[8], result[9], result[10], result[11] = [float(x) for x in l.split(',')] - elif line.startswith('corr') and line.find(',') != -1: + result[8], result[9], result[10], result[11] = [ + float(x) for x in l.split(",") + ] + elif line.startswith("corr") and line.find(",") != -1: l = line[6:] - result[12], result[13], result[14], result[15] = [float(x) for x in l.split(',')] + result[12], result[13], result[14], result[15] = [ + float(x) for x in l.split(",") + ] # print(result) return result + def create_params_map(training_file): param_map = {} with open(training_file) as f_in: reader = csv.reader(f_in, delimiter="|") for r in reader: params = json.loads(r[2]) - save_path = params['save_path'] - if save_path[-1] == '/': + save_path = params["save_path"] + if save_path[-1] == "/": save_path = save_path[:-1] param_map[save_path] = params - + return param_map def main(infer_log, training_file, out_file): param_map = create_params_map(training_file) - with open(out_file, 'w') as f_out: + with open(out_file, "w") as f_out: writer = csv.writer(f_out) - writer.writerow(['infer_id', 'model_class', 'instance_directory', 'params', 'model_path', - 'mse_mean', 'mse_std', 'mse_min', 'mse_max', - 'mae_mean', 'mae_std', 'mae_min', 'mae_max', - 'r2_mean', 'r2_std', 'r2_min', 'r2_max', - 'corr_mean', 'corr_std', 'corr_min', 'corr_max']) + writer.writerow([ + "infer_id", + "model_class", + "instance_directory", + "params", + "model_path", + "mse_mean", + "mse_std", + "mse_min", + "mse_max", + "mae_mean", + "mae_std", + "mae_min", + "mae_max", + "r2_mean", + "r2_std", + "r2_min", + "r2_max", + "corr_mean", + "corr_std", + "corr_min", + "corr_max", + ]) with open(infer_log) as f_in: - reader = csv.reader(f_in, delimiter='|') + reader = csv.reader(f_in, delimiter="|") # model class|data file|model|instance_dir for i, row in enumerate(reader): if i % 1000 == 0: - print('ROW: {}'.format(i)) + print("ROW: {}".format(i)) model_class = row[0] instance_dir = row[3] model_dir = path.dirname(row[2]) params = param_map[model_dir] - stats = grep('{}/infer.log'.format(instance_dir)) + stats = grep("{}/infer.log".format(instance_dir)) if not np.isnan(stats[0]): - result = [i, model_class, instance_dir, params, row[2]] + stats + result = [i, model_class, instance_dir, params, row[2] + ] + stats writer.writerow(result) else: - print("{}|{}|{}|{}".format(row[0], row[1], row[2], row[3], row[4])) + print("{}|{}|{}|{}".format(row[0], row[1], row[2], row[3], + row[4])) -if __name__ == '__main__': - # inference log file (e.g. infer_all_4/log.txt), training file (e.g. full_training_2/inputs.txt), output file, +if __name__ == "__main__": + # inference log file (e.g. infer_all_4/log.txt), training file (e.g. full_training_2/inputs.txt), output file, main(sys.argv[1], sys.argv[2], sys.argv[3]) diff --git a/workflows/cp1/scripts/parse_start_stop.py b/workflows/cp1/scripts/parse_start_stop.py index 879467d5..9fdca7e0 100644 --- a/workflows/cp1/scripts/parse_start_stop.py +++ b/workflows/cp1/scripts/parse_start_stop.py @@ -1,17 +1,17 @@ -import sys import csv -import subprocess import datetime import os - +import subprocess +import sys from operator import itemgetter -TIME_FORMAT='%Y/%m/%d %H:%M:%S' +TIME_FORMAT = "%Y/%m/%d %H:%M:%S" START = 0 STOP = 1 + def create_counts(timings_file, out_dir): - hpos = {'all' : []} + hpos = {"all": []} with open(timings_file) as f_in: reader = csv.reader(f_in) for row in reader: @@ -21,12 +21,12 @@ def create_counts(timings_file, out_dir): hpos[hpo_id].append(line) else: hpos[hpo_id] = [line] - hpos['all'].append(line) + hpos["all"].append(line) for k in hpos: hpos[k] = sorted(hpos[k], key=itemgetter(0)) - counts = {'all' : []} + counts = {"all": []} for k in hpos: count = 0 for ts, ev in hpos[k]: @@ -34,53 +34,53 @@ def create_counts(timings_file, out_dir): count += 1 else: count -= 1 - + if k in counts: counts[k].append([ts, count]) else: counts[k] = [[ts, count]] - + for k in counts: - with open('{}/{}_counts.csv'.format(out_dir, k), 'w') as f_out: + with open("{}/{}_counts.csv".format(out_dir, k), "w") as f_out: writer = csv.writer(f_out) for item in counts[k]: writer.writerow(item) - def grep(model_log): - output = subprocess.check_output(['grep', '-E', "RUN START|RUN STOP", model_log]) + output = subprocess.check_output( + ["grep", "-E", "RUN START|RUN STOP", model_log]) lines = output.decode("utf-8") result = [] - for line in lines.split('\n'): - idx = line.find(' __main') + for line in lines.split("\n"): + idx = line.find(" __main") if idx != -1: ts = line[0:idx] dt = datetime.datetime.strptime(ts, TIME_FORMAT).timestamp() - if line.endswith('START'): + if line.endswith("START"): result.append((dt, START)) else: result.append((dt, STOP)) - + return result def main(hpos_file, out_file): - with open(out_file, 'w') as f_out: + with open(out_file, "w") as f_out: with open(hpos_file) as f_in: - reader = csv.reader(f_in, delimiter='|') + reader = csv.reader(f_in, delimiter="|") for i, row in enumerate(reader): if i % 1000 == 0: - print('ROW: {}'.format(i)) + print("ROW: {}".format(i)) # hpo_id = row[1] run_dir = row[3] rd = os.path.basename(run_dir) - hpo_id = rd[:rd.find('_')] - result = grep('{}/model.log'.format(run_dir)) + hpo_id = rd[:rd.find("_")] + result = grep("{}/model.log".format(run_dir)) for r in result: - f_out.write('{},{},{}\n'.format(hpo_id, r[0], r[1])) + f_out.write("{},{},{}\n".format(hpo_id, r[0], r[1])) if __name__ == "__main__": - #main(sys.argv[1], sys.argv[2]) + # main(sys.argv[1], sys.argv[2]) create_counts(sys.argv[1], sys.argv[2]) diff --git a/workflows/cp1/scripts/parse_start_stop_upf.py b/workflows/cp1/scripts/parse_start_stop_upf.py index 0b2c7d03..4268905e 100644 --- a/workflows/cp1/scripts/parse_start_stop_upf.py +++ b/workflows/cp1/scripts/parse_start_stop_upf.py @@ -1,15 +1,17 @@ -import sys import csv +import datetime +import json import subprocess -import datetime, json +import sys from os import path -TIME_FORMAT='%Y/%m/%d %H:%M:%S' +TIME_FORMAT = "%Y/%m/%d %H:%M:%S" START = 0 STOP = 1 + def create_counts(timings_file): - hpos = {'all' : []} + hpos = {"all": []} with open(timings_file) as f_in: reader = csv.reader(f_in) for row in reader: @@ -18,12 +20,12 @@ def create_counts(timings_file): hpos[hpo_id].append(row[1:]) else: hpos[hpo_id] = [row[1:]] - hpos['all'].append(row[1:]) + hpos["all"].append(row[1:]) for k in hpos: sorted(hpos[k], itemgetter(0)) - counts = {'all' : []} + counts = {"all": []} for k in hpos: count = 0 for ts, ev in hpos[k]: @@ -31,7 +33,7 @@ def create_counts(timings_file): count += 1 else: count -= 1 - + if k in counts: counts[k].append([ts, count]) else: @@ -39,67 +41,73 @@ def create_counts(timings_file): def grep(model_log, rid, model_name): - output = subprocess.check_output(['grep', '-E', "RUN START|RUN STOP", model_log]) + output = subprocess.check_output( + ["grep", "-E", "RUN START|RUN STOP", model_log]) lines = output.decode("utf-8") # id, start, end, train time, epochs result = [int(rid), model_name, -1, -1, -1, -1] complete = False - for line in lines.split('\n'): - idx = line.find(' __main') + for line in lines.split("\n"): + idx = line.find(" __main") if idx != -1: ts = line[0:idx] dt = datetime.datetime.strptime(ts, TIME_FORMAT).timestamp() - if line.endswith('START'): + if line.endswith("START"): result[2] = dt - elif line.endswith('STOP'): + elif line.endswith("STOP"): result[3] = dt complete = True # Current time ....1888.599 # Epoch 2/100 - output = subprocess.check_output(['grep', '-E', "Current time", model_log]) - lines = output.decode("utf-8").strip().split('\n') + output = subprocess.check_output(["grep", "-E", "Current time", model_log]) + lines = output.decode("utf-8").strip().split("\n") line = lines[-1] - ct = line[line.rfind(' ....') + len(' ....') : ].strip() + ct = line[line.rfind(" ....") + len(" ...."):].strip() result[4] = float(ct) - - output = subprocess.check_output(['grep', '-E', "Epoch", model_log]) - lines = output.decode("utf-8").strip().split('\n') + + output = subprocess.check_output(["grep", "-E", "Epoch", model_log]) + lines = output.decode("utf-8").strip().split("\n") if complete: line = lines[-1] else: line = lines[-2] - epochs = line[line.find(' ') : line.find('/') ] + epochs = line[line.find(" "):line.find("/")] result[5] = int(epochs) - return result + def write_results(results): - with open('timings.txt', 'w') as f_out: - + with open("timings.txt", "w") as f_out: + result = results[hpo_id] for r in result: for i in r: - f_out.write('{} {}\n'.format(i[0], i[1])) + f_out.write("{} {}\n".format(i[0], i[1])) + def main(hpos_file, out_file): results = {} - with open(out_file, 'w') as f_out: + with open(out_file, "w") as f_out: writer = csv.writer(f_out) - writer.writerow(['upf_id', 'model_name', 'start_ts', 'end_ts', 'total_train_time', 'epochs']) + writer.writerow([ + "upf_id", "model_name", "start_ts", "end_ts", "total_train_time", + "epochs" + ]) with open(hpos_file) as f_in: - reader = csv.reader(f_in, delimiter='|') + reader = csv.reader(f_in, delimiter="|") for i, row in enumerate(reader): if i % 1000 == 0: - print('ROW: {}'.format(i)) + print("ROW: {}".format(i)) upf_id = row[0] params = json.loads(row[2]) - bname = path.basename(params['use_exported_data']) - model_name = bname[ : bname.find('.')] - run_dir = params['save_path'] - result = grep('{}/model.log'.format(run_dir), upf_id, model_name) + bname = path.basename(params["use_exported_data"]) + model_name = bname[:bname.find(".")] + run_dir = params["save_path"] + result = grep("{}/model.log".format(run_dir), upf_id, + model_name) writer.writerow(result) diff --git a/workflows/cp1/scripts/plots.R b/workflows/cp1/scripts/plots.R index 89c12f18..f9fefafe 100644 --- a/workflows/cp1/scripts/plots.R +++ b/workflows/cp1/scripts/plots.R @@ -52,10 +52,10 @@ ggplot (se, aes(x=start, y=hpo_id)) + geom_segment( xend=se$end, yend=se$hpo_id, size = 3 - ) + + ) + xlab('time (minutes)') + ylab('hpo id') + - scale_x_continuous(limits = c(0, max(se$end))) + scale_x_continuous(limits = c(0, max(se$end))) ft <- fread("~/Documents/results/cp1/train_upf_timings.csv") @@ -63,7 +63,7 @@ ft$time_per_epoch <- ft$total_train_time / ft$epochs fwrite(ft, file="~/Documents/results/cp1/train_upf_timings.csv", row.names = F) agg_ft <- ft[, .(min(total_train_time), max(total_train_time), mean(total_train_time), sd(total_train_time), - min(epochs), max(epochs), mean(epochs), sd(epochs), + min(epochs), max(epochs), mean(epochs), sd(epochs), min(time_per_epoch), max(time_per_epoch), mean(time_per_epoch), sd(time_per_epoch)), by=model_name] setnames(agg_ft, c("model_name", "min_train_time", "max_train_time", "mean_train_time", "std_train_time", "min_epochs", "max_epochs", "mean_epochs", "std_epochs", "min_time_per_epoch", "max_time_per_epoch", "mean_time_per_epoch", "std_time_per_epoch")) @@ -72,7 +72,7 @@ fwrite(agg_ft, file="~/Documents/results/cp1/agg_timings_by_model.csv", row.name idt <- fread("~/Documents/results/cp1/inference_results.csv") agg_idt <- idt[, .(min(r2), max(r2), mean(r2), sd(r2), - min(mae), max(mae), mean(mae), sd(mae), + min(mae), max(mae), mean(mae), sd(mae), min(mse), max(mse), mean(mse), sd(mse)), by=model_class] setnames(agg_idt, c("model_class", "min_r2", "max_r2", "mean_r2", "std_r2", "min_mae", "max_mae", "mean_mae", "std_mae", "min_mse", "max_mse", "mean_mse", "std_mse")) @@ -92,7 +92,7 @@ dts = list() i = 1 for (f in fs) { - hpo_dt <- fread(f, col.names = c("run_id", 'xcorr_record_id', 'params', 'instance_dir', 'timestamp', 'val_loss'), + hpo_dt <- fread(f, col.names = c("run_id", 'xcorr_record_id', 'params', 'instance_dir', 'timestamp', 'val_loss'), sep="|") fname = basename(f) vals <- strsplit(fname, "_", fixed=T) @@ -105,7 +105,7 @@ for (f in fs) { results_dir <- '~/Documents/results/cp1/nci_hpo_log/' fs <- Sys.glob(paste0(results_dir, '/*_hpo_runs.txt')) for (f in fs) { - hpo_dt <- fread(f, col.names = c("run_id", 'xcorr_record_id', 'params', 'instance_dir', 'timestamp', 'val_loss'), + hpo_dt <- fread(f, col.names = c("run_id", 'xcorr_record_id', 'params', 'instance_dir', 'timestamp', 'val_loss'), sep="|") fname = basename(f) vals <- strsplit(fname, "_", fixed=T) @@ -130,5 +130,3 @@ ggplot(data=hpos[val_loss < 1e+03], mapping=aes(x=iteration, y=val_loss)) + ylab("Val Loss (log scale)") + scale_y_continuous(trans='log10') + facet_wrap(~ hpo_id, ncol=5) - - diff --git a/workflows/cp1/sh/infer.sh b/workflows/cp1/sh/infer.sh index 750c1fee..2ea1c7de 100755 --- a/workflows/cp1/sh/infer.sh +++ b/workflows/cp1/sh/infer.sh @@ -16,7 +16,7 @@ then exit 1 fi -INSTANCE_DIRECTORY=$1 +INSTANCE_DIRECTORY=$1 DF="$2" MODEL_FILE="$3" N_PRED=$4 diff --git a/workflows/cp1/swift/infer_workflow.swift b/workflows/cp1/swift/infer_workflow.swift index d821adfb..65037628 100644 --- a/workflows/cp1/swift/infer_workflow.swift +++ b/workflows/cp1/swift/infer_workflow.swift @@ -40,7 +40,7 @@ string n_pred = argv("n_pred"); */ app (void o) run_model (string model_sh, string instance_dir, string data_file, string model_file, string run_id) { - // 1 2 3 4 5 + // 1 2 3 4 5 "bash" model_sh instance_dir data_file model_file n_pred run_id; } @@ -75,4 +75,3 @@ main() { write_lines(inputs, "log.txt"); } - diff --git a/workflows/cp1/swift/nci_workflow.swift b/workflows/cp1/swift/nci_workflow.swift index 47a85eea..6739e095 100644 --- a/workflows/cp1/swift/nci_workflow.swift +++ b/workflows/cp1/swift/nci_workflow.swift @@ -80,7 +80,7 @@ cache_dir = '%s' if len(cell_feature_subset_path) > 0: params['cell_feature_subset_path'] = cell_feature_subset_path # GDSC_NCI60_1600_800_features.txt - # GDSC_NCI60_2000_1000.h5 + # GDSC_NCI60_2000_1000.h5 import os ex_data_f = os.path.basename(params['cell_feature_subset_path']) idx = ex_data_f.rfind('_features') @@ -130,7 +130,7 @@ record_id = DB.insert_xcorr_record(studies=studies, """; -(string hpo_id) insert_hpo(string xcorr_record_id) +(string hpo_id) insert_hpo(string xcorr_record_id) { hpo_template = """ @@ -145,7 +145,7 @@ hpo_id = DB.insert_hpo_record(%s) hpo_id = python_persist(code, "str(hpo_id)"); } -(string run_id) insert_hpo_run(string hpo_id, string param_string, string run_directory) +(string run_id) insert_hpo_run(string hpo_id, string param_string, string run_directory) { run_template = """ @@ -160,7 +160,7 @@ run_id = DB.insert_hpo_run(%s, '%s', '%s') run_id = python_persist(code, "str(run_id)"); } -(void o) update_hpo_run(string run_id, string result) +(void o) update_hpo_run(string run_id, string result) { update_template = """ @@ -267,7 +267,7 @@ uno_xcorr.coxen_feature_selection(study1, study2, } else { results[j] = result; } - + // update_hpo_run(run_db_id, results[j]); // TODO DB: insert result with record_id } diff --git a/workflows/cp1/swift/upf_workflow.sh b/workflows/cp1/swift/upf_workflow.sh index beed8286..db3e3e95 100755 --- a/workflows/cp1/swift/upf_workflow.sh +++ b/workflows/cp1/swift/upf_workflow.sh @@ -70,7 +70,7 @@ export APP_PYTHONPATH=$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$XCORR_ROOT export TURBINE_JOBNAME="JOB:${EXPID}" -if [ -z ${GPU_STRING+x} ]; +if [ -z ${GPU_STRING+x} ]; then GPU_ARG="" else diff --git a/workflows/cp1/swift/upf_workflow.swift b/workflows/cp1/swift/upf_workflow.swift index 4e778176..b3ddd1a5 100644 --- a/workflows/cp1/swift/upf_workflow.swift +++ b/workflows/cp1/swift/upf_workflow.swift @@ -110,7 +110,7 @@ main() { //make_dir(instance) => { string param_code = update_param_template % (params, instance); //printf(param_code); - + updated_param = python_persist(param_code, "params_json"); inputs[i] = "%i|%f|%s" % (i, clock(), updated_param); string result = obj(updated_param, int2string(i)) => @@ -121,4 +121,3 @@ main() { write_lines(inputs, "inputs.txt"); write_lines(results, "results.txt"); } - diff --git a/workflows/cp1/swift/workflow.swift b/workflows/cp1/swift/workflow.swift index 48388aa6..62693846 100644 --- a/workflows/cp1/swift/workflow.swift +++ b/workflows/cp1/swift/workflow.swift @@ -80,7 +80,7 @@ cell_feature_subset_path = '%s' if len(cell_feature_subset_path) > 0: params['cell_feature_subset_path'] = cell_feature_subset_path # GDSC_NCI60_1600_800_features.txt - # GDSC_NCI60_2000_1000.h5 + # GDSC_NCI60_2000_1000.h5 import os ex_data_f = os.path.basename(params['cell_feature_subset_path']) idx = ex_data_f.rfind('_features') @@ -132,7 +132,7 @@ record_id = DB.insert_xcorr_record(studies=studies, """; -(string hpo_id) insert_hpo(string xcorr_record_id) +(string hpo_id) insert_hpo(string xcorr_record_id) { hpo_template = """ @@ -147,7 +147,7 @@ hpo_id = DB.insert_hpo_record(%s) hpo_id = python_persist(code, "str(hpo_id)"); } -(string run_id) insert_hpo_run(string hpo_id, string param_string, string run_directory) +(string run_id) insert_hpo_run(string hpo_id, string param_string, string run_directory) { run_template = """ @@ -162,7 +162,7 @@ run_id = DB.insert_hpo_run(%s, '%s', '%s') run_id = python_persist(code, "str(run_id)"); } -(void o) update_hpo_run(string run_id, string result) +(void o) update_hpo_run(string run_id, string result) { update_template = """ diff --git a/workflows/cp1/test/cfg-prm-1.sh b/workflows/cp1/test/cfg-prm-1.sh index 659d8f82..15a87a05 100644 --- a/workflows/cp1/test/cfg-prm-1.sh +++ b/workflows/cp1/test/cfg-prm-1.sh @@ -41,7 +41,7 @@ export DRUG_REPSONSE_DATA=$BENCHMARKS_ROOT/Data/Pilot1/rescaled_combined_single_ # Location of mlrMBO input file. uno_quick_test is # appropriate for testing PARAM_SET_FILE=$EMEWS_PROJECT_ROOT/data/uno_quick_test.R -# Actual useful mlrMBO input file for uno: uno_hpo.R +# Actual useful mlrMBO input file for uno: uno_hpo.R # PARAM_SET_FILE=$EMEWS_PROJECT_ROOT/data/uno_hpo.R if [[ "${PARAM_SET_FILE:-}" == "" ]]; then diff --git a/workflows/cp1/test/cfg-sys-1.sh b/workflows/cp1/test/cfg-sys-1.sh index 11fb3fa5..d7e2f118 100644 --- a/workflows/cp1/test/cfg-sys-1.sh +++ b/workflows/cp1/test/cfg-sys-1.sh @@ -2,7 +2,7 @@ # MLRMBO CFG SYS 1 # The total number of MPI processes including 2 -# for swift internals, and the number of +# for swift internals, and the number of # mlrMBO instances and the number of individual # Uno HPO runs. export PROCS=${PROCS:-6} diff --git a/workflows/cp1/test/cfg-sys-3.sh b/workflows/cp1/test/cfg-sys-3.sh index 6c30a103..d6f4718b 100644 --- a/workflows/cp1/test/cfg-sys-3.sh +++ b/workflows/cp1/test/cfg-sys-3.sh @@ -17,7 +17,7 @@ export PPN=${PPN:-1} # For Theta: # export QUEUE=${QUEUE:-debug-flat-quad} # export QUEUE=R.candle -export QUEUE=default +export QUEUE=default export WALLTIME=${WALLTIME:-01:59} diff --git a/workflows/cp1/test/create-new-test.sh b/workflows/cp1/test/create-new-test.sh index 42c82669..83a6858b 100755 --- a/workflows/cp1/test/create-new-test.sh +++ b/workflows/cp1/test/create-new-test.sh @@ -18,6 +18,3 @@ sed -i -e "s/PROPOSE_POINTS:-5/PROPOSE_POINTS:-$1/g" cfg-prm-$1.sh sed -i -e "s/MAX_CONCURRENT_EVALUATIONS:-1/MAX_CONCURRENT_EVALUATIONS:-$1/g" cfg-prm-$1.sh sed -i -e "s/DESIGN_SIZE:-10/DESIGN_SIZE:-$1/g" cfg-prm-$1.sh sed -i -e "s/MAX_BUDGET:-180/MAX_BUDGET:-$Budget/g" cfg-prm-$1.sh - - - diff --git a/workflows/cp1/test_infer/cfg-prm-1.sh b/workflows/cp1/test_infer/cfg-prm-1.sh index 82de21fd..b232b1df 100644 --- a/workflows/cp1/test_infer/cfg-prm-1.sh +++ b/workflows/cp1/test_infer/cfg-prm-1.sh @@ -10,5 +10,3 @@ XCORR_DATA_DIR=$EMEWS_PROJECT_ROOT/xcorr_data UPF_FILE=$EMEWS_PROJECT_ROOT/data/infer_upf.txt # Number of predictions to make for each inference runs N_PRED=30 - - diff --git a/workflows/cp1/test_infer/cfg-prm-250.sh b/workflows/cp1/test_infer/cfg-prm-250.sh index 2404db84..0ff1d57a 100644 --- a/workflows/cp1/test_infer/cfg-prm-250.sh +++ b/workflows/cp1/test_infer/cfg-prm-250.sh @@ -5,4 +5,3 @@ CACHE_DIR=$EMEWS_PROJECT_ROOT/cache XCORR_DATA_DIR=$EMEWS_PROJECT_ROOT/xcorr_data UPF_FILE=$EMEWS_PROJECT_ROOT/data/infer_upf_a.txt N_PRED=30 - diff --git a/workflows/cp1/test_upf/cfg-prm-1.sh b/workflows/cp1/test_upf/cfg-prm-1.sh index 3b498310..e11a9178 100644 --- a/workflows/cp1/test_upf/cfg-prm-1.sh +++ b/workflows/cp1/test_upf/cfg-prm-1.sh @@ -8,4 +8,3 @@ XCORR_DATA_DIR=$EMEWS_PROJECT_ROOT/xcorr_data # Location of the input file that contains the parameters for each run # 1 per row UPF_FILE=$EMEWS_PROJECT_ROOT/data/upf.txt - diff --git a/workflows/grid/README.md b/workflows/grid/README.md index 247ae873..5c0d11b2 100644 --- a/workflows/grid/README.md +++ b/workflows/grid/README.md @@ -1,14 +1,18 @@ # Simple parameter sweep with Swift, using command line programs + **run** runs **run-sweep.swift**, which runs a parameter sweep. It calls command-line programs as follows: + - determineParameters.{sh,py}: Read data/ **settings.json** for sweep parameters, and return as a string for use by Swift program - evaluateOne.{sh,py}: Runs a single experiment. (Calls p1b1_baseline). - computeStats.{sh,py}: Ingests data from all of the experiments and computes simple stats. -Usage: ./run +Usage: ./run Notes: -- **settings.json**: sweep parameters. Parameters must be labeled "1", "2", "3", "4", ... -1: epochs + +- **settings.json**: sweep parameters. Parameters must be labeled "1", "2", "3", "4", ... + 1: epochs + 2. batch_size 3. N1 -4. NE \ No newline at end of file +4. NE diff --git a/workflows/grid/data/settings.json b/workflows/grid/data/settings.json index acb160ba..01a65814 100644 --- a/workflows/grid/data/settings.json +++ b/workflows/grid/data/settings.json @@ -1,11 +1,10 @@ { - "parameters": - { - "epochs": [4, 6, 8 ], - "batch_size": [30, 40], - "N1": [1500], - "NE": [600], - "latent_dim": [2, 8, 16, 32, 64], - "learning_rate": [0.00001, 0.0001, 0.001, 0.1] - } + "parameters": { + "epochs": [4, 6, 8], + "batch_size": [30, 40], + "N1": [1500], + "NE": [600], + "latent_dim": [2, 8, 16, 32, 64], + "learning_rate": [0.00001, 0.0001, 0.001, 0.1] + } } diff --git a/workflows/grid/python/computeStats.py b/workflows/grid/python/computeStats.py index f414c378..4e33ee1b 100644 --- a/workflows/grid/python/computeStats.py +++ b/workflows/grid/python/computeStats.py @@ -1,40 +1,44 @@ +import json +import os import sys from collections import defaultdict -import json, os + def extractVals(A): B = defaultdict(dict) A1 = A.split() for n, val in zip(A1[0::2], A1[1::2]): B[n] = float(val) - return(B) + return B + def computeStats(swiftArrayAsString): A = extractVals(swiftArrayAsString) vals = [] for a in A: vals += [A[a]] - print('%d values, with min=%f, max=%f, avg=%f\n'%(len(vals),min(vals),max(vals),sum(vals)/float(len(vals)))) - - filename = os.environ['TURBINE_OUTPUT']+ "/final_stats.txt" - # writing the val loss to the output file - with open(filename, 'w') as the_file: - the_file.write('%d values, with min=%f, max=%f, avg=%f\n'%(len(vals),min(vals),max(vals),sum(vals)/float(len(vals)))) + print("%d values, with min=%f, max=%f, avg=%f\n" % + (len(vals), min(vals), max(vals), sum(vals) / float(len(vals)))) + filename = os.environ["TURBINE_OUTPUT"] + "/final_stats.txt" + # writing the val loss to the output file + with open(filename, "w") as the_file: + the_file.write( + "%d values, with min=%f, max=%f, avg=%f\n" % + (len(vals), min(vals), max(vals), sum(vals) / float(len(vals)))) -if (len(sys.argv) < 2): - print('requires arg=dataFilename') - sys.exit(1) +if len(sys.argv) < 2: + print("requires arg=dataFilename") + sys.exit(1) dataFilename = sys.argv[1] try: - with open(dataFilename, 'r') as the_file: + with open(dataFilename, "r") as the_file: data = the_file.read() except IOError as e: print("Could not open: %s" % dataFilename) print("PWD is: '%s'" % os.getcwd()) computeStats(data) - diff --git a/workflows/grid/python/determineParameters.py b/workflows/grid/python/determineParameters.py index 574c4819..ad85a36f 100644 --- a/workflows/grid/python/determineParameters.py +++ b/workflows/grid/python/determineParameters.py @@ -1,7 +1,10 @@ -import sys, json, os +import json +import os +import sys # ===== Definitions ========================================================= + def loadSettings(settingsFilename): print("Reading settings: %s" % settingsFilename) try: @@ -15,44 +18,54 @@ def loadSettings(settingsFilename): epochs = settings['parameters']["epochs"] batch_size = settings['parameters']["batch_size"] N1 = settings['parameters']["N1"] - NE = settings['parameters']["NE"] - latent_dim = settings['parameters']["latent_dim"] - learning_rate = settings['parameters']["learning_rate"] + NE = settings['parameters']["NE"] + latent_dim = settings['parameters']["latent_dim"] + learning_rate = settings['parameters']["learning_rate"] except KeyError as e: - print("Settings file (%s) does not contain key: %s" % (settingsFilename, str(e))) + print("Settings file (%s) does not contain key: %s" % + (settingsFilename, str(e))) sys.exit(1) - return(epochs, batch_size, N1, NE, latent_dim, learning_rate) + return (epochs, batch_size, N1, NE, latent_dim, learning_rate) + def expand(Vs, fr, to, soFar): soFarNew = [] for s in soFar: for v in Vs[fr]: - if s == '': - soFarNew += [str(v)] - else: - soFarNew += [s+','+str(v)] - if fr==to: - return(soFarNew) + if s == '': + soFarNew += [str(v)] + else: + soFarNew += [s + ',' + str(v)] + if fr == to: + return (soFarNew) else: - return expand(Vs, fr+1, to, soFarNew) + return expand(Vs, fr + 1, to, soFarNew) + # ===== Main program ======================================================== if (len(sys.argv) < 3): - print('requires arg1=settingsFilename and arg2=paramsFilename') - sys.exit(1) + print('requires arg1=settingsFilename and arg2=paramsFilename') + sys.exit(1) settingsFilename = sys.argv[1] -paramsFilename = sys.argv[2] +paramsFilename = sys.argv[2] -epochs, batch_size, N1, NE, latent_dim, learning_rate = loadSettings(settingsFilename) +epochs, batch_size, N1, NE, latent_dim, learning_rate = loadSettings( + settingsFilename) -values = {1:epochs, 2: batch_size, 3: N1, 4: NE, 5: latent_dim, 6: learning_rate} +values = { + 1: epochs, + 2: batch_size, + 3: N1, + 4: NE, + 5: latent_dim, + 6: learning_rate +} print values results = expand(values, 1, len(values), ['']) result = ':'.join(results) with open(paramsFilename, 'w') as the_file: the_file.write(result) - diff --git a/workflows/grid/python/evaluateOne.py b/workflows/grid/python/evaluateOne.py index 00910697..3b823eb6 100644 --- a/workflows/grid/python/evaluateOne.py +++ b/workflows/grid/python/evaluateOne.py @@ -1,48 +1,52 @@ +import json +import os +import socket import sys + import p1b1_runner -import json, os -import socket -if (len(sys.argv) < 3): - print('requires arg1=param and arg2=filename') - sys.exit(1) +if len(sys.argv) < 3: + print("requires arg1=param and arg2=filename") + sys.exit(1) parameterString = sys.argv[1] -filename = sys.argv[2] +filename = sys.argv[2] # print (parameterString) -print ("filename is " + filename) -print (socket.gethostname()) - -#List of hyperparameters - edit this to add or remove a parameter -epochs, batch_size, d1, d2, ld, lr = parameterString.split(',') - -hyper_parameter_map = {'epochs' : int(epochs)} -hyper_parameter_map['framework'] = 'keras' -hyper_parameter_map['batch_size'] = int(batch_size) -hyper_parameter_map['dense'] = [int(d1), int(d2)] -hyper_parameter_map['latent_dim'] = int(ld) -hyper_parameter_map['learning_rate'] = float(lr) - -hyper_parameter_map['run_id'] = parameterString -# hyper_parameter_map['instance_directory'] = os.environ['TURBINE_OUTPUT'] -hyper_parameter_map['save'] = os.environ['TURBINE_OUTPUT']+ "/output-"+os.environ['PMI_RANK'] -sys.argv = ['p1b1_runner'] +print("filename is " + filename) +print(socket.gethostname()) + +# List of hyperparameters - edit this to add or remove a parameter +epochs, batch_size, d1, d2, ld, lr = parameterString.split(",") + +hyper_parameter_map = {"epochs": int(epochs)} +hyper_parameter_map["framework"] = "keras" +hyper_parameter_map["batch_size"] = int(batch_size) +hyper_parameter_map["dense"] = [int(d1), int(d2)] +hyper_parameter_map["latent_dim"] = int(ld) +hyper_parameter_map["learning_rate"] = float(lr) + +hyper_parameter_map["run_id"] = parameterString +# hyper_parameter_map['instance_directory'] = os.environ['TURBINE_OUTPUT'] +hyper_parameter_map["save"] = (os.environ["TURBINE_OUTPUT"] + "/output-" + + os.environ["PMI_RANK"]) +sys.argv = ["p1b1_runner"] val_loss = p1b1_runner.run(hyper_parameter_map) -print (val_loss) +print(val_loss) -sfn = os.environ['TURBINE_OUTPUT']+ "/output-"+os.environ['PMI_RANK'] + "/procname-" + parameterString -with open(sfn, 'w') as sfile: +sfn = (os.environ["TURBINE_OUTPUT"] + "/output-" + os.environ["PMI_RANK"] + + "/procname-" + parameterString) +with open(sfn, "w") as sfile: sfile.write(socket.getfqdn()) - proc_id = "-"+ str(os.getpid()) + proc_id = "-" + str(os.getpid()) sfile.write(proc_id) # works around this error: # https://github.com/tensorflow/tensorflow/issues/3388 from keras import backend as K + K.clear_session() # writing the val loss to the output file (result-*) -with open(filename, 'w') as the_file: +with open(filename, "w") as the_file: the_file.write(repr(val_loss)) - diff --git a/workflows/grid/python/p1b1_runner.py b/workflows/grid/python/p1b1_runner.py index 7ceb0c59..ddb43b10 100644 --- a/workflows/grid/python/p1b1_runner.py +++ b/workflows/grid/python/p1b1_runner.py @@ -1,24 +1,30 @@ # tensoflow.__init__ calls _os.path.basename(_sys.argv[0]) # so we need to create a synthetic argv. import sys -if not hasattr(sys, 'argv'): - sys.argv = ['p1b1'] + +if not hasattr(sys, "argv"): + sys.argv = ["p1b1"] import json import os + import p1b1 import runner_utils + def run(hyper_parameter_map): - framework = hyper_parameter_map['framework'] - if framework is 'keras': + framework = hyper_parameter_map["framework"] + if framework is "keras": import p1b1_baseline_keras2 + pkg = p1b1_baseline_keras2 - elif framework is 'mxnet': + elif framework is "mxnet": import p1b1_baseline_mxnet + pkg = p1b1_baseline_mxnet - elif framework is 'neon': + elif framework is "neon": import p1b1_baseline_neon + pkg = p1b1_baseline_neon else: raise ValueError("Invalid framework: {}".format(framework)) @@ -27,23 +33,24 @@ def run(hyper_parameter_map): params = pkg.initialize_parameters() runner_utils.format_params(hyper_parameter_map) - for k,v in hyper_parameter_map.items(): - #if not k in params: + for k, v in hyper_parameter_map.items(): + # if not k in params: # raise Exception("Parameter '{}' not found in set of valid arguments".format(k)) params[k] = v print(params) history = pkg.run(params) - if framework is 'keras': + if framework is "keras": # works around this error: # https://github.com/tensorflow/tensorflow/issues/3388 try: from keras import backend as K + K.clear_session() - except AttributeError: # theano does not have this function + except AttributeError: # theano does not have this function pass # use the last validation_loss as the value to minimize - val_loss = history.history['val_loss'] + val_loss = history.history["val_loss"] return val_loss[-1] diff --git a/workflows/grid/swift/grid-sweep.swift b/workflows/grid/swift/grid-sweep.swift index 1344660a..78aceb5b 100644 --- a/workflows/grid/swift/grid-sweep.swift +++ b/workflows/grid/swift/grid-sweep.swift @@ -62,4 +62,3 @@ file tmp = write(repr(results)); //trace("Temporary filename is: " + filename(tmp)); computeStats(filename(tmp)); - diff --git a/workflows/grid/swift/workflow.sh b/workflows/grid/swift/workflow.sh index 27d2e909..3eabacbb 100755 --- a/workflows/grid/swift/workflow.sh +++ b/workflows/grid/swift/workflow.sh @@ -93,7 +93,7 @@ CMD_LINE_ARGS=( -param_set_file=$PARAM_SET_FILE -ds=$DESIGN_SIZE -pp=$PROPOSE_POINTS -it=$MAX_ITERATIONS - -settings=$EMEWS_PROJECT_ROOT/data/settings.json + -settings=$EMEWS_PROJECT_ROOT/data/settings.json -exp_id=$EXPID -benchmark_timeout=$BENCHMARK_TIMEOUT -site=$SITE diff --git a/workflows/grid/test/cfg-prm-1.sh b/workflows/grid/test/cfg-prm-1.sh index e7698292..49db0d0a 100644 --- a/workflows/grid/test/cfg-prm-1.sh +++ b/workflows/grid/test/cfg-prm-1.sh @@ -13,4 +13,3 @@ MAX_BUDGET=${MAX_BUDGET:-1800} DESIGN_SIZE=${DESIGN_SIZE:-2} PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/parameter_set.R} MODEL_NAME="p1b1" - diff --git a/workflows/grid/test/cfg-sys-1.sh b/workflows/grid/test/cfg-sys-1.sh index 6e48105f..b0afa605 100644 --- a/workflows/grid/test/cfg-sys-1.sh +++ b/workflows/grid/test/cfg-sys-1.sh @@ -18,4 +18,3 @@ export WALLTIME=${WALLTIME:-01:33:00} # Benchmark run timeout: benchmark run will timeouT # after the specified number of seconds. -1 is no timeout. BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} - diff --git a/workflows/mlrMBO/.gitignore b/workflows/mlrMBO/.gitignore index 8b137891..e69de29b 100644 --- a/workflows/mlrMBO/.gitignore +++ b/workflows/mlrMBO/.gitignore @@ -1 +0,0 @@ - diff --git a/workflows/mlrMBO/README.md b/workflows/mlrMBO/README.md index f3d36ec0..14768da6 100644 --- a/workflows/mlrMBO/README.md +++ b/workflows/mlrMBO/README.md @@ -2,56 +2,57 @@ mlrMBO is an iterative optimizer written in R. It evaluates the best values of hyperparameters for CANDLE "Benchmarks" available here: `git@github.com:ECP-CANDLE/Benchmarks.git` - given set of parameters. -## Running ## +## Running -1. cd into the *~/Supervisor/workflows/mlrMBO/test* directory -2. Specify the MODEL_NAME in *test-1.sh* file, hyperparameters in *cfg-prm-1.txt* -3. Specify the #procs, queue etc. in *cfg-sys-1.sh* file -4. Launch the test by invoking *./upf-1.sh * - where machine_name can be cori, theta, titan etc. +1. cd into the _~/Supervisor/workflows/mlrMBO/test_ directory +2. Specify the MODEL*NAME in \_test-1.sh* file, hyperparameters in _cfg-prm-1.txt_ +3. Specify the #procs, queue etc. in _cfg-sys-1.sh_ file +4. Launch the test by invoking _./upf-1.sh _ + where machine_name can be cori, theta, titan etc. 5. The benchmark will be run for the number of processors specified 6. Final objective function value will be available in the experiments directory and also printed - -## User requirements ## +## User requirements What you need to install to run the workflow: -* This workflow - `git@github.com:ECP-CANDLE/Supervisor.git` . +- This workflow - `git@github.com:ECP-CANDLE/Supervisor.git` . Clone and `cd` to `workflows/nt3_mlrMBO` (the directory containing this README). -* NT3 benchmark - `git@github.com:ECP-CANDLE/Benchmarks.git` . +- NT3 benchmark - `git@github.com:ECP-CANDLE/Benchmarks.git` . Clone and switch to the `frameworks` branch. -* benchmark data - - See the individual benchmarks README for obtaining the initial data +- benchmark data - + See the individual benchmarks README for obtaining the initial data -## Calling sequence ## +## Calling sequence Function calls :- -* test-1.sh -> swift/workflow.sh -> swift/workflow.swift -> -common/swift/obj_app.swift -> common/sh/model.sh -> -common/python/model_runner.py -> 'calls the benchmark' + +- test-1.sh -> swift/workflow.sh -> swift/workflow.swift -> + common/swift/obj_app.swift -> common/sh/model.sh -> + common/python/model_runner.py -> 'calls the benchmark' Scheduling scripts :- -* upf-1.sh -> cfg-sys-1.sh -> common/sh/ - module, scheduling, langs .sh files -## Making Changes ## +- upf-1.sh -> cfg-sys-1.sh -> common/sh/ - module, scheduling, langs .sh files + +## Making Changes -### Structure ### +### Structure -The point of the script structure is that it is easy to make copy and modify the `test-\*.sh` script, and the `cfg-\*.sh` scripts. These can be checked back into the repo for use by others. The `test-\*.sh` script and the `cfg-\*.sh` scripts should simply contain environment variables that control how `workflow.sh` and `workflow.swift` operate. +The point of the script structure is that it is easy to make copy and modify the `test-\*.sh` script, and the `cfg-\*.sh` scripts. These can be checked back into the repo for use by others. The `test-\*.sh` script and the `cfg-\*.sh` scripts should simply contain environment variables that control how `workflow.sh` and `workflow.swift` operate. `test-1` and `cfg-{sys,prm}-1` should be unmodified for simple testing. -### Calling a different objective function ### +### Calling a different objective function To call a different objective function: 1. Copy `common/swift/obj_app.swift` to a new directory and/or file name. 2. Edit the `app` function body to run your code and return the result. 3. Edit a `test-\*.sh` script to set environment variables: - * `OBJ_DIR`: Set this to the new directory (If changed. Otherwise, `OBJ_DIR` defaults to the absolute path to common/swift .) - * `OBJ_MODULE`: Set this to the Swift file name without suffix (If changed. Otherwise, `OBJ_MODULE` defaults to `obj_app` .) + - `OBJ_DIR`: Set this to the new directory (If changed. Otherwise, `OBJ_DIR` defaults to the absolute path to common/swift .) + - `OBJ_MODULE`: Set this to the Swift file name without suffix (If changed. Otherwise, `OBJ_MODULE` defaults to `obj_app` .) 4. Run it! Simple test for changing objective function: @@ -66,16 +67,16 @@ Swift: Assertion failed!: test-obj-fail.swift was successfully invoked! ... ``` -This indicates that the code in `test_obj_fail.swift` was executed instead of `obj_app.swift` . +This indicates that the code in `test_obj_fail.swift` was executed instead of `obj_app.swift` . -### Where to check for output ### +### Where to check for output This includes error output. -When you run the test script, you will get a message about `TURBINE_OUTPUT` . This will be the main output directory for your run. +When you run the test script, you will get a message about `TURBINE_OUTPUT` . This will be the main output directory for your run. -* On a local system, stdout/stderr for the workflow will go to your terminal. -* On a scheduled system, stdout/stderr for the workflow will go to `TURBINE_OUTPUT/output.txt` +- On a local system, stdout/stderr for the workflow will go to your terminal. +- On a scheduled system, stdout/stderr for the workflow will go to `TURBINE_OUTPUT/output.txt` The individual objective function (model) runs stdout/stderr go into directories of the form: diff --git a/workflows/mlrMBO/data/adrp_nightly.R b/workflows/mlrMBO/data/adrp_nightly.R index e0cc14f4..8a99d9e3 100644 --- a/workflows/mlrMBO/data/adrp_nightly.R +++ b/workflows/mlrMBO/data/adrp_nightly.R @@ -6,4 +6,3 @@ param.set <- makeParamSet( makeDiscreteParam("optimizer", values = c("adam", "sgd", "rmsprop")), makeDiscreteParam("dense", values = c("500 250 125 60 30", "250 125 60 30", "400 150 75 30","300 175 90 45 20","400 200 100 50 25", "350 170 85 40 20")) ) - diff --git a/workflows/mlrMBO/data/combo_hps_exp_01.R b/workflows/mlrMBO/data/combo_hps_exp_01.R index 2088b353..e97b062d 100644 --- a/workflows/mlrMBO/data/combo_hps_exp_01.R +++ b/workflows/mlrMBO/data/combo_hps_exp_01.R @@ -6,12 +6,12 @@ param.set <- makeParamSet( - + makeDiscreteParam("cell_features", values=c("mirna", "expression")), # use a subset of 978 landmark features only to speed up training makeDiscreteParam("use_landmark_genes", values=c(1)), - + # use consecutive 1000-neuron layers to facilitate residual connections makeDiscreteParam("dense", values=c("1000", @@ -32,19 +32,16 @@ param.set <- makeParamSet( makeIntegerParam("batch_size", lower=5, upper=10, trafo = function(x) 2L^x), makeDiscreteParam("residual", values=c(1, 0)), - + makeDiscreteParam("activation", values=c("relu", "sigmoid", "tanh")), makeDiscreteParam("optimizer", values=c("adam", "sgd", "rmsprop")), - + makeNumericParam("learning_rate", lower=0.00001, upper=0.1), - + makeDiscreteParam("reduce_lr", values=c(1, 0)), - + makeDiscreteParam("warmup_lr", values=c(1, 0)), - + makeIntegerParam("epochs", lower=5, upper=1000) ) - - - diff --git a/workflows/mlrMBO/data/combo_nightly.R b/workflows/mlrMBO/data/combo_nightly.R index 9e3effce..c140ba1a 100644 --- a/workflows/mlrMBO/data/combo_nightly.R +++ b/workflows/mlrMBO/data/combo_nightly.R @@ -6,20 +6,18 @@ param.set <- makeParamSet( - + makeDiscreteParam("cell_features", values=c("expression")), # use a subset of 978 landmark features only to speed up training makeDiscreteParam("use_landmark_genes", values=c(1)), makeDiscreteParam("residual", values=c(1, 0)), - - + + makeDiscreteParam("reduce_lr", values=c(1, 0)), - + makeDiscreteParam("warmup_lr", values=c(1, 0)), - + makeIntegerParam("epochs", lower=1, upper=3) ) - - diff --git a/workflows/mlrMBO/data/p1b1_hps_exp_01.R b/workflows/mlrMBO/data/p1b1_hps_exp_01.R index 62f6ee7e..0da5e977 100644 --- a/workflows/mlrMBO/data/p1b1_hps_exp_01.R +++ b/workflows/mlrMBO/data/p1b1_hps_exp_01.R @@ -8,17 +8,17 @@ param.set <- makeParamSet( # we optimize for ae and vae separately makeDiscreteParam("model", values=c("ae")), - + # makeDiscreteParam("latent_dim", values=c(2, 8, 32, 128, 512)), makeIntegerParam("latent_dim", lower=1, upper=9, trafo = function(x) 2L^x), # use a subset of 978 landmark features only to speed up training makeDiscreteParam("use_landmark_genes", values=c(0)), - + # large batch_size only makes sense when warmup_lr is on #makeDiscreteParam("batch_size", values=c(32, 64, 128, 256, 512, 1024)), makeIntegerParam("batch_size", lower=5, upper=10, trafo = function(x) 2L^x), - + # use consecutive 978-neuron layers to facilitate residual connections makeDiscreteParam("dense", values=c("1500 500", "978 978", @@ -26,22 +26,20 @@ param.set <- makeParamSet( "978 978 978 978", "978 978 978 978 978", "978 978 978 978 978 978")), - + makeDiscreteParam("residual", values=c(1, 0)), - + makeDiscreteParam("activation", values=c("relu", "sigmoid", "tanh")), - + makeDiscreteParam("optimizer", values=c("adam", "sgd")), - + makeNumericParam("learning_rate", lower=0.00001, upper=0.1), - + makeDiscreteParam("reduce_lr", values=c(1, 0)), - + makeDiscreteParam("warmup_lr", values=c(1, 0)), - + makeNumericParam("dropout", lower=0, upper=0.9), - + makeIntegerParam("epochs", lower=5, upper=1000) ) - - diff --git a/workflows/mlrMBO/data/p1b1_nightly.R b/workflows/mlrMBO/data/p1b1_nightly.R index 0ca68ca0..6b98b406 100644 --- a/workflows/mlrMBO/data/p1b1_nightly.R +++ b/workflows/mlrMBO/data/p1b1_nightly.R @@ -8,15 +8,15 @@ param.set <- makeParamSet( # we optimize for ae and vae separately makeDiscreteParam("model", values=c("ae")), - + # makeDiscreteParam("latent_dim", values=c(2, 8, 32, 128, 512)), makeIntegerParam("latent_dim", lower=1, upper=9, trafo = function(x) 2L^x), # use a subset of 978 landmark features only to speed up training makeDiscreteParam("use_landmark_genes", values=c(0)), - - + + # use consecutive 978-neuron layers to facilitate residual connections # makeDiscreteParam("dense", values=c("1500 500", # "978 978", @@ -24,22 +24,20 @@ param.set <- makeParamSet( # "978 978 978 978", # "978 978 978 978 978", # "978 978 978 978 978 978")), - + makeDiscreteParam("residual", values=c(1, 0)), - + makeDiscreteParam("activation", values=c("relu", "sigmoid", "tanh")), - + makeDiscreteParam("optimizer", values=c("adam", "sgd")), - + makeNumericParam("learning_rate", lower=0.00001, upper=0.1), - + makeDiscreteParam("reduce_lr", values=c(1, 0)), - + makeDiscreteParam("warmup_lr", values=c(1, 0)), - + makeNumericParam("dropout", lower=0, upper=0.9), - + makeIntegerParam("epochs", lower=2, upper=3) ) - - diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index f3fb4dbb..1549c3c2 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -36,13 +36,14 @@ usage() echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME" } -if (( ${#} != 7 )) +if (( ${#} != 7 )) && (( ${#} != 5 )) then usage exit 1 fi -if ! { +if (( ${#} == 7 )) +then get_site $1 # Sets SITE get_expid $2 # Sets EXPID get_cfg_sys $3 @@ -50,13 +51,20 @@ if ! { MODEL_NAME=$5 CANDLE_MODEL_TYPE=$6 CANDLE_IMAGE=$7 - } -then + + elif (( ${#} == 5 )) + then + get_site $1 # Sets SITE + get_expid $2 # Sets EXPID + get_cfg_sys $3 + get_cfg_prm $4 + MODEL_NAME=$5 +else usage exit 1 fi -echo "Running "$MODEL_NAME "workflow with" $CANDLE_MODEL_TYPE "and image" $CANDLE_IMAGE +# echo "Running "$MODEL_NAME "workflow with" $CANDLE_MODEL_TYPE "and image" $CANDLE_IMAGE # Set PYTHONPATH for BENCHMARK related stuff PYTHONPATH+=:$BENCHMARK_DIR # :$BENCHMARKS_ROOT/common # This is now candle_lib @@ -189,8 +197,6 @@ swift-t -O 0 -n $PROCS \ -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ -e MODEL_SH \ -e MODEL_NAME \ - -e CANDLE_MODEL_TYPE \ - -e CANDLE_IMAGE \ -e SITE \ -e BENCHMARK_TIMEOUT \ -e SH_TIMEOUT \ diff --git a/workflows/mlrMBO/test-1000-01-mbo/cfg-prm-restart.sh b/workflows/mlrMBO/test-1000-01-mbo/cfg-prm-restart.sh index 9d3afd6f..e1434cf3 100644 --- a/workflows/mlrMBO/test-1000-01-mbo/cfg-prm-restart.sh +++ b/workflows/mlrMBO/test-1000-01-mbo/cfg-prm-restart.sh @@ -2,7 +2,7 @@ # Configuration of parameters: 1 # mlrMBO settings -# How many to runs evaluate per iteration -> +# How many to runs evaluate per iteration -> #Adding the number of restart runs to the budget (9 - for the test case) #This is the minimum number of runs required for restart 9 (greater than 8, which is the design size) MAX_BUDGET=${MAX_BUDGET:-25} @@ -23,9 +23,7 @@ elif [ "$MODEL_NAME" = "p1b3" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b3_hps_exp_01.R} elif [ "$MODEL_NAME" = "p1b2" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b2_hps_exp_01.R} -else +else echo "Invalid model-" $MODEL_NAME exit fi - - diff --git a/workflows/mlrMBO/test-1000-01-mbo/cfg-sys-1.sh b/workflows/mlrMBO/test-1000-01-mbo/cfg-sys-1.sh index e2781a0b..583de2d9 100644 --- a/workflows/mlrMBO/test-1000-01-mbo/cfg-sys-1.sh +++ b/workflows/mlrMBO/test-1000-01-mbo/cfg-sys-1.sh @@ -36,5 +36,3 @@ export SH_TIMEOUT=${SH_TIMEOUT:-2000} # Ignore errors: If 1, unknown errors will be reported to model.log # but will not bring down the Swift workflow. See model.sh . export IGNORE_ERRORS=1 - - diff --git a/workflows/mlrMBO/test-1000-01-mbo/test-restart.sh b/workflows/mlrMBO/test-1000-01-mbo/test-restart.sh index 69b2d3ad..6bd48cf9 100755 --- a/workflows/mlrMBO/test-1000-01-mbo/test-restart.sh +++ b/workflows/mlrMBO/test-1000-01-mbo/test-restart.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eu -echo "Usage test-1.sh BECHMARK_NAME SITE RUN_DIR(optional, -a=automatic)" +echo "Usage test-1.sh BECHMARK_NAME SITE RUN_DIR(optional, -a=automatic)" RUN_DIR="" @@ -14,7 +14,7 @@ then echo "Automatically assigning run directory in ../experiments folder" RUN_DIR="-a" else - echo "Usage test SITE RUN_DIR(optional)" + echo "Usage test SITE RUN_DIR(optional)" exit 1 fi diff --git a/workflows/mlrMBO/test-1000-01-rs/cfg-prm-restart.sh b/workflows/mlrMBO/test-1000-01-rs/cfg-prm-restart.sh index 9d3afd6f..e1434cf3 100644 --- a/workflows/mlrMBO/test-1000-01-rs/cfg-prm-restart.sh +++ b/workflows/mlrMBO/test-1000-01-rs/cfg-prm-restart.sh @@ -2,7 +2,7 @@ # Configuration of parameters: 1 # mlrMBO settings -# How many to runs evaluate per iteration -> +# How many to runs evaluate per iteration -> #Adding the number of restart runs to the budget (9 - for the test case) #This is the minimum number of runs required for restart 9 (greater than 8, which is the design size) MAX_BUDGET=${MAX_BUDGET:-25} @@ -23,9 +23,7 @@ elif [ "$MODEL_NAME" = "p1b3" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b3_hps_exp_01.R} elif [ "$MODEL_NAME" = "p1b2" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b2_hps_exp_01.R} -else +else echo "Invalid model-" $MODEL_NAME exit fi - - diff --git a/workflows/mlrMBO/test-1000-01-rs/cfg-sys-1.sh b/workflows/mlrMBO/test-1000-01-rs/cfg-sys-1.sh index e2781a0b..583de2d9 100644 --- a/workflows/mlrMBO/test-1000-01-rs/cfg-sys-1.sh +++ b/workflows/mlrMBO/test-1000-01-rs/cfg-sys-1.sh @@ -36,5 +36,3 @@ export SH_TIMEOUT=${SH_TIMEOUT:-2000} # Ignore errors: If 1, unknown errors will be reported to model.log # but will not bring down the Swift workflow. See model.sh . export IGNORE_ERRORS=1 - - diff --git a/workflows/mlrMBO/test-1000-01-rs/test-restart.sh b/workflows/mlrMBO/test-1000-01-rs/test-restart.sh index 69b2d3ad..6bd48cf9 100755 --- a/workflows/mlrMBO/test-1000-01-rs/test-restart.sh +++ b/workflows/mlrMBO/test-1000-01-rs/test-restart.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eu -echo "Usage test-1.sh BECHMARK_NAME SITE RUN_DIR(optional, -a=automatic)" +echo "Usage test-1.sh BECHMARK_NAME SITE RUN_DIR(optional, -a=automatic)" RUN_DIR="" @@ -14,7 +14,7 @@ then echo "Automatically assigning run directory in ../experiments folder" RUN_DIR="-a" else - echo "Usage test SITE RUN_DIR(optional)" + echo "Usage test SITE RUN_DIR(optional)" exit 1 fi diff --git a/workflows/mlrMBO/test/cfg-prm-30.sh b/workflows/mlrMBO/test/cfg-prm-30.sh index c00e55fe..f379e9d8 100644 --- a/workflows/mlrMBO/test/cfg-prm-30.sh +++ b/workflows/mlrMBO/test/cfg-prm-30.sh @@ -19,9 +19,7 @@ elif [ "$MODEL_NAME" = "p1b3" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b3_hps_exp_01.R} elif [ "$MODEL_NAME" = "p1b2" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b2_hps_exp_01.R} -else +else echo "Invalid model-" $MODEL_NAME exit fi - - diff --git a/workflows/mlrMBO/test/cfg-prm-restart.sh b/workflows/mlrMBO/test/cfg-prm-restart.sh index 13c19339..a7581e5b 100644 --- a/workflows/mlrMBO/test/cfg-prm-restart.sh +++ b/workflows/mlrMBO/test/cfg-prm-restart.sh @@ -2,7 +2,7 @@ # Configuration of parameters: 1 # mlrMBO settings -# How many to runs evaluate per iteration -> +# How many to runs evaluate per iteration -> #Adding the number of restart runs to the budget (9 - for the test case) #This is the minimum number of runs required for restart 9 (greater than 8, which is the design size) MAX_BUDGET=${MAX_BUDGET:-25} @@ -24,9 +24,7 @@ elif [ "$MODEL_NAME" = "p1b3" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b3_hps_exp_01.R} elif [ "$MODEL_NAME" = "p1b2" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b2_hps_exp_01.R} -else +else echo "Invalid model-" $MODEL_NAME exit fi - - diff --git a/workflows/mlrMBO/test/cfg-sys-30.sh b/workflows/mlrMBO/test/cfg-sys-30.sh index f45ef2c5..0994073d 100644 --- a/workflows/mlrMBO/test/cfg-sys-30.sh +++ b/workflows/mlrMBO/test/cfg-sys-30.sh @@ -17,4 +17,3 @@ export WALLTIME=${WALLTIME:-3:00:00} # Benchmark run timeout: benchmark run will timeouT # after the specified number of seconds. -1 is no timeout. BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} - diff --git a/workflows/mlrMBO/test/create-new-test.sh b/workflows/mlrMBO/test/create-new-test.sh index 42c82669..83a6858b 100755 --- a/workflows/mlrMBO/test/create-new-test.sh +++ b/workflows/mlrMBO/test/create-new-test.sh @@ -18,6 +18,3 @@ sed -i -e "s/PROPOSE_POINTS:-5/PROPOSE_POINTS:-$1/g" cfg-prm-$1.sh sed -i -e "s/MAX_CONCURRENT_EVALUATIONS:-1/MAX_CONCURRENT_EVALUATIONS:-$1/g" cfg-prm-$1.sh sed -i -e "s/DESIGN_SIZE:-10/DESIGN_SIZE:-$1/g" cfg-prm-$1.sh sed -i -e "s/MAX_BUDGET:-180/MAX_BUDGET:-$Budget/g" cfg-prm-$1.sh - - - diff --git a/workflows/mlrMBO/test/restart-combo.csv b/workflows/mlrMBO/test/restart-combo.csv index 9268db2d..5ebc24c1 100644 --- a/workflows/mlrMBO/test/restart-combo.csv +++ b/workflows/mlrMBO/test/restart-combo.csv @@ -8,4 +8,3 @@ y,batch_size,epochs 41.376564008,512,26 6.5089799458,16,30 20.2991980919,64,23 - diff --git a/workflows/mlrMBO/test/test-30.sh b/workflows/mlrMBO/test/test-30.sh index 0b6789fd..304ad5fb 100755 --- a/workflows/mlrMBO/test/test-30.sh +++ b/workflows/mlrMBO/test/test-30.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eu -echo "Usage test-1.sh BECHMARK_NAME SITE RUN_DIR(optional, -a=automatic)" +echo "Usage test-1.sh BECHMARK_NAME SITE RUN_DIR(optional, -a=automatic)" RUN_DIR="" @@ -14,7 +14,7 @@ then echo "Automatically assigning run directory in ../experiments folder" RUN_DIR="-a" else - echo "Usage test SITE RUN_DIR(optional)" + echo "Usage test SITE RUN_DIR(optional)" exit 1 fi diff --git a/workflows/mlrMBO/test/test-restart.sh b/workflows/mlrMBO/test/test-restart.sh index 6a172ae3..e242c987 100755 --- a/workflows/mlrMBO/test/test-restart.sh +++ b/workflows/mlrMBO/test/test-restart.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eu -echo "Usage test-1.sh BECHMARK_NAME SITE RUN_DIR(optional, -a=automatic)" +echo "Usage test-1.sh BECHMARK_NAME SITE RUN_DIR(optional, -a=automatic)" RUN_DIR="" @@ -14,7 +14,7 @@ then echo "Automatically assigning run directory in ../experiments folder" RUN_DIR="-a" else - echo "Usage test SITE RUN_DIR(optional)" + echo "Usage test SITE RUN_DIR(optional)" exit 1 fi diff --git a/workflows/one-shot/load.py b/workflows/one-shot/load.py index 43b8b3c0..a77fb647 100644 --- a/workflows/one-shot/load.py +++ b/workflows/one-shot/load.py @@ -1,9 +1,10 @@ - # Performance test for pandas.read_csv() import sys + import pandas as pd F = sys.argv[1] -(pd.read_csv(F, header=None, low_memory=False, usecols=None).values).astype('float32') +(pd.read_csv(F, header=None, low_memory=False, + usecols=None).values).astype("float32") diff --git a/workflows/pbt/Readme.md b/workflows/pbt/Readme.md index e1cd16a2..88e61ef6 100644 --- a/workflows/pbt/Readme.md +++ b/workflows/pbt/Readme.md @@ -1,4 +1,4 @@ -# PBT Workflow # +# PBT Workflow PBT is an asynchronous optimization algorithm for jointly optimizing a population of models and their hyperparameters while effectively using a fixed @@ -6,12 +6,12 @@ computational budget. Like a simple parallel grid search, PBT begins by randomly sampling selected hyperparameters and initial weights and training multiple models in parallel using these hyperparameters and weights. However, unlike a parallel search, each training run periodically and -asynchronously runs an *evaluate* method when a model is considered *ready*, comparing its performance against that +asynchronously runs an _evaluate_ method when a model is considered _ready_, comparing its performance against that of other models. If it is under-performing, PBT uses two additional methods to -improve performance: *exploit* and *explore*. Exploit leverages the work of the +improve performance: _exploit_ and _explore_. Exploit leverages the work of the population as a whole by replacing an underperforming model with a better one, i.e., by replacing a model’s current weights with those of the better performing -model. Explore attempts to find new better performing hyperparameters by +model. Explore attempts to find new better performing hyperparameters by perturbing those of the better performing model. Training then continues with the new weights and the new hyperparameters. Evaluate, exploit, and explore are performed asynchronously and independently by each model for some specified @@ -30,25 +30,26 @@ necessary.) During the explore, a model perturbs the learning rate of the selected better performing model, and then continues training with the new weights and learning rate. -## Requirements ## +## Requirements -* This workflow: git@github.com:ECP-CANDLE/Supervisor.git. Clone and cd to workflows/pbt (the directory containing this README). +- This workflow: git@github.com:ECP-CANDLE/Supervisor.git. Clone and cd to workflows/pbt (the directory containing this README). -* Python: the PBT workflow has been tested under Python 2.7. +- Python: the PBT workflow has been tested under Python 2.7. -* MPI for Python (mpi4py): http://mpi4py.scipy.org/docs/ +- MPI for Python (mpi4py): http://mpi4py.scipy.org/docs/ -* Keras: https://keras.io +- Keras: https://keras.io -* CANDLE Benchmark Code: git@github.com:ECP-CANDLE/Benchmarks.git. Clone and switch to the frameworks branch. +- CANDLE Benchmark Code: git@github.com:ECP-CANDLE/Benchmarks.git. Clone and switch to the frameworks branch. -* TC1 benchmark data: - ``` +- TC1 benchmark data: + + ``` ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/type-class/type_18_300_test.csv ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/type-class/type_18_300_train.csv - ``` + ``` - `type_18_300_train.csv` and `type_18_300_test.csv` should be copied into `X/Benchmarks/Data/Pilot1`, where X is wherever you cloned the Benchmark repository. For example, from within X/Benchmarks + `type_18_300_train.csv` and `type_18_300_test.csv` should be copied into `X/Benchmarks/Data/Pilot1`, where X is wherever you cloned the Benchmark repository. For example, from within X/Benchmarks ``` mkdir -p Data/Pilot1 @@ -57,8 +58,8 @@ weights and learning rate. wget ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/type-class/type_18_300_train.csv ``` +## Running the Workflow -## Running the Workflow ## The PBT workflow is an MPI application that when given N number of processes, runs N - 1 tc1 models, and uses the remaining process to run a datastore into which the models can put and get model peformance data. The workflow can be run using the scripts in the `scripts` directory. Two scripts are provided: `local_run_pbt.sh` and `sbatch_run_pbt.sh`. The former can be used to run on a local desktop or laptop. The latter can be used to submit the PBT workflow on hpc resources @@ -68,9 +69,9 @@ When run the PBT workflow will create an experiments directory in which the outp each tc1 instance writes is model weights every epoch, and an output.csv file that records the accuracy, loss, learning rate, validation accuracy, and validation loss for each model (identified by MPI rank) each epoch. Additionally each tc1 model run will execute within its own `run_N` instance directory (e.g. `run_1`, `run_2` and so forth) within the output directory. -### local_run_pbt.sh ### +### local_run_pbt.sh - `local_run_pbt.sh` takes 3 arguments +`local_run_pbt.sh` takes 3 arguments 1. The number of processes to use 2. An experiment id @@ -79,7 +80,7 @@ that records the accuracy, loss, learning rate, validation accuracy, and validat The experiment id is used to as the name of the experiments directory into which the model output will be written as mentioned above. For example, given the location of the `scripts` directory as `workflows/pbt/scripts` and an experiment id of `r1`, the experiments directory will be `workflows/pbt/experiments/r1`. -### sbatch_run_pbt.sh ### +### sbatch_run_pbt.sh `sbatch_run_pbt.sh` takes 2 arguments: @@ -93,107 +94,119 @@ experiment id of `r1`, the experiments directory will be `workflows/pbt/experime `scripts/pbt.sbatch`. That file can be copied and edited as appropriate, setting the queue, walltime, python, etc. for your HPC machine. It is currently configured for NERSC's Cori system. -### Hyperparameter Configuration File ### +### Hyperparameter Configuration File The PBT workflow uses a json format file for defining the hyperparameter space used by the PBT algorithm. The PBT workflow includes 2 sample hyperparameter configuration files for the tc1 model. -* `data/tc1_params_full.json`: runs the full tc1 model, including the default convolution layer and no feature subsampling. -* `data/tc1_params_small.json`: runs a faster version of the tc1 model by ommitting the convolution layer and subsampling the features. +- `data/tc1_params_full.json`: runs the full tc1 model, including the default convolution layer and no feature subsampling. +- `data/tc1_params_small.json`: runs a faster version of the tc1 model by ommitting the convolution layer and subsampling the features. The hyperparameter configuration file has a json format consisting of a list of json dictionaries, each one of which defines a hyperparameter. Each dictionary has the following required keys: -* name: the name of the hyperparameter (e.g. epochs) -* type: determines how the models are initialized from the named parameter - one of `constant`, `int`, `float`, `logical`, or `categorical`. - * `constant`: all the tc1 models are initialized with the specifed value - * `int`: each tc1 model is initialized with an int randomly drawn from the range defined by `lower` and `upper` bounds - * `float`: each tc1 model is initialized with a float randomly drawn from the range defined by `lower` and `upper` bounds - * `logical`: each tc1 model is initialized with a random boolean. - * `categorical`: each tc1 model is initialized with an element chosen at random from the list of elements in `values`. +- name: the name of the hyperparameter (e.g. epochs) +- type: determines how the models are initialized from the named parameter - one of `constant`, `int`, `float`, `logical`, or `categorical`. + - `constant`: all the tc1 models are initialized with the specifed value + - `int`: each tc1 model is initialized with an int randomly drawn from the range defined by `lower` and `upper` bounds + - `float`: each tc1 model is initialized with a float randomly drawn from the range defined by `lower` and `upper` bounds + - `logical`: each tc1 model is initialized with a random boolean. + - `categorical`: each tc1 model is initialized with an element chosen at random from the list of elements in `values`. The following keys are required depending on value of the `type` key. If the `type` is `constant`: - * `value`: the constant value + +- `value`: the constant value If the `type` is `int`, or `float`: - * `lower`: the lower bound of the range to randomly draw from - * `upper`: the upper bound of the range to randomly draw from + +- `lower`: the lower bound of the range to randomly draw from +- `upper`: the upper bound of the range to randomly draw from If the `type` is `categorical`: - * `values`: the list of elements to randomly choose from - * `element_type`: the type of the elements to choose from. One of `int`, `float`, `string`, or `logical` + +- `values`: the list of elements to randomly choose from +- `element_type`: the type of the elements to choose from. One of `int`, `float`, `string`, or `logical` A sample hyperparameter definition file: ```javascript [ { - "name": "epochs", - "type": "constant", - "value": 5 + name: "epochs", + type: "constant", + value: 5, }, { - "name": "activation", - "type": "categorical", - "element_type": "string", - "values": ["softmax", "elu", "softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear"] + name: "activation", + type: "categorical", + element_type: "string", + values: [ + "softmax", + "elu", + "softplus", + "softsign", + "relu", + "tanh", + "sigmoid", + "hard_sigmoid", + "linear", + ], }, { - "name": "batch_size", - "type": "categorical", - "element_type": "int", - "values": [32, 64] + name: "batch_size", + type: "categorical", + element_type: "int", + values: [32, 64], }, { - "name": "lr", - "type": "float", - "lower": 0.0001, - "upper": 0.01 - } -] + name: "lr", + type: "float", + lower: 0.0001, + upper: 0.01, + }, +]; ``` Note that any other keys are ignored by the workflow but can be used to add additional information about the hyperparameter. For example, the sample files contain a `comment` entry that contains additional information about that hyperparameter. - -## Workflow Explained ## +## Workflow Explained The workflow consists of 3 parts. The DNN tc1 model in `models/tc1`, the PBT python code in `python/pbt.py` and the python code that runs the tc1 model using PBT, `python/tc1_pbt.py`. -### tc1 ### +### tc1 The tc1 model is a lightly modified version of the CANDLE tc1 benchmark. The code has been updated so that an external Keras callback can be passed through `models/tc1/tc1_runner.run()` and attached to the model. The PBT algorithnm is run via this callback. -### `python/pbt.py` ### +### `python/pbt.py` `pbt.py` provides the model-agnostic framework code for implementing a PBT workflow. It has 4 main components. 1. A PBTMetaDataStore class. This implements an in-memory datastore for the model run performance and hyperparamter data. It also manages a locking scheme for model weight file IO in order to prevent issues with concurrent -file access. + file access. 2. A PBTClient class. This allows an individual instance of a model to communicate with the PBTMetaDataStore, sending it peformance data, querying -performance data for a better performing model, requesting read and write locks for reading other model weights and writing its own. The PBTClient and -PBTMetaDataStore communicate via MPI. + performance data for a better performing model, requesting read and write locks for reading other model weights and writing its own. The PBTClient and + PBTMetaDataStore communicate via MPI. 3. A PBTCallback class. This is a Keras callback that given model-specific -*ready*, *exploit*, and *explore* implementations will pass its current performance data to the data store and write its model's weights -every epoch. Then when *ready*, it will perform an an *evaluate* to find a better performing model. Assuming one is found, an *exploit* and *explore* be peformed to update its model's weights and hyperparameters appropriately. A PBTCallback uses a PBTClient to ommunicate with a PBTMetaDataStore. + _ready_, _exploit_, and _explore_ implementations will pass its current performance data to the data store and write its model's weights + every epoch. Then when _ready_, it will perform an an _evaluate_ to find a better performing model. Assuming one is found, an _exploit_ and _explore_ be peformed to update its model's weights and hyperparameters appropriately. A PBTCallback uses a PBTClient to ommunicate with a PBTMetaDataStore. -4. A PBTWorker interface. This interface defines the API for PBT's *ready*, -*exploit* and *explore* steps. Client code implements this interface, -supplying implementations appropriate to that particular workflow. +4. A PBTWorker interface. This interface defines the API for PBT's _ready_, + _exploit_ and _explore_ steps. Client code implements this interface, + supplying implementations appropriate to that particular workflow. -### `python/tc1_pbt.py` ### +### `python/tc1_pbt.py` `tc1_pbt.py` implements PBT for the tc1 model using the classes and functions in `pbt.py`. In `tc1_pbt.py`, rank 0 first generates and distribute the hyperparameters to the models running on the other ranks. The ga_utils package is used to read the hyperparameter definition file (see above) and generate, @@ -204,28 +217,28 @@ PBTMetaDataStore's constructor is passed the path of the output directory where the `output.csv` file will be written together with a the path to a log file in which user customizable log messages are written. PBTMetaDataStore also takes a reference -to an *evaluate* function that is used to evaluate a model's current performance +to an _evaluate_ function that is used to evaluate a model's current performance and select a better performing model. That function -must have the following arguments: a list of dictionaries that contains the metadata for all the models, and a *score* against which model performance is determined. Exactly what the score represents (e.g. the validation loss) is +must have the following arguments: a list of dictionaries that contains the metadata for all the models, and a _score_ against which model performance is determined. Exactly what the score represents (e.g. the validation loss) is domain specific and is provided in the `PBTWorker.pack_data` method described below. -In `tc1_pbt.py`, `truncation_select` implements this *evaluate* function and is passed to the PBTMetaDataStore. In `truncation_select`, if the specified score is in the top 80% of scores, then an empty dictionary is returned. This empty dictionary indicates that a better performing model was not found and thus -*exploit* and *explore* should not occur. If the specified score is in the bottom 20% then the data for a model in the top 20% is random selected -and returned in a python dictionary. The data in this dictionary, the rank of the better performing model and its relevant hyperparameters can then be used in *exploit* and *explore*. +In `tc1_pbt.py`, `truncation_select` implements this _evaluate_ function and is passed to the PBTMetaDataStore. In `truncation_select`, if the specified score is in the top 80% of scores, then an empty dictionary is returned. This empty dictionary indicates that a better performing model was not found and thus +_exploit_ and _explore_ should not occur. If the specified score is in the bottom 20% then the data for a model in the top 20% is random selected +and returned in a python dictionary. The data in this dictionary, the rank of the better performing model and its relevant hyperparameters can then be used in _exploit_ and _explore_. With the PBTMetaDataStore initialized on rank 0, all the remaining processes run the tc1 model. A PBTCallback is added to each one of these models. The PBTCallback constructor requires a instance of a class that implements the PBTWorker interface. A PBTCallback calls the 3 methods of a PBTWorkder to: 1. Retrieve a model's metadata and hyperparameters in order put them in the -PBTMetaDataStore (`PBTWorker.pack_data`), -2. Specifies which performance metric to use as the 'score' for model performance (also in `PBTWorker.pack_data`) in an *evaluate*. -3. Determine when a model is *ready* for a potential exploit and explore (`PBTWorker.ready`), -4. Perform the *exploit* and *explore* update (`PBTWorker.update`). + PBTMetaDataStore (`PBTWorker.pack_data`), +2. Specifies which performance metric to use as the 'score' for model performance (also in `PBTWorker.pack_data`) in an _evaluate_. +3. Determine when a model is _ready_ for a potential exploit and explore (`PBTWorker.ready`), +4. Perform the _exploit_ and _explore_ update (`PBTWorker.update`). In the tc1 PBT workflow, `tc1_pbt.TC1PBTWorker` implements the `PBTWorker` -interface. `TC1PBTWorker.pack_data` retrieves a model's current learning rate, and specifies the validation loss as the performance score. `TC1PBTWorker.ready` specifies that the model is *ready* every 5 epochs. (5 is too soon to begin sharing weights, but it serves as an example and does exercise the workflow code within a reasonable amount of time.) `TC1PBTWorker.update` updates the model with a better performing learning rate after having perturbed it. Note that `update` does not need to load the better performing model's weights. That is done automatically in PBTCallback. +interface. `TC1PBTWorker.pack_data` retrieves a model's current learning rate, and specifies the validation loss as the performance score. `TC1PBTWorker.ready` specifies that the model is _ready_ every 5 epochs. (5 is too soon to begin sharing weights, but it serves as an example and does exercise the workflow code within a reasonable amount of time.) `TC1PBTWorker.update` updates the model with a better performing learning rate after having perturbed it. Note that `update` does not need to load the better performing model's weights. That is done automatically in PBTCallback. In sum then, in a PBTCallback at the end of every epoch: @@ -233,19 +246,19 @@ In sum then, in a PBTCallback at the end of every epoch: 2. `ready` is called to determine if a model is ready for an exploit / explore update. 3. If `ready` returns true, then the PBTCallback queries the PBTMetaDataStore for a better performing model using the supplied evaluate function (e.g. `truncation_select`). 4. If the selection function returns data from a better performing model, then -`update` is called to update the under performing model with the better performing hyperparameters, and the PBTCallback loads the -better performing model's weights into the under performing model. + `update` is called to update the under performing model with the better performing hyperparameters, and the PBTCallback loads the + better performing model's weights into the under performing model. -## Adapting the Workflow to a Different Model ## +## Adapting the Workflow to a Different Model `tc1_pbt.py` can easily be adapted to work with a different model. The following changes will need to be made: -* A new hyperparameter definition file. The rank 0 -code that reads this file can be re-used. +- A new hyperparameter definition file. The rank 0 + code that reads this file can be re-used. -* A new *evaluate* function. This can be passed to the PBTMetaDataStore -constructor in place of `truncation_select` +- A new _evaluate_ function. This can be passed to the PBTMetaDataStore + constructor in place of `truncation_select` -* A new PBTWorker implementation, implementing `ready`, `pack_data`, and -`update` as appropriate for the new model and workflow. This can be -passed to the PBTCallback in place of `TC1PBTWorker`. +- A new PBTWorker implementation, implementing `ready`, `pack_data`, and + `update` as appropriate for the new model and workflow. This can be + passed to the PBTCallback in place of `TC1PBTWorker`. diff --git a/workflows/pbt/data/tc1_params_full.json b/workflows/pbt/data/tc1_params_full.json index a2162270..03582d2a 100644 --- a/workflows/pbt/data/tc1_params_full.json +++ b/workflows/pbt/data/tc1_params_full.json @@ -16,7 +16,17 @@ "name": "activation", "type": "categorical", "element_type": "string", - "values": ["softmax", "elu", "softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear"] + "values": [ + "softmax", + "elu", + "softplus", + "softsign", + "relu", + "tanh", + "sigmoid", + "hard_sigmoid", + "linear" + ] }, { @@ -54,5 +64,4 @@ "type": "constant", "value": 200 } - ] diff --git a/workflows/pbt/data/tc1_params_small.json b/workflows/pbt/data/tc1_params_small.json index ebea49e5..63d60d05 100644 --- a/workflows/pbt/data/tc1_params_small.json +++ b/workflows/pbt/data/tc1_params_small.json @@ -23,7 +23,17 @@ "name": "activation", "type": "categorical", "element_type": "string", - "values": ["softmax", "elu", "softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear"] + "values": [ + "softmax", + "elu", + "softplus", + "softsign", + "relu", + "tanh", + "sigmoid", + "hard_sigmoid", + "linear" + ] }, { @@ -59,5 +69,4 @@ "type": "constant", "value": 5 } - ] diff --git a/workflows/pbt/models/mnist/mnist_cnn.py b/workflows/pbt/models/mnist/mnist_cnn.py index fb79ead0..fed76f2b 100644 --- a/workflows/pbt/models/mnist/mnist_cnn.py +++ b/workflows/pbt/models/mnist/mnist_cnn.py @@ -1,17 +1,16 @@ -'''Trains a simple convnet on the MNIST dataset. +"""Trains a simple convnet on the MNIST dataset. -Gets to 99.25% test accuracy after 12 epochs -(there is still a lot of margin for parameter tuning). -16 seconds per epoch on a GRID K520 GPU. -''' +Gets to 99.25% test accuracy after 12 epochs (there is still a lot of +margin for parameter tuning). 16 seconds per epoch on a GRID K520 GPU. +""" from __future__ import print_function + import keras +from keras import backend as K from keras.datasets import fashion_mnist +from keras.layers import Conv2D, Dense, Dropout, Flatten, MaxPooling2D from keras.models import Sequential -from keras.layers import Dense, Dropout, Flatten -from keras.layers import Conv2D, MaxPooling2D -from keras import backend as K batch_size = 128 num_classes = 10 @@ -23,7 +22,7 @@ # the data, split between train and test sets (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data() -if K.image_data_format() == 'channels_first': +if K.image_data_format() == "channels_first": x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) input_shape = (1, img_rows, img_cols) @@ -32,39 +31,43 @@ x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) input_shape = (img_rows, img_cols, 1) -x_train = x_train.astype('float32') -x_test = x_test.astype('float32') +x_train = x_train.astype("float32") +x_test = x_test.astype("float32") x_train /= 255 x_test /= 255 -print('x_train shape:', x_train.shape) -print(x_train.shape[0], 'train samples') -print(x_test.shape[0], 'test samples') +print("x_train shape:", x_train.shape) +print(x_train.shape[0], "train samples") +print(x_test.shape[0], "test samples") # convert class vectors to binary class matrices y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) model = Sequential() -model.add(Conv2D(32, kernel_size=(3, 3), - activation='relu', - input_shape=input_shape)) -model.add(Conv2D(64, (3, 3), activation='relu')) +model.add( + Conv2D(32, kernel_size=(3, 3), activation="relu", input_shape=input_shape)) +model.add(Conv2D(64, (3, 3), activation="relu")) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Dropout(0.25)) model.add(Flatten()) -model.add(Dense(128, activation='relu')) +model.add(Dense(128, activation="relu")) model.add(Dropout(0.5)) -model.add(Dense(num_classes, activation='softmax')) +model.add(Dense(num_classes, activation="softmax")) -model.compile(loss=keras.losses.categorical_crossentropy, - optimizer=keras.optimizers.Adadelta(), - metrics=['accuracy']) +model.compile( + loss=keras.losses.categorical_crossentropy, + optimizer=keras.optimizers.Adadelta(), + metrics=["accuracy"], +) -model.fit(x_train, y_train, - batch_size=batch_size, - epochs=epochs, - verbose=1, - validation_data=(x_test, y_test)) +model.fit( + x_train, + y_train, + batch_size=batch_size, + epochs=epochs, + verbose=1, + validation_data=(x_test, y_test), +) score = model.evaluate(x_test, y_test, verbose=0) -print('Test loss:', score[0]) -print('Test accuracy:', score[1]) +print("Test loss:", score[0]) +print("Test accuracy:", score[1]) diff --git a/workflows/pbt/models/tc1/tc1_baseline_keras2.py b/workflows/pbt/models/tc1/tc1_baseline_keras2.py index 6a993d90..dc95132c 100644 --- a/workflows/pbt/models/tc1/tc1_baseline_keras2.py +++ b/workflows/pbt/models/tc1/tc1_baseline_keras2.py @@ -1,49 +1,62 @@ -import pandas as pd -import numpy as np +import argparse +import gzip import os import sys -import gzip -import argparse + +import numpy as np +import pandas as pd + try: import configparser except ImportError: import ConfigParser as configparser from keras import backend as K - -from keras.layers import Input, Dense, Dropout, Activation, Conv1D, MaxPooling1D, Flatten, LocallyConnected1D +from keras.callbacks import CSVLogger, ModelCheckpoint, ReduceLROnPlateau +from keras.layers import ( + Activation, + Conv1D, + Dense, + Dropout, + Flatten, + Input, + LocallyConnected1D, + MaxPooling1D, +) +from keras.models import Model, Sequential, model_from_json, model_from_yaml from keras.optimizers import SGD, Adam, RMSprop -from keras.models import Sequential, Model, model_from_json, model_from_yaml from keras.utils import np_utils -from keras.callbacks import ModelCheckpoint, CSVLogger, ReduceLROnPlateau - from sklearn.metrics import accuracy_score -from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler +from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler file_path = os.path.dirname(os.path.realpath(__file__)) -lib_path = os.path.abspath(os.path.join(file_path, '..', 'common')) +lib_path = os.path.abspath(os.path.join(file_path, "..", "common")) sys.path.append(lib_path) -lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) +lib_path2 = os.path.abspath(os.path.join(file_path, "..", "..", "common")) sys.path.append(lib_path2) import data_utils import p1_common - from solr_keras import CandleRemoteMonitor, TerminateOnTimeOut -#EPOCH = 400 -#BATCH = 20 -#CLASSES = 2 +# EPOCH = 400 +# BATCH = 20 +# CLASSES = 2 + +# PL = 60484 # 1 + 60483 these are the width of the RNAseq datasets +# P = 60483 # 60483 +# DR = 0.1 # Dropout rate -#PL = 60484 # 1 + 60483 these are the width of the RNAseq datasets -#P = 60483 # 60483 -#DR = 0.1 # Dropout rate def common_parser(parser): - parser.add_argument("--config_file", dest='config_file', type=str, - default=os.path.join(file_path, 'tc1_default_model.txt'), - help="specify model configuration file") + parser.add_argument( + "--config_file", + dest="config_file", + type=str, + default=os.path.join(file_path, "tc1_default_model.txt"), + help="specify model configuration file", + ) # Parse has been split between arguments that are common with the default neon parser # and all the other options @@ -52,50 +65,57 @@ def common_parser(parser): return parser + def get_tc1_parser(): - parser = argparse.ArgumentParser(prog='tc1_baseline', formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description='Train Autoencoder - Pilot 1 Benchmark 1') + parser = argparse.ArgumentParser( + prog="tc1_baseline", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="Train Autoencoder - Pilot 1 Benchmark 1", + ) + + return common_parser(parser) - return common_parser(parser) def read_config_file(file): - config=configparser.ConfigParser() + config = configparser.ConfigParser() config.read(file) - section=config.sections() - fileParams={} - - fileParams['data_url']=eval(config.get(section[0],'data_url')) - fileParams['train_data']=eval(config.get(section[0],'train_data')) - fileParams['test_data']=eval(config.get(section[0],'test_data')) - fileParams['model_name']=eval(config.get(section[0],'model_name')) - fileParams['conv']=eval(config.get(section[0],'conv')) - fileParams['dense']=eval(config.get(section[0],'dense')) - fileParams['activation']=eval(config.get(section[0],'activation')) - fileParams['out_act']=eval(config.get(section[0],'out_act')) - fileParams['loss']=eval(config.get(section[0],'loss')) - fileParams['optimizer']=eval(config.get(section[0],'optimizer')) - fileParams['feature_subsample']=eval(config.get(section[0],'feature_subsample')) - fileParams['metrics']=eval(config.get(section[0],'metrics')) - fileParams['epochs']=eval(config.get(section[0],'epochs')) - fileParams['batch_size']=eval(config.get(section[0],'batch_size')) - fileParams['drop']=eval(config.get(section[0],'drop')) - fileParams['classes']=eval(config.get(section[0],'classes')) - fileParams['pool']=eval(config.get(section[0],'pool')) - fileParams['save']=eval(config.get(section[0], 'save')) - fileParams['lr']=eval(config.get(section[0], 'lr')) - fileParams['timeout']=eval(config.get(section[0], 'timeout')) + section = config.sections() + fileParams = {} + + fileParams["data_url"] = eval(config.get(section[0], "data_url")) + fileParams["train_data"] = eval(config.get(section[0], "train_data")) + fileParams["test_data"] = eval(config.get(section[0], "test_data")) + fileParams["model_name"] = eval(config.get(section[0], "model_name")) + fileParams["conv"] = eval(config.get(section[0], "conv")) + fileParams["dense"] = eval(config.get(section[0], "dense")) + fileParams["activation"] = eval(config.get(section[0], "activation")) + fileParams["out_act"] = eval(config.get(section[0], "out_act")) + fileParams["loss"] = eval(config.get(section[0], "loss")) + fileParams["optimizer"] = eval(config.get(section[0], "optimizer")) + fileParams["feature_subsample"] = eval( + config.get(section[0], "feature_subsample")) + fileParams["metrics"] = eval(config.get(section[0], "metrics")) + fileParams["epochs"] = eval(config.get(section[0], "epochs")) + fileParams["batch_size"] = eval(config.get(section[0], "batch_size")) + fileParams["drop"] = eval(config.get(section[0], "drop")) + fileParams["classes"] = eval(config.get(section[0], "classes")) + fileParams["pool"] = eval(config.get(section[0], "pool")) + fileParams["save"] = eval(config.get(section[0], "save")) + fileParams["lr"] = eval(config.get(section[0], "lr")) + fileParams["timeout"] = eval(config.get(section[0], "timeout")) return fileParams + def initialize_parameters(): # Get command-line parameters parser = get_tc1_parser() args = parser.parse_args() - #print('Args:', args) + # print('Args:', args) # Get parameters from configuration file fileParameters = read_config_file(args.config_file) - #print ('Params:', fileParameters) + # print ('Params:', fileParameters) # Consolidate parameter set. Command-line parameters overwrite file configuration gParameters = p1_common.args_overwrite_config(args, fileParameters) return gParameters @@ -103,31 +123,33 @@ def initialize_parameters(): def load_data(train_path, test_path, gParameters): - print('Loading data...') - if gParameters['feature_subsample'] > 0: - usecols = list(range(gParameters['feature_subsample'])) + print("Loading data...") + if gParameters["feature_subsample"] > 0: + usecols = list(range(gParameters["feature_subsample"])) else: usecols = None - df_train = (pd.read_csv(train_path, header=None, usecols=usecols).values).astype('float32') - df_test = (pd.read_csv(test_path, header=None, usecols=usecols).values).astype('float32') - print('done') + df_train = (pd.read_csv(train_path, header=None, + usecols=usecols).values).astype("float32") + df_test = (pd.read_csv(test_path, header=None, + usecols=usecols).values).astype("float32") + print("done") - print('df_train shape:', df_train.shape) - print('df_test shape:', df_test.shape) + print("df_train shape:", df_train.shape) + print("df_test shape:", df_test.shape) seqlen = df_train.shape[1] - df_y_train = df_train[:,0].astype('int') - df_y_test = df_test[:,0].astype('int') + df_y_train = df_train[:, 0].astype("int") + df_y_test = df_test[:, 0].astype("int") - Y_train = np_utils.to_categorical(df_y_train,gParameters['classes']) - Y_test = np_utils.to_categorical(df_y_test,gParameters['classes']) + Y_train = np_utils.to_categorical(df_y_train, gParameters["classes"]) + Y_test = np_utils.to_categorical(df_y_test, gParameters["classes"]) df_x_train = df_train[:, 1:seqlen].astype(np.float32) df_x_test = df_test[:, 1:seqlen].astype(np.float32) -# X_train = df_x_train.as_matrix() -# X_test = df_x_test.as_matrix() + # X_train = df_x_train.as_matrix() + # X_test = df_x_test.as_matrix() X_train = df_x_train X_test = df_x_test @@ -144,22 +166,27 @@ def load_data(train_path, test_path, gParameters): def run(gParameters, callbacks): - print ('Params:', gParameters) + print("Params:", gParameters) - file_train = gParameters['train_data'] - file_test = gParameters['test_data'] - url = gParameters['data_url'] + file_train = gParameters["train_data"] + file_test = gParameters["test_data"] + url = gParameters["data_url"] - train_file = data_utils.get_file(file_train, url+file_train, cache_subdir='Pilot1') - test_file = data_utils.get_file(file_test, url+file_test, cache_subdir='Pilot1') + train_file = data_utils.get_file(file_train, + url + file_train, + cache_subdir="Pilot1") + test_file = data_utils.get_file(file_test, + url + file_test, + cache_subdir="Pilot1") - X_train, Y_train, X_test, Y_test = load_data(train_file, test_file, gParameters) + X_train, Y_train, X_test, Y_test = load_data(train_file, test_file, + gParameters) - print('X_train shape:', X_train.shape) - print('X_test shape:', X_test.shape) + print("X_train shape:", X_train.shape) + print("X_test shape:", X_test.shape) - print('Y_train shape:', Y_train.shape) - print('Y_test shape:', Y_test.shape) + print("Y_train shape:", Y_train.shape) + print("Y_test shape:", Y_test.shape) x_train_len = X_train.shape[1] @@ -168,116 +195,139 @@ def run(gParameters, callbacks): X_train = np.expand_dims(X_train, axis=2) X_test = np.expand_dims(X_test, axis=2) - print('X_train shape:', X_train.shape) - print('X_test shape:', X_test.shape) + print("X_train shape:", X_train.shape) + print("X_test shape:", X_test.shape) model = Sequential() dense_first = True - layer_list = list(range(0, len(gParameters['conv']), 3)) + layer_list = list(range(0, len(gParameters["conv"]), 3)) for l, i in enumerate(layer_list): - filters = gParameters['conv'][i] - filter_len = gParameters['conv'][i+1] - stride = gParameters['conv'][i+2] - print(i/3, filters, filter_len, stride) - if gParameters['pool']: - pool_list=gParameters['pool'] + filters = gParameters["conv"][i] + filter_len = gParameters["conv"][i + 1] + stride = gParameters["conv"][i + 2] + print(i / 3, filters, filter_len, stride) + if gParameters["pool"]: + pool_list = gParameters["pool"] if type(pool_list) != list: - pool_list=list(pool_list) + pool_list = list(pool_list) if filters <= 0 or filter_len <= 0 or stride <= 0: - break + break dense_first = False - if 'locally_connected' in gParameters: - model.add(LocallyConnected1D(filters, filter_len, strides=stride, padding='valid', input_shape=(x_train_len, 1))) + if "locally_connected" in gParameters: + model.add( + LocallyConnected1D( + filters, + filter_len, + strides=stride, + padding="valid", + input_shape=(x_train_len, 1), + )) else: - #input layer + # input layer if i == 0: - model.add(Conv1D(filters=filters, kernel_size=filter_len, strides=stride, padding='valid', input_shape=(x_train_len, 1))) + model.add( + Conv1D( + filters=filters, + kernel_size=filter_len, + strides=stride, + padding="valid", + input_shape=(x_train_len, 1), + )) else: - model.add(Conv1D(filters=filters, kernel_size=filter_len, strides=stride, padding='valid')) - model.add(Activation(gParameters['activation'])) - if gParameters['pool']: - model.add(MaxPooling1D(pool_size=pool_list[i//3])) + model.add( + Conv1D( + filters=filters, + kernel_size=filter_len, + strides=stride, + padding="valid", + )) + model.add(Activation(gParameters["activation"])) + if gParameters["pool"]: + model.add(MaxPooling1D(pool_size=pool_list[i // 3])) if not dense_first: model.add(Flatten()) - for i, layer in enumerate(gParameters['dense']): + for i, layer in enumerate(gParameters["dense"]): if layer: if i == 0 and dense_first: model.add(Dense(layer, input_shape=(x_train_len, 1))) else: model.add(Dense(layer)) - model.add(Activation(gParameters['activation'])) - if gParameters['drop']: - model.add(Dropout(gParameters['drop'])) + model.add(Activation(gParameters["activation"])) + if gParameters["drop"]: + model.add(Dropout(gParameters["drop"])) if dense_first: model.add(Flatten()) - model.add(Dense(gParameters['classes'])) - - model.add(Activation(gParameters['out_act'])) - -#Reference case -#model.add(Conv1D(filters=128, kernel_size=20, strides=1, padding='valid', input_shape=(P, 1))) -#model.add(Activation('relu')) -#model.add(MaxPooling1D(pool_size=1)) -#model.add(Conv1D(filters=128, kernel_size=10, strides=1, padding='valid')) -#model.add(Activation('relu')) -#model.add(MaxPooling1D(pool_size=10)) -#model.add(Flatten()) -#model.add(Dense(200)) -#model.add(Activation('relu')) -#model.add(Dropout(0.1)) -#model.add(Dense(20)) -#model.add(Activation('relu')) -#model.add(Dropout(0.1)) -#model.add(Dense(CLASSES)) -#model.add(Activation('softmax')) + model.add(Dense(gParameters["classes"])) + + model.add(Activation(gParameters["out_act"])) + + # Reference case + # model.add(Conv1D(filters=128, kernel_size=20, strides=1, padding='valid', input_shape=(P, 1))) + # model.add(Activation('relu')) + # model.add(MaxPooling1D(pool_size=1)) + # model.add(Conv1D(filters=128, kernel_size=10, strides=1, padding='valid')) + # model.add(Activation('relu')) + # model.add(MaxPooling1D(pool_size=10)) + # model.add(Flatten()) + # model.add(Dense(200)) + # model.add(Activation('relu')) + # model.add(Dropout(0.1)) + # model.add(Dense(20)) + # model.add(Activation('relu')) + # model.add(Dropout(0.1)) + # model.add(Dense(CLASSES)) + # model.add(Activation('softmax')) model.summary() # ["adam", "rmsprop"] - lr = gParameters['lr'] - if gParameters['optimizer'] == 'adam': + lr = gParameters["lr"] + if gParameters["optimizer"] == "adam": optimizer = Adam(lr=lr) - elif gParameters['optimizer'] == 'rmsprop': + elif gParameters["optimizer"] == "rmsprop": optimizer = RMSprop(lr=lr) - model.compile(loss=gParameters['loss'], - optimizer=optimizer, - metrics=[gParameters['metrics']]) + model.compile(loss=gParameters["loss"], + optimizer=optimizer, + metrics=[gParameters["metrics"]]) - output_dir = gParameters['save'] + output_dir = gParameters["save"] if not os.path.exists(output_dir): os.makedirs(output_dir) -# set up a bunch of callbacks to do work during model training.. + # set up a bunch of callbacks to do work during model training.. - model_name = gParameters['model_name'] - path = '{}/{}.autosave.model.h5'.format(output_dir, model_name) + model_name = gParameters["model_name"] + path = "{}/{}.autosave.model.h5".format(output_dir, model_name) # checkpointer = ModelCheckpoint(filepath=path, verbose=1, save_weights_only=False, save_best_only=True) # csv_logger = CSVLogger('{}/training.log'.format(output_dir)) # reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0) candleRemoteMonitor = CandleRemoteMonitor(params=gParameters) - #callbacks.append(reduce_lr) - #timeout = 3600 - #timeoutMonitor = TerminateOnTimeOut(timeout) + # callbacks.append(reduce_lr) + # timeout = 3600 + # timeoutMonitor = TerminateOnTimeOut(timeout) callbacks.append(candleRemoteMonitor) - #callbacks.append(timeoutMonitor) - history = model.fit(X_train, Y_train, - batch_size=gParameters['batch_size'], - epochs=gParameters['epochs'], - verbose=0, - validation_data=(X_test, Y_test), - callbacks = callbacks) + # callbacks.append(timeoutMonitor) + history = model.fit( + X_train, + Y_train, + batch_size=gParameters["batch_size"], + epochs=gParameters["epochs"], + verbose=0, + validation_data=(X_test, Y_test), + callbacks=callbacks, + ) score = model.evaluate(X_test, Y_test, verbose=0) - print('Test score:', score[0]) - print('Test accuracy:', score[1]) + print("Test score:", score[0]) + print("Test accuracy:", score[1]) # serialize model to JSON # model_json = model.to_json() @@ -342,14 +392,16 @@ def run(gParameters, callbacks): return history + def main(): gParameters = initialize_parameters() run(gParameters) -if __name__ == '__main__': + +if __name__ == "__main__": main() try: K.clear_session() - except AttributeError: # theano does not have this function + except AttributeError: # theano does not have this function pass diff --git a/workflows/pbt/models/tc1/tc1_runner.py b/workflows/pbt/models/tc1/tc1_runner.py index 3a0b24b0..1e072fd0 100644 --- a/workflows/pbt/models/tc1/tc1_runner.py +++ b/workflows/pbt/models/tc1/tc1_runner.py @@ -1,30 +1,41 @@ # tensoflow.__init__ calls _os.path.basename(_sys.argv[0]) # so we need to create a synthetic argv. import sys -if not hasattr(sys, 'argv'): - sys.argv = ['nt3_tc1'] +if not hasattr(sys, "argv"): + sys.argv = ["nt3_tc1"] + +import importlib import json import os + +import log_tools import numpy as np -import importlib import runner_utils -import log_tools logger = None + def import_pkg(framework, model_name): - if framework == 'keras': + if framework == "keras": module_name = "{}_baseline_keras2".format(model_name) pkg = importlib.import_module(module_name) from keras import backend as K - if K.backend() == 'tensorflow' and 'NUM_INTER_THREADS' in os.environ: + + if K.backend() == "tensorflow" and "NUM_INTER_THREADS" in os.environ: import tensorflow as tf - print("Configuring tensorflow with {} inter threads and {} intra threads". - format(os.environ['NUM_INTER_THREADS'], os.environ['NUM_INTRA_THREADS'])) - session_conf = tf.ConfigProto(inter_op_parallelism_threads=int(os.environ['NUM_INTER_THREADS']), - intra_op_parallelism_threads=int(os.environ['NUM_INTRA_THREADS'])) + + print( + "Configuring tensorflow with {} inter threads and {} intra threads" + .format(os.environ["NUM_INTER_THREADS"], + os.environ["NUM_INTRA_THREADS"])) + session_conf = tf.ConfigProto( + inter_op_parallelism_threads=int( + os.environ["NUM_INTER_THREADS"]), + intra_op_parallelism_threads=int( + os.environ["NUM_INTRA_THREADS"]), + ) sess = tf.Session(graph=tf.get_default_graph(), config=session_conf) K.set_session(sess) # elif framework is 'mxnet': @@ -37,21 +48,22 @@ def import_pkg(framework, model_name): raise ValueError("Invalid framework: {}".format(framework)) return pkg + def run(hyper_parameter_map, callbacks): global logger logger = log_tools.get_logger(logger, __name__) - framework = hyper_parameter_map['framework'] - model_name = hyper_parameter_map['model_name'] + framework = hyper_parameter_map["framework"] + model_name = hyper_parameter_map["model_name"] pkg = import_pkg(framework, model_name) runner_utils.format_params(hyper_parameter_map) # params is python dictionary params = pkg.initialize_parameters() - for k,v in hyper_parameter_map.items(): - #if not k in params: + for k, v in hyper_parameter_map.items(): + # if not k in params: # raise Exception("Parameter '{}' not found in set of valid arguments".format(k)) params[k] = v @@ -61,12 +73,13 @@ def run(hyper_parameter_map, callbacks): runner_utils.keras_clear_session(framework) # use the last validation_loss as the value to minimize - val_loss = history.history['val_loss'] + val_loss = history.history["val_loss"] result = val_loss[-1] print("result: ", result) return result -if __name__ == '__main__': + +if __name__ == "__main__": logger = log_tools.get_logger(logger, __name__) logger.debug("RUN START") @@ -77,13 +90,14 @@ def run(hyper_parameter_map, callbacks): exp_id = sys.argv[5] run_id = sys.argv[6] benchmark_timeout = int(sys.argv[7]) - hyper_parameter_map = runner_utils.init(param_string, instance_directory, framework, 'save') - hyper_parameter_map['model_name'] = model_name - hyper_parameter_map['experiment_id'] = exp_id - hyper_parameter_map['run_id'] = run_id - hyper_parameter_map['timeout'] = benchmark_timeout + hyper_parameter_map = runner_utils.init(param_string, instance_directory, + framework, "save") + hyper_parameter_map["model_name"] = model_name + hyper_parameter_map["experiment_id"] = exp_id + hyper_parameter_map["run_id"] = run_id + hyper_parameter_map["timeout"] = benchmark_timeout # clear sys.argv so that argparse doesn't object - sys.argv = ['nt3_tc1_runner'] + sys.argv = ["nt3_tc1_runner"] result = run(hyper_parameter_map) runner_utils.write_output(result, instance_directory) logger.debug("RUN STOP") diff --git a/workflows/pbt/python/file_test.py b/workflows/pbt/python/file_test.py index d1a8cf09..441cfe87 100644 --- a/workflows/pbt/python/file_test.py +++ b/workflows/pbt/python/file_test.py @@ -1,22 +1,25 @@ -import time, random, sys -from mpi4py import MPI -from pbt_utils import PBTMetaDataStore, PBTClient, Timer +import random +import sys +import time import keras from keras import backend as K +from mpi4py import MPI +from pbt_utils import PBTClient, PBTMetaDataStore, Timer GET = 0 PUT = 1 + def r2(y_true, y_pred): - SS_res = K.sum(K.square(y_true - y_pred)) + SS_res = K.sum(K.square(y_true - y_pred)) SS_tot = K.sum(K.square(y_true - K.mean(y_true))) - return (1 - SS_res/(SS_tot + K.epsilon())) + return 1 - SS_res / (SS_tot + K.epsilon()) def run(comm, worker_comm, model_file): client = PBTClient(comm, 0) - model = keras.models.load_model(model_file, custom_objects={'r2' : r2}) + model = keras.models.load_model(model_file, custom_objects={"r2": r2}) timer = Timer("./timings_{}.csv".format(client.rank)) timer.start() @@ -48,6 +51,7 @@ def run(comm, worker_comm, model_file): timer.close() client.done() + def main(model_file): comm = MPI.COMM_WORLD rank = comm.Get_rank() @@ -60,5 +64,6 @@ def main(model_file): else: run(comm, worker_comm, model_file) -if __name__ == '__main__': + +if __name__ == "__main__": main(sys.argv[1]) diff --git a/workflows/pbt/python/pbt.py b/workflows/pbt/python/pbt.py index 1b0548b8..85116e45 100644 --- a/workflows/pbt/python/pbt.py +++ b/workflows/pbt/python/pbt.py @@ -1,11 +1,13 @@ -from mpi4py import MPI -import time, math, ctypes - +import ctypes +import math +import os.path +import random +import time from collections import deque -import random, os.path import keras import pbt_utils +from mpi4py import MPI try: import cPickle as pkl @@ -18,15 +20,13 @@ from io import BytesIO as IO - class Timer: def __init__(self, fname=None): if fname == None: self.out = None else: - self.out = open(fname, 'w') - + self.out = open(fname, "w") def start(self): self.t = time.time() @@ -45,7 +45,18 @@ def close(self): class MsgType: - LOCKED, UNLOCKED, ACQUIRE_READ_LOCK, RELEASE_READ_LOCK, ACQUIRE_WRITE_LOCK, RELEASE_WRITE_LOCK, GET_DATA, PUT_DATA, LOG, DONE = range(10) + ( + LOCKED, + UNLOCKED, + ACQUIRE_READ_LOCK, + RELEASE_READ_LOCK, + ACQUIRE_WRITE_LOCK, + RELEASE_WRITE_LOCK, + GET_DATA, + PUT_DATA, + LOG, + DONE, + ) = range(10) class Tags: @@ -53,13 +64,12 @@ class Tags: class PBTClient: - """Client of the PBTMetaDataStore, used to request locks, and put and get data - from a PBTMetaDataStore. - """ + """Client of the PBTMetaDataStore, used to request locks, and put and get + data from a PBTMetaDataStore.""" def __init__(self, comm, dest, outdir): - """Initializes the PBT client with a communicator and the destination rank - of the PBTMetaDataStore + """Initializes the PBT client with a communicator and the destination + rank of the PBTMetaDataStore. :param comm: the communicator to use to send / recv messages to the PBTMetaDataStore :param dest: the rank of the PBTMetaDataStore @@ -78,13 +88,13 @@ def acquire_read_lock(self, for_rank): :param for_rank: the rank of the weights file to acquire the lock for. """ - msg = {'type' : MsgType.ACQUIRE_READ_LOCK, 'rank' : for_rank} + msg = {"type": MsgType.ACQUIRE_READ_LOCK, "rank": for_rank} status = MPI.Status() - #print("{} requesting read lock: {}".format(self.rank, msg)) + # print("{} requesting read lock: {}".format(self.rank, msg)) self.comm.send(msg, dest=self.dest, tag=Tags.REQUEST) # wait for acknowledgement of lock self.comm.recv(source=self.dest, tag=Tags.ACK, status=status) - #print("{} acquired read lock".format(self.rank)) + # print("{} acquired read lock".format(self.rank)) def release_read_lock(self, for_rank): """Releases a previously acquired read lock for the weights file @@ -92,27 +102,28 @@ def release_read_lock(self, for_rank): :param for_rank: the rank of the weights file to release the lock for. """ - msg = {'type' : MsgType.RELEASE_READ_LOCK, 'rank' : for_rank} + msg = {"type": MsgType.RELEASE_READ_LOCK, "rank": for_rank} status = MPI.Status() self.comm.send(msg, dest=self.dest, tag=Tags.REQUEST) # wait for acknowledgement of lock release self.comm.recv(source=self.dest, tag=Tags.ACK, status=status) def release_write_lock(self, for_rank): - """Releases the write lock for the specified rank that has been acquired - by the put_data call. + """Releases the write lock for the specified rank that has been + acquired by the put_data call. :param for_rank: the rank of the weights file to release the write lock for. """ - msg = {'type' : MsgType.RELEASE_WRITE_LOCK, 'rank' : for_rank} + msg = {"type": MsgType.RELEASE_WRITE_LOCK, "rank": for_rank} status = MPI.Status() self.comm.send(msg, dest=self.dest, tag=Tags.REQUEST) # wait for acknowledgement of lock release self.comm.recv(source=self.dest, tag=Tags.ACK, status=status) def get_data(self, score, lock_weights=True): - """Gets the metadata for a better performing model, assuming there is one. + """Gets the metadata for a better performing model, assuming there is + one. Given a score against which to evaluate model performance, this will return the metadata for a better performing model as dictionary. If there is no better @@ -132,13 +143,17 @@ def get_data(self, score, lock_weights=True): a model's performance ('score'). The dictionary will also contain whatever model hyperparameters client code puts in the datastore. """ - msg = {'type' : MsgType.GET_DATA, 'lock_weights' : lock_weights, 'score': score} + msg = { + "type": MsgType.GET_DATA, + "lock_weights": lock_weights, + "score": score + } self.comm.send(msg, dest=self.dest, tag=Tags.REQUEST) status = MPI.Status() result = self.comm.recv(source=self.dest, tag=Tags.SCORE, status=status) if len(result) and lock_weights: - self.comm.recv(source=self.dest, tag=Tags.ACK, status=status) - #print{"{} acquired weights lock".format(self.rank)) + self.comm.recv(source=self.dest, tag=Tags.ACK, status=status) + # print{"{} acquired weights lock".format(self.rank)) return result def put_data(self, data, lock_weights=True): @@ -152,43 +167,46 @@ def put_data(self, data, lock_weights=True): :param lock_weights: if True this method will also acquire the write lock for the weights file associated with the rank in the data dictionary. """ - msg = {'type' : MsgType.PUT_DATA, 'data' : data, - 'lock_weights' : lock_weights} + msg = { + "type": MsgType.PUT_DATA, + "data": data, + "lock_weights": lock_weights + } self.comm.send(msg, dest=self.dest, tag=Tags.REQUEST) # don't return until the score has actually been put self.comm.recv(source=self.dest, tag=Tags.ACK) status = MPI.Status() if lock_weights: self.comm.recv(source=self.dest, tag=Tags.ACK, status=status) - #print{"{} acquired weights lock".format(self.rank)) + # print{"{} acquired weights lock".format(self.rank)) def log(self, log): - """Logs the specified log message. - """ - msg = {'type': MsgType.LOG, 'log': log} + """Logs the specified log message.""" + msg = {"type": MsgType.LOG, "log": log} self.comm.send(msg, dest=self.dest, tag=Tags.REQUEST) def done(self): - """Notifies the PBTMetaDataStore that model associated with this PBTClient - is finished. + """Notifies the PBTMetaDataStore that model associated with this + PBTClient is finished. - No more PBTClient calls should be made after this method is called. + No more PBTClient calls should be made after this method is + called. """ - msg = {'type' : MsgType.DONE} + msg = {"type": MsgType.DONE} self.comm.send(msg, dest=self.dest, tag=Tags.REQUEST) - def put(self, data, model): self.put_data(data) - #model.save_weights("{}/weights_{}.h5".format(self.outdir, self.rank)) + # model.save_weights("{}/weights_{}.h5".format(self.outdir, self.rank)) pbt_utils.save_state(model, self.outdir, self.rank) self.release_write_lock(self.rank) def load_state(self, model, data, read_rank): pbt_utils.load_state(model, self.outdir, read_rank) - #model.load_weights("{}/weights_{}.h5".format(self.outdir, read_rank)) + # model.load_weights("{}/weights_{}.h5".format(self.outdir, read_rank)) self.release_read_lock(read_rank) + class DataSpacesPBTClient(PBTClient): def __init__(self, comm, dest, outdir): @@ -220,15 +238,17 @@ def make_comm_arg(self, comm): def put(self, data, model): weights = pkl.dumps(model.get_weights(), pkl.HIGHEST_PROTOCOL) weights_size = len(weights) - data['_weights_size_'] = weights_size + data["_weights_size_"] = weights_size self.put_data(data) - self.lib.pbt_ds_put_weights(self.rank, weights, weights_size, self.mpi_comm_self) + self.lib.pbt_ds_put_weights(self.rank, weights, weights_size, + self.mpi_comm_self) self.release_write_lock(self.rank) def load_weights(self, model, data, read_rank): - weights_size = data['_weights_size_'] + weights_size = data["_weights_size_"] str_weights = ctypes.create_string_buffer(weights_size) - self.lib.pbt_ds_get_weights(read_rank, str_weights, weights_size, self.mpi_comm_self) + self.lib.pbt_ds_get_weights(read_rank, str_weights, weights_size, + self.mpi_comm_self) model.set_weights(pkl.load(IO(str_weights))) self.release_read_lock(read_rank) @@ -239,8 +259,7 @@ def done(self): class DataStoreLock: - """Lock for an individual weights file. - """ + """Lock for an individual weights file.""" def __init__(self, comm, source, target): """ @@ -254,12 +273,12 @@ def __init__(self, comm, source, target): self.comm = comm def lock(self): - #print{"Ack for lock '{}' lock from {}".format(self.locked_obj, self.target)) + # print{"Ack for lock '{}' lock from {}".format(self.locked_obj, self.target)) # send the acknowledgement of the lock back to target self.comm.send(MsgType.LOCKED, dest=self.target, tag=Tags.ACK) def unlock(self): - #print{"Ack for unlock '{}' lock from {}".format(self.locked_obj, self.target)) + # print{"Ack for unlock '{}' lock from {}".format(self.locked_obj, self.target)) self.comm.send(MsgType.UNLOCKED, dest=self.target, tag=Tags.ACK) @@ -336,7 +355,7 @@ def __init__(self, comm, outdir, exploiter, log_file, dataspaces=False): for i in range(self.comm.Get_size()): if i != self.rank: self.locks[i] = DataStoreLockManager(self.comm, self.rank) - self.scores[i] = {'score': float('nan')} + self.scores[i] = {"score": float("nan")} self.log_file = log_file self.all_scores = [] self.logs = [] @@ -353,11 +372,11 @@ def write_data(self): f = "{}/output.csv".format(self.outdir) header = self.all_scores[0].keys() if not os.path.isfile(f): - with open(f, 'w') as f_out: + with open(f, "w") as f_out: f_out.write(",".join(header)) f_out.write("\n") - with open(f, 'a') as f_out: + with open(f, "a") as f_out: for item in self.all_scores: for i, h in enumerate(header): if i > 0: @@ -372,7 +391,7 @@ def done(self): self.write_data() def write_logs(self): - with open(self.log_file, 'a') as f_out: + with open(self.log_file, "a") as f_out: for l in self.logs: f_out.write(l) f_out.write("\n") @@ -380,31 +399,31 @@ def write_logs(self): self.logs = [] def acquire_read_lock(self, requesting_rank, key): - #print("{} acquiring read lock for {}".format(requesting_rank, key)) + # print("{} acquiring read lock for {}".format(requesting_rank, key)) lock_manager = self.locks[key] lock_manager.read_lock(requesting_rank) def release_read_lock(self, requesting_rank, key): - #print("{} releasing read lock for {}".format(requesting_rank, key)) + # print("{} releasing read lock for {}".format(requesting_rank, key)) # can get NULL_RANK if score requested but no scores yet lock_manager = self.locks[key] lock_manager.read_unlock(requesting_rank) def acquire_write_lock(self, requesting_rank, key): - #print("{} acquiring write lock for {}".format(requesting_rank, key)) + # print("{} acquiring write lock for {}".format(requesting_rank, key)) lock_manager = self.locks[key] lock_manager.write_lock(requesting_rank) def release_write_lock(self, requesting_rank, key): - #print("{} releasing write lock for {}".format(requesting_rank, key)) + # print("{} releasing write lock for {}".format(requesting_rank, key)) lock_manager = self.locks[key] lock_manager.write_unlock(requesting_rank) def put_data(self, putting_rank, data): """ - :param :data - dictionary of data: val_loss etc. + :param :data - dictionary of data: val_loss etc. """ - #print("Putting score {},{}".format(putting_rank, data)) + # print("Putting score {},{}".format(putting_rank, data)) self.all_scores.append(data) live_ranks = self.comm.Get_size() - 1 @@ -414,54 +433,57 @@ def put_data(self, putting_rank, data): self.comm.send(MsgType.PUT_DATA, tag=Tags.ACK, dest=putting_rank) def get_data(self, score): - items = [x for x in self.scores.values() if not math.isnan(x['score'])] + items = [x for x in self.scores.values() if not math.isnan(x["score"])] result = self.exploiter(items, score) return result def run(self): t = time.localtime() start_time = time.time() - self.logs.append("PBT Start: {}".format(time.strftime('%Y-%m-%d %H:%M:%S', t))) + self.logs.append("PBT Start: {}".format( + time.strftime("%Y-%m-%d %H:%M:%S", t))) self.write_logs() - + status = MPI.Status() live_ranks = self.comm.Get_size() - 1 while live_ranks > 0: - msg = self.comm.recv(source=MPI.ANY_SOURCE, tag=Tags.REQUEST, status=status) + msg = self.comm.recv(source=MPI.ANY_SOURCE, + tag=Tags.REQUEST, + status=status) source = status.Get_source() - msg_type = msg['type'] + msg_type = msg["type"] if msg_type == MsgType.ACQUIRE_READ_LOCK: - msg_rank = msg['rank'] + msg_rank = msg["rank"] self.acquire_read_lock(source, msg_rank) elif msg_type == MsgType.RELEASE_READ_LOCK: - msg_rank = msg['rank'] + msg_rank = msg["rank"] self.release_read_lock(source, msg_rank) elif msg_type == MsgType.RELEASE_WRITE_LOCK: - msg_rank = msg['rank'] + msg_rank = msg["rank"] self.release_write_lock(source, msg_rank) elif msg_type == MsgType.PUT_DATA: - data = msg['data'] - lock_weights = msg['lock_weights'] + data = msg["data"] + lock_weights = msg["lock_weights"] self.put_data(source, data) if lock_weights: self.acquire_write_lock(source, source) elif msg_type == MsgType.GET_DATA: - score = msg['score'] + score = msg["score"] result = self.get_data(score) self.comm.send(result, dest=source, tag=Tags.SCORE) - lock_weights = msg['lock_weights'] + lock_weights = msg["lock_weights"] if len(result) and lock_weights: - rank_to_read = result['rank'] + rank_to_read = result["rank"] self.acquire_read_lock(source, rank_to_read) elif msg_type == MsgType.LOG: - log = msg['log'] + log = msg["log"] self.logs.append(log) if len(self.logs) > 20: self.write_logs() @@ -469,24 +491,23 @@ def run(self): elif msg_type == MsgType.DONE: live_ranks -= 1 - t = time.localtime() - self.logs.append("PBT End: {}".format(time.strftime('%Y-%m-%d %H:%M:%S', t))) + self.logs.append("PBT End: {}".format( + time.strftime("%Y-%m-%d %H:%M:%S", t))) self.logs.append("Duration: {}".format(time.time() - start_time)) self.done() print("Done") - + class PBTWorker: - """ PBTCallback uses classes that implement this API to determine - when a model is ready to exploit and explore, to retrieve metadata - and hyperparameters from the model to put in the shared PBTMetaDataStore, - and to perform the model specific exploit and explore update. - """ + """PBTCallback uses classes that implement this API to determine when a + model is ready to exploit and explore, to retrieve metadata and + hyperparameters from the model to put in the shared PBTMetaDataStore, and + to perform the model specific exploit and explore update.""" def ready(self, pbt_client, epoch, model): - """ Returns True if the model is ready for an exploit explore update. + """Returns True if the model is ready for an exploit explore update. :param pbt_client: A PBTClient instance that can be used for logging (i.e. pbt_client.log(msg)) @@ -496,10 +517,10 @@ def ready(self, pbt_client, epoch, model): pass def pack_data(self, pbt_client, model, metrics): - """ Packs relevant hyperparameters and selected score metric into a dict to be - passed to the PBTMetaDataStore. A typical implementation will select - one of the metrics (e.g. 'val_loss') from the keras provided metrics - and set that as the 'score' used to determine model peformance. + """Packs relevant hyperparameters and selected score metric into a dict + to be passed to the PBTMetaDataStore. A typical implementation will + select one of the metrics (e.g. 'val_loss') from the keras provided + metrics and set that as the 'score' used to determine model peformance. Any hyperparameters that are updated in an exploit / explore should also be included in the returned dictionary. For example, @@ -521,9 +542,9 @@ def pack_data(self, pbt_client, model, metrics): pass def update(self, epoch, pbt_client, model, data): - """ Updates the specified model by performing an exploit / explore - using the data in data. NOTE that the PBTCallback will load the - new weights into the model. That should NOT be done here. + """Updates the specified model by performing an exploit / explore using + the data in data. NOTE that the PBTCallback will load the new weights + into the model. That should NOT be done here. For example, assuming the pack_data method stores the learing rate as 'lr' and we want to update the specified model's lr to a perturbed @@ -547,6 +568,7 @@ def update(self, epoch, pbt_client, model, data): """ pass + import traceback @@ -570,7 +592,7 @@ class PBTCallback(keras.callbacks.Callback): PUT = 1 def __init__(self, comm, root_rank, outdir, pbt_worker, dataspaces=False): - """ Initializes this PBTCallback. + """Initializes this PBTCallback. :param comm: the MPI communicator in which this PBTCallback operates :param root_rank: the rank of the PBTMetaDataStore @@ -579,49 +601,55 @@ def __init__(self, comm, root_rank, outdir, pbt_worker, dataspaces=False): """ if dataspaces: raise ValueError("Dataspaces is not currently supported") - #self.client = DataSpacesPBTClient(comm, root_rank, outdir) + # self.client = DataSpacesPBTClient(comm, root_rank, outdir) else: self.client = PBTClient(comm, root_rank, outdir) self.outdir = outdir - #self.timer = Timer("{}/timings_{}.csv".format(self.outdir, self.client.rank)) + # self.timer = Timer("{}/timings_{}.csv".format(self.outdir, self.client.rank)) self.pbt_worker = pbt_worker def on_batch_end(self, batch, logs): pass def on_epoch_begin(self, epoch, logs): - + t = time.localtime() - self.client.log("Client {} Epoch {} Start: {}".format(self.client.rank, epoch, time.strftime('%Y-%m-%d %H:%M:%S', t))) - + self.client.log("Client {} Epoch {} Start: {}".format( + self.client.rank, epoch, time.strftime("%Y-%m-%d %H:%M:%S", t))) + self.epoch_start = time.time() def on_epoch_end(self, epoch, logs): - metrics = {'epoch': epoch, 'rank': self.client.rank, 'duration' : time.time() - self.epoch_start} - #print("Rank: {}, Epoch: {} end".format(self.client.rank, epoch)) + metrics = { + "epoch": epoch, + "rank": self.client.rank, + "duration": time.time() - self.epoch_start, + } + # print("Rank: {}, Epoch: {} end".format(self.client.rank, epoch)) metrics.update(logs) data = self.pbt_worker.pack_data(self.client, self.model, metrics) self.client.put(data, self.model) - #self.timer.end(PBTCallback.PUT) + # self.timer.end(PBTCallback.PUT) if self.pbt_worker.ready(self.client, self.model, epoch): - result = self.client.get_data(data['score']) + result = self.client.get_data(data["score"]) if len(result): - print("{},{} is ready - updating".format(epoch, self.client.rank)) - rank_to_read = result['rank'] + print("{},{} is ready - updating".format( + epoch, self.client.rank)) + rank_to_read = result["rank"] self.client.load_state(self.model, result, rank_to_read) # update after loading state as loading the state will set the state # of the optimizer etc. self.pbt_worker.update(epoch, self.client, self.model, result) print("{},{} updated".format(epoch, self.client.rank)) - #print("{} loading weights from {}".format(self.client.rank, rank)) - - #else: - # print("{},{} is ready - no update".format(epoch, self.client.rank)) - + # print("{} loading weights from {}".format(self.client.rank, rank)) + + # else: + # print("{},{} is ready - no update".format(epoch, self.client.rank)) def on_train_end(self, logs={}): t = time.localtime() - self.client.log("Client {} End: {}".format(self.client.rank, time.strftime('%Y-%m-%d %H:%M:%S', t))) + self.client.log("Client {} End: {}".format( + self.client.rank, time.strftime("%Y-%m-%d %H:%M:%S", t))) self.client.done() diff --git a/workflows/pbt/python/pbt_utils.py b/workflows/pbt/python/pbt_utils.py index be9965ab..9b4e703d 100644 --- a/workflows/pbt/python/pbt_utils.py +++ b/workflows/pbt/python/pbt_utils.py @@ -1,10 +1,12 @@ -from keras.engine import topology +import json + +import h5py +import keras.backend as K +import numpy as np from keras import optimizers +from keras.engine import topology from keras.models import Sequential -import keras.backend as K -import h5py, json -import numpy as np def get_json_type(obj): """Serialize any object to a JSON-serializable structure. @@ -20,9 +22,11 @@ def get_json_type(obj): """ # if obj is a serializable Keras class instance # e.g. optimizer, layer - if hasattr(obj, 'get_config'): - return {'class_name': obj.__class__.__name__, - 'config': obj.get_config()} + if hasattr(obj, "get_config"): + return { + "class_name": obj.__class__.__name__, + "config": obj.get_config() + } # if obj is any numpy type if type(obj).__module__ == np.__name__: @@ -36,142 +40,148 @@ def get_json_type(obj): if type(obj).__name__ == type.__name__: return obj.__name__ - raise TypeError('Not JSON Serializable:', obj) + raise TypeError("Not JSON Serializable:", obj) + def convert_custom_objects(obj, custom_objects={}): - """Handles custom object lookup. - - # Arguments - obj: object, dict, or list. - - # Returns - The same structure, where occurences - of a custom object name have been replaced - with the custom object. - """ - if isinstance(obj, list): - deserialized = [] - for value in obj: - if value in custom_objects: - deserialized.append(custom_objects[value]) - else: - deserialized.append(value) - return deserialized - if isinstance(obj, dict): - deserialized = {} - for key, value in obj.items(): - if value in custom_objects: - deserialized[key] = custom_objects[value] - else: - deserialized[key] = value - return deserialized - if obj in custom_objects: - return custom_objects[obj] - return obj + """Handles custom object lookup. + + # Arguments + obj: object, dict, or list. + + # Returns + The same structure, where occurences + of a custom object name have been replaced + with the custom object. + """ + if isinstance(obj, list): + deserialized = [] + for value in obj: + if value in custom_objects: + deserialized.append(custom_objects[value]) + else: + deserialized.append(value) + return deserialized + if isinstance(obj, dict): + deserialized = {} + for key, value in obj.items(): + if value in custom_objects: + deserialized[key] = custom_objects[value] + else: + deserialized[key] = value + return deserialized + if obj in custom_objects: + return custom_objects[obj] + return obj + def save_optimizer(model, hdf_file): # from save_model in keras.models.py - hdf_file.attrs['training_config'] = json.dumps({ - 'optimizer_config': { - 'class_name': model.optimizer.__class__.__name__, - 'config': model.optimizer.get_config() - }, - 'loss': model.loss, - 'metrics': model.metrics, - 'sample_weight_mode': model.sample_weight_mode, - 'loss_weights': model.loss_weights, - }, default=get_json_type).encode('utf8') + hdf_file.attrs["training_config"] = json.dumps( + { + "optimizer_config": { + "class_name": model.optimizer.__class__.__name__, + "config": model.optimizer.get_config(), + }, + "loss": model.loss, + "metrics": model.metrics, + "sample_weight_mode": model.sample_weight_mode, + "loss_weights": model.loss_weights, + }, + default=get_json_type, + ).encode("utf8") # Save optimizer weights. - symbolic_weights = getattr(model.optimizer, 'weights') + symbolic_weights = getattr(model.optimizer, "weights") if symbolic_weights: - optimizer_weights_group = hdf_file.create_group('optimizer_weights') + optimizer_weights_group = hdf_file.create_group("optimizer_weights") weight_values = K.batch_get_value(symbolic_weights) weight_names = [] for i, (w, val) in enumerate(zip(symbolic_weights, weight_values)): # Default values of symbolic_weights is /variable for theano - if K.backend() == 'theano': - if hasattr(w, 'name') and w.name != "/variable": + if K.backend() == "theano": + if hasattr(w, "name") and w.name != "/variable": name = str(w.name) else: - name = 'param_' + str(i) + name = "param_" + str(i) else: - if hasattr(w, 'name') and w.name: + if hasattr(w, "name") and w.name: name = str(w.name) else: - name = 'param_' + str(i) - weight_names.append(name.encode('utf8')) + name = "param_" + str(i) + weight_names.append(name.encode("utf8")) - optimizer_weights_group.attrs['weight_names'] = weight_names + optimizer_weights_group.attrs["weight_names"] = weight_names for name, val in zip(weight_names, weight_values): - param_dset = optimizer_weights_group.create_dataset( - name, - val.shape, - dtype=val.dtype) + param_dset = optimizer_weights_group.create_dataset(name, + val.shape, + dtype=val.dtype) if not val.shape: # scalar param_dset[()] = val else: param_dset[:] = val + def load_optimizer(model, hdf_file): - # instantiate optimizer - training_config = hdf_file.attrs.get('training_config') + # instantiate optimizer + training_config = hdf_file.attrs.get("training_config") - training_config = json.loads(training_config.decode('utf-8')) - optimizer_config = training_config['optimizer_config'] - optimizer = optimizers.deserialize(optimizer_config, - custom_objects={}) + training_config = json.loads(training_config.decode("utf-8")) + optimizer_config = training_config["optimizer_config"] + optimizer = optimizers.deserialize(optimizer_config, custom_objects={}) # Recover loss functions and metrics. - loss = convert_custom_objects(training_config['loss']) - metrics = convert_custom_objects(training_config['metrics']) - sample_weight_mode = training_config['sample_weight_mode'] - loss_weights = training_config['loss_weights'] + loss = convert_custom_objects(training_config["loss"]) + metrics = convert_custom_objects(training_config["metrics"]) + sample_weight_mode = training_config["sample_weight_mode"] + loss_weights = training_config["loss_weights"] # Compile model. - model.compile(optimizer=optimizer, - loss=loss, - metrics=metrics, - loss_weights=loss_weights, - sample_weight_mode=sample_weight_mode) + model.compile( + optimizer=optimizer, + loss=loss, + metrics=metrics, + loss_weights=loss_weights, + sample_weight_mode=sample_weight_mode, + ) # Set optimizer weights. - if 'optimizer_weights' in hdf_file: + if "optimizer_weights" in hdf_file: # Build train function (to get weight updates). if isinstance(model, Sequential): model.model._make_train_function() else: model._make_train_function() - optimizer_weights_group = hdf_file['optimizer_weights'] - optimizer_weight_names = [n.decode('utf8') for n in optimizer_weights_group.attrs['weight_names']] - optimizer_weight_values = [optimizer_weights_group[n] for n in optimizer_weight_names] + optimizer_weights_group = hdf_file["optimizer_weights"] + optimizer_weight_names = [ + n.decode("utf8") + for n in optimizer_weights_group.attrs["weight_names"] + ] + optimizer_weight_values = [ + optimizer_weights_group[n] for n in optimizer_weight_names + ] model.optimizer.set_weights(optimizer_weight_values) def save_state(model, outdir, rank): fname = "{}/weights_opt_{}.h5".format(outdir, rank) - with h5py.File(fname, 'w') as f: - model_weights_group = f.create_group('model_weights') + with h5py.File(fname, "w") as f: + model_weights_group = f.create_group("model_weights") topology.save_weights_to_hdf5_group(model_weights_group, model.layers) save_optimizer(model, f) f.flush() + def load_state(model, outdir, rank): fname = "{}/weights_opt_{}.h5".format(outdir, rank) - + # keras.engine.network.py, l. 1124+ - with h5py.File(fname, 'r') as f: - f = h5py.File(fname, mode='r') + with h5py.File(fname, "r") as f: + f = h5py.File(fname, mode="r") weights = f - if 'layer_names' not in f.attrs and 'model_weights' in f: - weights = f['model_weights'] - + if "layer_names" not in f.attrs and "model_weights" in f: + weights = f["model_weights"] + topology.load_weights_from_hdf5_group(weights, model.layers) load_optimizer(model, f) - - - - - - diff --git a/workflows/pbt/python/tc1_pbt.py b/workflows/pbt/python/tc1_pbt.py index 65f9c6f8..a7c56cb3 100644 --- a/workflows/pbt/python/tc1_pbt.py +++ b/workflows/pbt/python/tc1_pbt.py @@ -1,15 +1,18 @@ +import importlib +import math +import os +import random import sys -import importlib, time -from mpi4py import MPI -import os, random, math +import time import ga_utils import pbt - from keras import backend as K +from mpi4py import MPI class TC1PBTWorker: + def __init__(self, rank): self.rank = rank @@ -22,15 +25,15 @@ def ready(self, pbt_client, model, epoch): # return ready def pack_data(self, pbt_client, model, metrics): - """ Packs relevant hyperparameters and selected score metric into a dict to be - passed to the datastore. + """Packs relevant hyperparameters and selected score metric into a dict + to be passed to the datastore. - :param metrics: the metrics in keras callback log + :param metrics: the metrics in keras callback log """ lr = float(K.get_value(model.optimizer.lr)) - data = {'lr': lr, 'score': metrics['val_loss']} + data = {"lr": lr, "score": metrics["val_loss"]} data.update(metrics) - #pbt_client.log("{}: putting data".format(self.rank)) + # pbt_client.log("{}: putting data".format(self.rank)) return data def update(self, epoch, pbt_client, model, data): @@ -38,43 +41,44 @@ def update(self, epoch, pbt_client, model, data): # 'score': 0.36156702836354576, 'lr': 0.0010000000474974513, 'val_acc': 0.87870370237915607, # 'val_loss': 0.36156702836354576} # current_lr = float(K.get_value(model.optimizer.lr)) - lr = data['lr'] + lr = data["lr"] draw = random.random() - if draw < .5: + if draw < 0.5: lr = lr * 0.8 else: lr = lr * 1.2 K.set_value(model.optimizer.lr, lr) - #pbt_client.log("{},{},{},{},{}".format(self.rank, epoch, data['rank'], current_lr, lr)) - #pbt_client.log("{}: updating from rank {}, lr from {} to {}".format(self.rank, data['rank'], old_lr, lr)) + # pbt_client.log("{},{},{},{},{}".format(self.rank, epoch, data['rank'], current_lr, lr)) + # pbt_client.log("{}: updating from rank {}, lr from {} to {}".format(self.rank, data['rank'], old_lr, lr)) def truncation_select(data, score): """ - :param data: list of dict containg each ranks' model data as well as - rank itself. - :return a dict that contains all the selected rank's model data, or an - empty dict if no selection + :param data: list of dict containg each ranks' model data as well as + rank itself. + :return a dict that contains all the selected rank's model data, or an + empty dict if no selection """ # e.g. data: [{'acc': 0.87916666666666665, 'loss': 0.38366817765765721, 'rank': 1, # 'score': 0.36156702836354576, 'lr': 0.0010000000474974513, 'val_acc': 0.87870370237915607, # 'val_loss': 0.36156702836354576}, ...] - items = sorted(data, key=lambda item: item['score']) + items = sorted(data, key=lambda item: item["score"]) size = len(items) quintile = int(round(size / 5.0)) - if quintile > 0 and score >= items[-quintile]['score']: + if quintile > 0 and score >= items[-quintile]["score"]: # in bottom 20%, so select from top 20% if quintile == 1: idx = 0 else: idx = random.randint(0, quintile - 1) - #print("Returning: {}".format(items[idx])) + # print("Returning: {}".format(items[idx])) return items[idx] else: - #print("Returning nothing") + # print("Returning nothing") return {} + def init_params(params_file, comm): param_factories = ga_utils.create_parameters(params_file, True) params = [{}] @@ -86,6 +90,7 @@ def init_params(params_file, comm): return params + def run_model(comm, rank, hyper_parameter_map, args): exp_dir = args[2] instance_dir = "{}/run_{}/".format(exp_dir, rank) @@ -94,12 +99,12 @@ def run_model(comm, rank, hyper_parameter_map, args): model_name = args[3] - hyper_parameter_map['framework'] = 'keras' - hyper_parameter_map['save'] = '{}/output'.format(instance_dir) - hyper_parameter_map['instance_directory'] = instance_dir - hyper_parameter_map['model_name'] = model_name - hyper_parameter_map['experiment_id'] = args[4] - hyper_parameter_map['run_id'] = rank + hyper_parameter_map["framework"] = "keras" + hyper_parameter_map["save"] = "{}/output".format(instance_dir) + hyper_parameter_map["instance_directory"] = instance_dir + hyper_parameter_map["model_name"] = model_name + hyper_parameter_map["experiment_id"] = args[4] + hyper_parameter_map["run_id"] = rank runner = "{}_runner".format(model_name) sys.argv = [runner] @@ -108,7 +113,8 @@ def run_model(comm, rank, hyper_parameter_map, args): pbt_callback = pbt.PBTCallback(comm, 0, weights_dir, TC1PBTWorker(rank)) t = time.localtime() - pbt_callback.client.log("Client {} Start: {}".format(rank, time.strftime('%Y-%m-%d %H:%M:%S', t))) + pbt_callback.client.log("Client {} Start: {}".format( + rank, time.strftime("%Y-%m-%d %H:%M:%S", t))) try: pkg.run(hyper_parameter_map, [pbt_callback]) except: @@ -124,6 +130,7 @@ def init_dirs(outdir): if not os.path.exists(weights_dir): os.makedirs(weights_dir) + def main(args): comm = MPI.COMM_WORLD rank = comm.Get_rank() @@ -140,14 +147,15 @@ def main(args): init_dirs(outdir) comm.scatter(params, root=0) log_file = "{}/log.txt".format(outdir) - root = pbt.PBTMetaDataStore(comm, outdir, truncation_select, log_file) + root = pbt.PBTMetaDataStore(comm, outdir, truncation_select, + log_file) root.run() else: params = comm.scatter(None, root=0) if len(params) > 0: run_model(comm, rank, params, args) - #print("{}: {}".format(rank, params)) + # print("{}: {}".format(rank, params)) -if __name__ == '__main__': +if __name__ == "__main__": main(sys.argv) diff --git a/workflows/pbt/python/tc1_pbt_ds.py b/workflows/pbt/python/tc1_pbt_ds.py index 9e975938..9d6bd3be 100644 --- a/workflows/pbt/python/tc1_pbt_ds.py +++ b/workflows/pbt/python/tc1_pbt_ds.py @@ -1,14 +1,17 @@ -import sys import importlib -from mpi4py import MPI -import os, random, math +import math +import os +import random +import sys import ga_utils import pbt - from keras import backend as K +from mpi4py import MPI + class TC1PBTWorker: + def __init__(self, rank): self.rank = rank @@ -21,15 +24,15 @@ def ready(self, pbt_client, model, epoch): # return ready def pack_data(self, pbt_client, model, metrics): - """ Packs relevant hyperparameters and selected score metric into a dict to be - passed to the datastore. + """Packs relevant hyperparameters and selected score metric into a dict + to be passed to the datastore. - :param metrics: the metrics in keras callback log + :param metrics: the metrics in keras callback log """ lr = float(K.get_value(model.optimizer.lr)) - data = {'lr': lr, 'score': metrics['val_loss']} + data = {"lr": lr, "score": metrics["val_loss"]} data.update(metrics) - #pbt_client.log("{}: putting data".format(self.rank)) + # pbt_client.log("{}: putting data".format(self.rank)) return data def update(self, epoch, pbt_client, model, data): @@ -37,47 +40,48 @@ def update(self, epoch, pbt_client, model, data): # 'score': 0.36156702836354576, 'lr': 0.0010000000474974513, 'val_acc': 0.87870370237915607, # 'val_loss': 0.36156702836354576} current_lr = float(K.get_value(model.optimizer.lr)) - lr = data['lr'] + lr = data["lr"] draw = random.random() - if draw < .5: + if draw < 0.5: lr = lr * 0.8 else: lr = lr * 1.2 K.set_value(model.optimizer.lr, lr) - pbt_client.log("{},{},{},{},{}".format(self.rank, epoch, data['rank'], current_lr, lr)) - #pbt_client.log("{}: updating from rank {}, lr from {} to {}".format(self.rank, data['rank'], old_lr, lr)) + pbt_client.log("{},{},{},{},{}".format(self.rank, epoch, data["rank"], + current_lr, lr)) + # pbt_client.log("{}: updating from rank {}, lr from {} to {}".format(self.rank, data['rank'], old_lr, lr)) def truncation_select(data, score): """ - :param data: list of dict containg each ranks' model data as well as - rank itself. - :return a dict that contains all the selected rank's model data, or an - empty dict if no selection + :param data: list of dict containg each ranks' model data as well as + rank itself. + :return a dict that contains all the selected rank's model data, or an + empty dict if no selection """ # e.g. data: [{'acc': 0.87916666666666665, 'loss': 0.38366817765765721, 'rank': 1, # 'score': 0.36156702836354576, 'lr': 0.0010000000474974513, 'val_acc': 0.87870370237915607, # 'val_loss': 0.36156702836354576}, ...] - items = sorted(data, key=lambda item: item['score']) + items = sorted(data, key=lambda item: item["score"]) size = len(items) quintile = int(round(size / 5.0)) - if score >= items[-quintile]['score']: + if score >= items[-quintile]["score"]: # in bottom 20%, so select from top 20% idx = random.randint(0, quintile - 1) - #print("Returning: {}".format(items[idx])) + # print("Returning: {}".format(items[idx])) return items[idx] else: - #print("Returning nothing") + # print("Returning nothing") return {} + def random_select(data, score): - """ - Useful for testing to force a weight load. - """ + """Useful for testing to force a weight load.""" idx = random.randint(0, len(data) - 1) return data[idx] + def init_params(params_file, comm): param_factories = ga_utils.create_parameters(params_file, True) params = [{}] @@ -89,6 +93,7 @@ def init_params(params_file, comm): return params + def run_model(comm, rank, hyper_parameter_map, args): exp_dir = args[2] @@ -98,21 +103,25 @@ def run_model(comm, rank, hyper_parameter_map, args): model_name = args[3] - hyper_parameter_map['framework'] = 'keras' - hyper_parameter_map['save'] = '{}/output'.format(instance_dir) - hyper_parameter_map['instance_directory'] = instance_dir - hyper_parameter_map['model_name'] = model_name - hyper_parameter_map['experiment_id'] = args[4] - hyper_parameter_map['run_id'] = rank + hyper_parameter_map["framework"] = "keras" + hyper_parameter_map["save"] = "{}/output".format(instance_dir) + hyper_parameter_map["instance_directory"] = instance_dir + hyper_parameter_map["model_name"] = model_name + hyper_parameter_map["experiment_id"] = args[4] + hyper_parameter_map["run_id"] = rank runner = "{}_runner".format(model_name) sys.argv = [runner] pkg = importlib.import_module(runner) weights_dir = "{}/weights".format(exp_dir) - pbt_callback = pbt.PBTCallback(comm, 0, weights_dir, TC1PBTWorker(rank), - dataspaces=True) + pbt_callback = pbt.PBTCallback(comm, + 0, + weights_dir, + TC1PBTWorker(rank), + dataspaces=True) pkg.run(hyper_parameter_map, [pbt_callback]) + def init_dirs(outdir): if not os.path.exists(outdir): os.makedirs(outdir) @@ -121,6 +130,7 @@ def init_dirs(outdir): if not os.path.exists(weights_dir): os.makedirs(weights_dir) + def main(args): comm = MPI.COMM_WORLD rank = comm.Get_rank() @@ -131,15 +141,18 @@ def main(args): init_dirs(outdir) comm.scatter(params, root=0) log_file = "{}/log.txt".format(outdir) - root = pbt.PBTMetaDataStore(comm, outdir, random_select, log_file, - dataspaces=True) + root = pbt.PBTMetaDataStore(comm, + outdir, + random_select, + log_file, + dataspaces=True) root.run() else: params = comm.scatter(None, root=0) run_model(comm, rank, params, args) - #print("{}: {}".format(rank, params)) + # print("{}: {}".format(rank, params)) -if __name__ == '__main__': +if __name__ == "__main__": main(sys.argv) diff --git a/workflows/pbt/python/test/pbt_tests.py b/workflows/pbt/python/test/pbt_tests.py index 79ef4291..68d512c4 100644 --- a/workflows/pbt/python/test/pbt_tests.py +++ b/workflows/pbt/python/test/pbt_tests.py @@ -1,47 +1,50 @@ from __future__ import print_function + import unittest -import tc1_pbt -import pbt_utils -import numpy as np -import keras -from keras.optimizers import Adam +import keras +import numpy as np +import pbt_utils +import tc1_pbt from keras import backend as K +from keras.optimizers import Adam + class TestPBT(unittest.TestCase): def testTruncate(self): data = [] for i in range(0, 11): - data.append({'score': 11 - i, 'rank': i}) + data.append({"score": 11 - i, "rank": i}) - #print(data) + # print(data) for i in range(0, 10): result = tc1_pbt.truncation_select(data, i) self.assertEqual(0, len(result)) for i in range(10, 12): - result = tc1_pbt.truncation_select(data, i) + result = tc1_pbt.truncation_select(data, i) self.assertTrue(len(result) > 0) - score = result['score'] - rank = result['rank'] + score = result["score"] + rank = result["rank"] self.assertTrue(rank == 9 or rank == 10) self.assertTrue(score < 3) + class TestIO(unittest.TestCase): def create_model(self, lr): X, y = np.random.rand(100, 50), np.random.randint(2, size=100) x = keras.layers.Input((50,)) - out = keras.layers.Dense(1, activation='sigmoid')(x) + out = keras.layers.Dense(1, activation="sigmoid")(x) model = keras.models.Model(x, out) optimizer = Adam(lr=lr) - model.compile(optimizer=optimizer, loss='binary_crossentropy') + model.compile(optimizer=optimizer, loss="binary_crossentropy") model.fit(X, y, epochs=5) return model def testIO(self): - model = self.create_model(.0001) + model = self.create_model(0.0001) lr = float(K.get_value(model.optimizer.lr)) self.assertAlmostEqual(0.0001, lr) weights = model.get_weights() @@ -59,5 +62,5 @@ def testIO(self): self.assertTrue(np.array_equal(weights[0], model.get_weights()[0])) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/workflows/pbt/scripts/common.m4 b/workflows/pbt/scripts/common.m4 index 040627e0..a3dc922d 100644 --- a/workflows/pbt/scripts/common.m4 +++ b/workflows/pbt/scripts/common.m4 @@ -4,4 +4,4 @@ divert(`-1') changecom(`dnl') define(`getenv', `esyscmd(printf -- "$`$1' ")') define(`getenv_nospace', `esyscmd(printf -- "$`$1'")') -divert \ No newline at end of file +divert diff --git a/workflows/pbt/scripts/local.cfg b/workflows/pbt/scripts/local.cfg index 174ee761..b8694d9a 100644 --- a/workflows/pbt/scripts/local.cfg +++ b/workflows/pbt/scripts/local.cfg @@ -1,3 +1,3 @@ export PROCS=6 export PPN=1 -export EXP_DIR=../experiments/$EXP_ID \ No newline at end of file +export EXP_DIR=../experiments/$EXP_ID diff --git a/workflows/pbt/scripts/local_submit.cfg b/workflows/pbt/scripts/local_submit.cfg index 22dd29df..dac8dfa2 100644 --- a/workflows/pbt/scripts/local_submit.cfg +++ b/workflows/pbt/scripts/local_submit.cfg @@ -7,4 +7,4 @@ PYTHONPATH+=":$ROOT/models/tc1" export PYTHONPATH=$PYTHONPATH -CMD="mpirun -n $PROCS python -u $PBT_PY $PARAMS_FILE $EXP_DIR tc1 $EXP_ID" \ No newline at end of file +CMD="mpirun -n $PROCS python -u $PBT_PY $PARAMS_FILE $EXP_DIR tc1 $EXP_ID" diff --git a/workflows/pbt/scripts/local_submit.m4 b/workflows/pbt/scripts/local_submit.m4 index e1b26d28..c76e0e47 100644 --- a/workflows/pbt/scripts/local_submit.m4 +++ b/workflows/pbt/scripts/local_submit.m4 @@ -1 +1 @@ -# EMPTY \ No newline at end of file +# EMPTY diff --git a/workflows/pbt/scripts/pbt_run.sh b/workflows/pbt/scripts/pbt_run.sh index ef53d3bf..343d09eb 100755 --- a/workflows/pbt/scripts/pbt_run.sh +++ b/workflows/pbt/scripts/pbt_run.sh @@ -50,4 +50,3 @@ source "$SITE"_submit.cfg #echo $CMD $CMD - diff --git a/workflows/pbt/scripts/theta.cfg b/workflows/pbt/scripts/theta.cfg index 0864aa97..aac2a607 100644 --- a/workflows/pbt/scripts/theta.cfg +++ b/workflows/pbt/scripts/theta.cfg @@ -2,4 +2,4 @@ export PROCS=128 export PPN=1 export WALLTIME=01:00:00 export PROJECT=CSC249ADOA01 -export QUEUE=default \ No newline at end of file +export QUEUE=default diff --git a/workflows/pbt/scripts/theta_submit.cfg b/workflows/pbt/scripts/theta_submit.cfg index 4598b840..8583aaea 100644 --- a/workflows/pbt/scripts/theta_submit.cfg +++ b/workflows/pbt/scripts/theta_submit.cfg @@ -6,4 +6,4 @@ PP=${PP//:/\\:} EXPORTS="ROOT=$ROOT:PBT_PY=$PBT_PY:BENCHMARKS=$BENCHMARKS:PP=$PP" EXPORTS+=":SUPERVISOR=$SUPERVISOR:EXP_ID=$EXP_ID:PARAMS_FILE=$P_FILE:EXP_DIR=$EXP_DIR" -CMD="qsub --env $EXPORTS --jobname=$EXP_ID --mode script $SH" \ No newline at end of file +CMD="qsub --env $EXPORTS --jobname=$EXP_ID --mode script $SH" diff --git a/workflows/pbt/scripts/theta_submit.m4 b/workflows/pbt/scripts/theta_submit.m4 index 9c5fe1d6..a65856ae 100644 --- a/workflows/pbt/scripts/theta_submit.m4 +++ b/workflows/pbt/scripts/theta_submit.m4 @@ -4,7 +4,7 @@ ifelse(getenv_nospace(PROJECT), `',,#COBALT -A getenv_nospace(PROJECT) )#COBALT -n getenv(NODES) #COBALT -t getenv(WALLTIME) #COBALT -o getenv_nospace(EXP_DIR)/output.txt -#COBALT -e getenv_nospace(EXP_DIR)/output.txt +#COBALT -e getenv_nospace(EXP_DIR)/output.txt #COBALT --cwd getenv(EXP_DIR) export PYTHONPATH=$PP:$PYTHONPATH diff --git a/workflows/pbt/scripts/titan.cfg b/workflows/pbt/scripts/titan.cfg index f0d42843..902c8f97 100644 --- a/workflows/pbt/scripts/titan.cfg +++ b/workflows/pbt/scripts/titan.cfg @@ -4,4 +4,4 @@ export PPN=1 export WALLTIME=00:40:00 export PROJECT=MED106 export QUEUE=batch -export EXP_DIR=$PROJWORK/csc249/ncollier/experiments/$EXP_ID \ No newline at end of file +export EXP_DIR=$PROJWORK/csc249/ncollier/experiments/$EXP_ID diff --git a/workflows/pbt/scripts/titan_submit.cfg b/workflows/pbt/scripts/titan_submit.cfg index 02c082da..9e3cd8f1 100644 --- a/workflows/pbt/scripts/titan_submit.cfg +++ b/workflows/pbt/scripts/titan_submit.cfg @@ -1,4 +1,4 @@ EXPORTS="ROOT=$ROOT,PBT_PY=$PBT_PY,BENCHMARKS=$BENCHMARKS,PP=$PP" EXPORTS+=",SUPERVISOR=$SUPERVISOR,EXP_ID=$EXP_ID,PARAMS_FILE=$P_FILE,EXP_DIR=$EXP_DIR" -export CMD="qsub -v $EXPORTS -d $EXP_DIR -N $EXP_ID $SH" \ No newline at end of file +export CMD="qsub -v $EXPORTS -d $EXP_DIR -N $EXP_ID $SH" diff --git a/workflows/pbt/src/Readme.md b/workflows/pbt/src/Readme.md index 143bad42..4179658c 100644 --- a/workflows/pbt/src/Readme.md +++ b/workflows/pbt/src/Readme.md @@ -1,4 +1,4 @@ -# PBT src folder # +# PBT src folder This folder contains experimental c source code for using Dataspaces (http://personal.cac.rutgers.edu/TASSL/projects/data/index.html) as the PBT datastore, i.e. where the weights and model metadata are stored for querying by other models. diff --git a/workflows/random/README.md b/workflows/random/README.md index ccc8411c..b79fb7b0 100644 --- a/workflows/random/README.md +++ b/workflows/random/README.md @@ -1,5 +1,7 @@ # Simple parameter sweep with Swift -> parameters randomly chosen between specified bounds. + The main program (random-sweep.swift) calls a few app functions as follows: + - determineParameters.{sh,py}: Read data/ **settings.json** for sweep parameters, and return as a string for use by Swift program - evaluateOne.{sh,py}: Runs a single experiment. (Calls p1b1_runner). - computeStats.{sh,py}: Ingests data from all of the experiments and computes simple stats. @@ -7,19 +9,22 @@ The main program (random-sweep.swift) calls a few app functions as follows: Usage: ./run experient_1 Notes: -- **settings.json**: -A. parameters (benchmark parameters) -===================================== -1: epochs + +- **settings.json**: + A. parameters (benchmark parameters) + ===================================== + 1: epochs + 2. batch_size 3. N1 4. NE -B. samples (specifies the number of random samples to prepare) -=============================================================== +# B. samples (specifies the number of random samples to prepare) + 1. num For adding new parameters: + 1. Add to the json file the desired parameters 2. Read params in determineParameters.py: def loadSettings(settingsFilename): -3. Modify the evaluateOne.py file (set to run on keras framework now) \ No newline at end of file +3. Modify the evaluateOne.py file (set to run on keras framework now) diff --git a/workflows/random/data/settings.json b/workflows/random/data/settings.json index c0561733..4aca314d 100644 --- a/workflows/random/data/settings.json +++ b/workflows/random/data/settings.json @@ -1,15 +1,13 @@ { - "parameters": - { - "epochs": [4, 8 ], - "batch_size": [30, 40], - "N1": [1500, 1500], - "NE": [600, 600], - "latent_dim": [2, 64], - "learning_rate": [0.00001, 0.1] - }, - "samples": - { - "num": [120] - } + "parameters": { + "epochs": [4, 8], + "batch_size": [30, 40], + "N1": [1500, 1500], + "NE": [600, 600], + "latent_dim": [2, 64], + "learning_rate": [0.00001, 0.1] + }, + "samples": { + "num": [120] + } } diff --git a/workflows/random/python/computeStats.py b/workflows/random/python/computeStats.py index 69704a31..4e33ee1b 100644 --- a/workflows/random/python/computeStats.py +++ b/workflows/random/python/computeStats.py @@ -1,34 +1,41 @@ +import json +import os import sys from collections import defaultdict -import json, os + def extractVals(A): B = defaultdict(dict) A1 = A.split() for n, val in zip(A1[0::2], A1[1::2]): B[n] = float(val) - return(B) + return B + def computeStats(swiftArrayAsString): A = extractVals(swiftArrayAsString) vals = [] for a in A: vals += [A[a]] - print('%d values, with min=%f, max=%f, avg=%f\n'%(len(vals),min(vals),max(vals),sum(vals)/float(len(vals)))) - - filename = os.environ['TURBINE_OUTPUT']+ "/final_stats.txt" + print("%d values, with min=%f, max=%f, avg=%f\n" % + (len(vals), min(vals), max(vals), sum(vals) / float(len(vals)))) + + filename = os.environ["TURBINE_OUTPUT"] + "/final_stats.txt" # writing the val loss to the output file - with open(filename, 'w') as the_file: - the_file.write('%d values, with min=%f, max=%f, avg=%f\n'%(len(vals),min(vals),max(vals),sum(vals)/float(len(vals)))) + with open(filename, "w") as the_file: + the_file.write( + "%d values, with min=%f, max=%f, avg=%f\n" % + (len(vals), min(vals), max(vals), sum(vals) / float(len(vals)))) + -if (len(sys.argv) < 2): - print('requires arg=dataFilename') - sys.exit(1) +if len(sys.argv) < 2: + print("requires arg=dataFilename") + sys.exit(1) dataFilename = sys.argv[1] try: - with open(dataFilename, 'r') as the_file: + with open(dataFilename, "r") as the_file: data = the_file.read() except IOError as e: print("Could not open: %s" % dataFilename) diff --git a/workflows/random/python/determineParameters.py b/workflows/random/python/determineParameters.py index 0fc2a51c..0acd5daf 100644 --- a/workflows/random/python/determineParameters.py +++ b/workflows/random/python/determineParameters.py @@ -1,8 +1,11 @@ -import sys, json, os +import json +import os +import sys from random import randint, uniform # ===== Definitions ========================================================= + def loadSettings(settingsFilename): print("Reading settings: %s" % settingsFilename) try: @@ -13,51 +16,54 @@ def loadSettings(settingsFilename): print("PWD is: '%s'" % os.getcwd()) sys.exit(1) try: - epochs = settings['parameters']["epochs"] - batch_size = settings['parameters']["batch_size"] - N1 = settings['parameters']["N1"] - NE = settings['parameters']["NE"] - latent_dim = settings['parameters']["latent_dim"] - learning_rate = settings['parameters']["learning_rate"] - + epochs = settings["parameters"]["epochs"] + batch_size = settings["parameters"]["batch_size"] + N1 = settings["parameters"]["N1"] + NE = settings["parameters"]["NE"] + latent_dim = settings["parameters"]["latent_dim"] + learning_rate = settings["parameters"]["learning_rate"] except KeyError as e: - print("Settings file (%s) does not contain key: %s" % (settingsFilename, str(e))) + print("Settings file (%s) does not contain key: %s" % + (settingsFilename, str(e))) sys.exit(1) try: - samples = settings['samples']["num"] + samples = settings["samples"]["num"] except KeyError as e: - print("Settings file (%s) does not contain key: %s" % (settingsFilename, str(e))) + print("Settings file (%s) does not contain key: %s" % + (settingsFilename, str(e))) sys.exit(1) - return(epochs, batch_size, N1, NE, latent_dim, learning_rate, samples) + return (epochs, batch_size, N1, NE, latent_dim, learning_rate, samples) + # ===== Main program ======================================================== -if (len(sys.argv) < 3): - print('requires arg1=settingsFilename and arg2=paramsFilename') - sys.exit(1) +if len(sys.argv) < 3: + print("requires arg1=settingsFilename and arg2=paramsFilename") + sys.exit(1) settingsFilename = sys.argv[1] -paramsFilename = sys.argv[2] +paramsFilename = sys.argv[2] -print (settingsFilename) -print (paramsFilename) +print(settingsFilename) +print(paramsFilename) -epochs, batch_size, N1, NE, latent_dim, learning_rate, samples = loadSettings(settingsFilename) -result="" +epochs, batch_size, N1, NE, latent_dim, learning_rate, samples = loadSettings( + settingsFilename) +result = "" # select '#samples' random numbers between the range provided in settings.json file for s in range(samples[0]): - t_epoch= randint(epochs[0], epochs[1]) - t_batch_size= randint(batch_size[0], batch_size[1]) - t_N1= randint(N1[0], N1[1]) - t_NE= randint(NE[0], NE[1]) - t_ld= randint(latent_dim[0], latent_dim[1]) - t_lr= uniform(learning_rate[0], learning_rate[1]) - result+=str(t_epoch) + ',' + str(t_batch_size) + ',' + str(t_N1) + ',' + str(t_NE) + ',' + str(t_ld)+ ',' + str(t_lr) - if(s < (samples[0]-1)): - result+=":" - -with open(paramsFilename, 'w') as the_file: - the_file.write(result) + t_epoch = randint(epochs[0], epochs[1]) + t_batch_size = randint(batch_size[0], batch_size[1]) + t_N1 = randint(N1[0], N1[1]) + t_NE = randint(NE[0], NE[1]) + t_ld = randint(latent_dim[0], latent_dim[1]) + t_lr = uniform(learning_rate[0], learning_rate[1]) + result += (str(t_epoch) + "," + str(t_batch_size) + "," + str(t_N1) + "," + + str(t_NE) + "," + str(t_ld) + "," + str(t_lr)) + if s < (samples[0] - 1): + result += ":" +with open(paramsFilename, "w") as the_file: + the_file.write(result) diff --git a/workflows/random/python/evaluateOne.py b/workflows/random/python/evaluateOne.py index 00910697..3b823eb6 100644 --- a/workflows/random/python/evaluateOne.py +++ b/workflows/random/python/evaluateOne.py @@ -1,48 +1,52 @@ +import json +import os +import socket import sys + import p1b1_runner -import json, os -import socket -if (len(sys.argv) < 3): - print('requires arg1=param and arg2=filename') - sys.exit(1) +if len(sys.argv) < 3: + print("requires arg1=param and arg2=filename") + sys.exit(1) parameterString = sys.argv[1] -filename = sys.argv[2] +filename = sys.argv[2] # print (parameterString) -print ("filename is " + filename) -print (socket.gethostname()) - -#List of hyperparameters - edit this to add or remove a parameter -epochs, batch_size, d1, d2, ld, lr = parameterString.split(',') - -hyper_parameter_map = {'epochs' : int(epochs)} -hyper_parameter_map['framework'] = 'keras' -hyper_parameter_map['batch_size'] = int(batch_size) -hyper_parameter_map['dense'] = [int(d1), int(d2)] -hyper_parameter_map['latent_dim'] = int(ld) -hyper_parameter_map['learning_rate'] = float(lr) - -hyper_parameter_map['run_id'] = parameterString -# hyper_parameter_map['instance_directory'] = os.environ['TURBINE_OUTPUT'] -hyper_parameter_map['save'] = os.environ['TURBINE_OUTPUT']+ "/output-"+os.environ['PMI_RANK'] -sys.argv = ['p1b1_runner'] +print("filename is " + filename) +print(socket.gethostname()) + +# List of hyperparameters - edit this to add or remove a parameter +epochs, batch_size, d1, d2, ld, lr = parameterString.split(",") + +hyper_parameter_map = {"epochs": int(epochs)} +hyper_parameter_map["framework"] = "keras" +hyper_parameter_map["batch_size"] = int(batch_size) +hyper_parameter_map["dense"] = [int(d1), int(d2)] +hyper_parameter_map["latent_dim"] = int(ld) +hyper_parameter_map["learning_rate"] = float(lr) + +hyper_parameter_map["run_id"] = parameterString +# hyper_parameter_map['instance_directory'] = os.environ['TURBINE_OUTPUT'] +hyper_parameter_map["save"] = (os.environ["TURBINE_OUTPUT"] + "/output-" + + os.environ["PMI_RANK"]) +sys.argv = ["p1b1_runner"] val_loss = p1b1_runner.run(hyper_parameter_map) -print (val_loss) +print(val_loss) -sfn = os.environ['TURBINE_OUTPUT']+ "/output-"+os.environ['PMI_RANK'] + "/procname-" + parameterString -with open(sfn, 'w') as sfile: +sfn = (os.environ["TURBINE_OUTPUT"] + "/output-" + os.environ["PMI_RANK"] + + "/procname-" + parameterString) +with open(sfn, "w") as sfile: sfile.write(socket.getfqdn()) - proc_id = "-"+ str(os.getpid()) + proc_id = "-" + str(os.getpid()) sfile.write(proc_id) # works around this error: # https://github.com/tensorflow/tensorflow/issues/3388 from keras import backend as K + K.clear_session() # writing the val loss to the output file (result-*) -with open(filename, 'w') as the_file: +with open(filename, "w") as the_file: the_file.write(repr(val_loss)) - diff --git a/workflows/random/python/p1b1_runner.py b/workflows/random/python/p1b1_runner.py index 7ceb0c59..ddb43b10 100644 --- a/workflows/random/python/p1b1_runner.py +++ b/workflows/random/python/p1b1_runner.py @@ -1,24 +1,30 @@ # tensoflow.__init__ calls _os.path.basename(_sys.argv[0]) # so we need to create a synthetic argv. import sys -if not hasattr(sys, 'argv'): - sys.argv = ['p1b1'] + +if not hasattr(sys, "argv"): + sys.argv = ["p1b1"] import json import os + import p1b1 import runner_utils + def run(hyper_parameter_map): - framework = hyper_parameter_map['framework'] - if framework is 'keras': + framework = hyper_parameter_map["framework"] + if framework is "keras": import p1b1_baseline_keras2 + pkg = p1b1_baseline_keras2 - elif framework is 'mxnet': + elif framework is "mxnet": import p1b1_baseline_mxnet + pkg = p1b1_baseline_mxnet - elif framework is 'neon': + elif framework is "neon": import p1b1_baseline_neon + pkg = p1b1_baseline_neon else: raise ValueError("Invalid framework: {}".format(framework)) @@ -27,23 +33,24 @@ def run(hyper_parameter_map): params = pkg.initialize_parameters() runner_utils.format_params(hyper_parameter_map) - for k,v in hyper_parameter_map.items(): - #if not k in params: + for k, v in hyper_parameter_map.items(): + # if not k in params: # raise Exception("Parameter '{}' not found in set of valid arguments".format(k)) params[k] = v print(params) history = pkg.run(params) - if framework is 'keras': + if framework is "keras": # works around this error: # https://github.com/tensorflow/tensorflow/issues/3388 try: from keras import backend as K + K.clear_session() - except AttributeError: # theano does not have this function + except AttributeError: # theano does not have this function pass # use the last validation_loss as the value to minimize - val_loss = history.history['val_loss'] + val_loss = history.history["val_loss"] return val_loss[-1] diff --git a/workflows/random/python/test/run_test_p1b1.sh b/workflows/random/python/test/run_test_p1b1.sh index 65e4c62a..14e4964d 100755 --- a/workflows/random/python/test/run_test_p1b1.sh +++ b/workflows/random/python/test/run_test_p1b1.sh @@ -4,4 +4,4 @@ P1B1_DIR=../../../../../Benchmarks/Pilot1/P1B1 export PYTHONPATH="$PWD/..:$P1B1_DIR:../../../common/python" echo $PYTHONPATH -python test_p1b1.py \ No newline at end of file +python test_p1b1.py diff --git a/workflows/random/python/test/test_p1b1.py b/workflows/random/python/test/test_p1b1.py index 192de79b..8c0cdd9e 100644 --- a/workflows/random/python/test/test_p1b1.py +++ b/workflows/random/python/test/test_p1b1.py @@ -1,14 +1,17 @@ import p1b1_runner + def main(): - hyper_parameter_map = {'epochs' : 1} - hyper_parameter_map['batch_size'] = 40 - hyper_parameter_map['dense'] = [1900, 500] - hyper_parameter_map['framework'] = 'keras' - hyper_parameter_map['save'] = './p1bl1_output' + hyper_parameter_map = {"epochs": 1} + hyper_parameter_map["batch_size"] = 40 + hyper_parameter_map["dense"] = [1900, 500] + hyper_parameter_map["framework"] = "keras" + hyper_parameter_map["save"] = "./p1bl1_output" validation_loss = p1b1_runner.run(hyper_parameter_map) print("Validation Loss: ", validation_loss) -if __name__ == '__main__': + + +if __name__ == "__main__": main() diff --git a/workflows/random/swift/cooley_workflow.sh b/workflows/random/swift/cooley_workflow.sh index 50cd66ab..ddd2dfee 100755 --- a/workflows/random/swift/cooley_workflow.sh +++ b/workflows/random/swift/cooley_workflow.sh @@ -1,6 +1,6 @@ #!/bin/bash # -# Usage: ./run +# Usage: ./run # if [ "$#" -ne 1 ]; then @@ -76,4 +76,3 @@ set -x export TURBINE_LOG=1 echo swift-t -l -n $PROCS $MACHINE -p $ENVS $EMEWS_PROJECT_ROOT/random-sweep.swift $* --settings=$PWD/../data/settings.json swift-t -l -n $PROCS $MACHINE -p $ENVS $EMEWS_PROJECT_ROOT/random-sweep.swift $* --settings=$PWD/../data/settings.json - diff --git a/workflows/random/swift/workflow.sh b/workflows/random/swift/workflow.sh index 08bcef8d..1ea94308 100755 --- a/workflows/random/swift/workflow.sh +++ b/workflows/random/swift/workflow.sh @@ -62,7 +62,7 @@ CMD_LINE_ARGS=( -param_set_file=$PARAM_SET_FILE # settings.json file has all the parameter combinations to be tested #echo swift-t -l -n $PROCS $EMEWS_PROJECT_ROOT/random-sweep.swift $* -#swift-t -l -n $PROCS $EMEWS_PROJECT_ROOT/random-sweep.swift $* --settings=$PWD/../data/settings.json +#swift-t -l -n $PROCS $EMEWS_PROJECT_ROOT/random-sweep.swift $* --settings=$PWD/../data/settings.json @@ -86,5 +86,3 @@ swift-t -n $PROCS \ $( python_envs ) \ -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ $EMEWS_PROJECT_ROOT/swift/random-sweep.swift ${CMD_LINE_ARGS[@]} - - diff --git a/workflows/random/test/cfg-prm-1.sh b/workflows/random/test/cfg-prm-1.sh index 3db2ca4a..e60e7613 100644 --- a/workflows/random/test/cfg-prm-1.sh +++ b/workflows/random/test/cfg-prm-1.sh @@ -11,4 +11,3 @@ MAX_BUDGET=${MAX_BUDGET:-1800} DESIGN_SIZE=${DESIGN_SIZE:-2} PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/parameter_set.R} MODEL_NAME="p1b1" - diff --git a/workflows/random/test/cfg-sys-1.sh b/workflows/random/test/cfg-sys-1.sh index 6e48105f..b0afa605 100644 --- a/workflows/random/test/cfg-sys-1.sh +++ b/workflows/random/test/cfg-sys-1.sh @@ -18,4 +18,3 @@ export WALLTIME=${WALLTIME:-01:33:00} # Benchmark run timeout: benchmark run will timeouT # after the specified number of seconds. -1 is no timeout. BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} - diff --git a/workflows/random/test/test.sh b/workflows/random/test/test.sh index 5cd5b3d1..57bfd2aa 100755 --- a/workflows/random/test/test.sh +++ b/workflows/random/test/test.sh @@ -35,4 +35,3 @@ SCRIPT=$( basename $0 .sh ) check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID echo "$SCRIPT: SUCCESS" - diff --git a/workflows/test-horovod/make.sh b/workflows/test-horovod/make.sh index 6d416ec9..5463b7f1 100755 --- a/workflows/test-horovod/make.sh +++ b/workflows/test-horovod/make.sh @@ -22,7 +22,7 @@ make # mpicc -c -fPIC $TCL_INCLUDE_SPEC -I$CONTROLLER horovod_wrap.c # mpicc -shared -o libhorovod.so horovod_wrap.o $CONTROLLER/controller.o \ -# -l python2.7 +# -l python2.7 # tclsh make-package.tcl > pkgIndex.tcl # stc -r $PWD test-horovod.swift diff --git a/workflows/test-horovod/test.py b/workflows/test-horovod/test.py index 44e1c1c2..9702a536 100644 --- a/workflows/test-horovod/test.py +++ b/workflows/test-horovod/test.py @@ -1,7 +1,7 @@ - # This is the test Horovod program print("Importing...") -import keras import horovod.keras as hvd +import keras + print("Imported.") diff --git a/workflows/upf/README.md b/workflows/upf/README.md index ccbe4267..42346872 100644 --- a/workflows/upf/README.md +++ b/workflows/upf/README.md @@ -1,47 +1,48 @@ -# Evaluate an Unrolled Parameter File (UPF) # +# Evaluate an Unrolled Parameter File (UPF) This workflow evaluates ensembles of "Benchmark" available here: `git@github.com:ECP-CANDLE/Benchmarks.git` for a given set of parameters. -## Running ## +## Running -1. cd into the *~/Supervisor/workflows/upf/test* directory -2. Specify the MODEL_NAME in *upf-1.sh* file, hyperparameters in *upf-1.txt* -3. Specify the #procs, queue etc. in *cfg-sys-1.sh* file -4. Launch the test by invoking *./upf-1.sh * - where machine_name can be cori, theta, titan etc. +1. cd into the _~/Supervisor/workflows/upf/test_ directory +2. Specify the MODEL*NAME in \_upf-1.sh* file, hyperparameters in _upf-1.txt_ +3. Specify the #procs, queue etc. in _cfg-sys-1.sh_ file +4. Launch the test by invoking _./upf-1.sh _ + where machine_name can be cori, theta, titan etc. 5. The benchmark will be run for the number of processors specified 6. Final objective function value will be available in the experiments directory and also printed - -## User requirements ## +## User requirements What you need to install to run the workflow: -* This workflow - `git@github.com:ECP-CANDLE/Supervisor.git` . +- This workflow - `git@github.com:ECP-CANDLE/Supervisor.git` . Clone and `cd` to `workflows/nt3_mlrMBO` (the directory containing this README). -* NT3 benchmark - `git@github.com:ECP-CANDLE/Benchmarks.git` . +- NT3 benchmark - `git@github.com:ECP-CANDLE/Benchmarks.git` . Clone and switch to the `frameworks` branch. -* benchmark data - - See the individual benchmarks README for obtaining the initial data -* Swift/T with the recently implemented JSON module, +- benchmark data - + See the individual benchmarks README for obtaining the initial data +- Swift/T with the recently implemented JSON module, cf. https://github.com/swift-lang/swift-t/issues/121 -## Calling sequence ## +## Calling sequence Script call stack :- -* upf-1.sh -> swift/workflow.sh -> swift/workflow.swift -> -common/swift/obj_app.swift -> common/sh/model.sh -> -common/python/model_runner.py -> 'calls the benchmark' + +- upf-1.sh -> swift/workflow.sh -> swift/workflow.swift -> + common/swift/obj_app.swift -> common/sh/model.sh -> + common/python/model_runner.py -> 'calls the benchmark' Scheduling scripts :- -* upf-1.sh -> cfg-sys-1.sh -> common/sh/ - module, scheduling, langs .sh files -## Infer workflow ## +- upf-1.sh -> cfg-sys-1.sh -> common/sh/ - module, scheduling, langs .sh files + +## Infer workflow This workflow assumes you have a data directory (called, say, DATA) containing run directories for processing with the new infer.py script -### Quick start ### +### Quick start ``` $ cd workflows/upf/test @@ -57,8 +58,8 @@ $ ./mk-infer-upf.sh upf-DATA.txt /path/to/DATA/uq.{40..100} $ ./upf-infer.sh cori upf-DATA.txt ``` -### File index ### +### File index -* mk-infer-upf.sh: Assembles the JSON fragments into the UPF -* infer-template.json: M4 template for JSON fragments. Populated by environment variables set in mk-infer-upf.sh -* swift/workflow.{sh,swift}: Normal UPF workflow but newly extracts id from JSON template. The id is used as the run output directory +- mk-infer-upf.sh: Assembles the JSON fragments into the UPF +- infer-template.json: M4 template for JSON fragments. Populated by environment variables set in mk-infer-upf.sh +- swift/workflow.{sh,swift}: Normal UPF workflow but newly extracts id from JSON template. The id is used as the run output directory diff --git a/workflows/upf/test/upf-infer-orig.txt b/workflows/upf/test/upf-infer-orig.txt index f5ea51d6..792c12c3 100644 --- a/workflows/upf/test/upf-infer-orig.txt +++ b/workflows/upf/test/upf-infer-orig.txt @@ -1,3 +1,2 @@ {"model_file": "/global/homes/b/brettin/m2924/brettin/washington/uq.1/save/combo.A=relu.B=32.E=50.O=adam.LR=None.CF=r.DF=d.wu_lr.re_lr.res.D1=1000.D2=1000.D3=1000.D4=1000.model.h5", "weights_file": "/global/homes/b/brettin/m2924/brettin/washington/uq.1/save/combo.A=relu.B=32.E=50.O=adam.LR=None.CF=r.DF=d.wu_lr.re_lr.res.D1=1000.D2=1000.D3=1000.D4=1000.weights.h5", "drug_set": "ALMANAC", "sample_set": "GDSC"} {"model_file": "/global/homes/b/brettin/m2924/brettin/washington/uq.2/save/combo.A=relu.B=32.E=50.O=adam.LR=None.CF=r.DF=d.wu_lr.re_lr.res.D1=1000.D2=1000.D3=1000.D4=1000.model.h5", "weights_file": "/global/homes/b/brettin/m2924/brettin/washington/uq.2/save/combo.A=relu.B=32.E=50.O=adam.LR=None.CF=r.DF=d.wu_lr.re_lr.res.D1=1000.D2=1000.D3=1000.D4=1000.weights.h5", "drug_set": "ALMANAC", "sample_set": "GDSC"} - diff --git a/workflows/uq-noise/scripts/plot-extract-logs.py b/workflows/uq-noise/scripts/plot-extract-logs.py index 990f8159..c4145f55 100755 --- a/workflows/uq-noise/scripts/plot-extract-logs.py +++ b/workflows/uq-noise/scripts/plot-extract-logs.py @@ -1,39 +1,44 @@ #!/usr/bin/env python -import os, sys +import argparse +import os +import sys from pprint import pprint -import argparse parser = argparse.ArgumentParser() -parser.add_argument('output', - help='The workflow output directory ' + - '(input to this script)') -parser.add_argument('obj_return', - help='The key to look for in the model.logs, ' + - 'e.g., val_loss or val_acc') -parser.add_argument('data', - help='The extracted data ' + - '(output from this script)') +parser.add_argument("output", + help="The workflow output directory " + + "(input to this script)") +parser.add_argument( + "obj_return", + help="The key to look for in the model.logs, " + + "e.g., val_loss or val_acc", +) +parser.add_argument("data", + help="The extracted data " + "(output from this script)") # print(sys.argv) args = parser.parse_args(sys.argv[1:]) values = {} + def dict_append(D, key, value): if key not in values.keys(): D[key] = [] D[key].append(value) + def tokenize(line): - results = [ token for token in line.split(" ") - if len(token) > 0 ] + results = [token for token in line.split(" ") if len(token) > 0] return results + def is_final_report(line): - return ("/step" in line) + return "/step" in line + def parse_model_log(f, obj_return): - target = obj_return+":" + target = obj_return + ":" with open(f) as fp: for line in fp: tokens = tokenize(line) @@ -47,14 +52,15 @@ def parse_model_log(f, obj_return): if not is_final_report(line): continue tokens = tokenize(line) - for i in range(0, len(tokens)-1): + for i in range(0, len(tokens) - 1): if tokens[i] == target: - value = float(tokens[i+1]) - break # 1 level + value = float(tokens[i + 1]) + break # 1 level if value == "NOTFOUND": print("NOTFOUND " + f) return (noise_level, value) + for d in os.walk(args.output): if "model.log" not in d[2]: continue @@ -71,4 +77,4 @@ def parse_model_log(f, obj_return): # print("noise=%i count=%i", noise, count) # print(values[noise]) s = sum(values[noise]) - fp.write("%8.4f %8.4f # count=%i\n" % (noise, s/count, count)) + fp.write("%8.4f %8.4f # count=%i\n" % (noise, s / count, count)) diff --git a/workflows/uq-noise/scripts/plot-extract.py b/workflows/uq-noise/scripts/plot-extract.py index fd14336f..3abc10de 100755 --- a/workflows/uq-noise/scripts/plot-extract.py +++ b/workflows/uq-noise/scripts/plot-extract.py @@ -1,15 +1,16 @@ #!/usr/bin/env python -import os, sys +import os +import sys print(sys.argv) import argparse + parser = argparse.ArgumentParser() -parser.add_argument('output', - help='The workflow output file (input to this script)') -parser.add_argument('data', - help='The extracted data (output from this script)') +parser.add_argument("output", + help="The workflow output file (input to this script)") +parser.add_argument("data", help="The extracted data (output from this script)") args = parser.parse_args(sys.argv[1:]) values = {} @@ -17,8 +18,7 @@ with open(args.output) as fp: for line in fp: tokens = line.split(" ") - if tokens[0] == 'result' and \ - tokens[2] == ":": + if tokens[0] == "result" and tokens[2] == ":": noise = float(tokens[4]) value = float(tokens[6]) if noise not in values.keys(): @@ -32,4 +32,4 @@ for noise in noises: n = len(values[noise]) s = sum(values[noise]) - fp.write("%8.4f %8.4f\n" % (noise, s/n)) + fp.write("%8.4f %8.4f\n" % (noise, s / n)) diff --git a/workflows/uq-noise/swift/workflow-abstention.sh b/workflows/uq-noise/swift/workflow-abstention.sh index 8dc0c779..fe79d41b 100755 --- a/workflows/uq-noise/swift/workflow-abstention.sh +++ b/workflows/uq-noise/swift/workflow-abstention.sh @@ -152,7 +152,7 @@ cp $CFG_SYS $CFG_PRM $WORKFLOWS_ROOT/uq-noise/swift/workflow-noise.swift $TURBIN if [[ ${SITE} == "summit" ]] then - export TURBINE_LAUNCH_OPTIONS="-r6 -a1 -g1 -c7" + export TURBINE_LAUNCH_OPTIONS="-r6 -a1 -g1 -c7" fi TURBINE_RESIDENT_WORK_WORKERS=1 @@ -193,4 +193,3 @@ then fi # echo "EXIT CODE: 0" | tee -a $STDOUT - diff --git a/workflows/uq-noise/swift/workflow-abstention.swift b/workflows/uq-noise/swift/workflow-abstention.swift index 3fdda975..676aec61 100644 --- a/workflows/uq-noise/swift/workflow-abstention.swift +++ b/workflows/uq-noise/swift/workflow-abstention.swift @@ -60,4 +60,3 @@ foreach level, i in std_dev_array run_id, std_dev, result); } } - diff --git a/workflows/uq-noise/swift/workflow-gauss-abs.sh b/workflows/uq-noise/swift/workflow-gauss-abs.sh index 3061c3d5..86937f73 100755 --- a/workflows/uq-noise/swift/workflow-gauss-abs.sh +++ b/workflows/uq-noise/swift/workflow-gauss-abs.sh @@ -152,7 +152,7 @@ cp $CFG_SYS $CFG_PRM $WORKFLOWS_ROOT/uq-noise/swift/workflow-gauss-abs.swift $TU if [[ ${SITE} == "summit" ]] then - export TURBINE_LAUNCH_OPTIONS="-r6 -a1 -g1 -c7" + export TURBINE_LAUNCH_OPTIONS="-r6 -a1 -g1 -c7" fi TURBINE_RESIDENT_WORK_WORKERS=1 @@ -193,4 +193,3 @@ then fi # echo "EXIT CODE: 0" | tee -a $STDOUT - diff --git a/workflows/uq-noise/swift/workflow-gauss-abs.swift b/workflows/uq-noise/swift/workflow-gauss-abs.swift index fd87b698..a1fbb973 100644 --- a/workflows/uq-noise/swift/workflow-gauss-abs.swift +++ b/workflows/uq-noise/swift/workflow-gauss-abs.swift @@ -69,5 +69,3 @@ foreach level, i in std_dev_array run_id, std_dev, result); } } - - diff --git a/workflows/uq-noise/swift/workflow-gnoise.sh b/workflows/uq-noise/swift/workflow-gnoise.sh index 3369d556..38fc9460 100755 --- a/workflows/uq-noise/swift/workflow-gnoise.sh +++ b/workflows/uq-noise/swift/workflow-gnoise.sh @@ -193,4 +193,3 @@ then fi # echo "EXIT CODE: 0" | tee -a $STDOUT - diff --git a/workflows/uq-noise/swift/workflow-gnoise.swift b/workflows/uq-noise/swift/workflow-gnoise.swift index 1f37a269..6ee4bf11 100644 --- a/workflows/uq-noise/swift/workflow-gnoise.swift +++ b/workflows/uq-noise/swift/workflow-gnoise.swift @@ -62,4 +62,3 @@ foreach level, i in std_dev_array run_id, std_dev, result); } } - diff --git a/workflows/uq-noise/swift/workflow-noise.sh b/workflows/uq-noise/swift/workflow-noise.sh index 9efb8247..fef7067a 100755 --- a/workflows/uq-noise/swift/workflow-noise.sh +++ b/workflows/uq-noise/swift/workflow-noise.sh @@ -193,4 +193,3 @@ then fi # echo "EXIT CODE: 0" | tee -a $STDOUT - diff --git a/workflows/uq-noise/swift/workflow-noise.swift b/workflows/uq-noise/swift/workflow-noise.swift index 4c0ea381..52b9dad0 100644 --- a/workflows/uq-noise/swift/workflow-noise.swift +++ b/workflows/uq-noise/swift/workflow-noise.swift @@ -60,4 +60,3 @@ foreach level, i in label_noise_array run_id, label_noise, result); } } - diff --git a/workflows/uq-noise/swift/workflow.sh b/workflows/uq-noise/swift/workflow.sh index b8f3464f..09d70576 100755 --- a/workflows/uq-noise/swift/workflow.sh +++ b/workflows/uq-noise/swift/workflow.sh @@ -196,4 +196,3 @@ then fi # echo "EXIT CODE: 0" | tee -a $STDOUT - diff --git a/workflows/uq-noise/swift/workflow.swift b/workflows/uq-noise/swift/workflow.swift index 9c89f57c..6fec7dd5 100644 --- a/workflows/uq-noise/swift/workflow.swift +++ b/workflows/uq-noise/swift/workflow.swift @@ -58,4 +58,3 @@ foreach levelx, i in x_noise_levels } } } - diff --git a/workflows/uq-noise/swift/xy_workflow.swift b/workflows/uq-noise/swift/xy_workflow.swift index 929feeb7..56c8ea4c 100644 --- a/workflows/uq-noise/swift/xy_workflow.swift +++ b/workflows/uq-noise/swift/xy_workflow.swift @@ -53,4 +53,3 @@ foreach levelx, i in x_noise_levels } } } - diff --git a/workflows/xcorr/CandleFeatureSelectionFunction.py b/workflows/xcorr/CandleFeatureSelectionFunction.py index f3ae13e6..bcfc8216 100644 --- a/workflows/xcorr/CandleFeatureSelectionFunction.py +++ b/workflows/xcorr/CandleFeatureSelectionFunction.py @@ -2,7 +2,6 @@ import pandas as pd - # Use cross-correlation to select the features that are generalizable between data1 and data2. # data1: an array, where rows are samples and columns are features # data2: an array, where rows are samples and columns are features. data1 and data2 should have an equal @@ -17,8 +16,12 @@ def crossCorrelation_FS(data1, data2, cutoff): num = data1.shape[1] cor = [] for i in range(num): - cor.append(np.corrcoef(np.vstack((list(cor1[:i, i]) + list(cor1[(i + 1):, i]), - list(cor2[:i, i]) + list(cor2[(i + 1):, i]))))[0, 1]) + cor.append( + np.corrcoef( + np.vstack(( + list(cor1[:i, i]) + list(cor1[(i + 1):, i]), + list(cor2[:i, i]) + list(cor2[(i + 1):, i]), + )))[0, 1]) cor = np.array(cor) if cutoff < 1: fid = np.where(cor >= cutoff)[0] @@ -27,7 +30,6 @@ def crossCorrelation_FS(data1, data2, cutoff): return sorted(fid) - # Use COXEN approach to select predictive and generalizable genes for prediction. # study1: the name of study 1, should be one of 'CCLE', 'CTRP', 'gCSI', 'GDSC', 'NCI60' # study2: the name of study 2, should be one of 'CCLE', 'CTRP', 'gCSI', 'GDSC', 'NCI60' @@ -41,26 +43,41 @@ def crossCorrelation_FS(data1, data2, cutoff): # whose cross-correlation coefficient >= cutoffCrossCorrelation are selected. If cutoffCrossCorrelation >= 1, # it must be an integer indicating the number of genes to be selected based on cross-correlation coefficient. -def COXEN_FeatureSelection(study1, study2, rnaSeqData, drugResponseData, cutoffCorrelation=200, cutoffCrossCorrelation=100): + +def COXEN_FeatureSelection( + study1, + study2, + rnaSeqData, + drugResponseData, + cutoffCorrelation=200, + cutoffCrossCorrelation=100, +): # get rnaSeq data of study1 and study2 - study = np.array([i.split('.')[0] for i in rnaSeqData.index]) + study = np.array([i.split(".")[0] for i in rnaSeqData.index]) data1 = rnaSeqData.iloc[np.where(study == study1)[0], :] data2 = rnaSeqData.iloc[np.where(study == study2)[0], :] # keep only drug response data of cell lines in data1 - drugResponseData = drugResponseData.iloc[np.where(drugResponseData.SOURCE == study1)[0], :] - drugResponseData = drugResponseData.iloc[np.where(np.isin(drugResponseData.CELLNAME, data1.index))[0], :] + drugResponseData = drugResponseData.iloc[np.where( + drugResponseData.SOURCE == study1)[0], :] + drugResponseData = drugResponseData.iloc[ + np.where(np.isin(drugResponseData.CELLNAME, data1.index))[0], :] # perform the first step of COXEN approach to select predictive genes. To avoid exceeding the memory limit, # the prediction power of genes (i.e. absolute correlation coefficient with drug response) is calculated in batches. batchSize = 100 - numBatch = int(np.ceil(data1.shape[1]/batchSize)) + numBatch = int(np.ceil(data1.shape[1] / batchSize)) cor = np.empty((data1.shape[1], 1)) for i in range(numBatch): - startIndex = i*batchSize - endIndex = min((i+1)*batchSize, data1.shape[1]) - cor_i = np.corrcoef(np.vstack((np.transpose(data1.iloc[:, startIndex:endIndex].loc[drugResponseData.CELLNAME, - :].values), np.reshape(drugResponseData.GROWTH.values, (1, drugResponseData.shape[0]))))) + startIndex = i * batchSize + endIndex = min((i + 1) * batchSize, data1.shape[1]) + cor_i = np.corrcoef( + np.vstack(( + np.transpose(data1.iloc[:, startIndex:endIndex].loc[ + drugResponseData.CELLNAME, :].values), + np.reshape(drugResponseData.GROWTH.values, + (1, drugResponseData.shape[0])), + ))) cor[startIndex:endIndex, 0] = abs(cor_i[:-1, -1]) if cutoffCorrelation < 1: gid1 = np.where(cor >= cutoffCorrelation)[0] @@ -72,22 +89,41 @@ def COXEN_FeatureSelection(study1, study2, rnaSeqData, drugResponseData, cutoffC data2 = data2.iloc[:, gid1] # perform the second step of COXEN approach to select generalizable genes among the predictive genes - gid2 = crossCorrelation_FS(data1.values, data2.values, cutoffCrossCorrelation) + gid2 = crossCorrelation_FS(data1.values, data2.values, + cutoffCrossCorrelation) # return the gene names return data1.columns[gid2] - # Load data. -rnaSeqData = pd.read_csv('/home/nick/Documents/repos/Benchmarks/Data/Pilot1/combined_rnaseq_data_lincs1000_combat', sep='\t', engine='c', na_values=['na', '-', ''], - header=0, index_col=0) -drugResponseData = pd.read_csv('/home/nick/Documents/repos/Benchmarks/Data/Pilot1/rescaled_combined_single_drug_growth', sep='\t', engine='c', - na_values=['na', '-', ''], header=0, index_col=None) +rnaSeqData = pd.read_csv( + "/home/nick/Documents/repos/Benchmarks/Data/Pilot1/combined_rnaseq_data_lincs1000_combat", + sep="\t", + engine="c", + na_values=["na", "-", ""], + header=0, + index_col=0, +) +drugResponseData = pd.read_csv( + "/home/nick/Documents/repos/Benchmarks/Data/Pilot1/rescaled_combined_single_drug_growth", + sep="\t", + engine="c", + na_values=["na", "-", ""], + header=0, + index_col=None, +) # Sample selection and filtering should be done here by selecting a part of drugResponseData or a part of rnaSeqData. # The following line of code is just a example randomly selecting 10000 samples through subsetting drugResponseData. -drugResponseData = drugResponseData.iloc[np.random.permutation(drugResponseData.shape[0])[:10000], :] - -selectedGenes = COXEN_FeatureSelection(study1='CTRP', study2='CCLE', rnaSeqData=rnaSeqData, - drugResponseData=drugResponseData, cutoffCorrelation=100, cutoffCrossCorrelation=50) +drugResponseData = drugResponseData.iloc[ + np.random.permutation(drugResponseData.shape[0])[:10000], :] + +selectedGenes = COXEN_FeatureSelection( + study1="CTRP", + study2="CCLE", + rnaSeqData=rnaSeqData, + drugResponseData=drugResponseData, + cutoffCorrelation=100, + cutoffCrossCorrelation=50, +) diff --git a/workflows/xcorr/CandlePilotWorkflow.py b/workflows/xcorr/CandlePilotWorkflow.py index cedc9175..6ccbbe96 100644 --- a/workflows/xcorr/CandlePilotWorkflow.py +++ b/workflows/xcorr/CandlePilotWorkflow.py @@ -1,9 +1,9 @@ import sys + import numpy as np from scipy.stats import ttest_ind - # Could be used for NT3 (f(row) -> binary) # Use t-test to select features that are discriminative between two sample classes # data: an array, where rows are samples and columns are features (e.g., RNA expression row) @@ -16,12 +16,14 @@ def ttest_FS(data, label, cutoff): unique_label = list(set(label)) if len(unique_label) != 2: - print('T-test feature selection needs two sample classes') + print("T-test feature selection needs two sample classes") return None id0 = np.where(label == unique_label[0])[0] id1 = np.where(label == unique_label[1])[0] if len(id0) < 3 or len(id1) < 3: - print('T-test feature selection requires every sample class has at least 3 samples') + print( + "T-test feature selection requires every sample class has at least 3 samples" + ) return None t, p = ttest_ind(a=data[id0, :], b=data[id1, :], axis=0, equal_var=False) if cutoff < 1: @@ -41,16 +43,17 @@ def ttest_FS(data, label, cutoff): # integer indicating the number of features to be selected based on absolute correlation coefficient. # Returns a list of indices of the selected features. def correlation_FS(data, target, cutoff): - cor = np.corrcoef(np.vstack((np.transpose(data), np.reshape(target, (1, len(target)))))) + cor = np.corrcoef( + np.vstack((np.transpose(data), np.reshape(target, (1, len(target)))))) cor = abs(cor[:-1, -1]) if cutoff < 1: fid = np.where(cor >= cutoff)[0] else: - fid = sorted(range(len(cor)), key=lambda x: cor[x], reverse=True)[:int(cutoff)] + fid = sorted(range(len(cor)), key=lambda x: cor[x], + reverse=True)[:int(cutoff)] return sorted(fid) - # Use the COXEN approach to select the features that are generalizable between data1 and data2. # data1: an array, where rows are samples and columns are features # data2: an array, where rows are samples and columns are features. data1 and data2 should have an equal @@ -65,32 +68,34 @@ def COXEN_FS(data1, data2, cutoff): num = data1.shape[1] cor = [] for i in range(num): - cor.append(np.corrcoef(np.vstack((list(cor1[:i, i]) + list(cor1[(i + 1):, i]), - list(cor2[:i, i]) + list(cor2[(i + 1):, i]))))[0, 1]) + cor.append( + np.corrcoef( + np.vstack(( + list(cor1[:i, i]) + list(cor1[(i + 1):, i]), + list(cor2[:i, i]) + list(cor2[(i + 1):, i]), + )))[0, 1]) cor = np.array(cor) if cutoff < 1: fid = np.where(cor >= cutoff)[0] else: - fid = sorted(range(num), key=lambda x: cor[x], reverse=True)[:int(cutoff)] + fid = sorted(range(num), key=lambda x: cor[x], + reverse=True)[:int(cutoff)] return sorted(fid) - - - -numF = 10 # Number of features -numS = 50 # Number of samples to be multiplied by 2 +numF = 10 # Number of features +numS = 50 # Number of samples to be multiplied by 2 data1 = np.random.randn(numF, numS) for i in range(numF): - data1[i, :] = data1[i, :] + i/5 + data1[i, :] = data1[i, :] + i / 5 data2 = np.random.randn(numF, numS) data1 = np.hstack((data1, data2)) data1 = np.transpose(data1) label = np.array([0 for i in range(numS)] + [1 for i in range(numS)]) -data3 = np.random.randn(numF, int(numS/2)) +data3 = np.random.randn(numF, int(numS / 2)) for i in range(numF): - data3[i, :] = data3[i, :] + i/5 -data4 = np.random.randn(numF, int(numS/2)) + data3[i, :] = data3[i, :] + i / 5 +data4 = np.random.randn(numF, int(numS / 2)) data3 = np.hstack((data3, data4)) data3 = np.transpose(data3) diff --git a/workflows/xcorr/README.adoc b/workflows/xcorr/README.adoc index 11435d97..ea9f3bf8 100644 --- a/workflows/xcorr/README.adoc +++ b/workflows/xcorr/README.adoc @@ -14,14 +14,14 @@ COXEN approach takes the following steps. between the gene’s expression and the drug response value. Select _n_~1~ genes whose absolute correlation coefficients with drug response are the highest. -. For each of the _n_~1~ genes, do the following: +. For each of the _n_~1~ genes, do the following: .. Calculate its Pearson correlation coefficients with the other _n_~1~-1 genes based on their expression values in dataset 1, which forms a _n_~1~-1-dimensional -vector of Pearson correlation coefficients denoted by _c_~1~. -.. Calculate its Pearson correlation coefficients with the +vector of Pearson correlation coefficients denoted by _c_~1~. +.. Calculate its Pearson correlation coefficients with the . Among the _n_~1~ genes, select _n_~2~ genes whose COXEN scores are the -highest. +highest. With respect to using the results in to train a model, drug response prediction model would be trained using these _n_~2~ genes using @@ -47,11 +47,11 @@ the prediction model. The COXEN algorithm requires two input parameters _n_~1~ a _n_~2~, which are the number of candidate predictive genes and the number of selected genes in final output. These two parameters can be pre-determined before data analysis or tuned through hyperparameter search for identifying -their optimal values to build the prediction model. +their optimal values to build the prediction model. == Code -The COXEN implementation consists of two files: `xcorr.py` and `uno_xcorr.py`. +The COXEN implementation consists of two files: `xcorr.py` and `uno_xcorr.py`. * `xcorr.py` - implements COXEN correlation using numpy arrays to represent the datasets. This code encapsulates steps 1 and 2 in a generic way. See the @@ -59,8 +59,8 @@ documentation comments in each python function for more details. * `uno_xcorr.py` - runs the COXEN correlation code in `xcorr.py` on Pilot 1 gene and drug reponse data to produce cross correlated features files that -can be used with the Uno benchmark model. The module needs to be initialized -with gene and drug repsonse data via call to `uno_xcorr.init_uno_xcorr` before +can be used with the Uno benchmark model. The module needs to be initialized +with gene and drug repsonse data via call to `uno_xcorr.init_uno_xcorr` before running the cross correlation. For example, + @@ -79,17 +79,17 @@ a training run. For example, + ---- -uno_xcorr.coxen_feature_selection('CCLE', 'GDSC', 2000, 1000, +uno_xcorr.coxen_feature_selection('CCLE', 'GDSC', 2000, 1000, 'CCLE_GDSC_2000_1000_features.txt') ---- + where 'CCLE' and 'GDSC' are the names of cancer studies in the initialization data each with gene / drug treatment and response values. The call produces -a cross correlation file of the cell features of these two studies using +a cross correlation file of the cell features of these two studies using a correlation cutoff of 2000 (limiting __c_~1~_ in step 2.a above to those values >= 2000), and a cross -correlation cutoff of 1000 (limiting the results of step 2.c above to those +correlation cutoff of 1000 (limiting the results of step 2.c above to those >= 1000). diff --git a/workflows/xcorr/db-init.py b/workflows/xcorr/db-init.py index e88310cd..bc6fdbde 100644 --- a/workflows/xcorr/db-init.py +++ b/workflows/xcorr/db-init.py @@ -1,71 +1,73 @@ - # DB INIT PY # Initialize the SQLite DB # See db-init.sql for the table schema import sys -from xcorr_db import xcorr_db, q - from pathlib import Path + +from xcorr_db import q, xcorr_db + THIS = Path(sys.argv[0]).parent.resolve() -DB = xcorr_db('xcorr.db') +DB = xcorr_db("xcorr.db") DB.connect() + def create_tables(): - """ Set up the tables defined in the SQL file """ + """Set up the tables defined in the SQL file.""" global THIS - with open(str(THIS)+"/db-init.sql") as fp: + with open(str(THIS) + "/db-init.sql") as fp: sqlcode = fp.read() DB.executescript(sqlcode) DB.commit() + def insert_feature_names(): - """ - Copy features from the header of this datafile - into the features table - """ + """Copy features from the header of this datafile into the features + table.""" global THIS - datafile = str(THIS)+"/test_data/combined_rnaseq_data_lincs1000_combat" - #datafile = "test_data/combined_rnaseq_data_combat" + datafile = str(THIS) + "/test_data/combined_rnaseq_data_lincs1000_combat" + # datafile = "test_data/combined_rnaseq_data_combat" with open(datafile) as fp: line = fp.readline() feature_names = line.split("\t") - del feature_names[0] # Remove first token "Sample" + del feature_names[0] # Remove first token "Sample" for name in feature_names: - if name == "": continue + if name == "": + continue name = name.strip() - DB.insert(table="feature_names", - names=["name"], - values=[q(name)]) + DB.insert(table="feature_names", names=["name"], values=[q(name)]) + def insert_study_names(): - """ Copy study names from studies.txt into the DB """ + """Copy study names from studies.txt into the DB.""" global THIS studies = [] - with open(str(THIS)+"/studies.txt") as fp: + with open(str(THIS) + "/studies.txt") as fp: while True: line = fp.readline() - if line == "": break + if line == "": + break tokens = line.split("#") line = tokens[0] line = line.strip() - if line == "": continue + if line == "": + continue studies.append(line) for study in studies: - DB.insert(table="study_names", - names=["name"], - values=[q(study)]) + DB.insert(table="study_names", names=["name"], values=[q(study)]) + def create_indices(): - """ Create indices after data insertion for speed """ + """Create indices after data insertion for speed.""" DB.execute("create index features_index on features(record_id);") DB.execute("create index studies_index on studies ( study_id);") + # Catch and print all exceptions to improve visibility of success/failure success = False try: @@ -76,11 +78,13 @@ def create_indices(): success = True except Exception as e: import traceback + print(traceback.format_exc()) if not success: print("DB: !!! INIT FAILED !!!") import sys + sys.exit(1) print("DB: initialized successfully") diff --git a/workflows/xcorr/db-insert-junk.py b/workflows/xcorr/db-insert-junk.py index b701206c..9b1e54a2 100644 --- a/workflows/xcorr/db-insert-junk.py +++ b/workflows/xcorr/db-insert-junk.py @@ -1,4 +1,3 @@ - # DB INSERT JUNK PY # Test SQLite DB # See init-db.sql for the table schema @@ -7,23 +6,22 @@ import random import time -from xcorr_db import xcorr_db, q +from xcorr_db import q, xcorr_db -DB = xcorr_db('xcorr.db') +DB = xcorr_db("xcorr.db") feature_id2name, feature_name2id = DB.read_feature_names() -study_id2name, study_name2id = DB.read_study_names() +study_id2name, study_name2id = DB.read_study_names() feature_names = feature_name2id.keys() -study_names = study_name2id .keys() +study_names = study_name2id.keys() -for i in range(1,4): +for i in range(1, 4): cutoff_corr = 200 cutoff_xcorr = 50 - features = [ feature for feature in feature_names - if random.randint(0,300) == 0 ] - studies = [ study for study in study_names - if random.randint(0,1) == 0 ] - record = ( features, cutoff_corr, cutoff_xcorr ) - DB.insert_xcorr_record(studies, features, - cutoff_corr, cutoff_xcorr) + features = [ + feature for feature in feature_names if random.randint(0, 300) == 0 + ] + studies = [study for study in study_names if random.randint(0, 1) == 0] + record = (features, cutoff_corr, cutoff_xcorr) + DB.insert_xcorr_record(studies, features, cutoff_corr, cutoff_xcorr) diff --git a/workflows/xcorr/list-records.py b/workflows/xcorr/list-records.py index 58ba01f2..5df9c6e0 100644 --- a/workflows/xcorr/list-records.py +++ b/workflows/xcorr/list-records.py @@ -1,14 +1,13 @@ - # LIST RECORDS PY # List all the records in the DB and their metadata from record import Record -from xcorr_db import xcorr_db, q +from xcorr_db import q, xcorr_db DB = xcorr_db("xcorr.db") feature_id2name, feature_name2id = DB.read_feature_names() -study_id2name, study_name2id = DB.read_study_names() +study_id2name, study_name2id = DB.read_study_names() # Main list of records records = [] @@ -18,7 +17,8 @@ DB.execute("select rowid from records;") while True: row = DB.cursor.fetchone() - if row == None: break + if row == None: + break record_ids.append(row[0]) # Read the record data @@ -33,7 +33,8 @@ DB.execute("select * from features where record_id == %i;" % record_id) while True: row = DB.cursor.fetchone() - if row == None: break + if row == None: + break feature = feature_id2name[row[1]] record.features.append(feature) @@ -41,7 +42,8 @@ DB.execute("select * from studies where record_id == %i;" % record_id) while True: row = DB.cursor.fetchone() - if row == None: break + if row == None: + break study = study_id2name[row[1]] record.studies.append(study) diff --git a/workflows/xcorr/make-fake-data.py b/workflows/xcorr/make-fake-data.py index 3ee940e4..47f00c7c 100644 --- a/workflows/xcorr/make-fake-data.py +++ b/workflows/xcorr/make-fake-data.py @@ -1,27 +1,34 @@ -import numpy as np import os +import numpy as np + + def make_fake_data(out_dir): - numF = 10 # Number of features - numS = 50 # Number of samples to be multiplied by 2 + numF = 10 # Number of features + numS = 50 # Number of samples to be multiplied by 2 if not os.path.exists(out_dir): - os.makedirs(out_dir) + os.makedirs(out_dir) for j in range(6): data1 = np.random.randn(numF, numS) for i in range(numF): - data1[i, :] = data1[i, :] + i/5 + data1[i, :] = data1[i, :] + i / 5 data2 = np.random.randn(numF, numS) data1 = np.hstack((data1, data2)) data1 = np.transpose(data1) - data3 = np.random.randn(numF, int(numS/2)) + data3 = np.random.randn(numF, int(numS / 2)) for i in range(numF): - data3[i, :] = data3[i, :] + i/5 - data4 = np.random.randn(numF, int(numS/2)) + data3[i, :] = data3[i, :] + i / 5 + data4 = np.random.randn(numF, int(numS / 2)) data3 = np.hstack((data3, data4)) data3 = np.transpose(data3) - np.savetxt("{}/data{}.tsv".format(out_dir, j * 2), data1, delimiter='\t') - np.savetxt("{}/data{}.tsv".format(out_dir, j * 2 + 1), data3, delimiter='\t') + np.savetxt("{}/data{}.tsv".format(out_dir, j * 2), + data1, + delimiter="\t") + np.savetxt("{}/data{}.tsv".format(out_dir, j * 2 + 1), + data3, + delimiter="\t") + if __name__ == "__main__": - make_fake_data("./test_data") \ No newline at end of file + make_fake_data("./test_data") diff --git a/workflows/xcorr/record.py b/workflows/xcorr/record.py index 96bd92cf..9e16bcbf 100644 --- a/workflows/xcorr/record.py +++ b/workflows/xcorr/record.py @@ -1,17 +1,15 @@ - # RECORD PY # Represent a record in the DB + class Record: def __init__(self): self.features = [] - self.studies = [] + self.studies = [] def scan(self, row): - self.rowid, self.ts, self.cutoff_corr, self.cutoff_xcorr = \ - row[0:4] - + self.rowid, self.ts, self.cutoff_corr, self.cutoff_xcorr = row[0:4] def print(self): print("record: " + str(self.rowid)) diff --git a/workflows/xcorr/tests/uno_xcorr_tests.py b/workflows/xcorr/tests/uno_xcorr_tests.py index c0e3a51b..b537b887 100644 --- a/workflows/xcorr/tests/uno_xcorr_tests.py +++ b/workflows/xcorr/tests/uno_xcorr_tests.py @@ -1,42 +1,46 @@ import unittest -import uno_xcorr + import numpy as np +import uno_xcorr + # Run with: PYTHONPATH=UNO_BENCHMARK_PATH:BENCHMARK_COMMON_PATH python -m unittest tests.uno_sc -# E.g. PYTHONPATH=$HOME/Documents/repos/Benchmarks/Pilot1/Uno:$HOME/Documents/repos/Benchmarks/common +# E.g. PYTHONPATH=$HOME/Documents/repos/Benchmarks/Pilot1/Uno:$HOME/Documents/repos/Benchmarks/common # python -m unittest tests.uno_xcorr_tests class TestUnoXcorr(unittest.TestCase): def setUp(self): if uno_xcorr.gene_df is None: - dp = './test_data/rescaled_combined_single_drug_growth.bz2' - rp = './test_data/combined_rnaseq_data_lincs1000_combat.bz2' + dp = "./test_data/rescaled_combined_single_drug_growth.bz2" + rp = "./test_data/combined_rnaseq_data_lincs1000_combat.bz2" uno_xcorr.init_uno_xcorr(rp, dp) def test_init(self): shape = (15198, 943) self.assertEqual(shape[0], uno_xcorr.gene_df.shape[0]) self.assertEqual(shape[1], uno_xcorr.gene_df.shape[1]) - + shape = (27769716, 7) self.assertEqual(shape[0], uno_xcorr.drug_df.shape[0]) self.assertEqual(shape[1], uno_xcorr.drug_df.shape[1]) def test_source(self): - sources = ['CCLE', 'CTRP', 'GDC', 'GDSC', 'NCI60', 'NCIPDM', 'gCSI'] - df_sources = uno_xcorr.gene_df['source'].unique() + sources = ["CCLE", "CTRP", "GDC", "GDSC", "NCI60", "NCIPDM", "gCSI"] + df_sources = uno_xcorr.gene_df["source"].unique() self.assertEqual(sources, list(df_sources)) def test_xcorr(self): np.random.seed(42) - drug_ids = uno_xcorr.drug_df.iloc[np.random.permutation(uno_xcorr.drug_df.shape[0])[:10000], : ].DRUG_ID - f = './test_data/gene_out.txt' - uno_xcorr.coxen_feature_selection('CCLE', 'NCI60', 200, 200, drug_ids, f) + drug_ids = uno_xcorr.drug_df.iloc[np.random.permutation( + uno_xcorr.drug_df.shape[0])[:10000], :].DRUG_ID + f = "./test_data/gene_out.txt" + uno_xcorr.coxen_feature_selection("CCLE", "NCI60", 200, 200, drug_ids, + f) with open(f) as f_in: lines = f_in.readlines() self.assertEquals(200, len(lines)) - - -if __name__ == '__main__': + + +if __name__ == "__main__": unittest.main() diff --git a/workflows/xcorr/uno_xcorr.py b/workflows/xcorr/uno_xcorr.py index 8ab5a88e..2e530a3a 100644 --- a/workflows/xcorr/uno_xcorr.py +++ b/workflows/xcorr/uno_xcorr.py @@ -1,14 +1,15 @@ import os -import pandas as pd -import numpy as np +import numpy as np +import pandas as pd import xcorr gene_df = None drug_df = None + def init_uno_xcorr(rna_seq_path, drug_response_path, drug_ids=None): - """Initialize this package for xcorr and the Uno benchmark + """Initialize this package for xcorr and the Uno benchmark. :param rna_seq_path: path to gene expression data following the format of combined_rnaseq_data_combat @@ -16,51 +17,70 @@ def init_uno_xcorr(rna_seq_path, drug_response_path, drug_ids=None): rescaled_combined_single_drug_growth """ - rank = os.getenv('PMIX_RANK') - print('rank %s Setting up uno_xcorr...' % rank) + rank = os.getenv("PMIX_RANK") + print("rank %s Setting up uno_xcorr..." % rank) global gene_df - gene_df = pd.read_csv(rna_seq_path, compression='infer', sep='\t', engine='c', na_values=['na', '-', ''], - header=0, index_col=0) - gene_df['study'] = gene_df.index.str.extract('^([^.]*)', expand=False) + gene_df = pd.read_csv( + rna_seq_path, + compression="infer", + sep="\t", + engine="c", + na_values=["na", "-", ""], + header=0, + index_col=0, + ) + gene_df["study"] = gene_df.index.str.extract("^([^.]*)", expand=False) global drug_df - drug_df = pd.read_csv(drug_response_path, compression='infer', sep='\t', engine='c', - na_values=['na', '-', ''], header=0, index_col=None) + drug_df = pd.read_csv( + drug_response_path, + compression="infer", + sep="\t", + engine="c", + na_values=["na", "-", ""], + header=0, + index_col=None, + ) if drug_ids is not None: - drug_df = drug_df[drug_df['DRUG_ID'].isin(drug_ids)] + drug_df = drug_df[drug_df["DRUG_ID"].isin(drug_ids)] -def select_features(df, study_col, study='all'): - """ Selects and returns a data frame from features whose - study is equal to the specified study. If study is 'all' then - all features are returned. +def select_features(df, study_col, study="all"): + """Selects and returns a data frame from features whose study is equal to + the specified study. If study is 'all' then all features are returned. :param study: a string specifing the study -- one of 'CCLE', 'CTRP', 'gCSI', 'GDSC', 'NCI60' or 'all'. """ df1 = df - if study != 'all': + if study != "all": df1 = df1[df1[study_col] == study] return df1 ## TODO: add additional args / functions for additional sample selection -def coxen_feature_selection(study_1, study_2, correlation_cutoff, - cross_correlation_cutoff, drug_ids=None, output_file=None): - - df1 = select_features(gene_df, 'study', study_1) +def coxen_feature_selection( + study_1, + study_2, + correlation_cutoff, + cross_correlation_cutoff, + drug_ids=None, + output_file=None, +): + + df1 = select_features(gene_df, "study", study_1) # add namespace prefix as required by Uno - df1 = df1.drop(['study'], axis=1).add_prefix("rnaseq.") + df1 = df1.drop(["study"], axis=1).add_prefix("rnaseq.") - df2 = select_features(gene_df, 'study', study_2) + df2 = select_features(gene_df, "study", study_2) # add namespace prefix as required by Uno - df2 = df2.drop(['study'], axis=1).add_prefix("rnaseq.") + df2 = df2.drop(["study"], axis=1).add_prefix("rnaseq.") - dr_df = select_features(drug_df, 'SOURCE', study_1) + dr_df = select_features(drug_df, "SOURCE", study_1) if drug_ids is not None: - dr_df = dr_df[dr_df['DRUG_ID'].isin(drug_ids)] + dr_df = dr_df[dr_df["DRUG_ID"].isin(drug_ids)] # keep only drug response data of cell lines in data1 dr_df = dr_df.iloc[np.where(np.isin(dr_df.CELLNAME, df1.index))[0], :] @@ -68,19 +88,20 @@ def coxen_feature_selection(study_1, study_2, correlation_cutoff, # perform the first step of COXEN approach to select predictive genes. To avoid exceeding the memory limit, # the prediction power of genes (i.e. absolute correlation coefficient with drug response) is calculated in batches. gid1 = xcorr.correlation_feature_selection(df1, dr_df.GROWTH.values, - dr_df.CELLNAME, correlation_cutoff) + dr_df.CELLNAME, + correlation_cutoff) # keep only predictive genes for data1 and data2 df1 = df1.iloc[:, gid1] df2 = df2.iloc[:, gid1] gid2 = xcorr.cross_correlation_feature_selection(df1.values, df2.values, - cross_correlation_cutoff) + cross_correlation_cutoff) genes = df1.columns[gid2] if output_file is not None: - with open(output_file, 'w') as f_out: + with open(output_file, "w") as f_out: for g in genes: - f_out.write('{}\n'.format(g)) + f_out.write("{}\n".format(g)) return genes diff --git a/workflows/xcorr/xcorr.py b/workflows/xcorr/xcorr.py index 34d603fa..c489d04a 100644 --- a/workflows/xcorr/xcorr.py +++ b/workflows/xcorr/xcorr.py @@ -1,9 +1,12 @@ -import numpy as np import os +import numpy as np + + def correlation_feature_selection(data, targets, labels, cutoff): - """ - Use Pearson correlation coefficient to select predictive features for regression. + """Use Pearson correlation coefficient to select predictive features for + regression. + :param data: an data table, where rows are samples and columns are features :param label: sample labels of data, which match with targets :param targets: a vector of real numbers indicating the regression targets, with a length the same as labels. @@ -14,13 +17,17 @@ def correlation_feature_selection(data, targets, labels, cutoff): """ batchSize = 100 - numBatch = int(np.ceil(data.shape[1]/batchSize)) + numBatch = int(np.ceil(data.shape[1] / batchSize)) cor = np.empty((data.shape[1], 1)) for i in range(numBatch): - startIndex = i*batchSize - endIndex = min((i+1)*batchSize, data.shape[1]) - cor_i = np.corrcoef(np.vstack((np.transpose(data.iloc[:, startIndex:endIndex].loc[labels, - :].values), np.reshape(targets, (1, len(targets)))))) + startIndex = i * batchSize + endIndex = min((i + 1) * batchSize, data.shape[1]) + cor_i = np.corrcoef( + np.vstack(( + np.transpose( + data.iloc[:, startIndex:endIndex].loc[labels, :].values), + np.reshape(targets, (1, len(targets))), + ))) cor[startIndex:endIndex, 0] = abs(cor_i[:-1, -1]) if cutoff < 1: gid = np.where(cor >= cutoff)[0] @@ -31,13 +38,12 @@ def correlation_feature_selection(data, targets, labels, cutoff): def cross_correlation_feature_selection(data1, data2, cutoff): - """ - Use the COXEN approach to select the features that are generalizable between data1 and data2. - data1 and data2 should have an equal number of features. The features in data1 and data2 should - match. - + """Use the COXEN approach to select the features that are generalizable + between data1 and data2. data1 and data2 should have an equal number of + features. The features in data1 and data2 should match. + :param data1: an array, where rows are samples and columns are features - :param data2: an array, where rows are samples and columns are features. + :param data2: an array, where rows are samples and columns are features. :param cutoff: a positive number for selecting generalizable features. If cutoff < 1, this function selects the features with a correlation coefficient >= cutoff. If cutoff >= 1, it must be an integer indicating the number of features to be selected based on correlation coefficient. @@ -48,12 +54,15 @@ def cross_correlation_feature_selection(data1, data2, cutoff): num = data1.shape[1] cor = [] for i in range(num): - cor.append(np.corrcoef(np.vstack((list(cor1[:i, i]) + list(cor1[(i + 1):, i]), - list(cor2[:i, i]) + list(cor2[(i + 1):, i]))))[0, 1]) + cor.append( + np.corrcoef( + np.vstack(( + list(cor1[:i, i]) + list(cor1[(i + 1):, i]), + list(cor2[:i, i]) + list(cor2[(i + 1):, i]), + )))[0, 1]) cor = np.array(cor) if cutoff < 1: fid = np.where(cor >= cutoff)[0] else: fid = np.argsort(-cor)[:int(cutoff)] return sorted(fid) - diff --git a/workflows/xcorr/xcorr_db.py b/workflows/xcorr/xcorr_db.py index 10b123fa..6c08ab7a 100644 --- a/workflows/xcorr/xcorr_db.py +++ b/workflows/xcorr/xcorr_db.py @@ -1,4 +1,3 @@ - # XCORR DB PY # DB helper functions @@ -8,33 +7,33 @@ import sqlite3 import sys + def setup_db(db_file): - if 'DB' not in globals(): - rank = os.getenv('PMIX_RANK') - print('rank %s Connecting to DB...' % rank) + if "DB" not in globals(): + rank = os.getenv("PMIX_RANK") + print("rank %s Connecting to DB..." % rank) global DB DB = xcorr_db(db_file) DB.read_feature_names() DB.read_study_names() return DB + class xcorr_db: def __init__(self, db_file, log=False): - """ - Sets up a wrapper around the SQL connection and cursor objects - Also caches dicts that convert between names and ids for the - features and studies tables - """ - #self.conn = sqlite3.connect(db_file) - #self.cursor = self.conn.cursor() + """Sets up a wrapper around the SQL connection and cursor objects Also + caches dicts that convert between names and ids for the features and + studies tables.""" + # self.conn = sqlite3.connect(db_file) + # self.cursor = self.conn.cursor() self.db_file = db_file self.feature_id2name = None self.feature_name2id = None - self.study_id2name = None - self.study_name2id = None - self.autoclose = True - self.logger = None # Default + self.study_id2name = None + self.study_name2id = None + self.autoclose = True + self.logger = None # Default if log: logging.basicConfig(format="SQL: %(message)s") self.logger = logging.getLogger("xcorr_db") @@ -47,54 +46,66 @@ def connect(self): # provisional for cp1 runs def insert_hpo_record(self, record_id): - ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") self.connect() with self.conn: - hpo_id = self.insert(table='hpos', names=['xcorr_record_id', 'time'], - values = [str(record_id), q(ts)]) + hpo_id = self.insert( + table="hpos", + names=["xcorr_record_id", "time"], + values=[str(record_id), q(ts)], + ) self.commit() return hpo_id def insert_hpo_run(self, hpo_id, param_string, run_directory): - ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") self.connect() with self.conn: - run_id = self.insert(table='hpo_runs', names=['hpoid', 'params', 'run_directory', 'start'], - values = [str(hpo_id), q(param_string), q(run_directory), q(ts)]) + run_id = self.insert( + table="hpo_runs", + names=["hpoid", "params", "run_directory", "start"], + values=[str(hpo_id), + q(param_string), + q(run_directory), + q(ts)], + ) self.commit() return run_id def update_hpo_run(self, run_id, result): - ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") sql = "update hpo_runs set obj_result = ?, end = ? where runid = ?" self.connect() with self.conn: self.cursor.execute(sql, (result, ts, run_id)) self.commit() - def insert_xcorr_record(self, studies, features, - cutoff_corr, cutoff_xcorr): - """ - Insert a new XCORR record. + def insert_xcorr_record(self, studies, features, cutoff_corr, cutoff_xcorr): + """Insert a new XCORR record. + :return: The ID of the new record """ - ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') - names = [ "time", "cutoff_corr", "cutoff_xcorr" ] - values = [ q(ts), str(cutoff_corr), str(cutoff_xcorr) ] + ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + names = ["time", "cutoff_corr", "cutoff_xcorr"] + values = [q(ts), str(cutoff_corr), str(cutoff_xcorr)] self.connect() with self.conn: record_id = self.insert("records", names, values) for feature in features: feature_id = str(self.feature_name2id[feature]) - self.insert(table="features", - names=[ "record_id", "feature_id"], - values=[ record_id , feature_id ]) + self.insert( + table="features", + names=["record_id", "feature_id"], + values=[record_id, feature_id], + ) for study in studies: study_id = str(self.study_name2id[study]) - self.insert(table="studies", - names=[ "record_id", "study_id"], - values=[ record_id , study_id ]) + self.insert( + table="studies", + names=["record_id", "study_id"], + values=[record_id, study_id], + ) self.commit() self.log("inserted record: " + record_id) return record_id @@ -104,11 +115,13 @@ def scan_features_file(self, filename): with open(filename) as fp: while True: line = fp.readline() - if line == "": break + if line == "": + break tokens = line.split("#") line = tokens[0] line = line.strip() - if line == "": continue + if line == "": + continue line = line.replace("rnaseq.", "") results.append(line) return results @@ -122,10 +135,11 @@ def read_feature_names(self): self.feature_name2id = {} while True: row = self.cursor.fetchone() - if row == None: break + if row == None: + break rowid, name = row[0:2] self.feature_id2name[rowid] = name - self.feature_name2id[name] = rowid + self.feature_name2id[name] = rowid return self.feature_id2name, self.feature_name2id @@ -138,17 +152,19 @@ def read_study_names(self): self.study_name2id = {} while True: row = self.cursor.fetchone() - if row == None: break + if row == None: + break rowid, name = row[0:2] self.study_id2name[rowid] = name - self.study_name2id[name] = rowid + self.study_name2id[name] = rowid return self.study_id2name, self.study_name2id def insert(self, table, names, values): - """ Do a SQL insert """ - names_tpl = sql_tuple(names) + """Do a SQL insert.""" + names_tpl = sql_tuple(names) values_tpl = sql_tuple(values) - cmd = "insert into {} {} values {};".format(table, names_tpl, values_tpl) + cmd = "insert into {} {} values {};".format(table, names_tpl, + values_tpl) self.execute(cmd) rowid = str(self.cursor.lastrowid) return rowid @@ -180,19 +196,22 @@ def __del__(self): def q(s): - """ Quote the given string """ + """Quote the given string.""" return "'" + str(s) + "'" + def qL(L): - """ Quote each list entry as a string """ + """Quote each list entry as a string.""" return map(q, L) + def qA(*args): - """ Quote each argument as a string """ + """Quote each argument as a string.""" return map(q, args) + def sql_tuple(L): - """ Make the given list into a SQL-formatted tuple """ + """Make the given list into a SQL-formatted tuple.""" result = "" result += "(" result += ",".join(L) From 8754b953b9df58eea1eccf7dc284ecdd94ff69a1 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Mon, 7 Nov 2022 20:41:40 -0600 Subject: [PATCH 313/601] o Rollback all the pre-commit --- workflows/common/sh/model.sh | 16 ++++++++++++++-- workflows/mlrMBO/swift/workflow.sh | 20 +++++++++++++------- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index 8f137dbe..12536259 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -87,7 +87,7 @@ then # No model_runner, need to write parameters.txt explicitly: # get hyper_parameter_map to pass as 2nd argument - + python3 $WORKFLOWS_ROOT/common/python/runner_utils.py write_params $PARAMS $INIT_PARAMS_FILE MODEL_CMD=( singularity exec --nv $CANDLE_IMAGE train.sh $ADLB_RANK_OFFSET $CANDLE_DATA_DIR $INSTANCE_DIRECTORY/parameters.txt ) @@ -101,7 +101,19 @@ then # RESULT=$( sed -n '/val_loss:/{s/val_loss: \(.*\)/\1/;p}' | tail -1 ) # log "found result: $RESULT" # echo $RESULT > $INSTANCE_DIRECTORY/result.txt - echo $MODEL_CMD + + + # TODO: Add wait for the above and standardize getting results from container. + echo $MODEL_CMD & + PID=$! + # FIX: This doesn't work. + wait $PID + + + # get results of the format Loss: xxx last occurence of in the model.log file + RESULT=$(awk -v FS="Loss:" 'NF>1{print $2}' model.log | tail -1) + echo $RESULT > $INSTANCE_DIRECTORY/result.txt + else # "BENCHMARKS" # The Python command line arguments: diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index f3fb4dbb..1549c3c2 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -36,13 +36,14 @@ usage() echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME" } -if (( ${#} != 7 )) +if (( ${#} != 7 )) && (( ${#} != 5 )) then usage exit 1 fi -if ! { +if (( ${#} == 7 )) +then get_site $1 # Sets SITE get_expid $2 # Sets EXPID get_cfg_sys $3 @@ -50,13 +51,20 @@ if ! { MODEL_NAME=$5 CANDLE_MODEL_TYPE=$6 CANDLE_IMAGE=$7 - } -then + + elif (( ${#} == 5 )) + then + get_site $1 # Sets SITE + get_expid $2 # Sets EXPID + get_cfg_sys $3 + get_cfg_prm $4 + MODEL_NAME=$5 +else usage exit 1 fi -echo "Running "$MODEL_NAME "workflow with" $CANDLE_MODEL_TYPE "and image" $CANDLE_IMAGE +# echo "Running "$MODEL_NAME "workflow with" $CANDLE_MODEL_TYPE "and image" $CANDLE_IMAGE # Set PYTHONPATH for BENCHMARK related stuff PYTHONPATH+=:$BENCHMARK_DIR # :$BENCHMARKS_ROOT/common # This is now candle_lib @@ -189,8 +197,6 @@ swift-t -O 0 -n $PROCS \ -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ -e MODEL_SH \ -e MODEL_NAME \ - -e CANDLE_MODEL_TYPE \ - -e CANDLE_IMAGE \ -e SITE \ -e BENCHMARK_TIMEOUT \ -e SH_TIMEOUT \ From 2cc240de012e4b719fa2d4fb57058b8efeccbd89 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 9 Nov 2022 13:38:20 -0600 Subject: [PATCH 314/601] Add deletion message --- workflows/common/sh/env-mcs.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/common/sh/env-mcs.sh b/workflows/common/sh/env-mcs.sh index 84c02f38..42810d0d 100755 --- a/workflows/common/sh/env-mcs.sh +++ b/workflows/common/sh/env-mcs.sh @@ -2,6 +2,8 @@ # ENV MCS # Environment settings for ANL/MCS compute nodes +# MCS CLUSTER IS GONE -- DELETE THIS ONCE JENKINS/GCE WORKS -- 2022-11-09 + export PY=/homes/jain/anaconda3/bin/python/ export R=/home/wozniak/Public/sfw/x86_64/R-3.4.1/lib/R/ # Modify to specify the location of SWIFT_T installation From 9b40d188e02ac10a19df53014bd78bba816b7c89 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 9 Nov 2022 13:40:43 -0600 Subject: [PATCH 315/601] Adding env-gce.sh --- workflows/common/sh/env-gce.sh | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 workflows/common/sh/env-gce.sh diff --git a/workflows/common/sh/env-gce.sh b/workflows/common/sh/env-gce.sh new file mode 100644 index 00000000..084125ea --- /dev/null +++ b/workflows/common/sh/env-gce.sh @@ -0,0 +1,16 @@ + +# ENV GCE +# Environment settings for ANL/GCE compute nodes + +SFW=/nfs/gce/projects/Swift-T/sfw/x86_64 +SWIFT=$SFW/swift-t/mpich/2022-11-09-Jenkins + +PATH=$SWIFT/stc/bin:$PATH + +# For test output processing: +export LOCAL=1 +export CRAY=0 + +# Cf. utils.sh +log_path LD_LIBRARY_PATH +log_path PYTHONPATH From 03d06143fbd6f5af02c50915940ad5456d2b9fb4 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Thu, 10 Nov 2022 04:04:19 -0800 Subject: [PATCH 316/601] o Get closer to running test case from GraphDRP on lambda0 --- workflows/common/sh/model.sh | 37 ++++++++--------------- workflows/mlrMBO/test/test-singularity.sh | 4 +-- 2 files changed, 15 insertions(+), 26 deletions(-) diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index 12536259..6cf798a8 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -89,31 +89,8 @@ then # get hyper_parameter_map to pass as 2nd argument python3 $WORKFLOWS_ROOT/common/python/runner_utils.py write_params $PARAMS $INIT_PARAMS_FILE - MODEL_CMD=( singularity exec --nv $CANDLE_IMAGE train.sh $ADLB_RANK_OFFSET + MODEL_CMD=( singularity exec --nv $CANDLE_IMAGE train.sh $ADLB_RANK_OFFSET $CANDLE_DATA_DIR $INSTANCE_DIRECTORY/parameters.txt ) - # train.sh must write $INSTANCE_DIRECTORY/result.txt ! - # or - # Suggest: - - # Uncomment later - # grep "CANDLE_RESULT: " $INSTANCE_DIRECTORY/model.log - # grep "CANDLE_ERROR:" - # RESULT=$( sed -n '/val_loss:/{s/val_loss: \(.*\)/\1/;p}' | tail -1 ) - # log "found result: $RESULT" - # echo $RESULT > $INSTANCE_DIRECTORY/result.txt - - - # TODO: Add wait for the above and standardize getting results from container. - echo $MODEL_CMD & - PID=$! - # FIX: This doesn't work. - wait $PID - - - # get results of the format Loss: xxx last occurence of in the model.log file - RESULT=$(awk -v FS="Loss:" 'NF>1{print $2}' model.log | tail -1) - echo $RESULT > $INSTANCE_DIRECTORY/result.txt - else # "BENCHMARKS" # The Python command line arguments: @@ -158,6 +135,18 @@ else fi log "END: SUCCESS" + + echo $MODEL_CMD & + # grep for Singularity process and wai + PID=$(ps ux | awk '/[S]ingularity/{print $2}') + wait $PID + + # get results of the format Loss: xxx last occurence of in the model.log file + RESULT=$(awk -v FS="Loss:" 'NF>1{print $2}' model.log | tail -1) + echo $RESULT > $INSTANCE_DIRECTORY/result.txt + + + exit 0 # Success # Local Variables: diff --git a/workflows/mlrMBO/test/test-singularity.sh b/workflows/mlrMBO/test/test-singularity.sh index 585679a4..1d26d992 100755 --- a/workflows/mlrMBO/test/test-singularity.sh +++ b/workflows/mlrMBO/test/test-singularity.sh @@ -48,8 +48,8 @@ then fi export CANDLE_MODEL_TYPE="SINGULARITY" -export CANDLE_IMAGE="/software/improve/images/GraphDRP:0.0.1-20221028" -export INIT_PARAMS_FILE="/tmp/test_graphdrp_apartin/graphdrp_default_model.txt" +export CANDLE_IMAGE="/software/improve/images/GraphDRP\:0.0.1-20221109.sif" +export INIT_PARAMS_FILE="/homes/jain/IMPROVE/GraphDRP/graphdrp_default_model.txt" # Submit job $EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME $CANDLE_MODEL_TYPE $CANDLE_IMAGE From 3d1d393f6c6e3e3468c0bcfd758ac150bea551a8 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Thu, 10 Nov 2022 05:30:53 -0800 Subject: [PATCH 317/601] o Cleanup the directory structure --- workflows/common/sh/model.sh | 16 ++++++++------ workflows/common/sh/utils.sh | 13 +++++++++-- workflows/common/swift/obj_app.swift | 2 +- workflows/mlrMBO/data/dummy_nightly.R | 20 +++++++++++++++++ workflows/mlrMBO/swift/workflow.sh | 28 ++++++++++++++---------- workflows/mlrMBO/test/cfg-prm-nightly.sh | 2 +- 6 files changed, 58 insertions(+), 23 deletions(-) create mode 100644 workflows/mlrMBO/data/dummy_nightly.R diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index 6cf798a8..1080c90f 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -34,13 +34,15 @@ RUNID=$3 # Each model run, runs in its own "instance" directory # Set instance_directory to that and cd into it. -# TODO: rename INSTANCE_DIRECTORY to OUTPUT_DIR -if [[ $CANDLE_MODEL_TYPE = "SINGULARITY" ]] -then - INSTANCE_DIRECTORY=$CANDLE_DATA_DIR/output/$EXPID/run/$RUNID -else # "BENCHMARKS" - INSTANCE_DIRECTORY=$TURBINE_OUTPUT/run/$RUNID -fi +# # TODO: rename INSTANCE_DIRECTORY to OUTPUT_DIR +# if [[ $CANDLE_MODEL_TYPE = "SINGULARITY" ]] +# then +# INSTANCE_DIRECTORY=$CANDLE_DATA_DIR/output/$EXPID/run/$RUNID +# else # "BENCHMARKS" +# INSTANCE_DIRECTORY=$TURBINE_OUTPUT/run/$RUNID +# fi + +INSTANCE_DIRECTORY=$TURBINE_OUTPUT/run/$RUNID # All stdout/stderr after this point goes into model.log ! mkdir -p $INSTANCE_DIRECTORY diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index 93146917..c9835809 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -126,6 +126,7 @@ get_site() export SITE=$1 } + check_experiment() { if [[ -d $TURBINE_OUTPUT ]]; then while true; do @@ -157,10 +158,18 @@ get_expid() echo "get_expid(): could not find EXPID argument!" return 1 fi + + export EXPID=$1 + export CANDLE_MODEL_TYPE=$2 - EXPERIMENTS=${EXPERIMENTS:-$EMEWS_PROJECT_ROOT/experiments} + export EXPERIMENTS="" - export EXPID=$1 + if [[ $CANDLE_MODEL_TYPE = "SINGULARITY" ]] + then + EXPERIMENTS=${EXPERIMENTS:-$CANDLE_DATA_DIR/output/experiments} + else # "BENCHMARKS" + EXPERIMENTS=${EXPERIMENTS:-$EMEWS_PROJECT_ROOT/experiments} + fi local i=0 EXPS E TO diff --git a/workflows/common/swift/obj_app.swift b/workflows/common/swift/obj_app.swift index 315790af..1965149c 100644 --- a/workflows/common/swift/obj_app.swift +++ b/workflows/common/swift/obj_app.swift @@ -56,7 +56,7 @@ { obj_result = get_results(result_file); } - // printf("result(%s): %s", run_id, obj_result); + printf("result(%s): %s", run_id, obj_result); } /** diff --git a/workflows/mlrMBO/data/dummy_nightly.R b/workflows/mlrMBO/data/dummy_nightly.R new file mode 100644 index 00000000..797c4e5b --- /dev/null +++ b/workflows/mlrMBO/data/dummy_nightly.R @@ -0,0 +1,20 @@ + +# NT3 Hyperparameter Search - Test 1 +# These parameters should stay small for short tests +# and use no dense parameters to avoid mlrMBO crashes + +# see https://cran.r-project.org/web/packages/ParamHelpers/ParamHelpers.pdfmakeNum +# the parameter names should match names of the arguments expected by the benchmark + +param.set <- makeParamSet( + makeDiscreteParam("test_batch", values = c(8, 16)), + # makeIntegerParam("epochs", lower = 1, upper = 1), +# makeDiscreteParam("activation", values = c("softmax", "elu", "softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear")), +# makeDiscreteParam("dense", values = c("500 100 50", "1000 500 100 50", "2000 1000 500 100 50", "2000 1000 1000 500 100 50", "2000 1000 1000 1000 500 100 50")), + # makeDiscreteParam("optimizer", values = c("adam", "sgd", "rmsprop", "adagrad", "adadelta")), + # makeNumericParam("dropout", lower = 0, upper = 0.9), + makeNumericParam("learning_rate", lower = 0.001, upper = 0.1) +# makeDiscreteParam("conv", values = c("50 50 50 50 50 1", "25 25 25 25 25 1", "64 32 16 32 64 1", "100 100 100 100 100 1", "32 20 16 32 10 1")) + ## DEBUG PARAMETERS: DON'T USE THESE IN PRODUCTION RUN + ## makeDiscreteParam("conv", values = c("32 20 16 32 10 1")) +) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index 1549c3c2..b05cdcbe 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -42,29 +42,33 @@ then exit 1 fi +CANDLE_MODEL_TYPE="BENCHMARKS" + if (( ${#} == 7 )) then - get_site $1 # Sets SITE - get_expid $2 # Sets EXPID - get_cfg_sys $3 - get_cfg_prm $4 - MODEL_NAME=$5 + echo "HERE", $5 CANDLE_MODEL_TYPE=$6 CANDLE_IMAGE=$7 - elif (( ${#} == 5 )) then - get_site $1 # Sets SITE - get_expid $2 # Sets EXPID - get_cfg_sys $3 - get_cfg_prm $4 - MODEL_NAME=$5 + echo "Not a singularity run" else usage exit 1 fi -# echo "Running "$MODEL_NAME "workflow with" $CANDLE_MODEL_TYPE "and image" $CANDLE_IMAGE +TURBINE_OUTPUT="" +if [[ $CANDLE_MODEL_TYPE = "SINGULARITY" ]] +then + TURBINE_OUTPUT=$CANDLE_DATA_DIR/output/ + echo "Running "$MODEL_NAME "workflow with" $CANDLE_MODEL_TYPE "and image" $CANDLE_IMAGE +fi + +get_site $1 # Sets SITE +get_expid $2 $CANDLE_MODEL_TYPE # Sets EXPID +get_cfg_sys $3 +get_cfg_prm $4 +MODEL_NAME=$5 # Set PYTHONPATH for BENCHMARK related stuff PYTHONPATH+=:$BENCHMARK_DIR # :$BENCHMARKS_ROOT/common # This is now candle_lib diff --git a/workflows/mlrMBO/test/cfg-prm-nightly.sh b/workflows/mlrMBO/test/cfg-prm-nightly.sh index 1aef4fec..a19e8ae3 100644 --- a/workflows/mlrMBO/test/cfg-prm-nightly.sh +++ b/workflows/mlrMBO/test/cfg-prm-nightly.sh @@ -29,7 +29,7 @@ elif [ "$MODEL_NAME" = "p1b2" ]; then elif [ "$MODEL_NAME" = "p2b1" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p2b1_nightly.R} elif [ "$MODEL_NAME" = "dummy" ]; then - PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/nt3_nightly.R} + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/dummy_nightly.R} elif [ "$PARAM_SET_FILE" != "" ]; then PARAM_SET_FILE=${EMEWS_PROJECT_ROOT}/data/${PARAM_SET_FILE} else From ee66949e440f734d1c9b4e4e2b6d4c35afe4b0b0 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Thu, 10 Nov 2022 06:09:04 -0800 Subject: [PATCH 318/601] o Make tests smaller o Add comments --- workflows/common/sh/model.sh | 3 +-- workflows/mlrMBO/test/cfg-prm-nightly.sh | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index 1080c90f..e639f3ea 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -91,6 +91,7 @@ then # get hyper_parameter_map to pass as 2nd argument python3 $WORKFLOWS_ROOT/common/python/runner_utils.py write_params $PARAMS $INIT_PARAMS_FILE + # TODO: May need to bind a directory MODEL_CMD=( singularity exec --nv $CANDLE_IMAGE train.sh $ADLB_RANK_OFFSET $CANDLE_DATA_DIR $INSTANCE_DIRECTORY/parameters.txt ) else # "BENCHMARKS" @@ -147,8 +148,6 @@ log "END: SUCCESS" RESULT=$(awk -v FS="Loss:" 'NF>1{print $2}' model.log | tail -1) echo $RESULT > $INSTANCE_DIRECTORY/result.txt - - exit 0 # Success # Local Variables: diff --git a/workflows/mlrMBO/test/cfg-prm-nightly.sh b/workflows/mlrMBO/test/cfg-prm-nightly.sh index a19e8ae3..47ba2936 100644 --- a/workflows/mlrMBO/test/cfg-prm-nightly.sh +++ b/workflows/mlrMBO/test/cfg-prm-nightly.sh @@ -3,11 +3,11 @@ # mlrMBO settings # Total iterations -PROPOSE_POINTS=${PROPOSE_POINTS:-15} +PROPOSE_POINTS=${PROPOSE_POINTS:-5} MAX_CONCURRENT_EVALUATIONS=${MAX_CONCURRET_EVALUATIONS:-1} MAX_ITERATIONS=${MAX_ITERATIONS:-3} MAX_BUDGET=${MAX_BUDGET:-180} -DESIGN_SIZE=${DESIGN_SIZE:-15} +DESIGN_SIZE=${DESIGN_SIZE:-5} # TODO: move the following code to a utility library- # this is a configuration file From d03062842a16b9fc33d104b3ecb32fbf890f1b6a Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Thu, 10 Nov 2022 07:54:39 -0800 Subject: [PATCH 319/601] o Fix pre-commit --- workflows/common/sh/utils.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index c9835809..a35bb7a2 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -158,7 +158,7 @@ get_expid() echo "get_expid(): could not find EXPID argument!" return 1 fi - + export EXPID=$1 export CANDLE_MODEL_TYPE=$2 From 8ad8338ca455b4033cde0f378576651c3311046b Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Thu, 10 Nov 2022 12:00:47 -0800 Subject: [PATCH 320/601] remove comment --- workflows/mlrMBO/swift/workflow.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index b05cdcbe..0ef5ee70 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -46,7 +46,6 @@ CANDLE_MODEL_TYPE="BENCHMARKS" if (( ${#} == 7 )) then - echo "HERE", $5 CANDLE_MODEL_TYPE=$6 CANDLE_IMAGE=$7 elif (( ${#} == 5 )) From a3b32e4e44f3959b2489badaf0e50052ce1d981c Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Sun, 13 Nov 2022 12:52:19 -0800 Subject: [PATCH 321/601] o Make singularity test specific for GraphDRP and lambda0 o Add a parameters (mlrMBO) sample file for graphDRP --- workflows/mlrMBO/data/graphdrp_small.R | 15 +++++++++++++++ workflows/mlrMBO/test/cfg-prm-nightly.sh | 2 ++ ...st-singularity.sh => test-graphdrp-lambda0.sh} | 0 3 files changed, 17 insertions(+) create mode 100644 workflows/mlrMBO/data/graphdrp_small.R rename workflows/mlrMBO/test/{test-singularity.sh => test-graphdrp-lambda0.sh} (100%) diff --git a/workflows/mlrMBO/data/graphdrp_small.R b/workflows/mlrMBO/data/graphdrp_small.R new file mode 100644 index 00000000..07a199d2 --- /dev/null +++ b/workflows/mlrMBO/data/graphdrp_small.R @@ -0,0 +1,15 @@ + +# NT3 Hyperparameter Search - Test 1 +# These parameters should stay small for short tests +# and use no dense parameters to avoid mlrMBO crashes + +# see https://cran.r-project.org/web/packages/ParamHelpers/ParamHelpers.pdfmakeNum +# the parameter names should match names of the arguments expected by the benchmark + +param.set <- makeParamSet( + makeDiscreteParam("test_batch", values = c(8, 16)), + makeIntegerParam("epochs", lower = 1, upper = 2), + # makeDiscreteParam("optimizer", values = c("adam", "sgd", "rmsprop", "adagrad", "adadelta")), + # makeNumericParam("dropout", lower = 0, upper = 0.9), + makeNumericParam("learning_rate", lower = 0.001, upper = 0.1) +) diff --git a/workflows/mlrMBO/test/cfg-prm-nightly.sh b/workflows/mlrMBO/test/cfg-prm-nightly.sh index 47ba2936..50367b0a 100644 --- a/workflows/mlrMBO/test/cfg-prm-nightly.sh +++ b/workflows/mlrMBO/test/cfg-prm-nightly.sh @@ -28,6 +28,8 @@ elif [ "$MODEL_NAME" = "p1b2" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b2_nightly.R} elif [ "$MODEL_NAME" = "p2b1" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p2b1_nightly.R} +elif [ "$MODEL_NAME" = "graphdrp" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/graphdrp_small.R} elif [ "$MODEL_NAME" = "dummy" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/dummy_nightly.R} elif [ "$PARAM_SET_FILE" != "" ]; then diff --git a/workflows/mlrMBO/test/test-singularity.sh b/workflows/mlrMBO/test/test-graphdrp-lambda0.sh similarity index 100% rename from workflows/mlrMBO/test/test-singularity.sh rename to workflows/mlrMBO/test/test-graphdrp-lambda0.sh From c2468c8f04e442dd7a9afe898f082a6c5bf16cde Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Sun, 13 Nov 2022 13:12:59 -0800 Subject: [PATCH 322/601] o Change config file path --- workflows/common/sh/model.sh | 2 +- workflows/mlrMBO/test/test-graphdrp-lambda0.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index e639f3ea..ef5ba957 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -92,7 +92,7 @@ then python3 $WORKFLOWS_ROOT/common/python/runner_utils.py write_params $PARAMS $INIT_PARAMS_FILE # TODO: May need to bind a directory - MODEL_CMD=( singularity exec --nv $CANDLE_IMAGE train.sh $ADLB_RANK_OFFSET + MODEL_CMD=( singularity exec --bind $CANDLE_DATA_DIR --nv $CANDLE_IMAGE train.sh $ADLB_RANK_OFFSET $CANDLE_DATA_DIR $INSTANCE_DIRECTORY/parameters.txt ) else # "BENCHMARKS" diff --git a/workflows/mlrMBO/test/test-graphdrp-lambda0.sh b/workflows/mlrMBO/test/test-graphdrp-lambda0.sh index 1d26d992..26634464 100755 --- a/workflows/mlrMBO/test/test-graphdrp-lambda0.sh +++ b/workflows/mlrMBO/test/test-graphdrp-lambda0.sh @@ -49,7 +49,7 @@ fi export CANDLE_MODEL_TYPE="SINGULARITY" export CANDLE_IMAGE="/software/improve/images/GraphDRP\:0.0.1-20221109.sif" -export INIT_PARAMS_FILE="/homes/jain/IMPROVE/GraphDRP/graphdrp_default_model.txt" +export INIT_PARAMS_FILE="/software/improve/graphdrp_default_model.txt" # Submit job $EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME $CANDLE_MODEL_TYPE $CANDLE_IMAGE From 183d6f16f9d8670558a3abbe97eec0b8349c0124 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sun, 13 Nov 2022 16:14:16 -0600 Subject: [PATCH 323/601] Better comments --- workflows/upf/swift/workflow.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index b13e6b1d..7b44edde 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -29,10 +29,10 @@ then fi if ! { - get_site $1 # Sets SITE - get_expid $2 # Sets EXPID, TURBINE_OUTPUT - get_cfg_sys $3 - UPF=$4 + get_site $1 # Sets SITE + get_expid $2 # Sets EXPID, TURBINE_OUTPUT + get_cfg_sys $3 # Sets CFG_SYS + UPF=$4 # The JSON hyperparameter file } then usage From fa7a2c8f20a9aea1820b47fd5b328af1b113b637 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sun, 13 Nov 2022 16:14:33 -0600 Subject: [PATCH 324/601] In get_expid(), give CANDLE_MODEL_TYPE default=BENCHMARKS --- workflows/common/sh/utils.sh | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index a35bb7a2..b9cd5492 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -143,11 +143,13 @@ check_experiment() { get_expid() # Get Experiment IDentifier -# EXPID is the name of the new directory under experiments/ -# If the user provides -a, this function will autogenerate -# a new EXPID under the experiments directory, -# If EXP_SUFFIX is set in the environment, the resulting -# EXPID will have that suffix. +# EXPID: The name of the new directory under experiments/ +# If the user provides -a, this function will autogenerate +# a new EXPID under the experiments directory, +# If EXP_SUFFIX is set in the environment, the resulting +# EXPID will have that suffix. +# CANDLE_MODEL_TYPE: "BENCHMARKS" or "SINGULARITY" +# Defaults to "BENCHMARKS" # RETURN VALUES: EXPID and TURBINE_OUTPUT are exported into the environment # TURBINE_OUTPUT is canonicalized, because it may be soft-linked # to another filesystem (e.g., on Summit), and must be accessible @@ -155,12 +157,12 @@ get_expid() { if (( ${#} < 1 )) then - echo "get_expid(): could not find EXPID argument!" + echo "get_expid(): provide EXPID [CANDLE_MODEL_TYPE?]" return 1 fi export EXPID=$1 - export CANDLE_MODEL_TYPE=$2 + export CANDLE_MODEL_TYPE=${2:-Benchmarks} export EXPERIMENTS="" From 8284c0a36d16d4da961396638b91aa68d75e0774 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sun, 13 Nov 2022 16:17:31 -0600 Subject: [PATCH 325/601] Say EQ/R is not installed on GCE --- workflows/common/sh/env-gce.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/common/sh/env-gce.sh b/workflows/common/sh/env-gce.sh index 084125ea..3730085d 100644 --- a/workflows/common/sh/env-gce.sh +++ b/workflows/common/sh/env-gce.sh @@ -7,6 +7,8 @@ SWIFT=$SFW/swift-t/mpich/2022-11-09-Jenkins PATH=$SWIFT/stc/bin:$PATH +EQR=not-installed + # For test output processing: export LOCAL=1 export CRAY=0 From b9abfff021ec66896e85f0855867aa83f08e73da Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sun, 13 Nov 2022 16:20:01 -0600 Subject: [PATCH 326/601] More GCE settings --- workflows/common/sh/env-gce.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/workflows/common/sh/env-gce.sh b/workflows/common/sh/env-gce.sh index 3730085d..e2e8b274 100644 --- a/workflows/common/sh/env-gce.sh +++ b/workflows/common/sh/env-gce.sh @@ -13,6 +13,9 @@ EQR=not-installed export LOCAL=1 export CRAY=0 +LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} +SWIFT_IMPL="app" + # Cf. utils.sh log_path LD_LIBRARY_PATH log_path PYTHONPATH From 65a5329b1c5b60419bc47851d71e99e22cbb325e Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sun, 13 Nov 2022 16:30:29 -0600 Subject: [PATCH 327/601] Debug GCE paths --- workflows/common/sh/env-gce.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/workflows/common/sh/env-gce.sh b/workflows/common/sh/env-gce.sh index e2e8b274..f5eaa7cf 100644 --- a/workflows/common/sh/env-gce.sh +++ b/workflows/common/sh/env-gce.sh @@ -7,6 +7,9 @@ SWIFT=$SFW/swift-t/mpich/2022-11-09-Jenkins PATH=$SWIFT/stc/bin:$PATH +echo $SWIFT +echo $PATH + EQR=not-installed # For test output processing: @@ -17,5 +20,6 @@ LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} SWIFT_IMPL="app" # Cf. utils.sh +log_path PATH log_path LD_LIBRARY_PATH log_path PYTHONPATH From df136f8cb45fb82555722adbbb415e814b3cf9cf Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sun, 13 Nov 2022 16:54:42 -0600 Subject: [PATCH 328/601] Add sched-gce --- workflows/common/sh/sched-gce.sh | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 workflows/common/sh/sched-gce.sh diff --git a/workflows/common/sh/sched-gce.sh b/workflows/common/sh/sched-gce.sh new file mode 100644 index 00000000..9db8ac81 --- /dev/null +++ b/workflows/common/sh/sched-gce.sh @@ -0,0 +1,4 @@ + +# SCHED GCE + +# Nothing: Unscheduled mpiexec execution From fe44a47c003b76784d4c8d7ee8afbd1fe4c76a6d Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sun, 13 Nov 2022 16:54:52 -0600 Subject: [PATCH 329/601] Update Swift/T location for GCE --- workflows/common/sh/env-gce.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/sh/env-gce.sh b/workflows/common/sh/env-gce.sh index f5eaa7cf..32a982cd 100644 --- a/workflows/common/sh/env-gce.sh +++ b/workflows/common/sh/env-gce.sh @@ -2,7 +2,7 @@ # ENV GCE # Environment settings for ANL/GCE compute nodes -SFW=/nfs/gce/projects/Swift-T/sfw/x86_64 +SFW=/nfs/gce/projects/Swift-T/sfw/x86_64/U20 SWIFT=$SFW/swift-t/mpich/2022-11-09-Jenkins PATH=$SWIFT/stc/bin:$PATH From e5cee464818a5c0c0f0dd4aec17630d8f3c8f2d1 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sun, 13 Nov 2022 16:55:02 -0600 Subject: [PATCH 330/601] Better path formatting --- workflows/common/sh/utils.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index b9cd5492..a85ae129 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -65,7 +65,9 @@ log_path() # Provide the name of the variable (no dollar sign) { echo ${1}: - eval echo \$$1 | tr : '\n' | nl + eval echo \$$1 | tr : '\n' | nl --number-width=2 + echo -- + echo } which_check() From d7f6f1c0f0e8c45ee3bb90e4b5ed6e22525a2067 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sun, 13 Nov 2022 17:00:09 -0600 Subject: [PATCH 331/601] Better formatting --- workflows/common/sh/utils.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index a85ae129..9b21d735 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -65,7 +65,7 @@ log_path() # Provide the name of the variable (no dollar sign) { echo ${1}: - eval echo \$$1 | tr : '\n' | nl --number-width=2 + eval echo \$$1 | tr : '\n' | nl --number-width=2 --number-separator ": " echo -- echo } From 84d55abc11f3b27d83d6dca7c930c8863b31549e Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sun, 13 Nov 2022 17:01:48 -0600 Subject: [PATCH 332/601] Handle missing LD_LIBRARY_PATH --- workflows/common/swift/candle_utils.swift | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/workflows/common/swift/candle_utils.swift b/workflows/common/swift/candle_utils.swift index 9f033b05..de021110 100644 --- a/workflows/common/swift/candle_utils.swift +++ b/workflows/common/swift/candle_utils.swift @@ -17,10 +17,13 @@ foreach token $tokens { puts "PATH: $token" } puts "" -set tokens [ split $env(LD_LIBRARY_PATH) ":" ] -foreach token $tokens { - puts "LLP: $token" +if [ info exists env(LD_LIBRARY_PATH) ] { + set tokens [ split $env(LD_LIBRARY_PATH) ":" ] + foreach token $tokens { + puts "LLP: $token" + } } +puts "" if [ info exists env(PYTHONHOME) ] { puts "" puts "PYTHONHOME: $env(PYTHONHOME)" From 0c0c2cea832c23d54dc3e10e1fe4cd672c5a8501 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sun, 13 Nov 2022 20:01:05 -0600 Subject: [PATCH 333/601] More GCE settings --- workflows/common/sh/env-gce.sh | 8 +++++++- workflows/common/sh/sched-gce.sh | 4 ++++ 2 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 workflows/common/sh/sched-gce.sh diff --git a/workflows/common/sh/env-gce.sh b/workflows/common/sh/env-gce.sh index 084125ea..35dc5c91 100644 --- a/workflows/common/sh/env-gce.sh +++ b/workflows/common/sh/env-gce.sh @@ -2,15 +2,21 @@ # ENV GCE # Environment settings for ANL/GCE compute nodes -SFW=/nfs/gce/projects/Swift-T/sfw/x86_64 +SFW=/nfs/gce/projects/Swift-T/sfw/x86_64/U20 SWIFT=$SFW/swift-t/mpich/2022-11-09-Jenkins PATH=$SWIFT/stc/bin:$PATH +EQR=not-installed + +SWIFT_IMPL="app" + # For test output processing: export LOCAL=1 export CRAY=0 +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} + # Cf. utils.sh log_path LD_LIBRARY_PATH log_path PYTHONPATH diff --git a/workflows/common/sh/sched-gce.sh b/workflows/common/sh/sched-gce.sh new file mode 100644 index 00000000..8f266bdc --- /dev/null +++ b/workflows/common/sh/sched-gce.sh @@ -0,0 +1,4 @@ + +# SCHED GCE + +# Nothing- local unscheduled mpiexec execution. From ac933841ffc6ea75f0b74e60bebd829956c8b9b5 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sun, 13 Nov 2022 20:05:53 -0600 Subject: [PATCH 334/601] New langs-app-gce --- workflows/common/sh/langs-app-gce.sh | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 workflows/common/sh/langs-app-gce.sh diff --git a/workflows/common/sh/langs-app-gce.sh b/workflows/common/sh/langs-app-gce.sh new file mode 100644 index 00000000..16ec39dc --- /dev/null +++ b/workflows/common/sh/langs-app-gce.sh @@ -0,0 +1,7 @@ + +# LANGS APP GCE + +PATH=/nfs/gce/globalscratch/jain/conda_installs/bin:$PATH + +echo "langs-app-gce: using python:" +which python From db8bbd2194a846ce55d9801a034a1afb639fe9b1 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 14 Nov 2022 16:20:22 -0600 Subject: [PATCH 335/601] Update Swift/T for GCE --- workflows/common/sh/env-gce.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/workflows/common/sh/env-gce.sh b/workflows/common/sh/env-gce.sh index dd3c09ed..e3c683ed 100644 --- a/workflows/common/sh/env-gce.sh +++ b/workflows/common/sh/env-gce.sh @@ -3,15 +3,19 @@ # Environment settings for ANL/GCE compute nodes SFW=/nfs/gce/projects/Swift-T/sfw/x86_64/U20 -SWIFT=$SFW/swift-t/mpich/2022-11-09-Jenkins +# Python only: +# SWIFT=$SFW/swift-t/mpich/2022-11-14-Jenkins +# Python+R: +SWIFT=$SFW/swift-t/mpich/2022-11-14-Jenkins PATH=$SWIFT/stc/bin:$PATH echo $SWIFT -export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} +# Needed for Swift/T+R +export LD_LIBRARY_PATH=$SFW/R-4.1.0/lib/R/lib -EQR=not-installed +EQR=$SFW/EQ-R SWIFT_IMPL="app" # For test output processing: From 4365eaa9f9535bc0482781076af401e2d7c24be5 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 15 Nov 2022 12:27:01 -0600 Subject: [PATCH 336/601] Use APP_PYTHONPATH --- workflows/common/sh/langs-app-gce.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/common/sh/langs-app-gce.sh b/workflows/common/sh/langs-app-gce.sh index 16ec39dc..7bcada0a 100644 --- a/workflows/common/sh/langs-app-gce.sh +++ b/workflows/common/sh/langs-app-gce.sh @@ -5,3 +5,5 @@ PATH=/nfs/gce/globalscratch/jain/conda_installs/bin:$PATH echo "langs-app-gce: using python:" which python + +export PYTHONPATH=${APP_PYTHONPATH:-} From 17d94669cd22e481d2d01f69da229ea48ab46798 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 16 Nov 2022 14:50:37 -0600 Subject: [PATCH 337/601] o Fix runs for non singularity model.sh, still need to fix objective function def. and standardization of result capture. o Add wait --- scratch/horovod2/test-2.c | 3 +-- workflows/common/sh/model.sh | 11 +++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/scratch/horovod2/test-2.c b/scratch/horovod2/test-2.c index 7d6a7ee5..e49e5253 100644 --- a/scratch/horovod2/test-2.c +++ b/scratch/horovod2/test-2.c @@ -5,8 +5,7 @@ #include "controller.h" -int -main() +int main() { printf("OK\n"); return 0; diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index ef5ba957..7129e27e 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -140,6 +140,9 @@ fi log "END: SUCCESS" echo $MODEL_CMD & + +if [[ $CANDLE_MODEL_TYPE == "SINGULARITY" ]] +then # grep for Singularity process and wai PID=$(ps ux | awk '/[S]ingularity/{print $2}') wait $PID @@ -147,7 +150,15 @@ log "END: SUCCESS" # get results of the format Loss: xxx last occurence of in the model.log file RESULT=$(awk -v FS="Loss:" 'NF>1{print $2}' model.log | tail -1) echo $RESULT > $INSTANCE_DIRECTORY/result.txt +else + PID = $! + wait $PID + # FIXME: just grepping "loss:" and value after it and putting into result txt file + # get results of the format Loss: xxx last occurence of in the model.log file + RESULT=$(awk -v FS="loss:" 'NF>1{print $2}' model.log | tail -1) + echo $RESULT > $INSTANCE_DIRECTORY/result.txt +fi exit 0 # Success # Local Variables: From 1fd396791f2891ba5cc9741a98cb35dd62121a06 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 18 Nov 2022 13:11:10 -0600 Subject: [PATCH 338/601] Set APP_PYTHONPATH for UPF --- workflows/upf/swift/workflow.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index 7b44edde..f7301184 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -44,6 +44,9 @@ PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common PYTHONPATH+=:$WORKFLOWS_ROOT/common/python export PYTHONPATH +# Set PYTHONPATH for BENCHMARK related stuff in obj_app mode +export APP_PYTHONPATH+=:$BENCHMARK_DIR # :$BENCHMARKS_ROOT/common # This is now candle_lib + source_site env $SITE source_site sched $SITE From ed161f0dbffe8903ea2c9f7316762e8742880cde Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 18 Nov 2022 13:23:04 -0600 Subject: [PATCH 339/601] WIP: Fix handling of background model run --- workflows/common/sh/model.sh | 58 +++++++++++++++++------------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index 7129e27e..eace5493 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -111,35 +111,7 @@ fi log "MODEL_CMD: ${MODEL_CMD[@]}" # Run Python! -if $TIMEOUT_CMD "${MODEL_CMD[@]}" -then - : # Assume success so we can keep a failed exit code -else - # $? is the exit status of the most recently executed command - # (i.e the line in the 'if' condition) - CODE=$? - echo # spacer - if (( $CODE == 124 )) - then - log "TIMEOUT ERROR! (timeout=$SH_TIMEOUT)" - # This will trigger a NaN (the result file does not exist) - exit 0 - else - log "MODEL ERROR! (CODE=$CODE)" - if (( ${IGNORE_ERRORS:-0} )) - then - log "IGNORING ERROR." - # This will trigger a NaN (the result file does not exist) - exit 0 - fi - log "ABORTING WORKFLOW (exit 1)" - exit 1 # Unknown error in Python: abort the workflow - fi -fi - -log "END: SUCCESS" - - echo $MODEL_CMD & +$TIMEOUT_CMD "${MODEL_CMD[@]}" & if [[ $CANDLE_MODEL_TYPE == "SINGULARITY" ]] then @@ -151,14 +123,40 @@ then RESULT=$(awk -v FS="Loss:" 'NF>1{print $2}' model.log | tail -1) echo $RESULT > $INSTANCE_DIRECTORY/result.txt else - PID = $! + PID=$! wait $PID + CODE=$? + if (( CODE )) + then + # $? is the exit status of the most recently executed command + # (i.e the line in the 'if' condition) + echo # spacer + if (( $CODE == 124 )) + then + log "TIMEOUT ERROR! (timeout=$SH_TIMEOUT)" + # This will trigger a NaN (the result file does not exist) + exit 0 + else + log "MODEL ERROR! (CODE=$CODE)" + if (( ${IGNORE_ERRORS:-0} )) + then + log "IGNORING ERROR." + # This will trigger a NaN (the result file does not exist) + exit 0 + fi + log "ABORTING WORKFLOW (exit 1)" + exit 1 # Unknown error in Python: abort the workflow + fi + fi # FIXME: just grepping "loss:" and value after it and putting into result txt file # get results of the format Loss: xxx last occurence of in the model.log file RESULT=$(awk -v FS="loss:" 'NF>1{print $2}' model.log | tail -1) echo $RESULT > $INSTANCE_DIRECTORY/result.txt fi + +log "END: SUCCESS" + exit 0 # Success # Local Variables: From 32aeb996b114381c482a6d6040900cba38cff1c2 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 18 Nov 2022 14:25:42 -0600 Subject: [PATCH 340/601] Restore GA test-1 --- workflows/GA/swift/workflow.sh | 13 ++++++++++--- workflows/GA/swift/workflow.swift | 8 ++++++-- workflows/GA/test/cfg-sys-1.sh | 3 +++ workflows/common/sh/env-gce.sh | 2 ++ workflows/known-benchmarks.sh | 26 ++++++++++++++++++++++++++ 5 files changed, 47 insertions(+), 5 deletions(-) create mode 100644 workflows/known-benchmarks.sh diff --git a/workflows/GA/swift/workflow.sh b/workflows/GA/swift/workflow.sh index 33d1153e..d85a4b08 100755 --- a/workflows/GA/swift/workflow.sh +++ b/workflows/GA/swift/workflow.sh @@ -56,11 +56,14 @@ fi echo "Running "$MODEL_NAME "workflow" source_site env $SITE -source_site sched $SITE +source_site sched $SITE -# Set PYTHONPATH for BENCHMARK related stuff EQPY=${EQPY:-$WORKFLOWS_ROOT/common/ext/EQ-Py} -PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$EQPY + +# Set PYTHONPATH for BENCHMARK related stuff +source $WORKFLOWS_ROOT/known-benchmarks.sh +PYTHONPATH+=:$EQPY +PYTHONPATH+=:$WORKFLOWS_ROOT/common/python export TURBINE_JOBNAME="JOB:${EXPID}" CMD_LINE_ARGS=( -ga_params=$PARAM_SET_FILE @@ -134,6 +137,9 @@ fi # echo's anything following this to standard out +echo APP_PYPATH $APP_PYTHONPATH + + swift-t -O 0 -n $PROCS \ ${MACHINE:-} \ -p -I $EQPY -r $EQPY \ @@ -145,6 +151,7 @@ swift-t -O 0 -n $PROCS \ -e BENCHMARKS_ROOT \ -e EMEWS_PROJECT_ROOT \ $( python_envs ) \ + -e APP_PYTHONPATH \ -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ -e OBJ_RETURN \ -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ diff --git a/workflows/GA/swift/workflow.swift b/workflows/GA/swift/workflow.swift index 0d4836ff..dbca265a 100644 --- a/workflows/GA/swift/workflow.swift +++ b/workflows/GA/swift/workflow.swift @@ -10,10 +10,14 @@ import location; import string; import unix; import EQPy; -import R; import assert; import python; +import candle_utils; +report_env(); + +python("import sys ; import csv ; import _csv ; print('HELLO') ; sys.stdout.flush()"); + string emews_root = getenv("EMEWS_PROJECT_ROOT"); string turbine_output = getenv("TURBINE_OUTPUT"); string resident_work_ranks = getenv("RESIDENT_WORK_RANKS"); @@ -88,7 +92,7 @@ string FRAMEWORK = "keras"; // (num_iter, num_pop, seed, strategy, mut_prob, ga_params_file) algo_params = "%d,%d,%d,'%s',%f, '%s', '%s'" % (iters, pop, seed, strategy, mut_prob, ga_params_file, init_params_file); - EQPy_init_package(ME,"deap_ga") => + EQPy_init_package(ME, "deap_ga") => EQPy_get(ME) => EQPy_put(ME, algo_params) => loop(ME, ME_rank) => { diff --git a/workflows/GA/test/cfg-sys-1.sh b/workflows/GA/test/cfg-sys-1.sh index 27bd5ac5..af1a6ca3 100644 --- a/workflows/GA/test/cfg-sys-1.sh +++ b/workflows/GA/test/cfg-sys-1.sh @@ -38,6 +38,9 @@ export IGNORE_ERRORS=0 # it may be ncessary to include its location in the PYTHONPATH # export PYTHONPATH=/global/u1/n/ncollier/.local/cori/deeplearning2.7/lib/python2.7/site-packages +export TURBINE_RESIDENT_WORK_WORKERS=1 +export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) + # for running locally, edit as necessary # export PYTHONHOME=$HOME/anaconda3 # export PYTHON=python3.6 diff --git a/workflows/common/sh/env-gce.sh b/workflows/common/sh/env-gce.sh index e3c683ed..2bd063c5 100644 --- a/workflows/common/sh/env-gce.sh +++ b/workflows/common/sh/env-gce.sh @@ -15,6 +15,8 @@ echo $SWIFT # Needed for Swift/T+R export LD_LIBRARY_PATH=$SFW/R-4.1.0/lib/R/lib +export PYTHONPATH=${PYTHONPATH:-} + EQR=$SFW/EQ-R SWIFT_IMPL="app" diff --git a/workflows/known-benchmarks.sh b/workflows/known-benchmarks.sh new file mode 100644 index 00000000..469a04f9 --- /dev/null +++ b/workflows/known-benchmarks.sh @@ -0,0 +1,26 @@ + +# Known Benchmarks +# Generate the list of Benchmarks that Supervisor knows about +# To add a Benchmark, ... +# To call an unknown model, set MODEL_NAME and ... + +BENCHMARKS_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd ) +export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} + +BENCHMARKS_DIRS_BASE="" +BENCHMARKS_DIRS_BASE+=$BENCHMARKS_ROOT/Pilot1/P1B1: +BENCHMARKS_DIRS_BASE+=$BENCHMARKS_ROOT/Pilot1/Attn1: +BENCHMARKS_DIRS_BASE+=$BENCHMARKS_ROOT/Pilot1/NT3: +BENCHMARKS_DIRS_BASE+=$BENCHMARKS_ROOT/examples/ADRP: +BENCHMARKS_DIRS_BASE+=$BENCHMARKS_ROOT/examples/xform-smiles + +export BENCHMARK_TIMEOUT +export BENCHMARK_DIRS=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} + +# Set PYTHONPATH and/or APP_PYTHONPATH appropriately based on SWIFT_IMPL +# ... + +APP_PYTHONPATH+=:$BENCHMARK_DIRS:$BENCHMARKS_ROOT/common + PYTHONPATH+=:$BENCHMARK_DIRS:$BENCHMARKS_ROOT/common + +export APP_PYTHONPATH From 051cb2953580280d25486d334d1af0c6b6abbd3a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 18 Nov 2022 15:00:12 -0600 Subject: [PATCH 341/601] Clean up comments --- workflows/common/sh/model.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index eace5493..ac0f6a09 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -5,6 +5,9 @@ set -eu # Shell wrapper around Keras model +# Note that APP_PYTHONPATH is used by models here and +# not just PYTHONPATH + # Note: Under Swift/T, the initial output from here will go # to the main Swift/T stdout and be mixed with output from # other models. @@ -27,8 +30,8 @@ then exit 1 fi -FRAMEWORK=$1 # Usually "keras" -# JSON string of parameters +FRAMEWORK=$1 # Usually "keras" or "pytorch" +# JSON string of parameters: PARAMS="$2" RUNID=$3 @@ -128,8 +131,6 @@ else CODE=$? if (( CODE )) then - # $? is the exit status of the most recently executed command - # (i.e the line in the 'if' condition) echo # spacer if (( $CODE == 124 )) then @@ -149,8 +150,7 @@ else fi fi - # FIXME: just grepping "loss:" and value after it and putting into result txt file - # get results of the format Loss: xxx last occurence of in the model.log file + # Get results from model.log: last occurrence of "loss: xxx" RESULT=$(awk -v FS="loss:" 'NF>1{print $2}' model.log | tail -1) echo $RESULT > $INSTANCE_DIRECTORY/result.txt fi From 7f6e6bc7e2889f7b49a89580af06d14bc63f844f Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 18 Nov 2022 15:00:21 -0600 Subject: [PATCH 342/601] More comment docs --- workflows/known-benchmarks.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/workflows/known-benchmarks.sh b/workflows/known-benchmarks.sh index 469a04f9..c630b2e5 100644 --- a/workflows/known-benchmarks.sh +++ b/workflows/known-benchmarks.sh @@ -1,8 +1,10 @@ # Known Benchmarks # Generate the list of Benchmarks that Supervisor knows about -# To add a Benchmark, ... -# To call an unknown model, set MODEL_NAME and ... +# To add a Known Benchmark, add its paths to BENCHMARKS_DIRS_BASE below +# To call an unknown model, +# set environment variable MODEL_NAME to the short name +# set PYTHONPATH and/or APP_PYTHONPATH as needed BENCHMARKS_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd ) export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} From cbb972c42f8942b091513eff32a0448845cdc93d Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 15 Dec 2022 14:44:04 -0600 Subject: [PATCH 343/601] New settings for Polaris- basically works --- workflows/common/sh/env-polaris.sh | 18 ++++++++++++++++++ workflows/common/sh/langs-app-polaris.sh | 4 ++++ workflows/common/sh/sched-polaris.sh | 18 ++++++++++++++++++ 3 files changed, 40 insertions(+) create mode 100644 workflows/common/sh/env-polaris.sh create mode 100644 workflows/common/sh/langs-app-polaris.sh create mode 100644 workflows/common/sh/sched-polaris.sh diff --git a/workflows/common/sh/env-polaris.sh b/workflows/common/sh/env-polaris.sh new file mode 100644 index 00000000..d1e4fb54 --- /dev/null +++ b/workflows/common/sh/env-polaris.sh @@ -0,0 +1,18 @@ + +# ENV Polaris + +# SWIFT_IMPL=echo +SWIFT_IMPL=app + +CSC249=/lus/grand/projects/CSC249ADOA01 +ROOT=$CSC249/public/sfw/polaris +SWIFT=$ROOT/swift-t/2022-11-28 + +export TURBINE_HOME=$SWIFT/turbine +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH + +PY=$ROOT/Miniconda +PATH=$PY/bin:$PATH + +EQR=not-installed diff --git a/workflows/common/sh/langs-app-polaris.sh b/workflows/common/sh/langs-app-polaris.sh new file mode 100644 index 00000000..612ead56 --- /dev/null +++ b/workflows/common/sh/langs-app-polaris.sh @@ -0,0 +1,4 @@ + +# LANGS APP Polaris + +# Nothing for now diff --git a/workflows/common/sh/sched-polaris.sh b/workflows/common/sh/sched-polaris.sh new file mode 100644 index 00000000..721a490d --- /dev/null +++ b/workflows/common/sh/sched-polaris.sh @@ -0,0 +1,18 @@ + +# SCHED Summit + +# Scheduler settings for Swift/T/PBS/Polaris + +MACHINE="-m pbs" + +# Default PROJECT for CANDLE +export PROJECT=${PROJECT:-CSC249ADOA01} + +export QUEUE=${QUEUE:-debug} +export WALLTIME=${WALLTIME:-00:10:00} + +# These are Polaris-specific settings - see: +# https://www.alcf.anl.gov/support/user-guides/polaris/hardware-overview/machine-overview +# http://swift-lang.github.io/swift-t/sites.html#_polaris +export TURBINE_POLARIS=1 +export TURBINE_DIRECTIVE='#PBS -l filesystems=home:grand' From cc512f654b52b6f11859e137fb0df667e9da76f7 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 15 Dec 2022 14:44:27 -0600 Subject: [PATCH 344/601] Clean up workflow wrapper --- workflows/upf/swift/workflow.sh | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index f7301184..1181abc4 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -85,30 +85,15 @@ cp $CFG_SYS $TURBINE_OUTPUT # Make run directory in advance to reduce contention mkdir -pv $TURBINE_OUTPUT/run -which mpicc which swift-t # module list cp -v $UPF $TURBINE_OUTPUT -site2=$(echo $SITE | awk -v FS="-" '{print $1}') # ALW 2020-11-15: allow $SITEs to have hyphens in them as Justin implemented for Summit on 2020-10-29, e.g., summit-tf1 - -# ALW 2020-11-15: If we're running the candle wrapper scripts in which -# case if this file were being called then $CANDLE_RUN_WORKFLOW=1, -# don't set $TURBINE_LAUNCH_OPTIONS as this variable and the settings -# in the declaration below are handled by the wrapper scripts -if [[ ${site2} == "summit" && ${CANDLE_RUN_WORKFLOW:-0} != 1 ]] -then - export TURBINE_LAUNCH_OPTIONS="-a1 -g1 -c7" -fi - # TURBINE_STDOUT="$TURBINE_OUTPUT/out-%%r.txt" TURBINE_STDOUT= -echo OMP_NUM_THREADS ${OMP_NUM_THREADS:-UNSET} -export OMP_NUM_THREADS=1 - log_path LD_LIBRARY_PATH swift-t -n $PROCS \ From be6a36a623ff2ce222cbc8a8bf4738e398bc6b3b Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 15 Dec 2022 14:44:39 -0600 Subject: [PATCH 345/601] Update header --- workflows/common/sh/sched-summit.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflows/common/sh/sched-summit.sh b/workflows/common/sh/sched-summit.sh index 064176f4..ee173969 100644 --- a/workflows/common/sh/sched-summit.sh +++ b/workflows/common/sh/sched-summit.sh @@ -1,5 +1,7 @@ -# Scheduler settings for Swift/Summit +# SCHED Summit + +# Scheduler settings for Swift/T/LSF/Summit MACHINE="-m lsf" From 4e7044831ab539f12266fc2541717651b416af52 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 15 Dec 2022 16:32:50 -0600 Subject: [PATCH 346/601] Set Python location for Polaris compute node --- workflows/common/sh/langs-app-polaris.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/sh/langs-app-polaris.sh b/workflows/common/sh/langs-app-polaris.sh index 612ead56..4be8be23 100644 --- a/workflows/common/sh/langs-app-polaris.sh +++ b/workflows/common/sh/langs-app-polaris.sh @@ -1,4 +1,4 @@ # LANGS APP Polaris -# Nothing for now +PATH=/grand/CSC249ADOA01/public/sfw/polaris/Miniconda/bin:$PATH From 3892e1b80ea5d38e065264247042502dce4edfe8 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 15 Dec 2022 16:33:31 -0600 Subject: [PATCH 347/601] WIP making output directories match --- workflows/common/sh/model.sh | 38 +++++++++++++++++++-------------- workflows/common/sh/utils.sh | 16 +++++--------- workflows/upf/swift/workflow.sh | 19 +++++++++++------ workflows/upf/test/upf-1.txt | 6 +++--- 4 files changed, 43 insertions(+), 36 deletions(-) diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index ac0f6a09..5f936117 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -38,18 +38,19 @@ RUNID=$3 # Each model run, runs in its own "instance" directory # Set instance_directory to that and cd into it. # # TODO: rename INSTANCE_DIRECTORY to OUTPUT_DIR -# if [[ $CANDLE_MODEL_TYPE = "SINGULARITY" ]] -# then -# INSTANCE_DIRECTORY=$CANDLE_DATA_DIR/output/$EXPID/run/$RUNID -# else # "BENCHMARKS" -# INSTANCE_DIRECTORY=$TURBINE_OUTPUT/run/$RUNID -# fi - -INSTANCE_DIRECTORY=$TURBINE_OUTPUT/run/$RUNID +set -x +echo CMT $CANDLE_MODEL_TYPE +if [[ $CANDLE_MODEL_TYPE = "SINGULARITY" ]] +then + INSTANCE_DIRECTORY=$CANDLE_DATA_DIR/$MODEL_NAME/Output/$EXPID/$RUNID +else # "BENCHMARKS" + INSTANCE_DIRECTORY=$TURBINE_OUTPUT/$RUNID +fi # All stdout/stderr after this point goes into model.log ! -mkdir -p $INSTANCE_DIRECTORY +mkdir -pv $INSTANCE_DIRECTORY LOG_FILE=$INSTANCE_DIRECTORY/model.log +set +x exec >> $LOG_FILE exec 2>&1 cd $INSTANCE_DIRECTORY @@ -86,17 +87,22 @@ echo log "USING PYTHON:" $( which python ) echo +set -x # Construct the desired model command MODEL_CMD based on CANDLE_MODEL_TYPE: -if [[ $CANDLE_MODEL_TYPE == "SINGULARITY" ]] +if [[ ${CANDLE_MODEL_TYPE:-} == "SINGULARITY" ]] then # No model_runner, need to write parameters.txt explicitly: # get hyper_parameter_map to pass as 2nd argument - python3 $WORKFLOWS_ROOT/common/python/runner_utils.py write_params $PARAMS $INIT_PARAMS_FILE - # TODO: May need to bind a directory - MODEL_CMD=( singularity exec --bind $CANDLE_DATA_DIR --nv $CANDLE_IMAGE train.sh $ADLB_RANK_OFFSET - $CANDLE_DATA_DIR $INSTANCE_DIRECTORY/parameters.txt ) + + python3 $WORKFLOWS_ROOT/common/python/runner_utils.py write_params \ + "$PARAMS" $INSTANCE_DIRECTORY/parameters.txt + MODEL_CMD=( singularity exec --nv + --bind $CANDLE_DATA_DIR:/candle_data_dir + $CANDLE_IMAGE train.sh $ADLB_RANK_OFFSET + $CANDLE_DATA_DIR + $INSTANCE_DIRECTORY/parameters.txt ) else # "BENCHMARKS" # The Python command line arguments: @@ -116,9 +122,9 @@ log "MODEL_CMD: ${MODEL_CMD[@]}" # Run Python! $TIMEOUT_CMD "${MODEL_CMD[@]}" & -if [[ $CANDLE_MODEL_TYPE == "SINGULARITY" ]] +if [[ ${CANDLE_MODEL_TYPE:-} == "SINGULARITY" ]] then - # grep for Singularity process and wai + # grep for Singularity process and wait PID=$(ps ux | awk '/[S]ingularity/{print $2}') wait $PID diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index 9b21d735..e298407d 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -164,13 +164,13 @@ get_expid() fi export EXPID=$1 - export CANDLE_MODEL_TYPE=${2:-Benchmarks} + export CANDLE_MODEL_TYPE=${2:-BENCHMARKS} export EXPERIMENTS="" - if [[ $CANDLE_MODEL_TYPE = "SINGULARITY" ]] + if [[ $CANDLE_MODEL_TYPE == "SINGULARITY" ]] then - EXPERIMENTS=${EXPERIMENTS:-$CANDLE_DATA_DIR/output/experiments} + EXPERIMENTS=${EXPERIMENTS:-$CANDLE_DATA_DIR/$MODEL_NAME/Output} else # "BENCHMARKS" EXPERIMENTS=${EXPERIMENTS:-$EMEWS_PROJECT_ROOT/experiments} fi @@ -192,14 +192,14 @@ get_expid() then for E in ${EXPS[@]} do - EXPID=$( printf "X%03i" $i )${EXP_SUFFIX:-} + EXPID=$( printf "EXP%03i" $i )${EXP_SUFFIX:-} if [[ $E == $EXPID ]] then i=$(( i + 1 )) fi done fi - EXPID=$( printf "X%03i" $i )${EXP_SUFFIX:-} + EXPID=$( printf "EXP%03i" $i )${EXP_SUFFIX:-} export TURBINE_OUTPUT=$EXPERIMENTS/$EXPID check_experiment else @@ -213,12 +213,6 @@ get_expid() exit 1 fi TURBINE_OUTPUT=$TO - - # Andrew: Needed for functionality with George's restart.py script for UPF jobs - if [ -f metadata.json ]; then - mv metadata.json $TURBINE_OUTPUT - fi - } next() diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index 1181abc4..8e6ae9ac 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -29,10 +29,10 @@ then fi if ! { - get_site $1 # Sets SITE - get_expid $2 # Sets EXPID, TURBINE_OUTPUT - get_cfg_sys $3 # Sets CFG_SYS - UPF=$4 # The JSON hyperparameter file + get_site $1 # Sets SITE + get_expid $2 "SINGULARITY" # Sets EXPID, TURBINE_OUTPUT + get_cfg_sys $3 # Sets CFG_SYS + UPF=$4 # The JSON hyperparameter file } then usage @@ -57,7 +57,7 @@ then abort "The site '$SITE' did not set the location of EQ/R: this will not work!" fi -export TURBINE_JOBNAME="JOB:${EXPID}" +export TURBINE_JOBNAME="UPF_${EXPID}" OBJ_PARAM_ARG="" if [[ ${OBJ_PARAM:-} != "" ]] @@ -96,6 +96,11 @@ TURBINE_STDOUT= log_path LD_LIBRARY_PATH +if [[ ${CANDLE_DATA_DIR:-} == "" ]] +then + abort "upf/workflow.sh: Set CANDLE_DATA_DIR!" +fi + swift-t -n $PROCS \ -o $TURBINE_OUTPUT/workflow.tic \ ${MACHINE:-} \ @@ -107,13 +112,15 @@ swift-t -n $PROCS \ -e MODEL_SH \ -e SITE \ -e BENCHMARK_TIMEOUT \ - -e MODEL_NAME \ + -e MODEL_NAME=${MODEL_NAME:-MODEL_NULL} \ -e OBJ_RETURN \ -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ -e TURBINE_MPI_THREAD=${TURBINE_MPI_THREAD:-1} \ $( python_envs ) \ -e TURBINE_STDOUT=$TURBINE_STDOUT \ -e PYTHONUNBUFFERED=1 \ + -e CANDLE_MODEL_TYPE \ + -e CANDLE_IMAGE \ $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} # -e PYTHONVERBOSE=1 diff --git a/workflows/upf/test/upf-1.txt b/workflows/upf/test/upf-1.txt index fb4a4730..919b1d07 100644 --- a/workflows/upf/test/upf-1.txt +++ b/workflows/upf/test/upf-1.txt @@ -1,3 +1,3 @@ -{"id": "test1", "epochs": 1} -{"id": "test2", "epochs": 2} -{"id": "test3", "epochs": 3} +{"id": "RUN000", "epochs": 1} +{"id": "RUN001", "epochs": 2} +{"id": "RUN002", "epochs": 3} From afade789d59bbbdf4295ead7454ecd2c6645eabb Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 15 Dec 2022 16:34:17 -0600 Subject: [PATCH 348/601] New UPF GDRP runner --- workflows/upf/test/upf-gdrp-1.sh | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100755 workflows/upf/test/upf-gdrp-1.sh diff --git a/workflows/upf/test/upf-gdrp-1.sh b/workflows/upf/test/upf-gdrp-1.sh new file mode 100755 index 00000000..b696d6f1 --- /dev/null +++ b/workflows/upf/test/upf-gdrp-1.sh @@ -0,0 +1,28 @@ +#!/bin/bash +set -eu + +# TEST UPF GDRP 1 +# For GraphDRP + +if (( ${#} != 1 )) +then + echo "usage: test SITE" + exit 1 +fi + +SITE=$1 + +# Self-configure +THIS=$( cd $( dirname $0 ) ; /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. ; /bin/pwd ) +WORKFLOWS_ROOT=$( cd $THIS/../.. ; /bin/pwd ) +export EMEWS_PROJECT_ROOT + +export OBJ_RETURN="val_loss" +CFG_SYS=$THIS/cfg-sys-1.sh + +export CANDLE_IMAGE=/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif +export CANDLE_MODEL_TYPE="SINGULARITY" +export MODEL_NAME="GraphDRP" + +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $THIS/upf-1.txt From cc4bc19b187c8eb75a7ddcaf291350bb7ab54bdc Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Fri, 16 Dec 2022 10:04:04 +0000 Subject: [PATCH 349/601] Add singularity --- workflows/common/sh/env-polaris.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/common/sh/env-polaris.sh b/workflows/common/sh/env-polaris.sh index d1e4fb54..340ee835 100644 --- a/workflows/common/sh/env-polaris.sh +++ b/workflows/common/sh/env-polaris.sh @@ -1,5 +1,6 @@ # ENV Polaris +module load singularity # SWIFT_IMPL=echo SWIFT_IMPL=app From 0f90f1232df89b15e7b2f3d7eb5db093f7744396 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 16 Dec 2022 11:28:01 -0600 Subject: [PATCH 350/601] Move 'module load singularity' to compute node side --- workflows/common/sh/env-polaris.sh | 1 - workflows/common/sh/langs-app-polaris.sh | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/common/sh/env-polaris.sh b/workflows/common/sh/env-polaris.sh index 340ee835..d1e4fb54 100644 --- a/workflows/common/sh/env-polaris.sh +++ b/workflows/common/sh/env-polaris.sh @@ -1,6 +1,5 @@ # ENV Polaris -module load singularity # SWIFT_IMPL=echo SWIFT_IMPL=app diff --git a/workflows/common/sh/langs-app-polaris.sh b/workflows/common/sh/langs-app-polaris.sh index 4be8be23..25ab0278 100644 --- a/workflows/common/sh/langs-app-polaris.sh +++ b/workflows/common/sh/langs-app-polaris.sh @@ -2,3 +2,5 @@ # LANGS APP Polaris PATH=/grand/CSC249ADOA01/public/sfw/polaris/Miniconda/bin:$PATH + +module load singularity From 3f328ab384c0b79071e1d31333ee78baa020ecb6 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 16 Dec 2022 11:30:13 -0600 Subject: [PATCH 351/601] Change model.sh API to support Singularity --- workflows/common/python/runner_utils.py | 42 ++++++++++++++-- workflows/common/sh/model.sh | 22 ++++---- workflows/common/swift/obj_app.swift | 67 ++++++++++++------------- workflows/upf/swift/workflow.swift | 10 ++-- 4 files changed, 87 insertions(+), 54 deletions(-) diff --git a/workflows/common/python/runner_utils.py b/workflows/common/python/runner_utils.py index 08142002..17d4fd4e 100644 --- a/workflows/common/python/runner_utils.py +++ b/workflows/common/python/runner_utils.py @@ -94,6 +94,23 @@ def write_params(params, hyper_parameter_map): f_out.write("{}={}\n".format(*kv)) +def expand_params(params, hyper_parameter_map): + parent_dir = (hyper_parameter_map["instance_directory"] + if "instance_directory" in hyper_parameter_map else ".") + result = "" + for k, v in params.items(): + if type(v) in DATA_TYPES: + v = DATA_TYPES[type(v)] + if isinstance(v, basestring): + v = "'{}'".format(v) + if k == "solr_root" or k == "timeout" or k == "id": + # this must written at the end + pass # Not a command-line parameter + else: + result += "--{} {} ".format(k, v) + return result + + def keras_clear_session(framework): if framework == "keras": # works around this error: @@ -133,18 +150,35 @@ def merge_params(defaults, params): def main(): # Need argparse - print("runner_utils.main(): " + str(sys.argv)) if sys.argv[1] == "write_params": # Merge params from the user-provided params file and # the workflow-generated parameters - # Assume we are in the correct directory for this file: - defaults = read_config_file_dict(sys.argv[3]) # Parse the workflow-provided JSON string: J = json.loads(sys.argv[2]) + # Assume we are in the correct directory for this file: + defaults = read_config_file_dict(sys.argv[3]) params = merge_params(defaults, J) print("params: " + str(params)) write_params(params, {}) - + elif sys.argv[1] == "expand_params": + # Merge params from the user-provided params file and + # the workflow-generated parameters and create + # a set of command line flags to pass to CANDLE parser_utils + if not (len(sys.argv) == 3 or len(sys.argv) == 4): + print("runner_utils: bad subcommand args: " + str(sys.argv)) + exit(1) + # Parse the workflow-provided JSON string: + params = json.loads(sys.argv[2]) + if len(sys.argv) == 3: + pass # No defaults, OK + elif len(sys.argv) == 4: + defaults = read_config_file_dict(sys.argv[3]) + params = merge_params(defaults, params) + params = expand_params(params, {}) + print(params) + else: + print("runner_utils: unknown subcommand: " + str(sys.argv)) + exit(1) if __name__ == "__main__": main() diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index 5f936117..649b2e78 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -16,16 +16,17 @@ set -eu usage() { - echo "Usage: model.sh FRAMEWORK PARAMS RUNID" + echo "Usage: model.sh FRAMEWORK PARAMS EXPID RUNID" echo "The environment should have:" echo " EMEWS_PROJECT_ROOT|WORKFLOWS_ROOT TURBINE_OUTPUT" echo " SITE OBJ_RETURN BENCHMARK_TIMEOUT" echo " and MODEL_NAME EXPID for model_runner.py" - echo "If SH_TIMEOUT is provided, we run under the shell command timeout" + echo "If SH_TIMEOUT is set, we run under the shell command timeout" } -if (( ${#} != 3 )) +if (( ${#} != 4 )) then + echo "Wrong number of arguments: received ${#} , required: 4" usage exit 1 fi @@ -33,7 +34,8 @@ fi FRAMEWORK=$1 # Usually "keras" or "pytorch" # JSON string of parameters: PARAMS="$2" -RUNID=$3 +EXPID=$3 +RUNID=$4 # Each model run, runs in its own "instance" directory # Set instance_directory to that and cd into it. @@ -42,7 +44,9 @@ set -x echo CMT $CANDLE_MODEL_TYPE if [[ $CANDLE_MODEL_TYPE = "SINGULARITY" ]] then + # TODO: Rename "instance" to "run" INSTANCE_DIRECTORY=$CANDLE_DATA_DIR/$MODEL_NAME/Output/$EXPID/$RUNID + INTERNAL_DIRECTORY=$MODEL_NAME/Output/$EXPID/$RUNID else # "BENCHMARKS" INSTANCE_DIRECTORY=$TURBINE_OUTPUT/$RUNID fi @@ -50,6 +54,7 @@ fi # All stdout/stderr after this point goes into model.log ! mkdir -pv $INSTANCE_DIRECTORY LOG_FILE=$INSTANCE_DIRECTORY/model.log +echo "redirecting to: LOG_FILE=$INSTANCE_DIRECTORY/model.log" set +x exec >> $LOG_FILE exec 2>&1 @@ -95,14 +100,13 @@ then # No model_runner, need to write parameters.txt explicitly: # get hyper_parameter_map to pass as 2nd argument - - python3 $WORKFLOWS_ROOT/common/python/runner_utils.py write_params \ - "$PARAMS" $INSTANCE_DIRECTORY/parameters.txt + FLAGS=$( python3 $WORKFLOWS_ROOT/common/python/runner_utils.py expand_params \ + "$PARAMS" ) MODEL_CMD=( singularity exec --nv --bind $CANDLE_DATA_DIR:/candle_data_dir $CANDLE_IMAGE train.sh $ADLB_RANK_OFFSET - $CANDLE_DATA_DIR - $INSTANCE_DIRECTORY/parameters.txt ) + /candle_data_dir + $FLAGS ) # $INTERNAL_DIRECTORY/parameters.txt else # "BENCHMARKS" # The Python command line arguments: diff --git a/workflows/common/swift/obj_app.swift b/workflows/common/swift/obj_app.swift index 1965149c..e2013b5f 100644 --- a/workflows/common/swift/obj_app.swift +++ b/workflows/common/swift/obj_app.swift @@ -8,65 +8,60 @@ run_id : A string run ID that will be the output directory name */ (string obj_result) obj(string params, - string run_id) { + string expid, + string runid) +{ string model_sh = getenv("MODEL_SH"); string turbine_output = getenv("TURBINE_OUTPUT"); + string model_name = getenv("MODEL_NAME"); string outdir; - outdir = "%s/run/%s" % (turbine_output, run_id); - - // Comment: this is not needed as turbine_output has already been adjusted - // string myenv = getenv("CANDLE_MODEL_TYPE"); - // if (myenv == "SINGULARITY") { - // outdir = "%s/run/%s" % (turbine_output, run_id); - // } else { - // // outdir = "%s/output/%s/run/%s" % (getenv("CANDLE_DATA_DIR"), getenv("EXPID"), run_id); - // } + outdir = "%s/%s/Output/%s/%s" % (turbine_output, model_name, expid, runid); printf("running model shell script in: %s", outdir); // We do not use a file type here because this file may not be created, // which is handled by get_results() string result_file = outdir/"result.txt"; - wait (run_model(model_sh, params, run_id)) + wait (run_model(model_sh, params, expid, runid)) { obj_result = get_results(result_file); } - printf("result(%s): %s", run_id, obj_result); + printf("result(%s): %s", runid, obj_result); } -/** - The main objective function used by the CANDLE/Supervisor - model exploration (optimization) loop. - params : The JSON string of params to be passed to the Benchmark - run_id : A string run ID that will be the output directory name -*/ -(string obj_result) obj_prio(string params, - string run_id, int prio) { - string model_sh = getenv("MODEL_SH"); - string turbine_output = getenv("TURBINE_OUTPUT"); +// /** +// The main objective function used by the CANDLE/Supervisor +// model exploration (optimization) loop. +// params : The JSON string of params to be passed to the Benchmark +// run_id : A string run ID that will be the output directory name +// */ +// (string obj_result) obj_prio(string params, +// string run_id, int prio) { +// string model_sh = getenv("MODEL_SH"); +// string turbine_output = getenv("TURBINE_OUTPUT"); - // printf("running model shell script in: %s", outdir); - // We do not use a file type here because this file may not be created, - // which is handled by get_results() - string outdir = "%s/run/%s" % (turbine_output, run_id); - string result_file = outdir/"result.txt"; - wait (@prio=prio run_model(model_sh, params, run_id)) - { - obj_result = get_results(result_file); - } - printf("result(%s): %s", run_id, obj_result); -} +// // printf("running model shell script in: %s", outdir); +// // We do not use a file type here because this file may not be created, +// // which is handled by get_results() +// string outdir = "%s/run/%s" % (turbine_output, run_id); +// string result_file = outdir/"result.txt"; +// wait (@prio=prio run_model(model_sh, params, expidrun_id)) +// { +// obj_result = get_results(result_file); +// } +// printf("result(%s): %s", run_id, obj_result); +// } /** Swift/T app function that runs the Benchmark */ app (void o) run_model (string model_sh, string params, - string runid) + string expid, string runid) { - // 1 2 3 - "bash" model_sh FRAMEWORK params runid; + // 1 2 3 4 + "bash" model_sh FRAMEWORK params expid runid; } /** diff --git a/workflows/upf/swift/workflow.swift b/workflows/upf/swift/workflow.swift index 687a18ab..09b36ef9 100644 --- a/workflows/upf/swift/workflow.swift +++ b/workflows/upf/swift/workflow.swift @@ -17,11 +17,11 @@ report_env(); string FRAMEWORK = "keras"; // Scan command line -file upf = input(argv("f")); -int benchmark_timeout = toint(argv("benchmark_timeout", "-1")); +file upf = input(argv("f")); +int benchmark_timeout = string2int(argv("benchmark_timeout", "-1")); string model_name = getenv("MODEL_NAME"); -string exp_id = getenv("EXPID"); +string expid = getenv("EXPID"); string turbine_output = getenv("TURBINE_OUTPUT"); // Report some key facts: @@ -38,8 +38,8 @@ string results[]; foreach params,i in upf_lines { printf("params: %s", params); - id = json_get(params, "id"); - results[i] = obj(params, id); + runid = json_get(params, "id"); + results[i] = obj(params, expid, runid); assert(results[i] != "EXCEPTION", "exception in obj()!"); } From 1e060b589daf575d363509a887a253cb3cdfc6a1 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Mon, 19 Dec 2022 21:28:49 +0000 Subject: [PATCH 352/601] o Fixes for Singularity, ran on Polaris --- workflows/common/sh/model.sh | 16 ++++++++++++---- workflows/common/swift/obj_app.swift | 3 ++- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index 649b2e78..54fb117b 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -40,7 +40,7 @@ RUNID=$4 # Each model run, runs in its own "instance" directory # Set instance_directory to that and cd into it. # # TODO: rename INSTANCE_DIRECTORY to OUTPUT_DIR -set -x +#set -x echo CMT $CANDLE_MODEL_TYPE if [[ $CANDLE_MODEL_TYPE = "SINGULARITY" ]] then @@ -92,7 +92,7 @@ echo log "USING PYTHON:" $( which python ) echo -set -x +#set -x # Construct the desired model command MODEL_CMD based on CANDLE_MODEL_TYPE: if [[ ${CANDLE_MODEL_TYPE:-} == "SINGULARITY" ]] then @@ -129,11 +129,19 @@ $TIMEOUT_CMD "${MODEL_CMD[@]}" & if [[ ${CANDLE_MODEL_TYPE:-} == "SINGULARITY" ]] then # grep for Singularity process and wait + sleep 3 #sleep for sometime so that job id is available PID=$(ps ux | awk '/[S]ingularity/{print $2}') + PID2=$(ps ux | grep '[S]ingularity') + echo $PID, "--is the PID..and PID2:", $PID2 wait $PID - + ls -ltrh + sleep 1 # get results of the format Loss: xxx last occurence of in the model.log file - RESULT=$(awk -v FS="Loss:" 'NF>1{print $2}' model.log | tail -1) + #RESULT=$(awk -v FS="Loss:" 'NF>1{print $2}' model.log | tail -1) + # using set -x will break the following + RES=$(awk -v FS="IMPROVE_RESULT" 'NF>1 {x=$2} END {print x}' model.log) + echo $RES + RESULT="$(echo $RES | grep -Eo '[+-]?[0-9]+([.][0-9]+)?')" echo $RESULT > $INSTANCE_DIRECTORY/result.txt else PID=$! diff --git a/workflows/common/swift/obj_app.swift b/workflows/common/swift/obj_app.swift index e2013b5f..5a7b4b2c 100644 --- a/workflows/common/swift/obj_app.swift +++ b/workflows/common/swift/obj_app.swift @@ -17,7 +17,8 @@ string outdir; - outdir = "%s/%s/Output/%s/%s" % (turbine_output, model_name, expid, runid); + outdir = "%s/%s" % (turbine_output, runid); + // outdir = "%s/%s/Output/%s/%s" % (turbine_output, model_name, expid, runid); printf("running model shell script in: %s", outdir); From e89b615893baa906d701639ed7353b06df2efe63 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 20 Dec 2022 14:29:04 -0600 Subject: [PATCH 353/601] Better configuration reporting --- workflows/common/R/install-candle.sh | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/workflows/common/R/install-candle.sh b/workflows/common/R/install-candle.sh index 8c30626b..afaa537e 100755 --- a/workflows/common/R/install-candle.sh +++ b/workflows/common/R/install-candle.sh @@ -23,9 +23,30 @@ done echo "This will install multiple R packages for CANDLE." echo -echo "using R: $( which R )" -echo "using gcc: $( which gcc )" -echo "using gfortran: $( which gfortran )" + +if ! command which R > /dev/null +then + echo "No R found!" + exit 1 +fi + +echo "variables:" +set +u # These variables may be unset +for var in CC CXX FC +do + printf "using %-8s = %s\n" $var ${!var} +done +echo +set -u + +echo "tools:" +for tool in R cc CC gcc g++ ftn gfortran +do + if command which $tool > /dev/null 2>&1 + then + printf "using %-10s %s\n" "${tool}:" $( which $tool ) + fi +done echo if [ $CONFIRM = 1 ] From 30b991d57f0be7b919cd8aa2b318664e17a05c82 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 20 Dec 2022 14:29:28 -0600 Subject: [PATCH 354/601] R settings for Polaris --- workflows/common/sh/env-polaris.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/workflows/common/sh/env-polaris.sh b/workflows/common/sh/env-polaris.sh index d1e4fb54..47aadaa6 100644 --- a/workflows/common/sh/env-polaris.sh +++ b/workflows/common/sh/env-polaris.sh @@ -15,4 +15,7 @@ PATH=$SWIFT/turbine/bin:$PATH PY=$ROOT/Miniconda PATH=$PY/bin:$PATH -EQR=not-installed +R_HOME=$ROOT/R-4.2.2/lib64/R +EQR=$ROOT/EQ-R + +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:$R_HOME/lib From 6a2f4a6499b3d6a2eb66238cceb20cd66ee5158e Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 20 Dec 2022 14:29:36 -0600 Subject: [PATCH 355/601] Mark this as obsolete --- workflows/known-benchmarks.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/known-benchmarks.sh b/workflows/known-benchmarks.sh index c630b2e5..29239367 100644 --- a/workflows/known-benchmarks.sh +++ b/workflows/known-benchmarks.sh @@ -1,4 +1,6 @@ +# OBSOLETED: see common/sh/set-pythonpath.sh 2022-12-20 + # Known Benchmarks # Generate the list of Benchmarks that Supervisor knows about # To add a Known Benchmark, add its paths to BENCHMARKS_DIRS_BASE below From ce2cd0077eb23df01d5a8184d62bcd4b9ece4519 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 20 Dec 2022 14:29:51 -0600 Subject: [PATCH 356/601] Use realpath --- workflows/upf/swift/workflow.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index 8e6ae9ac..1f6dbc8c 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -4,9 +4,9 @@ set -eu # UPF WORKFLOW SH # Autodetect this workflow directory -export EMEWS_PROJECT_ROOT=$( readlink --canonicalize $( dirname $0 )/.. ) -export WORKFLOWS_ROOT=$( readlink --canonicalize $EMEWS_PROJECT_ROOT/.. ) -export BENCHMARKS_ROOT=$( readlink --canonicalize $EMEWS_PROJECT_ROOT/../../../Benchmarks ) +export EMEWS_PROJECT_ROOT=$( realpath $( dirname $0 )/.. ) +export WORKFLOWS_ROOT=$( realpath $EMEWS_PROJECT_ROOT/.. ) +export BENCHMARKS_ROOT=$( realpath $EMEWS_PROJECT_ROOT/../../../Benchmarks ) BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/NT3:$BENCHMARKS_ROOT/Pilot2/P2B1:$BENCHMARKS_ROOT/Pilot1/P1B1:$BENCHMARKS_ROOT/Pilot1/Combo:$BENCHMARKS_ROOT/Pilot3/P3B1:$BENCHMARKS_ROOT/Pilot3/P3B3:$BENCHMARKS_ROOT/Pilot3/P3B4:$BENCHMARKS_ROOT/Pilot3/P3B5 export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} From 22946fb21f48e378cff27e227cb40dcaee7cb9ea Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 20 Dec 2022 14:33:55 -0600 Subject: [PATCH 357/601] Updates for mlrMBO- almost works --- workflows/mlrMBO/swift/workflow.sh | 50 ++++++++------------------- workflows/mlrMBO/swift/workflow.swift | 4 +-- 2 files changed, 16 insertions(+), 38 deletions(-) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index 0ef5ee70..e2a15845 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -8,29 +8,13 @@ set -eu # Autodetect this workflow directory export EMEWS_PROJECT_ROOT=$( cd $( dirname $0 )/.. ; /bin/pwd ) export WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. ; /bin/pwd ) -if [[ ! -d $EMEWS_PROJECT_ROOT/../../../Benchmarks ]] -then - echo "Could not find Benchmarks in: $EMEWS_PROJECT_ROOT/../../../Benchmarks" - exit 1 -fi -BENCHMARKS_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd ) -export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} -BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/P1B1:$BENCHMARKS_ROOT/Pilot1/Attn1:$BENCHMARKS_ROOT/Pilot1/NT3:$BENCHMARKS_ROOT/examples/ADRP:$BENCHMARKS_ROOT/examples/xform-smiles export BENCHMARK_TIMEOUT -export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} SCRIPT_NAME=$(basename $0) # Source some utility functions used by EMEWS in this script source $WORKFLOWS_ROOT/common/sh/utils.sh -#source "${EMEWS_PROJECT_ROOT}/etc/emews_utils.sh" - moved to utils.sh - -# Uncomment to turn on Swift/T logging. Can also set TURBINE_LOG, -# TURBINE_DEBUG, and ADLB_DEBUG to 0 to turn off logging. -# Do not commit with logging enabled, users have run out of disk space -# export TURBINE_LOG=1 TURBINE_DEBUG=1 ADLB_DEBUG=1 - usage() { echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME" @@ -69,22 +53,18 @@ get_cfg_sys $3 get_cfg_prm $4 MODEL_NAME=$5 -# Set PYTHONPATH for BENCHMARK related stuff -PYTHONPATH+=:$BENCHMARK_DIR # :$BENCHMARKS_ROOT/common # This is now candle_lib -# Set PYTHONPATH for BENCHMARK related stuff in obj_app mode -export APP_PYTHONPATH+=:$BENCHMARK_DIR # :$BENCHMARKS_ROOT/common # This is now candle_lib - source_site env $SITE source_site sched $SITE -PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # needed for model_runner and logs - if [[ ${EQR:-} == "" ]] then abort "The site '$SITE' did not set the location of EQ/R: this will not work!" fi -export TURBINE_JOBNAME="JOB:${EXPID}" +# Set up PYTHONPATH for model +source $WORKFLOWS_ROOT/common/sh/set-pythonpath.sh + +export TURBINE_JOBNAME="MBO_${EXPID}" RESTART_FILE_ARG="" if [[ ${RESTART_FILE:-} != "" ]] @@ -144,10 +124,10 @@ then echo "Turbine will wait for job completion." fi -site2=$(echo $SITE | awk -v FS="-" '{print $1}') # ALW 2020-11-15: allow $SITEs to have hyphens in them as Justin implemented for Summit on 2020-10-29, e.g., summit-tf1 - -# Use for Summit (LSF needs two %)... actually, it may not be LSF as Biowulf (which uses SLURM) seems to need this too now -if [ ${site2:-} == "summit" ] || [ ${site2:-} == "biowulf" ] +# Handle %-escapes in TURBINE_STDOUT +if [ $SITE == "summit" ] || \ + [ $SITE == "biowulf" ] || \ + [ $SITE == "polaris" ] then export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" else @@ -156,8 +136,6 @@ fi mkdir -pv $TURBINE_OUTPUT/out -#swift-t -n $PROCS \ -# -o $TURBINE_OUTPUT/workflow.tic \ if [[ ${MACHINE:-} == "" ]] then STDOUT=$TURBINE_OUTPUT/output.txt @@ -179,7 +157,10 @@ then exit 1 fi -# ALW 2021-01-21: Please don't comment out the "-o $TURBINE_OUTPUT/workflow.tic" option below; otherwise, we get permissions issues on Biowulf. Thanks! +# We use 'swift-t -o' to allow swift-t to prevent scheduler errors +# on Biowulf. Reported by ALW 2021-01-21 + +( set -x swift-t -O 0 -n $PROCS \ -o $TURBINE_OUTPUT/workflow.tic \ @@ -209,6 +190,7 @@ swift-t -O 0 -n $PROCS \ $WAIT_ARG \ $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} |& \ tee $STDOUT +) if (( ${PIPESTATUS[0]} )) then @@ -216,8 +198,4 @@ then exit 1 fi -# echo "EXIT CODE: 0" | tee -a $STDOUT - -# Andrew: Needed this so that script to monitor job worked properly (queue_wait... function in utils.sh?) -# ALW 1/14/21: Removing this line again as I may not care about the job monitoring anymore and it clouds up the working directory -#echo $TURBINE_OUTPUT > turbine-directory.txt +echo "EXIT CODE: 0" | tee -a $STDOUT diff --git a/workflows/mlrMBO/swift/workflow.swift b/workflows/mlrMBO/swift/workflow.swift index fb211a91..46012dfa 100644 --- a/workflows/mlrMBO/swift/workflow.swift +++ b/workflows/mlrMBO/swift/workflow.swift @@ -92,8 +92,8 @@ string FRAMEWORK = "keras"; string results[]; foreach param, j in param_array { - results[j] = obj(param, - "%02i_%03i_%04i" % (restart_number,i,j)); + run_id = "%02i_%03i_%04i" % (restart_number,i,j); + results[j] = obj(param, exp_id, run_id); } string result = join(results, ";"); // printf(result); From bd392efd72b31233f033de62efaeff7c69941fff Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 21 Dec 2022 11:36:08 -0600 Subject: [PATCH 358/601] Put prefix on messages --- workflows/common/swift/obj_app.swift | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/common/swift/obj_app.swift b/workflows/common/swift/obj_app.swift index 5a7b4b2c..6c11650f 100644 --- a/workflows/common/swift/obj_app.swift +++ b/workflows/common/swift/obj_app.swift @@ -20,7 +20,7 @@ outdir = "%s/%s" % (turbine_output, runid); // outdir = "%s/%s/Output/%s/%s" % (turbine_output, model_name, expid, runid); - printf("running model shell script in: %s", outdir); + printf("obj_app: running model shell script in: %s", outdir); // We do not use a file type here because this file may not be created, // which is handled by get_results() @@ -29,7 +29,7 @@ { obj_result = get_results(result_file); } - printf("result(%s): %s", runid, obj_result); + printf("obj_app: result(%s): '%s'", runid, obj_result); } // /** From b4627227e7ebadab84ba6ec6a7d9a002d9f3a9fc Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 21 Dec 2022 11:36:18 -0600 Subject: [PATCH 359/601] Add Polaris to list of sites that need PYTHONPATH help --- workflows/common/sh/utils.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index e298407d..f6e5573a 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -94,7 +94,7 @@ python_envs() then # We do not currently need this except on MCS and Spock: # Swift/T should grab PYTHONPATH automatically - if [[ ${SITE} == "mcs" ]] || [[ ${SITE} == "spock" ]] + if [[ ${SITE} == "mcs" ]] || [[ ${SITE} == "spock" ]] || [[ ${SITE} == "polaris" ]] then # MCS discards PYTHONPATH in subshells RESULT+=( -e PYTHONPATH=$PYTHONPATH ) From edf6a06f1b9a4c8258e25ff4fcaf771de31f7fa7 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 21 Dec 2022 11:37:23 -0600 Subject: [PATCH 360/601] Better messaging --- workflows/mlrMBO/swift/workflow.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index e2a15845..012ea960 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -17,7 +17,8 @@ source $WORKFLOWS_ROOT/common/sh/utils.sh usage() { - echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME" + echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME " \ + "[CANDLE_MODEL_TYPE] [CANDLE_IMAGE]" } if (( ${#} != 7 )) && (( ${#} != 5 )) @@ -43,8 +44,9 @@ fi TURBINE_OUTPUT="" if [[ $CANDLE_MODEL_TYPE = "SINGULARITY" ]] then - TURBINE_OUTPUT=$CANDLE_DATA_DIR/output/ - echo "Running "$MODEL_NAME "workflow with" $CANDLE_MODEL_TYPE "and image" $CANDLE_IMAGE + TURBINE_OUTPUT=$CANDLE_DATA_DIR/output + printf "Running mlrMBO workflow with model %s and image %s:%s\n" \ + $MODEL_NAME $CANDLE_MODEL_TYPE $CANDLE_IMAGE fi get_site $1 # Sets SITE @@ -58,7 +60,8 @@ source_site sched $SITE if [[ ${EQR:-} == "" ]] then - abort "The site '$SITE' did not set the location of EQ/R: this will not work!" + abort "The site '$SITE' did not set the location of EQ/R: " \ + "this will not work!" fi # Set up PYTHONPATH for model From 938e1f14ceafdfa07cf04339c47809cf5e27c16a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 21 Dec 2022 11:37:39 -0600 Subject: [PATCH 361/601] Better if block --- workflows/mlrMBO/swift/workflow.sh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index 012ea960..85025c75 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -27,15 +27,14 @@ then exit 1 fi -CANDLE_MODEL_TYPE="BENCHMARKS" - if (( ${#} == 7 )) then CANDLE_MODEL_TYPE=$6 CANDLE_IMAGE=$7 - elif (( ${#} == 5 )) - then - echo "Not a singularity run" +elif (( ${#} == 5 )) +then + CANDLE_MODEL_TYPE="BENCHMARKS" + CANDLE_IMAGE=NONE else usage exit 1 From faa60f45bbeb221faa7f9876db8e485796386ac1 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 21 Dec 2022 11:37:49 -0600 Subject: [PATCH 362/601] Turn on environment debugging --- workflows/mlrMBO/swift/workflow.swift | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/workflows/mlrMBO/swift/workflow.swift b/workflows/mlrMBO/swift/workflow.swift index 46012dfa..f50d0d75 100644 --- a/workflows/mlrMBO/swift/workflow.swift +++ b/workflows/mlrMBO/swift/workflow.swift @@ -14,11 +14,12 @@ import EQR; import R; import assert; import python; -/* Helper for reporting environment variables common/swift/candle_utils.swift -* import candle_utils; -* -* report_env(); -*/ + +// Helper for reporting environment variables for debugging +// Cf. common/swift/candle_utils.swift +// This can be removed as desired. +import candle_utils; +report_env(); string emews_root = getenv("EMEWS_PROJECT_ROOT"); string turbine_output = getenv("TURBINE_OUTPUT"); From 47508048cfd267ac4d02bb4dafa59704ecaa85dc Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 23 Dec 2022 13:22:48 -0600 Subject: [PATCH 363/601] Simplify PID handling --- workflows/common/sh/model.sh | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index 54fb117b..af6fa556 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -125,26 +125,20 @@ log "MODEL_CMD: ${MODEL_CMD[@]}" # Run Python! $TIMEOUT_CMD "${MODEL_CMD[@]}" & +PID=$! if [[ ${CANDLE_MODEL_TYPE:-} == "SINGULARITY" ]] then - # grep for Singularity process and wait - sleep 3 #sleep for sometime so that job id is available - PID=$(ps ux | awk '/[S]ingularity/{print $2}') - PID2=$(ps ux | grep '[S]ingularity') - echo $PID, "--is the PID..and PID2:", $PID2 wait $PID ls -ltrh - sleep 1 - # get results of the format Loss: xxx last occurence of in the model.log file - #RESULT=$(awk -v FS="Loss:" 'NF>1{print $2}' model.log | tail -1) - # using set -x will break the following + sleep 1 # Wait for initial output + # Get last results of the format "IMPROVE_RESULT xxx" in model.log + # NOTE: Enabling set -x will break the following RES=$(awk -v FS="IMPROVE_RESULT" 'NF>1 {x=$2} END {print x}' model.log) echo $RES RESULT="$(echo $RES | grep -Eo '[+-]?[0-9]+([.][0-9]+)?')" echo $RESULT > $INSTANCE_DIRECTORY/result.txt else - PID=$! wait $PID CODE=$? if (( CODE )) From d0cf3c93303e8d80290ed1fdffd624fae69275db Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 23 Dec 2022 13:23:18 -0600 Subject: [PATCH 364/601] Update Swift/T for Polaris --- workflows/common/sh/env-polaris.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/sh/env-polaris.sh b/workflows/common/sh/env-polaris.sh index 47aadaa6..95da0749 100644 --- a/workflows/common/sh/env-polaris.sh +++ b/workflows/common/sh/env-polaris.sh @@ -6,7 +6,7 @@ SWIFT_IMPL=app CSC249=/lus/grand/projects/CSC249ADOA01 ROOT=$CSC249/public/sfw/polaris -SWIFT=$ROOT/swift-t/2022-11-28 +SWIFT=$ROOT/swift-t/2022-12-16 export TURBINE_HOME=$SWIFT/turbine PATH=$SWIFT/stc/bin:$PATH From 1242847629f04e6a59c0fadd1992d9d0ca773fbf Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 23 Dec 2022 13:23:53 -0600 Subject: [PATCH 365/601] Propagate CANDLE/IMPROVE settings --- workflows/mlrMBO/swift/workflow.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index 85025c75..21123a95 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -29,8 +29,8 @@ fi if (( ${#} == 7 )) then - CANDLE_MODEL_TYPE=$6 - CANDLE_IMAGE=$7 + export CANDLE_MODEL_TYPE=$6 + export CANDLE_IMAGE=$7 elif (( ${#} == 5 )) then CANDLE_MODEL_TYPE="BENCHMARKS" @@ -189,6 +189,8 @@ swift-t -O 0 -n $PROCS \ -e TURBINE_STDOUT \ -e IGNORE_ERRORS \ -e CANDLE_DATA_DIR \ + -e CANDLE_MODEL_TYPE \ + -e CANDLE_IMAGE \ $WAIT_ARG \ $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} |& \ tee $STDOUT From ecad83b1c88e7f52800710e68e1c1a0ac63b6367 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 23 Dec 2022 13:24:21 -0600 Subject: [PATCH 366/601] Spelling --- workflows/mlrMBO/test/cfg-sys-nightly.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/mlrMBO/test/cfg-sys-nightly.sh b/workflows/mlrMBO/test/cfg-sys-nightly.sh index ad2d8352..004b09dc 100644 --- a/workflows/mlrMBO/test/cfg-sys-nightly.sh +++ b/workflows/mlrMBO/test/cfg-sys-nightly.sh @@ -2,7 +2,7 @@ # MLRMBO CFG SYS 1 # The number of MPI processes -# Note that 2 processes are reserved for Swift/EMEMS +# Note that 2 processes are reserved for Swift/EMEWS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs export PROCS=${PROCS:-3} From d6482d044d976ecdf2b8bfae70c90b973438b72a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 23 Dec 2022 13:24:47 -0600 Subject: [PATCH 367/601] Fix header --- workflows/mlrMBO/test/cfg-prm-nightly.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/mlrMBO/test/cfg-prm-nightly.sh b/workflows/mlrMBO/test/cfg-prm-nightly.sh index 50367b0a..64cb51db 100644 --- a/workflows/mlrMBO/test/cfg-prm-nightly.sh +++ b/workflows/mlrMBO/test/cfg-prm-nightly.sh @@ -1,4 +1,4 @@ -# CFG PRM 1 +# CFG PRM NIGHTLY # mlrMBO settings From 00c3add545ec993a10187123434c2735ad611d14 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 23 Dec 2022 13:25:03 -0600 Subject: [PATCH 368/601] Better PARAM_SET_FILE handling --- workflows/mlrMBO/test/cfg-prm-nightly.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/mlrMBO/test/cfg-prm-nightly.sh b/workflows/mlrMBO/test/cfg-prm-nightly.sh index 64cb51db..cac57958 100644 --- a/workflows/mlrMBO/test/cfg-prm-nightly.sh +++ b/workflows/mlrMBO/test/cfg-prm-nightly.sh @@ -32,9 +32,9 @@ elif [ "$MODEL_NAME" = "graphdrp" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/graphdrp_small.R} elif [ "$MODEL_NAME" = "dummy" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/dummy_nightly.R} -elif [ "$PARAM_SET_FILE" != "" ]; then +elif [[ "${PARAM_SET_FILE:-}" != "" ]]; then PARAM_SET_FILE=${EMEWS_PROJECT_ROOT}/data/${PARAM_SET_FILE} else - echo "Invalid model-" $MODEL_NAME + printf "Could not find PARAM_SET_FILE for model: '%s'\n" $MODEL_NAME exit 1 fi From 8310f247c7755e145a9b9d6de3c3575570ae5585 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 23 Dec 2022 13:25:39 -0600 Subject: [PATCH 369/601] Do not need PYTHONPATH setting here for Polaris --- workflows/common/sh/utils.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index f6e5573a..ce833f8f 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -94,7 +94,7 @@ python_envs() then # We do not currently need this except on MCS and Spock: # Swift/T should grab PYTHONPATH automatically - if [[ ${SITE} == "mcs" ]] || [[ ${SITE} == "spock" ]] || [[ ${SITE} == "polaris" ]] + if [[ ${SITE} == "mcs" ]] || [[ ${SITE} == "spock" ]] # || [[ ${SITE} == "polaris" ]] then # MCS discards PYTHONPATH in subshells RESULT+=( -e PYTHONPATH=$PYTHONPATH ) From 3c4f7e82ce5f4b5763e9dd4207ff673a568e5589 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 23 Dec 2022 13:27:51 -0600 Subject: [PATCH 370/601] Initial test for GraphDRP --- workflows/mlrMBO/test/test-gdrp-1.sh | 37 ++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100755 workflows/mlrMBO/test/test-gdrp-1.sh diff --git a/workflows/mlrMBO/test/test-gdrp-1.sh b/workflows/mlrMBO/test/test-gdrp-1.sh new file mode 100755 index 00000000..d45df5d4 --- /dev/null +++ b/workflows/mlrMBO/test/test-gdrp-1.sh @@ -0,0 +1,37 @@ +#!/bin/bash +set -eu + +# TEST MLRMBO GDRP 1 +# For GraphDRP + +if (( ${#} != 1 )) +then + echo "usage: test SITE" + exit 1 +fi + +SITE=$1 + +# Self-configure +THIS=$( cd $( dirname $0 ) ; /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. ; /bin/pwd ) +WORKFLOWS_ROOT=$( cd $THIS/../.. ; /bin/pwd ) +export EMEWS_PROJECT_ROOT + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-nightly.sh +export CFG_PRM=$THIS/cfg-prm-nightly.sh + +# Specify the R file for This file must be present in the $EMEWS_PROJECT_ROOT/R +export R_FILE=mlrMBO-mbo.R + +CANDLE_MODEL_TYPE="SINGULARITY" +CANDLE_IMAGE=/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif + +export MODEL_NAME="graphdrp" + +# Currently ignored: +export OBJ_RETURN="val_loss" + +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $CFG_PRM $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE From c57ca414bd35d2e448e740d18a7a59dd4de2af0d Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 23 Dec 2022 13:29:19 -0600 Subject: [PATCH 371/601] Clean up and support IMPROVE --- workflows/upf/swift/workflow.sh | 34 ++++++++------------------------- workflows/upf/test/upf-1.sh | 2 ++ 2 files changed, 10 insertions(+), 26 deletions(-) diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index 1f6dbc8c..75dbc32c 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -6,17 +6,12 @@ set -eu # Autodetect this workflow directory export EMEWS_PROJECT_ROOT=$( realpath $( dirname $0 )/.. ) export WORKFLOWS_ROOT=$( realpath $EMEWS_PROJECT_ROOT/.. ) -export BENCHMARKS_ROOT=$( realpath $EMEWS_PROJECT_ROOT/../../../Benchmarks ) -BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/NT3:$BENCHMARKS_ROOT/Pilot2/P2B1:$BENCHMARKS_ROOT/Pilot1/P1B1:$BENCHMARKS_ROOT/Pilot1/Combo:$BENCHMARKS_ROOT/Pilot3/P3B1:$BENCHMARKS_ROOT/Pilot3/P3B3:$BENCHMARKS_ROOT/Pilot3/P3B4:$BENCHMARKS_ROOT/Pilot3/P3B5 -export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} SCRIPT_NAME=$(basename $0) # Source some utility functions used by EMEWS in this script source $WORKFLOWS_ROOT/common/sh/utils.sh -export TURBINE_LOG=0 TURBINE_DEBUG=0 ADLB_DEBUG=0 - usage() { echo "UNROLLED PARAMETER FILE: usage: workflow.sh SITE EXPID CFG_SYS UPF" @@ -39,23 +34,13 @@ then exit 1 fi -# Set PYTHONPATH for BENCHMARK related stuff -PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common -PYTHONPATH+=:$WORKFLOWS_ROOT/common/python -export PYTHONPATH - -# Set PYTHONPATH for BENCHMARK related stuff in obj_app mode -export APP_PYTHONPATH+=:$BENCHMARK_DIR # :$BENCHMARKS_ROOT/common # This is now candle_lib - source_site env $SITE -source_site sched $SITE +source_site sched $SITE -log_path PYTHONPATH +# Set up PYTHONPATH for model +source $WORKFLOWS_ROOT/common/sh/set-pythonpath.sh -if [[ ${EQR:-} == "" ]] -then - abort "The site '$SITE' did not set the location of EQ/R: this will not work!" -fi +log_path PYTHONPATH export TURBINE_JOBNAME="UPF_${EXPID}" @@ -65,7 +50,6 @@ then OBJ_PARAM_ARG="--obj_param=$OBJ_PARAM" fi -# Andrew: Allows for custom model.sh if desired export MODEL_SH=${MODEL_SH:-$WORKFLOWS_ROOT/common/sh/model.sh} export BENCHMARK_TIMEOUT @@ -85,22 +69,20 @@ cp $CFG_SYS $TURBINE_OUTPUT # Make run directory in advance to reduce contention mkdir -pv $TURBINE_OUTPUT/run -which swift-t - -# module list - cp -v $UPF $TURBINE_OUTPUT # TURBINE_STDOUT="$TURBINE_OUTPUT/out-%%r.txt" TURBINE_STDOUT= -log_path LD_LIBRARY_PATH - if [[ ${CANDLE_DATA_DIR:-} == "" ]] then abort "upf/workflow.sh: Set CANDLE_DATA_DIR!" fi +export CANDLE_IMAGE=${CANDLE_IMAGE:-} + +which swift-t + swift-t -n $PROCS \ -o $TURBINE_OUTPUT/workflow.tic \ ${MACHINE:-} \ diff --git a/workflows/upf/test/upf-1.sh b/workflows/upf/test/upf-1.sh index baf0a417..80d53808 100755 --- a/workflows/upf/test/upf-1.sh +++ b/workflows/upf/test/upf-1.sh @@ -21,4 +21,6 @@ export EMEWS_PROJECT_ROOT export OBJ_RETURN="val_loss" CFG_SYS=$THIS/cfg-sys-1.sh + +export CANDLE_MODEL_TYPE="BENCHMARKS" $EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $THIS/upf-1.txt From aecff19750c90daf4db00a841d027c014688805b Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 23 Dec 2022 13:30:50 -0600 Subject: [PATCH 372/601] New set-pythonpath to set the PYTHONPATH --- workflows/common/sh/set-pythonpath.sh | 41 +++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 workflows/common/sh/set-pythonpath.sh diff --git a/workflows/common/sh/set-pythonpath.sh b/workflows/common/sh/set-pythonpath.sh new file mode 100644 index 00000000..48992148 --- /dev/null +++ b/workflows/common/sh/set-pythonpath.sh @@ -0,0 +1,41 @@ + +# SET PYTHONPATH SH +# Sets up BENCHMARKS_ROOT variable and PYTHONPATH for workflows +# For CANDLE models, BENCHMARKS_ROOT is the CANDLE Benchmarks repo +# EMEWS_PROJECT_ROOT should be set by the calling script +# User may set BENCHMARKS_ROOT to override defaults +# BENCHMARKS_ROOT must exist as directory, +# although it may be empty/unused +# Repo structure is Supervisor/workflows/PROJECT , +# with Benchmarks normally alongside Supervisor +# If MODEL_PYTHON_DIR is set, that is added to PYTHONPATH + +BENCHMARKS_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd ) +export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} + +if [[ ! -d $BENCHMARKS_ROOT ]] +then + echo "Could not find BENCHMARKS_ROOT: '$BENCHMARKS_ROOT'" + return 1 +fi + +# This is now in candle_lib, which should be installed/available +# in the common compute-node Python environment: 2022-12-20 +# APP_PYTHONPATH+=:$BENCHMARK_DIRS:$BENCHMARKS_ROOT/common +# PYTHONPATH+=:$BENCHMARK_DIRS:$BENCHMARKS_ROOT/common + +export APP_PYTHONPATH=${APP_PYTHONPATH:-empty} + +PYTHONPATH+=:$WORKFLOWS_ROOT/common/python + +# Add known CANDLE Benchmarks to PYTHONPATH +PYTHONPATH+=:$BENCHMARKS_ROOT/Pilot1/P1B1 +PYTHONPATH+=:$BENCHMARKS_ROOT/Pilot1/Attn1 +PYTHONPATH+=:$BENCHMARKS_ROOT/Pilot1/NT3 +PYTHONPATH+=:$BENCHMARKS_ROOT/examples/ADRP +PYTHONPATH+=:$BENCHMARKS_ROOT/examples/xform-smiles + +if [[ ${MODEL_PYTHON_DIR:-} != "" ]] +then + PYTHONPATH+=:$MODEL_PYTHON_DIR +fi From da919eac0c443ffec544a5e28686d958a74af9f2 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 23 Dec 2022 13:35:39 -0600 Subject: [PATCH 373/601] Fix header --- workflows/mlrMBO/test/cfg-sys-nightly.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/mlrMBO/test/cfg-sys-nightly.sh b/workflows/mlrMBO/test/cfg-sys-nightly.sh index 004b09dc..6ab13616 100644 --- a/workflows/mlrMBO/test/cfg-sys-nightly.sh +++ b/workflows/mlrMBO/test/cfg-sys-nightly.sh @@ -1,5 +1,5 @@ -# MLRMBO CFG SYS 1 +# MLRMBO CFG SYS NIGHTLY # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEWS From b1e89dd6633969e935d139b88dfca458b80be68e Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 23 Dec 2022 13:35:53 -0600 Subject: [PATCH 374/601] Fix PPN for single-node runs --- workflows/mlrMBO/test/cfg-sys-nightly.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/mlrMBO/test/cfg-sys-nightly.sh b/workflows/mlrMBO/test/cfg-sys-nightly.sh index 6ab13616..7170e7e0 100644 --- a/workflows/mlrMBO/test/cfg-sys-nightly.sh +++ b/workflows/mlrMBO/test/cfg-sys-nightly.sh @@ -8,7 +8,7 @@ export PROCS=${PROCS:-3} # MPI processes per node # Cori has 32 cores per node, 128GB per node -export PPN=${PPN:-1} +export PPN=${PPN:-3} # For Theta: # export QUEUE=${QUEUE:-debug-flat-quad} From 62d38e05f2f535676207f287f8f31385436c16ef Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 23 Dec 2022 13:37:16 -0600 Subject: [PATCH 375/601] Clean up --- workflows/mlrMBO/test/cfg-sys-nightly.sh | 6 ------ 1 file changed, 6 deletions(-) diff --git a/workflows/mlrMBO/test/cfg-sys-nightly.sh b/workflows/mlrMBO/test/cfg-sys-nightly.sh index 7170e7e0..1de803a0 100644 --- a/workflows/mlrMBO/test/cfg-sys-nightly.sh +++ b/workflows/mlrMBO/test/cfg-sys-nightly.sh @@ -10,13 +10,7 @@ export PROCS=${PROCS:-3} # Cori has 32 cores per node, 128GB per node export PPN=${PPN:-3} -# For Theta: -# export QUEUE=${QUEUE:-debug-flat-quad} - export WALLTIME=${WALLTIME:-00:05:00} -# export WALLTIME=${WALLTIME:-120} - -#export PROJECT=Candle_ECP # Benchmark run timeout: benchmark run will timeout # after the specified number of seconds. From 602d3a669ba7baf817ded26459024fc3b271343c Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 23 Dec 2022 14:55:21 -0600 Subject: [PATCH 376/601] Add node/rank log model.log --- workflows/common/sh/model.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index af6fa556..23e0f59b 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -41,7 +41,6 @@ RUNID=$4 # Set instance_directory to that and cd into it. # # TODO: rename INSTANCE_DIRECTORY to OUTPUT_DIR #set -x -echo CMT $CANDLE_MODEL_TYPE if [[ $CANDLE_MODEL_TYPE = "SINGULARITY" ]] then # TODO: Rename "instance" to "run" @@ -74,6 +73,8 @@ log() log "START" log "MODEL_NAME: $MODEL_NAME" log "RUNID: $RUNID" +log "HOST: $( hostname )" +log "ADLB_RANK_OFFSET: $ADLB_RANK_OFFSET" # log "CANDLE_MODEL_TYPE: $CANDLE_MODEL_TYPE" # Source langs-app-{SITE} from workflow/common/sh/ (cf. utils.sh) From 1530cfd05877f5ccefab6616291e82d168cd0d96 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 23 Dec 2022 15:33:12 -0600 Subject: [PATCH 377/601] Add better mlrMBO output --- workflows/common/R/mlrMBO-mbo.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflows/common/R/mlrMBO-mbo.R b/workflows/common/R/mlrMBO-mbo.R index feeeb490..75e607fe 100644 --- a/workflows/common/R/mlrMBO-mbo.R +++ b/workflows/common/R/mlrMBO-mbo.R @@ -29,16 +29,16 @@ string_params <- elements_of_lists_to_json(dots[[1L]]) # print(dots) # print(paste0("parallelMap2 called with list_param: ",string_params)) - # print(paste("parallelMap2 called with list size:", length(string_params))) + print(paste("mlrMBO: produced task count: ", length(dots[[1L]]))) OUT_put(string_params) string_results = IN_get() - st = proc.time() - st # Assumes results are in the form a;b;c # Note: can also handle vector returns for each, # i.e., a,b;c,d;e,f res <- string_to_list_of_vectors(string_results) + print(paste("mlrMBO: received result count:", length(res))) # using dummy time return(result_with_extras_if_exist(res,st[3])) } @@ -179,7 +179,7 @@ # This is a string of R code containing arguments to main_function(), # e.g., "max.budget = 110, max.iterations = 10, design.size = 10, ..." msg <- IN_get() - print(paste("Received params1 msg: ", msg)) + print(paste("Received mlrMBO configuration parameters msg: ", msg)) # Edit the R code to make a list constructor expression code = paste0("list(",msg,")") From b82668f473edfe023993084915a02a644ff8c677 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 23 Dec 2022 15:38:56 -0600 Subject: [PATCH 378/601] Fix comments --- workflows/mlrMBO/data/graphdrp_small.R | 2 +- workflows/mlrMBO/test/test-gdrp-1.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/mlrMBO/data/graphdrp_small.R b/workflows/mlrMBO/data/graphdrp_small.R index 07a199d2..5d6f5e01 100644 --- a/workflows/mlrMBO/data/graphdrp_small.R +++ b/workflows/mlrMBO/data/graphdrp_small.R @@ -1,5 +1,5 @@ -# NT3 Hyperparameter Search - Test 1 +# GraphDRP Hyperparameter Search - Test "small" # These parameters should stay small for short tests # and use no dense parameters to avoid mlrMBO crashes diff --git a/workflows/mlrMBO/test/test-gdrp-1.sh b/workflows/mlrMBO/test/test-gdrp-1.sh index d45df5d4..2abe7e89 100755 --- a/workflows/mlrMBO/test/test-gdrp-1.sh +++ b/workflows/mlrMBO/test/test-gdrp-1.sh @@ -22,7 +22,7 @@ export EMEWS_PROJECT_ROOT export CFG_SYS=$THIS/cfg-sys-nightly.sh export CFG_PRM=$THIS/cfg-prm-nightly.sh -# Specify the R file for This file must be present in the $EMEWS_PROJECT_ROOT/R +# Specify the mlrMBO algorithm R file export R_FILE=mlrMBO-mbo.R CANDLE_MODEL_TYPE="SINGULARITY" From 1330abca09d7fe9f5a61e0ad5ff1ebd13058e7fd Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Sat, 31 Dec 2022 16:59:44 +0000 Subject: [PATCH 379/601] Make GDRP a little bigger --- workflows/mlrMBO/data/graphdrp_small.R | 6 +++--- workflows/mlrMBO/test/cfg-prm-nightly.sh | 9 +++++---- workflows/mlrMBO/test/cfg-sys-nightly.sh | 7 ++++--- workflows/mlrMBO/test/test-gdrp-1.sh | 2 +- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/workflows/mlrMBO/data/graphdrp_small.R b/workflows/mlrMBO/data/graphdrp_small.R index 5d6f5e01..7d2eb362 100644 --- a/workflows/mlrMBO/data/graphdrp_small.R +++ b/workflows/mlrMBO/data/graphdrp_small.R @@ -7,9 +7,9 @@ # the parameter names should match names of the arguments expected by the benchmark param.set <- makeParamSet( - makeDiscreteParam("test_batch", values = c(8, 16)), - makeIntegerParam("epochs", lower = 1, upper = 2), +# makeDiscreteParam("test_batch", values = c(8, 16)), + makeIntegerParam("epochs", lower = 50, upper = 60), # makeDiscreteParam("optimizer", values = c("adam", "sgd", "rmsprop", "adagrad", "adadelta")), - # makeNumericParam("dropout", lower = 0, upper = 0.9), + makeNumericParam("dropout", lower = 0.1, upper = 0.5), makeNumericParam("learning_rate", lower = 0.001, upper = 0.1) ) diff --git a/workflows/mlrMBO/test/cfg-prm-nightly.sh b/workflows/mlrMBO/test/cfg-prm-nightly.sh index cac57958..db39fe83 100644 --- a/workflows/mlrMBO/test/cfg-prm-nightly.sh +++ b/workflows/mlrMBO/test/cfg-prm-nightly.sh @@ -3,11 +3,11 @@ # mlrMBO settings # Total iterations -PROPOSE_POINTS=${PROPOSE_POINTS:-5} +PROPOSE_POINTS=${PROPOSE_POINTS:-10} MAX_CONCURRENT_EVALUATIONS=${MAX_CONCURRET_EVALUATIONS:-1} MAX_ITERATIONS=${MAX_ITERATIONS:-3} MAX_BUDGET=${MAX_BUDGET:-180} -DESIGN_SIZE=${DESIGN_SIZE:-5} +DESIGN_SIZE=${DESIGN_SIZE:-10} # TODO: move the following code to a utility library- # this is a configuration file @@ -35,6 +35,7 @@ elif [ "$MODEL_NAME" = "dummy" ]; then elif [[ "${PARAM_SET_FILE:-}" != "" ]]; then PARAM_SET_FILE=${EMEWS_PROJECT_ROOT}/data/${PARAM_SET_FILE} else - printf "Could not find PARAM_SET_FILE for model: '%s'\n" $MODEL_NAME - exit 1 + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/graphdrp_small.R} +# printf "Could not find PARAM_SET_FILE for model: '%s'\n" $MODEL_NAME +# exit 1 fi diff --git a/workflows/mlrMBO/test/cfg-sys-nightly.sh b/workflows/mlrMBO/test/cfg-sys-nightly.sh index 1de803a0..9792abc2 100644 --- a/workflows/mlrMBO/test/cfg-sys-nightly.sh +++ b/workflows/mlrMBO/test/cfg-sys-nightly.sh @@ -4,13 +4,14 @@ # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEWS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-3} +export PROCS=${PROCS:-16} # MPI processes per node # Cori has 32 cores per node, 128GB per node -export PPN=${PPN:-3} +export PPN=${PPN:-4} -export WALLTIME=${WALLTIME:-00:05:00} +export QUEUE=${QUEUE:-debug-scaling} +export WALLTIME=${WALLTIME:-00:60:00} # Benchmark run timeout: benchmark run will timeout # after the specified number of seconds. diff --git a/workflows/mlrMBO/test/test-gdrp-1.sh b/workflows/mlrMBO/test/test-gdrp-1.sh index 2abe7e89..2c7e180d 100755 --- a/workflows/mlrMBO/test/test-gdrp-1.sh +++ b/workflows/mlrMBO/test/test-gdrp-1.sh @@ -23,7 +23,7 @@ export CFG_SYS=$THIS/cfg-sys-nightly.sh export CFG_PRM=$THIS/cfg-prm-nightly.sh # Specify the mlrMBO algorithm R file -export R_FILE=mlrMBO-mbo.R +export R_FILE=mlrMBO-ils.R CANDLE_MODEL_TYPE="SINGULARITY" CANDLE_IMAGE=/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif From be3b526ee939c75c079d1e5ea0bd6020a9c73f3e Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 9 Jan 2023 15:27:13 -0600 Subject: [PATCH 380/601] Move APP_PYTHONPATH logic --- workflows/common/sh/langs-app-lambda.sh | 11 ----------- workflows/common/sh/model.sh | 12 ++++++++++-- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/workflows/common/sh/langs-app-lambda.sh b/workflows/common/sh/langs-app-lambda.sh index 82e0943b..51bb3390 100644 --- a/workflows/common/sh/langs-app-lambda.sh +++ b/workflows/common/sh/langs-app-lambda.sh @@ -9,15 +9,4 @@ PY=$SFW/Anaconda PATH=$PY/bin:$PATH -echo "Programs:" -which python - -export PYTHONPATH=${APP_PYTHONPATH:-}:${PYTHONPATH:-} - -# Cf. utils.sh -log_path APP_PYTHONPATH -log_path PYTHONPATH -log_path LD_LIBRARY_PATH -show PYTHONHOME - echo "langs-app-lambda done." diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index 23e0f59b..ada0c03e 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -90,10 +90,18 @@ log "PARAMS:" echo $PARAMS | print_json echo -log "USING PYTHON:" $( which python ) +log "USING PYTHON:" $( which python3 ) echo -#set -x +# Cf. utils.sh +log_path APP_PYTHONPATH +log_path PYTHONPATH +log_path LD_LIBRARY_PATH +show PYTHONHOME + +# Set up PYTHONPATH for app tasks +export PYTHONPATH=${APP_PYTHONPATH:-}:${PYTHONPATH:-} + # Construct the desired model command MODEL_CMD based on CANDLE_MODEL_TYPE: if [[ ${CANDLE_MODEL_TYPE:-} == "SINGULARITY" ]] then From 2dbd5cb5b2f34fc94f786817660da1cbe0a4b629 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 11 Jan 2023 12:21:23 -0800 Subject: [PATCH 381/601] o working version on lambda0 --- workflows/common/sh/set-pythonpath.sh | 3 ++- workflows/mlrMBO/data/graphdrp_small.R | 4 ++-- workflows/mlrMBO/test/cfg-prm-nightly.sh | 4 ++-- workflows/mlrMBO/test/cfg-sys-nightly.sh | 4 ++-- workflows/mlrMBO/test/test-gdrp-1.sh | 6 ++++-- workflows/mlrMBO/test/test-graphdrp-lambda0.sh | 2 +- 6 files changed, 13 insertions(+), 10 deletions(-) diff --git a/workflows/common/sh/set-pythonpath.sh b/workflows/common/sh/set-pythonpath.sh index 48992148..184d76a0 100644 --- a/workflows/common/sh/set-pythonpath.sh +++ b/workflows/common/sh/set-pythonpath.sh @@ -24,7 +24,6 @@ fi # APP_PYTHONPATH+=:$BENCHMARK_DIRS:$BENCHMARKS_ROOT/common # PYTHONPATH+=:$BENCHMARK_DIRS:$BENCHMARKS_ROOT/common -export APP_PYTHONPATH=${APP_PYTHONPATH:-empty} PYTHONPATH+=:$WORKFLOWS_ROOT/common/python @@ -35,6 +34,8 @@ PYTHONPATH+=:$BENCHMARKS_ROOT/Pilot1/NT3 PYTHONPATH+=:$BENCHMARKS_ROOT/examples/ADRP PYTHONPATH+=:$BENCHMARKS_ROOT/examples/xform-smiles +export APP_PYTHONPATH=${APP_PYTHONPATH:-$PYTHONPATH} + if [[ ${MODEL_PYTHON_DIR:-} != "" ]] then PYTHONPATH+=:$MODEL_PYTHON_DIR diff --git a/workflows/mlrMBO/data/graphdrp_small.R b/workflows/mlrMBO/data/graphdrp_small.R index 7d2eb362..785d2bf1 100644 --- a/workflows/mlrMBO/data/graphdrp_small.R +++ b/workflows/mlrMBO/data/graphdrp_small.R @@ -8,8 +8,8 @@ param.set <- makeParamSet( # makeDiscreteParam("test_batch", values = c(8, 16)), - makeIntegerParam("epochs", lower = 50, upper = 60), + makeIntegerParam("epochs", lower = 3, upper = 4), # makeDiscreteParam("optimizer", values = c("adam", "sgd", "rmsprop", "adagrad", "adadelta")), - makeNumericParam("dropout", lower = 0.1, upper = 0.5), + # makeNumericParam("dropout", lower = 0.1, upper = 0.5), makeNumericParam("learning_rate", lower = 0.001, upper = 0.1) ) diff --git a/workflows/mlrMBO/test/cfg-prm-nightly.sh b/workflows/mlrMBO/test/cfg-prm-nightly.sh index db39fe83..befe3ca0 100644 --- a/workflows/mlrMBO/test/cfg-prm-nightly.sh +++ b/workflows/mlrMBO/test/cfg-prm-nightly.sh @@ -3,11 +3,11 @@ # mlrMBO settings # Total iterations -PROPOSE_POINTS=${PROPOSE_POINTS:-10} +PROPOSE_POINTS=${PROPOSE_POINTS:-5} MAX_CONCURRENT_EVALUATIONS=${MAX_CONCURRET_EVALUATIONS:-1} MAX_ITERATIONS=${MAX_ITERATIONS:-3} MAX_BUDGET=${MAX_BUDGET:-180} -DESIGN_SIZE=${DESIGN_SIZE:-10} +DESIGN_SIZE=${DESIGN_SIZE:-5} # TODO: move the following code to a utility library- # this is a configuration file diff --git a/workflows/mlrMBO/test/cfg-sys-nightly.sh b/workflows/mlrMBO/test/cfg-sys-nightly.sh index 9792abc2..c7d1eaf1 100644 --- a/workflows/mlrMBO/test/cfg-sys-nightly.sh +++ b/workflows/mlrMBO/test/cfg-sys-nightly.sh @@ -4,11 +4,11 @@ # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEWS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-16} +export PROCS=${PROCS:-7} # MPI processes per node # Cori has 32 cores per node, 128GB per node -export PPN=${PPN:-4} +export PPN=${PPN:-1} export QUEUE=${QUEUE:-debug-scaling} export WALLTIME=${WALLTIME:-00:60:00} diff --git a/workflows/mlrMBO/test/test-gdrp-1.sh b/workflows/mlrMBO/test/test-gdrp-1.sh index 2c7e180d..28e137d2 100755 --- a/workflows/mlrMBO/test/test-gdrp-1.sh +++ b/workflows/mlrMBO/test/test-gdrp-1.sh @@ -23,10 +23,12 @@ export CFG_SYS=$THIS/cfg-sys-nightly.sh export CFG_PRM=$THIS/cfg-prm-nightly.sh # Specify the mlrMBO algorithm R file -export R_FILE=mlrMBO-ils.R +export R_FILE=mlrMBO-mbo.R CANDLE_MODEL_TYPE="SINGULARITY" -CANDLE_IMAGE=/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif +# CANDLE_IMAGE=/software/improve/images/GraphDRP.sif # lambda +CANDLE_IMAGE=/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif # Polaris + export MODEL_NAME="graphdrp" diff --git a/workflows/mlrMBO/test/test-graphdrp-lambda0.sh b/workflows/mlrMBO/test/test-graphdrp-lambda0.sh index 26634464..00d55539 100755 --- a/workflows/mlrMBO/test/test-graphdrp-lambda0.sh +++ b/workflows/mlrMBO/test/test-graphdrp-lambda0.sh @@ -48,7 +48,7 @@ then fi export CANDLE_MODEL_TYPE="SINGULARITY" -export CANDLE_IMAGE="/software/improve/images/GraphDRP\:0.0.1-20221109.sif" +export CANDLE_IMAGE="/software/improve/images/GraphDRP.sif" export INIT_PARAMS_FILE="/software/improve/graphdrp_default_model.txt" # Submit job From 9863d050544441d6e14da96e1ea95d9ef88c4d76 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 11 Jan 2023 15:23:25 -0600 Subject: [PATCH 382/601] Configure EQ-Py on Lambda --- workflows/common/sh/env-lambda.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/workflows/common/sh/env-lambda.sh b/workflows/common/sh/env-lambda.sh index 6caf7ef1..a533e414 100644 --- a/workflows/common/sh/env-lambda.sh +++ b/workflows/common/sh/env-lambda.sh @@ -7,6 +7,7 @@ SFW=/homes/woz/Public/sfw SWIFT=$SFW/swift-t/2022-11-02 PY=$SFW/Anaconda +EQPY=$SFW/EQ-Py EQR=$SFW/EQ-R R=$SFW/R-4.1.0 @@ -18,6 +19,8 @@ export LD_LIBRARY_PATH=$R/lib/R/lib:${LD_LIBRARY_PATH:-} # How to run CANDLE models: SWIFT_IMPL="app" +PYTHONPATH=$EQPY/src:${PYTHONPATH:-} + # Log settings to output echo "Programs:" which python swift-t | nl From 554a328e93afa357beb09b512b1655b9c5a80580 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 11 Jan 2023 15:36:57 -0600 Subject: [PATCH 383/601] Add Lambda to list of systems with Swift/T PYTHONPATH --- workflows/common/sh/utils.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index ce833f8f..f1f53f73 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -94,7 +94,9 @@ python_envs() then # We do not currently need this except on MCS and Spock: # Swift/T should grab PYTHONPATH automatically - if [[ ${SITE} == "mcs" ]] || [[ ${SITE} == "spock" ]] # || [[ ${SITE} == "polaris" ]] + if [[ ${SITE} == "mcs" ]] || \ + [[ ${SITE} == "spock" ]] || \ + [[ ${SITE} == "lambda" ]] then # MCS discards PYTHONPATH in subshells RESULT+=( -e PYTHONPATH=$PYTHONPATH ) From 97b67b460f4e96bcc2423689e148c6aba1ebe663 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 11 Jan 2023 14:15:12 -0800 Subject: [PATCH 384/601] o Changes for GA --- workflows/GA/swift/workflow.sh | 5 ++++- workflows/GA/swift/workflow.swift | 2 +- workflows/common/sh/env-lambda.sh | 7 +++---- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/workflows/GA/swift/workflow.sh b/workflows/GA/swift/workflow.sh index d85a4b08..0640f23d 100755 --- a/workflows/GA/swift/workflow.sh +++ b/workflows/GA/swift/workflow.sh @@ -60,6 +60,9 @@ source_site sched $SITE EQPY=${EQPY:-$WORKFLOWS_ROOT/common/ext/EQ-Py} +# Set up PYTHONPATH for model +source $WORKFLOWS_ROOT/common/sh/set-pythonpath.sh + # Set PYTHONPATH for BENCHMARK related stuff source $WORKFLOWS_ROOT/known-benchmarks.sh PYTHONPATH+=:$EQPY @@ -139,7 +142,7 @@ fi echo APP_PYPATH $APP_PYTHONPATH - +set -x swift-t -O 0 -n $PROCS \ ${MACHINE:-} \ -p -I $EQPY -r $EQPY \ diff --git a/workflows/GA/swift/workflow.swift b/workflows/GA/swift/workflow.swift index dbca265a..540811c1 100644 --- a/workflows/GA/swift/workflow.swift +++ b/workflows/GA/swift/workflow.swift @@ -78,7 +78,7 @@ string FRAMEWORK = "keras"; string results[]; foreach param, j in param_array { - results[j] = obj(param, "%00i_%000i_%0000i" % (restart_number,i,j)); + results[j] = obj(param, exp_id, "%00i_%000i_%0000i" % (restart_number,i,j)); } string res = join(results, ";"); // printf(res); diff --git a/workflows/common/sh/env-lambda.sh b/workflows/common/sh/env-lambda.sh index a533e414..4a7cb8ed 100644 --- a/workflows/common/sh/env-lambda.sh +++ b/workflows/common/sh/env-lambda.sh @@ -7,7 +7,7 @@ SFW=/homes/woz/Public/sfw SWIFT=$SFW/swift-t/2022-11-02 PY=$SFW/Anaconda -EQPY=$SFW/EQ-Py +# EQPY=$SFW/EQ-Py EQR=$SFW/EQ-R R=$SFW/R-4.1.0 @@ -19,12 +19,11 @@ export LD_LIBRARY_PATH=$R/lib/R/lib:${LD_LIBRARY_PATH:-} # How to run CANDLE models: SWIFT_IMPL="app" -PYTHONPATH=$EQPY/src:${PYTHONPATH:-} +# PYTHONPATH=$EQPY/src:${PYTHONPATH:-} # Log settings to output echo "Programs:" which python swift-t | nl # Cf. utils.sh show PYTHONHOME -log_path LD_LIBRARY_PATH -log_path PYTHONPATH +log_path LD_LIBRARY_PATH \ No newline at end of file From 627483f459674d9083a4262761ecc5cbf2425999 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 11 Jan 2023 16:35:10 -0600 Subject: [PATCH 385/601] Drop known-benchmarks - see set-pythonpath.sh --- workflows/known-benchmarks.sh | 30 ------------------------------ 1 file changed, 30 deletions(-) delete mode 100644 workflows/known-benchmarks.sh diff --git a/workflows/known-benchmarks.sh b/workflows/known-benchmarks.sh deleted file mode 100644 index 29239367..00000000 --- a/workflows/known-benchmarks.sh +++ /dev/null @@ -1,30 +0,0 @@ - -# OBSOLETED: see common/sh/set-pythonpath.sh 2022-12-20 - -# Known Benchmarks -# Generate the list of Benchmarks that Supervisor knows about -# To add a Known Benchmark, add its paths to BENCHMARKS_DIRS_BASE below -# To call an unknown model, -# set environment variable MODEL_NAME to the short name -# set PYTHONPATH and/or APP_PYTHONPATH as needed - -BENCHMARKS_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd ) -export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} - -BENCHMARKS_DIRS_BASE="" -BENCHMARKS_DIRS_BASE+=$BENCHMARKS_ROOT/Pilot1/P1B1: -BENCHMARKS_DIRS_BASE+=$BENCHMARKS_ROOT/Pilot1/Attn1: -BENCHMARKS_DIRS_BASE+=$BENCHMARKS_ROOT/Pilot1/NT3: -BENCHMARKS_DIRS_BASE+=$BENCHMARKS_ROOT/examples/ADRP: -BENCHMARKS_DIRS_BASE+=$BENCHMARKS_ROOT/examples/xform-smiles - -export BENCHMARK_TIMEOUT -export BENCHMARK_DIRS=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} - -# Set PYTHONPATH and/or APP_PYTHONPATH appropriately based on SWIFT_IMPL -# ... - -APP_PYTHONPATH+=:$BENCHMARK_DIRS:$BENCHMARKS_ROOT/common - PYTHONPATH+=:$BENCHMARK_DIRS:$BENCHMARKS_ROOT/common - -export APP_PYTHONPATH From f904a5d1390715aab87efe5364db3334fba7c291 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 11 Jan 2023 14:36:05 -0800 Subject: [PATCH 386/601] o Run one job at a time and don't use Benchmarks/common for candle lib --- workflows/GA/data/nt3_param_space_ga.json | 4 ++-- workflows/GA/swift/workflow.sh | 5 +---- workflows/GA/test/cfg-sys-1.sh | 2 +- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/workflows/GA/data/nt3_param_space_ga.json b/workflows/GA/data/nt3_param_space_ga.json index bf946317..9c21f4f4 100644 --- a/workflows/GA/data/nt3_param_space_ga.json +++ b/workflows/GA/data/nt3_param_space_ga.json @@ -10,8 +10,8 @@ { "name": "epochs", "type": "int", - "lower": 100, - "upper": 500, + "lower": 1, + "upper": 5, "sigma": 20 }, diff --git a/workflows/GA/swift/workflow.sh b/workflows/GA/swift/workflow.sh index 0640f23d..a5d1454f 100755 --- a/workflows/GA/swift/workflow.sh +++ b/workflows/GA/swift/workflow.sh @@ -13,10 +13,8 @@ then echo "Could not find Benchmarks in: $EMEWS_PROJECT_ROOT/../../../Benchmarks" exit 1 fi -export BENCHMARKS_ROOT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd) -BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/TC1:$BENCHMARKS_ROOT/Pilot1/NT3:$BENCHMARKS_ROOT/Pilot1/P1B1:$BENCHMARKS_ROOT/Pilot1/Combo:$BENCHMARKS_ROOT/Pilot2/P2B1 + export BENCHMARK_TIMEOUT -export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} SCRIPT_NAME=$(basename $0) @@ -64,7 +62,6 @@ EQPY=${EQPY:-$WORKFLOWS_ROOT/common/ext/EQ-Py} source $WORKFLOWS_ROOT/common/sh/set-pythonpath.sh # Set PYTHONPATH for BENCHMARK related stuff -source $WORKFLOWS_ROOT/known-benchmarks.sh PYTHONPATH+=:$EQPY PYTHONPATH+=:$WORKFLOWS_ROOT/common/python diff --git a/workflows/GA/test/cfg-sys-1.sh b/workflows/GA/test/cfg-sys-1.sh index af1a6ca3..d9380d47 100644 --- a/workflows/GA/test/cfg-sys-1.sh +++ b/workflows/GA/test/cfg-sys-1.sh @@ -4,7 +4,7 @@ # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEMS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-4} +export PROCS=${PROCS:-3} # MPI processes per node # Cori has 32 cores per node, 128GB per node From 85b126bd6ab282732c1fe9ee0619abc2d1edc96f Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 11 Jan 2023 16:51:55 -0600 Subject: [PATCH 387/601] Improve log_path() for unset variables --- workflows/common/sh/utils.sh | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index f1f53f73..4cdfa462 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -61,13 +61,19 @@ show() } log_path() -# Pretty print a colon-separated variable +# Pretty print a colon-separated variable, one entry per line # Provide the name of the variable (no dollar sign) { - echo ${1}: - eval echo \$$1 | tr : '\n' | nl --number-width=2 --number-separator ": " - echo -- - echo + # First, test if $1 is the name of a set shell variable: + if eval test \$\{$1:-\} + then + echo ${1}: + eval echo \$$1 | tr : '\n' | nl + echo -- + echo + else + echo "log_path(): ${1} is unset." + fi } which_check() From 105a04eb1e0440336df12d36ce2722f5d8d105e0 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 12 Jan 2023 13:43:47 -0600 Subject: [PATCH 388/601] Initial CANDLE-compliant model --- models/Comparator/cmp_baseline_keras2.py | 39 ++++++++++++++++++++++++ models/Comparator/cmp_default_model.txt | 6 ++++ 2 files changed, 45 insertions(+) create mode 100644 models/Comparator/cmp_baseline_keras2.py create mode 100644 models/Comparator/cmp_default_model.txt diff --git a/models/Comparator/cmp_baseline_keras2.py b/models/Comparator/cmp_baseline_keras2.py new file mode 100644 index 00000000..97e1b0ba --- /dev/null +++ b/models/Comparator/cmp_baseline_keras2.py @@ -0,0 +1,39 @@ + +import os + +import candle + + +class Comparator(candle.Benchmark): + pass + + +file_path = os.path.dirname(os.path.realpath(__file__)) + + +def initialize_parameters(default_model="cmp_default_model.txt"): + global file_path + bmk = Comparator(file_path, + default_model, + "keras", + prog="cmp_baseline", + desc="Meta-model to compare two models") + # Initialize parameters + gParameters = candle.finalize_parameters(bmk) + return gParameters + + +def run(gParameters): + print("COMPARATOR") + print(str(gParameters)) + global file_path + print("file_path: %s" % file_path) + + +def main(): + gParameters = initialize_parameters() + run(gParameters) + + +if __name__ == "__main__": + main() diff --git a/models/Comparator/cmp_default_model.txt b/models/Comparator/cmp_default_model.txt new file mode 100644 index 00000000..7ee6baff --- /dev/null +++ b/models/Comparator/cmp_default_model.txt @@ -0,0 +1,6 @@ +[Global_Params] + +model_name = 'cmp' + +model1 = 'graphdrp' +model2 = 'graphdrp' From 670bc008c29ecb2b57852f6f23a0e394113c1e18 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 12 Jan 2023 14:31:15 -0600 Subject: [PATCH 389/601] First model call works --- models/Comparator/cmp_baseline_keras2.py | 33 ++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/models/Comparator/cmp_baseline_keras2.py b/models/Comparator/cmp_baseline_keras2.py index 97e1b0ba..2474deef 100644 --- a/models/Comparator/cmp_baseline_keras2.py +++ b/models/Comparator/cmp_baseline_keras2.py @@ -1,6 +1,7 @@ import os - +import subprocess +from pathlib import Path import candle @@ -28,7 +29,35 @@ def run(gParameters): print(str(gParameters)) global file_path print("file_path: %s" % file_path) - + output_dir = gParameters["output_dir"] + expid = gParameters["experiment_id"] + supervisor = Path(file_path).absolute().parent.parent + workflows = supervisor / "workflows" + model_sh = workflows / "common" / "sh" / "model.sh" + print(model_sh) + os.chdir(output_dir) + env = { "WORKFLOWS_ROOT": str(workflows), + "TURBINE_OUTPUT": output_dir, + "EXPID": expid, + "SITE": "lambda", + "OBJ_RETURN": "loss", + "BENCHMARK_TIMEOUT": "120", + "MODEL_NAME": gParameters["model1"], + "CANDLE_MODEL_TYPE": "SINGULARITY", + "CANDLE_DATA_DIR": os.getenv("CANDLE_DATA_DIR"), + "ADLB_RANK_OFFSET": "0", + "CANDLE_IMAGE": "/software/improve/images/GraphDRP.sif" + } + print("env: " + str(env)) + cmd = [ "bash", model_sh, + "keras2", "{}", # empty JSON fragment + expid, + gParameters["run_id"] ] + print("cmd: " + str(cmd)) + with open("model1.log", "w") as model1_log: + subprocess.run(cmd, env=env, + stdout=model1_log, stderr=subprocess.STDOUT) + print("Comparator DONE.") def main(): gParameters = initialize_parameters() From 84ad741035ff0397f893162376fe848e5f87e8c6 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Fri, 13 Jan 2023 08:51:42 -0800 Subject: [PATCH 390/601] o Moving towards containers (Singularity) for GA --- workflows/GA/swift/workflow.sh | 72 +++++++++++++++++++++++-------- workflows/GA/swift/workflow.swift | 6 ++- workflows/GA/test/cfg-prm-1.sh | 2 +- 3 files changed, 60 insertions(+), 20 deletions(-) diff --git a/workflows/GA/swift/workflow.sh b/workflows/GA/swift/workflow.sh index a5d1454f..45030037 100755 --- a/workflows/GA/swift/workflow.sh +++ b/workflows/GA/swift/workflow.sh @@ -30,28 +30,42 @@ source $WORKFLOWS_ROOT/common/sh/utils.sh usage() { - echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME" + echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME" \ + "[CANDLE_MODEL_TYPE] [CANDLE_IMAGE]" } -if (( ${#} != 5 )) +if (( ${#} != 7 )) && (( ${#} != 5 )) then usage exit 1 fi -if ! { - get_site $1 # Sets SITE - get_expid $2 # Sets EXPID - get_cfg_sys $3 - get_cfg_prm $4 - MODEL_NAME=$5 - } +if (( ${#} == 7 )) then + export CANDLE_MODEL_TYPE=$6 + export CANDLE_IMAGE=$7 +elif (( ${#} == 5 )) +then + CANDLE_MODEL_TYPE="BENCHMARKS" + CANDLE_IMAGE=NONE +else usage exit 1 fi -echo "Running "$MODEL_NAME "workflow" +TURBINE_OUTPUT="" +if [[ $CANDLE_MODEL_TYPE = "SINGULARITY" ]] +then + TURBINE_OUTPUT=$CANDLE_DATA_DIR/output + printf "Running mlrMBO workflow with model %s and image %s:%s\n" \ + $MODEL_NAME $CANDLE_MODEL_TYPE $CANDLE_IMAGE +fi + +get_site $1 # Sets SITE +get_expid $2 $CANDLE_MODEL_TYPE # Sets EXPID +get_cfg_sys $3 +get_cfg_prm $4 +MODEL_NAME=$5 source_site env $SITE source_site sched $SITE @@ -65,7 +79,20 @@ source $WORKFLOWS_ROOT/common/sh/set-pythonpath.sh PYTHONPATH+=:$EQPY PYTHONPATH+=:$WORKFLOWS_ROOT/common/python -export TURBINE_JOBNAME="JOB:${EXPID}" +export TURBINE_JOBNAME="GA_JOB:${EXPID}" +RESTART_FILE_ARG="" +if [[ ${RESTART_FILE:-} != "" ]] +then + RESTART_FILE_ARG="--restart_file=$RESTART_FILE" +fi + +RESTART_NUMBER_ARG="" +if [[ ${RESTART_NUMBER:-} != "" ]] +then + RESTART_NUMBER_ARG="--restart_number=$RESTART_NUMBER" +fi + + CMD_LINE_ARGS=( -ga_params=$PARAM_SET_FILE -seed=$SEED -ni=$NUM_ITERATIONS @@ -108,8 +135,10 @@ then echo "Turbine will wait for job completion." fi -# Use for Summit (LSF needs two %) -if [[ ${SITE:-} == "summit" ]] +# Handle %-escapes in TURBINE_STDOUT +if [ $SITE == "summit" ] || \ + [ $SITE == "biowulf" ] || \ + [ $SITE == "polaris" ] then export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" else @@ -135,10 +164,13 @@ else STDOUT="" fi -# echo's anything following this to standard out - -echo APP_PYPATH $APP_PYTHONPATH +if [[ ${CANDLE_DATA_DIR:-} == "" ]] +then + echo "CANDLE_DATA_DIR is not set in the environment! Exiting..." + exit 1 +fi +( set -x swift-t -O 0 -n $PROCS \ ${MACHINE:-} \ @@ -155,6 +187,7 @@ swift-t -O 0 -n $PROCS \ -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ -e OBJ_RETURN \ -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ + -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ -e MODEL_SH \ -e MODEL_NAME \ -e SITE \ @@ -162,10 +195,13 @@ swift-t -O 0 -n $PROCS \ -e SH_TIMEOUT \ -e TURBINE_STDOUT \ -e IGNORE_ERRORS \ + -e CANDLE_DATA_DIR \ + -e CANDLE_MODEL_TYPE \ + -e CANDLE_IMAGE \ $WAIT_ARG \ $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} |& \ tee $STDOUT - +) if (( ${PIPESTATUS[0]} )) then @@ -173,7 +209,7 @@ then exit 1 fi -# echo "EXIT CODE: 0" | tee -a $STDOUT +echo "EXIT CODE: 0" | tee -a $STDOUT # Andrew: Needed this so that script to monitor job worked properly (queue_wait... function in utils.sh?) echo $TURBINE_OUTPUT > turbine-directory.txt diff --git a/workflows/GA/swift/workflow.swift b/workflows/GA/swift/workflow.swift index 540811c1..8b31faa9 100644 --- a/workflows/GA/swift/workflow.swift +++ b/workflows/GA/swift/workflow.swift @@ -25,12 +25,16 @@ string r_ranks[] = split(resident_work_ranks,","); string strategy = argv("strategy"); string ga_params_file = argv("ga_params"); -string init_params_file = argv("init_params", ""); +// string init_params_file = argv("init_params", ""); float mut_prob = string2float(argv("mutation_prob", "0.2")); string exp_id = argv("exp_id"); int benchmark_timeout = toint(argv("benchmark_timeout", "-1")); string model_name = getenv("MODEL_NAME"); +string candle_model_type = getenv("CANDLE_MODEL_TYPE"); +string candle_image = getenv("CANDLE_IMAGE"); +string init_params_file = getenv("INIT_PARAMS_FILE"); + printf("TURBINE_OUTPUT: " + turbine_output); string restart_number = argv("restart_number", "1"); diff --git a/workflows/GA/test/cfg-prm-1.sh b/workflows/GA/test/cfg-prm-1.sh index 85893270..8ad83e4c 100644 --- a/workflows/GA/test/cfg-prm-1.sh +++ b/workflows/GA/test/cfg-prm-1.sh @@ -4,7 +4,7 @@ SEED=${SEED:-1} # Total iterations -NUM_ITERATIONS=${NUM_ITERATIONS:-2} +NUM_ITERATIONS=${NUM_ITERATIONS:-7} # Size of GA population (i.e. the number of parameter sets to evaluate) POPULATION_SIZE=${POPULATION_SIZE:-4} # the GA strategy: one of 'simple' or 'mu_plus_lambda'. See From 3df1628c6f1625dd7da61cc5bc44059d5711509a Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Sun, 15 Jan 2023 12:41:01 -0800 Subject: [PATCH 391/601] o Add GraphDRP Singularity with GA workflow --- .../GA/data/graphdrp_param_space_ga.json | 48 +++++++++++++ workflows/GA/test/cfg-prm-1.sh | 2 + workflows/GA/test/test-graphdrp-lambda0.sh | 70 +++++++++++++++++++ 3 files changed, 120 insertions(+) create mode 100644 workflows/GA/data/graphdrp_param_space_ga.json create mode 100755 workflows/GA/test/test-graphdrp-lambda0.sh diff --git a/workflows/GA/data/graphdrp_param_space_ga.json b/workflows/GA/data/graphdrp_param_space_ga.json new file mode 100644 index 00000000..5ad609bd --- /dev/null +++ b/workflows/GA/data/graphdrp_param_space_ga.json @@ -0,0 +1,48 @@ +[ + { + "name": "activation", + "type": "categorical", + "element_type": "string", + "values": [ + "softmax", + "elu", + "softplus", + "softsign", + "relu", + "tanh", + "sigmoid", + "hard_sigmoid", + "linear" + ] + }, + + { + "name": "optimizer", + "type": "categorical", + "element_type": "string", + "values": ["adam", "rmsprop"] + }, + + { + "name": "dropout", + "type": "float", + "lower": 0.0, + "upper": 0.9, + "sigma": 0.045 + }, + + + { + "name": "batch_size", + "type": "ordered", + "element_type": "int", + "values": [16, 32, 64, 128, 256], + "sigma": 1 + }, + + { + "name": "epochs", + "type": "constant", + "value": 5 + } +] diff --git a/workflows/GA/test/cfg-prm-1.sh b/workflows/GA/test/cfg-prm-1.sh index 8ad83e4c..c3bf30cd 100644 --- a/workflows/GA/test/cfg-prm-1.sh +++ b/workflows/GA/test/cfg-prm-1.sh @@ -23,6 +23,8 @@ elif [ "$MODEL_NAME" = "p1b1" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b1_param_space_ga.json} elif [ "$MODEL_NAME" = "nt3" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/nt3_param_space_ga.json} +elif [ "$MODEL_NAME" = "graphdrp" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/graphdrp_param_space_ga.json} elif [ "$MODEL_NAME" = "tc1" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/tc1_param_space_ga.json} # TODO: Uncomment when parameter files are available diff --git a/workflows/GA/test/test-graphdrp-lambda0.sh b/workflows/GA/test/test-graphdrp-lambda0.sh new file mode 100755 index 00000000..d2af52e8 --- /dev/null +++ b/workflows/GA/test/test-graphdrp-lambda0.sh @@ -0,0 +1,70 @@ +#!/bin/bash +set -eu + +# MLRMBO TEST NIGHTLY + +usage() +{ + echo "Usage: test BENCHMARK_NAME SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 3 )) +then + RUN_DIR=$3 +elif (( ${#} == 2 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=$1 +SITE=$2 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-1.sh +export CFG_PRM=$THIS/cfg-prm-1.sh + +# The python GA model exploration algorithm +export GA_FILE=deap_ga.py + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export OBJ_RETURN="val_loss" + +if [[ $SITE == "theta" ]] +then + export WAIT=1 +fi + +export CANDLE_MODEL_TYPE="SINGULARITY" +export CANDLE_IMAGE="/software/improve/images/GraphDRP.sif" + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME $CANDLE_MODEL_TYPE $CANDLE_IMAGE + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +echo $TURBINE_OUTPUT +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) +#check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: From e9baca4ad156cd0a530c6f55a03798353cbf9cea Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Tue, 17 Jan 2023 22:09:00 +0000 Subject: [PATCH 392/601] o GA problem for Polaris GraphDRP --- workflows/GA/swift/workflow.sh | 2 +- workflows/GA/test/cfg-prm-polaris.sh | 42 +++++++++++++++++++++++++ workflows/GA/test/cfg-sys-polaris.sh | 47 ++++++++++++++++++++++++++++ workflows/GA/test/test-polaris.sh | 39 +++++++++++++++++++++++ 4 files changed, 129 insertions(+), 1 deletion(-) create mode 100644 workflows/GA/test/cfg-prm-polaris.sh create mode 100644 workflows/GA/test/cfg-sys-polaris.sh create mode 100755 workflows/GA/test/test-polaris.sh diff --git a/workflows/GA/swift/workflow.sh b/workflows/GA/swift/workflow.sh index 45030037..897bbbe1 100755 --- a/workflows/GA/swift/workflow.sh +++ b/workflows/GA/swift/workflow.sh @@ -79,7 +79,7 @@ source $WORKFLOWS_ROOT/common/sh/set-pythonpath.sh PYTHONPATH+=:$EQPY PYTHONPATH+=:$WORKFLOWS_ROOT/common/python -export TURBINE_JOBNAME="GA_JOB:${EXPID}" +export TURBINE_JOBNAME="GA_JOB_${EXPID}" RESTART_FILE_ARG="" if [[ ${RESTART_FILE:-} != "" ]] then diff --git a/workflows/GA/test/cfg-prm-polaris.sh b/workflows/GA/test/cfg-prm-polaris.sh new file mode 100644 index 00000000..c3bf30cd --- /dev/null +++ b/workflows/GA/test/cfg-prm-polaris.sh @@ -0,0 +1,42 @@ +# CFG PRM 1 + +# GA settings + +SEED=${SEED:-1} +# Total iterations +NUM_ITERATIONS=${NUM_ITERATIONS:-7} +# Size of GA population (i.e. the number of parameter sets to evaluate) +POPULATION_SIZE=${POPULATION_SIZE:-4} +# the GA strategy: one of 'simple' or 'mu_plus_lambda'. See +# https://deap.readthedocs.io/en/master/api/algo.html?highlight=eaSimple#module-deap.algorithms +# for more info. +GA_STRATEGY=${STRATEGY:-simple} + +INIT_PARAMS_FILE=${INIT_PARAMS_FILE:-} + +# TODO: move the following code to a utility library- +# this is a configuration file +# Set the ga parameter space definition file for running +if [ "$MODEL_NAME" = "combo" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/combo_param_space_ga.json} +elif [ "$MODEL_NAME" = "p1b1" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b1_param_space_ga.json} +elif [ "$MODEL_NAME" = "nt3" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/nt3_param_space_ga.json} +elif [ "$MODEL_NAME" = "graphdrp" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/graphdrp_param_space_ga.json} +elif [ "$MODEL_NAME" = "tc1" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/tc1_param_space_ga.json} +# TODO: Uncomment when parameter files are available +# elif [ "$MODEL_NAME" = "p1b3" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b3_param_space_ga.json} +# elif [ "$MODEL_NAME" = "p1b2" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b2_param_space_ga.json} +# elif [ "$MODEL_NAME" = "p2b1" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p2b1_param_space_ga.json} +elif [ "$PARAM_SET_FILE" != "" ]; then + PARAM_SET_FILE=${EMEWS_PROJECT_ROOT}/data/${PARAM_SET_FILE} +else + echo "Invalid model-" $MODEL_NAME + exit 1 +fi diff --git a/workflows/GA/test/cfg-sys-polaris.sh b/workflows/GA/test/cfg-sys-polaris.sh new file mode 100644 index 00000000..6ffef4ce --- /dev/null +++ b/workflows/GA/test/cfg-sys-polaris.sh @@ -0,0 +1,47 @@ +# +# COMBO CFG SYS 1 + +# The number of MPI processes +# Note that 2 processes are reserved for Swift/EMEMS +# The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs +export PROCS=${PROCS:-6} + +# MPI processes per node +# Cori has 32 cores per node, 128GB per node +export PPN=${PPN:-1} +export QUEUE=${QUEUE:-debug-scaling} +export WALLTIME=${WALLTIME:-00:05:00} + +#export PROJECT=Candle_ECP + +# Benchmark run timeout: benchmark run will timeout +# after the specified number of seconds. +# If set to -1 there is no timeout. +# This timeout is implemented with Keras callbacks +BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} + +# Uncomment below to use custom python script to run +# Use file name without .py (e.g, my_script.py) +# MODEL_PYTHON_SCRIPT=my_script + +# Shell timeout: benchmark run will be killed +# after the specified number of seconds. +# If set to -1 or empty there is no timeout. +# This timeout is implemented with the shell command 'timeout' +export SH_TIMEOUT=${SH_TIMEOUT:-} + +# Ignore errors: If 1, unknown errors will be reported to model.log +# but will not bring down the Swift workflow. See model.sh . +export IGNORE_ERRORS=0 + +# if the deap python package is not installed with swift-t's embedded python +# it may be ncessary to include its location in the PYTHONPATH +# export PYTHONPATH=/global/u1/n/ncollier/.local/cori/deeplearning2.7/lib/python2.7/site-packages + +export TURBINE_RESIDENT_WORK_WORKERS=1 +export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) + +# for running locally, edit as necessary +# export PYTHONHOME=$HOME/anaconda3 +# export PYTHON=python3.6 +# export SWIFT_T=$HOME/sfw/swift-t-4c8f0afd diff --git a/workflows/GA/test/test-polaris.sh b/workflows/GA/test/test-polaris.sh new file mode 100755 index 00000000..370d81c8 --- /dev/null +++ b/workflows/GA/test/test-polaris.sh @@ -0,0 +1,39 @@ +#!/bin/bash +set -eu + +# TEST MLRMBO GDRP 1 +# For GraphDRP + +if (( ${#} != 1 )) +then + echo "usage: test SITE" + exit 1 +fi + +SITE=$1 + +# Self-configure +THIS=$( cd $( dirname $0 ) ; /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. ; /bin/pwd ) +WORKFLOWS_ROOT=$( cd $THIS/../.. ; /bin/pwd ) +export EMEWS_PROJECT_ROOT + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-polaris.sh +export CFG_PRM=$THIS/cfg-prm-polaris.sh + +# Specify GA file +export GA_FILE=deap_ga.py + +CANDLE_MODEL_TYPE="SINGULARITY" +# CANDLE_IMAGE=/software/improve/images/GraphDRP.sif # lambda +CANDLE_IMAGE=/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif # Polaris + + +export MODEL_NAME="graphdrp" + +# Currently ignored: +export OBJ_RETURN="val_loss" + +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $CFG_PRM $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE From 2d1ce31011c140adbf6cb20df08717c20d11f035 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 17 Jan 2023 21:06:09 -0600 Subject: [PATCH 393/601] Provide EQ/Py; export PYTHONPATH --- workflows/common/sh/set-pythonpath.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/workflows/common/sh/set-pythonpath.sh b/workflows/common/sh/set-pythonpath.sh index 184d76a0..23984265 100644 --- a/workflows/common/sh/set-pythonpath.sh +++ b/workflows/common/sh/set-pythonpath.sh @@ -10,7 +10,8 @@ # with Benchmarks normally alongside Supervisor # If MODEL_PYTHON_DIR is set, that is added to PYTHONPATH -BENCHMARKS_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd ) +SUPERVISOR=$( cd $EMEWS_PROJECT_ROOT/../.. ; /bin/pwd ) +BENCHMARKS_DEFAULT=$( cd $SUPERVISOR/../Benchmarks ; /bin/pwd ) export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} if [[ ! -d $BENCHMARKS_ROOT ]] @@ -24,8 +25,10 @@ fi # APP_PYTHONPATH+=:$BENCHMARK_DIRS:$BENCHMARKS_ROOT/common # PYTHONPATH+=:$BENCHMARK_DIRS:$BENCHMARKS_ROOT/common - -PYTHONPATH+=:$WORKFLOWS_ROOT/common/python +# Set up Supervisor +export PYTHONPATH +PYTHONPATH+=:$SUPERVISOR/workflows/common/python +PYTHONPATH+=:$SUPERVISOR/workflows/common/ext/EQ-Py # Add known CANDLE Benchmarks to PYTHONPATH PYTHONPATH+=:$BENCHMARKS_ROOT/Pilot1/P1B1 From 889ccb2041a83ed6c71dbd07d62bf701375b6609 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 17 Jan 2023 21:06:28 -0600 Subject: [PATCH 394/601] Update header --- workflows/GA/test/test-polaris.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workflows/GA/test/test-polaris.sh b/workflows/GA/test/test-polaris.sh index 370d81c8..5a16bba2 100755 --- a/workflows/GA/test/test-polaris.sh +++ b/workflows/GA/test/test-polaris.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eu -# TEST MLRMBO GDRP 1 +# TEST POLARIS # For GraphDRP if (( ${#} != 1 )) @@ -29,7 +29,6 @@ CANDLE_MODEL_TYPE="SINGULARITY" # CANDLE_IMAGE=/software/improve/images/GraphDRP.sif # lambda CANDLE_IMAGE=/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif # Polaris - export MODEL_NAME="graphdrp" # Currently ignored: From 7557d31636c07d44336ca0f875a05363bffaa8c9 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 17 Jan 2023 21:06:42 -0600 Subject: [PATCH 395/601] Shorten job name to fit for PBS --- workflows/GA/swift/workflow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/GA/swift/workflow.sh b/workflows/GA/swift/workflow.sh index 897bbbe1..6b3c43f2 100755 --- a/workflows/GA/swift/workflow.sh +++ b/workflows/GA/swift/workflow.sh @@ -79,7 +79,7 @@ source $WORKFLOWS_ROOT/common/sh/set-pythonpath.sh PYTHONPATH+=:$EQPY PYTHONPATH+=:$WORKFLOWS_ROOT/common/python -export TURBINE_JOBNAME="GA_JOB_${EXPID}" +export TURBINE_JOBNAME="GA_${EXPID}" RESTART_FILE_ARG="" if [[ ${RESTART_FILE:-} != "" ]] then From 861fa8dd3318851882ef2b907e1cf189cfa18ece Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 17 Jan 2023 21:06:57 -0600 Subject: [PATCH 396/601] Use single-node job for debugging --- workflows/GA/test/cfg-sys-polaris.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/GA/test/cfg-sys-polaris.sh b/workflows/GA/test/cfg-sys-polaris.sh index 6ffef4ce..6be57c29 100644 --- a/workflows/GA/test/cfg-sys-polaris.sh +++ b/workflows/GA/test/cfg-sys-polaris.sh @@ -8,7 +8,7 @@ export PROCS=${PROCS:-6} # MPI processes per node # Cori has 32 cores per node, 128GB per node -export PPN=${PPN:-1} +export PPN=${PPN:-6} export QUEUE=${QUEUE:-debug-scaling} export WALLTIME=${WALLTIME:-00:05:00} From 17734e823553c7871c5465b39d389fd9cbe551fc Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 18 Jan 2023 17:15:12 +0000 Subject: [PATCH 397/601] o Increase time and make epochs constant for test runs on polaris --- workflows/GA/data/graphdrp_param_space_ga.json | 3 ++- workflows/GA/test/cfg-sys-polaris.sh | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/workflows/GA/data/graphdrp_param_space_ga.json b/workflows/GA/data/graphdrp_param_space_ga.json index 5ad609bd..2853f6d9 100644 --- a/workflows/GA/data/graphdrp_param_space_ga.json +++ b/workflows/GA/data/graphdrp_param_space_ga.json @@ -1,5 +1,6 @@ [ { + "name": "activation", "type": "categorical", "element_type": "string", @@ -43,6 +44,6 @@ { "name": "epochs", "type": "constant", - "value": 5 + "value": 2 } ] diff --git a/workflows/GA/test/cfg-sys-polaris.sh b/workflows/GA/test/cfg-sys-polaris.sh index 6be57c29..147c18f9 100644 --- a/workflows/GA/test/cfg-sys-polaris.sh +++ b/workflows/GA/test/cfg-sys-polaris.sh @@ -10,7 +10,7 @@ export PROCS=${PROCS:-6} # Cori has 32 cores per node, 128GB per node export PPN=${PPN:-6} export QUEUE=${QUEUE:-debug-scaling} -export WALLTIME=${WALLTIME:-00:05:00} +export WALLTIME=${WALLTIME:-00:39:00} #export PROJECT=Candle_ECP From b9b5471cd63e97401b0c587a2b9cb74d9b57de74 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Thu, 19 Jan 2023 14:12:54 -0600 Subject: [PATCH 398/601] o Add One D test model o Fix format --- models/Comparator/cmp_baseline_keras2.py | 45 +++++---- models/OneD/README.md | 12 +++ models/OneD/oned.py | 28 ++++++ models/OneD/oned_baseline_keras2.py | 93 +++++++++++++++++++ models/OneD/oned_default_model.txt | 3 + .../GA/data/graphdrp_param_space_ga.json | 2 +- workflows/common/python/runner_utils.py | 3 +- workflows/common/sh/env-lambda.sh | 2 +- workflows/common/sh/set-pythonpath.sh | 1 + 9 files changed, 167 insertions(+), 22 deletions(-) create mode 100644 models/OneD/README.md create mode 100644 models/OneD/oned.py create mode 100644 models/OneD/oned_baseline_keras2.py create mode 100644 models/OneD/oned_default_model.txt diff --git a/models/Comparator/cmp_baseline_keras2.py b/models/Comparator/cmp_baseline_keras2.py index 2474deef..957727d3 100644 --- a/models/Comparator/cmp_baseline_keras2.py +++ b/models/Comparator/cmp_baseline_keras2.py @@ -1,4 +1,3 @@ - import os import subprocess from pathlib import Path @@ -36,29 +35,37 @@ def run(gParameters): model_sh = workflows / "common" / "sh" / "model.sh" print(model_sh) os.chdir(output_dir) - env = { "WORKFLOWS_ROOT": str(workflows), - "TURBINE_OUTPUT": output_dir, - "EXPID": expid, - "SITE": "lambda", - "OBJ_RETURN": "loss", - "BENCHMARK_TIMEOUT": "120", - "MODEL_NAME": gParameters["model1"], - "CANDLE_MODEL_TYPE": "SINGULARITY", - "CANDLE_DATA_DIR": os.getenv("CANDLE_DATA_DIR"), - "ADLB_RANK_OFFSET": "0", - "CANDLE_IMAGE": "/software/improve/images/GraphDRP.sif" - } + env = { + "WORKFLOWS_ROOT": str(workflows), + "TURBINE_OUTPUT": output_dir, + "EXPID": expid, + "SITE": "lambda", + "OBJ_RETURN": "loss", + "BENCHMARK_TIMEOUT": "120", + "MODEL_NAME": gParameters["model1"], + "CANDLE_MODEL_TYPE": "SINGULARITY", + "CANDLE_DATA_DIR": os.getenv("CANDLE_DATA_DIR"), + "ADLB_RANK_OFFSET": "0", + "CANDLE_IMAGE": "/software/improve/images/GraphDRP.sif" + } print("env: " + str(env)) - cmd = [ "bash", model_sh, - "keras2", "{}", # empty JSON fragment - expid, - gParameters["run_id"] ] + cmd = [ + "bash", + model_sh, + "keras2", + "{}", # empty JSON fragment + expid, + gParameters["run_id"] + ] print("cmd: " + str(cmd)) with open("model1.log", "w") as model1_log: - subprocess.run(cmd, env=env, - stdout=model1_log, stderr=subprocess.STDOUT) + subprocess.run(cmd, + env=env, + stdout=model1_log, + stderr=subprocess.STDOUT) print("Comparator DONE.") + def main(): gParameters = initialize_parameters() run(gParameters) diff --git a/models/OneD/README.md b/models/OneD/README.md new file mode 100644 index 00000000..c465a5b1 --- /dev/null +++ b/models/OneD/README.md @@ -0,0 +1,12 @@ +# File organization: +- Name the main file where the actual model resides as _baseline_ or <_pytorch>.py +- .py for the Benchmark class +- _default_model.txt + +Please follow the above conventions for naming files, all lowercase filenames. +`model_name` is a required keyword for all models. + +This would enable the model a user to run `python oned_baseline_keras2.py` + +Users never change parameters inside the file oned_baseline_keras2.py, any parameters needed for tweaking or optimizing the model +must be provide vi oned_default_model.txt diff --git a/models/OneD/oned.py b/models/OneD/oned.py new file mode 100644 index 00000000..ded2ebc6 --- /dev/null +++ b/models/OneD/oned.py @@ -0,0 +1,28 @@ +import candle +import os + +# Define any needed additional args to ensure all new args are command-line accessible. +additional_definitions = [{ + 'name': 'x', + 'type': float, + 'nargs': 1, + 'help': '1D function, derived form cosine mixture' +}, { + 'name': 'new_keyword', + 'type': str, + 'nargs': 1, + 'help': 'helpful description' +}] + +# Define args that are required. +required = None + + +# Extend candle.Benchmark to configure the args +class IBenchmark(candle.Benchmark): + + def set_locals(self): + if required is not None: + self.required = set(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions diff --git a/models/OneD/oned_baseline_keras2.py b/models/OneD/oned_baseline_keras2.py new file mode 100644 index 00000000..0f14b4f7 --- /dev/null +++ b/models/OneD/oned_baseline_keras2.py @@ -0,0 +1,93 @@ +import os +import candle +from example import IBenchmark + +# Just because the tensorflow warnings are a bit verbose +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + +# This should be set outside as a user environment variable +os.environ['CANDLE_DATA_DIR'] = os.environ['HOME'] + '/improve_data_dir' + +# file_path becomes the default location of the example_default_model.txt file +file_path = os.path.dirname(os.path.realpath(__file__)) + + +# In the initialize_parameters() method, we will instantiate the base +# class, and finally build an argument parser to recognize your customized +# parameters in addition to the default parameters.The initialize_parameters() +# method should return a python dictionary, which will be passed to the run() +# method. +def initialize_parameters(): + i_bmk = IBenchmark( + file_path, # this is the path to this file needed to find default_model.txt + 'example_default_model.txt', # name of the default_model.txt file + 'keras', # framework, choice is keras or pytorch + prog='example_baseline', # basename of the model + desc='IMPROVE Benchmark') + + gParameters = candle.finalize_parameters( + i_bmk) # returns the parameter dictionary built from + # default_model.txt and overwritten by any + # matching comand line parameters. + + return gParameters + + +import numpy as np +import matplotlib.pyplot as plt + + +def func(x, n=1): + # "func" takes in two arguments: "x" and "n", n is set to 1. + # The function returns a calculation using the input "x" and a default value of "n" equal to 1. + # The calculation is a linear combination of three trigonometric functions (sine, cosine) + # with the addition of a random normal variable scaled by the input "n". + + y = 0.02 * x + 0.5 * np.sin(1 * x + 0.1) + 0.75 * np.cos( + 0.25 * x - 0.3) + n * np.random.normal(0, 0.2, 1) + return y[0] + + +def run(params): + # fetch data + # preprocess data + # save preprocessed data + # define callbacks + # build / compile model + # train model + # infer using model + # etc + print("running third party code") + + x = params['x'] + y = func(x) + + print("returning training metrics: ", y) + + return { + "val_loss": y, + } # metrics is used by the supervisor when running + # HPO workflows (and possible future non HPO workflows) + + # Dumping results into file, workflow requirement + val_scores = { + 'key': 'val_loss', + 'value': metrics['val_loss'], + 'val_loss': metrics['val_loss'], + } + + with open(params['output_dir'] + "/scores.json", "w", + encoding="utf-8") as f: + json.dump(val_scores, f, ensure_ascii=False, indent=4) + + return metrics # metrics is used by the supervisor when running + # HPO workflows (and possible future non HPO workflows) + + +def main(): + params = initialize_parameters() + scores = run(params) + + +if __name__ == "__main__": + main() diff --git a/models/OneD/oned_default_model.txt b/models/OneD/oned_default_model.txt new file mode 100644 index 00000000..93439811 --- /dev/null +++ b/models/OneD/oned_default_model.txt @@ -0,0 +1,3 @@ +[global] +model_name="1D" +x=1 diff --git a/workflows/GA/data/graphdrp_param_space_ga.json b/workflows/GA/data/graphdrp_param_space_ga.json index 2853f6d9..4701f84c 100644 --- a/workflows/GA/data/graphdrp_param_space_ga.json +++ b/workflows/GA/data/graphdrp_param_space_ga.json @@ -1,6 +1,6 @@ [ { - + "name": "activation", "type": "categorical", "element_type": "string", diff --git a/workflows/common/python/runner_utils.py b/workflows/common/python/runner_utils.py index 17d4fd4e..fdd0fe61 100644 --- a/workflows/common/python/runner_utils.py +++ b/workflows/common/python/runner_utils.py @@ -169,7 +169,7 @@ def main(): exit(1) # Parse the workflow-provided JSON string: params = json.loads(sys.argv[2]) - if len(sys.argv) == 3: + if len(sys.argv) == 3: pass # No defaults, OK elif len(sys.argv) == 4: defaults = read_config_file_dict(sys.argv[3]) @@ -180,5 +180,6 @@ def main(): print("runner_utils: unknown subcommand: " + str(sys.argv)) exit(1) + if __name__ == "__main__": main() diff --git a/workflows/common/sh/env-lambda.sh b/workflows/common/sh/env-lambda.sh index 4a7cb8ed..fc49566b 100644 --- a/workflows/common/sh/env-lambda.sh +++ b/workflows/common/sh/env-lambda.sh @@ -26,4 +26,4 @@ echo "Programs:" which python swift-t | nl # Cf. utils.sh show PYTHONHOME -log_path LD_LIBRARY_PATH \ No newline at end of file +log_path LD_LIBRARY_PATH diff --git a/workflows/common/sh/set-pythonpath.sh b/workflows/common/sh/set-pythonpath.sh index 23984265..43150abc 100644 --- a/workflows/common/sh/set-pythonpath.sh +++ b/workflows/common/sh/set-pythonpath.sh @@ -28,6 +28,7 @@ fi # Set up Supervisor export PYTHONPATH PYTHONPATH+=:$SUPERVISOR/workflows/common/python +PYTHONPATH+=:$SUPERVISOR/workflows/models/OneD PYTHONPATH+=:$SUPERVISOR/workflows/common/ext/EQ-Py # Add known CANDLE Benchmarks to PYTHONPATH From 3beb13c8ebf7cdb01c8ff944994cdfd6b961ace4 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Thu, 19 Jan 2023 20:15:17 +0000 Subject: [PATCH 399/601] o Bigger GraphDRP HPO setup --- workflows/mlrMBO/data/p1_gdrp.R | 16 +++++++++++ workflows/mlrMBO/test/big-gdrp.sh | 37 +++++++++++++++++++++++++ workflows/mlrMBO/test/cfg-prm-gdrp.sh | 40 +++++++++++++++++++++++++++ workflows/mlrMBO/test/cfg-sys-gdrp.sh | 40 +++++++++++++++++++++++++++ 4 files changed, 133 insertions(+) create mode 100644 workflows/mlrMBO/data/p1_gdrp.R create mode 100755 workflows/mlrMBO/test/big-gdrp.sh create mode 100644 workflows/mlrMBO/test/cfg-prm-gdrp.sh create mode 100644 workflows/mlrMBO/test/cfg-sys-gdrp.sh diff --git a/workflows/mlrMBO/data/p1_gdrp.R b/workflows/mlrMBO/data/p1_gdrp.R new file mode 100644 index 00000000..e311f4b6 --- /dev/null +++ b/workflows/mlrMBO/data/p1_gdrp.R @@ -0,0 +1,16 @@ + +# GraphDRP Hyperparameter Search - Test "small" +# These parameters should stay small for short tests +# and use no dense parameters to avoid mlrMBO crashes + +# see https://cran.r-project.org/web/packages/ParamHelpers/ParamHelpers.pdfmakeNum +# the parameter names should match names of the arguments expected by the benchmark + +param.set <- makeParamSet( + #makeDiscreteParam("test_batch", values = c(8, 17)), + makeDiscreteParam("batch_size", values = c(8, 256)), + makeIntegerParam("epochs", lower = 100, upper = 101), + makeDiscreteParam("optimizer", values = c("adam", "sgd")), + makeNumericParam("dropout", lower = 0, upper = 0.9), + makeNumericParam("learning_rate", lower = 0.001, upper = 0.1) +) diff --git a/workflows/mlrMBO/test/big-gdrp.sh b/workflows/mlrMBO/test/big-gdrp.sh new file mode 100755 index 00000000..7b390612 --- /dev/null +++ b/workflows/mlrMBO/test/big-gdrp.sh @@ -0,0 +1,37 @@ +#!/bin/bash +set -eu + +# TEST MLRMBO GDRP 1 +# For GraphDRP + +if (( ${#} != 1 )) +then + echo "usage: test SITE" + exit 1 +fi + +SITE=$1 + +# Self-configure +THIS=$( cd $( dirname $0 ) ; /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. ; /bin/pwd ) +WORKFLOWS_ROOT=$( cd $THIS/../.. ; /bin/pwd ) +export EMEWS_PROJECT_ROOT + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-gdrp.sh +export CFG_PRM=$THIS/cfg-prm-gdrp.sh + +# Specify the mlrMBO algorithm R file +export R_FILE=mlrMBO-mbo.R + +CANDLE_MODEL_TYPE="SINGULARITY" +CANDLE_IMAGE=/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif + +export MODEL_NAME="graphdrp" + +# Currently ignored: +export OBJ_RETURN="val_loss" + +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $CFG_PRM $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE diff --git a/workflows/mlrMBO/test/cfg-prm-gdrp.sh b/workflows/mlrMBO/test/cfg-prm-gdrp.sh new file mode 100644 index 00000000..79b96369 --- /dev/null +++ b/workflows/mlrMBO/test/cfg-prm-gdrp.sh @@ -0,0 +1,40 @@ +# CFG PRM NIGHTLY + +# mlrMBO settings + +# Total iterations +PROPOSE_POINTS=${PROPOSE_POINTS:-14} +MAX_CONCURRENT_EVALUATIONS=${MAX_CONCURRET_EVALUATIONS:-1} +MAX_ITERATIONS=${MAX_ITERATIONS:-3} +MAX_BUDGET=${MAX_BUDGET:-180} +DESIGN_SIZE=${DESIGN_SIZE:-14} + +# TODO: move the following code to a utility library- +# this is a configuration file +# Set the R data file for running +if [ "$MODEL_NAME" = "combo" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/combo_nightly.R} +elif [ "$MODEL_NAME" = "attn" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/attn_nightly.R} +elif [ "$MODEL_NAME" = "adrp" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/adrp_nightly.R} +elif [ "$MODEL_NAME" = "p1b1" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b1_nightly.R} +elif [ "$MODEL_NAME" = "nt3" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/nt3_nightly.R} +elif [ "$MODEL_NAME" = "p1b3" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b3_nightly.R} +elif [ "$MODEL_NAME" = "p1b2" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b2_nightly.R} +elif [ "$MODEL_NAME" = "p2b1" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p2b1_nightly.R} +elif [ "$MODEL_NAME" = "graphdrp" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1_gdrp.R} +elif [ "$MODEL_NAME" = "dummy" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/dummy_nightly.R} +elif [[ "${PARAM_SET_FILE:-}" != "" ]]; then + PARAM_SET_FILE=${EMEWS_PROJECT_ROOT}/data/${PARAM_SET_FILE} +else + printf "Could not find PARAM_SET_FILE for model: '%s'\n" $MODEL_NAME + exit 1 +fi diff --git a/workflows/mlrMBO/test/cfg-sys-gdrp.sh b/workflows/mlrMBO/test/cfg-sys-gdrp.sh new file mode 100644 index 00000000..1a7f8176 --- /dev/null +++ b/workflows/mlrMBO/test/cfg-sys-gdrp.sh @@ -0,0 +1,40 @@ + +# MLRMBO CFG SYS NIGHTLY + +# The number of MPI processes +# Note that 2 processes are reserved for Swift/EMEWS +# The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs +export PROCS=${PROCS:-16} + +# MPI processes per node +# Cori has 32 cores per node, 128GB per node +export PPN=${PPN:-1} +export QUEUE=${QUEUE:-prod} +export WALLTIME=${WALLTIME:-10:55:00} + +# Benchmark run timeout: benchmark run will timeout +# after the specified number of seconds. +# If set to -1 there is no timeout. +# This timeout is implemented with Keras callbacks +BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} + +# Uncomment below to use custom python script to run +# Use file name without .py (e.g, my_script.py) +# MODEL_PYTHON_SCRIPT=my_script + +# Shell timeout: benchmark run will be killed +# after the specified number of seconds. +# If set to -1 or empty there is no timeout. +# This timeout is implemented with the shell command 'timeout' +export SH_TIMEOUT=${SH_TIMEOUT:-} + +# Ignore errors: If 1, unknown errors will be reported to model.log +# but will not bring down the Swift workflow. See model.sh . +export IGNORE_ERRORS=0 + +# Resident task worker rank for mlrMBO algorithm +if [[ ${TURBINE_RESIDENT_WORK_WORKERS:-} == "" ]] +then + export TURBINE_RESIDENT_WORK_WORKERS=1 + export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) +fi From f7a90884abce5affbc0bbb5889860cd78d8006f3 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Thu, 19 Jan 2023 14:47:08 -0800 Subject: [PATCH 400/601] o Few fixes, need more epoch stuff with history object --- models/OneD/oned_baseline_keras2.py | 15 ++++++++++----- workflows/common/sh/set-pythonpath.sh | 2 +- workflows/mlrMBO/test/cfg-prm-nightly.sh | 2 ++ 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/models/OneD/oned_baseline_keras2.py b/models/OneD/oned_baseline_keras2.py index 0f14b4f7..17bbfef6 100644 --- a/models/OneD/oned_baseline_keras2.py +++ b/models/OneD/oned_baseline_keras2.py @@ -1,6 +1,6 @@ import os import candle -from example import IBenchmark +from oned import IBenchmark # Just because the tensorflow warnings are a bit verbose os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' @@ -8,7 +8,7 @@ # This should be set outside as a user environment variable os.environ['CANDLE_DATA_DIR'] = os.environ['HOME'] + '/improve_data_dir' -# file_path becomes the default location of the example_default_model.txt file +# file_path becomes the default location of the oned_default_model.txt file file_path = os.path.dirname(os.path.realpath(__file__)) @@ -20,9 +20,9 @@ def initialize_parameters(): i_bmk = IBenchmark( file_path, # this is the path to this file needed to find default_model.txt - 'example_default_model.txt', # name of the default_model.txt file + 'oned_default_model.txt', # name of the default_model.txt file 'keras', # framework, choice is keras or pytorch - prog='example_baseline', # basename of the model + prog='oned_baseline', # basename of the model desc='IMPROVE Benchmark') gParameters = candle.finalize_parameters( @@ -35,7 +35,7 @@ def initialize_parameters(): import numpy as np import matplotlib.pyplot as plt - +import tensorflow as tf def func(x, n=1): # "func" takes in two arguments: "x" and "n", n is set to 1. @@ -64,6 +64,11 @@ def run(params): print("returning training metrics: ", y) + h=tf.keras.callbacks.History() + h.history.setdefault('val_loss') + + h.history['val_loss']=y + return h return { "val_loss": y, } # metrics is used by the supervisor when running diff --git a/workflows/common/sh/set-pythonpath.sh b/workflows/common/sh/set-pythonpath.sh index 43150abc..23833e3d 100644 --- a/workflows/common/sh/set-pythonpath.sh +++ b/workflows/common/sh/set-pythonpath.sh @@ -28,7 +28,7 @@ fi # Set up Supervisor export PYTHONPATH PYTHONPATH+=:$SUPERVISOR/workflows/common/python -PYTHONPATH+=:$SUPERVISOR/workflows/models/OneD +PYTHONPATH+=:$SUPERVISOR/models/OneD PYTHONPATH+=:$SUPERVISOR/workflows/common/ext/EQ-Py # Add known CANDLE Benchmarks to PYTHONPATH diff --git a/workflows/mlrMBO/test/cfg-prm-nightly.sh b/workflows/mlrMBO/test/cfg-prm-nightly.sh index befe3ca0..9678314a 100644 --- a/workflows/mlrMBO/test/cfg-prm-nightly.sh +++ b/workflows/mlrMBO/test/cfg-prm-nightly.sh @@ -32,6 +32,8 @@ elif [ "$MODEL_NAME" = "graphdrp" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/graphdrp_small.R} elif [ "$MODEL_NAME" = "dummy" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/dummy_nightly.R} +elif [ "$MODEL_NAME" = "oned" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/oned.R} elif [[ "${PARAM_SET_FILE:-}" != "" ]]; then PARAM_SET_FILE=${EMEWS_PROJECT_ROOT}/data/${PARAM_SET_FILE} else From 16a0f19854882e6862fbf12c7923507b39ee396b Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Thu, 19 Jan 2023 20:04:25 -0800 Subject: [PATCH 401/601] o Add R file and fix one baseline run, mlrMBO still has some Error in makeRegrTask --- workflows/mlrMBO/data/oned.R | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 workflows/mlrMBO/data/oned.R diff --git a/workflows/mlrMBO/data/oned.R b/workflows/mlrMBO/data/oned.R new file mode 100644 index 00000000..1a897bf7 --- /dev/null +++ b/workflows/mlrMBO/data/oned.R @@ -0,0 +1,14 @@ + +# NT3 Hyperparameter Search - Test 1 +# These parameters should stay small for short tests +# and use no dense parameters to avoid mlrMBO crashes + +# see https://cran.r-project.org/web/packages/ParamHelpers/ParamHelpers.pdfmakeNum +# the parameter names should match names of the arguments expected by the benchmark + +param.set <- makeParamSet( + makeNumericParam("x", lower = 1, upper = 20) +# makeDiscreteParam("conv", values = c("50 50 50 50 50 1", "25 25 25 25 25 1", "64 32 16 32 64 1", "100 100 100 100 100 1", "32 20 16 32 10 1")) + ## DEBUG PARAMETERS: DON'T USE THESE IN PRODUCTION RUN + ## makeDiscreteParam("conv", values = c("32 20 16 32 10 1")) +) From 440ecce03e5dde95636fd949b83ad4b08e897a83 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Mon, 23 Jan 2023 19:23:19 -0800 Subject: [PATCH 402/601] o Working version of oneD, needed to remove impute.y.fun and transformation function for MBOExampleRun, they might need to be reintroduced. --- models/OneD/oned_baseline_keras2.py | 17 ++++++++++++----- workflows/common/R/mlrMBO-mbo.R | 4 +--- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/models/OneD/oned_baseline_keras2.py b/models/OneD/oned_baseline_keras2.py index 17bbfef6..66b755bb 100644 --- a/models/OneD/oned_baseline_keras2.py +++ b/models/OneD/oned_baseline_keras2.py @@ -37,15 +37,20 @@ def initialize_parameters(): import matplotlib.pyplot as plt import tensorflow as tf + def func(x, n=1): # "func" takes in two arguments: "x" and "n", n is set to 1. # The function returns a calculation using the input "x" and a default value of "n" equal to 1. # The calculation is a linear combination of three trigonometric functions (sine, cosine) # with the addition of a random normal variable scaled by the input "n". - y = 0.02 * x + 0.5 * np.sin(1 * x + 0.1) + 0.75 * np.cos( - 0.25 * x - 0.3) + n * np.random.normal(0, 0.2, 1) - return y[0] + #y = 0.02 * x + 0.5 * np.sin(1 * x + 0.1) + 0.75 * np.cos( + # 0.25 * x - 0.3) + n * np.random.normal(0, 0.2, 1) + + # remove random part + y = 0.02 * x + 0.5 * np.sin(1 * x + 0.1) + 0.75 * np.cos(0.25 * x - 0.3) + + return y def run(params): @@ -64,10 +69,12 @@ def run(params): print("returning training metrics: ", y) - h=tf.keras.callbacks.History() + h = tf.keras.callbacks.History() h.history.setdefault('val_loss') - h.history['val_loss']=y + y_array = np.ndarray(2) + y_array.fill(y) + h.history['val_loss'] = y_array return h return { "val_loss": y, diff --git a/workflows/common/R/mlrMBO-mbo.R b/workflows/common/R/mlrMBO-mbo.R index 75e607fe..0630eb14 100644 --- a/workflows/common/R/mlrMBO-mbo.R +++ b/workflows/common/R/mlrMBO-mbo.R @@ -77,9 +77,7 @@ se.method = "jackknife", se.boot = 2) ctrl = makeMBOControl(n.objectives = 1, - propose.points = propose.points, - impute.y.fun = function(x, y, opt.path, ...) .Machine$double.xmax, - trafo.y.fun = makeMBOTrafoFunction('log', log)) + propose.points = propose.points) ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritCB(), opt.restarts = 1, From d6a191fb02d48e463a3044dda6305d08f5e78349 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Mon, 23 Jan 2023 19:36:42 -0800 Subject: [PATCH 403/601] o OneDim problem now works with GA also. --- workflows/GA/data/oned_param_space_ga.json | 9 +++++++++ workflows/GA/test/cfg-prm-1.sh | 2 ++ 2 files changed, 11 insertions(+) create mode 100644 workflows/GA/data/oned_param_space_ga.json diff --git a/workflows/GA/data/oned_param_space_ga.json b/workflows/GA/data/oned_param_space_ga.json new file mode 100644 index 00000000..9c21d17a --- /dev/null +++ b/workflows/GA/data/oned_param_space_ga.json @@ -0,0 +1,9 @@ +[ + { + "name": "x", + "type": "float", + "lower": 0.0, + "upper": 20.0, + "sigma": 0.1 + } +] diff --git a/workflows/GA/test/cfg-prm-1.sh b/workflows/GA/test/cfg-prm-1.sh index c3bf30cd..f902d438 100644 --- a/workflows/GA/test/cfg-prm-1.sh +++ b/workflows/GA/test/cfg-prm-1.sh @@ -27,6 +27,8 @@ elif [ "$MODEL_NAME" = "graphdrp" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/graphdrp_param_space_ga.json} elif [ "$MODEL_NAME" = "tc1" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/tc1_param_space_ga.json} +elif [ "$MODEL_NAME" = "oned" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/oned_param_space_ga.json} # TODO: Uncomment when parameter files are available # elif [ "$MODEL_NAME" = "p1b3" ]; then # PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b3_param_space_ga.json} From 554cbf5b66c72ea6a9d86f72f01f19a1d7676a63 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 24 Jan 2023 12:17:30 -0600 Subject: [PATCH 404/601] Report critical paths --- workflows/GA/swift/workflow.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/GA/swift/workflow.sh b/workflows/GA/swift/workflow.sh index 6b3c43f2..1729b3f9 100755 --- a/workflows/GA/swift/workflow.sh +++ b/workflows/GA/swift/workflow.sh @@ -172,6 +172,7 @@ fi ( set -x +which python swift-t swift-t -O 0 -n $PROCS \ ${MACHINE:-} \ -p -I $EQPY -r $EQPY \ From 4a883ff153f73cea65e182eec262871d203515cf Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Tue, 24 Jan 2023 13:53:46 -0600 Subject: [PATCH 405/601] o Add python settings --- workflows/common/sh/env-gce.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/workflows/common/sh/env-gce.sh b/workflows/common/sh/env-gce.sh index 2bd063c5..99062319 100644 --- a/workflows/common/sh/env-gce.sh +++ b/workflows/common/sh/env-gce.sh @@ -10,6 +10,10 @@ SWIFT=$SFW/swift-t/mpich/2022-11-14-Jenkins PATH=$SWIFT/stc/bin:$PATH +PYTHON=/nfs/gce/globalscratch/jain/conda_installs/ +export PATH=$PYTHON/bin:$PATH +export PYTHONHOME=$PYTHON + echo $SWIFT # Needed for Swift/T+R From 8d33489b86bff13113e49cceab125426790f12ff Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Tue, 24 Jan 2023 18:45:29 -0600 Subject: [PATCH 406/601] o bring back impute, still avoid transformation(MBO) --- workflows/common/R/mlrMBO-mbo.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/common/R/mlrMBO-mbo.R b/workflows/common/R/mlrMBO-mbo.R index 0630eb14..9748269c 100644 --- a/workflows/common/R/mlrMBO-mbo.R +++ b/workflows/common/R/mlrMBO-mbo.R @@ -77,7 +77,8 @@ se.method = "jackknife", se.boot = 2) ctrl = makeMBOControl(n.objectives = 1, - propose.points = propose.points) + propose.points = propose.points, + impute.y.fun = function(x, y, opt.path, ...) .Machine$double.xmax) ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritCB(), opt.restarts = 1, From 4ee1a5951a34f18932eab28146d906b226303f57 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 24 Jan 2023 21:19:47 -0600 Subject: [PATCH 407/601] Merge - start on extracting learning rate (lr) --- workflows/cp-leaveout/scripts/Node.py | 15 ++++++++++++--- workflows/cp-leaveout/scripts/clean-ckpts-run.sh | 9 +++++---- workflows/cp-leaveout/scripts/clean-ckpts.sh | 4 ++-- .../cp-leaveout/scripts/extract-node-info.py | 8 ++++---- .../cp-leaveout/scripts/extract-node-info.sh | 1 - workflows/cp-leaveout/scripts/touch-exps.zsh | 2 ++ 6 files changed, 25 insertions(+), 14 deletions(-) diff --git a/workflows/cp-leaveout/scripts/Node.py b/workflows/cp-leaveout/scripts/Node.py index 24ada79d..0230b133 100644 --- a/workflows/cp-leaveout/scripts/Node.py +++ b/workflows/cp-leaveout/scripts/Node.py @@ -27,6 +27,8 @@ def __init__(self, id=None, logger=None): self.mae = None self.r2 = None self.corr = None + # The final learning rate: + self.lr = None # Differences wrt parent (lower is better) self.loss_delta = None self.val_loss_delta = None @@ -226,9 +228,11 @@ def parse_val_data(self, fp): value_string = tail[:comma] self.val_data = int(value_string) - def parse_error_data(self, fp): - """fp is the file pointer to save/python.log If lines are not found, - node.mse, etc., will remain None.""" + def parse_python_log(self, fp): + """ + fp is the file pointer to save/python.log + If lines are not found, node.mse, etc., will remain None + """ marker = "Comparing y_true " # The marker is just after the date: # We search this way for speed. @@ -237,6 +241,11 @@ def parse_error_data(self, fp): line = fp.readline() if line == "": break + if line.startswith("Epoch ", date_len) and + "lr=" in line: + tokens = line.split("=") + self.lr = float(tokens[1]) + print("%s lr=%0.3f" % (self.id, self.lr)) if line.startswith(marker, date_len): line = fp.readline() tokens = check_token(line, 2, "mse:") diff --git a/workflows/cp-leaveout/scripts/clean-ckpts-run.sh b/workflows/cp-leaveout/scripts/clean-ckpts-run.sh index 68085bc7..6a80b36b 100755 --- a/workflows/cp-leaveout/scripts/clean-ckpts-run.sh +++ b/workflows/cp-leaveout/scripts/clean-ckpts-run.sh @@ -19,19 +19,20 @@ then exit 1 fi +echo "RUN: $DIR" + if ! [[ -d $DIR/save/ckpts/epochs ]] then + echo "No epochs directory." exit fi cd $DIR/save/ckpts/epochs - +set -x MODELS=( $( ls ) ) -echo ${MODELS[@]} - N=${#MODELS[@]} -echo $N +echo "MODELS: $N" # Do not clean the last 3 models for (( i=0 ; i<$N-3 ; i++ )) diff --git a/workflows/cp-leaveout/scripts/clean-ckpts.sh b/workflows/cp-leaveout/scripts/clean-ckpts.sh index 4721f4a7..c4449b76 100755 --- a/workflows/cp-leaveout/scripts/clean-ckpts.sh +++ b/workflows/cp-leaveout/scripts/clean-ckpts.sh @@ -18,10 +18,10 @@ then exit 1 fi -RUNS=( $( echo $DIR/run/* ) ) +RUNS=( $( echo $DIR/run/1.1.1.3.1.1 ) ) for RUN in ${RUNS[@]} do - set -x $THIS/clean-ckpts-run.sh $RUN + echo done diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index ac1a51d4..21540d51 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -130,9 +130,9 @@ def parse_log(log_fp, nodes): node_current.stop_early() if node_current is not None and node_current.complete: # Store a complete Node in global dict nodes - logger.debug("node done.") + logger.info("node done.") # find_val_data(node_current) # old format? - find_error_data(node_current) + parse_python_log(node_current) nodes_found += 1 node_current = None @@ -144,7 +144,7 @@ def parse_build_df(line, logger=None): assert len(tokens) == 6 global build_df build_df = float(tokens[4]) - logger.info("build_df: %0.2f" % build_df) + # logger.info("build_df: %0.2f" % build_df) return build_df @@ -167,7 +167,7 @@ def find_error_data(node): if not os.path.exists(python_log): return with open(python_log) as fp: - node.parse_error_data(fp) + node.parse_python_log(fp) if node.mse is None: logger.fatal("Could not find error data for node: " + node.id) diff --git a/workflows/cp-leaveout/scripts/extract-node-info.sh b/workflows/cp-leaveout/scripts/extract-node-info.sh index d291a47e..103ab8de 100755 --- a/workflows/cp-leaveout/scripts/extract-node-info.sh +++ b/workflows/cp-leaveout/scripts/extract-node-info.sh @@ -20,7 +20,6 @@ then exit 1 fi - # Put all matching file names in this file, one per line # (this could contain thousands of entries, too long for command line): LOG_LIST=$DIR/log-list.txt diff --git a/workflows/cp-leaveout/scripts/touch-exps.zsh b/workflows/cp-leaveout/scripts/touch-exps.zsh index 306fbf5f..ef7d54dc 100755 --- a/workflows/cp-leaveout/scripts/touch-exps.zsh +++ b/workflows/cp-leaveout/scripts/touch-exps.zsh @@ -11,6 +11,8 @@ A=( 750 744 759 763 + 838 + 839 ) { From f29130252f921de00caccedfac7249068f4a0784 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 7 Feb 2023 14:16:39 -0600 Subject: [PATCH 408/601] Merge in branch jpg_crusher --- workflows/common/python/model_runner.py | 17 +- workflows/common/sh/env-summit.sh | 10 +- workflows/common/swift/obj_py.swift | 1 + workflows/cp-leaveout/scripts/clean-top21.py | 92 +++++++ workflows/cp-leaveout/scripts/touch-exps.zsh | 12 + workflows/cp-leaveout/swift/workflow.sh | 2 +- workflows/cp-leaveout/test/test-512.sh | 11 +- workflows/upf/swift/workflow.sh | 1 + workflows/upf/swift/workflow.swift | 4 + workflows/upf/test/upf-1.txt | 259 +++++++++++++++++++ 10 files changed, 398 insertions(+), 11 deletions(-) create mode 100644 workflows/cp-leaveout/scripts/clean-top21.py diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index 2c4e2708..8797ba7d 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -8,7 +8,8 @@ import os import sys import time - +import traceback +import importlib import runner_utils from log_tools import * from runner_utils import ModelResult @@ -141,8 +142,10 @@ def run(hyper_parameter_map, obj_return): with open(directory + "/rank.txt", "w") as fp: fp.write(str(os.getenv("ADLB_RANK_SELF")) + "\n") - framework = hyper_parameter_map["framework"] - model_name = hyper_parameter_map["model_name"] + framework = hyper_parameter_map['framework'] + print("framework: " + str(framework)) + sys.stdout.flush() + model_name = hyper_parameter_map['model_name'] pkg = import_pkg(framework, model_name) runner_utils.format_params(hyper_parameter_map) @@ -173,8 +176,16 @@ def run(hyper_parameter_map, obj_return): except Exception as e: logger.warn("RUN EXCEPTION: " + str(e)) print("RUN EXCEPTION: " + str(e)) + info = sys.exc_info() + s = traceback.format_tb(info[2]) + sys.stdout.write('\\n\\nEXCEPTION in model run(): \\n' + + repr(e) + ' ... \\n' + ''.join(s)) + sys.stdout.write('\\n') + sys.stdout.flush() + # logger.warn("Caught InvalidArgumentError") exception = True + exit(1) log("PKG RUN STOP") if framework == "keras": diff --git a/workflows/common/sh/env-summit.sh b/workflows/common/sh/env-summit.sh index 85e6d2a7..a31dcf79 100644 --- a/workflows/common/sh/env-summit.sh +++ b/workflows/common/sh/env-summit.sh @@ -2,7 +2,7 @@ # ENV Summit # SWIFT_IMPL=echo -SWIFT_IMPL=app +SWIFT_IMPL=py # Let modules initialize LD_LIBRARY_PATH before changing it: set +eu # modules create errors outside our control @@ -14,7 +14,8 @@ set -eu # From Wozniak MED106=/gpfs/alpine/world-shared/med106 ROOT=$MED106/sw/summit/gcc-7.5.0 -SWIFT=$ROOT/swift-t/2022-04-12 +# SWIFT=$ROOT/swift-t/2022-07-25 # Works +SWIFT=$ROOT/swift-t/m39-2022-09-27 export TURBINE_HOME=$SWIFT/turbine PATH=$SWIFT/stc/bin:$PATH @@ -24,8 +25,9 @@ R=$ROOT/R/4.1.3/lib64/R LD_LIBRARY_PATH+=:$R/lib # PY=/gpfs/alpine/world-shared/med106/sw/conda/2021-10-06/envs/CANDLE-2021-10-06 -#PY=/sw/summit/open-ce/anaconda-base/envs/open-ce-1.5.2-py39-0 -PY=/gpfs/alpine/world-shared/med106/sw/open-ce-1.1.3-py37/ +# PY=/sw/summit/open-ce/anaconda-base/envs/open-ce-1.5.2-py39-0 +# PY=/gpfs/alpine/world-shared/med106/sw/open-ce-1.1.3-py37/ +PY=/gpfs/alpine/world-shared/med106/sw/conda/m-39-2022-09-15 LD_LIBRARY_PATH+=:$PY/lib export PYTHONHOME=$PY PATH=$PY/bin:$PATH diff --git a/workflows/common/swift/obj_py.swift b/workflows/common/swift/obj_py.swift index d4a53c62..61791f20 100644 --- a/workflows/common/swift/obj_py.swift +++ b/workflows/common/swift/obj_py.swift @@ -25,6 +25,7 @@ try: J = """%s""" hyper_parameter_map = json.loads(J) hyper_parameter_map['framework'] = 'keras' + hyper_parameter_map['framework'] = 'keras' hyper_parameter_map['save'] = '{}/output'.format(outdir) hyper_parameter_map['instance_directory'] = outdir hyper_parameter_map['model_name'] = '%s' diff --git a/workflows/cp-leaveout/scripts/clean-top21.py b/workflows/cp-leaveout/scripts/clean-top21.py new file mode 100644 index 00000000..c0f41b59 --- /dev/null +++ b/workflows/cp-leaveout/scripts/clean-top21.py @@ -0,0 +1,92 @@ + +# CLEAN TOP21 +# Cleans the top21 file so only LINCS records are present +# File names are hard-coded but easy to change + +import logging + +logger = logging.getLogger("clean-top21") +logger.setLevel(logging.DEBUG) +ch = logging.StreamHandler() +ch.setLevel(logging.DEBUG) +formatter = logging.Formatter("%(asctime)s %(message)s", + datefmt="%H:%M:%S") +ch.setFormatter(formatter) +logger.addHandler(ch) +logger.info("Start") + +import pandas as pd + +logger.info("Pandas") + +SCRATCH = "/gpfs/alpine/med106/scratch/wozniak" +CANDLE_DATA = SCRATCH + "/CANDLE-Data/ChallengeProblem" + +# The original data from Yoo: +original = CANDLE_DATA + "/top21_2020Jul/top21.h5" +lincs1000 = CANDLE_DATA + "/top21_2020Jul/lincs1000" + +# The file we are creating here: +output = CANDLE_DATA + "/top21_2020Jul/top21-cleaned-dd.h5" + +# List of names in LINCS: +lincs = [] +with open(lincs1000, "r") as fp: + while True: + line = fp.readline() + if len(line) == 0: break + lincs.append(line.strip()) + +logger.info("lincs length: %i" % len(lincs)) + +store_in = pd.HDFStore(original, "r") +df = store_in.get("df") + +logger.info("HDF Opened.") + +columns = df.columns.to_list() +logger.info("df columns original: %i" % len(columns)) + +# List of dataframe column names to delete: +delete_these = [] + +count_key = 0 +count_GE_N = 0 +count_GE_Y = 0 +count_DD = 0 +count_other = 0 +for column in columns: + if column.startswith("GE_"): + # print("GE " + column) + substring = column[3:] + if substring in lincs: + count_GE_Y += 1 + else: + count_GE_N += 1 + delete_these.append(column) + elif column.startswith("DD_"): + # print("DD " + column) + count_DD += 1 + # delete_these.append(column) + elif column == "AUC" or column == "DRUG" or column == "CELL": + count_key += 1 + else: + print("NO '%s'" % column) + count_other += 1 + +print("count_key: %i" % count_key) +print("count_GE_Y: %i" % count_GE_Y) +print("count_GE_N: %i" % count_GE_N) +print("count_DD: %i" % count_DD) +print("count_other: %i" % count_other) + +logger.info("Scanned.") +logger.info("delete_these: %i" % len(delete_these)) +df.drop(columns=delete_these, inplace=True) +logger.info("df columns after: %i" % len(df.columns.to_list())) + +logger.info("Dropped.") + +df.to_hdf(output, key="df", mode="w") + +logger.info("Wrote.") diff --git a/workflows/cp-leaveout/scripts/touch-exps.zsh b/workflows/cp-leaveout/scripts/touch-exps.zsh index ef7d54dc..8cda3d84 100755 --- a/workflows/cp-leaveout/scripts/touch-exps.zsh +++ b/workflows/cp-leaveout/scripts/touch-exps.zsh @@ -3,6 +3,7 @@ set -eu which python +<<<<<<< Updated upstream A=( 750 746 757 @@ -11,6 +12,17 @@ A=( 750 744 759 763 +======= +A=( # 750 + # 746 + # 757 + # 771 + # 743 + # 744 + # 759 + # 763 + # 828 +>>>>>>> Stashed changes 838 839 ) diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index 62432c65..b9b4e864 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -175,7 +175,7 @@ then : fi -# which python swift-t java +which python swift-t java if [[ ${MACHINE:-} == "" ]] then diff --git a/workflows/cp-leaveout/test/test-512.sh b/workflows/cp-leaveout/test/test-512.sh index 1b9ef3b4..4e12307f 100755 --- a/workflows/cp-leaveout/test/test-512.sh +++ b/workflows/cp-leaveout/test/test-512.sh @@ -46,10 +46,15 @@ export CFG_PRM=$THIS/cfg-prm-1.sh SCRATCH=/gpfs/alpine/med106/scratch/wozniak # SCRATCH=/usb2/wozniak # CANDLE_DATA=$SCRATCH/CANDLE-Data/Milestone-13 -CANDLE_DATA=$SCRATCH/CANDLE-Data/ChallengeProblem -PLAN_JSON=$CANDLE_DATA/plangen_cell1593-p4_drug1779-p1.json +CANDLE_DATA=$SCRATCH/CANDLE-Data/ChallengeProblem/top21_2020Jul +# CANDLE_DATA=$SCRATCH/CANDLE-Data/ChallengeProblem/old +# PLAN_JSON=$CANDLE_DATA/plangen_cell1593-p4_drug1779-p1.json # DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.csv -DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.feather +# DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.feather +# DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.hdf5 +PLAN_JSON=$CANDLE_DATA/plangen_cell703-p4_drug1492-p1.json # NEW 2022-07 +# DATAFRAME_CSV=$CANDLE_DATA/top21.h5 # 2022-07 +DATAFRAME_CSV=$CANDLE_DATA/top21-cleaned-dd.h5 # NEW 2022-10 # BENCHMARK_DATA=$SCRATCH/proj/Benchmarks/Pilot1/Uno # BENCHMARK_DATA=$HOME/proj/Benchmarks/Pilot1/Uno BENCHMARK_DATA=$CANDLE_DATA diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index 75dbc32c..1ee5e952 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -92,6 +92,7 @@ swift-t -n $PROCS \ -e BENCHMARKS_ROOT \ -e EMEWS_PROJECT_ROOT \ -e MODEL_SH \ + -e FI_MR_CACHE_MAX_COUNT=0 \ -e SITE \ -e BENCHMARK_TIMEOUT \ -e MODEL_NAME=${MODEL_NAME:-MODEL_NULL} \ diff --git a/workflows/upf/swift/workflow.swift b/workflows/upf/swift/workflow.swift index 09b36ef9..a9c79060 100644 --- a/workflows/upf/swift/workflow.swift +++ b/workflows/upf/swift/workflow.swift @@ -17,7 +17,11 @@ report_env(); string FRAMEWORK = "keras"; // Scan command line +<<<<<<< Updated upstream file upf = input(argv("f")); +======= +file upf = input(argv("f")); +>>>>>>> Stashed changes int benchmark_timeout = string2int(argv("benchmark_timeout", "-1")); string model_name = getenv("MODEL_NAME"); diff --git a/workflows/upf/test/upf-1.txt b/workflows/upf/test/upf-1.txt index 919b1d07..a055d8c7 100644 --- a/workflows/upf/test/upf-1.txt +++ b/workflows/upf/test/upf-1.txt @@ -1,3 +1,262 @@ +<<<<<<< Updated upstream {"id": "RUN000", "epochs": 1} {"id": "RUN001", "epochs": 2} {"id": "RUN002", "epochs": 3} +======= +{"id": "test1", "epochs": 2} +{"id": "test2", "epochs": 2} +{"id": "test3", "epochs": 2} +{"id": "test4", "epochs": 2} +{"id": "test5", "epochs": 2} +{"id": "test6", "epochs": 2} +{"id": "test7", "epochs": 2} +{"id": "test8", "epochs": 2} +{"id": "test9", "epochs": 2} +{"id": "test11", "epochs": 2} +{"id": "test12", "epochs": 2} +{"id": "test13", "epochs": 2} +{"id": "test14", "epochs": 2} +{"id": "test15", "epochs": 2} +{"id": "test16", "epochs": 2} +{"id": "test17", "epochs": 2} +{"id": "test18", "epochs": 2} +{"id": "test19", "epochs": 2} +{"id": "test31", "epochs": 2} +{"id": "test32", "epochs": 2} +{"id": "test33", "epochs": 2} +{"id": "test34", "epochs": 2} +{"id": "test35", "epochs": 2} +{"id": "test36", "epochs": 2} +{"id": "test37", "epochs": 2} +{"id": "test38", "epochs": 2} +{"id": "test39", "epochs": 2} +{"id": "test111", "epochs": 2} +{"id": "test112", "epochs": 2} +{"id": "test113", "epochs": 2} +{"id": "test114", "epochs": 2} +{"id": "test115", "epochs": 2} +{"id": "test116", "epochs": 2} +{"id": "test117", "epochs": 2} +{"id": "test118", "epochs": 2} +{"id": "test119", "epochs": 2} +{"id": "test21", "epochs": 2} +{"id": "test22", "epochs": 2} +{"id": "test23", "epochs": 2} +{"id": "test24", "epochs": 2} +{"id": "test25", "epochs": 2} +{"id": "test26", "epochs": 2} +{"id": "test27", "epochs": 2} +{"id": "test28", "epochs": 2} +{"id": "test29", "epochs": 2} +{"id": "test211", "epochs": 2} +{"id": "test212", "epochs": 2} +{"id": "test213", "epochs": 2} +{"id": "test214", "epochs": 2} +{"id": "test215", "epochs": 2} +{"id": "test216", "epochs": 2} +{"id": "test217", "epochs": 2} +{"id": "test218", "epochs": 2} +{"id": "test219", "epochs": 2} +{"id": "test51", "epochs": 2} +{"id": "test52", "epochs": 2} +{"id": "test53", "epochs": 2} +{"id": "test54", "epochs": 2} +{"id": "test55", "epochs": 2} +{"id": "test56", "epochs": 2} +{"id": "test57", "epochs": 2} +{"id": "test58", "epochs": 2} +{"id": "test59", "epochs": 2} +{"id": "test511", "epochs": 2} +{"id": "test512", "epochs": 2} +{"id": "test513", "epochs": 2} +{"id": "test514", "epochs": 2} +{"id": "test515", "epochs": 2} +{"id": "test516", "epochs": 2} +{"id": "test517", "epochs": 2} +{"id": "test518", "epochs": 2} +{"id": "test519", "epochs": 2} +{"id": "test531", "epochs": 2} +{"id": "test532", "epochs": 2} +{"id": "test533", "epochs": 2} +{"id": "test534", "epochs": 2} +{"id": "test535", "epochs": 2} +{"id": "test536", "epochs": 2} +{"id": "test537", "epochs": 2} +{"id": "test538", "epochs": 2} +{"id": "test539", "epochs": 2} +{"id": "test5111", "epochs": 2} +{"id": "test5112", "epochs": 2} +{"id": "test5113", "epochs": 2} +{"id": "test5114", "epochs": 2} +{"id": "test5115", "epochs": 2} +{"id": "test5116", "epochs": 2} +{"id": "test5117", "epochs": 2} +{"id": "test5118", "epochs": 2} +{"id": "test5119", "epochs": 2} +{"id": "test521", "epochs": 2} +{"id": "test522", "epochs": 2} +{"id": "test523", "epochs": 2} +{"id": "test524", "epochs": 2} +{"id": "test525", "epochs": 2} +{"id": "test526", "epochs": 2} +{"id": "test527", "epochs": 2} +{"id": "test528", "epochs": 2} +{"id": "test529", "epochs": 2} +{"id": "test5211", "epochs": 2} +{"id": "test5212", "epochs": 2} +{"id": "test5213", "epochs": 2} +{"id": "test5214", "epochs": 2} +{"id": "test5215", "epochs": 2} +{"id": "test5216", "epochs": 2} +{"id": "test5217", "epochs": 2} +{"id": "test5218", "epochs": 2} +{"id": "test5219", "epochs": 2} +{"id": "test6211", "epochs": 2} +{"id": "test6212", "epochs": 2} +{"id": "test6213", "epochs": 2} +{"id": "test6214", "epochs": 2} +{"id": "test6215", "epochs": 2} +{"id": "test6216", "epochs": 2} +{"id": "test6217", "epochs": 2} +{"id": "test6218", "epochs": 2} +{"id": "test6219", "epochs": 2} +{"id": "test7211", "epochs": 2} +{"id": "test7212", "epochs": 2} +{"id": "test7213", "epochs": 2} +{"id": "test7214", "epochs": 2} +{"id": "test7215", "epochs": 2} +{"id": "test7216", "epochs": 2} +{"id": "test7217", "epochs": 2} +{"id": "test7218", "epochs": 2} +{"id": "test7219", "epochs": 2} +{"id": "test8218", "epochs": 2} +{"id": "test8219", "epochs": 2} +{"id": "test91", "epochs": 2} +{"id": "test92", "epochs": 2} +{"id": "test93", "epochs": 2} +{"id": "test94", "epochs": 2} +{"id": "test95", "epochs": 2} +{"id": "test96", "epochs": 2} +{"id": "test97", "epochs": 2} +{"id": "test98", "epochs": 2} +{"id": "test99", "epochs": 2} +{"id": "test911", "epochs": 2} +{"id": "test912", "epochs": 2} +{"id": "test913", "epochs": 2} +{"id": "test914", "epochs": 2} +{"id": "test915", "epochs": 2} +{"id": "test916", "epochs": 2} +{"id": "test917", "epochs": 2} +{"id": "test918", "epochs": 2} +{"id": "test919", "epochs": 2} +{"id": "test931", "epochs": 2} +{"id": "test932", "epochs": 2} +{"id": "test933", "epochs": 2} +{"id": "test934", "epochs": 2} +{"id": "test935", "epochs": 2} +{"id": "test936", "epochs": 2} +{"id": "test937", "epochs": 2} +{"id": "test938", "epochs": 2} +{"id": "test939", "epochs": 2} +{"id": "test9111", "epochs": 2} +{"id": "test9112", "epochs": 2} +{"id": "test9113", "epochs": 2} +{"id": "test9114", "epochs": 2} +{"id": "test9115", "epochs": 2} +{"id": "test9116", "epochs": 2} +{"id": "test9117", "epochs": 2} +{"id": "test9118", "epochs": 2} +{"id": "test9119", "epochs": 2} +{"id": "test921", "epochs": 2} +{"id": "test922", "epochs": 2} +{"id": "test923", "epochs": 2} +{"id": "test924", "epochs": 2} +{"id": "test925", "epochs": 2} +{"id": "test926", "epochs": 2} +{"id": "test927", "epochs": 2} +{"id": "test928", "epochs": 2} +{"id": "test929", "epochs": 2} +{"id": "test9211", "epochs": 2} +{"id": "test9212", "epochs": 2} +{"id": "test9213", "epochs": 2} +{"id": "test9214", "epochs": 2} +{"id": "test9215", "epochs": 2} +{"id": "test9216", "epochs": 2} +{"id": "test9217", "epochs": 2} +{"id": "test9218", "epochs": 2} +{"id": "test9219", "epochs": 2} +{"id": "test951", "epochs": 2} +{"id": "test952", "epochs": 2} +{"id": "test953", "epochs": 2} +{"id": "test954", "epochs": 2} +{"id": "test955", "epochs": 2} +{"id": "test956", "epochs": 2} +{"id": "test957", "epochs": 2} +{"id": "test958", "epochs": 2} +{"id": "test959", "epochs": 2} +{"id": "test9511", "epochs": 2} +{"id": "test9512", "epochs": 2} +{"id": "test9513", "epochs": 2} +{"id": "test9514", "epochs": 2} +{"id": "test9515", "epochs": 2} +{"id": "test9516", "epochs": 2} +{"id": "test9517", "epochs": 2} +{"id": "test9518", "epochs": 2} +{"id": "test9519", "epochs": 2} +{"id": "test9531", "epochs": 2} +{"id": "test9532", "epochs": 2} +{"id": "test9533", "epochs": 2} +{"id": "test9534", "epochs": 2} +{"id": "test9535", "epochs": 2} +{"id": "test9536", "epochs": 2} +{"id": "test9537", "epochs": 2} +{"id": "test9538", "epochs": 2} +{"id": "test9539", "epochs": 2} +{"id": "test95111", "epochs": 2} +{"id": "test95112", "epochs": 2} +{"id": "test95113", "epochs": 2} +{"id": "test95114", "epochs": 2} +{"id": "test95115", "epochs": 2} +{"id": "test95116", "epochs": 2} +{"id": "test95117", "epochs": 2} +{"id": "test95118", "epochs": 2} +{"id": "test95119", "epochs": 2} +{"id": "test9521", "epochs": 2} +{"id": "test9522", "epochs": 2} +{"id": "test9523", "epochs": 2} +{"id": "test9524", "epochs": 2} +{"id": "test9525", "epochs": 2} +{"id": "test9526", "epochs": 2} +{"id": "test9527", "epochs": 2} +{"id": "test9528", "epochs": 2} +{"id": "test9529", "epochs": 2} +{"id": "test95211", "epochs": 2} +{"id": "test95212", "epochs": 2} +{"id": "test95213", "epochs": 2} +{"id": "test95214", "epochs": 2} +{"id": "test95215", "epochs": 2} +{"id": "test95216", "epochs": 2} +{"id": "test95217", "epochs": 2} +{"id": "test95218", "epochs": 2} +{"id": "test95219", "epochs": 2} +{"id": "test96211", "epochs": 2} +{"id": "test96212", "epochs": 2} +{"id": "test96213", "epochs": 2} +{"id": "test96214", "epochs": 2} +{"id": "test96215", "epochs": 2} +{"id": "test96216", "epochs": 2} +{"id": "test96217", "epochs": 2} +{"id": "test96218", "epochs": 2} +{"id": "test96219", "epochs": 2} +{"id": "test97211", "epochs": 2} +{"id": "test97212", "epochs": 2} +{"id": "test97213", "epochs": 2} +{"id": "test97214", "epochs": 2} +{"id": "test97215", "epochs": 2} +{"id": "test97216", "epochs": 2} +{"id": "test97217", "epochs": 2} +{"id": "test97218", "epochs": 2} +{"id": "test97219", "epochs": 2} +{"id": "test98218", "epochs": 2} +{"id": "test98219", "epochs": 2} +>>>>>>> Stashed changes From 931059da2e558bd8c16e50e06f4417cdcda8fa08 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 8 Feb 2023 13:21:01 -0600 Subject: [PATCH 409/601] Handle errors in workflow.sh parameters --- workflows/upf/swift/workflow.sh | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index 75dbc32c..bc005e44 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -24,10 +24,14 @@ then fi if ! { - get_site $1 # Sets SITE - get_expid $2 "SINGULARITY" # Sets EXPID, TURBINE_OUTPUT - get_cfg_sys $3 # Sets CFG_SYS - UPF=$4 # The JSON hyperparameter file + # Sets SITE + # Sets EXPID, TURBINE_OUTPUT + # Sets CFG_SYS + # UPF is the JSON hyperparameter file + get_site $1 && \ + get_expid $2 "SINGULARITY" && \ + get_cfg_sys $3 && \ + UPF=$4 } then usage From a031e6f27e643aaf72c6a951901ba8a57bf14378 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 8 Feb 2023 13:22:11 -0600 Subject: [PATCH 410/601] Clean up comments --- workflows/upf/swift/workflow.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index bc005e44..20693bab 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -59,8 +59,7 @@ export BENCHMARK_TIMEOUT CMD_LINE_ARGS=( -expid=$EXPID -benchmark_timeout=$BENCHMARK_TIMEOUT - -f=$UPF # ALW: keeping it as $UPF to allow $UPF to be a full path - #-f=$TURBINE_OUTPUT/$UPF # Copied to TURBINE_OUTPUT below + -f=$UPF ) USER_VARS=( $CMD_LINE_ARGS ) From 67a70da18d5ac8814f79d785352dada6c1a51f06 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 8 Feb 2023 13:22:28 -0600 Subject: [PATCH 411/601] Add comment about additional env variables --- workflows/upf/swift/workflow.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index 20693bab..ac3b6373 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -108,5 +108,7 @@ swift-t -n $PROCS \ -e CANDLE_IMAGE \ $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} +# Can provide this to debug Python settings: # -e PYTHONVERBOSE=1 -# -e PATH=$PATH +# Can provide this if needed to reset PATH: +# -e PATH=$PATH From 7193674597e8ab9793f45638f54f594ab0dcc18e Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 8 Feb 2023 13:27:36 -0600 Subject: [PATCH 412/601] Don't run UPF in mode for SINGULARITY by default --- workflows/upf/swift/workflow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index ac3b6373..7bff1ea9 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -29,7 +29,7 @@ if ! { # Sets CFG_SYS # UPF is the JSON hyperparameter file get_site $1 && \ - get_expid $2 "SINGULARITY" && \ + get_expid $2 && \ get_cfg_sys $3 && \ UPF=$4 } From 574acad8155c3b41c9f6d274e3fb3bb6180a7ab0 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 8 Feb 2023 13:45:17 -0600 Subject: [PATCH 413/601] Attempt to fix error capture --- workflows/mlrMBO/swift/workflow.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index 21123a95..f125ac38 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -192,9 +192,8 @@ swift-t -O 0 -n $PROCS \ -e CANDLE_MODEL_TYPE \ -e CANDLE_IMAGE \ $WAIT_ARG \ - $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} |& \ + $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} ) |& \ tee $STDOUT -) if (( ${PIPESTATUS[0]} )) then From 1dcb5cd98ca476e83ec5fa2d8c0ce18dcc622477 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 13 Feb 2023 14:03:36 -0600 Subject: [PATCH 414/601] Add more comments, etc. --- scripts/shrink-log-single.sh | 9 +++++++-- scripts/shrink-log.mk | 4 ++-- scripts/shrink-log.py | 11 ++++++++--- scripts/shrink-logs.sh | 5 +++-- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/scripts/shrink-log-single.sh b/scripts/shrink-log-single.sh index 9d1284c1..26b980eb 100755 --- a/scripts/shrink-log-single.sh +++ b/scripts/shrink-log-single.sh @@ -7,9 +7,10 @@ set -eu INPUT=$1 OUTPUT=$2 -NAME=$( basename --suffix=.log $INPUT ) +NAME=$( basename --suffix=.txt $INPUT ) -T=${INPUT/$NAME/tr} +# Temp file for tr output: +T=$( mktemp --tmpdir=$TMP_SHRINK --suffix .txt tr-XXX ) if [ $INPUT == $T ] then @@ -22,7 +23,11 @@ then THIS=$( readlink --canonicalize $( dirname $0 ) ) fi +# This converts the TensorFlow line overwrite behavior to +# normal newlines: tr "\r" "\n" < $INPUT > $T + +# Does the log parsing and shrinking: python $THIS/shrink-log.py $T $OUTPUT rm $T diff --git a/scripts/shrink-log.mk b/scripts/shrink-log.mk index ccb40fc0..b2294d21 100644 --- a/scripts/shrink-log.mk +++ b/scripts/shrink-log.mk @@ -1,11 +1,11 @@ .DELETE_ON_ERROR: -OUTS = $(wildcard out-*.log) +OUTS = $(wildcard out-*.txt) SUMMARIES = $(subst out-,summary-,$(OUTS)) all: $(SUMMARIES) -summary-%.log: out-%.log +summary-%.txt: out-%.txt @ ${THIS}/shrink-log-single.sh $(<) $(@) diff --git a/scripts/shrink-log.py b/scripts/shrink-log.py index 74e64900..e535f558 100644 --- a/scripts/shrink-log.py +++ b/scripts/shrink-log.py @@ -1,7 +1,7 @@ # SHRINK LOG PY -# argv: 2 filenames : tr-*.log and summary-*.log +# argv: 2 filenames : tr file and summary-*.txt # Called by shrink-log-single.sh -# The tr-*.log file should have used tr to change CR to NL +# The tr file should have used tr to change CR to NL # Removes non-printing characters (backspace) # Reduces the number of training lines in output # Removes redundant batch size information @@ -15,7 +15,7 @@ from collections import deque # Only 1/shrink_factor training lines are copied -shrink_factor = 200 +shrink_factor = 100 # Number of additional consecutive lines at beginning and end of # training that are retained hold_space = 3 @@ -32,6 +32,7 @@ def shrink(fp_in, fp_out): continue # Blank line line = line.replace("\b", "") if "batch:" in line or "Current" in line: + # Found a training line line = re.sub("- batch: .* 32.0000 -", "", line) line = line.replace("Current", "\nCurrent") if starts < hold_space: @@ -46,10 +47,14 @@ def shrink(fp_in, fp_out): fp_out.write(line) else: starts = 0 + # Found a non-training line + # Flush the Q: while len(Q) > 0: fp_out.write(Q.popleft()) if line == line_previous: + # Discard redundant lines continue + # Good line: write it fp_out.write(line) line_previous = line # Done: flush Q: diff --git a/scripts/shrink-logs.sh b/scripts/shrink-logs.sh index 2d9a8207..440434b8 100755 --- a/scripts/shrink-logs.sh +++ b/scripts/shrink-logs.sh @@ -25,7 +25,8 @@ then fi # This is used inside the Makefile below: -mkdir -pv /tmp/$USER/shrink +export TMP_SHRINK=/tmp/$USER/shrink +mkdir -pv $TMP_SHRINK cd $DIR -nice -n 19 make -j 1 -f $THIS/shrink-log.mk +nice -n 19 make -j 4 -f $THIS/shrink-log.mk From 1c565b0f88f60ebd6713e076f36b08941264912c Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 13 Feb 2023 14:04:59 -0600 Subject: [PATCH 415/601] Better log parsing --- workflows/cp-leaveout/scripts/Node.py | 18 ++++++++++++------ workflows/cp-leaveout/scripts/README.adoc | 19 +++++++++++++++++++ .../cp-leaveout/scripts/extract-node-info.py | 2 +- .../cp-leaveout/scripts/extract-node-info.sh | 4 ++-- 4 files changed, 34 insertions(+), 9 deletions(-) diff --git a/workflows/cp-leaveout/scripts/Node.py b/workflows/cp-leaveout/scripts/Node.py index 0230b133..b7eb8054 100644 --- a/workflows/cp-leaveout/scripts/Node.py +++ b/workflows/cp-leaveout/scripts/Node.py @@ -27,8 +27,10 @@ def __init__(self, id=None, logger=None): self.mae = None self.r2 = None self.corr = None + # The first learning rate: + self.lr_first = None # The final learning rate: - self.lr = None + self.lr_final = None # Differences wrt parent (lower is better) self.loss_delta = None self.val_loss_delta = None @@ -241,16 +243,20 @@ def parse_python_log(self, fp): line = fp.readline() if line == "": break - if line.startswith("Epoch ", date_len) and + if line.startswith("Epoch ", date_len) and \ "lr=" in line: - tokens = line.split("=") - self.lr = float(tokens[1]) - print("%s lr=%0.3f" % (self.id, self.lr)) + tokens = line.split("=") + lr = float(tokens[1]) + # print("%s lr=%0.6f" % (self.id, lr)) + if self.lr_first is None: + self.lr_first = lr + else: + self.lr_final = lr if line.startswith(marker, date_len): line = fp.readline() tokens = check_token(line, 2, "mse:") self.mse = float(tokens[3]) - print("mse: " + str(self.mse)) + # print("mse: " + str(self.mse)) line = fp.readline() tokens = check_token(line, 2, "mae:") self.mae = float(tokens[3]) diff --git a/workflows/cp-leaveout/scripts/README.adoc b/workflows/cp-leaveout/scripts/README.adoc index bf827762..40a586f9 100644 --- a/workflows/cp-leaveout/scripts/README.adoc +++ b/workflows/cp-leaveout/scripts/README.adoc @@ -85,6 +85,25 @@ Analytically determine the number of Nodes in the workflow given N and S. Compile workflow statistics +==== Report learning rates + +Dump start and end learning rates into `lrs.txt` + +---- +$ scripts/report-lrs.sh $D > $D/lrs.txt +---- + +=== Data management + +==== mk-log-tar.sh + +Make a tarball of just the important logs (not the big HDF files). + +==== cp-subtree.sh + +Make a copy of experiment run subtrees; includes a random sample of +leaf nodes and all their parents. + === Analysis for model.log files These are not really supported for Summit runs diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index 21540d51..ebd58f7e 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -162,7 +162,7 @@ def trace(message): # logger.fatal("Could not find val data for node: " + node.id) -def find_error_data(node): +def parse_python_log(node): python_log = args.directory + "/run/%s/save/python.log" % node.id if not os.path.exists(python_log): return diff --git a/workflows/cp-leaveout/scripts/extract-node-info.sh b/workflows/cp-leaveout/scripts/extract-node-info.sh index 103ab8de..95661a08 100755 --- a/workflows/cp-leaveout/scripts/extract-node-info.sh +++ b/workflows/cp-leaveout/scripts/extract-node-info.sh @@ -29,9 +29,9 @@ RESTARTS=( $DIR/restarts/* ) for RESTART in ${RESTARTS[@]} do - $THIS/shrink-output.sh $RESTART + $SUPERVISOR/scripts/shrink-logs.sh $RESTART done -$THIS/shrink-output.sh $DIR +$SUPERVISOR/scripts/shrink-logs.sh $DIR { for RESTART in ${RESTARTS[@]} From 5e9e6bcba6808db7bfe8b9977dfd8c5ea72c3b20 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 14 Feb 2023 14:06:25 -0600 Subject: [PATCH 416/601] WIP Comparator --- models/Comparator/cmp_baseline_keras2.py | 71 +++++++++++++++--------- models/Comparator/cmp_default_model.txt | 2 +- 2 files changed, 45 insertions(+), 28 deletions(-) diff --git a/models/Comparator/cmp_baseline_keras2.py b/models/Comparator/cmp_baseline_keras2.py index 957727d3..6eba5003 100644 --- a/models/Comparator/cmp_baseline_keras2.py +++ b/models/Comparator/cmp_baseline_keras2.py @@ -30,42 +30,59 @@ def run(gParameters): print("file_path: %s" % file_path) output_dir = gParameters["output_dir"] expid = gParameters["experiment_id"] + runid = gParameters["run_id"] supervisor = Path(file_path).absolute().parent.parent workflows = supervisor / "workflows" - model_sh = workflows / "common" / "sh" / "model.sh" print(model_sh) os.chdir(output_dir) - env = { - "WORKFLOWS_ROOT": str(workflows), - "TURBINE_OUTPUT": output_dir, - "EXPID": expid, - "SITE": "lambda", - "OBJ_RETURN": "loss", - "BENCHMARK_TIMEOUT": "120", - "MODEL_NAME": gParameters["model1"], - "CANDLE_MODEL_TYPE": "SINGULARITY", - "CANDLE_DATA_DIR": os.getenv("CANDLE_DATA_DIR"), - "ADLB_RANK_OFFSET": "0", - "CANDLE_IMAGE": "/software/improve/images/GraphDRP.sif" - } + cmd = make_cmd(str(workflows), expid, runid) + run_dir = Path(os.getenv("CANDLE_DATA_DIR")) \ + / model1 / "Output" / expid / runid print("env: " + str(env)) - cmd = [ - "bash", - model_sh, - "keras2", - "{}", # empty JSON fragment - expid, - gParameters["run_id"] - ] print("cmd: " + str(cmd)) - with open("model1.log", "w") as model1_log: - subprocess.run(cmd, - env=env, - stdout=model1_log, - stderr=subprocess.STDOUT) + results = {} + for i in [ 1, 2 ]: + result = + model_name = gParameters["model%i" % i] + env = make_env(str(workflows), model_name) + with open(run_dir + "/start-%i.log" % i, "w") as fp: + subprocess.run(cmd, env=env, + stdout=start_log, + stderr=subprocess.STDOUT) + run_dir = Path(os.getenv("CANDLE_DATA_DIR")) \ + / model_name / "Output" / expid / runid + with open(run_dir / "result.txt") as fp: + line = fp.readline() + results[i] = int(line) + print("cmp: result %i: %f" % (i, results[i])) print("Comparator DONE.") +def make_env(workflows, model_name): + env = { "WORKFLOWS_ROOT": workflows, + "TURBINE_OUTPUT": output_dir, + "EXPID": expid, + "SITE": "lambda", + "OBJ_RETURN": "loss", + "BENCHMARK_TIMEOUT": "120", + "MODEL_NAME": model_name, + "CANDLE_MODEL_TYPE": "SINGULARITY", + "CANDLE_DATA_DIR": os.getenv("CANDLE_DATA_DIR"), + "ADLB_RANK_OFFSET": "0", + "CANDLE_IMAGE": "/software/improve/images/GraphDRP.sif" + } + return env + + +def make_cmd(workflows, expid, runid): + model_sh = workflows / "common" / "sh" / "model.sh" + + cmd = [ "bash", model_sh, + "keras2", "{}", # empty JSON fragment + expid, + runid ] + + def main(): gParameters = initialize_parameters() run(gParameters) diff --git a/models/Comparator/cmp_default_model.txt b/models/Comparator/cmp_default_model.txt index 7ee6baff..dca137c6 100644 --- a/models/Comparator/cmp_default_model.txt +++ b/models/Comparator/cmp_default_model.txt @@ -3,4 +3,4 @@ model_name = 'cmp' model1 = 'graphdrp' -model2 = 'graphdrp' +model2 = 'graphdrp' # 'graphdrp2' From 9fc53e0f839d84af6c10d270d28acd6694991cc8 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 14 Feb 2023 14:14:50 -0600 Subject: [PATCH 417/601] Basic Crusher tests --- scratch/crusher/README.adoc | 10 ++++++++++ scratch/crusher/hello.swift | 2 ++ scratch/crusher/py-tf.swift | 18 ++++++++++++++++++ scratch/crusher/py0.swift | 7 +++++++ scratch/crusher/test.sh | 31 +++++++++++++++++++++++++++++++ 5 files changed, 68 insertions(+) create mode 100644 scratch/crusher/README.adoc create mode 100644 scratch/crusher/hello.swift create mode 100644 scratch/crusher/py-tf.swift create mode 100644 scratch/crusher/py0.swift create mode 100755 scratch/crusher/test.sh diff --git a/scratch/crusher/README.adoc b/scratch/crusher/README.adoc new file mode 100644 index 00000000..c38de0b1 --- /dev/null +++ b/scratch/crusher/README.adoc @@ -0,0 +1,10 @@ + +Run with + +---- +$ ./test.sh hello.swift +---- + +Wait for job to complete, then check `turbine_output/output.txt` + +Logs are in `turbine_output/` diff --git a/scratch/crusher/hello.swift b/scratch/crusher/hello.swift new file mode 100644 index 00000000..79ecd0d2 --- /dev/null +++ b/scratch/crusher/hello.swift @@ -0,0 +1,2 @@ +import io; +printf("HELLO"); diff --git a/scratch/crusher/py-tf.swift b/scratch/crusher/py-tf.swift new file mode 100644 index 00000000..87ce81e7 --- /dev/null +++ b/scratch/crusher/py-tf.swift @@ -0,0 +1,18 @@ + +import io; +import python; + +r = python(---- +import sys, traceback +try: + sys.argv = ['python'] + import torch +except Exception as e: + info = sys.exc_info() + s = traceback.format_tb(info[2]) + sys.stdout.write('EXCEPTION in Python code: \\n' + repr(e) + ' ... \\n' + ''.join(s)) + sys.stdout.write('\\n') + sys.stdout.flush() +----, + "repr(torch.__version__)"); // +printf("PyTorch version: %s", r); diff --git a/scratch/crusher/py0.swift b/scratch/crusher/py0.swift new file mode 100644 index 00000000..4b262431 --- /dev/null +++ b/scratch/crusher/py0.swift @@ -0,0 +1,7 @@ + +import io; +import python; + +i = python("print(\"python works\")", + "repr(2+2)"); +printf("result of 2+2='%s'", i); diff --git a/scratch/crusher/test.sh b/scratch/crusher/test.sh new file mode 100755 index 00000000..e28c61fe --- /dev/null +++ b/scratch/crusher/test.sh @@ -0,0 +1,31 @@ +#!/bin/bash -l +set -eu + +if (( ${#} != 1 )) +then + echo "Provide the workflow!" + exit 1 +fi + +WORKFLOW=$1 + +MED106=/gpfs/alpine/world-shared/med106 +SWIFT=/gpfs/alpine/world-shared/med106/gounley1/crusher2/swift-t-install + +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH + +PY=/gpfs/alpine/world-shared/med106/gounley1/crusher2/conda520 + +which swift-t + +export PROJECT=MED106_crusher +export QUEUE=batch +export WALLTIME=00:05:00 +export PROCS=2 +export PPN=2 + +export TURBINE_LAUNCHER=srun + +set -x +swift-t -m slurm -n $PROCS -e PYTHONHOME=$PY $WORKFLOW From 5598477e2e3e5c9506221ff479f2205e02e809d5 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 14 Feb 2023 14:15:14 -0600 Subject: [PATCH 418/601] Settings for Crusher --- workflows/common/sh/env-crusher.sh | 30 ++++++++++++++++++++++++++++ workflows/common/sh/sched-crusher.sh | 10 ++++++++++ 2 files changed, 40 insertions(+) create mode 100644 workflows/common/sh/env-crusher.sh create mode 100644 workflows/common/sh/sched-crusher.sh diff --git a/workflows/common/sh/env-crusher.sh b/workflows/common/sh/env-crusher.sh new file mode 100644 index 00000000..927693dc --- /dev/null +++ b/workflows/common/sh/env-crusher.sh @@ -0,0 +1,30 @@ + +# ENV Crusher + +# SWIFT_IMPL=echo +SWIFT_IMPL=py + +# CANDLE software installation root: +MED106=/gpfs/alpine/world-shared/med106 + +# Gounley installation: +ROOT=$MED106/gounley1/crusher2 +SWIFT=$ROOT/swift-t-install + +# Wozniak installation: +# ROOT=$MED106/sw/crusher/gcc-11.2.0 +# SWIFT=$ROOT/swift-t/2022-08-10 + +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH + +# Set up Python: +PY=/gpfs/alpine/med106/world-shared/gounley1/crusher2/conda520tf +export PYTHONHOME=$PY + +# For test output processing: +LOCAL=0 +CRAY=1 + +# Dummy setting: EQ/R is not installed on Spock yet +EQR=not-installed diff --git a/workflows/common/sh/sched-crusher.sh b/workflows/common/sh/sched-crusher.sh new file mode 100644 index 00000000..02e933df --- /dev/null +++ b/workflows/common/sh/sched-crusher.sh @@ -0,0 +1,10 @@ + +# SCHED Crusher + +# Tell Swift/T to use SLURM: +MACHINE="-m slurm" +export TURBINE_LAUNCHER=srun + +# Default CANDLE account settings for Spock: +export PROJECT=${PROJECT:-MED106_crusher} +export QUEUE=${QUEUE:-batch} From d958ae405f662872483ca04484ddbe660ac5e6a8 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 14 Feb 2023 14:15:50 -0600 Subject: [PATCH 419/601] New utility scripts --- workflows/cp-leaveout/scripts/cp-subtree.sh | 51 +++++++++++++++++++++ workflows/cp-leaveout/scripts/mk-log-tar.sh | 28 +++++++++++ workflows/cp-leaveout/scripts/report-lrs.sh | 25 ++++++++++ 3 files changed, 104 insertions(+) create mode 100755 workflows/cp-leaveout/scripts/cp-subtree.sh create mode 100755 workflows/cp-leaveout/scripts/mk-log-tar.sh create mode 100755 workflows/cp-leaveout/scripts/report-lrs.sh diff --git a/workflows/cp-leaveout/scripts/cp-subtree.sh b/workflows/cp-leaveout/scripts/cp-subtree.sh new file mode 100755 index 00000000..364d821d --- /dev/null +++ b/workflows/cp-leaveout/scripts/cp-subtree.sh @@ -0,0 +1,51 @@ +#!/bin/zsh -f +set -eu + +# CP SUBTREE SH +# Make a subset of the existing experiment tree +# Selects N leaf nodes at stage STAGE +# Copies those leaf nodes and their parents into output directory OUT + +THIS=$( realpath $( dirname $0 ) ) + +SUPERVISOR=$( realpath $THIS/../../.. ) +alias shopt=: +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide DIR OUT STAGE N" \ + DIR OUT STAGE N - ${*} + +EXP_ID=${DIR:t} + +mkdir -pv $OUT/$EXP_ID/run +OUT=$( realpath $OUT ) + +cd $DIR/run + +# Make pattern for grep on directory names +P=() +PATTERN="" + +# Don't forget stage 0 == "1." +repeat $(( STAGE + 1 )) P+=( . ) +# Join array P with separator . (dot) +PATTERN="^${(j:.:)P}\$" + +# Pull out N random directories that match pattern +NODES=( $( ls | grep "$PATTERN" | shuf -n $N ) ) + +for NODE in $NODES +do + if [[ -d $OUT/$NODE ]] continue + print "copy: $NODE ..." + cp -r $NODE $OUT/$EXP_ID/run + while true + do + # Parent node: chop off last 2 characters + NODE=${NODE[1,-3]} + if (( ${#NODE} == 1 )) break + if [[ -d $OUT/$NODE ]] break + print "copy: $NODE ..." + cp -r $NODE $OUT/$EXP_ID/run + done +done diff --git a/workflows/cp-leaveout/scripts/mk-log-tar.sh b/workflows/cp-leaveout/scripts/mk-log-tar.sh new file mode 100755 index 00000000..46800f44 --- /dev/null +++ b/workflows/cp-leaveout/scripts/mk-log-tar.sh @@ -0,0 +1,28 @@ +#!/bin/bash +set -eu + +# MK LOG TAR SH +# Make a tarball with the important logs but not the big datasets + +THIS=$( realpath $( dirname $0 ) ) + +SUPERVISOR=$( realpath $THIS/../../.. ) +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ + DIR - ${*} + +cd $DIR + +echo "find in $PWD ..." + +FILES=( $( find . -name python.log -or -name predicted.tsv ) ) + +echo "found ${#FILES[@]} files." +echo "running tar ..." + +TGZ=logs.tgz # PWD==DIR +time nice -n 19 tar cfz $TGZ ${FILES[@]} + +echo "created:" +ls -lh $TGZ diff --git a/workflows/cp-leaveout/scripts/report-lrs.sh b/workflows/cp-leaveout/scripts/report-lrs.sh new file mode 100755 index 00000000..964b25e3 --- /dev/null +++ b/workflows/cp-leaveout/scripts/report-lrs.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -eu + +# REPORT LRS SH +# Report learning rates by Node + +THIS=$( realpath $( dirname $0 ) ) +CPLO=$( realpath $THIS/.. ) +SUPERVISOR=$( realpath $CPLO/../.. ) + +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ + DIR - ${*} + +if ! [[ -d $DIR ]] +then + echo "Does not exist: $DIR" + exit 1 +fi + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +set -x +python3 -u $THIS/report_lrs.py $DIR From 5f4da72581004052473dfa6d952dee7f5cfd8d1e Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 15 Feb 2023 13:13:28 -0600 Subject: [PATCH 420/601] Add usage note --- workflows/cp-leaveout/swift/workflow.swift | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/cp-leaveout/swift/workflow.swift b/workflows/cp-leaveout/swift/workflow.swift index 3d0b1da9..2bdefec0 100644 --- a/workflows/cp-leaveout/swift/workflow.swift +++ b/workflows/cp-leaveout/swift/workflow.swift @@ -9,6 +9,7 @@ Flags: -N : Number of nodes per stage (see default in code) -S : Number of stages (see default in code) + -E : Number of epochs (see default in Benchmark) -P : Early stopping patience (see default in code) -r : Use RunType.RESTART, default is RunType.RUN_ALL RUN_ALL means this is a fresh run with no prior results From 61105c70e65e00ffd1c5ed0fb0c5fef1819e6d98 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 15 Feb 2023 13:13:57 -0600 Subject: [PATCH 421/601] Check for python --- scripts/shrink-logs.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/shrink-logs.sh b/scripts/shrink-logs.sh index 440434b8..40846f5a 100755 --- a/scripts/shrink-logs.sh +++ b/scripts/shrink-logs.sh @@ -16,6 +16,12 @@ source $SUPERVISOR/workflows/common/sh/utils.sh SIGNATURE -H "Provide an output DIR (e.g., .../experiments/X042/out)!" \ DIR - ${*} +if ! which python 2>&1 > /dev/null +then + echo "shrink-logs.sh: Add python to PATH!" + exit 1 +fi + export PYTHONPATH+=:$SUPERVISOR/workflows/common/python if ! [[ -d $DIR ]] From a25a41e4376c928ca53c1044a78fab96043c1209 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 15 Feb 2023 13:19:15 -0600 Subject: [PATCH 422/601] Fix output file location --- workflows/cp-leaveout/scripts/extract-node-info.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/extract-node-info.sh b/workflows/cp-leaveout/scripts/extract-node-info.sh index 95661a08..afd7c06f 100755 --- a/workflows/cp-leaveout/scripts/extract-node-info.sh +++ b/workflows/cp-leaveout/scripts/extract-node-info.sh @@ -29,7 +29,7 @@ RESTARTS=( $DIR/restarts/* ) for RESTART in ${RESTARTS[@]} do - $SUPERVISOR/scripts/shrink-logs.sh $RESTART + $SUPERVISOR/scripts/shrink-logs.sh $RESTART/out done $SUPERVISOR/scripts/shrink-logs.sh $DIR From dcccde769f9dacca05b0000520605709b2b24090 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 16 Feb 2023 16:06:27 -0600 Subject: [PATCH 423/601] Support node selection in print-node-info --- .../cp-leaveout/scripts/print-node-info.py | 37 ++++++++++++++----- .../cp-leaveout/scripts/print-node-info.sh | 7 +--- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/workflows/cp-leaveout/scripts/print-node-info.py b/workflows/cp-leaveout/scripts/print-node-info.py index 68f0ab02..504307b4 100644 --- a/workflows/cp-leaveout/scripts/print-node-info.py +++ b/workflows/cp-leaveout/scripts/print-node-info.py @@ -10,6 +10,8 @@ parser = argparse.ArgumentParser(description="Print Node info stats") parser.add_argument("directory", help="The experiment directory (EXPID)") +parser.add_argument("nodes", default="", nargs="*", + help="Nodes to print (optional, defaults to all)") args = parser.parse_args() @@ -21,17 +23,34 @@ except IOError as e: fail(e, os.EX_IOERR, "Could not read: " + node_pkl) + # Raw data printing: +# print(str(args)) # print(len(data)) # print(data) -# Print the node info! -count = 0 -earlies = 0 -for node in data.values(): - print(node.str_table()) - count += 1 - if node.stopped_early: - earlies += 1 -print("print-node-info: %i/%i runs stopped early." % (count, earlies)) +def print_all(data): + # Print the node info! + print("print_all") + count = 0 + earlies = 0 + for node in data.values(): + print(node.str_table()) + count += 1 + if node.stopped_early: + earlies += 1 + print("print-node-info: %i/%i runs stopped early." % + (earlies, count)) + + +def print_selected(data, nodes): + for node_id in nodes: + node = data[node_id] + print(node.str_table()) + + +if args.nodes == "": + print_all(data) +else: + print_selected(data, args.nodes) diff --git a/workflows/cp-leaveout/scripts/print-node-info.sh b/workflows/cp-leaveout/scripts/print-node-info.sh index db36d50c..45c38fec 100755 --- a/workflows/cp-leaveout/scripts/print-node-info.sh +++ b/workflows/cp-leaveout/scripts/print-node-info.sh @@ -8,14 +8,9 @@ set -eu # See Node.str_table() for the output format THIS=$( readlink --canonicalize $( dirname $0 ) ) - SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) -source $SUPERVISOR/workflows/common/sh/utils.sh - -SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ - DIR - ${*} export PYTHONPATH+=:$SUPERVISOR/workflows/common/python set -x -python3 -u $THIS/print-node-info.py $DIR +python3 -u $THIS/print-node-info.py ${*} From 3317309fbc5375b46bb9bfd602a4b5bfa71bc60d Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 16 Feb 2023 16:07:29 -0600 Subject: [PATCH 424/601] Document node selection --- workflows/cp-leaveout/scripts/README.adoc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/README.adoc b/workflows/cp-leaveout/scripts/README.adoc index 40a586f9..a9fb34ab 100644 --- a/workflows/cp-leaveout/scripts/README.adoc +++ b/workflows/cp-leaveout/scripts/README.adoc @@ -37,7 +37,7 @@ This makes the Python Pickle containing the Node data. See Node.py . This avoids needing to walk all logs all the time (which takes tens of seconds). ---- -$ scripts/extract-node-info.sh $D +$ scripts/extract-node-info.sh $D [nodes...] ---- The data structure in the Pickle is a simple dictionary mapping node ID strings e.g. "1.2.3.4" to object of type Node. @@ -60,6 +60,8 @@ $ scripts/print-node-info.sh $D ... ---- +If specific node IDs are given on the command line, only those records are printed. + ==== Find loss increases (find-loss-increases) Brettin email 2019-12-18: From 8164bff654ef547f1f093a0d40cf79a9fc2b5e07 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 16 Feb 2023 16:22:09 -0600 Subject: [PATCH 425/601] Handle user error --- workflows/cp-leaveout/scripts/print-node-info.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/print-node-info.py b/workflows/cp-leaveout/scripts/print-node-info.py index 504307b4..c2845e7b 100644 --- a/workflows/cp-leaveout/scripts/print-node-info.py +++ b/workflows/cp-leaveout/scripts/print-node-info.py @@ -46,7 +46,11 @@ def print_all(data): def print_selected(data, nodes): for node_id in nodes: - node = data[node_id] + try: + node = data[node_id] + except KeyError: + print("Could not find node: '%s'" % node_id) + exit(1) print(node.str_table()) From 2b0f4dd52188f0bc419a3245bfb4b7b46eb76e57 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 17 Feb 2023 11:30:02 -0600 Subject: [PATCH 426/601] Fix comment --- workflows/cp-leaveout/swift/workflow.swift | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/swift/workflow.swift b/workflows/cp-leaveout/swift/workflow.swift index 2bdefec0..f77117ca 100644 --- a/workflows/cp-leaveout/swift/workflow.swift +++ b/workflows/cp-leaveout/swift/workflow.swift @@ -4,8 +4,8 @@ Simply run with: 'swift-t workflow.swift ' Or specify the N, S values: - 'swift-t workflow.swift -N=6 -S=6 ' - for 55,986 tasks. + 'swift-t workflow.swift -N=4 -S=6 ' + for ### tasks. Flags: -N : Number of nodes per stage (see default in code) -S : Number of stages (see default in code) From d592047dfd6d76b02a6b8a221bf20ebc1381e94a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 17 Feb 2023 11:30:17 -0600 Subject: [PATCH 427/601] Quick restart example --- workflows/cp-leaveout/swift/workflow.swift | 3 +++ 1 file changed, 3 insertions(+) diff --git a/workflows/cp-leaveout/swift/workflow.swift b/workflows/cp-leaveout/swift/workflow.swift index f77117ca..12634085 100644 --- a/workflows/cp-leaveout/swift/workflow.swift +++ b/workflows/cp-leaveout/swift/workflow.swift @@ -26,6 +26,9 @@ code that will be pushed into Swift/T for conciseness... NOTE: On Summit, you have to use sys.stdout.flush() after Python output on stdout + + RESTART EXAMPLE: + test/test-512.sh summit EXP003 flat -r -N=4 -S=6 -E=5 -P=5 */ import assert; From ba7f98b2e357ee44b06e1a4f3895aca241f79b47 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 17 Feb 2023 11:30:32 -0600 Subject: [PATCH 428/601] WS --- workflows/cp-leaveout/swift/workflow.swift | 1 - 1 file changed, 1 deletion(-) diff --git a/workflows/cp-leaveout/swift/workflow.swift b/workflows/cp-leaveout/swift/workflow.swift index 12634085..16a7a763 100644 --- a/workflows/cp-leaveout/swift/workflow.swift +++ b/workflows/cp-leaveout/swift/workflow.swift @@ -100,7 +100,6 @@ global const string FRAMEWORK = "keras"; run_stage(int N, int S, string this, int stage, void block, string plan_id, string db_file, string runtype) { - // printf("stage: %i this: %s", stage, this); // Run the model void parent = run_single(this, stage, block, plan_id); From d50f77f6fe9e4c5eff9b47a25dd18b043a0be15b Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 17 Feb 2023 11:46:34 -0600 Subject: [PATCH 429/601] Zero-pad node IDs --- workflows/cp-leaveout/swift/workflow.swift | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/swift/workflow.swift b/workflows/cp-leaveout/swift/workflow.swift index 16a7a763..362e48b6 100644 --- a/workflows/cp-leaveout/swift/workflow.swift +++ b/workflows/cp-leaveout/swift/workflow.swift @@ -109,7 +109,10 @@ run_stage(int N, int S, string this, int stage, void block, // Recurse to the child stages foreach id_child in [1:N] { - run_stage(N, S, this+"."+id_child, stage+1, parent, + run_stage(N, S, + // We want padded node IDs like "1.01.03" , "1.10.16" + "%s.%02i" % (this, id_child), + stage+1, parent, plan_id, db_file, runtype); } } @@ -154,7 +157,7 @@ run_stage(int N, int S, string this, int stage, void block, printf("run_single(): stop_subplan result: '%s'", result2); v = propagate(obj_result); } - else + else // result1 != 0 { printf("run_single(): plan node already marked complete: " + "%s result=%s", node, result1) => From 097cc9292c7bdcb64ac3ffc6ca44b89fbad1c41b Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 17 Feb 2023 11:46:56 -0600 Subject: [PATCH 430/601] Remove debug output --- workflows/cp-leaveout/scripts/Node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/scripts/Node.py b/workflows/cp-leaveout/scripts/Node.py index b7eb8054..0769c833 100644 --- a/workflows/cp-leaveout/scripts/Node.py +++ b/workflows/cp-leaveout/scripts/Node.py @@ -150,7 +150,7 @@ def parse_epochs(self, line, logger=None): def parse_load_initial(self, line, logger=None): tokens = line.split() self.load_initial = float(tokens[4]) - print("load_initial: " + str(self.load_initial)) + # print("load_initial: " + str(self.load_initial)) def parse_epoch_status(self, line, logger=None): tokens = line.split() From f91c08f2aa10f6b22dca8e8b7ce75d84bd346e0e Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 20 Feb 2023 14:40:58 -0600 Subject: [PATCH 431/601] Fix parent weights location --- workflows/cp-leaveout/swift/workflow.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/swift/workflow.swift b/workflows/cp-leaveout/swift/workflow.swift index 362e48b6..d92d5e52 100644 --- a/workflows/cp-leaveout/swift/workflow.swift +++ b/workflows/cp-leaveout/swift/workflow.swift @@ -190,7 +190,7 @@ run_stage(int N, int S, string this, int stage, void block, if (stage > 1) { n = strlen(this); - parent = substring(this, 0, n-2); + parent = substring(this, 0, n-3); result = json_fragment + ---- , "initial_weights": "../%s/save/model.h5" From d81a7c5175dccbcb2b18e449d37ec7b995c754de Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 21 Feb 2023 13:46:36 -0600 Subject: [PATCH 432/601] Drop obj_prio() --- workflows/common/swift/obj_app.swift | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/workflows/common/swift/obj_app.swift b/workflows/common/swift/obj_app.swift index 6c11650f..2adb4788 100644 --- a/workflows/common/swift/obj_app.swift +++ b/workflows/common/swift/obj_app.swift @@ -32,29 +32,6 @@ printf("obj_app: result(%s): '%s'", runid, obj_result); } -// /** -// The main objective function used by the CANDLE/Supervisor -// model exploration (optimization) loop. -// params : The JSON string of params to be passed to the Benchmark -// run_id : A string run ID that will be the output directory name -// */ -// (string obj_result) obj_prio(string params, -// string run_id, int prio) { -// string model_sh = getenv("MODEL_SH"); -// string turbine_output = getenv("TURBINE_OUTPUT"); - -// // printf("running model shell script in: %s", outdir); -// // We do not use a file type here because this file may not be created, -// // which is handled by get_results() -// string outdir = "%s/run/%s" % (turbine_output, run_id); -// string result_file = outdir/"result.txt"; -// wait (@prio=prio run_model(model_sh, params, expidrun_id)) -// { -// obj_result = get_results(result_file); -// } -// printf("result(%s): %s", run_id, obj_result); -// } - /** Swift/T app function that runs the Benchmark */ From f0ffa6b1b37f20906f4ffaa8264511266eb8443a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 21 Feb 2023 14:55:25 -0600 Subject: [PATCH 433/601] Log the result from model.sh --- workflows/common/sh/model.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index ada0c03e..8cb09f39 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -173,6 +173,7 @@ else # Get results from model.log: last occurrence of "loss: xxx" RESULT=$(awk -v FS="loss:" 'NF>1{print $2}' model.log | tail -1) + log "RESULT: $RESULT" echo $RESULT > $INSTANCE_DIRECTORY/result.txt fi From 8f265c80488b126d4dcf1c2bdc2e7df8aaf8a4ef Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 21 Feb 2023 14:56:00 -0600 Subject: [PATCH 434/601] Fix call signature between obj_app and obj_py --- workflows/common/swift/obj_py.swift | 6 ++++-- workflows/cp-leaveout/swift/workflow.swift | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/workflows/common/swift/obj_py.swift b/workflows/common/swift/obj_py.swift index 61791f20..177c807e 100644 --- a/workflows/common/swift/obj_py.swift +++ b/workflows/common/swift/obj_py.swift @@ -45,10 +45,12 @@ except Exception as e: obj_result = 'EXCEPTION' ----; -(string obj_result) obj(string params, string iter_indiv_id) { +(string obj_result) obj(string params, + string expid, + string runid) { string outdir = "%s/run/%s" % (turbine_output, iter_indiv_id); string code = code_template % (outdir, params, model_name, - exp_id, iter_indiv_id, benchmark_timeout); + expid, iter_indiv_id, benchmark_timeout); obj_result = python_persist(code, "str(obj_result)"); printf("obj_py:obj(): obj_result: '%s'", obj_result); } diff --git a/workflows/cp-leaveout/swift/workflow.swift b/workflows/cp-leaveout/swift/workflow.swift index d92d5e52..5e096cf0 100644 --- a/workflows/cp-leaveout/swift/workflow.swift +++ b/workflows/cp-leaveout/swift/workflow.swift @@ -138,7 +138,7 @@ run_stage(int N, int S, string this, int stage, void block, if (result1 == "0") { // Run the model - obj_result = obj(json, node); + obj_result = obj(json, exp_id, node); printf("run_single(): completed: node: '%s' result: '%s'", node, obj_result); // Update the DB to complete the model run From 0cce568b33f241dac163cb92ceff944058566b3b Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 21 Feb 2023 14:56:15 -0600 Subject: [PATCH 435/601] Fix return if NVM is not enabled --- workflows/cp-leaveout/py/data_setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/py/data_setup.py b/workflows/cp-leaveout/py/data_setup.py index c77589d4..20e0e8e2 100644 --- a/workflows/cp-leaveout/py/data_setup.py +++ b/workflows/cp-leaveout/py/data_setup.py @@ -31,7 +31,7 @@ def setup_nvm(params): # nvme_enabled = True print("NVMe: %r" % nvme_enabled) if not nvme_enabled: - return + return params # copy original datafrom to NVMe disk space try: src = Path(params["dataframe_from"]) From e5b213335e449478139bdeeb7d2ace440b78f755 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 21 Feb 2023 14:56:27 -0600 Subject: [PATCH 436/601] Fix out-*.txt if not on Summit --- workflows/cp-leaveout/swift/workflow.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index b9b4e864..f14b60d1 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -193,7 +193,12 @@ else fi # TURBINE_STDOUT="" -export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" +if [[ $SITE == "Summit" ]] +then + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" +else + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%r.txt" +fi mkdir -pv $TURBINE_OUTPUT/out # set -x From 54d31e5f8b64638f6db73da08686ef40bb63f29a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 21 Feb 2023 14:57:08 -0600 Subject: [PATCH 437/601] Better error handling --- workflows/cp-leaveout/swift/workflow.swift | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/swift/workflow.swift b/workflows/cp-leaveout/swift/workflow.swift index 5e096cf0..18af3e5f 100644 --- a/workflows/cp-leaveout/swift/workflow.swift +++ b/workflows/cp-leaveout/swift/workflow.swift @@ -151,8 +151,8 @@ run_stage(int N, int S, string this, int stage, void block, { result2 = "RETRY"; } - assert(obj_result != "EXCEPTION" && obj_result != "", - "Exception in obj()!"); + assert(obj_result != "", "Error in obj(): result is empty!"); + assert(obj_result != "EXCEPTION", "Exception in obj()!"); assert(result2 != "EXCEPTION", "Exception in plangen_stop()!"); printf("run_single(): stop_subplan result: '%s'", result2); v = propagate(obj_result); From 33203496d63b2f8a8d8617a0c6ab5c64219ba5f4 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 21 Feb 2023 14:57:25 -0600 Subject: [PATCH 438/601] Simple test for Lambda --- workflows/cp-leaveout/test/test-lambda-1.sh | 83 +++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100755 workflows/cp-leaveout/test/test-lambda-1.sh diff --git a/workflows/cp-leaveout/test/test-lambda-1.sh b/workflows/cp-leaveout/test/test-lambda-1.sh new file mode 100755 index 00000000..371f025d --- /dev/null +++ b/workflows/cp-leaveout/test/test-lambda-1.sh @@ -0,0 +1,83 @@ +#!/bin/bash +set -eu + +# CP LEAVEOUT TEST LAMBDA 1 + +SCRIPT=$( basename $0 .sh ) + +usage() +{ + echo "Usage: $0 SITE EXPID WORKFLOW_ARGS" +} + +if (( ${#} < 2 )) +then + usage + exit 1 +fi + +SITE=$1 +RUN_DIR=$2 +shift 2 +WORKFLOW_ARGS=$* + +SCRIPT=$( basename $0 .sh ) + +export MODEL_NAME=uno # nt3 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-512.sh +export CFG_PRM=$THIS/cfg-prm-1.sh + +# # Data files + +# Data files +CANDLE_DATA=$HOME/CANDLE_DATA_DIR/ChallengeProblem/top21_2020Jul +PLAN_JSON=$CANDLE_DATA/plangen_cell703-p4_drug1492-p1.json # NEW 2022-07 +# DATAFRAME_CSV=$CANDLE_DATA/topN.uno.h5 +# DATAFRAME_CSV=$CANDLE_DATA/top21.h5 # 2022-07 +# DATAFRAME_CSV=$CANDLE_DATA/top21-cleaned-dd.h5 # NEW 2022-10 +DATAFRAME_CSV=$CANDLE_DATA/top21-cleaned.h5 +# DATAFRAME_CSV=$CANDLE_DATA/top21_uno_v2.h5 +BENCHMARK_DATA=$CANDLE_DATA + +# What to return from the objective function (Keras model) +# val_loss (default), loss, and val_corr are supported +# export OBJ_RETURN="val_loss" +export OBJ_RETURN="loss" + +for f in $DATAFRAME_CSV $PLAN_JSON +do + if ! [[ -f $f ]] + then + echo "$0: does not exist: $f" + exit 1 + fi +done + +if [[ ! -e $BENCHMARK_DATA/cache ]] +then + echo "$0: The cache does not exist: $BENCHMARK_DATA/cache" + echo "$0: Use mkdir to create this directory" + exit 1 +fi + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME $WORKFLOW_ARGS \ + --plan_json=$PLAN_JSON \ + --dataframe_csv=$DATAFRAME_CSV \ + --benchmark_data=$BENCHMARK_DATA + +echo "$SCRIPT: OK" + +# Local Variables: +# c-basic-offset: 2; +# End: From 8fc7ca69275ff8ad6cefafc11f7d3040214ddc59 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 22 Feb 2023 11:47:36 -0600 Subject: [PATCH 439/601] Draft settings for Frontier --- workflows/common/sh/env-frontier.sh | 39 +++++++++++++++++++++++ workflows/common/sh/langs-app-frontier.sh | 12 +++++++ workflows/common/sh/sched-frontier.sh | 10 ++++++ 3 files changed, 61 insertions(+) create mode 100644 workflows/common/sh/env-frontier.sh create mode 100644 workflows/common/sh/langs-app-frontier.sh create mode 100644 workflows/common/sh/sched-frontier.sh diff --git a/workflows/common/sh/env-frontier.sh b/workflows/common/sh/env-frontier.sh new file mode 100644 index 00000000..8f3b5d75 --- /dev/null +++ b/workflows/common/sh/env-frontier.sh @@ -0,0 +1,39 @@ + +# ENV Frontier + +# SWIFT_IMPL=echo +SWIFT_IMPL=py + +# From Wozniak +# MED106=/gpfs/alpine/world-shared/med106 +# ROOT=$MED106/sw/summit/gcc-7.5.0 +ROOT=/autofs/nccs-svm1_home1/wozniak/Public/sfw/frontier +# SWIFT=$ROOT/swift-t/2022-07-25 # Works +SWIFT=$ROOT/swift-t/2023-02-22 + +export TURBINE_HOME=$SWIFT/turbine +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH + +# R=$ROOT/R/4.1.3/lib64/R +# LD_LIBRARY_PATH+=:$R/lib + +# PY=/gpfs/alpine/world-shared/med106/sw/conda/2021-10-06/envs/CANDLE-2021-10-06 +# PY=/sw/summit/open-ce/anaconda-base/envs/open-ce-1.5.2-py39-0 +# PY=/gpfs/alpine/world-shared/med106/sw/open-ce-1.1.3-py37/ +# # PY=/gpfs/alpine/world-shared/med106/sw/conda/m-39-2022-09-15 +# LD_LIBRARY_PATH+=:$PY/lib +# export PYTHONHOME=$PY +PATH=$PY/bin:$PATH + +# /gpfs/alpine/world-shared/med106/sw/condaenv-200408 +# export LD_LIBRARY_PATH=$PY/lib:$LD_LIBRARY_PATH + +# EMEWS Queues for R +# EQR=$ROOT/EQ-R + +# EQPy=$WORKFLOWS_ROOT/common/ext/EQ-Py + +# For test output processing: +LOCAL=0 +CRAY=1 diff --git a/workflows/common/sh/langs-app-frontier.sh b/workflows/common/sh/langs-app-frontier.sh new file mode 100644 index 00000000..372ca1a9 --- /dev/null +++ b/workflows/common/sh/langs-app-frontier.sh @@ -0,0 +1,12 @@ + +# LANGS APP FRONTIER SH + +# Allow for user PYTHONPATH additions: +APP_PYTHONPATH=${APP_PYTHONPATH:-} + +# Overwrite anything else set by the system or Swift/T environment: +export PY=/gpfs/alpine/med106/proj-shared/hm0/candle_tf_frontier +export LD_LIBRARY_PATH=$PY/lib +export PYTHONHOME=$PY +export PATH=$PYTHONHOME/bin:$PATH +export PYTHONPATH=$PYTHONHOME/lib/python3.9:$PYTHONHOME/lib/python3.9/site-packages:$APP_PYTHONPATH diff --git a/workflows/common/sh/sched-frontier.sh b/workflows/common/sh/sched-frontier.sh new file mode 100644 index 00000000..badff979 --- /dev/null +++ b/workflows/common/sh/sched-frontier.sh @@ -0,0 +1,10 @@ + +# SCHED Frontier + +# Scheduler settings for Swift/T/SLURM/Frontier + +MACHINE="-m slurm" + +# Default PROJECT for CANDLE +#export QUEUE=${QUEUE:-batch} +export PROJECT=${PROJECT:-MED106} From 5b4b26909b35a58911480fb4832908b9cbc01f22 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 22 Feb 2023 11:57:49 -0600 Subject: [PATCH 440/601] Drop reference to EQR here --- workflows/upf/swift/workflow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index 18a88d6f..0bc7f938 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -89,7 +89,7 @@ which swift-t swift-t -n $PROCS \ -o $TURBINE_OUTPUT/workflow.tic \ ${MACHINE:-} \ - -p -I $EQR -r $EQR \ + -p \ -I $WORKFLOWS_ROOT/common/swift \ -i obj_$SWIFT_IMPL \ -e BENCHMARKS_ROOT \ From 2d380a8029ef5dbd65f826624a0024cced241a65 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 22 Feb 2023 11:58:30 -0600 Subject: [PATCH 441/601] Fix bad merge --- workflows/upf/swift/workflow.swift | 4 ---- 1 file changed, 4 deletions(-) diff --git a/workflows/upf/swift/workflow.swift b/workflows/upf/swift/workflow.swift index a9c79060..09b36ef9 100644 --- a/workflows/upf/swift/workflow.swift +++ b/workflows/upf/swift/workflow.swift @@ -17,11 +17,7 @@ report_env(); string FRAMEWORK = "keras"; // Scan command line -<<<<<<< Updated upstream file upf = input(argv("f")); -======= -file upf = input(argv("f")); ->>>>>>> Stashed changes int benchmark_timeout = string2int(argv("benchmark_timeout", "-1")); string model_name = getenv("MODEL_NAME"); From 7f1aa8f0ef38aa541739898add28adb9f5a6c330 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 22 Feb 2023 12:00:33 -0600 Subject: [PATCH 442/601] Add PY for Frontier --- workflows/common/sh/env-frontier.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/common/sh/env-frontier.sh b/workflows/common/sh/env-frontier.sh index 8f3b5d75..3072dd59 100644 --- a/workflows/common/sh/env-frontier.sh +++ b/workflows/common/sh/env-frontier.sh @@ -18,6 +18,7 @@ PATH=$SWIFT/turbine/bin:$PATH # R=$ROOT/R/4.1.3/lib64/R # LD_LIBRARY_PATH+=:$R/lib +PY=/gpfs/alpine/med106/proj-shared/hm0/candle_tf_frontier # PY=/gpfs/alpine/world-shared/med106/sw/conda/2021-10-06/envs/CANDLE-2021-10-06 # PY=/sw/summit/open-ce/anaconda-base/envs/open-ce-1.5.2-py39-0 # PY=/gpfs/alpine/world-shared/med106/sw/open-ce-1.1.3-py37/ From 793e0cc944f2a18059cfe90eb70f061f468453dd Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 22 Feb 2023 12:00:47 -0600 Subject: [PATCH 443/601] Fix obj_py() for iter_indiv_id --- workflows/common/swift/obj_py.swift | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/common/swift/obj_py.swift b/workflows/common/swift/obj_py.swift index 177c807e..3229350a 100644 --- a/workflows/common/swift/obj_py.swift +++ b/workflows/common/swift/obj_py.swift @@ -48,9 +48,9 @@ except Exception as e: (string obj_result) obj(string params, string expid, string runid) { - string outdir = "%s/run/%s" % (turbine_output, iter_indiv_id); + string outdir = "%s/run/%s" % (turbine_output, runid); string code = code_template % (outdir, params, model_name, - expid, iter_indiv_id, benchmark_timeout); + expid, runid, benchmark_timeout); obj_result = python_persist(code, "str(obj_result)"); printf("obj_py:obj(): obj_result: '%s'", obj_result); } From 9cbaf4d180810a783983f7b835693eefaf2ad4da Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 22 Feb 2023 12:00:56 -0600 Subject: [PATCH 444/601] Fix bad merge --- workflows/upf/test/upf-1.txt | 259 ----------------------------------- 1 file changed, 259 deletions(-) diff --git a/workflows/upf/test/upf-1.txt b/workflows/upf/test/upf-1.txt index a055d8c7..919b1d07 100644 --- a/workflows/upf/test/upf-1.txt +++ b/workflows/upf/test/upf-1.txt @@ -1,262 +1,3 @@ -<<<<<<< Updated upstream {"id": "RUN000", "epochs": 1} {"id": "RUN001", "epochs": 2} {"id": "RUN002", "epochs": 3} -======= -{"id": "test1", "epochs": 2} -{"id": "test2", "epochs": 2} -{"id": "test3", "epochs": 2} -{"id": "test4", "epochs": 2} -{"id": "test5", "epochs": 2} -{"id": "test6", "epochs": 2} -{"id": "test7", "epochs": 2} -{"id": "test8", "epochs": 2} -{"id": "test9", "epochs": 2} -{"id": "test11", "epochs": 2} -{"id": "test12", "epochs": 2} -{"id": "test13", "epochs": 2} -{"id": "test14", "epochs": 2} -{"id": "test15", "epochs": 2} -{"id": "test16", "epochs": 2} -{"id": "test17", "epochs": 2} -{"id": "test18", "epochs": 2} -{"id": "test19", "epochs": 2} -{"id": "test31", "epochs": 2} -{"id": "test32", "epochs": 2} -{"id": "test33", "epochs": 2} -{"id": "test34", "epochs": 2} -{"id": "test35", "epochs": 2} -{"id": "test36", "epochs": 2} -{"id": "test37", "epochs": 2} -{"id": "test38", "epochs": 2} -{"id": "test39", "epochs": 2} -{"id": "test111", "epochs": 2} -{"id": "test112", "epochs": 2} -{"id": "test113", "epochs": 2} -{"id": "test114", "epochs": 2} -{"id": "test115", "epochs": 2} -{"id": "test116", "epochs": 2} -{"id": "test117", "epochs": 2} -{"id": "test118", "epochs": 2} -{"id": "test119", "epochs": 2} -{"id": "test21", "epochs": 2} -{"id": "test22", "epochs": 2} -{"id": "test23", "epochs": 2} -{"id": "test24", "epochs": 2} -{"id": "test25", "epochs": 2} -{"id": "test26", "epochs": 2} -{"id": "test27", "epochs": 2} -{"id": "test28", "epochs": 2} -{"id": "test29", "epochs": 2} -{"id": "test211", "epochs": 2} -{"id": "test212", "epochs": 2} -{"id": "test213", "epochs": 2} -{"id": "test214", "epochs": 2} -{"id": "test215", "epochs": 2} -{"id": "test216", "epochs": 2} -{"id": "test217", "epochs": 2} -{"id": "test218", "epochs": 2} -{"id": "test219", "epochs": 2} -{"id": "test51", "epochs": 2} -{"id": "test52", "epochs": 2} -{"id": "test53", "epochs": 2} -{"id": "test54", "epochs": 2} -{"id": "test55", "epochs": 2} -{"id": "test56", "epochs": 2} -{"id": "test57", "epochs": 2} -{"id": "test58", "epochs": 2} -{"id": "test59", "epochs": 2} -{"id": "test511", "epochs": 2} -{"id": "test512", "epochs": 2} -{"id": "test513", "epochs": 2} -{"id": "test514", "epochs": 2} -{"id": "test515", "epochs": 2} -{"id": "test516", "epochs": 2} -{"id": "test517", "epochs": 2} -{"id": "test518", "epochs": 2} -{"id": "test519", "epochs": 2} -{"id": "test531", "epochs": 2} -{"id": "test532", "epochs": 2} -{"id": "test533", "epochs": 2} -{"id": "test534", "epochs": 2} -{"id": "test535", "epochs": 2} -{"id": "test536", "epochs": 2} -{"id": "test537", "epochs": 2} -{"id": "test538", "epochs": 2} -{"id": "test539", "epochs": 2} -{"id": "test5111", "epochs": 2} -{"id": "test5112", "epochs": 2} -{"id": "test5113", "epochs": 2} -{"id": "test5114", "epochs": 2} -{"id": "test5115", "epochs": 2} -{"id": "test5116", "epochs": 2} -{"id": "test5117", "epochs": 2} -{"id": "test5118", "epochs": 2} -{"id": "test5119", "epochs": 2} -{"id": "test521", "epochs": 2} -{"id": "test522", "epochs": 2} -{"id": "test523", "epochs": 2} -{"id": "test524", "epochs": 2} -{"id": "test525", "epochs": 2} -{"id": "test526", "epochs": 2} -{"id": "test527", "epochs": 2} -{"id": "test528", "epochs": 2} -{"id": "test529", "epochs": 2} -{"id": "test5211", "epochs": 2} -{"id": "test5212", "epochs": 2} -{"id": "test5213", "epochs": 2} -{"id": "test5214", "epochs": 2} -{"id": "test5215", "epochs": 2} -{"id": "test5216", "epochs": 2} -{"id": "test5217", "epochs": 2} -{"id": "test5218", "epochs": 2} -{"id": "test5219", "epochs": 2} -{"id": "test6211", "epochs": 2} -{"id": "test6212", "epochs": 2} -{"id": "test6213", "epochs": 2} -{"id": "test6214", "epochs": 2} -{"id": "test6215", "epochs": 2} -{"id": "test6216", "epochs": 2} -{"id": "test6217", "epochs": 2} -{"id": "test6218", "epochs": 2} -{"id": "test6219", "epochs": 2} -{"id": "test7211", "epochs": 2} -{"id": "test7212", "epochs": 2} -{"id": "test7213", "epochs": 2} -{"id": "test7214", "epochs": 2} -{"id": "test7215", "epochs": 2} -{"id": "test7216", "epochs": 2} -{"id": "test7217", "epochs": 2} -{"id": "test7218", "epochs": 2} -{"id": "test7219", "epochs": 2} -{"id": "test8218", "epochs": 2} -{"id": "test8219", "epochs": 2} -{"id": "test91", "epochs": 2} -{"id": "test92", "epochs": 2} -{"id": "test93", "epochs": 2} -{"id": "test94", "epochs": 2} -{"id": "test95", "epochs": 2} -{"id": "test96", "epochs": 2} -{"id": "test97", "epochs": 2} -{"id": "test98", "epochs": 2} -{"id": "test99", "epochs": 2} -{"id": "test911", "epochs": 2} -{"id": "test912", "epochs": 2} -{"id": "test913", "epochs": 2} -{"id": "test914", "epochs": 2} -{"id": "test915", "epochs": 2} -{"id": "test916", "epochs": 2} -{"id": "test917", "epochs": 2} -{"id": "test918", "epochs": 2} -{"id": "test919", "epochs": 2} -{"id": "test931", "epochs": 2} -{"id": "test932", "epochs": 2} -{"id": "test933", "epochs": 2} -{"id": "test934", "epochs": 2} -{"id": "test935", "epochs": 2} -{"id": "test936", "epochs": 2} -{"id": "test937", "epochs": 2} -{"id": "test938", "epochs": 2} -{"id": "test939", "epochs": 2} -{"id": "test9111", "epochs": 2} -{"id": "test9112", "epochs": 2} -{"id": "test9113", "epochs": 2} -{"id": "test9114", "epochs": 2} -{"id": "test9115", "epochs": 2} -{"id": "test9116", "epochs": 2} -{"id": "test9117", "epochs": 2} -{"id": "test9118", "epochs": 2} -{"id": "test9119", "epochs": 2} -{"id": "test921", "epochs": 2} -{"id": "test922", "epochs": 2} -{"id": "test923", "epochs": 2} -{"id": "test924", "epochs": 2} -{"id": "test925", "epochs": 2} -{"id": "test926", "epochs": 2} -{"id": "test927", "epochs": 2} -{"id": "test928", "epochs": 2} -{"id": "test929", "epochs": 2} -{"id": "test9211", "epochs": 2} -{"id": "test9212", "epochs": 2} -{"id": "test9213", "epochs": 2} -{"id": "test9214", "epochs": 2} -{"id": "test9215", "epochs": 2} -{"id": "test9216", "epochs": 2} -{"id": "test9217", "epochs": 2} -{"id": "test9218", "epochs": 2} -{"id": "test9219", "epochs": 2} -{"id": "test951", "epochs": 2} -{"id": "test952", "epochs": 2} -{"id": "test953", "epochs": 2} -{"id": "test954", "epochs": 2} -{"id": "test955", "epochs": 2} -{"id": "test956", "epochs": 2} -{"id": "test957", "epochs": 2} -{"id": "test958", "epochs": 2} -{"id": "test959", "epochs": 2} -{"id": "test9511", "epochs": 2} -{"id": "test9512", "epochs": 2} -{"id": "test9513", "epochs": 2} -{"id": "test9514", "epochs": 2} -{"id": "test9515", "epochs": 2} -{"id": "test9516", "epochs": 2} -{"id": "test9517", "epochs": 2} -{"id": "test9518", "epochs": 2} -{"id": "test9519", "epochs": 2} -{"id": "test9531", "epochs": 2} -{"id": "test9532", "epochs": 2} -{"id": "test9533", "epochs": 2} -{"id": "test9534", "epochs": 2} -{"id": "test9535", "epochs": 2} -{"id": "test9536", "epochs": 2} -{"id": "test9537", "epochs": 2} -{"id": "test9538", "epochs": 2} -{"id": "test9539", "epochs": 2} -{"id": "test95111", "epochs": 2} -{"id": "test95112", "epochs": 2} -{"id": "test95113", "epochs": 2} -{"id": "test95114", "epochs": 2} -{"id": "test95115", "epochs": 2} -{"id": "test95116", "epochs": 2} -{"id": "test95117", "epochs": 2} -{"id": "test95118", "epochs": 2} -{"id": "test95119", "epochs": 2} -{"id": "test9521", "epochs": 2} -{"id": "test9522", "epochs": 2} -{"id": "test9523", "epochs": 2} -{"id": "test9524", "epochs": 2} -{"id": "test9525", "epochs": 2} -{"id": "test9526", "epochs": 2} -{"id": "test9527", "epochs": 2} -{"id": "test9528", "epochs": 2} -{"id": "test9529", "epochs": 2} -{"id": "test95211", "epochs": 2} -{"id": "test95212", "epochs": 2} -{"id": "test95213", "epochs": 2} -{"id": "test95214", "epochs": 2} -{"id": "test95215", "epochs": 2} -{"id": "test95216", "epochs": 2} -{"id": "test95217", "epochs": 2} -{"id": "test95218", "epochs": 2} -{"id": "test95219", "epochs": 2} -{"id": "test96211", "epochs": 2} -{"id": "test96212", "epochs": 2} -{"id": "test96213", "epochs": 2} -{"id": "test96214", "epochs": 2} -{"id": "test96215", "epochs": 2} -{"id": "test96216", "epochs": 2} -{"id": "test96217", "epochs": 2} -{"id": "test96218", "epochs": 2} -{"id": "test96219", "epochs": 2} -{"id": "test97211", "epochs": 2} -{"id": "test97212", "epochs": 2} -{"id": "test97213", "epochs": 2} -{"id": "test97214", "epochs": 2} -{"id": "test97215", "epochs": 2} -{"id": "test97216", "epochs": 2} -{"id": "test97217", "epochs": 2} -{"id": "test97218", "epochs": 2} -{"id": "test97219", "epochs": 2} -{"id": "test98218", "epochs": 2} -{"id": "test98219", "epochs": 2} ->>>>>>> Stashed changes From a1a59b1fbf434c1eef846f4ef323da9fabf9281e Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 22 Feb 2023 14:55:40 -0600 Subject: [PATCH 445/601] Turn off UNBUFFERED --- workflows/upf/swift/workflow.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index 0bc7f938..3362fc96 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -104,12 +104,13 @@ swift-t -n $PROCS \ -e TURBINE_MPI_THREAD=${TURBINE_MPI_THREAD:-1} \ $( python_envs ) \ -e TURBINE_STDOUT=$TURBINE_STDOUT \ - -e PYTHONUNBUFFERED=1 \ -e CANDLE_MODEL_TYPE \ -e CANDLE_IMAGE \ $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} # Can provide this to debug Python settings: # -e PYTHONVERBOSE=1 +# Can provide this if needed for debugging crashes: +# -e PYTHONUNBUFFERED=1 # Can provide this if needed to reset PATH: # -e PATH=$PATH From a03387dc60243a107a5b2852e7f45dfdbbb67fac Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 23 Feb 2023 09:50:36 -0600 Subject: [PATCH 446/601] Fixes for Frontier --- workflows/common/sh/env-frontier.sh | 4 ++-- workflows/common/sh/langs-app-frontier.sh | 3 ++- workflows/common/sh/sched-frontier.sh | 2 ++ 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/workflows/common/sh/env-frontier.sh b/workflows/common/sh/env-frontier.sh index 3072dd59..5ddfcaf3 100644 --- a/workflows/common/sh/env-frontier.sh +++ b/workflows/common/sh/env-frontier.sh @@ -9,7 +9,7 @@ SWIFT_IMPL=py # ROOT=$MED106/sw/summit/gcc-7.5.0 ROOT=/autofs/nccs-svm1_home1/wozniak/Public/sfw/frontier # SWIFT=$ROOT/swift-t/2022-07-25 # Works -SWIFT=$ROOT/swift-t/2023-02-22 +SWIFT=$ROOT/swift-t/2023-02-23 export TURBINE_HOME=$SWIFT/turbine PATH=$SWIFT/stc/bin:$PATH @@ -18,7 +18,7 @@ PATH=$SWIFT/turbine/bin:$PATH # R=$ROOT/R/4.1.3/lib64/R # LD_LIBRARY_PATH+=:$R/lib -PY=/gpfs/alpine/med106/proj-shared/hm0/candle_tf_frontier +PY=/gpfs/alpine/med106/proj-shared/hm0/candle_tf_2.10 # PY=/gpfs/alpine/world-shared/med106/sw/conda/2021-10-06/envs/CANDLE-2021-10-06 # PY=/sw/summit/open-ce/anaconda-base/envs/open-ce-1.5.2-py39-0 # PY=/gpfs/alpine/world-shared/med106/sw/open-ce-1.1.3-py37/ diff --git a/workflows/common/sh/langs-app-frontier.sh b/workflows/common/sh/langs-app-frontier.sh index 372ca1a9..f6e0d867 100644 --- a/workflows/common/sh/langs-app-frontier.sh +++ b/workflows/common/sh/langs-app-frontier.sh @@ -5,7 +5,8 @@ APP_PYTHONPATH=${APP_PYTHONPATH:-} # Overwrite anything else set by the system or Swift/T environment: -export PY=/gpfs/alpine/med106/proj-shared/hm0/candle_tf_frontier +# export PY=/gpfs/alpine/med106/proj-shared/hm0/candle_tf_frontier +export PY=/gpfs/alpine/med106/proj-shared/hm0/candle_tf_2.10 export LD_LIBRARY_PATH=$PY/lib export PYTHONHOME=$PY export PATH=$PYTHONHOME/bin:$PATH diff --git a/workflows/common/sh/sched-frontier.sh b/workflows/common/sh/sched-frontier.sh index badff979..e494cfa7 100644 --- a/workflows/common/sh/sched-frontier.sh +++ b/workflows/common/sh/sched-frontier.sh @@ -8,3 +8,5 @@ MACHINE="-m slurm" # Default PROJECT for CANDLE #export QUEUE=${QUEUE:-batch} export PROJECT=${PROJECT:-MED106} + +export TURBINE_PRELAUNCH="source activate /gpfs/alpine/med106/proj-shared/hm0/candle_tf_2.10" From 93648e56237dd5a5f3707bba8d6a935723cba7b2 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 23 Feb 2023 09:52:12 -0600 Subject: [PATCH 447/601] Fixes for Frontier --- workflows/cp-leaveout/swift/workflow.sh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index f14b60d1..deb017ac 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -19,7 +19,7 @@ BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/Uno export BENCHMARK_TIMEOUT export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} -PYTHONPATH=${PYTHONPATH:-}:$BENCHMARK_DIR +export PYTHONPATH=${PYTHONPATH:-}:$BENCHMARK_DIR:$BENCHMARKS_ROOT/Pilot1/Uno SCRIPT_NAME=$(basename $0) @@ -193,7 +193,7 @@ else fi # TURBINE_STDOUT="" -if [[ $SITE == "Summit" ]] +if [[ $SITE == "summit" || $SITE == "frontier" ]] then export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" else @@ -201,6 +201,8 @@ else fi mkdir -pv $TURBINE_OUTPUT/out +LD_LIBRARY_PATH=/opt/cray/libfabric/1.15.2.0/lib64 + # set -x swift-t -O 0 -n $PROCS \ ${MACHINE:-} \ @@ -232,9 +234,6 @@ swift-t -O 0 -n $PROCS \ # | \ # tee $STDOUT -# -# -e HIP_VISIBLE_DEVICES="0,1" \ - # -e USER # Needed on Summit to find NVME # -j /usr/bin/java # Give this to Swift/T if needed for Java # -e PYTHONUNBUFFERED=1 # May be needed if error output is being lost From 6662ba3bcdf37fd57d3da206ab4f9c7a2ca95cbc Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 23 Feb 2023 09:53:37 -0600 Subject: [PATCH 448/601] Default to 1 hour walltime --- workflows/cp-leaveout/test/cfg-sys-512.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/test/cfg-sys-512.sh b/workflows/cp-leaveout/test/cfg-sys-512.sh index 447383d1..3349cd8b 100644 --- a/workflows/cp-leaveout/test/cfg-sys-512.sh +++ b/workflows/cp-leaveout/test/cfg-sys-512.sh @@ -18,7 +18,7 @@ export PROCS=${PROCS:-6} # Cori has 32 cores per node, 128GB per node export PPN=${PPN:-1} -export WALLTIME=${WALLTIME:-12:00:00} +export WALLTIME=${WALLTIME:-01:00:00} # command separated list of gpu ids # export GPU_STRING=${GPU_STRING:-0} From e751f885e600bb1e2cad0e9b1148c57584c8433e Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sat, 25 Feb 2023 13:20:47 -0600 Subject: [PATCH 449/601] Set GPUs --- workflows/common/sh/sched-frontier.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/common/sh/sched-frontier.sh b/workflows/common/sh/sched-frontier.sh index e494cfa7..e6b7f824 100644 --- a/workflows/common/sh/sched-frontier.sh +++ b/workflows/common/sh/sched-frontier.sh @@ -10,3 +10,5 @@ MACHINE="-m slurm" export PROJECT=${PROJECT:-MED106} export TURBINE_PRELAUNCH="source activate /gpfs/alpine/med106/proj-shared/hm0/candle_tf_2.10" + +export TURBINE_LAUNCH_OPTIONS="--gpus-per-task=1 --gpus-per-node=$PPN" From 8e2481d970155b8f24ee358548c7c18ffee0c45b Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sat, 25 Feb 2023 13:20:59 -0600 Subject: [PATCH 450/601] Add Uno to PYTHONPATH --- workflows/common/sh/set-pythonpath.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/common/sh/set-pythonpath.sh b/workflows/common/sh/set-pythonpath.sh index 23833e3d..fcacdd88 100644 --- a/workflows/common/sh/set-pythonpath.sh +++ b/workflows/common/sh/set-pythonpath.sh @@ -35,6 +35,7 @@ PYTHONPATH+=:$SUPERVISOR/workflows/common/ext/EQ-Py PYTHONPATH+=:$BENCHMARKS_ROOT/Pilot1/P1B1 PYTHONPATH+=:$BENCHMARKS_ROOT/Pilot1/Attn1 PYTHONPATH+=:$BENCHMARKS_ROOT/Pilot1/NT3 +PYTHONPATH+=:$BENCHMARKS_ROOT/Pilot1/Uno PYTHONPATH+=:$BENCHMARKS_ROOT/examples/ADRP PYTHONPATH+=:$BENCHMARKS_ROOT/examples/xform-smiles From 1212fc2b41ad715578f1cd187bd7909ec70b518c Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sat, 25 Feb 2023 13:21:10 -0600 Subject: [PATCH 451/601] PYTHONPATH setting for Frontier --- workflows/common/sh/utils.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index 4cdfa462..e9160a4b 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -100,9 +100,10 @@ python_envs() then # We do not currently need this except on MCS and Spock: # Swift/T should grab PYTHONPATH automatically - if [[ ${SITE} == "mcs" ]] || \ - [[ ${SITE} == "spock" ]] || \ - [[ ${SITE} == "lambda" ]] + if [[ ${SITE} == "mcs" ]] || \ + [[ ${SITE} == "spock" ]] || \ + [[ ${SITE} == "lambda" ]] || \ + [[ ${SITE} == "frontier" ]] then # MCS discards PYTHONPATH in subshells RESULT+=( -e PYTHONPATH=$PYTHONPATH ) From d455ecd5de2908cb9854997c96f7c5d3c822cd29 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sat, 25 Feb 2023 13:21:54 -0600 Subject: [PATCH 452/601] Add warning message --- workflows/cp-leaveout/py/plangen.py | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/cp-leaveout/py/plangen.py b/workflows/cp-leaveout/py/plangen.py index c5275cb0..66f5990f 100644 --- a/workflows/cp-leaveout/py/plangen.py +++ b/workflows/cp-leaveout/py/plangen.py @@ -1175,6 +1175,7 @@ def get_subplan_features(plan_dict, subplan_id, parent_features=False): # acquire feature_set names populated in the plan content, _ = get_subplan(plan_dict, subplan_id) if not content: + print("get_subplan() found no content!") return None, None, None, None # peek inside the training set to capture active feature-set names From b057e4cf4a24c03b8e1af66929b7b4e0966dcf12 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sat, 25 Feb 2023 13:22:02 -0600 Subject: [PATCH 453/601] Use smaller job name --- workflows/cp-leaveout/swift/workflow.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index deb017ac..7b4dd0df 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -70,7 +70,8 @@ APP_PYTHONPATH+=:$EMEWS_PROJECT_ROOT/py # For plangen, data_setup APP_PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # For log_tools APP_PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common # For Benchmarks -export TURBINE_JOBNAME="JOB:${EXPID}" +# Job name limit on Frontier: 8 +export TURBINE_JOBNAME=$EXPID if [ -z ${GPU_STRING+x} ]; then From a63c999105da69aa650b391f3f02bb8e748b4545 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sat, 25 Feb 2023 13:22:16 -0600 Subject: [PATCH 454/601] Set default PROCS, PPN to 8 on Frontier --- workflows/cp-leaveout/test/cfg-sys-512.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/test/cfg-sys-512.sh b/workflows/cp-leaveout/test/cfg-sys-512.sh index 3349cd8b..cb329a19 100644 --- a/workflows/cp-leaveout/test/cfg-sys-512.sh +++ b/workflows/cp-leaveout/test/cfg-sys-512.sh @@ -12,11 +12,11 @@ # 3 92 921 12.0 0 # 4 46 91 6.0 0 # 5 1 45 2.0 -export PROCS=${PROCS:-6} +export PROCS=${PROCS:-8} # MPI processes per node # Cori has 32 cores per node, 128GB per node -export PPN=${PPN:-1} +export PPN=${PPN:-8} export WALLTIME=${WALLTIME:-01:00:00} From e717b6f41df7f6c9fd82f9131f9c3cd91be3aa12 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sat, 25 Feb 2023 13:22:54 -0600 Subject: [PATCH 455/601] Record new PLAN_JSON --- workflows/cp-leaveout/test/test-512.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/test/test-512.sh b/workflows/cp-leaveout/test/test-512.sh index 4e12307f..5c0f2aea 100755 --- a/workflows/cp-leaveout/test/test-512.sh +++ b/workflows/cp-leaveout/test/test-512.sh @@ -52,7 +52,9 @@ CANDLE_DATA=$SCRATCH/CANDLE-Data/ChallengeProblem/top21_2020Jul # DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.csv # DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.feather # DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.hdf5 -PLAN_JSON=$CANDLE_DATA/plangen_cell703-p4_drug1492-p1.json # NEW 2022-07 +PLAN_JSON=$CANDLE_DATA/plangen_cell703-p4_drug1492-p1-u.json # 2022-07 +# PLAN_JSON=$CANDLE_DATA/plangen_CELL2917-p4_DRUG2148-p4.json # 2023-02 +# PLAN_JSON=/gpfs/alpine/med106/proj-shared/brettin/Supervisor/workflows/cp-leaveout/plangen_CELL2917-p4_DRUG2148-p4.json # DATAFRAME_CSV=$CANDLE_DATA/top21.h5 # 2022-07 DATAFRAME_CSV=$CANDLE_DATA/top21-cleaned-dd.h5 # NEW 2022-10 # BENCHMARK_DATA=$SCRATCH/proj/Benchmarks/Pilot1/Uno From c02933a772cb4294c159c28bc50e3cc59dffc755 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sat, 25 Feb 2023 13:23:11 -0600 Subject: [PATCH 456/601] Set default walltime to 1h --- workflows/upf/test/cfg-sys-1.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/upf/test/cfg-sys-1.sh b/workflows/upf/test/cfg-sys-1.sh index 821d44a6..16eda96c 100644 --- a/workflows/upf/test/cfg-sys-1.sh +++ b/workflows/upf/test/cfg-sys-1.sh @@ -36,8 +36,8 @@ export PPN=${PPN:-2} # export PROJECT=med106 # export TURBINE_LAUNCH_OPTIONS="-a1 -g6 -c7" -# export WALLTIME=${WALLTIME:-0:30} -# echo WALLTIME: $WALLTIME +export WALLTIME=${WALLTIME:-1:00:00} +echo WALLTIME: $WALLTIME # export MAIL_ENABLED=1 # export MAIL_ADDRESS=wozniak@mcs.anl.gov From d88e64ef2fc873b337081fa6b4f3ad0428488985 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sat, 25 Feb 2023 13:23:38 -0600 Subject: [PATCH 457/601] Check for directory --- workflows/cp-leaveout/test/test-512.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/test/test-512.sh b/workflows/cp-leaveout/test/test-512.sh index 5c0f2aea..80c8f6c4 100755 --- a/workflows/cp-leaveout/test/test-512.sh +++ b/workflows/cp-leaveout/test/test-512.sh @@ -82,7 +82,7 @@ do fi done -if [[ ! -e $BENCHMARK_DATA/cache ]] +if [[ ! -d $BENCHMARK_DATA/cache ]] then echo "$0: The cache does not exist: $BENCHMARK_DATA/cache" echo "$0: Use mkdir to create this directory" From 1f316ac81cc965c80d2a9a09eaae2b091842bcbc Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sat, 25 Feb 2023 14:15:46 -0600 Subject: [PATCH 458/601] Fail fast if CANDLE_DATA_DIR is not set --- workflows/cp-leaveout/swift/workflow.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index 7b4dd0df..c86c0a8f 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -58,7 +58,11 @@ WORKFLOW_ARGS=$* echo "WORKFLOW.SH: Running model: $MODEL_NAME for EXPID: $EXPID" -set +x +if [[ ${CANDLE_DATA_DIR:-} == "" ]] +then + echo "workflow.sh: You must set CANDLE_DATA_DIR" + exit 1 +fi source_site env $SITE source_site sched $SITE From 8886f6c4af4da009ac51187db660ad32ba43c5dc Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sun, 26 Feb 2023 23:35:36 -0600 Subject: [PATCH 459/601] Handle normal timeouts on Frontier --- workflows/cp-leaveout/scripts/check-run.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/scripts/check-run.sh b/workflows/cp-leaveout/scripts/check-run.sh index bbb92e3a..8e6e6895 100755 --- a/workflows/cp-leaveout/scripts/check-run.sh +++ b/workflows/cp-leaveout/scripts/check-run.sh @@ -27,11 +27,17 @@ SUCCESS=0 if grep -q "User defined signal 2" $DIR/output.txt then + # Summit time out + echo "Job timed out normally." + SUCCESS=1 + +elif grep -q "DUE TO TIME LIMIT" $DIR/output.txt +then + # Frontier time out echo "Job timed out normally." SUCCESS=1 -fi -if grep -q "TURBINE: EXIT CODE: 0" $DIR/output.txt +elif grep -q "TURBINE: EXIT CODE: 0" $DIR/output.txt then echo "Job completed normally." grep "TURBINE: MPIEXEC TIME: " $DIR/output.txt From 6e363a0e3f4f2b99205a02768dc7df05bc810f30 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sun, 26 Feb 2023 23:35:47 -0600 Subject: [PATCH 460/601] Be more verbose --- workflows/cp-leaveout/db/print-db.sh | 1 + workflows/cp-leaveout/db/print-stats.sh | 2 ++ 2 files changed, 3 insertions(+) diff --git a/workflows/cp-leaveout/db/print-db.sh b/workflows/cp-leaveout/db/print-db.sh index edc00b4f..d3fa8868 100755 --- a/workflows/cp-leaveout/db/print-db.sh +++ b/workflows/cp-leaveout/db/print-db.sh @@ -13,4 +13,5 @@ DB=$1 THIS=$( readlink --canonicalize $( dirname $0 ) ) +echo DB: $DB sqlite3 $DB < $THIS/print-db.sql diff --git a/workflows/cp-leaveout/db/print-stats.sh b/workflows/cp-leaveout/db/print-stats.sh index c470afce..20d8edb1 100755 --- a/workflows/cp-leaveout/db/print-stats.sh +++ b/workflows/cp-leaveout/db/print-stats.sh @@ -17,6 +17,8 @@ then exit 1 fi +echo DB: $DB + COMPLETE=$( sqlite3 $DB < Date: Sun, 26 Feb 2023 23:36:03 -0600 Subject: [PATCH 461/601] Remove generated training data by default --- workflows/cp-leaveout/py/data_setup.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/workflows/cp-leaveout/py/data_setup.py b/workflows/cp-leaveout/py/data_setup.py index 20e0e8e2..d5644766 100644 --- a/workflows/cp-leaveout/py/data_setup.py +++ b/workflows/cp-leaveout/py/data_setup.py @@ -74,6 +74,7 @@ def pre_run(params): params["plan"], params["use_exported_data"], ) + print("TopN_Args: " + str(args)) data = params["benchmark_data"] try: @@ -141,4 +142,11 @@ def pre_run(params): def post_run(params, output_dict): print("data_setup(): post_run") sys.stdout.flush() + if "use_exported_data" in params: + try: + os.remove(params["use_exported_data"]) + except OSError as e: + print("Error: %s - %s." % (e.filename, e.strerror)) + else: + print("use_exported_data not in params") return ModelResult.SUCCESS From 38bc5e9dd6e40a1aa1b65158bf6bbbb7bfa02983 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sun, 26 Feb 2023 23:36:33 -0600 Subject: [PATCH 462/601] Clean up --- workflows/cp-leaveout/scripts/clean-ckpts-run.sh | 1 - workflows/cp-leaveout/scripts/clean-ckpts.sh | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/scripts/clean-ckpts-run.sh b/workflows/cp-leaveout/scripts/clean-ckpts-run.sh index 6a80b36b..67ef5240 100755 --- a/workflows/cp-leaveout/scripts/clean-ckpts-run.sh +++ b/workflows/cp-leaveout/scripts/clean-ckpts-run.sh @@ -28,7 +28,6 @@ then fi cd $DIR/save/ckpts/epochs -set -x MODELS=( $( ls ) ) N=${#MODELS[@]} diff --git a/workflows/cp-leaveout/scripts/clean-ckpts.sh b/workflows/cp-leaveout/scripts/clean-ckpts.sh index c4449b76..95b787dd 100755 --- a/workflows/cp-leaveout/scripts/clean-ckpts.sh +++ b/workflows/cp-leaveout/scripts/clean-ckpts.sh @@ -18,7 +18,7 @@ then exit 1 fi -RUNS=( $( echo $DIR/run/1.1.1.3.1.1 ) ) +RUNS=( $( echo $DIR/run/* ) ) for RUN in ${RUNS[@]} do From 5507a762d4f9addeadad58e39c2807e1f70140b1 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sun, 26 Feb 2023 23:36:43 -0600 Subject: [PATCH 463/601] Support big plan N=16 --- workflows/cp-leaveout/test/test-512.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/test/test-512.sh b/workflows/cp-leaveout/test/test-512.sh index 80c8f6c4..27d976a4 100755 --- a/workflows/cp-leaveout/test/test-512.sh +++ b/workflows/cp-leaveout/test/test-512.sh @@ -52,9 +52,13 @@ CANDLE_DATA=$SCRATCH/CANDLE-Data/ChallengeProblem/top21_2020Jul # DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.csv # DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.feather # DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.hdf5 +if (( ! ${BIG_PLAN:-0} )) +then PLAN_JSON=$CANDLE_DATA/plangen_cell703-p4_drug1492-p1-u.json # 2022-07 # PLAN_JSON=$CANDLE_DATA/plangen_CELL2917-p4_DRUG2148-p4.json # 2023-02 -# PLAN_JSON=/gpfs/alpine/med106/proj-shared/brettin/Supervisor/workflows/cp-leaveout/plangen_CELL2917-p4_DRUG2148-p4.json +else +PLAN_JSON=/gpfs/alpine/med106/proj-shared/brettin/Supervisor/workflows/cp-leaveout/plangen_CELL2917-p4_DRUG2148-p4.json +fi # DATAFRAME_CSV=$CANDLE_DATA/top21.h5 # 2022-07 DATAFRAME_CSV=$CANDLE_DATA/top21-cleaned-dd.h5 # NEW 2022-10 # BENCHMARK_DATA=$SCRATCH/proj/Benchmarks/Pilot1/Uno From 42fce3ddc33c21bc28f0b5b203db3fbe9609c5ca Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Sun, 26 Feb 2023 23:41:33 -0600 Subject: [PATCH 464/601] Support multi-digit node components --- workflows/cp-leaveout/swift/workflow.swift | 98 ++++++++++++---------- 1 file changed, 56 insertions(+), 42 deletions(-) diff --git a/workflows/cp-leaveout/swift/workflow.swift b/workflows/cp-leaveout/swift/workflow.swift index 18af3e5f..4193357c 100644 --- a/workflows/cp-leaveout/swift/workflow.swift +++ b/workflows/cp-leaveout/swift/workflow.swift @@ -6,6 +6,7 @@ Or specify the N, S values: 'swift-t workflow.swift -N=4 -S=6 ' for ### tasks. + Flags: -N : Number of nodes per stage (see default in code) -S : Number of stages (see default in code) @@ -22,6 +23,8 @@ --benchmark_data= : Used by data_setup to set softlinks to Uno cache and uno_auc_model.txt + NOTE: "token" variables are used to ensure that children do + not run before their parents NOTE: This workflow has some complex Python Exception handling code that will be pushed into Swift/T for conciseness... NOTE: On Summit, you have to use sys.stdout.flush() after @@ -57,7 +60,6 @@ else N = 0; } // Maximum stage number with default -// (tested up to S=7, 21,844 dummy tasks) int S; S_s = argv("S", "2"); assert(strlen(S_s) > 0, "Set argument S with -S=") => @@ -92,35 +94,49 @@ string turbine_output = getenv("TURBINE_OUTPUT"); printf("plangen: runtype:" + runtype); printf("benchmark_data: " + benchmark_data); -// // For compatibility with obj(): +// For compatibility with obj(): global const string FRAMEWORK = "keras"; -/** RUN STAGE: A recursive function that manages the stage dependencies */ +/** + RUN STAGE: A recursive function that manages the stage dependencies + token_parent: Blocks progress until the parent node is done + parent: The parent node, e.g., "1.2.3" + this: The current node, e.g., "1.2.3.4" +*/ (void v) -run_stage(int N, int S, string this, int stage, void block, - string plan_id, string db_file, string runtype) +run_stage(string db_file, string plan_id, string runtype, + void token_parent, int stage, string parent, string this) { - // printf("stage: %i this: %s", stage, this); + printf("stage: %i parent: %s this: %s", stage, parent, this); // Run the model - void parent = run_single(this, stage, block, plan_id); + void token_this = run_single(plan_id, token_parent, stage, + parent, this); if (stage < S) { // Recurse to the child stages - foreach id_child in [1:N] + foreach child in [1:N] { - run_stage(N, S, - // We want padded node IDs like "1.01.03" , "1.10.16" - "%s.%02i" % (this, id_child), - stage+1, parent, - plan_id, db_file, runtype); + run_stage(db_file, plan_id, runtype, + token_this, stage+1, + this, + "%s.%i" % (this, child) + ); // N, S, } } v = propagate(); } -/** RUN SINGLE: Set up and run a single model via obj(), plus the SQL ops */ -(void v) run_single(string node, int stage, void block, string plan_id) +/** + RUN SINGLE: Set up and run a single model via obj(), plus the SQL ops + token: Block on token so that this node does not run until the + parent is complete. + stage: The current stage, e.g., 4 + parent: The parent node, e.g., "1.2.3" + this: The current node, e.g., "1.2.3.4" +*/ +(void v) run_single(string plan_id, void token, int stage, + string parent, string this) { if (stage == 0) { @@ -128,24 +144,24 @@ run_stage(int N, int S, string this, int stage, void block, } else { - json_fragment = make_json_fragment(node, stage); - json = "{\"node\": \"%s\", %s}" % (node, json_fragment); - block => - printf("run_single(): running obj(%s)", node) => + json_fragment = make_json_fragment(parent, this, stage); + json = "{\"node\": \"%s\", %s}" % (this, json_fragment); + token => + printf("run_single(): running obj(%s)", this) => // Insert the model run into the DB - result1 = plangen_start(node, plan_id); + result1 = plangen_start(this, plan_id); assert(result1 != "EXCEPTION", "Exception in plangen_start()!"); if (result1 == "0") { // Run the model - obj_result = obj(json, exp_id, node); + obj_result = obj(json, exp_id, this); printf("run_single(): completed: node: '%s' result: '%s'", - node, obj_result); + this, obj_result); // Update the DB to complete the model run string result2; if (obj_result != "RUN_EXCEPTION") { - result2 = plangen_stop(node, plan_id); + result2 = plangen_stop(this, plan_id); } else { @@ -160,37 +176,35 @@ run_stage(int N, int S, string this, int stage, void block, else // result1 != 0 { printf("run_single(): plan node already marked complete: " + - "%s result=%s", node, result1) => + "%s result=%s", this, result1) => v = propagate(); } } } /** MAKE JSON FRAGMENT: Construct the JSON parameter fragment for the model */ -(string result) make_json_fragment(string this, int stage) +(string result) make_json_fragment(string parent, string this, int stage) { int epochs = compute_epochs(stage); json_fragment = ---- -"pre_module": "data_setup", -"post_module": "data_setup", -"plan": "%s", -"config_file": "uno_auc_model.txt", -"cache": "cache/top6_auc", -"user": "%s", -"dataframe_from": "%s", -"save_weights": "save/model.h5", -"gpus": "0", -"epochs": %i, -"es": "True", -"early_stopping": %i, +"pre_module": "data_setup", +"post_module": "data_setup", +"plan": "%s", +"config_file": "uno_auc_model.txt", +"cache": "cache/top6_auc", +"user": "%s", +"dataframe_from": "%s", +"save_weights": "save/model.h5", +"gpus": "0", +"epochs": %i, +"es": "True", +"early_stopping": %i, "use_exported_data": "topN.uno.h5", -"benchmark_data": "%s" +"benchmark_data": "%s" ---- % (plan_json, user, dataframe_csv, epochs, early_stopping, benchmark_data); if (stage > 1) { - n = strlen(this); - parent = substring(this, 0, n-3); result = json_fragment + ---- , "initial_weights": "../%s/save/model.h5" @@ -217,5 +231,5 @@ assert(plan_id != "-1", "Plan already exists!"); // Kickoff the workflow stage = 0; -run_stage(N, S, "1", stage, propagate(), plan_id, db_file, runtype); -// printf("CP LEAVEOUT WORKFLOW: RESULTS: COMPLETE"); +run_stage(db_file, plan_id, runtype, + propagate(), stage, "", "1"); From 642d9449559c5d61d64cab9a7f3bb283b19938cd Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 1 Mar 2023 16:50:08 -0600 Subject: [PATCH 465/601] sbcast test works --- scratch/sbcast/sbcast-1.sh | 22 ++++++++++++++++++++++ scratch/sbcast/sbcast-1.swift | 13 +++++++++++++ 2 files changed, 35 insertions(+) create mode 100755 scratch/sbcast/sbcast-1.sh create mode 100644 scratch/sbcast/sbcast-1.swift diff --git a/scratch/sbcast/sbcast-1.sh b/scratch/sbcast/sbcast-1.sh new file mode 100755 index 00000000..f6cfb98c --- /dev/null +++ b/scratch/sbcast/sbcast-1.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -eu + +# Add Swift/T to PATH +ROOT=/autofs/nccs-svm1_home1/wozniak/Public/sfw/frontier +SWIFT=$ROOT/swift-t/2023-02-23 +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH +# Add Python to PATH +PY=/gpfs/alpine/med106/proj-shared/hm0/candle_tf_2.10 +PATH=$PY/bin:$PATH + +# Set up data +EXPORTED_DATA_DIR="/ccs/home/hm0/med106_proj/Benchmarks/Pilot1/Uno" +EXPORTED_DATA_FILE="top_21_auc_1fold.uno.h5" + +# Scheduler settings +export PROJECT=MED106 +export TURBINE_PRELAUNCH="sbcast -f -F 8 $EXPORTED_DATA_DIR/$EXPORTED_DATA_FILE /dev/shm/$EXPORTED_DATA_FILE" + +# Run the workflow! +swift-t -m slurm sbcast-1.swift diff --git a/scratch/sbcast/sbcast-1.swift b/scratch/sbcast/sbcast-1.swift new file mode 100644 index 00000000..5b4f33bb --- /dev/null +++ b/scratch/sbcast/sbcast-1.swift @@ -0,0 +1,13 @@ + +import io; +import sys; + +app ls(string dir) +{ + "ls" dir ; +} + +local_prefix = "/dev/shm"; + +printf("local_prefix: '%s'", local_prefix) => + ls(local_prefix); From 180d8048bcf91c1bb8c9c681df1896de072b9ea8 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 1 Mar 2023 17:09:22 -0600 Subject: [PATCH 466/601] Adding hook-1.tcl mpi-io.sh --- scratch/sbcast/hook-1.tcl | 36 ++++++++++++++++++++++++++++++++++++ scratch/sbcast/mpi-io.sh | 25 +++++++++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 scratch/sbcast/hook-1.tcl create mode 100755 scratch/sbcast/mpi-io.sh diff --git a/scratch/sbcast/hook-1.tcl b/scratch/sbcast/hook-1.tcl new file mode 100644 index 00000000..f835145b --- /dev/null +++ b/scratch/sbcast/hook-1.tcl @@ -0,0 +1,36 @@ + +# HOOK TCL +# This code runs on each leader rank, +# i.e., once per node. + +# Set a root data directory +set root $env(HOME)/data +puts "HOOK HOST: [exec hostname]" + +# Get the leader communicator from ADLB +set comm [ adlb::comm_get leaders ] +# Get my rank among the leaders +set rank [ adlb::comm_rank $comm ] + +# If I am rank=0, construct the list of files to copy +set EXPORTED_DATA_DIR /ccs/home/hm0/med106_proj/Benchmarks/Pilot1/Uno +set EXPORTED_DATA_FILE top_21_auc_1fold.uno.h5 + +if { $rank == 0 } { + set files [ list $EXPORTED_DATA_DIR/$EXPORTED_DATA_FILE ] + puts "files: $files" +} + +# Broadcast the file list to all leaders +turbine::c::bcast $comm 0 files + +# Make a node-local data directory +set LOCAL_PREFIX /dev/shm + +# Copy each file to the node-local directory +foreach f $files { + if { $rank == 0 } { + puts "copying: $f" + } + turbine::c::copy_to $comm $f $LOCAL_PREFIX +} diff --git a/scratch/sbcast/mpi-io.sh b/scratch/sbcast/mpi-io.sh new file mode 100755 index 00000000..1b4a8492 --- /dev/null +++ b/scratch/sbcast/mpi-io.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -eu + +# Add Swift/T to PATH +ROOT=/autofs/nccs-svm1_home1/wozniak/Public/sfw/frontier +SWIFT=$ROOT/swift-t/2023-02-23 +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH +# Add Python to PATH +PY=/gpfs/alpine/med106/proj-shared/hm0/candle_tf_2.10 +PATH=$PY/bin:$PATH + +# Set up data +EXPORTED_DATA_DIR="/ccs/home/hm0/med106_proj/Benchmarks/Pilot1/Uno" +EXPORTED_DATA_FILE="top_21_auc_1fold.uno.h5" + +# Scheduler settings +export PROJECT=MED106 + +THIS=$( realpath . ) + +# Run the workflow! +swift-t -m slurm \ + -e TURBINE_LEADER_HOOK_STARTUP="$( sed 's/#.*//;s/$/;/' $THIS/hook-1.tcl )" \ + sbcast-1.swift From e8a2868d8810d5f252e82ed02241bbed524ede54 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 1 Mar 2023 17:14:33 -0600 Subject: [PATCH 467/601] Adding README.adoc --- scratch/sbcast/README.adoc | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 scratch/sbcast/README.adoc diff --git a/scratch/sbcast/README.adoc b/scratch/sbcast/README.adoc new file mode 100644 index 00000000..3b44dfee --- /dev/null +++ b/scratch/sbcast/README.adoc @@ -0,0 +1,26 @@ + +Two test cases to broadcast a file to /dev/shm on all compute nodes. +Both cases run the same Swift workflow (!), they just use different external settings +to move the data. + +== sbcast + +Usage: +---- +$ ./sbcast-1.sh +---- + +Inserts an `sbcast` command into the `turbine-slurm.sh` script for execution +just before the workflow starts. Does this via `TURBINE_PRELAUNCH` +https://swift-lang.github.io/swift-t/sites.html#turbine_prelaunch[(manual)]. + +== MPI-IO + +Usage: +---- +$ ./mpi-io.sh +---- + +Uses a Turbine leader hook +http://swift-lang.github.io/swift-t/guide.html#hooks[(manual)] +to use MPI-IO to make the file copy. From 2e0c41061eeb605936f732dea675562156ac57fb Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 1 Mar 2023 17:18:42 -0600 Subject: [PATCH 468/601] Respect user TURBINE_PRELAUNCH --- scratch/sbcast/{sbcast-1.swift => workflow.swift} | 0 workflows/common/sh/sched-frontier.sh | 3 ++- 2 files changed, 2 insertions(+), 1 deletion(-) rename scratch/sbcast/{sbcast-1.swift => workflow.swift} (100%) diff --git a/scratch/sbcast/sbcast-1.swift b/scratch/sbcast/workflow.swift similarity index 100% rename from scratch/sbcast/sbcast-1.swift rename to scratch/sbcast/workflow.swift diff --git a/workflows/common/sh/sched-frontier.sh b/workflows/common/sh/sched-frontier.sh index e6b7f824..391eb3d7 100644 --- a/workflows/common/sh/sched-frontier.sh +++ b/workflows/common/sh/sched-frontier.sh @@ -9,6 +9,7 @@ MACHINE="-m slurm" #export QUEUE=${QUEUE:-batch} export PROJECT=${PROJECT:-MED106} -export TURBINE_PRELAUNCH="source activate /gpfs/alpine/med106/proj-shared/hm0/candle_tf_2.10" +PY=/gpfs/alpine/med106/proj-shared/hm0/candle_tf_2.10 +export TURBINE_PRELAUNCH="source activate $PY ; ${TURBINE_PRELAUNCH:-}" export TURBINE_LAUNCH_OPTIONS="--gpus-per-task=1 --gpus-per-node=$PPN" From a8118036b3bf46ebb9377340e9c9510343e0217c Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 1 Mar 2023 17:30:30 -0600 Subject: [PATCH 469/601] Say "guide" --- scratch/sbcast/README.adoc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scratch/sbcast/README.adoc b/scratch/sbcast/README.adoc index 3b44dfee..1f8edaa8 100644 --- a/scratch/sbcast/README.adoc +++ b/scratch/sbcast/README.adoc @@ -1,5 +1,5 @@ -Two test cases to broadcast a file to /dev/shm on all compute nodes. +Two test cases to broadcast a file to `/dev/shm` on all compute nodes. Both cases run the same Swift workflow (!), they just use different external settings to move the data. @@ -12,7 +12,7 @@ $ ./sbcast-1.sh Inserts an `sbcast` command into the `turbine-slurm.sh` script for execution just before the workflow starts. Does this via `TURBINE_PRELAUNCH` -https://swift-lang.github.io/swift-t/sites.html#turbine_prelaunch[(manual)]. +(https://swift-lang.github.io/swift-t/sites.html#turbine_prelaunch[guide]). == MPI-IO @@ -22,5 +22,5 @@ $ ./mpi-io.sh ---- Uses a Turbine leader hook -http://swift-lang.github.io/swift-t/guide.html#hooks[(manual)] +(http://swift-lang.github.io/swift-t/guide.html#hooks[guide]) to use MPI-IO to make the file copy. From a88066469a5558bc26c14245d8e270496ea5664a Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Fri, 3 Mar 2023 18:52:07 -0800 Subject: [PATCH 470/601] o Few steps to get the comparator workflow working --- models/Comparator/cmp_baseline_keras2.py | 20 ++++++++++++-------- models/Comparator/cmp_default_model.txt | 2 +- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/models/Comparator/cmp_baseline_keras2.py b/models/Comparator/cmp_baseline_keras2.py index 6eba5003..528d0030 100644 --- a/models/Comparator/cmp_baseline_keras2.py +++ b/models/Comparator/cmp_baseline_keras2.py @@ -32,20 +32,22 @@ def run(gParameters): expid = gParameters["experiment_id"] runid = gParameters["run_id"] supervisor = Path(file_path).absolute().parent.parent - workflows = supervisor / "workflows" - print(model_sh) + workflows = supervisor / "workflows" + #print(model_sh) + model1 = gParameters["model1"] + model2 = gParameters["model2"] os.chdir(output_dir) cmd = make_cmd(str(workflows), expid, runid) run_dir = Path(os.getenv("CANDLE_DATA_DIR")) \ / model1 / "Output" / expid / runid - print("env: " + str(env)) + #print("env: " + str(env)) print("cmd: " + str(cmd)) - results = {} + results = [] for i in [ 1, 2 ]: - result = model_name = gParameters["model%i" % i] env = make_env(str(workflows), model_name) - with open(run_dir + "/start-%i.log" % i, "w") as fp: + print("command is ", cmd, "/nenv is:", env) + with open(str(run_dir) + "/start-%i.log" % i, "w") as start_log: subprocess.run(cmd, env=env, stdout=start_log, stderr=subprocess.STDOUT) @@ -59,6 +61,8 @@ def run(gParameters): def make_env(workflows, model_name): + output_dir = "./tmp" + expid = 'one_exp' env = { "WORKFLOWS_ROOT": workflows, "TURBINE_OUTPUT": output_dir, "EXPID": expid, @@ -75,13 +79,13 @@ def make_env(workflows, model_name): def make_cmd(workflows, expid, runid): - model_sh = workflows / "common" / "sh" / "model.sh" - + model_sh = workflows + "/common" +"/sh" + "/model.sh" cmd = [ "bash", model_sh, "keras2", "{}", # empty JSON fragment expid, runid ] + return cmd def main(): gParameters = initialize_parameters() diff --git a/models/Comparator/cmp_default_model.txt b/models/Comparator/cmp_default_model.txt index dca137c6..9bcea1e0 100644 --- a/models/Comparator/cmp_default_model.txt +++ b/models/Comparator/cmp_default_model.txt @@ -1,6 +1,6 @@ [Global_Params] -model_name = 'cmp' +model_name = 'graphdrp' model1 = 'graphdrp' model2 = 'graphdrp' # 'graphdrp2' From 9cea2473c495ae9fd73df488d566200c62d5c714 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 8 Mar 2023 13:42:21 -0600 Subject: [PATCH 471/601] Adding cmp-cv/swift/workflow.sh --- workflows/cmp-cv/swift/workflow.sh | 110 +++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100755 workflows/cmp-cv/swift/workflow.sh diff --git a/workflows/cmp-cv/swift/workflow.sh b/workflows/cmp-cv/swift/workflow.sh new file mode 100755 index 00000000..c5fb9ffa --- /dev/null +++ b/workflows/cmp-cv/swift/workflow.sh @@ -0,0 +1,110 @@ +#! /usr/bin/env bash +set -eu + +# CMP-CV WORKFLOW SH + +# Autodetect this workflow directory +export CANDLE_PROJECT_ROOT=$( realpath $( dirname $0 )/.. ) +export WORKFLOWS_ROOT=$( realpath $CANDLE_PROJECT_ROOT/.. ) + +SCRIPT_NAME=$(basename $0) + +# Source some utility functions used in this script +source $WORKFLOWS_ROOT/common/sh/utils.sh + +usage() +{ + echo "CMP-CV: usage: workflow.sh SITE EXPID CFG_SYS PLAN" +} + +if (( ${#} != 4 )) +then + usage + exit 1 +fi + +if ! { + # Sets SITE + # Sets EXPID, TURBINE_OUTPUT + # Sets CFG_SYS + # PLAN is the hyperparameter list file + get_site $1 && \ + get_expid $2 && \ + get_cfg_sys $3 && \ + UPF=$4 + } +then + usage + exit 1 +fi + +source_site env $SITE +source_site sched $SITE + +# Set up PYTHONPATH for model +source $WORKFLOWS_ROOT/common/sh/set-pythonpath.sh + +log_path PYTHONPATH + +export TURBINE_JOBNAME="CMP_${EXPID}" + +export MODEL_SH=${MODEL_SH:-$WORKFLOWS_ROOT/common/sh/model.sh} +export BENCHMARK_TIMEOUT + +CMD_LINE_ARGS=( --expid=$EXPID + --benchmark_timeout=$BENCHMARK_TIMEOUT + --plan=$PLAN + ) + +USER_VARS=( $CMD_LINE_ARGS ) +# log variables and script to to TURBINE_OUTPUT directory +log_script + +# Copy settings to TURBINE_OUTPUT for provenance +cp $CFG_SYS $TURBINE_OUTPUT + +# Make run directory in advance to reduce contention +mkdir -pv $TURBINE_OUTPUT/run + +cp -v $UPF $TURBINE_OUTPUT + +# TURBINE_STDOUT="$TURBINE_OUTPUT/out-%%r.txt" +TURBINE_STDOUT= + +if [[ ${CANDLE_DATA_DIR:-} == "" ]] +then + abort "cmp-cv workflow.sh: Set CANDLE_DATA_DIR!" +fi + +export CANDLE_IMAGE=${CANDLE_IMAGE:-} + +which swift-t + +swift-t -n $PROCS \ + -o $TURBINE_OUTPUT/workflow.tic \ + ${MACHINE:-} \ + -p \ + -I $WORKFLOWS_ROOT/common/swift \ + -i obj_$SWIFT_IMPL \ + -e BENCHMARKS_ROOT \ + -e CANDLE_PROJECT_ROOT \ + -e MODEL_SH \ + -e FI_MR_CACHE_MAX_COUNT=0 \ + -e SITE \ + -e BENCHMARK_TIMEOUT \ + -e MODEL_NAME=${MODEL_NAME:-MODEL_NULL} \ + -e OBJ_RETURN \ + -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ + -e TURBINE_MPI_THREAD=${TURBINE_MPI_THREAD:-1} \ + $( python_envs ) \ + -e TURBINE_STDOUT=$TURBINE_STDOUT \ + -e CANDLE_MODEL_TYPE \ + -e CANDLE_IMAGE \ + $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} + +# Can provide this to debug Python settings: +# -e PYTHONVERBOSE=1 +# Can provide this if needed for debugging crashes: +# -e PYTHONUNBUFFERED=1 +# Can provide this if needed to reset PATH: +# -e PATH=$PATH From bf00a4377e00d8dafb08ca587cd776bc4d6dbfde Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 8 Mar 2023 13:45:22 -0600 Subject: [PATCH 472/601] Adding cmp-cv/swift/workflow.swift --- workflows/cmp-cv/swift/workflow.swift | 48 +++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 workflows/cmp-cv/swift/workflow.swift diff --git a/workflows/cmp-cv/swift/workflow.swift b/workflows/cmp-cv/swift/workflow.swift new file mode 100644 index 00000000..046bd162 --- /dev/null +++ b/workflows/cmp-cv/swift/workflow.swift @@ -0,0 +1,48 @@ + +/** + CMP-CV WORKFLOW.SWIFT +*/ + +import assert; +import io; +import json; +import files; +import string; +import sys; + +import candle_utils; +report_env(); + +string FRAMEWORK = "keras"; + +// Scan command line +file plan = input(argv("plan")); +int benchmark_timeout = string2int(argv("benchmark_timeout", "-1")); + +string model_name = getenv("MODEL_NAME"); +string expid = getenv("EXPID"); +string turbine_output = getenv("TURBINE_OUTPUT"); + +// Report some key facts: +printf("CMP-CV: %s", filename(plan)); +system1("date \"WORKFLOW START: +%Y-%m-%d %H:%M\""); + +// Read unrolled parameter file +string plan_lines[] = file_lines(plan); + +// Resultant output values: +string results[]; + +// Evaluate each parameter set +foreach params, i in plan_lines +{ + printf("params: %s", params); + runid = json_get(params, "id"); + results[i] = obj(params, expid, runid); + assert(results[i] != "EXCEPTION", "exception in obj()!"); +} + +// Join all result values into one big semicolon-delimited string +string result = join(results, ";"); +// and print it +printf("WORKFLOW RESULT: " + result); From 83e628c7f79720c5e35746d97b7662e1c5a33f60 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 8 Mar 2023 13:49:58 -0600 Subject: [PATCH 473/601] Initial structure --- workflows/cmp-cv/test/cfg-sys-1.sh | 27 ++++++++++++++++++++++++++ workflows/cmp-cv/test/plan-small-1.txt | 2 ++ workflows/cmp-cv/test/test-small-1.sh | 25 ++++++++++++++++++++++++ 3 files changed, 54 insertions(+) create mode 100644 workflows/cmp-cv/test/cfg-sys-1.sh create mode 100644 workflows/cmp-cv/test/plan-small-1.txt create mode 100755 workflows/cmp-cv/test/test-small-1.sh diff --git a/workflows/cmp-cv/test/cfg-sys-1.sh b/workflows/cmp-cv/test/cfg-sys-1.sh new file mode 100644 index 00000000..5158b14a --- /dev/null +++ b/workflows/cmp-cv/test/cfg-sys-1.sh @@ -0,0 +1,27 @@ + +# CMP-CV CFG SYS 1 + +# Use 1 for interactive workflows +# export INTERACTIVE=1 + +# The number of MPI processes +# Note that 1 process is reserved for Swift/T +# For example, if PROCS=4 that gives you 3 workers, +# i.e., 3 concurrent Keras runs. +export PROCS=${PROCS:-2} + +# MPI processes per node. This should not exceed PROCS. +# Cori has 32 cores per node, 128GB per node +export PPN=${PPN:-2} + +#export QUEUE=${QUEUE:-batch} + +export WALLTIME=${WALLTIME:-1:00:00} +echo WALLTIME: $WALLTIME + +# export MAIL_ENABLED=1 +# export MAIL_ADDRESS=woz@anl.gov + +# Benchmark run timeout: benchmark run will timeouT +# after the specified number of seconds. -1 is no timeout. +BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} diff --git a/workflows/cmp-cv/test/plan-small-1.txt b/workflows/cmp-cv/test/plan-small-1.txt new file mode 100644 index 00000000..618fa71c --- /dev/null +++ b/workflows/cmp-cv/test/plan-small-1.txt @@ -0,0 +1,2 @@ +# PLAN SMALL 1 TXT +{ "hyperparam1": "value1", ... } diff --git a/workflows/cmp-cv/test/test-small-1.sh b/workflows/cmp-cv/test/test-small-1.sh new file mode 100755 index 00000000..c2adf22d --- /dev/null +++ b/workflows/cmp-cv/test/test-small-1.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -eu + +# CMP-CV TEST SMALL 1 + +if (( ${#} != 2 )) +then + echo "usage: test BENCHMARK_NAME SITE" + exit 1 +fi + +export MODEL_NAME=$1 +SITE=$2 + +# Self-configure +THIS=$( realpath $( dirname $0 ) ) +CANDLE_PROJECT_ROOT=$( realpath $THIS ) +WORKFLOWS_ROOT=$( realpath $THIS/../.. ) +export EMEWS_PROJECT_ROOT + +export OBJ_RETURN="val_loss" +CFG_SYS=$THIS/cfg-sys-1.sh + +export CANDLE_MODEL_TYPE="BENCHMARKS" +$CANDLE_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $THIS/plan-small-1.txt From 9454737514f60785d9875a818c0c48738fd0e4a6 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 15 Mar 2023 14:16:01 -0500 Subject: [PATCH 474/601] o Use CANDLE_OUTPUT_DIR as this is a required variable for sending o/p from candle_lib to use other o/p options than CANDLE_DATA_DIR --- workflows/mlrMBO/swift/workflow.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index f125ac38..ec6eace7 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -178,6 +178,7 @@ swift-t -O 0 -n $PROCS \ -e EMEWS_PROJECT_ROOT \ $( python_envs ) \ -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ + -e CANDLE_OUTPUT_DIR=$TURBINE_OUTPUT \ -e OBJ_RETURN \ -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ From 3b1c4c7bdeef35a1eafadcfb0bccf25ceb03bff3 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 15 Mar 2023 14:43:34 -0500 Subject: [PATCH 475/601] o Add export of CANDLE_OUTPUT_DIR to workflow.sh for non SINGULARITY case --- workflows/mlrMBO/swift/workflow.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index ec6eace7..f5aebb4e 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -46,6 +46,8 @@ then TURBINE_OUTPUT=$CANDLE_DATA_DIR/output printf "Running mlrMBO workflow with model %s and image %s:%s\n" \ $MODEL_NAME $CANDLE_MODEL_TYPE $CANDLE_IMAGE +else + export CANDLE_OUTPUT_DIR=$TURBINE_OUTPUT fi get_site $1 # Sets SITE From bf222c0a188be9b3d9c292f94754ee680ffa5817 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 15 Mar 2023 15:01:09 -0500 Subject: [PATCH 476/601] o Use model.sh to populate CANDLE_OUTPUT_DIR --- workflows/common/sh/model.sh | 1 + workflows/mlrMBO/swift/workflow.sh | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index 8cb09f39..e6a8240f 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -48,6 +48,7 @@ then INTERNAL_DIRECTORY=$MODEL_NAME/Output/$EXPID/$RUNID else # "BENCHMARKS" INSTANCE_DIRECTORY=$TURBINE_OUTPUT/$RUNID + export CANDLE_OUTPUT_DIR=$INSTANCE_DIRECTORY fi # All stdout/stderr after this point goes into model.log ! diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index f5aebb4e..f125ac38 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -46,8 +46,6 @@ then TURBINE_OUTPUT=$CANDLE_DATA_DIR/output printf "Running mlrMBO workflow with model %s and image %s:%s\n" \ $MODEL_NAME $CANDLE_MODEL_TYPE $CANDLE_IMAGE -else - export CANDLE_OUTPUT_DIR=$TURBINE_OUTPUT fi get_site $1 # Sets SITE @@ -180,7 +178,6 @@ swift-t -O 0 -n $PROCS \ -e EMEWS_PROJECT_ROOT \ $( python_envs ) \ -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ - -e CANDLE_OUTPUT_DIR=$TURBINE_OUTPUT \ -e OBJ_RETURN \ -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ From f0e4eaef5d3bf944db0ca30042b79a82a14a68bc Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 16 Mar 2023 15:20:24 -0500 Subject: [PATCH 477/601] WIP cmp-cv workflow --- workflows/cmp-cv/py/compare.py | 21 ++++ workflows/cmp-cv/swift/workflow.swift | 49 +++++--- workflows/cmp-cv/test/make-upf-1.sh | 26 ++++ workflows/cmp-cv/test/models-1.txt | 3 + workflows/cmp-cv/test/upf-1.txt | 165 ++++++++++++++++++++++++++ 5 files changed, 250 insertions(+), 14 deletions(-) create mode 100644 workflows/cmp-cv/py/compare.py create mode 100755 workflows/cmp-cv/test/make-upf-1.sh create mode 100644 workflows/cmp-cv/test/models-1.txt create mode 100644 workflows/cmp-cv/test/upf-1.txt diff --git a/workflows/cmp-cv/py/compare.py b/workflows/cmp-cv/py/compare.py new file mode 100644 index 00000000..0ce13b55 --- /dev/null +++ b/workflows/cmp-cv/py/compare.py @@ -0,0 +1,21 @@ + +CANDLE_DATA_DIR = os.getenv("CANDLE_DATA_DIR") + +def compare(exp_id, run_id): + + print(f"compare: run_id={run_id}") + gParams = read_params(exp_id, run_id) + model = gParams("model_name") + + directory = f"{CANDLE_DATA_DIR}/Output/{exp_id}/{run_id}" + df_res = pd.read_csv(f"{directory}/test_predictions.csv") + + # a class to calculate errors for subsets of the validation/test set + bmk = Benchmark(fp_path='drug_features.csv') # TODO: have to have a drug features for a common test set + subset_err = bmk.error_by_feature_domains_model(df_res, conditions) + + # collect results for comparison + subset_err.set_index('prop', inplace=True) # TODO: use 'prop' as a parameter and move it to cmp_models.txt + cmp_results[run_id] = subset_err.loc[cmp_prop, 'error'] # this is the property based on which we want to do the comparison + with open(f"{directory}/subset_err.txt", "w") as fp: + fp.write(str(cmp_results[run_id])) diff --git a/workflows/cmp-cv/swift/workflow.swift b/workflows/cmp-cv/swift/workflow.swift index 046bd162..c0922ad8 100644 --- a/workflows/cmp-cv/swift/workflow.swift +++ b/workflows/cmp-cv/swift/workflow.swift @@ -11,38 +11,59 @@ import string; import sys; import candle_utils; -report_env(); +// report_env(); string FRAMEWORK = "keras"; // Scan command line -file plan = input(argv("plan")); +// file plan = input(argv("plan")); +file model_file = input(argv("models")); +file gparams_file = input(argv("gparams")); int benchmark_timeout = string2int(argv("benchmark_timeout", "-1")); -string model_name = getenv("MODEL_NAME"); string expid = getenv("EXPID"); string turbine_output = getenv("TURBINE_OUTPUT"); // Report some key facts: -printf("CMP-CV: %s", filename(plan)); +printf("CMP-CV: %s", filename(model_file)); system1("date \"WORKFLOW START: +%Y-%m-%d %H:%M\""); // Read unrolled parameter file -string plan_lines[] = file_lines(plan); +// string plan_lines[] = file_lines(plan); +string model_lines[] = file_lines(model_file); + +string gparams_lines[] = file_lines(gparams_file); // Resultant output values: string results[]; +// string run_ids[]; + +compare(string exp_id, string run_id) +{ + python_persist("import compare", + "compare.compare("%s", \"%s\")") % (exp_id, run_id)); +} // Evaluate each parameter set -foreach params, i in plan_lines +foreach model, i in model_lines { - printf("params: %s", params); - runid = json_get(params, "id"); - results[i] = obj(params, expid, runid); - assert(results[i] != "EXCEPTION", "exception in obj()!"); + foreach gparam, j in gparams_lines + { + run_id = i*1000000 + j; + + // printf("model: %s", model); + m = "\"model_name\": \"%s\"" % model; + + gparams = replace(gparam, "MORE_PARAMS", m, 0); + printf(gparams); + results[run_id] = obj(params, expid, runid) => + compare(exp_id, run_id); + + // assert(results[i] != "EXCEPTION", "exception in obj()!"); + } } -// Join all result values into one big semicolon-delimited string -string result = join(results, ";"); -// and print it -printf("WORKFLOW RESULT: " + result); +// // Join all result values into one big semicolon-delimited string +// string result = join(run_ids, ";"); +// // and print it +// printf("WORKFLOW RESULT: " + result); diff --git a/workflows/cmp-cv/test/make-upf-1.sh b/workflows/cmp-cv/test/make-upf-1.sh new file mode 100755 index 00000000..eb5c0049 --- /dev/null +++ b/workflows/cmp-cv/test/make-upf-1.sh @@ -0,0 +1,26 @@ +#!/bin/zsh + + + +OUTPUT=$1 + +# Use ZSH for range operation + +EPOCHS_MIN=10 +EPOCHS_MAX=20 +BATCH_SIZE_MIN=5 +BATCH_SIZE_MAX=7 + + +for EPOCHS in {$EPOCHS_MIN..$EPOCHS_MAX} +do + for BATCH_SIZE in {$BATCH_SIZE_MIN..$BATCH_SIZE_MAX} + do + BS2=$(( 2 ** BATCH_SIZE )) + echo "{" + echo "\"epochs\": $EPOCHS," + echo "\"batch_size\": $BATCH_SIZE," + echo "MORE_PARAMS" + echo "}" + done +done > $OUTPUT diff --git a/workflows/cmp-cv/test/models-1.txt b/workflows/cmp-cv/test/models-1.txt new file mode 100644 index 00000000..169e86d1 --- /dev/null +++ b/workflows/cmp-cv/test/models-1.txt @@ -0,0 +1,3 @@ +DrugCell +SWnet +# tCNN diff --git a/workflows/cmp-cv/test/upf-1.txt b/workflows/cmp-cv/test/upf-1.txt new file mode 100644 index 00000000..c9f5321e --- /dev/null +++ b/workflows/cmp-cv/test/upf-1.txt @@ -0,0 +1,165 @@ +{ +"epochs": 10, +"batch_size": 5, +MORE_PARAMS +} +{ +"epochs": 10, +"batch_size": 6, +MORE_PARAMS +} +{ +"epochs": 10, +"batch_size": 7, +MORE_PARAMS +} +{ +"epochs": 11, +"batch_size": 5, +MORE_PARAMS +} +{ +"epochs": 11, +"batch_size": 6, +MORE_PARAMS +} +{ +"epochs": 11, +"batch_size": 7, +MORE_PARAMS +} +{ +"epochs": 12, +"batch_size": 5, +MORE_PARAMS +} +{ +"epochs": 12, +"batch_size": 6, +MORE_PARAMS +} +{ +"epochs": 12, +"batch_size": 7, +MORE_PARAMS +} +{ +"epochs": 13, +"batch_size": 5, +MORE_PARAMS +} +{ +"epochs": 13, +"batch_size": 6, +MORE_PARAMS +} +{ +"epochs": 13, +"batch_size": 7, +MORE_PARAMS +} +{ +"epochs": 14, +"batch_size": 5, +MORE_PARAMS +} +{ +"epochs": 14, +"batch_size": 6, +MORE_PARAMS +} +{ +"epochs": 14, +"batch_size": 7, +MORE_PARAMS +} +{ +"epochs": 15, +"batch_size": 5, +MORE_PARAMS +} +{ +"epochs": 15, +"batch_size": 6, +MORE_PARAMS +} +{ +"epochs": 15, +"batch_size": 7, +MORE_PARAMS +} +{ +"epochs": 16, +"batch_size": 5, +MORE_PARAMS +} +{ +"epochs": 16, +"batch_size": 6, +MORE_PARAMS +} +{ +"epochs": 16, +"batch_size": 7, +MORE_PARAMS +} +{ +"epochs": 17, +"batch_size": 5, +MORE_PARAMS +} +{ +"epochs": 17, +"batch_size": 6, +MORE_PARAMS +} +{ +"epochs": 17, +"batch_size": 7, +MORE_PARAMS +} +{ +"epochs": 18, +"batch_size": 5, +MORE_PARAMS +} +{ +"epochs": 18, +"batch_size": 6, +MORE_PARAMS +} +{ +"epochs": 18, +"batch_size": 7, +MORE_PARAMS +} +{ +"epochs": 19, +"batch_size": 5, +MORE_PARAMS +} +{ +"epochs": 19, +"batch_size": 6, +MORE_PARAMS +} +{ +"epochs": 19, +"batch_size": 7, +MORE_PARAMS +} +{ +"epochs": 20, +"batch_size": 5, +MORE_PARAMS +} +{ +"epochs": 20, +"batch_size": 6, +MORE_PARAMS +} +{ +"epochs": 20, +"batch_size": 7, +MORE_PARAMS +} From 853b06568012563fb679b9eab867336e9c3a4809 Mon Sep 17 00:00:00 2001 From: Gihan Panapitiya Date: Tue, 21 Mar 2023 22:51:09 -0700 Subject: [PATCH 478/601] workflow to find domain errors --- workflows/cmp-cv/swift/workflow.sh | 15 ++- workflows/cmp-cv/swift/workflow.swift | 53 +++++--- workflows/cmp-cv/test/models-1.txt | 2 +- workflows/cmp-cv/test/test-small-1.sh | 26 ++-- workflows/cmp-cv/test/upf-1.txt | 169 +----------------------- workflows/common/python/compare.py | 162 +++++++++++++++++++++++ workflows/common/python/model_runner.py | 89 +++++++------ workflows/common/sh/model.sh | 7 +- workflows/common/swift/obj_app.swift | 12 +- 9 files changed, 289 insertions(+), 246 deletions(-) create mode 100644 workflows/common/python/compare.py diff --git a/workflows/cmp-cv/swift/workflow.sh b/workflows/cmp-cv/swift/workflow.sh index c5fb9ffa..71edf3ae 100755 --- a/workflows/cmp-cv/swift/workflow.sh +++ b/workflows/cmp-cv/swift/workflow.sh @@ -17,7 +17,7 @@ usage() echo "CMP-CV: usage: workflow.sh SITE EXPID CFG_SYS PLAN" } -if (( ${#} != 4 )) +if (( ${#} != 5 )) then usage exit 1 @@ -32,6 +32,7 @@ if ! { get_expid $2 && \ get_cfg_sys $3 && \ UPF=$4 + MODELS=$5 } then usage @@ -43,6 +44,8 @@ source_site sched $SITE # Set up PYTHONPATH for model source $WORKFLOWS_ROOT/common/sh/set-pythonpath.sh +export PYTHONPATH="${PYTHONPATH}:/homes/ac.gpanapitiya/ccmg-mtg/models/to_Candle/DrugCell" +export PYTHONPATH="${PYTHONPATH}:/homes/ac.gpanapitiya/ccmg-mtg/models/to_Candle/SWnet" log_path PYTHONPATH @@ -50,10 +53,12 @@ export TURBINE_JOBNAME="CMP_${EXPID}" export MODEL_SH=${MODEL_SH:-$WORKFLOWS_ROOT/common/sh/model.sh} export BENCHMARK_TIMEOUT - -CMD_LINE_ARGS=( --expid=$EXPID - --benchmark_timeout=$BENCHMARK_TIMEOUT - --plan=$PLAN +PLAN="PLAN_NOT_DEFINED" +CMD_LINE_ARGS=( -expid=$EXPID + -benchmark_timeout=$BENCHMARK_TIMEOUT + -plan=$PLAN + -models=$MODELS + -gparams=$UPF ) USER_VARS=( $CMD_LINE_ARGS ) diff --git a/workflows/cmp-cv/swift/workflow.swift b/workflows/cmp-cv/swift/workflow.swift index c0922ad8..9e69f7f6 100644 --- a/workflows/cmp-cv/swift/workflow.swift +++ b/workflows/cmp-cv/swift/workflow.swift @@ -13,7 +13,7 @@ import sys; import candle_utils; // report_env(); -string FRAMEWORK = "keras"; +string FRAMEWORK = "pytorch"; // Scan command line // file plan = input(argv("plan")); @@ -38,30 +38,45 @@ string gparams_lines[] = file_lines(gparams_file); string results[]; // string run_ids[]; -compare(string exp_id, string run_id) +// compare(string exp_id, string run_id) +// { +// python_persist("import compare", +// "compare.compare(\"%s\", \"%s\")") % (exp_id, run_id); +// } + +compare(string expid, string runid) { - python_persist("import compare", - "compare.compare("%s", \"%s\")") % (exp_id, run_id)); + python_persist("import compare", "compare.compare(\"%s\", \"%s\")" % (expid, runid) ); + // python_persist("import compare", "compare.compare()"); } // Evaluate each parameter set -foreach model, i in model_lines +// foreach model, i in model_lines +// { +foreach gparam, j in gparams_lines { - foreach gparam, j in gparams_lines - { - run_id = i*1000000 + j; - - // printf("model: %s", model); - m = "\"model_name\": \"%s\"" % model; - - gparams = replace(gparam, "MORE_PARAMS", m, 0); - printf(gparams); - results[run_id] = obj(params, expid, runid) => - compare(exp_id, run_id); - - // assert(results[i] != "EXCEPTION", "exception in obj()!"); - } + // runid = i*1000000 + j; + runid = j; + + printf("runid: %s", runid); + // printf("model: %s", model); + + // printf("model: %s", model); + // m = "\"model_name\": \"%s\"" % model; + + // gparams = replace(gparam, "MORE_PARAMS", m, 0); + printf("gparams: %s", gparam); + // printf("GPARAMS: %s", gparams); + model_name = json_get(gparam, "model_name"); + candle_image = json_get(gparam, "candle_image"); + printf("MODEL: %s", model_name); + // printf(gparams); + // results[runid] = obj(gparam, expid, repr(runid) ); + results[runid] = obj(gparam, expid, repr(runid), model_name, candle_image) => compare(expid, repr(runid) ); + + // assert(results[i] != "EXCEPTION", "exception in obj()!"); } +// } // // Join all result values into one big semicolon-delimited string // string result = join(run_ids, ";"); diff --git a/workflows/cmp-cv/test/models-1.txt b/workflows/cmp-cv/test/models-1.txt index 169e86d1..650f4e0b 100644 --- a/workflows/cmp-cv/test/models-1.txt +++ b/workflows/cmp-cv/test/models-1.txt @@ -1,3 +1,3 @@ DrugCell -SWnet +# SWnet # tCNN diff --git a/workflows/cmp-cv/test/test-small-1.sh b/workflows/cmp-cv/test/test-small-1.sh index c2adf22d..26aeb264 100755 --- a/workflows/cmp-cv/test/test-small-1.sh +++ b/workflows/cmp-cv/test/test-small-1.sh @@ -3,23 +3,27 @@ set -eu # CMP-CV TEST SMALL 1 -if (( ${#} != 2 )) -then - echo "usage: test BENCHMARK_NAME SITE" - exit 1 -fi +# if (( ${#} != 2 )) +# then +# echo "usage: test BENCHMARK_NAME SITE" +# exit 1 +# fi -export MODEL_NAME=$1 -SITE=$2 +# export MODEL_NAME=$1 +SITE=$1 # Self-configure -THIS=$( realpath $( dirname $0 ) ) -CANDLE_PROJECT_ROOT=$( realpath $THIS ) -WORKFLOWS_ROOT=$( realpath $THIS/../.. ) +THIS=$( cd $( dirname $0 ) ; /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. ; /bin/pwd ) +WORKFLOWS_ROOT=$( cd $THIS/../.. ; /bin/pwd ) export EMEWS_PROJECT_ROOT export OBJ_RETURN="val_loss" CFG_SYS=$THIS/cfg-sys-1.sh +# export MODEL_NAME="DrugCell" +# export CANDLE_IMAGE=/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif +export CANDLE_MODEL_TYPE="SINGULARITY" + export CANDLE_MODEL_TYPE="BENCHMARKS" -$CANDLE_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $THIS/plan-small-1.txt +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $THIS/upf-1.txt $THIS/models-1.txt diff --git a/workflows/cmp-cv/test/upf-1.txt b/workflows/cmp-cv/test/upf-1.txt index c9f5321e..9c1a0121 100644 --- a/workflows/cmp-cv/test/upf-1.txt +++ b/workflows/cmp-cv/test/upf-1.txt @@ -1,165 +1,4 @@ -{ -"epochs": 10, -"batch_size": 5, -MORE_PARAMS -} -{ -"epochs": 10, -"batch_size": 6, -MORE_PARAMS -} -{ -"epochs": 10, -"batch_size": 7, -MORE_PARAMS -} -{ -"epochs": 11, -"batch_size": 5, -MORE_PARAMS -} -{ -"epochs": 11, -"batch_size": 6, -MORE_PARAMS -} -{ -"epochs": 11, -"batch_size": 7, -MORE_PARAMS -} -{ -"epochs": 12, -"batch_size": 5, -MORE_PARAMS -} -{ -"epochs": 12, -"batch_size": 6, -MORE_PARAMS -} -{ -"epochs": 12, -"batch_size": 7, -MORE_PARAMS -} -{ -"epochs": 13, -"batch_size": 5, -MORE_PARAMS -} -{ -"epochs": 13, -"batch_size": 6, -MORE_PARAMS -} -{ -"epochs": 13, -"batch_size": 7, -MORE_PARAMS -} -{ -"epochs": 14, -"batch_size": 5, -MORE_PARAMS -} -{ -"epochs": 14, -"batch_size": 6, -MORE_PARAMS -} -{ -"epochs": 14, -"batch_size": 7, -MORE_PARAMS -} -{ -"epochs": 15, -"batch_size": 5, -MORE_PARAMS -} -{ -"epochs": 15, -"batch_size": 6, -MORE_PARAMS -} -{ -"epochs": 15, -"batch_size": 7, -MORE_PARAMS -} -{ -"epochs": 16, -"batch_size": 5, -MORE_PARAMS -} -{ -"epochs": 16, -"batch_size": 6, -MORE_PARAMS -} -{ -"epochs": 16, -"batch_size": 7, -MORE_PARAMS -} -{ -"epochs": 17, -"batch_size": 5, -MORE_PARAMS -} -{ -"epochs": 17, -"batch_size": 6, -MORE_PARAMS -} -{ -"epochs": 17, -"batch_size": 7, -MORE_PARAMS -} -{ -"epochs": 18, -"batch_size": 5, -MORE_PARAMS -} -{ -"epochs": 18, -"batch_size": 6, -MORE_PARAMS -} -{ -"epochs": 18, -"batch_size": 7, -MORE_PARAMS -} -{ -"epochs": 19, -"batch_size": 5, -MORE_PARAMS -} -{ -"epochs": 19, -"batch_size": 6, -MORE_PARAMS -} -{ -"epochs": 19, -"batch_size": 7, -MORE_PARAMS -} -{ -"epochs": 20, -"batch_size": 5, -MORE_PARAMS -} -{ -"epochs": 20, -"batch_size": 6, -MORE_PARAMS -} -{ -"epochs": 20, -"batch_size": 7, -MORE_PARAMS -} +{"id": "RUN000", "epochs": 1, "model_name": "DrugCell", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif"} +{"id": "RUN001", "epochs": 2, "model_name": "DrugCell", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif"} +{"id": "RUN002", "epochs": 1, "model_name": "SWnet_CCLE", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/SWnet.sif"} +{"id": "RUN003", "epochs": 2, "model_name": "SWnet_CCLE", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/SWnet.sif"} \ No newline at end of file diff --git a/workflows/common/python/compare.py b/workflows/common/python/compare.py new file mode 100644 index 00000000..ce84b41a --- /dev/null +++ b/workflows/common/python/compare.py @@ -0,0 +1,162 @@ +import os +import pandas as pd +import pandas as pd +import numpy as np +from sklearn.metrics import mean_squared_error + +conditions = pd.DataFrame([[ 'nAromAtom' , 5, 10 ], + ['nAtom', 20, 50], + ['BertzCT', 800, 1000]], + columns=['prop', 'low', 'high']) +# from cmp_utils import conditions, Benchmark + +CANDLE_DATA_DIR = os.getenv("CANDLE_DATA_DIR") + +def compare(exp_id, run_id): + cmp_results={} + print(f"compare: run_id={run_id}") + # gParams = read_params(exp_id, run_id) + # model = gParams("model_name") + + model = "DrugCell" # TODO: Hardcoded. have to get this from output dir? + turbine_output = os.getenv("TURBINE_OUTPUT") + + outdir = os.path.join(turbine_output, run_id, 'Output', 'EXP000', 'RUN000') # TODO: Have to fix this + directory = outdir + # directory = f"{CANDLE_DATA_DIR}/Output/{exp_id}/{run_id}" + print("reading the predictions....") + df_res = pd.read_csv(f"{directory}/test_predictions.csv") + + # a class to calculate errors for subsets of the validation/test set + print("reading the drug feature file....") + # TODO: Should have to save the above file in this file + bmk = Benchmark(fp_path=f'{CANDLE_DATA_DIR}/drug_features.csv') # TODO: have to have a drug features for a common test set + subset_err, final_domain_err = bmk.error_by_feature_domains_model(df_res, conditions) + + # or this + # fp_path=f'{CANDLE_DATA_DIR}/drug_features.csv' + # subset_err, final_domain_err = error_by_feature_domains_model(fp_path, df_res, conditions) + + # collect results for comparison + cmp_prop = 'nAtom' # TODO: Get this from gParameters + subset_err.set_index('prop', inplace=True) # TODO: use 'prop' as a parameter and move it to cmp_models.txt + cmp_results[run_id] = subset_err.loc[cmp_prop, 'error'] # this is the property based on which we want to do the comparison + with open(f"{directory}/subset_err.txt", "w") as fp: + fp.write(str(cmp_results[run_id])) + + return str(cmp_results[run_id]) + + + + +def error_by_feature_domains_model(fp_path, preds, conditions): + + + fps = pd.read_csv(fp_path) + report = [] + preds['err'] = abs(preds['true'] - preds['pred']) + keep = preds.copy() + for i in range(conditions.shape[0]): + + prop = conditions.loc[i, 'prop'] + low = conditions.loc[i, 'low'] + high = conditions.loc[i, 'high'] + + locs = np.logical_and(fps[prop] <= high , fps[prop] > low) + smiles = fps.loc[locs, 'smiles'].values + tmp = preds[preds.smiles.isin(smiles)] + mean_err = tmp.err.mean() + + report.append([prop, low, high, mean_err]) + + keep = keep[keep.smiles.isin(smiles)] + + final_domain_err = keep.err.mean() # return this + report = pd.DataFrame(report, columns=['prop', 'low', 'high', 'error']) + return report, final_domain_err + + + + + +class Benchmark: + + def __init__(self, fp_path): + + self.fps = pd.read_csv(fp_path) + # self.model_preds = model_preds + # self.feature_conditions = feature_conditions + self.reports = {} + + + def error_by_feature_domains_model(self, preds, conditions): + + fps = self.fps + report = [] + preds['err'] = abs(preds['true'] - preds['pred']) + keep = preds.copy() + for i in range(conditions.shape[0]): + + prop = conditions.loc[i, 'prop'] + low = conditions.loc[i, 'low'] + high = conditions.loc[i, 'high'] + + locs = np.logical_and(fps[prop] <= high , fps[prop] > low) + smiles = fps.loc[locs, 'smiles'].values + tmp = preds[preds.smiles.isin(smiles)] + mean_err = tmp.err.mean() + + report.append([prop, low, high, mean_err]) + + keep = keep[keep.smiles.isin(smiles)] + + final_domain_err = keep.err.mean() # return this + report = pd.DataFrame(report, columns=['prop', 'low', 'high', 'error']) + return report, final_domain_err + + + + + + + def error_by_feature_domains(self, feature_conditions): + + results=[] + for model_name, pred in self.model_preds.items(): + + report = self.error_by_feature_domains_model(pred, feature_conditions) + report.loc[:, 'model'] = model_name + results.append(report) + + results = pd.concat(results, axis=0) + results = results.loc[:, ['model', 'prop', 'low', 'high', 'error']] + results.reset_index(drop=True, inplace=True) + + return results + + + def rank_by_acc(self, metric='rmse', th=3): + + results={} + for model_name, pred in self.model_preds.items(): + sub = pred[pred.labels > th] + rmse = mean_squared_error(y_true=sub['labels'], y_pred=sub['preds'])**.5 + + results[model_name] = {'rmse': rmse} + + results = pd.DataFrame.from_dict(results) + results = results.T + return results + + + +def create_grid_files(): + + dc_grid = {'epochs': [1, 2], 'lr': [1e-2, 1e-3]} + sw_grid = {'epochs': [3, 4], 'lr': [1e-2, 1e-5]} + + with open('DrugCell_grid.json', 'w') as fp: + json.dump(dc_grid, fp) + + with open('SWnet_CCLE_grid.json', 'w') as fp: + json.dump(sw_grid, fp) diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index 8797ba7d..0309171a 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -171,50 +171,62 @@ def run(hyper_parameter_map, obj_return): # Run the model! log("PKG RUN START") - try: - history = pkg.run(params) - except Exception as e: - logger.warn("RUN EXCEPTION: " + str(e)) - print("RUN EXCEPTION: " + str(e)) - info = sys.exc_info() - s = traceback.format_tb(info[2]) - sys.stdout.write('\\n\\nEXCEPTION in model run(): \\n' + - repr(e) + ' ... \\n' + ''.join(s)) - sys.stdout.write('\\n') - sys.stdout.flush() - - # logger.warn("Caught InvalidArgumentError") - exception = True - exit(1) - log("PKG RUN STOP") - - if framework == "keras": - runner_utils.keras_clear_session(framework) - - stop_perf(Ps) - finish = time.time() - duration = finish - start # check for epochs if not present set to 1, used for checking early stopping in function get_results if "epochs" in hyper_parameter_map: epochs = hyper_parameter_map["epochs"] else: epochs = 1 + + if framework == 'keras': + + try: + history = pkg.run(params) + except Exception as e: + logger.warn("RUN EXCEPTION: " + str(e)) + print("RUN EXCEPTION: " + str(e)) + info = sys.exc_info() + s = traceback.format_tb(info[2]) + sys.stdout.write('\\n\\nEXCEPTION in model run(): \\n' + + repr(e) + ' ... \\n' + ''.join(s)) + sys.stdout.write('\\n') + sys.stdout.flush() + + # logger.warn("Caught InvalidArgumentError") + exception = True + exit(1) + log("PKG RUN STOP") + + # if framework == "keras": + runner_utils.keras_clear_session(framework) - # Default result if there is no val_loss (as in infer.py) - result = 0 - history_result = {} - if not exception: - logger.info("DONE: run_id %s in %0.2f seconds." % - (hyper_parameter_map["run_id"], duration)) - if history is not None: - if history == "EPOCHS_COMPLETED_ALREADY": - result, history_result = "EPOCHS_COMPLETED_ALREADY", None - else: - result, history_result = get_results(history, obj_return, - epochs) - else: - result, history_result = "RUN_EXCEPTION", None + stop_perf(Ps) + finish = time.time() + duration = finish - start + + # Default result if there is no val_loss (as in infer.py) + result = 0 + history_result = {} + if not exception: + logger.info("DONE: run_id %s in %0.2f seconds." % + (hyper_parameter_map["run_id"], duration)) + if history is not None: + if history == "EPOCHS_COMPLETED_ALREADY": + result, history_result = "EPOCHS_COMPLETED_ALREADY", None + else: + result, history_result = get_results(history, obj_return, + epochs) + else: + result, history_result = "RUN_EXCEPTION", None + + elif framework == 'pytorch': + val_scores, infer_scores = pkg.run(params) + class history: + def __init__(self, val_scores): + self.history = {'val_loss': [val_scores['val_loss']] } + + history = history(val_scores) + result, history_result = get_results(history, obj_return, epochs) return (result, history_result) @@ -339,8 +351,9 @@ def get_results(history, obj_return, epochs_expected): logger.info("get_results(): " + msg) with open("stop.marker", "w") as fp: fp.write(msg + "\n") + print("VALUES: ", values, values[-1], type(values[-1])) # Default: the last value in the history - result = values[-1] + result = float(values[-1]) else: logger.warning("get_results(): objective function return key " + "not found: " + 'key: "' + obj_return + '" - ' + diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index e6a8240f..440ef2c4 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -24,7 +24,7 @@ usage() echo "If SH_TIMEOUT is set, we run under the shell command timeout" } -if (( ${#} != 4 )) +if (( ${#} != 6 )) then echo "Wrong number of arguments: received ${#} , required: 4" usage @@ -36,6 +36,9 @@ FRAMEWORK=$1 # Usually "keras" or "pytorch" PARAMS="$2" EXPID=$3 RUNID=$4 +# MODEL_NAME=$5 +export MODEL_NAME=$5 +export CANDLE_IMAGE=$6 # Each model run, runs in its own "instance" directory # Set instance_directory to that and cd into it. @@ -44,7 +47,7 @@ RUNID=$4 if [[ $CANDLE_MODEL_TYPE = "SINGULARITY" ]] then # TODO: Rename "instance" to "run" - INSTANCE_DIRECTORY=$CANDLE_DATA_DIR/$MODEL_NAME/Output/$EXPID/$RUNID + INSTANCE_DIRECTORY=$CANDLE_DATA_DIR/$MODEL_NAME/Output/$EXPID/$RUNID # gihan -removing $ in MODEL_NAME INTERNAL_DIRECTORY=$MODEL_NAME/Output/$EXPID/$RUNID else # "BENCHMARKS" INSTANCE_DIRECTORY=$TURBINE_OUTPUT/$RUNID diff --git a/workflows/common/swift/obj_app.swift b/workflows/common/swift/obj_app.swift index 2adb4788..9a0f10ca 100644 --- a/workflows/common/swift/obj_app.swift +++ b/workflows/common/swift/obj_app.swift @@ -9,11 +9,13 @@ */ (string obj_result) obj(string params, string expid, - string runid) + string runid, + string model_name, + string candle_image) { string model_sh = getenv("MODEL_SH"); string turbine_output = getenv("TURBINE_OUTPUT"); - string model_name = getenv("MODEL_NAME"); + // string model_name = getenv("MODEL_NAME"); string outdir; @@ -25,7 +27,7 @@ // We do not use a file type here because this file may not be created, // which is handled by get_results() string result_file = outdir/"result.txt"; - wait (run_model(model_sh, params, expid, runid)) + wait (run_model(model_sh, params, expid, runid, model_name, candle_image)) { obj_result = get_results(result_file); } @@ -36,10 +38,10 @@ Swift/T app function that runs the Benchmark */ app (void o) run_model (string model_sh, string params, - string expid, string runid) + string expid, string runid, string model_name, string candle_image) { // 1 2 3 4 - "bash" model_sh FRAMEWORK params expid runid; + "bash" model_sh FRAMEWORK params expid runid model_name candle_image; } /** From 33d0ca22e09eed04a4f61b834e8a1b5453f8cefe Mon Sep 17 00:00:00 2001 From: Gihan Panapitiya Date: Wed, 22 Mar 2023 07:31:04 -0700 Subject: [PATCH 479/601] readme updated --- README.adoc | 6 ++++++ workflows/cmp-cv/test/test-small-1.sh | 10 +++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/README.adoc b/README.adoc index 12f1ce98..8cf508ff 100644 --- a/README.adoc +++ b/README.adoc @@ -1 +1,7 @@ See the https://ecp-candle.github.io/Supervisor/home.html[Home Page] for more information. + +# Running the feature domain based comparison + +- Create the CANDLE_DATA_DIR. Place drug_features.csv in the CANDLE_DATA_DIR +- The paths of the model's directories have to be added to the PYTHONPATH in workflow.sh +- Start the run using the command ./test-small-1.sh SITE, where SITE is the name of the computing system. test-small-1.sh is at workflows/cmp-cv/test diff --git a/workflows/cmp-cv/test/test-small-1.sh b/workflows/cmp-cv/test/test-small-1.sh index 26aeb264..0a5f36a2 100755 --- a/workflows/cmp-cv/test/test-small-1.sh +++ b/workflows/cmp-cv/test/test-small-1.sh @@ -3,11 +3,11 @@ set -eu # CMP-CV TEST SMALL 1 -# if (( ${#} != 2 )) -# then -# echo "usage: test BENCHMARK_NAME SITE" -# exit 1 -# fi +if (( ${#} != 1 )) +then + echo "usage: test SITE" + exit 1 +fi # export MODEL_NAME=$1 SITE=$1 From cd127f36e5310e40f995237d86924c1187b8eeb0 Mon Sep 17 00:00:00 2001 From: Gihan Panapitiya Date: Wed, 22 Mar 2023 07:34:03 -0700 Subject: [PATCH 480/601] readme updated --- README.adoc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.adoc b/README.adoc index 8cf508ff..4e8c5b92 100644 --- a/README.adoc +++ b/README.adoc @@ -5,3 +5,11 @@ See the https://ecp-candle.github.io/Supervisor/home.html[Home Page] for more in - Create the CANDLE_DATA_DIR. Place drug_features.csv in the CANDLE_DATA_DIR - The paths of the model's directories have to be added to the PYTHONPATH in workflow.sh - Start the run using the command ./test-small-1.sh SITE, where SITE is the name of the computing system. test-small-1.sh is at workflows/cmp-cv/test + + +#### Known issues + +- some input files required for analysis have to be manually added to candle data dir +- outputs get written to 'experiments' not CANDLE_DATA_DIR +- python paths have to be explicitly specified in workflow.sh +- singularity container is not being used even though the CANDLE_MODEL_TYPE=SINGULARITY is specified From 458581fedba67f3144a0011bb5203a30ffb72504 Mon Sep 17 00:00:00 2001 From: Gihan Panapitiya Date: Wed, 22 Mar 2023 07:38:49 -0700 Subject: [PATCH 481/601] readme updated --- README.adoc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.adoc b/README.adoc index 4e8c5b92..5e56a824 100644 --- a/README.adoc +++ b/README.adoc @@ -5,6 +5,14 @@ See the https://ecp-candle.github.io/Supervisor/home.html[Home Page] for more in - Create the CANDLE_DATA_DIR. Place drug_features.csv in the CANDLE_DATA_DIR - The paths of the model's directories have to be added to the PYTHONPATH in workflow.sh - Start the run using the command ./test-small-1.sh SITE, where SITE is the name of the computing system. test-small-1.sh is at workflows/cmp-cv/test +- upf-1.txt is used as the input file to specify the model hyperparameters as well as the model name and candle_image location. + +``` +{"id": "RUN000", "epochs": 1, "model_name": "DrugCell", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif"} +{"id": "RUN001", "epochs": 2, "model_name": "DrugCell", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif"} +{"id": "RUN002", "epochs": 1, "model_name": "SWnet_CCLE", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/SWnet.sif"} +{"id": "RUN003", "epochs": 2, "model_name": "SWnet_CCLE", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/SWnet.sif"} +``` #### Known issues From 50f55ef4046422a80c7378e3b77d4c7b98484cce Mon Sep 17 00:00:00 2001 From: Gihan Panapitiya Date: Wed, 22 Mar 2023 07:40:30 -0700 Subject: [PATCH 482/601] readme updated --- README.adoc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.adoc b/README.adoc index 5e56a824..de916575 100644 --- a/README.adoc +++ b/README.adoc @@ -8,10 +8,10 @@ See the https://ecp-candle.github.io/Supervisor/home.html[Home Page] for more in - upf-1.txt is used as the input file to specify the model hyperparameters as well as the model name and candle_image location. ``` -{"id": "RUN000", "epochs": 1, "model_name": "DrugCell", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif"} -{"id": "RUN001", "epochs": 2, "model_name": "DrugCell", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif"} -{"id": "RUN002", "epochs": 1, "model_name": "SWnet_CCLE", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/SWnet.sif"} -{"id": "RUN003", "epochs": 2, "model_name": "SWnet_CCLE", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/SWnet.sif"} +{"id": "RUN000", "epochs": 1, "model_name": "DrugCell", "candle_image": "/path/to/sif/DrugCell.sif"} +{"id": "RUN001", "epochs": 2, "model_name": "DrugCell", "candle_image": "/path/to/sif/DrugCell.sif"} +{"id": "RUN002", "epochs": 1, "model_name": "SWnet_CCLE", "candle_image": "/path/to/sif/SWnet.sif"} +{"id": "RUN003", "epochs": 2, "model_name": "SWnet_CCLE", "candle_image": "/path/to/sif/SWnet.sif"} ``` From c048739afff3146647660ab187b23c56af709513 Mon Sep 17 00:00:00 2001 From: Gihan Panapitiya Date: Wed, 22 Mar 2023 08:00:16 -0700 Subject: [PATCH 483/601] instructions to run the examplle --- README.adoc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.adoc b/README.adoc index de916575..1f726f49 100644 --- a/README.adoc +++ b/README.adoc @@ -3,6 +3,7 @@ See the https://ecp-candle.github.io/Supervisor/home.html[Home Page] for more in # Running the feature domain based comparison - Create the CANDLE_DATA_DIR. Place drug_features.csv in the CANDLE_DATA_DIR + - drug_features.csv shoulld contain the drug features of at least the test set drug molecules - The paths of the model's directories have to be added to the PYTHONPATH in workflow.sh - Start the run using the command ./test-small-1.sh SITE, where SITE is the name of the computing system. test-small-1.sh is at workflows/cmp-cv/test - upf-1.txt is used as the input file to specify the model hyperparameters as well as the model name and candle_image location. @@ -14,6 +15,19 @@ See the https://ecp-candle.github.io/Supervisor/home.html[Home Page] for more in {"id": "RUN003", "epochs": 2, "model_name": "SWnet_CCLE", "candle_image": "/path/to/sif/SWnet.sif"} ``` +### Running the specific example at workflows/cmp-cv/test + +- Clone Supervisor https://github.com/gihanpanapitiya/Supervisor/tree/gihan_cmp +- Clone the DrugCell and SWnet model directories from https://github.com/gihanpanapitiya/DrugCell/tree/to_candle and https://github.com/gihanpanapitiya/SWnet/tree/to_candle + - Checkout to_candle branches and create the Singularity containers using the command, + ``` + singularity build --fakeroot /path/for/sif/DerugCell.sif /path/to/DrugCell.def + singularity build --fakeroot /path/for/sif/SWnet.sif /path/to/SWnet.def + ``` +- Add /path/for/sif/DerugCell.sif and /path/for/sif/SWnet.sif to the PYTHONPATH in workflow.sh +- Create the CANDLE_DATA_DIR. Place drug_features.csv in the CANDLE_DATA_DIR +- Run the command ./test-small-1.sh SITE + #### Known issues From bceb9742d7b9f304817f7d82da2077703bf57800 Mon Sep 17 00:00:00 2001 From: Gihan Panapitiya Date: Wed, 22 Mar 2023 12:45:22 -0700 Subject: [PATCH 484/601] Update README.adoc --- README.adoc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.adoc b/README.adoc index 1f726f49..793b1653 100644 --- a/README.adoc +++ b/README.adoc @@ -17,9 +17,9 @@ See the https://ecp-candle.github.io/Supervisor/home.html[Home Page] for more in ### Running the specific example at workflows/cmp-cv/test -- Clone Supervisor https://github.com/gihanpanapitiya/Supervisor/tree/gihan_cmp +- Clone Supervisor from https://github.com/ECP-CANDLE/Supervisor - Clone the DrugCell and SWnet model directories from https://github.com/gihanpanapitiya/DrugCell/tree/to_candle and https://github.com/gihanpanapitiya/SWnet/tree/to_candle - - Checkout to_candle branches and create the Singularity containers using the command, + - Checkout to_candle branches and create the Singularity containers (.sif files) using the command, ``` singularity build --fakeroot /path/for/sif/DerugCell.sif /path/to/DrugCell.def singularity build --fakeroot /path/for/sif/SWnet.sif /path/to/SWnet.def From f7d99f0e95cfea073cd7f40e535d9a185be8e922 Mon Sep 17 00:00:00 2001 From: Gihan Panapitiya Date: Wed, 22 Mar 2023 12:45:41 -0700 Subject: [PATCH 485/601] Update README.adoc --- README.adoc | 1 + 1 file changed, 1 insertion(+) diff --git a/README.adoc b/README.adoc index 793b1653..256eb8b6 100644 --- a/README.adoc +++ b/README.adoc @@ -20,6 +20,7 @@ See the https://ecp-candle.github.io/Supervisor/home.html[Home Page] for more in - Clone Supervisor from https://github.com/ECP-CANDLE/Supervisor - Clone the DrugCell and SWnet model directories from https://github.com/gihanpanapitiya/DrugCell/tree/to_candle and https://github.com/gihanpanapitiya/SWnet/tree/to_candle - Checkout to_candle branches and create the Singularity containers (.sif files) using the command, + ``` singularity build --fakeroot /path/for/sif/DerugCell.sif /path/to/DrugCell.def singularity build --fakeroot /path/for/sif/SWnet.sif /path/to/SWnet.def From 3d7e50303bd1101567bb18b4e17aedbe8e50d2f2 Mon Sep 17 00:00:00 2001 From: Gihan Panapitiya Date: Wed, 22 Mar 2023 12:45:59 -0700 Subject: [PATCH 486/601] Update README.adoc --- README.adoc | 1 + 1 file changed, 1 insertion(+) diff --git a/README.adoc b/README.adoc index 256eb8b6..79697ba8 100644 --- a/README.adoc +++ b/README.adoc @@ -25,6 +25,7 @@ See the https://ecp-candle.github.io/Supervisor/home.html[Home Page] for more in singularity build --fakeroot /path/for/sif/DerugCell.sif /path/to/DrugCell.def singularity build --fakeroot /path/for/sif/SWnet.sif /path/to/SWnet.def ``` + - Add /path/for/sif/DerugCell.sif and /path/for/sif/SWnet.sif to the PYTHONPATH in workflow.sh - Create the CANDLE_DATA_DIR. Place drug_features.csv in the CANDLE_DATA_DIR - Run the command ./test-small-1.sh SITE From fad4c980bb25fc827de014e2e2a97a9ae1226053 Mon Sep 17 00:00:00 2001 From: Gihan Panapitiya Date: Wed, 22 Mar 2023 13:59:18 -0700 Subject: [PATCH 487/601] Update model.sh --- workflows/common/sh/model.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index 440ef2c4..c3bc8987 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -47,7 +47,7 @@ export CANDLE_IMAGE=$6 if [[ $CANDLE_MODEL_TYPE = "SINGULARITY" ]] then # TODO: Rename "instance" to "run" - INSTANCE_DIRECTORY=$CANDLE_DATA_DIR/$MODEL_NAME/Output/$EXPID/$RUNID # gihan -removing $ in MODEL_NAME + INSTANCE_DIRECTORY=$CANDLE_DATA_DIR/$MODEL_NAME/Output/$EXPID/$RUNID INTERNAL_DIRECTORY=$MODEL_NAME/Output/$EXPID/$RUNID else # "BENCHMARKS" INSTANCE_DIRECTORY=$TURBINE_OUTPUT/$RUNID From 72ea2758a7c70b8e62aed76f675e0478b40ce2b6 Mon Sep 17 00:00:00 2001 From: Gihan Panapitiya Date: Thu, 23 Mar 2023 09:49:50 -0700 Subject: [PATCH 488/601] model.sh, obj_app.swift --- workflows/common/sh/model.sh | 26 ++++++---- workflows/common/swift/obj_app.swift | 12 ++--- workflows/common/swift/obj_container.swift | 60 ++++++++++++++++++++++ 3 files changed, 81 insertions(+), 17 deletions(-) create mode 100644 workflows/common/swift/obj_container.swift diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index c3bc8987..bfa0ce9e 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -24,21 +24,27 @@ usage() echo "If SH_TIMEOUT is set, we run under the shell command timeout" } -if (( ${#} != 6 )) +if (( ${#} < 4 )) then - echo "Wrong number of arguments: received ${#} , required: 4" + echo "Wrong number of arguments: received ${#} , required: at least 4" usage exit 1 fi -FRAMEWORK=$1 # Usually "keras" or "pytorch" -# JSON string of parameters: -PARAMS="$2" -EXPID=$3 -RUNID=$4 -# MODEL_NAME=$5 -export MODEL_NAME=$5 -export CANDLE_IMAGE=$6 +if (( ${#} >= 4 )) + then + FRAMEWORK=$1 # Usually "keras" or "pytorch" + # JSON string of parameters: + PARAMS="$2" + EXPID=$3 + RUNID=$4 +fi + +if (( ${#} == 6 )) + then + export MODEL_NAME=$5 + export CANDLE_IMAGE=$6 +fi # Each model run, runs in its own "instance" directory # Set instance_directory to that and cd into it. diff --git a/workflows/common/swift/obj_app.swift b/workflows/common/swift/obj_app.swift index 9a0f10ca..2adb4788 100644 --- a/workflows/common/swift/obj_app.swift +++ b/workflows/common/swift/obj_app.swift @@ -9,13 +9,11 @@ */ (string obj_result) obj(string params, string expid, - string runid, - string model_name, - string candle_image) + string runid) { string model_sh = getenv("MODEL_SH"); string turbine_output = getenv("TURBINE_OUTPUT"); - // string model_name = getenv("MODEL_NAME"); + string model_name = getenv("MODEL_NAME"); string outdir; @@ -27,7 +25,7 @@ // We do not use a file type here because this file may not be created, // which is handled by get_results() string result_file = outdir/"result.txt"; - wait (run_model(model_sh, params, expid, runid, model_name, candle_image)) + wait (run_model(model_sh, params, expid, runid)) { obj_result = get_results(result_file); } @@ -38,10 +36,10 @@ Swift/T app function that runs the Benchmark */ app (void o) run_model (string model_sh, string params, - string expid, string runid, string model_name, string candle_image) + string expid, string runid) { // 1 2 3 4 - "bash" model_sh FRAMEWORK params expid runid model_name candle_image; + "bash" model_sh FRAMEWORK params expid runid; } /** diff --git a/workflows/common/swift/obj_container.swift b/workflows/common/swift/obj_container.swift new file mode 100644 index 00000000..9a0f10ca --- /dev/null +++ b/workflows/common/swift/obj_container.swift @@ -0,0 +1,60 @@ + +// OBJ APP + +/** + The main objective function used by the CANDLE/Supervisor + model exploration (optimization) loop. + params : The JSON string of params to be passed to the Benchmark + run_id : A string run ID that will be the output directory name +*/ +(string obj_result) obj(string params, + string expid, + string runid, + string model_name, + string candle_image) +{ + string model_sh = getenv("MODEL_SH"); + string turbine_output = getenv("TURBINE_OUTPUT"); + // string model_name = getenv("MODEL_NAME"); + + string outdir; + + outdir = "%s/%s" % (turbine_output, runid); + // outdir = "%s/%s/Output/%s/%s" % (turbine_output, model_name, expid, runid); + + printf("obj_app: running model shell script in: %s", outdir); + + // We do not use a file type here because this file may not be created, + // which is handled by get_results() + string result_file = outdir/"result.txt"; + wait (run_model(model_sh, params, expid, runid, model_name, candle_image)) + { + obj_result = get_results(result_file); + } + printf("obj_app: result(%s): '%s'", runid, obj_result); +} + +/** + Swift/T app function that runs the Benchmark +*/ +app (void o) run_model (string model_sh, string params, + string expid, string runid, string model_name, string candle_image) +{ + // 1 2 3 4 + "bash" model_sh FRAMEWORK params expid runid model_name candle_image; +} + +/** + Extracts the Benchmark output if it exists, + else, provides a NaN so the workflow can keep running +*/ +(string obj_result) get_results(string result_file) { + if (file_exists(result_file)) { + file line = input(result_file); + obj_result = trim(read(line)); + } else { + printf("File not found: %s", result_file); + // return with a large value + obj_result = "1e7"; + } +} From dfb18a951d1da77d5f70a4cde71a5a42f2f32ae6 Mon Sep 17 00:00:00 2001 From: Gihan Panapitiya Date: Thu, 23 Mar 2023 11:17:33 -0700 Subject: [PATCH 489/601] obj_container --- workflows/cmp-cv/swift/workflow.sh | 1 + workflows/cmp-cv/swift/workflow.swift | 4 +++- workflows/common/swift/obj_container.swift | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/workflows/cmp-cv/swift/workflow.sh b/workflows/cmp-cv/swift/workflow.sh index 71edf3ae..34769e37 100755 --- a/workflows/cmp-cv/swift/workflow.sh +++ b/workflows/cmp-cv/swift/workflow.sh @@ -82,6 +82,7 @@ then fi export CANDLE_IMAGE=${CANDLE_IMAGE:-} +# export $SWIFT_IMPL=container which swift-t diff --git a/workflows/cmp-cv/swift/workflow.swift b/workflows/cmp-cv/swift/workflow.swift index 9e69f7f6..05dd4be0 100644 --- a/workflows/cmp-cv/swift/workflow.swift +++ b/workflows/cmp-cv/swift/workflow.swift @@ -72,7 +72,9 @@ foreach gparam, j in gparams_lines printf("MODEL: %s", model_name); // printf(gparams); // results[runid] = obj(gparam, expid, repr(runid) ); - results[runid] = obj(gparam, expid, repr(runid), model_name, candle_image) => compare(expid, repr(runid) ); + results[runid] = obj_container(gparam, expid, repr(runid), model_name, candle_image) => compare(expid, repr(runid) ); + // results[runid] = obj(gparam, expid, repr(runid)); + // => compare(expid, repr(runid) ); // assert(results[i] != "EXCEPTION", "exception in obj()!"); } diff --git a/workflows/common/swift/obj_container.swift b/workflows/common/swift/obj_container.swift index 9a0f10ca..bf30317a 100644 --- a/workflows/common/swift/obj_container.swift +++ b/workflows/common/swift/obj_container.swift @@ -7,7 +7,7 @@ params : The JSON string of params to be passed to the Benchmark run_id : A string run ID that will be the output directory name */ -(string obj_result) obj(string params, +(string obj_result) obj_container(string params, string expid, string runid, string model_name, From 8e89a8ccc0a3e62262469251d9ff0be450088594 Mon Sep 17 00:00:00 2001 From: Gihan Panapitiya Date: Thu, 23 Mar 2023 17:09:28 -0700 Subject: [PATCH 490/601] about compare.py --- workflows/common/python/compare.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/workflows/common/python/compare.py b/workflows/common/python/compare.py index ce84b41a..a6566d10 100644 --- a/workflows/common/python/compare.py +++ b/workflows/common/python/compare.py @@ -4,6 +4,18 @@ import numpy as np from sklearn.metrics import mean_squared_error +""" +This script can be used to filter a subset of the test set based on +the properties of the drug molecules. For example, here we can select the +molecules of which the 'prop' is between two values (provided in the +2nd and 3rd elements of each list in the conditions list. We can then +find the prediction errors for this domain. Knowledge of the errors of differnt +molecular groups is helpful to understand the currrent deficiencies of the drug +response models (or any molecular property prediction model in general). This knowledge +is then allow us to improve the models as well as use predictions from the models which +produce highly accurate preidictions for certain domains. +""" + conditions = pd.DataFrame([[ 'nAromAtom' , 5, 10 ], ['nAtom', 20, 50], ['BertzCT', 800, 1000]], From 3a8779e2a084b4f83b520c960b60b9c1fd2d07a9 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Fri, 24 Mar 2023 18:27:53 -0500 Subject: [PATCH 491/601] o Fix comments formatting etc. --- README.adoc | 2 +- models/Comparator/cmp_baseline_keras2.py | 49 +++++---- workflows/cmp-cv/py/compare.py | 13 ++- workflows/cmp-cv/test/upf-1.txt | 2 +- workflows/common/python/compare.py | 104 +++++++++--------- workflows/common/python/model_runner.py | 15 +-- workflows/cp-leaveout/scripts/Node.py | 6 +- workflows/cp-leaveout/scripts/clean-top21.py | 17 ++- .../cp-leaveout/scripts/print-node-info.py | 8 +- 9 files changed, 111 insertions(+), 105 deletions(-) diff --git a/README.adoc b/README.adoc index 79697ba8..d7283c10 100644 --- a/README.adoc +++ b/README.adoc @@ -25,7 +25,7 @@ See the https://ecp-candle.github.io/Supervisor/home.html[Home Page] for more in singularity build --fakeroot /path/for/sif/DerugCell.sif /path/to/DrugCell.def singularity build --fakeroot /path/for/sif/SWnet.sif /path/to/SWnet.def ``` - + - Add /path/for/sif/DerugCell.sif and /path/for/sif/SWnet.sif to the PYTHONPATH in workflow.sh - Create the CANDLE_DATA_DIR. Place drug_features.csv in the CANDLE_DATA_DIR - Run the command ./test-small-1.sh SITE diff --git a/models/Comparator/cmp_baseline_keras2.py b/models/Comparator/cmp_baseline_keras2.py index 528d0030..8fc4fead 100644 --- a/models/Comparator/cmp_baseline_keras2.py +++ b/models/Comparator/cmp_baseline_keras2.py @@ -32,23 +32,24 @@ def run(gParameters): expid = gParameters["experiment_id"] runid = gParameters["run_id"] supervisor = Path(file_path).absolute().parent.parent - workflows = supervisor / "workflows" + workflows = supervisor / "workflows" #print(model_sh) model1 = gParameters["model1"] model2 = gParameters["model2"] os.chdir(output_dir) - cmd = make_cmd(str(workflows), expid, runid) + cmd = make_cmd(str(workflows), expid, runid) run_dir = Path(os.getenv("CANDLE_DATA_DIR")) \ / model1 / "Output" / expid / runid #print("env: " + str(env)) print("cmd: " + str(cmd)) results = [] - for i in [ 1, 2 ]: + for i in [1, 2]: model_name = gParameters["model%i" % i] env = make_env(str(workflows), model_name) print("command is ", cmd, "/nenv is:", env) with open(str(run_dir) + "/start-%i.log" % i, "w") as start_log: - subprocess.run(cmd, env=env, + subprocess.run(cmd, + env=env, stdout=start_log, stderr=subprocess.STDOUT) run_dir = Path(os.getenv("CANDLE_DATA_DIR")) \ @@ -63,30 +64,36 @@ def run(gParameters): def make_env(workflows, model_name): output_dir = "./tmp" expid = 'one_exp' - env = { "WORKFLOWS_ROOT": workflows, - "TURBINE_OUTPUT": output_dir, - "EXPID": expid, - "SITE": "lambda", - "OBJ_RETURN": "loss", - "BENCHMARK_TIMEOUT": "120", - "MODEL_NAME": model_name, - "CANDLE_MODEL_TYPE": "SINGULARITY", - "CANDLE_DATA_DIR": os.getenv("CANDLE_DATA_DIR"), - "ADLB_RANK_OFFSET": "0", - "CANDLE_IMAGE": "/software/improve/images/GraphDRP.sif" - } + env = { + "WORKFLOWS_ROOT": workflows, + "TURBINE_OUTPUT": output_dir, + "EXPID": expid, + "SITE": "lambda", + "OBJ_RETURN": "loss", + "BENCHMARK_TIMEOUT": "120", + "MODEL_NAME": model_name, + "CANDLE_MODEL_TYPE": "SINGULARITY", + "CANDLE_DATA_DIR": os.getenv("CANDLE_DATA_DIR"), + "ADLB_RANK_OFFSET": "0", + "CANDLE_IMAGE": "/software/improve/images/GraphDRP.sif" + } return env def make_cmd(workflows, expid, runid): - model_sh = workflows + "/common" +"/sh" + "/model.sh" - cmd = [ "bash", model_sh, - "keras2", "{}", # empty JSON fragment - expid, - runid ] + model_sh = workflows + "/common" + "/sh" + "/model.sh" + cmd = [ + "bash", + model_sh, + "keras2", + "{}", # empty JSON fragment + expid, + runid + ] return cmd + def main(): gParameters = initialize_parameters() run(gParameters) diff --git a/workflows/cmp-cv/py/compare.py b/workflows/cmp-cv/py/compare.py index 0ce13b55..165182dc 100644 --- a/workflows/cmp-cv/py/compare.py +++ b/workflows/cmp-cv/py/compare.py @@ -1,6 +1,6 @@ - CANDLE_DATA_DIR = os.getenv("CANDLE_DATA_DIR") + def compare(exp_id, run_id): print(f"compare: run_id={run_id}") @@ -11,11 +11,16 @@ def compare(exp_id, run_id): df_res = pd.read_csv(f"{directory}/test_predictions.csv") # a class to calculate errors for subsets of the validation/test set - bmk = Benchmark(fp_path='drug_features.csv') # TODO: have to have a drug features for a common test set + bmk = Benchmark(fp_path='drug_features.csv' + ) # TODO: have to have a drug features for a common test set subset_err = bmk.error_by_feature_domains_model(df_res, conditions) # collect results for comparison - subset_err.set_index('prop', inplace=True) # TODO: use 'prop' as a parameter and move it to cmp_models.txt - cmp_results[run_id] = subset_err.loc[cmp_prop, 'error'] # this is the property based on which we want to do the comparison + subset_err.set_index( + 'prop', inplace=True + ) # TODO: use 'prop' as a parameter and move it to cmp_models.txt + cmp_results[run_id] = subset_err.loc[ + cmp_prop, + 'error'] # this is the property based on which we want to do the comparison with open(f"{directory}/subset_err.txt", "w") as fp: fp.write(str(cmp_results[run_id])) diff --git a/workflows/cmp-cv/test/upf-1.txt b/workflows/cmp-cv/test/upf-1.txt index 9c1a0121..bc465052 100644 --- a/workflows/cmp-cv/test/upf-1.txt +++ b/workflows/cmp-cv/test/upf-1.txt @@ -1,4 +1,4 @@ {"id": "RUN000", "epochs": 1, "model_name": "DrugCell", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif"} {"id": "RUN001", "epochs": 2, "model_name": "DrugCell", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif"} {"id": "RUN002", "epochs": 1, "model_name": "SWnet_CCLE", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/SWnet.sif"} -{"id": "RUN003", "epochs": 2, "model_name": "SWnet_CCLE", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/SWnet.sif"} \ No newline at end of file +{"id": "RUN003", "epochs": 2, "model_name": "SWnet_CCLE", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/SWnet.sif"} diff --git a/workflows/common/python/compare.py b/workflows/common/python/compare.py index a6566d10..5b66d6c8 100644 --- a/workflows/common/python/compare.py +++ b/workflows/common/python/compare.py @@ -1,39 +1,41 @@ +"""This script can be used to filter a subset of the test set based on the +properties of the drug molecules. + +For example, here we can select the molecules of which the 'prop' is +between two values (provided in the 2nd and 3rd elements of each list in +the conditions list. We can then find the prediction errors for this +domain. Knowledge of the errors of differnt molecular groups is helpful +to understand the currrent deficiencies of the drug response models (or +any molecular property prediction model in general). This knowledge is +then allow us to improve the models as well as use predictions from the +models which produce highly accurate preidictions for certain domains. +""" + import os import pandas as pd import pandas as pd import numpy as np from sklearn.metrics import mean_squared_error -""" -This script can be used to filter a subset of the test set based on -the properties of the drug molecules. For example, here we can select the -molecules of which the 'prop' is between two values (provided in the -2nd and 3rd elements of each list in the conditions list. We can then -find the prediction errors for this domain. Knowledge of the errors of differnt -molecular groups is helpful to understand the currrent deficiencies of the drug -response models (or any molecular property prediction model in general). This knowledge -is then allow us to improve the models as well as use predictions from the models which -produce highly accurate preidictions for certain domains. -""" - -conditions = pd.DataFrame([[ 'nAromAtom' , 5, 10 ], - ['nAtom', 20, 50], - ['BertzCT', 800, 1000]], - columns=['prop', 'low', 'high']) +conditions = pd.DataFrame( + [['nAromAtom', 5, 10], ['nAtom', 20, 50], ['BertzCT', 800, 1000]], + columns=['prop', 'low', 'high']) # from cmp_utils import conditions, Benchmark CANDLE_DATA_DIR = os.getenv("CANDLE_DATA_DIR") + def compare(exp_id, run_id): - cmp_results={} + cmp_results = {} print(f"compare: run_id={run_id}") # gParams = read_params(exp_id, run_id) # model = gParams("model_name") - model = "DrugCell" # TODO: Hardcoded. have to get this from output dir? + model = "DrugCell" # TODO: Hardcoded. have to get this from output dir? turbine_output = os.getenv("TURBINE_OUTPUT") - outdir = os.path.join(turbine_output, run_id, 'Output', 'EXP000', 'RUN000') # TODO: Have to fix this + outdir = os.path.join(turbine_output, run_id, 'Output', 'EXP000', + 'RUN000') # TODO: Have to fix this directory = outdir # directory = f"{CANDLE_DATA_DIR}/Output/{exp_id}/{run_id}" print("reading the predictions....") @@ -42,28 +44,31 @@ def compare(exp_id, run_id): # a class to calculate errors for subsets of the validation/test set print("reading the drug feature file....") # TODO: Should have to save the above file in this file - bmk = Benchmark(fp_path=f'{CANDLE_DATA_DIR}/drug_features.csv') # TODO: have to have a drug features for a common test set - subset_err, final_domain_err = bmk.error_by_feature_domains_model(df_res, conditions) + bmk = Benchmark(fp_path=f'{CANDLE_DATA_DIR}/drug_features.csv' + ) # TODO: have to have a drug features for a common test set + subset_err, final_domain_err = bmk.error_by_feature_domains_model( + df_res, conditions) # or this # fp_path=f'{CANDLE_DATA_DIR}/drug_features.csv' # subset_err, final_domain_err = error_by_feature_domains_model(fp_path, df_res, conditions) # collect results for comparison - cmp_prop = 'nAtom' # TODO: Get this from gParameters - subset_err.set_index('prop', inplace=True) # TODO: use 'prop' as a parameter and move it to cmp_models.txt - cmp_results[run_id] = subset_err.loc[cmp_prop, 'error'] # this is the property based on which we want to do the comparison + cmp_prop = 'nAtom' # TODO: Get this from gParameters + subset_err.set_index( + 'prop', inplace=True + ) # TODO: use 'prop' as a parameter and move it to cmp_models.txt + cmp_results[run_id] = subset_err.loc[ + cmp_prop, + 'error'] # this is the property based on which we want to do the comparison with open(f"{directory}/subset_err.txt", "w") as fp: fp.write(str(cmp_results[run_id])) return str(cmp_results[run_id]) - - def error_by_feature_domains_model(fp_path, preds, conditions): - - + fps = pd.read_csv(fp_path) report = [] preds['err'] = abs(preds['true'] - preds['pred']) @@ -74,7 +79,7 @@ def error_by_feature_domains_model(fp_path, preds, conditions): low = conditions.loc[i, 'low'] high = conditions.loc[i, 'high'] - locs = np.logical_and(fps[prop] <= high , fps[prop] > low) + locs = np.logical_and(fps[prop] <= high, fps[prop] > low) smiles = fps.loc[locs, 'smiles'].values tmp = preds[preds.smiles.isin(smiles)] mean_err = tmp.err.mean() @@ -83,26 +88,22 @@ def error_by_feature_domains_model(fp_path, preds, conditions): keep = keep[keep.smiles.isin(smiles)] - final_domain_err = keep.err.mean() # return this + final_domain_err = keep.err.mean() # return this report = pd.DataFrame(report, columns=['prop', 'low', 'high', 'error']) return report, final_domain_err - - - class Benchmark: - + def __init__(self, fp_path): - + self.fps = pd.read_csv(fp_path) # self.model_preds = model_preds # self.feature_conditions = feature_conditions self.reports = {} - - + def error_by_feature_domains_model(self, preds, conditions): - + fps = self.fps report = [] preds['err'] = abs(preds['true'] - preds['pred']) @@ -113,7 +114,7 @@ def error_by_feature_domains_model(self, preds, conditions): low = conditions.loc[i, 'low'] high = conditions.loc[i, 'high'] - locs = np.logical_and(fps[prop] <= high , fps[prop] > low) + locs = np.logical_and(fps[prop] <= high, fps[prop] > low) smiles = fps.loc[locs, 'smiles'].values tmp = preds[preds.smiles.isin(smiles)] mean_err = tmp.err.mean() @@ -122,37 +123,33 @@ def error_by_feature_domains_model(self, preds, conditions): keep = keep[keep.smiles.isin(smiles)] - final_domain_err = keep.err.mean() # return this + final_domain_err = keep.err.mean() # return this report = pd.DataFrame(report, columns=['prop', 'low', 'high', 'error']) return report, final_domain_err - - - - - def error_by_feature_domains(self, feature_conditions): - results=[] + results = [] for model_name, pred in self.model_preds.items(): - - report = self.error_by_feature_domains_model(pred, feature_conditions) + + report = self.error_by_feature_domains_model( + pred, feature_conditions) report.loc[:, 'model'] = model_name results.append(report) - + results = pd.concat(results, axis=0) results = results.loc[:, ['model', 'prop', 'low', 'high', 'error']] results.reset_index(drop=True, inplace=True) return results - def rank_by_acc(self, metric='rmse', th=3): - - results={} + + results = {} for model_name, pred in self.model_preds.items(): sub = pred[pred.labels > th] - rmse = mean_squared_error(y_true=sub['labels'], y_pred=sub['preds'])**.5 + rmse = mean_squared_error(y_true=sub['labels'], + y_pred=sub['preds'])**.5 results[model_name] = {'rmse': rmse} @@ -161,7 +158,6 @@ def rank_by_acc(self, metric='rmse', th=3): return results - def create_grid_files(): dc_grid = {'epochs': [1, 2], 'lr': [1e-2, 1e-3]} diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index 0309171a..13c01566 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -171,13 +171,12 @@ def run(hyper_parameter_map, obj_return): # Run the model! log("PKG RUN START") - # check for epochs if not present set to 1, used for checking early stopping in function get_results if "epochs" in hyper_parameter_map: epochs = hyper_parameter_map["epochs"] else: epochs = 1 - + if framework == 'keras': try: @@ -187,8 +186,8 @@ def run(hyper_parameter_map, obj_return): print("RUN EXCEPTION: " + str(e)) info = sys.exc_info() s = traceback.format_tb(info[2]) - sys.stdout.write('\\n\\nEXCEPTION in model run(): \\n' + - repr(e) + ' ... \\n' + ''.join(s)) + sys.stdout.write('\\n\\nEXCEPTION in model run(): \\n' + repr(e) + + ' ... \\n' + ''.join(s)) sys.stdout.write('\\n') sys.stdout.flush() @@ -214,16 +213,18 @@ def run(hyper_parameter_map, obj_return): if history == "EPOCHS_COMPLETED_ALREADY": result, history_result = "EPOCHS_COMPLETED_ALREADY", None else: - result, history_result = get_results(history, obj_return, - epochs) + result, history_result = get_results( + history, obj_return, epochs) else: result, history_result = "RUN_EXCEPTION", None elif framework == 'pytorch': val_scores, infer_scores = pkg.run(params) + class history: + def __init__(self, val_scores): - self.history = {'val_loss': [val_scores['val_loss']] } + self.history = {'val_loss': [val_scores['val_loss']]} history = history(val_scores) result, history_result = get_results(history, obj_return, epochs) diff --git a/workflows/cp-leaveout/scripts/Node.py b/workflows/cp-leaveout/scripts/Node.py index 0769c833..ce8fb70c 100644 --- a/workflows/cp-leaveout/scripts/Node.py +++ b/workflows/cp-leaveout/scripts/Node.py @@ -231,10 +231,8 @@ def parse_val_data(self, fp): self.val_data = int(value_string) def parse_python_log(self, fp): - """ - fp is the file pointer to save/python.log - If lines are not found, node.mse, etc., will remain None - """ + """fp is the file pointer to save/python.log If lines are not found, + node.mse, etc., will remain None.""" marker = "Comparing y_true " # The marker is just after the date: # We search this way for speed. diff --git a/workflows/cp-leaveout/scripts/clean-top21.py b/workflows/cp-leaveout/scripts/clean-top21.py index c0f41b59..d16126bd 100644 --- a/workflows/cp-leaveout/scripts/clean-top21.py +++ b/workflows/cp-leaveout/scripts/clean-top21.py @@ -1,4 +1,3 @@ - # CLEAN TOP21 # Cleans the top21 file so only LINCS records are present # File names are hard-coded but easy to change @@ -9,8 +8,7 @@ logger.setLevel(logging.DEBUG) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) -formatter = logging.Formatter("%(asctime)s %(message)s", - datefmt="%H:%M:%S") +formatter = logging.Formatter("%(asctime)s %(message)s", datefmt="%H:%M:%S") ch.setFormatter(formatter) logger.addHandler(ch) logger.info("Start") @@ -23,7 +21,7 @@ CANDLE_DATA = SCRATCH + "/CANDLE-Data/ChallengeProblem" # The original data from Yoo: -original = CANDLE_DATA + "/top21_2020Jul/top21.h5" +original = CANDLE_DATA + "/top21_2020Jul/top21.h5" lincs1000 = CANDLE_DATA + "/top21_2020Jul/lincs1000" # The file we are creating here: @@ -34,7 +32,8 @@ with open(lincs1000, "r") as fp: while True: line = fp.readline() - if len(line) == 0: break + if len(line) == 0: + break lincs.append(line.strip()) logger.info("lincs length: %i" % len(lincs)) @@ -50,10 +49,10 @@ # List of dataframe column names to delete: delete_these = [] -count_key = 0 -count_GE_N = 0 -count_GE_Y = 0 -count_DD = 0 +count_key = 0 +count_GE_N = 0 +count_GE_Y = 0 +count_DD = 0 count_other = 0 for column in columns: if column.startswith("GE_"): diff --git a/workflows/cp-leaveout/scripts/print-node-info.py b/workflows/cp-leaveout/scripts/print-node-info.py index c2845e7b..95c4cb4f 100644 --- a/workflows/cp-leaveout/scripts/print-node-info.py +++ b/workflows/cp-leaveout/scripts/print-node-info.py @@ -10,7 +10,9 @@ parser = argparse.ArgumentParser(description="Print Node info stats") parser.add_argument("directory", help="The experiment directory (EXPID)") -parser.add_argument("nodes", default="", nargs="*", +parser.add_argument("nodes", + default="", + nargs="*", help="Nodes to print (optional, defaults to all)") args = parser.parse_args() @@ -23,7 +25,6 @@ except IOError as e: fail(e, os.EX_IOERR, "Could not read: " + node_pkl) - # Raw data printing: # print(str(args)) # print(len(data)) @@ -40,8 +41,7 @@ def print_all(data): count += 1 if node.stopped_early: earlies += 1 - print("print-node-info: %i/%i runs stopped early." % - (earlies, count)) + print("print-node-info: %i/%i runs stopped early." % (earlies, count)) def print_selected(data, nodes): From c7786d867c52c2f6b9c09811bc8a8d3b32718441 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Fri, 24 Mar 2023 18:48:29 -0500 Subject: [PATCH 492/601] o set SWIFT_IMPL, more fixes needed for CDD and singularity runs --- workflows/cmp-cv/swift/workflow.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/cmp-cv/swift/workflow.sh b/workflows/cmp-cv/swift/workflow.sh index 34769e37..93c0e15e 100755 --- a/workflows/cmp-cv/swift/workflow.sh +++ b/workflows/cmp-cv/swift/workflow.sh @@ -82,7 +82,8 @@ then fi export CANDLE_IMAGE=${CANDLE_IMAGE:-} -# export $SWIFT_IMPL=container + +export SWIFT_IMPL=container which swift-t From d6005e9ec32d0ff0f36ac5c7aa997c5255cc8f6b Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Fri, 24 Mar 2023 18:57:54 -0500 Subject: [PATCH 493/601] o Try to run singularity workflows --- workflows/cmp-cv/test/test-small-1.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cmp-cv/test/test-small-1.sh b/workflows/cmp-cv/test/test-small-1.sh index 0a5f36a2..0cd88fb5 100755 --- a/workflows/cmp-cv/test/test-small-1.sh +++ b/workflows/cmp-cv/test/test-small-1.sh @@ -25,5 +25,5 @@ CFG_SYS=$THIS/cfg-sys-1.sh # export CANDLE_IMAGE=/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif export CANDLE_MODEL_TYPE="SINGULARITY" -export CANDLE_MODEL_TYPE="BENCHMARKS" +# export CANDLE_MODEL_TYPE="BENCHMARKS" $EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $THIS/upf-1.txt $THIS/models-1.txt From 987ceced9d85f97ab47a8985d623261afd2a04f9 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Fri, 24 Mar 2023 19:07:37 -0500 Subject: [PATCH 494/601] o get candle model type from swift file --- workflows/cmp-cv/swift/workflow.swift | 1 + workflows/cmp-cv/test/test-small-1.sh | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cmp-cv/swift/workflow.swift b/workflows/cmp-cv/swift/workflow.swift index 05dd4be0..5a2e37ec 100644 --- a/workflows/cmp-cv/swift/workflow.swift +++ b/workflows/cmp-cv/swift/workflow.swift @@ -23,6 +23,7 @@ int benchmark_timeout = string2int(argv("benchmark_timeout", "-1")); string expid = getenv("EXPID"); string turbine_output = getenv("TURBINE_OUTPUT"); +string candle_model_type = getenv("CANDLE_MODEL_TYPE"); // Report some key facts: printf("CMP-CV: %s", filename(model_file)); diff --git a/workflows/cmp-cv/test/test-small-1.sh b/workflows/cmp-cv/test/test-small-1.sh index 0cd88fb5..22fc8222 100755 --- a/workflows/cmp-cv/test/test-small-1.sh +++ b/workflows/cmp-cv/test/test-small-1.sh @@ -25,5 +25,4 @@ CFG_SYS=$THIS/cfg-sys-1.sh # export CANDLE_IMAGE=/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif export CANDLE_MODEL_TYPE="SINGULARITY" -# export CANDLE_MODEL_TYPE="BENCHMARKS" $EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $THIS/upf-1.txt $THIS/models-1.txt From 2fded36db732ddc6b282d5b1e1178571bb80f29e Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Fri, 24 Mar 2023 19:41:37 -0500 Subject: [PATCH 495/601] o change the default setting logic for CMT --- workflows/common/sh/utils.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index e9160a4b..adf28daa 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -173,7 +173,12 @@ get_expid() fi export EXPID=$1 - export CANDLE_MODEL_TYPE=${2:-BENCHMARKS} + + if [[ -z "${CANDLE_MODEL_TYPE}" ]]; then + CANDLE_MODEL_TYPE="BENCHMARKS" + fi + +echo "CANDLE_MODEL_TYPE is set to: ${CANDLE_MODEL_TYPE}" export EXPERIMENTS="" From 5205a2d416005932bc0f9e9145c99b6f75a1126b Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Fri, 24 Mar 2023 19:52:04 -0500 Subject: [PATCH 496/601] o set model name to cmp if none specified for comparison workflows --- workflows/common/sh/utils.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index adf28daa..75dc47ee 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -178,7 +178,13 @@ get_expid() CANDLE_MODEL_TYPE="BENCHMARKS" fi -echo "CANDLE_MODEL_TYPE is set to: ${CANDLE_MODEL_TYPE}" + echo "CANDLE_MODEL_TYPE is set to: ${CANDLE_MODEL_TYPE}" + + if [[ -z "${MODEL_NAME}" ]]; then + MODEL_NAME="cmp" + fi + + echo "MODEL_NAME is set to: ${MODEL_NAME}" export EXPERIMENTS="" From 341367ea64e76aaed8965145e3e7e1f5bd3e0d12 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Fri, 24 Mar 2023 20:34:54 -0500 Subject: [PATCH 497/601] o Fix flags for cmp workflow and singularity --- workflows/common/sh/model.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index bfa0ce9e..2796842b 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -121,6 +121,11 @@ then FLAGS=$( python3 $WORKFLOWS_ROOT/common/python/runner_utils.py expand_params \ "$PARAMS" ) + + # Remove --candle image flag and the second argument, assume it is the last argument + export FLAGS="${FLAGS/ --candle_image*/}" + + # The Singularity command line arguments: MODEL_CMD=( singularity exec --nv --bind $CANDLE_DATA_DIR:/candle_data_dir $CANDLE_IMAGE train.sh $ADLB_RANK_OFFSET From e8a4e6d971b0a50523b289ce8213dddee88ceba7 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Mon, 3 Apr 2023 16:58:29 -0500 Subject: [PATCH 498/601] o Changes for Mac OSX M1 and also fixes for GA test-1 that would fix Jenkins failure for OneD test problem --- workflows/GA/swift/workflow.sh | 2 +- workflows/GA/test/test-1.sh | 10 ++++-- workflows/common/sh/env-mbook.sh | 44 ++++++++++++++++++++++++++ workflows/common/sh/langs-app-mbook.sh | 25 +++++++++++++++ workflows/common/sh/utils.sh | 2 +- workflows/mlrMBO/data/nt3_nightly.R | 2 +- workflows/mlrMBO/swift/workflow.sh | 2 +- 7 files changed, 81 insertions(+), 6 deletions(-) create mode 100644 workflows/common/sh/env-mbook.sh create mode 100644 workflows/common/sh/langs-app-mbook.sh diff --git a/workflows/GA/swift/workflow.sh b/workflows/GA/swift/workflow.sh index 1729b3f9..9bf7adeb 100755 --- a/workflows/GA/swift/workflow.sh +++ b/workflows/GA/swift/workflow.sh @@ -200,7 +200,7 @@ swift-t -O 0 -n $PROCS \ -e CANDLE_MODEL_TYPE \ -e CANDLE_IMAGE \ $WAIT_ARG \ - $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} |& \ + $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} | 2>&1 \ tee $STDOUT ) diff --git a/workflows/GA/test/test-1.sh b/workflows/GA/test/test-1.sh index fe71cc6a..3b4cac30 100755 --- a/workflows/GA/test/test-1.sh +++ b/workflows/GA/test/test-1.sh @@ -38,6 +38,12 @@ export CFG_PRM=$THIS/cfg-prm-1.sh # The python GA model exploration algorithm export GA_FILE=deap_ga.py +CANDLE_MODEL_TYPE="BENCHMARKS" +# CANDLE_IMAGE=/software/improve/images/GraphDRP.sif # lambda +CANDLE_IMAGE=None # Polaris + +export MODEL_NAME="nt3" + # What to return from the objective function (Keras model) # val_loss (default) and val_corr are supported export OBJ_RETURN="val_loss" @@ -48,8 +54,8 @@ then fi # Submit job -$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME - +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $CFG_PRM $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE # Check job output TURBINE_OUTPUT=$( readlink turbine-output ) echo $TURBINE_OUTPUT diff --git a/workflows/common/sh/env-mbook.sh b/workflows/common/sh/env-mbook.sh new file mode 100644 index 00000000..a69a0e06 --- /dev/null +++ b/workflows/common/sh/env-mbook.sh @@ -0,0 +1,44 @@ + +# ENV mbook +# Environment settings for mbook (Swift, Python, R, Tcl, etc.) + +# Everything is installed in here: +SFW=/Users/mbook/install/ + +SWIFT=$SFW/swift-t/ +PY=/opt/homebrew/anaconda3/envs/tensorflow/ +# EQPY=$SFW/EQ-Py +EQR=/Users/mbook/Supervisor/workflows/common/ext/EQ-R/ + +PATH=$SWIFT/stc/bin:$PATH +PATH=$PY/bin:$PATH + +export LD_LIBRARY_PATH=/Library/Frameworks/R.framework/Resources/lib/:${LD_LIBRARY_PATH:-} + +# How to run CANDLE models: +SWIFT_IMPL="app" + +# PYTHONPATH=$EQPY/src:${PYTHONPATH:-} + +# Log settings to output +echo "Programs:" +which python swift-t | nl +# Cf. utils.sh +show PYTHONHOME + +### +export PYTHONHOME=$PY + +PYTHON="$PYTHONHOME/bin/" +export LD_LIBRARY_PATH="$PYTHONHOME/lib" +export PATH="$PYTHONHOME/bin:$PATH" + +COMMON_DIR=$EMEWS_PROJECT_ROOT/../common/python +PYTHONPATH+=":$PYTHONHOME/lib/:" +PYTHONPATH+=":$COMMON_DIR:" + +APP_PYTHONPATH=${APP_PYTHONPATH:-} +PYTHONPATH+=":$APP_PYTHONPATH" +### + +log_path LD_LIBRARY_PATH diff --git a/workflows/common/sh/langs-app-mbook.sh b/workflows/common/sh/langs-app-mbook.sh new file mode 100644 index 00000000..98bbae08 --- /dev/null +++ b/workflows/common/sh/langs-app-mbook.sh @@ -0,0 +1,25 @@ + +# LANGS APP mbook + +echo "langs-app-mbook ..." + +PY=/opt/homebrew/anaconda3/envs/tensorflow/ + +PATH=$PY/bin:$PATH + + +export PYTHONHOME=$PY +PYTHON="$PYTHONHOME/bin/" +export LD_LIBRARY_PATH="$PYTHONHOME/lib" +# export PATH="$PYTHONHOME/bin:$PATH" + +COMMON_DIR=$EMEWS_PROJECT_ROOT/../common/python +PYTHONPATH+=":$PYTHONHOME/lib/:" +PYTHONPATH+=":$COMMON_DIR:" + +APP_PYTHONPATH=${APP_PYTHONPATH:-} +PYTHONPATH+=":$APP_PYTHONPATH" + +export PYTHONPATH + +echo "langs-app-mbook done." diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index 75dc47ee..e6b757ff 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -623,7 +623,7 @@ pad_keys() { # Pad 1st tokens printf "%-15s " $1 shift - echo ${*} + echo $* } print_json() { diff --git a/workflows/mlrMBO/data/nt3_nightly.R b/workflows/mlrMBO/data/nt3_nightly.R index 7fc9fa31..8124fc2d 100644 --- a/workflows/mlrMBO/data/nt3_nightly.R +++ b/workflows/mlrMBO/data/nt3_nightly.R @@ -11,7 +11,7 @@ param.set <- makeParamSet( makeIntegerParam("epochs", lower = 2, upper = 5), # makeDiscreteParam("activation", values = c("softmax", "elu", "softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear")), # makeDiscreteParam("dense", values = c("500 100 50", "1000 500 100 50", "2000 1000 500 100 50", "2000 1000 1000 500 100 50", "2000 1000 1000 1000 500 100 50")), - makeDiscreteParam("optimizer", values = c("adam", "sgd", "rmsprop", "adagrad", "adadelta")), + makeDiscreteParam("optimizer", values = c("adam", "sgd")), makeNumericParam("dropout", lower = 0, upper = 0.9), makeNumericParam("learning_rate", lower = 0.00001, upper = 0.1) # makeDiscreteParam("conv", values = c("50 50 50 50 50 1", "25 25 25 25 25 1", "64 32 16 32 64 1", "100 100 100 100 100 1", "32 20 16 32 10 1")) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index f125ac38..28fdeeaf 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -192,7 +192,7 @@ swift-t -O 0 -n $PROCS \ -e CANDLE_MODEL_TYPE \ -e CANDLE_IMAGE \ $WAIT_ARG \ - $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} ) |& \ + $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} ) 2>&1 | \ tee $STDOUT if (( ${PIPESTATUS[0]} )) From 7e1f810743ade93bb4376c82dd41ed4e17e11851 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Mon, 3 Apr 2023 17:05:39 -0500 Subject: [PATCH 499/601] o Remove hard coded name --- workflows/GA/test/test-1.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/workflows/GA/test/test-1.sh b/workflows/GA/test/test-1.sh index 3b4cac30..bc36cec4 100755 --- a/workflows/GA/test/test-1.sh +++ b/workflows/GA/test/test-1.sh @@ -42,8 +42,6 @@ CANDLE_MODEL_TYPE="BENCHMARKS" # CANDLE_IMAGE=/software/improve/images/GraphDRP.sif # lambda CANDLE_IMAGE=None # Polaris -export MODEL_NAME="nt3" - # What to return from the objective function (Keras model) # val_loss (default) and val_corr are supported export OBJ_RETURN="val_loss" From 691e6594e8aa9b79b0f26cfb437621a8fd7f6adc Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Tue, 4 Apr 2023 13:08:57 -0700 Subject: [PATCH 500/601] o Get cmp-cv to run --- workflows/cmp-cv/py/compare.py | 3 ++- workflows/cmp-cv/test/upf-1.txt | 4 ++-- workflows/common/sh/utils.sh | 4 +--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/workflows/cmp-cv/py/compare.py b/workflows/cmp-cv/py/compare.py index 165182dc..f82b1ba7 100644 --- a/workflows/cmp-cv/py/compare.py +++ b/workflows/cmp-cv/py/compare.py @@ -1,4 +1,5 @@ CANDLE_DATA_DIR = os.getenv("CANDLE_DATA_DIR") +MODEL_NAME = os.getenv("MODEL_NAME") def compare(exp_id, run_id): @@ -7,7 +8,7 @@ def compare(exp_id, run_id): gParams = read_params(exp_id, run_id) model = gParams("model_name") - directory = f"{CANDLE_DATA_DIR}/Output/{exp_id}/{run_id}" + directory = f"{CANDLE_DATA_DIR}/{model}/Output/{exp_id}/{run_id}" df_res = pd.read_csv(f"{directory}/test_predictions.csv") # a class to calculate errors for subsets of the validation/test set diff --git a/workflows/cmp-cv/test/upf-1.txt b/workflows/cmp-cv/test/upf-1.txt index bc465052..04c0473e 100644 --- a/workflows/cmp-cv/test/upf-1.txt +++ b/workflows/cmp-cv/test/upf-1.txt @@ -1,4 +1,4 @@ {"id": "RUN000", "epochs": 1, "model_name": "DrugCell", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif"} {"id": "RUN001", "epochs": 2, "model_name": "DrugCell", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif"} -{"id": "RUN002", "epochs": 1, "model_name": "SWnet_CCLE", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/SWnet.sif"} -{"id": "RUN003", "epochs": 2, "model_name": "SWnet_CCLE", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/SWnet.sif"} +{"id": "RUN002", "epochs": 1, "model_name": "GraphDRP", "candle_image": "/software/improve/images/GraphDRP.sif"} +{"id": "RUN003", "epochs": 2, "model_name": "GraphDRP", "candle_image": "/software/improve/images/GraphDRP.sif"} diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index 75dc47ee..f7d25814 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -180,9 +180,7 @@ get_expid() echo "CANDLE_MODEL_TYPE is set to: ${CANDLE_MODEL_TYPE}" - if [[ -z "${MODEL_NAME}" ]]; then - MODEL_NAME="cmp" - fi + MODEL_NAME=${MODEL_NAME:-cmp} echo "MODEL_NAME is set to: ${MODEL_NAME}" From e6684e65da64e1f6e19b1f435ee4728a4b6935ad Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 4 Apr 2023 16:29:54 -0500 Subject: [PATCH 501/601] Do not quote strings --- workflows/common/python/runner_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/python/runner_utils.py b/workflows/common/python/runner_utils.py index fdd0fe61..6f111e1f 100644 --- a/workflows/common/python/runner_utils.py +++ b/workflows/common/python/runner_utils.py @@ -102,7 +102,7 @@ def expand_params(params, hyper_parameter_map): if type(v) in DATA_TYPES: v = DATA_TYPES[type(v)] if isinstance(v, basestring): - v = "'{}'".format(v) + v = "{}".format(v) if k == "solr_root" or k == "timeout" or k == "id": # this must written at the end pass # Not a command-line parameter From 25eceaeae9d27216b3a69b8b037e5a40edf64492 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 4 Apr 2023 16:30:15 -0500 Subject: [PATCH 502/601] WS --- workflows/cmp-cv/swift/workflow.swift | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/workflows/cmp-cv/swift/workflow.swift b/workflows/cmp-cv/swift/workflow.swift index 5a2e37ec..f13b6215 100644 --- a/workflows/cmp-cv/swift/workflow.swift +++ b/workflows/cmp-cv/swift/workflow.swift @@ -17,13 +17,13 @@ string FRAMEWORK = "pytorch"; // Scan command line // file plan = input(argv("plan")); -file model_file = input(argv("models")); -file gparams_file = input(argv("gparams")); -int benchmark_timeout = string2int(argv("benchmark_timeout", "-1")); +file model_file = input(argv("models")); +file gparams_file = input(argv("gparams")); +int benchmark_timeout = string2int(argv("benchmark_timeout", "-1")); -string expid = getenv("EXPID"); -string turbine_output = getenv("TURBINE_OUTPUT"); -string candle_model_type = getenv("CANDLE_MODEL_TYPE"); +string expid = getenv("EXPID"); +string turbine_output = getenv("TURBINE_OUTPUT"); +string candle_model_type = getenv("CANDLE_MODEL_TYPE"); // Report some key facts: printf("CMP-CV: %s", filename(model_file)); From e7f08af98bf97ffe8715fb55961f6cc373e11d3a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 4 Apr 2023 16:30:25 -0500 Subject: [PATCH 503/601] Add numbers --- workflows/common/swift/obj_container.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/swift/obj_container.swift b/workflows/common/swift/obj_container.swift index bf30317a..717dd03e 100644 --- a/workflows/common/swift/obj_container.swift +++ b/workflows/common/swift/obj_container.swift @@ -40,7 +40,7 @@ app (void o) run_model (string model_sh, string params, string expid, string runid, string model_name, string candle_image) { - // 1 2 3 4 + // 1 2 3 4 5 6 "bash" model_sh FRAMEWORK params expid runid model_name candle_image; } From 4a3135130f73bf75e2429bafbed7f7071888d249 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 5 Apr 2023 12:03:44 -0700 Subject: [PATCH 504/601] o Write empty results.txt if nothing found, add more Params to singularity call, fix compare.py and wait for drug_features.csv file. cmp-cv runs on lambda. TODO: fix compare --- workflows/cmp-cv/swift/workflow.swift | 7 ++-- workflows/cmp-cv/test/upf-1.txt | 8 ++--- workflows/common/python/compare.py | 49 ++++++++++++++------------- workflows/common/sh/model.sh | 7 ++-- 4 files changed, 38 insertions(+), 33 deletions(-) diff --git a/workflows/cmp-cv/swift/workflow.swift b/workflows/cmp-cv/swift/workflow.swift index f13b6215..7ae87823 100644 --- a/workflows/cmp-cv/swift/workflow.swift +++ b/workflows/cmp-cv/swift/workflow.swift @@ -45,9 +45,10 @@ string results[]; // "compare.compare(\"%s\", \"%s\")") % (exp_id, run_id); // } -compare(string expid, string runid) +compare(string model_name, string expid, string runid) { - python_persist("import compare", "compare.compare(\"%s\", \"%s\")" % (expid, runid) ); + printf("Calling compare with model_name: %s", model_name)=> + python_persist("import compare", "compare.compare(\"%s\", \"%s\", \"%s\")" % (model_name, expid, runid) ); // python_persist("import compare", "compare.compare()"); } @@ -73,7 +74,7 @@ foreach gparam, j in gparams_lines printf("MODEL: %s", model_name); // printf(gparams); // results[runid] = obj(gparam, expid, repr(runid) ); - results[runid] = obj_container(gparam, expid, repr(runid), model_name, candle_image) => compare(expid, repr(runid) ); + results[runid] = obj_container(gparam, expid, repr(runid), model_name, candle_image) => compare(model_name, expid, repr(runid) ); // results[runid] = obj(gparam, expid, repr(runid)); // => compare(expid, repr(runid) ); diff --git a/workflows/cmp-cv/test/upf-1.txt b/workflows/cmp-cv/test/upf-1.txt index 04c0473e..5b762e1c 100644 --- a/workflows/cmp-cv/test/upf-1.txt +++ b/workflows/cmp-cv/test/upf-1.txt @@ -1,4 +1,4 @@ -{"id": "RUN000", "epochs": 1, "model_name": "DrugCell", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif"} -{"id": "RUN001", "epochs": 2, "model_name": "DrugCell", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif"} -{"id": "RUN002", "epochs": 1, "model_name": "GraphDRP", "candle_image": "/software/improve/images/GraphDRP.sif"} -{"id": "RUN003", "epochs": 2, "model_name": "GraphDRP", "candle_image": "/software/improve/images/GraphDRP.sif"} +{"id": "RUN001", "epochs": 1, "model_name": "DrugCell", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif"} +{"id": "RUN002", "epochs": 1, "model_name": "DrugCell", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif"} +{"id": "RUN003", "epochs": 1, "model_name": "SWnet", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/SWnet.sif"} +{"id": "RUN004", "epochs": 1, "model_name": "SWnet", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/SWnet.sif"} diff --git a/workflows/common/python/compare.py b/workflows/common/python/compare.py index 5b66d6c8..54a94ba0 100644 --- a/workflows/common/python/compare.py +++ b/workflows/common/python/compare.py @@ -25,17 +25,17 @@ CANDLE_DATA_DIR = os.getenv("CANDLE_DATA_DIR") -def compare(exp_id, run_id): +def compare(model_name, exp_id, run_id): cmp_results = {} print(f"compare: run_id={run_id}") # gParams = read_params(exp_id, run_id) # model = gParams("model_name") - model = "DrugCell" # TODO: Hardcoded. have to get this from output dir? - turbine_output = os.getenv("TURBINE_OUTPUT") - - outdir = os.path.join(turbine_output, run_id, 'Output', 'EXP000', - 'RUN000') # TODO: Have to fix this + # model = "DrugCell" # TODO: Hardcoded. have to get this from output dir? + # turbine_output = os.getenv("TURBINE_OUTPUT") + + CANDLE_DATA_DIR = os.getenv("CANDLE_DATA_DIR") + outdir = os.path.join(CANDLE_DATA_DIR, model_name, "Output", exp_id, run_id) directory = outdir # directory = f"{CANDLE_DATA_DIR}/Output/{exp_id}/{run_id}" print("reading the predictions....") @@ -43,24 +43,25 @@ def compare(exp_id, run_id): # a class to calculate errors for subsets of the validation/test set print("reading the drug feature file....") - # TODO: Should have to save the above file in this file - bmk = Benchmark(fp_path=f'{CANDLE_DATA_DIR}/drug_features.csv' - ) # TODO: have to have a drug features for a common test set - subset_err, final_domain_err = bmk.error_by_feature_domains_model( - df_res, conditions) - - # or this - # fp_path=f'{CANDLE_DATA_DIR}/drug_features.csv' - # subset_err, final_domain_err = error_by_feature_domains_model(fp_path, df_res, conditions) - - # collect results for comparison - cmp_prop = 'nAtom' # TODO: Get this from gParameters - subset_err.set_index( - 'prop', inplace=True - ) # TODO: use 'prop' as a parameter and move it to cmp_models.txt - cmp_results[run_id] = subset_err.loc[ - cmp_prop, - 'error'] # this is the property based on which we want to do the comparison + # # TODO: Should have to save the above file in this file + # bmk = Benchmark(fp_path=f'{CANDLE_DATA_DIR}/drug_features.csv' + # ) # TODO: have to have a drug features for a common test set + # subset_err, final_domain_err = bmk.error_by_feature_domains_model( + # df_res, conditions) + + # # or this + # # fp_path=f'{CANDLE_DATA_DIR}/drug_features.csv' + # # subset_err, final_domain_err = error_by_feature_domains_model(fp_path, df_res, conditions) + + # # collect results for comparison + # cmp_prop = 'nAtom' # TODO: Get this from gParameters + # subset_err.set_index( + # 'prop', inplace=True + # ) # TODO: use 'prop' as a parameter and move it to cmp_models.txt + # cmp_results[run_id] = subset_err.loc[ + # cmp_prop, + # 'error'] # this is the property based on which we want to do the comparison + cmp_results[run_id] = -1 # set to -1 for now as we don't have the drug features file with open(f"{directory}/subset_err.txt", "w") as fp: fp.write(str(cmp_results[run_id])) diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index 2796842b..865a17e5 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -130,7 +130,9 @@ then --bind $CANDLE_DATA_DIR:/candle_data_dir $CANDLE_IMAGE train.sh $ADLB_RANK_OFFSET /candle_data_dir - $FLAGS ) # $INTERNAL_DIRECTORY/parameters.txt + $FLAGS # $INTERNAL_DIRECTORY/parameters.txt + --experiment_id $EXPID + --run_id $RUNID) else # "BENCHMARKS" # The Python command line arguments: @@ -160,7 +162,8 @@ then # NOTE: Enabling set -x will break the following RES=$(awk -v FS="IMPROVE_RESULT" 'NF>1 {x=$2} END {print x}' model.log) echo $RES - RESULT="$(echo $RES | grep -Eo '[+-]?[0-9]+([.][0-9]+)?')" + RESULT="$(echo $RES | grep -Eo '[+-]?[0-9]+([.][0-9]+)?')" || true + echo $RESULT, ": Result" echo $RESULT > $INSTANCE_DIRECTORY/result.txt else wait $PID From 61d14e3932b613b2372edd5c6f2b664f8fd83f7d Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 5 Apr 2023 16:36:21 -0700 Subject: [PATCH 505/601] o Move compare.py to cmp-cv workflow level --- workflows/cmp-cv/py/compare.py | 178 ++++++++++++++++++++++++++--- workflows/cmp-cv/swift/workflow.sh | 3 +- workflows/common/python/compare.py | 171 --------------------------- 3 files changed, 163 insertions(+), 189 deletions(-) delete mode 100644 workflows/common/python/compare.py diff --git a/workflows/cmp-cv/py/compare.py b/workflows/cmp-cv/py/compare.py index f82b1ba7..54a94ba0 100644 --- a/workflows/cmp-cv/py/compare.py +++ b/workflows/cmp-cv/py/compare.py @@ -1,27 +1,171 @@ -CANDLE_DATA_DIR = os.getenv("CANDLE_DATA_DIR") -MODEL_NAME = os.getenv("MODEL_NAME") +"""This script can be used to filter a subset of the test set based on the +properties of the drug molecules. + +For example, here we can select the molecules of which the 'prop' is +between two values (provided in the 2nd and 3rd elements of each list in +the conditions list. We can then find the prediction errors for this +domain. Knowledge of the errors of differnt molecular groups is helpful +to understand the currrent deficiencies of the drug response models (or +any molecular property prediction model in general). This knowledge is +then allow us to improve the models as well as use predictions from the +models which produce highly accurate preidictions for certain domains. +""" + +import os +import pandas as pd +import pandas as pd +import numpy as np +from sklearn.metrics import mean_squared_error +conditions = pd.DataFrame( + [['nAromAtom', 5, 10], ['nAtom', 20, 50], ['BertzCT', 800, 1000]], + columns=['prop', 'low', 'high']) +# from cmp_utils import conditions, Benchmark + +CANDLE_DATA_DIR = os.getenv("CANDLE_DATA_DIR") -def compare(exp_id, run_id): +def compare(model_name, exp_id, run_id): + cmp_results = {} print(f"compare: run_id={run_id}") - gParams = read_params(exp_id, run_id) - model = gParams("model_name") + # gParams = read_params(exp_id, run_id) + # model = gParams("model_name") - directory = f"{CANDLE_DATA_DIR}/{model}/Output/{exp_id}/{run_id}" + # model = "DrugCell" # TODO: Hardcoded. have to get this from output dir? + # turbine_output = os.getenv("TURBINE_OUTPUT") + + CANDLE_DATA_DIR = os.getenv("CANDLE_DATA_DIR") + outdir = os.path.join(CANDLE_DATA_DIR, model_name, "Output", exp_id, run_id) + directory = outdir + # directory = f"{CANDLE_DATA_DIR}/Output/{exp_id}/{run_id}" + print("reading the predictions....") df_res = pd.read_csv(f"{directory}/test_predictions.csv") # a class to calculate errors for subsets of the validation/test set - bmk = Benchmark(fp_path='drug_features.csv' - ) # TODO: have to have a drug features for a common test set - subset_err = bmk.error_by_feature_domains_model(df_res, conditions) - - # collect results for comparison - subset_err.set_index( - 'prop', inplace=True - ) # TODO: use 'prop' as a parameter and move it to cmp_models.txt - cmp_results[run_id] = subset_err.loc[ - cmp_prop, - 'error'] # this is the property based on which we want to do the comparison + print("reading the drug feature file....") + # # TODO: Should have to save the above file in this file + # bmk = Benchmark(fp_path=f'{CANDLE_DATA_DIR}/drug_features.csv' + # ) # TODO: have to have a drug features for a common test set + # subset_err, final_domain_err = bmk.error_by_feature_domains_model( + # df_res, conditions) + + # # or this + # # fp_path=f'{CANDLE_DATA_DIR}/drug_features.csv' + # # subset_err, final_domain_err = error_by_feature_domains_model(fp_path, df_res, conditions) + + # # collect results for comparison + # cmp_prop = 'nAtom' # TODO: Get this from gParameters + # subset_err.set_index( + # 'prop', inplace=True + # ) # TODO: use 'prop' as a parameter and move it to cmp_models.txt + # cmp_results[run_id] = subset_err.loc[ + # cmp_prop, + # 'error'] # this is the property based on which we want to do the comparison + cmp_results[run_id] = -1 # set to -1 for now as we don't have the drug features file with open(f"{directory}/subset_err.txt", "w") as fp: fp.write(str(cmp_results[run_id])) + + return str(cmp_results[run_id]) + + +def error_by_feature_domains_model(fp_path, preds, conditions): + + fps = pd.read_csv(fp_path) + report = [] + preds['err'] = abs(preds['true'] - preds['pred']) + keep = preds.copy() + for i in range(conditions.shape[0]): + + prop = conditions.loc[i, 'prop'] + low = conditions.loc[i, 'low'] + high = conditions.loc[i, 'high'] + + locs = np.logical_and(fps[prop] <= high, fps[prop] > low) + smiles = fps.loc[locs, 'smiles'].values + tmp = preds[preds.smiles.isin(smiles)] + mean_err = tmp.err.mean() + + report.append([prop, low, high, mean_err]) + + keep = keep[keep.smiles.isin(smiles)] + + final_domain_err = keep.err.mean() # return this + report = pd.DataFrame(report, columns=['prop', 'low', 'high', 'error']) + return report, final_domain_err + + +class Benchmark: + + def __init__(self, fp_path): + + self.fps = pd.read_csv(fp_path) + # self.model_preds = model_preds + # self.feature_conditions = feature_conditions + self.reports = {} + + def error_by_feature_domains_model(self, preds, conditions): + + fps = self.fps + report = [] + preds['err'] = abs(preds['true'] - preds['pred']) + keep = preds.copy() + for i in range(conditions.shape[0]): + + prop = conditions.loc[i, 'prop'] + low = conditions.loc[i, 'low'] + high = conditions.loc[i, 'high'] + + locs = np.logical_and(fps[prop] <= high, fps[prop] > low) + smiles = fps.loc[locs, 'smiles'].values + tmp = preds[preds.smiles.isin(smiles)] + mean_err = tmp.err.mean() + + report.append([prop, low, high, mean_err]) + + keep = keep[keep.smiles.isin(smiles)] + + final_domain_err = keep.err.mean() # return this + report = pd.DataFrame(report, columns=['prop', 'low', 'high', 'error']) + return report, final_domain_err + + def error_by_feature_domains(self, feature_conditions): + + results = [] + for model_name, pred in self.model_preds.items(): + + report = self.error_by_feature_domains_model( + pred, feature_conditions) + report.loc[:, 'model'] = model_name + results.append(report) + + results = pd.concat(results, axis=0) + results = results.loc[:, ['model', 'prop', 'low', 'high', 'error']] + results.reset_index(drop=True, inplace=True) + + return results + + def rank_by_acc(self, metric='rmse', th=3): + + results = {} + for model_name, pred in self.model_preds.items(): + sub = pred[pred.labels > th] + rmse = mean_squared_error(y_true=sub['labels'], + y_pred=sub['preds'])**.5 + + results[model_name] = {'rmse': rmse} + + results = pd.DataFrame.from_dict(results) + results = results.T + return results + + +def create_grid_files(): + + dc_grid = {'epochs': [1, 2], 'lr': [1e-2, 1e-3]} + sw_grid = {'epochs': [3, 4], 'lr': [1e-2, 1e-5]} + + with open('DrugCell_grid.json', 'w') as fp: + json.dump(dc_grid, fp) + + with open('SWnet_CCLE_grid.json', 'w') as fp: + json.dump(sw_grid, fp) diff --git a/workflows/cmp-cv/swift/workflow.sh b/workflows/cmp-cv/swift/workflow.sh index 93c0e15e..37fc69f0 100755 --- a/workflows/cmp-cv/swift/workflow.sh +++ b/workflows/cmp-cv/swift/workflow.sh @@ -44,9 +44,10 @@ source_site sched $SITE # Set up PYTHONPATH for model source $WORKFLOWS_ROOT/common/sh/set-pythonpath.sh + export PYTHONPATH="${PYTHONPATH}:/homes/ac.gpanapitiya/ccmg-mtg/models/to_Candle/DrugCell" export PYTHONPATH="${PYTHONPATH}:/homes/ac.gpanapitiya/ccmg-mtg/models/to_Candle/SWnet" - +export PYTHONPATH="${PYTHONPATH}:$WORKFLOWS_ROOT/cmp-cv/py" log_path PYTHONPATH export TURBINE_JOBNAME="CMP_${EXPID}" diff --git a/workflows/common/python/compare.py b/workflows/common/python/compare.py deleted file mode 100644 index 54a94ba0..00000000 --- a/workflows/common/python/compare.py +++ /dev/null @@ -1,171 +0,0 @@ -"""This script can be used to filter a subset of the test set based on the -properties of the drug molecules. - -For example, here we can select the molecules of which the 'prop' is -between two values (provided in the 2nd and 3rd elements of each list in -the conditions list. We can then find the prediction errors for this -domain. Knowledge of the errors of differnt molecular groups is helpful -to understand the currrent deficiencies of the drug response models (or -any molecular property prediction model in general). This knowledge is -then allow us to improve the models as well as use predictions from the -models which produce highly accurate preidictions for certain domains. -""" - -import os -import pandas as pd -import pandas as pd -import numpy as np -from sklearn.metrics import mean_squared_error - -conditions = pd.DataFrame( - [['nAromAtom', 5, 10], ['nAtom', 20, 50], ['BertzCT', 800, 1000]], - columns=['prop', 'low', 'high']) -# from cmp_utils import conditions, Benchmark - -CANDLE_DATA_DIR = os.getenv("CANDLE_DATA_DIR") - - -def compare(model_name, exp_id, run_id): - cmp_results = {} - print(f"compare: run_id={run_id}") - # gParams = read_params(exp_id, run_id) - # model = gParams("model_name") - - # model = "DrugCell" # TODO: Hardcoded. have to get this from output dir? - # turbine_output = os.getenv("TURBINE_OUTPUT") - - CANDLE_DATA_DIR = os.getenv("CANDLE_DATA_DIR") - outdir = os.path.join(CANDLE_DATA_DIR, model_name, "Output", exp_id, run_id) - directory = outdir - # directory = f"{CANDLE_DATA_DIR}/Output/{exp_id}/{run_id}" - print("reading the predictions....") - df_res = pd.read_csv(f"{directory}/test_predictions.csv") - - # a class to calculate errors for subsets of the validation/test set - print("reading the drug feature file....") - # # TODO: Should have to save the above file in this file - # bmk = Benchmark(fp_path=f'{CANDLE_DATA_DIR}/drug_features.csv' - # ) # TODO: have to have a drug features for a common test set - # subset_err, final_domain_err = bmk.error_by_feature_domains_model( - # df_res, conditions) - - # # or this - # # fp_path=f'{CANDLE_DATA_DIR}/drug_features.csv' - # # subset_err, final_domain_err = error_by_feature_domains_model(fp_path, df_res, conditions) - - # # collect results for comparison - # cmp_prop = 'nAtom' # TODO: Get this from gParameters - # subset_err.set_index( - # 'prop', inplace=True - # ) # TODO: use 'prop' as a parameter and move it to cmp_models.txt - # cmp_results[run_id] = subset_err.loc[ - # cmp_prop, - # 'error'] # this is the property based on which we want to do the comparison - cmp_results[run_id] = -1 # set to -1 for now as we don't have the drug features file - with open(f"{directory}/subset_err.txt", "w") as fp: - fp.write(str(cmp_results[run_id])) - - return str(cmp_results[run_id]) - - -def error_by_feature_domains_model(fp_path, preds, conditions): - - fps = pd.read_csv(fp_path) - report = [] - preds['err'] = abs(preds['true'] - preds['pred']) - keep = preds.copy() - for i in range(conditions.shape[0]): - - prop = conditions.loc[i, 'prop'] - low = conditions.loc[i, 'low'] - high = conditions.loc[i, 'high'] - - locs = np.logical_and(fps[prop] <= high, fps[prop] > low) - smiles = fps.loc[locs, 'smiles'].values - tmp = preds[preds.smiles.isin(smiles)] - mean_err = tmp.err.mean() - - report.append([prop, low, high, mean_err]) - - keep = keep[keep.smiles.isin(smiles)] - - final_domain_err = keep.err.mean() # return this - report = pd.DataFrame(report, columns=['prop', 'low', 'high', 'error']) - return report, final_domain_err - - -class Benchmark: - - def __init__(self, fp_path): - - self.fps = pd.read_csv(fp_path) - # self.model_preds = model_preds - # self.feature_conditions = feature_conditions - self.reports = {} - - def error_by_feature_domains_model(self, preds, conditions): - - fps = self.fps - report = [] - preds['err'] = abs(preds['true'] - preds['pred']) - keep = preds.copy() - for i in range(conditions.shape[0]): - - prop = conditions.loc[i, 'prop'] - low = conditions.loc[i, 'low'] - high = conditions.loc[i, 'high'] - - locs = np.logical_and(fps[prop] <= high, fps[prop] > low) - smiles = fps.loc[locs, 'smiles'].values - tmp = preds[preds.smiles.isin(smiles)] - mean_err = tmp.err.mean() - - report.append([prop, low, high, mean_err]) - - keep = keep[keep.smiles.isin(smiles)] - - final_domain_err = keep.err.mean() # return this - report = pd.DataFrame(report, columns=['prop', 'low', 'high', 'error']) - return report, final_domain_err - - def error_by_feature_domains(self, feature_conditions): - - results = [] - for model_name, pred in self.model_preds.items(): - - report = self.error_by_feature_domains_model( - pred, feature_conditions) - report.loc[:, 'model'] = model_name - results.append(report) - - results = pd.concat(results, axis=0) - results = results.loc[:, ['model', 'prop', 'low', 'high', 'error']] - results.reset_index(drop=True, inplace=True) - - return results - - def rank_by_acc(self, metric='rmse', th=3): - - results = {} - for model_name, pred in self.model_preds.items(): - sub = pred[pred.labels > th] - rmse = mean_squared_error(y_true=sub['labels'], - y_pred=sub['preds'])**.5 - - results[model_name] = {'rmse': rmse} - - results = pd.DataFrame.from_dict(results) - results = results.T - return results - - -def create_grid_files(): - - dc_grid = {'epochs': [1, 2], 'lr': [1e-2, 1e-3]} - sw_grid = {'epochs': [3, 4], 'lr': [1e-2, 1e-5]} - - with open('DrugCell_grid.json', 'w') as fp: - json.dump(dc_grid, fp) - - with open('SWnet_CCLE_grid.json', 'w') as fp: - json.dump(sw_grid, fp) From a84222266166fab524b969397014b336f731427c Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 5 Apr 2023 17:39:59 -0700 Subject: [PATCH 506/601] o enable drug_features.csv, a manual process to put the file in CDD for now --- workflows/cmp-cv/py/compare.py | 36 ++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/workflows/cmp-cv/py/compare.py b/workflows/cmp-cv/py/compare.py index 54a94ba0..e9916af7 100644 --- a/workflows/cmp-cv/py/compare.py +++ b/workflows/cmp-cv/py/compare.py @@ -43,25 +43,27 @@ def compare(model_name, exp_id, run_id): # a class to calculate errors for subsets of the validation/test set print("reading the drug feature file....") - # # TODO: Should have to save the above file in this file - # bmk = Benchmark(fp_path=f'{CANDLE_DATA_DIR}/drug_features.csv' - # ) # TODO: have to have a drug features for a common test set - # subset_err, final_domain_err = bmk.error_by_feature_domains_model( - # df_res, conditions) + # TODO: Should have to save the above file in this file + # copy and place the following in your CANDLE_DATA_DIR + # cp /lambda_stor/homes/ac.gpanapitiya/ccmg-mtg/benchmark/drug_features.csv . + bmk = Benchmark(fp_path=f'{CANDLE_DATA_DIR}/drug_features.csv' + ) # TODO: have to have a drug features for a common test set + subset_err, final_domain_err = bmk.error_by_feature_domains_model( + df_res, conditions) # # or this - # # fp_path=f'{CANDLE_DATA_DIR}/drug_features.csv' - # # subset_err, final_domain_err = error_by_feature_domains_model(fp_path, df_res, conditions) - - # # collect results for comparison - # cmp_prop = 'nAtom' # TODO: Get this from gParameters - # subset_err.set_index( - # 'prop', inplace=True - # ) # TODO: use 'prop' as a parameter and move it to cmp_models.txt - # cmp_results[run_id] = subset_err.loc[ - # cmp_prop, - # 'error'] # this is the property based on which we want to do the comparison - cmp_results[run_id] = -1 # set to -1 for now as we don't have the drug features file + # fp_path=f'{CANDLE_DATA_DIR}/drug_features.csv' + # subset_err, final_domain_err = error_by_feature_domains_model(fp_path, df_res, conditions) + + # collect results for comparison + cmp_prop = 'nAtom' # TODO: Get this from gParameters + subset_err.set_index( + 'prop', inplace=True + ) # TODO: use 'prop' as a parameter and move it to cmp_models.txt + cmp_results[run_id] = subset_err.loc[ + cmp_prop, + 'error'] # this is the property based on which we want to do the comparison + # cmp_results[run_id] = -1 # set to -1 for now as we don't have the drug features file with open(f"{directory}/subset_err.txt", "w") as fp: fp.write(str(cmp_results[run_id])) From 2c100d798e68d265ab76c37cd7628aab4d0a9c2b Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 12 Apr 2023 00:25:55 -0500 Subject: [PATCH 507/601] o Some fixes to the noise workflow --- .../uq-noise/swift/workflow-gauss-abs.sh | 2 +- workflows/uq-noise/swift/workflow.sh | 22 +++++++++++++++++-- workflows/uq-noise/swift/workflow.swift | 2 +- workflows/uq-noise/test/cfg-sys-1.sh | 4 +++- workflows/uq-noise/test/cfg-sys-small.sh | 5 ++++- workflows/uq-noise/test/gauss-abs.sh | 1 + workflows/uq-noise/test/test-1.sh | 2 ++ 7 files changed, 32 insertions(+), 6 deletions(-) diff --git a/workflows/uq-noise/swift/workflow-gauss-abs.sh b/workflows/uq-noise/swift/workflow-gauss-abs.sh index 86937f73..9a015ee1 100755 --- a/workflows/uq-noise/swift/workflow-gauss-abs.sh +++ b/workflows/uq-noise/swift/workflow-gauss-abs.sh @@ -182,7 +182,7 @@ swift-t -n $PROCS \ -e SH_TIMEOUT \ -e IGNORE_ERRORS \ $WAIT_ARG \ - $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} |& \ + $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} 2>&1 \ tee $STDOUT diff --git a/workflows/uq-noise/swift/workflow.sh b/workflows/uq-noise/swift/workflow.sh index 09d70576..2dc3e65a 100755 --- a/workflows/uq-noise/swift/workflow.sh +++ b/workflows/uq-noise/swift/workflow.sh @@ -56,7 +56,7 @@ source_site env $SITE source_site sched $SITE # Set PYTHONPATH for BENCHMARK related stuff -PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$XCORR_ROOT +PYTHONPATH+=:$BENCHMARK_DIR:$XCORR_ROOT PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # needed for model_runner and logs export APP_PYTHONPATH=$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$XCORR_ROOT @@ -87,6 +87,12 @@ then # $EMEWS_PROJECT_ROOT/db/db-cplo-init $DB_FILE $UQ_NOISE_ID fi + + +# Set up PYTHONPATH for model +source $WORKFLOWS_ROOT/common/sh/set-pythonpath.sh + + CMD_LINE_ARGS=( -benchmark_timeout=$BENCHMARK_TIMEOUT -exp_id=$EXPID -site=$SITE @@ -125,6 +131,18 @@ then echo "Turbine will wait for job completion." fi +# Handle %-escapes in TURBINE_STDOUT +if [ $SITE == "summit" ] || \ + [ $SITE == "biowulf" ] || \ + [ $SITE == "polaris" ] +then + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" +else + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%r.txt" +fi + +mkdir -pv $TURBINE_OUTPUT/out + if [[ ${MACHINE:-} == "" ]] then STDOUT=$TURBINE_OUTPUT/output.txt @@ -185,7 +203,7 @@ swift-t -n $PROCS \ -e SH_TIMEOUT \ -e IGNORE_ERRORS \ $WAIT_ARG \ - $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} |& \ + $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} 2>&1 \ tee $STDOUT diff --git a/workflows/uq-noise/swift/workflow.swift b/workflows/uq-noise/swift/workflow.swift index 6fec7dd5..461c8de9 100644 --- a/workflows/uq-noise/swift/workflow.swift +++ b/workflows/uq-noise/swift/workflow.swift @@ -52,7 +52,7 @@ foreach levelx, i in x_noise_levels " \"epochs\" : 1 } ") % (x_noise_level, y_noise_level); printf("running: %s", params); - result = obj(params, run_id); + result = obj(params, exp_id, run_id); printf("result %s : x_noise %0.3f y_noise %0.3f : %s", run_id, x_noise_level, y_noise_level, result); } diff --git a/workflows/uq-noise/test/cfg-sys-1.sh b/workflows/uq-noise/test/cfg-sys-1.sh index 03183555..932d8daa 100644 --- a/workflows/uq-noise/test/cfg-sys-1.sh +++ b/workflows/uq-noise/test/cfg-sys-1.sh @@ -4,7 +4,7 @@ # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEMS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-30} +export PROCS=${PROCS:-3} # export PROCS=${PROCS:-128} # MPI processes per node @@ -17,6 +17,8 @@ export PPN=${PPN:-1} export WALLTIME=${WALLTIME:-02:00:00} +RESIDENT_WORK_RANKS=$(( PROCS - 2 )) + # command separated list of gpu ids # export GPU_STRING=${GPU_STRING:-0} #export TURBINE_LAUNCH_OPTIONS="-a6 -g6 -c42" diff --git a/workflows/uq-noise/test/cfg-sys-small.sh b/workflows/uq-noise/test/cfg-sys-small.sh index e009c597..9aaed891 100644 --- a/workflows/uq-noise/test/cfg-sys-small.sh +++ b/workflows/uq-noise/test/cfg-sys-small.sh @@ -4,7 +4,7 @@ # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEMS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-2} +export PROCS=${PROCS:-3} # export PROCS=${PROCS:-128} # MPI processes per node @@ -17,6 +17,9 @@ export QUEUE=${QUEUE:-debug-cache-quad} export WALLTIME=${WALLTIME:-00:15:00} +RESIDENT_WORK_RANKS=$(( PROCS - 2 )) + + # command separated list of gpu ids # export GPU_STRING=${GPU_STRING:-0} #export TURBINE_LAUNCH_OPTIONS="-a6 -g6 -c42" diff --git a/workflows/uq-noise/test/gauss-abs.sh b/workflows/uq-noise/test/gauss-abs.sh index 2938256b..7a0fa824 100755 --- a/workflows/uq-noise/test/gauss-abs.sh +++ b/workflows/uq-noise/test/gauss-abs.sh @@ -39,6 +39,7 @@ export CFG_PRM=$THIS/cfg-prm-1.sh # What to return from the objective function (Keras model) # val_loss (default) and val_corr are supported export OBJ_RETURN="val_loss" +export CANDLE_MODEL_TYPE="BENCHMARKS" if [[ $SITE == "theta" ]] then diff --git a/workflows/uq-noise/test/test-1.sh b/workflows/uq-noise/test/test-1.sh index eed7152d..9dd15f9d 100755 --- a/workflows/uq-noise/test/test-1.sh +++ b/workflows/uq-noise/test/test-1.sh @@ -40,6 +40,8 @@ export CFG_PRM=$THIS/cfg-prm-1.sh # val_loss (default) and val_corr are supported export OBJ_RETURN="val_loss" +export CANDLE_MODEL_TYPE="BENCHMARKS" + if [[ $SITE == "theta" ]] then export WAIT=1 From 324d2acc10494f248db72eebf7e711ba8980da8b Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 12 Apr 2023 12:28:54 -0700 Subject: [PATCH 508/601] o Add new workflow cross-study-generalization CSG, also add new param model_script - train.sh preprocess.sh or infer.sh. Fix cmp-cv workflow --- workflows/csg/swift/workflow.sh | 118 +++++++++++++++++++++++++++++ workflows/csg/swift/workflow.swift | 58 ++++++++++++++ workflows/csg/test/cfg-sys-1.sh | 27 +++++++ workflows/csg/test/make-upf-1.sh | 26 +++++++ workflows/csg/test/test-small-1.sh | 28 +++++++ workflows/csg/test/upf-1.txt | 4 + 6 files changed, 261 insertions(+) create mode 100755 workflows/csg/swift/workflow.sh create mode 100644 workflows/csg/swift/workflow.swift create mode 100644 workflows/csg/test/cfg-sys-1.sh create mode 100755 workflows/csg/test/make-upf-1.sh create mode 100755 workflows/csg/test/test-small-1.sh create mode 100644 workflows/csg/test/upf-1.txt diff --git a/workflows/csg/swift/workflow.sh b/workflows/csg/swift/workflow.sh new file mode 100755 index 00000000..37fc69f0 --- /dev/null +++ b/workflows/csg/swift/workflow.sh @@ -0,0 +1,118 @@ +#! /usr/bin/env bash +set -eu + +# CMP-CV WORKFLOW SH + +# Autodetect this workflow directory +export CANDLE_PROJECT_ROOT=$( realpath $( dirname $0 )/.. ) +export WORKFLOWS_ROOT=$( realpath $CANDLE_PROJECT_ROOT/.. ) + +SCRIPT_NAME=$(basename $0) + +# Source some utility functions used in this script +source $WORKFLOWS_ROOT/common/sh/utils.sh + +usage() +{ + echo "CMP-CV: usage: workflow.sh SITE EXPID CFG_SYS PLAN" +} + +if (( ${#} != 5 )) +then + usage + exit 1 +fi + +if ! { + # Sets SITE + # Sets EXPID, TURBINE_OUTPUT + # Sets CFG_SYS + # PLAN is the hyperparameter list file + get_site $1 && \ + get_expid $2 && \ + get_cfg_sys $3 && \ + UPF=$4 + MODELS=$5 + } +then + usage + exit 1 +fi + +source_site env $SITE +source_site sched $SITE + +# Set up PYTHONPATH for model +source $WORKFLOWS_ROOT/common/sh/set-pythonpath.sh + +export PYTHONPATH="${PYTHONPATH}:/homes/ac.gpanapitiya/ccmg-mtg/models/to_Candle/DrugCell" +export PYTHONPATH="${PYTHONPATH}:/homes/ac.gpanapitiya/ccmg-mtg/models/to_Candle/SWnet" +export PYTHONPATH="${PYTHONPATH}:$WORKFLOWS_ROOT/cmp-cv/py" +log_path PYTHONPATH + +export TURBINE_JOBNAME="CMP_${EXPID}" + +export MODEL_SH=${MODEL_SH:-$WORKFLOWS_ROOT/common/sh/model.sh} +export BENCHMARK_TIMEOUT +PLAN="PLAN_NOT_DEFINED" +CMD_LINE_ARGS=( -expid=$EXPID + -benchmark_timeout=$BENCHMARK_TIMEOUT + -plan=$PLAN + -models=$MODELS + -gparams=$UPF + ) + +USER_VARS=( $CMD_LINE_ARGS ) +# log variables and script to to TURBINE_OUTPUT directory +log_script + +# Copy settings to TURBINE_OUTPUT for provenance +cp $CFG_SYS $TURBINE_OUTPUT + +# Make run directory in advance to reduce contention +mkdir -pv $TURBINE_OUTPUT/run + +cp -v $UPF $TURBINE_OUTPUT + +# TURBINE_STDOUT="$TURBINE_OUTPUT/out-%%r.txt" +TURBINE_STDOUT= + +if [[ ${CANDLE_DATA_DIR:-} == "" ]] +then + abort "cmp-cv workflow.sh: Set CANDLE_DATA_DIR!" +fi + +export CANDLE_IMAGE=${CANDLE_IMAGE:-} + +export SWIFT_IMPL=container + +which swift-t + +swift-t -n $PROCS \ + -o $TURBINE_OUTPUT/workflow.tic \ + ${MACHINE:-} \ + -p \ + -I $WORKFLOWS_ROOT/common/swift \ + -i obj_$SWIFT_IMPL \ + -e BENCHMARKS_ROOT \ + -e CANDLE_PROJECT_ROOT \ + -e MODEL_SH \ + -e FI_MR_CACHE_MAX_COUNT=0 \ + -e SITE \ + -e BENCHMARK_TIMEOUT \ + -e MODEL_NAME=${MODEL_NAME:-MODEL_NULL} \ + -e OBJ_RETURN \ + -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ + -e TURBINE_MPI_THREAD=${TURBINE_MPI_THREAD:-1} \ + $( python_envs ) \ + -e TURBINE_STDOUT=$TURBINE_STDOUT \ + -e CANDLE_MODEL_TYPE \ + -e CANDLE_IMAGE \ + $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} + +# Can provide this to debug Python settings: +# -e PYTHONVERBOSE=1 +# Can provide this if needed for debugging crashes: +# -e PYTHONUNBUFFERED=1 +# Can provide this if needed to reset PATH: +# -e PATH=$PATH diff --git a/workflows/csg/swift/workflow.swift b/workflows/csg/swift/workflow.swift new file mode 100644 index 00000000..07fb0bab --- /dev/null +++ b/workflows/csg/swift/workflow.swift @@ -0,0 +1,58 @@ + +/** + CMP-CV WORKFLOW.SWIFT +*/ + +import assert; +import io; +import json; +import files; +import string; +import sys; + +import candle_utils; +// report_env(); + +string FRAMEWORK = "pytorch"; + +// Scan command line +// file plan = input(argv("plan")); +file model_file = input(argv("models")); +file gparams_file = input(argv("gparams")); +int benchmark_timeout = string2int(argv("benchmark_timeout", "-1")); + +string expid = getenv("EXPID"); +string turbine_output = getenv("TURBINE_OUTPUT"); +string candle_model_type = getenv("CANDLE_MODEL_TYPE"); + +// Report some key facts: +printf("Cross-Validation: %s", filename(model_file)); +system1("date \"WORKFLOW START: +%Y-%m-%d %H:%M\""); + +// Read unrolled parameter file +// string plan_lines[] = file_lines(plan); +string model_lines[] = file_lines(model_file); + +string gparams_lines[] = file_lines(gparams_file); + +// Resultant output values: +string results[]; + +foreach gparam, j in gparams_lines +{ + // runid = i*1000000 + j; + runid = j; + + printf("runid: %s", runid); + printf("gparams: %s", gparam); + + model_name = json_get(gparam, "model_name"); + candle_image = json_get(gparam, "candle_image"); + model_script = "train.sh"; + + printf("MODEL: %s", model_name); + + results[runid] = obj_container(gparam, expid, repr(runid), model_name, candle_image, model_script); + +} + diff --git a/workflows/csg/test/cfg-sys-1.sh b/workflows/csg/test/cfg-sys-1.sh new file mode 100644 index 00000000..5158b14a --- /dev/null +++ b/workflows/csg/test/cfg-sys-1.sh @@ -0,0 +1,27 @@ + +# CMP-CV CFG SYS 1 + +# Use 1 for interactive workflows +# export INTERACTIVE=1 + +# The number of MPI processes +# Note that 1 process is reserved for Swift/T +# For example, if PROCS=4 that gives you 3 workers, +# i.e., 3 concurrent Keras runs. +export PROCS=${PROCS:-2} + +# MPI processes per node. This should not exceed PROCS. +# Cori has 32 cores per node, 128GB per node +export PPN=${PPN:-2} + +#export QUEUE=${QUEUE:-batch} + +export WALLTIME=${WALLTIME:-1:00:00} +echo WALLTIME: $WALLTIME + +# export MAIL_ENABLED=1 +# export MAIL_ADDRESS=woz@anl.gov + +# Benchmark run timeout: benchmark run will timeouT +# after the specified number of seconds. -1 is no timeout. +BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} diff --git a/workflows/csg/test/make-upf-1.sh b/workflows/csg/test/make-upf-1.sh new file mode 100755 index 00000000..eb5c0049 --- /dev/null +++ b/workflows/csg/test/make-upf-1.sh @@ -0,0 +1,26 @@ +#!/bin/zsh + + + +OUTPUT=$1 + +# Use ZSH for range operation + +EPOCHS_MIN=10 +EPOCHS_MAX=20 +BATCH_SIZE_MIN=5 +BATCH_SIZE_MAX=7 + + +for EPOCHS in {$EPOCHS_MIN..$EPOCHS_MAX} +do + for BATCH_SIZE in {$BATCH_SIZE_MIN..$BATCH_SIZE_MAX} + do + BS2=$(( 2 ** BATCH_SIZE )) + echo "{" + echo "\"epochs\": $EPOCHS," + echo "\"batch_size\": $BATCH_SIZE," + echo "MORE_PARAMS" + echo "}" + done +done > $OUTPUT diff --git a/workflows/csg/test/test-small-1.sh b/workflows/csg/test/test-small-1.sh new file mode 100755 index 00000000..22fc8222 --- /dev/null +++ b/workflows/csg/test/test-small-1.sh @@ -0,0 +1,28 @@ +#!/bin/bash +set -eu + +# CMP-CV TEST SMALL 1 + +if (( ${#} != 1 )) +then + echo "usage: test SITE" + exit 1 +fi + +# export MODEL_NAME=$1 +SITE=$1 + +# Self-configure +THIS=$( cd $( dirname $0 ) ; /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. ; /bin/pwd ) +WORKFLOWS_ROOT=$( cd $THIS/../.. ; /bin/pwd ) +export EMEWS_PROJECT_ROOT + +export OBJ_RETURN="val_loss" +CFG_SYS=$THIS/cfg-sys-1.sh + +# export MODEL_NAME="DrugCell" +# export CANDLE_IMAGE=/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif +export CANDLE_MODEL_TYPE="SINGULARITY" + +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $THIS/upf-1.txt $THIS/models-1.txt diff --git a/workflows/csg/test/upf-1.txt b/workflows/csg/test/upf-1.txt new file mode 100644 index 00000000..bb88828e --- /dev/null +++ b/workflows/csg/test/upf-1.txt @@ -0,0 +1,4 @@ +{"id": "RUN001", "batch_size": 16, "epochs": 4, "model_name": "GraphDRP", "candle_image": "/software/improve/images/GraphDRP.sif"} +{"id": "RUN002", "batch_size": 32, "epochs": 3, "model_name": "GraphDRP", "candle_image": "/software/improve/images/GraphDRP.sif"} +{"id": "RUN003", "batch_size": 64, "epochs": 2, "model_name": "GraphDRP", "candle_image": "/software/improve/images/GraphDRP.sif"} +{"id": "RUN004", "batch_size": 128, "epochs": 1, "model_name": "GraphDRP", "candle_image": "/software/improve/images/GraphDRP.sif"} \ No newline at end of file From 5833ddd32fc64005c62d3e8a27020056b1068295 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 12 Apr 2023 12:31:57 -0700 Subject: [PATCH 509/601] o Fix args --- workflows/cmp-cv/swift/workflow.swift | 3 ++- workflows/cmp-cv/test/upf-1.txt | 10 ++++++---- workflows/common/sh/model.sh | 5 +++-- workflows/common/swift/obj_container.swift | 9 +++++---- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/workflows/cmp-cv/swift/workflow.swift b/workflows/cmp-cv/swift/workflow.swift index 7ae87823..1739bc49 100644 --- a/workflows/cmp-cv/swift/workflow.swift +++ b/workflows/cmp-cv/swift/workflow.swift @@ -74,7 +74,8 @@ foreach gparam, j in gparams_lines printf("MODEL: %s", model_name); // printf(gparams); // results[runid] = obj(gparam, expid, repr(runid) ); - results[runid] = obj_container(gparam, expid, repr(runid), model_name, candle_image) => compare(model_name, expid, repr(runid) ); + model_script = "train.sh"; + results[runid] = obj_container(gparam, expid, repr(runid), model_name, candle_image, model_script) => compare(model_name, expid, repr(runid)); // results[runid] = obj(gparam, expid, repr(runid)); // => compare(expid, repr(runid) ); diff --git a/workflows/cmp-cv/test/upf-1.txt b/workflows/cmp-cv/test/upf-1.txt index 5b762e1c..0bd73242 100644 --- a/workflows/cmp-cv/test/upf-1.txt +++ b/workflows/cmp-cv/test/upf-1.txt @@ -1,4 +1,6 @@ -{"id": "RUN001", "epochs": 1, "model_name": "DrugCell", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif"} -{"id": "RUN002", "epochs": 1, "model_name": "DrugCell", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif"} -{"id": "RUN003", "epochs": 1, "model_name": "SWnet", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/SWnet.sif"} -{"id": "RUN004", "epochs": 1, "model_name": "SWnet", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/SWnet.sif"} +{"id": "RUN001", "epochs": 1, "model_name": "DrugCell", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/DrugCell.sif"} +{"id": "RUN002", "epochs": 2, "model_name": "DrugCell", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/DrugCell.sif"} +{"id": "RUN003", "epochs": 1, "model_name": "SWnet", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} +{"id": "RUN004", "epochs": 2, "model_name": "SWnet", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} + + diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index 865a17e5..81832093 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -40,10 +40,11 @@ if (( ${#} >= 4 )) RUNID=$4 fi -if (( ${#} == 6 )) +if (( ${#} == 7 )) then export MODEL_NAME=$5 export CANDLE_IMAGE=$6 + export MODEL_SCRIPT=$7 fi # Each model run, runs in its own "instance" directory @@ -128,7 +129,7 @@ then # The Singularity command line arguments: MODEL_CMD=( singularity exec --nv --bind $CANDLE_DATA_DIR:/candle_data_dir - $CANDLE_IMAGE train.sh $ADLB_RANK_OFFSET + $CANDLE_IMAGE $MODEL_SCRIPT $ADLB_RANK_OFFSET /candle_data_dir $FLAGS # $INTERNAL_DIRECTORY/parameters.txt --experiment_id $EXPID diff --git a/workflows/common/swift/obj_container.swift b/workflows/common/swift/obj_container.swift index 717dd03e..bfeee236 100644 --- a/workflows/common/swift/obj_container.swift +++ b/workflows/common/swift/obj_container.swift @@ -11,7 +11,8 @@ string expid, string runid, string model_name, - string candle_image) + string candle_image, + string model_script) { string model_sh = getenv("MODEL_SH"); string turbine_output = getenv("TURBINE_OUTPUT"); @@ -27,7 +28,7 @@ // We do not use a file type here because this file may not be created, // which is handled by get_results() string result_file = outdir/"result.txt"; - wait (run_model(model_sh, params, expid, runid, model_name, candle_image)) + wait (run_model(model_sh, params, expid, runid, model_name, candle_image, model_script)) { obj_result = get_results(result_file); } @@ -38,10 +39,10 @@ Swift/T app function that runs the Benchmark */ app (void o) run_model (string model_sh, string params, - string expid, string runid, string model_name, string candle_image) + string expid, string runid, string model_name, string candle_image, string model_script) { // 1 2 3 4 5 6 - "bash" model_sh FRAMEWORK params expid runid model_name candle_image; + "bash" model_sh FRAMEWORK params expid runid model_name candle_image model_script; } /** From 614ef9f90a2059260e4936683074e4f481f08507 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 12 Apr 2023 14:57:24 -0700 Subject: [PATCH 510/601] o Add comments --- workflows/cmp-cv/test/upf-1.txt | 14 +++++++++++++- workflows/csg/swift/workflow.swift | 4 ++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/workflows/cmp-cv/test/upf-1.txt b/workflows/cmp-cv/test/upf-1.txt index 0bd73242..86d087d2 100644 --- a/workflows/cmp-cv/test/upf-1.txt +++ b/workflows/cmp-cv/test/upf-1.txt @@ -3,4 +3,16 @@ {"id": "RUN003", "epochs": 1, "model_name": "SWnet", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} {"id": "RUN004", "epochs": 2, "model_name": "SWnet", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} - +#GraphDRP +#{"id": "RUN004", "training_data" : "path/to/dir, +# "testing_data": "path/to/dir", +# "infer_data" : [ path/to/dir ], +# "model_params": name_of_model_params_output_of_training, +# "epochs": 50, +# "model_name": "GraphDRP", +# "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} +#{"id": "RUN004", "epochs": 50, "model_name": "GraphDRP", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} +#{"id": "RUN004", "epochs": 50, "model_name": "GraphDRP", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} +#{"id": "RUN004", "epochs": 50, "model_name": "GraphDRP", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} +#{"id": "RUN004", "epochs": 50, "model_name": "DeepTTC", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} +#DeepTTC \ No newline at end of file diff --git a/workflows/csg/swift/workflow.swift b/workflows/csg/swift/workflow.swift index 07fb0bab..f0ac2a3e 100644 --- a/workflows/csg/swift/workflow.swift +++ b/workflows/csg/swift/workflow.swift @@ -51,8 +51,8 @@ foreach gparam, j in gparams_lines model_script = "train.sh"; printf("MODEL: %s", model_name); - + // TODO: Add preprocessing script results[runid] = obj_container(gparam, expid, repr(runid), model_name, candle_image, model_script); - + // TODO: Add inference script or loop to do multiple inferences on a trained model } From 8ae9d8baa3a03ca73020dc848dec7785e3899fcc Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 12 Apr 2023 22:23:49 +0000 Subject: [PATCH 511/601] o Fixes for various workflows GA, Noise, CSG on site Polaris ANL --- .../GA/data/graphdrp_param_space_ga.json | 2 +- workflows/GA/test/cfg-prm-polaris.sh | 4 +-- workflows/GA/test/cfg-sys-polaris.sh | 6 ++-- workflows/common/sh/langs-app-polaris.sh | 5 ++++ workflows/csg/test/test-polaris.sh | 29 +++++++++++++++++++ workflows/csg/test/upf-graphdrp-polaris.txt | 4 +++ workflows/uq-noise/test/cfg-sys-small.sh | 2 +- 7 files changed, 45 insertions(+), 7 deletions(-) create mode 100755 workflows/csg/test/test-polaris.sh create mode 100644 workflows/csg/test/upf-graphdrp-polaris.txt diff --git a/workflows/GA/data/graphdrp_param_space_ga.json b/workflows/GA/data/graphdrp_param_space_ga.json index 4701f84c..3421cd63 100644 --- a/workflows/GA/data/graphdrp_param_space_ga.json +++ b/workflows/GA/data/graphdrp_param_space_ga.json @@ -44,6 +44,6 @@ { "name": "epochs", "type": "constant", - "value": 2 + "value": 20 } ] diff --git a/workflows/GA/test/cfg-prm-polaris.sh b/workflows/GA/test/cfg-prm-polaris.sh index c3bf30cd..28862053 100644 --- a/workflows/GA/test/cfg-prm-polaris.sh +++ b/workflows/GA/test/cfg-prm-polaris.sh @@ -4,9 +4,9 @@ SEED=${SEED:-1} # Total iterations -NUM_ITERATIONS=${NUM_ITERATIONS:-7} +NUM_ITERATIONS=${NUM_ITERATIONS:-5} # Size of GA population (i.e. the number of parameter sets to evaluate) -POPULATION_SIZE=${POPULATION_SIZE:-4} +POPULATION_SIZE=${POPULATION_SIZE:-9} # the GA strategy: one of 'simple' or 'mu_plus_lambda'. See # https://deap.readthedocs.io/en/master/api/algo.html?highlight=eaSimple#module-deap.algorithms # for more info. diff --git a/workflows/GA/test/cfg-sys-polaris.sh b/workflows/GA/test/cfg-sys-polaris.sh index 147c18f9..9af5cbcf 100644 --- a/workflows/GA/test/cfg-sys-polaris.sh +++ b/workflows/GA/test/cfg-sys-polaris.sh @@ -4,13 +4,13 @@ # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEMS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-6} +export PROCS=${PROCS:-11} # MPI processes per node # Cori has 32 cores per node, 128GB per node -export PPN=${PPN:-6} +export PPN=${PPN:-11} export QUEUE=${QUEUE:-debug-scaling} -export WALLTIME=${WALLTIME:-00:39:00} +export WALLTIME=${WALLTIME:-00:59:00} #export PROJECT=Candle_ECP diff --git a/workflows/common/sh/langs-app-polaris.sh b/workflows/common/sh/langs-app-polaris.sh index 25ab0278..0f6e7a72 100644 --- a/workflows/common/sh/langs-app-polaris.sh +++ b/workflows/common/sh/langs-app-polaris.sh @@ -4,3 +4,8 @@ PATH=/grand/CSC249ADOA01/public/sfw/polaris/Miniconda/bin:$PATH module load singularity + +export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 +export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 +export http_proxy=http://proxy.alcf.anl.gov:3128 +export https_proxy=http://proxy.alcf.anl.gov:3128 \ No newline at end of file diff --git a/workflows/csg/test/test-polaris.sh b/workflows/csg/test/test-polaris.sh new file mode 100755 index 00000000..35539d25 --- /dev/null +++ b/workflows/csg/test/test-polaris.sh @@ -0,0 +1,29 @@ +#!/bin/bash +set -eu + +# CMP-CV TEST SMALL 1 + +if (( ${#} != 1 )) +then + echo "usage: test SITE" + exit 1 +fi + +# export MODEL_NAME=$1 +SITE=$1 + +# Self-configure +THIS=$( cd $( dirname $0 ) ; /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. ; /bin/pwd ) +WORKFLOWS_ROOT=$( cd $THIS/../.. ; /bin/pwd ) +export EMEWS_PROJECT_ROOT + +export OBJ_RETURN="val_loss" +CFG_SYS=$THIS/cfg-sys-1.sh + +# export MODEL_NAME="DrugCell" +# export CANDLE_IMAGE=/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif +export CANDLE_MODEL_TYPE="SINGULARITY" + +# model-1.txt is not used currently +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $THIS/upf-graphdrp-polaris.txt $THIS/models-1.txt diff --git a/workflows/csg/test/upf-graphdrp-polaris.txt b/workflows/csg/test/upf-graphdrp-polaris.txt new file mode 100644 index 00000000..71224424 --- /dev/null +++ b/workflows/csg/test/upf-graphdrp-polaris.txt @@ -0,0 +1,4 @@ +{"id": "RUN001", "batch_size": 16, "epochs": 4, "model_name": "GraphDRP", "candle_image": "/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif"} +{"id": "RUN002", "batch_size": 32, "epochs": 3, "model_name": "GraphDRP", "candle_image": "/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif"} +{"id": "RUN003", "batch_size": 64, "epochs": 2, "model_name": "GraphDRP", "candle_image": "/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif"} +{"id": "RUN004", "batch_size": 128, "epochs": 1, "model_name": "GraphDRP", "candle_image": "/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif"} \ No newline at end of file diff --git a/workflows/uq-noise/test/cfg-sys-small.sh b/workflows/uq-noise/test/cfg-sys-small.sh index 9aaed891..9e63a85e 100644 --- a/workflows/uq-noise/test/cfg-sys-small.sh +++ b/workflows/uq-noise/test/cfg-sys-small.sh @@ -12,7 +12,7 @@ export PROCS=${PROCS:-3} export PPN=${PPN:-1} # For Theta: -export QUEUE=${QUEUE:-debug-cache-quad} +# export QUEUE=${QUEUE:-debug-cache-quad} # export QUEUE=R.candle export WALLTIME=${WALLTIME:-00:15:00} From f51f44342a2725445d738ec2c58bece381fd24c8 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 18 Apr 2023 11:41:35 -0500 Subject: [PATCH 512/601] Initial dense-noise workflow --- workflows/dense-noise/swift/workflow.sh | 176 ++++++++++++++++++++ workflows/dense-noise/swift/workflow.swift | 59 +++++++ workflows/dense-noise/test/cfg-prm-1.sh | 4 + workflows/dense-noise/test/cfg-sys-small.sh | 36 ++++ workflows/dense-noise/test/test-1.sh | 63 +++++++ 5 files changed, 338 insertions(+) create mode 100755 workflows/dense-noise/swift/workflow.sh create mode 100644 workflows/dense-noise/swift/workflow.swift create mode 100644 workflows/dense-noise/test/cfg-prm-1.sh create mode 100644 workflows/dense-noise/test/cfg-sys-small.sh create mode 100755 workflows/dense-noise/test/test-1.sh diff --git a/workflows/dense-noise/swift/workflow.sh b/workflows/dense-noise/swift/workflow.sh new file mode 100755 index 00000000..7bd2a163 --- /dev/null +++ b/workflows/dense-noise/swift/workflow.sh @@ -0,0 +1,176 @@ +#! /usr/bin/env bash +set -eu + +# DENSE NOISE WORKFLOW +# Main entry point for DENSE-NOISE workflow +# See README.adoc for more information + +# Autodetect this workflow directory +export EMEWS_PROJECT_ROOT=$( cd $( dirname $0 )/.. ; /bin/pwd ) +export WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. ; /bin/pwd ) +if [[ ! -d $EMEWS_PROJECT_ROOT/../../../Benchmarks ]] +then + echo "Could not find Benchmarks in: $EMEWS_PROJECT_ROOT/../../../Benchmarks" + exit 1 +fi +BENCHMARKS_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd) +export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} +BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/NT3 +export BENCHMARK_TIMEOUT +export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} + +SCRIPT_NAME=$(basename $0) + +# Source some utility functions used by EMEWS in this script +source $WORKFLOWS_ROOT/common/sh/utils.sh + +usage() +{ + echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME" +} + +if (( ${#} != 5 )) +then + usage + exit 1 +fi + +if ! { + get_site $1 # Sets SITE + get_expid $2 # Sets EXPID + get_cfg_sys $3 + get_cfg_prm $4 + MODEL_NAME=$5 + } +then + usage + exit 1 +fi + +echo "Running "$MODEL_NAME "workflow" + +source_site env $SITE +source_site sched $SITE + +# Set PYTHONPATH for BENCHMARK related stuff +PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # needed for model_runner + +export APP_PYTHONPATH=$BENCHMARK_DIR:$BENCHMARKS_ROOT/common + +export TURBINE_JOBNAME="${EXPID}" + +if [ -z ${GPU_STRING+x} ]; +then + GPU_ARG="" +else + GPU_ARG="-gpus=$GPU_STRING" +fi + +mkdir -pv $TURBINE_OUTPUT + +# Set up PYTHONPATH for model +source $WORKFLOWS_ROOT/common/sh/set-pythonpath.sh + +CMD_LINE_ARGS=( -benchmark_timeout=$BENCHMARK_TIMEOUT + -exp_id=$EXPID + -site=$SITE + ) + +USER_VARS=( $CMD_LINE_ARGS ) +# log variables and script to to TURBINE_OUTPUT directory +log_script + +# Make run directory in advance to reduce contention +mkdir -pv $TURBINE_OUTPUT/run +mkdir -pv $TURBINE_OUTPUT/data + +# Allow the user to set an objective function +OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} +SWIFT_IMPL="py" +OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +# This is used by the obj_app objective function +export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh + +# log_path PYTHONPATH + +WORKFLOW_SWIFT=${WORKFLOW_SWIFT:-workflow.swift} +echo "WORKFLOW_SWIFT: $WORKFLOW_SWIFT" + +WAIT_ARG="" +if (( ${WAIT:-0} )) +then + WAIT_ARG="-t w" + echo "Turbine will wait for job completion." +fi + +# Handle %-escapes in TURBINE_STDOUT +if [ $SITE == "summit" ] || \ + [ $SITE == "biowulf" ] || \ + [ $SITE == "polaris" ] +then + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" +else + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%r.txt" +fi + +mkdir -pv $TURBINE_OUTPUT/out + +if [[ ${MACHINE:-} == "" ]] +then + STDOUT=$TURBINE_OUTPUT/output.txt + # The turbine-output link is only created on scheduled systems, + # so if running locally, we create it here so the test*.sh wrappers + # can find it + [[ -L turbine-output ]] && rm turbine-output + ln -s $TURBINE_OUTPUT turbine-output +else + # When running on a scheduled system, Swift/T automatically redirects + # stdout to the turbine-output directory. This will just be for + # warnings or unusual messages + # use for summit (slurm needs two %) + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" + + #export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%r.txt" + mkdir -pv $TURBINE_OUTPUT/out + STDOUT="" +fi + +#echo ${CMD_LINE_ARGS[@]} + +cd $TURBINE_OUTPUT +cp $CFG_SYS $CFG_PRM $WORKFLOWS_ROOT/uq-noise/swift/workflow.swift $TURBINE_OUTPUT + +swift-t -n $PROCS \ + ${MACHINE:-} \ + -p \ + -I $OBJ_DIR \ + -i $OBJ_MODULE \ + -e LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} \ + -e TURBINE_STDOUT \ + -e BENCHMARKS_ROOT \ + -e EMEWS_PROJECT_ROOT \ + -e APP_PYTHONPATH=$APP_PYTHONPATH \ + $( python_envs ) \ + -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ + -e OBJ_RETURN \ + -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ + -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ + -e MODEL_SH \ + -e MODEL_NAME \ + -e SITE \ + -e BENCHMARK_TIMEOUT \ + -e BENCHMARKS_ROOT \ + -e SH_TIMEOUT \ + -e IGNORE_ERRORS \ + $WAIT_ARG \ + $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} 2>&1 \ + tee $STDOUT + + +if (( ${PIPESTATUS[0]} )) +then + echo "workflow.sh: swift-t exited with error!" + exit 1 +fi + +# echo "EXIT CODE: 0" | tee -a $STDOUT diff --git a/workflows/dense-noise/swift/workflow.swift b/workflows/dense-noise/swift/workflow.swift new file mode 100644 index 00000000..ea528a95 --- /dev/null +++ b/workflows/dense-noise/swift/workflow.swift @@ -0,0 +1,59 @@ +/* + UQ NOISE SWIFT + Main workflow +*/ + +import assert; +import files; +import io; +import python; +import unix; +import sys; +import string; +import location; +import math; + +string FRAMEWORK = "keras"; + +string xcorr_root = getenv("XCORR_ROOT"); +string preprocess_rnaseq = getenv("PREPROP_RNASEQ"); +string emews_root = getenv("EMEWS_PROJECT_ROOT"); +string turbine_output = getenv("TURBINE_OUTPUT"); + +string exp_id = argv("exp_id"); +int benchmark_timeout = toint(argv("benchmark_timeout", "-1")); +string model_name = getenv("MODEL_NAME"); + +printf("UQ NOISE WORKFLOW.SWIFT"); +printf("TURBINE_OUTPUT: " + turbine_output); + +int epochs = 1; + +int neurons[] = [500:1000:250]; + +float y_num_noises = 1; // Number of noise levels to try +float y_noise_levels[] = [0:y_num_noises]; +float noise_step = 10; // Difference between noises + +int num_trials = 1; +int trials[] = [0:num_trials-1]; + +foreach neuron in neurons +{ + foreach levely, j in y_noise_levels + { + foreach trial, k in trials + { + y_noise_level = levely * noise_step; + run_id = "%04i-%0.0f-%02i" % (neuron, y_noise_level, trial); + params = ("{ \"layer_force\" : %i , " + + " \"noise\" : %f , " + + " \"epochs\" : %i } ") % + (neuron, y_noise_level, epochs); + printf("running: %s", params); + result = obj(params, exp_id, run_id); + printf("result %s : neuron %i y_noise %0.3f : %s", + run_id, neuron, y_noise_level, result); + } + } +} diff --git a/workflows/dense-noise/test/cfg-prm-1.sh b/workflows/dense-noise/test/cfg-prm-1.sh new file mode 100644 index 00000000..b44914a5 --- /dev/null +++ b/workflows/dense-noise/test/cfg-prm-1.sh @@ -0,0 +1,4 @@ + +# CFG PRM 1 + +# Empty diff --git a/workflows/dense-noise/test/cfg-sys-small.sh b/workflows/dense-noise/test/cfg-sys-small.sh new file mode 100644 index 00000000..b4f45c39 --- /dev/null +++ b/workflows/dense-noise/test/cfg-sys-small.sh @@ -0,0 +1,36 @@ + +# CFG SYS 1 + +# The number of MPI processes +# Note that 2 processes are reserved for Swift/EMEMS +# The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs +export PROCS=${PROCS:-2} + +# MPI processes per node +export PPN=${PPN:-1} + +export WALLTIME=${WALLTIME:-00:05:00} + +# CANDLE@ALCF: +export PROJECT=CSC249ADOA01 + +# Benchmark run timeout: benchmark run will timeout +# after the specified number of seconds. +# If set to -1 there is no timeout. +# This timeout is implemented with Keras callbacks +BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} + +# Shell timeout: benchmark run will be killed +# after the specified number of seconds. +# If set to -1 or empty there is no timeout. +# This timeout is implemented with the shell command 'timeout' +export SH_TIMEOUT=${SH_TIMEOUT:-} + +# Ignore errors: If 1, unknown errors will be reported to model.log +# but will not bring down the Swift workflow. See model.sh . +export IGNORE_ERRORS=0 + +if [[ ${SITE} == "summit" ]] +then + export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" +fi diff --git a/workflows/dense-noise/test/test-1.sh b/workflows/dense-noise/test/test-1.sh new file mode 100755 index 00000000..34fcdc5a --- /dev/null +++ b/workflows/dense-noise/test/test-1.sh @@ -0,0 +1,63 @@ +#!/bin/bash +set -eu + +# UQ NOISE TEST 1 + +usage() +{ + echo "Usage: test SITE RUN_DIR MODEL_NAME" + echo " RUN_DIR: use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 3 )) +then + SITE=$1 + RUN_DIR=$2 + export MODEL_NAME=$3 +else + usage + exit 1 +fi + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-small.sh +# export CFG_SYS=$THIS/cfg-sys-big.sh +export CFG_PRM=$THIS/cfg-prm-1.sh + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export OBJ_RETURN="val_loss" + +export CANDLE_MODEL_TYPE="BENCHMARKS" + +if [[ $SITE == "theta" ]] +then + export WAIT=1 +fi + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME + +# Wait for job +TURBINE_OUTPUT=$( readlink turbine-output ) +queue_wait + +# Check job output +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: From c54222cb5f69bbd9593d11b4b2695e99a56fb7d5 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 25 Apr 2023 13:33:49 -0500 Subject: [PATCH 513/601] Call everything CANDLE_MODEL_IMPL, "container" is now one of these --- workflows/GA/swift/workflow.sh | 2 +- workflows/async-search/swift/workflow.sh | 2 +- .../swift/workflow_simple_obj_app.sh | 2 +- workflows/cmp-cv/swift/workflow.sh | 4 +- workflows/common/sh/env-biowulf.sh | 2 +- workflows/common/sh/env-cori.sh | 2 +- workflows/common/sh/env-crusher.sh | 4 +- workflows/common/sh/env-default.sh | 2 +- workflows/common/sh/env-dunedin.sh | 2 +- workflows/common/sh/env-frontier.sh | 4 +- workflows/common/sh/env-gce.sh | 2 +- workflows/common/sh/env-lambda.sh | 2 +- workflows/common/sh/env-local.sh | 2 +- workflows/common/sh/env-mbook.sh | 2 +- workflows/common/sh/env-mcs.sh | 2 +- workflows/common/sh/env-pascal.sh | 2 +- workflows/common/sh/env-polaris.sh | 6 +- workflows/common/sh/env-spack.sh | 2 +- workflows/common/sh/env-spock.sh | 4 +- workflows/common/sh/env-summit-i.sh | 4 +- workflows/common/sh/env-summit-login.sh | 6 +- workflows/common/sh/env-summit-tf-2.4.1.sh | 4 +- workflows/common/sh/env-summit-tf1.sh | 4 +- workflows/common/sh/env-summit-tf2.sh | 2 +- workflows/common/sh/env-summit.sh | 4 +- workflows/common/sh/env-theta.sh | 2 +- workflows/common/sh/env-titan.sh | 2 +- workflows/common/sh/env-washington.sh | 4 +- workflows/common/sh/model.sh | 75 ++++++++++--------- .../swift/{obj_app.swift => model_app.swift} | 21 ++++-- workflows/common/swift/model_container.swift | 62 +++++++++++++++ .../{obj_echo.swift => model_echo.swift} | 14 +++- .../swift/{obj_py.swift => model_py.swift} | 11 ++- workflows/common/swift/obj_container.swift | 61 --------------- workflows/cp-leaveout/swift/baseline-error.sh | 2 +- .../cp-leaveout/swift/cpl-upf-workflow.sh | 2 +- workflows/cp-leaveout/swift/workflow-tic.sh | 2 +- workflows/cp-leaveout/swift/workflow.sh | 2 +- workflows/cp1/swift/infer_workflow.sh | 2 +- workflows/cp1/swift/upf_workflow.sh | 2 +- workflows/cp1/swift/workflow.sh | 2 +- workflows/csg/swift/workflow.sh | 4 +- workflows/dense-noise/swift/workflow.sh | 54 +++++++------ workflows/dense-noise/swift/workflow.swift | 39 ++++++---- workflows/dense-noise/test/cfg-sys-small.sh | 9 ++- workflows/dense-noise/test/test-1.sh | 15 +--- workflows/grid/swift/workflow.sh | 2 +- workflows/mlrMBO/swift/workflow.sh | 2 +- workflows/upf/swift/workflow.sh | 2 +- .../uq-noise/swift/workflow-abstention.sh | 4 +- .../uq-noise/swift/workflow-gauss-abs.sh | 4 +- workflows/uq-noise/swift/workflow-gnoise.sh | 4 +- workflows/uq-noise/swift/workflow-noise.sh | 4 +- workflows/uq-noise/swift/workflow.sh | 4 +- 54 files changed, 254 insertions(+), 233 deletions(-) rename workflows/common/swift/{obj_app.swift => model_app.swift} (68%) create mode 100644 workflows/common/swift/model_container.swift rename workflows/common/swift/{obj_echo.swift => model_echo.swift} (64%) rename workflows/common/swift/{obj_py.swift => model_py.swift} (82%) delete mode 100644 workflows/common/swift/obj_container.swift diff --git a/workflows/GA/swift/workflow.sh b/workflows/GA/swift/workflow.sh index 9bf7adeb..4619ef0f 100755 --- a/workflows/GA/swift/workflow.sh +++ b/workflows/GA/swift/workflow.sh @@ -124,7 +124,7 @@ mkdir -pv $TURBINE_OUTPUT/run # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh diff --git a/workflows/async-search/swift/workflow.sh b/workflows/async-search/swift/workflow.sh index 4f0b5879..60c3da05 100755 --- a/workflows/async-search/swift/workflow.sh +++ b/workflows/async-search/swift/workflow.sh @@ -117,7 +117,7 @@ cp $WORKFLOWS_ROOT/async-search/python/$PY_PACKAGE.py $CFG_SYS $CFG_PRM $TURBINE # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh diff --git a/workflows/async-search/swift/workflow_simple_obj_app.sh b/workflows/async-search/swift/workflow_simple_obj_app.sh index 1ad012e5..9e1ec741 100755 --- a/workflows/async-search/swift/workflow_simple_obj_app.sh +++ b/workflows/async-search/swift/workflow_simple_obj_app.sh @@ -110,7 +110,7 @@ cp $CFG_SYS $CFG_PRM $TURBINE_OUTPUT # Allow the user to set an objective function # OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -# OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +# OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh diff --git a/workflows/cmp-cv/swift/workflow.sh b/workflows/cmp-cv/swift/workflow.sh index 37fc69f0..bffb2625 100755 --- a/workflows/cmp-cv/swift/workflow.sh +++ b/workflows/cmp-cv/swift/workflow.sh @@ -84,7 +84,7 @@ fi export CANDLE_IMAGE=${CANDLE_IMAGE:-} -export SWIFT_IMPL=container +export CANDLE_MODEL_IMPL=container which swift-t @@ -93,7 +93,7 @@ swift-t -n $PROCS \ ${MACHINE:-} \ -p \ -I $WORKFLOWS_ROOT/common/swift \ - -i obj_$SWIFT_IMPL \ + -i model_$CANDLE_MODEL_IMPL \ -e BENCHMARKS_ROOT \ -e CANDLE_PROJECT_ROOT \ -e MODEL_SH \ diff --git a/workflows/common/sh/env-biowulf.sh b/workflows/common/sh/env-biowulf.sh index c53c697f..b7be0362 100644 --- a/workflows/common/sh/env-biowulf.sh +++ b/workflows/common/sh/env-biowulf.sh @@ -50,7 +50,7 @@ export TURBINE_HOME="$SWIFT_T_INSTALL/turbine" export TURBINE_LOG="1" export ADLB_DEBUG_RANKS="1" export ADLB_DEBUG_HOSTMAP="1" -export SWIFT_IMPL="app" +export CANDLE_MODEL_IMPL="app" # Resident task workers and ranks if [ -z ${TURBINE_RESIDENT_WORK_WORKERS+x} ]; then # if $TURBINE_RESIDENT_WORK_WORKERS is unset... diff --git a/workflows/common/sh/env-cori.sh b/workflows/common/sh/env-cori.sh index 92a64f39..f9ee9323 100644 --- a/workflows/common/sh/env-cori.sh +++ b/workflows/common/sh/env-cori.sh @@ -16,7 +16,7 @@ SWIFT=/global/homes/w/wozniak/Public/sfw/compute/swift-t-2018-06-05 export PATH=$SWIFT/stc/bin:$PATH # On Cori, we have a good Swift/T Python embedded interpreter, # but we use app anyway -SWIFT_IMPL="app" +CANDLE_MODEL_IMPL="app" # Python PYTHON=/global/common/cori/software/python/2.7-anaconda/envs/deeplearning diff --git a/workflows/common/sh/env-crusher.sh b/workflows/common/sh/env-crusher.sh index 927693dc..e40081e4 100644 --- a/workflows/common/sh/env-crusher.sh +++ b/workflows/common/sh/env-crusher.sh @@ -1,8 +1,8 @@ # ENV Crusher -# SWIFT_IMPL=echo -SWIFT_IMPL=py +# CANDLE_MODEL_IMPL=echo +CANDLE_MODEL_IMPL=py # CANDLE software installation root: MED106=/gpfs/alpine/world-shared/med106 diff --git a/workflows/common/sh/env-default.sh b/workflows/common/sh/env-default.sh index 9b575c81..9bfae8e9 100644 --- a/workflows/common/sh/env-default.sh +++ b/workflows/common/sh/env-default.sh @@ -5,7 +5,7 @@ export PYTHONPATH=${EMEWS_PROJECT_ROOT}/python:${PYTHONPATH:-} -SWIFT_IMPL=app +CANDLE_MODEL_IMPL=app # Resident task workers and ranks if [ -z ${TURBINE_RESIDENT_WORK_WORKERS+x} ] diff --git a/workflows/common/sh/env-dunedin.sh b/workflows/common/sh/env-dunedin.sh index 66e0d161..57c4c958 100644 --- a/workflows/common/sh/env-dunedin.sh +++ b/workflows/common/sh/env-dunedin.sh @@ -16,7 +16,7 @@ export R_HOME=/home/wozniak/Public/sfw/R-3.5.3/lib/R # Swift/T export PATH=/home/wozniak/Public/sfw/swift-t/stc/bin:$PATH -SWIFT_IMPL="app" +CANDLE_MODEL_IMPL="app" # EMEWS Queues for R # EQR=/opt/EQ-R diff --git a/workflows/common/sh/env-frontier.sh b/workflows/common/sh/env-frontier.sh index 5ddfcaf3..c9e0283d 100644 --- a/workflows/common/sh/env-frontier.sh +++ b/workflows/common/sh/env-frontier.sh @@ -1,8 +1,8 @@ # ENV Frontier -# SWIFT_IMPL=echo -SWIFT_IMPL=py +# CANDLE_MODEL_IMPL=echo +CANDLE_MODEL_IMPL=py # From Wozniak # MED106=/gpfs/alpine/world-shared/med106 diff --git a/workflows/common/sh/env-gce.sh b/workflows/common/sh/env-gce.sh index 99062319..f2f04e07 100644 --- a/workflows/common/sh/env-gce.sh +++ b/workflows/common/sh/env-gce.sh @@ -22,7 +22,7 @@ export LD_LIBRARY_PATH=$SFW/R-4.1.0/lib/R/lib export PYTHONPATH=${PYTHONPATH:-} EQR=$SFW/EQ-R -SWIFT_IMPL="app" +CANDLE_MODEL_IMPL="app" # For test output processing: export LOCAL=1 diff --git a/workflows/common/sh/env-lambda.sh b/workflows/common/sh/env-lambda.sh index fc49566b..c2cbb963 100644 --- a/workflows/common/sh/env-lambda.sh +++ b/workflows/common/sh/env-lambda.sh @@ -17,7 +17,7 @@ PATH=$PY/bin:$PATH export LD_LIBRARY_PATH=$R/lib/R/lib:${LD_LIBRARY_PATH:-} # How to run CANDLE models: -SWIFT_IMPL="app" +CANDLE_MODEL_IMPL="app" # PYTHONPATH=$EQPY/src:${PYTHONPATH:-} diff --git a/workflows/common/sh/env-local.sh b/workflows/common/sh/env-local.sh index 7646d78d..d7a9de09 100644 --- a/workflows/common/sh/env-local.sh +++ b/workflows/common/sh/env-local.sh @@ -11,7 +11,7 @@ export PYTHONPATH=${PYTHONPATH:-}${PYTHONPATH:+:} PYTHONPATH+=$WORKFLOWS_ROOT/common/python: export PATH=$SWIFT_T/stc/bin:$PATH -SWIFT_IMPL="py" +CANDLE_MODEL_IMPL="py" # EMEWS Queues for R EQR=$WORKFLOWS_ROOT/common/ext/EQ-R diff --git a/workflows/common/sh/env-mbook.sh b/workflows/common/sh/env-mbook.sh index a69a0e06..fa2709fc 100644 --- a/workflows/common/sh/env-mbook.sh +++ b/workflows/common/sh/env-mbook.sh @@ -16,7 +16,7 @@ PATH=$PY/bin:$PATH export LD_LIBRARY_PATH=/Library/Frameworks/R.framework/Resources/lib/:${LD_LIBRARY_PATH:-} # How to run CANDLE models: -SWIFT_IMPL="app" +CANDLE_MODEL_IMPL="app" # PYTHONPATH=$EQPY/src:${PYTHONPATH:-} diff --git a/workflows/common/sh/env-mcs.sh b/workflows/common/sh/env-mcs.sh index 42810d0d..07a27b31 100755 --- a/workflows/common/sh/env-mcs.sh +++ b/workflows/common/sh/env-mcs.sh @@ -19,7 +19,7 @@ PYTHONPATH+=$WORKFLOWS_ROOT/common/python: export PATH=$SWIFT_T/turbine/bin:$SWIFT_T/stc/bin:$PATH echo $PATH -SWIFT_IMPL="py" +CANDLE_MODEL_IMPL="py" # EMEWS Queues for R # EQR=$WORKFLOWS_ROOT/common/ext/EQ-R diff --git a/workflows/common/sh/env-pascal.sh b/workflows/common/sh/env-pascal.sh index 11f1d710..22583292 100644 --- a/workflows/common/sh/env-pascal.sh +++ b/workflows/common/sh/env-pascal.sh @@ -5,7 +5,7 @@ if [ -z "$SUPERVISOR_HOME" ]; then echo "SUPERVISOR_HOME is blank"; else echo "SUPERVISOR_HOME is set to '$SUPERVISOR_HOME'"; fi source ${SUPERVISOR_HOME}/spack/loads -SWIFT_IMPL=app +CANDLE_MODEL_IMPL=app # EMEWS Queues for R EQR=$(spack location -i eqr) diff --git a/workflows/common/sh/env-polaris.sh b/workflows/common/sh/env-polaris.sh index 95da0749..2eb56929 100644 --- a/workflows/common/sh/env-polaris.sh +++ b/workflows/common/sh/env-polaris.sh @@ -1,12 +1,12 @@ # ENV Polaris -# SWIFT_IMPL=echo -SWIFT_IMPL=app +# CANDLE_MODEL_IMPL=echo +CANDLE_MODEL_IMPL=app CSC249=/lus/grand/projects/CSC249ADOA01 ROOT=$CSC249/public/sfw/polaris -SWIFT=$ROOT/swift-t/2022-12-16 +SWIFT=$ROOT/swift-t/2023-04-19 export TURBINE_HOME=$SWIFT/turbine PATH=$SWIFT/stc/bin:$PATH diff --git a/workflows/common/sh/env-spack.sh b/workflows/common/sh/env-spack.sh index 1ac6d916..c70f9972 100644 --- a/workflows/common/sh/env-spack.sh +++ b/workflows/common/sh/env-spack.sh @@ -34,7 +34,7 @@ fi TURBINE_PY=$( readlink --canonicalize $TURBINE_PY_LIB/.. ) PATH=$TURBINE_PY/bin:$PATH -SWIFT_IMPL="app" +CANDLE_MODEL_IMPL="app" # # EMEWS Queues for R # EQR=$( spack find -p eqr | sed -n 's/eqr@[.0-9]*//p' ) diff --git a/workflows/common/sh/env-spock.sh b/workflows/common/sh/env-spock.sh index d9d247c8..e9136d4a 100644 --- a/workflows/common/sh/env-spock.sh +++ b/workflows/common/sh/env-spock.sh @@ -1,8 +1,8 @@ # ENV Spock -# SWIFT_IMPL=echo -SWIFT_IMPL=py +# CANDLE_MODEL_IMPL=echo +CANDLE_MODEL_IMPL=py # CANDLE software installation root: MED106=/gpfs/alpine/world-shared/med106 diff --git a/workflows/common/sh/env-summit-i.sh b/workflows/common/sh/env-summit-i.sh index 34afbbbb..3b604156 100644 --- a/workflows/common/sh/env-summit-i.sh +++ b/workflows/common/sh/env-summit-i.sh @@ -3,8 +3,8 @@ # Environment settings for Summit (Swift, Python, R, Tcl, etc.) # GCC 6.4.0, TensorFlow 1, condaenv-200408, R 3.6.1 -# SWIFT_IMPL=echo -SWIFT_IMPL=py +# CANDLE_MODEL_IMPL=echo +CANDLE_MODEL_IMPL=py # Let modules initialize LD_LIBRARY_PATH before changing it: set +eu # modules create errors outside our control diff --git a/workflows/common/sh/env-summit-login.sh b/workflows/common/sh/env-summit-login.sh index 0c58119c..83ad938a 100644 --- a/workflows/common/sh/env-summit-login.sh +++ b/workflows/common/sh/env-summit-login.sh @@ -2,9 +2,9 @@ # ENV SUMMIT LOGIN # Environment settings for Summit login node (Swift, Python, R, Tcl, etc.) -SWIFT_IMPL=echo -# SWIFT_IMPL=app -# SWIFT_IMPL=py +CANDLE_MODEL_IMPL=echo +# CANDLE_MODEL_IMPL=app +# CANDLE_MODEL_IMPL=py # Load basic LD_LIBRARY_PATH before changing it: # module load gcc/7.4.0 diff --git a/workflows/common/sh/env-summit-tf-2.4.1.sh b/workflows/common/sh/env-summit-tf-2.4.1.sh index a30e7b14..c93df7fa 100644 --- a/workflows/common/sh/env-summit-tf-2.4.1.sh +++ b/workflows/common/sh/env-summit-tf-2.4.1.sh @@ -2,8 +2,8 @@ # ENV Summit - TF 2.4.1 # Environment settings for Summit (Swift, Python, R, Tcl, etc.) -# SWIFT_IMPL=echo -SWIFT_IMPL=py +# CANDLE_MODEL_IMPL=echo +CANDLE_MODEL_IMPL=py # Let modules initialize LD_LIBRARY_PATH before changing it: set +eu # modules create errors outside our control diff --git a/workflows/common/sh/env-summit-tf1.sh b/workflows/common/sh/env-summit-tf1.sh index 8427dcac..39468958 100644 --- a/workflows/common/sh/env-summit-tf1.sh +++ b/workflows/common/sh/env-summit-tf1.sh @@ -4,8 +4,8 @@ # Environment settings for Summit (Swift, Python, R, Tcl, etc.) # GCC 6.4.0, TensorFlow 1, condaenv-200408, R 3.6.1 -# SWIFT_IMPL=echo -SWIFT_IMPL=py +# CANDLE_MODEL_IMPL=echo +CANDLE_MODEL_IMPL=py # Let modules initialize LD_LIBRARY_PATH before changing it: set +eu # modules create errors outside our control diff --git a/workflows/common/sh/env-summit-tf2.sh b/workflows/common/sh/env-summit-tf2.sh index d5607c17..f469053d 100644 --- a/workflows/common/sh/env-summit-tf2.sh +++ b/workflows/common/sh/env-summit-tf2.sh @@ -4,7 +4,7 @@ # Environment settings for Summit (Swift, Python, R, Tcl, etc.) # GCC 8.3.1, TensorFlow 2.4.1, opence 1.2.0-py38-0, R 3.6.1 -SWIFT_IMPL=py +CANDLE_MODEL_IMPL=py # Let modules initialize LD_LIBRARY_PATH before changing it: set +eu # modules create errors outside our control diff --git a/workflows/common/sh/env-summit.sh b/workflows/common/sh/env-summit.sh index a31dcf79..bb9ee199 100644 --- a/workflows/common/sh/env-summit.sh +++ b/workflows/common/sh/env-summit.sh @@ -1,8 +1,8 @@ # ENV Summit -# SWIFT_IMPL=echo -SWIFT_IMPL=py +# CANDLE_MODEL_IMPL=echo +CANDLE_MODEL_IMPL=py # Let modules initialize LD_LIBRARY_PATH before changing it: set +eu # modules create errors outside our control diff --git a/workflows/common/sh/env-theta.sh b/workflows/common/sh/env-theta.sh index 79c7078d..e3120a4d 100644 --- a/workflows/common/sh/env-theta.sh +++ b/workflows/common/sh/env-theta.sh @@ -58,7 +58,7 @@ fi # Selects the *.swift files to include # If "app", use app functions # If "py", use in-memory Python functions -SWIFT_IMPL="app" +CANDLE_MODEL_IMPL="app" # Log settings to output echo "Programs:" diff --git a/workflows/common/sh/env-titan.sh b/workflows/common/sh/env-titan.sh index 2ac5f1b0..caa1047a 100644 --- a/workflows/common/sh/env-titan.sh +++ b/workflows/common/sh/env-titan.sh @@ -1,4 +1,4 @@ -SWIFT_IMPL=app +CANDLE_MODEL_IMPL=app export R=/ccs/proj/med106/gounley1/titan/R-3.2.1/lib64/R export PY=/sw/xk6/deeplearning/1.0/sles11.3_gnu4.9.3 export LD_LIBRARY_PATH=$PY/lib:$R/lib:$LD_LIBRARY_PATH diff --git a/workflows/common/sh/env-washington.sh b/workflows/common/sh/env-washington.sh index 56d87ace..02dcc2e9 100644 --- a/workflows/common/sh/env-washington.sh +++ b/workflows/common/sh/env-washington.sh @@ -19,8 +19,8 @@ PATH=$R/bin:$PATH # Swift/T export PATH=/homes/wozniak/Public/sfw/swift-t/2019-05-23/stc/bin:$PATH -# SWIFT_IMPL="app" # use this one for real runs -SWIFT_IMPL="echo" # use this one to debug the model.sh command line +# CANDLE_MODEL_IMPL="app" # use this one for real runs +CANDLE_MODEL_IMPL="echo" # use this one to debug the model.sh command line # EMEWS Queues for R # EQR=/opt/EQ-R diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index 81832093..1f816251 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -3,7 +3,8 @@ set -eu # MODEL.SH -# Shell wrapper around Keras model +# Supervisor shell wrapper around CANDLE model +# Used for CANDLE_MODEL_IMPL types: "app" and "container" # Note that APP_PYTHONPATH is used by models here and # not just PYTHONPATH @@ -16,45 +17,47 @@ set -eu usage() { - echo "Usage: model.sh FRAMEWORK PARAMS EXPID RUNID" + echo "Usage: model.sh FRAMEWORK PARAMS EXPID RUNID MODEL_TYPE MODEL_NAME MODEL_ACTION" + echo "MODEL_TYPE is BENCHMARK or SINGULARITY" + echo "MODEL_NAME is the CANDLE Benchmark name (e.g., 'uno')" + echo " or a /path/to/image.sif" + echo "MODEL_ACTION is unused for a Benchmark," + echo " for Singularity it is a script (e.g., 'ACTION.sh')" echo "The environment should have:" - echo " EMEWS_PROJECT_ROOT|WORKFLOWS_ROOT TURBINE_OUTPUT" - echo " SITE OBJ_RETURN BENCHMARK_TIMEOUT" - echo " and MODEL_NAME EXPID for model_runner.py" + echo " EMEWS_PROJECT_ROOT|WORKFLOWS_ROOT TURBINE_OUTPUT" + echo " SITE OBJ_RETURN BENCHMARK_TIMEOUT" + echo " CANDLE_DATA_DIR" echo "If SH_TIMEOUT is set, we run under the shell command timeout" } -if (( ${#} < 4 )) +set -x +if (( ${#} != 7 )) then - echo "Wrong number of arguments: received ${#} , required: at least 4" + echo + echo "model.sh: Wrong number of arguments: received ${#} , required: 7" + echo usage exit 1 fi -if (( ${#} >= 4 )) - then - FRAMEWORK=$1 # Usually "keras" or "pytorch" - # JSON string of parameters: - PARAMS="$2" - EXPID=$3 - RUNID=$4 -fi - -if (( ${#} == 7 )) - then - export MODEL_NAME=$5 - export CANDLE_IMAGE=$6 - export MODEL_SCRIPT=$7 -fi +FRAMEWORK=$1 # Usually "keras" or "pytorch" +# JSON string of parameters: +PARAMS="$2" +export EXPID=$3 +export RUNID=$4 +export MODEL_TYPE=$5 +export MODEL_NAME=$6 +export MODEL_ACTION=$7 # Each model run, runs in its own "instance" directory # Set instance_directory to that and cd into it. # # TODO: rename INSTANCE_DIRECTORY to OUTPUT_DIR #set -x -if [[ $CANDLE_MODEL_TYPE = "SINGULARITY" ]] +if [[ $MODEL_TYPE = "SINGULARITY" ]] then # TODO: Rename "instance" to "run" - INSTANCE_DIRECTORY=$CANDLE_DATA_DIR/$MODEL_NAME/Output/$EXPID/$RUNID + MODEL_TOKEN=$( basename $MODEL_NAME .sif ) + INSTANCE_DIRECTORY=$CANDLE_DATA_DIR/$MODEL_TOKEN/Output/$EXPID/$RUNID INTERNAL_DIRECTORY=$MODEL_NAME/Output/$EXPID/$RUNID else # "BENCHMARKS" INSTANCE_DIRECTORY=$TURBINE_OUTPUT/$RUNID @@ -86,7 +89,7 @@ log "MODEL_NAME: $MODEL_NAME" log "RUNID: $RUNID" log "HOST: $( hostname )" log "ADLB_RANK_OFFSET: $ADLB_RANK_OFFSET" -# log "CANDLE_MODEL_TYPE: $CANDLE_MODEL_TYPE" +log "MODEL_TYPE: $MODEL_TYPE" # Source langs-app-{SITE} from workflow/common/sh/ (cf. utils.sh) if [[ ${WORKFLOWS_ROOT:-} == "" ]] @@ -113,8 +116,8 @@ show PYTHONHOME # Set up PYTHONPATH for app tasks export PYTHONPATH=${APP_PYTHONPATH:-}:${PYTHONPATH:-} -# Construct the desired model command MODEL_CMD based on CANDLE_MODEL_TYPE: -if [[ ${CANDLE_MODEL_TYPE:-} == "SINGULARITY" ]] +# Construct the desired model command MODEL_CMD based on MODEL_TYPE: +if [[ ${MODEL_TYPE:-} == "SINGULARITY" ]] then # No model_runner, need to write parameters.txt explicitly: @@ -129,11 +132,13 @@ then # The Singularity command line arguments: MODEL_CMD=( singularity exec --nv --bind $CANDLE_DATA_DIR:/candle_data_dir - $CANDLE_IMAGE $MODEL_SCRIPT $ADLB_RANK_OFFSET + $MODEL_NAME ${MODEL_ACTION}.sh $ADLB_RANK_OFFSET /candle_data_dir $FLAGS # $INTERNAL_DIRECTORY/parameters.txt --experiment_id $EXPID - --run_id $RUNID) + --run_id $RUNID + ) + else # "BENCHMARKS" # The Python command line arguments: @@ -154,17 +159,17 @@ log "MODEL_CMD: ${MODEL_CMD[@]}" $TIMEOUT_CMD "${MODEL_CMD[@]}" & PID=$! -if [[ ${CANDLE_MODEL_TYPE:-} == "SINGULARITY" ]] +if [[ ${MODEL_TYPE:-} == "SINGULARITY" ]] then wait $PID ls -ltrh sleep 1 # Wait for initial output - # Get last results of the format "IMPROVE_RESULT xxx" in model.log - # NOTE: Enabling set -x will break the following - RES=$(awk -v FS="IMPROVE_RESULT" 'NF>1 {x=$2} END {print x}' model.log) - echo $RES + # Get last results of the format "CANDLE_RESULT xxx" in model.log + # NOTE: Enabling set -x will break the following (token CANDLE_RESULT) + RES=$( awk -v FS="CANDLE_RESULT" 'NF>1 {x=$2} END {print x}' \ + $INSTANCE_DIRECTORY/model.log ) RESULT="$(echo $RES | grep -Eo '[+-]?[0-9]+([.][0-9]+)?')" || true - echo $RESULT, ": Result" + echo "CANDLE RESULT: '$RESULT'" echo $RESULT > $INSTANCE_DIRECTORY/result.txt else wait $PID diff --git a/workflows/common/swift/obj_app.swift b/workflows/common/swift/model_app.swift similarity index 68% rename from workflows/common/swift/obj_app.swift rename to workflows/common/swift/model_app.swift index 2adb4788..5dce30be 100644 --- a/workflows/common/swift/obj_app.swift +++ b/workflows/common/swift/model_app.swift @@ -1,19 +1,24 @@ -// OBJ APP +/** + CANDLE MODEL: APP + Runs CANDLE models as Swift/T app functions +*/ /** The main objective function used by the CANDLE/Supervisor model exploration (optimization) loop. params : The JSON string of params to be passed to the Benchmark - run_id : A string run ID that will be the output directory name + expid : A string experiment ID that will be in the output directory name + runid : A string run ID that will be in the output directory name + model_name : Benchmark (e.g., "uno") */ -(string obj_result) obj(string params, - string expid, - string runid) +(string obj_result) candle_model_train(string params, + string expid, + string runid, + string model_name) { string model_sh = getenv("MODEL_SH"); string turbine_output = getenv("TURBINE_OUTPUT"); - string model_name = getenv("MODEL_NAME"); string outdir; @@ -38,8 +43,8 @@ app (void o) run_model (string model_sh, string params, string expid, string runid) { - // 1 2 3 4 - "bash" model_sh FRAMEWORK params expid runid; + // 1 2 3 4 5 6 7 + "bash" model_sh FRAMEWORK params expid runid "BENCHMARK" model_name "train"; } /** diff --git a/workflows/common/swift/model_container.swift b/workflows/common/swift/model_container.swift new file mode 100644 index 00000000..c017c955 --- /dev/null +++ b/workflows/common/swift/model_container.swift @@ -0,0 +1,62 @@ + +/** + CANDLE MODEL: CONTAINER + Runs CANDLE models as Swift/T app functions + under a Singularity container +*/ + +/** + The main objective function used by the CANDLE/Supervisor + model exploration (optimization) loop. + params : The JSON string of params to be passed to the Benchmark + run_id : A string run ID that will be the output directory name + model_name : A path to a SIF +*/ +(string obj_result) candle_model_train(string params, + string expid, + string runid, + string model_name) +{ + CDD = getenv("CANDLE_DATA_DIR"); + model_sh = getenv("MODEL_SH"); + + model_token = rootname_string(basename_string(model_name)); + outdir = "%s/%s/Output/%s/%s" % (CDD, model_token, expid, runid); + printf("obj_container(): running in: %s", outdir); + + // We do not use a file type here because this file may not be created, + // which is handled by get_results() + result_file = outdir/"result.txt"; + wait (run_model_train(model_sh, params, expid, runid, model_name)) + { + obj_result = get_results(result_file); + } + printf("model_train_container(): result(%s): '%s'", + runid, obj_result); +} + +/** + Swift/T app function that runs the Benchmark +*/ +app (void o) run_model_train(string model_sh, string params, + string expid, string runid, + string model_name) +{ + // 1 2 3 4 5 6 7 + "bash" model_sh FRAMEWORK params expid runid "SINGULARITY" model_name "train"; +} + +/** + Extracts the Benchmark output if it exists, + else, provides a NaN so the workflow can keep running +*/ +(string model_result) get_results(string result_file) { + if (file_exists(result_file)) { + file line = input(result_file); + model_result = trim(read(line)); + } else { + printf("File not found: %s", result_file); + // return with a large value + model_result = "1e7"; + } +} diff --git a/workflows/common/swift/obj_echo.swift b/workflows/common/swift/model_echo.swift similarity index 64% rename from workflows/common/swift/obj_echo.swift rename to workflows/common/swift/model_echo.swift index d72c492e..7a8bb50e 100644 --- a/workflows/common/swift/obj_echo.swift +++ b/workflows/common/swift/model_echo.swift @@ -1,5 +1,8 @@ -// OBJ ECHO +/** + CANDLE MODEL: CONTAINER + Pretends to run CANDLE models, actually just echoes its arguments +*/ /** This has the same signature as the main objective function @@ -8,8 +11,11 @@ params : The JSON string of params to be passed to the Benchmark run_id : A string run ID that will be the output directory name */ -(string obj_result) obj(string params_in, - string run_id) { +(string model_result) candle_model_train(string params, + string expid, + string runid, + string model_name) +{ string model_sh = getenv("MODEL_SH"); string turbine_output = getenv("TURBINE_OUTPUT"); @@ -18,5 +24,5 @@ // 1 2 3 printf("bash model.sh %s %s %s in: %s", FRAMEWORK, params, run_id, turbine_output) => - obj_result = "ECHO SUCCESS"; + model_result = "ECHO SUCCESS"; } diff --git a/workflows/common/swift/obj_py.swift b/workflows/common/swift/model_py.swift similarity index 82% rename from workflows/common/swift/obj_py.swift rename to workflows/common/swift/model_py.swift index 3229350a..c131055f 100644 --- a/workflows/common/swift/obj_py.swift +++ b/workflows/common/swift/model_py.swift @@ -1,6 +1,7 @@ /** - OBJ PY SWIFT + CANDLE MODEL: PY + Runs CANDLE models as Swift/T python() functions */ string code_template = @@ -45,9 +46,11 @@ except Exception as e: obj_result = 'EXCEPTION' ----; -(string obj_result) obj(string params, - string expid, - string runid) { +(string obj_result) candle_model_train(string params, + string expid, + string runid, + string model_name) +{ string outdir = "%s/run/%s" % (turbine_output, runid); string code = code_template % (outdir, params, model_name, expid, runid, benchmark_timeout); diff --git a/workflows/common/swift/obj_container.swift b/workflows/common/swift/obj_container.swift deleted file mode 100644 index bfeee236..00000000 --- a/workflows/common/swift/obj_container.swift +++ /dev/null @@ -1,61 +0,0 @@ - -// OBJ APP - -/** - The main objective function used by the CANDLE/Supervisor - model exploration (optimization) loop. - params : The JSON string of params to be passed to the Benchmark - run_id : A string run ID that will be the output directory name -*/ -(string obj_result) obj_container(string params, - string expid, - string runid, - string model_name, - string candle_image, - string model_script) -{ - string model_sh = getenv("MODEL_SH"); - string turbine_output = getenv("TURBINE_OUTPUT"); - // string model_name = getenv("MODEL_NAME"); - - string outdir; - - outdir = "%s/%s" % (turbine_output, runid); - // outdir = "%s/%s/Output/%s/%s" % (turbine_output, model_name, expid, runid); - - printf("obj_app: running model shell script in: %s", outdir); - - // We do not use a file type here because this file may not be created, - // which is handled by get_results() - string result_file = outdir/"result.txt"; - wait (run_model(model_sh, params, expid, runid, model_name, candle_image, model_script)) - { - obj_result = get_results(result_file); - } - printf("obj_app: result(%s): '%s'", runid, obj_result); -} - -/** - Swift/T app function that runs the Benchmark -*/ -app (void o) run_model (string model_sh, string params, - string expid, string runid, string model_name, string candle_image, string model_script) -{ - // 1 2 3 4 5 6 - "bash" model_sh FRAMEWORK params expid runid model_name candle_image model_script; -} - -/** - Extracts the Benchmark output if it exists, - else, provides a NaN so the workflow can keep running -*/ -(string obj_result) get_results(string result_file) { - if (file_exists(result_file)) { - file line = input(result_file); - obj_result = trim(read(line)); - } else { - printf("File not found: %s", result_file); - // return with a large value - obj_result = "1e7"; - } -} diff --git a/workflows/cp-leaveout/swift/baseline-error.sh b/workflows/cp-leaveout/swift/baseline-error.sh index 66023e12..52287243 100755 --- a/workflows/cp-leaveout/swift/baseline-error.sh +++ b/workflows/cp-leaveout/swift/baseline-error.sh @@ -125,7 +125,7 @@ mkdir -p $TURBINE_OUTPUT/run # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh diff --git a/workflows/cp-leaveout/swift/cpl-upf-workflow.sh b/workflows/cp-leaveout/swift/cpl-upf-workflow.sh index 8eced6d6..ac475a29 100755 --- a/workflows/cp-leaveout/swift/cpl-upf-workflow.sh +++ b/workflows/cp-leaveout/swift/cpl-upf-workflow.sh @@ -126,7 +126,7 @@ mkdir -p $TURBINE_OUTPUT/run # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh diff --git a/workflows/cp-leaveout/swift/workflow-tic.sh b/workflows/cp-leaveout/swift/workflow-tic.sh index 8206830c..f1229f86 100755 --- a/workflows/cp-leaveout/swift/workflow-tic.sh +++ b/workflows/cp-leaveout/swift/workflow-tic.sh @@ -117,7 +117,7 @@ mkdir -p $TURBINE_OUTPUT/run # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index c86c0a8f..92b6923b 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -150,7 +150,7 @@ mkdir -p $TURBINE_OUTPUT/run # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh diff --git a/workflows/cp1/swift/infer_workflow.sh b/workflows/cp1/swift/infer_workflow.sh index be860652..3f5a05bc 100755 --- a/workflows/cp1/swift/infer_workflow.sh +++ b/workflows/cp1/swift/infer_workflow.sh @@ -102,7 +102,7 @@ mkdir -pv $TURBINE_OUTPUT/data # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh diff --git a/workflows/cp1/swift/upf_workflow.sh b/workflows/cp1/swift/upf_workflow.sh index db3e3e95..929dfada 100755 --- a/workflows/cp1/swift/upf_workflow.sh +++ b/workflows/cp1/swift/upf_workflow.sh @@ -108,7 +108,7 @@ mkdir -pv $XCORR_DATA_DIR # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh diff --git a/workflows/cp1/swift/workflow.sh b/workflows/cp1/swift/workflow.sh index e487c621..ce5052ec 100755 --- a/workflows/cp1/swift/workflow.sh +++ b/workflows/cp1/swift/workflow.sh @@ -152,7 +152,7 @@ mkdir -pv $TURBINE_OUTPUT/hpo_log # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh diff --git a/workflows/csg/swift/workflow.sh b/workflows/csg/swift/workflow.sh index 37fc69f0..bffb2625 100755 --- a/workflows/csg/swift/workflow.sh +++ b/workflows/csg/swift/workflow.sh @@ -84,7 +84,7 @@ fi export CANDLE_IMAGE=${CANDLE_IMAGE:-} -export SWIFT_IMPL=container +export CANDLE_MODEL_IMPL=container which swift-t @@ -93,7 +93,7 @@ swift-t -n $PROCS \ ${MACHINE:-} \ -p \ -I $WORKFLOWS_ROOT/common/swift \ - -i obj_$SWIFT_IMPL \ + -i model_$CANDLE_MODEL_IMPL \ -e BENCHMARKS_ROOT \ -e CANDLE_PROJECT_ROOT \ -e MODEL_SH \ diff --git a/workflows/dense-noise/swift/workflow.sh b/workflows/dense-noise/swift/workflow.sh index 7bd2a163..ba67cb6d 100755 --- a/workflows/dense-noise/swift/workflow.sh +++ b/workflows/dense-noise/swift/workflow.sh @@ -47,7 +47,7 @@ then exit 1 fi -echo "Running "$MODEL_NAME "workflow" +echo "workflow.sh start: MODEL_NAME=$MODEL_NAME" source_site env $SITE source_site sched $SITE @@ -55,10 +55,14 @@ source_site sched $SITE # Set PYTHONPATH for BENCHMARK related stuff PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # needed for model_runner -export APP_PYTHONPATH=$BENCHMARK_DIR:$BENCHMARKS_ROOT/common - export TURBINE_JOBNAME="${EXPID}" +if [[ ${CANDLE_DATA_DIR:-} == "" ]] +then + echo "workflow.sh: CANDLE_DATA_DIR is not set!" + exit 1 +fi + if [ -z ${GPU_STRING+x} ]; then GPU_ARG="" @@ -86,8 +90,8 @@ mkdir -pv $TURBINE_OUTPUT/data # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -SWIFT_IMPL="py" -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +CANDLE_MODEL_IMPL="container" +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh @@ -103,18 +107,8 @@ then echo "Turbine will wait for job completion." fi -# Handle %-escapes in TURBINE_STDOUT -if [ $SITE == "summit" ] || \ - [ $SITE == "biowulf" ] || \ - [ $SITE == "polaris" ] -then - export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" -else - export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%r.txt" -fi - +# Output handline mkdir -pv $TURBINE_OUTPUT/out - if [[ ${MACHINE:-} == "" ]] then STDOUT=$TURBINE_OUTPUT/output.txt @@ -125,20 +119,22 @@ then ln -s $TURBINE_OUTPUT turbine-output else # When running on a scheduled system, Swift/T automatically redirects - # stdout to the turbine-output directory. This will just be for - # warnings or unusual messages - # use for summit (slurm needs two %) - export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" - - #export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%r.txt" - mkdir -pv $TURBINE_OUTPUT/out + # stdout to the turbine-output directory. + # Some systems do % interpretation in environment variables, + # we escape them in TURBINE_STDOUT here: + if [[ $SITE == "summit" ]] || \ + [[ $SITE == "biowulf" ]] || \ + [[ $SITE == "polaris" ]] + then + : # export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" + else + : # export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%r.txt" + fi STDOUT="" fi -#echo ${CMD_LINE_ARGS[@]} - cd $TURBINE_OUTPUT -cp $CFG_SYS $CFG_PRM $WORKFLOWS_ROOT/uq-noise/swift/workflow.swift $TURBINE_OUTPUT +cp $CFG_SYS $CFG_PRM $TURBINE_OUTPUT swift-t -n $PROCS \ ${MACHINE:-} \ @@ -153,6 +149,7 @@ swift-t -n $PROCS \ $( python_envs ) \ -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ -e OBJ_RETURN \ + -e CANDLE_DATA_DIR \ -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ -e MODEL_SH \ @@ -163,14 +160,13 @@ swift-t -n $PROCS \ -e SH_TIMEOUT \ -e IGNORE_ERRORS \ $WAIT_ARG \ - $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} 2>&1 \ + $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} 2>&1 | \ tee $STDOUT - if (( ${PIPESTATUS[0]} )) then echo "workflow.sh: swift-t exited with error!" exit 1 fi -# echo "EXIT CODE: 0" | tee -a $STDOUT +echo "JOB OK" | tee -a $STDOUT diff --git a/workflows/dense-noise/swift/workflow.swift b/workflows/dense-noise/swift/workflow.swift index ea528a95..cb281e38 100644 --- a/workflows/dense-noise/swift/workflow.swift +++ b/workflows/dense-noise/swift/workflow.swift @@ -24,20 +24,35 @@ string exp_id = argv("exp_id"); int benchmark_timeout = toint(argv("benchmark_timeout", "-1")); string model_name = getenv("MODEL_NAME"); -printf("UQ NOISE WORKFLOW.SWIFT"); +printf("DENSE NOISE WORKFLOW.SWIFT"); printf("TURBINE_OUTPUT: " + turbine_output); +printf("model_name: " + model_name); -int epochs = 1; +int epochs = 10; -int neurons[] = [500:1000:250]; +int neurons[] = [500:1000:50]; -float y_num_noises = 1; // Number of noise levels to try +float y_num_noises = 11; // Number of noise levels to try float y_noise_levels[] = [0:y_num_noises]; -float noise_step = 10; // Difference between noises +float noise_step = 5; // Difference between noises -int num_trials = 1; +int num_trials = 5; int trials[] = [0:num_trials-1]; +config = "/usr/local/Benchmarks/Pilot1/Uno/uno_auc_model.txt"; + +json_template = """ +{ "layer_force": %4i, + "noise" : %5.2f, + "epochs" : %2i, + "config" : "%s", + "experiment_id": "%s", + "run_id": "%s", + "candle_result": "r2", + "ckpt_save_interval": 1 +} +"""; + foreach neuron in neurons { foreach levely, j in y_noise_levels @@ -45,13 +60,11 @@ foreach neuron in neurons foreach trial, k in trials { y_noise_level = levely * noise_step; - run_id = "%04i-%0.0f-%02i" % (neuron, y_noise_level, trial); - params = ("{ \"layer_force\" : %i , " + - " \"noise\" : %f , " + - " \"epochs\" : %i } ") % - (neuron, y_noise_level, epochs); - printf("running: %s", params); - result = obj(params, exp_id, run_id); + run_id = "%04i-%05.2f-%02i" % (neuron, y_noise_level, trial); + params = json_template % + (neuron, y_noise_level, epochs, config, exp_id, run_id); + printf("running: %s: %s", run_id, params); + result = candle_model_train(params, exp_id, run_id, model_name); printf("result %s : neuron %i y_noise %0.3f : %s", run_id, neuron, y_noise_level, result); } diff --git a/workflows/dense-noise/test/cfg-sys-small.sh b/workflows/dense-noise/test/cfg-sys-small.sh index b4f45c39..f2d02805 100644 --- a/workflows/dense-noise/test/cfg-sys-small.sh +++ b/workflows/dense-noise/test/cfg-sys-small.sh @@ -1,18 +1,19 @@ -# CFG SYS 1 +# CFG SYS SMALL # The number of MPI processes -# Note that 2 processes are reserved for Swift/EMEMS +# Note that 2 processes are reserved for Swift/EMEWS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs export PROCS=${PROCS:-2} # MPI processes per node -export PPN=${PPN:-1} +export PPN=${PPN:-2} -export WALLTIME=${WALLTIME:-00:05:00} +export WALLTIME=${WALLTIME:-00:60:00} # CANDLE@ALCF: export PROJECT=CSC249ADOA01 +export QUEUE="debug-scaling" # Benchmark run timeout: benchmark run will timeout # after the specified number of seconds. diff --git a/workflows/dense-noise/test/test-1.sh b/workflows/dense-noise/test/test-1.sh index 34fcdc5a..ad762730 100755 --- a/workflows/dense-noise/test/test-1.sh +++ b/workflows/dense-noise/test/test-1.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eu -# UQ NOISE TEST 1 +# DENSE NOISE TEST 1 usage() { @@ -26,6 +26,7 @@ EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) export EMEWS_PROJECT_ROOT WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) source $WORKFLOWS_ROOT/common/sh/utils.sh +SCRIPT=$( basename $0 .sh ) # Select configurations export CFG_SYS=$THIS/cfg-sys-small.sh @@ -46,17 +47,7 @@ fi # Submit job $EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME -# Wait for job -TURBINE_OUTPUT=$( readlink turbine-output ) -queue_wait - -# Check job output -OUTPUT=$TURBINE_OUTPUT/output.txt -WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) - -SCRIPT=$( basename $0 .sh ) - -echo "$SCRIPT: SUCCESS" +echo "$SCRIPT: OK" # Local Variables: # c-basic-offset: 2; diff --git a/workflows/grid/swift/workflow.sh b/workflows/grid/swift/workflow.sh index 3eabacbb..3d153a7a 100755 --- a/workflows/grid/swift/workflow.sh +++ b/workflows/grid/swift/workflow.sh @@ -115,7 +115,7 @@ mkdir -pv $TURBINE_OUTPUT/run # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index 28fdeeaf..db3f033c 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -114,7 +114,7 @@ mkdir -pv $TURBINE_OUTPUT/run # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function # Andrew: Allows for custom model.sh file, if that's desired export MODEL_SH=${MODEL_SH:-$WORKFLOWS_ROOT/common/sh/model.sh} diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index 3362fc96..9af3f6b4 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -91,7 +91,7 @@ swift-t -n $PROCS \ ${MACHINE:-} \ -p \ -I $WORKFLOWS_ROOT/common/swift \ - -i obj_$SWIFT_IMPL \ + -i model_$CANDLE_MODEL_IMPL \ -e BENCHMARKS_ROOT \ -e EMEWS_PROJECT_ROOT \ -e MODEL_SH \ diff --git a/workflows/uq-noise/swift/workflow-abstention.sh b/workflows/uq-noise/swift/workflow-abstention.sh index fe79d41b..b31c4205 100755 --- a/workflows/uq-noise/swift/workflow-abstention.sh +++ b/workflows/uq-noise/swift/workflow-abstention.sh @@ -108,8 +108,8 @@ mkdir -pv $TURBINE_OUTPUT/hpo_log # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -SWIFT_IMPL="py" -OBJ_MODULE=${OBJ_MODULE:-obj_abstention_$SWIFT_IMPL} +CANDLE_MODEL_IMPL="py" +OBJ_MODULE=${OBJ_MODULE:-obj_abstention_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model_abstention.sh diff --git a/workflows/uq-noise/swift/workflow-gauss-abs.sh b/workflows/uq-noise/swift/workflow-gauss-abs.sh index 9a015ee1..9d1b51df 100755 --- a/workflows/uq-noise/swift/workflow-gauss-abs.sh +++ b/workflows/uq-noise/swift/workflow-gauss-abs.sh @@ -108,8 +108,8 @@ mkdir -pv $TURBINE_OUTPUT/hpo_log # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -SWIFT_IMPL="py" -OBJ_MODULE=${OBJ_MODULE:-obj_abstention_$SWIFT_IMPL} +CANDLE_MODEL_IMPL="py" +OBJ_MODULE=${OBJ_MODULE:-obj_abstention_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model_abstention.sh diff --git a/workflows/uq-noise/swift/workflow-gnoise.sh b/workflows/uq-noise/swift/workflow-gnoise.sh index 38fc9460..98d173ff 100755 --- a/workflows/uq-noise/swift/workflow-gnoise.sh +++ b/workflows/uq-noise/swift/workflow-gnoise.sh @@ -108,8 +108,8 @@ mkdir -pv $TURBINE_OUTPUT/hpo_log # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -SWIFT_IMPL="py" -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +CANDLE_MODEL_IMPL="py" +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh diff --git a/workflows/uq-noise/swift/workflow-noise.sh b/workflows/uq-noise/swift/workflow-noise.sh index fef7067a..ea50e09c 100755 --- a/workflows/uq-noise/swift/workflow-noise.sh +++ b/workflows/uq-noise/swift/workflow-noise.sh @@ -108,8 +108,8 @@ mkdir -pv $TURBINE_OUTPUT/hpo_log # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -SWIFT_IMPL="py" -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +CANDLE_MODEL_IMPL="py" +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh diff --git a/workflows/uq-noise/swift/workflow.sh b/workflows/uq-noise/swift/workflow.sh index 2dc3e65a..926ea292 100755 --- a/workflows/uq-noise/swift/workflow.sh +++ b/workflows/uq-noise/swift/workflow.sh @@ -114,8 +114,8 @@ mkdir -pv $TURBINE_OUTPUT/hpo_log # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -SWIFT_IMPL="py" -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +CANDLE_MODEL_IMPL="py" +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh From c64b70b7a55bae2c19aa476c9bee8c75a462cb2a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 26 Apr 2023 11:06:10 -0500 Subject: [PATCH 514/601] Update Swift/T/Frontier --- workflows/common/sh/env-frontier.sh | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/workflows/common/sh/env-frontier.sh b/workflows/common/sh/env-frontier.sh index c9e0283d..b5d9b329 100644 --- a/workflows/common/sh/env-frontier.sh +++ b/workflows/common/sh/env-frontier.sh @@ -4,32 +4,16 @@ # CANDLE_MODEL_IMPL=echo CANDLE_MODEL_IMPL=py -# From Wozniak -# MED106=/gpfs/alpine/world-shared/med106 -# ROOT=$MED106/sw/summit/gcc-7.5.0 ROOT=/autofs/nccs-svm1_home1/wozniak/Public/sfw/frontier -# SWIFT=$ROOT/swift-t/2022-07-25 # Works -SWIFT=$ROOT/swift-t/2023-02-23 +SWIFT=$ROOT/swift-t/2023-04-26 export TURBINE_HOME=$SWIFT/turbine PATH=$SWIFT/stc/bin:$PATH PATH=$SWIFT/turbine/bin:$PATH -# R=$ROOT/R/4.1.3/lib64/R -# LD_LIBRARY_PATH+=:$R/lib - -PY=/gpfs/alpine/med106/proj-shared/hm0/candle_tf_2.10 -# PY=/gpfs/alpine/world-shared/med106/sw/conda/2021-10-06/envs/CANDLE-2021-10-06 -# PY=/sw/summit/open-ce/anaconda-base/envs/open-ce-1.5.2-py39-0 -# PY=/gpfs/alpine/world-shared/med106/sw/open-ce-1.1.3-py37/ -# # PY=/gpfs/alpine/world-shared/med106/sw/conda/m-39-2022-09-15 -# LD_LIBRARY_PATH+=:$PY/lib -# export PYTHONHOME=$PY +PY=/lustre/orion/world-shared/med106/gounley1/conda543 PATH=$PY/bin:$PATH -# /gpfs/alpine/world-shared/med106/sw/condaenv-200408 -# export LD_LIBRARY_PATH=$PY/lib:$LD_LIBRARY_PATH - # EMEWS Queues for R # EQR=$ROOT/EQ-R From beb84f257bd484dc21e44c415e81ecb9c0022d20 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 26 Apr 2023 13:55:10 -0500 Subject: [PATCH 515/601] Better messaging on Frontier --- workflows/common/python/model_runner.py | 38 ++++++++++++++----------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index 13c01566..bcdfe7dd 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -168,41 +168,37 @@ def run(hyper_parameter_map, obj_return): history = None exception = False - # Run the model! - log("PKG RUN START") - - # check for epochs if not present set to 1, used for checking early stopping in function get_results + # check for epochs if not present set to 1, + # used for checking early stopping in function get_results if "epochs" in hyper_parameter_map: epochs = hyper_parameter_map["epochs"] else: epochs = 1 - if framework == 'keras': + log("PKG RUN START") + if framework == "keras": try: + # Run the model! history = pkg.run(params) except Exception as e: logger.warn("RUN EXCEPTION: " + str(e)) print("RUN EXCEPTION: " + str(e)) info = sys.exc_info() s = traceback.format_tb(info[2]) - sys.stdout.write('\\n\\nEXCEPTION in model run(): \\n' + repr(e) + - ' ... \\n' + ''.join(s)) - sys.stdout.write('\\n') + # This produces backslashes in output like "\n\n" + # on Frontier 2023-02-26 + # sys.stdout.write('\\n\\nEXCEPTION in model run(): \\n' + + # repr(e) + ' ... \\n' + ''.join(s)) + # sys.stdout.write('\\n') + sys.stdout.write('\n\nEXCEPTION in model run(): \n' + + repr(e) + ' ... \n' + ''.join(s)) + sys.stdout.write('\n') sys.stdout.flush() - - # logger.warn("Caught InvalidArgumentError") exception = True exit(1) - log("PKG RUN STOP") - - # if framework == "keras": runner_utils.keras_clear_session(framework) - stop_perf(Ps) - finish = time.time() - duration = finish - start - # Default result if there is no val_loss (as in infer.py) result = 0 history_result = {} @@ -229,6 +225,12 @@ def __init__(self, val_scores): history = history(val_scores) result, history_result = get_results(history, obj_return, epochs) + log("PKG RUN STOP") + + stop_perf(Ps) + finish = time.time() + duration = finish - start + return (result, history_result) @@ -291,7 +293,9 @@ def run_model(hyper_parameter_map): exit(1) elif result == ModelResult.SKIP: logger.info("run_pre() returned SKIP ...") + logger.info("model_runner: EXIT") sys.stdout.flush() + time.sleep(10) return ("SKIP", "HISTORY_EMPTY") else: assert result == ModelResult.SUCCESS # proceed... From b675a35ca8dd877b36999763d514da913e9e0123 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 26 Apr 2023 13:55:20 -0500 Subject: [PATCH 516/601] Fix error message --- workflows/common/swift/model_py.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/swift/model_py.swift b/workflows/common/swift/model_py.swift index c131055f..c5cea46b 100644 --- a/workflows/common/swift/model_py.swift +++ b/workflows/common/swift/model_py.swift @@ -39,7 +39,7 @@ try: except Exception as e: info = sys.exc_info() s = traceback.format_tb(info[2]) - sys.stdout.write('\\n\\nEXCEPTION in obj() code: \\n' + + sys.stdout.write('\\n\\nEXCEPTION in candle_model_train(): \\n' + repr(e) + ' ... \\n' + ''.join(s)) sys.stdout.write('\\n') sys.stdout.flush() From 1c80ec1de1dff605af0fdaad7681a5f1c43252c4 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 26 Apr 2023 13:55:58 -0500 Subject: [PATCH 517/601] More checks --- workflows/cp-leaveout/scripts/check-run.sh | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/workflows/cp-leaveout/scripts/check-run.sh b/workflows/cp-leaveout/scripts/check-run.sh index 8e6e6895..62200d3b 100755 --- a/workflows/cp-leaveout/scripts/check-run.sh +++ b/workflows/cp-leaveout/scripts/check-run.sh @@ -37,13 +37,21 @@ then echo "Job timed out normally." SUCCESS=1 -elif grep -q "TURBINE: EXIT CODE: 0" $DIR/output.txt +elif grep -q "EXIT CODE: 0" $DIR/output.txt then echo "Job completed normally." - grep "TURBINE: MPIEXEC TIME: " $DIR/output.txt + grep "MPIEXEC TIME: " $DIR/output.txt SUCCESS=1 fi +if (( ! SUCCESS )) +then + # Find MPI Aborts on Frontier + grep "START:" $DIR/output.txt + grep "MPICH .* Abort" $DIR/output.txt | \ + cut --delimiter ' ' --fields=1-12 +fi + if (( ! SUCCESS )) then echo "Job failed!" From 9214ee79a95cb7f91c2941d5cf2b73b814dadf18 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 26 Apr 2023 13:56:07 -0500 Subject: [PATCH 518/601] Shorten job name on Frontier --- workflows/upf/swift/workflow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index 9af3f6b4..7d7d4c1b 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -46,7 +46,7 @@ source $WORKFLOWS_ROOT/common/sh/set-pythonpath.sh log_path PYTHONPATH -export TURBINE_JOBNAME="UPF_${EXPID}" +export TURBINE_JOBNAME="${EXPID}" OBJ_PARAM_ARG="" if [[ ${OBJ_PARAM:-} != "" ]] From 4edc456061ef19838fb34b5786f4e82f970931cd Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 26 Apr 2023 13:56:17 -0500 Subject: [PATCH 519/601] WS --- workflows/upf/test/upf-1.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/workflows/upf/test/upf-1.sh b/workflows/upf/test/upf-1.sh index 80d53808..23e05541 100755 --- a/workflows/upf/test/upf-1.sh +++ b/workflows/upf/test/upf-1.sh @@ -21,6 +21,5 @@ export EMEWS_PROJECT_ROOT export OBJ_RETURN="val_loss" CFG_SYS=$THIS/cfg-sys-1.sh - export CANDLE_MODEL_TYPE="BENCHMARKS" $EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $THIS/upf-1.txt From f1a4960b33f8e8ffd6eda59b0d4f831851147426 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 26 Apr 2023 13:56:27 -0500 Subject: [PATCH 520/601] Update function name --- workflows/upf/swift/workflow.swift | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflows/upf/swift/workflow.swift b/workflows/upf/swift/workflow.swift index 09b36ef9..1b57d605 100644 --- a/workflows/upf/swift/workflow.swift +++ b/workflows/upf/swift/workflow.swift @@ -17,8 +17,8 @@ report_env(); string FRAMEWORK = "keras"; // Scan command line -file upf = input(argv("f")); -int benchmark_timeout = string2int(argv("benchmark_timeout", "-1")); +file upf = input(argv("f")); +int benchmark_timeout = string2int(argv("benchmark_timeout", "-1")); string model_name = getenv("MODEL_NAME"); string expid = getenv("EXPID"); @@ -39,8 +39,8 @@ foreach params,i in upf_lines { printf("params: %s", params); runid = json_get(params, "id"); - results[i] = obj(params, expid, runid); - assert(results[i] != "EXCEPTION", "exception in obj()!"); + results[i] = candle_model_train(params, expid, runid, model_name); + assert(results[i] != "EXCEPTION", "exception in candle_model_train()!"); } // Join all result values into one big semicolon-delimited string From 8f23034dd8679e381323056c8fecb02af39abf4b Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 3 May 2023 11:53:14 -0500 Subject: [PATCH 521/601] Support for model.log --- scripts/shrink-log.mk | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/shrink-log.mk b/scripts/shrink-log.mk index b2294d21..f218bc03 100644 --- a/scripts/shrink-log.mk +++ b/scripts/shrink-log.mk @@ -1,11 +1,17 @@ .DELETE_ON_ERROR: +# Logs on stdout from Python runs OUTS = $(wildcard out-*.txt) +# Logs in model.log from containers +RUNS = $(shell find . -name model.log) -SUMMARIES = $(subst out-,summary-,$(OUTS)) +SUMMARIES = $(subst out-,summary-,$(OUTS)) $(subst model,summary,$(RUNS)) all: $(SUMMARIES) summary-%.txt: out-%.txt @ ${THIS}/shrink-log-single.sh $(<) $(@) + +%/summary.log: %/model.log + @ ${THIS}/shrink-log-single.sh $(<) $(@) From 0d9efa8402577a258d14d485e0d15b206600f21f Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 3 May 2023 13:10:15 -0500 Subject: [PATCH 522/601] o obj is not called candle_model_train with an extra argument model_name --- workflows/mlrMBO/swift/workflow.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/mlrMBO/swift/workflow.swift b/workflows/mlrMBO/swift/workflow.swift index f50d0d75..d859cca9 100644 --- a/workflows/mlrMBO/swift/workflow.swift +++ b/workflows/mlrMBO/swift/workflow.swift @@ -94,7 +94,7 @@ string FRAMEWORK = "keras"; foreach param, j in param_array { run_id = "%02i_%03i_%04i" % (restart_number,i,j); - results[j] = obj(param, exp_id, run_id); + results[j] = candle_model_train(param, exp_id, run_id, model_name); } string result = join(results, ";"); // printf(result); From 713dbb3e80d296f8b3a4c6672f54f81154714861 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 3 May 2023 13:39:40 -0500 Subject: [PATCH 523/601] o fix duration variable not declared error --- workflows/common/python/model_runner.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index bcdfe7dd..828ec775 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -203,8 +203,6 @@ def run(hyper_parameter_map, obj_return): result = 0 history_result = {} if not exception: - logger.info("DONE: run_id %s in %0.2f seconds." % - (hyper_parameter_map["run_id"], duration)) if history is not None: if history == "EPOCHS_COMPLETED_ALREADY": result, history_result = "EPOCHS_COMPLETED_ALREADY", None @@ -230,6 +228,10 @@ def __init__(self, val_scores): stop_perf(Ps) finish = time.time() duration = finish - start + + # print the run_id and duration + logger.info("DONE: run_id %s in %0.2f seconds." % + (hyper_parameter_map["run_id"], duration)) return (result, history_result) From 495fbf584ea49eeb11bc37dee3ca6430df913d62 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 3 May 2023 14:32:26 -0500 Subject: [PATCH 524/601] o Change workflow for GA with new args for candle_model_train --- workflows/GA/swift/workflow.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/GA/swift/workflow.swift b/workflows/GA/swift/workflow.swift index 8b31faa9..a0ca3930 100644 --- a/workflows/GA/swift/workflow.swift +++ b/workflows/GA/swift/workflow.swift @@ -82,7 +82,7 @@ string FRAMEWORK = "keras"; string results[]; foreach param, j in param_array { - results[j] = obj(param, exp_id, "%00i_%000i_%0000i" % (restart_number,i,j)); + results[j] = candle_model_train(param, exp_id, "%00i_%000i_%0000i" % (restart_number,i,j), model_name); } string res = join(results, ";"); // printf(res); From d7b01fae1ac472b9dee398ea53ea854d020c8625 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 8 May 2023 15:17:10 -0500 Subject: [PATCH 525/601] Fixes for Frontier --- workflows/dense-noise/swift/workflow.sh | 2 +- workflows/dense-noise/swift/workflow.swift | 2 +- workflows/dense-noise/test/cfg-sys-small.sh | 8 ++++++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/workflows/dense-noise/swift/workflow.sh b/workflows/dense-noise/swift/workflow.sh index ba67cb6d..19702543 100755 --- a/workflows/dense-noise/swift/workflow.sh +++ b/workflows/dense-noise/swift/workflow.sh @@ -90,7 +90,7 @@ mkdir -pv $TURBINE_OUTPUT/data # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -CANDLE_MODEL_IMPL="container" +CANDLE_MODEL_IMPL="py" OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh diff --git a/workflows/dense-noise/swift/workflow.swift b/workflows/dense-noise/swift/workflow.swift index cb281e38..9be7e372 100644 --- a/workflows/dense-noise/swift/workflow.swift +++ b/workflows/dense-noise/swift/workflow.swift @@ -1,5 +1,5 @@ /* - UQ NOISE SWIFT + DENSE NOISE WORKFLOW SWIFT Main workflow */ diff --git a/workflows/dense-noise/test/cfg-sys-small.sh b/workflows/dense-noise/test/cfg-sys-small.sh index f2d02805..874c7a8d 100644 --- a/workflows/dense-noise/test/cfg-sys-small.sh +++ b/workflows/dense-noise/test/cfg-sys-small.sh @@ -12,8 +12,12 @@ export PPN=${PPN:-2} export WALLTIME=${WALLTIME:-00:60:00} # CANDLE@ALCF: -export PROJECT=CSC249ADOA01 -export QUEUE="debug-scaling" +# export PROJECT=CSC249ADOA01 +# export QUEUE="debug-scaling" + +# CANDLE@OLCF: +export PROJECT=MED106 +export QUEUE=batch # Benchmark run timeout: benchmark run will timeout # after the specified number of seconds. From a90343b8f9767570ca80221ef4fe65d41c4ec847 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 8 May 2023 15:21:54 -0500 Subject: [PATCH 526/601] Merge --- scripts/shrink-log.py | 4 +- workflows/cmp-cv/swift/workflow.sh | 2 +- workflows/cmp-cv/swift/workflow.swift | 7 +- workflows/common/python/model_runner.py | 56 +++++++-------- workflows/common/sh/sched-frontier.sh | 7 +- workflows/common/sh/utils.sh | 17 ++--- workflows/common/swift/model_py.swift | 18 ++--- workflows/cp-leaveout/py/data_setup.py | 41 ++++++----- workflows/cp-leaveout/py/plangen.py | 7 +- workflows/cp-leaveout/scripts/touch-exps.zsh | 11 --- workflows/cp-leaveout/swift/workflow.sh | 74 +++++++++++++------- workflows/cp-leaveout/swift/workflow.swift | 20 +++--- workflows/cp-leaveout/test/cfg-sys-512.sh | 4 +- workflows/cp-leaveout/test/test-512.sh | 16 +++-- 14 files changed, 159 insertions(+), 125 deletions(-) diff --git a/scripts/shrink-log.py b/scripts/shrink-log.py index e535f558..8e94bbe8 100644 --- a/scripts/shrink-log.py +++ b/scripts/shrink-log.py @@ -88,7 +88,7 @@ def hsize(size, decimal_places=2): s0 = os.stat(file_in) z0 = s0[stat.ST_SIZE] h0 = hsize(z0) -print("shrink: %11s %s" % (h0, file_in)) +print("shrink: %11s %s" % (h0, file_out)) with open(file_in, "r") as fp_in: with open(file_out, "w") as fp_out: @@ -102,4 +102,4 @@ def hsize(size, decimal_places=2): rate = hsize(z0 / t) print("shrank: %0.2fs %11s/s %11s -> %11s %s" % - (t, rate, hsize(z0), hsize(z1), file_in)) + (t, rate, hsize(z0), hsize(z1), file_out)) diff --git a/workflows/cmp-cv/swift/workflow.sh b/workflows/cmp-cv/swift/workflow.sh index bffb2625..b0a25e0a 100755 --- a/workflows/cmp-cv/swift/workflow.sh +++ b/workflows/cmp-cv/swift/workflow.sh @@ -84,7 +84,7 @@ fi export CANDLE_IMAGE=${CANDLE_IMAGE:-} -export CANDLE_MODEL_IMPL=container +export CANDLE_MODEL_IMPL=py which swift-t diff --git a/workflows/cmp-cv/swift/workflow.swift b/workflows/cmp-cv/swift/workflow.swift index 1739bc49..2bb29bff 100644 --- a/workflows/cmp-cv/swift/workflow.swift +++ b/workflows/cmp-cv/swift/workflow.swift @@ -75,7 +75,12 @@ foreach gparam, j in gparams_lines // printf(gparams); // results[runid] = obj(gparam, expid, repr(runid) ); model_script = "train.sh"; - results[runid] = obj_container(gparam, expid, repr(runid), model_name, candle_image, model_script) => compare(model_name, expid, repr(runid)); + results[runid] = // obj_container(gparam, expid, repr(runid), + // model_name, candle_image, model_script) => + candle_model_run(gparam, expid, repr(runid), + model_name) => + + compare(model_name, expid, repr(runid)); // results[runid] = obj(gparam, expid, repr(runid)); // => compare(expid, repr(runid) ); diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index 828ec775..084cfb60 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -17,6 +17,7 @@ logger = None print("MODEL RUNNER...") +sys.stdout.flush() # Set PYTHONPATH: # Let MODEL_PYTHON_DIR override default Benchmarks model locations @@ -127,7 +128,7 @@ def stop_perf(Ps): Ps[s].terminate() -def run(hyper_parameter_map, obj_return): +def run(hyper_parameter_map, model_return): start = time.time() global logger logger = get_logger(logger, "MODEL RUNNER") @@ -208,7 +209,7 @@ def run(hyper_parameter_map, obj_return): result, history_result = "EPOCHS_COMPLETED_ALREADY", None else: result, history_result = get_results( - history, obj_return, epochs) + history, model_return, epochs) else: result, history_result = "RUN_EXCEPTION", None @@ -221,30 +222,29 @@ def __init__(self, val_scores): self.history = {'val_loss': [val_scores['val_loss']]} history = history(val_scores) - result, history_result = get_results(history, obj_return, epochs) - - log("PKG RUN STOP") + result, history_result = get_results(history, model_return, epochs) stop_perf(Ps) finish = time.time() duration = finish - start - + # print the run_id and duration logger.info("DONE: run_id %s in %0.2f seconds." % (hyper_parameter_map["run_id"], duration)) + log("PKG RUN STOP") return (result, history_result) -def get_obj_return(): - obj_return = os.getenv("OBJ_RETURN") - valid_obj_returns = ["loss", "val_loss", "val_corr", "val_acc"] - if obj_return is None: - raise Exception("No OBJ_RETURN was in the environment!") - if obj_return not in valid_obj_returns: - raise Exception("Invalid value for OBJ_RETURN: use: " + - str(valid_obj_returns)) - return obj_return +def get_model_return(): + model_return = os.getenv("MODEL_RETURN") + valid_model_returns = ["loss", "val_loss", "val_corr", "val_acc"] + if model_return is None: + raise Exception("No MODEL_RETURN was in the environment!") + if model_return not in valid_model_returns: + raise Exception("Invalid value for MODEL_RETURN: use: " + + str(valid_model_returns)) + return model_return def load_pre_post(hyper_parameter_map, key): @@ -282,7 +282,7 @@ def run_model(hyper_parameter_map): os.chdir(instance_directory) global logger logger = get_logger(logger, "MODEL RUNNER") - obj_return = get_obj_return() + model_return = get_model_return() # logger.info("run_model: node: " + hyper_parameter_map['node']) directory = hyper_parameter_map["instance_directory"] os.chdir(directory) @@ -302,7 +302,7 @@ def run_model(hyper_parameter_map): else: assert result == ModelResult.SUCCESS # proceed... - result, history = run(hyper_parameter_map, obj_return) + result, history = run(hyper_parameter_map, model_return) runner_utils.write_output(result, directory) runner_utils.write_output( json.dumps(history, cls=runner_utils.FromNPEncoder), directory, @@ -334,25 +334,25 @@ def setup_params(pkg, hyper_parameter_map, params_arg): return params -def get_results(history, obj_return, epochs_expected): +def get_results(history, model_return, epochs_expected): """Return the history entry that the user requested. Also checks for early stopping and if so marks the directory. - history: The Keras history object + history: The Keras history modelect """ - logger.debug('get_results(): "%s"' % obj_return) + logger.debug('get_results(): "%s"' % model_return) known_params = ["loss", "val_loss", "val_corr", "val_dice_coef"] - if obj_return not in known_params: + if model_return not in known_params: raise ValueError("Unsupported objective function return " + 'key: "' + - obj_return + '" - ' + - "use obj_param to specify one of " + str(known_params)) + model_return + '" - ' + + "use model_param to specify one of " + str(known_params)) - if obj_return in history.history: + if model_return in history.history: # Good value - values = history.history[obj_return] + values = history.history[model_return] if len(values) < epochs_expected: msg = "early stopping: %i/%i" % (len(values), epochs_expected) logger.info("get_results(): " + msg) @@ -363,21 +363,21 @@ def get_results(history, obj_return, epochs_expected): result = float(values[-1]) else: logger.warning("get_results(): objective function return key " + - "not found: " + 'key: "' + obj_return + '" - ' + + "not found: " + 'key: "' + model_return + '" - ' + "history: " + str(history.history.keys())) logger.warning("get_results(): returning NaN") result = math.nan # Fix NaNs: if math.isnan(result): - if obj_return == "val_corr" or obj_return == "val_dice_coef": + if model_return == "val_corr" or model_return == "val_dice_coef": # Return the negative result result = -result else: # Just return a large number result = 999999999 - print("result: " + obj_return + ": " + str(result)) + print("result: " + model_return + ": " + str(result)) history_result = history.history.copy() return result, history_result diff --git a/workflows/common/sh/sched-frontier.sh b/workflows/common/sh/sched-frontier.sh index 391eb3d7..cea9d7a9 100644 --- a/workflows/common/sh/sched-frontier.sh +++ b/workflows/common/sh/sched-frontier.sh @@ -9,7 +9,10 @@ MACHINE="-m slurm" #export QUEUE=${QUEUE:-batch} export PROJECT=${PROJECT:-MED106} -PY=/gpfs/alpine/med106/proj-shared/hm0/candle_tf_2.10 -export TURBINE_PRELAUNCH="source activate $PY ; ${TURBINE_PRELAUNCH:-}" +# PY=/gpfs/alpine/med106/proj-shared/hm0/candle_tf_2.10 +PY=/lustre/orion/world-shared/med106/gounley1/conda543 +export TURBINE_PRELAUNCH="source activate $PY" + +export TURBINE_DIRECTIVE="#SBATCH -C nvme" export TURBINE_LAUNCH_OPTIONS="--gpus-per-task=1 --gpus-per-node=$PPN" diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index 0ed88569..9e21b61f 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -161,24 +161,21 @@ get_expid() # EXPID will have that suffix. # CANDLE_MODEL_TYPE: "BENCHMARKS" or "SINGULARITY" # Defaults to "BENCHMARKS" +# This variable affects the experiment directory structure # RETURN VALUES: EXPID and TURBINE_OUTPUT are exported into the environment # TURBINE_OUTPUT is canonicalized, because it may be soft-linked # to another filesystem (e.g., on Summit), and must be accessible # from the compute nodes without accessing the soft-links { - if (( ${#} < 1 )) + if (( ${#} != 1 )) then - echo "get_expid(): provide EXPID [CANDLE_MODEL_TYPE?]" + echo "get_expid(): provide EXPID" return 1 fi export EXPID=$1 - if [[ -z "${CANDLE_MODEL_TYPE}" ]]; then - CANDLE_MODEL_TYPE="BENCHMARKS" - fi - - echo "CANDLE_MODEL_TYPE is set to: ${CANDLE_MODEL_TYPE}" + echo "CANDLE_MODEL_TYPE is set to: ${CANDLE_MODEL_TYPE:=BENCHMARKS}" MODEL_NAME=${MODEL_NAME:-cmp} @@ -218,10 +215,10 @@ get_expid() done fi EXPID=$( printf "EXP%03i" $i )${EXP_SUFFIX:-} - export TURBINE_OUTPUT=$EXPERIMENTS/$EXPID + TURBINE_OUTPUT=$EXPERIMENTS/$EXPID check_experiment else - export TURBINE_OUTPUT=$EXPERIMENTS/$EXPID + TURBINE_OUTPUT=$EXPERIMENTS/$EXPID fi mkdir -pv $TURBINE_OUTPUT TO=$( readlink --canonicalize $TURBINE_OUTPUT ) @@ -230,7 +227,7 @@ get_expid() echo "Could not canonicalize: $TURBINE_OUTPUT" exit 1 fi - TURBINE_OUTPUT=$TO + export TURBINE_OUTPUT=$TO } next() diff --git a/workflows/common/swift/model_py.swift b/workflows/common/swift/model_py.swift index c5cea46b..6af0cd47 100644 --- a/workflows/common/swift/model_py.swift +++ b/workflows/common/swift/model_py.swift @@ -17,7 +17,7 @@ try: import tensorflow from tensorflow import keras - obj_result = '-100' + model_result = '-100' outdir = '%s' if not os.path.exists(outdir): @@ -34,7 +34,7 @@ try: hyper_parameter_map['run_id'] = '%s' hyper_parameter_map['timeout'] = %d - obj_result, history = model_runner.run_model(hyper_parameter_map) + model_result, history = model_runner.run_model(hyper_parameter_map) except Exception as e: info = sys.exc_info() @@ -43,17 +43,17 @@ except Exception as e: repr(e) + ' ... \\n' + ''.join(s)) sys.stdout.write('\\n') sys.stdout.flush() - obj_result = 'EXCEPTION' + model_result = 'EXCEPTION' ----; -(string obj_result) candle_model_train(string params, - string expid, - string runid, - string model_name) +(string model_result) candle_model_train(string params, + string expid, + string runid, + string model_name) { string outdir = "%s/run/%s" % (turbine_output, runid); string code = code_template % (outdir, params, model_name, expid, runid, benchmark_timeout); - obj_result = python_persist(code, "str(obj_result)"); - printf("obj_py:obj(): obj_result: '%s'", obj_result); + model_result = python_persist(code, "str(model_result)"); + printf("model_py:candle_model_train(): model_result: '%s'", model_result); } diff --git a/workflows/cp-leaveout/py/data_setup.py b/workflows/cp-leaveout/py/data_setup.py index d5644766..cfa79783 100644 --- a/workflows/cp-leaveout/py/data_setup.py +++ b/workflows/cp-leaveout/py/data_setup.py @@ -24,37 +24,45 @@ def __init__(self, dataframe_from, node, plan, output): self.output = output -def setup_nvm(params): +def setup_local_fs(params): # username = os.environ['USER'] # No longer works on Summit 2021-10-13 username = params["user"] - nvme_enabled = Path("/mnt/bb/{}".format(username)).exists() - # nvme_enabled = True + userdir = Path("/mnt/bb/%s" % username) + nvme_enabled = userdir.exists() print("NVMe: %r" % nvme_enabled) if not nvme_enabled: return params - # copy original datafrom to NVMe disk space + # The training data directory for this workflow node: + nodedir = userdir / params["node"] + os.makedirs(nodedir, exist_ok=True) + # copy original datafrom to NVMe try: src = Path(params["dataframe_from"]) - dest = Path("/mnt/bb/{}/{}".format(username, src.name)) + local_orig = userdir / src.name + local_train = nodedir / Path("topN.uno.h5") + dest = Path(local_orig) if not dest.exists(): start = time.time() count = dest.write_bytes(src.read_bytes()) stop = time.time() duration = stop - start rate = count / duration / (1024 * 1024) - print("File copy completed. Original dataframe " + - "copied to NVM in %0.1f seconds (%0.1f MB/s)." % - (duration, rate)) + print("Original dataframe copied to NVM in " + + "%0.1f seconds (%0.1f MB/s)." % (duration, rate)) else: + # Report file size: + stats = os.stat(local_orig) print("File copy skipped. " + - "Original dataframe already exists in NVM.") + "Original dataframe already exists in NVM: size=%i" % + stats.st_size) except Exception as e: print("Error occurred in copying original dataframe\n" + str(e)) traceback.print_exc() return ModelResult.ERROR params["dataframe_from"] = dest.resolve() - # Do not do this: it changes the location of the training data - # params["use_exported_data"] = "/mnt/bb/{}/{}".format(username, params["use_exported_data"]) + # WARNING: this changes the location of the training data: + params["dataframe_from"] = local_orig + params["use_exported_data"] = local_train return params @@ -68,13 +76,14 @@ def pre_run(params): # softlink to cache & config file # build node specific training/validation dataset + params = setup_local_fs(params) + args = TopN_Args( params["dataframe_from"], params["node"], params["plan"], - params["use_exported_data"], + output=params["use_exported_data"], ) - print("TopN_Args: " + str(args)) data = params["benchmark_data"] try: @@ -96,9 +105,8 @@ def pre_run(params): print("data_setup: build_dataframe(output=%s) ..." % args.output) sys.stdout.flush() if not os.path.exists(args.output): - params = setup_nvm(params) out_orig = args.output - args.output = out_orig + ".part" + args.output = Path(str(out_orig) + ".part") start = time.time() topN_to_uno.build_dataframe(args) stop = time.time() @@ -144,7 +152,8 @@ def post_run(params, output_dict): sys.stdout.flush() if "use_exported_data" in params: try: - os.remove(params["use_exported_data"]) + # os.remove(params["use_exported_data"]) + pass except OSError as e: print("Error: %s - %s." % (e.filename, e.strerror)) else: diff --git a/workflows/cp-leaveout/py/plangen.py b/workflows/cp-leaveout/py/plangen.py index 66f5990f..1a537831 100644 --- a/workflows/cp-leaveout/py/plangen.py +++ b/workflows/cp-leaveout/py/plangen.py @@ -1052,10 +1052,12 @@ def get_subplan(plan_dict, subplan_id=None): A (content, subplan_id) pair is returned. The returned name is useful when using default arguments to retrieve the root plan. """ - if subplan_id is None: subplan_id, content = _get_first_entry(plan_dict) else: + # print("get_subplan dump:") + # json.dump(plan_dict, sys.stdout, indent=2) + # print("keys: %i" % len(plan_dict.keys())) content = plan_dict.get(subplan_id) return content, subplan_id @@ -1173,8 +1175,9 @@ def get_subplan_features(plan_dict, subplan_id, parent_features=False): """ # acquire feature_set names populated in the plan + print("get_subplan_features(): " + subplan_id) content, _ = get_subplan(plan_dict, subplan_id) - if not content: + if content is None: print("get_subplan() found no content!") return None, None, None, None diff --git a/workflows/cp-leaveout/scripts/touch-exps.zsh b/workflows/cp-leaveout/scripts/touch-exps.zsh index 8cda3d84..497c727c 100755 --- a/workflows/cp-leaveout/scripts/touch-exps.zsh +++ b/workflows/cp-leaveout/scripts/touch-exps.zsh @@ -3,16 +3,6 @@ set -eu which python -<<<<<<< Updated upstream -A=( 750 - 746 - 757 - 771 - 743 - 744 - 759 - 763 -======= A=( # 750 # 746 # 757 @@ -22,7 +12,6 @@ A=( # 750 # 759 # 763 # 828 ->>>>>>> Stashed changes 838 839 ) diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index 92b6923b..5678bb05 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -73,6 +73,7 @@ PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # For log_tools, model_runner APP_PYTHONPATH+=:$EMEWS_PROJECT_ROOT/py # For plangen, data_setup APP_PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # For log_tools APP_PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common # For Benchmarks +export APP_PYTHONPATH # Job name limit on Frontier: 8 export TURBINE_JOBNAME=$EXPID @@ -149,8 +150,8 @@ log_script mkdir -p $TURBINE_OUTPUT/run # Allow the user to set an objective function -OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} +CANDLE_MODEL_DIR=${CANDLE_MODEL_DIR:-$WORKFLOWS_ROOT/common/swift} +CANDLE_MODEL_MODULE=${CANDLE_MODEL_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh @@ -197,7 +198,7 @@ else STDOUT="" fi -# TURBINE_STDOUT="" +TURBINE_STDOUT="" if [[ $SITE == "summit" || $SITE == "frontier" ]] then export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" @@ -208,40 +209,63 @@ mkdir -pv $TURBINE_OUTPUT/out LD_LIBRARY_PATH=/opt/cray/libfabric/1.15.2.0/lib64 -# set -x +export MODEL_RETURN="val_loss" + +export TURBINE_LEADER_HOOK_STARTUP="$( sed 's/#.*//;s/$/;/' $EMEWS_PROJECT_ROOT/swift/hook-1.tcl )" + +# Environment variables KEY=VALUE passed into workflow. +# If exported, a VALUE does not need to be provided. +ENVS=( + # Where the Benchmarks are: + BENCHMARKS_ROOT + # The top-level directory for this workflow: + EMEWS_PROJECT_ROOT + # This will be pre-pended into PYTHONPATH if model.sh is used: + APP_PYTHONPATH + # Tell Python to auto-flush stdout: + PYTHONUNBUFFERED=1 + # Other site-specific Python settings: + # $( python_envs ) + # The CANDLE model: + MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} + MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} + # Location of model.sh: + MODEL_SH + # The CANDLE model name: + MODEL_NAME + # The statistic to return from each model: + MODEL_RETURN + # The computing site we are running on: + SITE + # A timeout in seconds for each model: + BENCHMARK_TIMEOUT + SH_TIMEOUT + # If 1, do not crash workflow on model errors: + IGNORE_ERRORS +) + +# Number of ranks to allocate for the DB: +export TURBINE_DB_WORKERS=1 + +# Insert -e flags for Swift/T command line: +ENV_ARG="-e $( echo ${ENVS[@]} | sed 's/ */ -e /g' )" + swift-t -O 0 -n $PROCS \ ${MACHINE:-} \ -p \ - -I $OBJ_DIR \ - -i $OBJ_MODULE \ + -I $CANDLE_MODEL_DIR \ + -i $CANDLE_MODEL_MODULE \ -I $EMEWS_PROJECT_ROOT/swift \ -i $EPOCH_MODE_MODULE \ - -e LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} \ - -e BENCHMARKS_ROOT \ - -e EMEWS_PROJECT_ROOT \ - -e APP_PYTHONPATH=$APP_PYTHONPATH \ - $( python_envs ) \ - -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ - -e TURBINE_STDOUT=$TURBINE_STDOUT \ - -e OBJ_RETURN \ - -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ - -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ - -e MODEL_SH \ - -e MODEL_NAME \ - -e SITE \ - -e BENCHMARK_TIMEOUT \ - -e BENCHMARKS_ROOT \ - -e SH_TIMEOUT \ - -e IGNORE_ERRORS \ - -e TURBINE_DB_WORKERS=1 \ + ${ENV_ARG} \ $WAIT_ARG \ $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} # | \ # tee $STDOUT +# -e LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} \ # -e USER # Needed on Summit to find NVME # -j /usr/bin/java # Give this to Swift/T if needed for Java -# -e PYTHONUNBUFFERED=1 # May be needed if error output is being lost # -e PYTHONVERBOSE=1 # Debugs module load confusion diff --git a/workflows/cp-leaveout/swift/workflow.swift b/workflows/cp-leaveout/swift/workflow.swift index 4193357c..8d19f639 100644 --- a/workflows/cp-leaveout/swift/workflow.swift +++ b/workflows/cp-leaveout/swift/workflow.swift @@ -107,7 +107,7 @@ global const string FRAMEWORK = "keras"; run_stage(string db_file, string plan_id, string runtype, void token_parent, int stage, string parent, string this) { - printf("stage: %i parent: %s this: %s", stage, parent, this); + // printf("stage: %i parent: %s this: %s", stage, parent, this); // Run the model void token_this = run_single(plan_id, token_parent, stage, parent, this); @@ -147,19 +147,19 @@ run_stage(string db_file, string plan_id, string runtype, json_fragment = make_json_fragment(parent, this, stage); json = "{\"node\": \"%s\", %s}" % (this, json_fragment); token => - printf("run_single(): running obj(%s)", this) => + printf("run_single(): running candle_model_train(%s)", this) => // Insert the model run into the DB result1 = plangen_start(this, plan_id); assert(result1 != "EXCEPTION", "Exception in plangen_start()!"); if (result1 == "0") { // Run the model - obj_result = obj(json, exp_id, this); + model_result = candle_model_train(json, exp_id, this, model_name); printf("run_single(): completed: node: '%s' result: '%s'", - this, obj_result); + this, model_result); // Update the DB to complete the model run string result2; - if (obj_result != "RUN_EXCEPTION") + if (model_result != "RUN_EXCEPTION") { result2 = plangen_stop(this, plan_id); } @@ -167,11 +167,11 @@ run_stage(string db_file, string plan_id, string runtype, { result2 = "RETRY"; } - assert(obj_result != "", "Error in obj(): result is empty!"); - assert(obj_result != "EXCEPTION", "Exception in obj()!"); + assert(model_result != "", "Error in obj(): result is empty!"); + assert(model_result != "EXCEPTION", "Exception in obj()!"); assert(result2 != "EXCEPTION", "Exception in plangen_stop()!"); printf("run_single(): stop_subplan result: '%s'", result2); - v = propagate(obj_result); + v = propagate(model_result); } else // result1 != 0 { @@ -199,10 +199,12 @@ run_stage(string db_file, string plan_id, string runtype, "epochs": %i, "es": "True", "early_stopping": %i, +"experiment_id": "%s", +"run_id": "%s", "use_exported_data": "topN.uno.h5", "benchmark_data": "%s" ---- % -(plan_json, user, dataframe_csv, epochs, early_stopping, benchmark_data); +(plan_json, user, dataframe_csv, epochs, early_stopping, exp_id, this, benchmark_data); if (stage > 1) { result = json_fragment + ---- diff --git a/workflows/cp-leaveout/test/cfg-sys-512.sh b/workflows/cp-leaveout/test/cfg-sys-512.sh index cb329a19..43699e1c 100644 --- a/workflows/cp-leaveout/test/cfg-sys-512.sh +++ b/workflows/cp-leaveout/test/cfg-sys-512.sh @@ -12,13 +12,13 @@ # 3 92 921 12.0 0 # 4 46 91 6.0 0 # 5 1 45 2.0 -export PROCS=${PROCS:-8} +export PROCS=${PROCS:-2048} # MPI processes per node # Cori has 32 cores per node, 128GB per node export PPN=${PPN:-8} -export WALLTIME=${WALLTIME:-01:00:00} +export WALLTIME=${WALLTIME:-12:00:00} # command separated list of gpu ids # export GPU_STRING=${GPU_STRING:-0} diff --git a/workflows/cp-leaveout/test/test-512.sh b/workflows/cp-leaveout/test/test-512.sh index 27d976a4..3dff6707 100755 --- a/workflows/cp-leaveout/test/test-512.sh +++ b/workflows/cp-leaveout/test/test-512.sh @@ -47,18 +47,20 @@ SCRATCH=/gpfs/alpine/med106/scratch/wozniak # SCRATCH=/usb2/wozniak # CANDLE_DATA=$SCRATCH/CANDLE-Data/Milestone-13 CANDLE_DATA=$SCRATCH/CANDLE-Data/ChallengeProblem/top21_2020Jul +CANDLE_DATA=$CANDLE_DATA_DIR/ChallengeProblem/top21_2020Jul # CANDLE_DATA=$SCRATCH/CANDLE-Data/ChallengeProblem/old # PLAN_JSON=$CANDLE_DATA/plangen_cell1593-p4_drug1779-p1.json # DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.csv # DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.feather # DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.hdf5 -if (( ! ${BIG_PLAN:-0} )) -then -PLAN_JSON=$CANDLE_DATA/plangen_cell703-p4_drug1492-p1-u.json # 2022-07 -# PLAN_JSON=$CANDLE_DATA/plangen_CELL2917-p4_DRUG2148-p4.json # 2023-02 -else -PLAN_JSON=/gpfs/alpine/med106/proj-shared/brettin/Supervisor/workflows/cp-leaveout/plangen_CELL2917-p4_DRUG2148-p4.json -fi +# if (( ! ${BIG_PLAN:-0} )) +# then +# PLAN_JSON=$CANDLE_DATA/plangen_cell703-p4_drug1492-p1-u.json # 2022-07 +# # PLAN_JSON=$CANDLE_DATA/plangen_CELL2917-p4_DRUG2148-p4.json # 2023-02 +# else +# PLAN_JSON=/gpfs/alpine/med106/proj-shared/brettin/Supervisor/workflows/cp-leaveout/plangen_CELL2917-p4_DRUG2148-p4.json +# fi +PLAN_JSON=$CANDLE_DATA/plangen_CELL703-p4_DRUG1492-p4.json # DATAFRAME_CSV=$CANDLE_DATA/top21.h5 # 2022-07 DATAFRAME_CSV=$CANDLE_DATA/top21-cleaned-dd.h5 # NEW 2022-10 # BENCHMARK_DATA=$SCRATCH/proj/Benchmarks/Pilot1/Uno From ec7752cc64de2ef42b46ca9b5ef0150df74dcdca Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 11 May 2023 10:18:08 -0500 Subject: [PATCH 527/601] Use new Swift/T --- workflows/common/sh/env-frontier.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/workflows/common/sh/env-frontier.sh b/workflows/common/sh/env-frontier.sh index b5d9b329..59c30d3e 100644 --- a/workflows/common/sh/env-frontier.sh +++ b/workflows/common/sh/env-frontier.sh @@ -4,8 +4,11 @@ # CANDLE_MODEL_IMPL=echo CANDLE_MODEL_IMPL=py -ROOT=/autofs/nccs-svm1_home1/wozniak/Public/sfw/frontier -SWIFT=$ROOT/swift-t/2023-04-26 +# ROOT=/autofs/nccs-svm1_home1/wozniak/Public/sfw/frontier +# SWIFT=$ROOT/swift-t/2023-04-26 # Good + +ROOT=/lustre/orion/med106/world-shared/sfw +SWIFT=$ROOT/swift-t/2023-05-08 # MPI-IO fix export TURBINE_HOME=$SWIFT/turbine PATH=$SWIFT/stc/bin:$PATH From 615d6cfa4333d80d44f712411f289ef81d8bb1d5 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 11 May 2023 10:19:47 -0500 Subject: [PATCH 528/601] Use plan file from NVMe --- workflows/cp-leaveout/py/data_setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/cp-leaveout/py/data_setup.py b/workflows/cp-leaveout/py/data_setup.py index cfa79783..8276ac96 100644 --- a/workflows/cp-leaveout/py/data_setup.py +++ b/workflows/cp-leaveout/py/data_setup.py @@ -63,6 +63,8 @@ def setup_local_fs(params): # WARNING: this changes the location of the training data: params["dataframe_from"] = local_orig params["use_exported_data"] = local_train + params["plan"] = str(userdir / Path(params["plan"]).name) + print("Using plan file: " + params["plan"]) return params From 193a34c5d8a4a921bc9bbe1b58247c6b5ae8692c Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Fri, 12 May 2023 12:22:39 -0700 Subject: [PATCH 529/601] o Fix things as per new defs --- workflows/cmp-cv/swift/workflow.swift | 2 +- workflows/cmp-cv/test/test-small-1.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/cmp-cv/swift/workflow.swift b/workflows/cmp-cv/swift/workflow.swift index 2bb29bff..f0a50b16 100644 --- a/workflows/cmp-cv/swift/workflow.swift +++ b/workflows/cmp-cv/swift/workflow.swift @@ -77,7 +77,7 @@ foreach gparam, j in gparams_lines model_script = "train.sh"; results[runid] = // obj_container(gparam, expid, repr(runid), // model_name, candle_image, model_script) => - candle_model_run(gparam, expid, repr(runid), + candle_model_train(gparam, expid, repr(runid), model_name) => compare(model_name, expid, repr(runid)); diff --git a/workflows/cmp-cv/test/test-small-1.sh b/workflows/cmp-cv/test/test-small-1.sh index 22fc8222..0929c4ee 100755 --- a/workflows/cmp-cv/test/test-small-1.sh +++ b/workflows/cmp-cv/test/test-small-1.sh @@ -18,7 +18,7 @@ EMEWS_PROJECT_ROOT=$( cd $THIS/.. ; /bin/pwd ) WORKFLOWS_ROOT=$( cd $THIS/../.. ; /bin/pwd ) export EMEWS_PROJECT_ROOT -export OBJ_RETURN="val_loss" +export MODEL_RETURN="val_loss" CFG_SYS=$THIS/cfg-sys-1.sh # export MODEL_NAME="DrugCell" From b9bd396ac3198c6b31fc0fa9e1417fed14008eda Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Fri, 12 May 2023 13:05:32 -0700 Subject: [PATCH 530/601] o Fix cmp-cv as per candle_model.. --- workflows/cmp-cv/py/compare.py | 6 +++--- workflows/cmp-cv/swift/workflow.sh | 6 ++---- workflows/cmp-cv/swift/workflow.swift | 2 +- workflows/cmp-cv/test/upf-1.txt | 10 +++++----- workflows/common/python/model_runner.py | 7 ++++--- workflows/common/sh/langs-app-polaris.sh | 2 +- workflows/common/swift/model_app.swift | 3 ++- workflows/csg/swift/workflow.swift | 5 +++-- workflows/csg/test/upf-1.txt | 2 +- workflows/csg/test/upf-graphdrp-polaris.txt | 2 +- 10 files changed, 23 insertions(+), 22 deletions(-) diff --git a/workflows/cmp-cv/py/compare.py b/workflows/cmp-cv/py/compare.py index e9916af7..5f051ade 100644 --- a/workflows/cmp-cv/py/compare.py +++ b/workflows/cmp-cv/py/compare.py @@ -32,10 +32,10 @@ def compare(model_name, exp_id, run_id): # model = gParams("model_name") # model = "DrugCell" # TODO: Hardcoded. have to get this from output dir? - # turbine_output = os.getenv("TURBINE_OUTPUT") - + # turbine_output = os.getenv("TURBINE_OUTPUT") + CANDLE_DATA_DIR = os.getenv("CANDLE_DATA_DIR") - outdir = os.path.join(CANDLE_DATA_DIR, model_name, "Output", exp_id, run_id) + outdir = os.path.join(CANDLE_DATA_DIR, model_name, "Output", exp_id, run_id) directory = outdir # directory = f"{CANDLE_DATA_DIR}/Output/{exp_id}/{run_id}" print("reading the predictions....") diff --git a/workflows/cmp-cv/swift/workflow.sh b/workflows/cmp-cv/swift/workflow.sh index b0a25e0a..15342bb2 100755 --- a/workflows/cmp-cv/swift/workflow.sh +++ b/workflows/cmp-cv/swift/workflow.sh @@ -45,8 +45,6 @@ source_site sched $SITE # Set up PYTHONPATH for model source $WORKFLOWS_ROOT/common/sh/set-pythonpath.sh -export PYTHONPATH="${PYTHONPATH}:/homes/ac.gpanapitiya/ccmg-mtg/models/to_Candle/DrugCell" -export PYTHONPATH="${PYTHONPATH}:/homes/ac.gpanapitiya/ccmg-mtg/models/to_Candle/SWnet" export PYTHONPATH="${PYTHONPATH}:$WORKFLOWS_ROOT/cmp-cv/py" log_path PYTHONPATH @@ -84,7 +82,7 @@ fi export CANDLE_IMAGE=${CANDLE_IMAGE:-} -export CANDLE_MODEL_IMPL=py +export CANDLE_MODEL_IMPL=container which swift-t @@ -101,7 +99,7 @@ swift-t -n $PROCS \ -e SITE \ -e BENCHMARK_TIMEOUT \ -e MODEL_NAME=${MODEL_NAME:-MODEL_NULL} \ - -e OBJ_RETURN \ + -e MODEL_RETURN \ -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ -e TURBINE_MPI_THREAD=${TURBINE_MPI_THREAD:-1} \ $( python_envs ) \ diff --git a/workflows/cmp-cv/swift/workflow.swift b/workflows/cmp-cv/swift/workflow.swift index f0a50b16..6eec2dd1 100644 --- a/workflows/cmp-cv/swift/workflow.swift +++ b/workflows/cmp-cv/swift/workflow.swift @@ -78,7 +78,7 @@ foreach gparam, j in gparams_lines results[runid] = // obj_container(gparam, expid, repr(runid), // model_name, candle_image, model_script) => candle_model_train(gparam, expid, repr(runid), - model_name) => + candle_image) => compare(model_name, expid, repr(runid)); // results[runid] = obj(gparam, expid, repr(runid)); diff --git a/workflows/cmp-cv/test/upf-1.txt b/workflows/cmp-cv/test/upf-1.txt index 86d087d2..07190f70 100644 --- a/workflows/cmp-cv/test/upf-1.txt +++ b/workflows/cmp-cv/test/upf-1.txt @@ -3,16 +3,16 @@ {"id": "RUN003", "epochs": 1, "model_name": "SWnet", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} {"id": "RUN004", "epochs": 2, "model_name": "SWnet", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} -#GraphDRP -#{"id": "RUN004", "training_data" : "path/to/dir, +#GraphDRP +#{"id": "RUN004", "training_data" : "path/to/dir, # "testing_data": "path/to/dir", # "infer_data" : [ path/to/dir ], # "model_params": name_of_model_params_output_of_training, -# "epochs": 50, -# "model_name": "GraphDRP", +# "epochs": 50, +# "model_name": "GraphDRP", # "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} #{"id": "RUN004", "epochs": 50, "model_name": "GraphDRP", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} #{"id": "RUN004", "epochs": 50, "model_name": "GraphDRP", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} #{"id": "RUN004", "epochs": 50, "model_name": "GraphDRP", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} #{"id": "RUN004", "epochs": 50, "model_name": "DeepTTC", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} -#DeepTTC \ No newline at end of file +#DeepTTC diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index 084cfb60..fb379467 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -192,8 +192,8 @@ def run(hyper_parameter_map, model_return): # sys.stdout.write('\\n\\nEXCEPTION in model run(): \\n' + # repr(e) + ' ... \\n' + ''.join(s)) # sys.stdout.write('\\n') - sys.stdout.write('\n\nEXCEPTION in model run(): \n' + - repr(e) + ' ... \n' + ''.join(s)) + sys.stdout.write('\n\nEXCEPTION in model run(): \n' + repr(e) + + ' ... \n' + ''.join(s)) sys.stdout.write('\n') sys.stdout.flush() exception = True @@ -348,7 +348,8 @@ def get_results(history, model_return, epochs_expected): if model_return not in known_params: raise ValueError("Unsupported objective function return " + 'key: "' + model_return + '" - ' + - "use model_param to specify one of " + str(known_params)) + "use model_param to specify one of " + + str(known_params)) if model_return in history.history: # Good value diff --git a/workflows/common/sh/langs-app-polaris.sh b/workflows/common/sh/langs-app-polaris.sh index 0f6e7a72..f83741b4 100644 --- a/workflows/common/sh/langs-app-polaris.sh +++ b/workflows/common/sh/langs-app-polaris.sh @@ -8,4 +8,4 @@ module load singularity export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 export http_proxy=http://proxy.alcf.anl.gov:3128 -export https_proxy=http://proxy.alcf.anl.gov:3128 \ No newline at end of file +export https_proxy=http://proxy.alcf.anl.gov:3128 diff --git a/workflows/common/swift/model_app.swift b/workflows/common/swift/model_app.swift index 5dce30be..49cb955f 100644 --- a/workflows/common/swift/model_app.swift +++ b/workflows/common/swift/model_app.swift @@ -10,13 +10,14 @@ params : The JSON string of params to be passed to the Benchmark expid : A string experiment ID that will be in the output directory name runid : A string run ID that will be in the output directory name - model_name : Benchmark (e.g., "uno") + model_name : Benchmark (e.g., "uno") */ (string obj_result) candle_model_train(string params, string expid, string runid, string model_name) { + string model_sh = getenv("MODEL_SH"); string turbine_output = getenv("TURBINE_OUTPUT"); diff --git a/workflows/csg/swift/workflow.swift b/workflows/csg/swift/workflow.swift index f0ac2a3e..555cc798 100644 --- a/workflows/csg/swift/workflow.swift +++ b/workflows/csg/swift/workflow.swift @@ -52,7 +52,8 @@ foreach gparam, j in gparams_lines printf("MODEL: %s", model_name); // TODO: Add preprocessing script - results[runid] = obj_container(gparam, expid, repr(runid), model_name, candle_image, model_script); + // results[runid] = + model_script_preprocess = "preprocess.sh"; + obj_container(gparam, expid, repr(runid), model_name, candle_image, model_script_preprocess)=>obj_container(gparam, expid, repr(runid), model_name, candle_image, model_script); // TODO: Add inference script or loop to do multiple inferences on a trained model } - diff --git a/workflows/csg/test/upf-1.txt b/workflows/csg/test/upf-1.txt index bb88828e..6cbf5a43 100644 --- a/workflows/csg/test/upf-1.txt +++ b/workflows/csg/test/upf-1.txt @@ -1,4 +1,4 @@ {"id": "RUN001", "batch_size": 16, "epochs": 4, "model_name": "GraphDRP", "candle_image": "/software/improve/images/GraphDRP.sif"} {"id": "RUN002", "batch_size": 32, "epochs": 3, "model_name": "GraphDRP", "candle_image": "/software/improve/images/GraphDRP.sif"} {"id": "RUN003", "batch_size": 64, "epochs": 2, "model_name": "GraphDRP", "candle_image": "/software/improve/images/GraphDRP.sif"} -{"id": "RUN004", "batch_size": 128, "epochs": 1, "model_name": "GraphDRP", "candle_image": "/software/improve/images/GraphDRP.sif"} \ No newline at end of file +{"id": "RUN004", "batch_size": 128, "epochs": 1, "model_name": "GraphDRP", "candle_image": "/software/improve/images/GraphDRP.sif"} diff --git a/workflows/csg/test/upf-graphdrp-polaris.txt b/workflows/csg/test/upf-graphdrp-polaris.txt index 71224424..59113d20 100644 --- a/workflows/csg/test/upf-graphdrp-polaris.txt +++ b/workflows/csg/test/upf-graphdrp-polaris.txt @@ -1,4 +1,4 @@ {"id": "RUN001", "batch_size": 16, "epochs": 4, "model_name": "GraphDRP", "candle_image": "/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif"} {"id": "RUN002", "batch_size": 32, "epochs": 3, "model_name": "GraphDRP", "candle_image": "/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif"} {"id": "RUN003", "batch_size": 64, "epochs": 2, "model_name": "GraphDRP", "candle_image": "/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif"} -{"id": "RUN004", "batch_size": 128, "epochs": 1, "model_name": "GraphDRP", "candle_image": "/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif"} \ No newline at end of file +{"id": "RUN004", "batch_size": 128, "epochs": 1, "model_name": "GraphDRP", "candle_image": "/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif"} From eaab019b2a0f289469771f37069a15cc450447c5 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 17 May 2023 10:57:34 -0500 Subject: [PATCH 531/601] Handle more errors --- workflows/cp-leaveout/swift/workflow.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index 5678bb05..8cff4e16 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -115,7 +115,12 @@ then echo "Restart requested ..." if [[ ! -d $TURBINE_OUTPUT ]] then - echo "No prior run found! (tried $TURBINE_OUTPUT/output.txt)" + echo "ERROR: No prior run found! (tried $TURBINE_OUTPUT)" + exit 1 + fi + if [[ ! -f $TURBINE_OUTPUT/cplo.db ]] + then + echo "ERROR: No DB found! (tried $TURBINE_OUTPUT/cplo.db)" exit 1 fi if [[ ! -f $TURBINE_OUTPUT/output.txt ]] From e6182010fbe04646e23a86c4b7aada2701ac4c9f Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 17 May 2023 10:57:41 -0500 Subject: [PATCH 532/601] Try new Swift/T --- workflows/common/sh/env-frontier.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/common/sh/env-frontier.sh b/workflows/common/sh/env-frontier.sh index 59c30d3e..47974aab 100644 --- a/workflows/common/sh/env-frontier.sh +++ b/workflows/common/sh/env-frontier.sh @@ -8,7 +8,8 @@ CANDLE_MODEL_IMPL=py # SWIFT=$ROOT/swift-t/2023-04-26 # Good ROOT=/lustre/orion/med106/world-shared/sfw -SWIFT=$ROOT/swift-t/2023-05-08 # MPI-IO fix +# SWIFT=$ROOT/swift-t/2023-05-08 # MPI-IO fix +SWIFT=$ROOT/swift-t/2023-05-10 # PMI SYNC export TURBINE_HOME=$SWIFT/turbine PATH=$SWIFT/stc/bin:$PATH From 07a750aca86a5db78fe2ad59e00b291d52f290db Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 17 May 2023 11:15:00 -0500 Subject: [PATCH 533/601] Backup DB and its log --- workflows/cp-leaveout/swift/workflow.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index 8cff4e16..63684a0f 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -1,5 +1,6 @@ #! /usr/bin/env bash set -eu +shopt -s nullglob # CP-LEAVEOUT WORKFLOW # Main entry point for CP-LEAVEOUT workflow @@ -135,8 +136,10 @@ then PRIORS=( $TURBINE_OUTPUT/output.txt $TURBINE_OUTPUT/out $TURBINE_OUTPUT/turbine* - $TURBINE_OUTPUT/jobid.txt ) + $TURBINE_OUTPUT/jobid.txt + $TURBINE_OUTPUT/plangen_db.log* ) mv ${PRIORS[@]} $PRIOR_RUN + cp $TURBINE_OUTPUT/cplo.db $PRIOR_RUN fi else # Not a restart if [[ -f $TURBINE_OUTPUT/output.txt ]] From 39451c30ace7456759c5345170080e3bf2acd1cb Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 17 May 2023 11:16:16 -0500 Subject: [PATCH 534/601] Better output and logging --- workflows/cp-leaveout/py/data_setup.py | 58 +++++++++++-------- workflows/cp-leaveout/py/plangen.py | 79 +++++++++++++++++--------- 2 files changed, 85 insertions(+), 52 deletions(-) diff --git a/workflows/cp-leaveout/py/data_setup.py b/workflows/cp-leaveout/py/data_setup.py index 8276ac96..0c7d530c 100644 --- a/workflows/cp-leaveout/py/data_setup.py +++ b/workflows/cp-leaveout/py/data_setup.py @@ -9,6 +9,13 @@ import topN_to_uno from runner_utils import ModelResult +from log_tools import * + + +logger = get_logger(logger, "DATA SETUP") + +logger.info("LOAD:") +sys.stdout.flush() class TopN_Args: @@ -25,11 +32,12 @@ def __init__(self, dataframe_from, node, plan, output): def setup_local_fs(params): + global logger # username = os.environ['USER'] # No longer works on Summit 2021-10-13 username = params["user"] userdir = Path("/mnt/bb/%s" % username) nvme_enabled = userdir.exists() - print("NVMe: %r" % nvme_enabled) + logger.info("NVMe: %r" % nvme_enabled) if not nvme_enabled: return params # The training data directory for this workflow node: @@ -47,33 +55,32 @@ def setup_local_fs(params): stop = time.time() duration = stop - start rate = count / duration / (1024 * 1024) - print("Original dataframe copied to NVM in " + + logger.info("Original dataframe copied to NVM in " + "%0.1f seconds (%0.1f MB/s)." % (duration, rate)) else: # Report file size: stats = os.stat(local_orig) - print("File copy skipped. " + - "Original dataframe already exists in NVM: size=%i" % - stats.st_size) + logger.info("Original dataframe already exists in NVM: size=%i" % + stats.st_size) except Exception as e: print("Error occurred in copying original dataframe\n" + str(e)) + sys.stdout.flush() traceback.print_exc() + sys.stdout.flush() return ModelResult.ERROR params["dataframe_from"] = dest.resolve() # WARNING: this changes the location of the training data: params["dataframe_from"] = local_orig params["use_exported_data"] = local_train params["plan"] = str(userdir / Path(params["plan"]).name) - print("Using plan file: " + params["plan"]) + logger.info("Using plan file: " + params["plan"]) return params def pre_run(params): - import sys - import time + global logger - print("data_setup.pre_run(): node: '%s' ..." % params["node"]) - sys.stdout.flush() + logger.info("pre_run(): node: '%s' ..." % params["node"]) # softlink to cache & config file # build node specific training/validation dataset @@ -92,8 +99,8 @@ def pre_run(params): for filename in ["uno_auc_model.txt"]: # "cache", if not os.path.islink(filename): src = f"{data}/{filename}" - print("data_setup: src: (%s)" % src) - print("data_setup: dest: (%s)" % filename) + logger.info("data_setup: src: (%s)" % src) + logger.info("data_setup: dest: (%s)" % filename) os.symlink(src, filename) except Exception as e: print("data_setup: error making symlink:") @@ -101,10 +108,11 @@ def pre_run(params): print("data_setup: src: (%s)" % src) print("data_setup: dest: (%s)" % filename) print(str(e)) + sys.stdout.flush() return ModelResult.ERROR try: - print("data_setup: build_dataframe(output=%s) ..." % args.output) + logger.info("build_dataframe(output=%s) ..." % args.output) sys.stdout.flush() if not os.path.exists(args.output): out_orig = args.output @@ -113,12 +121,12 @@ def pre_run(params): topN_to_uno.build_dataframe(args) stop = time.time() duration = stop - start - print("data_setup: build_dataframe() OK : " + - "%0.1f seconds." % duration) - sys.stdout.flush() + logger.info("build_dataframe() OK : " + + "%0.1f seconds." % duration) + # sys.stdout.flush() os.rename(args.output, out_orig) - print("data_setup: rename() OK") - sys.stdout.flush() + logger.info("rename() OK") + # sys.stdout.flush() args.output = out_orig else: print("data_setup: dataframe exists: %s" % @@ -126,7 +134,7 @@ def pre_run(params): except topN_to_uno.topN_NoDataException: print("data_setup: caught topN_NoDataException: SKIP " + "for node: '%s'" % params["node"]) - sys.stdout.flush() + # sys.stdout.flush() directory = params["instance_directory"] with open(directory + "/NO-DATA.txt", "a") as fp: ts = datetime.datetime.now() @@ -144,14 +152,15 @@ def pre_run(params): traceback.print_exc(file=sys.stdout) sys.stdout.flush() return ModelResult.ERROR - print("data_setup.pre_run() done.") - sys.stdout.flush() + logger.info("data_setup.pre_run() done.") + # sys.stdout.flush() return ModelResult.SUCCESS def post_run(params, output_dict): - print("data_setup(): post_run") - sys.stdout.flush() + global logger + # logger.info("post_run") + # sys.stdout.flush() if "use_exported_data" in params: try: # os.remove(params["use_exported_data"]) @@ -159,5 +168,6 @@ def post_run(params, output_dict): except OSError as e: print("Error: %s - %s." % (e.filename, e.strerror)) else: - print("use_exported_data not in params") + # print("use_exported_data not in params") + pass return ModelResult.SUCCESS diff --git a/workflows/cp-leaveout/py/plangen.py b/workflows/cp-leaveout/py/plangen.py index 1a537831..7352da2d 100644 --- a/workflows/cp-leaveout/py/plangen.py +++ b/workflows/cp-leaveout/py/plangen.py @@ -4,6 +4,7 @@ import os import sqlite3 import sys +import time import traceback from abc import ABC, abstractmethod # abstract class support from collections import OrderedDict, deque, namedtuple @@ -18,7 +19,11 @@ ISO_TIMESTAMP = "seconds" # timestamp to ISO string ISO_TIMESTAMP_ENCODE = "%Y-%m-%dT%H:%M:%S" # ISO string to timestamp -DEBUG_SQL = True +DEBUG_SQL = False # True + + +conn = None +csr = None def isempty(path): @@ -497,11 +502,20 @@ class RunStat(Enum): # subplan execution status """ +# def log(msg): +# if DEBUG_SQL: +# with open("plangen_db.log", "a") as fp: +# fp.write(msg + "\n") +# fp.flush() + +from log_tools import * + + +logger = get_logger(logger, "PLANGEN", milliseconds=True) + + def log(msg): - if DEBUG_SQL: - with open("plangen_db.log", "a") as fp: - fp.write(msg + "\n") - fp.flush() + logger.debug(msg) # ------------------------------------------------------------------------------ @@ -546,7 +560,7 @@ def execute_sql_stmt(conn, stmt, cursor=None, trap_exception=False): lclcsr = conn.cursor() try: db_exception = False - log("STMT: " + stmt) + # log("STMT: " + stmt) lclcsr.execute(stmt) except db_Error as e: @@ -776,36 +790,40 @@ def start_subplan(db_path, the subplan. -1 is returned from a RESTART call if the a RunhistRow already exists for the plan/subplan and is marked COMPLETE. """ - - print("plangen: start_subplan: subplan_id=%s" % subplan_id) - sys.stdout.flush() - conn = db_connect(db_path) - csr = conn.cursor() + global conn, csr + start = time.time() + log("start_subplan: subplan_id=%s" % subplan_id) + # sys.stdout.flush() + if conn is None: + conn = db_connect(db_path) + csr = conn.cursor() + # conn.execute('PRAGMA journal_mode = WAL') + conn.execute('PRAGMA synchronous = OFF') skip = False - print("plangen: start_subplan: run_type: '%s'" % str(run_type)) - # print("plangen: start_subplan: run_type type: %s" % str(type(run_type))) - print("plangen: start_subplan: base: '%s'" % str(RunType.RESTART)) - sys.stdout.flush() + log("start_subplan: run_type: '%s'" % str(run_type)) + # log("plangen: start_subplan: run_type type: %s" % str(type(run_type))) + log("start_subplan: base: '%s'" % str(RunType.RESTART)) + # sys.stdout.flush() # skip previously completed work if RESTART if "RESTART" in str(run_type): - print("plangen: start_subplan: checking restart: %i" % plan_id) - sys.stdout.flush() + log("start_subplan: checking restart: %i" % plan_id) + # sys.stdout.flush() stmt = _select_row_from_runhist.format(plan_id, subplan_id) execute_sql_stmt(conn, stmt, cursor=csr) row = csr.fetchone() if row: - print("plangen: start_subplan: found row.") + log("start_subplan: found row.") runhist_rec = RunhistRow._make(row) - print("plangen: start_subplan: found '%s'" % runhist_rec.status) + log("start_subplan: found '%s'" % runhist_rec.status) if runhist_rec.status == RunStat.COMPLETE.name: skip = True - print("plangen: start_subplan: skip %r" % skip) + # log("start_subplan: skip %r" % skip) else: - print("plangen: start_subplan: not checking restart") - sys.stdout.flush() + log("start_subplan: not checking restart") + # sys.stdout.flush() # construct/reinit a new runhist record if not skip: @@ -818,16 +836,21 @@ def start_subplan(db_path, execute_sql_stmt(conn, stmt, cursor=csr) - csr.close() + # csr.close() conn.commit() - conn.close() + # conn.close() if skip: - print("plangen: start_subplan: subplan_id=%s: SKIP" % subplan_id) - return -1 + token = "SKIP" + result = -1 else: - print("plangen: start_subplan: subplan_id=%s: RUN" % subplan_id) - return 0 + token = "RUN" + result = 0 + + log("start_subplan: subplan_id=%s: %s" % (subplan_id, result)) + stop = time.time() + log("start_subplan: time: %0.3f" % (stop - start)) + return result def stop_subplan(db_path, plan_id=None, subplan_id=None, comp_info_dict={}): From b9adec3eef8d15fdce387805f37216d1c36854d5 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 17 May 2023 11:38:45 -0500 Subject: [PATCH 535/601] Better output and logging --- workflows/common/python/log_tools.py | 11 ++++++++--- workflows/common/python/model_runner.py | 5 +++-- workflows/cp-leaveout/swift/workflow.sh | 2 ++ 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/workflows/common/python/log_tools.py b/workflows/common/python/log_tools.py index f4648413..0a3a5bf0 100644 --- a/workflows/common/python/log_tools.py +++ b/workflows/common/python/log_tools.py @@ -7,7 +7,7 @@ logger = None -def get_logger(logger, name, stream=sys.stdout): +def get_logger(logger, name, stream=sys.stdout, milliseconds=False): """Set up logging.""" if logger is not None: return logger @@ -16,8 +16,13 @@ def get_logger(logger, name, stream=sys.stdout): logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) h = logging.StreamHandler(stream=stream) - fmtr = logging.Formatter("%(asctime)s %(name)s %(levelname)-5s %(message)s", - datefmt="%Y-%m-%d %H:%M:%S") + if not milliseconds: + fmtr = logging.Formatter("%(asctime)s %(name)s %(levelname)-5s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S") + else: + fmtr = logging.Formatter("%(asctime)s.%(msecs)03d %(name)s %(levelname)-5s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S") + h.setFormatter(fmtr) logger.addHandler(h) return logger diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index fb379467..47ded8fd 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -134,7 +134,7 @@ def run(hyper_parameter_map, model_return): logger = get_logger(logger, "MODEL RUNNER") logger.info("run(): START:") - sys.stdout.flush() + # sys.stdout.flush() directory = hyper_parameter_map[ "instance_directory"] # should be output_dir @@ -145,7 +145,7 @@ def run(hyper_parameter_map, model_return): framework = hyper_parameter_map['framework'] print("framework: " + str(framework)) - sys.stdout.flush() + # sys.stdout.flush() model_name = hyper_parameter_map['model_name'] pkg = import_pkg(framework, model_name) @@ -232,6 +232,7 @@ def __init__(self, val_scores): logger.info("DONE: run_id %s in %0.2f seconds." % (hyper_parameter_map["run_id"], duration)) log("PKG RUN STOP") + sys.stdout.flush() return (result, history_result) diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index 63684a0f..5fd30a2a 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -258,6 +258,8 @@ export TURBINE_DB_WORKERS=1 # Insert -e flags for Swift/T command line: ENV_ARG="-e $( echo ${ENVS[@]} | sed 's/ */ -e /g' )" +export TURBINE_LOG=0 + swift-t -O 0 -n $PROCS \ ${MACHINE:-} \ -p \ From 8e59501facb89ee330dc69e282c43fe2cdf35271 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 17 May 2023 11:45:25 -0500 Subject: [PATCH 536/601] Minor changes --- workflows/common/python/model_runner.py | 2 +- workflows/cp-leaveout/py/plangen.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index 47ded8fd..d997653b 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -16,7 +16,7 @@ logger = None -print("MODEL RUNNER...") +print("MODEL RUNNER MODULE") sys.stdout.flush() # Set PYTHONPATH: diff --git a/workflows/cp-leaveout/py/plangen.py b/workflows/cp-leaveout/py/plangen.py index 7352da2d..50e1a346 100644 --- a/workflows/cp-leaveout/py/plangen.py +++ b/workflows/cp-leaveout/py/plangen.py @@ -21,7 +21,6 @@ ISO_TIMESTAMP_ENCODE = "%Y-%m-%dT%H:%M:%S" # ISO string to timestamp DEBUG_SQL = False # True - conn = None csr = None From 2c4f78623338a4a7ec365c1ad2fdf795645db8d5 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 17 May 2023 16:13:27 -0500 Subject: [PATCH 537/601] Update for new log format --- workflows/cp-leaveout/scripts/Node.py | 35 +++++---- .../cp-leaveout/scripts/extract-node-info.py | 73 ++++++++++--------- .../cp-leaveout/scripts/print-node-info.py | 7 ++ 3 files changed, 68 insertions(+), 47 deletions(-) diff --git a/workflows/cp-leaveout/scripts/Node.py b/workflows/cp-leaveout/scripts/Node.py index ce8fb70c..0daabc0d 100644 --- a/workflows/cp-leaveout/scripts/Node.py +++ b/workflows/cp-leaveout/scripts/Node.py @@ -58,16 +58,18 @@ def __init__(self, id=None, logger=None): self.ckpt_writes = {} # Did EarlyStopping stop this node? self.stopped_early = False + # Did the topN module find data for this Node? + self.has_data = True # Did training complete for this node? self.complete = False # Can disable logging here: self.verbose = True - self.debug(logger, "START: " + str(self)) + # self.debug(logger, "START: " + str(self)) def set_id(self, id, logger=None): self.id = id self.stage = (len(self.id) - 1) // 2 - self.debug(logger, "SET ID: " + id) + # self.debug(logger, "SET ID: " + id) def new_segment(self): self.segment += 1 @@ -82,10 +84,13 @@ def __repr__(self): def __str__(self): special = "" - if not self.complete: - special = " INCOMPLETE!" - if self.stopped_early: - special = " EARLY STOP!" + if not self.has_data: + special = "NO DATA!" + else: + if not self.complete: + special = " INCOMPLETE!" + if self.stopped_early: + special = " EARLY STOP!" return "Node [%s]: %s (epochs=%i/%s, loss=%s, val_loss=%s)%s" % ( Node.maybe_str_integer(self.stage), self.id, @@ -99,17 +104,19 @@ def __str__(self): def str_table(self): """Like str() but uses fixed-width fields.""" special = "" + if not self.has_data: + return "%-13s : %i NO-DATA" % (self.id, self.stage) if not self.complete: special = " INCOMPLETE!" if self.stopped_early: special = " EARLY STOP!" - return "%-13s : %i : %2i / %2i : %s - %s : %s : %s" % ( + return "%-13s : %i : %2s / %2s : %s - %s : %s : %s" % ( self.id, self.stage, - self.epochs_actual, - self.epochs_planned, - self.date_start, - self.date_stop, + Node.maybe_str_integer(self.epochs_actual), + Node.maybe_str_integer(self.epochs_planned), + str(self.date_start), + str(self.date_stop), self.str_errors(), special, ) @@ -176,7 +183,7 @@ def parse_model_write(self, line, logger=None): def stop_early(self, logger=None): self.stopped_early = True - self.debug(logger, "STOP EARLY") + self.trace(logger, "STOP EARLY") def parse_date_start(self, line): tokens = line.split() @@ -186,11 +193,11 @@ def parse_date_stop(self, line, logger=None): tokens = line.split() self.date_stop = tokens[0] + " " + tokens[1] if self.epochs_planned is None: - self.debug(logger, "STOP : epochs_planned=None") + self.trace(logger, "STOP : epochs_planned=None") return if self.epochs_actual == self.epochs_planned or self.stopped_early: self.complete = True - self.debug(logger, "COMPLETE") + self.trace(logger, "COMPLETE") def parse_training_done(self, line, logger=None): # The current epoch should already be set diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index ebd58f7e..55c1fc0f 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -67,8 +67,6 @@ def parse_logs(log_files): def parse_log(log_fp, nodes): nodes_found = 0 node_current = None - # Temporary way to capture build DF time, which happens before - # node_current is defined. Fixing log format to address this. 2021-11-28 build_df = None while True: @@ -76,42 +74,29 @@ def parse_log(log_fp, nodes): # print(line) if line == "": break - if "DONE: run_id" in line: + if line.startswith("data_setup.pre_run"): + if "node:" in line: + tokens = line.split() + node_id = tokens[-2].strip() + node_current = get_node(nodes, node_id, logger) + elif "DONE: run_id" in line: # This is also a MODEL RUNNER line, # but could be DEBUG or INFO # (should be INFO in future) if node_current is None: # Restarted node with no epochs remaining: continue - logger.info("RUN DONE.") + trace("RUN DONE.") node_current.parse_date_stop(line, logger) elif "MODEL RUNNER" in line: # print(line.strip()) - if "DEBUG" in line: - if "PARAM UPDATE START" in line: - logger.debug("New Node ...") - node_current = Node(logger=logger) - node_current.parse_date_start(line) - elif " node =" in line: - logger.info("start: " + line) - tokens = line.split() - node_id = tokens[-1].strip() - if node_id not in nodes: - node_current.set_id(node_id, logger) - nodes[node_id] = node_current - if build_df is not None: - node_current.build_df = build_df - build_df = None - else: - logger.debug("lookup: " + node_id) - node_current = nodes[node_id] - node_current.new_segment() - node_current.complete = False - elif " epochs =" in line: - if node_current is None: - # Restarted node with no epochs remaining: - continue - node_current.parse_epochs(line, logger) + if "PARAM UPDATE START" in line: + node_current.parse_date_start(line) + if " epochs =" in line: + if node_current is None: + # Restarted node with no epochs remaining: + continue + node_current.parse_epochs(line, logger) elif line.startswith("data_setup: build_dataframe() OK"): build_df = parse_build_df(line, logger) elif line.startswith("Loaded from initial_weights"): @@ -124,21 +109,43 @@ def parse_log(log_fp, nodes): node_current.parse_training_done(line, logger) elif line.startswith("model wrote:"): node_current.parse_model_write(line, logger) + elif "topN_NoDataException" in line: + node_current.has_data = False elif "early stopping" in line: - if node_current is not None: - # TensorFlow may report early stopping even if at max epochs: - node_current.stop_early() + if not "setting early stopping patience" in line: + if node_current is not None: + # TensorFlow may report early stopping even if at max epochs: + node_current.stop_early() if node_current is not None and node_current.complete: # Store a complete Node in global dict nodes - logger.info("node done.") + trace("node done.") # find_val_data(node_current) # old format? parse_python_log(node_current) + # print(Node.str_table(node_current)) nodes_found += 1 node_current = None + # exit(0) logger.info("Found %i nodes in log." % nodes_found) +def get_node(nodes, node_id, logger): + + if "'" in node_id: + node_id = node_id.replace("'", "") + if node_id not in nodes: + trace("NEW: " + node_id) + result = Node(logger=logger) + result.set_id(node_id, logger) + nodes[node_id] = result + else: + trace("lookup: " + node_id) + result = nodes[node_id] + result.new_segment() + result.complete = False + return result + + def parse_build_df(line, logger=None): tokens = line.split() assert len(tokens) == 6 diff --git a/workflows/cp-leaveout/scripts/print-node-info.py b/workflows/cp-leaveout/scripts/print-node-info.py index 95c4cb4f..54f3c2ac 100644 --- a/workflows/cp-leaveout/scripts/print-node-info.py +++ b/workflows/cp-leaveout/scripts/print-node-info.py @@ -9,6 +9,8 @@ from utils import fail parser = argparse.ArgumentParser(description="Print Node info stats") +parser.add_argument("--count", "-c", action="store_true", + help="Simply count the nodes") parser.add_argument("directory", help="The experiment directory (EXPID)") parser.add_argument("nodes", default="", @@ -37,6 +39,7 @@ def print_all(data): count = 0 earlies = 0 for node in data.values(): + # print(node.id) print(node.str_table()) count += 1 if node.stopped_early: @@ -54,6 +57,10 @@ def print_selected(data, nodes): print(node.str_table()) +if args.count: + print(len(data)) + exit(0) + if args.nodes == "": print_all(data) else: From 424d085ce87f37e239c512b2a9270e1edc707a4d Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 22 May 2023 11:54:20 -0500 Subject: [PATCH 538/601] Better logging --- workflows/cp-leaveout/py/data_setup.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/workflows/cp-leaveout/py/data_setup.py b/workflows/cp-leaveout/py/data_setup.py index 0c7d530c..9e14c659 100644 --- a/workflows/cp-leaveout/py/data_setup.py +++ b/workflows/cp-leaveout/py/data_setup.py @@ -80,7 +80,7 @@ def setup_local_fs(params): def pre_run(params): global logger - logger.info("pre_run(): node: '%s' ..." % params["node"]) + logger.info("PRE_RUN node: %s ..." % params["node"]) # softlink to cache & config file # build node specific training/validation dataset @@ -129,11 +129,11 @@ def pre_run(params): # sys.stdout.flush() args.output = out_orig else: - print("data_setup: dataframe exists: %s" % - os.path.realpath(args.output)) + logger.info("data_setup: dataframe exists: %s" % + os.path.realpath(args.output)) except topN_to_uno.topN_NoDataException: - print("data_setup: caught topN_NoDataException: SKIP " + - "for node: '%s'" % params["node"]) + logger.info("data_setup: topN_NoDataException: SKIP " + + "node: %s" % params["node"]) # sys.stdout.flush() directory = params["instance_directory"] with open(directory + "/NO-DATA.txt", "a") as fp: @@ -152,7 +152,7 @@ def pre_run(params): traceback.print_exc(file=sys.stdout) sys.stdout.flush() return ModelResult.ERROR - logger.info("data_setup.pre_run() done.") + logger.info("PRE_RUN done.") # sys.stdout.flush() return ModelResult.SUCCESS From b2ada6eb4a4c41b48413789214eaaf2eacbdd001 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 22 May 2023 11:55:49 -0500 Subject: [PATCH 539/601] Merge --- workflows/cp-leaveout/scripts/extract-node-info.sh | 6 +++--- workflows/dense-noise/test/cfg-sys-small.sh | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/workflows/cp-leaveout/scripts/extract-node-info.sh b/workflows/cp-leaveout/scripts/extract-node-info.sh index afd7c06f..c4243865 100755 --- a/workflows/cp-leaveout/scripts/extract-node-info.sh +++ b/workflows/cp-leaveout/scripts/extract-node-info.sh @@ -36,10 +36,10 @@ $SUPERVISOR/scripts/shrink-logs.sh $DIR { for RESTART in ${RESTARTS[@]} do - echo $RESTART/out/summary-*.txt + find $RESTART/out -name summary- done - echo $DIR/out/summary-*.txt -} | fmt -w 1 > $LOG_LIST + find $DIR/out -name summary- +} > $LOG_LIST # | fmt -w 1 export PYTHONPATH+=:$SUPERVISOR/workflows/common/python diff --git a/workflows/dense-noise/test/cfg-sys-small.sh b/workflows/dense-noise/test/cfg-sys-small.sh index 874c7a8d..12948386 100644 --- a/workflows/dense-noise/test/cfg-sys-small.sh +++ b/workflows/dense-noise/test/cfg-sys-small.sh @@ -9,7 +9,7 @@ export PROCS=${PROCS:-2} # MPI processes per node export PPN=${PPN:-2} -export WALLTIME=${WALLTIME:-00:60:00} +export WALLTIME=${WALLTIME:-00:05:00} # CANDLE@ALCF: # export PROJECT=CSC249ADOA01 From 03810575b1c964270e3c550f73fd25de6ea6aa14 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 23 May 2023 15:17:15 -0500 Subject: [PATCH 540/601] Set default timeout --- workflows/cmp-cv/swift/workflow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cmp-cv/swift/workflow.sh b/workflows/cmp-cv/swift/workflow.sh index 15342bb2..e235378b 100755 --- a/workflows/cmp-cv/swift/workflow.sh +++ b/workflows/cmp-cv/swift/workflow.sh @@ -51,7 +51,7 @@ log_path PYTHONPATH export TURBINE_JOBNAME="CMP_${EXPID}" export MODEL_SH=${MODEL_SH:-$WORKFLOWS_ROOT/common/sh/model.sh} -export BENCHMARK_TIMEOUT +export BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-21600} # 6 hours PLAN="PLAN_NOT_DEFINED" CMD_LINE_ARGS=( -expid=$EXPID -benchmark_timeout=$BENCHMARK_TIMEOUT From bf7432d6e5f678b7274c85e97f55ac234f44a7a5 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 23 May 2023 22:10:02 -0500 Subject: [PATCH 541/601] New Swift/T --- workflows/common/sh/env-polaris.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/sh/env-polaris.sh b/workflows/common/sh/env-polaris.sh index 2eb56929..14222f73 100644 --- a/workflows/common/sh/env-polaris.sh +++ b/workflows/common/sh/env-polaris.sh @@ -6,7 +6,7 @@ CANDLE_MODEL_IMPL=app CSC249=/lus/grand/projects/CSC249ADOA01 ROOT=$CSC249/public/sfw/polaris -SWIFT=$ROOT/swift-t/2023-04-19 +SWIFT=$ROOT/swift-t/2023-05-23 export TURBINE_HOME=$SWIFT/turbine PATH=$SWIFT/stc/bin:$PATH From 1e986ec84fe98cabb537fb9acabf955396f80b96 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 23 May 2023 22:10:08 -0500 Subject: [PATCH 542/601] Update header --- workflows/common/sh/sched-polaris.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/sh/sched-polaris.sh b/workflows/common/sh/sched-polaris.sh index 721a490d..557a6015 100644 --- a/workflows/common/sh/sched-polaris.sh +++ b/workflows/common/sh/sched-polaris.sh @@ -1,5 +1,5 @@ -# SCHED Summit +# SCHED Polaris # Scheduler settings for Swift/T/PBS/Polaris From 89e744d5857abf9ec7e4721a096f8e10300243a5 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 23 May 2023 22:10:19 -0500 Subject: [PATCH 543/601] Update to new names --- workflows/common/swift/model_container.swift | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/workflows/common/swift/model_container.swift b/workflows/common/swift/model_container.swift index c017c955..fe3e228e 100644 --- a/workflows/common/swift/model_container.swift +++ b/workflows/common/swift/model_container.swift @@ -12,7 +12,7 @@ run_id : A string run ID that will be the output directory name model_name : A path to a SIF */ -(string obj_result) candle_model_train(string params, +(string model_result) candle_model_train(string params, string expid, string runid, string model_name) @@ -22,17 +22,17 @@ model_token = rootname_string(basename_string(model_name)); outdir = "%s/%s/Output/%s/%s" % (CDD, model_token, expid, runid); - printf("obj_container(): running in: %s", outdir); + printf("candle_model_train_container(): running in: %s", outdir); // We do not use a file type here because this file may not be created, // which is handled by get_results() result_file = outdir/"result.txt"; wait (run_model_train(model_sh, params, expid, runid, model_name)) { - obj_result = get_results(result_file); + model_result = get_results(result_file); } - printf("model_train_container(): result(%s): '%s'", - runid, obj_result); + printf("candle_model_train_container(): result(%s): '%s'", + runid, model_result); } /** From f8c95f47ce94863ef7adee2a931b22d559a495ea Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 23 May 2023 22:11:32 -0500 Subject: [PATCH 544/601] Back to CANDLE_MODEL_IMPL="container" --- workflows/dense-noise/swift/workflow.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/dense-noise/swift/workflow.sh b/workflows/dense-noise/swift/workflow.sh index 19702543..f8d80b64 100755 --- a/workflows/dense-noise/swift/workflow.sh +++ b/workflows/dense-noise/swift/workflow.sh @@ -90,7 +90,8 @@ mkdir -pv $TURBINE_OUTPUT/data # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -CANDLE_MODEL_IMPL="py" +# CANDLE_MODEL_IMPL: "container" on Polaris, "py" on Summit/Frontier +CANDLE_MODEL_IMPL="container" OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh From 36ba170b9d2096ea05e023239fa8938c7e47cf0a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 24 May 2023 14:21:02 -0500 Subject: [PATCH 545/601] Merge --- workflows/cmp-cv/py/compare.py | 13 +++++++++++++ workflows/cmp-cv/test/upf-1.txt | 9 +++++++++ 2 files changed, 22 insertions(+) diff --git a/workflows/cmp-cv/py/compare.py b/workflows/cmp-cv/py/compare.py index 5f051ade..b735fc2b 100644 --- a/workflows/cmp-cv/py/compare.py +++ b/workflows/cmp-cv/py/compare.py @@ -12,11 +12,14 @@ """ import os +import sys import pandas as pd import pandas as pd import numpy as np from sklearn.metrics import mean_squared_error +CANDLE_DATA_DIR = os.getenv("CANDLE_DATA_DIR") + conditions = pd.DataFrame( [['nAromAtom', 5, 10], ['nAtom', 20, 50], ['BertzCT', 800, 1000]], columns=['prop', 'low', 'high']) @@ -39,6 +42,16 @@ def compare(model_name, exp_id, run_id): directory = outdir # directory = f"{CANDLE_DATA_DIR}/Output/{exp_id}/{run_id}" print("reading the predictions....") + + gParams = read_params(exp_id, run_id) + model_name = gParams("model_name") + print(f"compare: model_name={model_name} exp_id={exp_id} run_id={run_id}") + + sys.stdout.flush() + + return + + directory = f"{CANDLE_DATA_DIR}/{model_name}/Output/{exp_id}/{run_id}" df_res = pd.read_csv(f"{directory}/test_predictions.csv") # a class to calculate errors for subsets of the validation/test set diff --git a/workflows/cmp-cv/test/upf-1.txt b/workflows/cmp-cv/test/upf-1.txt index 07190f70..24e024bb 100644 --- a/workflows/cmp-cv/test/upf-1.txt +++ b/workflows/cmp-cv/test/upf-1.txt @@ -1,3 +1,4 @@ + {"id": "RUN001", "epochs": 1, "model_name": "DrugCell", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/DrugCell.sif"} {"id": "RUN002", "epochs": 2, "model_name": "DrugCell", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/DrugCell.sif"} {"id": "RUN003", "epochs": 1, "model_name": "SWnet", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} @@ -16,3 +17,11 @@ #{"id": "RUN004", "epochs": 50, "model_name": "GraphDRP", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} #{"id": "RUN004", "epochs": 50, "model_name": "DeepTTC", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} #DeepTTC + +# # NOTE: Everything after "candle_image" is stripped! +# # Insert new parameters before "candle_image" +# {"id": "RUN000", "epochs": 1, "model_name": "DrugCell", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif"} +# {"id": "RUN001", "epochs": 1, "model_name": "DrugCell", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif"} +# {"id": "RUN002", "epochs": 1, "model_name": "GraphDRP", "candle_image": "/software/improve/images/GraphDRP.sif"} +# {"id": "RUN003", "epochs": 1, "model_name": "GraphDRP", "candle_image": "/software/improve/images/GraphDRP.sif"} +# {"id": "RUN004", "epochs": 1, "model_name": "GraphDRP", "candle_image": "/software/improve/images/GraphDRP.sif"} From a9ecbc7d4857990b64d475ad71ddcff091dfd300 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 24 May 2023 14:21:25 -0500 Subject: [PATCH 546/601] Set CANDLE_MODEL_IMPL --- workflows/mlrMBO/swift/workflow.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index db3f033c..957da268 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -113,6 +113,7 @@ cp $WORKFLOWS_ROOT/common/R/$R_FILE $PARAM_SET_FILE $CFG_SYS $CFG_PRM $TURBINE_O mkdir -pv $TURBINE_OUTPUT/run # Allow the user to set an objective function +CANDLE_MODEL_IMPL=${CANDLE_MODEL_IMPL:-container} OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function From 2c4e0cb4df8048ad18daf8334d8f18fce9ca5eea Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 24 May 2023 14:36:57 -0500 Subject: [PATCH 547/601] Reduce get_expid arguments --- workflows/mlrMBO/swift/workflow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index 957da268..8f53b6b5 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -49,7 +49,7 @@ then fi get_site $1 # Sets SITE -get_expid $2 $CANDLE_MODEL_TYPE # Sets EXPID +get_expid $2 # Sets EXPID get_cfg_sys $3 get_cfg_prm $4 MODEL_NAME=$5 From f4471096ccd506702c3ef5a2815e80b5adc54f4e Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 24 May 2023 14:37:38 -0500 Subject: [PATCH 548/601] Improve error message --- workflows/mlrMBO/test/cfg-prm-1.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/mlrMBO/test/cfg-prm-1.sh b/workflows/mlrMBO/test/cfg-prm-1.sh index fd46deb7..d8209561 100644 --- a/workflows/mlrMBO/test/cfg-prm-1.sh +++ b/workflows/mlrMBO/test/cfg-prm-1.sh @@ -28,7 +28,7 @@ fi if [[ "${PARAM_SET_FILE:-}" == "" ]]; then # PARAM_SET_FILE must be set before this script returns! - echo "Invalid model-" "'${MODEL_NAME:-}'" + echo "Cannot set PARAM_SET_FILE: unknown model: '${MODEL_NAME:-}'" exit 1 fi set +x From 95b98ca60ac57f0b087cd48869d3e410a6bc429f Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 24 May 2023 14:37:52 -0500 Subject: [PATCH 549/601] Fix usage message --- workflows/mlrMBO/test/test-1.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/mlrMBO/test/test-1.sh b/workflows/mlrMBO/test/test-1.sh index 029c54f9..abbce404 100755 --- a/workflows/mlrMBO/test/test-1.sh +++ b/workflows/mlrMBO/test/test-1.sh @@ -5,7 +5,7 @@ set -eu usage() { - echo "Usage: test BENCHMARK_NAME SITE RUN_DIR(optional)" + echo "Usage: test MODEL_NAME SITE RUN_DIR(optional)" echo " RUN_DIR is optional, use -a for automatic" } From 48417b8fe9d3174cee8ba0d45d1deb1696d5a426 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 24 May 2023 14:38:55 -0500 Subject: [PATCH 550/601] Set PARAM_SET_FILE for now --- workflows/mlrMBO/test/test-1.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/mlrMBO/test/test-1.sh b/workflows/mlrMBO/test/test-1.sh index abbce404..49038e1e 100755 --- a/workflows/mlrMBO/test/test-1.sh +++ b/workflows/mlrMBO/test/test-1.sh @@ -32,6 +32,8 @@ WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) source $WORKFLOWS_ROOT/common/sh/utils.sh # Select configurations +# Temporarily hard-coding to graphdrp: +export PARAM_SET_FILE=$THIS/../data/graphdrp_small.R export CFG_SYS=$THIS/cfg-sys-1.sh export CFG_PRM=$THIS/cfg-prm-1.sh From feff4e2bac96e1fd879133aec2270d31149c6feb Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 24 May 2023 14:40:47 -0500 Subject: [PATCH 551/601] Rename to MODEL_RETURN --- workflows/mlrMBO/test/test-1.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/mlrMBO/test/test-1.sh b/workflows/mlrMBO/test/test-1.sh index 49038e1e..22170c6f 100755 --- a/workflows/mlrMBO/test/test-1.sh +++ b/workflows/mlrMBO/test/test-1.sh @@ -45,7 +45,7 @@ export R_FILE=mlrMBO-ils.R # What to return from the objective function (Keras model) # val_loss (default) and val_corr are supported -export OBJ_RETURN="val_loss" +export MODEL_RETURN="val_loss" if [[ $SITE == "theta" ]] then From fac09e42da7840d98267ebe078dcb6c1e0f49ed3 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 24 May 2023 14:53:12 -0500 Subject: [PATCH 552/601] Provide PARAM_SET_FILE=graphdrp_small.R --- workflows/mlrMBO/test/test-graphdrp-lambda0.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/workflows/mlrMBO/test/test-graphdrp-lambda0.sh b/workflows/mlrMBO/test/test-graphdrp-lambda0.sh index 00d55539..8ae89eee 100755 --- a/workflows/mlrMBO/test/test-graphdrp-lambda0.sh +++ b/workflows/mlrMBO/test/test-graphdrp-lambda0.sh @@ -32,6 +32,7 @@ WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) source $WORKFLOWS_ROOT/common/sh/utils.sh # Select configurations +export PARAM_SET_FILE=graphdrp_small.R export CFG_SYS=$THIS/cfg-sys-nightly.sh export CFG_PRM=$THIS/cfg-prm-nightly.sh @@ -52,7 +53,9 @@ export CANDLE_IMAGE="/software/improve/images/GraphDRP.sif" export INIT_PARAMS_FILE="/software/improve/graphdrp_default_model.txt" # Submit job -$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME $CANDLE_MODEL_TYPE $CANDLE_IMAGE +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE # Check job output TURBINE_OUTPUT=$( readlink turbine-output ) From 1369a4fcb13c68d66913241aaa1acba1081451ee Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 24 May 2023 15:07:28 -0500 Subject: [PATCH 553/601] Rename to MODEL_RETURN --- workflows/mlrMBO/swift/workflow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index 8f53b6b5..8e8e69ad 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -179,7 +179,7 @@ swift-t -O 0 -n $PROCS \ -e EMEWS_PROJECT_ROOT \ $( python_envs ) \ -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ - -e OBJ_RETURN \ + -e MODEL_RETURN \ -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ -e MODEL_SH \ From 39c9dc2c331b4e5a1626106cac2e6d5a7648a273 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 24 May 2023 15:09:37 -0500 Subject: [PATCH 554/601] Set MODEL_RETURN --- workflows/mlrMBO/test/test-graphdrp-lambda0.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/mlrMBO/test/test-graphdrp-lambda0.sh b/workflows/mlrMBO/test/test-graphdrp-lambda0.sh index 8ae89eee..9ac8ccb8 100755 --- a/workflows/mlrMBO/test/test-graphdrp-lambda0.sh +++ b/workflows/mlrMBO/test/test-graphdrp-lambda0.sh @@ -41,7 +41,7 @@ export R_FILE=mlrMBO-mbo.R # What to return from the objective function (Keras model) # val_loss (default) and val_corr are supported -export OBJ_RETURN="val_loss" +export MODEL_RETURN="val_loss" if [[ $SITE == "theta" ]] then From b7c8428777da3857703965ec4d5563bcfca0f921 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 24 May 2023 15:24:17 -0500 Subject: [PATCH 555/601] Auto-set CANDLE_MODEL_IMPL="container" when CANDLE_MODEL_TYPE==SINGULARITY --- workflows/mlrMBO/swift/workflow.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index 8e8e69ad..b38adf00 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -113,6 +113,10 @@ cp $WORKFLOWS_ROOT/common/R/$R_FILE $PARAM_SET_FILE $CFG_SYS $CFG_PRM $TURBINE_O mkdir -pv $TURBINE_OUTPUT/run # Allow the user to set an objective function +if [[ ${CANDLE_MODEL_TYPE:-} == "SINGULARITY" ]] +then + CANDLE_MODEL_IMPL="container" +fi CANDLE_MODEL_IMPL=${CANDLE_MODEL_IMPL:-container} OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} From dc4abf1bd2f7bf9be69d1abdabd29da00454bd4e Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 24 May 2023 15:28:28 -0500 Subject: [PATCH 556/601] Readability improvement; drop TURBINE_STDOUT for now --- workflows/mlrMBO/swift/workflow.sh | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index b38adf00..8e63108c 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -118,8 +118,8 @@ then CANDLE_MODEL_IMPL="container" fi CANDLE_MODEL_IMPL=${CANDLE_MODEL_IMPL:-container} -OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} +SWIFT_LIBS_DIR=${SWIFT_LIBS_DIR:-$WORKFLOWS_ROOT/common/swift} +SWIFT_MODULE=${SWIFT_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function # Andrew: Allows for custom model.sh file, if that's desired export MODEL_SH=${MODEL_SH:-$WORKFLOWS_ROOT/common/sh/model.sh} @@ -168,20 +168,21 @@ fi # on Biowulf. Reported by ALW 2021-01-21 ( + PY_ENVS=$( python_envs ) set -x swift-t -O 0 -n $PROCS \ -o $TURBINE_OUTPUT/workflow.tic \ ${MACHINE:-} \ -p -I $EQR -r $EQR \ - -I $OBJ_DIR \ - -i $OBJ_MODULE \ + -I $SWIFT_LIBS_DIR \ + -i $SWIFT_MODULE \ -e LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} \ -e TURBINE_RESIDENT_WORK_WORKERS=$TURBINE_RESIDENT_WORK_WORKERS \ -e RESIDENT_WORK_RANKS=$RESIDENT_WORK_RANKS \ -e APP_PYTHONPATH \ -e BENCHMARKS_ROOT \ -e EMEWS_PROJECT_ROOT \ - $( python_envs ) \ + $PY_ENVS \ -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ -e MODEL_RETURN \ -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ @@ -197,8 +198,9 @@ swift-t -O 0 -n $PROCS \ -e CANDLE_MODEL_TYPE \ -e CANDLE_IMAGE \ $WAIT_ARG \ - $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} ) 2>&1 | \ - tee $STDOUT + $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} ) +# 2>&1 | \ +# tee $STDOUT if (( ${PIPESTATUS[0]} )) then From aa09e78442a40a8e6b7c948202e1a1f5475ba417 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 25 May 2023 12:54:38 -0500 Subject: [PATCH 557/601] Change back to IMPROVE_RESULT for now --- workflows/common/sh/model.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index 1f816251..93f4baf8 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -30,7 +30,6 @@ usage() echo "If SH_TIMEOUT is set, we run under the shell command timeout" } -set -x if (( ${#} != 7 )) then echo @@ -166,7 +165,7 @@ then sleep 1 # Wait for initial output # Get last results of the format "CANDLE_RESULT xxx" in model.log # NOTE: Enabling set -x will break the following (token CANDLE_RESULT) - RES=$( awk -v FS="CANDLE_RESULT" 'NF>1 {x=$2} END {print x}' \ + RES=$( awk -v FS="IMPROVE_RESULT" 'NF>1 {x=$2} END {print x}' \ $INSTANCE_DIRECTORY/model.log ) RESULT="$(echo $RES | grep -Eo '[+-]?[0-9]+([.][0-9]+)?')" || true echo "CANDLE RESULT: '$RESULT'" From ddbbdacaae8a2c7e55fa5f464ddce357a8bb4688 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 26 May 2023 13:09:19 -0500 Subject: [PATCH 558/601] Fix experiments directory for Singularity runs --- workflows/common/sh/utils.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index 9e21b61f..d10ef7b6 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -185,7 +185,9 @@ get_expid() if [[ $CANDLE_MODEL_TYPE == "SINGULARITY" ]] then - EXPERIMENTS=${EXPERIMENTS:-$CANDLE_DATA_DIR/$MODEL_NAME/Output} + # Keep this directory in sync with model.sh RUN_DIRECTORY + MODEL_TOKEN=$( basename $MODEL_NAME .sif ) + EXPERIMENTS=$CANDLE_DATA_DIR/$MODEL_TOKEN/Output else # "BENCHMARKS" EXPERIMENTS=${EXPERIMENTS:-$EMEWS_PROJECT_ROOT/experiments} fi From 6d62bddc3475095ddbf32dfd7f2716e40affff36 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 26 May 2023 13:15:17 -0500 Subject: [PATCH 559/601] Settings for Lambda7 --- workflows/common/sh/env-lambda7.sh | 29 ++++++++++++++++++++++++++++ workflows/common/sh/sched-lambda7.sh | 4 ++++ 2 files changed, 33 insertions(+) create mode 100644 workflows/common/sh/env-lambda7.sh create mode 100644 workflows/common/sh/sched-lambda7.sh diff --git a/workflows/common/sh/env-lambda7.sh b/workflows/common/sh/env-lambda7.sh new file mode 100644 index 00000000..ba249349 --- /dev/null +++ b/workflows/common/sh/env-lambda7.sh @@ -0,0 +1,29 @@ + +# ENV Lambda +# Environment settings for Lambda (Swift, Python, R, Tcl, etc.) + +# Everything is installed in here: +SFW=/homes/woz/Public/sfw + +SWIFT=$SFW/swift-t/2023-05-26 +PY=$SFW/Miniconda +# EQPY=$SFW/EQ-Py +EQR=$SFW/EQ-R +R=$SFW/R-4.1.0 + +PATH=$SWIFT/stc/bin:$PATH +PATH=$PY/bin:$PATH + +export LD_LIBRARY_PATH=$R/lib/R/lib:${LD_LIBRARY_PATH:-} + +# How to run CANDLE models: +CANDLE_MODEL_IMPL="app" + +# PYTHONPATH=$EQPY/src:${PYTHONPATH:-} + +# Log settings to output +echo "Programs:" +which python swift-t | nl +# Cf. utils.sh +show PYTHONHOME +log_path LD_LIBRARY_PATH diff --git a/workflows/common/sh/sched-lambda7.sh b/workflows/common/sh/sched-lambda7.sh new file mode 100644 index 00000000..d3a21667 --- /dev/null +++ b/workflows/common/sh/sched-lambda7.sh @@ -0,0 +1,4 @@ + +# SCHED LAMBDA7 + +# Empty- Lambda uses normal unscheduled mpiexec execution in Swift/T From 044ecb2eb9d629fc2fe121540cdaf724ab137797 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 26 May 2023 14:35:10 -0500 Subject: [PATCH 560/601] Fix syntax --- workflows/common/sh/utils.sh | 2 +- workflows/mlrMBO/swift/workflow.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index d10ef7b6..54f6d202 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -194,7 +194,7 @@ get_expid() local i=0 EXPS E TO - if [ $EXPID = "-a" ] + if [[ $EXPID == "-a" ]] then shift # Search for free experiment number diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index 8e63108c..50b749fe 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -41,7 +41,7 @@ else fi TURBINE_OUTPUT="" -if [[ $CANDLE_MODEL_TYPE = "SINGULARITY" ]] +if [[ $CANDLE_MODEL_TYPE == "SINGULARITY" ]] then TURBINE_OUTPUT=$CANDLE_DATA_DIR/output printf "Running mlrMBO workflow with model %s and image %s:%s\n" \ From 83b80d62747ec1f9e4a06d52e4c370fc96cf7ad2 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 26 May 2023 14:48:03 -0500 Subject: [PATCH 561/601] New test from Wilke --- workflows/mlrMBO/data/graphdrp.R | 13 ++++ workflows/mlrMBO/test/test-model-lambda.sh | 86 ++++++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 workflows/mlrMBO/data/graphdrp.R create mode 100755 workflows/mlrMBO/test/test-model-lambda.sh diff --git a/workflows/mlrMBO/data/graphdrp.R b/workflows/mlrMBO/data/graphdrp.R new file mode 100644 index 00000000..e496a74c --- /dev/null +++ b/workflows/mlrMBO/data/graphdrp.R @@ -0,0 +1,13 @@ + +# GraphDRP Hyperparameter Search - Test "small" +# These parameters should stay small for short tests +# and use no dense parameters to avoid mlrMBO crashes + +# see https://cran.r-project.org/web/packages/ParamHelpers/ParamHelpers.pdfmakeNum +# the parameter names should match names of the arguments expected by the benchmark + +param.set <- makeParamSet( + # makeIntegerParam("epochs", lower = 3, upper = 4), + makeIntegerParam("batch_size" , lower = 32 , upper = 2048 ), + makeNumericParam("learning_rate", lower = 0.000001, upper = 0.1) +) diff --git a/workflows/mlrMBO/test/test-model-lambda.sh b/workflows/mlrMBO/test/test-model-lambda.sh new file mode 100755 index 00000000..2f12516f --- /dev/null +++ b/workflows/mlrMBO/test/test-model-lambda.sh @@ -0,0 +1,86 @@ +#!/bin/bash +set -eu + +# MLRMBO TEST NIGHTLY + +usage() +{ + echo "Usage: test BENCHMARK_NAME SITE RUN_DIR EXPERIMENT_PARAMATER_FILE" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 4 )) +then + RUN_DIR=$3 +elif (( ${#} == 2 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + + + +export MODEL_NAME=$1 +SITE=$2 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-nightly.sh +export CFG_PRM=$THIS/cfg-prm-nightly.sh +export PARAM_SET_FILE=$4 + +# Move experiment config in place - is R file wtf +if [ -f $PARAM_SET_FILE ] +then + echo $WORKFLOWS_ROOT + echo $EMEWS_PROJECT_ROOT + FNAME=$( basename $PARAM_SET_FILE ) + cp $PARAM_SET_FILE $EMEWS_PROJECT_ROOT/data/$FNAME + PARAM_SET_FILE=$FNAME +fi + + +# Specify the R file for This file must be present in the $EMEWS_PROJECT_ROOT/R +export R_FILE=mlrMBO-mbo.R + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export MODEL_RETURN="val_loss" + +if [[ $SITE == "theta" ]] +then + export WAIT=1 +fi + +export CANDLE_MODEL_TYPE="SINGULARITY" +export CANDLE_IMAGE="/software/improve/images/GraphDRP.sif" +export INIT_PARAMS_FILE="/software/improve/graphdrp_default_model.txt" + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +echo $TURBINE_OUTPUT +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) +#check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: From 0fdd2ccb2769f50b6727be0954e8b439b3c1d1fc Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 30 May 2023 11:34:39 -0500 Subject: [PATCH 562/601] Adding random_baseline_keras2.py --- models/Random/random_baseline_keras2.py | 91 +++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 models/Random/random_baseline_keras2.py diff --git a/models/Random/random_baseline_keras2.py b/models/Random/random_baseline_keras2.py new file mode 100644 index 00000000..1b6a555a --- /dev/null +++ b/models/Random/random_baseline_keras2.py @@ -0,0 +1,91 @@ +""" +SUPERVISOR MODEL RANDOM +Simply returns a random number in [0,10) as val_loss +""" + +import os + +import tensorflow as tf +import numpy as np + +import candle + +# file_path becomes the default location of the oned_default_model.txt file +file_path = os.path.dirname(os.path.realpath(__file__)) + + +class BenchmarkRandom(candle.Benchmark): + """ Our subclass implementation of a CANDLE Benchmark """ + def set_locals(self): + pass + +# In the initialize_parameters() method, we will instantiate the base +# class, and finally build an argument parser to recognize your customized +# parameters in addition to the default parameters.The initialize_parameters() +# method should return a python dictionary, which will be passed to the run() +# method. +def initialize_parameters(): + bmk = BenchmarkRandom( + # The path to this file needed to find default_model.txt: + file_path, + # The name of the default_model.txt file: + 'random_default_model.txt', + 'keras', # framework, choice is keras or pytorch + prog='random_baseline', # basename of the model + desc='Supervisor Benchmark Random') + + # Get the parameter dictionary built from + # random_default_model.txt and modified by any + # matching command line parameters: + gParameters = candle.finalize_parameters(bmk) + + return gParameters + + +def model_implementation(): + """ The implementation of the model w/o CANDLE conventions """ + import random + result = random.random() * 10 + return result + + +def run(params): + + result = model_implementation() + + print("IMPROVE_RESULT: " + str(result)) + + h = tf.keras.callbacks.History() + h.history.setdefault('val_loss') + + y_array = np.ndarray(2) + y_array.fill(result) + h.history['val_loss'] = y_array + return h + return { + "val_loss": result, + } # metrics is used by the supervisor when running + # HPO workflows (and possible future non HPO workflows) + + # Dumping results into file, workflow requirement + val_scores = { + 'key': 'val_loss', + 'value': metrics['val_loss'], + 'val_loss': metrics['val_loss'], + } + + with open(params['output_dir'] + "/scores.json", "w", + encoding="utf-8") as f: + json.dump(val_scores, f, ensure_ascii=False, indent=4) + + return metrics # metrics is used by the supervisor when running + # HPO workflows (and possible future non HPO workflows) + + +def main(): + params = initialize_parameters() + scores = run(params) + + +if __name__ == "__main__": + main() From 8fc43d74266a622a01db77a39c3d57c429b399f5 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 30 May 2023 11:59:32 -0500 Subject: [PATCH 563/601] get_expid() takes 1 argument --- workflows/GA/swift/workflow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/GA/swift/workflow.sh b/workflows/GA/swift/workflow.sh index 4619ef0f..a0d50e4b 100755 --- a/workflows/GA/swift/workflow.sh +++ b/workflows/GA/swift/workflow.sh @@ -62,7 +62,7 @@ then fi get_site $1 # Sets SITE -get_expid $2 $CANDLE_MODEL_TYPE # Sets EXPID +get_expid $2 # Sets EXPID get_cfg_sys $3 get_cfg_prm $4 MODEL_NAME=$5 From 35622c63f32eb03dff92c5129346e1941787e667 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 30 May 2023 15:37:54 -0500 Subject: [PATCH 564/601] Clean up --- workflows/GA/swift/workflow.sh | 8 -------- 1 file changed, 8 deletions(-) diff --git a/workflows/GA/swift/workflow.sh b/workflows/GA/swift/workflow.sh index a0d50e4b..1ad35201 100755 --- a/workflows/GA/swift/workflow.sh +++ b/workflows/GA/swift/workflow.sh @@ -8,12 +8,6 @@ set -eu # Autodetect this workflow directory export EMEWS_PROJECT_ROOT=$( cd $( dirname $0 )/.. ; /bin/pwd ) export WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. ; /bin/pwd ) -if [[ ! -d $EMEWS_PROJECT_ROOT/../../../Benchmarks ]] -then - echo "Could not find Benchmarks in: $EMEWS_PROJECT_ROOT/../../../Benchmarks" - exit 1 -fi - export BENCHMARK_TIMEOUT SCRIPT_NAME=$(basename $0) @@ -92,7 +86,6 @@ then RESTART_NUMBER_ARG="--restart_number=$RESTART_NUMBER" fi - CMD_LINE_ARGS=( -ga_params=$PARAM_SET_FILE -seed=$SEED -ni=$NUM_ITERATIONS @@ -118,7 +111,6 @@ log_script #copy the configuration files to TURBINE_OUTPUT cp $WORKFLOWS_ROOT/common/python/$GA_FILE $PARAM_SET_FILE $INIT_PARAMS_FILE $CFG_SYS $CFG_PRM $TURBINE_OUTPUT - # Make run directory in advance to reduce contention mkdir -pv $TURBINE_OUTPUT/run From 92e45fe8db85193bb23f21831c632bb7134bc49a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 30 May 2023 15:38:02 -0500 Subject: [PATCH 565/601] Allow this to be unset - user can set it --- workflows/GA/test/cfg-prm-1.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/GA/test/cfg-prm-1.sh b/workflows/GA/test/cfg-prm-1.sh index f902d438..ce8c174c 100644 --- a/workflows/GA/test/cfg-prm-1.sh +++ b/workflows/GA/test/cfg-prm-1.sh @@ -36,7 +36,7 @@ elif [ "$MODEL_NAME" = "oned" ]; then # PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b2_param_space_ga.json} # elif [ "$MODEL_NAME" = "p2b1" ]; then # PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p2b1_param_space_ga.json} -elif [ "$PARAM_SET_FILE" != "" ]; then +elif [ "${PARAM_SET_FILE:-}" != "" ]; then PARAM_SET_FILE=${EMEWS_PROJECT_ROOT}/data/${PARAM_SET_FILE} else echo "Invalid model-" $MODEL_NAME From df4f3acb6d753dfef31e51a87708c65618a0da1a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 30 May 2023 15:45:34 -0500 Subject: [PATCH 566/601] Settings for Lambda7 --- workflows/common/sh/env-lambda7.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/common/sh/env-lambda7.sh b/workflows/common/sh/env-lambda7.sh index ba249349..b779666c 100644 --- a/workflows/common/sh/env-lambda7.sh +++ b/workflows/common/sh/env-lambda7.sh @@ -1,5 +1,5 @@ -# ENV Lambda +# ENV Lambda7 # Environment settings for Lambda (Swift, Python, R, Tcl, etc.) # Everything is installed in here: @@ -8,7 +8,7 @@ SFW=/homes/woz/Public/sfw SWIFT=$SFW/swift-t/2023-05-26 PY=$SFW/Miniconda # EQPY=$SFW/EQ-Py -EQR=$SFW/EQ-R +export EQR=$SFW/EQ-R R=$SFW/R-4.1.0 PATH=$SWIFT/stc/bin:$PATH From 8e40608aa96150f25367415f3964a689a5b31b34 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 30 May 2023 15:47:08 -0500 Subject: [PATCH 567/601] Better structure when not using a Benchmark; add Random --- workflows/common/sh/set-pythonpath.sh | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/workflows/common/sh/set-pythonpath.sh b/workflows/common/sh/set-pythonpath.sh index fcacdd88..b2a028b8 100644 --- a/workflows/common/sh/set-pythonpath.sh +++ b/workflows/common/sh/set-pythonpath.sh @@ -11,6 +11,21 @@ # If MODEL_PYTHON_DIR is set, that is added to PYTHONPATH SUPERVISOR=$( cd $EMEWS_PROJECT_ROOT/../.. ; /bin/pwd ) + +# Set up Supervisor +export PYTHONPATH +PYTHONPATH+=:$SUPERVISOR/workflows/common/python +PYTHONPATH+=:$SUPERVISOR/models/OneD +PYTHONPATH+=:$SUPERVISOR/models/Random +PYTHONPATH+=:$SUPERVISOR/workflows/common/ext/EQ-Py + +# The remainder of this script sets up PYTHONPATHs +# for the CANDLE Benchmarks if they are found +if ! [[ -d $SUPERVISOR/../Benchmarks ]] +then + # The user must be running an external model or container + return +fi BENCHMARKS_DEFAULT=$( cd $SUPERVISOR/../Benchmarks ; /bin/pwd ) export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} @@ -25,12 +40,6 @@ fi # APP_PYTHONPATH+=:$BENCHMARK_DIRS:$BENCHMARKS_ROOT/common # PYTHONPATH+=:$BENCHMARK_DIRS:$BENCHMARKS_ROOT/common -# Set up Supervisor -export PYTHONPATH -PYTHONPATH+=:$SUPERVISOR/workflows/common/python -PYTHONPATH+=:$SUPERVISOR/models/OneD -PYTHONPATH+=:$SUPERVISOR/workflows/common/ext/EQ-Py - # Add known CANDLE Benchmarks to PYTHONPATH PYTHONPATH+=:$BENCHMARKS_ROOT/Pilot1/P1B1 PYTHONPATH+=:$BENCHMARKS_ROOT/Pilot1/Attn1 From 90ed6027804ca5047544f04e80f83fb6952440e6 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 30 May 2023 15:47:26 -0500 Subject: [PATCH 568/601] Better messaging and comments --- workflows/common/sh/utils.sh | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index 54f6d202..d61a48c0 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -169,17 +169,15 @@ get_expid() { if (( ${#} != 1 )) then - echo "get_expid(): provide EXPID" + echo "get_expid(): provide EXPID or '-a'" return 1 fi export EXPID=$1 - echo "CANDLE_MODEL_TYPE is set to: ${CANDLE_MODEL_TYPE:=BENCHMARKS}" - - MODEL_NAME=${MODEL_NAME:-cmp} - - echo "MODEL_NAME is set to: ${MODEL_NAME}" + : ${CANDLE_MODEL_TYPE:=BENCHMARKS} ${MODEL_NAME:=cmp} + echo "get_expid(): CANDLE_MODEL_TYPE=$CANDLE_MODEL_TYPE" + echo "get_expid(): MODEL_NAME=$MODEL_NAME" export EXPERIMENTS="" @@ -226,7 +224,7 @@ get_expid() TO=$( readlink --canonicalize $TURBINE_OUTPUT ) if [[ $TO == "" ]] then - echo "Could not canonicalize: $TURBINE_OUTPUT" + echo "get_expid(): could not canonicalize: $TURBINE_OUTPUT" exit 1 fi export TURBINE_OUTPUT=$TO @@ -234,6 +232,7 @@ get_expid() next() # Obtain next available numbered file name matching pattern +# in global variable REPLY # E.g., 'next out-%02i' returns 'out-02' if out-00 and out-01 exist. { local PATTERN=$1 FILE="" i=0 From 13d7d7235941cf857e1a95f7c7f07b24cc3eee61 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 30 May 2023 15:47:39 -0500 Subject: [PATCH 569/601] Update names --- workflows/common/swift/model_app.swift | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/workflows/common/swift/model_app.swift b/workflows/common/swift/model_app.swift index 49cb955f..6e6473d1 100644 --- a/workflows/common/swift/model_app.swift +++ b/workflows/common/swift/model_app.swift @@ -12,10 +12,10 @@ runid : A string run ID that will be in the output directory name model_name : Benchmark (e.g., "uno") */ -(string obj_result) candle_model_train(string params, - string expid, - string runid, - string model_name) +(string model_result) candle_model_train(string params, + string expid, + string runid, + string model_name) { string model_sh = getenv("MODEL_SH"); @@ -26,16 +26,17 @@ outdir = "%s/%s" % (turbine_output, runid); // outdir = "%s/%s/Output/%s/%s" % (turbine_output, model_name, expid, runid); - printf("obj_app: running model shell script in: %s", outdir); + printf("candle_model_train_app(): running model shell in: %s", + outdir); // We do not use a file type here because this file may not be created, // which is handled by get_results() string result_file = outdir/"result.txt"; wait (run_model(model_sh, params, expid, runid)) { - obj_result = get_results(result_file); + model_result = get_results(result_file); } - printf("obj_app: result(%s): '%s'", runid, obj_result); + printf("candle_model_train_app: result(%s): '%s'", runid, model_result); } /** @@ -52,13 +53,13 @@ app (void o) run_model (string model_sh, string params, Extracts the Benchmark output if it exists, else, provides a NaN so the workflow can keep running */ -(string obj_result) get_results(string result_file) { +(string model_result) get_results(string result_file) { if (file_exists(result_file)) { file line = input(result_file); - obj_result = trim(read(line)); + model_result = trim(read(line)); } else { printf("File not found: %s", result_file); // return with a large value - obj_result = "1e7"; + model_result = "1e7"; } } From 8d7b0f2cf9c6877d7348f07b669b38ecc54ab320 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 30 May 2023 15:47:57 -0500 Subject: [PATCH 570/601] New generic test for GA --- workflows/GA/test/test-bmk.sh | 69 +++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 workflows/GA/test/test-bmk.sh diff --git a/workflows/GA/test/test-bmk.sh b/workflows/GA/test/test-bmk.sh new file mode 100644 index 00000000..9e461a88 --- /dev/null +++ b/workflows/GA/test/test-bmk.sh @@ -0,0 +1,69 @@ +#!/bin/bash +set -eu + +# GA TEST BMK +# Runs any CANDLE Benchmark in using MODEL_IMPL="app" + +usage() +{ + echo "Usage: test BENCHMARK_NAME SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 3 )) +then + RUN_DIR=$3 +elif (( ${#} == 2 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=$1 +SITE=$2 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-1.sh +export CFG_PRM=$THIS/cfg-prm-1.sh +export PARAM_SET_FILE=graphdrp_param_space_ga.json + +# The python GA model exploration algorithm +export GA_FILE=deap_ga.py + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export MODEL_RETURN="val_loss" + +export CANDLE_MODEL_TYPE="BENCHMARKS" +export CANDLE_IMAGE=NONE +export CANDLE_MODEL_IMPL="app" + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +echo $TURBINE_OUTPUT +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) +#check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: From 8a9dff04cb52c969fbd44a295df9daeaf25f12be Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 30 May 2023 16:22:55 -0500 Subject: [PATCH 571/601] Clean up --- workflows/GA/swift/workflow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/GA/swift/workflow.sh b/workflows/GA/swift/workflow.sh index 1ad35201..51c8a2cd 100755 --- a/workflows/GA/swift/workflow.sh +++ b/workflows/GA/swift/workflow.sh @@ -163,7 +163,7 @@ then fi ( -set -x +# set -x which python swift-t swift-t -O 0 -n $PROCS \ ${MACHINE:-} \ From 558af6ea3811d9305d131b02db3951bcbc5e1cce Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 30 May 2023 16:23:21 -0500 Subject: [PATCH 572/601] Remove PARAM_SET_FILE --- workflows/GA/test/test-bmk.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 workflows/GA/test/test-bmk.sh diff --git a/workflows/GA/test/test-bmk.sh b/workflows/GA/test/test-bmk.sh old mode 100644 new mode 100755 index 9e461a88..e7fc81d6 --- a/workflows/GA/test/test-bmk.sh +++ b/workflows/GA/test/test-bmk.sh @@ -35,7 +35,7 @@ source $WORKFLOWS_ROOT/common/sh/utils.sh # Select configurations export CFG_SYS=$THIS/cfg-sys-1.sh export CFG_PRM=$THIS/cfg-prm-1.sh -export PARAM_SET_FILE=graphdrp_param_space_ga.json +# export PARAM_SET_FILE=graphdrp_param_space_ga.json # The python GA model exploration algorithm export GA_FILE=deap_ga.py From ae05b5b91c5742b949a42e31a8f654e6844c281f Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 30 May 2023 16:25:01 -0500 Subject: [PATCH 573/601] Add nice output log --- models/OneD/oned_baseline_keras2.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/models/OneD/oned_baseline_keras2.py b/models/OneD/oned_baseline_keras2.py index 66b755bb..7e0666d2 100644 --- a/models/OneD/oned_baseline_keras2.py +++ b/models/OneD/oned_baseline_keras2.py @@ -50,6 +50,8 @@ def func(x, n=1): # remove random part y = 0.02 * x + 0.5 * np.sin(1 * x + 0.1) + 0.75 * np.cos(0.25 * x - 0.3) + print("oned: f(x=%0.3f) => y=%0.3f" % (x, y)) + return y From 4a3ba556cd8652d5e4aec03cbcf74b649f8ca93c Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 30 May 2023 16:25:16 -0500 Subject: [PATCH 574/601] Add human-readable report at end --- workflows/common/python/deap_ga.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/workflows/common/python/deap_ga.py b/workflows/common/python/deap_ga.py index 550dd318..6240d275 100644 --- a/workflows/common/python/deap_ga.py +++ b/workflows/common/python/deap_ga.py @@ -1,7 +1,9 @@ import csv +from datetime import datetime import json import math import random +import sys import threading import time @@ -141,6 +143,12 @@ def run(): :param ga parameters file name: ga parameters file name (e.g., "ga_params.json") :param param_file: name of file containing initial parameters """ + start_time = time.time() + time_string = datetime.fromtimestamp(start_time) \ + .strftime("%Y-%m-%d %H:%M:%S") + print("deap_ga: START: " + time_string) + sys.stdout.flush() + eqpy.OUT_put("Params") params = eqpy.IN_get() @@ -179,7 +187,7 @@ def run(): # num_iter-1 generations since the initial population is evaluated once first mutpb = mut_prob - start_time = time.time() + if strategy == "simple": pop, log = algorithms.eaSimple( pop, @@ -216,6 +224,21 @@ def run(): fitnesses = [str(p.fitness.values[0]) for p in pop] + time_string = datetime.fromtimestamp(end_time) \ + .strftime("%Y-%m-%d %H:%M:%S") + print("deap_ga: STOP: " + time_string) + sys.stdout.flush() + + best_i = -1 + best_fitness = sys.float_info.max + for i in range(0, len(fitnesses)): + f = float(fitnesses[i]) + if f < best_fitness: + best_i = i + best_fitness = f + print("deap_ga: BEST: %f == %s" % (best_fitness, str(pop[i]))) + sys.stdout.flush() + eqpy.OUT_put("DONE") # return the final population eqpy.OUT_put("{}\n{}\n{}\n{}\n{}".format( From 17416256e1a04430fa9bb79a68b8a52f556362eb Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 30 May 2023 16:52:10 -0500 Subject: [PATCH 575/601] Prevent "SUCCESS" on job failure --- workflows/GA/swift/workflow.sh | 60 ++++++++++++++++------------------ 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/workflows/GA/swift/workflow.sh b/workflows/GA/swift/workflow.sh index 51c8a2cd..89df4d70 100755 --- a/workflows/GA/swift/workflow.sh +++ b/workflows/GA/swift/workflow.sh @@ -163,37 +163,35 @@ then fi ( -# set -x -which python swift-t -swift-t -O 0 -n $PROCS \ - ${MACHINE:-} \ - -p -I $EQPY -r $EQPY \ - -I $OBJ_DIR \ - -i $OBJ_MODULE \ - -e LD_LIBRARY_PATH=$LD_LIBRARY_PATH \ - -e TURBINE_RESIDENT_WORK_WORKERS=$TURBINE_RESIDENT_WORK_WORKERS \ - -e RESIDENT_WORK_RANKS=$RESIDENT_WORK_RANKS \ - -e BENCHMARKS_ROOT \ - -e EMEWS_PROJECT_ROOT \ - $( python_envs ) \ - -e APP_PYTHONPATH \ - -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ - -e OBJ_RETURN \ - -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ - -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ - -e MODEL_SH \ - -e MODEL_NAME \ - -e SITE \ - -e BENCHMARK_TIMEOUT \ - -e SH_TIMEOUT \ - -e TURBINE_STDOUT \ - -e IGNORE_ERRORS \ - -e CANDLE_DATA_DIR \ - -e CANDLE_MODEL_TYPE \ - -e CANDLE_IMAGE \ - $WAIT_ARG \ - $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} | 2>&1 \ - tee $STDOUT + which python swift-t + swift-t -O 0 -n $PROCS \ + ${MACHINE:-} \ + -p -I $EQPY -r $EQPY \ + -I $OBJ_DIR \ + -i $OBJ_MODULE \ + -e LD_LIBRARY_PATH=$LD_LIBRARY_PATH \ + -e TURBINE_RESIDENT_WORK_WORKERS=$TURBINE_RESIDENT_WORK_WORKERS \ + -e RESIDENT_WORK_RANKS=$RESIDENT_WORK_RANKS \ + -e BENCHMARKS_ROOT \ + -e EMEWS_PROJECT_ROOT \ + $( python_envs ) \ + -e APP_PYTHONPATH \ + -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ + -e MODEL_RETURN \ + -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ + -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ + -e MODEL_SH \ + -e MODEL_NAME \ + -e SITE \ + -e BENCHMARK_TIMEOUT \ + -e SH_TIMEOUT \ + -e TURBINE_STDOUT \ + -e IGNORE_ERRORS \ + -e CANDLE_DATA_DIR \ + -e CANDLE_MODEL_TYPE \ + -e CANDLE_IMAGE \ + $WAIT_ARG \ + $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} ) if (( ${PIPESTATUS[0]} )) From 68a8d0df89204a937d75fb21954f3845edbbff25 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 30 May 2023 16:52:21 -0500 Subject: [PATCH 576/601] Clean up --- workflows/GA/swift/workflow.swift | 2 -- 1 file changed, 2 deletions(-) diff --git a/workflows/GA/swift/workflow.swift b/workflows/GA/swift/workflow.swift index a0ca3930..2d0f8d01 100644 --- a/workflows/GA/swift/workflow.swift +++ b/workflows/GA/swift/workflow.swift @@ -16,8 +16,6 @@ import python; import candle_utils; report_env(); -python("import sys ; import csv ; import _csv ; print('HELLO') ; sys.stdout.flush()"); - string emews_root = getenv("EMEWS_PROJECT_ROOT"); string turbine_output = getenv("TURBINE_OUTPUT"); string resident_work_ranks = getenv("RESIDENT_WORK_RANKS"); From 5695aa6c5d7269e0f0f50edf2a38af5f72554535 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 30 May 2023 17:10:47 -0500 Subject: [PATCH 577/601] Update names --- workflows/GA/swift/workflow.sh | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/workflows/GA/swift/workflow.sh b/workflows/GA/swift/workflow.sh index 89df4d70..79b8df49 100755 --- a/workflows/GA/swift/workflow.sh +++ b/workflows/GA/swift/workflow.sh @@ -114,10 +114,13 @@ cp $WORKFLOWS_ROOT/common/python/$GA_FILE $PARAM_SET_FILE $INIT_PARAMS_FILE $CF # Make run directory in advance to reduce contention mkdir -pv $TURBINE_OUTPUT/run -# Allow the user to set an objective function -OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} -# This is used by the obj_app objective function +if [[ ${CANDLE_MODEL_TYPE:-} == "SINGULARITY" ]] +then + CANDLE_MODEL_IMPL="container" +fi +SWIFT_LIBS_DIR=${SWIFT_LIBS_DIR:-$WORKFLOWS_ROOT/common/swift} +SWIFT_MODULE=${SWIFT_MODULE:-model_$CANDLE_MODEL_IMPL} +# This is used by the candle_model_train_app function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh WAIT_ARG="" @@ -167,8 +170,8 @@ fi swift-t -O 0 -n $PROCS \ ${MACHINE:-} \ -p -I $EQPY -r $EQPY \ - -I $OBJ_DIR \ - -i $OBJ_MODULE \ + -I $SWIFT_LIBS_DIR \ + -i $SWIFT_MODULE \ -e LD_LIBRARY_PATH=$LD_LIBRARY_PATH \ -e TURBINE_RESIDENT_WORK_WORKERS=$TURBINE_RESIDENT_WORK_WORKERS \ -e RESIDENT_WORK_RANKS=$RESIDENT_WORK_RANKS \ From 04a1b7ff0640d394e730d03a8649b8af423acc63 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 30 May 2023 17:12:14 -0500 Subject: [PATCH 578/601] Fix typo --- workflows/GA/test/test-bmk.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/GA/test/test-bmk.sh b/workflows/GA/test/test-bmk.sh index e7fc81d6..59499ec4 100755 --- a/workflows/GA/test/test-bmk.sh +++ b/workflows/GA/test/test-bmk.sh @@ -2,7 +2,7 @@ set -eu # GA TEST BMK -# Runs any CANDLE Benchmark in using MODEL_IMPL="app" +# Runs any CANDLE Benchmark using MODEL_IMPL="app" usage() { From d17d6471b39ee4a52c5b001993e458f511965915 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 30 May 2023 17:12:25 -0500 Subject: [PATCH 579/601] New GA test for SIFs --- workflows/GA/test/test-sif.sh | 68 +++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100755 workflows/GA/test/test-sif.sh diff --git a/workflows/GA/test/test-sif.sh b/workflows/GA/test/test-sif.sh new file mode 100755 index 00000000..ae1de8de --- /dev/null +++ b/workflows/GA/test/test-sif.sh @@ -0,0 +1,68 @@ +#!/bin/bash +set -eu + +# GA TEST SIF +# Runs any IMPROVE container + +usage() +{ + echo "Usage: test SIF SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 3 )) +then + RUN_DIR=$3 +elif (( ${#} == 2 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=$1 +SITE=$2 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-1.sh +export CFG_PRM=$THIS/cfg-prm-1.sh +export PARAM_SET_FILE=graphdrp_param_space_ga.json + +# The Python GA model exploration algorithm +export GA_FILE=deap_ga.py + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export MODEL_RETURN="val_loss" + +export CANDLE_MODEL_TYPE="SINGULARITY" +export CANDLE_IMAGE=$MODEL_NAME +export CANDLE_MODEL_IMPL="app" + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +echo $TURBINE_OUTPUT +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: From 0e8e840cab24429185194e2e3113301142af0de8 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 31 May 2023 11:19:34 -0500 Subject: [PATCH 580/601] New param space for HiDRA --- workflows/GA/data/hidra_param_space.json | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 workflows/GA/data/hidra_param_space.json diff --git a/workflows/GA/data/hidra_param_space.json b/workflows/GA/data/hidra_param_space.json new file mode 100644 index 00000000..5dbd942e --- /dev/null +++ b/workflows/GA/data/hidra_param_space.json @@ -0,0 +1,23 @@ +[ + { + "name": "learning_rate", + "type": "float", + "lower": 0.000001, + "upper": 0.1, + "sigma": 0.01 + }, + + { + "name": "batch_size", + "type": "ordered", + "element_type": "int", + "values": [16, 32, 64, 128, 256, 512, 1024, 2048], + "sigma": 1 + }, + + { + "name": "epochs", + "type": "constant", + "value": 5 + } +] From 9cf04f2df9d6aa6f15aad48ab468e4d00b2d5b0b Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 31 May 2023 14:08:34 -0500 Subject: [PATCH 581/601] Probably better error handling --- workflows/GA/swift/workflow.swift | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/workflows/GA/swift/workflow.swift b/workflows/GA/swift/workflow.swift index 2d0f8d01..ebbf5a08 100644 --- a/workflows/GA/swift/workflow.swift +++ b/workflows/GA/swift/workflow.swift @@ -3,6 +3,7 @@ * WORKFLOW.SWIFT */ +import assert; import io; import sys; import files; @@ -66,13 +67,14 @@ string FRAMEWORK = "keras"; } else if (params == "EQPY_ABORT") { - printf("EQPy Aborted"); + printf("EQPy aborted..."); string why = EQPy_get(ME); // TODO handle the abort if necessary // e.g. write intermediate results ... printf("%s", why) => v = propagate() => - c = false; + c = false => + assert(false, "EQPY aborted!"); } else { From ba12653405100c157ecbc94f95eb58be54aae3ac Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 31 May 2023 14:09:13 -0500 Subject: [PATCH 582/601] Better final output --- workflows/common/python/deap_ga.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/workflows/common/python/deap_ga.py b/workflows/common/python/deap_ga.py index 6240d275..c89d5be0 100644 --- a/workflows/common/python/deap_ga.py +++ b/workflows/common/python/deap_ga.py @@ -20,22 +20,26 @@ def obj_func(x): return 0 +# Produces something like: # {"batch_size":512,"epochs":51,"activation":"softsign", # "dense":"2000 1000 1000 500 100 50","optimizer":"adagrad","drop":0.1378, # "learning_rate":0.0301,"conv":"25 25 25 25 25 1"} -def create_list_of_json_strings(list_of_lists, super_delim=";"): +def create_list_of_json_strings(list_of_lists, super_delimiter=";"): # create string of ; separated jsonified maps - res = [] + result = [] global ga_params - for l in list_of_lists: - jmap = {} - for i, p in enumerate(ga_params): - jmap[p.name] = l[i] + for L in list_of_lists: + json_string = create_json_string(L) + result.append(json_string) + return super_delimiter.join(result) - jstring = json.dumps(jmap) - res.append(jstring) - return super_delim.join(res) +def create_json_string(L): + json_dict = {} + for i, p in enumerate(ga_params): + json_dict[p.name] = L[i] + result = json.dumps(json_dict) + return result def create_fitnesses(params_string): @@ -236,7 +240,7 @@ def run(): if f < best_fitness: best_i = i best_fitness = f - print("deap_ga: BEST: %f == %s" % (best_fitness, str(pop[i]))) + print("deap_ga: BEST: %s == %s" % (best_fitness, create_json_string(pop[i]))) sys.stdout.flush() eqpy.OUT_put("DONE") From e85c949214cc54f2f62a2fcf73287f009acf9dcb Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 1 Jun 2023 11:09:07 -0500 Subject: [PATCH 583/601] Update header --- workflows/GA/test/cfg-sys-1.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/GA/test/cfg-sys-1.sh b/workflows/GA/test/cfg-sys-1.sh index d9380d47..2bb49dee 100644 --- a/workflows/GA/test/cfg-sys-1.sh +++ b/workflows/GA/test/cfg-sys-1.sh @@ -1,5 +1,5 @@ # -# COMBO CFG SYS 1 +# GA CFG SYS 1 # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEMS From 16ffa52c7f941bc42058e7f72a6f3bad005122ee Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 1 Jun 2023 11:59:19 -0500 Subject: [PATCH 584/601] Add an iteration report --- workflows/common/python/deap_ga.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/workflows/common/python/deap_ga.py b/workflows/common/python/deap_ga.py index c89d5be0..c606ff3b 100644 --- a/workflows/common/python/deap_ga.py +++ b/workflows/common/python/deap_ga.py @@ -14,7 +14,7 @@ # list of ga_utils parameter objects ga_params = None - +iteration = 1 def obj_func(x): return 0 @@ -59,6 +59,9 @@ def queue_map(obj_func, pops): # [[a,b,c,d],[e,f,g,h],...] if not pops: return [] + global iteration + print("deap_ga: ITERATION: %i" % iteration) + sys.stdout.flush() eqpy.OUT_put(create_list_of_json_strings(pops)) result = eqpy.IN_get() split_result = result.split(";") From c1df72c1374db927312a794ce4cd98b56554fd7b Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 1 Jun 2023 14:29:05 -0500 Subject: [PATCH 585/601] Merge --- models/OneD/oned_baseline_keras2.py | 2 + .../GA/data/graphdrp_param_space_ga.json | 2 +- workflows/GA/test/cfg-prm-1.sh | 10 ++- workflows/GA/test/cfg-sys-1.sh | 2 +- workflows/GA/test/test-graphdrp-lambda0.sh | 15 ++-- workflows/common/R/mlrMBO-mbo.R | 74 ++++++++++++++----- workflows/common/python/deap_ga.py | 50 +++++++++---- workflows/common/python/log_tools.py | 5 +- workflows/common/sh/model.sh | 2 +- workflows/mlrMBO/data/graphdrp_small.R | 6 +- workflows/mlrMBO/swift/workflow.swift | 16 ++-- .../mlrMBO/test/test-graphdrp-lambda0.sh | 13 ++-- 12 files changed, 129 insertions(+), 68 deletions(-) diff --git a/models/OneD/oned_baseline_keras2.py b/models/OneD/oned_baseline_keras2.py index 7e0666d2..e869029b 100644 --- a/models/OneD/oned_baseline_keras2.py +++ b/models/OneD/oned_baseline_keras2.py @@ -69,6 +69,8 @@ def run(params): x = params['x'] y = func(x) + print("IMPROVE_RESULT: " + str(y)) + print("returning training metrics: ", y) h = tf.keras.callbacks.History() diff --git a/workflows/GA/data/graphdrp_param_space_ga.json b/workflows/GA/data/graphdrp_param_space_ga.json index 3421cd63..ca3531a1 100644 --- a/workflows/GA/data/graphdrp_param_space_ga.json +++ b/workflows/GA/data/graphdrp_param_space_ga.json @@ -44,6 +44,6 @@ { "name": "epochs", "type": "constant", - "value": 20 + "value": 5 } ] diff --git a/workflows/GA/test/cfg-prm-1.sh b/workflows/GA/test/cfg-prm-1.sh index ce8c174c..3f216caa 100644 --- a/workflows/GA/test/cfg-prm-1.sh +++ b/workflows/GA/test/cfg-prm-1.sh @@ -4,14 +4,18 @@ SEED=${SEED:-1} # Total iterations -NUM_ITERATIONS=${NUM_ITERATIONS:-7} -# Size of GA population (i.e. the number of parameter sets to evaluate) -POPULATION_SIZE=${POPULATION_SIZE:-4} +NUM_ITERATIONS=${NUM_ITERATIONS:-5} +# Size of GA population +# (i.e. the number of parameter sets to evaluate per iteration) +POPULATION_SIZE=${POPULATION_SIZE:-8} # the GA strategy: one of 'simple' or 'mu_plus_lambda'. See # https://deap.readthedocs.io/en/master/api/algo.html?highlight=eaSimple#module-deap.algorithms # for more info. GA_STRATEGY=${STRATEGY:-simple} +# Polaris: +# Run HiDRA on 10 nodes for 3 hours for 20 epochs + INIT_PARAMS_FILE=${INIT_PARAMS_FILE:-} # TODO: move the following code to a utility library- diff --git a/workflows/GA/test/cfg-sys-1.sh b/workflows/GA/test/cfg-sys-1.sh index 2bb49dee..d11826e7 100644 --- a/workflows/GA/test/cfg-sys-1.sh +++ b/workflows/GA/test/cfg-sys-1.sh @@ -4,7 +4,7 @@ # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEMS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-3} +export PROCS=${PROCS:-8} # MPI processes per node # Cori has 32 cores per node, 128GB per node diff --git a/workflows/GA/test/test-graphdrp-lambda0.sh b/workflows/GA/test/test-graphdrp-lambda0.sh index d2af52e8..599965a8 100755 --- a/workflows/GA/test/test-graphdrp-lambda0.sh +++ b/workflows/GA/test/test-graphdrp-lambda0.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eu -# MLRMBO TEST NIGHTLY +# GA TEST GRAPHDRP LAMBDA usage() { @@ -31,28 +31,25 @@ export EMEWS_PROJECT_ROOT WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) source $WORKFLOWS_ROOT/common/sh/utils.sh - # Select configurations export CFG_SYS=$THIS/cfg-sys-1.sh export CFG_PRM=$THIS/cfg-prm-1.sh +export PARAM_SET_FILE=graphdrp_param_space_ga.json # The python GA model exploration algorithm export GA_FILE=deap_ga.py # What to return from the objective function (Keras model) # val_loss (default) and val_corr are supported -export OBJ_RETURN="val_loss" - -if [[ $SITE == "theta" ]] -then - export WAIT=1 -fi +export MODEL_RETURN="val_loss" export CANDLE_MODEL_TYPE="SINGULARITY" export CANDLE_IMAGE="/software/improve/images/GraphDRP.sif" # Submit job -$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME $CANDLE_MODEL_TYPE $CANDLE_IMAGE +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE # Check job output TURBINE_OUTPUT=$( readlink turbine-output ) diff --git a/workflows/common/R/mlrMBO-mbo.R b/workflows/common/R/mlrMBO-mbo.R index 9748269c..4a5869d7 100644 --- a/workflows/common/R/mlrMBO-mbo.R +++ b/workflows/common/R/mlrMBO-mbo.R @@ -1,6 +1,14 @@ + # mlrMBO EMEWS Algorithm Wrapper + set.seed(12345) + options(warn=2) + options(error=function()traceback(2)) - # mlrMBO EMEWS Algorithm Wrapper +options( + parallelMap.default.mode = "local", + parallelMap.default.cpus = 1, + parallelMap.default.show.info = TRUE +) emews_root <- Sys.getenv("EMEWS_PROJECT_ROOT") if (emews_root == "") { @@ -22,9 +30,12 @@ level = NA_character_, show.info = NA){ st = proc.time() - + print("parallelMap2() ...") + mode <- deparse(substitute(fun)) + print(paste0("mode: ", mode)) #For wrapFun do this: initdesign - if (deparse(substitute(fun)) == "wrapFun"){ + if (mode == "wrapFun"){ + print("wrapFun") dots <- list(...) string_params <- elements_of_lists_to_json(dots[[1L]]) # print(dots) @@ -37,15 +48,34 @@ # Assumes results are in the form a;b;c # Note: can also handle vector returns for each, # i.e., a,b;c,d;e,f + print(paste0("mlrMBO: received result: ", string_results)) res <- string_to_list_of_vectors(string_results) print(paste("mlrMBO: received result count:", length(res))) # using dummy time - return(result_with_extras_if_exist(res,st[3])) + extras = result_with_extras_if_exist(res,st[3]) + print(paste0("mlrMBO: extras: ", extras)) + return(extras) } - # For all other values of deparse(substitute(fun)) eg. proposePointsByInfillOptimization, doBaggingTrainIteration etc. - else{ - return(pm(fun, ..., more.args = more.args, simplify = simplify, use.names = use.names, impute.error = impute.error, - level = level, show.info = show.info)) + # For all other values of deparse(substitute(fun)) eg. + # proposePointsByInfillOptimization, doBaggingTrainIteration etc. + else { + print("pm() ...") + # tryCatch( + ## pm_out <- pm(fun, ..., more.args = more.args, simplify = simplify, + ## use.names = use.names, impute.error = impute.error, + ## level = level, show.info = show.info) + # , + # error=function(e){print(paste("CATCH: ", e))}) + dots <- list(...) + print(paste0("dots: ", dots)) + flush.console() + pm_out <- fun(opt.state=...) +## , more.args = more.args, simplify = simplify, +## use.names = use.names, impute.error = impute.error, +## level = level, show.info = show.info) + print(paste0("pm_out ...", pm_out)) + flush.console() + return(pm_out) } } @@ -64,7 +94,7 @@ # dummy objective function simple.obj.fun = function(x){} - main_function <- function(max.budget = 110, + main_function <- function(max.budget = 1000, max.iterations = 10, design.size=10, propose.points=10, @@ -80,9 +110,12 @@ propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$double.xmax) ctrl = setMBOControlInfill(ctrl, - crit = makeMBOInfillCritCB(), - opt.restarts = 1, - opt.focussearch.points = 1000) + crit = crit.cb +# makeMBOInfillCritCB(), + # opt.restarts = 1 +# , +# opt.focussearch.points = 1000 +) ctrl = setMBOControlTermination(ctrl, max.evals = max.budget, iters = max.iterations) @@ -142,9 +175,12 @@ } # each discrete variable should be represented once, else optimization will fail # this checks if design size is less than max number of discrete values - print(paste0("design size=", design.size, " must be greater or equal to maximum discrete values=", max_val_discrete)) if (design.size < max_val_discrete){ - print("Aborting! design.size is less than the discrete parameters specified") + print(paste0("design size=", design.size, + " must be >= to maximum discrete values=", + max_val_discrete)) + print("Aborting! design.size < the discrete parameters specified") + flush.console() quit() } @@ -166,8 +202,10 @@ design = chkpntResults } # print(paste("design:", design)) - configureMlr(show.info = FALSE, show.learner.output = FALSE, on.learner.warning = "quiet") - res = mbo(obj.fun, design = design, learner = NULL, control = ctrl, show.info = TRUE) + configureMlr() + # show.info = FALSE, show.learner.output = FALSE, on.learner.warning = "quiet") + res = mbo(obj.fun, design = design, learner = NULL, control = ctrl, + show.info = TRUE) return(res) } @@ -178,7 +216,7 @@ # This is a string of R code containing arguments to main_function(), # e.g., "max.budget = 110, max.iterations = 10, design.size = 10, ..." msg <- IN_get() - print(paste("Received mlrMBO configuration parameters msg: ", msg)) + cat(paste0("Received mlrMBO configuration parameters msg: ", msg)) # Edit the R code to make a list constructor expression code = paste0("list(",msg,")") @@ -205,8 +243,10 @@ turbine_output <- Sys.getenv("TURBINE_OUTPUT") if (turbine_output != "") { + print(paste0("setwd(): ", turbine_output)) setwd(turbine_output) } + print("saving final_res.Rds ...") # This will be saved to experiment directory saveRDS(final_res,file = "final_res.Rds") diff --git a/workflows/common/python/deap_ga.py b/workflows/common/python/deap_ga.py index c606ff3b..9ca1d168 100644 --- a/workflows/common/python/deap_ga.py +++ b/workflows/common/python/deap_ga.py @@ -1,5 +1,4 @@ import csv -from datetime import datetime import json import math import random @@ -7,6 +6,7 @@ import threading import time +import log_tools import eqpy import ga_utils import numpy as np @@ -15,6 +15,8 @@ # list of ga_utils parameter objects ga_params = None iteration = 1 +logger = log_tools.get_logger(None, "DEAP") + def obj_func(x): return 0 @@ -34,11 +36,11 @@ def create_list_of_json_strings(list_of_lists, super_delimiter=";"): return super_delimiter.join(result) -def create_json_string(L): +def create_json_string(L, indent=None): json_dict = {} for i, p in enumerate(ga_params): json_dict[p.name] = L[i] - result = json.dumps(json_dict) + result = json.dumps(json_dict, indent=indent) return result @@ -60,10 +62,16 @@ def queue_map(obj_func, pops): if not pops: return [] global iteration - print("deap_ga: ITERATION: %i" % iteration) + iteration_start = time.time() + logger.info("ITERATION: %i START" % iteration) sys.stdout.flush() eqpy.OUT_put(create_list_of_json_strings(pops)) result = eqpy.IN_get() + duration = time.time() - iteration_start + logger.info("ITERATION: %i STOP. duration: %0.3f" % + (iteration, duration)) + sys.stdout.flush() + iteration += 1 split_result = result.split(";") # TODO determine if max'ing or min'ing and use -9999999 or 99999999 return [(float(x),) if not math.isnan(float(x)) else (float(99999999),) @@ -93,8 +101,9 @@ def parse_init_params(params_file): def update_init_pop(pop, params_file): - global ga_params - print("Reading initial population from {}".format(params_file)) + global ga_params, logger + logger.info("Reading initial population from {}".format(params_file)) + sys.stdout.flush() init_params = parse_init_params(params_file) if len(pop) > len(init_params): raise ValueError( @@ -147,21 +156,23 @@ def run(): :param num_pop: size of population :param seed: random seed :param strategy: one of 'simple', 'mu_plus_lambda' - :param ga parameters file name: ga parameters file name (e.g., "ga_params.json") + :param ga parameters file name: ga parameters file name + (e.g., "ga_params.json") :param param_file: name of file containing initial parameters """ + global logger start_time = time.time() - time_string = datetime.fromtimestamp(start_time) \ - .strftime("%Y-%m-%d %H:%M:%S") - print("deap_ga: START: " + time_string) + logger.info("OPTIMIZATION START") sys.stdout.flush() eqpy.OUT_put("Params") params = eqpy.IN_get() - # parse params + # Evaluate and log the params given by the workflow level: (num_iter, num_pop, seed, strategy, mut_prob, ga_params_file, param_file) = eval("{}".format(params)) + log_params(logger, num_iter, num_pop, seed) + random.seed(seed) global ga_params ga_params = ga_utils.create_parameters(ga_params_file) @@ -231,9 +242,7 @@ def run(): fitnesses = [str(p.fitness.values[0]) for p in pop] - time_string = datetime.fromtimestamp(end_time) \ - .strftime("%Y-%m-%d %H:%M:%S") - print("deap_ga: STOP: " + time_string) + logger.info("OPTIMIZATION STOP") sys.stdout.flush() best_i = -1 @@ -243,15 +252,24 @@ def run(): if f < best_fitness: best_i = i best_fitness = f - print("deap_ga: BEST: %s == %s" % (best_fitness, create_json_string(pop[i]))) + logger.info("BEST: %s == ...\n%s" % + (best_fitness, create_json_string(pop[i], indent=2))) sys.stdout.flush() eqpy.OUT_put("DONE") # return the final population - eqpy.OUT_put("{}\n{}\n{}\n{}\n{}".format( + eqpy.OUT_put("{}\n{}\n{}\n{}\n{}\n".format( create_list_of_json_strings(pop), ";".join(fitnesses), start_time, log, end_time, )) + + +def log_params(logger, num_iter, num_pop, seed): + logger.info("HPO PARAMS START") + logger.info("num_iter: %4i" % num_iter) + logger.info("num_pop: %4i" % num_pop) + logger.info("seed: %4i" % seed) + logger.info("HPO PARAMS STOP") diff --git a/workflows/common/python/log_tools.py b/workflows/common/python/log_tools.py index 0a3a5bf0..aa28e8eb 100644 --- a/workflows/common/python/log_tools.py +++ b/workflows/common/python/log_tools.py @@ -8,7 +8,10 @@ def get_logger(logger, name, stream=sys.stdout, milliseconds=False): - """Set up logging.""" + """ + Set up logging if necessary + If the caller's logger already exists, just return it + """ if logger is not None: return logger import logging diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index 93f4baf8..e0be51ba 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -25,7 +25,7 @@ usage() echo " for Singularity it is a script (e.g., 'ACTION.sh')" echo "The environment should have:" echo " EMEWS_PROJECT_ROOT|WORKFLOWS_ROOT TURBINE_OUTPUT" - echo " SITE OBJ_RETURN BENCHMARK_TIMEOUT" + echo " SITE MODEL_RETURN BENCHMARK_TIMEOUT" echo " CANDLE_DATA_DIR" echo "If SH_TIMEOUT is set, we run under the shell command timeout" } diff --git a/workflows/mlrMBO/data/graphdrp_small.R b/workflows/mlrMBO/data/graphdrp_small.R index 785d2bf1..b3e01509 100644 --- a/workflows/mlrMBO/data/graphdrp_small.R +++ b/workflows/mlrMBO/data/graphdrp_small.R @@ -8,8 +8,8 @@ param.set <- makeParamSet( # makeDiscreteParam("test_batch", values = c(8, 16)), - makeIntegerParam("epochs", lower = 3, upper = 4), + makeIntegerParam("epochs", lower = 1, upper = 6), # makeDiscreteParam("optimizer", values = c("adam", "sgd", "rmsprop", "adagrad", "adadelta")), - # makeNumericParam("dropout", lower = 0.1, upper = 0.5), - makeNumericParam("learning_rate", lower = 0.001, upper = 0.1) + makeNumericParam("dropout", lower = 0.1, upper = 0.5), + makeNumericParam("learning_rate", lower = 0.001, upper = 0.5) ) diff --git a/workflows/mlrMBO/swift/workflow.swift b/workflows/mlrMBO/swift/workflow.swift index d859cca9..161a1bdf 100644 --- a/workflows/mlrMBO/swift/workflow.swift +++ b/workflows/mlrMBO/swift/workflow.swift @@ -26,7 +26,7 @@ string turbine_output = getenv("TURBINE_OUTPUT"); string resident_work_ranks = getenv("RESIDENT_WORK_RANKS"); string r_ranks[] = split(resident_work_ranks,","); int propose_points = toint(argv("pp", "3")); -int max_budget = toint(argv("mb", "110")); +int max_budget = toint(argv("mb", "1000")); int max_iterations = toint(argv("it", "5")); int design_size = toint(argv("ds", "10")); string param_set = argv("param_set_file"); @@ -34,7 +34,6 @@ string exp_id = argv("exp_id"); int benchmark_timeout = toint(argv("benchmark_timeout", "-1")); string restart_file = argv("restart_file", "DISABLED"); string r_file = argv("r_file", "mlrMBO1.R"); - string model_name = getenv("MODEL_NAME"); string candle_model_type = getenv("CANDLE_MODEL_TYPE"); string candle_image = getenv("CANDLE_IMAGE"); @@ -44,8 +43,6 @@ string init_params_file = getenv("INIT_PARAMS_FILE"); printf("CANDLE mlrMBO Workflow"); printf("TURBINE_OUTPUT: " + turbine_output); - - string restart_number = argv("restart_number", "1"); string site = argv("site"); @@ -81,11 +78,11 @@ string FRAMEWORK = "keras"; } else if (params == "EQR_ABORT") { - printf("EQR aborted: see output for R error") => - string why = EQR_get(ME); - printf("%s", why) => + printf("EQR_ABORT: see output for R error") => + string why = EQR_get(ME); + printf("EQR_ABORT: R exception: %s", why) => // v = propagate(why) => - c = false; + c = false; } else { @@ -104,7 +101,8 @@ string FRAMEWORK = "keras"; } // These must agree with the arguments to the objective function in mlrMBO.R, -// except param.set.file is removed and processed by the mlrMBO.R algorithm wrapper. +// except param.set.file is removed and processed by the mlrMBO.R +// algorithm wrapper. string algo_params_template = """ param.set.file='%s', diff --git a/workflows/mlrMBO/test/test-graphdrp-lambda0.sh b/workflows/mlrMBO/test/test-graphdrp-lambda0.sh index 9ac8ccb8..35cbbb50 100755 --- a/workflows/mlrMBO/test/test-graphdrp-lambda0.sh +++ b/workflows/mlrMBO/test/test-graphdrp-lambda0.sh @@ -43,14 +43,13 @@ export R_FILE=mlrMBO-mbo.R # val_loss (default) and val_corr are supported export MODEL_RETURN="val_loss" -if [[ $SITE == "theta" ]] -then - export WAIT=1 -fi +# export CANDLE_MODEL_TYPE="SINGULARITY" +# export CANDLE_IMAGE="/software/improve/images/GraphDRP.sif" +# export INIT_PARAMS_FILE="/software/improve/graphdrp_default_model.txt" -export CANDLE_MODEL_TYPE="SINGULARITY" -export CANDLE_IMAGE="/software/improve/images/GraphDRP.sif" -export INIT_PARAMS_FILE="/software/improve/graphdrp_default_model.txt" +export CANDLE_MODEL_TYPE="BENCHMARKS" +export CANDLE_IMAGE="NONE" +export CANDLE_MODEL_IMPL="app" # Submit job $EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ From c2ac94cc70bf9a7e0304fafae96b28b7c7bc2547 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 1 Jun 2023 14:30:18 -0500 Subject: [PATCH 586/601] Merge --- workflows/GA/swift/workflow.sh | 2 +- workflows/GA/test/cfg-sys-1.sh | 7 ++++++- workflows/dense-noise/swift/workflow.sh | 13 +++++++------ workflows/dense-noise/test/cfg-sys-small.sh | 14 +++++++++----- workflows/dense-noise/test/test-1.sh | 10 +++++----- 5 files changed, 28 insertions(+), 18 deletions(-) diff --git a/workflows/GA/swift/workflow.sh b/workflows/GA/swift/workflow.sh index 79b8df49..69f18d2f 100755 --- a/workflows/GA/swift/workflow.sh +++ b/workflows/GA/swift/workflow.sh @@ -73,7 +73,7 @@ source $WORKFLOWS_ROOT/common/sh/set-pythonpath.sh PYTHONPATH+=:$EQPY PYTHONPATH+=:$WORKFLOWS_ROOT/common/python -export TURBINE_JOBNAME="GA_${EXPID}" +export TURBINE_JOBNAME=$EXPID RESTART_FILE_ARG="" if [[ ${RESTART_FILE:-} != "" ]] then diff --git a/workflows/GA/test/cfg-sys-1.sh b/workflows/GA/test/cfg-sys-1.sh index d11826e7..6192ca61 100644 --- a/workflows/GA/test/cfg-sys-1.sh +++ b/workflows/GA/test/cfg-sys-1.sh @@ -8,12 +8,17 @@ export PROCS=${PROCS:-8} # MPI processes per node # Cori has 32 cores per node, 128GB per node -export PPN=${PPN:-1} +export PPN=${PPN:-8} export WALLTIME=${WALLTIME:-01:00:00} #export PROJECT=Candle_ECP +export PROJECT=candle_aesp +# export QUEUE="debug" # Up to 2 nodes +# export QUEUE="debug-scaling" # Up to 10 nodes +export QUEUE="prod" # At least 10 nodes + # Benchmark run timeout: benchmark run will timeout # after the specified number of seconds. # If set to -1 there is no timeout. diff --git a/workflows/dense-noise/swift/workflow.sh b/workflows/dense-noise/swift/workflow.sh index f8d80b64..48b646c3 100755 --- a/workflows/dense-noise/swift/workflow.sh +++ b/workflows/dense-noise/swift/workflow.sh @@ -88,11 +88,12 @@ log_script mkdir -pv $TURBINE_OUTPUT/run mkdir -pv $TURBINE_OUTPUT/data -# Allow the user to set an objective function -OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} # CANDLE_MODEL_IMPL: "container" on Polaris, "py" on Summit/Frontier CANDLE_MODEL_IMPL="container" -OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} + +# Allow the user to set an objective function +SWIFT_LIBS_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} +SWIFT_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh @@ -140,8 +141,8 @@ cp $CFG_SYS $CFG_PRM $TURBINE_OUTPUT swift-t -n $PROCS \ ${MACHINE:-} \ -p \ - -I $OBJ_DIR \ - -i $OBJ_MODULE \ + -I $SWIFT_LIBS_DIR \ + -i $SWIFT_MODULE \ -e LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} \ -e TURBINE_STDOUT \ -e BENCHMARKS_ROOT \ @@ -149,7 +150,7 @@ swift-t -n $PROCS \ -e APP_PYTHONPATH=$APP_PYTHONPATH \ $( python_envs ) \ -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ - -e OBJ_RETURN \ + -e MODEL_RETURN \ -e CANDLE_DATA_DIR \ -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ diff --git a/workflows/dense-noise/test/cfg-sys-small.sh b/workflows/dense-noise/test/cfg-sys-small.sh index 12948386..38189a98 100644 --- a/workflows/dense-noise/test/cfg-sys-small.sh +++ b/workflows/dense-noise/test/cfg-sys-small.sh @@ -2,8 +2,8 @@ # CFG SYS SMALL # The number of MPI processes -# Note that 2 processes are reserved for Swift/EMEWS -# The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs +# Note that 1 processes is reserved for Swift +# The default of 2 gives you 1 worker, i.e., 1 concurrent Python export PROCS=${PROCS:-2} # MPI processes per node @@ -13,11 +13,15 @@ export WALLTIME=${WALLTIME:-00:05:00} # CANDLE@ALCF: # export PROJECT=CSC249ADOA01 -# export QUEUE="debug-scaling" +export PROJECT=swift-t-polaris +# export QUEUE="debug" # Up to 2 nodes +# export QUEUE="debug-scaling" # Up to 10 nodes +export QUEUE="prod" # At least 10 nodes + # CANDLE@OLCF: -export PROJECT=MED106 -export QUEUE=batch +# export PROJECT=MED106 +# export QUEUE=batch # Benchmark run timeout: benchmark run will timeout # after the specified number of seconds. diff --git a/workflows/dense-noise/test/test-1.sh b/workflows/dense-noise/test/test-1.sh index ad762730..fb6314ae 100755 --- a/workflows/dense-noise/test/test-1.sh +++ b/workflows/dense-noise/test/test-1.sh @@ -5,16 +5,16 @@ set -eu usage() { - echo "Usage: test SITE RUN_DIR MODEL_NAME" + echo "Usage: test MODEL_NAME SITE RUN_DIR" echo " RUN_DIR: use -a for automatic" } RUN_DIR="" if (( ${#} == 3 )) then - SITE=$1 - RUN_DIR=$2 - export MODEL_NAME=$3 + export MODEL_NAME=$1 + SITE=$2 + RUN_DIR=$3 else usage exit 1 @@ -35,7 +35,7 @@ export CFG_PRM=$THIS/cfg-prm-1.sh # What to return from the objective function (Keras model) # val_loss (default) and val_corr are supported -export OBJ_RETURN="val_loss" +export MODEL_RETURN="val_loss" export CANDLE_MODEL_TYPE="BENCHMARKS" From 2db6133b05e14c352981fbabc76e5238b196b6cf Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 2 Jun 2023 13:28:23 -0500 Subject: [PATCH 587/601] Adding data/paccman_param_space.json data/graphdrp_param_space.json --- workflows/GA/data/graphdrp_param_space.json | 23 +++++++++++++++++++++ workflows/GA/data/paccman_param_space.json | 23 +++++++++++++++++++++ 2 files changed, 46 insertions(+) create mode 100644 workflows/GA/data/graphdrp_param_space.json create mode 100644 workflows/GA/data/paccman_param_space.json diff --git a/workflows/GA/data/graphdrp_param_space.json b/workflows/GA/data/graphdrp_param_space.json new file mode 100644 index 00000000..0548c081 --- /dev/null +++ b/workflows/GA/data/graphdrp_param_space.json @@ -0,0 +1,23 @@ +[ + { + "name": "learning_rate", + "type": "float", + "lower": 0.000001, + "upper": 0.1, + "sigma": 0.01 + }, + + { + "name": "batch_size", + "type": "ordered", + "element_type": "int", + "values": [32, 64, 128, 256, 512, 1024, 2048], + "sigma": 1 + }, + + { + "name": "epochs", + "type": "constant", + "value": 5 + } +] diff --git a/workflows/GA/data/paccman_param_space.json b/workflows/GA/data/paccman_param_space.json new file mode 100644 index 00000000..0548c081 --- /dev/null +++ b/workflows/GA/data/paccman_param_space.json @@ -0,0 +1,23 @@ +[ + { + "name": "learning_rate", + "type": "float", + "lower": 0.000001, + "upper": 0.1, + "sigma": 0.01 + }, + + { + "name": "batch_size", + "type": "ordered", + "element_type": "int", + "values": [32, 64, 128, 256, 512, 1024, 2048], + "sigma": 1 + }, + + { + "name": "epochs", + "type": "constant", + "value": 5 + } +] From 326fe313c445f74f8be27cb41a27ee21e4ef1053 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 2 Jun 2023 13:42:16 -0500 Subject: [PATCH 588/601] Adding data/paccmann_param_space.json --- workflows/GA/data/paccmann_param_space.json | 23 +++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 workflows/GA/data/paccmann_param_space.json diff --git a/workflows/GA/data/paccmann_param_space.json b/workflows/GA/data/paccmann_param_space.json new file mode 100644 index 00000000..0548c081 --- /dev/null +++ b/workflows/GA/data/paccmann_param_space.json @@ -0,0 +1,23 @@ +[ + { + "name": "learning_rate", + "type": "float", + "lower": 0.000001, + "upper": 0.1, + "sigma": 0.01 + }, + + { + "name": "batch_size", + "type": "ordered", + "element_type": "int", + "values": [32, 64, 128, 256, 512, 1024, 2048], + "sigma": 1 + }, + + { + "name": "epochs", + "type": "constant", + "value": 5 + } +] From 1d6d108ab1d144b50aa0b597a616451d03891845 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Sat, 3 Jun 2023 14:50:15 -0700 Subject: [PATCH 589/601] o Add IGTD for lambda --- workflows/GA/data/igtd_param_space_ga.json | 22 +++++++ workflows/GA/test/test-igtd-lambda0.sh | 67 ++++++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 workflows/GA/data/igtd_param_space_ga.json create mode 100755 workflows/GA/test/test-igtd-lambda0.sh diff --git a/workflows/GA/data/igtd_param_space_ga.json b/workflows/GA/data/igtd_param_space_ga.json new file mode 100644 index 00000000..2df67c3c --- /dev/null +++ b/workflows/GA/data/igtd_param_space_ga.json @@ -0,0 +1,22 @@ +[ + { + "name": "learning_rate", + "type": "float", + "lower": 0.00001, + "upper": 0.1, + "sigma": 0.0049995 + }, + { + "name": "batch_size", + "type": "ordered", + "element_type": "int", + "values": [16, 32, 64, 128, 256], + "sigma": 1 + }, + + { + "name": "epochs", + "type": "constant", + "value": 50 + } +] diff --git a/workflows/GA/test/test-igtd-lambda0.sh b/workflows/GA/test/test-igtd-lambda0.sh new file mode 100755 index 00000000..314f1acb --- /dev/null +++ b/workflows/GA/test/test-igtd-lambda0.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -eu + +# GA TEST GRAPHDRP LAMBDA + +usage() +{ + echo "Usage: test BENCHMARK_NAME SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 3 )) +then + RUN_DIR=$3 +elif (( ${#} == 2 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=$1 +SITE=$2 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-1.sh +export CFG_PRM=$THIS/cfg-prm-1.sh +export PARAM_SET_FILE=../data/igtd_param_space_ga.json + +# The python GA model exploration algorithm +export GA_FILE=deap_ga.py + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export MODEL_RETURN="val_loss" + +export CANDLE_MODEL_TYPE="SINGULARITY" +export CANDLE_IMAGE="/software/improve/images/IGTD.sif" + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +echo $TURBINE_OUTPUT +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) +#check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: From 1fede6f306d6f7ca0933c61eae3019f5abe54331 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 5 Jun 2023 16:36:24 -0500 Subject: [PATCH 590/601] Drop- misspelled --- workflows/GA/data/paccman_param_space.json | 23 ---------------------- 1 file changed, 23 deletions(-) delete mode 100644 workflows/GA/data/paccman_param_space.json diff --git a/workflows/GA/data/paccman_param_space.json b/workflows/GA/data/paccman_param_space.json deleted file mode 100644 index 0548c081..00000000 --- a/workflows/GA/data/paccman_param_space.json +++ /dev/null @@ -1,23 +0,0 @@ -[ - { - "name": "learning_rate", - "type": "float", - "lower": 0.000001, - "upper": 0.1, - "sigma": 0.01 - }, - - { - "name": "batch_size", - "type": "ordered", - "element_type": "int", - "values": [32, 64, 128, 256, 512, 1024, 2048], - "sigma": 1 - }, - - { - "name": "epochs", - "type": "constant", - "value": 5 - } -] From 3859ef4711acdae16cad388b8f7827823252983e Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 5 Jun 2023 16:36:50 -0500 Subject: [PATCH 591/601] Use modern Swift --- workflows/GA/swift/workflow.swift | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/workflows/GA/swift/workflow.swift b/workflows/GA/swift/workflow.swift index ebbf5a08..a64d7f1f 100644 --- a/workflows/GA/swift/workflow.swift +++ b/workflows/GA/swift/workflow.swift @@ -105,22 +105,17 @@ string FRAMEWORK = "keras"; } } -main() { +main { assert(strlen(emews_root) > 0, "Set EMEWS_PROJECT_ROOT!"); - int random_seed = toint(argv("seed", "0")); - int num_iter = toint(argv("ni","100")); // -ni=100 - int num_pop = toint(argv("np","100")); // -np=100; - - //printf("NI: %i # num_iter", num_iter); - //printf("NV: %i # num_variations", num_variations); - //printf("NP: %i # num_pop", num_pop); - //printf("MUTPB: %f # mut_prob", mut_prob); + int random_seed = string2int(argv("seed", "0")); + int num_iter = string2int(argv("ni","100")); + int num_pop = string2int(argv("np","100")); int ME_ranks[]; foreach r_rank, i in r_ranks{ - ME_ranks[i] = toint(r_rank); + ME_ranks[i] = string2int(r_rank); } foreach ME_rank, i in ME_ranks { From d6a122c462cb390b3803bc70c0fc22411e8cdf89 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 5 Jun 2023 16:37:15 -0500 Subject: [PATCH 592/601] Initial tests for Paccmann and tCNNS --- workflows/GA/test/cfg-prm-paccmann-1.sh | 48 +++++++++++++++++ workflows/GA/test/cfg-prm-tcnns-1.sh | 48 +++++++++++++++++ workflows/GA/test/cfg-sys-paccmann-1.sh | 46 ++++++++++++++++ workflows/GA/test/cfg-sys-tcnns-1.sh | 46 ++++++++++++++++ workflows/GA/test/test-sif-paccmann-1.sh | 67 ++++++++++++++++++++++++ workflows/GA/test/test-sif-tcnns-1.sh | 67 ++++++++++++++++++++++++ 6 files changed, 322 insertions(+) create mode 100644 workflows/GA/test/cfg-prm-paccmann-1.sh create mode 100644 workflows/GA/test/cfg-prm-tcnns-1.sh create mode 100644 workflows/GA/test/cfg-sys-paccmann-1.sh create mode 100644 workflows/GA/test/cfg-sys-tcnns-1.sh create mode 100755 workflows/GA/test/test-sif-paccmann-1.sh create mode 100755 workflows/GA/test/test-sif-tcnns-1.sh diff --git a/workflows/GA/test/cfg-prm-paccmann-1.sh b/workflows/GA/test/cfg-prm-paccmann-1.sh new file mode 100644 index 00000000..0bb76e92 --- /dev/null +++ b/workflows/GA/test/cfg-prm-paccmann-1.sh @@ -0,0 +1,48 @@ +# CFG PRM PACCMAN 1 +# GA settings + +SEED=${SEED:-1} +# Total iterations +NUM_ITERATIONS=${NUM_ITERATIONS:-3} +# Size of GA population +# (i.e. the number of parameter sets to evaluate per iteration) +POP_DEFAULT=$(( PROCS - 2 )) +POPULATION_SIZE=${POPULATION_SIZE:-${POP_DEFAULT}} +# the GA strategy: one of 'simple' or 'mu_plus_lambda'. See +# https://deap.readthedocs.io/en/master/api/algo.html?highlight=eaSimple#module-deap.algorithms +# for more info. +GA_STRATEGY=${STRATEGY:-simple} + +# Polaris: +# Run HiDRA on 10 nodes for 3 hours for 20 epochs + +INIT_PARAMS_FILE=${INIT_PARAMS_FILE:-} + +# TODO: move the following code to a utility library- +# this is a configuration file +# Set the ga parameter space definition file for running +if [ "$MODEL_NAME" = "combo" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/combo_param_space_ga.json} +elif [ "$MODEL_NAME" = "p1b1" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b1_param_space_ga.json} +elif [ "$MODEL_NAME" = "nt3" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/nt3_param_space_ga.json} +elif [ "$MODEL_NAME" = "graphdrp" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/graphdrp_param_space_ga.json} +elif [ "$MODEL_NAME" = "tc1" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/tc1_param_space_ga.json} +elif [ "$MODEL_NAME" = "oned" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/oned_param_space_ga.json} +# TODO: Uncomment when parameter files are available +# elif [ "$MODEL_NAME" = "p1b3" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b3_param_space_ga.json} +# elif [ "$MODEL_NAME" = "p1b2" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b2_param_space_ga.json} +# elif [ "$MODEL_NAME" = "p2b1" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p2b1_param_space_ga.json} +elif [ "${PARAM_SET_FILE:-}" != "" ]; then + PARAM_SET_FILE=${EMEWS_PROJECT_ROOT}/data/${PARAM_SET_FILE} +else + echo "Invalid model-" $MODEL_NAME + exit 1 +fi diff --git a/workflows/GA/test/cfg-prm-tcnns-1.sh b/workflows/GA/test/cfg-prm-tcnns-1.sh new file mode 100644 index 00000000..0bb76e92 --- /dev/null +++ b/workflows/GA/test/cfg-prm-tcnns-1.sh @@ -0,0 +1,48 @@ +# CFG PRM PACCMAN 1 +# GA settings + +SEED=${SEED:-1} +# Total iterations +NUM_ITERATIONS=${NUM_ITERATIONS:-3} +# Size of GA population +# (i.e. the number of parameter sets to evaluate per iteration) +POP_DEFAULT=$(( PROCS - 2 )) +POPULATION_SIZE=${POPULATION_SIZE:-${POP_DEFAULT}} +# the GA strategy: one of 'simple' or 'mu_plus_lambda'. See +# https://deap.readthedocs.io/en/master/api/algo.html?highlight=eaSimple#module-deap.algorithms +# for more info. +GA_STRATEGY=${STRATEGY:-simple} + +# Polaris: +# Run HiDRA on 10 nodes for 3 hours for 20 epochs + +INIT_PARAMS_FILE=${INIT_PARAMS_FILE:-} + +# TODO: move the following code to a utility library- +# this is a configuration file +# Set the ga parameter space definition file for running +if [ "$MODEL_NAME" = "combo" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/combo_param_space_ga.json} +elif [ "$MODEL_NAME" = "p1b1" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b1_param_space_ga.json} +elif [ "$MODEL_NAME" = "nt3" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/nt3_param_space_ga.json} +elif [ "$MODEL_NAME" = "graphdrp" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/graphdrp_param_space_ga.json} +elif [ "$MODEL_NAME" = "tc1" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/tc1_param_space_ga.json} +elif [ "$MODEL_NAME" = "oned" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/oned_param_space_ga.json} +# TODO: Uncomment when parameter files are available +# elif [ "$MODEL_NAME" = "p1b3" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b3_param_space_ga.json} +# elif [ "$MODEL_NAME" = "p1b2" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b2_param_space_ga.json} +# elif [ "$MODEL_NAME" = "p2b1" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p2b1_param_space_ga.json} +elif [ "${PARAM_SET_FILE:-}" != "" ]; then + PARAM_SET_FILE=${EMEWS_PROJECT_ROOT}/data/${PARAM_SET_FILE} +else + echo "Invalid model-" $MODEL_NAME + exit 1 +fi diff --git a/workflows/GA/test/cfg-sys-paccmann-1.sh b/workflows/GA/test/cfg-sys-paccmann-1.sh new file mode 100644 index 00000000..ee2c9023 --- /dev/null +++ b/workflows/GA/test/cfg-sys-paccmann-1.sh @@ -0,0 +1,46 @@ +# CFG SYS PACCMAN 1 + +# The number of MPI processes +# Note that 2 processes are reserved for Swift/EMEMS +# The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs +export PROCS=${PROCS:-10} + +# MPI processes per node +# Cori has 32 cores per node, 128GB per node +export PPN=${PPN:-1} + +export WALLTIME=${WALLTIME:-01:00:00} + +#export PROJECT=Candle_ECP + +# Benchmark run timeout: benchmark run will timeout +# after the specified number of seconds. +# If set to -1 there is no timeout. +# This timeout is implemented with Keras callbacks +BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} + +# Uncomment below to use custom python script to run +# Use file name without .py (e.g, my_script.py) +# MODEL_PYTHON_SCRIPT=my_script + +# Shell timeout: benchmark run will be killed +# after the specified number of seconds. +# If set to -1 or empty there is no timeout. +# This timeout is implemented with the shell command 'timeout' +export SH_TIMEOUT=${SH_TIMEOUT:-} + +# Ignore errors: If 1, unknown errors will be reported to model.log +# but will not bring down the Swift workflow. See model.sh . +export IGNORE_ERRORS=0 + +# if the deap python package is not installed with swift-t's embedded python +# it may be ncessary to include its location in the PYTHONPATH +# export PYTHONPATH=/global/u1/n/ncollier/.local/cori/deeplearning2.7/lib/python2.7/site-packages + +export TURBINE_RESIDENT_WORK_WORKERS=1 +export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) + +# for running locally, edit as necessary +# export PYTHONHOME=$HOME/anaconda3 +# export PYTHON=python3.6 +# export SWIFT_T=$HOME/sfw/swift-t-4c8f0afd diff --git a/workflows/GA/test/cfg-sys-tcnns-1.sh b/workflows/GA/test/cfg-sys-tcnns-1.sh new file mode 100644 index 00000000..ee2c9023 --- /dev/null +++ b/workflows/GA/test/cfg-sys-tcnns-1.sh @@ -0,0 +1,46 @@ +# CFG SYS PACCMAN 1 + +# The number of MPI processes +# Note that 2 processes are reserved for Swift/EMEMS +# The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs +export PROCS=${PROCS:-10} + +# MPI processes per node +# Cori has 32 cores per node, 128GB per node +export PPN=${PPN:-1} + +export WALLTIME=${WALLTIME:-01:00:00} + +#export PROJECT=Candle_ECP + +# Benchmark run timeout: benchmark run will timeout +# after the specified number of seconds. +# If set to -1 there is no timeout. +# This timeout is implemented with Keras callbacks +BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} + +# Uncomment below to use custom python script to run +# Use file name without .py (e.g, my_script.py) +# MODEL_PYTHON_SCRIPT=my_script + +# Shell timeout: benchmark run will be killed +# after the specified number of seconds. +# If set to -1 or empty there is no timeout. +# This timeout is implemented with the shell command 'timeout' +export SH_TIMEOUT=${SH_TIMEOUT:-} + +# Ignore errors: If 1, unknown errors will be reported to model.log +# but will not bring down the Swift workflow. See model.sh . +export IGNORE_ERRORS=0 + +# if the deap python package is not installed with swift-t's embedded python +# it may be ncessary to include its location in the PYTHONPATH +# export PYTHONPATH=/global/u1/n/ncollier/.local/cori/deeplearning2.7/lib/python2.7/site-packages + +export TURBINE_RESIDENT_WORK_WORKERS=1 +export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) + +# for running locally, edit as necessary +# export PYTHONHOME=$HOME/anaconda3 +# export PYTHON=python3.6 +# export SWIFT_T=$HOME/sfw/swift-t-4c8f0afd diff --git a/workflows/GA/test/test-sif-paccmann-1.sh b/workflows/GA/test/test-sif-paccmann-1.sh new file mode 100755 index 00000000..9d56f8b9 --- /dev/null +++ b/workflows/GA/test/test-sif-paccmann-1.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -eu + +# GA TEST SIF Paccman + +usage() +{ + echo "Usage: test SIF SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 3 )) +then + RUN_DIR=$3 +elif (( ${#} == 2 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=$1 +SITE=$2 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-paccmann-1.sh +export CFG_PRM=$THIS/cfg-prm-paccmann-1.sh +export PARAM_SET_FILE=paccmann_param_space.json + +# The Python GA model exploration algorithm +export GA_FILE=deap_ga.py + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export MODEL_RETURN="val_loss" + +export CANDLE_MODEL_TYPE="SINGULARITY" +export CANDLE_IMAGE=$MODEL_NAME +export CANDLE_MODEL_IMPL="app" + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +echo $TURBINE_OUTPUT +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: diff --git a/workflows/GA/test/test-sif-tcnns-1.sh b/workflows/GA/test/test-sif-tcnns-1.sh new file mode 100755 index 00000000..507a32cc --- /dev/null +++ b/workflows/GA/test/test-sif-tcnns-1.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -eu + +# GA TEST SIF tCNNS + +usage() +{ + echo "Usage: test SIF SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 3 )) +then + RUN_DIR=$3 +elif (( ${#} == 2 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=$1 +SITE=$2 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-tcnns-1.sh +export CFG_PRM=$THIS/cfg-prm-tcnns-1.sh +export PARAM_SET_FILE=tcnns_param_space.json + +# The Python GA model exploration algorithm +export GA_FILE=deap_ga.py + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export MODEL_RETURN="val_loss" + +export CANDLE_MODEL_TYPE="SINGULARITY" +export CANDLE_IMAGE=$MODEL_NAME +export CANDLE_MODEL_IMPL="app" + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +echo $TURBINE_OUTPUT +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: From 7d5f299a12fd998bf7fcdbd27bea82dbc1e78c87 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 5 Jun 2023 16:37:52 -0500 Subject: [PATCH 593/601] Adding data/tcnns_param_space.json --- workflows/GA/data/tcnns_param_space.json | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 workflows/GA/data/tcnns_param_space.json diff --git a/workflows/GA/data/tcnns_param_space.json b/workflows/GA/data/tcnns_param_space.json new file mode 100644 index 00000000..30db57db --- /dev/null +++ b/workflows/GA/data/tcnns_param_space.json @@ -0,0 +1,23 @@ +[ + { + "name": "learning_rate", + "type": "float", + "lower": 0.000001, + "upper": 0.1, + "sigma": 0.01 + }, + + { + "name": "batch_size", + "type": "ordered", + "element_type": "int", + "values": [32, 64, 128, 256, 512, 1024, 2048], + "sigma": 1 + }, + + { + "name": "epochs", + "type": "constant", + "value": 3 + } +] From bc7ba094c00be044963e6b3f223ba188615d9263 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 6 Jun 2023 16:05:25 -0500 Subject: [PATCH 594/601] Enable error handling in DEAP --- workflows/GA/data/random_param_space.json | 30 ++++++ workflows/GA/test/cfg-prm-1.sh | 5 +- workflows/GA/test/test-random-lambda7.sh | 68 ++++++++++++ workflows/common/python/deap_ga.py | 107 ++++++++++++++----- workflows/common/python/model_runner.py | 24 ++--- workflows/common/sh/model.sh | 56 +++++----- workflows/common/swift/model_app.swift | 3 +- workflows/common/swift/model_container.swift | 5 +- workflows/common/swift/model_py.swift | 2 +- 9 files changed, 222 insertions(+), 78 deletions(-) create mode 100644 workflows/GA/data/random_param_space.json create mode 100755 workflows/GA/test/test-random-lambda7.sh diff --git a/workflows/GA/data/random_param_space.json b/workflows/GA/data/random_param_space.json new file mode 100644 index 00000000..79125122 --- /dev/null +++ b/workflows/GA/data/random_param_space.json @@ -0,0 +1,30 @@ +[ + { + "name": "learning_rate", + "type": "float", + "lower": 0.000001, + "upper": 0.1, + "sigma": 0.01 + }, + + { + "name": "batch_size", + "type": "ordered", + "element_type": "int", + "values": [32, 64, 128, 256, 512, 1024, 2048], + "sigma": 1 + }, + + { + "name": "epochs", + "type": "constant", + "value": 5 + }, + + { + "name": "crash_probability", + "type": "constant", + "value": 0.25 + } + +] diff --git a/workflows/GA/test/cfg-prm-1.sh b/workflows/GA/test/cfg-prm-1.sh index 3f216caa..52e1bdb5 100644 --- a/workflows/GA/test/cfg-prm-1.sh +++ b/workflows/GA/test/cfg-prm-1.sh @@ -13,8 +13,9 @@ POPULATION_SIZE=${POPULATION_SIZE:-8} # for more info. GA_STRATEGY=${STRATEGY:-simple} -# Polaris: -# Run HiDRA on 10 nodes for 3 hours for 20 epochs +# Set IGNORE_ERRORS=1 to ignore model errors and +# allow NaNs in model results: +# export IGNORE_ERRORS=1 INIT_PARAMS_FILE=${INIT_PARAMS_FILE:-} diff --git a/workflows/GA/test/test-random-lambda7.sh b/workflows/GA/test/test-random-lambda7.sh new file mode 100755 index 00000000..9ca8a27c --- /dev/null +++ b/workflows/GA/test/test-random-lambda7.sh @@ -0,0 +1,68 @@ +#!/bin/bash +set -eu + +# GA TEST GRAPHDRP LAMBDA + +usage() +{ + echo "Usage: test BENCHMARK_NAME SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 3 )) +then + RUN_DIR=$3 +elif (( ${#} == 2 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=$1 +SITE=$2 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-1.sh +export CFG_PRM=$THIS/cfg-prm-1.sh +export PARAM_SET_FILE=random_param_space.json + +# The python GA model exploration algorithm +export GA_FILE=deap_ga.py + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export MODEL_RETURN="val_loss" + +export CANDLE_MODEL_TYPE="BENCHMARKS" +export CANDLE_IMAGE=NONE +export CANDLE_MODEL_IMPL="app" + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +echo $TURBINE_OUTPUT +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) +#check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: diff --git a/workflows/common/python/deap_ga.py b/workflows/common/python/deap_ga.py index 9ca1d168..24280619 100644 --- a/workflows/common/python/deap_ga.py +++ b/workflows/common/python/deap_ga.py @@ -1,3 +1,9 @@ +""" +DEAP GA PY + +EMEWS interface module for DEAP +""" + import csv import json import math @@ -12,22 +18,31 @@ import numpy as np from deap import algorithms, base, creator, tools -# list of ga_utils parameter objects +# List of ga_utils parameter objects: ga_params = None -iteration = 1 + +# Last mean value (used if there are no new values): +mean_last = None + +generation = 1 logger = log_tools.get_logger(None, "DEAP") def obj_func(x): + """ + Just a stub for the DEAP framework + """ return 0 -# Produces something like: -# {"batch_size":512,"epochs":51,"activation":"softsign", -# "dense":"2000 1000 1000 500 100 50","optimizer":"adagrad","drop":0.1378, -# "learning_rate":0.0301,"conv":"25 25 25 25 25 1"} def create_list_of_json_strings(list_of_lists, super_delimiter=";"): - # create string of ; separated jsonified maps + """ + create string of semicolon-separated jsonified maps + Produces something like: + {"batch_size":512,"epochs":51,"activation":"softsign", + "dense":"2000 1000 1000 500 100 50","optimizer":"adagrad","drop":0.1378, + "learning_rate":0.0301,"conv":"25 25 25 25 25 1"} + """ result = [] global ga_params for L in list_of_lists: @@ -45,8 +60,8 @@ def create_json_string(L, indent=None): def create_fitnesses(params_string): - """return equivalent length tuple list. - + """ + return equivalent length tuple list. :type params_string: str """ params = params_string.split(";") @@ -55,28 +70,68 @@ def create_fitnesses(params_string): return res -def queue_map(obj_func, pops): - # Note that the obj_func is not used - # sending data that looks like: - # [[a,b,c,d],[e,f,g,h],...] +def make_floats(results): + """ + results: String of data from workflow + return: List of singleton-tuples, each a float + This function converts the workflow strings to the DEAP format, + and replaces any string NaNs in the results with + the mean of the current generation or + the mean of the prior generation. + """ + global mean_last + tokens = results.split(";") + NaNs = [] + values = [] + output = {} + floats = [] + for i, token in enumerate(tokens): + if token.lower() == "nan": + output[i] = "nan" + NaNs.append(i) + else: + f = float(token) + output[i] = f + values.append(f) + logger.info("RESULTS: values: %i NaNs: %i" % + (len(values), len(NaNs))) + if len(values) > 0: + mean = sum(values) / len(values) + mean_last = mean + else: + assert mean_last is not None, \ + "all generation=1 results are NaN!" + mean = mean_last + + for i in NaNs: + output[i] = mean + for i in range(0, len(tokens)): + floats.append((output[i],)) + return floats + + +def queue_map(_f, pops): + """ + Note that _f is not used, but is part of the DEAP framework + Formats model parameters that look like: + [[a,b,c,d],[e,f,g,h],...] + """ if not pops: return [] - global iteration - iteration_start = time.time() - logger.info("ITERATION: %i START" % iteration) + global generation + generation_start = time.time() + logger.info("GENERATION: %i START: pop: %i" % + (generation, len(pops))) sys.stdout.flush() eqpy.OUT_put(create_list_of_json_strings(pops)) - result = eqpy.IN_get() - duration = time.time() - iteration_start - logger.info("ITERATION: %i STOP. duration: %0.3f" % - (iteration, duration)) + results = eqpy.IN_get() + duration = time.time() - generation_start + logger.info("GENERATION: %i STOP. duration: %0.3f" % + (generation, duration)) sys.stdout.flush() - iteration += 1 - split_result = result.split(";") - # TODO determine if max'ing or min'ing and use -9999999 or 99999999 - return [(float(x),) if not math.isnan(float(x)) else (float(99999999),) - for x in split_result] - # return [(float(x),) for x in split_result] + generation += 1 + floats = make_floats(results) + return floats def make_random_params(): diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index d997653b..ae7fb83f 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -183,7 +183,7 @@ def run(hyper_parameter_map, model_return): # Run the model! history = pkg.run(params) except Exception as e: - logger.warn("RUN EXCEPTION: " + str(e)) + logger.info("RUN EXCEPTION: " + str(e)) print("RUN EXCEPTION: " + str(e)) info = sys.exc_info() s = traceback.format_tb(info[2]) @@ -336,15 +336,18 @@ def setup_params(pkg, hyper_parameter_map, params_arg): def get_results(history, model_return, epochs_expected): - """Return the history entry that the user requested. + """ + Return the history entry that the user requested via MODEL_RETURN, + which may be math.nan in case of error. - Also checks for early stopping and if so marks the directory. - history: The Keras history modelect + Also checks for early stopping and if so marks the directory + with a 0-byte file named "stop.marker" + history: The TensorFlow history """ logger.debug('get_results(): "%s"' % model_return) - known_params = ["loss", "val_loss", "val_corr", "val_dice_coef"] + known_params = ["loss", "val_loss"] if model_return not in known_params: raise ValueError("Unsupported objective function return " + 'key: "' + @@ -364,21 +367,12 @@ def get_results(history, model_return, epochs_expected): # Default: the last value in the history result = float(values[-1]) else: - logger.warning("get_results(): objective function return key " + + logger.warning("get_results(): model return key " + "not found: " + 'key: "' + model_return + '" - ' + "history: " + str(history.history.keys())) logger.warning("get_results(): returning NaN") result = math.nan - # Fix NaNs: - if math.isnan(result): - if model_return == "val_corr" or model_return == "val_dice_coef": - # Return the negative result - result = -result - else: - # Just return a large number - result = 999999999 - print("result: " + model_return + ": " + str(result)) history_result = history.history.copy() return result, history_result diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index e0be51ba..26744761 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -152,52 +152,50 @@ else # "BENCHMARKS" # model_runner/runner_utils writes result.txt fi +echo log "MODEL_CMD: ${MODEL_CMD[@]}" +echo # Run Python! $TIMEOUT_CMD "${MODEL_CMD[@]}" & PID=$! -if [[ ${MODEL_TYPE:-} == "SINGULARITY" ]] +# Use if block to suppress errors: +if wait $PID +then + CODE=0 +else + CODE=$? +fi + +log "$MODEL_TYPE: EXIT CODE: $CODE" +if (( CODE == 0 )) then - wait $PID ls -ltrh sleep 1 # Wait for initial output - # Get last results of the format "CANDLE_RESULT xxx" in model.log + # Get last results of the format "IMPROVE RESULT xxx" in model.log # NOTE: Enabling set -x will break the following (token CANDLE_RESULT) RES=$( awk -v FS="IMPROVE_RESULT" 'NF>1 {x=$2} END {print x}' \ $INSTANCE_DIRECTORY/model.log ) RESULT="$(echo $RES | grep -Eo '[+-]?[0-9]+([.][0-9]+)?')" || true - echo "CANDLE RESULT: '$RESULT'" + echo "IMPROVE RESULT: '$RESULT'" echo $RESULT > $INSTANCE_DIRECTORY/result.txt else - wait $PID - CODE=$? - if (( CODE )) + echo # spacer + if (( $CODE == 124 )) then - echo # spacer - if (( $CODE == 124 )) - then - log "TIMEOUT ERROR! (timeout=$SH_TIMEOUT)" - # This will trigger a NaN (the result file does not exist) - exit 0 - else - log "MODEL ERROR! (CODE=$CODE)" - if (( ${IGNORE_ERRORS:-0} )) - then - log "IGNORING ERROR." - # This will trigger a NaN (the result file does not exist) - exit 0 - fi - log "ABORTING WORKFLOW (exit 1)" - exit 1 # Unknown error in Python: abort the workflow - fi + log "TIMEOUT ERROR! (timeout=$SH_TIMEOUT)" + else + log "MODEL ERROR! (CODE=$CODE)" fi - - # Get results from model.log: last occurrence of "loss: xxx" - RESULT=$(awk -v FS="loss:" 'NF>1{print $2}' model.log | tail -1) - log "RESULT: $RESULT" - echo $RESULT > $INSTANCE_DIRECTORY/result.txt + if (( ${IGNORE_ERRORS:-0} == 0 )) + then + # Unknown error in Python: abort the workflow + log "ABORTING WORKFLOW (exit 1)" + exit 1 + fi + # This will trigger a NaN (the result file does not exist) + log "IGNORING ERROR." fi log "END: SUCCESS" diff --git a/workflows/common/swift/model_app.swift b/workflows/common/swift/model_app.swift index 6e6473d1..896ec626 100644 --- a/workflows/common/swift/model_app.swift +++ b/workflows/common/swift/model_app.swift @@ -59,7 +59,6 @@ app (void o) run_model (string model_sh, string params, model_result = trim(read(line)); } else { printf("File not found: %s", result_file); - // return with a large value - model_result = "1e7"; + model_result = "NaN"; } } diff --git a/workflows/common/swift/model_container.swift b/workflows/common/swift/model_container.swift index fe3e228e..73ec9b55 100644 --- a/workflows/common/swift/model_container.swift +++ b/workflows/common/swift/model_container.swift @@ -47,7 +47,7 @@ app (void o) run_model_train(string model_sh, string params, } /** - Extracts the Benchmark output if it exists, + Extracts the model output if it exists, else, provides a NaN so the workflow can keep running */ (string model_result) get_results(string result_file) { @@ -56,7 +56,6 @@ app (void o) run_model_train(string model_sh, string params, model_result = trim(read(line)); } else { printf("File not found: %s", result_file); - // return with a large value - model_result = "1e7"; + model_result = "NaN"; } } diff --git a/workflows/common/swift/model_py.swift b/workflows/common/swift/model_py.swift index 6af0cd47..5d798e7b 100644 --- a/workflows/common/swift/model_py.swift +++ b/workflows/common/swift/model_py.swift @@ -17,7 +17,7 @@ try: import tensorflow from tensorflow import keras - model_result = '-100' + model_result = 'NaN' outdir = '%s' if not os.path.exists(outdir): From 7cd829fd65b6f062efa76430944c690776627ed3 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 6 Jun 2023 16:56:34 -0500 Subject: [PATCH 595/601] Merge --- workflows/common/swift/model_container.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/swift/model_container.swift b/workflows/common/swift/model_container.swift index 73ec9b55..57f41103 100644 --- a/workflows/common/swift/model_container.swift +++ b/workflows/common/swift/model_container.swift @@ -47,7 +47,7 @@ app (void o) run_model_train(string model_sh, string params, } /** - Extracts the model output if it exists, + Extracts the model result if it exists, else, provides a NaN so the workflow can keep running */ (string model_result) get_results(string result_file) { From 5be9fb467dc8498ce452ceaec7e6adacb18995ec Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 6 Jun 2023 16:57:19 -0500 Subject: [PATCH 596/601] New Swift/T for Polaris --- workflows/common/sh/env-polaris.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/common/sh/env-polaris.sh b/workflows/common/sh/env-polaris.sh index 14222f73..98eac55a 100644 --- a/workflows/common/sh/env-polaris.sh +++ b/workflows/common/sh/env-polaris.sh @@ -6,7 +6,7 @@ CANDLE_MODEL_IMPL=app CSC249=/lus/grand/projects/CSC249ADOA01 ROOT=$CSC249/public/sfw/polaris -SWIFT=$ROOT/swift-t/2023-05-23 +SWIFT=$ROOT/swift-t/2023-06-05 export TURBINE_HOME=$SWIFT/turbine PATH=$SWIFT/stc/bin:$PATH From ab525e18e2b450086dc67fbe068d1554ec80037b Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 6 Jun 2023 17:00:02 -0500 Subject: [PATCH 597/601] Clean up --- workflows/GA/test/cfg-prm-tcnns-1.sh | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/workflows/GA/test/cfg-prm-tcnns-1.sh b/workflows/GA/test/cfg-prm-tcnns-1.sh index 0bb76e92..4f179e80 100644 --- a/workflows/GA/test/cfg-prm-tcnns-1.sh +++ b/workflows/GA/test/cfg-prm-tcnns-1.sh @@ -1,5 +1,4 @@ -# CFG PRM PACCMAN 1 -# GA settings +# CFG PRM tCNNS 1 SEED=${SEED:-1} # Total iterations @@ -13,9 +12,6 @@ POPULATION_SIZE=${POPULATION_SIZE:-${POP_DEFAULT}} # for more info. GA_STRATEGY=${STRATEGY:-simple} -# Polaris: -# Run HiDRA on 10 nodes for 3 hours for 20 epochs - INIT_PARAMS_FILE=${INIT_PARAMS_FILE:-} # TODO: move the following code to a utility library- From 0f07c1f4dc533bc597ae2fc09092a6041dab8813 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 6 Jun 2023 17:01:49 -0500 Subject: [PATCH 598/601] Ignore errors in tCNNS --- workflows/GA/test/cfg-prm-tcnns-1.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/GA/test/cfg-prm-tcnns-1.sh b/workflows/GA/test/cfg-prm-tcnns-1.sh index 4f179e80..caaaf197 100644 --- a/workflows/GA/test/cfg-prm-tcnns-1.sh +++ b/workflows/GA/test/cfg-prm-tcnns-1.sh @@ -11,6 +11,7 @@ POPULATION_SIZE=${POPULATION_SIZE:-${POP_DEFAULT}} # https://deap.readthedocs.io/en/master/api/algo.html?highlight=eaSimple#module-deap.algorithms # for more info. GA_STRATEGY=${STRATEGY:-simple} +export IGNORE_ERRORS=1 INIT_PARAMS_FILE=${INIT_PARAMS_FILE:-} From 20455ab7de24ea695fa5fce5f88f700e2ea93b54 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 6 Jun 2023 17:02:24 -0500 Subject: [PATCH 599/601] Support random fake crashes --- models/Random/random_baseline_keras2.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/models/Random/random_baseline_keras2.py b/models/Random/random_baseline_keras2.py index 1b6a555a..4f9eff34 100644 --- a/models/Random/random_baseline_keras2.py +++ b/models/Random/random_baseline_keras2.py @@ -42,16 +42,29 @@ def initialize_parameters(): return gParameters -def model_implementation(): +def model_implementation(params): """ The implementation of the model w/o CANDLE conventions """ - import random - result = random.random() * 10 + + from random import random + if "crash_probability" in params: + crash_probability = float(params["crash_probability"]) + if random() < crash_probability: + raise FakeCrashException() + + result = random() * 10 return result +class FakeCrashException(Exception): + """ + A dummy uncaught Exception to test error handling in Supervisor + """ + pass + + def run(params): - result = model_implementation() + result = model_implementation(params) print("IMPROVE_RESULT: " + str(result)) From a410b6cba72c71a17924646eadc3b802d0ec5576 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Fri, 9 Jun 2023 11:59:32 -0500 Subject: [PATCH 600/601] Enable TURBINE_STDOUT for cmp-cv --- workflows/cmp-cv/swift/workflow.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/cmp-cv/swift/workflow.sh b/workflows/cmp-cv/swift/workflow.sh index e235378b..9a2b32d9 100755 --- a/workflows/cmp-cv/swift/workflow.sh +++ b/workflows/cmp-cv/swift/workflow.sh @@ -72,8 +72,8 @@ mkdir -pv $TURBINE_OUTPUT/run cp -v $UPF $TURBINE_OUTPUT -# TURBINE_STDOUT="$TURBINE_OUTPUT/out-%%r.txt" -TURBINE_STDOUT= +TURBINE_STDOUT="$TURBINE_OUTPUT/out-%%r.txt" +# TURBINE_STDOUT= if [[ ${CANDLE_DATA_DIR:-} == "" ]] then From 522e0dc4a191c2bd7a298180edf2e9117e5e5cfa Mon Sep 17 00:00:00 2001 From: Gihan Panapitiya Date: Fri, 9 Jun 2023 10:09:15 -0700 Subject: [PATCH 601/601] Update compare.py --- workflows/cmp-cv/py/compare.py | 79 +++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 31 deletions(-) diff --git a/workflows/cmp-cv/py/compare.py b/workflows/cmp-cv/py/compare.py index b735fc2b..a9098aa9 100644 --- a/workflows/cmp-cv/py/compare.py +++ b/workflows/cmp-cv/py/compare.py @@ -12,17 +12,40 @@ """ import os -import sys import pandas as pd import pandas as pd import numpy as np from sklearn.metrics import mean_squared_error -CANDLE_DATA_DIR = os.getenv("CANDLE_DATA_DIR") - +# conditions = pd.DataFrame( +# [['nAromAtom', 5, 10], ['nAtom', 20, 50], ['BertzCT', 800, 1000]], +# columns=['prop', 'low', 'high']) +# case 2 conditions = pd.DataFrame( - [['nAromAtom', 5, 10], ['nAtom', 20, 50], ['BertzCT', 800, 1000]], + [ + ['nAtom', 8, 28],['nAtom', 28, 48],['nAtom', 48, 67],['nAtom', 67, 87],['nAtom', 87, 106],['nAtom', 106, 125], + ['nAtom', 125, 145],['nAtom', 145, 164],['nAtom', 164, 184],['nAtom', 184, 203],['nAtom', 203, 222], + ['nAtom', 222, 242],['nAtom', 242, 261],['nAtom', 261, 281],['nAtom', 281, 300],['nAtom', 300, 319], + ['nAtom', 319, 339],['nAtom', 339, 358],['nAtom', 358, 378],['nAtom', 378, 397],['nAtom', 397, 416], + ['nAtom', 416, 436],['nAtom', 436, 455],['nAtom', 455, 494], + ['nAromAtom', 0, 3],['nAromAtom', 3, 6],['nAromAtom', 6, 10],['nAromAtom', 10, 13], + ['nAromAtom', 13, 16],['nAromAtom', 16, 19],['nAromAtom', 19, 22],['nAromAtom', 22, 26], + ['nAromAtom', 26, 29],['nAromAtom', 29, 32],['nAromAtom', 32, 35],['nAromAtom', 35, 38], + ['nAromAtom', 38, 42],['nAromAtom', 42, 45],['nAromAtom', 45, 48], + ['nRing', 0, 2],['nRing', 2, 3],['nRing', 3, 5],['nRing', 5, 6], + ['nRing', 6, 8],['nRing', 8, 10],['nRing', 10, 11],['nRing', 11, 13], + ['nRing', 38, 42],['nRing', 42, 45],['nRing', 45, 48], + ['nAcid', 0, 1],['nAcid', 1, 2],['nAcid', 2, 3],['nAcid', 3, 4], + ['BertzCT', 7.50964047e+00, 9.80918522e+02], ['BertzCT', 9.80918522e+02, 1.95422740e+03], + ['BertzCT', 1.95422740e+03, 2.92753628e+03],['BertzCT', 2.92753628e+03, 3.90084517e+03], + ['BertzCT', 3.90084517e+03, 4.87415405e+03],['BertzCT', 4.87415405e+03, 5.84746293e+03], + ['BertzCT', 5.84746293e+03, 6.82077181e+03],['BertzCT', 6.82077181e+03, 7.79408069e+03], + ['BertzCT', 7.79408069e+03, 8.76738957e+03],['BertzCT', 8.76738957e+03, 9.74069845e+03], + ['nRot', 0, 10],['nRot', 10, 19],['nRot', 19, 29],['nRot', 29, 38],['nRot', 38, 48], + ['nRot', 48, 58],['nRot', 58, 67],['nRot', 67, 77],['nRot', 77, 86],['nRot', 86, 96] + ], columns=['prop', 'low', 'high']) + # from cmp_utils import conditions, Benchmark CANDLE_DATA_DIR = os.getenv("CANDLE_DATA_DIR") @@ -42,16 +65,6 @@ def compare(model_name, exp_id, run_id): directory = outdir # directory = f"{CANDLE_DATA_DIR}/Output/{exp_id}/{run_id}" print("reading the predictions....") - - gParams = read_params(exp_id, run_id) - model_name = gParams("model_name") - print(f"compare: model_name={model_name} exp_id={exp_id} run_id={run_id}") - - sys.stdout.flush() - - return - - directory = f"{CANDLE_DATA_DIR}/{model_name}/Output/{exp_id}/{run_id}" df_res = pd.read_csv(f"{directory}/test_predictions.csv") # a class to calculate errors for subsets of the validation/test set @@ -59,26 +72,29 @@ def compare(model_name, exp_id, run_id): # TODO: Should have to save the above file in this file # copy and place the following in your CANDLE_DATA_DIR # cp /lambda_stor/homes/ac.gpanapitiya/ccmg-mtg/benchmark/drug_features.csv . - bmk = Benchmark(fp_path=f'{CANDLE_DATA_DIR}/drug_features.csv' - ) # TODO: have to have a drug features for a common test set - subset_err, final_domain_err = bmk.error_by_feature_domains_model( - df_res, conditions) + # bmk = Benchmark(fp_path=f'{CANDLE_DATA_DIR}/drug_features.csv' + # ) # TODO: have to have a drug features for a common test set + # subset_err, final_domain_err = bmk.error_by_feature_domains_model( + # df_res, conditions) # # or this - # fp_path=f'{CANDLE_DATA_DIR}/drug_features.csv' - # subset_err, final_domain_err = error_by_feature_domains_model(fp_path, df_res, conditions) + fp_path=f'{CANDLE_DATA_DIR}/drug_features.csv' + subset_err, final_domain_err = error_by_feature_domains_model(fp_path, df_res, conditions) # collect results for comparison - cmp_prop = 'nAtom' # TODO: Get this from gParameters - subset_err.set_index( - 'prop', inplace=True - ) # TODO: use 'prop' as a parameter and move it to cmp_models.txt - cmp_results[run_id] = subset_err.loc[ - cmp_prop, - 'error'] # this is the property based on which we want to do the comparison - # cmp_results[run_id] = -1 # set to -1 for now as we don't have the drug features file - with open(f"{directory}/subset_err.txt", "w") as fp: - fp.write(str(cmp_results[run_id])) + # cmp_prop = 'nAtom' # TODO: Get this from gParameters + # subset_err.set_index( + # 'prop', inplace=True + # ) # TODO: use 'prop' as a parameter and move it to cmp_models.txt + # cmp_results[run_id] = subset_err.loc[ + # cmp_prop, + # 'error'] # this is the property based on which we want to do the comparison + cmp_results[run_id] = -1 # for case 2, this is not defined + # # cmp_results[run_id] = -1 # set to -1 for now as we don't have the drug features file + # with open(f"{directory}/subset_err.txt", "w") as fp: + # fp.write(str(cmp_results[run_id])) + + subset_err.to_csv(f"{directory}/domain_err.csv", index=False) return str(cmp_results[run_id]) @@ -102,7 +118,8 @@ def error_by_feature_domains_model(fp_path, preds, conditions): report.append([prop, low, high, mean_err]) - keep = keep[keep.smiles.isin(smiles)] + keep = keep[keep.smiles.isin(smiles)] # this is in case we want to progressively + # consider domains. A domain composed of multiple domains final_domain_err = keep.err.mean() # return this report = pd.DataFrame(report, columns=['prop', 'low', 'high', 'error'])