lightvector · Fierralin · May 3, 2022 · May 3, 2022 · May 3, 2022 · May 3, 2022
diff --git a/Compiling.md b/Compiling.md
@@ -39,10 +39,10 @@ As also mentioned in the instructions below but repeated here for visibility, if
 
 ## Windows
    * Requirements
-      * CMake with a minimum version of 3.10.2, GUI version strongly recommended (https://cmake.org/download/)
+      * CMake with a minimum version of 3.18.2, GUI version strongly recommended (https://cmake.org/download/)
       * Microsoft Visual Studio for C++. Version 15 (2017) has been tested and should work, other versions might work as well.
       * If using the OpenCL backend, a modern GPU that supports OpenCL 1.2 or greater, or else something like [this](https://software.intel.com/en-us/opencl-sdk) for CPU. But if using CPU, Eigen should be better.
-      * If using the CUDA backend, CUDA 10.2 with CUDNN 7.6.5, or CUDA 11.1 with CUDNN 8.0.4 (https://developer.nvidia.com/cuda-toolkit) (https://developer.nvidia.com/cudnn) and a GPU capable of supporting them. I'm unsure how version compatibility works with CUDA, there's a good chance that later versions than these work just as well, but they have not been tested.
+      * If using the CUDA backend, CUDA 10.2 with CUDNN 7.6.5, or CUDA 11.0.2 with CUDNN 8.0.4 (https://developer.nvidia.com/cuda-toolkit) (https://developer.nvidia.com/cudnn) and a GPU capable of supporting them. I'm unsure how version compatibility works with CUDA, there's a good chance that later versions than these work just as well, but they have not been tested. (If you do sync selfplay-training on a single machine, you'd better choose CUDA 11.0.2 + CUDNN 8.0.4, for tensorflow_gpu 2.4.0 compatibility.)
       * If using the TensorRT backend, in addition to the dependencies for the CUDA backend, you also need TensorRT (https://developer.nvidia.com/tensorrt) on a version compatible with your CUDA and CUDNN versions.
       * If using the Eigen backend, Eigen3, version 3.3.x. (http://eigen.tuxfamily.org/index.php?title=Main_Page#Download).
       * zlib. The following package might work, https://www.nuget.org/packages/zlib-vc140-static-64/, or alternatively you can build it yourself via something like: https://github.com/kiyolee/zlib-win-build

diff --git a/python/README.md b/python/README.md
@@ -39,4 +39,9 @@ These are a separate set of scripts that have nothing to do with any of the abov
 * `genboard_run.py`
 * `genboard_train.py`
 
+### Dependencies
+- scipy
+- tf-slim
+- python-dateutil
+- requests-toolbelt
 
diff --git a/python/export_model.py b/python/export_model.py
@@ -89,7 +89,7 @@ def log(s):
   sys.stderr.flush()
 
   if not for_cuda:
-    tf.train.write_graph(session.graph_def,export_dir,filename_prefix + ".graph.pb")
+    tf.io.write_graph(session.graph_def,export_dir,filename_prefix + ".graph.pb")
     savepath = export_dir + "/" + filename_prefix
     saver.save(session, savepath + ".weights")
     with open(savepath + ".config.json","w") as f:

diff --git a/python/model.py b/python/model.py
diff --git a/python/selfplay/distributed/download_and_upload_and_shuffle_and_export_loop.sh b/python/selfplay/distributed/download_and_upload_and_shuffle_and_export_loop.sh
@@ -32,6 +32,11 @@ shift
 RATING_ONLY="$1"
 shift
 
+PYTHON_BIN=python3
+if [ ${OS} == "Windows_NT" ] && [ ! -z "${CONDA_PYTHON_EXE}" ]; then
+  PYTHON_BIN=python
+fi
+
 #We're not really using gating, but the upload script expects them to be where gating would put them
 #and using gating disables the export script from making extraneous selfplay data dirs.
 USEGATING=1
@@ -69,7 +74,7 @@ cp -r "$GITROOTDIR"/python/selfplay "$DATED_ARCHIVE"
         cd "$basedir"/scripts
         while true
         do
-            time python3 ./summarize_old_selfplay_files.py "$basedir"/selfplay/ \
+            time ${PYTHON_BIN} ./summarize_old_selfplay_files.py "$basedir"/selfplay/ \
                  -old-summary-file-to-assume-correct "$basedir"/selfplay.summary.json \
                  -new-summary-file "$basedir"/selfplay.summary.json.tmp
             mv "$basedir"/selfplay.summary.json.tmp "$basedir"/selfplay.summary.json

diff --git a/python/selfplay/distributed/upload_model_for_selfplay.sh b/python/selfplay/distributed/upload_model_for_selfplay.sh
@@ -23,6 +23,12 @@ shift
 RATING_ONLY="$1"
 shift
 
+
+PYTHON_BIN=python3
+if [ ${OS} == "Windows_NT" ] && [ ! -z "${CONDA_PYTHON_EXE}" ]; then
+  PYTHON_BIN=python
+fi
+
 #------------------------------------------------------------------------------
 
 mkdir -p "$BASEDIR"/modelstobetested
@@ -84,7 +90,7 @@ function uploadStuff() {
                 do
                     set +e
                     set -x
-                    python3 ./upload_model.py \
+                    ${PYTHON_BIN} ./upload_model.py \
                             -run-name "$RUNNAME" \
                             -model-name "$RUNNAME"-"$NAME" \
                             -model-file "$TMPDST"/"$RUNNAME"-"$NAME".bin.gz \

diff --git a/python/selfplay/export_model_for_selfplay.sh b/python/selfplay/export_model_for_selfplay.sh
@@ -21,6 +21,11 @@ shift
 USEGATING="$1"
 shift
 
+PYTHON_BIN=python3
+if [ ${OS} == "Windows_NT" ] && [ ! -z "${CONDA_PYTHON_EXE}" ]; then
+  PYTHON_BIN=python
+fi
+
 #------------------------------------------------------------------------------
 
 mkdir -p "$BASEDIR"/tfsavedmodels_toexport
@@ -64,7 +69,7 @@ function exportStuff() {
                 mkdir "$TMPDST"
 
                 set -x
-                python3 ./export_model.py \
+                ${PYTHON_BIN} ./export_model.py \
                         -saved-model-dir "$SRC"/saved_model \
                         -export-dir "$TMPDST" \
                         -model-name "$NAMEPREFIX""-""$NAME" \

diff --git a/python/selfplay/shuffle.sh b/python/selfplay/shuffle.sh
@@ -23,6 +23,11 @@ shift
 BATCHSIZE="$1"
 shift
 
+PYTHON_BIN=python3
+if [ ${OS} == "Windows_NT" ] && [ ! -z "${CONDA_PYTHON_EXE}" ]; then
+  PYTHON_BIN=python
+fi
+
 #------------------------------------------------------------------------------
 
 OUTDIR=$(date "+%Y%m%d-%H%M%S")
@@ -37,7 +42,7 @@ echo "Beginning shuffle at" $(date "+%Y-%m-%d %H:%M:%S")
 
 #set -x
 (
-    time python3 ./shuffle.py \
+    time ${PYTHON_BIN} ./shuffle.py \
          "$BASEDIR"/selfplay/ \
          -expand-window-per-row 0.4 \
          -taper-window-exponent 0.65 \

diff --git a/python/selfplay/shuffle_loop.sh b/python/selfplay/shuffle_loop.sh
@@ -20,6 +20,11 @@ shift
 BATCHSIZE="$1"
 shift
 
+PYTHON_BIN=python3
+if [ ${OS} == "Windows_NT" ] && [ ! -z "${CONDA_PYTHON_EXE}" ]; then
+  PYTHON_BIN=python
+fi
+
 GITROOTDIR="$(git rev-parse --show-toplevel)"
 
 basedir="$(realpath "$BASEDIRRAW")"
@@ -42,7 +47,7 @@ cp -r "$GITROOTDIR"/python/selfplay "$DATED_ARCHIVE"
     while true
     do
         rm -f "$basedir"/selfplay.summary.json.tmp
-        time python3 ./summarize_old_selfplay_files.py "$basedir"/selfplay/ \
+        time ${PYTHON_BIN} ./summarize_old_selfplay_files.py "$basedir"/selfplay/ \
              -old-summary-file-to-assume-correct "$basedir"/selfplay.summary.json \
              -new-summary-file "$basedir"/selfplay.summary.json.tmp
         mv "$basedir"/selfplay.summary.json.tmp "$basedir"/selfplay.summary.json

diff --git a/python/selfplay/train.sh b/python/selfplay/train.sh
@@ -28,6 +28,11 @@ shift
 EXPORTMODE="$1"
 shift
 
+PYTHON_BIN=python3
+if [ ${OS} == "Windows_NT" ] && [ ! -z "${CONDA_PYTHON_EXE}" ]; then
+  PYTHON_BIN=python
+fi
+
 GITROOTDIR="$(git rev-parse --show-toplevel)"
 
 #------------------------------------------------------------------------------
@@ -65,7 +70,7 @@ else
     exit 1
 fi
 
-time python3 "$GITROOTDIR"/python/train.py \
+time ${PYTHON_BIN} "$GITROOTDIR"/python/train.py \
      -traindir "$BASEDIR"/train/"$TRAININGNAME" \
      -datadir "$BASEDIR"/shuffleddata/current/ \
      -exportdir "$BASEDIR"/"$EXPORT_SUBDIR" \

diff --git a/python/shuffle.py b/python/shuffle.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 import tensorflow as tf
-from tensorflow.python_io import TFRecordOptions,TFRecordCompressionType,TFRecordWriter
+from tensorflow.compat.v1.python_io import TFRecordOptions,TFRecordCompressionType,TFRecordWriter
 
 import tfrecordio
 

diff --git a/python/test.py b/python/test.py
@@ -65,7 +65,7 @@ def log(s):
   dataset = dataset.flat_map(lambda fname: tf.data.TFRecordDataset(fname,compression_type="ZLIB"))
   parse_input = tfrecordio.make_tf_record_parser(model_config,pos_len,batch_size)
   dataset = dataset.map(parse_input)
-  iterator = dataset.make_one_shot_iterator()
+  iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
   features = iterator.get_next()
 elif using_npz:
   features = tfrecordio.make_raw_input_feature_placeholders(model_config,pos_len,batch_size)

diff --git a/python/tfrecordio.py b/python/tfrecordio.py
@@ -22,6 +22,7 @@ def make_raw_input_feature_placeholders(model_config,pos_len,batch_size):
   num_bin_input_features = Model.get_num_bin_input_features(model_config)
   num_global_input_features = Model.get_num_global_input_features(model_config)
 
+  tf.compat.v1.disable_v2_behavior()
   return {
     "binchwp": tf.compat.v1.placeholder(tf.uint8,[batch_size,num_bin_input_features,(pos_len*pos_len+7)//8]),
     "ginc": tf.compat.v1.placeholder(tf.float32,[batch_size,num_global_input_features]),
@@ -40,8 +41,8 @@ def make_tf_record_parser(model_config,pos_len,batch_size,multi_num_gpus=None):
   raw_input_features = make_raw_input_features(model_config,pos_len,batch_size)
 
   def parse_input(serialized_example):
-    example = tf.io.parse_single_example(serialized_example,raw_input_features)
-    binchwp = tf.decode_raw(example["binchwp"],tf.uint8)
+    example = tf.io.parse_single_example(serialized=serialized_example,features=raw_input_features)
+    binchwp = tf.io.decode_raw(example["binchwp"],tf.uint8)
     ginc = example["ginc"]
     ptncm = example["ptncm"]
     gtnc = example["gtnc"]

diff --git a/python/train.py b/python/train.py
@@ -17,6 +17,7 @@
 import numpy as np
 import itertools
 import copy
+import tf_slim
 
 import data
 from board import Board
@@ -151,6 +152,9 @@ def trainlog(s):
     multi_gpu_device_ids.append("/GPU:" + str(int(piece)))
   num_gpus_used = len(multi_gpu_device_ids)
 
+# Fix for tensorflow 2.4: Not creating XLA devices, tf_xla_enable_xla_devices not set
+os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'
+
 
 # MODEL ----------------------------------------------------------------
 printed_model_yet = False
@@ -172,6 +176,7 @@ def trainlog(s):
     assign_ops = []
     for variable in itertools.chain(tf.compat.v1.model_variables(), tf.compat.v1.trainable_variables()):
       if variable.name.startswith("swa_model/"):
+        tf.compat.v1.disable_v2_behavior()
         placeholder = tf.compat.v1.placeholder(variable.dtype,variable.shape)
         assign_ops.append(tf.compat.v1.assign(variable,placeholder))
         swa_assign_placeholders[variable.name] = placeholder
@@ -261,7 +266,8 @@ def model_fn(features,labels,mode,params):
       synchronization=tf.VariableSynchronization.ON_READ,
       aggregation=tf.VariableAggregation.SUM
     )
-    wsum_op = tf.assign_add(wsum,target_vars.weight_sum)
+    #wsum_op = tf.assign_add(wsum,target_vars.weight_sum)
+    wsum_op = wsum.assign_add(target_vars.weight_sum)
     eval_metric_ops={
       #"wsum": (wsum.read_value(),wsum_op),
       "p0loss": tf.compat.v1.metrics.mean(target_vars.policy_loss_unreduced, weights=target_vars.target_weight_used),
@@ -300,8 +306,8 @@ def model_fn(features,labels,mode,params):
     printed_model_yet = True
 
     def moving_mean(name,x,weights):
-      sumwx = tf.reduce_sum(x*weights,name="printstats/wx/"+name)
-      sumw = tf.reduce_sum(weights,name="printstats/w/"+name)
+      sumwx = tf.reduce_sum(input_tensor=x*weights,name="printstats/wx/"+name)
+      sumw = tf.reduce_sum(input_tensor=weights,name="printstats/w/"+name)
       moving_wx = tf.compat.v1.get_variable(initializer=tf.zeros([]),name=(name+"/moving_wx"),trainable=False)
       moving_w = tf.compat.v1.get_variable(initializer=tf.zeros([]),name=(name+"/moving_w"),trainable=False)
 
@@ -413,7 +419,7 @@ def moving_mean(name,x,weights):
           break
       if checkpoint_path is not None:
         print("Initial weights checkpoint to use found at: " + checkpoint_path)
-        vars_in_checkpoint = tf.contrib.framework.list_variables(checkpoint_path)
+        vars_in_checkpoint = tf_slim.list_variables(checkpoint_path)
         varname_in_checkpoint = {}
         print("Checkpoint contains:")
         for varandshape in vars_in_checkpoint:

diff --git a/python/visualize.py b/python/visualize.py
@@ -46,7 +46,7 @@ def log(s):
 
 pos_len = 19 # shouldn't matter, all we're doing is exporting weights that don't depend on this
 if name_scope is not None:
-  with tf.name_scope(name_scope):
+  with tf.compat.v1.name_scope(name_scope):
     model = Model(model_config,pos_len,{})
 else:
   model = Model(model_config,pos_len,{})
@@ -59,7 +59,7 @@ def volume(variable):
   return variable_parameters
 
 total_parameters = 0
-for variable in tf.global_variables():
+for variable in tf.compat.v1.global_variables():
   variable_parameters = volume(variable)
   total_parameters += variable_parameters
   log("Model variable %s, %d parameters" % (variable.name,variable_parameters))
@@ -120,7 +120,7 @@ def run(fetches):
 
   if show_all_weight_magnitudes:
     print("name,sumsq,l2regstrength,meansq,rms")
-    for variable in tf.trainable_variables():
+    for variable in tf.compat.v1.trainable_variables():
       values = np.array(variable.eval())
       sq = np.square(values)
       reg = np.sum(sq) if any(v.name == variable.name for v in model.reg_variables) else 0