nnstreamer · myungjoo · May 2, 2024 · May 7, 2024 · May 7, 2024 · May 7, 2024
@@ -15,4 +15,4 @@ e = executable('knn_sample',
   install_dir: application_install_dir
 )
 
-test('app_knn', e, args: [nntr_app_resdir / 'KNN'])
+test('app_knn', e, args: [nntr_app_resdir / 'KNN/'])
@@ -136,7 +136,7 @@ class Model {
    * @retval #ML_ERROR_NONE Successful.
    * @retval #ML_ERROR_INVALID_PARAMETER invalid parameter.
    */
-  virtual int compile() = 0;
+  virtual int compile(ExecutionMode exec_mode_ = ExecutionMode::TRAIN) = 0;
 
   /**
    * @brief     Initialize Network. This should be called after setting the

@@ -16,6 +16,7 @@
 /usr/include/nntrainer/blas_interface.h
 /usr/include/nntrainer/var_grad.h
 /usr/include/nntrainer/weight.h
+/usr/include/nntrainer/blas_avx.h
 # todo: update dataset headers
 /usr/include/nntrainer/databuffer.h
 /usr/include/nntrainer/databuffer_factory.h

@@ -64,9 +64,21 @@ warning_c_flags = [
   '-Wno-error=varargs'
 ]
 
+arch = host_machine.cpu_family()
+
+if get_option('enable-avx') and arch == 'x86_64'
+   extra_defines += '-DUSE_AVX=1'
+   if get_option('platform') == 'tizen'
+      add_project_arguments(['-mavx2'], language: ['c','cpp'])
+   else
+      add_project_arguments(['-march=native'], language: ['c','cpp'])
+   endif
+   message('-march=native added for AVX hardware acceleration.')
+elif get_option('enable-avx')
+   warning('AVX enabled for non x86_64 build target. The enable-avx option is ignored.')
+endif
 
 if get_option('enable-fp16')
-   arch = host_machine.cpu_family()
    if get_option('platform') == 'android'
      add_project_arguments('-mfp16-format=ieee', language: ['c', 'cpp'])
      extra_defines += '-DENABLE_FP16=1'
@@ -105,11 +117,6 @@ if get_option('enable-fp16')
      if cc.version().version_compare('>=12.1.0')
        message ('Float16 for x86_64 enabled. Modern gcc-x64 generally supports float16 with _Float16.')
        extra_defines += '-DENABLE_FP16=1'
-       if get_option('enable-avx')
-        extra_defines += '-DUSE_AVX=1'
-        add_project_arguments(['-march=native'], language: ['c','cpp'])
-        message('-march=native added for AVX hardware acceleration.')
-       endif
      else
        warning ('Float16 for x86_64 enabled. However, software emulation is applied for fp16, making it slower and inconsistent. Use GCC 12+ for FP16 support. This build will probably fail unless you bring a compiler that supports fp16 for x64.')
      endif

@@ -40,7 +40,7 @@ option('enable-fp16', type: 'boolean', value: false)
 option('enable-cublas', type: 'boolean', value: false)
 option('enable-openmp', type: 'boolean', value: true)
 option('enable-neon', type: 'boolean', value: false)
-option('enable-avx', type: 'boolean', value: false)
+option('enable-avx', type: 'boolean', value: true)
 option('enable-opencl', type: 'boolean', value: false)
 
 # ml-api dependency (to enable, install capi-inference from github.com/nnstreamer/api )

@@ -35,6 +35,10 @@ GraphCore::getSortedNode(unsigned int ith) const {
   return Sorted.at(ith);
 }
 
+const unsigned int GraphCore::getSortedNodeIdx(const std::string &name) const {
+  return sorted_node_map.at(name);
+}
+
 void GraphCore::makeAdjacencyList(
   std::vector<std::list<std::shared_ptr<GraphNode>>> &adj) {
   /** initialize the adj list */
@@ -93,6 +97,11 @@ void GraphCore::topologicalSort() {
 
   if (Sorted.size() != node_list.size())
     throw std::runtime_error("Internal error in topologicalSort");
+  unsigned int idx = 0;
+  for (auto n : Sorted) {
+    sorted_node_map[n->getName()] = idx;
+    idx++;
+  }
 }
 
 const std::shared_ptr<GraphNode> &

@@ -91,6 +91,13 @@ class GraphCore {
    */
   const std::shared_ptr<GraphNode> &getSortedNode(unsigned int ith) const;
 
+  /**
+   * @brief getter of Sorted GraphNode index with name
+   * @param[in] layer name
+   * @ret index
+   */
+  const unsigned int getSortedNodeIdx(const std::string &name) const;
+
   /**
    * @brief getter of GraphNode with node name
    * @param[in] node name
@@ -252,6 +259,7 @@ class GraphCore {
   std::vector<std::shared_ptr<GraphNode>>
     node_list;                                    /**< Unordered Node List  */
   std::unordered_map<std::string, int> node_map;  /**< Unordered Node map  */
+  std::unordered_map<std::string, int> sorted_node_map;  /**< Unordered Node map  */
   std::vector<std::shared_ptr<GraphNode>> Sorted; /**< Ordered Node List  */
   bool sorted; /** if the node_list is sorted */
 

@@ -337,7 +337,7 @@ void NetworkGraph::applyGradients(
       continue;
     }
 
-    if (rc.isGradientClipByGlobalNorm(i)) {
+    if (rc.isGradientClipByGlobalNorm(i) || rc.isMixedPrecision(i)) {
       /**
        * @note the weights whose gradient are to be clipped by global norm will
        * be clipped at once at the end of iteration and applied then.
@@ -393,56 +393,118 @@ sharedConstTensors NetworkGraph::incremental_forwarding(
   return out;
 }
 
-void NetworkGraph::backwarding(
+bool NetworkGraph::backwarding(
   int iteration,
-  std::function<void(std::shared_ptr<LayerNode>, int)> &backwarding_op,
-  std::function<void(Weight &, int)> &apply_grad_clip_op,
-  std::function<bool(void *userdata)> stop_cb, void *userdata) const {
+  std::function<void(std::shared_ptr<LayerNode>, bool)> &forwarding_op,
+  std::function<bool(std::shared_ptr<LayerNode>, int)> &backwarding_op,
+  std::function<void(Weight &, int)> &lazy_apply_grad_op,
+  std::function<bool(void *userdata)> stop_cb, void *userdata) {
   /**
    * last layer backwarding is run out of this loop
    */
   auto iter_begin = getBackwardingBeginIter();
   auto iter_end = getBackwardingEndIter();
+  bool is_valid = true;
 
   /// there is no layer to train, so backwarding is essentially noop
   if (iter_begin == iter_end) {
-    return;
+    return true;
   }
 
   auto const &lptr_begin = (*iter_begin);
+  // graph_const_reverse_iterator
+  auto iter_ = iter_begin;
 
   if (lptr_begin->requireLabel() == false)
     throw std::runtime_error(
       "Error: last layer does not accept label, we can't train");
 
-  for (auto iter = iter_begin; iter != iter_end && !stop_cb(userdata); iter++) {
-    auto &ln = *iter;
+  for (iter_ = iter_begin; iter_ != iter_end && !stop_cb(userdata); iter_++) {
+    auto &ln = *iter_;
     PROFILE_TIME_START(profile_keys.at(ln->getType()));
-    backwarding_op(ln, iteration);
+    is_valid = backwarding_op(ln, iteration);
     PROFILE_TIME_END(profile_keys.at(ln->getType()));
+
+    if (!is_valid) {
+      std::cout << ln->getName() << " : Gradient has NaN --> "
+                << ln->getRunContext().getLossScale() << std::endl;
+      break;
+    }
   }
 
-  /** perform clipping of the gradients by global norm if any */
-  if (clip_weights.empty())
-    return;
+  if (!is_valid) {
+    /** if has NaN
+     * 1. reset the loss scale. : @todo Backoff_factor : default --> 0.5
+     * 2. run forwarding from cur_iter to cend() && !stop_cb(userdata);
+     * 3. return false --> run backwarding again;
+     */
+    float scale = (*iter_)->getRunContext().getLossScale();
+
+    NNTR_THROW_IF(scale == 1.0f, std::invalid_argument)
+      << "Loss Scale Factor is 1.0f";
+
+    float s = scale > 1.5f ? scale * 0.5f : 1.0f;
+
+    resetLossScale(s);
 
-  /** calculate the global norm */
-  Tensor global_norm_t(
-    TensorDim({1u, 1u, 1u, (unsigned int)clip_weights.size()}));
-  float *global_norm_data = global_norm_t.getData();
-  for (unsigned int idx = 0; idx < clip_weights.size(); idx++) {
-    auto const &w = clip_weights[idx];
-    global_norm_data[idx] = w->getGradientNorm();
+    auto f_iter = cbegin() + graph.getSortedNodeIdx((*iter_)->getName());
+
+    for (auto iter = f_iter; iter != cend() && !stop_cb(userdata); iter++) {
+      auto &ln = *iter;
+      ln->reStoreData(true);
+    }
+
+    for (auto iter = f_iter; iter != cend() && !stop_cb(userdata); iter++) {
+      auto &ln = *iter;
+      PROFILE_TIME_START(profile_keys.at(ln->getType()));
+      forwarding_op(*iter, true);
+      PROFILE_TIME_END(profile_keys.at(ln->getType()));
+    }
+
+    return false;
   }
-  float global_norm = global_norm_t.l2norm();
-  /** apply the gradient with the above global norm */
-  for (auto w : clip_weights) {
-    w->clipGradientByGlobalNorm(global_norm);
+
+  /** perform clipping of the gradients by global norm if any */
+  if (lazy_weights.empty())
+    return true;
+
+  if (is_clip_grad) {
+    /** calculate the global norm */
+    Tensor global_norm_t(
+      TensorDim({1u, 1u, 1u, (unsigned int)lazy_weights.size()}));
+    float *global_norm_data = global_norm_t.getData();
+    for (unsigned int idx = 0; idx < lazy_weights.size(); idx++) {
+      auto const &w = lazy_weights[idx];
+      if (w->getGradientRef().getDataType() != TensorDim::DataType::FP32) {
+        Tensor grad_32 = w->getGradientRef().clone(TensorDim::DataType::FP32);
+        global_norm_data[idx] = grad_32.l2norm();
+      } else {
+        global_norm_data[idx] = w->getGradientNorm();
+      }
+    }
+    float global_norm = global_norm_t.l2norm();
+    /** apply the gradient with the above global norm */
+    for (auto w : lazy_weights) {
+      w->clipGradientByGlobalNorm(global_norm);
+    }
   }
   /** apply the gradient with the above global norm */
-  for (auto w : clip_weights) {
-    apply_grad_clip_op(*w, iteration);
+  for (auto w : lazy_weights) {
+    lazy_apply_grad_op(*w, iteration);
+  }
+  nan_count++;
+
+  /** @todo : handle as property : growth_interval : default --> 2000 */
+
+  if (nan_count > 2000) {
+    float scale = (*iter_)->getRunContext().getLossScale();
+    /** @todo growth_factor : default --> 2.0 */
+    float s = scale * 2.0f;
+    resetLossScale(s);
+    nan_count = 0;
   }
+
+  return true;
 }
 
 LayerNode *NetworkGraph::computeBackwardEnd() {
@@ -580,8 +642,15 @@ void NetworkGraph::addLayer(std::shared_ptr<LayerNode> layer) {
 
 InPlace
 NetworkGraph::canExecuteInPlace(const std::shared_ptr<LayerNode> &lnode) {
-  if (!lnode->supportInPlace())
+
+  if (!lnode->supportInPlace()) {
     return InPlace::NONE;
+  }
+
+  if (lnode->getType() == InputLayer::type &&
+      !istrequal(getTensorType()[2], "FP32")) {
+    return InPlace::NONE;
+  }
 
   /** layers which behave as a no-op - flatten */
   auto no_op = [](const std::shared_ptr<LayerNode> &lnode) {
@@ -746,7 +815,7 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
                  [](const Var_Grad *vg) { return vg->getDim(); });
 
   /** finalize the layer and get the final context */
-  auto init_context = lnode->finalize(input_dims, getTensorType());
+  auto init_context = lnode->finalize(input_dims, getTensorType(), exec_mode);
 
   /**
    * Request manager for either a pre-allocated output as input or a newly
@@ -768,9 +837,10 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
    * node is going to be used with in-place optimizations.
    */
   auto out_specs = init_context.getOutSpecs();
+
   /// @note try move inplace control to finalize
   bool shared_var = false, shared_grad = false;
-  if (lnode->executeInPlace() != InPlace::NONE) {
+  if (lnode->executeInPlace() != InPlace::NONE && lnode->supportInPlace()) {
     setInplaceSharedMemoryConfigByLayer(lnode, shared_var, shared_grad);
     for (unsigned int i = 0; i < out_specs.size(); ++i) {
       auto &s = out_specs.at(i);
@@ -873,13 +943,17 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
     }
   }
 
+  lnode->setDataType(init_context.getWeightDataType(),
+                     init_context.getActivationDataType());
+
   lnode->configureRunContext(
     // TODO: update weights spec for trainable based on layer trainable prop
     tensor_manager->requestWeights(gnode, init_context.getWeightsSpec(),
                                    lnode->getTrainable(), shared_weight_names),
     inputs, outputs,
     tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(),
-                                   lnode->getTrainable(), shared_tensor_names));
+                                   lnode->getTrainable(), shared_tensor_names),
+    init_context.getLossScale());
 
   return outputs;
 }
@@ -1027,7 +1101,8 @@ NetworkGraph::refinalizeContext(const std::shared_ptr<LayerNode> &lnode,
     // TODO: update weights spec for trainable based on layer trainable prop
     weights, inputs, outputs,
     tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(),
-                                   lnode->getTrainable(), shared_tensor_names));
+                                   lnode->getTrainable(), shared_tensor_names),
+    init_context.getLossScale());
 
   return outputs;
 }
@@ -1197,7 +1272,7 @@ int NetworkGraph::initialize(ExecutionMode mode,
          */
         if (tensor_manager->isLastAccess(rc.getWeightGrad(i).getName(),
                                          last_grad_access) ||
-            (rc.isGradientClipByGlobalNorm(i) &&
+            ((rc.isGradientClipByGlobalNorm(i) || rc.isMixedPrecision(i)) &&
              tensor_manager->isSecondLastAccess(rc.getWeightGrad(i).getName(),
                                                 last_grad_access))) {
           rc.getWeightObject(i).setAsGradientLastAccess();
@@ -1287,11 +1362,19 @@ int NetworkGraph::initialize(ExecutionMode mode,
 
   /** select weights which would require clipping of the gradients by global
    * norm if any */
-  clip_weights = tensor_manager->getWeights([](const Weight *w) {
+  lazy_weights = tensor_manager->getWeights([](const Weight *w) {
     return w->hasGradient() && w->isGradientLastAccess() &&
-           w->isGradientClipByGlobalNorm();
+           (w->isGradientClipByGlobalNorm() || w->isMixedPrecision());
   });
 
+  is_clip_grad = false;
+  for (auto w : lazy_weights) {
+    if (w->isGradientClipByGlobalNorm()) {
+      is_clip_grad = true;
+      break;
+    }
+  }
+
   return ML_ERROR_NONE;
 }
 
@@ -1556,10 +1639,18 @@ void NetworkGraph::requestOptimizerVariable(
       const TensorDim &dim = w->getDim();
       std::vector<TensorDim> dims = cb(dim);
       w->setOptimizerVariables(tensor_manager->requestWeightOptimizerVariables(
-        dims, w->getName(), TensorLifespan::MAX_LIFESPAN,
-        w->isGradientClipByGlobalNorm(), Tensor::Initializer::ZEROS));
+        dims, w->getName(), ":opt", TensorLifespan::MAX_LIFESPAN,
+        w->isGradientClipByGlobalNorm(), w->isMixedPrecision(),
+        Tensor::Initializer::ZEROS));
     }
   }
 }
 
+void NetworkGraph::resetLossScale(float scale) {
+  for (auto iter = cbegin(); iter != cend(); iter++) {
+    auto &ln = *iter;
+    ln->getRunContext().setLossScale(scale);
+  }
+}
+
 } /* namespace nntrainer */