[ FSU ] Enabls Asynchronos FSU for forwarding

This PR enables asynchronos mode for FSU (flash storage utilization) for better performance. It splits the load and unload tensors which make difficult to handle. Also fix the inference execution order when it is in INFERENCE mode and change the trainable option to false when it calls the request weights and tensors. Add the new function to load and unload tensors as well as check load complete. It also considers weight pool and tensor pool differenetly according to the ExecutionMode. It is not use FSU mode for tensor pool for the INFERENCE Mode. Resolves: **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: jijoong.moon <[email protected]>
nnstreamer · Dec 3, 2024 · 6434d16 · 6434d16
1 parent 9a4375e
commit 6434d16
Show file tree

Hide file tree

Showing 20 changed files with 452 additions and 141 deletions.
diff --git a/Applications/SimpleFC/README.md b/Applications/SimpleFC/README.md
diff --git a/Applications/SimpleFC/jni/main.cpp b/Applications/SimpleFC/jni/main.cpp
@@ -86,15 +86,17 @@ std::vector<LayerHandle> createGraph() {
   std::vector<LayerHandle> layers;
 
   layers.push_back(createLayer(
-    "input", {withKey("name", "input0"), withKey("input_shape", "1:1:32")}));
+    "input", {withKey("name", "input0"), withKey("input_shape", "1:1:320")}));
 
-  layers.push_back(
-    createLayer("fully_connected",
-                {withKey("unit", 10)}));
+  layers.push_back(createLayer("fully_connected",
+                               {withKey("unit", 100),
+                                withKey("weight_initializer", "xavier_uniform"),
+                                withKey("bias_initializer", "zeros")}));
 
-  layers.push_back(
-    createLayer("fully_connected",
-                {withKey("unit", 10)}));
+  layers.push_back(createLayer("fully_connected",
+                               {withKey("unit", 100),
+                                withKey("weight_initializer", "xavier_uniform"),
+                                withKey("bias_initializer", "zeros")}));
 
   return layers;
 }
@@ -133,10 +135,13 @@ void createAndRun(unsigned int epochs, unsigned int batch_size,
 
   // setup model
   ModelHandle model = create();
-  model->setProperty({withKey("batch_size", batch_size),
-                      withKey("epochs", epochs),
-                      withKey("save_path", "model_full.bin"),
-		      withKey("memory_swap","true")});
+  model->setProperty(
+    {withKey("batch_size", batch_size), withKey("epochs", epochs),
+     // withKey("save_path", "model_full.bin")});
+     // withKey("save_path", "model_full.bin"), withKey("memory_swap",
+     // "true")});
+     withKey("memory_swap", "true"), withKey("memory_swap_lookahead", "1"),
+     withKey("model_tensor_type", "FP16-FP16")});
 
   auto optimizer = ml::train::createOptimizer("sgd", {"learning_rate=0.001"});
   model->setOptimizer(std::move(optimizer));
@@ -156,28 +161,24 @@ void createAndRun(unsigned int epochs, unsigned int batch_size,
   auto dataset_valid = ml::train::createDataset(
     ml::train::DatasetType::GENERATOR, validData_cb, valid_user_data.get());
 
-  // model->setDataset(ml::train::DatasetModeType::MODE_TRAIN,
-  //                   std::move(dataset_train));
-  // model->setDataset(ml::train::DatasetModeType::MODE_VALID,
-  //                   std::move(dataset_valid));
-
-  // if (transfer_learning)
-  //   model->load(pretrained_bin_path);
-  // model->train();
+  model->save("simplefc_weight_fp16_fp16_100.bin",ml::train::ModelFormat::MODEL_FORMAT_BIN);
+  // exit(0);
+  // model->load("./simplefc_weight100.bin");
+  model->load("./simplefc_weight_fp16_fp16_100.bin");
 
   model->summarize(std::cout, ML_TRAIN_SUMMARY_MODEL);
 
-  uint feature_size = 32;
+  uint feature_size = 320;
 
-  float input [32];
-  float label [1];
+  float input[320];
+  float label[1];
 
-  for(uint j=0;j<feature_size;++j)
+  for (uint j = 0; j < feature_size; ++j)
     input[j] = j;
 
-  std::vector<float*> in;
-  std::vector<float*> l;
-  std::vector<float*> answer;
+  std::vector<float *> in;
+  std::vector<float *> l;
+  std::vector<float *> answer;
 
   in.push_back(input);
   l.push_back(label);
@@ -187,19 +188,18 @@ void createAndRun(unsigned int epochs, unsigned int batch_size,
   in.clear();
   l.clear();
 
-  std::cout << "done"<<std::endl;
-
+  std::cout << "done" << std::endl;
 }
 
 std::array<UserDataType, 2>
 createFakeDataGenerator(unsigned int batch_size,
                         unsigned int simulated_data_size,
                         unsigned int data_split) {
   UserDataType train_data(new nntrainer::util::RandomDataLoader(
-    {{batch_size, 1, 1, 32}}, {{batch_size, 1, 1, 10}},
+    {{batch_size, 1, 1, 320}}, {{batch_size, 1, 1, 100}},
     simulated_data_size / data_split));
   UserDataType valid_data(new nntrainer::util::RandomDataLoader(
-    {{batch_size, 1, 1, 32}}, {{batch_size, 1, 1, 10}},
+    {{batch_size, 1, 1, 320}}, {{batch_size, 1, 1, 100}},
     simulated_data_size / data_split));
 
   return {std::move(train_data), std::move(valid_data)};
@@ -231,9 +231,9 @@ int main(int argc, char *argv[]) {
 
   std::string data_dir = "fake";
   uint batch_size = 1;
-  uint data_split =1;
+  uint data_split = 1;
   uint epoch = 1;
-  
+
   std::array<UserDataType, 2> user_datas;
 
   try {

diff --git a/meson.build b/meson.build
@@ -90,7 +90,6 @@ if get_option('enable-fp16')
      extra_defines += '-DENABLE_FP16=1'
      extra_defines += '-DUSE__FP16=1'
      extra_defines += '-DUSE_NEON=1'
-     extra_defines += '-DUSE_MMAP=1'
    elif arch == 'aarch64'
      ## About FP16 in GCC (from GCC-9.1 manual)
      # https://gcc.gnu.org/onlinedocs/gcc-9.1.0/gcc/Half-Precision.html

diff --git a/nntrainer/graph/network_graph.cpp b/nntrainer/graph/network_graph.cpp
@@ -229,6 +229,14 @@ int NetworkGraph::checkCompiledGraph() {
 void NetworkGraph::markNodesForBackwarding() {
   /** accumulate all the nodes which must support backwarding */
   std::unordered_set<std::string> must_support_backwarding;
+  if (exec_mode == ExecutionMode::INFERENCE) {
+    for (auto iter = cbegin(); iter != cend(); iter++) {
+      auto lnode = (*iter);
+      lnode->needsCalcGradient(false);
+      lnode->needsCalcDerivative(false);
+    }
+    return;
+  }
 
   /**
    * if a node is trainable, then all the nodes ahead of it must support
@@ -867,14 +875,16 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
   }
   lnode->setDataType(init_context.getWeightDataType(),
                      init_context.getActivationDataType());
-
+  bool trainable = lnode->getTrainable();
+  if (exec_mode == ExecutionMode::INFERENCE)
+    trainable = false;
   lnode->configureRunContext(
     // TODO: update weights spec for trainable based on layer trainable prop
     tensor_manager->requestWeights(gnode, init_context.getWeightsSpec(),
-                                   lnode->getTrainable(), shared_weight_names),
+                                   trainable, shared_weight_names),
     inputs, outputs,
     tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(),
-                                   lnode->getTrainable(), shared_tensor_names),
+                                   trainable, shared_tensor_names),
     init_context.getLossScale());
 
   return outputs;
@@ -1552,6 +1562,22 @@ void NetworkGraph::flushCacheExcept(unsigned int order) {
   tensor_manager->flushCacheExcept(order);
 }
 
+void NetworkGraph::LoadTensors(unsigned int order) {
+  tensor_manager->LoadTensors(order);
+}
+
+bool NetworkGraph::checkLoadComplete(unsigned int order) {
+  return tensor_manager->checkLoadComplete(order);
+}
+
+bool NetworkGraph::checkUnloadComplete(unsigned int order) {
+  return tensor_manager->checkUnloadComplete(order);
+}
+
+void NetworkGraph::UnloadTensors(unsigned int order) {
+  tensor_manager->UnloadTensors(order);
+}
+
 void NetworkGraph::requestOptimizerVariable(
   std::function<std::vector<TensorDim>(const TensorDim &)> cb,
   bool request_only_trainable) {

diff --git a/nntrainer/graph/network_graph.h b/nntrainer/graph/network_graph.h
@@ -370,8 +370,12 @@ class NetworkGraph {
    * @brief Allocate memory for all the managed weights
    */
   void allocateWeights(bool init = true) {
-    tensor_manager->allocateWeights(
-      std::get<3>(backward_iter_end->getExecutionOrder()), init);
+    unsigned int max_exec_order =
+      std::get<3>(backward_iter_end->getExecutionOrder());
+
+    if (exec_mode == ExecutionMode::INFERENCE)
+      max_exec_order = std::get<0>(forward_iter_end->getExecutionOrder());
+    tensor_manager->allocateWeights(max_exec_order, init);
   }
 
   /**
@@ -447,6 +451,34 @@ class NetworkGraph {
    */
   void flushCacheExcept(const unsigned int order);
 
+  /**
+   * @brief Load data of order to the device
+   *
+   * @param order execution order
+   */
+  void LoadTensors(const unsigned int order);
+
+  /**
+   * @brief check data of order is loaded
+   *
+   * @param order execution order
+   */
+  bool checkLoadComplete(const unsigned int order);
+
+  /**
+   * @brief check data of order is Unloaded
+   *
+   * @param order execution order
+   */
+  bool checkUnloadComplete(const unsigned int order);
+
+  /**
+   * @brief Load data of order to the device
+   *
+   * @param order execution order
+   */
+  void UnloadTensors(const unsigned int order);
+
 #ifdef ENABLE_TEST
   /**
    * @brief Get layer node's tenexecution orders

diff --git a/nntrainer/layers/layer_node.cpp b/nntrainer/layers/layer_node.cpp
@@ -489,7 +489,8 @@ void LayerNode::read(std::ifstream &file, bool opt_var,
 
     for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
       /// @note shared weights are only be read at the first acecss
-      if (run_context->isGradientLastAccess(i)) {
+      //      if (run_context->isGradientLastAccess(i)) {
+      if (run_context->isGradientFirstAccess(i)) {
         if (layer->getType() == BatchNormalizationLayer::type) {
           if ((mode == ml::train::ExecutionMode::TRAIN) &&
               (this->getWeightDataType() != TensorDim::DataType::FP32)) {
@@ -526,7 +527,7 @@ void LayerNode::save(std::ofstream &file, bool opt_var,
 
   if (opt_var) {
     for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
-      if (run_context->isGradientLastAccess(i) && getTrainable()) {
+      if (run_context->isGradientFirstAccess(i) && getTrainable()) {
         // @note save optimizer variables
         if (run_context->weightHasGradient(i)) {
           for (unsigned int j = 0; j < run_context->getNumWeightOptVar(i);
@@ -539,7 +540,7 @@ void LayerNode::save(std::ofstream &file, bool opt_var,
   } else {
     // @note shared weights are only be saved at the first access
     for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
-      if (run_context->isGradientLastAccess(i)) {
+      if (run_context->isGradientFirstAccess(i)) {
 
         /** @note For batch normalization layer, we do need full precision for
          * training and the data type of weight is full precision. But for

diff --git a/nntrainer/models/model_common_properties.cpp b/nntrainer/models/model_common_properties.cpp
@@ -34,8 +34,6 @@ MemorySwap::MemorySwap(bool value) { set(value); }
 
 MemorySwapPath::MemorySwapPath(const std::string &value) { set(value); }
 
-MemorySwapMode::MemorySwapMode(const std::string &value) { set(value); }
-
 MemorySwapLookahead::MemorySwapLookahead(const unsigned int &value) {
   set(value);
 }

diff --git a/nntrainer/models/model_common_properties.h b/nntrainer/models/model_common_properties.h
@@ -179,24 +179,6 @@ class MemorySwapLookahead : public Property<unsigned int> {
   MemorySwapLookahead(const unsigned int &value = 0);
 };
 
-/**
- * @brief cache file path property
- *
- */
-class MemorySwapMode : public Property<std::string> {
-public:
-  static constexpr const char *key =
-    "memory_swap_mode";          /**< unique key to access */
-  using prop_tag = str_prop_tag; /**< property type */
-
-  /**
-   * @brief Constructor
-   *
-   * @param value value to set, defaults to current directory
-   */
-  MemorySwapMode(const std::string &value = "train");
-};
-
 /**
  * @brief     Enumeration of Data Type for model & layer
  */