Skip to content

Commit

Permalink
[ FSU ] Enabls Asynchronos FSU for forwarding
Browse files Browse the repository at this point in the history
This PR enables asynchronos mode for FSU (flash storage utilization)
for better performance.

It splits the load and unload tensors which make difficult to handle.
Also fix the inference execution order when it is in INFERENCE mode
and change the trainable option to false when it calls the request
weights and tensors.

Add the new function to load and unload tensors as well as check load
complete.

It also considers weight pool and tensor pool differenetly according
to the ExecutionMode. It is not use FSU mode for tensor pool for the
INFERENCE Mode.

Resolves:

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: jijoong.moon <[email protected]>
  • Loading branch information
jijoongmoon committed Dec 3, 2024
1 parent 9a4375e commit 6434d16
Show file tree
Hide file tree
Showing 20 changed files with 452 additions and 141 deletions.
30 changes: 0 additions & 30 deletions Applications/SimpleFC/README.md

This file was deleted.

64 changes: 32 additions & 32 deletions Applications/SimpleFC/jni/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,15 +86,17 @@ std::vector<LayerHandle> createGraph() {
std::vector<LayerHandle> layers;

layers.push_back(createLayer(
"input", {withKey("name", "input0"), withKey("input_shape", "1:1:32")}));
"input", {withKey("name", "input0"), withKey("input_shape", "1:1:320")}));

layers.push_back(
createLayer("fully_connected",
{withKey("unit", 10)}));
layers.push_back(createLayer("fully_connected",
{withKey("unit", 100),
withKey("weight_initializer", "xavier_uniform"),
withKey("bias_initializer", "zeros")}));

layers.push_back(
createLayer("fully_connected",
{withKey("unit", 10)}));
layers.push_back(createLayer("fully_connected",
{withKey("unit", 100),
withKey("weight_initializer", "xavier_uniform"),
withKey("bias_initializer", "zeros")}));

return layers;
}
Expand Down Expand Up @@ -133,10 +135,13 @@ void createAndRun(unsigned int epochs, unsigned int batch_size,

// setup model
ModelHandle model = create();
model->setProperty({withKey("batch_size", batch_size),
withKey("epochs", epochs),
withKey("save_path", "model_full.bin"),
withKey("memory_swap","true")});
model->setProperty(
{withKey("batch_size", batch_size), withKey("epochs", epochs),
// withKey("save_path", "model_full.bin")});
// withKey("save_path", "model_full.bin"), withKey("memory_swap",
// "true")});
withKey("memory_swap", "true"), withKey("memory_swap_lookahead", "1"),
withKey("model_tensor_type", "FP16-FP16")});

auto optimizer = ml::train::createOptimizer("sgd", {"learning_rate=0.001"});
model->setOptimizer(std::move(optimizer));
Expand All @@ -156,28 +161,24 @@ void createAndRun(unsigned int epochs, unsigned int batch_size,
auto dataset_valid = ml::train::createDataset(
ml::train::DatasetType::GENERATOR, validData_cb, valid_user_data.get());

// model->setDataset(ml::train::DatasetModeType::MODE_TRAIN,
// std::move(dataset_train));
// model->setDataset(ml::train::DatasetModeType::MODE_VALID,
// std::move(dataset_valid));

// if (transfer_learning)
// model->load(pretrained_bin_path);
// model->train();
model->save("simplefc_weight_fp16_fp16_100.bin",ml::train::ModelFormat::MODEL_FORMAT_BIN);
// exit(0);
// model->load("./simplefc_weight100.bin");
model->load("./simplefc_weight_fp16_fp16_100.bin");

model->summarize(std::cout, ML_TRAIN_SUMMARY_MODEL);

uint feature_size = 32;
uint feature_size = 320;

float input [32];
float label [1];
float input[320];
float label[1];

for(uint j=0;j<feature_size;++j)
for (uint j = 0; j < feature_size; ++j)
input[j] = j;

std::vector<float*> in;
std::vector<float*> l;
std::vector<float*> answer;
std::vector<float *> in;
std::vector<float *> l;
std::vector<float *> answer;

in.push_back(input);
l.push_back(label);
Expand All @@ -187,19 +188,18 @@ void createAndRun(unsigned int epochs, unsigned int batch_size,
in.clear();
l.clear();

std::cout << "done"<<std::endl;

std::cout << "done" << std::endl;
}

std::array<UserDataType, 2>
createFakeDataGenerator(unsigned int batch_size,
unsigned int simulated_data_size,
unsigned int data_split) {
UserDataType train_data(new nntrainer::util::RandomDataLoader(
{{batch_size, 1, 1, 32}}, {{batch_size, 1, 1, 10}},
{{batch_size, 1, 1, 320}}, {{batch_size, 1, 1, 100}},
simulated_data_size / data_split));
UserDataType valid_data(new nntrainer::util::RandomDataLoader(
{{batch_size, 1, 1, 32}}, {{batch_size, 1, 1, 10}},
{{batch_size, 1, 1, 320}}, {{batch_size, 1, 1, 100}},
simulated_data_size / data_split));

return {std::move(train_data), std::move(valid_data)};
Expand Down Expand Up @@ -231,9 +231,9 @@ int main(int argc, char *argv[]) {

std::string data_dir = "fake";
uint batch_size = 1;
uint data_split =1;
uint data_split = 1;
uint epoch = 1;

std::array<UserDataType, 2> user_datas;

try {
Expand Down
1 change: 0 additions & 1 deletion meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ if get_option('enable-fp16')
extra_defines += '-DENABLE_FP16=1'
extra_defines += '-DUSE__FP16=1'
extra_defines += '-DUSE_NEON=1'
extra_defines += '-DUSE_MMAP=1'
elif arch == 'aarch64'
## About FP16 in GCC (from GCC-9.1 manual)
# https://gcc.gnu.org/onlinedocs/gcc-9.1.0/gcc/Half-Precision.html
Expand Down
32 changes: 29 additions & 3 deletions nntrainer/graph/network_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,14 @@ int NetworkGraph::checkCompiledGraph() {
void NetworkGraph::markNodesForBackwarding() {
/** accumulate all the nodes which must support backwarding */
std::unordered_set<std::string> must_support_backwarding;
if (exec_mode == ExecutionMode::INFERENCE) {
for (auto iter = cbegin(); iter != cend(); iter++) {
auto lnode = (*iter);
lnode->needsCalcGradient(false);
lnode->needsCalcDerivative(false);
}
return;
}

/**
* if a node is trainable, then all the nodes ahead of it must support
Expand Down Expand Up @@ -867,14 +875,16 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
}
lnode->setDataType(init_context.getWeightDataType(),
init_context.getActivationDataType());

bool trainable = lnode->getTrainable();
if (exec_mode == ExecutionMode::INFERENCE)
trainable = false;
lnode->configureRunContext(
// TODO: update weights spec for trainable based on layer trainable prop
tensor_manager->requestWeights(gnode, init_context.getWeightsSpec(),
lnode->getTrainable(), shared_weight_names),
trainable, shared_weight_names),
inputs, outputs,
tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(),
lnode->getTrainable(), shared_tensor_names),
trainable, shared_tensor_names),
init_context.getLossScale());

return outputs;
Expand Down Expand Up @@ -1552,6 +1562,22 @@ void NetworkGraph::flushCacheExcept(unsigned int order) {
tensor_manager->flushCacheExcept(order);
}

void NetworkGraph::LoadTensors(unsigned int order) {
tensor_manager->LoadTensors(order);
}

bool NetworkGraph::checkLoadComplete(unsigned int order) {
return tensor_manager->checkLoadComplete(order);
}

bool NetworkGraph::checkUnloadComplete(unsigned int order) {
return tensor_manager->checkUnloadComplete(order);
}

void NetworkGraph::UnloadTensors(unsigned int order) {
tensor_manager->UnloadTensors(order);
}

void NetworkGraph::requestOptimizerVariable(
std::function<std::vector<TensorDim>(const TensorDim &)> cb,
bool request_only_trainable) {
Expand Down
36 changes: 34 additions & 2 deletions nntrainer/graph/network_graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -370,8 +370,12 @@ class NetworkGraph {
* @brief Allocate memory for all the managed weights
*/
void allocateWeights(bool init = true) {
tensor_manager->allocateWeights(
std::get<3>(backward_iter_end->getExecutionOrder()), init);
unsigned int max_exec_order =
std::get<3>(backward_iter_end->getExecutionOrder());

if (exec_mode == ExecutionMode::INFERENCE)
max_exec_order = std::get<0>(forward_iter_end->getExecutionOrder());
tensor_manager->allocateWeights(max_exec_order, init);
}

/**
Expand Down Expand Up @@ -447,6 +451,34 @@ class NetworkGraph {
*/
void flushCacheExcept(const unsigned int order);

/**
* @brief Load data of order to the device
*
* @param order execution order
*/
void LoadTensors(const unsigned int order);

/**
* @brief check data of order is loaded
*
* @param order execution order
*/
bool checkLoadComplete(const unsigned int order);

/**
* @brief check data of order is Unloaded
*
* @param order execution order
*/
bool checkUnloadComplete(const unsigned int order);

/**
* @brief Load data of order to the device
*
* @param order execution order
*/
void UnloadTensors(const unsigned int order);

#ifdef ENABLE_TEST
/**
* @brief Get layer node's tenexecution orders
Expand Down
7 changes: 4 additions & 3 deletions nntrainer/layers/layer_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,8 @@ void LayerNode::read(std::ifstream &file, bool opt_var,

for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
/// @note shared weights are only be read at the first acecss
if (run_context->isGradientLastAccess(i)) {
// if (run_context->isGradientLastAccess(i)) {
if (run_context->isGradientFirstAccess(i)) {
if (layer->getType() == BatchNormalizationLayer::type) {
if ((mode == ml::train::ExecutionMode::TRAIN) &&
(this->getWeightDataType() != TensorDim::DataType::FP32)) {
Expand Down Expand Up @@ -526,7 +527,7 @@ void LayerNode::save(std::ofstream &file, bool opt_var,

if (opt_var) {
for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
if (run_context->isGradientLastAccess(i) && getTrainable()) {
if (run_context->isGradientFirstAccess(i) && getTrainable()) {
// @note save optimizer variables
if (run_context->weightHasGradient(i)) {
for (unsigned int j = 0; j < run_context->getNumWeightOptVar(i);
Expand All @@ -539,7 +540,7 @@ void LayerNode::save(std::ofstream &file, bool opt_var,
} else {
// @note shared weights are only be saved at the first access
for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
if (run_context->isGradientLastAccess(i)) {
if (run_context->isGradientFirstAccess(i)) {

/** @note For batch normalization layer, we do need full precision for
* training and the data type of weight is full precision. But for
Expand Down
2 changes: 0 additions & 2 deletions nntrainer/models/model_common_properties.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ MemorySwap::MemorySwap(bool value) { set(value); }

MemorySwapPath::MemorySwapPath(const std::string &value) { set(value); }

MemorySwapMode::MemorySwapMode(const std::string &value) { set(value); }

MemorySwapLookahead::MemorySwapLookahead(const unsigned int &value) {
set(value);
}
Expand Down
18 changes: 0 additions & 18 deletions nntrainer/models/model_common_properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,24 +179,6 @@ class MemorySwapLookahead : public Property<unsigned int> {
MemorySwapLookahead(const unsigned int &value = 0);
};

/**
* @brief cache file path property
*
*/
class MemorySwapMode : public Property<std::string> {
public:
static constexpr const char *key =
"memory_swap_mode"; /**< unique key to access */
using prop_tag = str_prop_tag; /**< property type */

/**
* @brief Constructor
*
* @param value value to set, defaults to current directory
*/
MemorySwapMode(const std::string &value = "train");
};

/**
* @brief Enumeration of Data Type for model & layer
*/
Expand Down
Loading

0 comments on commit 6434d16

Please sign in to comment.