Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

meson: do not enable AVX in non-x64 #2629

Closed
wants to merge 25 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
1ecac99
[ Weight ] Add Var32 Tensor in Weight.
jijoongmoon May 2, 2024
a5491f1
[ Mixed ] Create weight with var32 tensor
jijoongmoon May 7, 2024
8dab24c
[ Layers ] Update Layers to support FP16
jijoongmoon May 7, 2024
69bda1a
[ Test ] Mixed Precision Test Case
jijoongmoon May 7, 2024
82e74a9
[ Optimizer ] Update Optimizer / Adam to support Mixed training
jijoongmoon May 9, 2024
727a810
[ Tensor ] add is_NaN check in Tensor
jijoongmoon May 8, 2024
e0c6d98
[ Context ] Add loss scale in Context & using mse loss
jijoongmoon May 11, 2024
0792b84
[ Mixed Precision ] Enable Mixed Precision
jijoongmoon May 13, 2024
88908bd
[ Tensor ] Add inifinity check in Tensor
jijoongmoon May 14, 2024
9351134
[ MSE ] Fix for better MSE loss precision
jijoongmoon May 17, 2024
931879b
[ TEST ] Add Torch Mixed Precision Model Test
jijoongmoon May 17, 2024
c333096
[ TEST ] add torch input and output test data for mixed precision
jijoongmoon May 20, 2024
9c51f23
[ TEST ] Add more unittest and fixes for mixed precsion
jijoongmoon May 24, 2024
91850a4
[ Layer ] Update Conv2D to support Mixed Precision
jijoongmoon May 29, 2024
4ac4588
[ Layer ] enable Mixed Precision in LSTM Layer
jijoongmoon May 30, 2024
4c376fd
[ Model ] Add Execution Mode in Compile
jijoongmoon May 31, 2024
6088bd2
[ Layer ] Mixed Precision support for BN Layer
jijoongmoon Jun 3, 2024
d15aef8
[layer] enable mixed precision - reshape_layer
DonghakPark May 30, 2024
ab6124d
[Layer] Enable mixed precision - pooling2d_layer
DonghakPark Jun 3, 2024
ce6650e
[ Model ] Fix the gradient clipping for the FP16 or Low bit Gradient
jijoongmoon Jun 9, 2024
15794ac
[ Layer ] Add mu and var backup up tensor.
jijoongmoon Jun 9, 2024
9faaaad
[ Layer ] prevent randomize when it restore the data
jijoongmoon Jun 9, 2024
1f80b65
[ Context ] add check if it needs restore previous data
jijoongmoon Jun 9, 2024
527becc
[ Tensor ] remove sscal to set zero.
jijoongmoon Jun 9, 2024
13bce59
meson: do not enable AVX in non-x64
myungjoo Jun 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Applications/KNN/jni/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ e = executable('knn_sample',
install_dir: application_install_dir
)

test('app_knn', e, args: [nntr_app_resdir / 'KNN'])
test('app_knn', e, args: [nntr_app_resdir / 'KNN/'])
2 changes: 1 addition & 1 deletion api/ccapi/include/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ class Model {
* @retval #ML_ERROR_NONE Successful.
* @retval #ML_ERROR_INVALID_PARAMETER invalid parameter.
*/
virtual int compile() = 0;
virtual int compile(ExecutionMode exec_mode_ = ExecutionMode::TRAIN) = 0;

/**
* @brief Initialize Network. This should be called after setting the
Expand Down
1 change: 1 addition & 0 deletions debian/nntrainer-dev.install
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
/usr/include/nntrainer/blas_interface.h
/usr/include/nntrainer/var_grad.h
/usr/include/nntrainer/weight.h
/usr/include/nntrainer/blas_avx.h
# todo: update dataset headers
/usr/include/nntrainer/databuffer.h
/usr/include/nntrainer/databuffer_factory.h
Expand Down
19 changes: 13 additions & 6 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,21 @@ warning_c_flags = [
'-Wno-error=varargs'
]

arch = host_machine.cpu_family()

if get_option('enable-avx') and arch == 'x86_64'
extra_defines += '-DUSE_AVX=1'
if get_option('platform') == 'tizen'
add_project_arguments(['-mavx2'], language: ['c','cpp'])
else
add_project_arguments(['-march=native'], language: ['c','cpp'])
endif
message('-march=native added for AVX hardware acceleration.')
elif get_option('enable-avx')
warning('AVX enabled for non x86_64 build target. The enable-avx option is ignored.')
endif

if get_option('enable-fp16')
arch = host_machine.cpu_family()
if get_option('platform') == 'android'
add_project_arguments('-mfp16-format=ieee', language: ['c', 'cpp'])
extra_defines += '-DENABLE_FP16=1'
Expand Down Expand Up @@ -105,11 +117,6 @@ if get_option('enable-fp16')
if cc.version().version_compare('>=12.1.0')
message ('Float16 for x86_64 enabled. Modern gcc-x64 generally supports float16 with _Float16.')
extra_defines += '-DENABLE_FP16=1'
if get_option('enable-avx')
extra_defines += '-DUSE_AVX=1'
add_project_arguments(['-march=native'], language: ['c','cpp'])
message('-march=native added for AVX hardware acceleration.')
endif
else
warning ('Float16 for x86_64 enabled. However, software emulation is applied for fp16, making it slower and inconsistent. Use GCC 12+ for FP16 support. This build will probably fail unless you bring a compiler that supports fp16 for x64.')
endif
Expand Down
2 changes: 1 addition & 1 deletion meson_options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ option('enable-fp16', type: 'boolean', value: false)
option('enable-cublas', type: 'boolean', value: false)
option('enable-openmp', type: 'boolean', value: true)
option('enable-neon', type: 'boolean', value: false)
option('enable-avx', type: 'boolean', value: false)
option('enable-avx', type: 'boolean', value: true)
option('enable-opencl', type: 'boolean', value: false)

# ml-api dependency (to enable, install capi-inference from github.com/nnstreamer/api )
Expand Down
9 changes: 9 additions & 0 deletions nntrainer/graph/graph_core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ GraphCore::getSortedNode(unsigned int ith) const {
return Sorted.at(ith);
}

const unsigned int GraphCore::getSortedNodeIdx(const std::string &name) const {
return sorted_node_map.at(name);
}

void GraphCore::makeAdjacencyList(
std::vector<std::list<std::shared_ptr<GraphNode>>> &adj) {
/** initialize the adj list */
Expand Down Expand Up @@ -93,6 +97,11 @@ void GraphCore::topologicalSort() {

if (Sorted.size() != node_list.size())
throw std::runtime_error("Internal error in topologicalSort");
unsigned int idx = 0;
for (auto n : Sorted) {
sorted_node_map[n->getName()] = idx;
idx++;
}
}

const std::shared_ptr<GraphNode> &
Expand Down
8 changes: 8 additions & 0 deletions nntrainer/graph/graph_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,13 @@ class GraphCore {
*/
const std::shared_ptr<GraphNode> &getSortedNode(unsigned int ith) const;

/**
* @brief getter of Sorted GraphNode index with name
* @param[in] layer name
* @ret index
*/
const unsigned int getSortedNodeIdx(const std::string &name) const;

/**
* @brief getter of GraphNode with node name
* @param[in] node name
Expand Down Expand Up @@ -252,6 +259,7 @@ class GraphCore {
std::vector<std::shared_ptr<GraphNode>>
node_list; /**< Unordered Node List */
std::unordered_map<std::string, int> node_map; /**< Unordered Node map */
std::unordered_map<std::string, int> sorted_node_map; /**< Unordered Node map */
std::vector<std::shared_ptr<GraphNode>> Sorted; /**< Ordered Node List */
bool sorted; /** if the node_list is sorted */

Expand Down
161 changes: 126 additions & 35 deletions nntrainer/graph/network_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ void NetworkGraph::applyGradients(
continue;
}

if (rc.isGradientClipByGlobalNorm(i)) {
if (rc.isGradientClipByGlobalNorm(i) || rc.isMixedPrecision(i)) {
/**
* @note the weights whose gradient are to be clipped by global norm will
* be clipped at once at the end of iteration and applied then.
Expand Down Expand Up @@ -393,56 +393,118 @@ sharedConstTensors NetworkGraph::incremental_forwarding(
return out;
}

void NetworkGraph::backwarding(
bool NetworkGraph::backwarding(
int iteration,
std::function<void(std::shared_ptr<LayerNode>, int)> &backwarding_op,
std::function<void(Weight &, int)> &apply_grad_clip_op,
std::function<bool(void *userdata)> stop_cb, void *userdata) const {
std::function<void(std::shared_ptr<LayerNode>, bool)> &forwarding_op,
std::function<bool(std::shared_ptr<LayerNode>, int)> &backwarding_op,
std::function<void(Weight &, int)> &lazy_apply_grad_op,
std::function<bool(void *userdata)> stop_cb, void *userdata) {
/**
* last layer backwarding is run out of this loop
*/
auto iter_begin = getBackwardingBeginIter();
auto iter_end = getBackwardingEndIter();
bool is_valid = true;

/// there is no layer to train, so backwarding is essentially noop
if (iter_begin == iter_end) {
return;
return true;
}

auto const &lptr_begin = (*iter_begin);
// graph_const_reverse_iterator
auto iter_ = iter_begin;

if (lptr_begin->requireLabel() == false)
throw std::runtime_error(
"Error: last layer does not accept label, we can't train");

for (auto iter = iter_begin; iter != iter_end && !stop_cb(userdata); iter++) {
auto &ln = *iter;
for (iter_ = iter_begin; iter_ != iter_end && !stop_cb(userdata); iter_++) {
auto &ln = *iter_;
PROFILE_TIME_START(profile_keys.at(ln->getType()));
backwarding_op(ln, iteration);
is_valid = backwarding_op(ln, iteration);
PROFILE_TIME_END(profile_keys.at(ln->getType()));

if (!is_valid) {
std::cout << ln->getName() << " : Gradient has NaN --> "
<< ln->getRunContext().getLossScale() << std::endl;
break;
}
}

/** perform clipping of the gradients by global norm if any */
if (clip_weights.empty())
return;
if (!is_valid) {
/** if has NaN
* 1. reset the loss scale. : @todo Backoff_factor : default --> 0.5
* 2. run forwarding from cur_iter to cend() && !stop_cb(userdata);
* 3. return false --> run backwarding again;
*/
float scale = (*iter_)->getRunContext().getLossScale();

NNTR_THROW_IF(scale == 1.0f, std::invalid_argument)
<< "Loss Scale Factor is 1.0f";

float s = scale > 1.5f ? scale * 0.5f : 1.0f;

resetLossScale(s);

/** calculate the global norm */
Tensor global_norm_t(
TensorDim({1u, 1u, 1u, (unsigned int)clip_weights.size()}));
float *global_norm_data = global_norm_t.getData();
for (unsigned int idx = 0; idx < clip_weights.size(); idx++) {
auto const &w = clip_weights[idx];
global_norm_data[idx] = w->getGradientNorm();
auto f_iter = cbegin() + graph.getSortedNodeIdx((*iter_)->getName());

for (auto iter = f_iter; iter != cend() && !stop_cb(userdata); iter++) {
auto &ln = *iter;
ln->reStoreData(true);
}

for (auto iter = f_iter; iter != cend() && !stop_cb(userdata); iter++) {
auto &ln = *iter;
PROFILE_TIME_START(profile_keys.at(ln->getType()));
forwarding_op(*iter, true);
PROFILE_TIME_END(profile_keys.at(ln->getType()));
}

return false;
}
float global_norm = global_norm_t.l2norm();
/** apply the gradient with the above global norm */
for (auto w : clip_weights) {
w->clipGradientByGlobalNorm(global_norm);

/** perform clipping of the gradients by global norm if any */
if (lazy_weights.empty())
return true;

if (is_clip_grad) {
/** calculate the global norm */
Tensor global_norm_t(
TensorDim({1u, 1u, 1u, (unsigned int)lazy_weights.size()}));
float *global_norm_data = global_norm_t.getData();
for (unsigned int idx = 0; idx < lazy_weights.size(); idx++) {
auto const &w = lazy_weights[idx];
if (w->getGradientRef().getDataType() != TensorDim::DataType::FP32) {
Tensor grad_32 = w->getGradientRef().clone(TensorDim::DataType::FP32);
global_norm_data[idx] = grad_32.l2norm();
} else {
global_norm_data[idx] = w->getGradientNorm();
}
}
float global_norm = global_norm_t.l2norm();
/** apply the gradient with the above global norm */
for (auto w : lazy_weights) {
w->clipGradientByGlobalNorm(global_norm);
}
}
/** apply the gradient with the above global norm */
for (auto w : clip_weights) {
apply_grad_clip_op(*w, iteration);
for (auto w : lazy_weights) {
lazy_apply_grad_op(*w, iteration);
}
nan_count++;

/** @todo : handle as property : growth_interval : default --> 2000 */

if (nan_count > 2000) {
float scale = (*iter_)->getRunContext().getLossScale();
/** @todo growth_factor : default --> 2.0 */
float s = scale * 2.0f;
resetLossScale(s);
nan_count = 0;
}

return true;
}

LayerNode *NetworkGraph::computeBackwardEnd() {
Expand Down Expand Up @@ -580,8 +642,15 @@ void NetworkGraph::addLayer(std::shared_ptr<LayerNode> layer) {

InPlace
NetworkGraph::canExecuteInPlace(const std::shared_ptr<LayerNode> &lnode) {
if (!lnode->supportInPlace())

if (!lnode->supportInPlace()) {
return InPlace::NONE;
}

if (lnode->getType() == InputLayer::type &&
!istrequal(getTensorType()[2], "FP32")) {
return InPlace::NONE;
}

/** layers which behave as a no-op - flatten */
auto no_op = [](const std::shared_ptr<LayerNode> &lnode) {
Expand Down Expand Up @@ -746,7 +815,7 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
[](const Var_Grad *vg) { return vg->getDim(); });

/** finalize the layer and get the final context */
auto init_context = lnode->finalize(input_dims, getTensorType());
auto init_context = lnode->finalize(input_dims, getTensorType(), exec_mode);

/**
* Request manager for either a pre-allocated output as input or a newly
Expand All @@ -768,9 +837,10 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
* node is going to be used with in-place optimizations.
*/
auto out_specs = init_context.getOutSpecs();

/// @note try move inplace control to finalize
bool shared_var = false, shared_grad = false;
if (lnode->executeInPlace() != InPlace::NONE) {
if (lnode->executeInPlace() != InPlace::NONE && lnode->supportInPlace()) {
setInplaceSharedMemoryConfigByLayer(lnode, shared_var, shared_grad);
for (unsigned int i = 0; i < out_specs.size(); ++i) {
auto &s = out_specs.at(i);
Expand Down Expand Up @@ -873,13 +943,17 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
}
}

lnode->setDataType(init_context.getWeightDataType(),
init_context.getActivationDataType());

lnode->configureRunContext(
// TODO: update weights spec for trainable based on layer trainable prop
tensor_manager->requestWeights(gnode, init_context.getWeightsSpec(),
lnode->getTrainable(), shared_weight_names),
inputs, outputs,
tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(),
lnode->getTrainable(), shared_tensor_names));
lnode->getTrainable(), shared_tensor_names),
init_context.getLossScale());

return outputs;
}
Expand Down Expand Up @@ -1027,7 +1101,8 @@ NetworkGraph::refinalizeContext(const std::shared_ptr<LayerNode> &lnode,
// TODO: update weights spec for trainable based on layer trainable prop
weights, inputs, outputs,
tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(),
lnode->getTrainable(), shared_tensor_names));
lnode->getTrainable(), shared_tensor_names),
init_context.getLossScale());

return outputs;
}
Expand Down Expand Up @@ -1197,7 +1272,7 @@ int NetworkGraph::initialize(ExecutionMode mode,
*/
if (tensor_manager->isLastAccess(rc.getWeightGrad(i).getName(),
last_grad_access) ||
(rc.isGradientClipByGlobalNorm(i) &&
((rc.isGradientClipByGlobalNorm(i) || rc.isMixedPrecision(i)) &&
tensor_manager->isSecondLastAccess(rc.getWeightGrad(i).getName(),
last_grad_access))) {
rc.getWeightObject(i).setAsGradientLastAccess();
Expand Down Expand Up @@ -1287,11 +1362,19 @@ int NetworkGraph::initialize(ExecutionMode mode,

/** select weights which would require clipping of the gradients by global
* norm if any */
clip_weights = tensor_manager->getWeights([](const Weight *w) {
lazy_weights = tensor_manager->getWeights([](const Weight *w) {
return w->hasGradient() && w->isGradientLastAccess() &&
w->isGradientClipByGlobalNorm();
(w->isGradientClipByGlobalNorm() || w->isMixedPrecision());
});

is_clip_grad = false;
for (auto w : lazy_weights) {
if (w->isGradientClipByGlobalNorm()) {
is_clip_grad = true;
break;
}
}

return ML_ERROR_NONE;
}

Expand Down Expand Up @@ -1556,10 +1639,18 @@ void NetworkGraph::requestOptimizerVariable(
const TensorDim &dim = w->getDim();
std::vector<TensorDim> dims = cb(dim);
w->setOptimizerVariables(tensor_manager->requestWeightOptimizerVariables(
dims, w->getName(), TensorLifespan::MAX_LIFESPAN,
w->isGradientClipByGlobalNorm(), Tensor::Initializer::ZEROS));
dims, w->getName(), ":opt", TensorLifespan::MAX_LIFESPAN,
w->isGradientClipByGlobalNorm(), w->isMixedPrecision(),
Tensor::Initializer::ZEROS));
}
}
}

void NetworkGraph::resetLossScale(float scale) {
for (auto iter = cbegin(); iter != cend(); iter++) {
auto &ln = *iter;
ln->getRunContext().setLossScale(scale);
}
}

} /* namespace nntrainer */
Loading