Skip to content

Commit

Permalink
[onert] Export tensor and optimizer data to the checkpoint file (#13815)
Browse files Browse the repository at this point in the history
This commit exports tensor and optimizer data to the checkpoint file.
It introduces DataBuffer to manage checkpoint data buffer.

ONE-DCO-1.0-Signed-off-by: Jiyoung Yun <[email protected]>
  • Loading branch information
jyoungyun authored Sep 4, 2024
1 parent 949661a commit 8a22e18
Showing 1 changed file with 172 additions and 3 deletions.
175 changes: 172 additions & 3 deletions runtime/onert/core/src/exporter/train/CheckpointExporter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,51 @@ using namespace train;
using namespace checkpoint;
using namespace exec;

struct DataBuffer
{
void setSize(const std::vector<uint32_t> &sizes)
{
_offset.resize(sizes.size());
uint32_t total = std::accumulate(sizes.begin(), sizes.end(), 0);
_data.resize(total);

_offset_it = _offset.begin();
_data_ptr = _data.data();
}

void setOffset(uint32_t offset) { _start_offset = offset + _offset.size() * sizeof(uint32_t); }

// This function should be called after executing the setSize() and setOffset() functions.
void setData(const char *data, uint32_t size)
{
assert(_offset_it != _offset.end());
assert(_data_ptr - _data.data() + size <= _data.size());

*_offset_it++ = _start_offset;
if (data && size > 0)
std::memcpy(_data_ptr, data, size);
_data_ptr += size;
_start_offset += size;
}

uint32_t size() const { return sizeof(uint32_t) * _offset.size() + _data.size(); }

void write(std::ofstream &ostream)
{
ostream.write(reinterpret_cast<const char *>(&_offset[0]),
static_cast<std::streamsize>(sizeof(_offset[0]) * _offset.size()));
ostream.write(reinterpret_cast<const char *>(&_data[0]),
static_cast<std::streamsize>(_data.size()));
}

private:
std::vector<uint32_t> _offset;
std::vector<char> _data;
uint32_t _start_offset;
std::vector<uint32_t>::iterator _offset_it;
char *_data_ptr;
};

class CheckpointExporter
{
public:
Expand All @@ -44,8 +89,29 @@ class CheckpointExporter
_header.schema = checkpoint::SCHEMA_VERSION;

uint32_t offset = sizeof(_header);
// TODO Store tensor and optimizer data
UNUSED_RELEASE(exec);

auto length = 0;
exec->iterateTrainableTensors(
[&](const ir::OperandIndex &, const backend::train::ITrainableTensor *) { length++; });
_header.length = length;

setTensorData(offset, exec);
offset += _tensor_data.size();

_header.opt1_offset = offset;
setOptimizerData(offset, train_info, exec);
if (_optimizer_data.size() > 2)
throw std::runtime_error{"Do not support optimizer data more than 2."};
if (_optimizer_data.size() > 0)
{
offset += _optimizer_data[0].size();
if (_optimizer_data.size() > 1)
{
_header.opt2_offset = offset;
offset += _optimizer_data[1].size();
}
}

_header.other_offset = offset;

std::memset(&_footer, 0, sizeof(_footer));
Expand All @@ -59,14 +125,117 @@ class CheckpointExporter
throw std::runtime_error{"Failed to save checkpoint: " + path};

dst.write(reinterpret_cast<const char *>(&_header), sizeof(_header));
// TODO Write tensor and optimizer data
_tensor_data.write(dst);
for (auto &opt : _optimizer_data)
opt.write(dst);
dst.write(reinterpret_cast<const char *>(&_footer), sizeof(_footer));
dst.close();
}

private:
void setTensorData(uint32_t start_offset, const exec::Execution *const exec)
{
std::vector<uint32_t> sizes;
exec->iterateTrainableTensors([&](const ir::OperandIndex &,
const backend::train::ITrainableTensor *tensor) {
assert(tensor);
if (tensor->total_size() >= std::numeric_limits<uint32_t>::max())
{
throw std::runtime_error{"Tensor size exceeds the uint32_t max value. This model does not "
"support saving as a checkpoint file."};
}
sizes.emplace_back(tensor->total_size());
});

assert(_header.length == sizes.size());

_tensor_data.setSize(sizes);
_tensor_data.setOffset(start_offset);
[[maybe_unused]] auto vindex = 0;
exec->iterateTrainableTensors([&](const ir::OperandIndex &,
const backend::train::ITrainableTensor *tensor) {
assert(tensor);
assert(sizes[vindex++] == tensor->total_size());
_tensor_data.setData(reinterpret_cast<const char *>(tensor->buffer()), tensor->total_size());
});
}

void setOptimizerData(uint32_t start_offset, const ir::train::TrainingInfo *const train_info,
const exec::Execution *const exec)
{
// TODO Support multiple optimizer
switch (train_info->optimizerInfo().optim_code)
{
case onert::ir::train::OptimizerCode::Adam:
setAdamOptimizerData(start_offset, exec);
break;
default:
break;
}
}

void setAdamOptimizerData(uint32_t start_offset, const exec::Execution *const exec)
{
// Adam optimizer has two optimizer variables. (mean, variance)
constexpr auto ADAM_VARIABLE_COUNT = 2;

std::vector<uint32_t> sizes;
exec->iterateTrainableTensors(
[&](const ir::OperandIndex &, const backend::train::ITrainableTensor *tensor) {
assert(tensor);
auto trainable_tensor = const_cast<backend::train::ITrainableTensor *>(tensor);
const auto &opt_vars = trainable_tensor->optVars();

// Untrainable tensor should not have any optimizer variables.
assert(opt_vars.size() == ADAM_VARIABLE_COUNT || opt_vars.size() == 0);

uint32_t size = 0;
if (opt_vars.size() == ADAM_VARIABLE_COUNT)
{
assert(opt_vars[0]->total_size() == opt_vars[1]->total_size());
size = opt_vars[0]->total_size();
}

sizes.emplace_back(size);
});

assert(_header.length == sizes.size());

_optimizer_data.resize(ADAM_VARIABLE_COUNT);
for (auto &opt : _optimizer_data)
{
opt.setSize(sizes);
opt.setOffset(start_offset);
start_offset += opt.size();
}

[[maybe_unused]] auto vindex = 0;
exec->iterateTrainableTensors(
[&](const ir::OperandIndex &, const backend::train::ITrainableTensor *tensor) {
assert(tensor);
auto trainable_tensor = const_cast<backend::train::ITrainableTensor *>(tensor);
const auto &opt_vars = trainable_tensor->optVars();

for (auto i = 0; i < ADAM_VARIABLE_COUNT; ++i)
{
if (opt_vars.size() == ADAM_VARIABLE_COUNT)
{
assert(opt_vars[i]->total_size() == sizes[vindex]);
_optimizer_data[i].setData(reinterpret_cast<const char *>(opt_vars[i]->buffer()),
opt_vars[i]->total_size());
}
else
_optimizer_data[i].setData(nullptr, 0);
}
vindex++;
});
}

private:
checkpoint::Header _header;
checkpoint::Footer _footer;
DataBuffer _tensor_data;
std::vector<DataBuffer> _optimizer_data;
};

} // namespace
Expand Down

0 comments on commit 8a22e18

Please sign in to comment.