Skip to content

Commit

Permalink
EAMxx: fix how we find filename in rpointer
Browse files Browse the repository at this point in the history
* We must consider avg type and output freq specs, to avoid name clashing
* Since rhist filename contains output control specs, no need to check
  that freq/freq_units/avg_type are unchanged upon restart: if rhist file
  is found, they are ok.
  • Loading branch information
bartgol committed Sep 30, 2024
1 parent cefaf92 commit e540ae9
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 62 deletions.
77 changes: 46 additions & 31 deletions components/eamxx/src/share/io/scream_io_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,44 +4,54 @@
#include "share/util/scream_utils.hpp"

#include <fstream>
#include <regex>

namespace scream {

std::string find_filename_in_rpointer (
const std::string& filename_prefix,
const bool model_restart,
const ekat::Comm& comm,
const util::TimeStamp& run_t0)
const util::TimeStamp& run_t0,
const OutputAvgType avg_type,
const IOControl& control)
{
std::string filename;
bool found = false;
std::string content;
std::string suffix = model_restart ? ".r." : ".rhist.";
std::string pattern_str = filename_prefix + suffix;

// The AD will pass a default constructed control, since it doesn't know the values
// of REST_N/REST_OPTION used in the previous run. Also, model restart is *always* INSTANT.
if (model_restart) {
EKAT_REQUIRE_MSG (avg_type==OutputAvgType::Instant,
"Error! Model restart output should have INSTANT avg type.\n"
" - input avg_type: " + e2str(avg_type) + "\n");
pattern_str += e2str(OutputAvgType::Instant) + R"(.n(step|sec|min|hour|day|month|year)s_x\d+)";
} else {
EKAT_REQUIRE_MSG (control.output_enabled(),
"Error! When restarting an output stream, we need a valid IOControl structure.\n"
" - filename prefix: " + filename_prefix + "\n");
pattern_str += e2str(avg_type) + "." + control.frequency_units + "_x" + std::to_string(control.frequency);
}
pattern_str += "." + run_t0.to_string() + ".nc";
std::regex pattern (pattern_str);

if (comm.am_i_root()) {
std::ifstream rpointer_file;

std::string line;
rpointer_file.open("rpointer.atm");

// If the timestamp is in the filename, then the filename ends with "S.nc",
// with S being the string representation of the timestamp
auto ts_len = run_t0.to_string().size();
auto extract_ts = [&] (const std::string& line) -> util::TimeStamp {
auto min_size = ts_len+3;
if (line.size()>=min_size) {
auto ts_str = line.substr(line.size()-min_size,ts_len);
auto ts = util::str_to_time_stamp(ts_str);
return ts;
} else {
return util::TimeStamp();
}
};

while ((rpointer_file >> line) and not found) {
while (std::getline(rpointer_file,line)) {
content += line + "\n";

found = line.find(filename_prefix+suffix) != std::string::npos &&
extract_ts(line)==run_t0;
filename = line;
if (std::regex_match(line,pattern)) {
filename = line;
found = true;
break;
}
}
}

Expand All @@ -52,18 +62,23 @@ std::string find_filename_in_rpointer (
if (not found) {
broadcast_string(content,comm,comm.root_rank());

// If the history restart file is not found, it must be because the last
// model restart step coincided with a model output step, in which case
// a restart history file is not written.
// If that's the case, *disable* output restart, by setting
// 'Restart'->'Perform Restart' = false
// in the input parameter list
EKAT_ERROR_MSG (
"Error! Restart requested, but no restart file found in 'rpointer.atm'.\n"
" restart filename prefix: " + filename_prefix + "\n"
" restart file type: " + std::string(model_restart ? "model restart" : "history restart") + "\n"
" run t0 : " + run_t0.to_string() + "\n"
" rpointer content:\n" + content);
if (model_restart) {
EKAT_ERROR_MSG (
"Error! Restart requested, but no model restart file found in 'rpointer.atm'.\n"
" model restart filename prefix: " + filename_prefix + "\n"
" run t0 : " + run_t0.to_string() + "\n"
" rpointer content:\n" + content + "\n\n");
} else {
EKAT_ERROR_MSG (
"Error! Restart requested, but no history restart file found in 'rpointer.atm'.\n"
" hist restart filename prefix: " + filename_prefix + "\n"
" run t0 : " + run_t0.to_string() + "\n"
" avg_type : " + e2str(avg_type) + "\n"
" output freq : " + std::to_string(control.frequency) + "\n"
" output freq units: " + control.frequency_units + "\n"
" rpointer content:\n" + content + "\n\n"
" Did you change output specs (avg type, freq, or freq units) across restart? If so, please, remember that it is not allowed.\n");
}
}

// Have the root rank communicate the nc filename
Expand Down
11 changes: 9 additions & 2 deletions components/eamxx/src/share/io/scream_io_utils.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef SCREAM_IO_UTILS_HPP
#define SCREAM_IO_UTILS_HPP

#include "scream_io_control.hpp"
#include "share/util/scream_time_stamp.hpp"

#include <ekat/util/ekat_string_utils.hpp>
Expand Down Expand Up @@ -59,11 +60,17 @@ inline OutputAvgType str2avg (const std::string& s) {
return OAT::Invalid;
}

// The AD will pass a default constructed control, since it doesn't know the values
// of REST_N/REST_OPTION used in the previous run
// Output streams MUST pass a valid control structure, cause we need to differentiate
// between, e.g., streams with same filename prefix, but different output freq specs
std::string find_filename_in_rpointer (
const std::string& casename,
const std::string& filename_prefix,
const bool model_restart,
const ekat::Comm& comm,
const util::TimeStamp& run_t0);
const util::TimeStamp& run_t0,
const OutputAvgType avg_type = OutputAvgType::Instant,
const IOControl& control = {});

struct LongNames {

Expand Down
22 changes: 5 additions & 17 deletions components/eamxx/src/share/io/scream_output_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,9 @@ setup (const ekat::Comm& io_comm, const ekat::ParameterList& params,

if (perform_history_restart) {
using namespace scorpio;
auto rhist_file = find_filename_in_rpointer(hist_restart_filename_prefix,false,m_io_comm,m_run_t0);
IOFileSpecs hist_restart_specs;
hist_restart_specs.ftype = FileType::HistoryRestart;
auto rhist_file = find_filename_in_rpointer(hist_restart_filename_prefix,false,m_io_comm,m_run_t0,m_avg_type,m_output_control);

scorpio::register_file(rhist_file,scorpio::Read);
// From restart file, get the time of last write, as well as the current size of the avg sample
Expand All @@ -196,22 +198,8 @@ setup (const ekat::Comm& io_comm, const ekat::ParameterList& params,

// We do NOT allow changing output specs across restart. If you do want to change
// any of these, you MUST start a new output stream (e.g., setting 'Perform Restart: false')
auto old_freq = scorpio::get_attribute<int>(rhist_file,"GLOBAL","averaging_frequency");
EKAT_REQUIRE_MSG (old_freq == m_output_control.frequency,
"Error! Cannot change frequency when performing history restart.\n"
" - old freq: " << old_freq << "\n"
" - new freq: " << m_output_control.frequency << "\n");
auto old_freq_units = scorpio::get_attribute<std::string>(rhist_file,"GLOBAL","averaging_frequency_units");
EKAT_REQUIRE_MSG (old_freq_units == m_output_control.frequency_units,
"Error! Cannot change frequency units when performing history restart.\n"
" - old freq units: " << old_freq_units << "\n"
" - new freq units: " << m_output_control.frequency_units << "\n");
auto old_avg_type = scorpio::get_attribute<std::string>(rhist_file,"GLOBAL","averaging_type");
EKAT_REQUIRE_MSG (old_avg_type == e2str(m_avg_type),
"Error! Cannot change avg type when performing history restart.\n"
" - old avg type: " << old_avg_type + "\n"
" - new avg type: " << e2str(m_avg_type) << "\n");

// NOTE: we do not check that freq/freq_units/avg_type are not changed: since we used
// that info to find the correct rhist file, we already know that they match!
auto old_storage_type = scorpio::get_attribute<std::string>(rhist_file,"GLOBAL","file_max_storage_type");
EKAT_REQUIRE_MSG (old_storage_type == e2str(m_output_file_specs.storage.type),
"Error! Cannot change file storage type when performing history restart.\n"
Expand Down
39 changes: 27 additions & 12 deletions components/eamxx/src/share/io/tests/io_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
TEST_CASE ("find_filename_in_rpointer") {
using namespace scream;

constexpr auto AVG = OutputAvgType::Average;
constexpr auto INST = OutputAvgType::Instant;

ekat::Comm comm(MPI_COMM_WORLD);

util::TimeStamp t0({2023,9,7},{12,0,0});
Expand All @@ -17,21 +20,33 @@ TEST_CASE ("find_filename_in_rpointer") {
// Create a dummy rpointer
std::ofstream rpointer ("rpointer.atm");

rpointer << "foo.r." + t0.to_string() + ".nc\n";
rpointer << "bar2.rhist." + t0.to_string() + ".nc\n";
rpointer << "bar.rhist." + t0.to_string() + ".nc\n";
rpointer.close();
IOControl foo_c, bar_c, bar2_c;
foo_c.frequency = 3; foo_c.frequency_units = "nsteps";
bar_c.frequency = 1; bar_c.frequency_units = "ndays";
bar2_c.frequency = 6; bar2_c.frequency_units = "nhours";

// Now test find_filename_in_rpointer with different inputs
std::string foo_fname = "foo.r.INSTANT.nsteps_x3." + t0.to_string() + ".nc";
std::string bar_fname = "bar.rhist.AVERAGE.ndays_x1." + t0.to_string() + ".nc";
std::string bar2_fname = "bar.rhist.AVERAGE.nhours_x6." + t0.to_string() + ".nc";

REQUIRE_THROWS (find_filename_in_rpointer("baz",false,comm,t0)); // wrong prefix
REQUIRE_THROWS (find_filename_in_rpointer("bar",false,comm,t1)); // wrong timestamp
REQUIRE_THROWS (find_filename_in_rpointer("bar",true, comm,t0)); // bar is not model restart
REQUIRE_THROWS (find_filename_in_rpointer("foo",false,comm,t0)); // foo is model restart
rpointer << foo_fname<< "\n";
rpointer << bar_fname<< "\n";
rpointer << bar2_fname << "\n";
rpointer.close();

REQUIRE (find_filename_in_rpointer("bar", false,comm,t0)==("bar.rhist."+t0.to_string()+".nc"));
REQUIRE (find_filename_in_rpointer("bar2",false,comm,t0)==("bar2.rhist."+t0.to_string()+".nc"));
REQUIRE (find_filename_in_rpointer("foo", true, comm,t0)==("foo.r."+t0.to_string()+".nc"));
// Now test find_filename_in_rpointer with different inputs
REQUIRE_THROWS (find_filename_in_rpointer("baz",false,comm,t0,AVG)); // missing control (needed for rhist files)
REQUIRE_THROWS (find_filename_in_rpointer("baz",false,comm,t0,AVG,bar_c)); // wrong prefix
REQUIRE_THROWS (find_filename_in_rpointer("bar",false,comm,t1,AVG,bar_c)); // wrong timestamp
REQUIRE_THROWS (find_filename_in_rpointer("bar",true, comm,t0,AVG,bar_c)); // bar is not model restart
REQUIRE_THROWS (find_filename_in_rpointer("bar",false,comm,t0,INST,bar_c)); // wrong avg type
REQUIRE_THROWS (find_filename_in_rpointer("bar",false,comm,t0,INST,bar2_c)); // wrong freq specs
REQUIRE_THROWS (find_filename_in_rpointer("foo",false,comm,t0,INST,foo_c)); // foo is model restart
REQUIRE_THROWS (find_filename_in_rpointer("foo",true,comm,t0,AVG)); // model restart MUST be INSTANT

REQUIRE (find_filename_in_rpointer("bar",false,comm,t0,AVG,bar_c)==bar_fname);
REQUIRE (find_filename_in_rpointer("bar",false,comm,t0,AVG,bar2_c)==bar2_fname);
REQUIRE (find_filename_in_rpointer("foo",true, comm,t0)==foo_fname);
}

TEST_CASE ("io_control") {
Expand Down

0 comments on commit e540ae9

Please sign in to comment.