Skip to content

Commit

Permalink
Merge Pull Request #2986 from E3SM-Project/scream/bartgol/eamxx/fix-i…
Browse files Browse the repository at this point in the history
…o-bug

Automatically Merged using E3SM Pull Request AutoTester
PR Title: Fix bug related to name clashing in output streams
PR Author: bartgol
PR LABELS: I/O, AT: AUTOMERGE, bugfix, code robustness
  • Loading branch information
E3SM-Autotester authored Oct 4, 2024
2 parents 2763997 + 48930bf commit d981484
Show file tree
Hide file tree
Showing 14 changed files with 109 additions and 86 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ TEST_CASE("nudging_tests") {
const int nlevs_fine = 2*nlevs_data -1;

// Files names
auto postfix = ".INSTANT.nsteps_x1." + get_t0().to_string() + ".nc";
auto postfix = ".INSTANT.nsteps_x1.np*." + get_t0().to_string() + ".nc";
auto nudging_data = "nudging_data" + postfix;
auto nudging_data_filled = "nudging_data_filled" + postfix;
auto map_file = "map_ncol" + std::to_string(ngcols_data)
Expand Down
87 changes: 56 additions & 31 deletions components/eamxx/src/share/io/scream_io_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,46 +2,60 @@

#include "share/io/scream_scorpio_interface.hpp"
#include "share/util/scream_utils.hpp"
#include "share/scream_config.hpp"

#include <fstream>
#include <regex>

namespace scream {

std::string find_filename_in_rpointer (
const std::string& filename_prefix,
const bool model_restart,
const ekat::Comm& comm,
const util::TimeStamp& run_t0)
const util::TimeStamp& run_t0,
const OutputAvgType avg_type,
const IOControl& control)
{
std::string filename;
bool found = false;
std::string content;
std::string suffix = model_restart ? ".r." : ".rhist.";
std::string pattern_str = filename_prefix + suffix;

// The AD will pass a default constructed control, since it doesn't know the values
// of REST_N/REST_OPTION used in the previous run. Also, model restart is *always* INSTANT.
if (model_restart) {
EKAT_REQUIRE_MSG (avg_type==OutputAvgType::Instant,
"Error! Model restart output should have INSTANT avg type.\n"
" - input avg_type: " + e2str(avg_type) + "\n");
pattern_str += e2str(OutputAvgType::Instant) + R"(.n(step|sec|min|hour|day|month|year)s_x\d+)";
} else {
EKAT_REQUIRE_MSG (control.output_enabled(),
"Error! When restarting an output stream, we need a valid IOControl structure.\n"
" - filename prefix: " + filename_prefix + "\n");
pattern_str += e2str(avg_type) + "." + control.frequency_units + "_x" + std::to_string(control.frequency);
}
if (is_scream_standalone()) {
pattern_str += ".np" + std::to_string(comm.size());
}
pattern_str += "." + run_t0.to_string() + ".nc";
std::regex pattern (pattern_str);

if (comm.am_i_root()) {
std::ifstream rpointer_file;

std::string line;
rpointer_file.open("rpointer.atm");

// If the timestamp is in the filename, then the filename ends with "S.nc",
// with S being the string representation of the timestamp
auto ts_len = run_t0.to_string().size();
auto extract_ts = [&] (const std::string& line) -> util::TimeStamp {
auto min_size = ts_len+3;
if (line.size()>=min_size) {
auto ts_str = line.substr(line.size()-min_size,ts_len);
auto ts = util::str_to_time_stamp(ts_str);
return ts;
} else {
return util::TimeStamp();
}
};

while ((rpointer_file >> line) and not found) {
while (std::getline(rpointer_file,line)) {
content += line + "\n";

found = line.find(filename_prefix+suffix) != std::string::npos &&
extract_ts(line)==run_t0;
filename = line;
if (std::regex_search(line,pattern)) {
filename = line;
found = true;
break;
}
}
}

Expand All @@ -52,18 +66,29 @@ std::string find_filename_in_rpointer (
if (not found) {
broadcast_string(content,comm,comm.root_rank());

// If the history restart file is not found, it must be because the last
// model restart step coincided with a model output step, in which case
// a restart history file is not written.
// If that's the case, *disable* output restart, by setting
// 'Restart'->'Perform Restart' = false
// in the input parameter list
EKAT_ERROR_MSG (
"Error! Restart requested, but no restart file found in 'rpointer.atm'.\n"
" restart filename prefix: " + filename_prefix + "\n"
" restart file type: " + std::string(model_restart ? "model restart" : "history restart") + "\n"
" run t0 : " + run_t0.to_string() + "\n"
" rpointer content:\n" + content);
if (model_restart) {
EKAT_ERROR_MSG (
"Error! Restart requested, but no model restart file found in 'rpointer.atm'.\n"
" model restart filename prefix: " + filename_prefix + "\n"
" model restart filename pattern: " + pattern_str + "\n"
" run t0 : " + run_t0.to_string() + "\n"
" rpointer content:\n" + content + "\n\n");
} else {
EKAT_ERROR_MSG (
"Error! Restart requested, but no history restart file found in 'rpointer.atm'.\n"
" hist restart filename prefix: " + filename_prefix + "\n"
" hist restart filename pattern: " + pattern_str + "\n"
" run t0 : " + run_t0.to_string() + "\n"
" avg_type : " + e2str(avg_type) + "\n"
" output freq : " + std::to_string(control.frequency) + "\n"
" output freq units: " + control.frequency_units + "\n"
" rpointer content:\n" + content + "\n\n"
" Did you change output specs (avg type, freq, or freq units) across restart? If so, please, remember that it is not allowed.\n"
" It is also possible you are using a rhist file create before commit 6b7d441330d. That commit changed how rhist file names\n"
" are formed. In particular, we no longer use INSTANT.${REST_OPTION}_x${REST_N}, but we use the avg type, and freq/freq_option\n"
" of the output stream (to avoid name clashes if 2 streams only differ for one of those). If you want to use your rhist file,\n"
" please rename it, so that the avg-type, freq, and freq_option reflect those of the output stream.\n");
}
}

// Have the root rank communicate the nc filename
Expand Down
11 changes: 9 additions & 2 deletions components/eamxx/src/share/io/scream_io_utils.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef SCREAM_IO_UTILS_HPP
#define SCREAM_IO_UTILS_HPP

#include "scream_io_control.hpp"
#include "share/util/scream_time_stamp.hpp"

#include <ekat/util/ekat_string_utils.hpp>
Expand Down Expand Up @@ -59,11 +60,17 @@ inline OutputAvgType str2avg (const std::string& s) {
return OAT::Invalid;
}

// The AD will pass a default constructed control, since it doesn't know the values
// of REST_N/REST_OPTION used in the previous run
// Output streams MUST pass a valid control structure, cause we need to differentiate
// between, e.g., streams with same filename prefix, but different output freq specs
std::string find_filename_in_rpointer (
const std::string& casename,
const std::string& filename_prefix,
const bool model_restart,
const ekat::Comm& comm,
const util::TimeStamp& run_t0);
const util::TimeStamp& run_t0,
const OutputAvgType avg_type = OutputAvgType::Instant,
const IOControl& control = {});

struct LongNames {

Expand Down
41 changes: 12 additions & 29 deletions components/eamxx/src/share/io/scream_output_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,9 @@ setup (const ekat::Comm& io_comm, const ekat::ParameterList& params,

if (perform_history_restart) {
using namespace scorpio;
auto rhist_file = find_filename_in_rpointer(hist_restart_filename_prefix,false,m_io_comm,m_run_t0);
IOFileSpecs hist_restart_specs;
hist_restart_specs.ftype = FileType::HistoryRestart;
auto rhist_file = find_filename_in_rpointer(hist_restart_filename_prefix,false,m_io_comm,m_run_t0,m_avg_type,m_output_control);

scorpio::register_file(rhist_file,scorpio::Read);
// From restart file, get the time of last write, as well as the current size of the avg sample
Expand All @@ -196,22 +198,8 @@ setup (const ekat::Comm& io_comm, const ekat::ParameterList& params,

// We do NOT allow changing output specs across restart. If you do want to change
// any of these, you MUST start a new output stream (e.g., setting 'Perform Restart: false')
auto old_freq = scorpio::get_attribute<int>(rhist_file,"GLOBAL","averaging_frequency");
EKAT_REQUIRE_MSG (old_freq == m_output_control.frequency,
"Error! Cannot change frequency when performing history restart.\n"
" - old freq: " << old_freq << "\n"
" - new freq: " << m_output_control.frequency << "\n");
auto old_freq_units = scorpio::get_attribute<std::string>(rhist_file,"GLOBAL","averaging_frequency_units");
EKAT_REQUIRE_MSG (old_freq_units == m_output_control.frequency_units,
"Error! Cannot change frequency units when performing history restart.\n"
" - old freq units: " << old_freq_units << "\n"
" - new freq units: " << m_output_control.frequency_units << "\n");
auto old_avg_type = scorpio::get_attribute<std::string>(rhist_file,"GLOBAL","averaging_type");
EKAT_REQUIRE_MSG (old_avg_type == e2str(m_avg_type),
"Error! Cannot change avg type when performing history restart.\n"
" - old avg type: " << old_avg_type + "\n"
" - new avg type: " << e2str(m_avg_type) << "\n");

// NOTE: we do not check that freq/freq_units/avg_type are not changed: since we used
// that info to find the correct rhist file, we already know that they match!
auto old_storage_type = scorpio::get_attribute<std::string>(rhist_file,"GLOBAL","file_max_storage_type");
EKAT_REQUIRE_MSG (old_storage_type == e2str(m_output_file_specs.storage.type),
"Error! Cannot change file storage type when performing history restart.\n"
Expand Down Expand Up @@ -400,7 +388,7 @@ void OutputManager::run(const util::TimeStamp& timestamp)

// Check if we need to open a new file
if (not filespecs.is_open) {
filespecs.filename = compute_filename (control,filespecs,timestamp);
filespecs.filename = compute_filename (filespecs,timestamp);
// Register all dims/vars, write geometry data (e.g. lat/lon/hyam/hybm)
setup_file(filespecs,control);
}
Expand Down Expand Up @@ -613,18 +601,20 @@ long long OutputManager::res_dep_memory_footprint () const {
}

std::string OutputManager::
compute_filename (const IOControl& control,
const IOFileSpecs& file_specs,
compute_filename (const IOFileSpecs& file_specs,
const util::TimeStamp& timestamp) const
{
auto filename = m_filename_prefix + file_specs.suffix();
const auto& control = m_output_control;

// Always add avg type and frequency info
filename += "." + e2str(m_avg_type);
filename += "." + control.frequency_units+ "_x" + std::to_string(control.frequency);

// Optionally, add number of mpi ranks (useful mostly in unit tests, to run multiple MPI configs in parallel)
if (m_params.get<bool>("MPI Ranks in Filename")) {
// For standalone EAMxx, we may have 2+ versions of the same test running with two
// different choices of ranks. To avoid name clashing for the output files,
// add the comm size to the output file name.
if (is_scream_standalone()) {
filename += ".np" + std::to_string(m_io_comm.size());
}

Expand Down Expand Up @@ -679,7 +669,6 @@ set_params (const ekat::ParameterList& params,
m_filename_prefix = m_params.get<std::string>("filename_prefix");

// Hard code some parameters in case we access them later
m_params.set("MPI Ranks in Filename",false);
m_params.set<std::string>("Floating Point Precision","real");
} else {
auto avg_type = m_params.get<std::string>("Averaging Type");
Expand Down Expand Up @@ -715,12 +704,6 @@ set_params (const ekat::ParameterList& params,
"Error! Invalid/unsupported value for 'Floating Point Precision'.\n"
" - input value: " + prec + "\n"
" - supported values: float, single, double, real\n");

// If not set, hard code to false for CIME cases, and true for standalone,
// since standalone may be running multiple versions of the same test at once
if (not m_params.isParameter("MPI Ranks in Filename")) {
m_params.set("MPI Ranks in Filename",is_scream_standalone());
}
}

// Output control
Expand Down
3 changes: 1 addition & 2 deletions components/eamxx/src/share/io/scream_output_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,7 @@ class OutputManager
long long res_dep_memory_footprint () const;
protected:

std::string compute_filename (const IOControl& control,
const IOFileSpecs& file_specs,
std::string compute_filename (const IOFileSpecs& file_specs,
const util::TimeStamp& timestamp) const;

void set_file_header(const IOFileSpecs& file_specs);
Expand Down
41 changes: 29 additions & 12 deletions components/eamxx/src/share/io/tests/io_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
TEST_CASE ("find_filename_in_rpointer") {
using namespace scream;

constexpr auto AVG = OutputAvgType::Average;
constexpr auto INST = OutputAvgType::Instant;

ekat::Comm comm(MPI_COMM_WORLD);

util::TimeStamp t0({2023,9,7},{12,0,0});
Expand All @@ -17,21 +20,35 @@ TEST_CASE ("find_filename_in_rpointer") {
// Create a dummy rpointer
std::ofstream rpointer ("rpointer.atm");

rpointer << "foo.r." + t0.to_string() + ".nc\n";
rpointer << "bar2.rhist." + t0.to_string() + ".nc\n";
rpointer << "bar.rhist." + t0.to_string() + ".nc\n";
rpointer.close();
IOControl foo_c, bar_c, bar2_c;
foo_c.frequency = 3; foo_c.frequency_units = "nsteps";
bar_c.frequency = 1; bar_c.frequency_units = "ndays";
bar2_c.frequency = 6; bar2_c.frequency_units = "nhours";

// Now test find_filename_in_rpointer with different inputs
std::string suffix = ".np" + std::to_string(comm.size()) + "." + t0.to_string() + ".nc";
std::string foo_fname = "foo.r.INSTANT.nsteps_x3" + suffix;
std::string bar_fname = "bar.rhist.AVERAGE.ndays_x1" + suffix;
std::string bar2_fname = "bar.rhist.AVERAGE.nhours_x6" + suffix;

REQUIRE_THROWS (find_filename_in_rpointer("baz",false,comm,t0)); // wrong prefix
REQUIRE_THROWS (find_filename_in_rpointer("bar",false,comm,t1)); // wrong timestamp
REQUIRE_THROWS (find_filename_in_rpointer("bar",true, comm,t0)); // bar is not model restart
REQUIRE_THROWS (find_filename_in_rpointer("foo",false,comm,t0)); // foo is model restart
rpointer << foo_fname<< "\n";
rpointer << bar_fname<< "\n";
rpointer << bar2_fname << "\n";
rpointer.close();

REQUIRE (find_filename_in_rpointer("bar", false,comm,t0)==("bar.rhist."+t0.to_string()+".nc"));
REQUIRE (find_filename_in_rpointer("bar2",false,comm,t0)==("bar2.rhist."+t0.to_string()+".nc"));
REQUIRE (find_filename_in_rpointer("foo", true, comm,t0)==("foo.r."+t0.to_string()+".nc"));
// Now test find_filename_in_rpointer with different inputs
REQUIRE_THROWS (find_filename_in_rpointer("baz",false,comm,t0,AVG)); // missing control (needed for rhist files)
REQUIRE_THROWS (find_filename_in_rpointer("baz",false,comm,t0,AVG,bar_c)); // wrong prefix
REQUIRE_THROWS (find_filename_in_rpointer("bar",false,comm,t1,AVG,bar_c)); // wrong timestamp
REQUIRE_THROWS (find_filename_in_rpointer("bar",true, comm,t0,AVG,bar_c)); // bar is not model restart
REQUIRE_THROWS (find_filename_in_rpointer("bar",false,comm,t0,INST,bar_c)); // wrong avg type
REQUIRE_THROWS (find_filename_in_rpointer("bar",false,comm,t0,INST,bar2_c)); // wrong freq specs
REQUIRE_THROWS (find_filename_in_rpointer("foo",false,comm,t0,INST,foo_c)); // foo is model restart
REQUIRE_THROWS (find_filename_in_rpointer("foo",true,comm,t0,AVG)); // model restart MUST be INSTANT

auto test = find_filename_in_rpointer("bar",false,comm,t0,AVG,bar_c);
REQUIRE (find_filename_in_rpointer("bar",false,comm,t0,AVG,bar_c)==bar_fname);
REQUIRE (find_filename_in_rpointer("bar",false,comm,t0,AVG,bar2_c)==bar2_fname);
REQUIRE (find_filename_in_rpointer("foo",true, comm,t0)==foo_fname);
}

TEST_CASE ("io_control") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,5 +120,4 @@ Fields:
output_control:
Frequency: ${NUM_STEPS}
frequency_units: nsteps
MPI Ranks in Filename: true
...
Original file line number Diff line number Diff line change
Expand Up @@ -117,5 +117,4 @@ Fields:
output_control:
Frequency: ${NUM_STEPS}
frequency_units: nsteps
MPI Ranks in Filename: true
...
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,4 @@ Field Names:
output_control:
Frequency: ${NUM_STEPS}
frequency_units: nsteps
MPI Ranks in Filename: true
...
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ CreateUnitTest(create_vert_remap_and_weights "create_vert_remap_and_weights.cpp"
# Run a test to setup nudging source data:
set (NUM_STEPS 5)
set (POSTFIX source_data)
set (ADD_RANKS false)
set (ATM_TIME_STEP 300)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/input_source_data.yaml
${CMAKE_CURRENT_BINARY_DIR}/input_source_data.yaml)
Expand All @@ -36,7 +35,6 @@ CreateUnitTestFromExec (shoc_p3_source shoc_p3_nudging
set (NUM_STEPS 5)
set (ATM_TIME_STEP 300)
set (POSTFIX nudged)
set (ADD_RANKS true)
set (VERT_TYPE TIME_DEPENDENT_3D_PROFILE)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/input_nudging.yaml
${CMAKE_CURRENT_BINARY_DIR}/input_nudging.yaml)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ atmosphere_processes:
p3:
max_total_ni: 720.0e3
nudging:
nudging_filenames_patterns: [shoc_p3_source_data_${POSTFIX}.INSTANT.nsteps_x${NUM_STEPS}.${RUN_T0}.nc]
nudging_filenames_patterns: [shoc_p3_source_data_${POSTFIX}.INSTANT.nsteps_x${NUM_STEPS}.np1.${RUN_T0}.nc]
nudging_fields: ["T_mid", "qv"]
nudging_timescale: 1000
use_nudging_weights: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
filename_prefix: shoc_p3_${POSTFIX}_nudged
Averaging Type: Instant
Max Snapshots Per File: 2
MPI Ranks in Filename: ${ADD_RANKS}
Field Names:
- p_mid
- T_mid
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ filename_prefix: shoc_p3_${POSTFIX}_nudged_remapped
Averaging Type: Instant
Max Snapshots Per File: 2
vertical_remap_file: vertical_remap.nc
MPI Ranks in Filename: ${ADD_RANKS}
Field Names:
- T_mid
- qv
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,4 @@ Fields:
output_control:
Frequency: 2
frequency_units: nsteps
MPI Ranks in Filename: true
...

0 comments on commit d981484

Please sign in to comment.