Skip to content

Commit

Permalink
Remove dependence on timing for checkpoint-restart
Browse files Browse the repository at this point in the history
Not an issue for real runs, but could cause test failures when the
code to restore from a checkpoint took longer to run than the
executable that generated the checkpoint.
  • Loading branch information
wthrowe committed Feb 6, 2025
1 parent 9af80d6 commit 81bf640
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 74 deletions.
9 changes: 0 additions & 9 deletions src/Parallel/PhaseControl/CheckpointAndExitAfterWallclock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,6 @@ std::optional<Parallel::Phase> RestartPhase::combine_method::operator()(
"arbitration in the Main chare, so no reduction data should be "
"provided.");
}

std::optional<double> WallclockHoursAtCheckpoint::combine_method::operator()(
const std::optional<double> /*first_time*/,
const std::optional<double>& /*second_time*/) {
ERROR(
"The wallclock time at which a checkpoint was requested should "
"only be altered by the phase change arbitration in the Main "
"chare, so no reduction data should be provided.");
}
} // namespace Tags

CheckpointAndExitAfterWallclock::CheckpointAndExitAfterWallclock(
Expand Down
42 changes: 9 additions & 33 deletions src/Parallel/PhaseControl/CheckpointAndExitAfterWallclock.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,22 +46,6 @@ struct RestartPhase {
using main_combine_method = combine_method;
};

/// Storage in the phase change decision tuple so that the Main chare can record
/// the elapsed wallclock time since the start of the run.
///
/// \note This tag is not intended to participate in any of the reduction
/// procedures, so will error if the combine method is called.
struct WallclockHoursAtCheckpoint {
using type = std::optional<double>;

struct combine_method {
[[noreturn]] std::optional<double> operator()(
const std::optional<double> /*first_time*/,
const std::optional<double>& /*second_time*/);
};
using main_combine_method = combine_method;
};

/// Stores whether the checkpoint and exit has been requested.
///
/// Combinations are performed via `funcl::Or`, as the phase in question should
Expand Down Expand Up @@ -146,8 +130,7 @@ struct CheckpointAndExitAfterWallclock : public PhaseChange {
using return_tags = tmpl::list<>;

using phase_change_tags_and_combines =
tmpl::list<Tags::RestartPhase, Tags::WallclockHoursAtCheckpoint,
Tags::CheckpointAndExitRequested>;
tmpl::list<Tags::RestartPhase, Tags::CheckpointAndExitRequested>;

template <typename Metavariables>
using participating_components = typename Metavariables::component_list;
Expand All @@ -174,15 +157,17 @@ struct CheckpointAndExitAfterWallclock : public PhaseChange {

private:
std::optional<double> wallclock_hours_for_checkpoint_and_exit_ = std::nullopt;
// Phase arbitration is only run from Main, so there are no
// threading issues here.
// NOLINTNEXTLINE(spectre-mutable)
mutable bool halting_ = false;
};

template <typename... DecisionTags>
void CheckpointAndExitAfterWallclock::initialize_phase_data_impl(
const gsl::not_null<tuples::TaggedTuple<DecisionTags...>*>
phase_change_decision_data) const {
tuples::get<Tags::RestartPhase>(*phase_change_decision_data) = std::nullopt;
tuples::get<Tags::WallclockHoursAtCheckpoint>(*phase_change_decision_data) =
std::nullopt;
tuples::get<Tags::CheckpointAndExitRequested>(*phase_change_decision_data) =
false;
}
Expand Down Expand Up @@ -214,21 +199,12 @@ CheckpointAndExitAfterWallclock::arbitrate_phase_change_impl(

auto& restart_phase =
tuples::get<Tags::RestartPhase>(*phase_change_decision_data);
auto& wallclock_hours_at_checkpoint =
tuples::get<Tags::WallclockHoursAtCheckpoint>(
*phase_change_decision_data);
auto& exit_code =
tuples::get<Parallel::Tags::ExitCode>(*phase_change_decision_data);
if (restart_phase.has_value()) {
ASSERT(wallclock_hours_at_checkpoint.has_value(),
"Consistency error: Should have recorded the Wallclock time "
"while recording a phase to restart from.");
// This `if` branch, where restart_phase has a value, is the
// post-checkpoint call to arbitrate_phase_change. Depending on the time
// elapsed so far in this run, next phase is...
// - Exit, if the time is large
// - restart_phase, if the time is small
if (elapsed_hours >= wallclock_hours_at_checkpoint.value()) {
// post-checkpoint call to arbitrate_phase_change.
if (halting_) {
// Preserve restart_phase for use after restarting from the checkpoint
exit_code = Parallel::ExitCode::ContinueFromCheckpoint;
return std::make_pair(Parallel::Phase::Exit,
Expand All @@ -245,7 +221,6 @@ CheckpointAndExitAfterWallclock::arbitrate_phase_change_impl(
// Reset restart_phase until it is needed for the next checkpoint
const auto result = restart_phase;
restart_phase.reset();
wallclock_hours_at_checkpoint.reset();
return std::make_pair(result.value(),
ArbitrationStrategy::PermitAdditionalJumps);
}
Expand All @@ -260,7 +235,8 @@ CheckpointAndExitAfterWallclock::arbitrate_phase_change_impl(
std::numeric_limits<double>::infinity())) {
// Record phase and actual elapsed time for determining following phase
restart_phase = current_phase;
wallclock_hours_at_checkpoint = elapsed_hours;
ASSERT(not halting_, "Halting for checkpoint recursively");
halting_ = true;
return std::make_pair(Parallel::Phase::WriteCheckpoint,
ArbitrationStrategy::RunPhaseImmediately);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,11 @@ SPECTRE_TEST_CASE("Unit.Parallel.PhaseControl.CheckpointAndExitAfterWallclock",
{
INFO("Test initialize phase change decision data");
PhaseChangeDecisionData phase_change_decision_data{
Parallel::Phase::Execute, true, 1.0, true,
Parallel::ExitCode::Complete};
Parallel::Phase::Execute, true, true, Parallel::ExitCode::Complete};
phase_change0.initialize_phase_data<Metavariables>(
make_not_null(&phase_change_decision_data));
CHECK(phase_change_decision_data ==
PhaseChangeDecisionData{std::nullopt, std::nullopt, false, true,
PhaseChangeDecisionData{std::nullopt, false, true,
Parallel::ExitCode::Complete});
}
{
Expand All @@ -71,13 +70,13 @@ SPECTRE_TEST_CASE("Unit.Parallel.PhaseControl.CheckpointAndExitAfterWallclock",
// the PhaseChange with a big trigger time.
// (this assumes the test doesn't take 1h to get here)
PhaseChangeDecisionData phase_change_decision_data{
std::nullopt, std::nullopt, true, true, Parallel::ExitCode::Complete};
std::nullopt, true, true, Parallel::ExitCode::Complete};
const auto decision_result = phase_change1.arbitrate_phase_change(
make_not_null(&phase_change_decision_data), Parallel::Phase::Execute,
cache);
CHECK(decision_result == std::nullopt);
CHECK(phase_change_decision_data ==
PhaseChangeDecisionData{std::nullopt, std::nullopt, false, true,
PhaseChangeDecisionData{std::nullopt, false, true,
Parallel::ExitCode::Complete});
}
{
Expand All @@ -86,67 +85,52 @@ SPECTRE_TEST_CASE("Unit.Parallel.PhaseControl.CheckpointAndExitAfterWallclock",
// the PhaseChange with a tiny trigger time.
// (this assumes the test takes at least a few cycles to get here)
PhaseChangeDecisionData phase_change_decision_data{
std::nullopt, std::nullopt, true, true, Parallel::ExitCode::Complete};
std::nullopt, true, true, Parallel::ExitCode::Complete};
const auto decision_result = phase_change0.arbitrate_phase_change(
make_not_null(&phase_change_decision_data), Parallel::Phase::Execute,
cache);
CHECK(
decision_result ==
std::make_pair(Parallel::Phase::WriteCheckpoint,
PhaseControl::ArbitrationStrategy::RunPhaseImmediately));
// Check recorded time in range: 0 second < time < 1 second
// (this assumes test run duration falls in this time window)
const double recorded_time =
tuples::get<PhaseControl::Tags::WallclockHoursAtCheckpoint>(
phase_change_decision_data)
.value();
CHECK(recorded_time > 0.0);
const double one_second = 1.0 / 3600.0;
CHECK(recorded_time < one_second);
CHECK(phase_change_decision_data ==
PhaseChangeDecisionData{Parallel::Phase::Execute, recorded_time,
false, true, Parallel::ExitCode::Complete});
PhaseChangeDecisionData{Parallel::Phase::Execute, false, true,
Parallel::ExitCode::Complete});
}
{
INFO("Restarting from checkpoint");
// Check behavior following the checkpoint phase
// First check case where wallclock time < recorded time, which corresponds
// to restarting from a checkpoint. Should update options next.
// (this assumes the test doesn't take 1h to get here)
const PhaseControl::CheckpointAndExitAfterWallclock phase_change_restart(
0.0);
PhaseChangeDecisionData phase_change_decision_data{
Parallel::Phase::Execute, 1.0, false, true,
Parallel::ExitCode::Complete};
auto decision_result = phase_change0.arbitrate_phase_change(
Parallel::Phase::Execute, false, true, Parallel::ExitCode::Complete};
auto decision_result = phase_change_restart.arbitrate_phase_change(
make_not_null(&phase_change_decision_data),
Parallel::Phase::WriteCheckpoint, cache);
CHECK(decision_result ==
std::make_pair(
Parallel::Phase::UpdateOptionsAtRestartFromCheckpoint,
PhaseControl::ArbitrationStrategy::PermitAdditionalJumps));
CHECK(phase_change_decision_data ==
PhaseChangeDecisionData{Parallel::Phase::Execute, 1.0, false, true,
PhaseChangeDecisionData{Parallel::Phase::Execute, false, true,
Parallel::ExitCode::Complete});

// Now, from update phase, go back to Execute
decision_result = phase_change0.arbitrate_phase_change(
decision_result = phase_change_restart.arbitrate_phase_change(
make_not_null(&phase_change_decision_data),
Parallel::Phase::UpdateOptionsAtRestartFromCheckpoint, cache);
CHECK(decision_result ==
std::make_pair(
Parallel::Phase::Execute,
PhaseControl::ArbitrationStrategy::PermitAdditionalJumps));
CHECK(phase_change_decision_data ==
PhaseChangeDecisionData{std::nullopt, std::nullopt, false, true,
PhaseChangeDecisionData{std::nullopt, false, true,
Parallel::ExitCode::Complete});
}
{
INFO("Exiting after checkpoint");
// Now check case where wallclock time > recorded time, which corresponds to
// having just written a checkpoint. We want to exit with exit code 2 now.
// (this assumes the test takes at least a few cycles to get here)
PhaseChangeDecisionData phase_change_decision_data{
Parallel::Phase::Execute, 1e-15, false, true,
Parallel::ExitCode::Complete};
Parallel::Phase::Execute, false, true, Parallel::ExitCode::Complete};
const auto decision_result = phase_change0.arbitrate_phase_change(
make_not_null(&phase_change_decision_data),
Parallel::Phase::WriteCheckpoint, cache);
Expand All @@ -155,7 +139,7 @@ SPECTRE_TEST_CASE("Unit.Parallel.PhaseControl.CheckpointAndExitAfterWallclock",
std::make_pair(Parallel::Phase::Exit,
PhaseControl::ArbitrationStrategy::RunPhaseImmediately));
CHECK(phase_change_decision_data ==
PhaseChangeDecisionData{Parallel::Phase::Execute, 1e-15, false, true,
PhaseChangeDecisionData{Parallel::Phase::Execute, false, true,
Parallel::ExitCode::ContinueFromCheckpoint});
}
}

0 comments on commit 81bf640

Please sign in to comment.