Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Kill tasks during job prep #6535

Open
wants to merge 8 commits into
base: 8.4.x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 25 additions & 7 deletions cylc/flow/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1537,13 +1537,7 @@ def start_job_submission(self, itasks: 'Iterable[TaskProxy]') -> bool:
self.task_job_mgr.task_remote_mgr.rsync_includes = (
self.config.get_validated_rsync_includes())

submitted = self.task_job_mgr.submit_task_jobs(
self.workflow,
itasks,
self.server.curve_auth,
self.server.client_pub_key_dir,
run_mode=self.get_run_mode()
)
submitted = self.submit_task_jobs(itasks)
if not submitted:
return False

Expand All @@ -1562,6 +1556,30 @@ def start_job_submission(self, itasks: 'Iterable[TaskProxy]') -> bool:
# one or more tasks were passed through the submission pipeline
return True

def submit_task_jobs(
self, itasks: 'Iterable[TaskProxy]'
) -> 'List[TaskProxy]':
"""Prepare for job submission and submit task jobs.

Return: tasks that attempted submission.
"""
# submit "simulation/skip" mode tasks, modify "dummy" task configs:
itasks, submitted_nonlive_tasks = (
self.task_job_mgr.submit_nonlive_task_jobs(
self.workflow, itasks, self.get_run_mode()
)
)

# submit "live" mode tasks (and "dummy" mode tasks)
submitted_live_tasks = self.task_job_mgr.submit_livelike_task_jobs(
self.workflow,
itasks,
self.server.curve_auth,
self.server.client_pub_key_dir,
)
Comment on lines +1567 to +1579
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not a big fan of moving this task submission logic into the scheduler.

I'm guessing you did this to avoid having to populate the extra args in integration tests, fair enough.

Maybe just leave a shim in scheduler.py to do this, would rather keep submission logic in one place, Cylc logic has a habit of sprawling.


return submitted_nonlive_tasks + submitted_live_tasks

def process_workflow_db_queue(self):
"""Update workflow DB."""
self.workflow_db_mgr.process_queued_ops()
Expand Down
28 changes: 4 additions & 24 deletions cylc/flow/task_job_mgr.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,15 +265,10 @@ def prep_submit_task_jobs(
bad_tasks.append(itask)
return (prepared_tasks, bad_tasks)

def submit_task_jobs(
self,
workflow,
itasks: 'Iterable[TaskProxy]',
curve_auth,
client_pub_key_dir,
run_mode: RunMode = RunMode.LIVE,
def submit_livelike_task_jobs(
self, workflow, itasks, curve_auth, client_pub_key_dir
) -> 'List[TaskProxy]':
"""Prepare for job submission and submit task jobs.
"""Submission for live tasks and dummy tasks.

Preparation (host selection, remote host init, and remote install)
is done asynchronously. Newly released tasks may be sent here several
Expand All @@ -285,22 +280,7 @@ def submit_task_jobs(

This method uses prep_submit_task_jobs() as helper.

Return (list): list of tasks that attempted submission.
"""
# submit "simulation/skip" mode tasks, modify "dummy" task configs:
itasks, submitted_nonlive_tasks = self.submit_nonlive_task_jobs(
workflow, itasks, run_mode)

# submit "live" mode tasks (and "dummy" mode tasks)
submitted_live_tasks = self.submit_livelike_task_jobs(
workflow, itasks, curve_auth, client_pub_key_dir)

return submitted_nonlive_tasks + submitted_live_tasks

def submit_livelike_task_jobs(
self, workflow, itasks, curve_auth, client_pub_key_dir
) -> 'List[TaskProxy]':
"""Submission for live tasks and dummy tasks.
Return: tasks that attempted submission.
"""
done_tasks: 'List[TaskProxy]' = []
# Mapping of platforms to task proxies:
Expand Down
8 changes: 4 additions & 4 deletions tests/integration/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,14 +424,14 @@ def capture_submission():
def _disable_submission(schd: 'Scheduler') -> 'Set[TaskProxy]':
submitted_tasks: 'Set[TaskProxy]' = set()

def _submit_task_jobs(_, itasks, *args, **kwargs):
def _submit_task_jobs(itasks):
nonlocal submitted_tasks
for itask in itasks:
itask.state_reset(TASK_STATUS_SUBMITTED)
submitted_tasks.update(itasks)
return itasks

schd.task_job_mgr.submit_task_jobs = _submit_task_jobs # type: ignore
schd.submit_task_jobs = _submit_task_jobs # type: ignore[method-assign]
return submitted_tasks

return _disable_submission
Expand Down Expand Up @@ -564,7 +564,7 @@ def reflog():
"""

def _reflog(schd: 'Scheduler', flow_nums: bool = False) -> Set[tuple]:
submit_task_jobs = schd.task_job_mgr.submit_task_jobs
submit_task_jobs = schd.submit_task_jobs
triggers = set()

def _submit_task_jobs(*args, **kwargs):
Expand All @@ -580,7 +580,7 @@ def _submit_task_jobs(*args, **kwargs):
triggers.add((itask.identity, deps or None))
return itasks

schd.task_job_mgr.submit_task_jobs = _submit_task_jobs
schd.submit_task_jobs = _submit_task_jobs

return triggers

Expand Down
12 changes: 2 additions & 10 deletions tests/integration/run_modes/test_mode_overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,7 @@ async def test_force_trigger_does_not_override_run_mode(
schd.pool.force_trigger_tasks('1/foo', [1])

# ... but job submission will always change this to the correct mode:
schd.task_job_mgr.submit_task_jobs(
schd.workflow,
[foo],
schd.server.curve_auth,
schd.server.client_pub_key_dir)
schd.submit_task_jobs([foo])

assert foo.run_mode.value == 'skip'

Expand Down Expand Up @@ -157,10 +153,6 @@ async def test_run_mode_override_from_broadcast(
foo_1000 = schd.pool.get_task(ISO8601Point('1000'), 'foo')
foo_1001 = schd.pool.get_task(ISO8601Point('1001'), 'foo')

schd.task_job_mgr.submit_task_jobs(
schd.workflow,
[foo_1000, foo_1001],
schd.server.curve_auth,
schd.server.client_pub_key_dir)
schd.submit_task_jobs([foo_1000, foo_1001])
assert foo_1000.run_mode.value == 'skip'
assert capture_live_submissions() == {'1001/foo'}
52 changes: 24 additions & 28 deletions tests/integration/run_modes/test_nonlive.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from cylc.flow.cycling.integer import IntegerPoint
from cylc.flow.cycling.iso8601 import ISO8601Point
from cylc.flow.scheduler import Scheduler


# Define here to ensure test doesn't just mirror code:
Expand All @@ -30,9 +31,11 @@
'submit_status': 0,
'run_signal': None,
'run_status': 0,
# capture_live_submissions fixture submits jobs in sim mode
'platform_name': 'simulation',
'job_runner_name': 'simulation',
'job_id': None},
'job_id': None,
},
'skip': {
'flow_nums': '[1]',
'is_manual_submit': 0,
Expand All @@ -42,7 +45,8 @@
'run_status': 0,
'platform_name': 'skip',
'job_runner_name': 'skip',
'job_id': None},
'job_id': None,
},
}


Expand All @@ -59,12 +63,7 @@ def submit_and_check_db():
"""
def _inner(schd):
# Submit task jobs:
schd.task_job_mgr.submit_task_jobs(
schd.workflow,
schd.pool.get_tasks(),
schd.server.curve_auth,
schd.server.client_pub_key_dir
)
schd.submit_task_jobs(schd.pool.get_tasks())
# Make sure that db changes are enacted:
schd.workflow_db_mgr.process_queued_ops()

Expand All @@ -77,7 +76,7 @@ def _inner(schd):

# Check that timestamps have been created:
for timestamp in [
'time_submit', 'time_submit_exit', 'time_run', 'time_run_exit'
'time_submit', 'time_submit_exit', 'time_run', 'time_run_exit'
]:
assert task_jobs[timestamp] is not None
return _inner
Expand All @@ -90,26 +89,33 @@ async def test_db_task_jobs(
"""Ensure that task job data is added to the database correctly
for each run mode.
"""
schd = scheduler(flow({
'scheduling': {'graph': {
'R1': '&'.join(KGO)}},
'runtime': {
mode: {'run mode': mode} for mode in KGO}
}))
schd: Scheduler = scheduler(
flow({
'scheduling': {
'graph': {
'R1': ' & '.join(KGO)
}
},
'runtime': {
mode: {'run mode': mode} for mode in KGO
},
}),
run_mode='live'
)
async with start(schd):
# Reference all task proxies so we can examine them
# at the end of the test:
itask_skip = schd.pool.get_task(IntegerPoint('1'), 'skip')
itask_live = schd.pool.get_task(IntegerPoint('1'), 'live')


submit_and_check_db(schd)

# Set outputs to failed:
schd.pool.set_prereqs_and_outputs('*', ['failed'], [], [])

submit_and_check_db(schd)

# capture_live_submissions fixture submits jobs in sim mode
assert itask_live.run_mode.value == 'simulation'
assert itask_skip.run_mode.value == 'skip'

Expand All @@ -124,12 +130,7 @@ async def test_db_task_states(
conf['runtime'] = {'one': {'run mode': 'skip'}}
schd = scheduler(flow(conf))
async with start(schd):
schd.task_job_mgr.submit_task_jobs(
schd.workflow,
schd.pool.get_tasks(),
schd.server.curve_auth,
schd.server.client_pub_key_dir
)
schd.submit_task_jobs(schd.pool.get_tasks())
schd.workflow_db_mgr.process_queued_ops()
result = schd.workflow_db_mgr.pri_dao.connect().execute(
'SELECT * FROM task_states').fetchone()
Expand Down Expand Up @@ -165,12 +166,7 @@ async def test_mean_task_time(
itask.tdef.elapsed_times.extend([133.0, 132.4])

# Submit two tasks:
schd.task_job_mgr.submit_task_jobs(
schd.workflow,
[itask],
schd.server.curve_auth,
schd.server.client_pub_key_dir
)
schd.submit_task_jobs([itask])

# Ensure that the skipped task has succeeded, and that the
# number of items in the elapsed_times has not changed.
Expand Down
29 changes: 4 additions & 25 deletions tests/integration/run_modes/test_skip.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,7 @@ async def test_settings_override_from_broadcast(

foo, = schd.pool.get_tasks()

schd.task_job_mgr.submit_task_jobs(
schd.workflow,
schd.pool.get_tasks(),
schd.server.curve_auth,
schd.server.client_pub_key_dir
)
schd.submit_task_jobs(schd.pool.get_tasks())
# Run mode has changed:
assert foo.platform['name'] == 'skip'
# Output failed emitted:
Expand Down Expand Up @@ -215,13 +210,7 @@ async def test_prereqs_marked_satisfied_by_skip_mode(

async with start(schd):
foo = schd.pool.get_task(IntegerPoint(1), 'foo')
schd.task_job_mgr.submit_task_jobs(
schd.workflow,
[foo],
schd.server.curve_auth,
schd.server.client_pub_key_dir,
run_mode=schd.get_run_mode()
)
schd.submit_task_jobs([foo])
bar = schd.pool.get_task(IntegerPoint(1), 'bar')
satisfied_message, = bar.state.prerequisites[0]._satisfied.values()
assert satisfied_message == 'satisfied by skip mode'
Expand All @@ -240,20 +229,10 @@ async def test_outputs_can_be_changed(one_conf, flow, start, scheduler, validate
{"skip": {"outputs": "failed"}},
],
)
schd.task_job_mgr.submit_task_jobs(
schd.workflow,
schd.pool.get_tasks(),
None,
None
)
schd.submit_task_jobs(schd.pool.get_tasks())

# Broadcast the task into skip mode, output succeeded and submit it:
schd.broadcast_mgr.put_broadcast(
['1'], ['one'], [{'skip': {'outputs': 'succeeded'}}]
)
schd.task_job_mgr.submit_task_jobs(
schd.workflow,
schd.pool.get_tasks(),
None,
None
)
schd.submit_task_jobs(schd.pool.get_tasks())
24 changes: 14 additions & 10 deletions tests/integration/test_kill.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ async def test_kill_preparing(


async def test_kill_preparing_pipeline(
flow, scheduler, run, monkeypatch: pytest.MonkeyPatch
flow, scheduler, start, monkeypatch: pytest.MonkeyPatch
):
"""Test killing a preparing task through various stages of the preparing
pipeline that involve submitting subprocesses and waiting for them to
Expand All @@ -120,7 +120,7 @@ async def test_kill_preparing_pipeline(
schd: Scheduler = scheduler(
flow('one'), run_mode='live', paused_start=False
)
async with run(schd):
async with start(schd):
remote_mgr = schd.task_job_mgr.task_remote_mgr
mock_eval_platform = Mock(return_value=None)
monkeypatch.setattr(remote_mgr, 'eval_platform', mock_eval_platform)
Expand All @@ -131,22 +131,25 @@ async def test_kill_preparing_pipeline(
itask = schd.pool.get_tasks()[0]

# Platform eval:
await task_state(itask, TASK_STATUS_PREPARING)
schd.submit_task_jobs([itask])
assert itask.state(TASK_STATUS_PREPARING)
assert schd.release_tasks_to_run() is False
await run_cmd(kill_tasks(schd, [itask.identity]))
await task_state(itask, TASK_STATUS_SUBMIT_FAILED)
assert itask.state(TASK_STATUS_SUBMIT_FAILED)
assert schd.release_tasks_to_run() is False
# Set to finished:
mock_eval_platform.return_value = LOCALHOST
# Should not submit after finish because it was killed:
assert schd.release_tasks_to_run() is False

# Remote init:
patch_remote_init(schd, REMOTE_INIT_IN_PROGRESS)
schd.pool._force_trigger(itask)
await task_state(itask, TASK_STATUS_PREPARING)
schd.submit_task_jobs([itask])
assert itask.state(TASK_STATUS_PREPARING)
assert schd.release_tasks_to_run() is False
await run_cmd(kill_tasks(schd, [itask.identity]))
await task_state(itask, TASK_STATUS_SUBMIT_FAILED)
assert itask.state(TASK_STATUS_SUBMIT_FAILED)
assert schd.release_tasks_to_run() is False
# Set to finished:
patch_remote_init(schd, REMOTE_INIT_DONE)
# Should not submit after finish because it was killed:
Expand All @@ -155,11 +158,12 @@ async def test_kill_preparing_pipeline(

# Remote file install:
patch_remote_init(schd, REMOTE_FILE_INSTALL_IN_PROGRESS)
schd.pool._force_trigger(itask)
await task_state(itask, TASK_STATUS_PREPARING)
schd.submit_task_jobs([itask])
assert itask.state(TASK_STATUS_PREPARING)
assert schd.release_tasks_to_run() is False
await run_cmd(kill_tasks(schd, [itask.identity]))
await task_state(itask, TASK_STATUS_SUBMIT_FAILED)
assert itask.state(TASK_STATUS_SUBMIT_FAILED)
assert schd.release_tasks_to_run() is False
# Set to finished:
patch_remote_init(schd, REMOTE_FILE_INSTALL_DONE)
# Should not submit after finish because it was killed:
Expand Down
Loading
Loading