Skip to content

Commit 1605833

Browse files
authored
Use a long running MPI job to test job cancellation without need for special timing conditions. (#5665)
Signed-off-by: Eddy Mwiti <eddmwiti@amazon.com>
1 parent d7c44f5 commit 1605833

File tree

2 files changed

+22
-10
lines changed

2 files changed

+22
-10
lines changed

tests/integration-tests/tests/schedulers/test_slurm.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1539,19 +1539,31 @@ def _wait_compute_cloudinit_done(remote_command_executor, compute_node):
15391539
assert_that(compute_cloudinit_status_output).contains("status: done")
15401540

15411541

1542-
@retry(wait_fixed=seconds(10), stop_max_attempt_number=4)
1543-
def _check_mpi_process(remote_command_executor, slurm_commands, num_nodes, after_completion):
1544-
"""Submit script and check for MPI processes."""
1545-
# Clean up old datafiles
1546-
remote_command_executor.run_remote_command("rm -f /shared/check_proc.out")
1547-
result = slurm_commands.submit_command("ps aux | grep IMB | grep MPI >> /shared/check_proc.out", nodes=num_nodes)
1542+
def _assert_mpi_process_completion(
1543+
remote_command_executor, slurm_commands, num_nodes, after_completion, check_proc_file
1544+
):
1545+
result = slurm_commands.submit_command(
1546+
f'ps aux | grep "mpiexec.hydra.*sleep" | grep -v "grep" >> {check_proc_file}', nodes=num_nodes
1547+
)
15481548
job_id = slurm_commands.assert_job_submitted(result.stdout)
15491549
slurm_commands.wait_job_completed(job_id)
1550-
proc_track_result = remote_command_executor.run_remote_command("cat /shared/check_proc.out")
1550+
proc_track_result = remote_command_executor.run_remote_command(f"cat {check_proc_file}")
15511551
if after_completion:
1552-
assert_that(proc_track_result.stdout).does_not_contain("IMB-MPI1")
1552+
assert_that(proc_track_result.stdout).does_not_match(".*mpiexec.hydra.*sleep")
15531553
else:
1554-
assert_that(proc_track_result.stdout).contains("IMB-MPI1")
1554+
assert_that(proc_track_result.stdout).matches(".*mpiexec.hydra.*sleep")
1555+
1556+
1557+
def _check_mpi_process(remote_command_executor, slurm_commands, num_nodes, after_completion):
1558+
"""Submit script and check for MPI processes."""
1559+
# Clean up old datafiles
1560+
check_proc_file = "/shared/check_proc.out"
1561+
1562+
# Check completion status of MPI process using the shared datafile
1563+
remote_command_executor.run_remote_command(f"rm -f {check_proc_file}")
1564+
retry(wait_fixed=seconds(10), stop_max_attempt_number=4)(_assert_mpi_process_completion)(
1565+
remote_command_executor, slurm_commands, num_nodes, after_completion, check_proc_file
1566+
)
15551567

15561568

15571569
def _test_cluster_gpu_limits(slurm_commands, partition, instance_type, max_count, gpu_per_instance, gpu_type):

tests/integration-tests/tests/schedulers/test_slurm/test_slurm/mpi_job.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@
77
#SBATCH --output=runscript.out
88

99
module load intelmpi
10-
mpirun -n 6 IMB-MPI1 Alltoall -npmin 2
10+
mpirun -n 6 bash -c 'sleep 300' -npmin 2

0 commit comments

Comments
 (0)