@@ -1539,19 +1539,31 @@ def _wait_compute_cloudinit_done(remote_command_executor, compute_node):
1539
1539
assert_that (compute_cloudinit_status_output ).contains ("status: done" )
1540
1540
1541
1541
1542
- @ retry ( wait_fixed = seconds ( 10 ), stop_max_attempt_number = 4 )
1543
- def _check_mpi_process ( remote_command_executor , slurm_commands , num_nodes , after_completion ):
1544
- """Submit script and check for MPI processes."""
1545
- # Clean up old datafiles
1546
- remote_command_executor . run_remote_command ( "rm -f /shared/check_proc.out" )
1547
- result = slurm_commands . submit_command ( "ps aux | grep IMB | grep MPI >> /shared/check_proc.out" , nodes = num_nodes )
1542
+ def _assert_mpi_process_completion (
1543
+ remote_command_executor , slurm_commands , num_nodes , after_completion , check_proc_file
1544
+ ):
1545
+ result = slurm_commands . submit_command (
1546
+ f'ps aux | grep "mpiexec.hydra.*sleep" | grep -v "grep" >> { check_proc_file } ' , nodes = num_nodes
1547
+ )
1548
1548
job_id = slurm_commands .assert_job_submitted (result .stdout )
1549
1549
slurm_commands .wait_job_completed (job_id )
1550
- proc_track_result = remote_command_executor .run_remote_command ("cat /shared/check_proc.out " )
1550
+ proc_track_result = remote_command_executor .run_remote_command (f "cat { check_proc_file } " )
1551
1551
if after_completion :
1552
- assert_that (proc_track_result .stdout ).does_not_contain ( "IMB-MPI1 " )
1552
+ assert_that (proc_track_result .stdout ).does_not_match ( ".*mpiexec.hydra.*sleep " )
1553
1553
else :
1554
- assert_that (proc_track_result .stdout ).contains ("IMB-MPI1" )
1554
+ assert_that (proc_track_result .stdout ).matches (".*mpiexec.hydra.*sleep" )
1555
+
1556
+
1557
+ def _check_mpi_process (remote_command_executor , slurm_commands , num_nodes , after_completion ):
1558
+ """Submit script and check for MPI processes."""
1559
+ # Clean up old datafiles
1560
+ check_proc_file = "/shared/check_proc.out"
1561
+
1562
+ # Check completion status of MPI process using the shared datafile
1563
+ remote_command_executor .run_remote_command (f"rm -f { check_proc_file } " )
1564
+ retry (wait_fixed = seconds (10 ), stop_max_attempt_number = 4 )(_assert_mpi_process_completion )(
1565
+ remote_command_executor , slurm_commands , num_nodes , after_completion , check_proc_file
1566
+ )
1555
1567
1556
1568
1557
1569
def _test_cluster_gpu_limits (slurm_commands , partition , instance_type , max_count , gpu_per_instance , gpu_type ):
0 commit comments