Skip to content

Commit b2af25d

Browse files
authored
add auto-resume flag automatically when job executed on HP (#595)
1 parent bfe39ba commit b2af25d

File tree

1 file changed

+5
-0
lines changed
  • 3.test_cases/22.nemo-run/slurm

1 file changed

+5
-0
lines changed

3.test_cases/22.nemo-run/slurm/run.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,10 @@ def slurm_executor(
7777

7878

7979
local_tunnel = run.LocalTunnel(job_dir="")
80+
81+
if os.path.isdir("/opt/sagemaker_cluster"):
82+
print("Detected Hyperpod cluster.. enabling --auto-resume=1")
83+
srun_args = ["--auto-resume=1"]
8084

8185
# This defines the slurm executor.
8286
# We connect to the executor via the tunnel defined by user, host and remote_job_dir.
@@ -91,6 +95,7 @@ def slurm_executor(
9195
mem="0",
9296
exclusive=True,
9397
packager=packager,
98+
srun_args=srun_args,
9499
)
95100

96101

0 commit comments

Comments
 (0)