Skip to content

Commit 83a875b

Browse files
committed
Merge branch 'main' into jax-config-common-func
2 parents c7783cc + a4fac20 commit 83a875b

File tree

124 files changed

+8433
-886
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

124 files changed

+8433
-886
lines changed

axlearn/cloud/common/bastion.py

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1037,15 +1037,30 @@ def _update_single_job(self, job: Job) -> Job:
10371037
serialized_jobspec = io.StringIO()
10381038
serialize_jobspec(job.spec, serialized_jobspec)
10391039
env_vars |= {_BASTION_SERIALIZED_JOBSPEC_ENV_VAR: serialized_jobspec.getvalue()}
1040-
_start_command(
1041-
job,
1042-
remote_log_dir=self._log_dir,
1043-
env_vars=env_vars,
1044-
)
1045-
assert job.command_proc is not None
10461040

1047-
# If command is completed, move to CLEANING. Otherwise, it's still RUNNING.
1048-
if _is_proc_complete(job.command_proc):
1041+
try:
1042+
_start_command(
1043+
job,
1044+
remote_log_dir=self._log_dir,
1045+
env_vars=env_vars,
1046+
)
1047+
except Exception as e: # pylint: disable=broad-exception-caught
1048+
job.command_proc = None
1049+
logging.warning(
1050+
"Failed to start command for the job %s: %s with error: %s",
1051+
job.spec.name,
1052+
job.spec.command,
1053+
e,
1054+
)
1055+
1056+
if job.command_proc is None:
1057+
# If failed to start command, moving the job to CLEANING directly.
1058+
self._append_to_job_history(
1059+
job, msg="Failed to start job command", state=JobLifecycleState.CLEANING
1060+
)
1061+
job.state.status = JobStatus.CLEANING
1062+
elif _is_proc_complete(job.command_proc):
1063+
# If command is completed, move to CLEANING. Otherwise, it's still RUNNING.
10491064
self._append_to_job_history(
10501065
job, msg="CLEANING: process finished", state=JobLifecycleState.CLEANING
10511066
)

axlearn/cloud/common/bastion_test.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1112,6 +1112,46 @@ def mock_tfio_exists(f):
11121112
updated_job.state, JobState(status=JobStatus.CLEANING, metadata={"tier": 1})
11131113
)
11141114

1115+
@mock.patch("subprocess.Popen")
1116+
def test_active_command_malformed(self, mock_popen):
1117+
"""If the command is malformed, bastion should gracefully move the job to CLEANING state."""
1118+
job = Job(
1119+
spec=new_jobspec(
1120+
name="test_job",
1121+
command="command",
1122+
cleanup_command="cleanup",
1123+
metadata=JobMetadata(
1124+
user_id="test_user",
1125+
project_id="test_job",
1126+
creation_time=datetime.now(),
1127+
resources={"v4": 8},
1128+
),
1129+
),
1130+
state=JobState(status=JobStatus.ACTIVE, metadata={"tier": 1}),
1131+
command_proc=None, # Initially, command is None.
1132+
cleanup_proc=None,
1133+
)
1134+
patch_fns = mock.patch.multiple(
1135+
bastion.__name__,
1136+
_upload_job_state=mock.DEFAULT,
1137+
)
1138+
mock_popen.side_effect = ValueError("Command malformed")
1139+
1140+
with patch_fns, self._patch_bastion(None) as mock_bastion:
1141+
# Initially, job should have no command.
1142+
self.assertIsNone(job.command_proc)
1143+
1144+
# Run single update step to start the job.
1145+
updated_job = mock_bastion._update_single_job(job)
1146+
1147+
# Command failed to be started.
1148+
self.assertIsNone(updated_job.command_proc)
1149+
1150+
# Job state should be CLEANING.
1151+
self.assertEqual(
1152+
updated_job.state, JobState(status=JobStatus.CLEANING, metadata={"tier": 1})
1153+
)
1154+
11151155
# pylint: disable-next=too-many-branches
11161156
def test_update_jobs(self):
11171157
"""Tests the global update step."""

axlearn/cloud/common/git_summary.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,17 +104,20 @@ class GitSummaryMembers(enum.Enum):
104104
commit = GitSummaryMember(
105105
label="git-commit", file=".git.commit", cmd=("git", "rev-parse", "HEAD")
106106
)
107+
# For retrieving a git branch, use rev-parse instead of ``git branch --show_current``
108+
# Since the latter is not supported on older git versions (was introduced in git version 2.22.0)
107109
branch = GitSummaryMember(
108110
label="git-branch",
109111
file=".git.branch",
110-
cmd=("git", "branch", "--show-current"),
112+
cmd=("git", "rev-parse", "--abbrev-ref", "HEAD"),
111113
indicator_label=False,
112114
)
113115
remote = GitSummaryMember(
114116
label="git-remote",
115117
file=".git.remote",
116118
# get remote url where branch is tracked (if any)
117-
cmd="git remote get-url $(git config --get branch.$(git branch --show-current).remote) "
119+
cmd="git remote get-url $(git config --get"
120+
" branch.$(git rev-parse --abbrev-ref HEAD).remote) "
118121
"|| git remote get-url origin " # fallback to origin
119122
"|| git config --get-regexp 'remote.*.url' | head -n 1 | cut -d= -f2", # or 1st remote
120123
required=False, # remote is not always available

axlearn/cloud/common/git_summary_test.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,9 @@ def test_valid_summary(self, remote):
7070
+ (
7171
(
7272
f"&& git remote add origin {remote} "
73-
"&& git config branch.$(git branch --show-current).remote origin "
74-
"&& git config branch.$(git branch --show-current).merge refs/heads/"
75-
"$(git branch --show-current)"
73+
"&& git config branch.$(git rev-parse --abbrev-ref HEAD).remote origin "
74+
"&& git config branch.$(git rev-parse --abbrev-ref HEAD).merge refs/heads/"
75+
"$(git rev-parse --abbrev-ref HEAD)"
7676
)
7777
if remote
7878
else ""

axlearn/cloud/gcp/jobs/cpu_runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ def define_flags(cls, fv: flags.FlagValues):
117117
"https://cloud.google.com/compute/docs/general-purpose-machines",
118118
**common_kwargs,
119119
)
120-
flags.DEFINE_integer("disk_size", 64, "Disk size of the VM in GB.", **common_kwargs)
120+
flags.DEFINE_integer("disk_size", 128, "Disk size of the VM in GB.", **common_kwargs)
121121
flags.DEFINE_bool(
122122
"retain_vm",
123123
False,

axlearn/cloud/gcp/jobset_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -579,7 +579,7 @@ def _build_pod(self) -> Nested[Any]:
579579
bucketName=parsed.netloc,
580580
# pylint: disable=line-too-long
581581
mountOptions=f"only-dir={parsed.path.lstrip('/')},implicit-dirs,metadata-cache:ttl-secs:-1,metadata-cache:stat-cache-max-size-mb:-1,metadata-cache:type-cache-max-size-mb:-1,kernel-list-cache-ttl-secs=-1,gcs-connection:http-client-timeout:{cfg.gcsfuse_mount.http_client_timeout}",
582-
gcsfuseMetadataPrefetchOnMount="true", # Improves first-time read.
582+
gcsfuseMetadataPrefetchOnMount="false", # Improves first-time read.
583583
disableMetrics="false", # Enables GCSFuse metrics by default.
584584
),
585585
),

axlearn/cloud/gcp/vm.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -365,19 +365,28 @@ def list_disk_images(creds: Credentials) -> list[str]:
365365
"""
366366
resource = _compute_resource(creds)
367367
image_project = gcp_settings("image_project")
368-
images = (
369-
resource.images()
370-
.list(
371-
project=image_project,
372-
orderBy="creationTimestamp desc",
373-
maxResults=50,
374-
)
375-
.execute()
376-
)
377368
image_names = []
378-
for el in images["items"]:
379-
if "ubuntu-2004" in el["name"] and "arm" not in el["name"]:
380-
image_names.append(el["name"])
369+
next_page_token = None
370+
371+
while True:
372+
images = (
373+
resource.images()
374+
.list(
375+
project=image_project,
376+
orderBy="creationTimestamp desc",
377+
maxResults=100,
378+
pageToken=next_page_token,
379+
)
380+
.execute()
381+
)
382+
for el in images["items"]:
383+
if "ubuntu-2204" in el["name"] and "arm" not in el["name"]:
384+
image_names.append(el["name"])
385+
386+
next_page_token = images.get("nextPageToken")
387+
if next_page_token is None:
388+
break
389+
381390
return [f"projects/{image_project}/global/images/{name}" for name in image_names]
382391

383392

0 commit comments

Comments
 (0)