Skip to content

Commit a62db5b

Browse files
committed
add multi-container support
1 parent 9b35b42 commit a62db5b

File tree

4 files changed

+82
-1
lines changed

4 files changed

+82
-1
lines changed

pathways-job

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Subproject commit 2880c34e7d71596664bafa1c3cecb5754a9991e7

src/xpk/commands/workload.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -489,7 +489,23 @@ def workload_create(args) -> None:
489489
- PodFailurePolicy"""
490490
restart_on_exit_codes_list = get_restart_exit_codes(args)
491491
restart_on_exit_codes = ','.join(map(str, restart_on_exit_codes_list))
492-
pod_failure_policy = f"""
492+
if args.multi_container:
493+
pod_failure_policy = f"""
494+
podFailurePolicy:
495+
rules:
496+
- action: FailJob
497+
onExitCodes:
498+
containerName: {get_main_container_docker_image(args, system)}-1
499+
operator: NotIn
500+
values: [{restart_on_exit_codes}]
501+
- action: FailJob
502+
onExitCodes:
503+
containerName: {get_main_container_docker_image(args, system)}-2
504+
operator: NotIn
505+
values: [{restart_on_exit_codes}]"""
506+
507+
else:
508+
pod_failure_policy = f"""
493509
podFailurePolicy:
494510
rules:
495511
- action: FailJob

src/xpk/core/docker_container.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,65 @@ def get_main_container(args, system, docker_image, resource_type) -> str:
112112
'touch /shared-volume/stacktrace_signal; '
113113
)
114114

115+
if args.multi_container:
116+
containers = []
117+
for i in range(2):
118+
container_yaml = """
119+
- name: {docker_name}
120+
image: {docker_image}
121+
{image_pull_policy}
122+
env: {env}
123+
securityContext:
124+
privileged: true
125+
command:
126+
- bash
127+
- -c
128+
- |
129+
echo XPK Start: $(date);
130+
_sigterm() (kill -SIGTERM $! 2>/dev/null;);
131+
trap _sigterm SIGTERM;
132+
{gsutil_test_command}
133+
({command}) & PID=$!;
134+
while kill -0 $PID 2>/dev/null;
135+
do sleep 5;
136+
done;
137+
wait $PID;
138+
EXIT_CODE=$?;
139+
{xpk_internal_commands}
140+
echo XPK End: $(date);
141+
echo EXIT_CODE=$EXIT_CODE;
142+
{tpu_stacktrace_terminate_command}
143+
{gpu_workload_terminate_command}
144+
exit $EXIT_CODE
145+
resources:
146+
limits:
147+
{resources}
148+
"""
149+
volume_mounts = get_volume_mounts(args, system)
150+
if volume_mounts != '':
151+
container_yaml += """
152+
volumeMounts:
153+
{volume_mounts}
154+
"""
155+
containers.append(
156+
container_yaml.format(
157+
args=args,
158+
system=system,
159+
image_pull_policy=add_image_pull_policy_for_pw_or_gpu(args, system),
160+
env=get_env_container(args, system),
161+
docker_name=f'jax-tpu-{i+1}',
162+
docker_image=docker_image,
163+
gsutil_test_command=gsutil_test_command,
164+
command=command,
165+
tpu_stacktrace_terminate_command=tpu_stacktrace_terminate_command,
166+
gpu_workload_terminate_command=gpu_workload_terminate_command,
167+
xpk_internal_commands=xpk_internal_commands,
168+
resources=f'{resource_type}: {int(system.chips_per_vm / 2)}',
169+
volume_mounts=volume_mounts,
170+
)
171+
)
172+
return ''.join(containers)
173+
115174
yaml = """- name: {docker_name}
116175
image: {docker_image}
117176
{image_pull_policy}

src/xpk/parser/workload.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,11 @@ def set_workload_create_parser(workload_create_parser: ArgumentParser):
131131
default=1,
132132
help='The number of nodes to use, default=1.',
133133
)
134+
workload_create_parser_optional_arguments.add_argument(
135+
'--multi-container',
136+
action='store_true',
137+
help='Enable multi-container workload.',
138+
)
134139
workload_create_parser_optional_arguments.add_argument(
135140
'--scheduler',
136141
type=str,

0 commit comments

Comments
 (0)