Skip to content

Commit 48599a1

Browse files
authored
Super-slicing cluster create arguments validation (#911)
* Super-slicing cluster create arguments validation * lint and goldens fixes * fix f-string-without-interpolation
1 parent c7a2049 commit 48599a1

File tree

7 files changed

+299
-86
lines changed

7 files changed

+299
-86
lines changed

goldens.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ goldens:
1818
"Cluster create sub-slicing":
1919
command: SUB_SLICING_ENABLED=true xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=v6e-4x4 --reservation=golden-reservation --sub-slicing --dry-run
2020
"Cluster create super-slicing":
21-
command: SUPER_SLICING_ENABLED=true xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-4x4x4 --reservation=golden-reservation --super-slicing --num-slices=5 --dry-run
21+
command: SUPER_SLICING_ENABLED=true xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-4x4x4 --reservation=golden-reservation/reservationBlocks/block/reservationSubBlocks/subblock --super-slicing --num-slices=5 --dry-run
2222
"Cluster create private":
2323
command: xpk cluster create-pathways --project=golden-project --zone=us-central1-a --cluster=golden-cluster-private --private --tpu-type=v5p-8 --num-slices=1 --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation=golden-reservation --dry-run
2424
"Cluster create with Managed Lustre driver":

goldens/Cluster_create_super-slicing.txt

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1-
$ SUPER_SLICING_ENABLED=true xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-4x4x4 --reservation=golden-reservation --super-slicing --num-slices=5 --dry-run
1+
$ SUPER_SLICING_ENABLED=true xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-4x4x4 --reservation=golden-reservation/reservationBlocks/block/reservationSubBlocks/subblock --super-slicing --num-slices=5 --dry-run
22
[XPK] Starting xpk v0.0.0
33
[XPK] Starting cluster create for cluster golden-cluster:
44
[XPK] Working on golden-project and us-central1-a
5+
[XPK] Task: `Get reservation deployment type` is implemented by the following command not running since it is a dry run.
6+
gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a --format="value(deploymentType)"
57
[XPK] Task: `Determine server supported GKE versions for default gke version` is implemented by the following command not running since it is a dry run.
68
gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)"
79
[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run.
@@ -51,11 +53,11 @@ kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="Conf
5153
[XPK] Existing node pool names ['0']
5254
[XPK] Task: `Retrieve resource policy` is implemented by the following command not running since it is a dry run.
5355
gcloud compute resource-policies describe tpu7x-128-4x4x4-ss-placement-policy --project=golden-project --region=us-central1
54-
[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --placement-policy=tpu7x-128-4x4x4-ss-placement-policy --enable-gvnic --node-version=0 --num-nodes=16 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node 15
55-
[XPK] To complete NodepoolCreate-golden-cluster-np-1 we are executing gcloud beta container node-pools create golden-cluster-np-1 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --placement-policy=tpu7x-128-4x4x4-ss-placement-policy --enable-gvnic --node-version=0 --num-nodes=16 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node 15
56-
[XPK] To complete NodepoolCreate-golden-cluster-np-2 we are executing gcloud beta container node-pools create golden-cluster-np-2 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --placement-policy=tpu7x-128-4x4x4-ss-placement-policy --enable-gvnic --node-version=0 --num-nodes=16 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node 15
57-
[XPK] To complete NodepoolCreate-golden-cluster-np-3 we are executing gcloud beta container node-pools create golden-cluster-np-3 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --placement-policy=tpu7x-128-4x4x4-ss-placement-policy --enable-gvnic --node-version=0 --num-nodes=16 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node 15
58-
[XPK] To complete NodepoolCreate-golden-cluster-np-4 we are executing gcloud beta container node-pools create golden-cluster-np-4 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --placement-policy=tpu7x-128-4x4x4-ss-placement-policy --enable-gvnic --node-version=0 --num-nodes=16 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node 15
56+
[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation/reservationBlocks/block/reservationSubBlocks/subblock --placement-policy=tpu7x-128-4x4x4-ss-placement-policy --enable-gvnic --node-version=0 --num-nodes=16 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node 15
57+
[XPK] To complete NodepoolCreate-golden-cluster-np-1 we are executing gcloud beta container node-pools create golden-cluster-np-1 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation/reservationBlocks/block/reservationSubBlocks/subblock --placement-policy=tpu7x-128-4x4x4-ss-placement-policy --enable-gvnic --node-version=0 --num-nodes=16 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node 15
58+
[XPK] To complete NodepoolCreate-golden-cluster-np-2 we are executing gcloud beta container node-pools create golden-cluster-np-2 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation/reservationBlocks/block/reservationSubBlocks/subblock --placement-policy=tpu7x-128-4x4x4-ss-placement-policy --enable-gvnic --node-version=0 --num-nodes=16 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node 15
59+
[XPK] To complete NodepoolCreate-golden-cluster-np-3 we are executing gcloud beta container node-pools create golden-cluster-np-3 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation/reservationBlocks/block/reservationSubBlocks/subblock --placement-policy=tpu7x-128-4x4x4-ss-placement-policy --enable-gvnic --node-version=0 --num-nodes=16 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node 15
60+
[XPK] To complete NodepoolCreate-golden-cluster-np-4 we are executing gcloud beta container node-pools create golden-cluster-np-4 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation/reservationBlocks/block/reservationSubBlocks/subblock --placement-policy=tpu7x-128-4x4x4-ss-placement-policy --enable-gvnic --node-version=0 --num-nodes=16 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node 15
5961
[XPK] Breaking up a total of 5 commands into 1 batches
6062
[XPK] Pretending all the jobs succeeded
6163
[XPK] Create or delete node pool request complete.
@@ -70,15 +72,15 @@ metadata:
7072
data:
7173
tpu7x-128: "80"
7274

73-
[XPK] Temp file (a3dd06b296e1eb6792c99ad309e6eb714888c53e8b8fb8adc3beb8f250ef163c) content:
75+
[XPK] Temp file (cd1d35a150fd66bdee3f462567a1cf4ed56b8af9ab774e15dcdac83b6b82e48e) content:
7476
kind: ConfigMap
7577
apiVersion: v1
7678
metadata:
7779
name: golden-cluster-metadata-configmap
7880
data:
7981
xpk_version: v0.0.0
8082
capacity_type: RESERVATION
81-
reservation_id: golden-reservation
83+
reservation_id: golden-reservation/reservationBlocks/block/reservationSubBlocks/subblock
8284

8385
[XPK] Breaking up a total of 2 commands into 1 batches
8486
[XPK] Pretending all the jobs succeeded

src/xpk/commands/cluster.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from ..utils.feature_flags import FeatureFlags
2020
from ..utils.versions import ReleaseChannel
2121
from ..core.pathways import get_pathways_machine_types
22-
from ..core.capacity import H100_DEVICE_TYPE, get_reservation_deployment_type
22+
from ..core.capacity import H100_DEVICE_TYPE, get_reservation_deployment_type, parse_reservation
2323
from ..core.cluster import (
2424
get_all_clusters_programmatic,
2525
get_cluster_credentials,
@@ -79,7 +79,7 @@
7979
from ..utils.execution_context import is_dry_run, is_quiet
8080
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
8181
from . import cluster_gcluster
82-
from .common import set_cluster_command, validate_sub_slicing_system
82+
from .common import set_cluster_command, validate_sub_slicing_system, validate_super_slicing_system
8383
from jinja2 import Environment, FileSystemLoader
8484
from ..utils.templates import get_templates_absolute_path
8585
import shutil
@@ -211,6 +211,9 @@ def _validate_cluster_create_args(args, system: SystemCharacteristics):
211211
if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing:
212212
validate_sub_slicing_system(system)
213213
_validate_sub_slicing_reservation(args)
214+
if FeatureFlags.SUPER_SLICING_ENABLED and args.super_slicing:
215+
validate_super_slicing_system(system)
216+
_validate_super_slicing_reservation(args)
214217
if args.enable_pathways:
215218
_validate_pathways_machine(args)
216219

@@ -233,15 +236,30 @@ def _validate_pathways_machine(args):
233236

234237

235238
def _validate_sub_slicing_reservation(args):
239+
_validate_gsc_reservation(args, 'Sub-slicing')
240+
241+
242+
def _validate_super_slicing_reservation(args):
243+
_validate_gsc_reservation(args, 'Super-slicing')
244+
reservation = parse_reservation(args.reservation, args.project)
245+
if reservation.block_name is None:
246+
xpk_print(
247+
'Error: Validation failed: Super-slicing cluster creation'
248+
' requires a block or sub-block reservation.'
249+
)
250+
xpk_exit(1)
251+
252+
253+
def _validate_gsc_reservation(args, creation_description: str):
236254
if args.reservation is None:
237255
xpk_print(
238-
'Error: Validation failed: Sub-slicing cluster creation requires'
239-
' Cluster Director reservation to be specified.'
256+
f'Error: Validation failed: {creation_description} cluster creation'
257+
' requires Cluster Director reservation to be specified.'
240258
)
241259
xpk_exit(1)
242260

243261
deployment_type = get_reservation_deployment_type(
244-
reservation=args.reservation, project=args.project, zone=args.zone
262+
reservation_path=args.reservation, project=args.project, zone=args.zone
245263
)
246264
if deployment_type != 'DENSE':
247265
xpk_print(

src/xpk/commands/cluster_test.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -629,3 +629,103 @@ def test_run_gke_cluster_create_command_with_super_slicing_enables_slice_control
629629
mocks.commands_tester.assert_command_run(
630630
'clusters create', '--enable-slice-controller'
631631
)
632+
633+
634+
def test_validate_cluster_create_args_for_correct_super_slicing_args_pass(
635+
mocks: _Mocks,
636+
):
637+
FeatureFlags.SUPER_SLICING_ENABLED = True
638+
args = construct_args(
639+
super_slicing=True,
640+
reservation='test-reservation/reservationBlocks/block',
641+
)
642+
643+
_validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
644+
args = construct_args(
645+
super_slicing=True,
646+
reservation='test-reservation/reservationBlocks/block/reservationSubBlocks/subblock',
647+
)
648+
_validate_cluster_create_args(
649+
args, UserFacingNameToSystemCharacteristics['tpu7x-128']
650+
)
651+
652+
assert mocks.common_print_mock.call_count == 0
653+
654+
655+
def test_validate_cluster_create_args_for_super_slicing_system_not_supported_throws(
656+
mocks: _Mocks,
657+
):
658+
FeatureFlags.SUPER_SLICING_ENABLED = True
659+
args = construct_args(
660+
super_slicing=True,
661+
reservation='test-reservation/reservationBlocks/block',
662+
)
663+
664+
with pytest.raises(SystemExit):
665+
_validate_cluster_create_args(
666+
args, UserFacingNameToSystemCharacteristics['tpu7x-4x4x8']
667+
)
668+
669+
assert mocks.common_print_mock.call_count == 1
670+
assert (
671+
mocks.common_print_mock.call_args[0][0]
672+
== 'Error: tpu7x-256 does not support Super-slicing.'
673+
)
674+
675+
676+
def test_validate_cluster_create_args_for_super_slicing_missing_reservation(
677+
mocks: _Mocks,
678+
):
679+
FeatureFlags.SUPER_SLICING_ENABLED = True
680+
args = construct_args(
681+
super_slicing=True,
682+
reservation=None,
683+
)
684+
685+
with pytest.raises(SystemExit):
686+
_validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
687+
688+
assert mocks.commands_print_mock.call_count == 1
689+
assert (
690+
'Validation failed: Super-slicing cluster creation requires'
691+
in mocks.commands_print_mock.call_args[0][0]
692+
)
693+
694+
695+
def test_validate_cluster_create_args_for_super_slicing_reservation_no_blocks(
696+
mocks: _Mocks,
697+
):
698+
FeatureFlags.SUPER_SLICING_ENABLED = True
699+
args = construct_args(
700+
super_slicing=True,
701+
reservation='reservation',
702+
)
703+
704+
with pytest.raises(SystemExit):
705+
_validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
706+
707+
assert mocks.commands_print_mock.call_count == 1
708+
assert (
709+
'requires a block or sub-block reservation'
710+
in mocks.commands_print_mock.call_args[0][0]
711+
)
712+
713+
714+
def test_validate_cluster_create_args_for_super_slicing_sparse_deployment_type_reservation(
715+
mocks: _Mocks,
716+
):
717+
FeatureFlags.SUPER_SLICING_ENABLED = True
718+
args = construct_args(
719+
super_slicing=True,
720+
reservation='test-reservation/reservationBlocks/block',
721+
)
722+
mocks.commands_get_reservation_deployment_type.return_value = 'SPARSE'
723+
724+
with pytest.raises(SystemExit):
725+
_validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
726+
727+
assert mocks.commands_print_mock.call_count == 5
728+
assert (
729+
'Refer to the documentation for more information on creating Cluster'
730+
in mocks.commands_print_mock.call_args[0][0]
731+
)

src/xpk/commands/common.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,9 @@ def validate_sub_slicing_system(system: SystemCharacteristics):
7373
if not system.supports_sub_slicing:
7474
xpk_print(f'Error: {system.device_type} does not support Sub-slicing.')
7575
xpk_exit(1)
76+
77+
78+
def validate_super_slicing_system(system: SystemCharacteristics):
79+
if not system.supports_super_slicing:
80+
xpk_print(f'Error: {system.device_type} does not support Super-slicing.')
81+
xpk_exit(1)

0 commit comments

Comments
 (0)