Skip to content

Commit

Permalink
Making CloudDNS optional for Pathways-enabled clusters.
Browse files Browse the repository at this point in the history
  • Loading branch information
RoshaniN committed Oct 29, 2024
1 parent d88b092 commit b850df0
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 9 deletions.
12 changes: 10 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,14 @@ all zones.
--num-slices=4 --on-demand \
--tpu-type=v5litepod-16
```
Please specify `--enable-clouddns` if you would like CloudDNS to be the
DNS provider for the Pathways cluster. For example,
```shell
python3 xpk.py cluster create-pathways \
--cluster xpk-pw-test-clouddns \
--num-slices=4 --on-demand \
--tpu-type=v5litepod-16
```

* Cluster Create can be called again with the same `--cluster name` to modify
the number of slices or retry failed steps.
Expand Down Expand Up @@ -370,8 +378,8 @@ will fail the cluster creation process because Vertex AI Tensorboard is not supp
--tpu-type=v5litepod-16 \
--cluster xpk-pw-test
```
Executing the command above would provide the address of the proxy that the user job should connect to.
Specify `JAX_PLATFORMS=proxy` and `JAX_BACKEND_TARGET=<proxy address from above>` and `import previewutilies` to establish this connection between the user's JAX code and the Pathways proxy. Execute Pathways workloads interactively on Vertex AI notebooks!
Executing the command above would provide the address of the proxy that the user job should connect to. Users would need to use kubectl port-forwarding to establish connection from the notebook/VM to the proxy.
Specify `JAX_PLATFORMS=proxy` and `JAX_BACKEND_TARGET=<proxy address from above>` and `import pathwaysutils` to establish this connection between the user's JAX code and the Pathways proxy. Execute Pathways workloads interactively on Vertex AI notebooks!
### Set `max-restarts` for production jobs
Expand Down
13 changes: 9 additions & 4 deletions src/xpk/commands/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def cluster_create(args) -> None:
xpk_exit(create_cluster_command_code)

# Update Pathways clusters with CloudDNS if not enabled already.
if args.enable_pathways:
if args.enable_pathways and args.enable_clouddns:
update_cluster_command_code = update_cluster_with_clouddns_if_necessary(
args
)
Expand Down Expand Up @@ -468,10 +468,15 @@ def run_gke_cluster_create_command(
command += (
' --enable-ip-alias'
f' --create-subnetwork name={args.cluster}-subnetwork'
' --cluster-dns=clouddns'
' --cluster-dns-scope=vpc'
f' --cluster-dns-domain={args.cluster}-domain'
)
if args.enable_clouddns:
# Enables CloudDNS as the default provider of the Pathways cluster,
# useful for Pathways headless mode workloads.
command += (
' --cluster-dns=clouddns'
' --cluster-dns-scope=vpc'
f' --cluster-dns-domain={args.cluster}-domain'
)

return_code = run_command_with_updates(command, 'GKE Cluster Create', args)
if return_code != 0:
Expand Down
5 changes: 2 additions & 3 deletions src/xpk/commands/workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,10 +326,9 @@ def workload_create(args) -> None:

if args.headless and not is_cluster_using_clouddns(args):
xpk_print(
'Please run xpk cluster create-pathways first, to upgrade and enable'
' CloudDNS on your cluster.'
'Cluster is not using CloudDNS, connect to the proxy server'
' using kubectl port forwarding. '
)
xpk_exit(1)

set_cluster_command_code = set_cluster_command(args)
if set_cluster_command_code != 0:
Expand Down
6 changes: 6 additions & 0 deletions src/xpk/parser/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,12 @@ def set_cluster_parser(cluster_parser):
default=None,
help='The tpu type to use, v5litepod-16, etc.',
)
cluster_create_pathways_optional_arguments.add_argument(
'--enable-clouddns',
type=bool,
default=False,
help='Enables CloudDNS on the Pathways cluster.',
)

add_shared_cluster_create_required_arguments([
cluster_create_required_arguments,
Expand Down
2 changes: 2 additions & 0 deletions src/xpk/parser/workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,8 @@ def add_shared_workload_create_optional_arguments(args_parsers):
' headless mode. This arg can only be used in `xpk workload'
' create-pathways`(preferred) or `xpk workload create'
' --use-pathways.` (--use-pathways will be deprecated soon).'
' Headless workloads may be created on clusters with/without '
' CloudDNS.'
),
)
custom_parser.add_argument(
Expand Down

0 comments on commit b850df0

Please sign in to comment.