ray-project · andrewsykim · Nov 7, 2024 · Oct 23, 2024 · Oct 24, 2024 · Oct 24, 2024
diff --git a/ray-operator/config/samples/ray-cluster.tpu-v6e-16-multihost.yaml b/ray-operator/config/samples/ray-cluster.tpu-v6e-16-multihost.yaml
@@ -38,8 +38,6 @@ spec:
     rayStartParams: {}
     template:
       spec:
-        securityContext:
-          runAsUser: 0
         containers:
           - name: ray-worker
             image: rayproject/ray:2.37.0-py310

diff --git a/ray-operator/config/samples/ray-cluster.tpu-v6e-256-multihost.yaml b/ray-operator/config/samples/ray-cluster.tpu-v6e-256-multihost.yaml
@@ -38,8 +38,6 @@ spec:
     rayStartParams: {}
     template:
       spec:
-        securityContext:
-          runAsUser: 0
         containers:
           - name: ray-worker
             image: rayproject/ray:2.37.0-py310

diff --git a/ray-operator/config/samples/ray-cluster.tpu-v6e-singlehost.yaml b/ray-operator/config/samples/ray-cluster.tpu-v6e-singlehost.yaml
@@ -40,8 +40,6 @@ spec:
     rayStartParams: {}
     template:
       spec:
-        securityContext:
-          runAsUser: 0
         containers:
           - name: ray-worker
             image: rayproject/ray:2.37.0-py310

diff --git a/ray-operator/config/samples/ray-job.tpu-v6e-256-multihost.yaml b/ray-operator/config/samples/ray-job.tpu-v6e-256-multihost.yaml
@@ -40,11 +40,10 @@ spec:
         maxReplicas: 1
         numOfHosts: 64
         groupName: tpu-group
-        rayStartParams: {}
+        rayStartParams:
+          resources: '"{\"TPU\": 4}"'
         template:
           spec:
-            securityContext:
-              runAsUser: 0
             containers:
               - name: ray-worker
                 image: rayproject/ray:2.37.0-py310
@@ -58,19 +57,8 @@ spec:
                     google.com/tpu: "4"
                     memory: 200G
                 env:
-                - name: NODE_IP
-                  valueFrom:
-                    fieldRef:
-                      fieldPath: status.hostIP
-                - name: VBAR_CONTROL_SERVICE_URL
-                  value: $(NODE_IP):8353
                 - name: JAX_PLATFORMS
                   value: tpu,cpu
-                - name: ENABLE_PJRT_COMPATIBILITY
-                  value: "true"
-                ports:
-                - containerPort: 8081
-                  name: mxla
             nodeSelector:
               cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
               cloud.google.com/gke-tpu-topology: 16x16
diff --git a/ray-operator/config/samples/tpu/tpu_list_devices.py b/ray-operator/config/samples/tpu/tpu_list_devices.py
@@ -1,12 +1,19 @@
+import os
 import ray
 import jax
+import time
+
+from jax.experimental import multihost_utils
 
 ray.init()
 
 @ray.remote(resources={"TPU": 4})
 def tpu_cores():
-    return "TPU cores:" + str(jax.device_count())
+    cores = "TPU cores:" + str(jax.device_count())
+    print("TPU Worker: " + os.environ.get("TPU_WORKER_ID"))
+    return cores
 
 num_workers = int(ray.available_resources()["TPU"]) // 4
+print(f"Number of TPU Workers: {num_workers}")
 result = [tpu_cores.remote() for _ in range(num_workers)]
 print(ray.get(result))