milaboratory · mike-ainsel · May 26, 2026 · May 26, 2026 · May 26, 2026 · May 26, 2026
diff --git a/infrastructure/aws/cloudformation-eks-1-35.yaml b/infrastructure/aws/cloudformation-eks-1-35.yaml
@@ -168,8 +168,13 @@ Parameters:
   ClusterName:
     Type: String
     Default: platforma-cluster
-    AllowedPattern: '^[a-zA-Z0-9][a-zA-Z0-9_-]{0,99}$'
-    ConstraintDescription: 'Alphanumeric, hyphens, underscores, 1-100 characters'
+    # Upper bound of 25 characters keeps the derived ECR pull-through cache
+    # prefix (`quay-${ClusterName}`) under AWS's 30-char limit. Lowercase
+    # alphanumeric and hyphens only — the auto-generated S3 bucket name
+    # (`platforma-${ClusterName}-...`) cannot contain underscores or
+    # uppercase, and S3 is the tightest naming constraint downstream.
+    AllowedPattern: '^[a-z0-9][a-z0-9-]{0,24}$'
+    ConstraintDescription: 'Lowercase alphanumeric, hyphens, 1-25 characters'
     Description: EKS cluster name
 
   PlatformaNamespace:
@@ -456,7 +461,7 @@ Parameters:
     AllowedValues: [small, medium, large, xlarge]
     Description: >
       Cluster sizing profile. Controls node group scaling limits and Kueue quotas.
-      All deployment sizes support the same maximum single-job size (62 vCPU / 500Gi).
+      All deployment sizes support the same maximum single-job size (62 vCPU / 484Gi).
       Larger sizes allow more jobs to run simultaneously.
       GPU nodes (g6f/g6/g6e) scale from 0 — no cost when idle. Disable entirely with EnableGpu=false.
       6 GPU tiers: 3GB (g6f.xl, $0.24/hr), 6GB (g6f.2xl, $0.49/hr), 12GB (g6f.4xl, $0.98/hr),
@@ -539,13 +544,18 @@ Mappings:
 
   # Kueue ClusterQueue quotas and node group MaxSize per deployment size.
   # Quotas are derived from pool sizes (worst case: all jobs are max size).
-  # CPU: (64-vCPU node count) x 63 allocatable.  Memory: (r7i.16xl count) x 500Gi.
+  # CPU: (64-vCPU node count) x 63 allocatable.  Memory: (r7i.16xl count) x 484Gi.
+  # 484Gi = r7i.16xlarge EKS allocatable (~486 GiB) minus ~1 GiB EKS DaemonSet
+  # overhead (aws-node, kube-proxy, ebs-csi) minus 1 GiB safety margin for
+  # customer-installed DaemonSets (Datadog, Falco, Wiz, etc.). Matches the GCP
+  # per-job ceiling on n2d-highmem-64 so the same deployment_size label means
+  # the same workload capacity on both clouds.
   # UI quota is fixed at 64 vCPU / 256 GiB regardless of size.
   DeploymentSize:
     small:
-      # Kueue quotas (derived: 2x63=126 CPU, 1x500=500Gi)
+      # Kueue quotas (derived: 2x63=126 CPU, 1x484=484Gi)
       BatchCpu: '126'
-      BatchMemoryGi: '500'
+      BatchMemoryGi: '484'
       UiCpu: '64'
       UiMemoryGi: '256'
       # Node group MaxSize per pool
@@ -562,9 +572,9 @@ Mappings:
       MaxGpu96g: '1'
       MaxUi: '4'
     medium:
-      # Kueue quotas (derived: 4x63=252 CPU, 2x500=1000Gi)
+      # Kueue quotas (derived: 4x63=252 CPU, 2x484=968Gi)
       BatchCpu: '252'
-      BatchMemoryGi: '1000'
+      BatchMemoryGi: '968'
       UiCpu: '64'
       UiMemoryGi: '256'
       MaxBatch16c64g: '8'
@@ -580,9 +590,9 @@ Mappings:
       MaxGpu96g: '1'
       MaxUi: '8'
     large:
-      # Kueue quotas (derived: 8x63=504 CPU, 4x500=2000Gi)
+      # Kueue quotas (derived: 8x63=504 CPU, 4x484=1936Gi)
       BatchCpu: '504'
-      BatchMemoryGi: '2000'
+      BatchMemoryGi: '1936'
       UiCpu: '64'
       UiMemoryGi: '256'
       MaxBatch16c64g: '16'
@@ -598,9 +608,9 @@ Mappings:
       MaxGpu96g: '2'
       MaxUi: '16'
     xlarge:
-      # Kueue quotas (derived: 16x63=1008 CPU, 8x500=4000Gi)
+      # Kueue quotas (derived: 16x63=1008 CPU, 8x484=3872Gi)
       BatchCpu: '1008'
-      BatchMemoryGi: '4000'
+      BatchMemoryGi: '3872'
       UiCpu: '64'
       UiMemoryGi: '256'
       MaxBatch16c64g: '32'
@@ -2540,7 +2550,7 @@ Resources:
                     --set extraArgs.max-node-group-backoff-duration=5m \
                     --set extraArgs.enable-provisioning-requests=true \
                     --set extraArgs.kube-api-content-type=application/json \
-                    --set extraArgs.startup-taints=nvidia.com/gpu-not-ready \
+                    --set extraArgs.startup-taint=nvidia.com/gpu-not-ready \
                     --atomic --timeout 5m
 
                   # Grant CA access to ProvisioningRequest and PodTemplate resources.
@@ -3074,7 +3084,7 @@ Resources:
                   kueue:
                     maxJobResources:
                       cpu: 62
-                      memory: 500Gi
+                      memory: 484Gi
                     mode: dedicated
                     pools:
                       ui:
@@ -3118,7 +3128,7 @@ Resources:
                         memory: 16Gi
                       limits:
                         cpu: 8
-                        memory: 16Gi
+                        memory: 32Gi
                     nodeSelector:
                       node.kubernetes.io/pool: system
                     extraArgs: ${APP_EXTRA_ARGS:-[]}
@@ -3327,7 +3337,7 @@ Resources:
       ExternalDnsVersion: '1.20.0'
       AlbControllerVersion: '3.0.0'
       # Bump for non-version buildspec changes (new flags, RBAC, install order)
-      BuildSpecRevision: '2'
+      BuildSpecRevision: '3'
 
   TriggerPlatformaDeploy:
     Type: AWS::CloudFormation::CustomResource

diff --git a/infrastructure/aws/values-aws-s3.yaml b/infrastructure/aws/values-aws-s3.yaml
@@ -63,9 +63,13 @@ ingress:
 
 kueue:
   # Max resources for a single job (rejects jobs that exceed this).
+  # 484Gi = r7i.16xlarge EKS allocatable (~486 GiB) minus ~1 GiB EKS DaemonSet
+  # overhead minus 1 GiB safety margin for customer-installed DaemonSets.
+  # Matches GCP per-job ceiling on n2d-highmem-64 so deployment_size means the
+  # same workload capacity on both clouds. Reduce further if you run heavier DS.
   maxJobResources:
     cpu: 62
-    memory: 500Gi
+    memory: 484Gi
   mode: dedicated
   # Node pools — controls where jobs run.
   # Must match your EKS node group labels and taints.
@@ -101,7 +105,7 @@ kueue:
         memory: 256Gi
       batch:
         cpu: 126
-        memory: 500Gi
+        memory: 484Gi
       gpu:
         gpu: 8
         cpu: 32
@@ -115,6 +119,6 @@ app:
       memory: 16Gi
     limits:
       cpu: 8
-      memory: 16Gi
+      memory: 32Gi
   nodeSelector:
     node.kubernetes.io/pool: system