From 4b9acbc0dc074730f4a8547b44ac26cd74ff93da Mon Sep 17 00:00:00 2001
From: Vladimir Antropov <vladimir.antropov@milaboratories.com>
Date: Tue, 26 May 2026 13:53:00 +0200
Subject: [PATCH 1/4] fix(aws): align Kueue batch memory with EKS allocatable
 plus safety margin
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Batch jobs requesting the full r7i.16xlarge nominal capacity (500Gi) cannot
schedule on a real node — EKS reserves ~14 GiB for kubelet/OS, leaving
allocatable at ~486 GiB. Subtract an additional 1 GiB safety margin for
customer-installed DaemonSets (Datadog, Falco, Wiz, etc.) and the effective
per-node ceiling is 485 GiB.

Changes:
- CloudFormation Mappings BatchMemoryGi (per-node × max-batch-nodes):
    small   500 -> 485   (1 node)
    medium  1000 -> 970  (2 nodes)
    large   2000 -> 1940 (4 nodes)
    xlarge  4000 -> 3880 (8 nodes)
- kueue.maxJobResources.memory: 500Gi -> 485Gi (both CFN-rendered values
  and values-aws-s3.yaml)
- dedicated.resources.batch.memory: 500Gi -> 485Gi (values-aws-s3.yaml)
- Parameter description and Mappings header comment updated.

Also bumps platforma controller memory limit from 16Gi -> 32Gi (matches
the request:limit ratio used on GCP) so the controller has burst headroom
under heavy workflow scheduling. Memory request stays at 16Gi.

This combines two related fixes: aligning the per-job ceiling with what
EKS can actually schedule, and reserving headroom for customer DaemonSets
so the chart's defaults work out of the box even with extra agents.
---
 .../aws/cloudformation-eks-1-35.yaml          | 27 ++++++++++---------
 infrastructure/aws/values-aws-s3.yaml         |  8 +++---
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/infrastructure/aws/cloudformation-eks-1-35.yaml b/infrastructure/aws/cloudformation-eks-1-35.yaml
index 6c26b11..2528401 100644
--- a/infrastructure/aws/cloudformation-eks-1-35.yaml
+++ b/infrastructure/aws/cloudformation-eks-1-35.yaml
@@ -456,7 +456,7 @@ Parameters:
     AllowedValues: [small, medium, large, xlarge]
     Description: >
       Cluster sizing profile. Controls node group scaling limits and Kueue quotas.
-      All deployment sizes support the same maximum single-job size (62 vCPU / 500Gi).
+      All deployment sizes support the same maximum single-job size (62 vCPU / 485Gi).
       Larger sizes allow more jobs to run simultaneously.
       GPU nodes (g6f/g6/g6e) scale from 0 — no cost when idle. Disable entirely with EnableGpu=false.
       6 GPU tiers: 3GB (g6f.xl, $0.24/hr), 6GB (g6f.2xl, $0.49/hr), 12GB (g6f.4xl, $0.98/hr),
@@ -539,13 +539,16 @@ Mappings:
 
   # Kueue ClusterQueue quotas and node group MaxSize per deployment size.
   # Quotas are derived from pool sizes (worst case: all jobs are max size).
-  # CPU: (64-vCPU node count) x 63 allocatable.  Memory: (r7i.16xl count) x 500Gi.
+  # CPU: (64-vCPU node count) x 63 allocatable.  Memory: (r7i.16xl count) x 485Gi.
+  # 485Gi = r7i.16xlarge EKS allocatable (~486 GiB) minus 1 GiB safety margin
+  # for customer-installed DaemonSets (Datadog, Falco, Wiz, etc.). Customers
+  # with heavier DS footprints should lower kueue.maxJobResources.memory.
   # UI quota is fixed at 64 vCPU / 256 GiB regardless of size.
   DeploymentSize:
     small:
-      # Kueue quotas (derived: 2x63=126 CPU, 1x500=500Gi)
+      # Kueue quotas (derived: 2x63=126 CPU, 1x485=485Gi)
       BatchCpu: '126'
-      BatchMemoryGi: '500'
+      BatchMemoryGi: '485'
       UiCpu: '64'
       UiMemoryGi: '256'
       # Node group MaxSize per pool
@@ -562,9 +565,9 @@ Mappings:
       MaxGpu96g: '1'
       MaxUi: '4'
     medium:
-      # Kueue quotas (derived: 4x63=252 CPU, 2x500=1000Gi)
+      # Kueue quotas (derived: 4x63=252 CPU, 2x485=970Gi)
       BatchCpu: '252'
-      BatchMemoryGi: '1000'
+      BatchMemoryGi: '970'
       UiCpu: '64'
       UiMemoryGi: '256'
       MaxBatch16c64g: '8'
@@ -580,9 +583,9 @@ Mappings:
       MaxGpu96g: '1'
       MaxUi: '8'
     large:
-      # Kueue quotas (derived: 8x63=504 CPU, 4x500=2000Gi)
+      # Kueue quotas (derived: 8x63=504 CPU, 4x485=1940Gi)
       BatchCpu: '504'
-      BatchMemoryGi: '2000'
+      BatchMemoryGi: '1940'
       UiCpu: '64'
       UiMemoryGi: '256'
       MaxBatch16c64g: '16'
@@ -598,9 +601,9 @@ Mappings:
       MaxGpu96g: '2'
       MaxUi: '16'
     xlarge:
-      # Kueue quotas (derived: 16x63=1008 CPU, 8x500=4000Gi)
+      # Kueue quotas (derived: 16x63=1008 CPU, 8x485=3880Gi)
       BatchCpu: '1008'
-      BatchMemoryGi: '4000'
+      BatchMemoryGi: '3880'
       UiCpu: '64'
       UiMemoryGi: '256'
       MaxBatch16c64g: '32'
@@ -3074,7 +3077,7 @@ Resources:
                   kueue:
                     maxJobResources:
                       cpu: 62
-                      memory: 500Gi
+                      memory: 485Gi
                     mode: dedicated
                     pools:
                       ui:
@@ -3118,7 +3121,7 @@ Resources:
                         memory: 16Gi
                       limits:
                         cpu: 8
-                        memory: 16Gi
+                        memory: 32Gi
                     nodeSelector:
                       node.kubernetes.io/pool: system
                     extraArgs: ${APP_EXTRA_ARGS:-[]}
diff --git a/infrastructure/aws/values-aws-s3.yaml b/infrastructure/aws/values-aws-s3.yaml
index b79f2d1..9796126 100644
--- a/infrastructure/aws/values-aws-s3.yaml
+++ b/infrastructure/aws/values-aws-s3.yaml
@@ -63,9 +63,11 @@ ingress:
 
 kueue:
   # Max resources for a single job (rejects jobs that exceed this).
+  # 485Gi = r7i.16xlarge EKS allocatable (~486 GiB) minus 1 GiB safety margin
+  # for customer-installed DaemonSets. Reduce further if you run heavier DS.
   maxJobResources:
     cpu: 62
-    memory: 500Gi
+    memory: 485Gi
   mode: dedicated
   # Node pools — controls where jobs run.
   # Must match your EKS node group labels and taints.
@@ -101,7 +103,7 @@ kueue:
         memory: 256Gi
       batch:
         cpu: 126
-        memory: 500Gi
+        memory: 485Gi
       gpu:
         gpu: 8
         cpu: 32
@@ -115,6 +117,6 @@ app:
       memory: 16Gi
     limits:
       cpu: 8
-      memory: 16Gi
+      memory: 32Gi
   nodeSelector:
     node.kubernetes.io/pool: system

From 98cfa523cdb801eae39cfd99cc70be7e06651403 Mon Sep 17 00:00:00 2001
From: Vladimir Antropov <vladimir.antropov@milaboratories.com>
Date: Tue, 26 May 2026 17:54:25 +0200
Subject: [PATCH 2/4] fix(aws): subtract EKS DaemonSet overhead, align per-job
 ceiling with GCP (484 GiB)

Reduces the per-node batch memory ceiling from 485 GiB to 484 GiB to subtract
~1 GiB of EKS-managed DaemonSet overhead (aws-node, kube-proxy, ebs-csi)
explicitly, matching the GCP per-job ceiling on n2d-highmem-64. Same
deployment_size label now means the same workload capacity on both clouds.

CloudFormation Mappings BatchMemoryGi:
  small   485 -> 484   (1 node)
  medium  970 -> 968   (2 nodes)
  large   1940 -> 1936 (4 nodes)
  xlarge  3880 -> 3872 (8 nodes)

kueue.maxJobResources.memory: 485Gi -> 484Gi (CFN-rendered + values-aws-s3.yaml)
kueue.dedicated.resources.batch.memory: 485Gi -> 484Gi (values-aws-s3.yaml)
---
 .../aws/cloudformation-eks-1-35.yaml          | 30 ++++++++++---------
 infrastructure/aws/values-aws-s3.yaml         | 10 ++++---
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/infrastructure/aws/cloudformation-eks-1-35.yaml b/infrastructure/aws/cloudformation-eks-1-35.yaml
index 2528401..7b72f23 100644
--- a/infrastructure/aws/cloudformation-eks-1-35.yaml
+++ b/infrastructure/aws/cloudformation-eks-1-35.yaml
@@ -456,7 +456,7 @@ Parameters:
     AllowedValues: [small, medium, large, xlarge]
     Description: >
       Cluster sizing profile. Controls node group scaling limits and Kueue quotas.
-      All deployment sizes support the same maximum single-job size (62 vCPU / 485Gi).
+      All deployment sizes support the same maximum single-job size (62 vCPU / 484Gi).
       Larger sizes allow more jobs to run simultaneously.
       GPU nodes (g6f/g6/g6e) scale from 0 — no cost when idle. Disable entirely with EnableGpu=false.
       6 GPU tiers: 3GB (g6f.xl, $0.24/hr), 6GB (g6f.2xl, $0.49/hr), 12GB (g6f.4xl, $0.98/hr),
@@ -539,16 +539,18 @@ Mappings:
 
   # Kueue ClusterQueue quotas and node group MaxSize per deployment size.
   # Quotas are derived from pool sizes (worst case: all jobs are max size).
-  # CPU: (64-vCPU node count) x 63 allocatable.  Memory: (r7i.16xl count) x 485Gi.
-  # 485Gi = r7i.16xlarge EKS allocatable (~486 GiB) minus 1 GiB safety margin
-  # for customer-installed DaemonSets (Datadog, Falco, Wiz, etc.). Customers
-  # with heavier DS footprints should lower kueue.maxJobResources.memory.
+  # CPU: (64-vCPU node count) x 63 allocatable.  Memory: (r7i.16xl count) x 484Gi.
+  # 484Gi = r7i.16xlarge EKS allocatable (~486 GiB) minus ~1 GiB EKS DaemonSet
+  # overhead (aws-node, kube-proxy, ebs-csi) minus 1 GiB safety margin for
+  # customer-installed DaemonSets (Datadog, Falco, Wiz, etc.). Matches the GCP
+  # per-job ceiling on n2d-highmem-64 so the same deployment_size label means
+  # the same workload capacity on both clouds.
   # UI quota is fixed at 64 vCPU / 256 GiB regardless of size.
   DeploymentSize:
     small:
-      # Kueue quotas (derived: 2x63=126 CPU, 1x485=485Gi)
+      # Kueue quotas (derived: 2x63=126 CPU, 1x484=484Gi)
       BatchCpu: '126'
-      BatchMemoryGi: '485'
+      BatchMemoryGi: '484'
       UiCpu: '64'
       UiMemoryGi: '256'
       # Node group MaxSize per pool
@@ -565,9 +567,9 @@ Mappings:
       MaxGpu96g: '1'
       MaxUi: '4'
     medium:
-      # Kueue quotas (derived: 4x63=252 CPU, 2x485=970Gi)
+      # Kueue quotas (derived: 4x63=252 CPU, 2x484=968Gi)
       BatchCpu: '252'
-      BatchMemoryGi: '970'
+      BatchMemoryGi: '968'
       UiCpu: '64'
       UiMemoryGi: '256'
       MaxBatch16c64g: '8'
@@ -583,9 +585,9 @@ Mappings:
       MaxGpu96g: '1'
       MaxUi: '8'
     large:
-      # Kueue quotas (derived: 8x63=504 CPU, 4x485=1940Gi)
+      # Kueue quotas (derived: 8x63=504 CPU, 4x484=1936Gi)
       BatchCpu: '504'
-      BatchMemoryGi: '1940'
+      BatchMemoryGi: '1936'
       UiCpu: '64'
       UiMemoryGi: '256'
       MaxBatch16c64g: '16'
@@ -601,9 +603,9 @@ Mappings:
       MaxGpu96g: '2'
       MaxUi: '16'
     xlarge:
-      # Kueue quotas (derived: 16x63=1008 CPU, 8x485=3880Gi)
+      # Kueue quotas (derived: 16x63=1008 CPU, 8x484=3872Gi)
       BatchCpu: '1008'
-      BatchMemoryGi: '3880'
+      BatchMemoryGi: '3872'
       UiCpu: '64'
       UiMemoryGi: '256'
       MaxBatch16c64g: '32'
@@ -3077,7 +3079,7 @@ Resources:
                   kueue:
                     maxJobResources:
                       cpu: 62
-                      memory: 485Gi
+                      memory: 484Gi
                     mode: dedicated
                     pools:
                       ui:
diff --git a/infrastructure/aws/values-aws-s3.yaml b/infrastructure/aws/values-aws-s3.yaml
index 9796126..63ea2b9 100644
--- a/infrastructure/aws/values-aws-s3.yaml
+++ b/infrastructure/aws/values-aws-s3.yaml
@@ -63,11 +63,13 @@ ingress:
 
 kueue:
   # Max resources for a single job (rejects jobs that exceed this).
-  # 485Gi = r7i.16xlarge EKS allocatable (~486 GiB) minus 1 GiB safety margin
-  # for customer-installed DaemonSets. Reduce further if you run heavier DS.
+  # 484Gi = r7i.16xlarge EKS allocatable (~486 GiB) minus ~1 GiB EKS DaemonSet
+  # overhead minus 1 GiB safety margin for customer-installed DaemonSets.
+  # Matches GCP per-job ceiling on n2d-highmem-64 so deployment_size means the
+  # same workload capacity on both clouds. Reduce further if you run heavier DS.
   maxJobResources:
     cpu: 62
-    memory: 485Gi
+    memory: 484Gi
   mode: dedicated
   # Node pools — controls where jobs run.
   # Must match your EKS node group labels and taints.
@@ -103,7 +105,7 @@ kueue:
         memory: 256Gi
       batch:
         cpu: 126
-        memory: 485Gi
+        memory: 484Gi
       gpu:
         gpu: 8
         cpu: 32

From c308988d81602a26c610fd6c742138d615a31373 Mon Sep 17 00:00:00 2001
From: Vladimir Antropov <vladimir.antropov@milaboratories.com>
Date: Tue, 26 May 2026 20:11:37 +0200
Subject: [PATCH 3/4] fix(aws): bump BuildSpecRevision to 3 and correct
 extraArgs.startup-taint (singular)

- BuildSpecRevision: 2 -> 3 forces CodeBuild to re-run on stack update,
  picking up the 484Gi Kueue values from the earlier commit on this
  branch (without this bump, existing stacks would keep the cached
  buildspec and never apply the new Kueue ceiling).
- Cluster Autoscaler --set arg: startup-taints -> startup-taint (CA
  flag is singular; the plural form was silently ignored, leaving GPU
  nodes without the intended startup taint).
---
 infrastructure/aws/cloudformation-eks-1-35.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/infrastructure/aws/cloudformation-eks-1-35.yaml b/infrastructure/aws/cloudformation-eks-1-35.yaml
index 7b72f23..6ba7be2 100644
--- a/infrastructure/aws/cloudformation-eks-1-35.yaml
+++ b/infrastructure/aws/cloudformation-eks-1-35.yaml
@@ -2545,7 +2545,7 @@ Resources:
                     --set extraArgs.max-node-group-backoff-duration=5m \
                     --set extraArgs.enable-provisioning-requests=true \
                     --set extraArgs.kube-api-content-type=application/json \
-                    --set extraArgs.startup-taints=nvidia.com/gpu-not-ready \
+                    --set extraArgs.startup-taint=nvidia.com/gpu-not-ready \
                     --atomic --timeout 5m
 
                   # Grant CA access to ProvisioningRequest and PodTemplate resources.
@@ -3332,7 +3332,7 @@ Resources:
       ExternalDnsVersion: '1.20.0'
       AlbControllerVersion: '3.0.0'
       # Bump for non-version buildspec changes (new flags, RBAC, install order)
-      BuildSpecRevision: '2'
+      BuildSpecRevision: '3'
 
   TriggerPlatformaDeploy:
     Type: AWS::CloudFormation::CustomResource

From a4a71e181bb860132f3d66bf1c419b2d5ef0bb40 Mon Sep 17 00:00:00 2001
From: Vladimir Antropov <vladimir.antropov@milaboratories.com>
Date: Tue, 26 May 2026 20:47:24 +0200
Subject: [PATCH 4/4] fix(aws): restore strict ClusterName pattern (lowercase,
 no underscores, max 25 chars)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sync from pl main commit 1f25f0cdd (review feedback): tighten the
ClusterName parameter to '^[a-z0-9][a-z0-9-]{0,24}$' to match the
constraints of derived resource names:

- ECR pull-through cache prefix (quay-${ClusterName}) must stay under
  AWS's 30-char limit → 25-char ceiling on ClusterName.
- S3 bucket name (platforma-${ClusterName}-...) is S3-naming-rules
  bound → no underscores, no uppercase.

Regression on this branch: the loose pattern (alphanumeric +
underscores + uppercase, 1-100 chars) silently allowed names that
then break ECR/S3 downstream.
---
 infrastructure/aws/cloudformation-eks-1-35.yaml | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/infrastructure/aws/cloudformation-eks-1-35.yaml b/infrastructure/aws/cloudformation-eks-1-35.yaml
index 6ba7be2..9a76d8a 100644
--- a/infrastructure/aws/cloudformation-eks-1-35.yaml
+++ b/infrastructure/aws/cloudformation-eks-1-35.yaml
@@ -168,8 +168,13 @@ Parameters:
   ClusterName:
     Type: String
     Default: platforma-cluster
-    AllowedPattern: '^[a-zA-Z0-9][a-zA-Z0-9_-]{0,99}$'
-    ConstraintDescription: 'Alphanumeric, hyphens, underscores, 1-100 characters'
+    # Upper bound of 25 characters keeps the derived ECR pull-through cache
+    # prefix (`quay-${ClusterName}`) under AWS's 30-char limit. Lowercase
+    # alphanumeric and hyphens only — the auto-generated S3 bucket name
+    # (`platforma-${ClusterName}-...`) cannot contain underscores or
+    # uppercase, and S3 is the tightest naming constraint downstream.
+    AllowedPattern: '^[a-z0-9][a-z0-9-]{0,24}$'
+    ConstraintDescription: 'Lowercase alphanumeric, hyphens, 1-25 characters'
     Description: EKS cluster name
 
   PlatformaNamespace: