From 4b9acbc0dc074730f4a8547b44ac26cd74ff93da Mon Sep 17 00:00:00 2001 From: Vladimir Antropov Date: Tue, 26 May 2026 13:53:00 +0200 Subject: [PATCH 1/4] fix(aws): align Kueue batch memory with EKS allocatable plus safety margin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Batch jobs requesting the full r7i.16xlarge nominal capacity (500Gi) cannot schedule on a real node — EKS reserves ~14 GiB for kubelet/OS, leaving allocatable at ~486 GiB. Subtract an additional 1 GiB safety margin for customer-installed DaemonSets (Datadog, Falco, Wiz, etc.) and the effective per-node ceiling is 485 GiB. Changes: - CloudFormation Mappings BatchMemoryGi (per-node × max-batch-nodes): small 500 -> 485 (1 node) medium 1000 -> 970 (2 nodes) large 2000 -> 1940 (4 nodes) xlarge 4000 -> 3880 (8 nodes) - kueue.maxJobResources.memory: 500Gi -> 485Gi (both CFN-rendered values and values-aws-s3.yaml) - dedicated.resources.batch.memory: 500Gi -> 485Gi (values-aws-s3.yaml) - Parameter description and Mappings header comment updated. Also bumps platforma controller memory limit from 16Gi -> 32Gi (matches the request:limit ratio used on GCP) so the controller has burst headroom under heavy workflow scheduling. Memory request stays at 16Gi. This combines two related fixes: aligning the per-job ceiling with what EKS can actually schedule, and reserving headroom for customer DaemonSets so the chart's defaults work out of the box even with extra agents. --- .../aws/cloudformation-eks-1-35.yaml | 27 ++++++++++--------- infrastructure/aws/values-aws-s3.yaml | 8 +++--- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/infrastructure/aws/cloudformation-eks-1-35.yaml b/infrastructure/aws/cloudformation-eks-1-35.yaml index 6c26b11..2528401 100644 --- a/infrastructure/aws/cloudformation-eks-1-35.yaml +++ b/infrastructure/aws/cloudformation-eks-1-35.yaml @@ -456,7 +456,7 @@ Parameters: AllowedValues: [small, medium, large, xlarge] Description: > Cluster sizing profile. Controls node group scaling limits and Kueue quotas. - All deployment sizes support the same maximum single-job size (62 vCPU / 500Gi). + All deployment sizes support the same maximum single-job size (62 vCPU / 485Gi). Larger sizes allow more jobs to run simultaneously. GPU nodes (g6f/g6/g6e) scale from 0 — no cost when idle. Disable entirely with EnableGpu=false. 6 GPU tiers: 3GB (g6f.xl, $0.24/hr), 6GB (g6f.2xl, $0.49/hr), 12GB (g6f.4xl, $0.98/hr), @@ -539,13 +539,16 @@ Mappings: # Kueue ClusterQueue quotas and node group MaxSize per deployment size. # Quotas are derived from pool sizes (worst case: all jobs are max size). - # CPU: (64-vCPU node count) x 63 allocatable. Memory: (r7i.16xl count) x 500Gi. + # CPU: (64-vCPU node count) x 63 allocatable. Memory: (r7i.16xl count) x 485Gi. + # 485Gi = r7i.16xlarge EKS allocatable (~486 GiB) minus 1 GiB safety margin + # for customer-installed DaemonSets (Datadog, Falco, Wiz, etc.). Customers + # with heavier DS footprints should lower kueue.maxJobResources.memory. # UI quota is fixed at 64 vCPU / 256 GiB regardless of size. DeploymentSize: small: - # Kueue quotas (derived: 2x63=126 CPU, 1x500=500Gi) + # Kueue quotas (derived: 2x63=126 CPU, 1x485=485Gi) BatchCpu: '126' - BatchMemoryGi: '500' + BatchMemoryGi: '485' UiCpu: '64' UiMemoryGi: '256' # Node group MaxSize per pool @@ -562,9 +565,9 @@ Mappings: MaxGpu96g: '1' MaxUi: '4' medium: - # Kueue quotas (derived: 4x63=252 CPU, 2x500=1000Gi) + # Kueue quotas (derived: 4x63=252 CPU, 2x485=970Gi) BatchCpu: '252' - BatchMemoryGi: '1000' + BatchMemoryGi: '970' UiCpu: '64' UiMemoryGi: '256' MaxBatch16c64g: '8' @@ -580,9 +583,9 @@ Mappings: MaxGpu96g: '1' MaxUi: '8' large: - # Kueue quotas (derived: 8x63=504 CPU, 4x500=2000Gi) + # Kueue quotas (derived: 8x63=504 CPU, 4x485=1940Gi) BatchCpu: '504' - BatchMemoryGi: '2000' + BatchMemoryGi: '1940' UiCpu: '64' UiMemoryGi: '256' MaxBatch16c64g: '16' @@ -598,9 +601,9 @@ Mappings: MaxGpu96g: '2' MaxUi: '16' xlarge: - # Kueue quotas (derived: 16x63=1008 CPU, 8x500=4000Gi) + # Kueue quotas (derived: 16x63=1008 CPU, 8x485=3880Gi) BatchCpu: '1008' - BatchMemoryGi: '4000' + BatchMemoryGi: '3880' UiCpu: '64' UiMemoryGi: '256' MaxBatch16c64g: '32' @@ -3074,7 +3077,7 @@ Resources: kueue: maxJobResources: cpu: 62 - memory: 500Gi + memory: 485Gi mode: dedicated pools: ui: @@ -3118,7 +3121,7 @@ Resources: memory: 16Gi limits: cpu: 8 - memory: 16Gi + memory: 32Gi nodeSelector: node.kubernetes.io/pool: system extraArgs: ${APP_EXTRA_ARGS:-[]} diff --git a/infrastructure/aws/values-aws-s3.yaml b/infrastructure/aws/values-aws-s3.yaml index b79f2d1..9796126 100644 --- a/infrastructure/aws/values-aws-s3.yaml +++ b/infrastructure/aws/values-aws-s3.yaml @@ -63,9 +63,11 @@ ingress: kueue: # Max resources for a single job (rejects jobs that exceed this). + # 485Gi = r7i.16xlarge EKS allocatable (~486 GiB) minus 1 GiB safety margin + # for customer-installed DaemonSets. Reduce further if you run heavier DS. maxJobResources: cpu: 62 - memory: 500Gi + memory: 485Gi mode: dedicated # Node pools — controls where jobs run. # Must match your EKS node group labels and taints. @@ -101,7 +103,7 @@ kueue: memory: 256Gi batch: cpu: 126 - memory: 500Gi + memory: 485Gi gpu: gpu: 8 cpu: 32 @@ -115,6 +117,6 @@ app: memory: 16Gi limits: cpu: 8 - memory: 16Gi + memory: 32Gi nodeSelector: node.kubernetes.io/pool: system From 98cfa523cdb801eae39cfd99cc70be7e06651403 Mon Sep 17 00:00:00 2001 From: Vladimir Antropov Date: Tue, 26 May 2026 17:54:25 +0200 Subject: [PATCH 2/4] fix(aws): subtract EKS DaemonSet overhead, align per-job ceiling with GCP (484 GiB) Reduces the per-node batch memory ceiling from 485 GiB to 484 GiB to subtract ~1 GiB of EKS-managed DaemonSet overhead (aws-node, kube-proxy, ebs-csi) explicitly, matching the GCP per-job ceiling on n2d-highmem-64. Same deployment_size label now means the same workload capacity on both clouds. CloudFormation Mappings BatchMemoryGi: small 485 -> 484 (1 node) medium 970 -> 968 (2 nodes) large 1940 -> 1936 (4 nodes) xlarge 3880 -> 3872 (8 nodes) kueue.maxJobResources.memory: 485Gi -> 484Gi (CFN-rendered + values-aws-s3.yaml) kueue.dedicated.resources.batch.memory: 485Gi -> 484Gi (values-aws-s3.yaml) --- .../aws/cloudformation-eks-1-35.yaml | 30 ++++++++++--------- infrastructure/aws/values-aws-s3.yaml | 10 ++++--- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/infrastructure/aws/cloudformation-eks-1-35.yaml b/infrastructure/aws/cloudformation-eks-1-35.yaml index 2528401..7b72f23 100644 --- a/infrastructure/aws/cloudformation-eks-1-35.yaml +++ b/infrastructure/aws/cloudformation-eks-1-35.yaml @@ -456,7 +456,7 @@ Parameters: AllowedValues: [small, medium, large, xlarge] Description: > Cluster sizing profile. Controls node group scaling limits and Kueue quotas. - All deployment sizes support the same maximum single-job size (62 vCPU / 485Gi). + All deployment sizes support the same maximum single-job size (62 vCPU / 484Gi). Larger sizes allow more jobs to run simultaneously. GPU nodes (g6f/g6/g6e) scale from 0 — no cost when idle. Disable entirely with EnableGpu=false. 6 GPU tiers: 3GB (g6f.xl, $0.24/hr), 6GB (g6f.2xl, $0.49/hr), 12GB (g6f.4xl, $0.98/hr), @@ -539,16 +539,18 @@ Mappings: # Kueue ClusterQueue quotas and node group MaxSize per deployment size. # Quotas are derived from pool sizes (worst case: all jobs are max size). - # CPU: (64-vCPU node count) x 63 allocatable. Memory: (r7i.16xl count) x 485Gi. - # 485Gi = r7i.16xlarge EKS allocatable (~486 GiB) minus 1 GiB safety margin - # for customer-installed DaemonSets (Datadog, Falco, Wiz, etc.). Customers - # with heavier DS footprints should lower kueue.maxJobResources.memory. + # CPU: (64-vCPU node count) x 63 allocatable. Memory: (r7i.16xl count) x 484Gi. + # 484Gi = r7i.16xlarge EKS allocatable (~486 GiB) minus ~1 GiB EKS DaemonSet + # overhead (aws-node, kube-proxy, ebs-csi) minus 1 GiB safety margin for + # customer-installed DaemonSets (Datadog, Falco, Wiz, etc.). Matches the GCP + # per-job ceiling on n2d-highmem-64 so the same deployment_size label means + # the same workload capacity on both clouds. # UI quota is fixed at 64 vCPU / 256 GiB regardless of size. DeploymentSize: small: - # Kueue quotas (derived: 2x63=126 CPU, 1x485=485Gi) + # Kueue quotas (derived: 2x63=126 CPU, 1x484=484Gi) BatchCpu: '126' - BatchMemoryGi: '485' + BatchMemoryGi: '484' UiCpu: '64' UiMemoryGi: '256' # Node group MaxSize per pool @@ -565,9 +567,9 @@ Mappings: MaxGpu96g: '1' MaxUi: '4' medium: - # Kueue quotas (derived: 4x63=252 CPU, 2x485=970Gi) + # Kueue quotas (derived: 4x63=252 CPU, 2x484=968Gi) BatchCpu: '252' - BatchMemoryGi: '970' + BatchMemoryGi: '968' UiCpu: '64' UiMemoryGi: '256' MaxBatch16c64g: '8' @@ -583,9 +585,9 @@ Mappings: MaxGpu96g: '1' MaxUi: '8' large: - # Kueue quotas (derived: 8x63=504 CPU, 4x485=1940Gi) + # Kueue quotas (derived: 8x63=504 CPU, 4x484=1936Gi) BatchCpu: '504' - BatchMemoryGi: '1940' + BatchMemoryGi: '1936' UiCpu: '64' UiMemoryGi: '256' MaxBatch16c64g: '16' @@ -601,9 +603,9 @@ Mappings: MaxGpu96g: '2' MaxUi: '16' xlarge: - # Kueue quotas (derived: 16x63=1008 CPU, 8x485=3880Gi) + # Kueue quotas (derived: 16x63=1008 CPU, 8x484=3872Gi) BatchCpu: '1008' - BatchMemoryGi: '3880' + BatchMemoryGi: '3872' UiCpu: '64' UiMemoryGi: '256' MaxBatch16c64g: '32' @@ -3077,7 +3079,7 @@ Resources: kueue: maxJobResources: cpu: 62 - memory: 485Gi + memory: 484Gi mode: dedicated pools: ui: diff --git a/infrastructure/aws/values-aws-s3.yaml b/infrastructure/aws/values-aws-s3.yaml index 9796126..63ea2b9 100644 --- a/infrastructure/aws/values-aws-s3.yaml +++ b/infrastructure/aws/values-aws-s3.yaml @@ -63,11 +63,13 @@ ingress: kueue: # Max resources for a single job (rejects jobs that exceed this). - # 485Gi = r7i.16xlarge EKS allocatable (~486 GiB) minus 1 GiB safety margin - # for customer-installed DaemonSets. Reduce further if you run heavier DS. + # 484Gi = r7i.16xlarge EKS allocatable (~486 GiB) minus ~1 GiB EKS DaemonSet + # overhead minus 1 GiB safety margin for customer-installed DaemonSets. + # Matches GCP per-job ceiling on n2d-highmem-64 so deployment_size means the + # same workload capacity on both clouds. Reduce further if you run heavier DS. maxJobResources: cpu: 62 - memory: 485Gi + memory: 484Gi mode: dedicated # Node pools — controls where jobs run. # Must match your EKS node group labels and taints. @@ -103,7 +105,7 @@ kueue: memory: 256Gi batch: cpu: 126 - memory: 485Gi + memory: 484Gi gpu: gpu: 8 cpu: 32 From c308988d81602a26c610fd6c742138d615a31373 Mon Sep 17 00:00:00 2001 From: Vladimir Antropov Date: Tue, 26 May 2026 20:11:37 +0200 Subject: [PATCH 3/4] fix(aws): bump BuildSpecRevision to 3 and correct extraArgs.startup-taint (singular) - BuildSpecRevision: 2 -> 3 forces CodeBuild to re-run on stack update, picking up the 484Gi Kueue values from the earlier commit on this branch (without this bump, existing stacks would keep the cached buildspec and never apply the new Kueue ceiling). - Cluster Autoscaler --set arg: startup-taints -> startup-taint (CA flag is singular; the plural form was silently ignored, leaving GPU nodes without the intended startup taint). --- infrastructure/aws/cloudformation-eks-1-35.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/infrastructure/aws/cloudformation-eks-1-35.yaml b/infrastructure/aws/cloudformation-eks-1-35.yaml index 7b72f23..6ba7be2 100644 --- a/infrastructure/aws/cloudformation-eks-1-35.yaml +++ b/infrastructure/aws/cloudformation-eks-1-35.yaml @@ -2545,7 +2545,7 @@ Resources: --set extraArgs.max-node-group-backoff-duration=5m \ --set extraArgs.enable-provisioning-requests=true \ --set extraArgs.kube-api-content-type=application/json \ - --set extraArgs.startup-taints=nvidia.com/gpu-not-ready \ + --set extraArgs.startup-taint=nvidia.com/gpu-not-ready \ --atomic --timeout 5m # Grant CA access to ProvisioningRequest and PodTemplate resources. @@ -3332,7 +3332,7 @@ Resources: ExternalDnsVersion: '1.20.0' AlbControllerVersion: '3.0.0' # Bump for non-version buildspec changes (new flags, RBAC, install order) - BuildSpecRevision: '2' + BuildSpecRevision: '3' TriggerPlatformaDeploy: Type: AWS::CloudFormation::CustomResource From a4a71e181bb860132f3d66bf1c419b2d5ef0bb40 Mon Sep 17 00:00:00 2001 From: Vladimir Antropov Date: Tue, 26 May 2026 20:47:24 +0200 Subject: [PATCH 4/4] fix(aws): restore strict ClusterName pattern (lowercase, no underscores, max 25 chars) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sync from pl main commit 1f25f0cdd (review feedback): tighten the ClusterName parameter to '^[a-z0-9][a-z0-9-]{0,24}$' to match the constraints of derived resource names: - ECR pull-through cache prefix (quay-${ClusterName}) must stay under AWS's 30-char limit → 25-char ceiling on ClusterName. - S3 bucket name (platforma-${ClusterName}-...) is S3-naming-rules bound → no underscores, no uppercase. Regression on this branch: the loose pattern (alphanumeric + underscores + uppercase, 1-100 chars) silently allowed names that then break ECR/S3 downstream. --- infrastructure/aws/cloudformation-eks-1-35.yaml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/infrastructure/aws/cloudformation-eks-1-35.yaml b/infrastructure/aws/cloudformation-eks-1-35.yaml index 6ba7be2..9a76d8a 100644 --- a/infrastructure/aws/cloudformation-eks-1-35.yaml +++ b/infrastructure/aws/cloudformation-eks-1-35.yaml @@ -168,8 +168,13 @@ Parameters: ClusterName: Type: String Default: platforma-cluster - AllowedPattern: '^[a-zA-Z0-9][a-zA-Z0-9_-]{0,99}$' - ConstraintDescription: 'Alphanumeric, hyphens, underscores, 1-100 characters' + # Upper bound of 25 characters keeps the derived ECR pull-through cache + # prefix (`quay-${ClusterName}`) under AWS's 30-char limit. Lowercase + # alphanumeric and hyphens only — the auto-generated S3 bucket name + # (`platforma-${ClusterName}-...`) cannot contain underscores or + # uppercase, and S3 is the tightest naming constraint downstream. + AllowedPattern: '^[a-z0-9][a-z0-9-]{0,24}$' + ConstraintDescription: 'Lowercase alphanumeric, hyphens, 1-25 characters' Description: EKS cluster name PlatformaNamespace: