diff --git a/infrastructure/aws/cloudformation-eks-1-35.yaml b/infrastructure/aws/cloudformation-eks-1-35.yaml index 6c26b11..9a76d8a 100644 --- a/infrastructure/aws/cloudformation-eks-1-35.yaml +++ b/infrastructure/aws/cloudformation-eks-1-35.yaml @@ -168,8 +168,13 @@ Parameters: ClusterName: Type: String Default: platforma-cluster - AllowedPattern: '^[a-zA-Z0-9][a-zA-Z0-9_-]{0,99}$' - ConstraintDescription: 'Alphanumeric, hyphens, underscores, 1-100 characters' + # Upper bound of 25 characters keeps the derived ECR pull-through cache + # prefix (`quay-${ClusterName}`) under AWS's 30-char limit. Lowercase + # alphanumeric and hyphens only — the auto-generated S3 bucket name + # (`platforma-${ClusterName}-...`) cannot contain underscores or + # uppercase, and S3 is the tightest naming constraint downstream. + AllowedPattern: '^[a-z0-9][a-z0-9-]{0,24}$' + ConstraintDescription: 'Lowercase alphanumeric, hyphens, 1-25 characters' Description: EKS cluster name PlatformaNamespace: @@ -456,7 +461,7 @@ Parameters: AllowedValues: [small, medium, large, xlarge] Description: > Cluster sizing profile. Controls node group scaling limits and Kueue quotas. - All deployment sizes support the same maximum single-job size (62 vCPU / 500Gi). + All deployment sizes support the same maximum single-job size (62 vCPU / 484Gi). Larger sizes allow more jobs to run simultaneously. GPU nodes (g6f/g6/g6e) scale from 0 — no cost when idle. Disable entirely with EnableGpu=false. 6 GPU tiers: 3GB (g6f.xl, $0.24/hr), 6GB (g6f.2xl, $0.49/hr), 12GB (g6f.4xl, $0.98/hr), @@ -539,13 +544,18 @@ Mappings: # Kueue ClusterQueue quotas and node group MaxSize per deployment size. # Quotas are derived from pool sizes (worst case: all jobs are max size). - # CPU: (64-vCPU node count) x 63 allocatable. Memory: (r7i.16xl count) x 500Gi. + # CPU: (64-vCPU node count) x 63 allocatable. Memory: (r7i.16xl count) x 484Gi. + # 484Gi = r7i.16xlarge EKS allocatable (~486 GiB) minus ~1 GiB EKS DaemonSet + # overhead (aws-node, kube-proxy, ebs-csi) minus 1 GiB safety margin for + # customer-installed DaemonSets (Datadog, Falco, Wiz, etc.). Matches the GCP + # per-job ceiling on n2d-highmem-64 so the same deployment_size label means + # the same workload capacity on both clouds. # UI quota is fixed at 64 vCPU / 256 GiB regardless of size. DeploymentSize: small: - # Kueue quotas (derived: 2x63=126 CPU, 1x500=500Gi) + # Kueue quotas (derived: 2x63=126 CPU, 1x484=484Gi) BatchCpu: '126' - BatchMemoryGi: '500' + BatchMemoryGi: '484' UiCpu: '64' UiMemoryGi: '256' # Node group MaxSize per pool @@ -562,9 +572,9 @@ Mappings: MaxGpu96g: '1' MaxUi: '4' medium: - # Kueue quotas (derived: 4x63=252 CPU, 2x500=1000Gi) + # Kueue quotas (derived: 4x63=252 CPU, 2x484=968Gi) BatchCpu: '252' - BatchMemoryGi: '1000' + BatchMemoryGi: '968' UiCpu: '64' UiMemoryGi: '256' MaxBatch16c64g: '8' @@ -580,9 +590,9 @@ Mappings: MaxGpu96g: '1' MaxUi: '8' large: - # Kueue quotas (derived: 8x63=504 CPU, 4x500=2000Gi) + # Kueue quotas (derived: 8x63=504 CPU, 4x484=1936Gi) BatchCpu: '504' - BatchMemoryGi: '2000' + BatchMemoryGi: '1936' UiCpu: '64' UiMemoryGi: '256' MaxBatch16c64g: '16' @@ -598,9 +608,9 @@ Mappings: MaxGpu96g: '2' MaxUi: '16' xlarge: - # Kueue quotas (derived: 16x63=1008 CPU, 8x500=4000Gi) + # Kueue quotas (derived: 16x63=1008 CPU, 8x484=3872Gi) BatchCpu: '1008' - BatchMemoryGi: '4000' + BatchMemoryGi: '3872' UiCpu: '64' UiMemoryGi: '256' MaxBatch16c64g: '32' @@ -2540,7 +2550,7 @@ Resources: --set extraArgs.max-node-group-backoff-duration=5m \ --set extraArgs.enable-provisioning-requests=true \ --set extraArgs.kube-api-content-type=application/json \ - --set extraArgs.startup-taints=nvidia.com/gpu-not-ready \ + --set extraArgs.startup-taint=nvidia.com/gpu-not-ready \ --atomic --timeout 5m # Grant CA access to ProvisioningRequest and PodTemplate resources. @@ -3074,7 +3084,7 @@ Resources: kueue: maxJobResources: cpu: 62 - memory: 500Gi + memory: 484Gi mode: dedicated pools: ui: @@ -3118,7 +3128,7 @@ Resources: memory: 16Gi limits: cpu: 8 - memory: 16Gi + memory: 32Gi nodeSelector: node.kubernetes.io/pool: system extraArgs: ${APP_EXTRA_ARGS:-[]} @@ -3327,7 +3337,7 @@ Resources: ExternalDnsVersion: '1.20.0' AlbControllerVersion: '3.0.0' # Bump for non-version buildspec changes (new flags, RBAC, install order) - BuildSpecRevision: '2' + BuildSpecRevision: '3' TriggerPlatformaDeploy: Type: AWS::CloudFormation::CustomResource diff --git a/infrastructure/aws/values-aws-s3.yaml b/infrastructure/aws/values-aws-s3.yaml index b79f2d1..63ea2b9 100644 --- a/infrastructure/aws/values-aws-s3.yaml +++ b/infrastructure/aws/values-aws-s3.yaml @@ -63,9 +63,13 @@ ingress: kueue: # Max resources for a single job (rejects jobs that exceed this). + # 484Gi = r7i.16xlarge EKS allocatable (~486 GiB) minus ~1 GiB EKS DaemonSet + # overhead minus 1 GiB safety margin for customer-installed DaemonSets. + # Matches GCP per-job ceiling on n2d-highmem-64 so deployment_size means the + # same workload capacity on both clouds. Reduce further if you run heavier DS. maxJobResources: cpu: 62 - memory: 500Gi + memory: 484Gi mode: dedicated # Node pools — controls where jobs run. # Must match your EKS node group labels and taints. @@ -101,7 +105,7 @@ kueue: memory: 256Gi batch: cpu: 126 - memory: 500Gi + memory: 484Gi gpu: gpu: 8 cpu: 32 @@ -115,6 +119,6 @@ app: memory: 16Gi limits: cpu: 8 - memory: 16Gi + memory: 32Gi nodeSelector: node.kubernetes.io/pool: system