Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 26 additions & 16 deletions infrastructure/aws/cloudformation-eks-1-35.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,13 @@ Parameters:
ClusterName:
Type: String
Default: platforma-cluster
AllowedPattern: '^[a-zA-Z0-9][a-zA-Z0-9_-]{0,99}$'
ConstraintDescription: 'Alphanumeric, hyphens, underscores, 1-100 characters'
# Upper bound of 25 characters keeps the derived ECR pull-through cache
# prefix (`quay-${ClusterName}`) under AWS's 30-char limit. Lowercase
# alphanumeric and hyphens only — the auto-generated S3 bucket name
# (`platforma-${ClusterName}-...`) cannot contain underscores or
# uppercase, and S3 is the tightest naming constraint downstream.
AllowedPattern: '^[a-z0-9][a-z0-9-]{0,24}$'
ConstraintDescription: 'Lowercase alphanumeric, hyphens, 1-25 characters'
Description: EKS cluster name

PlatformaNamespace:
Expand Down Expand Up @@ -456,7 +461,7 @@ Parameters:
AllowedValues: [small, medium, large, xlarge]
Description: >
Cluster sizing profile. Controls node group scaling limits and Kueue quotas.
All deployment sizes support the same maximum single-job size (62 vCPU / 500Gi).
All deployment sizes support the same maximum single-job size (62 vCPU / 484Gi).
Larger sizes allow more jobs to run simultaneously.
GPU nodes (g6f/g6/g6e) scale from 0 — no cost when idle. Disable entirely with EnableGpu=false.
6 GPU tiers: 3GB (g6f.xl, $0.24/hr), 6GB (g6f.2xl, $0.49/hr), 12GB (g6f.4xl, $0.98/hr),
Expand Down Expand Up @@ -539,13 +544,18 @@ Mappings:

# Kueue ClusterQueue quotas and node group MaxSize per deployment size.
# Quotas are derived from pool sizes (worst case: all jobs are max size).
# CPU: (64-vCPU node count) x 63 allocatable. Memory: (r7i.16xl count) x 500Gi.
# CPU: (64-vCPU node count) x 63 allocatable. Memory: (r7i.16xl count) x 484Gi.
# 484Gi = r7i.16xlarge EKS allocatable (~486 GiB) minus ~1 GiB EKS DaemonSet
# overhead (aws-node, kube-proxy, ebs-csi) minus 1 GiB safety margin for
# customer-installed DaemonSets (Datadog, Falco, Wiz, etc.). Matches the GCP
# per-job ceiling on n2d-highmem-64 so the same deployment_size label means
# the same workload capacity on both clouds.
# UI quota is fixed at 64 vCPU / 256 GiB regardless of size.
DeploymentSize:
small:
# Kueue quotas (derived: 2x63=126 CPU, 1x500=500Gi)
# Kueue quotas (derived: 2x63=126 CPU, 1x484=484Gi)
BatchCpu: '126'
BatchMemoryGi: '500'
BatchMemoryGi: '484'
UiCpu: '64'
UiMemoryGi: '256'
# Node group MaxSize per pool
Expand All @@ -562,9 +572,9 @@ Mappings:
MaxGpu96g: '1'
MaxUi: '4'
medium:
# Kueue quotas (derived: 4x63=252 CPU, 2x500=1000Gi)
# Kueue quotas (derived: 4x63=252 CPU, 2x484=968Gi)
BatchCpu: '252'
BatchMemoryGi: '1000'
BatchMemoryGi: '968'
UiCpu: '64'
UiMemoryGi: '256'
MaxBatch16c64g: '8'
Expand All @@ -580,9 +590,9 @@ Mappings:
MaxGpu96g: '1'
MaxUi: '8'
large:
# Kueue quotas (derived: 8x63=504 CPU, 4x500=2000Gi)
# Kueue quotas (derived: 8x63=504 CPU, 4x484=1936Gi)
BatchCpu: '504'
BatchMemoryGi: '2000'
BatchMemoryGi: '1936'
UiCpu: '64'
UiMemoryGi: '256'
MaxBatch16c64g: '16'
Expand All @@ -598,9 +608,9 @@ Mappings:
MaxGpu96g: '2'
MaxUi: '16'
xlarge:
# Kueue quotas (derived: 16x63=1008 CPU, 8x500=4000Gi)
# Kueue quotas (derived: 16x63=1008 CPU, 8x484=3872Gi)
BatchCpu: '1008'
BatchMemoryGi: '4000'
BatchMemoryGi: '3872'
UiCpu: '64'
UiMemoryGi: '256'
MaxBatch16c64g: '32'
Expand Down Expand Up @@ -2540,7 +2550,7 @@ Resources:
--set extraArgs.max-node-group-backoff-duration=5m \
--set extraArgs.enable-provisioning-requests=true \
--set extraArgs.kube-api-content-type=application/json \
--set extraArgs.startup-taints=nvidia.com/gpu-not-ready \
--set extraArgs.startup-taint=nvidia.com/gpu-not-ready \
--atomic --timeout 5m

# Grant CA access to ProvisioningRequest and PodTemplate resources.
Expand Down Expand Up @@ -3074,7 +3084,7 @@ Resources:
kueue:
maxJobResources:
cpu: 62
memory: 500Gi
memory: 484Gi
mode: dedicated
pools:
ui:
Expand Down Expand Up @@ -3118,7 +3128,7 @@ Resources:
memory: 16Gi
limits:
cpu: 8
memory: 16Gi
memory: 32Gi
nodeSelector:
node.kubernetes.io/pool: system
extraArgs: ${APP_EXTRA_ARGS:-[]}
Expand Down Expand Up @@ -3327,7 +3337,7 @@ Resources:
ExternalDnsVersion: '1.20.0'
AlbControllerVersion: '3.0.0'
# Bump for non-version buildspec changes (new flags, RBAC, install order)
BuildSpecRevision: '2'
BuildSpecRevision: '3'

TriggerPlatformaDeploy:
Type: AWS::CloudFormation::CustomResource
Expand Down
10 changes: 7 additions & 3 deletions infrastructure/aws/values-aws-s3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,13 @@ ingress:

kueue:
# Max resources for a single job (rejects jobs that exceed this).
# 484Gi = r7i.16xlarge EKS allocatable (~486 GiB) minus ~1 GiB EKS DaemonSet
# overhead minus 1 GiB safety margin for customer-installed DaemonSets.
# Matches GCP per-job ceiling on n2d-highmem-64 so deployment_size means the
# same workload capacity on both clouds. Reduce further if you run heavier DS.
maxJobResources:
cpu: 62
memory: 500Gi
memory: 484Gi
mode: dedicated
# Node pools — controls where jobs run.
# Must match your EKS node group labels and taints.
Expand Down Expand Up @@ -101,7 +105,7 @@ kueue:
memory: 256Gi
batch:
cpu: 126
memory: 500Gi
memory: 484Gi
gpu:
gpu: 8
cpu: 32
Expand All @@ -115,6 +119,6 @@ app:
memory: 16Gi
limits:
cpu: 8
memory: 16Gi
memory: 32Gi
nodeSelector:
node.kubernetes.io/pool: system
Loading