Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 63 additions & 37 deletions pkg/manager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,65 @@ var HypershiftSupportedVersions = HypershiftSupportedVersionsType{}
var reBranchVersion = regexp.MustCompile(`^(openshift-|release-)(\d+\.\d+)$`)
var reMajorMinorVersion = regexp.MustCompile(`^(\d+)\.(\d+)$`)

// platformQuotaSlices maps each cloud platform to its available quota-slice
// accounts. The first entry is the primary (default) account. Subsequent entries
// are alternates that can be selected when they have more free resources.
var platformQuotaSlices = map[string][]CloudAccountProfile{
"aws": {
{QuotaSlice: "aws-quota-slice"},
{
QuotaSlice: "aws-2-quota-slice",
ProfileName: "aws-2",
ProfileSecret: "cluster-secrets-aws-2",
AccountDomain: "aws-2.ci.openshift.org",
},
},
"azure": {
{QuotaSlice: "azure4-quota-slice"},
{
QuotaSlice: "azure-2-quota-slice",
ProfileName: "azure-2",
ProfileSecret: "cluster-secrets-azure-2",
AccountDomain: "ci2.azure.devcluster.openshift.com",
},
},
"gcp": {
{QuotaSlice: "gcp-quota-slice"},
{
QuotaSlice: "gcp-openshift-gce-devel-ci-2-quota-slice",
ProfileName: "gcp-openshift-gce-devel-ci-2",
ProfileSecret: "cluster-secrets-gcp-openshift-gce-devel-ci-2",
},
},
}

// selectCloudAccountProfile queries Boskos metrics for each quota-slice
// candidate for the given platform and returns the profile with the most free
// resources. Returns nil if the platform has no configured accounts or if the
// primary (index 0) has the most free resources (no conversion needed).
func selectCloudAccountProfile(platform string, lClient LeaseClient) (*CloudAccountProfile, error) {
accounts, ok := platformQuotaSlices[platform]
if !ok || len(accounts) < 2 {
return nil, nil
}
bestIdx := 0
bestFree := -1
for i := range accounts {
metrics, err := lClient.Metrics(accounts[i].QuotaSlice)
if err != nil {
return nil, fmt.Errorf("failed to get metrics for %q leases: %v", accounts[i].QuotaSlice, err)
Comment on lines +154 to +156
Copy link
Copy Markdown

@coderabbitai coderabbitai Bot Mar 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Fall back to the primary account when Boskos metrics fail.

This is a best-effort balancing check. Returning an error here means a transient Boskos issue blocks every amd64 aws / azure / gcp launch instead of just using the default account.

Suggested fix
 		metrics, err := lClient.Metrics(accounts[i].QuotaSlice)
 		if err != nil {
-			return nil, fmt.Errorf("failed to get metrics for %q leases: %v", accounts[i].QuotaSlice, err)
+			klog.Warningf("failed to get metrics for %q leases, using primary account: %v", accounts[i].QuotaSlice, err)
+			return nil, nil
 		}
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
metrics, err := lClient.Metrics(accounts[i].QuotaSlice)
if err != nil {
return nil, fmt.Errorf("failed to get metrics for %q leases: %v", accounts[i].QuotaSlice, err)
metrics, err := lClient.Metrics(accounts[i].QuotaSlice)
if err != nil {
klog.Warningf("failed to get metrics for %q leases, using primary account: %v", accounts[i].QuotaSlice, err)
return nil, nil
}
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@pkg/manager/manager.go` around lines 154 - 156, The current code returns an
error when lClient.Metrics(accounts[i].QuotaSlice) fails, which aborts
scheduling; change this to log the error and fall back to using the
primary/default account instead of returning nil. Specifically, in the block
that calls lClient.Metrics for accounts[i].QuotaSlice, catch the error, emit a
warning (including the error) and set metrics to the primary account's metrics
(or mark this candidate as using the primary quota) so the caller can continue;
do this change around the lClient.Metrics call and its error handling to avoid
returning fmt.Errorf and instead continue with a fallback metrics value.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@thiagoalessio I like the idea of falling back to the "default" account (i.e. the zero [0] item) instead of failing outright... We might as well make an attempt at getting a cluster through if/when Boskos isn't available.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like the humans are having a chat. I'll hop back into my burrow for now. If you need me again, just tag @coderabbitai in a new comment, and I'll come hopping out!

}
if metrics.Free > bestFree {
bestIdx = i
bestFree = metrics.Free
}
}
if bestIdx == 0 {
return nil, nil
}
return &accounts[bestIdx], nil
}

func (j Job) IsComplete() bool {
return j.Complete || len(j.Credentials) > 0 || (len(j.State) > 0 && j.State != prowapiv1.PendingState)
}
Expand Down Expand Up @@ -2125,44 +2184,11 @@ func (m *jobManager) LaunchJobForUser(req *JobRequest) (string, error) {

// check what leases are available for platform
if req.Architecture == "amd64" && m.lClient != nil {
switch req.Platform {
case "aws":
metrics1, err := m.lClient.Metrics("aws-quota-slice")
if err != nil {
return "", fmt.Errorf("failed to get metrics for `aws` leases: %v", err)
}
metrics2, err := m.lClient.Metrics("aws-2-quota-slice")
if err != nil {
return "", fmt.Errorf("failed to get metrics for `aws-2` leases: %v", err)
}
if metrics2.Free > metrics1.Free {
job.UseSecondaryAccount = true
}
case "azure":
metrics1, err := m.lClient.Metrics("azure4-quota-slice")
if err != nil {
return "", fmt.Errorf("failed to get metrics for `azure` leases: %v", err)
}
metrics2, err := m.lClient.Metrics("azure-2-quota-slice")
if err != nil {
return "", fmt.Errorf("failed to get metrics for `azure-2` leases: %v", err)
}
if metrics2.Free > metrics1.Free {
job.UseSecondaryAccount = true
}
case "gcp":
metrics1, err := m.lClient.Metrics("gcp-quota-slice")
if err != nil {
return "", fmt.Errorf("failed to get metrics for `gcp` leases: %v", err)
}
metrics2, err := m.lClient.Metrics("gcp-openshift-gce-devel-ci-2-quota-slice")
if err != nil {
return "", fmt.Errorf("failed to get metrics for `gcp-openshift-gce-devel-ci-2` leases: %v", err)
}
if metrics2.Free > metrics1.Free {
job.UseSecondaryAccount = true
}
profile, err := selectCloudAccountProfile(req.Platform, m.lClient)
if err != nil {
return "", err
}
job.CloudAccountProfile = profile
}

msg, err := func() (string, error) {
Expand Down
120 changes: 119 additions & 1 deletion pkg/manager/manager_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,124 @@
package manager

import "testing"
import (
"fmt"
"testing"

"github.com/openshift/ci-tools/pkg/lease"
)

type mockLeaseClient struct {
metrics map[string]lease.Metrics
}

func (m *mockLeaseClient) Metrics(rtype string) (lease.Metrics, error) {
if metrics, ok := m.metrics[rtype]; ok {
return metrics, nil
}
return lease.Metrics{}, fmt.Errorf("resource type %q not found", rtype)
}

func Test_selectCloudAccountProfile(t *testing.T) {
tests := []struct {
name string
platform string
metrics map[string]lease.Metrics
wantNil bool
wantProfile string
wantErr bool
}{
{
name: "platform not in map returns nil",
platform: "metal",
metrics: map[string]lease.Metrics{},
wantNil: true,
},
{
name: "primary has more free resources returns nil",
platform: "aws",
metrics: map[string]lease.Metrics{
"aws-quota-slice": {Free: 10, Leased: 5},
"aws-2-quota-slice": {Free: 3, Leased: 12},
},
wantNil: true,
},
{
name: "secondary has more free resources returns that profile",
platform: "aws",
metrics: map[string]lease.Metrics{
"aws-quota-slice": {Free: 2, Leased: 13},
"aws-2-quota-slice": {Free: 8, Leased: 7},
},
wantProfile: "aws-2",
},
{
name: "equal free counts returns nil (primary wins)",
platform: "aws",
metrics: map[string]lease.Metrics{
"aws-quota-slice": {Free: 5, Leased: 10},
"aws-2-quota-slice": {Free: 5, Leased: 10},
},
wantNil: true,
},
{
name: "all zero free returns nil",
platform: "aws",
metrics: map[string]lease.Metrics{
"aws-quota-slice": {Free: 0, Leased: 15},
"aws-2-quota-slice": {Free: 0, Leased: 15},
},
wantNil: true,
},
{
name: "gcp secondary wins",
platform: "gcp",
metrics: map[string]lease.Metrics{
"gcp-quota-slice": {Free: 1, Leased: 14},
"gcp-openshift-gce-devel-ci-2-quota-slice": {Free: 7, Leased: 8},
},
wantProfile: "gcp-openshift-gce-devel-ci-2",
},
{
name: "azure secondary wins",
platform: "azure",
metrics: map[string]lease.Metrics{
"azure4-quota-slice": {Free: 3, Leased: 12},
"azure-2-quota-slice": {Free: 9, Leased: 6},
},
wantProfile: "azure-2",
},
{
name: "metrics error returns error",
platform: "aws",
metrics: map[string]lease.Metrics{},
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
client := &mockLeaseClient{metrics: tt.metrics}
got, err := selectCloudAccountProfile(tt.platform, client)
if (err != nil) != tt.wantErr {
t.Errorf("selectCloudAccountProfile() error = %v, wantErr %v", err, tt.wantErr)
return
}
if tt.wantErr {
return
}
if tt.wantNil && got != nil {
t.Errorf("selectCloudAccountProfile() = %+v, want nil", got)
return
}
if !tt.wantNil && got == nil {
t.Errorf("selectCloudAccountProfile() = nil, want profile %q", tt.wantProfile)
return
}
if !tt.wantNil && got.ProfileName != tt.wantProfile {
t.Errorf("selectCloudAccountProfile() ProfileName = %q, want %q", got.ProfileName, tt.wantProfile)
}
})
}
}

func Test_containsValidVersion(t *testing.T) {
type args struct {
Expand Down
34 changes: 6 additions & 28 deletions pkg/manager/prow.go
Original file line number Diff line number Diff line change
Expand Up @@ -562,21 +562,11 @@ func (m *jobManager) newJob(job *Job) (string, error) {
}
}

// if a step based config, launch should now be the test config we will run; time to update the config for lease balancing
if job.UseSecondaryAccount {
switch job.Platform {
case "aws":
if err := convertAWSToAWS2(pj, sourceConfig); err != nil {
return "", fmt.Errorf("failed updating aws job to aws-2: %w", err)
}
case "gcp":
if err := convertGCPToGCP2(pj, sourceConfig); err != nil {
return "", fmt.Errorf("failed updating gcp job to gcp-openshift-gce-devel-ci-2: %w", err)
}
case "azure":
if err := convertAzureToAzure2(pj, sourceConfig); err != nil {
return "", fmt.Errorf("failed updating azure job to azure-2: %w", err)
}
// if an alternate cloud account was selected for lease balancing, apply it
if job.CloudAccountProfile != nil {
p := job.CloudAccountProfile
if err := applyClusterProfile(pj, sourceConfig, p.ProfileName, p.ProfileSecret, p.AccountDomain); err != nil {
return "", fmt.Errorf("failed applying cluster profile %q: %w", p.ProfileName, err)
}
}

Expand Down Expand Up @@ -1713,7 +1703,7 @@ func (e *resolvedEnvironment) Lookup(name string) string {
return ""
}

func convertToAccount2(job *prowapiv1.ProwJob, sourceConfig *citools.ReleaseBuildConfiguration, profileName, profileSecret, accountDomain string) error {
func applyClusterProfile(job *prowapiv1.ProwJob, sourceConfig *citools.ReleaseBuildConfiguration, profileName, profileSecret, accountDomain string) error {
job.Labels["ci-operator.openshift.io/cloud-cluster-profile"] = profileName
for index, volume := range job.Spec.PodSpec.Volumes {
// TODO: only some ci-chat-bot jobs have this; check if they can all be removed
Expand Down Expand Up @@ -1744,15 +1734,3 @@ func convertToAccount2(job *prowapiv1.ProwJob, sourceConfig *citools.ReleaseBuil
}
return nil
}

func convertAWSToAWS2(job *prowapiv1.ProwJob, sourceConfig *citools.ReleaseBuildConfiguration) error {
return convertToAccount2(job, sourceConfig, "aws-2", "cluster-secrets-aws-2", "aws-2.ci.openshift.org")
}

func convertAzureToAzure2(job *prowapiv1.ProwJob, sourceConfig *citools.ReleaseBuildConfiguration) error {
return convertToAccount2(job, sourceConfig, "azure-2", "cluster-secrets-azure-2", "ci2.azure.devcluster.openshift.com")
}

func convertGCPToGCP2(job *prowapiv1.ProwJob, sourceConfig *citools.ReleaseBuildConfiguration) error {
return convertToAccount2(job, sourceConfig, "gcp-openshift-gce-devel-ci-2", "cluster-secrets-gcp-openshift-gce-devel-ci-2", "")
}
11 changes: 10 additions & 1 deletion pkg/manager/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,15 @@ type JobInput struct {
Refs []prowapiv1.Refs
}

// CloudAccountProfile holds the parameters needed to redirect a ProwJob to an
// alternate cloud account (e.g. aws-2 instead of the default aws account).
type CloudAccountProfile struct {
QuotaSlice string // boskos resource type, e.g. "aws-2-quota-slice"
ProfileName string // cluster profile name, e.g. "aws-2"
ProfileSecret string // k8s secret name, e.g. "cluster-secrets-aws-2"
AccountDomain string // base domain override (optional), e.g. "aws-2.ci.openshift.org"
}

// Job responds to user requests and tracks the state of the launched
// jobs. This object must be recreatable from a ProwJob, but the RequestedChannel
// field may be empty to indicate the user has already been notified.
Expand Down Expand Up @@ -486,7 +495,7 @@ type Job struct {

WorkflowName string

UseSecondaryAccount bool
CloudAccountProfile *CloudAccountProfile

Operator OperatorInfo
CatalogComplete bool
Expand Down