diff --git a/.gitignore b/.gitignore index 619d645..0d93063 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ *.tgz /coverprofile common.mk +/.secrets/ diff --git a/Makefile b/Makefile index a963d34..8c40a0a 100644 --- a/Makefile +++ b/Makefile @@ -87,9 +87,16 @@ test: $(GINKGO) $(KCP) generate ## Run all tests (excludes e2e). TEST_KCP_ASSETS=$(LOCALBIN) $(GINKGO) -r -cover --fail-fast --require-suite -covermode count --output-dir=$(BUILD_PATH) -coverprofile=coverprofile --skip-package=test/e2e $(testargs) .PHONY: test-e2e -test-e2e: $(GINKGO) ## Run e2e tests (kind + kcp + helm). +test-e2e: $(GINKGO) ## Run e2e tests (kind + kcp + helm). Set E2E_SHARD_CONFIG=single-shard|multi-shard (default: multi-shard). $(GINKGO) -r --fail-fast -v --timeout 30m ./test/e2e/ $(testargs) +.PHONY: test-e2e-matrix +test-e2e-matrix: ## Run e2e tests against both shard configs (single-shard, multi-shard). + $(MAKE) clean-e2e + E2E_SHARD_CONFIG=single-shard $(MAKE) test-e2e + $(MAKE) clean-e2e + E2E_SHARD_CONFIG=multi-shard $(MAKE) test-e2e + .PHONY: e2e-cleanup clean-e2e: ## Remove kind cluster from e2e tests. -$(KIND) delete cluster --name dep-ctrl-e2e 2>/dev/null diff --git a/README.md b/README.md index 9628d2c..29c6b55 100644 --- a/README.md +++ b/README.md @@ -201,12 +201,12 @@ workspace paths to logical cluster names. for VW URL discovery and full CRUD on `apiexports/content` to manage webhooks in binding workspaces via the VW. -**system:admin** (shard-local) -- the webhook SA gets shard-wide read access to -`apiexports/content` and `apiexportendpointslices`. This is evaluated by the Bootstrap -Policy Authorizer for every request on the shard, giving the webhook access to all -provider APIExport virtual workspaces without per-workspace RBAC. +No shard-wide RBAC is needed. The webhook watches dependent resources through the +dep-ctrl APIExport's virtual workspace, authorized by dynamically managed +permissionClaims. Providers accept these claims in their APIBinding. -See [docs/getting-started.md](docs/getting-started.md) for the full bootstrap procedure. +See [docs/getting-started.md](docs/getting-started.md) for the full deployment guide +using [kcp-operator](https://github.com/kcp-dev/helm-charts). ## Development @@ -228,6 +228,7 @@ make build make test # E2E tests (requires kind, helm, docker) +# Deploys a multi-shard kcp via kcp-operator (root + shard1) make test-e2e ``` diff --git a/api/v1alpha1/types.go b/api/v1alpha1/types.go index f664b5b..b1ec7d9 100644 --- a/api/v1alpha1/types.go +++ b/api/v1alpha1/types.go @@ -57,6 +57,13 @@ type DependentRef struct { // +kubebuilder:validation:MaxLength=63 // +kubebuilder:validation:Pattern=`^[A-Z][A-Za-z0-9]*$` Kind string `json:"kind"` + + // Resource is the plural resource name of the dependent (e.g., "virtualmachines"). + // Used by the webhook to construct the GVR for listing dependent resources. + // +kubebuilder:validation:MinLength=1 + // +kubebuilder:validation:MaxLength=63 + // +kubebuilder:validation:Pattern=`^[a-z][a-z0-9]*$` + Resource string `json:"resource"` } // APIExportReference identifies an APIExport by workspace path and name. diff --git a/api/v1alpha1/validation_test.go b/api/v1alpha1/validation_test.go index 3fab949..bfd2a9b 100644 --- a/api/v1alpha1/validation_test.go +++ b/api/v1alpha1/validation_test.go @@ -68,6 +68,14 @@ func TestCRDFieldValidation(t *testing.T) { valid: []string{"A", "VirtualMachine", "VPC", "Foo123"}, invalid: []string{"", "virtualMachine", "1Foo", "Foo-Bar", "Foo.Bar", "Foo_Bar"}, }, + { + path: "spec.dependent.resource", + pattern: `^[a-z][a-z0-9]*$`, + minLength: 1, + maxLength: 63, + valid: []string{"virtualmachines", "vpcs", "subnets"}, + invalid: []string{"", "VirtualMachines", "virtual-machines", "virtual_machines"}, + }, { path: "spec.dependencies.items.apiExportRef.path", pattern: `^[a-z][a-z0-9-]*(:[a-z][a-z0-9-]*)*$`, diff --git a/charts/dependency-controller/files/apiexport-dependencies.opendefense.cloud.yaml b/charts/dependency-controller/files/apiexport-dependencies.opendefense.cloud.yaml index f46e8aa..4c244bf 100644 --- a/charts/dependency-controller/files/apiexport-dependencies.opendefense.cloud.yaml +++ b/charts/dependency-controller/files/apiexport-dependencies.opendefense.cloud.yaml @@ -16,7 +16,7 @@ spec: resources: - group: dependencies.opendefense.cloud name: dependencyrules - schema: v260428-3564c91.dependencyrules.dependencies.opendefense.cloud + schema: v260504-f130a11.dependencyrules.dependencies.opendefense.cloud storage: crd: {} status: {} diff --git a/charts/dependency-controller/files/apiresourceschema-dependencyrules.dependencies.opendefense.cloud.yaml b/charts/dependency-controller/files/apiresourceschema-dependencyrules.dependencies.opendefense.cloud.yaml index 4fbac37..ec8e4e9 100644 --- a/charts/dependency-controller/files/apiresourceschema-dependencyrules.dependencies.opendefense.cloud.yaml +++ b/charts/dependency-controller/files/apiresourceschema-dependencyrules.dependencies.opendefense.cloud.yaml @@ -1,7 +1,7 @@ apiVersion: apis.kcp.io/v1alpha1 kind: APIResourceSchema metadata: - name: v260428-3564c91.dependencyrules.dependencies.opendefense.cloud + name: v260504-f130a11.dependencyrules.dependencies.opendefense.cloud spec: group: dependencies.opendefense.cloud names: @@ -134,6 +134,14 @@ spec: minLength: 1 pattern: ^[A-Z][A-Za-z0-9]*$ type: string + resource: + description: |- + Resource is the plural resource name of the dependent (e.g., "virtualmachines"). + Used by the webhook to construct the GVR for listing dependent resources. + maxLength: 63 + minLength: 1 + pattern: ^[a-z][a-z0-9]*$ + type: string version: description: Version is the API version of the dependent resource. maxLength: 63 @@ -144,6 +152,7 @@ spec: - apiExportName - group - kind + - resource - version type: object required: diff --git a/charts/dependency-controller/files/dependencies.opendefense.cloud_dependencyrules.yaml b/charts/dependency-controller/files/dependencies.opendefense.cloud_dependencyrules.yaml index 85546cc..549fa36 100644 --- a/charts/dependency-controller/files/dependencies.opendefense.cloud_dependencyrules.yaml +++ b/charts/dependency-controller/files/dependencies.opendefense.cloud_dependencyrules.yaml @@ -140,6 +140,14 @@ spec: minLength: 1 pattern: ^[A-Z][A-Za-z0-9]*$ type: string + resource: + description: |- + Resource is the plural resource name of the dependent (e.g., "virtualmachines"). + Used by the webhook to construct the GVR for listing dependent resources. + maxLength: 63 + minLength: 1 + pattern: ^[a-z][a-z0-9]*$ + type: string version: description: Version is the API version of the dependent resource. maxLength: 63 @@ -150,6 +158,7 @@ spec: - apiExportName - group - kind + - resource - version type: object required: diff --git a/charts/dependency-controller/templates/webhook-deployment.yaml b/charts/dependency-controller/templates/webhook-deployment.yaml index 43bac1c..bc0aa90 100644 --- a/charts/dependency-controller/templates/webhook-deployment.yaml +++ b/charts/dependency-controller/templates/webhook-deployment.yaml @@ -27,9 +27,6 @@ spec: - /dependency-webhook args: - --api-export-name={{ .Values.apiExportName }} - {{- if .Values.kcpBaseHost }} - - --kcp-base-host={{ .Values.kcpBaseHost }} - {{- end }} - --webhook-port={{ .Values.webhook.port }} - --tls-cert-dir={{ .Values.webhook.tlsCertDir }} - --health-probe-bind-address={{ .Values.webhook.healthProbeBindAddress }} diff --git a/charts/dependency-controller/values.yaml b/charts/dependency-controller/values.yaml index 7b94918..f8af3fe 100644 --- a/charts/dependency-controller/values.yaml +++ b/charts/dependency-controller/values.yaml @@ -7,8 +7,9 @@ imagePullSecrets: [] apiExportName: dependencies.opendefense.cloud -# Base kcp host URL (without workspace path). Used to resolve workspace paths -# to logical cluster names. If empty, derived from kubeconfig. +# Base kcp host URL (without workspace path). Used by the controller to resolve +# workspace paths to logical cluster names. +# If empty, derived from kubeconfig. kcpBaseHost: "" # Controller configuration. diff --git a/cmd/controller/main.go b/cmd/controller/main.go index 44c2362..d52ece3 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -4,6 +4,7 @@ package main import ( + "context" "flag" "os" @@ -14,8 +15,8 @@ import ( "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" - "k8s.io/client-go/rest" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -25,6 +26,7 @@ import ( v1alpha1 "go.opendefense.cloud/dependency-controller/api/v1alpha1" "go.opendefense.cloud/dependency-controller/internal/controller" + "go.opendefense.cloud/dependency-controller/internal/kcp" ) var scheme = runtime.NewScheme() @@ -58,14 +60,37 @@ func main() { cfg := ctrl.GetConfigOrDie() + if err := kcp.ValidateKubeconfig(cfg); err != nil { + setupLog.Error(err, "invalid kubeconfig") + os.Exit(1) + } + // Derive base config (root kcp URL without workspace path). - baseCfg := rest.CopyConfig(cfg) + baseCfg, err := kcp.BaseConfig(cfg) + if err != nil { + setupLog.Error(err, "unable to derive front-proxy base URL from kubeconfig") + os.Exit(1) + } if kcpBaseHost != "" { baseCfg.Host = kcpBaseHost } + // Resolve the APIExportEndpointSlice name for the dep-ctrl's APIExport. + // The slice name is not necessarily the same as the APIExport name. + directClient, err := client.New(cfg, client.Options{Scheme: scheme}) + if err != nil { + setupLog.Error(err, "unable to create client for endpoint slice discovery") + os.Exit(1) + } + + ess, err := kcp.FindEndpointSlice(context.Background(), directClient, apiExportName) + if err != nil { + setupLog.Error(err, "unable to find APIExportEndpointSlice", "apiExport", apiExportName) + os.Exit(1) + } + // Create apiexport provider for the dependency-controller's own APIExport. - depCtrlProvider, err := apiexport.New(cfg, apiExportName, apiexport.Options{ + depCtrlProvider, err := apiexport.New(cfg, ess.Name, apiexport.Options{ Scheme: scheme, }) if err != nil { @@ -97,7 +122,6 @@ func main() { // Register the multicluster DependencyRule reconciler. reconciler := controller.NewDependencyRuleReconciler(mgr) - reconciler.APIExportName = apiExportName reconciler.BaseConfig = baseCfg // Wire up webhook installer if configured. @@ -110,7 +134,7 @@ func main() { os.Exit(1) } } - reconciler.WebhookInstaller = controller.NewWebhookInstaller(nil, webhookURL, caBundle) + reconciler.WebhookInstaller = controller.NewWebhookInstaller(mgr, webhookURL, caBundle) } if err := mcbuilder.ControllerManagedBy(mgr). diff --git a/cmd/webhook/main.go b/cmd/webhook/main.go index c6cb707..8c66ecb 100644 --- a/cmd/webhook/main.go +++ b/cmd/webhook/main.go @@ -15,8 +15,8 @@ import ( "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" - "k8s.io/client-go/rest" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -26,6 +26,7 @@ import ( mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" v1alpha1 "go.opendefense.cloud/dependency-controller/api/v1alpha1" + "go.opendefense.cloud/dependency-controller/internal/kcp" "go.opendefense.cloud/dependency-controller/internal/webhook" ) @@ -41,12 +42,10 @@ func init() { func main() { var apiExportName string - var kcpBaseHost string var webhookPort int var tlsCertDir string var healthProbeBindAddress string flag.StringVar(&apiExportName, "api-export-name", "dependencies.opendefense.cloud", "Name of the dependency-controller's APIExport") - flag.StringVar(&kcpBaseHost, "kcp-base-host", "", "Base kcp host URL (without workspace path). If empty, derived from kubeconfig.") flag.IntVar(&webhookPort, "webhook-port", 9443, "Port for the webhook server") flag.StringVar(&tlsCertDir, "tls-cert-dir", "/etc/webhook-tls", "Directory containing tls.crt and tls.key for the webhook server") flag.StringVar(&healthProbeBindAddress, "health-probe-bind-address", ":8081", "Address to bind the health probe endpoint") @@ -60,14 +59,35 @@ func main() { cfg := ctrl.GetConfigOrDie() - // Derive base config (root kcp URL without workspace path). - baseCfg := rest.CopyConfig(cfg) - if kcpBaseHost != "" { - baseCfg.Host = kcpBaseHost + if err := kcp.ValidateKubeconfig(cfg); err != nil { + setupLog.Error(err, "invalid kubeconfig") + os.Exit(1) + } + + // Derive front-proxy base config by stripping the /clusters/... workspace + // path from the kubeconfig host. This base URL is used by the webhook to + // construct per-request clients targeting specific consumer workspaces. + baseCfg, err := kcp.BaseConfig(cfg) + if err != nil { + setupLog.Error(err, "unable to derive front-proxy base URL from kubeconfig") + os.Exit(1) + } + + // Resolve the APIExportEndpointSlice name for the dep-ctrl's APIExport. + directClient, err := client.New(cfg, client.Options{Scheme: scheme}) + if err != nil { + setupLog.Error(err, "unable to create client for endpoint slice discovery") + os.Exit(1) + } + + ess, err := kcp.FindEndpointSlice(context.Background(), directClient, apiExportName) + if err != nil { + setupLog.Error(err, "unable to find APIExportEndpointSlice", "apiExport", apiExportName) + os.Exit(1) } // Create apiexport provider for the dependency-controller's own APIExport. - depCtrlProvider, err := apiexport.New(cfg, apiExportName, apiexport.Options{ + depCtrlProvider, err := apiexport.New(cfg, ess.Name, apiexport.Options{ Scheme: scheme, }) if err != nil { @@ -95,7 +115,6 @@ func main() { cacheMgr := &webhook.RuleCacheManager{ DepCtrlManager: mgr, - BaseConfig: baseCfg, Scheme: scheme, APIExportName: apiExportName, Registry: registry, @@ -139,6 +158,7 @@ func main() { validator := &webhook.DeletionValidator{ Registry: registry, Initialized: initialized, + BaseConfig: baseCfg, } mgr.GetWebhookServer().Register("/validate", &ctrlwebhook.Admission{Handler: validator}) diff --git a/config/crds/dependencies.opendefense.cloud_dependencyrules.yaml b/config/crds/dependencies.opendefense.cloud_dependencyrules.yaml index 85546cc..549fa36 100644 --- a/config/crds/dependencies.opendefense.cloud_dependencyrules.yaml +++ b/config/crds/dependencies.opendefense.cloud_dependencyrules.yaml @@ -140,6 +140,14 @@ spec: minLength: 1 pattern: ^[A-Z][A-Za-z0-9]*$ type: string + resource: + description: |- + Resource is the plural resource name of the dependent (e.g., "virtualmachines"). + Used by the webhook to construct the GVR for listing dependent resources. + maxLength: 63 + minLength: 1 + pattern: ^[a-z][a-z0-9]*$ + type: string version: description: Version is the API version of the dependent resource. maxLength: 63 @@ -150,6 +158,7 @@ spec: - apiExportName - group - kind + - resource - version type: object required: diff --git a/config/kcp/apiexport-dependencies.opendefense.cloud.yaml b/config/kcp/apiexport-dependencies.opendefense.cloud.yaml index f46e8aa..4c244bf 100644 --- a/config/kcp/apiexport-dependencies.opendefense.cloud.yaml +++ b/config/kcp/apiexport-dependencies.opendefense.cloud.yaml @@ -16,7 +16,7 @@ spec: resources: - group: dependencies.opendefense.cloud name: dependencyrules - schema: v260428-3564c91.dependencyrules.dependencies.opendefense.cloud + schema: v260504-f130a11.dependencyrules.dependencies.opendefense.cloud storage: crd: {} status: {} diff --git a/config/kcp/apiresourceschema-dependencyrules.dependencies.opendefense.cloud.yaml b/config/kcp/apiresourceschema-dependencyrules.dependencies.opendefense.cloud.yaml index 4fbac37..ec8e4e9 100644 --- a/config/kcp/apiresourceschema-dependencyrules.dependencies.opendefense.cloud.yaml +++ b/config/kcp/apiresourceschema-dependencyrules.dependencies.opendefense.cloud.yaml @@ -1,7 +1,7 @@ apiVersion: apis.kcp.io/v1alpha1 kind: APIResourceSchema metadata: - name: v260428-3564c91.dependencyrules.dependencies.opendefense.cloud + name: v260504-f130a11.dependencyrules.dependencies.opendefense.cloud spec: group: dependencies.opendefense.cloud names: @@ -134,6 +134,14 @@ spec: minLength: 1 pattern: ^[A-Z][A-Za-z0-9]*$ type: string + resource: + description: |- + Resource is the plural resource name of the dependent (e.g., "virtualmachines"). + Used by the webhook to construct the GVR for listing dependent resources. + maxLength: 63 + minLength: 1 + pattern: ^[a-z][a-z0-9]*$ + type: string version: description: Version is the API version of the dependent resource. maxLength: 63 @@ -144,6 +152,7 @@ spec: - apiExportName - group - kind + - resource - version type: object required: diff --git a/docs/architecture.md b/docs/architecture.md index 180af68..fac8c48 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -17,31 +17,36 @@ The system solves this with two cooperating binaries: - **Controller** -- watches `DependencyRule` objects and installs admission webhooks in provider workspaces -- **Webhook** -- maintains indexed caches of dependent resources and serves - admission requests that block deletion of still-referenced resources +- **Webhook** -- maintains a metadata registry of dependency rules and serves + admission requests that block deletion of still-referenced resources by querying + consumer workspaces directly ## Workspace Topology ```mermaid graph LR subgraph DC["Dep-Ctrl Workspace"] - DCExport["APIExport:
DependencyRule
+ permissionClaims"] + DCExport["APIExport:
DependencyRule
+ VWC permissionClaim"] end subgraph CP["Compute Provider WS"] - CPBinding["APIBinding: dep-ctrl
(claims accepted)"] + CPBinding["APIBinding: dep-ctrl
(VWC claim accepted)"] CPExport["APIExport: compute"] CPRule["DependencyRule:
VM → VPC"] end subgraph NP["Network Provider WS"] - NPBinding["APIBinding: dep-ctrl
(claims accepted)"] + NPBinding["APIBinding: dep-ctrl
(VWC claim accepted)"] NPExport["APIExport: VPCs"] NPWebhook["ValidatingWebhook"] end subgraph ROOT["Root Workspace"] - ROOTROLE["ClusterRoles:
workspaces/content +
workspace resolution
"] + ROOTROLE["ClusterRoles:
controller: workspaces/content
+ workspaces read
"] + end + + subgraph SYSADMIN["system:admin (per shard)"] + SACR["ClusterRoles:
webhook: wildcard read
(applied directly to each shard)"] end subgraph CW["Consumer WS"] @@ -58,6 +63,7 @@ graph LR style CP fill:#e1f0da,color:#1a3e12 style NP fill:#e1f0da,color:#1a3e12 style ROOT fill:#f3e8ff,color:#4a1d7a + style SYSADMIN fill:#f3e8ff,color:#4a1d7a style CW fill:#fef3c7,color:#664d03 ``` @@ -70,17 +76,30 @@ the permissionClaim). **Provider workspaces** -- each provider (compute, network, ...) exports its own resource types and binds to the dep-ctrl APIExport to create `DependencyRule` -objects. The APIBinding must **accept** the dep-ctrl's permissionClaim, which +objects. The APIBinding must **accept** the dep-ctrl's VWC permissionClaim, which grants the controller access to manage `ValidatingWebhookConfigurations` in those workspaces through the virtual workspace. **Consumer workspaces** -- bind to provider exports and create the actual -resources (VPCs, VMs). Consumers don't interact with the dependency system directly. - -**Root workspace** -- hosts static `ClusterRoles` granting both components -`workspaces/content` access (needed to enter child workspaces). The controller -additionally gets `workspaces` read access to resolve workspace paths to logical -cluster names. This is a deployment prerequisite. +resources (VPCs, VMs). Consumers don't interact with the dependency system +directly. The webhook queries dependent resources in consumer workspaces via +the front-proxy using broad read RBAC. + +**Root workspace** -- hosts static `ClusterRoles` for the **controller**: +`workspaces/content` access (to enter child workspaces) and `workspaces` read +(to resolve workspace paths to logical cluster names). This is a deployment +prerequisite. + +**`system:admin` workspace, per shard** -- hosts a `ClusterRole` + +`ClusterRoleBinding` granting the **webhook** wildcard read access (`get`, +`list` on all resources). The webhook queries dependent resources directly +in consumer workspaces, and consumer workspaces can live on any shard. kcp's +`BootstrapPolicyAuthorizer` reads RBAC from the local shard's `system:admin` +only — bindings do not propagate across shards — so this binding must be +applied to **every shard** that hosts consumer workspaces. `system:admin` is +not reachable through the front-proxy; the binding is applied via direct +(port-forwarded) shard access using a `system:masters` identity issued from +a kcp-operator `Kubeconfig` CR with `rootShardRef` / `shardRef`. ## Component Overview @@ -92,11 +111,11 @@ flowchart TD end subgraph Webhook["Webhook Server Binary (cmd/webhook)"] - RCM["Rule Cache Manager"] - RCM -->|"creates per rule"| IC["Indexed Cache
(mcmanager per provider APIExport)"] - IC -->|"stored in"| RR["Rule Registry"] + RCM["Rule Registry Manager"] + RCM -->|"populates"| RR["Rule Registry
(metadata only)"] DV["Deletion Validator"] - DV -->|"queries"| RR + DV -->|"queries rules"| RR + DV -->|"queries dependents via
front-proxy per request"| CW["Consumer Workspaces"] end WI -->|"installs via dep-ctrl VW"| PW["Provider Workspaces"] @@ -113,10 +132,8 @@ flowchart TD **Entry point:** [`cmd/controller/main.go`](../cmd/controller/main.go) -The controller watches `DependencyRule` objects and ensures admission webhooks -exist in the right provider workspaces. The webhook's read access to dependent -resources is granted shard-wide via `system:admin` bootstrap RBAC, so the -controller only handles webhook installation. +The controller watches `DependencyRule` objects and installs +`ValidatingWebhookConfiguration` objects in the right provider workspaces. All operations in provider workspaces are routed through the dep-ctrl APIExport's virtual workspace, authorized by `permissionClaims`. The controller never connects @@ -135,16 +152,14 @@ On first reconcile, the controller lazily initializes two components The VW only accepts logical cluster names in its `/clusters/` path, not workspace paths. The resolver caches mappings and is consulted before every -webhook or RBAC operation. +webhook operation. ### How a DependencyRule becomes a webhook When a provider creates a `DependencyRule` ([`api/v1alpha1/types.go`](../api/v1alpha1/types.go)), the controller's reconciler ([`internal/controller/dependencyrule_controller.go:Reconcile`](../internal/controller/dependencyrule_controller.go)) -picks it up via the dep-ctrl APIExport's virtual workspace and performs two actions: - -#### 1. Webhook Installation +picks it up via the dep-ctrl APIExport's virtual workspace and installs webhooks. The [`WebhookInstaller`](../internal/controller/webhook_installer.go) creates or updates a `ValidatingWebhookConfiguration` named `dependency-controller` in each @@ -181,17 +196,18 @@ workspace, the webhook is deleted entirely. **Entry point:** [`cmd/webhook/main.go`](../cmd/webhook/main.go) The webhook server watches the same `DependencyRule` objects as the controller, -but its job is different: it builds indexed caches of dependent resources and uses -them to answer admission requests. +but its job is different: it maintains a metadata registry of active rules and +uses per-request direct queries to check for active dependents when a deletion +is attempted. -### Startup and Prewarming +### Startup and Registry Population On startup, the webhook creates an `mcmanager` backed by the dep-ctrl APIExport provider, then registers the [`RuleCacheManager`](../internal/webhook/rule_cache_manager.go) as a controller watching `DependencyRule` objects. -Before the webhook can serve requests safely, it must populate its caches from +Before the webhook can serve requests safely, it must populate its registry with all existing rules. This happens in a [`manager.RunnableFunc`](../cmd/webhook/main.go) that runs after the manager starts: @@ -199,9 +215,8 @@ starts: 1. [`PopulateRegistry`](../internal/webhook/rule_cache_manager.go) resolves the dep-ctrl APIExport's virtual workspace URL from its `APIExportEndpointSlice` - ([`virtualWorkspaceClient`](../internal/webhook/rule_cache_manager.go)) 2. Lists all existing `DependencyRule` objects across all bound workspaces -3. Calls [`ensureCache`](../internal/webhook/rule_cache_manager.go) for each rule +3. Registers each rule's metadata (GVK, GVR, field paths) in the registry 4. Closes the `initialized` channel Until `initialized` is closed: @@ -209,45 +224,22 @@ Until `initialized` is closed: returns unhealthy - The `DeletionValidator` denies all DELETE requests with "not yet initialized" -### Per-Rule Indexed Caches - -Each `DependencyRule` gets a dedicated `mcmanager.Manager` connected to the -dependent resource's provider APIExport virtual workspace. This 1:1 mapping -([`ensureCache`](../internal/webhook/rule_cache_manager.go)) exists because: - -- **Different rules reference different APIExports** -- each provider's virtual - workspace is a distinct endpoint, so each needs its own manager -- **Clean lifecycle** -- cancelling one rule's context tears down only its - informers, without affecting other rules - -For each dependency target in the rule, -[`ensureCache`](../internal/webhook/rule_cache_manager.go) registers a field -index on the dependent resource's informer: - -```go -mgr.GetFieldIndexer().IndexField(ctx, watchObj, fieldPath, func(obj client.Object) []string { - val := fieldpath.Resolve(u.Object, fieldPath) // e.g., ".spec.vpcRef.name" -> "my-vpc" - return []string{val} -}) -``` +### Per-Request Direct Queries -This enables O(1) lookups: "find all VMs whose `.spec.vpcRef.name` equals `my-vpc`". +Unlike a cache-based approach, the webhook does not maintain persistent informers +for dependent resources. Instead, on each DELETE admission request, it constructs +a temporary dynamic client scoped to the consumer workspace via the kcp front-proxy +and lists dependent resources directly. -A minimal controller is registered on the same GVK to activate the informer -(controller-runtime won't start an informer without a controller watching it) -and to track which logical clusters have been discovered and mark the cache -as ready. +The webhook derives the front-proxy base URL from its kubeconfig at startup by +stripping the `/clusters/...` workspace path suffix. For each admission request, +it builds a workspace-scoped URL: `{frontProxyBase}/clusters/{logicalClusterName}`. -The resulting state is stored in the -[`RuleRegistry`](../internal/webhook/rule_registry.go) as a `RuleState` keyed by -`clusterName/ruleName`. The registry also maintains a reverse index -(`byTarget map[GVR][]string`) so the webhook can quickly find which rules -protect a given resource type. - -On `DependencyRule` deletion, `Reconcile` calls -[`Registry.Unregister(key)`](../internal/webhook/rule_registry.go) which removes -the state from the map and calls `state.Cancel()` to stop the background manager -goroutine and tear down all informers. +This approach is routing-transparent across shards -- no webhook configuration +change is required when shards are added, as the front-proxy handles routing +based on the logical cluster name. Each new shard does, however, need the +`system:admin` RBAC binding (see [Workspace Topology](#workspace-topology)) +applied so the webhook is authorized to list dependents on it. ### Admission Request Flow @@ -272,24 +264,25 @@ DELETE vpcs/my-vpc (from consumer workspace) 6. Registry.FindByTargetGVR(vpcs GVR) | returns []RuleEntry with matched IndexedFields | -7. For each matching rule: - | a. Cache not ready? --> Deny ("cache warming up, retry later") - | b. Get cluster from rule's manager - | c. Query: cache.List(ctx, &list, MatchingFields{".spec.vpcRef.name": "my-vpc"}) - | d. Each result is a blocker: "VirtualMachine/my-vm" +7. Create dynamic client for {frontProxy}/clusters/{clusterName} + | +8. For each matching rule: + | a. List dependent resources in namespace + | b. Filter by field path (fieldpath.Resolve == deleted resource name) + | c. Each match is a blocker: "VirtualMachine/my-vm" | -8. Blockers found? --> Deny ("still referenced by VirtualMachine/my-vm") +9. Blockers found? --> Deny ("still referenced by VirtualMachine/my-vm") | -9. No blockers --> Allow +10. No blockers --> Allow ``` The validator is rule-agnostic -- it doesn't need to know the structure of each -`DependencyRule`, only how to query the indexed caches via the field paths stored -in `RuleEntry.MatchedField`. +`DependencyRule`, only how to query the dependent resources via the GVR and field +paths stored in `RuleEntry`. ### Force-Deleting Protected Resources -If the dependency lifecycle has broken down (stale caches, crashed webhook), +If the dependency lifecycle has broken down (stale rules, crashed webhook), operators can bypass protection: ```sh @@ -318,6 +311,7 @@ spec: group: compute.test.io version: v1 kind: VirtualMachine + resource: virtualmachines dependencies: - apiExportRef: path: "root:network-provider" @@ -329,14 +323,15 @@ spec: path: ".spec.vpcRef.name" ``` -`spec.dependent` -- the resource type that holds references (the one being cached). +`spec.dependent` -- the resource type that holds references (the one queried on deletion). `spec.dependent.apiExportName` -- the APIExport in the same workspace that provides this type. +`spec.dependent.resource` -- the plural resource name, used to construct the GVR for dynamic client queries. `spec.dependencies[]` -- the resource types being referenced (the ones being protected). `spec.dependencies[].fieldRef.path` -- where in the dependent resource the reference lives. ### RuleRegistry (`internal/webhook/rule_registry.go`) -Thread-safe store shared between the `RuleCacheManager` (writer) and +Thread-safe metadata store shared between the `RuleCacheManager` (writer) and `DeletionValidator` (reader). - `rules map[string]*RuleState` -- keyed by `clusterName/ruleName` @@ -344,36 +339,70 @@ Thread-safe store shared between the `RuleCacheManager` (writer) and rebuilt on every `Register`/`Unregister` Key operations: -- `Register(key, state) *RuleState` -- adds/replaces a rule, returns old state for cleanup -- `Unregister(key)` -- removes and cancels the rule's manager +- `Register(key, state) *RuleState` -- adds/replaces a rule, returns old state +- `Unregister(key)` -- removes the rule - `FindByTargetGVR(gvr) []RuleEntry` -- O(1) lookup used by the admission handler ### Field Path Resolution (`internal/fieldpath/fieldpath.go`) [`fieldpath.Resolve`](../internal/fieldpath/fieldpath.go) extracts a string value from an unstructured object given a dot-notation path (e.g., `.spec.vpcRef.name`). -Used by the index functions registered on per-rule informers. +Used by the webhook to filter dependent resources by matching field values against +the deleted resource name. --- ## Known Limitations +### Dependency-provider workspaces must be direct children of root + +The path named in `DependencyRule.spec.dependencies[].apiExportRef.path` +must currently be a direct child of `root` (e.g., `root:network-provider`). +Nested paths such as `root:org:network-provider` will fail to resolve. + +The cause is in +[`workspaceResolver.ensureResolved`](../internal/controller/dependencyrule_controller.go): +the resolver splits the supplied path on `:`, takes only the last segment as +the `Workspace` name, and looks that name up in the **root** workspace. A +nested workspace will not be found there. + +This restriction applies **only** to the workspace hosting the protected +APIExport. It does **not** apply to: + +- Consumer workspaces — the webhook identifies them by the `kcp.io/cluster` + annotation on the admission request (a logical cluster name, not a path) + and queries them directly through the front-proxy. +- The dependent-provider workspace (where the `DependencyRule` itself + lives) — the controller and webhook discover rules through the dep-ctrl + APIExport's multicluster watch, which delivers the logical cluster name + with each event. +- The dep-ctrl workspace — its location is pinned at deploy time via the + controller's kubeconfig. + +Lifting this limitation requires the resolver to walk the path segment by +segment (or call kcp's path-aware Workspace lookup) instead of bottoming +out at root. + ### Circular dependencies The system does not detect cycles. If rule A says VM depends on VPC and rule B says VPC depends on VM, neither can be deleted normally. Use `skip-protection` to break the cycle. -### Informer cache lag +### Per-request query latency -Between a dependent being created and the informer cache syncing (typically -sub-second), the referenced resource can be deleted without the webhook blocking -it. Similarly, after a dependent is deleted, there's a brief window where the -webhook may still block deletion of the referenced resource. +Each DELETE admission request triggers a live API call to list dependent resources +in the consumer workspace. This adds latency compared to a cache-based approach, +but DELETE operations are infrequent and the namespace-scoped listings are +typically small. -### Rule updates are not detected +### Rule updates -[`ensureCache`](../internal/webhook/rule_cache_manager.go) is a no-op if a -cache already exists for the rule key. If a `DependencyRule`'s spec changes -(e.g., a new dependency target is added), the existing cache won't be updated. -The workaround is to delete and recreate the rule. +In-place `DependencyRule` spec edits are picked up automatically: the +`RuleCacheManager` reconciles on each update event and overwrites the +existing registry entry, and `WebhookInstaller.reconcileWorkspaceWebhook` +recomputes the desired webhook config from scratch on every change rather +than maintaining incremental state. This is covered by the +*"should propagate in-place DependencyRule updates to the webhook"* e2e +scenario, which patches `fieldRef.path` on a live rule and verifies the +webhook re-evaluates without a recreate cycle. diff --git a/docs/development.md b/docs/development.md index 1de74ee..1dbf002 100644 --- a/docs/development.md +++ b/docs/development.md @@ -69,11 +69,21 @@ make run-webhook # Run the webhook server from source ### Test ```sh -make test # Unit + integration tests (requires kcp binary, excludes e2e) -make test-e2e # E2E tests (requires kind, helm, docker) -make clean-e2e # Remove kind cluster from e2e tests +make test # Unit + integration tests (requires kcp binary, excludes e2e) +make test-e2e # E2E tests (requires kind, helm, docker) — uses the active shard config +make test-e2e-matrix # Run e2e tests against both shard configs sequentially +make clean-e2e # Remove kind cluster from e2e tests ``` +`make test-e2e` honors the `E2E_SHARD_CONFIG` environment variable +(`single-shard` or `multi-shard`, default `multi-shard`); see the [E2E Tests](#e2e-tests) +section for what each config exercises. `make test-e2e-matrix` runs both back-to-back +with a kind cleanup between runs. + +Tool paths can be overridden via `KIND`, `KUBECTL`, `HELM`, `DOCKER` env vars +(fallback: PATH lookup). Set `E2E_SKIP_CLEANUP=1` to keep the kind cluster +running after the suite for inspection. + ### Lint & Format ```sh @@ -105,14 +115,79 @@ webhook server with a self-signed CA. It verifies the full lifecycle: ## E2E Tests -The e2e tests (`test/e2e/`) run against a real kind cluster with kcp and -cert-manager deployed via Helm. They build the Docker image, load it into -kind, deploy the Helm chart (which includes both controller and webhook), and -exercise the full system including TLS webhook dispatch through kcp's admission -pipeline. - -Tool paths can be configured via environment variables (`KIND`, `KUBECTL`, -`HELM`, `DOCKER`) with fallback to PATH lookup. +The e2e tests (`test/e2e/`) run against a real kind cluster with a multi-shard +kcp instance deployed via the +[kcp-operator](https://github.com/kcp-dev/helm-charts). The test suite: + +1. Creates a kind cluster with a NodePort for the kcp front-proxy +2. Installs cert-manager and the kcp-operator Helm chart +3. Deploys two etcd instances and creates a `RootShard`, `Shard` (`shard1`), and + `FrontProxy` via kcp-operator CRs +4. Generates admin and component kubeconfigs via kcp-operator `Kubeconfig` CRs + (using `rootShardRef` so certs are trusted by both front-proxy and shards) +5. Bootstraps RBAC (see below) +6. Builds the Docker image, loads it into kind, and deploys via Helm +7. Exercises the full system including TLS webhook dispatch through kcp's + admission pipeline + +### Workspace topology and shard placement + +The five test workspaces (`dep-ctrl`, `network-provider`, `compute-provider`, +`consumer1`, `consumer2`) are pinned deterministically to either `root` or +`shard1` via `spec.location.selector`. Both shards carry an `e2e-target=` +label. After workspace readiness, `verifyShardPlacements` reads each +workspace's `internal.tenancy.kcp.io/shard` annotation and asserts that +selectors weren't ignored. + +Selection is driven by the `E2E_SHARD_CONFIG` env var: + +| Config | Purpose | Placements | +|------------------|------------------------------------------|-------------------------------------------------------------------------------------------------------------| +| `single-shard` | sanity / single-shard fast paths | all five workspaces on `root` | +| `multi-shard` (default) | exercises every cross-shard path | `compute-provider` and `consumer2` on `shard1`; `dep-ctrl`, `network-provider`, `consumer1` on `root` | + +Together, the two configs exercise: same-shard fast paths (single-shard); +controller-VW cross-shard webhook installation, provider ↔ consumer +cross-shard binding, and webhook ↔ consumer cross-shard query (multi-shard). +Run `make test-e2e-matrix` to execute both sequentially. + +### Bootstrap RBAC + +Three RBAC bundles are applied during setup: + +1. **`system:admin` per shard** (`test/fixtures/system-admin-rbac-bootstrap.yaml`): + ClusterRole + ClusterRoleBinding granting the webhook SA `*/*` get/list. + Applied via direct (port-forwarded) connection to each shard, using a + `system:masters` kubeconfig generated from a kcp-operator `Kubeconfig` CR + with `rootShardRef` / `shardRef`. kcp's `BootstrapPolicyAuthorizer` reads + this binding from the local shard only — bindings do not propagate across + shards — so the fixture is applied once per shard. +2. **Root workspace** (`test/fixtures/root-rbac-bootstrap.yaml`): controller-only + rules (workspaces/content access + tenancy.kcp.io/workspaces read), applied + via the front-proxy. +3. **Dep-ctrl workspace** (`test/fixtures/depctrl-rbac-bootstrap.yaml`): + apiexportendpointslices read + dep-ctrl APIExport content access for both + the controller and webhook SAs. + +The webhook installation in provider workspaces is authorized by the +`validatingwebhookconfigurations` permissionClaim on the dep-ctrl APIExport, +not by RBAC. + +### Test scenarios + +`test/e2e/dependency_test.go` is an `Ordered` `Describe` covering: + +- Initial install (DependencyRule → ValidatingWebhookConfiguration appears) +- Block / unblock cycles with a single dependent and with multiple dependents +- Cross-shard isolation (`consumer2` with no VMs) +- Cross-shard protection (`consumer2` with a VM, on `shard1` under multi-shard) +- `skip-protection` annotation +- DependencyRule deletion → webhook removal +- DependencyRule recreation → protection restored +- **In-place rule update**: patches `fieldRef.path` on a live rule and + verifies the webhook re-evaluates without a recreate cycle (proves + `WebhookInstaller.reconcileWorkspaceWebhook`'s update branch and the + webhook's `RuleCacheManager` re-indexing both work end-to-end). ### Fixtures @@ -120,95 +195,22 @@ Test fixtures are loaded from YAML files rather than constructed inline: - `config/kcp/` -- the generated dep-ctrl APIResourceSchemas and APIExport (same files used for real deployment) -- `test/fixtures/` -- test provider schemas (VPC, VirtualMachine) and their - APIExports +- `test/fixtures/` -- test provider schemas (VPC, VirtualMachine), APIExports, + RBAC bundles, and per-test VPC/VM resources ## Deploying to kcp -### 1. Create the dep-ctrl workspace and apply schemas - -```sh -kubectl ws create dep-ctrl --enter -kubectl apply -k config/kcp/ -``` - -This creates the `APIResourceSchema` objects and the -`dependencies.opendefense.cloud` APIExport. The APIExport includes a -`permissionClaim` for `validatingwebhookconfigurations` -- this authorizes the -controller to manage webhooks in provider workspaces through the virtual workspace. - -### 2. Apply bootstrap RBAC - -Bootstrap RBAC is required in three locations, applied with a privileged identity: - -**Root workspace** -- both components need `workspaces/content` access to enter -child workspaces. The controller also needs `workspaces` read access for -workspace path resolution: - -```sh -kubectl ws root -kubectl apply -f test/fixtures/root-rbac-bootstrap.yaml -``` - -**Dep-ctrl workspace** -- the controller needs full CRUD on `apiexports/content` -(to manage claimed resources through the VW) and `apiexportendpointslices` read -access for VW URL discovery: - -```sh -kubectl ws root:dep-ctrl -kubectl apply -f test/fixtures/depctrl-rbac-bootstrap.yaml -``` +See [docs/getting-started.md](getting-started.md) for the full step-by-step +deployment guide using kcp-operator. The guide covers kcp-operator setup, +multi-shard configuration, bootstrap RBAC, kubeconfig generation via +kcp-operator Kubeconfig CRs, and Helm deployment. -**system:admin** (shard-local) -- the webhook SA gets shard-wide read access to -`apiexports/content` and `apiexportendpointslices`, evaluated by the Bootstrap -Policy Authorizer for every request on the shard. Must be applied via the kcp -server (not front-proxy): - -```sh -kubectl --server=https://:6443/clusters/system:admin \ - apply -f test/fixtures/shard-admin-rbac-bootstrap.yaml -``` - -Adjust the service account names in the `ClusterRoleBinding` subjects to match -your deployment (defaults: `system:serviceaccount:dependency-system:dependency-controller` -and `system:serviceaccount:dependency-system:dependency-webhook`). - -### 3. Deploy with Helm - -The chart deploys both the controller and webhook server as separate Deployments: - -```sh -helm install dep-ctrl charts/dependency-controller \ - --namespace dependency-system --create-namespace \ - --set kcpBaseHost=https://kcp.example.com:6443 \ - --set controller.kubeconfig.secretName=kcp-controller-kubeconfig \ - --set webhook.kubeconfig.secretName=kcp-webhook-kubeconfig \ - --set webhook.tls.certManager.issuerRef.name=my-issuer -``` +### Quick reference -The controller automatically discovers the webhook service URL and CA bundle -from the co-deployed webhook resources. It routes all provider workspace -operations through the dep-ctrl APIExport's virtual workspace, resolving -workspace paths to logical cluster names via root Workspace objects. - -### 4. Provider setup - -Each API provider that wants deletion protection: - -1. Binds to the dep-ctrl APIExport in their provider workspace, **accepting the - permissionClaim** (required for the controller to install webhooks): - ```yaml - spec: - permissionClaims: - - group: "admissionregistration.k8s.io" - resource: "validatingwebhookconfigurations" - selector: { matchAll: true } - state: Accepted - ``` -2. Creates a `DependencyRule` declaring which of their resources depend on which - other resources - -The controller then automatically installs a `ValidatingWebhookConfiguration` -in each dependency provider's workspace (via the VW). The webhook server picks -up the rule, starts an indexed cache for the dependent resource type (authorized -by the shard-wide system:admin RBAC), and begins serving admission requests. +1. Deploy kcp via kcp-operator (RootShard, FrontProxy, optional additional Shards) +2. Create the dep-ctrl workspace and apply `config/kcp/` schemas +3. Apply bootstrap RBAC: per-shard `system:admin` (webhook get/list), root + workspace (controller), dep-ctrl workspace (APIExport access for both) +4. Generate component kubeconfigs via kcp-operator Kubeconfig CRs (use `rootShardRef`) +5. Deploy with Helm +6. Providers bind to the dep-ctrl APIExport (accepting permissionClaims) and create DependencyRules diff --git a/docs/getting-started.md b/docs/getting-started.md index 85a5561..83168b3 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -1,10 +1,11 @@ # Getting Started -This guide walks through deploying the dependency-controller from scratch. It -assumes you have a running kcp instance and a Kubernetes cluster where the -controller and webhook will run (they connect to kcp remotely via kubeconfigs). +This guide walks through deploying the dependency-controller on a Kubernetes +cluster with kcp managed by the +[kcp-operator](https://github.com/kcp-dev/helm-charts). By the end you will +have: -By the end, you'll have: +- A multi-shard kcp instance (root shard + one additional shard) - The `DependencyRule` API available in kcp - The controller and webhook running in your Kubernetes cluster - A working example where deleting a VPC is blocked while a VirtualMachine @@ -12,20 +13,25 @@ By the end, you'll have: ## Prerequisites -- A running [kcp](https://github.com/kcp-dev/kcp) instance -- A Kubernetes cluster (the "management cluster") with: - - [cert-manager](https://cert-manager.io/) installed (for webhook TLS) - - Network connectivity to kcp +- A Kubernetes cluster (the "management cluster") -- [kind](https://kind.sigs.k8s.io/) works well for dev +- [cert-manager](https://cert-manager.io/) installed in the cluster - `kubectl` with the [kcp plugin](https://github.com/kcp-dev/kcp/tree/main/cli) - `helm` v3 -### Quick setup with kind and kcp - -If you don't have a cluster yet: +### Quick setup with kind ```sh -# Create a kind cluster -kind create cluster --name dep-ctrl +# Create a kind cluster (expose a NodePort for the kcp front-proxy) +cat <<'EOF' | kind create cluster --name dep-ctrl --config - +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: + - role: control-plane + extraPortMappings: + - containerPort: 31443 + hostPort: 31443 + protocol: TCP +EOF # Install cert-manager kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.17.2/cert-manager.yaml @@ -42,54 +48,214 @@ spec: EOF ``` -For kcp, follow the [kcp installation guide](https://docs.kcp.io/kcp/main/setup/install/) -or deploy it via its [Helm chart](https://github.com/kcp-dev/helm-charts). - ## Overview -The deployment has four phases: - -1. **kcp setup** -- create the dep-ctrl workspace, apply schemas and the - APIExport -2. **Bootstrap RBAC** -- grant the controller and webhook identities the - minimum permissions they need in kcp -3. **Helm install** -- deploy the controller and webhook into your Kubernetes +The deployment has six phases: + +1. **Deploy kcp** -- install kcp-operator, etcd, and create the kcp shards and + front-proxy +2. **kcp workspace setup** -- create the dep-ctrl workspace, apply schemas and + the APIExport +3. **Bootstrap RBAC** -- grant the controller and webhook identities the minimum + permissions they need in kcp +4. **Create kubeconfigs** -- generate client certificates for the controller and + webhook via kcp-operator Kubeconfig CRs +5. **Helm install** -- deploy the controller and webhook into your Kubernetes cluster -4. **Provider onboarding** -- providers bind to the dep-ctrl APIExport and +6. **Provider onboarding** -- providers bind to the dep-ctrl APIExport and create DependencyRules -``` -Management Cluster (kind) kcp +```text +Management Cluster (kind) kcp (via kcp-operator) +-------------------------------+ +----------------------------------+ -| dependency-controller pod | | root workspace | -| reads Workspace objects |--->| ClusterRoles for both SAs | -| manages webhooks via VW | | | -| | | root:dep-ctrl workspace | -| dependency-webhook pod | | APIExport: DependencyRule | -| watches rules via VW |--->| ClusterRoles for controller SA | -| serves admission requests | | | -+-------------------------------+ | system:admin (shard-local) | - | ClusterRole for webhook SA | - | (shard-wide apiexports/content)| - | | - | root:network-provider | - | APIExport: VPCs | - | ValidatingWebhook (installed | - | by controller via VW) | +| kcp-operator | | root shard | +| etcd (one per shard) | | system:admin (per-shard) | +| root-kcp (root shard pod) | | ClusterRoleBinding for | +| shard1-kcp (shard1 pod) | | webhook SA (wildcard read) | +| kcp-front-proxy | | root workspace | +| | | ClusterRole for controller | +| dependency-controller pod | | root:dep-ctrl workspace | +| reads Workspace objects |--->| APIExport: DependencyRule | +| manages webhooks via VW | | ClusterRoles for both SAs | +| | | root:compute-provider | +| dependency-webhook pod | | APIExport: VMs | +| watches rules via VW | | DependencyRule: VM -> VPC | +| serves admission requests |--->| root:network-provider | ++-------------------------------+ | APIExport: VPCs | + | ValidatingWebhook (installed | + | by controller via VW) | | | - | root:compute-provider | - | APIExport: VMs | - | DependencyRule: VM -> VPC | + | shard1 | + | system:admin (per-shard) | + | same webhook binding as root | + | (consumer workspaces may live | + | on any shard) | +----------------------------------+ ``` -## Step 1: Create the dep-ctrl workspace and apply schemas +## Step 1: Deploy kcp via kcp-operator + +### 1a. Install the kcp-operator + +```sh +helm repo add kcp https://kcp-dev.github.io/helm-charts +helm repo update kcp + +helm install kcp-operator kcp/kcp-operator \ + --namespace kcp-system --create-namespace \ + --wait --timeout 300s +``` + +### 1b. Deploy etcd + +Each kcp shard needs its own etcd instance. For production, use a proper etcd +cluster; for dev/testing a single-replica StatefulSet per shard is sufficient. +You need a `Service` (client port 2379) and a `StatefulSet` per shard. + +For a working example, see the `applyEtcd()` function in +[`test/e2e/suite_test.go`](../test/e2e/suite_test.go) which creates minimal +single-node etcd instances. + +```sh +# Create etcd for the root shard (Service + StatefulSet named "etcd-root") +# Create etcd for the secondary shard ("etcd-shard1", optional for multi-shard) + +# Wait for etcd to be ready +kubectl -n kcp-system wait statefulset etcd-root \ + --for=jsonpath='{.status.readyReplicas}'=1 --timeout=120s +``` + +### 1c. Create a cert-manager Issuer + +kcp-operator uses cert-manager for PKI (server certs, client CAs, etc.): + +```sh +kubectl -n kcp-system apply -f - < /tmp/kcp-admin.kubeconfig +``` + +If you're using kind with a NodePort, rewrite the server URL in the kubeconfig +to `https://localhost:`. The kcp-operator generates two contexts: +`base` (bare URL) and `default` (URL + `/clusters/root`). Use `base` if you +want to specify workspace paths manually via `--server`. -The dependency-controller manages its own API type (`DependencyRule`) via a kcp -APIExport. You need to create a workspace for it and apply the schema and export -definitions. +## Step 2: Create the dep-ctrl workspace and apply schemas ```sh +export KUBECONFIG=/tmp/kcp-admin.kubeconfig + # Create the dep-ctrl workspace kubectl ws root kubectl ws create dep-ctrl --enter @@ -118,52 +284,33 @@ access another workspace's resources -- instead, the APIExport's acts as a proxy. `permissionClaims` tell kcp which resource types the APIExport provider is allowed to manage in binding workspaces via that proxy. Provider workspaces must explicitly accept these claims when creating their APIBinding -(covered in [Step 5](#step-5-onboard-a-provider)). +(covered in [Step 6](#step-6-onboard-a-provider)). -## Step 2: Bootstrap RBAC in kcp +## Step 3: Bootstrap RBAC in kcp Both components run with dedicated service account identities. They need -specific permissions in several kcp locations, applied once using a privileged -identity (e.g., `system:masters` via a bootstrap certificate). +specific permissions in several kcp locations, applied once using the admin +kubeconfig. ### Root workspace RBAC -Both components need `workspaces/content` access to enter child workspaces -(this is how kcp authorizes traversing the workspace hierarchy). The controller -additionally needs to read `Workspace` objects to resolve workspace paths -(like `root:network-provider`) to the logical cluster names that the virtual -workspace requires. +The **controller** needs `workspaces/content` access to enter child workspaces +(this is how kcp authorizes traversing the workspace hierarchy) and +`workspaces` read access to resolve paths like `root:network-provider` to the +logical cluster names the virtual workspace requires. ```sh kubectl ws root kubectl apply -f test/fixtures/root-rbac-bootstrap.yaml ``` -The file -([`test/fixtures/root-rbac-bootstrap.yaml`](../test/fixtures/root-rbac-bootstrap.yaml)) -creates: - -| Resource | Name | Purpose | -|---|---|---| -| ClusterRole | `dependency-controller` | `workspaces/content` access + `workspaces` read (get/list/watch) | -| ClusterRoleBinding | `dependency-controller` | Binds to controller SA | -| ClusterRole | `dependency-controller-webhook` | `workspaces/content` access only | -| ClusterRoleBinding | `dependency-controller-webhook` | Binds to webhook SA | - -**Why `workspaces/content`?** Without it, kcp rejects any request to a child -workspace's API, including virtual workspace endpoints. Both components access -the dep-ctrl workspace where their kubeconfig points. - -**Why `workspaces` read for the controller?** The virtual workspace accepts -logical cluster names (opaque IDs like `qh6707jkfsen31z9`), not workspace paths -(`root:network-provider`). The controller reads `Workspace` objects from root -to map paths to cluster names via `workspace.spec.cluster`. +(The webhook does **not** get RBAC here — its broad read is granted in +`system:admin` per shard, see below.) ### Dep-ctrl workspace RBAC -The controller needs access to the dep-ctrl APIExport's virtual workspace. -This is authorized by the `apiexports/content` subresource in the workspace -where the APIExport is defined. +Both components need access to the dep-ctrl APIExport's virtual workspace and +`apiexportendpointslices` for VW URL discovery: ```sh kubectl ws root:dep-ctrl @@ -172,58 +319,83 @@ kubectl apply -f test/fixtures/depctrl-rbac-bootstrap.yaml The file ([`test/fixtures/depctrl-rbac-bootstrap.yaml`](../test/fixtures/depctrl-rbac-bootstrap.yaml)) -creates: +grants both the controller and webhook SAs: -| Resource | Name | Permissions | -|---|---|---| -| ClusterRole | `dependency-controller` | `apiexportendpointslices` read + `apiexports/content` full CRUD | -| ClusterRoleBinding | `dependency-controller` | Binds to controller SA | +- `apiexportendpointslices` read access (for VW URL discovery at startup) +- `apiexports/content` full CRUD (for managing resources through the VW) -**Why `apiexportendpointslices`?** The controller discovers the virtual -workspace URL by reading the `APIExportEndpointSlice` for the -`dependencies.opendefense.cloud` APIExport. This URL is the entry point for -all operations through the virtual workspace. +### Webhook RBAC in `system:admin` (per shard) -**Why `apiexports/content` with full CRUD?** The controller writes resources -(webhooks) in binding workspaces through the virtual workspace. The verb on -`apiexports/content` controls what operations are allowed through the VW -- -read-only access would block the controller from creating anything. +The webhook queries dependent resources directly in each consumer workspace, +and consumer workspaces can live on any shard. kcp's +`BootstrapPolicyAuthorizer` reads RBAC from the local shard's `system:admin` +logical cluster only — bindings do **not** propagate across shards — so the +webhook needs a `ClusterRole` + `ClusterRoleBinding` in `system:admin` of +**every shard** that hosts consumer workspaces. -### Shard-wide RBAC via system:admin +`system:admin` is intentionally not reachable through the front-proxy. To +apply RBAC there you need: -The webhook needs read access to `apiexports/content` and -`apiexportendpointslices` across **all** workspaces on the shard. Rather than -creating per-workspace RBAC, this is granted once in kcp's `system:admin` -logical cluster. The Bootstrap Policy Authorizer evaluates `system:admin` RBAC -for every request on the shard. +1. A kubeconfig issued via a kcp-operator `Kubeconfig` CR with + `target.rootShardRef` (or `target.shardRef` for secondary shards) and + `groups: ["system:masters"]` — `system:masters` is honored by the shard + directly but not by the front-proxy. +2. Direct shard access. The shard's ClusterIP service is normally not exposed; + `kubectl port-forward svc/` to localhost is the simplest path. + Make sure the shard's server certificate has `localhost` / `127.0.0.1` in + its SANs (kcp-operator's `RootShard` / `Shard` `certificateTemplates`). -This must be applied via the **kcp server** (not the front-proxy), using a -`system:masters` identity: +Example (root shard): ```sh -kubectl --server=https://:6443/clusters/system:admin \ - apply -f test/fixtures/shard-admin-rbac-bootstrap.yaml -``` +# 1. Generate a system:masters kubeconfig for the root shard +kubectl apply -f - <<'EOF' +apiVersion: operator.kcp.io/v1alpha1 +kind: Kubeconfig +metadata: + name: root-system-masters + namespace: kcp-system +spec: + username: bootstrap-system-masters + groups: + - "system:masters" + validity: 8766h + secretRef: + name: root-system-masters-kubeconfig + target: + rootShardRef: + name: root +EOF -The file -([`test/fixtures/shard-admin-rbac-bootstrap.yaml`](../test/fixtures/shard-admin-rbac-bootstrap.yaml)) -creates: +kubectl -n kcp-system wait secret root-system-masters-kubeconfig \ + --for=jsonpath='{.data.kubeconfig}' --timeout=120s -| Resource | Name | Permissions | -|---|---|---| -| ClusterRole | `dependency-controller-webhook-apiexport-reader` | `apiexports/content` + `apiexportendpointslices` read | -| ClusterRoleBinding | `dependency-controller-webhook-apiexport-reader` | Binds to webhook SA | +kubectl -n kcp-system get secret root-system-masters-kubeconfig \ + -o jsonpath='{.data.kubeconfig}' | base64 -d > /tmp/root-masters.kubeconfig +# Rewrite the server URL in /tmp/root-masters.kubeconfig to https://localhost:6443 +# (or any free port you pick for the port-forward). -**Why shard-wide?** The webhook watches DependencyRules via the dep-ctrl -APIExport VW, and reads dependent resources (e.g., VirtualMachines) via other -providers' APIExport VWs. These VWs span many workspaces. Granting read access -at the shard level avoids the need for per-workspace RBAC management and -removes the need for `clusterroles`/`clusterrolebindings` permissionClaims on -the APIExport. +# 2. Port-forward the root shard service +kubectl -n kcp-system port-forward svc/root-kcp 6443:6443 & +PF_PID=$! -**Why `system:admin`?** This is a shard-local logical cluster (not a workspace) -where the Bootstrap Policy Authorizer evaluates RBAC for every request on the -shard. It is accessed via the kcp server directly, not the front-proxy. +# 3. Apply the binding in /clusters/system:admin +kubectl --kubeconfig /tmp/root-masters.kubeconfig \ + --server https://localhost:6443/clusters/system:admin \ + apply --validate=false -f test/fixtures/system-admin-rbac-bootstrap.yaml + +# 4. Stop the port-forward +kill $PF_PID +``` + +Repeat for every additional shard, swapping `rootShardRef` for +`shardRef: {name: }` in the `Kubeconfig` CR and pointing the +port-forward at that shard's service (e.g., `svc/shard1-shard-kcp`). + +The fixture +([`test/fixtures/system-admin-rbac-bootstrap.yaml`](../test/fixtures/system-admin-rbac-bootstrap.yaml)) +grants the webhook SA `*/*` `get`/`list`. Within a shard, this binding +applies to every workspace on that shard — no per-consumer RBAC is needed. ### Adjusting service account names @@ -233,37 +405,95 @@ The bootstrap RBAC files bind to these default identities: If your deployment uses different service account names or namespaces, edit the `subjects` in the `ClusterRoleBinding` resources before applying. The names must -match what you configure in the Helm values (Step 3). +match what you configure in the Helm values (Step 5). -## Step 3: Create kubeconfigs for the components +## Step 4: Create kubeconfigs for the components Each component needs a kubeconfig that authenticates as its service account -identity and targets the dep-ctrl workspace. How you create these depends on -your kcp setup: - -- **Client certificates**: create certificates with the appropriate CN - (e.g., `system:serviceaccount:dependency-system:dependency-controller`) - signed by kcp's CA -- **Service account tokens**: if kcp supports token-based auth, create tokens - bound to the service accounts +identity. Use kcp-operator `Kubeconfig` CRs to generate client certificates: -The kubeconfig server URL should point to kcp with the dep-ctrl workspace path: +```sh +# Controller kubeconfig +kubectl apply -f - <<'EOF' +apiVersion: operator.kcp.io/v1alpha1 +kind: Kubeconfig +metadata: + name: controller-kubeconfig + namespace: kcp-system +spec: + username: "system:serviceaccount:dependency-system:dependency-controller" + groups: + - "system:authenticated" + - "system:serviceaccounts" + - "system:serviceaccounts:dependency-system" + validity: 8766h + secretRef: + name: controller-kubeconfig + target: + rootShardRef: + name: root +EOF +# Webhook kubeconfig +kubectl apply -f - <<'EOF' +apiVersion: operator.kcp.io/v1alpha1 +kind: Kubeconfig +metadata: + name: webhook-kubeconfig + namespace: kcp-system +spec: + username: "system:serviceaccount:dependency-system:dependency-webhook" + groups: + - "system:authenticated" + - "system:serviceaccounts" + - "system:serviceaccounts:dependency-system" + validity: 8766h + secretRef: + name: webhook-kubeconfig + target: + rootShardRef: + name: root +EOF ``` -https://:6443/clusters/root:dep-ctrl -``` -Store the kubeconfigs as Kubernetes secrets in the management cluster: +**Important notes about kcp-operator Kubeconfig CRs:** + +- **`rootShardRef` (not `frontProxyRef`)**: This generates client certificates + signed by `root-client-ca`, which is trusted by both the front-proxy (via + `kcp-merged-client-ca`) and all shards directly. This is required because the + multicluster-provider connects to APIExport virtual workspace URLs that point + directly at shards, not through the front-proxy. + +- **`system:authenticated` group**: kcp's front-proxy uses request header + impersonation. Unlike direct Kubernetes authentication, impersonated + identities do not automatically get the `system:authenticated` group -- it + must be listed explicitly. + +- **Two contexts**: kcp-operator generates kubeconfigs with two contexts: + `base` (bare shard URL) and `default` (shard URL + `/clusters/root`). The + component kubeconfigs should be rewritten to point at the front-proxy with + the dep-ctrl workspace path (`/clusters/root:dep-ctrl`) before mounting + them as secrets. + +Extract and rewrite the kubeconfigs: ```sh +# Extract the controller kubeconfig +kubectl -n kcp-system get secret controller-kubeconfig \ + -o jsonpath='{.data.kubeconfig}' | base64 -d > /tmp/controller.kubeconfig + +# Rewrite the server URL from the shard URL to the front-proxy + workspace path +# (replace the shard URL with https://:/clusters/root:dep-ctrl) + +# Store as Kubernetes secrets kubectl -n dependency-system create secret generic kcp-controller-kubeconfig \ - --from-file=kubeconfig=/path/to/controller.kubeconfig + --from-file=kubeconfig=/tmp/controller.kubeconfig kubectl -n dependency-system create secret generic kcp-webhook-kubeconfig \ - --from-file=kubeconfig=/path/to/webhook.kubeconfig + --from-file=kubeconfig=/tmp/webhook.kubeconfig ``` -## Step 4: Deploy with Helm +## Step 5: Deploy with Helm The Helm chart deploys both the controller and webhook as separate Deployments in a single release. The controller automatically discovers the webhook's @@ -272,7 +502,6 @@ service URL and TLS CA from the co-deployed resources. ```sh helm install dep-ctrl charts/dependency-controller \ --namespace dependency-system --create-namespace \ - --set kcpBaseHost=https://:6443 \ --set controller.kubeconfig.secretName=kcp-controller-kubeconfig \ --set webhook.kubeconfig.secretName=kcp-webhook-kubeconfig \ --set webhook.tls.certManager.issuerRef.name=selfsigned @@ -282,11 +511,15 @@ Key values: | Value | Purpose | |---|---| -| `kcpBaseHost` | Root kcp URL (no workspace path). Used by the controller to resolve workspace paths and by the webhook to discover APIExport VW URLs. | -| `controller.kubeconfig.secretName` | Secret containing the controller's kubeconfig (from Step 3). | -| `webhook.kubeconfig.secretName` | Secret containing the webhook's kubeconfig (from Step 3). | +| `controller.kubeconfig.secretName` | Secret containing the controller's kubeconfig (from Step 4). | +| `webhook.kubeconfig.secretName` | Secret containing the webhook's kubeconfig (from Step 4). | | `webhook.tls.certManager.issuerRef.name` | cert-manager issuer for the webhook's TLS certificate. | +Both components automatically derive the kcp front-proxy base URL from their +kubeconfig by stripping the `/clusters/...` workspace path suffix. No +additional base URL flag is needed (though `kcpBaseHost` can be set to +override this if your kubeconfig host doesn't follow the standard pattern). + See [`charts/dependency-controller/values.yaml`](../charts/dependency-controller/values.yaml) for all available options. @@ -303,14 +536,14 @@ The webhook pod's readiness probe only passes once it has populated its rule registry (listed all existing DependencyRules). On first deploy with no rules, this is near-instant. -## Step 5: Onboard a provider +## Step 6: Onboard a provider This example uses two providers -- a network provider (exports VPCs) and a compute provider (exports VirtualMachines). The compute provider will create a DependencyRule saying "VMs depend on VPCs", which blocks VPC deletion while any VM references it. -### 5a. Create provider workspaces and APIExports +### 6a. Create provider workspaces and APIExports ```sh # Network provider @@ -326,7 +559,7 @@ kubectl apply -f test/fixtures/apiresourceschema-virtualmachines.yaml kubectl apply -f test/fixtures/apiexport-compute.test.io.yaml ``` -### 5b. Bind providers to the dep-ctrl APIExport +### 6b. Bind providers to the dep-ctrl APIExport **Every provider workspace referenced in a DependencyRule must bind to the dep-ctrl APIExport and accept the permissionClaims.** This applies to both @@ -397,10 +630,8 @@ A reference fixture is available at The accepted permissionClaim grants the controller permission to create `ValidatingWebhookConfigurations` in the workspace (for deletion protection). -The webhook's read access to provider APIExports is handled shard-wide via the -`system:admin` bootstrap RBAC (Step 2) -- no per-workspace RBAC is needed. -### 5c. Create a DependencyRule +### 6c. Create a DependencyRule The compute provider declares that VirtualMachines depend on VPCs: @@ -417,6 +648,7 @@ spec: group: compute.test.io version: v1 kind: VirtualMachine + resource: virtualmachines dependencies: - apiExportRef: path: root:network-provider @@ -433,14 +665,11 @@ A reference fixture is available at [`test/fixtures/dependencyrule-vm-dependencies.yaml`](../test/fixtures/dependencyrule-vm-dependencies.yaml). Once applied, the controller will install a `ValidatingWebhookConfiguration` -in `root:network-provider` (protecting VPC deletions). +in `root:network-provider` (protecting VPC deletions). The webhook registers +the rule's metadata in its registry and begins serving admission requests +for VPC deletions. -The webhook will: -1. Start an indexed cache watching VirtualMachines via the compute APIExport VW - (authorized by the shard-wide `system:admin` RBAC from Step 2) -2. Begin serving admission requests for VPC deletions - -### 5d. Create a consumer workspace and test resources +### 6d. Create a consumer workspace and test resources ```sh kubectl ws root @@ -497,7 +726,7 @@ spec: EOF ``` -### 5e. Verify deletion protection +### 6e. Verify deletion protection ```sh # This should be denied: @@ -513,7 +742,7 @@ kubectl delete vpc my-vpc -n default ## How the pieces fit together -Here's the flow that makes Step 5e work: +Here's the flow that makes Step 6e work: 1. The **controller** watches DependencyRules via the dep-ctrl APIExport's virtual workspace @@ -523,13 +752,44 @@ Here's the flow that makes Step 5e work: `/clusters/` and creates a `ValidatingWebhookConfiguration` in the network-provider workspace (authorized by the accepted `validatingwebhookconfigurations` permissionClaim) -4. The **webhook** also watches the same DependencyRule and starts an indexed - cache watching VirtualMachines via compute.test.io's VW (authorized by the - shard-wide `system:admin` RBAC from Step 2) +4. The **webhook** also watches the same DependencyRule and registers the rule's + metadata (dependent GVR, field paths, target GVR) in its in-memory registry 5. When a consumer deletes a VPC, kcp dispatches the DELETE to the webhook (via the installed `ValidatingWebhookConfiguration`) -6. The webhook queries its indexed cache: "any VMs where `.spec.vpcRef.name` - equals `my-vpc`?" -- finds `my-vm` and denies the deletion +6. The webhook extracts the logical cluster name from the `kcp.io/cluster` + annotation on the object, constructs a temporary dynamic client scoped to + `{frontProxy}/clusters/{clusterName}`, and lists VirtualMachines in that + namespace +7. It filters by field path: "any VMs where `.spec.vpcRef.name` equals + `my-vpc`?" -- finds `my-vm` and denies the deletion + +## Multi-shard considerations + +The dependency-controller is multi-shard aware. The runtime data path needs +no per-shard configuration — but the **bootstrap RBAC does**: + +- **Webhook installation**: The controller installs webhooks via the dep-ctrl + APIExport's virtual workspace, which automatically routes to the correct + shard for each provider workspace. No changes per shard. + +- **Per-request queries**: The webhook queries dependent resources via the + kcp front-proxy, which transparently routes requests to the correct shard + based on the logical cluster name. No webhook-side configuration changes + per shard. + +- **`system:admin` RBAC must be applied per shard.** kcp's + `BootstrapPolicyAuthorizer` reads RBAC from the local shard's `system:admin` + logical cluster only; bindings do not propagate across shards. Each new + shard that hosts consumer workspaces needs the webhook's wildcard read + binding applied via the procedure in [Step 3](#webhook-rbac-in-systemadmin-per-shard). + +- **Root and dep-ctrl workspace RBAC are applied once.** Those workspaces + live on the root shard and the bindings there are not duplicated. + +- **Kubeconfigs**: Component kubeconfigs must use certificates signed by + `root-client-ca` (via `rootShardRef` in the Kubeconfig CR). This CA is + trusted by both the front-proxy and all shards, which is required because + VW URLs point directly at shards. ## Troubleshooting @@ -543,11 +803,19 @@ kubectl -n dependency-system logs -l app.kubernetes.io/component=webhook ``` Common issues: + - **Kubeconfig invalid** -- the webhook can't reach kcp -- **Missing system:admin RBAC** -- the webhook SA needs shard-wide - `apiexports/content` and `apiexportendpointslices` read access (Step 2) -- **Missing root RBAC** -- the webhook SA needs `workspaces/content` in root - (Step 2) +- **Missing dep-ctrl workspace RBAC** -- ensure + [Step 3](#dep-ctrl-workspace-rbac) was applied +- **Missing `system:admin` RBAC on a shard** -- if the webhook can list on some + consumer workspaces but not others, the failing ones are likely on a shard + where the `system:admin` binding from + [Step 3](#webhook-rbac-in-systemadmin-per-shard) was never applied. + Verify with: `kubectl auth can-i list + --as=system:serviceaccount:dependency-system:dependency-webhook` + against the consumer workspace path. +- **Certificate not trusted by shards** -- ensure kubeconfigs use `rootShardRef` + (not `frontProxyRef`) so certs are signed by `root-client-ca` ### Webhook not blocking deletions @@ -567,8 +835,8 @@ kubectl get apibinding dependencies.opendefense.cloud -o jsonpath='{.status.phas ### Controller can't create webhooks Check that the provider workspace's APIBinding has accepted the -permissionClaims (Step 5b). Without acceptance, the VW rejects write -operations: +`validatingwebhookconfigurations` permissionClaim (Step 6b). Without +acceptance, the VW rejects write operations: ```sh kubectl ws root:network-provider @@ -578,7 +846,7 @@ kubectl get apibinding dependencies.opendefense.cloud -o yaml ### Force-deleting a protected resource -If the webhook is down or caches are stale, annotate the resource to bypass +If the webhook is down or rules are stale, annotate the resource to bypass protection: ```sh diff --git a/internal/controller/dependencyrule_controller.go b/internal/controller/dependencyrule_controller.go index 41699fa..8e948ed 100644 --- a/internal/controller/dependencyrule_controller.go +++ b/internal/controller/dependencyrule_controller.go @@ -10,7 +10,6 @@ import ( "sync" "github.com/kcp-dev/logicalcluster/v3" - apisv1alpha1 "github.com/kcp-dev/sdk/apis/apis/v1alpha1" tenancyv1alpha1 "github.com/kcp-dev/sdk/apis/tenancy/v1alpha1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/rest" @@ -35,18 +34,13 @@ type DependencyRuleReconciler struct { // workspaces. Nil if webhook installation is not configured. WebhookInstaller *WebhookInstaller - // APIExportName is the name of the dep-ctrl APIExport, used to discover - // the virtual workspace URL from the APIExportEndpointSlice. - APIExportName string - - // BaseConfig is the root kcp REST config (no workspace path suffix). - // Used to resolve workspace paths to logical cluster names by listing - // Workspace objects in the root workspace. + // BaseConfig is the kcp front-proxy REST config (no workspace path suffix). + // Used for webhook installation (routed to the correct shard per workspace) + // and to resolve workspace paths to logical cluster names. BaseConfig *rest.Config - // mu protects lazy initialization of vwBaseConfig and wsResolver. + // mu protects lazy initialization of wsResolver. mu sync.Mutex - vwBaseCfg *rest.Config wsResolver *workspaceResolver } @@ -56,44 +50,23 @@ func NewDependencyRuleReconciler(depCtrlMgr mcmanager.Manager) *DependencyRuleRe } } -// ensureInitialized lazily discovers the VW base URL and creates a workspace -// resolver. Must be called before using WebhookInstaller. -func (r *DependencyRuleReconciler) ensureInitialized(ctx context.Context) error { +// ensureInitialized lazily creates the workspace resolver used to map +// workspace paths to logical cluster names for webhook installation. +func (r *DependencyRuleReconciler) ensureInitialized() error { r.mu.Lock() defer r.mu.Unlock() - if r.vwBaseCfg != nil { + if r.wsResolver != nil { return nil } localMgr := r.DepCtrlManager.GetLocalManager() - localCfg := localMgr.GetConfig() - - directClient, err := client.New(localCfg, client.Options{Scheme: localMgr.GetScheme()}) - if err != nil { - return fmt.Errorf("creating client for VW URL discovery: %w", err) - } - - var ess apisv1alpha1.APIExportEndpointSlice - if err := directClient.Get(ctx, client.ObjectKey{Name: r.APIExportName}, &ess); err != nil { - return fmt.Errorf("getting APIExportEndpointSlice %s: %w", r.APIExportName, err) - } - - if len(ess.Status.APIExportEndpoints) == 0 { - return fmt.Errorf("APIExportEndpointSlice %s has no endpoints", r.APIExportName) - } - vwURL := ess.Status.APIExportEndpoints[0].URL - r.vwBaseCfg = rest.CopyConfig(localCfg) - r.vwBaseCfg.Host = vwURL - - // Create workspace resolver using the base kcp config. + // Create workspace resolver using the base kcp config (front-proxy URL). rootCfg := rest.CopyConfig(r.BaseConfig) rootCfg.Host += logicalcluster.NewPath("root").RequestPath() r.wsResolver = &workspaceResolver{rootCfg: rootCfg, scheme: localMgr.GetScheme()} - log.FromContext(ctx).Info("discovered VW base URL for webhook operations", "url", vwURL) - return nil } @@ -102,8 +75,8 @@ func (r *DependencyRuleReconciler) ensureInitialized(ctx context.Context) error func (r *DependencyRuleReconciler) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { logger := log.FromContext(ctx).WithValues("rule", req.Name, "cluster", req.ClusterName) - if err := r.ensureInitialized(ctx); err != nil { - logger.Error(err, "failed to initialize VW config") + if err := r.ensureInitialized(); err != nil { + logger.Error(err, "failed to initialize workspace resolver") return ctrl.Result{}, err } @@ -142,22 +115,25 @@ func (r *DependencyRuleReconciler) Reconcile(ctx context.Context, req mcreconcil return ctrl.Result{}, nil } -// ensureWebhooks installs webhooks in dependency provider workspaces via the VW. +// ensureWebhooks installs webhooks in dependency provider workspaces via the virtual workspace. func (r *DependencyRuleReconciler) ensureWebhooks(ctx context.Context, ruleKey string, rule *v1alpha1.DependencyRule) error { - // Replace workspace paths with logical cluster names for the installer. - resolvedRule := rule.DeepCopy() - for i := range resolvedRule.Spec.Dependencies { - wsPath := resolvedRule.Spec.Dependencies[i].APIExportRef.Path + // Build a mapping from workspace path to logical cluster name. + clusterNames := make(map[string]string, len(rule.Spec.Dependencies)) + for _, dep := range rule.Spec.Dependencies { + wsPath := dep.APIExportRef.Path + if _, ok := clusterNames[wsPath]; ok { + continue + } + clusterName, err := r.wsResolver.resolve(wsPath) if err != nil { return fmt.Errorf("resolving %s: %w", wsPath, err) } - resolvedRule.Spec.Dependencies[i].APIExportRef.Path = clusterName - } - r.WebhookInstaller.BaseConfig = r.vwBaseCfg + clusterNames[wsPath] = clusterName + } - return r.WebhookInstaller.EnsureWebhooks(ctx, ruleKey, resolvedRule) + return r.WebhookInstaller.EnsureWebhooks(ctx, ruleKey, rule, clusterNames) } // collectWorkspacePaths extracts dependency workspace paths referenced in a DependencyRule. diff --git a/internal/controller/integration_test.go b/internal/controller/integration_test.go index f52f6e6..8d6531b 100644 --- a/internal/controller/integration_test.go +++ b/internal/controller/integration_test.go @@ -194,7 +194,6 @@ var _ = Describe("Dependency Controller", Ordered, func() { cacheMgr := &depwebhook.RuleCacheManager{ DepCtrlManager: mgr, - BaseConfig: kcpConfig, Scheme: scheme.Scheme, APIExportName: "dependencies.opendefense.cloud", Registry: registry, @@ -219,10 +218,9 @@ var _ = Describe("Dependency Controller", Ordered, func() { Expect(err).NotTo(HaveOccurred()) // Create the controller-side reconciler (webhook install only, no RBAC in envtest). - webhookInstaller := controller.NewWebhookInstaller(kcpConfig, webhookURL, caBundle) + webhookInstaller := controller.NewWebhookInstaller(mgr, webhookURL, caBundle) ruleReconciler := controller.NewDependencyRuleReconciler(mgr) - ruleReconciler.APIExportName = "dependencies.opendefense.cloud" ruleReconciler.BaseConfig = kcpConfig ruleReconciler.WebhookInstaller = webhookInstaller @@ -233,7 +231,7 @@ var _ = Describe("Dependency Controller", Ordered, func() { Expect(err).NotTo(HaveOccurred()) // Wire up the webhook handler with the rule registry. - webhookHandler = &depwebhook.DeletionValidator{Registry: registry, Initialized: initialized} + webhookHandler = &depwebhook.DeletionValidator{Registry: registry, Initialized: initialized, BaseConfig: kcpConfig} // Start the manager. go func() { diff --git a/internal/controller/webhook_installer.go b/internal/controller/webhook_installer.go index c8f2dc3..cd38551 100644 --- a/internal/controller/webhook_installer.go +++ b/internal/controller/webhook_installer.go @@ -8,15 +8,13 @@ import ( "fmt" "sync" - "github.com/kcp-dev/logicalcluster/v3" registrationv1 "k8s.io/api/admissionregistration/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/kubernetes/scheme" - "k8s.io/client-go/rest" - "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" v1alpha1 "go.opendefense.cloud/dependency-controller/api/v1alpha1" ) @@ -35,7 +33,11 @@ const webhookName = "dependency-controller" // When a DependencyRule is deleted, its contributions are removed. If no rules // remain for a workspace the webhook is deleted entirely. type WebhookInstaller struct { - BaseConfig *rest.Config + // Manager is the multicluster manager for the dep-ctrl APIExport. Used to + // obtain per-workspace clients via the virtual workspace, which scopes + // access to workspaces that have bound the APIExport and accepted the + // validatingwebhookconfigurations permissionClaim. + Manager mcmanager.Manager WebhookURL string CABundle []byte @@ -56,9 +58,9 @@ type webhookRuleKey struct { Resource string } -func NewWebhookInstaller(baseCfg *rest.Config, webhookURL string, caBundle []byte) *WebhookInstaller { +func NewWebhookInstaller(mgr mcmanager.Manager, webhookURL string, caBundle []byte) *WebhookInstaller { return &WebhookInstaller{ - BaseConfig: baseCfg, + Manager: mgr, WebhookURL: webhookURL, CABundle: caBundle, ruleTargets: make(map[string][]ruleTarget), @@ -66,17 +68,19 @@ func NewWebhookInstaller(baseCfg *rest.Config, webhookURL string, caBundle []byt } // EnsureWebhooks installs or updates ValidatingWebhookConfigurations for all -// dependency targets in the given rule. -func (w *WebhookInstaller) EnsureWebhooks(ctx context.Context, ruleKey string, rule *v1alpha1.DependencyRule) error { - // Group targets by provider workspace so we do one update per workspace. - byWorkspace := make(map[string][]v1alpha1.DependencyTarget) +// dependency targets in the given rule. The clusterNames map translates +// workspace paths (from APIExportRef.Path) to logical cluster names used +// to connect via the virtual workspace. +func (w *WebhookInstaller) EnsureWebhooks(ctx context.Context, ruleKey string, rule *v1alpha1.DependencyRule, clusterNames map[string]string) error { + // Group targets by logical cluster name so we do one update per workspace. + byCluster := make(map[string][]v1alpha1.DependencyTarget) for _, dep := range rule.Spec.Dependencies { - wsPath := dep.APIExportRef.Path - byWorkspace[wsPath] = append(byWorkspace[wsPath], dep) + clusterName := clusterNames[dep.APIExportRef.Path] + byCluster[clusterName] = append(byCluster[clusterName], dep) } - for wsPath, deps := range byWorkspace { - if err := w.ensureWebhookForWorkspace(ctx, ruleKey, wsPath, deps); err != nil { + for clusterName, deps := range byCluster { + if err := w.ensureWebhookForWorkspace(ctx, ruleKey, clusterName, deps); err != nil { return err } } @@ -151,13 +155,11 @@ func (w *WebhookInstaller) reconcileWorkspaceWebhook(ctx context.Context, wsPath // Compute desired rules for this workspace across all DependencyRules. desired := w.desiredRulesForWorkspace(wsPath) - cfg := rest.CopyConfig(w.BaseConfig) - cfg.Host += logicalcluster.NewPath(wsPath).RequestPath() - - c, err := client.New(cfg, client.Options{Scheme: scheme.Scheme}) + cl, err := w.Manager.GetCluster(ctx, multicluster.ClusterName(wsPath)) if err != nil { - return fmt.Errorf("creating client for %s: %w", wsPath, err) + return fmt.Errorf("getting cluster %s from manager: %w", wsPath, err) } + c := cl.GetClient() whCfg := ®istrationv1.ValidatingWebhookConfiguration{} err = c.Get(ctx, types.NamespacedName{Name: webhookName}, whCfg) diff --git a/internal/kcp/endpointslice.go b/internal/kcp/endpointslice.go new file mode 100644 index 0000000..b833d36 --- /dev/null +++ b/internal/kcp/endpointslice.go @@ -0,0 +1,31 @@ +// Copyright 2026 BWI GmbH and Dependency Controller contributors +// SPDX-License-Identifier: Apache-2.0 + +package kcp + +import ( + "context" + "fmt" + + apisv1alpha1 "github.com/kcp-dev/sdk/apis/apis/v1alpha1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// FindEndpointSlice finds the APIExportEndpointSlice whose spec.export.name +// matches the given APIExport name. The slice name is not guaranteed to match +// the APIExport name — service providers can create custom slices with +// different names. +func FindEndpointSlice(ctx context.Context, c client.Reader, apiExportName string) (*apisv1alpha1.APIExportEndpointSlice, error) { + var list apisv1alpha1.APIExportEndpointSliceList + if err := c.List(ctx, &list); err != nil { + return nil, fmt.Errorf("listing APIExportEndpointSlices: %w", err) + } + + for i := range list.Items { + if list.Items[i].Spec.APIExport.Name == apiExportName { + return &list.Items[i], nil + } + } + + return nil, fmt.Errorf("no APIExportEndpointSlice found for APIExport %q", apiExportName) +} diff --git a/internal/kcp/kubeconfig.go b/internal/kcp/kubeconfig.go new file mode 100644 index 0000000..932d354 --- /dev/null +++ b/internal/kcp/kubeconfig.go @@ -0,0 +1,45 @@ +// Copyright 2026 BWI GmbH and Dependency Controller contributors +// SPDX-License-Identifier: Apache-2.0 + +package kcp + +import ( + "errors" + "strings" + + "k8s.io/client-go/rest" +) + +// ValidateKubeconfig checks that the given rest.Config points to a kcp +// workspace (i.e. the host URL contains /clusters/). This catches the most +// common misconfiguration — pointing at a plain Kubernetes cluster or at kcp +// without a workspace path. +func ValidateKubeconfig(cfg *rest.Config) error { + if cfg == nil { + return errors.New("kubeconfig is nil") + } + + if !strings.Contains(cfg.Host, "/clusters/") { + return errors.New("kubeconfig host does not contain a /clusters/ workspace path — ensure the kubeconfig points to a kcp workspace (e.g. https://host/clusters/root:my-workspace)") + } + + return nil +} + +// BaseConfig returns a copy of the given rest.Config with the /clusters/... +// workspace path stripped from the host URL, leaving only the front-proxy +// base URL. This base URL can be used to construct workspace-scoped clients +// by appending /clusters/. +// +// Returns an error if the host URL does not contain a /clusters/ path. +func BaseConfig(cfg *rest.Config) (*rest.Config, error) { + idx := strings.Index(cfg.Host, "/clusters/") + if idx == -1 { + return nil, errors.New("kubeconfig host does not contain a /clusters/ workspace path — cannot derive front-proxy base URL") + } + + baseCfg := rest.CopyConfig(cfg) + baseCfg.Host = cfg.Host[:idx] + + return baseCfg, nil +} diff --git a/internal/kcp/kubeconfig_test.go b/internal/kcp/kubeconfig_test.go new file mode 100644 index 0000000..87d8891 --- /dev/null +++ b/internal/kcp/kubeconfig_test.go @@ -0,0 +1,107 @@ +// Copyright 2026 BWI GmbH and Dependency Controller contributors +// SPDX-License-Identifier: Apache-2.0 + +package kcp + +import ( + "testing" + + "k8s.io/client-go/rest" +) + +func TestValidateKubeconfig(t *testing.T) { + tests := []struct { + name string + cfg *rest.Config + wantErr bool + }{ + { + name: "nil config", + cfg: nil, + wantErr: true, + }, + { + name: "plain kubernetes URL", + cfg: &rest.Config{Host: "https://localhost:6443"}, + wantErr: true, + }, + { + name: "kcp URL without workspace path", + cfg: &rest.Config{Host: "https://kcp.example.com"}, + wantErr: true, + }, + { + name: "valid kcp workspace URL", + cfg: &rest.Config{Host: "https://kcp.example.com/clusters/root:dep-ctrl"}, + wantErr: false, + }, + { + name: "valid kcp root URL", + cfg: &rest.Config{Host: "https://kcp.example.com/clusters/root"}, + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := ValidateKubeconfig(tt.cfg) + if (err != nil) != tt.wantErr { + t.Errorf("ValidateKubeconfig() error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +} + +func TestBaseConfig(t *testing.T) { + tests := []struct { + name string + host string + wantHost string + wantErr bool + }{ + { + name: "no clusters path", + host: "https://localhost:6443", + wantErr: true, + }, + { + name: "strips workspace path", + host: "https://kcp.example.com/clusters/root:dep-ctrl", + wantHost: "https://kcp.example.com", + wantErr: false, + }, + { + name: "strips root path", + host: "https://kcp.example.com/clusters/root", + wantHost: "https://kcp.example.com", + wantErr: false, + }, + { + name: "preserves port", + host: "https://localhost:31443/clusters/root:dep-ctrl", + wantHost: "https://localhost:31443", + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cfg := &rest.Config{Host: tt.host} + baseCfg, err := BaseConfig(cfg) + if (err != nil) != tt.wantErr { + t.Errorf("BaseConfig() error = %v, wantErr %v", err, tt.wantErr) + return + } + if err != nil { + return + } + if baseCfg.Host != tt.wantHost { + t.Errorf("BaseConfig() host = %q, want %q", baseCfg.Host, tt.wantHost) + } + // Verify original config is not mutated. + if cfg.Host != tt.host { + t.Errorf("original config mutated: host = %q, want %q", cfg.Host, tt.host) + } + }) + } +} diff --git a/internal/webhook/deletion_validator.go b/internal/webhook/deletion_validator.go index c553d14..1f7776a 100644 --- a/internal/webhook/deletion_validator.go +++ b/internal/webhook/deletion_validator.go @@ -11,12 +11,16 @@ import ( "strings" "github.com/kcp-dev/logicalcluster/v3" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime/schema" - "sigs.k8s.io/controller-runtime/pkg/client" + "k8s.io/client-go/dynamic" + "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/webhook/admission" - "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + + "go.opendefense.cloud/dependency-controller/internal/fieldpath" ) const ( @@ -41,8 +45,9 @@ func ReadyzCheck(initialized <-chan struct{}) func(*http.Request) error { // DeletionValidator is a validating admission webhook handler that blocks // deletion of resources that have active dependents referencing them. // -// It queries indexed caches maintained by the RuleCacheManager's per-rule -// multicluster managers. No Dependency marker objects are needed. +// On each DELETE request it constructs a temporary dynamic client scoped to +// the consumer workspace (via the front-proxy) and lists dependent resources +// directly, filtering by field path to find references to the deleted resource. type DeletionValidator struct { Registry *RuleRegistry @@ -50,6 +55,11 @@ type DeletionValidator struct { // all existing DependencyRules. Until then, DELETE requests are denied // to prevent deletions slipping through before the registry is ready. Initialized <-chan struct{} + + // BaseConfig is the front-proxy REST config without a workspace path. + // Per-request clients are created by copying this config and setting + // Host to {base}/clusters/{logicalClusterName}. + BaseConfig *rest.Config } func (v *DeletionValidator) Handle(ctx context.Context, req admission.Request) admission.Response { @@ -100,40 +110,23 @@ func (v *DeletionValidator) Handle(ctx context.Context, req admission.Request) a return admission.Allowed("") } - // Query each matching rule's indexed cache for dependents referencing this resource. + // Build a dynamic client scoped to the consumer workspace. + dynClient, err := v.workspaceClient(clusterName.String()) + if err != nil { + logger.Error(err, "failed to create workspace client") + return admission.Errored(http.StatusInternalServerError, fmt.Errorf("creating workspace client: %w", err)) + } + + // Query each matching rule's dependent resources in the consumer workspace. var blockers []string for _, entry := range entries { - if !entry.State.IsReady() { - msg := fmt.Sprintf("dependency check unavailable for rule %s: cache warming up, retry later", entry.Key) - logger.Info(msg) - - return admission.Denied(msg) - } - - cluster, err := entry.State.Manager.GetCluster(ctx, multicluster.ClusterName(clusterName.String())) + dependents, err := v.listDependents(ctx, dynClient, entry, req.Name, req.Namespace) if err != nil { - // Cluster not known to this rule's manager — no dependents here. - logger.V(1).Info("cluster not found in rule manager, skipping", "rule", entry.Key, "cluster", clusterName) - continue + logger.Error(err, "failed to query dependents", "rule", entry.Key) + return admission.Errored(http.StatusInternalServerError, fmt.Errorf("checking dependencies for rule %s: %w", entry.Key, err)) } - // Query the field index for dependents referencing the deleted resource name. - // Scoped to the same namespace — cross-namespace references are not supported. - list := &unstructured.UnstructuredList{} - list.SetGroupVersionKind(entry.State.DependentGVK.GroupVersion().WithKind(entry.State.DependentGVK.Kind + "List")) - - err = cluster.GetCache().List(ctx, list, - client.MatchingFields{entry.MatchedField.FieldPath: req.Name}, - client.InNamespace(req.Namespace), - ) - if err != nil { - logger.Error(err, "failed to query indexed cache", "rule", entry.Key) - return admission.Errored(http.StatusInternalServerError, fmt.Errorf("failed to check dependencies for rule %s: %w", entry.Key, err)) - } - - for _, item := range list.Items { - blockers = append(blockers, fmt.Sprintf("%s/%s", entry.State.DependentGVK.Kind, item.GetName())) - } + blockers = append(blockers, dependents...) } if len(blockers) > 0 { @@ -147,6 +140,56 @@ func (v *DeletionValidator) Handle(ctx context.Context, req admission.Request) a return admission.Allowed("") } +// workspaceClient creates a dynamic client targeting a specific workspace +// through the front-proxy. +func (v *DeletionValidator) workspaceClient(clusterName string) (dynamic.Interface, error) { + cfg := rest.CopyConfig(v.BaseConfig) + cfg.Host = fmt.Sprintf("%s/clusters/%s", strings.TrimRight(cfg.Host, "/"), clusterName) + + return dynamic.NewForConfig(cfg) +} + +// listDependents lists dependent resources in the consumer workspace and +// returns the names of those that reference the deleted resource via the +// rule's field path. +func (v *DeletionValidator) listDependents( + ctx context.Context, + dynClient dynamic.Interface, + entry RuleEntry, + targetName, namespace string, +) ([]string, error) { + gvr := entry.State.DependentGVR + + var res dynamic.ResourceInterface + if namespace != "" { + res = dynClient.Resource(gvr).Namespace(namespace) + } else { + res = dynClient.Resource(gvr) + } + + list, err := res.List(ctx, metav1.ListOptions{}) + if err != nil { + // If the dependent resource type doesn't exist in this workspace + // (e.g., consumer hasn't bound the provider APIExport), there are + // no dependents — allow deletion. + if errors.IsNotFound(err) { + return nil, nil + } + + return nil, fmt.Errorf("listing %s: %w", gvr, err) + } + + var blockers []string + for _, item := range list.Items { + val := fieldpath.Resolve(item.Object, entry.MatchedField.FieldPath) + if val == targetName { + blockers = append(blockers, fmt.Sprintf("%s/%s", entry.State.DependentGVK.Kind, item.GetName())) + } + } + + return blockers, nil +} + // objectFromRequest extracts the unstructured object from the admission // request's OldObject (for DELETE) or Object (for other operations). func objectFromRequest(req admission.Request) (*unstructured.Unstructured, error) { diff --git a/internal/webhook/rule_cache_manager.go b/internal/webhook/rule_cache_manager.go index 65327c6..100cb22 100644 --- a/internal/webhook/rule_cache_manager.go +++ b/internal/webhook/rule_cache_manager.go @@ -8,39 +8,28 @@ import ( "fmt" "github.com/kcp-dev/logicalcluster/v3" - "github.com/kcp-dev/multicluster-provider/apiexport" - apisv1alpha1 "github.com/kcp-dev/sdk/apis/apis/v1alpha1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/client-go/rest" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/manager" - metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" - mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" - mccontroller "sigs.k8s.io/multicluster-runtime/pkg/controller" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" v1alpha1 "go.opendefense.cloud/dependency-controller/api/v1alpha1" - "go.opendefense.cloud/dependency-controller/internal/fieldpath" + "go.opendefense.cloud/dependency-controller/internal/kcp" ) -// RuleCacheManager reconciles DependencyRule objects and manages a dedicated -// indexed cache per rule. Each rule's dependent resource type (e.g., VirtualMachine) -// is watched through a multicluster manager connected to the provider's APIExport -// virtual workspace. Field indices on the dependent resources allow the -// DeletionValidator to efficiently query "which dependents reference resource X?". +// RuleCacheManager reconciles DependencyRule objects and populates the +// RuleRegistry with rule metadata. The DeletionValidator uses the registry +// to look up which rules protect a given resource type, then queries +// dependent resources directly per admission request. type RuleCacheManager struct { // DepCtrlManager is the multicluster manager for the dep-ctrl's APIExport. DepCtrlManager mcmanager.Manager - // BaseConfig is the root kcp REST config (no workspace path suffix). - BaseConfig *rest.Config - - // Scheme is the runtime scheme used when creating per-rule managers. + // Scheme is the runtime scheme used when creating clients. Scheme *runtime.Scheme // APIExportName is the name of the dep-ctrl APIExport (and its @@ -48,48 +37,49 @@ type RuleCacheManager struct { // during initial registry population. APIExportName string - // Registry holds the per-rule cache state queried by the DeletionValidator. + // Registry holds the rule metadata queried by the DeletionValidator. Registry *RuleRegistry } -// PopulateRegistry lists all existing DependencyRules from the APIExport virtual -// workspace and ensures an indexed cache exists for each one. This must be called -// after the manager has started (e.g., from a manager.Runnable) so that the -// APIExportEndpointSlice is available. +// PopulateRegistry lists all existing DependencyRules from every shard's +// APIExport virtual workspace and registers their metadata in the registry. +// This must be called after the manager has started (e.g., from a +// manager.Runnable) so that the APIExportEndpointSlice is available. func (m *RuleCacheManager) PopulateRegistry(ctx context.Context) error { logger := log.FromContext(ctx).WithName("rule-cache-manager") - vwClient, err := m.virtualWorkspaceClient(ctx) + vwClients, err := m.virtualWorkspaceClients(ctx) if err != nil { - return fmt.Errorf("creating virtual workspace client: %w", err) + return fmt.Errorf("creating virtual workspace clients: %w", err) } - var ruleList v1alpha1.DependencyRuleList - if err := vwClient.List(ctx, &ruleList); err != nil { - return fmt.Errorf("listing initial DependencyRules: %w", err) - } + var totalRules int + for _, vwClient := range vwClients { + var ruleList v1alpha1.DependencyRuleList + if err := vwClient.List(ctx, &ruleList); err != nil { + return fmt.Errorf("listing initial DependencyRules: %w", err) + } - logger.Info("populating rule registry", "ruleCount", len(ruleList.Items)) + totalRules += len(ruleList.Items) - for i := range ruleList.Items { - rule := &ruleList.Items[i] - clusterName := logicalcluster.From(rule) - key := ruleStateKey(clusterName.String(), rule.Name) - if err := m.ensureCache(ctx, key, clusterName.String(), rule); err != nil { - return fmt.Errorf("populating rule %s/%s: %w", clusterName, rule.Name, err) + for i := range ruleList.Items { + rule := &ruleList.Items[i] + clusterName := logicalcluster.From(rule) + key := ruleStateKey(clusterName.String(), rule.Name) + m.registerRule(key, rule) } } - logger.Info("rule registry populated") + logger.Info("rule registry populated", "ruleCount", totalRules, "shards", len(vwClients)) return nil } -// virtualWorkspaceClient reads the APIExportEndpointSlice from the dep-ctrl -// workspace to discover the virtual workspace URL, then returns a client -// pointing at {vwURL}/clusters/* so it can list resources across all bound -// workspaces. -func (m *RuleCacheManager) virtualWorkspaceClient(ctx context.Context) (client.Client, error) { +// virtualWorkspaceClients reads the APIExportEndpointSlice from the dep-ctrl +// workspace to discover the virtual workspace URLs (one per kcp shard), then +// returns a client per shard pointing at {vwURL}/clusters/* so it can list +// resources across all bound workspaces on that shard. +func (m *RuleCacheManager) virtualWorkspaceClients(ctx context.Context) ([]client.Client, error) { // Use a direct (non-cached) client to read the APIExportEndpointSlice // from the dep-ctrl workspace. localCfg := m.DepCtrlManager.GetLocalManager().GetConfig() @@ -98,32 +88,34 @@ func (m *RuleCacheManager) virtualWorkspaceClient(ctx context.Context) (client.C return nil, fmt.Errorf("creating direct client: %w", err) } - var ess apisv1alpha1.APIExportEndpointSlice - if err := directClient.Get(ctx, client.ObjectKey{Name: m.APIExportName}, &ess); err != nil { - return nil, fmt.Errorf("getting APIExportEndpointSlice %s: %w", m.APIExportName, err) + ess, err := kcp.FindEndpointSlice(ctx, directClient, m.APIExportName) + if err != nil { + return nil, fmt.Errorf("resolving APIExportEndpointSlice for %s: %w", m.APIExportName, err) } if len(ess.Status.APIExportEndpoints) == 0 { - return nil, fmt.Errorf("APIExportEndpointSlice %s has no endpoints", m.APIExportName) + return nil, fmt.Errorf("APIExportEndpointSlice %s has no endpoints", ess.Name) } - vwURL := ess.Status.APIExportEndpoints[0].URL + // Create a client for each shard's virtual workspace URL. + var clients []client.Client + for _, ep := range ess.Status.APIExportEndpoints { + vwCfg := rest.CopyConfig(localCfg) + vwCfg.Host = ep.URL + "/clusters/*" - // Create a client for the wildcard cluster path to list across all - // logical clusters visible through the virtual workspace. - vwCfg := rest.CopyConfig(localCfg) - vwCfg.Host = vwURL + "/clusters/*" + vwClient, err := client.New(vwCfg, client.Options{Scheme: m.Scheme}) + if err != nil { + return nil, fmt.Errorf("creating VW client for %s: %w", ep.URL, err) + } - vwClient, err := client.New(vwCfg, client.Options{Scheme: m.Scheme}) - if err != nil { - return nil, fmt.Errorf("creating VW client: %w", err) + clients = append(clients, vwClient) } - return vwClient, nil + return clients, nil } -// Reconcile handles DependencyRule events. On creation/update it ensures an -// indexed cache exists for the rule. On deletion it tears down the cache. +// Reconcile handles DependencyRule events. On creation/update it registers +// the rule metadata. On deletion it removes it from the registry. func (m *RuleCacheManager) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { logger := log.FromContext(ctx).WithValues("rule", req.Name, "cluster", req.ClusterName) @@ -139,7 +131,7 @@ func (m *RuleCacheManager) Reconcile(ctx context.Context, req mcreconcile.Reques var rule v1alpha1.DependencyRule if err := cl.GetClient().Get(ctx, client.ObjectKey{Name: req.Name}, &rule); err != nil { if client.IgnoreNotFound(err) == nil { - logger.Info("DependencyRule deleted, tearing down cache") + logger.Info("DependencyRule deleted, removing from registry") m.Registry.Unregister(key) return ctrl.Result{}, nil @@ -148,143 +140,44 @@ func (m *RuleCacheManager) Reconcile(ctx context.Context, req mcreconcile.Reques return ctrl.Result{}, err } - if err := m.ensureCache(ctx, key, clusterName, &rule); err != nil { - logger.Error(err, "failed to ensure cache for rule") - return ctrl.Result{}, err - } + m.registerRule(key, &rule) return ctrl.Result{}, nil } -// ensureCache creates a multicluster manager for the rule's dependent resource -// type, registers field indices for each dependency target, and stores the -// resulting cache state in the registry. If a cache already exists for the -// given key this is a no-op. -func (m *RuleCacheManager) ensureCache(ctx context.Context, key string, clusterName string, rule *v1alpha1.DependencyRule) error { - if m.Registry.Exists(key) { - return nil - } - +// registerRule extracts metadata from a DependencyRule and stores it in the +// registry. +func (m *RuleCacheManager) registerRule(key string, rule *v1alpha1.DependencyRule) { dep := rule.Spec.Dependent - - mgr, mgrCancel, err := m.startProviderManager(ctx, clusterName, dep.APIExportName) - if err != nil { - return fmt.Errorf("creating manager for rule %s: %w", rule.Name, err) - } - gvk := schema.GroupVersionKind{ Group: dep.Group, Version: dep.Version, Kind: dep.Kind, } + gvr := schema.GroupVersionResource{ + Group: dep.Group, + Version: dep.Version, + Resource: dep.Resource, + } - // Build indexed fields for each dependency target. var indexFields []IndexedField - watchObj := &unstructured.Unstructured{} - watchObj.SetGroupVersionKind(gvk) - for _, depTarget := range rule.Spec.Dependencies { - fieldPath := depTarget.FieldRef.Path - targetGVR := schema.GroupVersionResource{ - Group: depTarget.Group, - Version: depTarget.Version, - Resource: depTarget.Resource, - } - - // Register an index on the dependent resource's field path. - if err := mgr.GetFieldIndexer().IndexField(ctx, watchObj, fieldPath, func(obj client.Object) []string { - u, ok := obj.(*unstructured.Unstructured) - if !ok { - return nil - } - val := fieldpath.Resolve(u.Object, fieldPath) - if val == "" { - return nil - } - - return []string{val} - }); err != nil { - mgrCancel() - return fmt.Errorf("indexing field %s for rule %s: %w", fieldPath, rule.Name, err) - } - indexFields = append(indexFields, IndexedField{ - FieldPath: fieldPath, - TargetGVR: targetGVR, + FieldPath: depTarget.FieldRef.Path, + TargetGVR: schema.GroupVersionResource{ + Group: depTarget.Group, + Version: depTarget.Version, + Resource: depTarget.Resource, + }, }) } - // Register a controller to activate the informer and track discovered clusters. - registry := m.Registry - ruleKey := key - if err := mcbuilder.ControllerManagedBy(mgr). - Named(fmt.Sprintf("dep-index-%s", key)). - WithOptions(mccontroller.Options{SkipNameValidation: new(true)}). - For(watchObj). - Complete(mcreconcile.Func(func(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { - registry.TrackCluster(ruleKey, string(req.ClusterName)) - if state := registry.Get(ruleKey); state != nil && !state.IsReady() { - registry.MarkReady(ruleKey) - } - - return ctrl.Result{}, nil - })); err != nil { - mgrCancel() - return fmt.Errorf("registering index controller for rule %s: %w", rule.Name, err) - } - - state := &RuleState{ - Manager: mgr, - Cancel: mgrCancel, + m.Registry.Register(key, &RuleState{ Rule: rule.Spec, DependentGVK: gvk, + DependentGVR: gvr, IndexFields: indexFields, - } - - // Register atomically; if a concurrent reconcile raced us, cancel the - // duplicate manager we just created. - if old := m.Registry.Register(key, state); old != nil { - old.Cancel() - } - - return nil -} - -// startProviderManager creates a new multicluster manager backed by the given -// APIExport's virtual workspace. The manager is started in a background -// goroutine and the returned cancel function tears it down. -func (m *RuleCacheManager) startProviderManager(ctx context.Context, clusterName string, apiExportName string) (mcmanager.Manager, context.CancelFunc, error) { - logger := log.FromContext(ctx).WithValues("apiExport", apiExportName, "cluster", clusterName) - - cfg := rest.CopyConfig(m.BaseConfig) - cfg.Host += logicalcluster.NewPath(clusterName).RequestPath() - - provider, err := apiexport.New(cfg, apiExportName, apiexport.Options{ - Scheme: m.Scheme, }) - if err != nil { - return nil, nil, fmt.Errorf("creating apiexport provider: %w", err) - } - - mgr, err := mcmanager.New(cfg, provider, manager.Options{ - Scheme: m.Scheme, - Metrics: metricsserver.Options{BindAddress: "0"}, - HealthProbeBindAddress: "0", - }) - if err != nil { - return nil, nil, fmt.Errorf("creating multicluster manager: %w", err) - } - - mgrCtx, cancel := context.WithCancel(ctx) - - go func() { - logger.Info("starting provider manager for APIExport") - if err := mgr.Start(mgrCtx); err != nil { - logger.Error(err, "provider manager failed") - } - }() - - return mgr, cancel, nil } // ruleStateKey returns a qualified key combining the cluster and rule name. diff --git a/internal/webhook/rule_registry.go b/internal/webhook/rule_registry.go index 08ab27e..50128b5 100644 --- a/internal/webhook/rule_registry.go +++ b/internal/webhook/rule_registry.go @@ -7,32 +7,26 @@ import ( "sync" "k8s.io/apimachinery/pkg/runtime/schema" - mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" v1alpha1 "go.opendefense.cloud/dependency-controller/api/v1alpha1" ) // RuleRegistry is a thread-safe registry of active DependencyRules and their -// associated multicluster managers. It maintains a reverse index from dependency -// target GVRs to rule keys, enabling the webhook to quickly find which rules -// protect a given resource type. +// metadata. It maintains a reverse index from dependency target GVRs to rule +// keys, enabling the webhook to quickly find which rules protect a given +// resource type. type RuleRegistry struct { mu sync.RWMutex rules map[string]*RuleState byTarget map[schema.GroupVersionResource][]string // GVR -> rule keys } -// RuleState holds the runtime state for a single DependencyRule. +// RuleState holds the metadata for a single DependencyRule. type RuleState struct { - Manager mcmanager.Manager - Cancel func() Rule v1alpha1.DependencyRuleSpec DependentGVK schema.GroupVersionKind + DependentGVR schema.GroupVersionResource IndexFields []IndexedField - Ready bool - - mu sync.Mutex - knownClusters map[string]struct{} } // IndexedField maps a field path on the dependent resource to the dependency @@ -59,7 +53,7 @@ func NewRuleRegistry() *RuleRegistry { // Register adds a rule to the registry and rebuilds the reverse index. // If a state was already registered under this key, it is replaced and -// the old state is returned so the caller can clean it up. +// the old state is returned so the caller can detect duplicates. func (r *RuleRegistry) Register(key string, state *RuleState) *RuleState { r.mu.Lock() defer r.mu.Unlock() @@ -71,21 +65,17 @@ func (r *RuleRegistry) Register(key string, state *RuleState) *RuleState { return old } -// Unregister removes a rule from the registry, cancels its manager, and -// rebuilds the reverse index. +// Unregister removes a rule from the registry and rebuilds the reverse index. func (r *RuleRegistry) Unregister(key string) { r.mu.Lock() - state, exists := r.rules[key] - if !exists { - r.mu.Unlock() + defer r.mu.Unlock() + + if _, exists := r.rules[key]; !exists { return } + delete(r.rules, key) r.rebuildTargetIndex() - r.mu.Unlock() - - // Cancel outside the lock to avoid holding it during teardown. - state.Cancel() } // Exists returns true if the given rule key is registered. @@ -160,41 +150,3 @@ func (r *RuleRegistry) rebuildTargetIndex() { } } } - -// TrackCluster records that the given rule has seen a resource in the given cluster. -func (r *RuleRegistry) TrackCluster(key, clusterName string) { - r.mu.RLock() - state := r.rules[key] - r.mu.RUnlock() - if state == nil { - return - } - state.mu.Lock() - defer state.mu.Unlock() - if state.knownClusters == nil { - state.knownClusters = make(map[string]struct{}) - } - state.knownClusters[clusterName] = struct{}{} -} - -// IsReady returns true if the rule's cache is synced and ready for queries. -func (s *RuleState) IsReady() bool { - s.mu.Lock() - defer s.mu.Unlock() - - return s.Ready -} - -// MarkReady marks a rule's cache as synced and ready for queries. -func (r *RuleRegistry) MarkReady(key string) { - r.mu.RLock() - state := r.rules[key] - r.mu.RUnlock() - if state == nil { - return - } - - state.mu.Lock() - defer state.mu.Unlock() - state.Ready = true -} diff --git a/internal/webhook/rule_registry_test.go b/internal/webhook/rule_registry_test.go index 0f7b02d..0a9550b 100644 --- a/internal/webhook/rule_registry_test.go +++ b/internal/webhook/rule_registry_test.go @@ -20,15 +20,16 @@ func TestRuleRegistry_RegisterAndFind(t *testing.T) { subnetGVR := schema.GroupVersionResource{Group: "network.test.io", Version: "v1", Resource: "subnets"} state := &RuleState{ - Cancel: func() {}, Rule: v1alpha1.DependencyRuleSpec{ Dependent: v1alpha1.DependentRef{ - Group: "compute.test.io", - Version: "v1", - Kind: "VirtualMachine", + Group: "compute.test.io", + Version: "v1", + Kind: "VirtualMachine", + Resource: "virtualmachines", }, }, DependentGVK: schema.GroupVersionKind{Group: "compute.test.io", Version: "v1", Kind: "VirtualMachine"}, + DependentGVR: schema.GroupVersionResource{Group: "compute.test.io", Version: "v1", Resource: "virtualmachines"}, IndexFields: []IndexedField{ {FieldPath: ".spec.vpcRef.name", TargetGVR: vpcGVR}, {FieldPath: ".spec.subnetRef.name", TargetGVR: subnetGVR}, @@ -63,10 +64,8 @@ func TestRuleRegistry_Unregister(t *testing.T) { r := NewRuleRegistry() vpcGVR := schema.GroupVersionResource{Group: "network.test.io", Version: "v1", Resource: "vpcs"} - cancelled := false state := &RuleState{ - Cancel: func() { cancelled = true }, IndexFields: []IndexedField{ {FieldPath: ".spec.vpcRef.name", TargetGVR: vpcGVR}, }, @@ -75,10 +74,6 @@ func TestRuleRegistry_Unregister(t *testing.T) { r.Register("cluster1/vm-deps", state) r.Unregister("cluster1/vm-deps") - if !cancelled { - t.Error("expected cancel to be called") - } - entries := r.FindByTargetGVR(vpcGVR) if len(entries) != 0 { t.Errorf("expected 0 entries after unregister, got %d", len(entries)) @@ -95,11 +90,9 @@ func TestRuleRegistry_MultipleRulesSameTarget(t *testing.T) { vpcGVR := schema.GroupVersionResource{Group: "network.test.io", Version: "v1", Resource: "vpcs"} state1 := &RuleState{ - Cancel: func() {}, IndexFields: []IndexedField{{FieldPath: ".spec.vpcRef.name", TargetGVR: vpcGVR}}, } state2 := &RuleState{ - Cancel: func() {}, IndexFields: []IndexedField{{FieldPath: ".spec.networkRef.vpc", TargetGVR: vpcGVR}}, } @@ -122,44 +115,6 @@ func TestRuleRegistry_MultipleRulesSameTarget(t *testing.T) { } } -func TestRuleRegistry_TrackCluster(t *testing.T) { - r := NewRuleRegistry() - - state := &RuleState{ - Cancel: func() {}, - } - r.Register("cluster1/rule", state) - - r.TrackCluster("cluster1/rule", "consumer-ws-1") - r.TrackCluster("cluster1/rule", "consumer-ws-2") - r.TrackCluster("cluster1/rule", "consumer-ws-1") // duplicate - - state.mu.Lock() - count := len(state.knownClusters) - state.mu.Unlock() - - if count != 2 { - t.Errorf("expected 2 known clusters, got %d", count) - } -} - -func TestRuleRegistry_MarkReady(t *testing.T) { - r := NewRuleRegistry() - - state := &RuleState{Cancel: func() {}} - r.Register("c1/rule", state) - - if state.IsReady() { - t.Error("expected Ready=false initially") - } - - r.MarkReady("c1/rule") - - if !state.IsReady() { - t.Error("expected Ready=true after MarkReady") - } -} - func TestRuleRegistry_UnregisterNonexistent(t *testing.T) { r := NewRuleRegistry() // Should not panic. @@ -173,11 +128,9 @@ func TestRuleRegistry_AllTargetGVRs(t *testing.T) { subnetGVR := schema.GroupVersionResource{Group: "net.io", Version: "v1", Resource: "subnets"} r.Register("c1/rule1", &RuleState{ - Cancel: func() {}, IndexFields: []IndexedField{{FieldPath: ".spec.vpcRef.name", TargetGVR: vpcGVR}}, }) r.Register("c1/rule2", &RuleState{ - Cancel: func() {}, IndexFields: []IndexedField{ {FieldPath: ".spec.vpcRef.name", TargetGVR: vpcGVR}, {FieldPath: ".spec.subnetRef.name", TargetGVR: subnetGVR}, @@ -197,18 +150,6 @@ func TestRuleRegistry_GetNil(t *testing.T) { } } -func TestRuleRegistry_MarkReadyNonexistent(t *testing.T) { - r := NewRuleRegistry() - // Should not panic. - r.MarkReady("nonexistent") -} - -func TestRuleRegistry_TrackClusterNonexistent(t *testing.T) { - r := NewRuleRegistry() - // Should not panic. - r.TrackCluster("nonexistent", "cluster1") -} - func TestRuleRegistry_ConcurrentAccess(t *testing.T) { r := NewRuleRegistry() vpcGVR := schema.GroupVersionResource{Group: "net.io", Version: "v1", Resource: "vpcs"} @@ -225,13 +166,12 @@ func TestRuleRegistry_ConcurrentAccess(t *testing.T) { key := fmt.Sprintf("cluster%d/rule%d", id%5, id) for j := range opsPerGoroutine { - switch j % 7 { + switch j % 5 { case 0: r.Register(key, &RuleState{ - Cancel: func() {}, IndexFields: []IndexedField{{FieldPath: ".spec.ref", TargetGVR: vpcGVR}}, Rule: v1alpha1.DependencyRuleSpec{ - Dependent: v1alpha1.DependentRef{Group: "compute.io", Version: "v1", Kind: "VM"}, + Dependent: v1alpha1.DependentRef{Group: "compute.io", Version: "v1", Kind: "VM", Resource: "vms"}, }, }) case 1: @@ -241,10 +181,6 @@ func TestRuleRegistry_ConcurrentAccess(t *testing.T) { case 3: r.Get(key) case 4: - r.TrackCluster(key, fmt.Sprintf("consumer-%d", j)) - case 5: - r.MarkReady(key) - case 6: r.AllTargetGVRs() } } @@ -273,7 +209,6 @@ func TestRuleRegistry_RegisterOverwrite(t *testing.T) { subnetGVR := schema.GroupVersionResource{Group: "net.io", Version: "v1", Resource: "subnets"} oldState := &RuleState{ - Cancel: func() {}, IndexFields: []IndexedField{{FieldPath: ".spec.vpcRef.name", TargetGVR: vpcGVR}}, } old := r.Register("c1/rule", oldState) @@ -283,7 +218,6 @@ func TestRuleRegistry_RegisterOverwrite(t *testing.T) { // Overwrite with new state targeting a different GVR. old = r.Register("c1/rule", &RuleState{ - Cancel: func() {}, IndexFields: []IndexedField{{FieldPath: ".spec.subnetRef.name", TargetGVR: subnetGVR}}, }) if old != oldState { @@ -305,7 +239,7 @@ func TestRuleRegistry_RegisterOverwrite(t *testing.T) { func TestRuleRegistry_RegisterFirstReturnsNil(t *testing.T) { r := NewRuleRegistry() - old := r.Register("c1/rule", &RuleState{Cancel: func() {}}) + old := r.Register("c1/rule", &RuleState{}) if old != nil { t.Error("expected nil when registering a new key") } diff --git a/test/e2e/dependency_test.go b/test/e2e/dependency_test.go index 6072d87..7c6f7b4 100644 --- a/test/e2e/dependency_test.go +++ b/test/e2e/dependency_test.go @@ -40,37 +40,6 @@ var _ = Describe("Dependency Controller E2E", Ordered, func() { }) }) - It("should have shard-wide apiexports/content access via system:admin bootstrap RBAC", func() { - // The webhook SA has shard-wide apiexports/content and apiexportendpointslices - // access granted via system:admin RBAC (Bootstrap Policy Authorizer), instead - // of per-workspace dynamic RBAC managed by the controller. - - By("verifying webhook SA has apiexports/content on compute.test.io in compute-provider") - waitFor(time.Minute, "webhook SA has apiexports/content on compute.test.io", func() error { - allowed, err := webhookSACanAccessExport(wsComputeProvider, "compute.test.io") - if err != nil { - return fmt.Errorf("SAR check: %w", err) - } - if !allowed { - return fmt.Errorf("webhook SA does not yet have apiexports/content on compute.test.io") - } - - return nil - }) - - By("verifying webhook SA also has apiexports/content on network.test.io (shard-wide)") - allowed, err := webhookSACanAccessExport(wsNetworkProvider, "network.test.io") - Expect(err).NotTo(HaveOccurred()) - Expect(allowed).To(BeTrue(), - "webhook SA should have shard-wide apiexports/content access via system:admin") - - By("verifying webhook SA cannot access secrets in compute-provider") - allowed, err = webhookSACanAccessResource(wsComputeProvider, "", "secrets", "list") - Expect(err).NotTo(HaveOccurred()) - Expect(allowed).To(BeFalse(), - "webhook SA should NOT be able to list secrets") - }) - It("should block VPC deletion while a VM references it", func() { waitFor(time.Minute, "webhook blocks VPC deletion", func() error { out, err := kcpctlNoFail(wsConsumer1, "delete", "vpc", "my-vpc", "--namespace", "default") @@ -137,6 +106,39 @@ var _ = Describe("Dependency Controller E2E", Ordered, func() { }) }) + It("should block VPC deletion in consumer2 (may be on a different shard)", func() { + // consumer2 may be scheduled on a different shard than consumer1, + // validating that webhook protection works regardless of shard placement. + By("creating a VPC in consumer2") + applyFixtureToWS(wsConsumer2, filepath.Join(fixturesDir, "vpc-shard-vpc.yaml"), nil) + + By("creating a VM referencing the VPC in consumer2") + applyFixtureToWS(wsConsumer2, filepath.Join(fixturesDir, "vm-shard-vm.yaml"), nil) + + By("waiting for the webhook to block shard-vpc deletion") + waitFor(time.Minute, "webhook blocks shard-vpc deletion", func() error { + out, err := kcpctlNoFail(wsConsumer2, "delete", "vpc", "shard-vpc", "--namespace", "default") + if err == nil { + applyFixtureToWS(wsConsumer2, filepath.Join(fixturesDir, "vpc-shard-vpc.yaml"), nil) + return fmt.Errorf("deletion was not blocked, recreated VPC") + } + if strings.Contains(out, "still referenced by") { + return nil + } + + return fmt.Errorf("unexpected output: %s", out) + }) + + By("deleting the VM to release the VPC") + kcpctl(wsConsumer2, "delete", "virtualmachine", "shard-vm", "--namespace", "default") + + By("verifying VPC deletion now succeeds") + waitFor(time.Minute, "shard-vpc deletion allowed after VM removed", func() error { + _, err := kcpctlNoFail(wsConsumer2, "delete", "vpc", "shard-vpc", "--namespace", "default") + return err + }) + }) + It("should allow deletion when skip-protection annotation is set", func() { By("creating a VPC and VM in consumer1") applyFixtureToWS(wsConsumer1, filepath.Join(fixturesDir, "vpc-skip-vpc.yaml"), nil) @@ -208,9 +210,6 @@ var _ = Describe("Dependency Controller E2E", Ordered, func() { return err }()).To(Succeed()) - // RBAC is not affected by rule deletion — webhook SA retains shard-wide - // apiexports/content access via system:admin bootstrap RBAC. - // Clean up remaining resources. kcpctlNoFail(wsConsumer1, "delete", "virtualmachine", "cleanup-vm", "--namespace", "default") //nolint:errcheck }) @@ -251,4 +250,45 @@ var _ = Describe("Dependency Controller E2E", Ordered, func() { }) kcpctlNoFail(wsComputeProvider, "delete", "dependencyrule", "vm-dependencies") //nolint:errcheck }) + + It("should propagate in-place DependencyRule updates to the webhook", func() { + // Patching a live rule's fieldRef to point at a non-existent path + // must cause the webhook's rule registry to re-evaluate dependents + // — there are now zero matches, so deletion must be allowed without + // the user touching VMs or recreating the rule. + + By("creating the DependencyRule and a VPC+VM that triggers protection") + applyFixtureToWS(wsComputeProvider, filepath.Join(fixturesDir, "dependencyrule-vm-dependencies.yaml"), subs) + applyFixtureToWS(wsConsumer1, filepath.Join(fixturesDir, "vpc-update-vpc.yaml"), nil) + applyFixtureToWS(wsConsumer1, filepath.Join(fixturesDir, "vm-update-vm.yaml"), nil) + + By("waiting for the webhook to block update-vpc deletion under the original fieldPath") + waitFor(time.Minute, "webhook blocks update-vpc deletion", func() error { + out, err := kcpctlNoFail(wsConsumer1, "delete", "vpc", "update-vpc", "--namespace", "default") + if err == nil { + applyFixtureToWS(wsConsumer1, filepath.Join(fixturesDir, "vpc-update-vpc.yaml"), nil) + return fmt.Errorf("deletion was not blocked, recreated VPC") + } + if strings.Contains(out, "still referenced by") { + return nil + } + + return fmt.Errorf("unexpected output: %s", out) + }) + + By("patching the rule's fieldRef.path to a non-existent field") + kcpctl(wsComputeProvider, "patch", "dependencyrule", "vm-dependencies", + "--type=json", + "-p", `[{"op":"replace","path":"/spec/dependencies/0/fieldRef/path","value":".spec.notarealfield"}]`) + + By("verifying the webhook now allows deletion (rule update propagated, no recreate)") + waitFor(time.Minute, "rule update propagated to webhook registry", func() error { + _, err := kcpctlNoFail(wsConsumer1, "delete", "vpc", "update-vpc", "--namespace", "default") + return err + }) + + By("cleaning up") + kcpctlNoFail(wsConsumer1, "delete", "virtualmachine", "update-vm", "--namespace", "default") //nolint:errcheck + kcpctlNoFail(wsComputeProvider, "delete", "dependencyrule", "vm-dependencies") //nolint:errcheck + }) }) diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go index 1cd485d..634b443 100644 --- a/test/e2e/suite_test.go +++ b/test/e2e/suite_test.go @@ -7,11 +7,13 @@ import ( "bytes" "context" "encoding/base64" - "encoding/json" "fmt" + "net" + "net/url" "os" "os/exec" "path/filepath" + "regexp" "strings" "testing" "time" @@ -29,14 +31,15 @@ var ( ) const ( - kindClusterName = "dep-ctrl-e2e" - kcpNamespace = "kcp-system" - depNamespace = "dependency-system" - kcpNodePort = "31500" - kcpServerLocalPort = "31501" // NodePort to kcp server (not front-proxy) - certManagerVer = "v1.17.2" - imageName = "dependency-controller:integration-test" - helmTimeout = "300s" + kindClusterName = "dep-ctrl-e2e" + kcpNamespace = "kcp-system" + depNamespace = "dependency-system" + certManagerVer = "v1.17.2" + imageName = "dependency-controller:integration-test" + helmTimeout = "300s" + + // NodePort for the front-proxy service exposed via kind. + frontProxyNodePort = "31443" ) // Workspace names under root. @@ -59,8 +62,44 @@ var ( // Per-component kubeconfigs for in-cluster pods. controllerKubeconfigPath string webhookKubeconfigPath string + + // In-cluster front-proxy base URL (extracted from kcp-operator kubeconfig). + inClusterFPURL string ) +// shardPlacement maps each test workspace to the shard it should be pinned to +// ("root" or "shard1"). Selected at suite startup via E2E_SHARD_CONFIG. +type shardPlacement struct { + depCtrl string + networkProvider string + computeProvider string + consumer1 string + consumer2 string +} + +// Two architecturally distinct configurations. Together they exercise: +// - same-shard fast paths (single-shard) +// - cross-shard webhook installation, dep-ctrl ↔ provider, consumer ↔ provider, +// and webhook query (multi-shard) +var shardConfigs = map[string]shardPlacement{ + "single-shard": { + depCtrl: "root", + networkProvider: "root", + computeProvider: "root", + consumer1: "root", + consumer2: "root", + }, + "multi-shard": { + depCtrl: "root", + networkProvider: "root", + computeProvider: "shard1", + consumer1: "root", + consumer2: "shard1", + }, +} + +var activeShardConfig shardPlacement + func TestE2E(t *testing.T) { RegisterFailHandler(Fail) RunSpecs(t, "E2E Suite") @@ -83,6 +122,20 @@ func init() { kubectlBin = lookupTool("KUBECTL", "kubectl") helmBin = lookupTool("HELM", "helm") dockerBin = lookupTool("DOCKER", "docker") + + name := os.Getenv("E2E_SHARD_CONFIG") + if name == "" { + name = "multi-shard" + } + cfg, ok := shardConfigs[name] + if !ok { + valid := make([]string, 0, len(shardConfigs)) + for k := range shardConfigs { + valid = append(valid, k) + } + panic(fmt.Sprintf("unknown E2E_SHARD_CONFIG %q (valid: %v)", name, valid)) + } + activeShardConfig = cfg } // run executes a command and returns combined output. Fails the test on non-zero exit. @@ -126,7 +179,7 @@ func kcpctl(wsPath string, args ...string) { GinkgoHelper() run(kubectlBin, append([]string{ "--kubeconfig", kcpHostKubeconfig, - "--server", fmt.Sprintf("https://localhost:%s/clusters/root:%s", kcpNodePort, wsPath), + "--server", fmt.Sprintf("https://localhost:%s/clusters/root:%s", frontProxyNodePort, wsPath), }, args...)...) } @@ -134,7 +187,7 @@ func kcpctl(wsPath string, args ...string) { func kcpctlNoFail(wsPath string, args ...string) (string, error) { return runNoFail(kubectlBin, append([]string{ "--kubeconfig", kcpHostKubeconfig, - "--server", fmt.Sprintf("https://localhost:%s/clusters/root:%s", kcpNodePort, wsPath), + "--server", fmt.Sprintf("https://localhost:%s/clusters/root:%s", frontProxyNodePort, wsPath), }, args...)...) } @@ -142,98 +195,15 @@ func kcpctlNoFail(wsPath string, args ...string) (string, error) { func kcpctlRootNoFail(args ...string) (string, error) { return runNoFail(kubectlBin, append([]string{ "--kubeconfig", kcpHostKubeconfig, - "--server", fmt.Sprintf("https://localhost:%s/clusters/root", kcpNodePort), + "--server", fmt.Sprintf("https://localhost:%s/clusters/root", frontProxyNodePort), }, args...)...) } -// subjectAccessReview is the JSON structure for a SubjectAccessReview response. -type subjectAccessReview struct { - Status struct { - Allowed bool `json:"allowed"` - Reason string `json:"reason"` - } `json:"status"` -} - -// webhookSACanAccessExport checks if the webhook SA has apiexports/content -// access for the given APIExport in the specified workspace, using a -// SubjectAccessReview evaluated by kcp. -func webhookSACanAccessExport(wsPath, exportName string) (bool, error) { - sarJSON := fmt.Sprintf(`{ - "apiVersion": "authorization.k8s.io/v1", - "kind": "SubjectAccessReview", - "spec": { - "user": "system:serviceaccount:%s:dependency-webhook", - "groups": ["system:serviceaccounts", "system:serviceaccounts:%s"], - "resourceAttributes": { - "group": "apis.kcp.io", - "resource": "apiexports", - "subresource": "content", - "name": %q, - "verb": "get" - } - } -}`, depNamespace, depNamespace, exportName) - - cmd := exec.CommandContext(context.Background(), kubectlBin, - "--kubeconfig", kcpHostKubeconfig, - "--server", fmt.Sprintf("https://localhost:%s/clusters/root:%s", kcpNodePort, wsPath), - "create", "-f", "-", "-o", "json", - ) - cmd.Stdin = strings.NewReader(sarJSON) - var buf bytes.Buffer - cmd.Stdout = &buf - cmd.Stderr = &buf - if err := cmd.Run(); err != nil { - return false, fmt.Errorf("SAR failed in %s: %s %v", wsPath, buf.String(), err) - } - - var sar subjectAccessReview - if err := json.Unmarshal(buf.Bytes(), &sar); err != nil { - return false, fmt.Errorf("parsing SAR response: %w", err) - } - - return sar.Status.Allowed, nil -} - -// webhookSACanAccessResource checks if the webhook SA can access a specific -// resource type in a workspace, using a SubjectAccessReview. -func webhookSACanAccessResource(wsPath, group, resource, verb string) (bool, error) { - sarJSON := fmt.Sprintf(`{ - "apiVersion": "authorization.k8s.io/v1", - "kind": "SubjectAccessReview", - "spec": { - "user": "system:serviceaccount:%s:dependency-webhook", - "groups": ["system:serviceaccounts", "system:serviceaccounts:%s"], - "resourceAttributes": { - "group": %q, - "resource": %q, - "verb": %q - } - } -}`, depNamespace, depNamespace, group, resource, verb) - - cmd := exec.CommandContext(context.Background(), kubectlBin, - "--kubeconfig", kcpHostKubeconfig, - "--server", fmt.Sprintf("https://localhost:%s/clusters/root:%s", kcpNodePort, wsPath), - "create", "-f", "-", "-o", "json", - ) - cmd.Stdin = strings.NewReader(sarJSON) - var buf bytes.Buffer - cmd.Stdout = &buf - cmd.Stderr = &buf - if err := cmd.Run(); err != nil { - return false, fmt.Errorf("SAR failed in %s: %s %v", wsPath, buf.String(), err) - } - - var sar subjectAccessReview - if err := json.Unmarshal(buf.Bytes(), &sar); err != nil { - return false, fmt.Errorf("parsing SAR response: %w", err) - } - - return sar.Status.Allowed, nil -} - -// applyFixtureToWS applies a YAML fixture to a kcp workspace with placeholder substitution. +// applyFixtureToWS applies a YAML fixture to a kcp workspace with placeholder +// substitution. Retries on transient kcp authorization errors that surface +// while APIExports are propagating across shards (a fresh consumer workspace +// on a non-root shard cannot bind to a provider's APIExport until kcp has +// finished publishing the APIExport's APIExportEndpointSlice on that shard). func applyFixtureToWS(wsPath, file string, substitutions map[string]string) { GinkgoHelper() raw, err := os.ReadFile(file) @@ -244,16 +214,22 @@ func applyFixtureToWS(wsPath, file string, substitutions map[string]string) { content = strings.ReplaceAll(content, "${"+k+"}", v) } - cmd := exec.CommandContext(context.Background(), kubectlBin, - "--kubeconfig", kcpHostKubeconfig, - "--server", fmt.Sprintf("https://localhost:%s/clusters/root:%s", kcpNodePort, wsPath), - "apply", "-f", "-", - ) - cmd.Stdin = strings.NewReader(content) - var buf bytes.Buffer - cmd.Stdout = &buf - cmd.Stderr = &buf - Expect(cmd.Run()).To(Succeed(), "applying %s to %s: %s", file, wsPath, buf.String()) + waitFor(2*time.Minute, fmt.Sprintf("apply %s to %s", file, wsPath), func() error { + cmd := exec.CommandContext(context.Background(), kubectlBin, + "--kubeconfig", kcpHostKubeconfig, + "--server", fmt.Sprintf("https://localhost:%s/clusters/root:%s", frontProxyNodePort, wsPath), + "apply", "-f", "-", + ) + cmd.Stdin = strings.NewReader(content) + var buf bytes.Buffer + cmd.Stdout = &buf + cmd.Stderr = &buf + if err := cmd.Run(); err != nil { + return fmt.Errorf("%w: %s", err, buf.String()) + } + + return nil + }) } // waitFor retries a check function until it succeeds or the timeout is reached. @@ -280,14 +256,10 @@ func waitFor(timeout time.Duration, desc string, check func() error) { } } -// secretField extracts a base64-decoded field from a k8s secret in the kcp namespace. -func secretField(name, jsonpath string) []byte { +// kindctlSecret extracts the kubeconfig from a k8s secret in the kcp-system namespace. +func kindctlSecret(name string) string { GinkgoHelper() - out := kindctl("-n", kcpNamespace, "get", "secret", name, "-o", fmt.Sprintf("jsonpath=%s", jsonpath)) - decoded, err := base64.StdEncoding.DecodeString(strings.TrimSpace(out)) - Expect(err).NotTo(HaveOccurred()) - - return decoded + return kindctl("-n", kcpNamespace, "get", "secret", name, "-o", "jsonpath={.data.kubeconfig}") } var _ = SynchronizedBeforeSuite(func() { @@ -306,14 +278,17 @@ var _ = SynchronizedBeforeSuite(func() { By("installing cert-manager") installCertManager() - By("deploying kcp via helm") - deployKCP() + By("deploying kcp via kcp-operator") + deployKCPOperator() + + By("deploying etcd instances") + deployEtcd() - By("building admin kubeconfigs") - buildAdminKubeconfigs() + By("creating kcp RootShard, Shard, and FrontProxy") + createKCPResources() - By("exposing kcp server via NodePort") - kindctl("apply", "-f", filepath.Join(fixturesDir, "kcp-server-nodeport.yaml")) + By("generating admin kubeconfig") + buildAdminKubeconfig() By("building component kubeconfigs") buildComponentKubeconfigs() @@ -379,202 +354,649 @@ func installCertManager() { }) } -func deployKCP() { +func deployKCPOperator() { _, _ = runNoFail(helmBin, "repo", "add", "kcp", "https://kcp-dev.github.io/helm-charts") run(helmBin, "repo", "update", "kcp") - run(helmBin, "upgrade", "--install", "kcp", "kcp/kcp", + run(helmBin, "upgrade", "--install", "kcp-operator", "kcp/kcp-operator", "--namespace", kcpNamespace, "--create-namespace", - "--values", filepath.Join(fixturesDir, "integration-values-kcp.yaml"), "--wait", "--timeout", helmTimeout, ) } -func buildAdminKubeconfigs() { - kindctl("apply", "-f", filepath.Join(fixturesDir, "kcp-admin-cert.yaml")) +func deployEtcd() { + // Deploy two etcd instances: one for the root shard, one for the secondary shard. + for _, name := range []string{"etcd-root", "etcd-shard"} { + applyEtcd(name) + } - waitFor(time.Minute, "front-proxy admin cert issued", func() error { - _, err := kindctlNoFail("-n", kcpNamespace, "get", "secret", "kcp-admin-front-proxy-cert", - "-o", "jsonpath={.data.tls\\.crt}") + // Wait for etcd pods to be ready. + for _, name := range []string{"etcd-root", "etcd-shard"} { + waitFor(2*time.Minute, fmt.Sprintf("%s ready", name), func() error { + _, err := kindctlNoFail("-n", kcpNamespace, "wait", "statefulset", name, + "--for=jsonpath={.status.readyReplicas}=1", "--timeout=1s") - return err + return err + }) + } +} + +// applyEtcd creates a minimal single-node etcd instance in the kcp namespace. +func applyEtcd(name string) { + GinkgoHelper() + manifest := fmt.Sprintf(`--- +apiVersion: v1 +kind: Service +metadata: + name: %[1]s + namespace: %[2]s +spec: + type: ClusterIP + selector: + app.kubernetes.io/name: etcd + app.kubernetes.io/instance: %[1]s + ports: + - name: client + port: 2379 + targetPort: client +--- +apiVersion: v1 +kind: Service +metadata: + name: %[1]s-headless + namespace: %[2]s + annotations: + service.alpha.kubernetes.io/tolerate-unready-endpoints: "true" +spec: + type: ClusterIP + clusterIP: None + publishNotReadyAddresses: true + selector: + app.kubernetes.io/name: etcd + app.kubernetes.io/instance: %[1]s + ports: + - name: client + port: 2379 + targetPort: client + - name: peer + port: 2380 + targetPort: peer +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: %[1]s + namespace: %[2]s +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: etcd + app.kubernetes.io/instance: %[1]s + serviceName: %[1]s-headless + template: + metadata: + labels: + app.kubernetes.io/name: etcd + app.kubernetes.io/instance: %[1]s + spec: + automountServiceAccountToken: false + containers: + - name: etcd + image: quay.io/coreos/etcd:v3.5.21 + imagePullPolicy: IfNotPresent + command: ["/usr/local/bin/etcd"] + args: + - --name=$(HOSTNAME) + - --data-dir=/data + - --listen-peer-urls=http://0.0.0.0:2380 + - --listen-client-urls=http://0.0.0.0:2379 + - --advertise-client-urls=http://$(HOSTNAME).%[1]s-headless.%[2]s.svc.cluster.local:2379 + - --initial-cluster-state=new + - --initial-cluster-token=$(HOSTNAME) + - --initial-cluster=$(HOSTNAME)=http://$(HOSTNAME).%[1]s-headless.%[2]s.svc.cluster.local:2380 + - --initial-advertise-peer-urls=http://$(HOSTNAME).%[1]s-headless.%[2]s.svc.cluster.local:2380 + - --listen-metrics-urls=http://0.0.0.0:8080 + env: + - name: HOSTNAME + valueFrom: + fieldRef: + fieldPath: metadata.name + ports: + - name: client + containerPort: 2379 + - name: peer + containerPort: 2380 + - name: metrics + containerPort: 8080 + livenessProbe: + httpGet: + path: /livez + port: metrics + initialDelaySeconds: 15 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /readyz + port: metrics + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 30 + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + memory: 256Mi + volumeMounts: + - name: data + mountPath: /data + volumeClaimTemplates: + - metadata: + name: data + spec: + accessModes: [ReadWriteOnce] + resources: + requests: + storage: 1Gi +`, name, kcpNamespace) + + cmd := exec.CommandContext(context.Background(), kubectlBin, + "--context", "kind-"+kindClusterName, "apply", "-f", "-") + cmd.Stdin = strings.NewReader(manifest) + var buf bytes.Buffer + cmd.Stdout = &buf + cmd.Stderr = &buf + Expect(cmd.Run()).To(Succeed(), "applying etcd %s: %s", name, buf.String()) +} + +func createKCPResources() { + // The front-proxy hostname used for in-cluster access and via NodePort. + fpHostname := fmt.Sprintf("kcp-front-proxy.%s.svc.cluster.local", kcpNamespace) + + // Create a cert-manager Issuer in the kcp namespace for kcp-operator PKI. + applyToKind(fmt.Sprintf(`apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: selfsigned + namespace: %s +spec: + selfSigned: {}`, kcpNamespace)) + + // Create RootShard. certificateTemplates adds localhost to the server cert + // so we can port-forward to the shard for direct access during bootstrap. + applyToKind(fmt.Sprintf(`apiVersion: operator.kcp.io/v1alpha1 +kind: RootShard +metadata: + name: root + namespace: %[1]s +spec: + external: + hostname: %[2]s + port: 6443 + certificates: + issuerRef: + group: cert-manager.io + kind: Issuer + name: selfsigned + certificateTemplates: + server: + spec: + dnsNames: + - localhost + ipAddresses: + - "127.0.0.1" + cache: + embedded: + enabled: true + etcd: + endpoints: + - http://etcd-root.%[1]s.svc.cluster.local:2379 + auth: + serviceAccount: + enabled: true + deploymentTemplate: + spec: + template: + spec: + hostAliases: + - ip: "10.96.200.200" + hostnames: + - "%[2]s"`, kcpNamespace, fpHostname)) + + // Create FrontProxy with a fixed ClusterIP and NodePort for host access. + applyToKind(fmt.Sprintf(`apiVersion: operator.kcp.io/v1alpha1 +kind: FrontProxy +metadata: + name: kcp + namespace: %[1]s +spec: + rootShard: + ref: + name: root + auth: + serviceAccount: + enabled: true + serviceTemplate: + spec: + type: NodePort + clusterIP: "10.96.200.200" + certificateTemplates: + server: + spec: + dnsNames: + - localhost + - "%[2]s" + ipAddresses: + - "127.0.0.1"`, kcpNamespace, fpHostname)) + + // Wait for the RootShard to be running. + waitFor(3*time.Minute, "root shard running", func() error { + out, err := kindctlNoFail("-n", kcpNamespace, "get", "rootshard", "root", + "-o", "jsonpath={.status.phase}") + if err != nil { + return err + } + if strings.TrimSpace(out) != "Running" { + return fmt.Errorf("root shard phase: %s", out) + } + + return nil }) - // Extract front-proxy client certs for host kubeconfig. - fpClientCrt := secretField("kcp-admin-front-proxy-cert", "{.data.tls\\.crt}") - fpClientKey := secretField("kcp-admin-front-proxy-cert", "{.data.tls\\.key}") - - fpCrtFile := filepath.Join(tmpDir, "fp-client.crt") - fpKeyFile := filepath.Join(tmpDir, "fp-client.key") - Expect(os.WriteFile(fpCrtFile, fpClientCrt, 0o600)).To(Succeed()) - Expect(os.WriteFile(fpKeyFile, fpClientKey, 0o600)).To(Succeed()) - - run(kubectlBin, "--kubeconfig", kcpHostKubeconfig, "config", "set-cluster", "kcp", - "--server=https://localhost:"+kcpNodePort, - "--insecure-skip-tls-verify=true") - run(kubectlBin, "--kubeconfig", kcpHostKubeconfig, "config", "set-credentials", "kcp-admin", - "--client-certificate="+fpCrtFile, - "--client-key="+fpKeyFile, - "--embed-certs=true") - run(kubectlBin, "--kubeconfig", kcpHostKubeconfig, "config", "set-context", "kcp", - "--cluster=kcp", "--user=kcp-admin") - run(kubectlBin, "--kubeconfig", kcpHostKubeconfig, "config", "use-context", "kcp") - - waitFor(30*time.Second, "kcp API reachable", func() error { - _, err := runNoFail(kubectlBin, "--kubeconfig", kcpHostKubeconfig, "get", "--raw", "/readyz") - return err + // Wait for the FrontProxy to be running. + waitFor(2*time.Minute, "front-proxy running", func() error { + out, err := kindctlNoFail("-n", kcpNamespace, "get", "frontproxy", "kcp", + "-o", "jsonpath={.status.phase}") + if err != nil { + return err + } + if strings.TrimSpace(out) != "Running" { + return fmt.Errorf("front-proxy phase: %s", out) + } + + return nil }) -} -// buildComponentKubeconfigs creates per-component client certificates and -// kubeconfigs for the controller and webhook. Each component gets a -// least-privilege identity instead of shared admin credentials. -func buildComponentKubeconfigs() { - // Apply cert requests for controller and webhook identities. - kindctl("apply", "-f", filepath.Join(fixturesDir, "kcp-controller-cert.yaml")) - kindctl("apply", "-f", filepath.Join(fixturesDir, "kcp-webhook-sa-cert.yaml")) + // Patch the front-proxy Service to use a fixed NodePort. + kindctl("-n", kcpNamespace, "patch", "service", "kcp-front-proxy", "--type=json", + fmt.Sprintf(`-p=[{"op":"replace","path":"/spec/ports/0/nodePort","value":%s}]`, frontProxyNodePort)) - waitFor(time.Minute, "controller cert issued", func() error { - _, err := kindctlNoFail("-n", kcpNamespace, "get", "secret", "kcp-controller-cert", - "-o", "jsonpath={.data.tls\\.crt}") + // Create secondary Shard with localhost in server cert for port-forward access. + applyToKind(fmt.Sprintf(`apiVersion: operator.kcp.io/v1alpha1 +kind: Shard +metadata: + name: shard1 + namespace: %[1]s +spec: + rootShard: + ref: + name: root + etcd: + endpoints: + - http://etcd-shard.%[1]s.svc.cluster.local:2379 + auth: + serviceAccount: + enabled: true + certificateTemplates: + server: + spec: + dnsNames: + - localhost + ipAddresses: + - "127.0.0.1" + deploymentTemplate: + spec: + template: + spec: + hostAliases: + - ip: "10.96.200.200" + hostnames: + - "%[2]s"`, kcpNamespace, fpHostname)) + + // Wait for the secondary shard to be running. + waitFor(3*time.Minute, "shard1 running", func() error { + out, err := kindctlNoFail("-n", kcpNamespace, "get", "shard", "shard1", + "-o", "jsonpath={.status.phase}") + if err != nil { + return err + } + if strings.TrimSpace(out) != "Running" { + return fmt.Errorf("shard1 phase: %s", out) + } + + return nil + }) +} + +func buildAdminKubeconfig() { + // Create a Kubeconfig CR for admin access via front-proxy. + applyToKind(fmt.Sprintf(`apiVersion: operator.kcp.io/v1alpha1 +kind: Kubeconfig +metadata: + name: e2e-admin + namespace: %s +spec: + username: kcp-admin + groups: + - "system:kcp:admin" + validity: 8766h + secretRef: + name: e2e-admin-kubeconfig + target: + frontProxyRef: + name: kcp`, kcpNamespace)) + + // Wait for the kubeconfig secret to be created. + waitFor(2*time.Minute, "admin kubeconfig secret created", func() error { + _, err := kindctlNoFail("-n", kcpNamespace, "get", "secret", "e2e-admin-kubeconfig", + "-o", "jsonpath={.data.kubeconfig}") return err }) - waitFor(time.Minute, "webhook SA cert issued", func() error { - _, err := kindctlNoFail("-n", kcpNamespace, "get", "secret", "kcp-webhook-sa-cert", - "-o", "jsonpath={.data.tls\\.crt}") + + // Extract the kubeconfig and rewrite the server URL to use localhost NodePort. + kcRaw := kindctlSecret("e2e-admin-kubeconfig") + kcBytes, err := decodeBase64(kcRaw) + Expect(err).NotTo(HaveOccurred()) + + // Extract the actual server URL from the kubeconfig rather than hardcoding the port. + adminServerURL := extractServerFromKubeconfig(kcBytes) + rewritten := strings.ReplaceAll(string(kcBytes), + adminServerURL, + fmt.Sprintf("https://localhost:%s", frontProxyNodePort)) + + Expect(os.WriteFile(kcpHostKubeconfig, []byte(rewritten), 0o600)).To(Succeed()) + + waitFor(30*time.Second, "kcp API reachable via front-proxy", func() error { + _, err := runNoFail(kubectlBin, "--kubeconfig", kcpHostKubeconfig, + "--server", fmt.Sprintf("https://localhost:%s/clusters/root", frontProxyNodePort), + "get", "--raw", "/readyz") return err }) +} - kcpServerCA := secretField("kcp-ca", "{.data.tls\\.crt}") - caFile := filepath.Join(tmpDir, "kcp-server-ca.crt") - Expect(os.WriteFile(caFile, kcpServerCA, 0o600)).To(Succeed()) +// buildComponentKubeconfigs creates Kubeconfig CRs for the controller and webhook +// identities, then extracts the generated kubeconfigs pointing at the in-cluster +// front-proxy for use by deployed pods. +func buildComponentKubeconfigs() { + depCtrlPath := "root:" + wsDepCtrl - kcpInternalServer := fmt.Sprintf("https://kcp.%s.svc.cluster.local:6443/clusters/root:%s", kcpNamespace, wsDepCtrl) + // Controller and webhook kubeconfigs target the root shard (not the front-proxy) + // so their client certificates are signed by root-client-ca. This CA is trusted by + // both the front-proxy (via kcp-merged-client-ca) and all shards directly. This is + // required because the multicluster-provider connects to APIExport virtual workspace + // URLs that point directly at shards, not through the front-proxy. + // The server URL is rewritten below to point at the front-proxy. + applyToKind(fmt.Sprintf(`apiVersion: operator.kcp.io/v1alpha1 +kind: Kubeconfig +metadata: + name: e2e-controller + namespace: %[1]s +spec: + username: "system:serviceaccount:%[2]s:dependency-controller" + groups: + - "system:authenticated" + - "system:serviceaccounts" + - "system:serviceaccounts:%[2]s" + validity: 8766h + secretRef: + name: e2e-controller-kubeconfig + target: + rootShardRef: + name: root`, kcpNamespace, depNamespace)) + + applyToKind(fmt.Sprintf(`apiVersion: operator.kcp.io/v1alpha1 +kind: Kubeconfig +metadata: + name: e2e-webhook + namespace: %[1]s +spec: + username: "system:serviceaccount:%[2]s:dependency-webhook" + groups: + - "system:authenticated" + - "system:serviceaccounts" + - "system:serviceaccounts:%[2]s" + validity: 8766h + secretRef: + name: e2e-webhook-kubeconfig + target: + rootShardRef: + name: root`, kcpNamespace, depNamespace)) + + // Wait for both kubeconfig secrets. + for _, name := range []string{"e2e-controller-kubeconfig", "e2e-webhook-kubeconfig"} { + waitFor(2*time.Minute, fmt.Sprintf("%s secret created", name), func() error { + _, err := kindctlNoFail("-n", kcpNamespace, "get", "secret", name, + "-o", "jsonpath={.data.kubeconfig}") + + return err + }) + } - // Build controller kubeconfig. - ctrlCrt := secretField("kcp-controller-cert", "{.data.tls\\.crt}") - ctrlKey := secretField("kcp-controller-cert", "{.data.tls\\.key}") - ctrlCrtFile := filepath.Join(tmpDir, "ctrl-client.crt") - ctrlKeyFile := filepath.Join(tmpDir, "ctrl-client.key") - Expect(os.WriteFile(ctrlCrtFile, ctrlCrt, 0o600)).To(Succeed()) - Expect(os.WriteFile(ctrlKeyFile, ctrlKey, 0o600)).To(Succeed()) + // The kubeconfigs target the root shard. Extract the shard URL and rewrite + // it to the front-proxy URL with the dep-ctrl workspace path. The client cert + // from root-client-ca works for both front-proxy and direct shard access. + fpHostname := fmt.Sprintf("kcp-front-proxy.%s.svc.cluster.local", kcpNamespace) + kcRaw := kindctlSecret("e2e-controller-kubeconfig") + kcBytes, err := decodeBase64(kcRaw) + Expect(err).NotTo(HaveOccurred()) + shardURL := extractServerFromKubeconfig(kcBytes) + // Determine the front-proxy port from the shard URL (both use 6443). + parsed, err := url.Parse(shardURL) + Expect(err).NotTo(HaveOccurred()) + fpPort := parsed.Port() + if fpPort == "" { + fpPort = "6443" + } + inClusterFPURL = "https://" + net.JoinHostPort(fpHostname, fpPort) + depCtrlURL := inClusterFPURL + "/clusters/" + depCtrlPath + + // Rewrite kubeconfigs: shard URL -> front-proxy + workspace path. controllerKubeconfigPath = filepath.Join(tmpDir, "kcp-controller.kubeconfig") - run(kubectlBin, "--kubeconfig", controllerKubeconfigPath, "config", "set-cluster", "kcp", - "--server="+kcpInternalServer, - "--certificate-authority="+caFile, - "--embed-certs=true") - run(kubectlBin, "--kubeconfig", controllerKubeconfigPath, "config", "set-credentials", "controller", - "--client-certificate="+ctrlCrtFile, - "--client-key="+ctrlKeyFile, - "--embed-certs=true") - run(kubectlBin, "--kubeconfig", controllerKubeconfigPath, "config", "set-context", "kcp", - "--cluster=kcp", "--user=controller") - run(kubectlBin, "--kubeconfig", controllerKubeconfigPath, "config", "use-context", "kcp") - - // Build webhook kubeconfig. - whCrt := secretField("kcp-webhook-sa-cert", "{.data.tls\\.crt}") - whKey := secretField("kcp-webhook-sa-cert", "{.data.tls\\.key}") - whCrtFile := filepath.Join(tmpDir, "webhook-client.crt") - whKeyFile := filepath.Join(tmpDir, "webhook-client.key") - Expect(os.WriteFile(whCrtFile, whCrt, 0o600)).To(Succeed()) - Expect(os.WriteFile(whKeyFile, whKey, 0o600)).To(Succeed()) + extractAndRewriteKubeconfig("e2e-controller-kubeconfig", controllerKubeconfigPath, + shardURL, depCtrlURL) webhookKubeconfigPath = filepath.Join(tmpDir, "kcp-webhook.kubeconfig") - run(kubectlBin, "--kubeconfig", webhookKubeconfigPath, "config", "set-cluster", "kcp", - "--server="+kcpInternalServer, - "--certificate-authority="+caFile, - "--embed-certs=true") - run(kubectlBin, "--kubeconfig", webhookKubeconfigPath, "config", "set-credentials", "webhook", - "--client-certificate="+whCrtFile, - "--client-key="+whKeyFile, - "--embed-certs=true") - run(kubectlBin, "--kubeconfig", webhookKubeconfigPath, "config", "set-context", "kcp", - "--cluster=kcp", "--user=webhook") - run(kubectlBin, "--kubeconfig", webhookKubeconfigPath, "config", "use-context", "kcp") -} - -// bootstrapRBAC uses the bootstrap (system:masters) cert to create -// RBAC for the controller and webhook identities. -// - Root workspace: workspace access, webhook/RBAC management, APIExport content -// - Dep-ctrl workspace: APIExportEndpointSlice read for VW URL discovery + extractAndRewriteKubeconfig("e2e-webhook-kubeconfig", webhookKubeconfigPath, + shardURL, depCtrlURL) +} + +// extractAndRewriteKubeconfig extracts a kubeconfig from a secret, rewrites the +// server URL, and writes it to the given path. +func extractAndRewriteKubeconfig(secretName, outputPath, oldURL, newURL string) { + GinkgoHelper() + kcRaw := kindctlSecret(secretName) + kcBytes, err := decodeBase64(kcRaw) + Expect(err).NotTo(HaveOccurred()) + + rewritten := string(kcBytes) + + // kcp-operator generates two contexts: "base" (bare front-proxy URL) and + // "default" (front-proxy URL + /clusters/root). We rewrite the base URL to + // include the workspace path, but this corrupts the "default" entry with a + // double /clusters/ path. Switch to the "base" context which has the correct URL. + rewritten = strings.ReplaceAll(rewritten, oldURL, newURL) + rewritten = strings.ReplaceAll(rewritten, "current-context: default", "current-context: base") + + Expect(os.WriteFile(outputPath, []byte(rewritten), 0o600)).To(Succeed()) +} + +// decodeBase64 decodes a base64-encoded string. +func decodeBase64(s string) ([]byte, error) { + return base64.StdEncoding.DecodeString(strings.TrimSpace(s)) +} + +// bootstrapRBAC creates RBAC for the controller and webhook identities. +// The webhook's broad get/list rule must be applied in system:admin on every +// shard hosting consumer workspaces — the BootstrapPolicyAuthorizer reads +// RBAC from the local shard's system:admin only ("the policy defined in +// this workspace applies to every workspace in a kcp shard"; kcp source +// pkg/authorization/bootstrap_policy_authorizer.go), so per-shard application +// is required. Controller rules and dep-ctrl APIExport access live in the +// root and dep-ctrl workspaces and go via the front-proxy. func bootstrapRBAC() { - kindctl("apply", "-f", filepath.Join(fixturesDir, "kcp-bootstrap-cert.yaml")) + // Webhook get/list, applied in system:admin on every shard via direct + // (port-forwarded) shard access. + applySystemAdminRBAC("root", "rootShardRef") + applySystemAdminRBAC("shard1", "shardRef") + + // Controller-only RBAC in the root workspace via front-proxy. + run(kubectlBin, "--kubeconfig", kcpHostKubeconfig, + "--server", fmt.Sprintf("https://localhost:%s/clusters/root", frontProxyNodePort), + "apply", "-f", filepath.Join(fixturesDir, "root-rbac-bootstrap.yaml")) + + // Controller + webhook access to the dep-ctrl APIExport via front-proxy. + run(kubectlBin, "--kubeconfig", kcpHostKubeconfig, + "--server", fmt.Sprintf("https://localhost:%s/clusters/root:%s", frontProxyNodePort, wsDepCtrl), + "apply", "-f", filepath.Join(fixturesDir, "depctrl-rbac-bootstrap.yaml")) +} + +// applySystemAdminRBAC creates a system:masters kubeconfig targeting the +// given shard, port-forwards that shard's service to localhost, applies the +// system:admin RBAC fixture there, then tears down the port-forward. +// +// refField selects the kcp-operator Kubeconfig target field: "rootShardRef" +// for the root shard, "shardRef" for any secondary shard. +func applySystemAdminRBAC(shardName, refField string) { + GinkgoHelper() - waitFor(time.Minute, "bootstrap cert issued", func() error { - _, err := kindctlNoFail("-n", kcpNamespace, "get", "secret", "kcp-bootstrap-cert", - "-o", "jsonpath={.data.tls\\.crt}") + kubeconfigName := "e2e-" + shardName + "-system-masters" + secretName := kubeconfigName + "-kubeconfig" + + // Create a Kubeconfig CR with the appropriate shard target + system:masters + // group. The front-proxy does not honor system:masters, so we must hit the + // shard directly. The shard's server cert already includes localhost / + // 127.0.0.1 (see the certificateTemplates on the RootShard / Shard CRs). + applyToKind(fmt.Sprintf(`apiVersion: operator.kcp.io/v1alpha1 +kind: Kubeconfig +metadata: + name: %[1]s + namespace: %[2]s +spec: + username: e2e-system-masters + groups: + - "system:masters" + validity: 8766h + secretRef: + name: %[3]s + target: + %[4]s: + name: %[5]s`, kubeconfigName, kcpNamespace, secretName, refField, shardName)) + + waitFor(2*time.Minute, fmt.Sprintf("%s secret created", secretName), func() error { + _, err := kindctlNoFail("-n", kcpNamespace, "get", "secret", secretName, + "-o", "jsonpath={.data.kubeconfig}") return err }) - bsCrt := secretField("kcp-bootstrap-cert", "{.data.tls\\.crt}") - bsKey := secretField("kcp-bootstrap-cert", "{.data.tls\\.key}") - kcpServerCA := secretField("kcp-ca", "{.data.tls\\.crt}") - - bsCrtFile := filepath.Join(tmpDir, "bootstrap-client.crt") - bsKeyFile := filepath.Join(tmpDir, "bootstrap-client.key") - caFile := filepath.Join(tmpDir, "kcp-server-ca.crt") - Expect(os.WriteFile(bsCrtFile, bsCrt, 0o600)).To(Succeed()) - Expect(os.WriteFile(bsKeyFile, bsKey, 0o600)).To(Succeed()) - Expect(os.WriteFile(caFile, kcpServerCA, 0o600)).To(Succeed()) - - // Build a temporary bootstrap kubeconfig targeting root via kcp server NodePort. - bsKubeconfig := filepath.Join(tmpDir, "kcp-bootstrap.kubeconfig") - run(kubectlBin, "--kubeconfig", bsKubeconfig, "config", "set-cluster", "kcp", - fmt.Sprintf("--server=https://localhost:%s/clusters/root", kcpServerLocalPort), - "--certificate-authority="+caFile, - "--embed-certs=true") - run(kubectlBin, "--kubeconfig", bsKubeconfig, "config", "set-credentials", "bootstrap", - "--client-certificate="+bsCrtFile, - "--client-key="+bsKeyFile, - "--embed-certs=true") - run(kubectlBin, "--kubeconfig", bsKubeconfig, "config", "set-context", "kcp", - "--cluster=kcp", "--user=bootstrap") - run(kubectlBin, "--kubeconfig", bsKubeconfig, "config", "use-context", "kcp") - - // Apply RBAC in the root workspace. - run(kubectlBin, "--kubeconfig", bsKubeconfig, "apply", "-f", - filepath.Join(fixturesDir, "root-rbac-bootstrap.yaml")) - - // Apply RBAC in the dep-ctrl workspace (apiexportendpointslices read). - run(kubectlBin, "--kubeconfig", bsKubeconfig, - "--server", fmt.Sprintf("https://localhost:%s/clusters/root:%s", kcpServerLocalPort, wsDepCtrl), - "apply", "-f", filepath.Join(fixturesDir, "depctrl-rbac-bootstrap.yaml")) + kcRaw := kindctlSecret(secretName) + kcBytes, err := decodeBase64(kcRaw) + Expect(err).NotTo(HaveOccurred()) + shardURL := extractServerFromKubeconfig(kcBytes) + + parsed, err := url.Parse(shardURL) + Expect(err).NotTo(HaveOccurred()) + shardSvc := strings.SplitN(parsed.Hostname(), ".", 2)[0] + shardPort := parsed.Port() + if shardPort == "" { + shardPort = "6443" + } - // Apply shard-wide RBAC in system:admin (apiexports/content + apiexportendpointslices for webhook SA). - // The Bootstrap Policy Authorizer evaluates system:admin RBAC for every request on the shard. - run(kubectlBin, "--kubeconfig", bsKubeconfig, - "--server", fmt.Sprintf("https://localhost:%s/clusters/system:admin", kcpServerLocalPort), - "apply", "-f", filepath.Join(fixturesDir, "shard-admin-rbac-bootstrap.yaml")) + localPort := pickFreePort() + rewritten := strings.ReplaceAll(string(kcBytes), + shardURL, fmt.Sprintf("https://localhost:%d", localPort)) + rewritten = strings.ReplaceAll(rewritten, + "current-context: default", "current-context: base") + sysKubeconfig := filepath.Join(tmpDir, kubeconfigName+".kubeconfig") + Expect(os.WriteFile(sysKubeconfig, []byte(rewritten), 0o600)).To(Succeed()) + + // Start port-forward in the background. kubectl port-forward exits when + // stdin closes; we kill it explicitly via defer. + pfCmd := exec.CommandContext(context.Background(), kubectlBin, + "--context", "kind-"+kindClusterName, + "-n", kcpNamespace, "port-forward", + "svc/"+shardSvc, fmt.Sprintf("%d:%s", localPort, shardPort)) + pfCmd.Stdout = GinkgoWriter + pfCmd.Stderr = GinkgoWriter + Expect(pfCmd.Start()).To(Succeed()) + defer func() { + _ = pfCmd.Process.Kill() + _, _ = pfCmd.Process.Wait() + }() + + waitFor(30*time.Second, fmt.Sprintf("%s reachable via port-forward", shardSvc), func() error { + _, err := runNoFail(kubectlBin, "--kubeconfig", sysKubeconfig, + "--server", fmt.Sprintf("https://localhost:%d/clusters/system:admin", localPort), + "get", "--raw", "/readyz") + + return err + }) + + // --validate=false: system:admin does not serve OpenAPI, so client-side + // schema validation has nothing to compare against. + run(kubectlBin, "--kubeconfig", sysKubeconfig, + "--server", fmt.Sprintf("https://localhost:%d/clusters/system:admin", localPort), + "apply", "--validate=false", + "-f", filepath.Join(fixturesDir, "system-admin-rbac-bootstrap.yaml")) } +// pickFreePort asks the kernel for a free TCP port on localhost. +func pickFreePort() int { + GinkgoHelper() + var lc net.ListenConfig + listener, err := lc.Listen(context.Background(), "tcp", "127.0.0.1:0") + Expect(err).NotTo(HaveOccurred()) + port := listener.Addr().(*net.TCPAddr).Port + Expect(listener.Close()).To(Succeed()) + + return port +} + +// extractServerFromKubeconfig extracts the server URL from a kubeconfig YAML. +func extractServerFromKubeconfig(kubeconfig []byte) string { + // Simple regex extraction — avoids pulling in k8s.io/client-go/tools/clientcmd. + re := regexp.MustCompile(`server:\s*(https?://\S+)`) + m := re.FindSubmatch(kubeconfig) + if len(m) < 2 { + Fail("could not extract server URL from kubeconfig") + } + + return string(m[1]) +} + +// portForwardRe matches the "Forwarding from 127.0.0.1:PORT -> ..." line. func buildAndLoadImage() { run(dockerBin, "build", "-t", imageName, rootDir) run(kindBin, "load", "docker-image", imageName, "--name", kindClusterName) } func setupKCPWorkspaces() { - // Create all workspaces. - for _, ws := range []string{wsDepCtrl, wsNetworkProvider, wsComputeProvider, wsConsumer1, wsConsumer2} { - cmd := exec.CommandContext(context.Background(), kubectlBin, - "--kubeconfig", kcpHostKubeconfig, - "--server", fmt.Sprintf("https://localhost:%s/clusters/root", kcpNodePort), - "apply", "-f", "-", - ) - cmd.Stdin = strings.NewReader(fmt.Sprintf(`apiVersion: tenancy.kcp.io/v1alpha1 -kind: Workspace -metadata: - name: %s`, ws)) - var buf bytes.Buffer - cmd.Stdout = &buf - cmd.Stderr = &buf - cmd.Run() //nolint:errcheck // ignore AlreadyExists + // Label both shards so we can pin workspaces deterministically. The + // kcp-operator registers a kcp Shard object named after each CR. Each + // workspace is then pinned via spec.location.selector.matchLabels using + // the active config below. + for _, shard := range []string{"root", "shard1"} { + waitFor(time.Minute, fmt.Sprintf("shard %s kcp object exists", shard), func() error { + _, err := kcpctlRootNoFail("get", "shard", shard) + return err + }) + run(kubectlBin, "--kubeconfig", kcpHostKubeconfig, + "--server", fmt.Sprintf("https://localhost:%s/clusters/root", frontProxyNodePort), + "label", "shard", shard, "e2e-target="+shard, "--overwrite") } + // Create workspaces with the placement dictated by the active shard config. + createWorkspace(wsDepCtrl, activeShardConfig.depCtrl) + createWorkspace(wsNetworkProvider, activeShardConfig.networkProvider) + createWorkspace(wsComputeProvider, activeShardConfig.computeProvider) + createWorkspace(wsConsumer1, activeShardConfig.consumer1) + createWorkspace(wsConsumer2, activeShardConfig.consumer2) + // Wait for workspaces to be ready. for _, ws := range []string{wsDepCtrl, wsNetworkProvider, wsComputeProvider, wsConsumer1, wsConsumer2} { waitFor(time.Minute, fmt.Sprintf("workspace %s ready", ws), func() error { @@ -590,6 +1012,11 @@ metadata: }) } + // Verify each workspace landed on the shard the active config pinned + // it to. Pinning is the assertion — if scheduling ignored a selector, + // later cross-shard tests would silently degrade to single-shard. + verifyShardPlacements() + // Apply schemas and exports. kcpctl(wsDepCtrl, "apply", "-f", filepath.Join(rootDir, "config/kcp/apiresourceschema-dependencyrules.dependencies.opendefense.cloud.yaml")) kcpctl(wsDepCtrl, "apply", "-f", filepath.Join(rootDir, "config/kcp/apiexport-dependencies.opendefense.cloud.yaml")) @@ -643,6 +1070,100 @@ metadata: } } +// verifyShardPlacements asserts that each workspace landed on the shard the +// active config pinned it to. The shard placement is exposed via the +// internal.tenancy.kcp.io/shard annotation; the annotation value is the +// shard's logical-cluster ID (not its name), so we infer the name→ID +// mapping from the placements themselves and then check consistency: +// workspaces sharing a target name must share an ID, and distinct targets +// must resolve to distinct IDs. +func verifyShardPlacements() { + GinkgoHelper() + expected := map[string]string{ + wsDepCtrl: activeShardConfig.depCtrl, + wsNetworkProvider: activeShardConfig.networkProvider, + wsComputeProvider: activeShardConfig.computeProvider, + wsConsumer1: activeShardConfig.consumer1, + wsConsumer2: activeShardConfig.consumer2, + } + + const shardAnnotation = "internal.tenancy.kcp.io/shard" + jsonpath := fmt.Sprintf(`jsonpath={.metadata.annotations.%s}`, + strings.ReplaceAll(shardAnnotation, ".", `\.`)) + + waitFor(time.Minute, "shard placements match active config", func() error { + actual := make(map[string]string, len(expected)) + for ws := range expected { + out, err := kcpctlRootNoFail("get", "workspace", ws, "-o", jsonpath) + if err != nil { + return err + } + id := strings.TrimSpace(out) + if id == "" { + return fmt.Errorf("workspace %s shard annotation not yet set", ws) + } + actual[ws] = id + } + + // Build name→ID mapping from observations; flag mismatches in the + // same group (same target name, different IDs) and across groups + // (different target names, same ID — selector ignored). + nameToID := map[string]string{} + for ws, target := range expected { + id := actual[ws] + if existing, ok := nameToID[target]; ok && existing != id { + return fmt.Errorf("workspaces pinned to %q resolved to multiple shards: %q vs %q (workspace %s)", + target, existing, id, ws) + } + nameToID[target] = id + } + idToName := map[string]string{} + for name, id := range nameToID { + if existing, ok := idToName[id]; ok && existing != name { + return fmt.Errorf("shards %q and %q both resolved to logical cluster %q — selector ignored", + name, existing, id) + } + idToName[id] = name + } + + return nil + }) +} + +// createWorkspace creates a kcp workspace under root. If shardTarget is non-empty, +// the workspace is pinned to that shard via spec.location.selector. +func createWorkspace(name, shardTarget string) { + GinkgoHelper() + var manifest string + if shardTarget != "" { + manifest = fmt.Sprintf(`apiVersion: tenancy.kcp.io/v1alpha1 +kind: Workspace +metadata: + name: %s +spec: + location: + selector: + matchLabels: + e2e-target: %s`, name, shardTarget) + } else { + manifest = fmt.Sprintf(`apiVersion: tenancy.kcp.io/v1alpha1 +kind: Workspace +metadata: + name: %s`, name) + } + + cmd := exec.CommandContext(context.Background(), kubectlBin, + "--kubeconfig", kcpHostKubeconfig, + "--server", fmt.Sprintf("https://localhost:%s/clusters/root", frontProxyNodePort), + "apply", "-f", "-", + ) + cmd.Stdin = strings.NewReader(manifest) + var buf bytes.Buffer + cmd.Stdout = &buf + cmd.Stderr = &buf + cmd.Run() //nolint:errcheck // ignore AlreadyExists +} + // createKubeconfigSecret creates or updates a Secret in the dep-ctrl namespace // containing the given kubeconfig file. func createKubeconfigSecret(secretName, kubeconfigPath string) { @@ -673,6 +1194,19 @@ func deployCharts() { filepath.Join(rootDir, "charts/dependency-controller"), "--namespace", depNamespace, "--values", filepath.Join(fixturesDir, "integration-values.yaml"), + "--set", "kcpBaseHost="+inClusterFPURL, "--wait", "--timeout", "120s", ) } + +// applyToKind applies a YAML manifest to the kind cluster. +func applyToKind(manifest string) { + GinkgoHelper() + cmd := exec.CommandContext(context.Background(), kubectlBin, + "--context", "kind-"+kindClusterName, "apply", "-f", "-") + cmd.Stdin = strings.NewReader(manifest) + var buf bytes.Buffer + cmd.Stdout = &buf + cmd.Stderr = &buf + Expect(cmd.Run()).To(Succeed(), "applying manifest: %s", buf.String()) +} diff --git a/test/fixtures/depctrl-rbac-bootstrap.yaml b/test/fixtures/depctrl-rbac-bootstrap.yaml index 3fb0f32..9868587 100644 --- a/test/fixtures/depctrl-rbac-bootstrap.yaml +++ b/test/fixtures/depctrl-rbac-bootstrap.yaml @@ -1,20 +1,16 @@ # RBAC in the dep-ctrl workspace for the dependency-controller. -# Applied during setup using a bootstrap (system:masters) identity. +# Applied during setup using a privileged identity (e.g., system:masters). # -# The controller connects to the dep-ctrl workspace to: +# Both components connect to the dep-ctrl workspace to: # 1. Discover the VW URL via APIExportEndpointSlice # 2. Access the dep-ctrl APIExport's virtual workspace (apiexports/content) # -# The controller needs full CRUD on apiexports/content because it manages -# webhooks in binding workspaces via the VW (using permissionClaims). -# -# The webhook SA's access is granted shard-wide via system:admin RBAC -# (see shard-admin-rbac-bootstrap.yaml) instead of per-workspace grants. +# The controller manages webhooks in binding workspaces via the VW. --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: dependency-controller + name: dependency-controller-depctrl rules: - apiGroups: ["apis.kcp.io"] resources: ["apiexportendpointslices"] @@ -27,12 +23,15 @@ rules: apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: dependency-controller + name: dependency-controller-depctrl roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: dependency-controller + name: dependency-controller-depctrl subjects: - apiGroup: rbac.authorization.k8s.io kind: User name: "system:serviceaccount:dependency-system:dependency-controller" + - apiGroup: rbac.authorization.k8s.io + kind: User + name: "system:serviceaccount:dependency-system:dependency-webhook" diff --git a/test/fixtures/dependencyrule-vm-dependencies copy.yaml b/test/fixtures/dependencyrule-vm-dependencies copy.yaml new file mode 100644 index 0000000..74cf344 --- /dev/null +++ b/test/fixtures/dependencyrule-vm-dependencies copy.yaml @@ -0,0 +1,19 @@ +apiVersion: dependencies.opendefense.cloud/v1alpha1 +kind: DependencyRule +metadata: + name: vm-dependencies +spec: + dependent: + apiExportName: compute.test.io + group: compute.test.io + version: v1 + kind: VirtualMachine + dependencies: + - apiExportRef: + path: root:network-provider + name: network.test.io + group: network.test.io + version: v1 + resource: vpcs + fieldRef: + path: ".spec.vpcRef.name" diff --git a/test/fixtures/dependencyrule-vm-dependencies.yaml b/test/fixtures/dependencyrule-vm-dependencies.yaml index 5975178..cc1ec0c 100644 --- a/test/fixtures/dependencyrule-vm-dependencies.yaml +++ b/test/fixtures/dependencyrule-vm-dependencies.yaml @@ -8,6 +8,7 @@ spec: group: compute.test.io version: v1 kind: VirtualMachine + resource: virtualmachines dependencies: - apiExportRef: path: ${NETWORK_PROVIDER_PATH} diff --git a/test/fixtures/integration-values-kcp.yaml b/test/fixtures/integration-values-kcp.yaml deleted file mode 100644 index 5f4ba1d..0000000 --- a/test/fixtures/integration-values-kcp.yaml +++ /dev/null @@ -1,35 +0,0 @@ -# Values for integration testing — kcp deployed in kind via helm chart. -externalHostname: "kcp-front-proxy.kcp-system.svc.cluster.local" -externalPort: "8443" - -certificates: - dnsNames: - - localhost - - kcp-front-proxy.kcp-system.svc.cluster.local - - kcp.kcp-system.svc.cluster.local - -etcd: - replicas: 1 - resources: - requests: - memory: 256Mi - cpu: 100m - limits: - memory: 512Mi - volumeSize: 1Gi - -kcp: - volumeClassName: standard - extraFlags: - - "--shard-base-url=https://kcp.kcp-system.svc.cluster.local:6443" - resources: - requests: - memory: 512Mi - cpu: 200m - limits: - memory: 1Gi - -kcpFrontProxy: - service: - type: NodePort - nodePort: 31500 diff --git a/test/fixtures/integration-values.yaml b/test/fixtures/integration-values.yaml index dee598b..9161218 100644 --- a/test/fixtures/integration-values.yaml +++ b/test/fixtures/integration-values.yaml @@ -1,10 +1,12 @@ -# Values for integration testing — both components deployed in kind, connecting to kcp server directly. +# Values for integration testing — both components deployed in kind, +# connecting to kcp via the front-proxy. image: repository: dependency-controller tag: integration-test pullPolicy: Never -kcpBaseHost: "https://kcp.kcp-system.svc.cluster.local:6443" +# kcpBaseHost is set dynamically via --set in the e2e test to match +# the actual front-proxy URL from kcp-operator. controller: kubeconfig: diff --git a/test/fixtures/kcp-admin-cert.yaml b/test/fixtures/kcp-admin-cert.yaml deleted file mode 100644 index 1fa9950..0000000 --- a/test/fixtures/kcp-admin-cert.yaml +++ /dev/null @@ -1,44 +0,0 @@ -# Client certificates for kcp admin access. -# -# The front-proxy cert is used by the test script (host) to manage workspaces -# via the front-proxy NodePort. -# The server cert is used by in-cluster pods (controller/webhook) to connect -# directly to the kcp server, including virtual workspace endpoints. ---- -apiVersion: cert-manager.io/v1 -kind: Certificate -metadata: - name: kcp-admin-front-proxy-cert - namespace: kcp-system -spec: - commonName: kcp-admin - issuerRef: - name: kcp-front-proxy-client-issuer - privateKey: - algorithm: RSA - size: 2048 - secretName: kcp-admin-front-proxy-cert - subject: - organizations: - - "system:kcp:admin" - usages: - - client auth ---- -apiVersion: cert-manager.io/v1 -kind: Certificate -metadata: - name: kcp-admin-server-cert - namespace: kcp-system -spec: - commonName: kcp-admin - issuerRef: - name: kcp-client-issuer - privateKey: - algorithm: RSA - size: 2048 - secretName: kcp-admin-server-cert - subject: - organizations: - - "system:kcp:admin" - usages: - - client auth diff --git a/test/fixtures/kcp-bootstrap-cert.yaml b/test/fixtures/kcp-bootstrap-cert.yaml deleted file mode 100644 index 5066b73..0000000 --- a/test/fixtures/kcp-bootstrap-cert.yaml +++ /dev/null @@ -1,21 +0,0 @@ -# Bootstrap cert with system:masters for e2e setup only. -# Used to seed initial RBAC grants in system:master workspace. -# NOT used by any deployed component. -apiVersion: cert-manager.io/v1 -kind: Certificate -metadata: - name: kcp-bootstrap-cert - namespace: kcp-system -spec: - commonName: "e2e-bootstrap" - issuerRef: - name: kcp-client-issuer - privateKey: - algorithm: RSA - size: 2048 - secretName: kcp-bootstrap-cert - subject: - organizations: - - "system:masters" - usages: - - client auth diff --git a/test/fixtures/kcp-controller-cert.yaml b/test/fixtures/kcp-controller-cert.yaml deleted file mode 100644 index 1cb3fee..0000000 --- a/test/fixtures/kcp-controller-cert.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# Client certificate for the dependency-controller service account. -# Issued by kcp's client CA so the controller can connect to the kcp server -# directly (not via front-proxy). -apiVersion: cert-manager.io/v1 -kind: Certificate -metadata: - name: kcp-controller-cert - namespace: kcp-system -spec: - commonName: "system:serviceaccount:dependency-system:dependency-controller" - issuerRef: - name: kcp-client-issuer - privateKey: - algorithm: RSA - size: 2048 - secretName: kcp-controller-cert - subject: - organizations: - - "system:serviceaccounts" - - "system:serviceaccounts:dependency-system" - usages: - - client auth diff --git a/test/fixtures/kcp-server-nodeport.yaml b/test/fixtures/kcp-server-nodeport.yaml deleted file mode 100644 index 6d3b0da..0000000 --- a/test/fixtures/kcp-server-nodeport.yaml +++ /dev/null @@ -1,20 +0,0 @@ -# NodePort service exposing the kcp server directly (not the front-proxy). -# VW paths (/services/apiexport/...) are only served by the kcp server, -# so e2e tests need direct access to verify RBAC via VW requests. -apiVersion: v1 -kind: Service -metadata: - name: kcp-server-nodeport - namespace: kcp-system -spec: - type: NodePort - ports: - - protocol: TCP - name: kcp - port: 6443 - targetPort: 6443 - nodePort: 31501 - selector: - app.kubernetes.io/name: kcp - app.kubernetes.io/instance: kcp - app.kubernetes.io/component: server diff --git a/test/fixtures/kcp-webhook-sa-cert.yaml b/test/fixtures/kcp-webhook-sa-cert.yaml deleted file mode 100644 index eefbf66..0000000 --- a/test/fixtures/kcp-webhook-sa-cert.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Client certificate for the dependency-webhook service account. -# Issued by kcp's client CA so the webhook can connect to the kcp server -# directly. The CN and orgs encode the SA identity that kcp uses for -# authorization (matching the bootstrap RBAC subjects in root, dep-ctrl, -# and system:admin). -apiVersion: cert-manager.io/v1 -kind: Certificate -metadata: - name: kcp-webhook-sa-cert - namespace: kcp-system -spec: - commonName: "system:serviceaccount:dependency-system:dependency-webhook" - issuerRef: - name: kcp-client-issuer - privateKey: - algorithm: RSA - size: 2048 - secretName: kcp-webhook-sa-cert - subject: - organizations: - - "system:serviceaccounts" - - "system:serviceaccounts:dependency-system" - usages: - - client auth diff --git a/test/fixtures/kind-config.yaml b/test/fixtures/kind-config.yaml index a28c932..a6692b8 100644 --- a/test/fixtures/kind-config.yaml +++ b/test/fixtures/kind-config.yaml @@ -3,9 +3,6 @@ apiVersion: kind.x-k8s.io/v1alpha4 nodes: - role: control-plane extraPortMappings: - - containerPort: 31500 - hostPort: 31500 - protocol: TCP - - containerPort: 31501 - hostPort: 31501 + - containerPort: 31443 + hostPort: 31443 protocol: TCP diff --git a/test/fixtures/root-rbac-bootstrap.yaml b/test/fixtures/root-rbac-bootstrap.yaml index f9a27e0..2d381ee 100644 --- a/test/fixtures/root-rbac-bootstrap.yaml +++ b/test/fixtures/root-rbac-bootstrap.yaml @@ -1,10 +1,11 @@ -# RBAC in the root workspace for the dependency-controller system. -# Applied during setup using a bootstrap (system:masters) identity. +# RBAC in the root workspace for the dependency-controller. +# Applied during setup using a system:kcp:admin identity via front-proxy. # -# Both components need workspaces/content access to enter the dep-ctrl -# workspace (where their kubeconfig points). Webhook installation and -# RBAC management in provider workspaces is done through the dep-ctrl -# APIExport's virtual workspace, authorized by permissionClaims. +# Only the controller has rules here. The webhook's broad get/list access +# lives in the system:admin workspace of the root shard (see +# system-admin-rbac-bootstrap.yaml) and propagates shard-wide. Webhook +# installation and RBAC management in provider workspaces is done through +# the dep-ctrl APIExport's virtual workspace, authorized by permissionClaims. --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole @@ -32,26 +33,3 @@ subjects: - apiGroup: rbac.authorization.k8s.io kind: User name: "system:serviceaccount:dependency-system:dependency-controller" ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: dependency-controller-webhook -rules: - # Enter child workspaces through virtual workspace endpoints. - - apiGroups: ["core.kcp.io"] - resources: ["workspaces/content"] - verbs: ["access"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: dependency-controller-webhook -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: dependency-controller-webhook -subjects: - - apiGroup: rbac.authorization.k8s.io - kind: User - name: "system:serviceaccount:dependency-system:dependency-webhook" diff --git a/test/fixtures/shard-admin-rbac-bootstrap.yaml b/test/fixtures/shard-admin-rbac-bootstrap.yaml deleted file mode 100644 index 0c01f6f..0000000 --- a/test/fixtures/shard-admin-rbac-bootstrap.yaml +++ /dev/null @@ -1,33 +0,0 @@ -# RBAC in kcp's system:admin logical cluster, granting the webhook SA -# apiexports/content and apiexportendpointslices read access across all -# workspaces on the shard. Applied via the kcp server (not front-proxy) -# using a bootstrap (system:masters) identity. -# -# system:admin RBAC is evaluated by the Bootstrap Policy Authorizer for -# every request on the shard, so this single grant replaces the per-workspace -# ClusterRole/ClusterRoleBinding that the controller previously managed. ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: dependency-controller-webhook-apiexport-reader -rules: - - apiGroups: ["apis.kcp.io"] - resources: ["apiexports/content"] - verbs: ["get", "list", "watch"] - - apiGroups: ["apis.kcp.io"] - resources: ["apiexportendpointslices"] - verbs: ["get", "list", "watch"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: dependency-controller-webhook-apiexport-reader -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: dependency-controller-webhook-apiexport-reader -subjects: - - apiGroup: rbac.authorization.k8s.io - kind: User - name: "system:serviceaccount:dependency-system:dependency-webhook" diff --git a/test/fixtures/system-admin-rbac-bootstrap.yaml b/test/fixtures/system-admin-rbac-bootstrap.yaml new file mode 100644 index 0000000..62f7992 --- /dev/null +++ b/test/fixtures/system-admin-rbac-bootstrap.yaml @@ -0,0 +1,32 @@ +# RBAC for the dependency-controller webhook in the system:admin workspace. +# Applied during setup using a system:masters identity over a direct +# (port-forwarded) connection to each shard. The system:admin workspace +# is not reachable via the front-proxy. +# +# kcp's BootstrapPolicyAuthorizer reads RBAC from the local shard's +# system:admin only — bindings do not propagate across shards — so this +# fixture is applied once per shard. Within a shard the binding applies +# to every workspace on that shard, so the webhook can get/list dependent +# resources in any consumer workspace without per-workspace RBAC. +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: dependency-controller-webhook +rules: + - apiGroups: ["*"] + resources: ["*"] + verbs: ["get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: dependency-controller-webhook +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: dependency-controller-webhook +subjects: + - apiGroup: rbac.authorization.k8s.io + kind: User + name: "system:serviceaccount:dependency-system:dependency-webhook" diff --git a/test/fixtures/vm-shard-vm.yaml b/test/fixtures/vm-shard-vm.yaml new file mode 100644 index 0000000..7b68ea1 --- /dev/null +++ b/test/fixtures/vm-shard-vm.yaml @@ -0,0 +1,9 @@ +apiVersion: compute.test.io/v1 +kind: VirtualMachine +metadata: + name: shard-vm + namespace: default +spec: + cpu: 2 + vpcRef: + name: shard-vpc diff --git a/test/fixtures/vm-update-vm.yaml b/test/fixtures/vm-update-vm.yaml new file mode 100644 index 0000000..c4bce12 --- /dev/null +++ b/test/fixtures/vm-update-vm.yaml @@ -0,0 +1,9 @@ +apiVersion: compute.test.io/v1 +kind: VirtualMachine +metadata: + name: update-vm + namespace: default +spec: + cpu: 1 + vpcRef: + name: update-vpc diff --git a/test/fixtures/vpc-shard-vpc.yaml b/test/fixtures/vpc-shard-vpc.yaml new file mode 100644 index 0000000..9821979 --- /dev/null +++ b/test/fixtures/vpc-shard-vpc.yaml @@ -0,0 +1,7 @@ +apiVersion: network.test.io/v1 +kind: VPC +metadata: + name: shard-vpc + namespace: default +spec: + cidr: "10.3.0.0/16" diff --git a/test/fixtures/vpc-update-vpc.yaml b/test/fixtures/vpc-update-vpc.yaml new file mode 100644 index 0000000..d3ab117 --- /dev/null +++ b/test/fixtures/vpc-update-vpc.yaml @@ -0,0 +1,7 @@ +apiVersion: network.test.io/v1 +kind: VPC +metadata: + name: update-vpc + namespace: default +spec: + cidr: "10.5.0.0/16"