Skip to content

Commit 850b443

Browse files
authored
fix(linstor): verify resource deletion completes; warn if stuck in DELETING (#13076)
Co-authored-by: jmsperu <jmsperu@users.noreply.github.com>
1 parent 1fe486f commit 850b443

3 files changed

Lines changed: 77 additions & 0 deletions

File tree

plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStorageAdaptor.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -514,6 +514,17 @@ private boolean deRefOrDeleteResource(DevelopersApi api, String rscName, String
514514
ApiCallRcList answers = api.resourceDefinitionDelete(rd.getName());
515515
checkLinstorAnswersThrow(answers);
516516
deleted = true;
517+
518+
// LINSTOR can return success here while the resource lingers in DELETING state
519+
// on the controller (down peer, lost quorum, etc.). Confirm it's actually gone
520+
// — if not, log a WARN so operators can clear it manually. Don't throw: the
521+
// CloudStack-side accounting has already moved on.
522+
if (!LinstorUtil.waitForResourceDefinitionDeleted(api, rd.getName(),
523+
LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS)) {
524+
logger.warn("Linstor: resource {} still present {}ms after delete returned success — " +
525+
"may be stuck in DELETING. Check the LINSTOR controller (linstor resource list).",
526+
rd.getName(), LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS);
527+
}
517528
}
518529
}
519530
return deleted;

plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/driver/LinstorPrimaryDataStoreDriverImpl.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,20 @@ private void deleteResourceDefinition(StoragePoolVO storagePoolVO, String rscDef
232232
throw new CloudRuntimeException("Linstor: Unable to delete resource definition: " + rscDefName);
233233
}
234234
logger.info("Linstor: Deleted resource {}", rscDefName);
235+
236+
// LINSTOR can return success on the delete API call while the resource lingers in
237+
// DELETING state (peer issues, lost quorum, satellite down). Verify the resource is
238+
// actually gone — if not, log a WARN so operators see it. We deliberately do NOT
239+
// throw here: the volume is already considered gone on the CloudStack side, and
240+
// throwing would leave the CS DB and LINSTOR in different states.
241+
if (!LinstorUtil.waitForResourceDefinitionDeleted(linstorApi, rscDefName,
242+
LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS))
243+
{
244+
logger.warn("Linstor: resource {} still present {}ms after delete returned success — " +
245+
"may be stuck in DELETING. Check the LINSTOR controller (linstor resource list) " +
246+
"and clear manually if the resource has no live peers.",
247+
rscDefName, LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS);
248+
}
235249
} catch (ApiException apiEx)
236250
{
237251
logger.error("Linstor: ApiEx - " + apiEx.getMessage());

plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/util/LinstorUtil.java

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,58 @@ public static List<ResourceDefinition> getRDListStartingWith(DevelopersApi api,
401401
.collect(Collectors.toList());
402402
}
403403

404+
/**
405+
* Default per-call timeout for {@link #waitForResourceDefinitionDeleted}. Long enough for a
406+
* healthy LINSTOR controller to finish a normal delete; short enough not to block the calling
407+
* thread for too long if the delete is genuinely stuck. Used both from the management server
408+
* (e.g. {@code LinstorPrimaryDataStoreDriverImpl}) and from KVM agent paths.
409+
*/
410+
public static final long DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS = 30_000L;
411+
412+
/**
413+
* Returns {@code true} if the named resource definition is no longer present on the LINSTOR
414+
* controller. Used after a {@code resourceDefinitionDelete} to verify the delete actually
415+
* completed (LINSTOR can return success on the API call while the resource lingers in
416+
* DELETING state due to peer issues, lost quorum, or down satellites). Uses the
417+
* controller-side name filter rather than scanning every RD on the cluster (cheap even
418+
* when polled once per second from {@link #waitForResourceDefinitionDeleted}).
419+
*/
420+
public static boolean isResourceDefinitionGone(DevelopersApi api, String rscName) throws ApiException {
421+
List<ResourceDefinition> matching =
422+
api.resourceDefinitionList(Collections.singletonList(rscName), false, null, null, null);
423+
return matching == null || matching.isEmpty();
424+
}
425+
426+
/**
427+
* Polls the controller until the named resource definition is gone or the timeout elapses.
428+
* Returns {@code true} if the resource was confirmed gone, {@code false} if it was still
429+
* present (or the controller kept erroring) at the deadline. Callers should NOT throw on a
430+
* {@code false} return — the upstream API call already reported success and the operator
431+
* may need to investigate manually. Log a WARN with the resource name instead.
432+
*/
433+
public static boolean waitForResourceDefinitionDeleted(DevelopersApi api, String rscName, long timeoutMillis) {
434+
final long deadline = System.currentTimeMillis() + timeoutMillis;
435+
while (true) {
436+
try {
437+
if (isResourceDefinitionGone(api, rscName)) {
438+
return true;
439+
}
440+
} catch (ApiException e) {
441+
LOGGER.debug("LINSTOR delete-verify poll failed for {}: {}", rscName, e.getMessage());
442+
// Keep polling — controller may be transiently unavailable.
443+
}
444+
if (System.currentTimeMillis() >= deadline) {
445+
return false;
446+
}
447+
try {
448+
Thread.sleep(1_000L);
449+
} catch (InterruptedException ie) {
450+
Thread.currentThread().interrupt();
451+
return false;
452+
}
453+
}
454+
}
455+
404456
/**
405457
* Returns a pair list of resource-definitions with ther 1:1 mapped resource-group objects that start with the
406458
* resource name `startWith`

0 commit comments

Comments
 (0)