From 1585ff180f67c6abcc11dbacd7d6522c33027e84 Mon Sep 17 00:00:00 2001 From: Levi Jiang Date: Mon, 11 May 2026 11:38:14 -0700 Subject: [PATCH 1/2] Reduce metadata refresh retries from 20 to 3 Iceberg's BaseMetastoreTableOperations defaults META_DATA_REFRESH_RETRIES to 20 with exponential backoff capped at 5s, so a failing refresh stalls for ~90 seconds before surfacing the underlying error. HTS already returns the authoritative metadata pointer, so the high retry budget mostly serves to mask read-after-write windows on object stores - 3 retries is enough. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../catalog/OpenHouseInternalTableOperations.java | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java index d1636bda9..6923505e2 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java @@ -90,6 +90,15 @@ public class OpenHouseInternalTableOperations extends BaseMetastoreTableOperatio private static final Cache CACHE = CacheBuilder.newBuilder().expireAfterWrite(5, TimeUnit.MINUTES).maximumSize(1000).build(); + /** + * Overrides Iceberg's {@code BaseMetastoreTableOperations.META_DATA_REFRESH_RETRIES} (20). The + * underlying {@code refreshFromMetadataLocation} retries with exponential backoff capped at 5s, + * so 20 retries can stretch a single failing refresh to ~90 seconds. HTS already returns the + * authoritative metadata pointer, so a high retry budget mostly serves to absorb object-store + * read-after-write races, where a much smaller value suffices. + */ + private static final int METADATA_REFRESH_RETRIES = 3; + @Override protected String tableName() { return this.tableIdentifier.toString(); @@ -133,7 +142,7 @@ protected void doRefresh() { protected void refreshMetadata(final String metadataLoc) { long startTime = System.currentTimeMillis(); boolean needToReload = !Objects.equal(currentMetadataLocation(), metadataLoc); - Runnable r = () -> super.refreshFromMetadataLocation(metadataLoc); + Runnable r = () -> super.refreshFromMetadataLocation(metadataLoc, METADATA_REFRESH_RETRIES); try { if (needToReload) { metricsReporter.executeWithStats( @@ -405,7 +414,7 @@ updatedMtDataRef, io().newOutputFile(newMetadataLocation)), * "forced refresh" in {@link OpenHouseInternalTableOperations#commit(TableMetadata, * TableMetadata)} */ - refreshFromMetadataLocation(newMetadataLocation); + refreshFromMetadataLocation(newMetadataLocation, METADATA_REFRESH_RETRIES); } if (isReplicatedTableCreate(properties)) { updateMetadataFieldForTable(metadata, newMetadataLocation); From 8cd81115eb4cd8af13427e02ccf2a34aa3cf65c0 Mon Sep 17 00:00:00 2001 From: Levi Jiang Date: Mon, 11 May 2026 11:48:22 -0700 Subject: [PATCH 2/2] fix --- .../internal/catalog/OpenHouseInternalTableOperations.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java index 6923505e2..7651991af 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java @@ -93,9 +93,7 @@ public class OpenHouseInternalTableOperations extends BaseMetastoreTableOperatio /** * Overrides Iceberg's {@code BaseMetastoreTableOperations.META_DATA_REFRESH_RETRIES} (20). The * underlying {@code refreshFromMetadataLocation} retries with exponential backoff capped at 5s, - * so 20 retries can stretch a single failing refresh to ~90 seconds. HTS already returns the - * authoritative metadata pointer, so a high retry budget mostly serves to absorb object-store - * read-after-write races, where a much smaller value suffices. + * so 20 retries can stretch a single failing refresh to ~90 seconds. */ private static final int METADATA_REFRESH_RETRIES = 3;