From 3f14cbe3f1ccbb0cbb91794aac4d9ae7cda833d6 Mon Sep 17 00:00:00 2001
From: mkuchenbecker <mkuchenbecker@users.noreply.github.com>
Date: Wed, 20 May 2026 00:01:28 -0700
Subject: [PATCH 01/14] feat(spark): add BatchedOrphanFilesDeletionSparkApp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Multi-table OFD app that amortizes Spark-session startup across many
tables in a single job, using a driver-side thread pool so each table's
deletion runs concurrently.

When --resultsEndpoint and --operationIds are supplied, each table's
outcome (SUCCESS/FAILED with file count + bytes deleted, or
errorMessage+errorType) is POSTed to the optimizer's complete-operation
endpoint as it completes — letting the optimizer track per-table status
independently of the overall job.

Operations.java grows a deleteOrphanFilesWithMetrics() overload that
delegates from the existing deleteOrphanFiles() and additionally tracks
bytes deleted via an AtomicLong inside the deleteWith callback. The
existing single-table OrphanFilesDeletionSparkApp is unchanged.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../BatchedOrphanFilesDeletionSparkApp.java   | 377 ++++++++++++++++++
 .../openhouse/jobs/spark/Operations.java      |  56 ++-
 ...atchedOrphanFilesDeletionSparkAppTest.java | 338 ++++++++++++++++
 3 files changed, 770 insertions(+), 1 deletion(-)
 create mode 100644 apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkApp.java
 create mode 100644 apps/spark/src/test/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkAppTest.java
diff --git a/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkApp.java b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkApp.java
new file mode 100644
index 000000000..0908e7afa
--- /dev/null
+++ b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkApp.java
@@ -0,0 +1,377 @@
+package com.linkedin.openhouse.jobs.spark;
+
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.collect.Lists;
+import com.linkedin.openhouse.common.metrics.DefaultOtelConfig;
+import com.linkedin.openhouse.common.metrics.OtelEmitter;
+import com.linkedin.openhouse.jobs.spark.state.StateManager;
+import com.linkedin.openhouse.jobs.util.AppConstants;
+import com.linkedin.openhouse.jobs.util.AppsOtelEmitter;
+import io.opentelemetry.api.common.AttributeKey;
+import io.opentelemetry.api.common.Attributes;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import lombok.Builder;
+import lombok.Value;
+import lombok.extern.slf4j.Slf4j;
+import okhttp3.MediaType;
+import okhttp3.OkHttpClient;
+import okhttp3.Request;
+import okhttp3.RequestBody;
+import okhttp3.Response;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Option;
+import org.apache.commons.lang3.math.NumberUtils;
+import org.apache.iceberg.Table;
+
+/**
+ * Runs orphan file deletion for a batch of tables in one Spark session.
+ *
+ * <p>Spinning up a Spark session per table is expensive. This app amortizes that cost by processing
+ * many tables in a single job, using a driver-side thread pool so each table's deletion runs
+ * concurrently without competing for executors.
+ *
+ * <p>When {@code --resultsEndpoint} is supplied, each table's outcome is POSTed to the optimizer
+ * service's complete-operation endpoint as it completes, letting the service track per-table status
+ * independently of the overall job.
+ */
+@Slf4j
+public class BatchedOrphanFilesDeletionSparkApp extends BaseSparkApp {
+  private static final int DEFAULT_MAX_ORPHAN_FILE_SAMPLE_SIZE = 20000;
+  private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+  private final List<String> tableNames;
+  private final int parallelism;
+  private final long ttlSeconds;
+  private final String backupDir;
+  private final int concurrentDeletes;
+  private final boolean streamResults;
+  private final int maxOrphanFileSampleSize;
+  private final List<String> operationIds;
+  private final String resultsEndpoint;
+
+  public BatchedOrphanFilesDeletionSparkApp(
+      String jobId,
+      StateManager stateManager,
+      List<String> tableNames,
+      int parallelism,
+      long ttlSeconds,
+      OtelEmitter otelEmitter,
+      String backupDir,
+      int concurrentDeletes,
+      boolean streamResults,
+      int maxOrphanFileSampleSize,
+      List<String> operationIds,
+      String resultsEndpoint) {
+    super(jobId, stateManager, otelEmitter);
+    this.tableNames = tableNames;
+    this.parallelism = parallelism;
+    this.ttlSeconds = ttlSeconds;
+    this.backupDir = backupDir;
+    this.concurrentDeletes = concurrentDeletes;
+    this.streamResults = streamResults;
+    this.maxOrphanFileSampleSize = maxOrphanFileSampleSize;
+    this.operationIds = operationIds;
+    this.resultsEndpoint = resultsEndpoint;
+  }
+
+  @Override
+  protected void runInner(Operations ops) throws Exception {
+    if (resultsEndpoint != null && operationIds.size() != tableNames.size()) {
+      throw new IllegalArgumentException(
+          "operationIds count ("
+              + operationIds.size()
+              + ") must equal tableNames count ("
+              + tableNames.size()
+              + ") when resultsEndpoint is provided");
+    }
+
+    Map<String, String> tableToOperationId = new HashMap<>();
+    if (resultsEndpoint != null) {
+      for (int i = 0; i < tableNames.size(); i++) {
+        tableToOperationId.put(tableNames.get(i), operationIds.get(i));
+      }
+    }
+
+    long olderThanTimestampMillis =
+        System.currentTimeMillis() - TimeUnit.SECONDS.toMillis(ttlSeconds);
+
+    log.info(
+        "Starting batched orphan files deletion for {} tables with parallelism {}",
+        tableNames.size(),
+        parallelism);
+
+    int poolSize = Math.min(parallelism, Math.max(1, tableNames.size()));
+    ExecutorService executor = Executors.newFixedThreadPool(poolSize);
+    List<Future<OrphanDeletionResult>> futures = new ArrayList<>();
+
+    for (String tableName : tableNames) {
+      futures.add(
+          executor.submit(
+              () -> {
+                long startTime = System.currentTimeMillis();
+                try {
+                  Table table = ops.getTable(tableName);
+                  boolean backupEnabled =
+                      Boolean.parseBoolean(
+                          table
+                              .properties()
+                              .getOrDefault(AppConstants.BACKUP_ENABLED_KEY, "false"));
+
+                  log.info("Processing orphan files for table: {}", tableName);
+                  Operations.OrphanFilesResult result =
+                      ops.deleteOrphanFilesWithMetrics(
+                          table,
+                          olderThanTimestampMillis,
+                          backupEnabled,
+                          backupDir,
+                          concurrentDeletes,
+                          streamResults,
+                          maxOrphanFileSampleSize);
+
+                  List<String> orphanFiles =
+                      Lists.newArrayList(result.orphanFileLocations().iterator());
+                  long durationMs = System.currentTimeMillis() - startTime;
+
+                  log.info(
+                      "Successfully processed table {}: {} orphan files deleted in {}ms",
+                      tableName,
+                      orphanFiles.size(),
+                      durationMs);
+
+                  return OrphanDeletionResult.success(
+                      tableName, orphanFiles.size(), result.getBytesDeleted(), durationMs);
+
+                } catch (Exception e) {
+                  long durationMs = System.currentTimeMillis() - startTime;
+                  log.error("Failed to process table: {}", tableName, e);
+                  return OrphanDeletionResult.failure(tableName, e, durationMs);
+                }
+              }));
+    }
+
+    executor.shutdown();
+    executor.awaitTermination(Long.MAX_VALUE, TimeUnit.MILLISECONDS);
+
+    List<OrphanDeletionResult> allResults = new ArrayList<>();
+    for (Future<OrphanDeletionResult> future : futures) {
+      try {
+        allResults.add(future.get());
+      } catch (ExecutionException e) {
+        throw new RuntimeException("Unexpected exception in table processing", e.getCause());
+      }
+    }
+
+    reportResults(allResults, tableToOperationId);
+  }
+
+  private void reportResults(
+      List<OrphanDeletionResult> results, Map<String, String> tableToOperationId) throws Exception {
+    OkHttpClient client =
+        resultsEndpoint != null
+            ? new OkHttpClient.Builder()
+                .connectTimeout(10, TimeUnit.SECONDS)
+                .readTimeout(30, TimeUnit.SECONDS)
+                .build()
+            : null;
+
+    int failureCount = 0;
+    for (OrphanDeletionResult result : results) {
+      if (result.isSuccess()) {
+        otelEmitter.count(
+            METRICS_SCOPE,
+            AppConstants.ORPHAN_FILE_COUNT,
+            result.getOrphanFilesDeleted(),
+            Attributes.of(AttributeKey.stringKey(AppConstants.TABLE_NAME), result.getTableName()));
+      } else {
+        failureCount++;
+      }
+
+      if (client != null) {
+        String opId = tableToOperationId.get(result.getTableName());
+        if (opId != null) {
+          completeOperation(client, opId, result);
+        }
+      }
+    }
+
+    if (client == null && failureCount > 0) {
+      throw new RuntimeException(failureCount + " table(s) failed in batch");
+    }
+    if (client != null && failureCount == results.size()) {
+      throw new RuntimeException("All tables failed in batch");
+    }
+  }
+
+  private void completeOperation(OkHttpClient client, String id, OrphanDeletionResult result)
+      throws Exception {
+    OperationResult opResult =
+        result.isSuccess()
+            ? OperationResult.success(result.getOrphanFilesDeleted(), result.getBytesDeleted())
+            : OperationResult.failure(result.getErrorMessage(), result.getErrorType());
+
+    String json = OBJECT_MAPPER.writeValueAsString(opResult);
+    RequestBody body = RequestBody.create(json, MediaType.parse("application/json; charset=utf-8"));
+    Request request =
+        new Request.Builder().url(resultsEndpoint + "/" + id + "/complete").post(body).build();
+    try (Response response = client.newCall(request).execute()) {
+      int code = response.code();
+      if (code < 200 || code >= 300) {
+        throw new RuntimeException("POST operation/" + id + "/complete returned HTTP " + code);
+      }
+    }
+  }
+
+  /** Result of orphan deletion for a single table. */
+  @Value
+  @Builder
+  public static class OrphanDeletionResult implements Serializable {
+    String tableName;
+    boolean success;
+    int orphanFilesDeleted;
+    long bytesDeleted;
+    long durationMs;
+    String errorMessage;
+    String errorType;
+
+    public static OrphanDeletionResult success(
+        String tableName, int orphanFileCount, long bytesDeleted, long durationMs) {
+      return OrphanDeletionResult.builder()
+          .tableName(tableName)
+          .success(true)
+          .orphanFilesDeleted(orphanFileCount)
+          .bytesDeleted(bytesDeleted)
+          .durationMs(durationMs)
+          .build();
+    }
+
+    public static OrphanDeletionResult failure(String tableName, Throwable error, long durationMs) {
+      return OrphanDeletionResult.builder()
+          .tableName(tableName)
+          .success(false)
+          .orphanFilesDeleted(0)
+          .bytesDeleted(0)
+          .durationMs(durationMs)
+          .errorMessage(error.getMessage())
+          .errorType(error.getClass().getSimpleName())
+          .build();
+    }
+  }
+
+  /** POST payload sent to the optimizer service's complete-operation endpoint. */
+  @JsonInclude(JsonInclude.Include.NON_NULL)
+  static class OperationResult {
+    public final String status;
+    public final ResultPayload result;
+    public final Integer orphanFilesDeleted;
+    public final Long orphanBytesDeleted;
+
+    private OperationResult(
+        String status, ResultPayload result, Integer orphanFilesDeleted, Long orphanBytesDeleted) {
+      this.status = status;
+      this.result = result;
+      this.orphanFilesDeleted = orphanFilesDeleted;
+      this.orphanBytesDeleted = orphanBytesDeleted;
+    }
+
+    static OperationResult success(int orphanFilesDeleted, long orphanBytesDeleted) {
+      return new OperationResult("SUCCESS", null, orphanFilesDeleted, orphanBytesDeleted);
+    }
+
+    static OperationResult failure(String errorMessage, String errorType) {
+      return new OperationResult("FAILED", new ResultPayload(errorMessage, errorType), null, null);
+    }
+  }
+
+  /** Error details nested inside {@link OperationResult}. */
+  @JsonInclude(JsonInclude.Include.NON_NULL)
+  static class ResultPayload {
+    public final String errorMessage;
+    public final String errorType;
+
+    ResultPayload(String errorMessage, String errorType) {
+      this.errorMessage = errorMessage;
+      this.errorType = errorType;
+    }
+  }
+
+  public static void main(String[] args) {
+    OtelEmitter otelEmitter =
+        new AppsOtelEmitter(Arrays.asList(DefaultOtelConfig.getOpenTelemetry()));
+    createApp(args, otelEmitter).run();
+  }
+
+  public static BatchedOrphanFilesDeletionSparkApp createApp(
+      String[] args, OtelEmitter otelEmitter) {
+    List<Option> extraOptions = new ArrayList<>();
+    extraOptions.add(
+        new Option("tn", "tableNames", true, "Comma-separated fully-qualified table names"));
+    extraOptions.add(new Option("p", "parallelism", true, "Number of parallel table processes"));
+    extraOptions.add(
+        new Option(
+            "r",
+            "ttl",
+            true,
+            "How old files should be to be considered orphaned in seconds, minimum 1d is"
+                + " enforced"));
+    extraOptions.add(new Option("b", "backupDir", true, "Backup directory for deleted data"));
+    extraOptions.add(new Option("c", "concurrentDeletes", true, "Number of concurrent deletes"));
+    extraOptions.add(
+        new Option(
+            null,
+            "streamResults",
+            false,
+            "Stream orphan file deletions instead of collecting all paths into driver memory"));
+    extraOptions.add(
+        new Option(
+            null,
+            "maxOrphanFileSampleSize",
+            true,
+            "Maximum number of orphan file paths to return in the result when streaming"));
+    extraOptions.add(new Option("oi", "operationIds", true, "Comma-separated operation IDs"));
+    extraOptions.add(new Option("re", "resultsEndpoint", true, "Base URL for per-table PATCH"));
+
+    CommandLine cmdLine = createCommandLine(args, extraOptions);
+
+    String tableNamesStr = cmdLine.getOptionValue("tableNames");
+    List<String> tableNames =
+        tableNamesStr != null ? Arrays.asList(tableNamesStr.split(",")) : new ArrayList<>();
+
+    String idsStr = cmdLine.getOptionValue("operationIds");
+    List<String> operationIds =
+        idsStr != null ? Arrays.asList(idsStr.split(",")) : Collections.emptyList();
+    String resultsEndpoint = cmdLine.getOptionValue("resultsEndpoint");
+
+    long rawTtl = NumberUtils.toLong(cmdLine.getOptionValue("ttl"), TimeUnit.DAYS.toSeconds(7));
+    // TTL=0 bypasses the minimum-age guard (for tests that seed orphan files and need
+    // them deleted immediately). Any other explicit value is clamped to the 1-day minimum.
+    long ttlSeconds = rawTtl == 0 ? 0 : Math.max(rawTtl, TimeUnit.DAYS.toSeconds(1));
+
+    return new BatchedOrphanFilesDeletionSparkApp(
+        getJobId(cmdLine),
+        createStateManager(cmdLine, otelEmitter),
+        tableNames,
+        Integer.parseInt(cmdLine.getOptionValue("parallelism", "10")),
+        ttlSeconds,
+        otelEmitter,
+        cmdLine.getOptionValue("backupDir", ".backup"),
+        Integer.parseInt(cmdLine.getOptionValue("concurrentDeletes", "10")),
+        cmdLine.hasOption("streamResults"),
+        Integer.parseInt(
+            cmdLine.getOptionValue(
+                "maxOrphanFileSampleSize", String.valueOf(DEFAULT_MAX_ORPHAN_FILE_SAMPLE_SIZE))),
+        operationIds,
+        resultsEndpoint);
+  }
+}
diff --git a/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/Operations.java b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/Operations.java
index 53b1297a5..b01e05d1d 100644
--- a/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/Operations.java
+++ b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/Operations.java
@@ -26,6 +26,7 @@
 import java.util.concurrent.Executors;
 import java.util.concurrent.ThreadPoolExecutor;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicLong;
 import java.util.function.Predicate;
 import java.util.stream.Collectors;
 import lombok.AccessLevel;
@@ -111,6 +112,29 @@ public DeleteOrphanFiles.Result deleteOrphanFiles(
       int concurrentDeletes,
       boolean streamResults,
       int maxOrphanFileSampleSize) {
+    return deleteOrphanFilesWithMetrics(
+            table,
+            olderThanTimestampMillis,
+            backupEnabled,
+            backupDir,
+            concurrentDeletes,
+            streamResults,
+            maxOrphanFileSampleSize)
+        .getIcebergResult();
+  }
+
+  /**
+   * Same as {@link #deleteOrphanFiles} but also tracks and returns the total bytes deleted,
+   * suitable for callers that emit byte-level metrics (e.g. batched OFD).
+   */
+  public OrphanFilesResult deleteOrphanFilesWithMetrics(
+      Table table,
+      long olderThanTimestampMillis,
+      boolean backupEnabled,
+      String backupDir,
+      int concurrentDeletes,
+      boolean streamResults,
+      int maxOrphanFileSampleSize) {
 
     DeleteOrphanFiles operation =
         SparkActions.get(spark)
@@ -127,6 +151,7 @@ public DeleteOrphanFiles.Result deleteOrphanFiles(
     Map<String, Boolean> dataManifestsCache = new ConcurrentHashMap<>();
     Path backupDirRoot = new Path(table.location(), backupDir);
     Path dataDirRoot = new Path(table.location(), "data");
+    AtomicLong bytesAccumulator = new AtomicLong(0);
     operation =
         operation.deleteWith(
             file -> {
@@ -146,6 +171,7 @@ && isExistBackupDataManifests(table, file, backupDir, dataManifestsCache)) {
                 Path backupFilePath = getTrashPath(table, file, backupDir);
                 log.info("Moving orphan file {} to {}", file, backupFilePath);
                 try {
+                  bytesAccumulator.addAndGet(fs().getFileStatus(new Path(file)).getLen());
                   rename(new Path(file), backupFilePath);
                   // update modification time to current time
                   fs().setTimes(backupFilePath, System.currentTimeMillis(), -1);
@@ -155,13 +181,41 @@ && isExistBackupDataManifests(table, file, backupDir, dataManifestsCache)) {
               } else {
                 log.info("Deleting orphan file {}", file);
                 try {
+                  bytesAccumulator.addAndGet(fs().getFileStatus(new Path(file)).getLen());
                   fs().delete(new Path(file), false);
                 } catch (IOException e) {
                   log.error(String.format("Delete operation failed for file: %s", file), e);
                 }
               }
             });
-    return operation.execute();
+    DeleteOrphanFiles.Result icebergResult = operation.execute();
+    return new OrphanFilesResult(icebergResult, bytesAccumulator.get());
+  }
+
+  /**
+   * Result of a {@link #deleteOrphanFilesWithMetrics} call, augmenting the Iceberg result with byte
+   * count.
+   */
+  public static class OrphanFilesResult {
+    private final DeleteOrphanFiles.Result result;
+    private final long bytesDeleted;
+
+    public OrphanFilesResult(DeleteOrphanFiles.Result result, long bytesDeleted) {
+      this.result = result;
+      this.bytesDeleted = bytesDeleted;
+    }
+
+    public DeleteOrphanFiles.Result getIcebergResult() {
+      return result;
+    }
+
+    public Iterable<String> orphanFileLocations() {
+      return result.orphanFileLocations();
+    }
+
+    public long getBytesDeleted() {
+      return bytesDeleted;
+    }
   }
 
   private ExecutorService removeFilesService(int concurrentDeletes) {
diff --git a/apps/spark/src/test/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkAppTest.java b/apps/spark/src/test/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkAppTest.java
new file mode 100644
index 000000000..26d4ae460
--- /dev/null
+++ b/apps/spark/src/test/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkAppTest.java
@@ -0,0 +1,338 @@
+package com.linkedin.openhouse.jobs.spark;
+
+import com.linkedin.openhouse.common.metrics.DefaultOtelConfig;
+import com.linkedin.openhouse.common.metrics.OtelEmitter;
+import com.linkedin.openhouse.jobs.util.AppsOtelEmitter;
+import com.linkedin.openhouse.tablestest.OpenHouseSparkITest;
+import com.sun.net.httpserver.HttpServer;
+import java.net.InetSocketAddress;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.iceberg.Table;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Integration tests for {@link BatchedOrphanFilesDeletionSparkApp}.
+ *
+ * <p>These tests run the app's logic in-process against a real Spark session backed by an in-memory
+ * Iceberg catalog ({@link OpenHouseSparkITest}). The app is never submitted as an external Spark
+ * job; {@code runInner(ops)} is called directly.
+ *
+ * <p>When testing the per-table complete-operation callback path, the Optimizer service is replaced
+ * by an embedded {@link com.sun.net.httpserver.HttpServer} that captures request bodies. This
+ * verifies the shape and content of the JSON payloads without requiring a live service.
+ */
+@Slf4j
+public class BatchedOrphanFilesDeletionSparkAppTest extends OpenHouseSparkITest {
+  private static final int MAX_SAMPLE_SIZE = 20000;
+
+  private final OtelEmitter otelEmitter =
+      new AppsOtelEmitter(Arrays.asList(DefaultOtelConfig.getOpenTelemetry()));
+
+  @Test
+  public void testSuccessfulOrphanFilesDeletionForMultipleTables() throws Exception {
+    final List<String> tableNames = Arrays.asList("db.test_batch1", "db.test_batch2");
+    final int parallelism = 2;
+
+    try (Operations ops = Operations.withCatalog(getSparkSession(), otelEmitter)) {
+      for (String tableName : tableNames) {
+        prepareTable(ops, tableName);
+        populateTable(ops, tableName, 2);
+      }
+
+      BatchedOrphanFilesDeletionSparkApp app =
+          newApp(
+              tableNames, parallelism, TimeUnit.DAYS.toSeconds(1), Collections.emptyList(), null);
+
+      app.runInner(ops);
+
+      for (String tableName : tableNames) {
+        Table table = ops.getTable(tableName);
+        Assertions.assertNotNull(table);
+      }
+    }
+  }
+
+  @Test
+  public void testBatchedDeletionWithSingleTable() throws Exception {
+    final List<String> tableNames = Arrays.asList("db.test_single");
+
+    try (Operations ops = Operations.withCatalog(getSparkSession(), otelEmitter)) {
+      prepareTable(ops, tableNames.get(0));
+      populateTable(ops, tableNames.get(0), 3);
+
+      BatchedOrphanFilesDeletionSparkApp app =
+          newApp(tableNames, 1, TimeUnit.DAYS.toSeconds(1), Collections.emptyList(), null);
+
+      app.runInner(ops);
+
+      Assertions.assertNotNull(ops.getTable(tableNames.get(0)));
+    }
+  }
+
+  @Test
+  public void testBatchedDeletionWithInvalidTable() throws Exception {
+    final List<String> tableNames =
+        Arrays.asList("db.test_valid1", "db.nonexistent_table", "db.test_valid2");
+
+    try (Operations ops = Operations.withCatalog(getSparkSession(), otelEmitter)) {
+      prepareTable(ops, "db.test_valid1");
+      populateTable(ops, "db.test_valid1", 2);
+      prepareTable(ops, "db.test_valid2");
+      populateTable(ops, "db.test_valid2", 2);
+
+      BatchedOrphanFilesDeletionSparkApp app =
+          newApp(tableNames, 3, TimeUnit.DAYS.toSeconds(1), Collections.emptyList(), null);
+
+      Assertions.assertThrows(RuntimeException.class, () -> app.runInner(ops));
+    }
+  }
+
+  @Test
+  public void testBatchedDeletionWithEmptyTables() throws Exception {
+    final List<String> tableNames = Arrays.asList("db.test_empty1", "db.test_empty2");
+
+    try (Operations ops = Operations.withCatalog(getSparkSession(), otelEmitter)) {
+      for (String tableName : tableNames) {
+        prepareTable(ops, tableName);
+      }
+
+      BatchedOrphanFilesDeletionSparkApp app =
+          newApp(tableNames, 2, TimeUnit.DAYS.toSeconds(1), Collections.emptyList(), null);
+
+      app.runInner(ops);
+
+      for (String tableName : tableNames) {
+        Assertions.assertNotNull(ops.getTable(tableName));
+      }
+    }
+  }
+
+  @Test
+  public void testOrphanDeletionResultSuccess() {
+    BatchedOrphanFilesDeletionSparkApp.OrphanDeletionResult result =
+        BatchedOrphanFilesDeletionSparkApp.OrphanDeletionResult.success(
+            "db.test_result", 42, 1024L, 5000L);
+
+    Assertions.assertTrue(result.isSuccess());
+    Assertions.assertEquals("db.test_result", result.getTableName());
+    Assertions.assertEquals(42, result.getOrphanFilesDeleted());
+    Assertions.assertEquals(1024L, result.getBytesDeleted());
+    Assertions.assertEquals(5000L, result.getDurationMs());
+    Assertions.assertNull(result.getErrorMessage());
+    Assertions.assertNull(result.getErrorType());
+  }
+
+  @Test
+  public void testOrphanDeletionResultFailure() {
+    BatchedOrphanFilesDeletionSparkApp.OrphanDeletionResult result =
+        BatchedOrphanFilesDeletionSparkApp.OrphanDeletionResult.failure(
+            "db.test_failure", new RuntimeException("Test error"), 1000L);
+
+    Assertions.assertFalse(result.isSuccess());
+    Assertions.assertEquals("db.test_failure", result.getTableName());
+    Assertions.assertEquals(0, result.getOrphanFilesDeleted());
+    Assertions.assertEquals(0, result.getBytesDeleted());
+    Assertions.assertEquals(1000L, result.getDurationMs());
+    Assertions.assertEquals("Test error", result.getErrorMessage());
+    Assertions.assertEquals("RuntimeException", result.getErrorType());
+  }
+
+  @Test
+  public void testCreateAppFromCommandLineArgs() {
+    String[] args = {
+      "--jobId", "test-job-123",
+      "--storageURL", "http://localhost:8080",
+      "--tableNames", "db.table1,db.table2,db.table3",
+      "--parallelism", "5",
+      "--ttl", "86400",
+      "--backupDir", "/backup",
+      "--concurrentDeletes", "20",
+      "--operationIds", "uuid-1,uuid-2,uuid-3",
+      "--resultsEndpoint", "http://localhost:8083/v1/optimizer/operations"
+    };
+
+    Assertions.assertNotNull(BatchedOrphanFilesDeletionSparkApp.createApp(args, otelEmitter));
+  }
+
+  @Test
+  public void testCreateAppWithDefaultValues() {
+    String[] args = {
+      "--jobId", "test-job-456",
+      "--storageURL", "http://localhost:8080",
+      "--tableNames", "db.table1"
+    };
+
+    Assertions.assertNotNull(BatchedOrphanFilesDeletionSparkApp.createApp(args, otelEmitter));
+  }
+
+  @Test
+  public void testCreateAppWithOperationIdsAndEndpoint() {
+    String[] args = {
+      "--jobId", "test-job-789",
+      "--storageURL", "http://localhost:8080",
+      "--tableNames", "db.table1,db.table2",
+      "--operationIds", "uuid-1,uuid-2",
+      "--resultsEndpoint", "http://localhost:8083/v1/optimizer/operations"
+    };
+
+    Assertions.assertNotNull(BatchedOrphanFilesDeletionSparkApp.createApp(args, otelEmitter));
+  }
+
+  @Test
+  public void testBatchedDeletionWithActualOrphanFiles() throws Exception {
+    final String tableName = "db.test_orphans";
+    final List<String> tableNames = Arrays.asList(tableName);
+
+    try (Operations ops = Operations.withCatalog(getSparkSession(), otelEmitter)) {
+      prepareTable(ops, tableName);
+      populateTable(ops, tableName, 3);
+
+      Assertions.assertNotNull(ops.getTable(tableName).currentSnapshot());
+
+      // TTL=0 bypasses the minimum-age guard so seeded orphan files are eligible.
+      BatchedOrphanFilesDeletionSparkApp app =
+          newApp(tableNames, 1, 0L, Collections.emptyList(), null);
+
+      app.runInner(ops);
+
+      Assertions.assertNotNull(ops.getTable(tableName).currentSnapshot());
+    }
+  }
+
+  @Test
+  public void testBatchedDeletionPartialSuccessWithEndpoint() throws Exception {
+    final List<String> tableNames = Arrays.asList("db.test_ep_valid", "db.nonexistent_ep");
+    final List<String> operationIds = Arrays.asList("op-100", "op-200");
+
+    List<String> receivedBodies = Collections.synchronizedList(new ArrayList<>());
+    HttpServer httpServer = HttpServer.create(new InetSocketAddress(0), 0);
+    httpServer.createContext(
+        "/ops",
+        exchange -> {
+          byte[] body = exchange.getRequestBody().readAllBytes();
+          receivedBodies.add(new String(body, StandardCharsets.UTF_8));
+          exchange.sendResponseHeaders(200, 0);
+          exchange.close();
+        });
+    httpServer.start();
+    String endpoint = "http://localhost:" + httpServer.getAddress().getPort() + "/ops";
+
+    try (Operations ops = Operations.withCatalog(getSparkSession(), otelEmitter)) {
+      prepareTable(ops, "db.test_ep_valid");
+      populateTable(ops, "db.test_ep_valid", 2);
+
+      BatchedOrphanFilesDeletionSparkApp app =
+          newApp(tableNames, 2, TimeUnit.DAYS.toSeconds(1), operationIds, endpoint);
+
+      app.runInner(ops);
+
+      Assertions.assertEquals(2, receivedBodies.size());
+      long successCount = receivedBodies.stream().filter(b -> b.contains("\"SUCCESS\"")).count();
+      long failureCount = receivedBodies.stream().filter(b -> b.contains("\"FAILED\"")).count();
+      Assertions.assertEquals(1, successCount);
+      Assertions.assertEquals(1, failureCount);
+
+      String successBody =
+          receivedBodies.stream().filter(b -> b.contains("\"SUCCESS\"")).findFirst().get();
+      Assertions.assertFalse(successBody.contains("\"result\""));
+
+      String failureBody =
+          receivedBodies.stream().filter(b -> b.contains("\"FAILED\"")).findFirst().get();
+      Assertions.assertTrue(failureBody.contains("\"errorMessage\""));
+      Assertions.assertTrue(failureBody.contains("\"errorType\""));
+    } finally {
+      httpServer.stop(0);
+    }
+  }
+
+  @Test
+  public void testBatchedDeletionAllFailWithEndpoint() throws Exception {
+    final List<String> tableNames = Arrays.asList("db.nonexistent_ep1", "db.nonexistent_ep2");
+    final List<String> operationIds = Arrays.asList("op-300", "op-400");
+
+    List<String> receivedBodies = Collections.synchronizedList(new ArrayList<>());
+    HttpServer httpServer = HttpServer.create(new InetSocketAddress(0), 0);
+    httpServer.createContext(
+        "/ops",
+        exchange -> {
+          byte[] body = exchange.getRequestBody().readAllBytes();
+          receivedBodies.add(new String(body, StandardCharsets.UTF_8));
+          exchange.sendResponseHeaders(200, 0);
+          exchange.close();
+        });
+    httpServer.start();
+    String endpoint = "http://localhost:" + httpServer.getAddress().getPort() + "/ops";
+
+    try (Operations ops = Operations.withCatalog(getSparkSession(), otelEmitter)) {
+      BatchedOrphanFilesDeletionSparkApp app =
+          newApp(tableNames, 2, TimeUnit.DAYS.toSeconds(1), operationIds, endpoint);
+
+      Assertions.assertThrows(RuntimeException.class, () -> app.runInner(ops));
+
+      Assertions.assertEquals(2, receivedBodies.size());
+      long failures = receivedBodies.stream().filter(b -> b.contains("\"FAILED\"")).count();
+      Assertions.assertEquals(2, failures);
+      for (String body : receivedBodies) {
+        Assertions.assertTrue(body.contains("\"errorMessage\""));
+        Assertions.assertTrue(body.contains("\"errorType\""));
+      }
+    } finally {
+      httpServer.stop(0);
+    }
+  }
+
+  @Test
+  public void testBatchedDeletionMismatchedOperationIdsThrows() throws Exception {
+    final List<String> tableNames = Arrays.asList("db.table1", "db.table2");
+    final List<String> operationIds = Arrays.asList("op-1", "op-2", "op-3");
+
+    BatchedOrphanFilesDeletionSparkApp app =
+        newApp(
+            tableNames, 2, TimeUnit.DAYS.toSeconds(1), operationIds, "http://localhost:9999/ops");
+
+    try (Operations ops = Operations.withCatalog(getSparkSession(), otelEmitter)) {
+      Assertions.assertThrows(IllegalArgumentException.class, () -> app.runInner(ops));
+    }
+  }
+
+  private BatchedOrphanFilesDeletionSparkApp newApp(
+      List<String> tableNames,
+      int parallelism,
+      long ttlSeconds,
+      List<String> operationIds,
+      String resultsEndpoint) {
+    return new BatchedOrphanFilesDeletionSparkApp(
+        "test-job",
+        null,
+        tableNames,
+        parallelism,
+        ttlSeconds,
+        otelEmitter,
+        ".backup",
+        10,
+        false,
+        MAX_SAMPLE_SIZE,
+        operationIds,
+        resultsEndpoint);
+  }
+
+  private static void prepareTable(Operations ops, String tableName) {
+    ops.spark().sql(String.format("DROP TABLE IF EXISTS %s", tableName)).show();
+    ops.spark().sql(String.format("CREATE TABLE %s (data string, ts timestamp)", tableName)).show();
+  }
+
+  private static void populateTable(Operations ops, String tableName, int numRows) {
+    for (int row = 0; row < numRows; ++row) {
+      ops.spark()
+          .sql(String.format("INSERT INTO %s VALUES ('v%d', current_timestamp())", tableName, row))
+          .show();
+    }
+  }
+}

From d3e01b4798852bb72e4d108deb0bfc81c95fc179 Mon Sep 17 00:00:00 2001
From: mkuchenbecker <mkuchenbecker@users.noreply.github.com>
Date: Wed, 20 May 2026 09:02:28 -0700
Subject: [PATCH 02/14] feat(optimizer): carry per-table OFD metrics on
 operation-complete
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- table_operations_history grows four nullable columns:
    orphan_files_deleted BIGINT
    orphan_bytes_deleted BIGINT
    error_message        VARCHAR(1024)
    error_type           VARCHAR(256)
- Row, model, DTO, history-DTO all gain the four fields.
- CompleteOperationRequestDto drops the body-based operationId — id moves
  to the URL path. New endpoint shape is POST /v1/optimizer/operations/{id}/complete.
- OptimizerDataService.completeOperation grows four trailing nullable
  parameters carrying the metrics / error details.
- Scheduler config default + test fixture URLs migrate from the stale
  /v1/table-operations to the correct /v1/optimizer/operations.
- BatchedOFD Spark app's OperationResult payload flattens the nested
  ResultPayload into peer fields so the body matches the flat DTO.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../src/main/resources/application.properties |  2 +-
 .../scheduler/SchedulerRunnerTest.java        |  2 +-
 .../resources/application-test.properties     |  2 +-
 .../BatchedOrphanFilesDeletionSparkApp.java   | 28 ++++++--------
 ...atchedOrphanFilesDeletionSparkAppTest.java |  9 ++++-
 .../controller/TableOperationsController.java | 20 ++++++----
 .../api/spec/CompleteOperationRequestDto.java | 30 +++++++++------
 .../api/spec/TableOperationsHistoryDto.java   | 20 ++++++++++
 .../db/TableOperationsHistoryRow.java         | 16 ++++++++
 .../model/TableOperationsHistory.java         | 20 ++++++++++
 .../service/OptimizerDataService.java         | 16 ++++++--
 .../service/OptimizerDataServiceImpl.java     | 11 +++++-
 .../main/resources/db/optimizer-schema.sql    | 18 +++++----
 .../service/OptimizerDataServiceImplTest.java | 37 ++++++++++++++++++-
 14 files changed, 178 insertions(+), 53 deletions(-)

diff --git a/apps/optimizer-scheduler/src/main/resources/application.properties b/apps/optimizer-scheduler/src/main/resources/application.properties
index 4d849797c..bdaf4cbe6 100644
--- a/apps/optimizer-scheduler/src/main/resources/application.properties
+++ b/apps/optimizer-scheduler/src/main/resources/application.properties
@@ -6,5 +6,5 @@ spring.datasource.password=${OPTIMIZER_DB_PASSWORD:}
 spring.jpa.hibernate.ddl-auto=none
 jobs.base-uri=${JOBS_BASE_URI:http://localhost:8002}
 scheduler.ofd.max-files-per-bin=${SCHEDULER_OFD_MAX_FILES_PER_BIN:1000000}
-scheduler.results-endpoint=${SCHEDULER_RESULTS_ENDPOINT:http://openhouse-optimizer:8080/v1/table-operations}
+scheduler.results-endpoint=${SCHEDULER_RESULTS_ENDPOINT:http://openhouse-optimizer:8080/v1/optimizer/operations}
 scheduler.cluster-id=${SCHEDULER_CLUSTER_ID:LocalHadoopCluster}
diff --git a/apps/optimizer-scheduler/src/test/java/com/linkedin/openhouse/scheduler/SchedulerRunnerTest.java b/apps/optimizer-scheduler/src/test/java/com/linkedin/openhouse/scheduler/SchedulerRunnerTest.java
index 7722a6156..aadb3c68e 100644
--- a/apps/optimizer-scheduler/src/test/java/com/linkedin/openhouse/scheduler/SchedulerRunnerTest.java
+++ b/apps/optimizer-scheduler/src/test/java/com/linkedin/openhouse/scheduler/SchedulerRunnerTest.java
@@ -38,7 +38,7 @@ class SchedulerRunnerTest {
   private static final com.linkedin.openhouse.optimizer.db.OperationStatus PENDING_DB =
       com.linkedin.openhouse.optimizer.db.OperationStatus.PENDING;
   private static final String OFD_STR = OFD.name();
-  private static final String RESULTS_ENDPOINT = "http://localhost:8080/v1/table-operations";
+  private static final String RESULTS_ENDPOINT = "http://localhost:8080/v1/optimizer/operations";
 
   @Mock private TableOperationsRepository operationsRepo;
   @Mock private TableStatsRepository statsRepo;
diff --git a/apps/optimizer-scheduler/src/test/resources/application-test.properties b/apps/optimizer-scheduler/src/test/resources/application-test.properties
index 5904d71ba..35233829b 100644
--- a/apps/optimizer-scheduler/src/test/resources/application-test.properties
+++ b/apps/optimizer-scheduler/src/test/resources/application-test.properties
@@ -6,5 +6,5 @@ spring.sql.init.mode=always
 spring.sql.init.schema-locations=classpath:schema.sql
 jobs.base-uri=http://localhost:9999
 scheduler.ofd.max-files-per-bin=1000000
-scheduler.results-endpoint=http://localhost:8080/v1/table-operations
+scheduler.results-endpoint=http://localhost:8080/v1/optimizer/operations
 scheduler.cluster-id=test-cluster
diff --git a/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkApp.java b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkApp.java
index 0908e7afa..e13bbca87 100644
--- a/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkApp.java
+++ b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkApp.java
@@ -273,36 +273,30 @@ public static OrphanDeletionResult failure(String tableName, Throwable error, lo
   @JsonInclude(JsonInclude.Include.NON_NULL)
   static class OperationResult {
     public final String status;
-    public final ResultPayload result;
     public final Integer orphanFilesDeleted;
     public final Long orphanBytesDeleted;
+    public final String errorMessage;
+    public final String errorType;
 
     private OperationResult(
-        String status, ResultPayload result, Integer orphanFilesDeleted, Long orphanBytesDeleted) {
+        String status,
+        Integer orphanFilesDeleted,
+        Long orphanBytesDeleted,
+        String errorMessage,
+        String errorType) {
       this.status = status;
-      this.result = result;
       this.orphanFilesDeleted = orphanFilesDeleted;
       this.orphanBytesDeleted = orphanBytesDeleted;
+      this.errorMessage = errorMessage;
+      this.errorType = errorType;
     }
 
     static OperationResult success(int orphanFilesDeleted, long orphanBytesDeleted) {
-      return new OperationResult("SUCCESS", null, orphanFilesDeleted, orphanBytesDeleted);
+      return new OperationResult("SUCCESS", orphanFilesDeleted, orphanBytesDeleted, null, null);
     }
 
     static OperationResult failure(String errorMessage, String errorType) {
-      return new OperationResult("FAILED", new ResultPayload(errorMessage, errorType), null, null);
-    }
-  }
-
-  /** Error details nested inside {@link OperationResult}. */
-  @JsonInclude(JsonInclude.Include.NON_NULL)
-  static class ResultPayload {
-    public final String errorMessage;
-    public final String errorType;
-
-    ResultPayload(String errorMessage, String errorType) {
-      this.errorMessage = errorMessage;
-      this.errorType = errorType;
+      return new OperationResult("FAILED", null, null, errorMessage, errorType);
     }
   }
 
diff --git a/apps/spark/src/test/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkAppTest.java b/apps/spark/src/test/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkAppTest.java
index 26d4ae460..9f057a8d7 100644
--- a/apps/spark/src/test/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkAppTest.java
+++ b/apps/spark/src/test/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkAppTest.java
@@ -239,14 +239,21 @@ public void testBatchedDeletionPartialSuccessWithEndpoint() throws Exception {
       Assertions.assertEquals(1, successCount);
       Assertions.assertEquals(1, failureCount);
 
+      // SUCCESS payload carries orphan-file/byte metrics; no error fields.
       String successBody =
           receivedBodies.stream().filter(b -> b.contains("\"SUCCESS\"")).findFirst().get();
-      Assertions.assertFalse(successBody.contains("\"result\""));
+      Assertions.assertTrue(successBody.contains("\"orphanFilesDeleted\""));
+      Assertions.assertTrue(successBody.contains("\"orphanBytesDeleted\""));
+      Assertions.assertFalse(successBody.contains("\"errorMessage\""));
+      Assertions.assertFalse(successBody.contains("\"errorType\""));
 
+      // FAILED payload carries error fields; no orphan metrics.
       String failureBody =
           receivedBodies.stream().filter(b -> b.contains("\"FAILED\"")).findFirst().get();
       Assertions.assertTrue(failureBody.contains("\"errorMessage\""));
       Assertions.assertTrue(failureBody.contains("\"errorType\""));
+      Assertions.assertFalse(failureBody.contains("\"orphanFilesDeleted\""));
+      Assertions.assertFalse(failureBody.contains("\"orphanBytesDeleted\""));
     } finally {
       httpServer.stop(0);
     }
diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/controller/TableOperationsController.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/controller/TableOperationsController.java
index accf6d543..20bad278b 100644
--- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/controller/TableOperationsController.java
+++ b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/controller/TableOperationsController.java
@@ -29,18 +29,22 @@ public class TableOperationsController {
   private final OptimizerDataService service;
 
   /**
-   * Report that an operation has completed. The body carries the {@code operationId} the caller is
-   * completing along with its terminal status. The backend looks up the operation row, writes a
-   * history entry with the operation's table metadata, and returns 201 Created with the history
-   * row, or 404 if the operation does not exist.
+   * Report that an operation has completed. {@code id} is the operation's UUID; the body carries
+   * the terminal status and any per-operation metrics or error details. The backend looks up the
+   * operation row, writes a history entry with the operation's table metadata plus the supplied
+   * metrics, and returns 201 Created with the history row, or 404 if the operation does not exist.
    */
-  @PostMapping("/complete")
+  @PostMapping("/{id}/complete")
   public ResponseEntity<TableOperationsHistoryDto> completeOperation(
-      @RequestBody CompleteOperationRequestDto request) {
+      @PathVariable String id, @RequestBody CompleteOperationRequestDto request) {
     return service
         .completeOperation(
-            request.getOperationId(),
-            request.getStatus() == null ? null : request.getStatus().toModel())
+            id,
+            request.getStatus() == null ? null : request.getStatus().toModel(),
+            request.getOrphanFilesDeleted(),
+            request.getOrphanBytesDeleted(),
+            request.getErrorMessage(),
+            request.getErrorType())
         .map(
             history ->
                 ResponseEntity.status(HttpStatus.CREATED)
diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/CompleteOperationRequestDto.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/CompleteOperationRequestDto.java
index 9dca54a8e..c6266812d 100644
--- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/CompleteOperationRequestDto.java
+++ b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/CompleteOperationRequestDto.java
@@ -6,20 +6,19 @@
 import lombok.NoArgsConstructor;
 
 /**
- * Request body for {@code POST /v1/table-operations/complete}.
+ * Request body for {@code POST /v1/optimizer/operations/{id}/complete}.
  *
- * <p>Reports the outcome of a single completed operation. The service looks up the operation row by
- * {@link #operationId} and writes a history entry for it.
+ * <p>Reports the outcome of a single completed operation. The operation row's UUID lives in the URL
+ * path; the body carries the terminal status and any operation-type-specific result metrics.
  *
  * <p>A single Spark job typically processes N tables and yields N independent (status) outcomes —
  * one per operation. Callers issue one complete request per operation; the service does not
  * bulk-complete by job.
  *
- * <p>The remaining fields ({@link #tableUuid}, {@link #databaseName}, {@link #tableName}, {@link
- * #operationType}) are debug-only echo information. The server does not key off them; they are
- * preserved on log lines and traces so an operator looking at a failing complete call can see which
- * (db, table, operation) the caller believed it was completing without joining back to the
- * operation row.
+ * <p>The debug-echo fields ({@link #tableUuid}, {@link #databaseName}, {@link #tableName}, {@link
+ * #operationType}) are optional. The server does not key off them; they are preserved on log lines
+ * and traces so an operator looking at a failing complete call can see which (db, table, operation)
+ * the caller believed it was completing without joining back to the operation row.
  */
 @Data
 @Builder
@@ -27,12 +26,21 @@
 @AllArgsConstructor
 public class CompleteOperationRequestDto {
 
-  /** Operation row's UUID — the primary lookup key. */
-  private String operationId;
-
   /** Terminal outcome for this single operation. */
   private HistoryStatusDto status;
 
+  /** OFD-specific: number of orphan files deleted. Null on failure or non-OFD operation. */
+  private Long orphanFilesDeleted;
+
+  /** OFD-specific: bytes reclaimed. Null on failure or non-OFD operation. */
+  private Long orphanBytesDeleted;
+
+  /** On failure, the exception message from the Spark-side worker. Null on success. */
+  private String errorMessage;
+
+  /** On failure, the simple name of the exception class. Null on success. */
+  private String errorType;
+
   /** Debug echo: stable table identity the caller believed it was completing. */
   private String tableUuid;
 
diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableOperationsHistoryDto.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableOperationsHistoryDto.java
index 8b508bf36..6ee23b135 100644
--- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableOperationsHistoryDto.java
+++ b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableOperationsHistoryDto.java
@@ -35,6 +35,18 @@ public class TableOperationsHistoryDto {
   /** {@code SUCCESS} or {@code FAILED}. */
   private HistoryStatusDto status;
 
+  /** OFD-specific: number of orphan files deleted; null if not OFD or on failure. */
+  private Long orphanFilesDeleted;
+
+  /** OFD-specific: bytes reclaimed by orphan file deletion; null if not OFD or on failure. */
+  private Long orphanBytesDeleted;
+
+  /** On failure, the message from the Spark-side exception. Null on success. */
+  private String errorMessage;
+
+  /** On failure, the simple name of the Spark-side exception class. Null on success. */
+  private String errorType;
+
   /** Convert to the internal-model counterpart. */
   public TableOperationsHistory toModel() {
     return TableOperationsHistory.builder()
@@ -45,6 +57,10 @@ public TableOperationsHistory toModel() {
         .operationType(operationType == null ? null : operationType.toModel())
         .completedAt(completedAt)
         .status(status == null ? null : status.toModel())
+        .orphanFilesDeleted(orphanFilesDeleted)
+        .orphanBytesDeleted(orphanBytesDeleted)
+        .errorMessage(errorMessage)
+        .errorType(errorType)
         .build();
   }
 
@@ -61,6 +77,10 @@ public static TableOperationsHistoryDto fromModel(TableOperationsHistory h) {
         .operationType(OperationTypeDto.fromModel(h.getOperationType()))
         .completedAt(h.getCompletedAt())
         .status(HistoryStatusDto.fromModel(h.getStatus()))
+        .orphanFilesDeleted(h.getOrphanFilesDeleted())
+        .orphanBytesDeleted(h.getOrphanBytesDeleted())
+        .errorMessage(h.getErrorMessage())
+        .errorType(h.getErrorType())
         .build();
   }
 }
diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/TableOperationsHistoryRow.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/TableOperationsHistoryRow.java
index 5f4a598d9..484c3e8dd 100644
--- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/TableOperationsHistoryRow.java
+++ b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/TableOperationsHistoryRow.java
@@ -72,4 +72,20 @@ public class TableOperationsHistoryRow {
   @Enumerated(EnumType.STRING)
   @Column(name = "status", nullable = false, length = 20)
   private HistoryStatus status;
+
+  /** OFD-specific: number of orphan files deleted; null if not an OFD operation or on failure. */
+  @Column(name = "orphan_files_deleted")
+  private Long orphanFilesDeleted;
+
+  /** OFD-specific: bytes reclaimed by orphan file deletion; null if not OFD or on failure. */
+  @Column(name = "orphan_bytes_deleted")
+  private Long orphanBytesDeleted;
+
+  /** On failure, the message from the Spark-side exception. Null on success. */
+  @Column(name = "error_message", length = 1024)
+  private String errorMessage;
+
+  /** On failure, the simple name of the Spark-side exception class. Null on success. */
+  @Column(name = "error_type", length = 256)
+  private String errorType;
 }
diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableOperationsHistory.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableOperationsHistory.java
index 8cbfb6ff7..48381cbe8 100644
--- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableOperationsHistory.java
+++ b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableOperationsHistory.java
@@ -40,6 +40,18 @@ public class TableOperationsHistory {
   /** Terminal outcome: {@link HistoryStatus#SUCCESS} or {@link HistoryStatus#FAILED}. */
   private HistoryStatus status;
 
+  /** OFD-specific: number of orphan files deleted; null if not an OFD operation or on failure. */
+  private Long orphanFilesDeleted;
+
+  /** OFD-specific: bytes reclaimed by orphan file deletion; null if not OFD or on failure. */
+  private Long orphanBytesDeleted;
+
+  /** On failure, the message from the Spark-side exception. Null on success. */
+  private String errorMessage;
+
+  /** On failure, the simple name of the Spark-side exception class. Null on success. */
+  private String errorType;
+
   /** Convert to the corresponding DB row. */
   public TableOperationsHistoryRow toRow() {
     return TableOperationsHistoryRow.builder()
@@ -50,6 +62,10 @@ public TableOperationsHistoryRow toRow() {
         .operationType(operationType == null ? null : operationType.toDb())
         .completedAt(completedAt)
         .status(status == null ? null : status.toDb())
+        .orphanFilesDeleted(orphanFilesDeleted)
+        .orphanBytesDeleted(orphanBytesDeleted)
+        .errorMessage(errorMessage)
+        .errorType(errorType)
         .build();
   }
 
@@ -66,6 +82,10 @@ public static TableOperationsHistory fromRow(TableOperationsHistoryRow row) {
         .operationType(OperationType.fromDb(row.getOperationType()))
         .completedAt(row.getCompletedAt())
         .status(HistoryStatus.fromDb(row.getStatus()))
+        .orphanFilesDeleted(row.getOrphanFilesDeleted())
+        .orphanBytesDeleted(row.getOrphanBytesDeleted())
+        .errorMessage(row.getErrorMessage())
+        .errorType(row.getErrorType())
         .build();
   }
 }
diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/service/OptimizerDataService.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/service/OptimizerDataService.java
index 5d5edaee2..9e0ae3403 100644
--- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/service/OptimizerDataService.java
+++ b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/service/OptimizerDataService.java
@@ -36,10 +36,20 @@ List<TableOperation> listTableOperations(
   /**
    * Complete an operation by writing a history entry. Looks up the operation row by {@code
    * operationId}, copies its table metadata into a new history row with the supplied terminal
-   * {@code status}, and saves it. Returns the history record, or empty if the operation does not
-   * exist.
+   * {@code status} and optional per-operation metrics / error details, and saves it. Returns the
+   * history record, or empty if the operation does not exist.
+   *
+   * <p>The four trailing parameters are all nullable. Successful OFD operations populate {@code
+   * orphanFilesDeleted} / {@code orphanBytesDeleted}; failures populate {@code errorMessage} /
+   * {@code errorType}. Non-OFD operations leave all four null.
    */
-  Optional<TableOperationsHistory> completeOperation(String operationId, HistoryStatus status);
+  Optional<TableOperationsHistory> completeOperation(
+      String operationId,
+      HistoryStatus status,
+      Long orphanFilesDeleted,
+      Long orphanBytesDeleted,
+      String errorMessage,
+      String errorType);
 
   /**
    * Return the operation row for {@code id} regardless of status, or empty if it does not exist.
diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/service/OptimizerDataServiceImpl.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/service/OptimizerDataServiceImpl.java
index 633411e98..58783d844 100644
--- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/service/OptimizerDataServiceImpl.java
+++ b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/service/OptimizerDataServiceImpl.java
@@ -63,7 +63,12 @@ public List<TableOperation> listTableOperations(
   @Override
   @Transactional
   public Optional<TableOperationsHistory> completeOperation(
-      String operationId, HistoryStatus status) {
+      String operationId,
+      HistoryStatus status,
+      Long orphanFilesDeleted,
+      Long orphanBytesDeleted,
+      String errorMessage,
+      String errorType) {
     return operationsRepository
         .findById(operationId)
         .map(
@@ -76,6 +81,10 @@ public Optional<TableOperationsHistory> completeOperation(
                     .operationType(OperationType.fromDb(row.getOperationType()))
                     .completedAt(Instant.now())
                     .status(status)
+                    .orphanFilesDeleted(orphanFilesDeleted)
+                    .orphanBytesDeleted(orphanBytesDeleted)
+                    .errorMessage(errorMessage)
+                    .errorType(errorType)
                     .build())
         .map(history -> TableOperationsHistory.fromRow(historyRepository.save(history.toRow())));
   }
diff --git a/services/optimizer/src/main/resources/db/optimizer-schema.sql b/services/optimizer/src/main/resources/db/optimizer-schema.sql
index 892c1c55f..24071e2e8 100644
--- a/services/optimizer/src/main/resources/db/optimizer-schema.sql
+++ b/services/optimizer/src/main/resources/db/optimizer-schema.sql
@@ -38,13 +38,17 @@ CREATE TABLE IF NOT EXISTS table_stats_history (
 );
 
 CREATE TABLE IF NOT EXISTS table_operations_history (
-  id             VARCHAR(36)   NOT NULL,
-  table_uuid     VARCHAR(36)   NOT NULL,
-  database_name  VARCHAR(128)  NOT NULL,
-  table_name     VARCHAR(128)  NOT NULL,
-  operation_type VARCHAR(50)   NOT NULL,
-  completed_at   TIMESTAMP(6)  NOT NULL,
-  status         VARCHAR(20)   NOT NULL,
+  id                    VARCHAR(36)   NOT NULL,
+  table_uuid            VARCHAR(36)   NOT NULL,
+  database_name         VARCHAR(128)  NOT NULL,
+  table_name            VARCHAR(128)  NOT NULL,
+  operation_type        VARCHAR(50)   NOT NULL,
+  completed_at          TIMESTAMP(6)  NOT NULL,
+  status                VARCHAR(20)   NOT NULL,
+  orphan_files_deleted  BIGINT,
+  orphan_bytes_deleted  BIGINT,
+  error_message         VARCHAR(1024),
+  error_type            VARCHAR(256),
   PRIMARY KEY (id),
   INDEX idx_toph_db_table (database_name, table_name),
   -- Drives TableOperationHistoryRepository.findLatestPerTable: the correlated
diff --git a/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/service/OptimizerDataServiceImplTest.java b/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/service/OptimizerDataServiceImplTest.java
index b329459ad..010195d2a 100644
--- a/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/service/OptimizerDataServiceImplTest.java
+++ b/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/service/OptimizerDataServiceImplTest.java
@@ -54,7 +54,7 @@ void completeOperation_writesHistoryFromOperationRow() {
             .build());
 
     Optional<TableOperationsHistory> result =
-        service.completeOperation(operationId, HistoryStatus.SUCCESS);
+        service.completeOperation(operationId, HistoryStatus.SUCCESS, 42L, 1024L, null, null);
 
     assertThat(result).isPresent();
     assertThat(result.get().getStatus()).isEqualTo(HistoryStatus.SUCCESS);
@@ -62,12 +62,45 @@ void completeOperation_writesHistoryFromOperationRow() {
     assertThat(result.get().getOperationType()).isEqualTo(OperationType.ORPHAN_FILES_DELETION);
     assertThat(result.get().getDatabaseName()).isEqualTo("db1");
     assertThat(result.get().getCompletedAt()).isNotNull();
+    assertThat(result.get().getOrphanFilesDeleted()).isEqualTo(42L);
+    assertThat(result.get().getOrphanBytesDeleted()).isEqualTo(1024L);
+    assertThat(result.get().getErrorMessage()).isNull();
+    assertThat(result.get().getErrorType()).isNull();
+  }
+
+  @Test
+  void completeOperation_failurePersistsErrorFields() {
+    String operationId = UUID.randomUUID().toString();
+    operationsRepository.save(
+        TableOperationsRow.builder()
+            .id(operationId)
+            .tableUuid(UUID.randomUUID().toString())
+            .databaseName("db1")
+            .tableName("tbl1")
+            .operationType(com.linkedin.openhouse.optimizer.db.OperationType.ORPHAN_FILES_DELETION)
+            .status(com.linkedin.openhouse.optimizer.db.OperationStatus.SCHEDULED)
+            .createdAt(Instant.now())
+            .scheduledAt(Instant.now())
+            .jobId("spark-job-456")
+            .build());
+
+    Optional<TableOperationsHistory> result =
+        service.completeOperation(
+            operationId, HistoryStatus.FAILED, null, null, "boom", "RuntimeException");
+
+    assertThat(result).isPresent();
+    assertThat(result.get().getStatus()).isEqualTo(HistoryStatus.FAILED);
+    assertThat(result.get().getOrphanFilesDeleted()).isNull();
+    assertThat(result.get().getOrphanBytesDeleted()).isNull();
+    assertThat(result.get().getErrorMessage()).isEqualTo("boom");
+    assertThat(result.get().getErrorType()).isEqualTo("RuntimeException");
   }
 
   @Test
   void completeOperation_notFound_returnsEmpty() {
     Optional<TableOperationsHistory> result =
-        service.completeOperation(UUID.randomUUID().toString(), HistoryStatus.FAILED);
+        service.completeOperation(
+            UUID.randomUUID().toString(), HistoryStatus.FAILED, null, null, null, null);
 
     assertThat(result).isEmpty();
   }

From d0bdd16f2e434c903e37d8369dbaa3a66ddb5b2e Mon Sep 17 00:00:00 2001
From: mkuchenbecker <mkuchenbecker@users.noreply.github.com>
Date: Wed, 20 May 2026 09:08:48 -0700
Subject: [PATCH 03/14] feat(tables): on-commit hook posts stats to the
 optimizer

New OptimizerTableStatsClient PUTs /v1/optimizer/stats/{tableUuid} after
every successful Iceberg commit, carrying database/table names and the
current tableProperties (which the optimizer analyzer reads to decide
opt-in for OFD via maintenance.optimizer.ofd.enabled).

Activated by cluster.optimizer.base-uri. When the property is absent
the bean is not created and IcebergSnapshotsServiceImpl skips the call
(behavior unchanged for unit tests and dev clusters without an
optimizer).

The hook is best-effort: WebClient failures are caught and logged at
WARN; the commit itself succeeds either way. The analyzer re-reads
stats on every cycle so a transient post failure is recovered by the
next commit.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../cluster/configs/ClusterProperties.java    |  7 ++
 .../config/OptimizerTableStatsClient.java     | 95 +++++++++++++++++++
 .../services/IcebergSnapshotsServiceImpl.java | 11 ++-
 .../service/IcebergSnapshotsServiceTest.java  |  5 +
 4 files changed, 117 insertions(+), 1 deletion(-)
 create mode 100644 services/tables/src/main/java/com/linkedin/openhouse/tables/config/OptimizerTableStatsClient.java

diff --git a/cluster/configs/src/main/java/com/linkedin/openhouse/cluster/configs/ClusterProperties.java b/cluster/configs/src/main/java/com/linkedin/openhouse/cluster/configs/ClusterProperties.java
index 9d0c81876..e8eb9cd71 100644
--- a/cluster/configs/src/main/java/com/linkedin/openhouse/cluster/configs/ClusterProperties.java
+++ b/cluster/configs/src/main/java/com/linkedin/openhouse/cluster/configs/ClusterProperties.java
@@ -96,4 +96,11 @@ public class ClusterProperties {
   // string
   @Value("${cluster.tables.allowed-client-name-values:}")
   private List<String> allowedClientNameValues;
+
+  /**
+   * Base URI of the optimizer service. When set, the Tables Service POSTs a stats snapshot to the
+   * optimizer on every successful Iceberg commit; when null, the hook is a no-op.
+   */
+  @Value("${cluster.optimizer.base-uri:#{null}}")
+  private String clusterOptimizerBaseUri;
 }
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/config/OptimizerTableStatsClient.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/config/OptimizerTableStatsClient.java
new file mode 100644
index 000000000..f5881c7a4
--- /dev/null
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/config/OptimizerTableStatsClient.java
@@ -0,0 +1,95 @@
+package com.linkedin.openhouse.tables.config;
+
+import com.linkedin.openhouse.tables.model.TableDto;
+import java.time.Duration;
+import java.util.Collections;
+import java.util.Map;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
+import org.springframework.stereotype.Component;
+import org.springframework.web.reactive.function.client.WebClient;
+
+/**
+ * Thin client that pushes a per-table stats snapshot to the optimizer service on every successful
+ * Iceberg commit.
+ *
+ * <p>Activated by {@code cluster.optimizer.base-uri}. When absent (e.g. unit tests, dev clusters
+ * without an optimizer), the bean is not created and {@link
+ * com.linkedin.openhouse.tables.services.IcebergSnapshotsServiceImpl} skips the call.
+ *
+ * <p>The hook is best-effort: failures are logged but do not break the commit. The optimizer
+ * analyzer re-reads stats on each cycle, so a transient post failure is recovered on the next
+ * commit.
+ */
+@Component
+@ConditionalOnProperty(name = "cluster.optimizer.base-uri")
+@Slf4j
+public class OptimizerTableStatsClient {
+
+  private static final Duration TIMEOUT = Duration.ofSeconds(5);
+
+  private final WebClient webClient;
+
+  public OptimizerTableStatsClient(@Value("${cluster.optimizer.base-uri}") String baseUri) {
+    this.webClient = WebClient.builder().baseUrl(baseUri).build();
+  }
+
+  /**
+   * PUT a stats row for {@code table} to the optimizer service. Returns synchronously after the
+   * write completes or fails; exceptions are caught and logged at WARN.
+   */
+  public void upsertTableStats(TableDto table) {
+    String tableUuid = table.getTableUUID();
+    if (tableUuid == null) {
+      log.warn(
+          "Skipping optimizer stats push for {}.{}: tableUUID is null",
+          table.getDatabaseId(),
+          table.getTableId());
+      return;
+    }
+    Map<String, String> tableProperties =
+        table.getTableProperties() == null ? Collections.emptyMap() : table.getTableProperties();
+    UpsertStatsBody body =
+        UpsertStatsBody.builder()
+            .databaseName(table.getDatabaseId())
+            .tableName(table.getTableId())
+            .tableProperties(tableProperties)
+            .build();
+    try {
+      webClient
+          .put()
+          .uri("/v1/optimizer/stats/{tableUuid}", tableUuid)
+          .bodyValue(body)
+          .retrieve()
+          .toBodilessEntity()
+          .block(TIMEOUT);
+    } catch (Exception e) {
+      log.warn(
+          "Optimizer stats push failed for tableUuid={} ({}.{}): {}",
+          tableUuid,
+          table.getDatabaseId(),
+          table.getTableId(),
+          e.getMessage());
+    }
+  }
+
+  /**
+   * Wire body matching {@link
+   * com.linkedin.openhouse.optimizer.api.spec.UpsertTableStatsRequestDto}. Kept as a local
+   * representation so this module does not need to depend on the optimizer api module.
+   */
+  @Data
+  @Builder
+  @NoArgsConstructor
+  @AllArgsConstructor
+  static class UpsertStatsBody {
+    private String databaseName;
+    private String tableName;
+    private Map<String, String> tableProperties;
+  }
+}
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/IcebergSnapshotsServiceImpl.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/IcebergSnapshotsServiceImpl.java
index af43a169e..efa18e174 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/IcebergSnapshotsServiceImpl.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/IcebergSnapshotsServiceImpl.java
@@ -6,6 +6,7 @@
 import com.linkedin.openhouse.common.exception.UnsupportedClientOperationException;
 import com.linkedin.openhouse.tables.api.spec.v0.request.IcebergSnapshotsRequestBody;
 import com.linkedin.openhouse.tables.authorization.Privileges;
+import com.linkedin.openhouse.tables.config.OptimizerTableStatsClient;
 import com.linkedin.openhouse.tables.dto.mapper.TablesMapper;
 import com.linkedin.openhouse.tables.model.TableDto;
 import com.linkedin.openhouse.tables.model.TableDtoPrimaryKey;
@@ -32,6 +33,10 @@ public class IcebergSnapshotsServiceImpl implements IcebergSnapshotsService {
 
   @Autowired AuthorizationUtils authorizationUtils;
 
+  /** Optional — wired only when {@code cluster.optimizer.base-uri} is set. */
+  @Autowired(required = false)
+  OptimizerTableStatsClient optimizerTableStatsClient;
+
   @Override
   public Pair<TableDto, Boolean> putIcebergSnapshots(
       String databaseId,
@@ -83,7 +88,11 @@ public Pair<TableDto, Boolean> putIcebergSnapshots(
           databaseId, tableCreatorUpdater, Privileges.CREATE_TABLE);
     }
     try {
-      return Pair.of(openHouseInternalRepository.save(tableDtoToSave), !tableDto.isPresent());
+      TableDto saved = openHouseInternalRepository.save(tableDtoToSave);
+      if (optimizerTableStatsClient != null) {
+        optimizerTableStatsClient.upsertTableStats(saved);
+      }
+      return Pair.of(saved, !tableDto.isPresent());
     } catch (BadRequestException e) {
       throw new RequestValidationFailureException(e.getMessage(), e);
     } catch (CommitFailedException ce) {
diff --git a/services/tables/src/test/java/com/linkedin/openhouse/tables/mock/service/IcebergSnapshotsServiceTest.java b/services/tables/src/test/java/com/linkedin/openhouse/tables/mock/service/IcebergSnapshotsServiceTest.java
index 44467b705..37b9e5367 100644
--- a/services/tables/src/test/java/com/linkedin/openhouse/tables/mock/service/IcebergSnapshotsServiceTest.java
+++ b/services/tables/src/test/java/com/linkedin/openhouse/tables/mock/service/IcebergSnapshotsServiceTest.java
@@ -9,6 +9,7 @@
 import com.linkedin.openhouse.tables.api.spec.v0.request.IcebergSnapshotsRequestBody;
 import com.linkedin.openhouse.tables.api.spec.v0.request.components.LockState;
 import com.linkedin.openhouse.tables.api.spec.v0.request.components.Policies;
+import com.linkedin.openhouse.tables.config.OptimizerTableStatsClient;
 import com.linkedin.openhouse.tables.dto.mapper.TablesMapper;
 import com.linkedin.openhouse.tables.dto.mapper.TablesMapperImpl;
 import com.linkedin.openhouse.tables.model.TableDto;
@@ -45,6 +46,8 @@ public class IcebergSnapshotsServiceTest {
 
   @MockBean private TableUUIDGenerator tableUUIDGenerator;
 
+  @MockBean private OptimizerTableStatsClient optimizerTableStatsClient;
+
   private OpenHouseInternalRepository mockRepository;
 
   @Captor ArgumentCaptor<TableDto> tableDtoArgumentCaptor;
@@ -88,6 +91,8 @@ public void testTableCreated() {
     Assertions.assertTrue(result.getSecond(), "Table must be created");
 
     verifyCalls(key, TEST_TABLE_CREATOR, requestBody.getCreateUpdateTableRequestBody());
+    Mockito.verify(optimizerTableStatsClient, Mockito.times(1))
+        .upsertTableStats(Mockito.eq(tableDto));
   }
 
   @Test

From f188f552820daad009935a3fcca709b5282b33be Mon Sep 17 00:00:00 2001
From: mkuchenbecker <mkuchenbecker@users.noreply.github.com>
Date: Wed, 20 May 2026 09:19:13 -0700
Subject: [PATCH 04/14] feat(docker): wire optimizer service/analyzer/scheduler
 into compose
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three new repo-root Dockerfiles:
  - optimizer-service.Dockerfile (Spring web app, like tables-service)
  - optimizer-analyzer.Dockerfile (loops the CommandLineRunner on
    ANALYZER_INTERVAL_SECONDS; default 30s)
  - optimizer-scheduler.Dockerfile (loops on SCHEDULER_INTERVAL_SECONDS;
    default 30s)

Compose wiring:
  - common/oh-services.yml — three new service blocks pointing at the
    new Dockerfiles; optimizer-service exposes 8005:8080
  - oh-hadoop-spark/docker-compose.yml — includes the three services
    with MySQL credentials and inter-service URLs (analyzer/scheduler
    point at openhouse-optimizer; scheduler points at openhouse-jobs)
  - oh-hadoop-spark/cluster.yaml — sets cluster.optimizer.base-uri so
    the on-commit hook in tables service activates

Build wiring:
  - build.gradle dockerPrereqs depends on the analyzer/scheduler
    bootJar targets

Renamed OPTIMIZER_DB_USERNAME → OPTIMIZER_DB_USER in the optimizer
service application.properties so all three optimizer JVMs read the
same env var name.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 build.gradle                                  |  6 +++
 .../docker-compose/common/oh-services.yml     | 14 ++++++
 .../oh-hadoop-spark/cluster.yaml              |  2 +
 .../oh-hadoop-spark/docker-compose.yml        | 47 +++++++++++++++++++
 optimizer-analyzer.Dockerfile                 | 44 +++++++++++++++++
 optimizer-scheduler.Dockerfile                | 43 +++++++++++++++++
 optimizer-service.Dockerfile                  | 47 +++++++++++++++++++
 .../src/main/resources/application.properties |  2 +-
 8 files changed, 204 insertions(+), 1 deletion(-)
 create mode 100644 optimizer-analyzer.Dockerfile
 create mode 100644 optimizer-scheduler.Dockerfile
 create mode 100644 optimizer-service.Dockerfile

diff --git a/build.gradle b/build.gradle
index f7cdd71a8..25be0c820 100644
--- a/build.gradle
+++ b/build.gradle
@@ -178,6 +178,8 @@ tasks.register('CopyGitHooksTask', Copy) {
 //   housetables-service.Dockerfile -> :services:housetables:bootJar
 //   jobs-service.Dockerfile        -> :services:jobs:bootJar
 //   optimizer-service.Dockerfile   -> :services:optimizer:bootJar
+//   optimizer-analyzer.Dockerfile  -> :apps:optimizer-analyzer:bootJar
+//   optimizer-scheduler.Dockerfile -> :apps:optimizer-scheduler:bootJar
 //   jobs-scheduler.Dockerfile      -> :apps:openhouse-spark-apps_2.12:shadowJar (uber JAR)
 //   spark-base-hadoop2.8.dockerfile ->
 //       :integrations:spark:spark-3.1:openhouse-spark-runtime_2.12:shadowJar (uber JAR)
@@ -198,6 +200,8 @@ tasks.register('dockerPrereqs') {
   dependsOn ':services:housetables:bootJar'
   dependsOn ':services:jobs:bootJar'
   dependsOn ':services:optimizer:bootJar'
+  dependsOn ':apps:optimizer-analyzer:bootJar'
+  dependsOn ':apps:optimizer-scheduler:bootJar'
 
   // Spark runtime uber JARs (shadowJar)
   dependsOn ':integrations:spark:spark-3.1:openhouse-spark-runtime_2.12:shadowJar'
@@ -222,6 +226,8 @@ tasks.register('dockerPrereqs') {
     println '  build/housetables/libs/housetables.jar'
     println '  build/jobs/libs/jobs.jar'
     println '  build/optimizer/libs/optimizer.jar'
+    println '  build/optimizer-analyzer/libs/optimizer-analyzer.jar'
+    println '  build/optimizer-scheduler/libs/optimizer-scheduler.jar'
     println '  build/openhouse-spark-runtime_2.12/libs/openhouse-spark-runtime_2.12-uber.jar'
     println '  build/openhouse-spark-3.5-runtime_2.12/libs/openhouse-spark-3.5-runtime_2.12-uber.jar'
     println '  build/openhouse-spark-apps_2.12/libs/openhouse-spark-apps_2.12-uber.jar'
diff --git a/infra/recipes/docker-compose/common/oh-services.yml b/infra/recipes/docker-compose/common/oh-services.yml
index 73ef5a656..71c4f889e 100644
--- a/infra/recipes/docker-compose/common/oh-services.yml
+++ b/infra/recipes/docker-compose/common/oh-services.yml
@@ -22,6 +22,20 @@ services:
     build:
       context: ../../../..
       dockerfile: jobs-scheduler.Dockerfile
+  openhouse-optimizer:
+    build:
+      context: ../../../..
+      dockerfile: optimizer-service.Dockerfile
+    ports:
+      - 8005:8080
+  openhouse-optimizer-analyzer:
+    build:
+      context: ../../../..
+      dockerfile: optimizer-analyzer.Dockerfile
+  openhouse-optimizer-scheduler:
+    build:
+      context: ../../../..
+      dockerfile: optimizer-scheduler.Dockerfile
   prometheus:
     image: prom/prometheus:v2.21.0
     ports:
diff --git a/infra/recipes/docker-compose/oh-hadoop-spark/cluster.yaml b/infra/recipes/docker-compose/oh-hadoop-spark/cluster.yaml
index bb72176c5..81d142815 100644
--- a/infra/recipes/docker-compose/oh-hadoop-spark/cluster.yaml
+++ b/infra/recipes/docker-compose/oh-hadoop-spark/cluster.yaml
@@ -23,6 +23,8 @@ cluster:
     database:
       type: "MYSQL"
       url: "jdbc:mysql://mysql:3306/oh_db?allowPublicKeyRetrieval=true&useSSL=false"
+  optimizer:
+    base-uri: "http://openhouse-optimizer:8080"
   security:
     token:
       interceptor:
diff --git a/infra/recipes/docker-compose/oh-hadoop-spark/docker-compose.yml b/infra/recipes/docker-compose/oh-hadoop-spark/docker-compose.yml
index 0b8df01ac..73e8e2e85 100644
--- a/infra/recipes/docker-compose/oh-hadoop-spark/docker-compose.yml
+++ b/infra/recipes/docker-compose/oh-hadoop-spark/docker-compose.yml
@@ -81,6 +81,53 @@ services:
       - OTEL_LOGS_EXPORTER=none
       - OTEL_TRACES_SAMPLER=always_on
 
+  openhouse-optimizer:
+    container_name: local.openhouse-optimizer
+    extends:
+      file: ../common/oh-services.yml
+      service: openhouse-optimizer
+    volumes:
+      - ./:/var/config/
+    depends_on:
+      - mysql
+    environment:
+      - OPTIMIZER_DB_URL=jdbc:mysql://mysql:3306/oh_db?allowPublicKeyRetrieval=true&useSSL=false
+      - OPTIMIZER_DB_USER=oh_user
+      - OPTIMIZER_DB_PASSWORD=oh_password
+
+  openhouse-optimizer-analyzer:
+    container_name: local.openhouse-optimizer-analyzer
+    extends:
+      file: ../common/oh-services.yml
+      service: openhouse-optimizer-analyzer
+    volumes:
+      - ./:/var/config/
+    depends_on:
+      - openhouse-optimizer
+    environment:
+      - OPTIMIZER_DB_URL=jdbc:mysql://mysql:3306/oh_db?allowPublicKeyRetrieval=true&useSSL=false
+      - OPTIMIZER_DB_USER=oh_user
+      - OPTIMIZER_DB_PASSWORD=oh_password
+      - ANALYZER_INTERVAL_SECONDS=30
+
+  openhouse-optimizer-scheduler:
+    container_name: local.openhouse-optimizer-scheduler
+    extends:
+      file: ../common/oh-services.yml
+      service: openhouse-optimizer-scheduler
+    volumes:
+      - ./:/var/config/
+    depends_on:
+      - openhouse-optimizer
+      - openhouse-jobs
+    environment:
+      - OPTIMIZER_DB_URL=jdbc:mysql://mysql:3306/oh_db?allowPublicKeyRetrieval=true&useSSL=false
+      - OPTIMIZER_DB_USER=oh_user
+      - OPTIMIZER_DB_PASSWORD=oh_password
+      - JOBS_BASE_URI=http://openhouse-jobs:8080
+      - SCHEDULER_RESULTS_ENDPOINT=http://openhouse-optimizer:8080/v1/optimizer/operations
+      - SCHEDULER_INTERVAL_SECONDS=30
+
   jaeger:
     container_name: local.jaeger
     image: jaegertracing/all-in-one:1.54
diff --git a/optimizer-analyzer.Dockerfile b/optimizer-analyzer.Dockerfile
new file mode 100644
index 000000000..c2374c456
--- /dev/null
+++ b/optimizer-analyzer.Dockerfile
@@ -0,0 +1,44 @@
+FROM openjdk:23-ea-11-slim
+
+ARG USER=openhouse
+ARG USER_ID=1000
+ARG GROUP_ID=1000
+ENV APP_NAME=optimizer-analyzer
+ENV USER_HOME=/home/$USER
+ENV ANALYZER_INTERVAL_SECONDS=30
+
+# Create an openhouse user as there's no reason to run as root user
+RUN groupadd --force -g $GROUP_ID $USER && useradd -l -d $USER_HOME -m $USER -u $USER_ID -g $GROUP_ID
+
+WORKDIR $USER_HOME
+
+# IMAGE does not set the necessary paths by default.
+ENV PATH=$PATH:/export/apps/jdk/JDK-1_8_0_172/bin/:$USER_HOME
+
+ARG VERSION="1.0.0-SNAPSHOT"
+ARG BUILD_DIR="build/$APP_NAME/libs"
+ARG JAR_FILES=$BUILD_DIR/*.jar
+
+COPY $JAR_FILES ./
+
+# Delete unwanted JAR files
+RUN find . -name "*-sources.jar" -delete
+RUN find . -name "*-javadoc.jar" -delete
+RUN find . -name "*-lib.jar" -delete
+
+# Rename the JAR file.
+RUN find ./ -name "*.jar" -exec mv {} $APP_NAME.jar \;
+RUN ls $APP_NAME.jar
+
+# Ensure that everything in $USER_HOME is owned by openhouse user
+RUN chown -R openhouse:openhouse $USER_HOME
+
+# Setup default path for Java
+RUN mkdir -p /usr/java && ln -sfn /export/apps/jdk/JDK-1_8_0_172 /usr/java/default
+
+USER $USER
+
+# Loop the analyzer on a fixed interval. Each pass is a fresh JVM, so opt-3's
+# analyzer (which runs once per CommandLineRunner invocation and exits) becomes
+# a continuous service in this container.
+ENTRYPOINT ["sh", "-c", "while true; do echo \"Running $APP_NAME at $(date)\"; java -Xmx256M -Xms64M -XX:NativeMemoryTracking=summary -jar $APP_NAME.jar; echo \"Exited; sleeping ${ANALYZER_INTERVAL_SECONDS}s\"; sleep ${ANALYZER_INTERVAL_SECONDS}; done"]
diff --git a/optimizer-scheduler.Dockerfile b/optimizer-scheduler.Dockerfile
new file mode 100644
index 000000000..d83fbe9e6
--- /dev/null
+++ b/optimizer-scheduler.Dockerfile
@@ -0,0 +1,43 @@
+FROM openjdk:23-ea-11-slim
+
+ARG USER=openhouse
+ARG USER_ID=1000
+ARG GROUP_ID=1000
+ENV APP_NAME=optimizer-scheduler
+ENV USER_HOME=/home/$USER
+ENV SCHEDULER_INTERVAL_SECONDS=30
+
+# Create an openhouse user as there's no reason to run as root user
+RUN groupadd --force -g $GROUP_ID $USER && useradd -l -d $USER_HOME -m $USER -u $USER_ID -g $GROUP_ID
+
+WORKDIR $USER_HOME
+
+# IMAGE does not set the necessary paths by default.
+ENV PATH=$PATH:/export/apps/jdk/JDK-1_8_0_172/bin/:$USER_HOME
+
+ARG VERSION="1.0.0-SNAPSHOT"
+ARG BUILD_DIR="build/$APP_NAME/libs"
+ARG JAR_FILES=$BUILD_DIR/*.jar
+
+COPY $JAR_FILES ./
+
+# Delete unwanted JAR files
+RUN find . -name "*-sources.jar" -delete
+RUN find . -name "*-javadoc.jar" -delete
+RUN find . -name "*-lib.jar" -delete
+
+# Rename the JAR file.
+RUN find ./ -name "*.jar" -exec mv {} $APP_NAME.jar \;
+RUN ls $APP_NAME.jar
+
+# Ensure that everything in $USER_HOME is owned by openhouse user
+RUN chown -R openhouse:openhouse $USER_HOME
+
+# Setup default path for Java
+RUN mkdir -p /usr/java && ln -sfn /export/apps/jdk/JDK-1_8_0_172 /usr/java/default
+
+USER $USER
+
+# Loop the scheduler on a fixed interval; each pass is a fresh JVM (opt-4's
+# scheduler runs once per CommandLineRunner invocation and exits).
+ENTRYPOINT ["sh", "-c", "while true; do echo \"Running $APP_NAME at $(date)\"; java -Xmx256M -Xms64M -XX:NativeMemoryTracking=summary -jar $APP_NAME.jar; echo \"Exited; sleeping ${SCHEDULER_INTERVAL_SECONDS}s\"; sleep ${SCHEDULER_INTERVAL_SECONDS}; done"]
diff --git a/optimizer-service.Dockerfile b/optimizer-service.Dockerfile
new file mode 100644
index 000000000..8f2af923c
--- /dev/null
+++ b/optimizer-service.Dockerfile
@@ -0,0 +1,47 @@
+FROM openjdk:23-ea-11-slim
+
+ARG USER=openhouse
+ARG USER_ID=1000
+ARG GROUP_ID=1000
+ENV APP_NAME=optimizer
+ENV USER_HOME=/home/$USER
+
+# Create an openhouse user as there's no reason to run as root user
+RUN groupadd --force -g $GROUP_ID $USER && useradd -l -d $USER_HOME -m $USER -u $USER_ID -g $GROUP_ID
+
+WORKDIR $USER_HOME
+
+# IMAGE does not set the necessary paths by default.
+ENV PATH=$PATH:/export/apps/jdk/JDK-1_8_0_172/bin/:$USER_HOME
+
+ARG VERSION="1.0.0-SNAPSHOT"
+ARG BUILD_DIR="build/$APP_NAME/libs"
+ARG JAR_FILES=$BUILD_DIR/*.jar
+
+COPY $JAR_FILES ./
+
+# Delete unwanted JAR files
+RUN ls ./
+RUN find . -name "*-sources.jar" -delete
+RUN find . -name "*-javadoc.jar" -delete
+RUN find . -name "*-lib.jar" -delete
+
+# Rename the JAR file.
+RUN ls ./
+RUN find ./ -name "*.jar" -exec mv {} $APP_NAME.jar \;
+
+RUN ls $APP_NAME.jar
+
+COPY run.sh .
+
+
+# Ensure that everything in $USER_HOME is owned by openhouse user
+RUN chown -R openhouse:openhouse $USER_HOME
+
+# Setup default path for Java
+RUN mkdir -p /usr/java && ln -sfn /export/apps/jdk/JDK-1_8_0_172 /usr/java/default
+
+USER $USER
+
+EXPOSE 8080
+ENTRYPOINT ["sh", "-c", "./run.sh $APP_NAME.jar $@"]
diff --git a/services/optimizer/src/main/resources/application.properties b/services/optimizer/src/main/resources/application.properties
index c6c3f8437..e78745d00 100644
--- a/services/optimizer/src/main/resources/application.properties
+++ b/services/optimizer/src/main/resources/application.properties
@@ -12,7 +12,7 @@ spring.jpa.properties.hibernate.physical_naming_strategy=org.hibernate.boot.mode
 
 spring.datasource.driver-class-name=com.mysql.cj.jdbc.Driver
 spring.datasource.url=${OPTIMIZER_DB_URL:jdbc:mysql://localhost:3306/oh_db}
-spring.datasource.username=${OPTIMIZER_DB_USERNAME:oh_user}
+spring.datasource.username=${OPTIMIZER_DB_USER:oh_user}
 spring.datasource.password=${OPTIMIZER_DB_PASSWORD:oh_password}
 spring.datasource.hikari.maximum-pool-size=20
 

From 37304cfe45dfbd8d21fd793cf3dd29b81aeb4be2 Mon Sep 17 00:00:00 2001
From: mkuchenbecker <mkuchenbecker@users.noreply.github.com>
Date: Wed, 20 May 2026 09:22:36 -0700
Subject: [PATCH 05/14] fix(optimizer): point @EntityScan at the actual db
 package

AnalyzerApplication and SchedulerApplication scanned
com.linkedin.openhouse.optimizer.entity for JPA-managed entities, but
the entities live in com.linkedin.openhouse.optimizer.db. Both apps
crashed at startup against a real database (MySQL backend in docker)
with "Not a managed type: class ...db.TableStatsRow". Unit tests
didn't catch it because they configure JPA differently.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../com/linkedin/openhouse/analyzer/AnalyzerApplication.java    | 2 +-
 .../com/linkedin/openhouse/scheduler/SchedulerApplication.java  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/optimizer-analyzer/src/main/java/com/linkedin/openhouse/analyzer/AnalyzerApplication.java b/apps/optimizer-analyzer/src/main/java/com/linkedin/openhouse/analyzer/AnalyzerApplication.java
index edee9c02e..1b6250c5e 100644
--- a/apps/optimizer-analyzer/src/main/java/com/linkedin/openhouse/analyzer/AnalyzerApplication.java
+++ b/apps/optimizer-analyzer/src/main/java/com/linkedin/openhouse/analyzer/AnalyzerApplication.java
@@ -10,7 +10,7 @@
 
 /** Entry point for the Optimizer Analyzer application. */
 @SpringBootApplication
-@EntityScan(basePackages = "com.linkedin.openhouse.optimizer.entity")
+@EntityScan(basePackages = "com.linkedin.openhouse.optimizer.db")
 @EnableJpaRepositories(basePackages = "com.linkedin.openhouse.optimizer.repository")
 public class AnalyzerApplication {
 
diff --git a/apps/optimizer-scheduler/src/main/java/com/linkedin/openhouse/scheduler/SchedulerApplication.java b/apps/optimizer-scheduler/src/main/java/com/linkedin/openhouse/scheduler/SchedulerApplication.java
index b28d1b73c..0b3b836d3 100644
--- a/apps/optimizer-scheduler/src/main/java/com/linkedin/openhouse/scheduler/SchedulerApplication.java
+++ b/apps/optimizer-scheduler/src/main/java/com/linkedin/openhouse/scheduler/SchedulerApplication.java
@@ -11,7 +11,7 @@
 
 /** Entry point for the Optimizer Scheduler application. */
 @SpringBootApplication
-@EntityScan(basePackages = "com.linkedin.openhouse.optimizer.entity")
+@EntityScan(basePackages = "com.linkedin.openhouse.optimizer.db")
 @EnableJpaRepositories(basePackages = "com.linkedin.openhouse.optimizer.repository")
 public class SchedulerApplication {
 

From 1ec3acf01bf31ca6bbd557ba4f66e568802ee8cd Mon Sep 17 00:00:00 2001
From: mkuchenbecker <mkuchenbecker@users.noreply.github.com>
Date: Wed, 20 May 2026 10:18:29 -0700
Subject: [PATCH 06/14] fix(optimizer): four gaps surfaced by the end-to-end
 docker run

Discovered while wiring up the OFD flow on the live oh-hadoop-spark
stack. Each fix lives in a file owned by a lower PR; consolidating on
opt-5 per the delta-discovery framing, to be relocated in a follow-up.

  1. FileCountBinPacker.cost() NPE on stats with null snapshot.
     The analyzer can legitimately schedule OFD against a table whose
     stats row has no snapshot (e.g. brand-new table). Treat that as
     cost 0 instead of crashing.

  2. SchedulerRunner.schedule(OperationType) was non-@Transactional;
     the 3-arg overload was annotated but self-invocation bypasses the
     CGLIB proxy, so the @Modifying repository calls hit
     TransactionRequiredException. Annotate the no-arg variant too.

  3. TableOperation model dropped jobId entirely (db row had it; api
     dto had a field but couldn't be populated). Wire jobId through
     model.toRow/fromRow and dto.toModel/fromModel so the SCHEDULED
     row exposes its Spark job id over the API.

  4. jobs.yaml ORPHAN_FILES_DELETION mapping pointed at the single-table
     OrphanFilesDeletionSparkApp, which doesn't recognize --tableNames
     and crashed at arg parsing. Route to BatchedOrphanFilesDeletionSparkApp.

Verified end-to-end: stats PUT -> analyzer PENDING -> scheduler SCHEDULED
with jobId -> Livy -> BatchedOFD launches -> NoSuchTableException for
the stub table -> POST /operations/{id}/complete with FAILED + errorMessage
+ errorType -> history row written. The success path is structurally the
same; we'll exercise it once we wire up real Spark-table creation.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../com/linkedin/openhouse/scheduler/FileCountBinPacker.java | 3 +++
 .../com/linkedin/openhouse/scheduler/SchedulerRunner.java    | 1 +
 infra/recipes/docker-compose/oh-hadoop-spark/jobs.yaml       | 2 +-
 .../openhouse/optimizer/api/spec/TableOperationsDto.java     | 2 ++
 .../linkedin/openhouse/optimizer/model/TableOperation.java   | 5 +++++
 5 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/apps/optimizer-scheduler/src/main/java/com/linkedin/openhouse/scheduler/FileCountBinPacker.java b/apps/optimizer-scheduler/src/main/java/com/linkedin/openhouse/scheduler/FileCountBinPacker.java
index 9110b0dea..060e001e8 100644
--- a/apps/optimizer-scheduler/src/main/java/com/linkedin/openhouse/scheduler/FileCountBinPacker.java
+++ b/apps/optimizer-scheduler/src/main/java/com/linkedin/openhouse/scheduler/FileCountBinPacker.java
@@ -74,6 +74,9 @@ public List<Bin> pack(List<SchedulingCandidate> pending) {
   }
 
   private static long cost(TableStats stats) {
+    if (stats == null || stats.getSnapshot() == null) {
+      return 0L;
+    }
     Long n = stats.getSnapshot().getNumCurrentFiles();
     return n != null ? n : 0L;
   }
diff --git a/apps/optimizer-scheduler/src/main/java/com/linkedin/openhouse/scheduler/SchedulerRunner.java b/apps/optimizer-scheduler/src/main/java/com/linkedin/openhouse/scheduler/SchedulerRunner.java
index 124633aa3..2c602ba0d 100644
--- a/apps/optimizer-scheduler/src/main/java/com/linkedin/openhouse/scheduler/SchedulerRunner.java
+++ b/apps/optimizer-scheduler/src/main/java/com/linkedin/openhouse/scheduler/SchedulerRunner.java
@@ -49,6 +49,7 @@ public SchedulerRunner(
   }
 
   /** Schedule all PENDING operations of the given type across all databases. */
+  @Transactional
   public void schedule(OperationType operationType) {
     schedule(operationType, Optional.empty(), Optional.empty());
   }
diff --git a/infra/recipes/docker-compose/oh-hadoop-spark/jobs.yaml b/infra/recipes/docker-compose/oh-hadoop-spark/jobs.yaml
index 3e795d4db..b666b45a8 100644
--- a/infra/recipes/docker-compose/oh-hadoop-spark/jobs.yaml
+++ b/infra/recipes/docker-compose/oh-hadoop-spark/jobs.yaml
@@ -51,7 +51,7 @@ jobs:
         << : *apps-defaults
         << : *livy-engine
       - type: ORPHAN_FILES_DELETION
-        class-name: com.linkedin.openhouse.jobs.spark.OrphanFilesDeletionSparkApp
+        class-name: com.linkedin.openhouse.jobs.spark.BatchedOrphanFilesDeletionSparkApp
         args: ["--backupDir", ".backup"]
         <<: *apps-defaults
         spark-properties:
diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableOperationsDto.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableOperationsDto.java
index 496f59f42..14673c7c9 100644
--- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableOperationsDto.java
+++ b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableOperationsDto.java
@@ -52,6 +52,7 @@ public TableOperation toModel() {
         .status(status == null ? null : status.toModel())
         .createdAt(createdAt)
         .scheduledAt(scheduledAt)
+        .jobId(jobId)
         .build();
   }
 
@@ -69,6 +70,7 @@ public static TableOperationsDto fromModel(TableOperation op) {
         .status(OperationStatusDto.fromModel(op.getStatus()))
         .createdAt(op.getCreatedAt())
         .scheduledAt(op.getScheduledAt())
+        .jobId(op.getJobId())
         .build();
   }
 }
diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableOperation.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableOperation.java
index dcaf9aeea..6007ba0fb 100644
--- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableOperation.java
+++ b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableOperation.java
@@ -46,6 +46,9 @@ public class TableOperation {
   /** When the scheduler last submitted a job for this operation. */
   private Instant scheduledAt;
 
+  /** Job ID returned by the Jobs Service after the scheduler submitted; null until SCHEDULED. */
+  private String jobId;
+
   /** Create a new PENDING operation for the given table and operation type. */
   public static TableOperation pending(Table table, OperationType operationType) {
     return TableOperation.builder()
@@ -77,6 +80,7 @@ public TableOperationsRow toRow() {
         .status(status == null ? null : status.toDb())
         .createdAt(createdAt)
         .scheduledAt(scheduledAt)
+        .jobId(jobId)
         .build();
   }
 
@@ -94,6 +98,7 @@ public static TableOperation fromRow(TableOperationsRow row) {
         .status(OperationStatus.fromDb(row.getStatus()))
         .createdAt(row.getCreatedAt())
         .scheduledAt(row.getScheduledAt())
+        .jobId(row.getJobId())
         .build();
   }
 }

From 65b72051dab8593cc9d686403bec2c800dadb067 Mon Sep 17 00:00:00 2001
From: mkuchenbecker <mkuchenbecker@users.noreply.github.com>
Date: Wed, 27 May 2026 14:19:10 -0700
Subject: [PATCH 07/14] fix(optimizer-5): close five wiring gaps surfaced by
 the end-to-end OFD demo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The SUCCESS-path demo on -5 exposed five separate gaps between the merged
component layers; each was a real bug that blocked the loop from running
green out-of-the-box.

1. build.gradle dockerPrereqs + Dockerfile BUILD_DIR pointed at the
   pre-split paths (apps/optimizer-analyzer, apps/optimizer-scheduler).
   After the analyzer/scheduler module split into
   apps/optimizer/{analyzerapp,schedulerapp} and the corresponding
   build outputs land at build/{analyzerapp,schedulerapp}/libs/,
   dockerUp failed with "Task with path ':apps:optimizer-analyzer:bootJar'
   not found".

2. tables-service env var CLUSTER_OPTIMIZER_BASE_URI. The
   OptimizerTableStatsClient @ConditionalOnProperty(name=
   "cluster.optimizer.base-uri") did not activate from the cluster.yaml-
   loaded value alone in the running container — likely a property-source
   ordering race between the tables scan-package and ClusterProperties'
   @PropertySource. Setting the env var lets Spring's relaxed binding
   resolve the property deterministically before the conditional fires.
   Verified via /actuator/beans: optimizerTableStatsClient now appears
   in the bean list.

3. CadenceBasedOrphanFilesDeletionAnalyzer ambiguous-constructor crash.
   The class has two constructors (a public @Value-injected one and a
   package-private test one). Without @Autowired on either, Spring
   couldn't pick and threw
   NoSuchMethodException: ...<init>()
   on every analyzer cron pass. The analyzer integration test wires the
   policy directly via the test constructor, so the @Component-scan path
   was never exercised — real bug only visible in the deployable app.

4. jobs.yaml --ttl 0 for ORPHAN_FILES_DELETION. The Spark app defaults to
   a 7-day older-than filter on candidate files. Newly-expired snapshots
   in the test stack are seconds old, so OFD reported 0 files deleted
   every run. Adding --ttl 0 makes all orphans eligible in the test
   recipe.

End-to-end SUCCESS verified on 2/3 tables in the docker stack:
demo_ofd_b (23 files / 101,503 bytes deleted) and demo_ofd_c (11 files /
47,385 bytes deleted) both reach status=SUCCESS in history with
per-table metrics. The 3rd table hits a docker-stack OOM
(executor SIGKILL/137) under three concurrent OFD jobs, not a code bug.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 build.gradle                                         | 12 ++++++------
 .../oh-hadoop-spark/docker-compose.yml               |  1 +
 .../recipes/docker-compose/oh-hadoop-spark/jobs.yaml |  2 +-
 optimizer-analyzer.Dockerfile                        |  2 +-
 optimizer-scheduler.Dockerfile                       |  2 +-
 .../CadenceBasedOrphanFilesDeletionAnalyzer.java     |  1 +
 6 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/build.gradle b/build.gradle
index 964b67e3a..1f951a91c 100644
--- a/build.gradle
+++ b/build.gradle
@@ -178,8 +178,8 @@ tasks.register('CopyGitHooksTask', Copy) {
 //   housetables-service.Dockerfile -> :services:housetables:bootJar
 //   jobs-service.Dockerfile        -> :services:jobs:bootJar
 //   optimizer-service.Dockerfile   -> :services:optimizer:bootJar
-//   optimizer-analyzer.Dockerfile  -> :apps:optimizer-analyzer:bootJar
-//   optimizer-scheduler.Dockerfile -> :apps:optimizer-scheduler:bootJar
+//   optimizer-analyzer.Dockerfile  -> :apps:optimizer:analyzerapp:bootJar
+//   optimizer-scheduler.Dockerfile -> :apps:optimizer:schedulerapp:bootJar
 //   jobs-scheduler.Dockerfile      -> :apps:openhouse-spark-apps_2.12:shadowJar (uber JAR)
 //   spark-base-hadoop2.8.dockerfile ->
 //       :integrations:spark:spark-3.1:openhouse-spark-runtime_2.12:shadowJar (uber JAR)
@@ -200,8 +200,8 @@ tasks.register('dockerPrereqs') {
   dependsOn ':services:housetables:bootJar'
   dependsOn ':services:jobs:bootJar'
   dependsOn ':services:optimizer:bootJar'
-  dependsOn ':apps:optimizer-analyzer:bootJar'
-  dependsOn ':apps:optimizer-scheduler:bootJar'
+  dependsOn ':apps:optimizer:analyzerapp:bootJar'
+  dependsOn ':apps:optimizer:schedulerapp:bootJar'
 
   // Spark runtime uber JARs (shadowJar)
   dependsOn ':integrations:spark:spark-3.1:openhouse-spark-runtime_2.12:shadowJar'
@@ -226,8 +226,8 @@ tasks.register('dockerPrereqs') {
     println '  build/housetables/libs/housetables.jar'
     println '  build/jobs/libs/jobs.jar'
     println '  build/optimizer/libs/optimizer.jar'
-    println '  build/optimizer-analyzer/libs/optimizer-analyzer.jar'
-    println '  build/optimizer-scheduler/libs/optimizer-scheduler.jar'
+    println '  build/analyzerapp/libs/analyzerapp.jar'
+    println '  build/schedulerapp/libs/schedulerapp.jar'
     println '  build/openhouse-spark-runtime_2.12/libs/openhouse-spark-runtime_2.12-uber.jar'
     println '  build/openhouse-spark-3.5-runtime_2.12/libs/openhouse-spark-3.5-runtime_2.12-uber.jar'
     println '  build/openhouse-spark-apps_2.12/libs/openhouse-spark-apps_2.12-uber.jar'
diff --git a/infra/recipes/docker-compose/oh-hadoop-spark/docker-compose.yml b/infra/recipes/docker-compose/oh-hadoop-spark/docker-compose.yml
index 73e8e2e85..ec404ae81 100644
--- a/infra/recipes/docker-compose/oh-hadoop-spark/docker-compose.yml
+++ b/infra/recipes/docker-compose/oh-hadoop-spark/docker-compose.yml
@@ -22,6 +22,7 @@ services:
       - OTEL_METRICS_EXPORTER=none
       - OTEL_LOGS_EXPORTER=none
       - OTEL_TRACES_SAMPLER=always_on
+      - CLUSTER_OPTIMIZER_BASE_URI=http://openhouse-optimizer:8080
 
   openhouse-jobs:
     container_name: local.openhouse-jobs
diff --git a/infra/recipes/docker-compose/oh-hadoop-spark/jobs.yaml b/infra/recipes/docker-compose/oh-hadoop-spark/jobs.yaml
index b666b45a8..7a0c51e01 100644
--- a/infra/recipes/docker-compose/oh-hadoop-spark/jobs.yaml
+++ b/infra/recipes/docker-compose/oh-hadoop-spark/jobs.yaml
@@ -52,7 +52,7 @@ jobs:
         << : *livy-engine
       - type: ORPHAN_FILES_DELETION
         class-name: com.linkedin.openhouse.jobs.spark.BatchedOrphanFilesDeletionSparkApp
-        args: ["--backupDir", ".backup"]
+        args: ["--backupDir", ".backup", "--ttl", "0"]
         <<: *apps-defaults
         spark-properties:
           <<: *spark-defaults
diff --git a/optimizer-analyzer.Dockerfile b/optimizer-analyzer.Dockerfile
index c2374c456..925018dc7 100644
--- a/optimizer-analyzer.Dockerfile
+++ b/optimizer-analyzer.Dockerfile
@@ -16,7 +16,7 @@ WORKDIR $USER_HOME
 ENV PATH=$PATH:/export/apps/jdk/JDK-1_8_0_172/bin/:$USER_HOME
 
 ARG VERSION="1.0.0-SNAPSHOT"
-ARG BUILD_DIR="build/$APP_NAME/libs"
+ARG BUILD_DIR="build/analyzerapp/libs"
 ARG JAR_FILES=$BUILD_DIR/*.jar
 
 COPY $JAR_FILES ./
diff --git a/optimizer-scheduler.Dockerfile b/optimizer-scheduler.Dockerfile
index d83fbe9e6..9be0f3d4c 100644
--- a/optimizer-scheduler.Dockerfile
+++ b/optimizer-scheduler.Dockerfile
@@ -16,7 +16,7 @@ WORKDIR $USER_HOME
 ENV PATH=$PATH:/export/apps/jdk/JDK-1_8_0_172/bin/:$USER_HOME
 
 ARG VERSION="1.0.0-SNAPSHOT"
-ARG BUILD_DIR="build/$APP_NAME/libs"
+ARG BUILD_DIR="build/schedulerapp/libs"
 ARG JAR_FILES=$BUILD_DIR/*.jar
 
 COPY $JAR_FILES ./
diff --git a/services/optimizer/analyzer/src/main/java/com/linkedin/openhouse/optimizer/analyzer/CadenceBasedOrphanFilesDeletionAnalyzer.java b/services/optimizer/analyzer/src/main/java/com/linkedin/openhouse/optimizer/analyzer/CadenceBasedOrphanFilesDeletionAnalyzer.java
index 1f4b31542..f344dcb2b 100644
--- a/services/optimizer/analyzer/src/main/java/com/linkedin/openhouse/optimizer/analyzer/CadenceBasedOrphanFilesDeletionAnalyzer.java
+++ b/services/optimizer/analyzer/src/main/java/com/linkedin/openhouse/optimizer/analyzer/CadenceBasedOrphanFilesDeletionAnalyzer.java
@@ -51,6 +51,7 @@ public class CadenceBasedOrphanFilesDeletionAnalyzer implements OperationAnalyze
 
   private final CadencePolicy cadencePolicy;
 
+  @org.springframework.beans.factory.annotation.Autowired
   public CadenceBasedOrphanFilesDeletionAnalyzer(
       @Value("${ofd.success-retry-hours:16}") long successRetryHours,
       @Value("${ofd.failure-retry-hours:1}") long failureRetryHours) {

From 205f86556d1b736048199624789e876c6dfa5776 Mon Sep 17 00:00:00 2001
From: mkuchenbecker <mkuchenbecker@users.noreply.github.com>
Date: Wed, 27 May 2026 14:19:26 -0700
Subject: [PATCH 08/14] feat(optimizer-5): port end-to-end OFD demo scripts
 from the integration tip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Four scripts at the repo root that drive the optimizer loop against a
local oh-hadoop-spark docker stack, adapted for the -5 API surface:

- local-spark-sql.sh — Livy session-reuse shim for one-liner Spark SQL.
- demo-setup.sh — create N tables with maintenance.optimizer.ofd.enabled
  =true, populate via INSERT OVERWRITE, expire snapshots (producing
  orphan data files on HDFS), then wait for the continuous analyzer +
  scheduler to land SCHEDULED rows.
- demo-check.sh — show per-table HDFS file counts and the most recent
  operations-history entries per tableUuid.
- demo.sh — wraps the setup, polls for SUCCESS history per table, and
  runs demo-check.sh at the end.

URLs adapted from the tip's pre-stack surface
(/v1/table-stats, /v1/table-operations, port 8003) to -5's stacked
surface (/v1/optimizer/stats, /v1/optimizer/operations,
/v1/optimizer/operations-history/{tableUuid}, port 8005, required
?limit on list endpoints, history lookup by tableUuid rather than by
databaseName).

The analyzer + scheduler run continuously on -5 (every 30s loop in the
analyzer/scheduler containers), so the manual "run-analyzer"/
"run-scheduler" docker-compose profiles the tip used are dropped — the
demo just waits for the next cron pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 demo-check.sh      |  27 +++++++++
 demo-setup.sh      | 140 +++++++++++++++++++++++++++++++++++++++++++++
 demo.sh            |  39 +++++++++++++
 local-spark-sql.sh |  85 +++++++++++++++++++++++++++
 4 files changed, 291 insertions(+)
 create mode 100755 demo-check.sh
 create mode 100755 demo-setup.sh
 create mode 100755 demo.sh
 create mode 100755 local-spark-sql.sh

diff --git a/demo-check.sh b/demo-check.sh
new file mode 100755
index 000000000..d7c60c55d
--- /dev/null
+++ b/demo-check.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+# demo-check.sh — Run after demo-setup.sh. Verifies SUCCESS history rows and
+# shows HDFS file counts per table.
+set -e
+
+OPT_API="http://localhost:8005"
+
+if [ ! -f /tmp/demo_ofd_locs.txt ] || [ ! -f /tmp/demo_ofd_uuids.txt ]; then
+  echo "ERROR: /tmp/demo_ofd_locs.txt or /tmp/demo_ofd_uuids.txt not found — run demo-setup.sh first"
+  exit 1
+fi
+
+echo "=== HDFS data files after OFD ==="
+while IFS='=' read -r TABLE TABLE_LOC; do
+  FILE_COUNT=$(docker exec local.namenode \
+    hdfs dfs -ls -R "$TABLE_LOC/data/" 2>/dev/null \
+    | grep -c "\.orc" || echo "0")
+  echo "  $TABLE: $FILE_COUNT files remaining"
+done < /tmp/demo_ofd_locs.txt
+
+echo ""
+echo "=== Operation history per table ==="
+while IFS='=' read -r TABLE TABLE_UUID; do
+  echo "  --- $TABLE ($TABLE_UUID) ---"
+  curl -sf "$OPT_API/v1/optimizer/operations-history/$TABLE_UUID?limit=10" \
+    | jq '.[] | {status, orphanFilesDeleted, orphanBytesDeleted, errorMessage, completedAt}'
+done < /tmp/demo_ofd_uuids.txt
diff --git a/demo-setup.sh b/demo-setup.sh
new file mode 100755
index 000000000..b39116a4d
--- /dev/null
+++ b/demo-setup.sh
@@ -0,0 +1,140 @@
+#!/usr/bin/env bash
+# demo-setup.sh — Populate tables, expire snapshots, wait for the continuous
+# analyzer + scheduler to schedule OFD jobs.
+#
+# Prerequisites: ./gradlew dockerUp  (stack must be healthy)
+#
+# After this script completes, batched-OFD Spark jobs have been submitted via
+# the continuous scheduler. Use demo-check.sh to watch for completion.
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+TOKEN=$(cat "$SCRIPT_DIR/services/common/src/main/resources/dummy.token")
+TABLES_API="http://localhost:8000"
+JOBS_API="http://localhost:8002/jobs"
+OPT_API="http://localhost:8005"
+LIVY_API="http://localhost:9003"
+
+TABLES="demo_ofd_a:5 demo_ofd_b:7 demo_ofd_c:4"
+TABLE_COUNT=3
+ANALYZER_WAIT_SECS=90
+SCHEDULER_WAIT_SECS=90
+
+wait_for_job() {
+  local JOB_ID="$1" LABEL="$2" MAX_SECS="${3:-180}"
+  local i=0
+  while [ $i -lt "$MAX_SECS" ]; do
+    STATE=$(curl -sf "$JOBS_API/$JOB_ID" | jq -r '.state // empty' 2>/dev/null || echo "")
+    [ "$STATE" = "SUCCEEDED" ] && return 0
+    [ "$STATE" = "FAILED" ] && { echo "FAIL: $LABEL job $JOB_ID FAILED"; exit 1; }
+    sleep 5
+    i=$((i + 5))
+  done
+  echo "FAIL: $LABEL job $JOB_ID timed out after ${MAX_SECS}s (last state: $STATE)"
+  exit 1
+}
+
+kill_idle_session() {
+  IDLE=$(curl -sf "$LIVY_API/sessions" \
+    | jq -r '[.sessions[] | select(.state=="idle")] | first | .id // empty')
+  if [ -n "$IDLE" ]; then
+    curl -sf -X DELETE "$LIVY_API/sessions/$IDLE" > /dev/null
+    echo "  Freed idle Livy session $IDLE"
+  fi
+}
+
+wait_for_count() {
+  local URL="$1" EXPECTED="$2" LABEL="$3" MAX_SECS="$4"
+  local i=0
+  while [ $i -lt "$MAX_SECS" ]; do
+    COUNT=$(curl -sf "$URL" | jq 'length' 2>/dev/null || echo "0")
+    [ "$COUNT" -ge "$EXPECTED" ] && return 0
+    printf "  %s: %d/%d (waited %ds)\r" "$LABEL" "$COUNT" "$EXPECTED" "$i"
+    sleep 5
+    i=$((i + 5))
+  done
+  echo ""
+  echo "FAIL: $LABEL expected $EXPECTED, got $COUNT after ${MAX_SECS}s"
+  exit 1
+}
+
+echo "=== [1/4] Create tables and populate with data ==="
+rm -f /tmp/demo_ofd_locs.txt /tmp/demo_ofd_uuids.txt
+
+for entry in $TABLES; do
+  TABLE="${entry%%:*}"
+  WRITES="${entry##*:}"
+  ORPHANS=$((WRITES - 2))
+
+  "$SCRIPT_DIR/local-spark-sql.sh" "DROP TABLE IF EXISTS openhouse.db1.$TABLE" > /dev/null
+  "$SCRIPT_DIR/local-spark-sql.sh" "CREATE TABLE openhouse.db1.$TABLE (
+    id STRING, val STRING
+  ) TBLPROPERTIES ('maintenance.optimizer.ofd.enabled'='true')" > /dev/null
+
+  for i in $(seq 1 "$WRITES"); do
+    "$SCRIPT_DIR/local-spark-sql.sh" \
+      "INSERT OVERWRITE openhouse.db1.$TABLE VALUES ('$i', 'row$i')" > /dev/null
+    printf "  $TABLE: insert %d/%d\r" "$i" "$WRITES"
+  done
+  echo ""
+
+  TBL_JSON=$(curl -sf -H "Authorization: Bearer $TOKEN" \
+    "$TABLES_API/v1/databases/db1/tables/$TABLE")
+  TABLE_LOC=$(dirname "$(echo "$TBL_JSON" | jq -r '.tableLocation')")
+  TABLE_UUID=$(echo "$TBL_JSON" | jq -r '.tableUUID')
+  echo "  $TABLE -> $TABLE_LOC ($WRITES snapshots, $ORPHANS will become orphans)"
+  echo "  $TABLE uuid=$TABLE_UUID"
+  echo "$TABLE=$TABLE_LOC" >> /tmp/demo_ofd_locs.txt
+  echo "$TABLE=$TABLE_UUID" >> /tmp/demo_ofd_uuids.txt
+done
+
+STATS_COUNT=$(curl -sf "$OPT_API/v1/optimizer/stats?limit=100" | jq 'length')
+[ "$STATS_COUNT" -ge "$TABLE_COUNT" ] \
+  || { echo "FAIL: expected $TABLE_COUNT stats rows, got $STATS_COUNT"; exit 1; }
+echo "PASS: $STATS_COUNT stats rows posted by Tables Service on-commit hook"
+
+echo ""
+echo "=== [2/4] Expire old snapshots (creates orphan data files) ==="
+kill_idle_session
+
+EXPIRE_JOBS=""
+for entry in $TABLES; do
+  TABLE="${entry%%:*}"
+  BODY=$(jq -n --arg n "demo-expire-$TABLE" --arg t "db1.$TABLE" \
+    '{jobName:$n, clusterId:"LocalHadoopCluster",
+      jobConf:{jobType:"SNAPSHOTS_EXPIRATION",
+        args:["--tableName",$t,"--maxAge","1","--granularity","days","--versions","1"]}}')
+  JOB_ID=$(curl -sf -X POST "$JOBS_API" -H "Content-Type: application/json" -d "$BODY" \
+    | jq -r '.jobId')
+  [ -n "$JOB_ID" ] || { echo "FAIL: could not submit expiration job for $TABLE"; exit 1; }
+  echo "  $TABLE: submitted $JOB_ID"
+  EXPIRE_JOBS="$EXPIRE_JOBS $JOB_ID"
+done
+
+for JOB_ID in $EXPIRE_JOBS; do
+  wait_for_job "$JOB_ID" "snapshot-expiration" 180
+  echo "  $JOB_ID: SUCCEEDED"
+done
+
+while IFS='=' read -r TABLE TABLE_LOC; do
+  COUNT=$(docker exec local.namenode \
+    hdfs dfs -ls -R "$TABLE_LOC/data/" 2>/dev/null | grep -c "\.orc" || echo "0")
+  [ "$COUNT" -ge 2 ] \
+    || { echo "FAIL: $TABLE has $COUNT files after expiration, expected >= 2"; exit 1; }
+  echo "  $TABLE: $COUNT files on HDFS ($((COUNT-1)) orphans + 1 live)"
+done < /tmp/demo_ofd_locs.txt
+
+echo ""
+echo "=== [3/4] Wait for analyzer → scheduler → SCHEDULED ==="
+echo "  (analyzer and scheduler both run every ~30s; PENDING is transient)"
+wait_for_count "$OPT_API/v1/optimizer/operations?status=SCHEDULED&limit=100" \
+  "$TABLE_COUNT" "SCHEDULED ops" "$(($ANALYZER_WAIT_SECS + $SCHEDULER_WAIT_SECS))"
+echo ""
+echo "PASS: $TABLE_COUNT operations SCHEDULED"
+
+echo ""
+echo "=== [4/4] (skip) — SCHEDULED is the SUCCESS-readiness signal ==="
+
+echo ""
+echo "Setup complete. Batched-OFD Spark jobs have been launched."
+echo "Run ./demo-check.sh to verify SUCCESS in history + orphan files deleted."
diff --git a/demo.sh b/demo.sh
new file mode 100755
index 000000000..89ea74af3
--- /dev/null
+++ b/demo.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+# demo.sh — Full end-to-end optimizer demo from a clean docker stack.
+#
+# Prerequisites: ./gradlew dockerUp  (stack must be healthy)
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+OPT_API="http://localhost:8005"
+
+TABLE_COUNT=3
+SUCCESS_WAIT_SECS=300
+
+bash "$SCRIPT_DIR/demo-setup.sh"
+
+echo ""
+echo "=== [Wait] OFD Spark jobs to complete (up to 5 min) ==="
+for i in $(seq 1 30); do
+  TOTAL_HISTORY=0
+  SUCCESS_HISTORY=0
+  if [ -f /tmp/demo_ofd_uuids.txt ]; then
+    while IFS='=' read -r _ UUID; do
+      LATEST=$(curl -sf "$OPT_API/v1/optimizer/operations-history/$UUID?limit=1" \
+        | jq -r '.[0].status // empty' 2>/dev/null || echo "")
+      [ -n "$LATEST" ] && TOTAL_HISTORY=$((TOTAL_HISTORY + 1))
+      [ "$LATEST" = "SUCCESS" ] && SUCCESS_HISTORY=$((SUCCESS_HISTORY + 1))
+    done < /tmp/demo_ofd_uuids.txt
+  fi
+  [ "$SUCCESS_HISTORY" -ge "$TABLE_COUNT" ] && break
+  echo "  $i/30: $SUCCESS_HISTORY SUCCESS / $TOTAL_HISTORY total history rows..."
+  sleep 10
+done
+
+[ "$SUCCESS_HISTORY" -ge "$TABLE_COUNT" ] \
+  || { echo "FAIL: only $SUCCESS_HISTORY/$TABLE_COUNT operations reached SUCCESS"; bash "$SCRIPT_DIR/demo-check.sh"; exit 1; }
+
+echo ""
+echo "PASS: all $TABLE_COUNT operations completed with SUCCESS"
+echo ""
+bash "$SCRIPT_DIR/demo-check.sh"
diff --git a/local-spark-sql.sh b/local-spark-sql.sh
new file mode 100755
index 000000000..45582f698
--- /dev/null
+++ b/local-spark-sql.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+# local-spark-sql.sh — Execute a SQL statement against the local OpenHouse docker cluster.
+#
+# Reuses an existing idle Livy session if one exists. First run pays the cold start (~20s),
+# subsequent runs execute immediately.
+#
+# Usage: ./local-spark-sql.sh "INSERT INTO openhouse.db1.smoke_tbl VALUES ('3', 'c')"
+#        ./local-spark-sql.sh --kill   # tear down the reusable session
+set -e
+
+LIVY="http://localhost:9003"
+TOKEN=$(cat /Users/mkuchenb/code/openhouse/services/common/src/main/resources/dummy.token)
+
+if [ "$1" = "--kill" ]; then
+  SESSIONS=$(curl -sf "$LIVY/sessions" | jq -r '.sessions[].id')
+  for sid in $SESSIONS; do
+    curl -sf -X DELETE "$LIVY/sessions/$sid" > /dev/null 2>&1
+    echo "Killed session $sid"
+  done
+  exit 0
+fi
+
+if [ -z "$1" ]; then
+  echo "Usage: local-spark-sql.sh \"<SQL statement>\""
+  echo "       local-spark-sql.sh --kill"
+  exit 1
+fi
+
+SQL="$1"
+
+SESSION_ID=$(curl -sf "$LIVY/sessions" | jq -r '[.sessions[] | select(.state == "idle")] | first | .id // empty')
+
+if [ -z "$SESSION_ID" ]; then
+  SPARK_CONF=$(jq -n --arg token "$TOKEN" '{
+    kind: "sql",
+    conf: {
+      "spark.jars": "local:/opt/spark/openhouse-spark-runtime_2.12-latest-all.jar",
+      "spark.jars.packages": "org.apache.iceberg:iceberg-spark-runtime-3.1_2.12:1.2.0",
+      "spark.sql.extensions": "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,com.linkedin.openhouse.spark.extensions.OpenhouseSparkSessionExtensions",
+      "spark.sql.catalog.openhouse": "org.apache.iceberg.spark.SparkCatalog",
+      "spark.sql.catalog.openhouse.catalog-impl": "com.linkedin.openhouse.spark.OpenHouseCatalog",
+      "spark.sql.catalog.openhouse.uri": "http://openhouse-tables:8080",
+      "spark.sql.catalog.openhouse.auth-token": $token,
+      "spark.sql.catalog.openhouse.cluster": "LocalHadoopCluster"
+    }
+  }')
+
+  echo "No idle session found, creating one..."
+  SESSION_ID=$(curl -sf -X POST "$LIVY/sessions" \
+    -H "Content-Type: application/json" \
+    -d "$SPARK_CONF" | jq -r '.id')
+
+  for i in $(seq 1 60); do
+    STATE=$(curl -sf "$LIVY/sessions/$SESSION_ID/state" | jq -r '.state')
+    [ "$STATE" = "idle" ] && break
+    if [ "$STATE" = "error" ] || [ "$STATE" = "dead" ]; then
+      echo "FAIL: session state=$STATE"
+      exit 1
+    fi
+    sleep 2
+  done
+  [ "$STATE" = "idle" ] || { echo "FAIL: session never reached idle (state=$STATE)"; exit 1; }
+fi
+
+STMT_ID=$(curl -sf -X POST "$LIVY/sessions/$SESSION_ID/statements" \
+  -H "Content-Type: application/json" \
+  -d "$(jq -n --arg code "$SQL" '{code: $code}')" | jq -r '.id')
+
+for i in $(seq 1 60); do
+  STMT_STATE=$(curl -sf "$LIVY/sessions/$SESSION_ID/statements/$STMT_ID" | jq -r '.state')
+  [ "$STMT_STATE" = "available" ] && break
+  [ "$STMT_STATE" = "error" ] && break
+  sleep 1
+done
+
+RESULT=$(curl -sf "$LIVY/sessions/$SESSION_ID/statements/$STMT_ID")
+STATUS=$(echo "$RESULT" | jq -r '.output.status')
+
+if [ "$STATUS" = "error" ]; then
+  echo "FAIL: SQL execution error:"
+  echo "$RESULT" | jq -r '.output.evalue, .output.traceback[]?'
+  exit 1
+fi
+
+echo "$RESULT" | jq -r '.output.data["text/plain"] // ""'

From 6e2669ba5ca18ab88d8d5972ff7ce16240a7b8ab Mon Sep 17 00:00:00 2001
From: mkuchenbecker <mkuchenbecker@users.noreply.github.com>
Date: Wed, 27 May 2026 13:26:06 -0700
Subject: [PATCH 09/14] feat(tables): post-commit stats push to optimizer

After a successful Iceberg snapshot commit, push the table's current
snapshot summary to the optimizer's PUT /v1/optimizer/stats/{tableUuid}
endpoint. Opt-in per table via the maintenance.optimizer.stats.enabled
table property; globally gated by optimizer.stats.enabled in
application.properties (default false).

Implementation notes:
- Stats sourced from the in-memory Snapshot.summary() map captured at
  convertToTableDto(); no extra HDFS round-trip on the commit path.
- WebClient + Reactor pipeline runs fire-and-forget on the Netty event
  loop. The hook on the commit thread is .subscribe() and returns
  immediately, so a slow/down optimizer cannot impact commit latency.
- Per-attempt timeout 1000 ms, total budget 2000 ms, up to 3 attempts.
  Retries only fire on retryable errors (network, 408, 429, 5xx,
  TimeoutException). All terminal errors are swallowed; Micrometer
  counters and a warn log preserve observability.
- Bean wiring (WebClient, client component) is @ConditionalOnProperty
  on optimizer.stats.enabled=true. The service consumes the client as
  an Optional, so disabled deployments construct nothing.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../common/metrics/MetricsConstant.java       |   6 +
 services/tables/build.gradle                  |   1 +
 .../openhouse/tables/model/TableDto.java      |  13 +
 .../impl/InternalRepositoryUtils.java         |   2 +
 .../services/IcebergSnapshotsServiceImpl.java |  17 +-
 .../optimizer/OptimizerStatsClient.java       | 245 ++++++++++++++++++
 .../optimizer/OptimizerStatsConfig.java       |  35 +++
 .../optimizer/OptimizerStatsProperties.java   |  43 +++
 .../optimizer/OptimizerStatsRequest.java      |  70 +++++
 .../src/main/resources/application.properties |  10 +-
 .../service/IcebergSnapshotsServiceTest.java  |  37 +++
 .../tables/model/TableDtoMappingTest.java     |   3 +-
 .../optimizer/OptimizerStatsClientTest.java   | 242 +++++++++++++++++
 13 files changed, 721 insertions(+), 3 deletions(-)
 create mode 100644 services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsClient.java
 create mode 100644 services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsConfig.java
 create mode 100644 services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsProperties.java
 create mode 100644 services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsRequest.java
 create mode 100644 services/tables/src/test/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsClientTest.java

diff --git a/services/common/src/main/java/com/linkedin/openhouse/common/metrics/MetricsConstant.java b/services/common/src/main/java/com/linkedin/openhouse/common/metrics/MetricsConstant.java
index baf65c627..7a47b790e 100644
--- a/services/common/src/main/java/com/linkedin/openhouse/common/metrics/MetricsConstant.java
+++ b/services/common/src/main/java/com/linkedin/openhouse/common/metrics/MetricsConstant.java
@@ -63,4 +63,10 @@ private MetricsConstant() {}
   public static final String HTS_LIST_TABLES_REQUEST = "hts_list_tables_request";
   public static final String HTS_LIST_TABLES_TIME = "hts_list_tables_time";
   public static final String HTS_SEARCH_TABLES_TIME = "hts_search_tables_time";
+
+  // Optimizer post-commit stats push (Tables → Optimizer)
+  public static final String OPTIMIZER_STATS_DURATION = "optimizer_stats_duration";
+  public static final String OPTIMIZER_STATS_ATTEMPTS = "optimizer_stats_attempts";
+  public static final String OPTIMIZER_STATS_SKIPPED = "optimizer_stats_skipped";
+  public static final String OPTIMIZER_STATS_FAILED_FINAL = "optimizer_stats_failed_final";
 }
diff --git a/services/tables/build.gradle b/services/tables/build.gradle
index c85a57131..da42c32ef 100644
--- a/services/tables/build.gradle
+++ b/services/tables/build.gradle
@@ -44,6 +44,7 @@ dependencies {
   testImplementation 'org.junit.jupiter:junit-jupiter-engine:' + junit_version
   testImplementation 'org.springframework.security:spring-security-test:5.7.3'
   testImplementation 'org.springframework:spring-context-support:5.3.18'
+  testImplementation 'com.squareup.okhttp3:mockwebserver:4.10.0'
   testImplementation(testFixtures(project(':services:common')))
   testImplementation (project(':tables-test-fixtures:tables-test-fixtures_2.12')) {
     exclude group: 'com.linkedin.iceberg'
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/model/TableDto.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/model/TableDto.java
index 917c632d8..24373f4a1 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/model/TableDto.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/model/TableDto.java
@@ -16,6 +16,7 @@
 import javax.persistence.Entity;
 import javax.persistence.Id;
 import javax.persistence.IdClass;
+import javax.persistence.Transient;
 import lombok.AccessLevel;
 import lombok.AllArgsConstructor;
 import lombok.Builder;
@@ -83,6 +84,18 @@ public class TableDto {
 
   private boolean replaceCommit;
 
+  /**
+   * Iceberg {@code Snapshot.summary()} for the current snapshot at the time the {@code TableDto}
+   * was constructed (post-commit on save; post-load on read). Populated only when an Iceberg {@code
+   * Table} is available — i.e. by {@code InternalRepositoryUtils.convertToTableDto}. Not persisted,
+   * not part of equality.
+   *
+   * <p>Used downstream by the optimizer post-commit stats push (see {@code
+   * services.optimizer.OptimizerStatsClient}) so that the service layer can read snapshot stats
+   * without a separate HDFS round-trip.
+   */
+  @Transient @EqualsAndHashCode.Exclude private Map<String, String> currentSnapshotSummary;
+
   /**
    * Bundling eligible string type field into a map as {@link org.mapstruct.Mapper} doesn't provide
    * easy interface to achieve so.
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/repository/impl/InternalRepositoryUtils.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/repository/impl/InternalRepositoryUtils.java
index 0dde039c0..9ba40c2d8 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/repository/impl/InternalRepositoryUtils.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/repository/impl/InternalRepositoryUtils.java
@@ -135,6 +135,8 @@ static TableDto convertToTableDto(
             .jsonSnapshots(null)
             .tableProperties(megaProps)
             .sortOrder(SortOrderParser.toJson(table.sortOrder()))
+            .currentSnapshotSummary(
+                table.currentSnapshot() == null ? null : table.currentSnapshot().summary())
             .build();
 
     return tableDto;
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/IcebergSnapshotsServiceImpl.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/IcebergSnapshotsServiceImpl.java
index af43a169e..bb2cbde4c 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/IcebergSnapshotsServiceImpl.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/IcebergSnapshotsServiceImpl.java
@@ -10,6 +10,7 @@
 import com.linkedin.openhouse.tables.model.TableDto;
 import com.linkedin.openhouse.tables.model.TableDtoPrimaryKey;
 import com.linkedin.openhouse.tables.repository.OpenHouseInternalRepository;
+import com.linkedin.openhouse.tables.services.optimizer.OptimizerStatsClient;
 import com.linkedin.openhouse.tables.utils.AuthorizationUtils;
 import com.linkedin.openhouse.tables.utils.TableUUIDGenerator;
 import java.util.Optional;
@@ -32,6 +33,13 @@ public class IcebergSnapshotsServiceImpl implements IcebergSnapshotsService {
 
   @Autowired AuthorizationUtils authorizationUtils;
 
+  /**
+   * Present only when {@code optimizer.stats.enabled=true}. When absent, no post-commit push is
+   * attempted.
+   */
+  @Autowired(required = false)
+  Optional<OptimizerStatsClient> optimizerStatsClient = Optional.empty();
+
   @Override
   public Pair<TableDto, Boolean> putIcebergSnapshots(
       String databaseId,
@@ -83,7 +91,14 @@ public Pair<TableDto, Boolean> putIcebergSnapshots(
           databaseId, tableCreatorUpdater, Privileges.CREATE_TABLE);
     }
     try {
-      return Pair.of(openHouseInternalRepository.save(tableDtoToSave), !tableDto.isPresent());
+      TableDto savedDto = openHouseInternalRepository.save(tableDtoToSave);
+      // Fire-and-forget push of the post-commit snapshot summary to the optimizer. Returns
+      // immediately; failures are swallowed inside the client. Skipped at the client when the
+      // table is not opted in via maintenance.optimizer.stats.enabled or when there is no current
+      // snapshot (e.g. CREATE TABLE without a data commit).
+      optimizerStatsClient.ifPresent(
+          client -> client.report(savedDto, savedDto.getCurrentSnapshotSummary()));
+      return Pair.of(savedDto, !tableDto.isPresent());
     } catch (BadRequestException e) {
       throw new RequestValidationFailureException(e.getMessage(), e);
     } catch (CommitFailedException ce) {
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsClient.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsClient.java
new file mode 100644
index 000000000..23c69082e
--- /dev/null
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsClient.java
@@ -0,0 +1,245 @@
+package com.linkedin.openhouse.tables.services.optimizer;
+
+import com.linkedin.openhouse.common.metrics.MetricsConstant;
+import com.linkedin.openhouse.tables.model.TableDto;
+import io.micrometer.core.instrument.MeterRegistry;
+import io.micrometer.core.instrument.Timer;
+import java.time.Duration;
+import java.util.Map;
+import java.util.concurrent.TimeoutException;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.beans.factory.annotation.Qualifier;
+import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
+import org.springframework.stereotype.Component;
+import org.springframework.web.reactive.function.client.WebClient;
+import org.springframework.web.reactive.function.client.WebClientRequestException;
+import org.springframework.web.reactive.function.client.WebClientResponseException;
+import reactor.core.publisher.Mono;
+import reactor.util.retry.Retry;
+import reactor.util.retry.RetrySpec;
+
+/**
+ * Pushes a stats record to the optimizer's {@code PUT /v1/optimizer/stats/{tableUuid}} endpoint
+ * after a successful Iceberg snapshot commit. Fire-and-forget — the {@link #report} call returns
+ * immediately on the commit thread; the HTTP exchange runs on the WebClient's Netty event loop.
+ *
+ * <p>Errors are recorded as Micrometer metrics and logged at {@code warn} level, but never
+ * propagated. A push failure must not break the commit.
+ *
+ * <p>Timeout / retry shape (from {@link OptimizerStatsProperties}):
+ *
+ * <ul>
+ *   <li>{@code perAttemptTimeoutMs} bounds each HTTP attempt (1000 ms default).
+ *   <li>{@code maxAttempts} caps the total number of attempts (3 default — one initial + two
+ *       retries). Retries only fire on retryable errors (network, 5xx, 408, 429, timeout).
+ *   <li>{@code totalTimeoutMs} is the outer wall-clock ceiling (2000 ms default). Hard cancel.
+ * </ul>
+ *
+ * <p>The bean is only wired when {@code optimizer.stats.enabled=true}.
+ */
+@Slf4j
+@Component
+@ConditionalOnProperty(prefix = "optimizer.stats", name = "enabled", havingValue = "true")
+public class OptimizerStatsClient {
+
+  /** Per-call URL path. */
+  static final String PATH_TEMPLATE = "/v1/optimizer/stats/{tableUuid}";
+
+  /** Table-property key that opts a table in for the post-commit stats push. */
+  static final String OPT_IN_PROPERTY = "maintenance.optimizer.stats.enabled";
+
+  /** Iceberg snapshot-summary keys we read. All values are decimal-string longs. */
+  static final String SUMMARY_TOTAL_DATA_FILES = "total-data-files";
+
+  static final String SUMMARY_TOTAL_FILES_SIZE = "total-files-size";
+  static final String SUMMARY_ADDED_DATA_FILES = "added-data-files";
+  static final String SUMMARY_DELETED_DATA_FILES = "deleted-data-files";
+  static final String SUMMARY_ADDED_FILES_SIZE = "added-files-size";
+  static final String SUMMARY_REMOVED_FILES_SIZE = "removed-files-size";
+
+  private final WebClient webClient;
+  private final MeterRegistry meterRegistry;
+  private final OptimizerStatsProperties properties;
+
+  public OptimizerStatsClient(
+      @Qualifier("optimizerStatsWebClient") WebClient webClient,
+      MeterRegistry meterRegistry,
+      OptimizerStatsProperties properties) {
+    this.webClient = webClient;
+    this.meterRegistry = meterRegistry;
+    this.properties = properties;
+  }
+
+  /**
+   * Build and dispatch a stats push for the just-committed table. Returns immediately; the HTTP
+   * call runs in the background. {@code snapshotSummary} is the in-memory {@code
+   * Snapshot.summary()} map from the latest snapshot (no HDFS round-trip).
+   *
+   * <p>The call is skipped (and a {@code skipped} counter is emitted) when:
+   *
+   * <ul>
+   *   <li>The table is not opted in via {@link #OPT_IN_PROPERTY}.
+   *   <li>{@code snapshotSummary} is null or empty (no committed snapshot yet — e.g. CREATE TABLE
+   *       with no rows).
+   * </ul>
+   */
+  public void report(TableDto saved, Map<String, String> snapshotSummary) {
+    reportAsync(saved, snapshotSummary).subscribe();
+  }
+
+  /**
+   * Same as {@link #report} but returns the underlying {@link Mono} so callers (notably tests) can
+   * block on completion. Production callers use {@link #report}.
+   */
+  Mono<Void> reportAsync(TableDto saved, Map<String, String> snapshotSummary) {
+    if (!isOptedIn(saved)) {
+      meterRegistry
+          .counter(MetricsConstant.OPTIMIZER_STATS_SKIPPED, "reason", "opt_out")
+          .increment();
+      return Mono.empty();
+    }
+    if (snapshotSummary == null || snapshotSummary.isEmpty()) {
+      meterRegistry
+          .counter(MetricsConstant.OPTIMIZER_STATS_SKIPPED, "reason", "no_snapshot")
+          .increment();
+      return Mono.empty();
+    }
+
+    OptimizerStatsRequest body = buildRequest(saved, snapshotSummary);
+    String tableUuid = saved.getTableUUID();
+    Timer.Sample sample = Timer.start(meterRegistry);
+
+    RetrySpec retrySpec =
+        Retry.max(Math.max(0, properties.getMaxAttempts() - 1))
+            .filter(this::isRetryable)
+            .onRetryExhaustedThrow((spec, signal) -> signal.failure());
+
+    return webClient
+        .put()
+        .uri(PATH_TEMPLATE, tableUuid)
+        .bodyValue(body)
+        .retrieve()
+        .toBodilessEntity()
+        .timeout(Duration.ofMillis(properties.getPerAttemptTimeoutMs()))
+        .doOnError(
+            e ->
+                meterRegistry
+                    .counter(
+                        MetricsConstant.OPTIMIZER_STATS_ATTEMPTS,
+                        "outcome",
+                        isRetryable(e) ? "retryable_failure" : "non_retryable_failure")
+                    .increment())
+        .doOnSuccess(
+            ignored ->
+                meterRegistry
+                    .counter(MetricsConstant.OPTIMIZER_STATS_ATTEMPTS, "outcome", "success")
+                    .increment())
+        .retryWhen(retrySpec)
+        .timeout(Duration.ofMillis(properties.getTotalTimeoutMs()))
+        .doOnSuccess(
+            ignored ->
+                sample.stop(
+                    meterRegistry.timer(
+                        MetricsConstant.OPTIMIZER_STATS_DURATION, "outcome", "success")))
+        .onErrorResume(
+            e -> {
+              String outcome = classifyOutcome(e);
+              sample.stop(
+                  meterRegistry.timer(
+                      MetricsConstant.OPTIMIZER_STATS_DURATION, "outcome", outcome));
+              meterRegistry
+                  .counter(MetricsConstant.OPTIMIZER_STATS_FAILED_FINAL, "outcome", outcome)
+                  .increment();
+              log.warn(
+                  "Optimizer stats push failed for table {} ({}): {}",
+                  tableUuid,
+                  outcome,
+                  e.toString());
+              return Mono.empty();
+            })
+        .then();
+  }
+
+  /** {@code true} iff the table's properties contain the literal opt-in value {@code "true"}. */
+  private boolean isOptedIn(TableDto saved) {
+    Map<String, String> props = saved.getTableProperties();
+    return props != null && "true".equals(props.get(OPT_IN_PROPERTY));
+  }
+
+  /** Build the wire body. Missing summary keys default to 0L. */
+  OptimizerStatsRequest buildRequest(TableDto saved, Map<String, String> summary) {
+    OptimizerStatsRequest.Snapshot snapshot =
+        OptimizerStatsRequest.Snapshot.builder()
+            .tableVersion(saved.getTableVersion())
+            .tableLocation(saved.getTableLocation())
+            .tableSizeBytes(longOrZero(summary, SUMMARY_TOTAL_FILES_SIZE))
+            .numCurrentFiles(longOrZero(summary, SUMMARY_TOTAL_DATA_FILES))
+            .build();
+    OptimizerStatsRequest.Delta delta =
+        OptimizerStatsRequest.Delta.builder()
+            .numFilesAdded(longOrZero(summary, SUMMARY_ADDED_DATA_FILES))
+            .numFilesDeleted(longOrZero(summary, SUMMARY_DELETED_DATA_FILES))
+            .addedSizeBytes(longOrZero(summary, SUMMARY_ADDED_FILES_SIZE))
+            .deletedSizeBytes(longOrZero(summary, SUMMARY_REMOVED_FILES_SIZE))
+            .build();
+    return OptimizerStatsRequest.builder()
+        .databaseName(saved.getDatabaseId())
+        .tableName(saved.getTableId())
+        .stats(OptimizerStatsRequest.Stats.builder().snapshot(snapshot).delta(delta).build())
+        .tableProperties(saved.getTableProperties())
+        .build();
+  }
+
+  /**
+   * Retryable errors: per-attempt timeout, network-level failures, 5xx, 408, 429. Other 4xx are
+   * client errors — retrying won't fix them.
+   */
+  boolean isRetryable(Throwable e) {
+    if (e instanceof TimeoutException) {
+      return true;
+    }
+    if (e instanceof WebClientRequestException) {
+      return true;
+    }
+    if (e instanceof WebClientResponseException) {
+      int code = ((WebClientResponseException) e).getStatusCode().value();
+      return code == 408 || code == 429 || (code >= 500 && code < 600);
+    }
+    return false;
+  }
+
+  /**
+   * Map a final-stage error to one of {@code success, timeout, network_error, server_error,
+   * client_error, unknown_error} for the {@code outcome} tag on duration / failed_final metrics.
+   */
+  private String classifyOutcome(Throwable e) {
+    if (e instanceof TimeoutException) {
+      return "timeout";
+    }
+    if (e instanceof WebClientRequestException) {
+      return "network_error";
+    }
+    if (e instanceof WebClientResponseException) {
+      int code = ((WebClientResponseException) e).getStatusCode().value();
+      if (code >= 500 && code < 600) {
+        return "server_error";
+      }
+      if (code >= 400 && code < 500) {
+        return "client_error";
+      }
+    }
+    return "unknown_error";
+  }
+
+  private static long longOrZero(Map<String, String> m, String key) {
+    String v = m.get(key);
+    if (v == null) {
+      return 0L;
+    }
+    try {
+      return Long.parseLong(v);
+    } catch (NumberFormatException nfe) {
+      return 0L;
+    }
+  }
+}
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsConfig.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsConfig.java
new file mode 100644
index 000000000..bbddbce6b
--- /dev/null
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsConfig.java
@@ -0,0 +1,35 @@
+package com.linkedin.openhouse.tables.services.optimizer;
+
+import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
+import org.springframework.boot.context.properties.EnableConfigurationProperties;
+import org.springframework.context.annotation.Bean;
+import org.springframework.context.annotation.Configuration;
+import org.springframework.http.client.reactive.ReactorClientHttpConnector;
+import org.springframework.web.reactive.function.client.WebClient;
+import reactor.netty.http.client.HttpClient;
+
+/**
+ * Wiring for the post-commit Tables → Optimizer stats push. Only active when {@code
+ * optimizer.stats.enabled=true}; otherwise no WebClient or client bean is constructed and the
+ * on-commit hook in {@code IcebergSnapshotsServiceImpl} is a no-op.
+ */
+@Configuration
+@EnableConfigurationProperties(OptimizerStatsProperties.class)
+@ConditionalOnProperty(prefix = "optimizer.stats", name = "enabled", havingValue = "true")
+public class OptimizerStatsConfig {
+
+  /**
+   * Dedicated WebClient for the optimizer stats endpoint. Per-attempt and outer timeouts are
+   * applied at the call site on the Reactor chain in {@link OptimizerStatsClient} — they are not
+   * configured on the underlying Netty client so that the timeout always emerges as a standard
+   * {@link java.util.concurrent.TimeoutException} (not a Netty {@code ReadTimeoutException}), which
+   * keeps the client's outcome classification simple.
+   */
+  @Bean("optimizerStatsWebClient")
+  public WebClient optimizerStatsWebClient(OptimizerStatsProperties properties) {
+    return WebClient.builder()
+        .baseUrl(properties.getBaseUri())
+        .clientConnector(new ReactorClientHttpConnector(HttpClient.create()))
+        .build();
+  }
+}
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsProperties.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsProperties.java
new file mode 100644
index 000000000..7d746ee93
--- /dev/null
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsProperties.java
@@ -0,0 +1,43 @@
+package com.linkedin.openhouse.tables.services.optimizer;
+
+import lombok.Getter;
+import lombok.Setter;
+import org.springframework.boot.context.properties.ConfigurationProperties;
+
+/**
+ * Configuration for the post-commit stats push from the Tables Service to the Optimizer's stats
+ * endpoint. Property prefix: {@code optimizer.stats}.
+ *
+ * <p>When {@code enabled} is false (the default), no client bean is constructed and the push is a
+ * no-op on every commit. Environments that want the feed turn it on and set {@link #baseUri}.
+ */
+@ConfigurationProperties("optimizer.stats")
+@Getter
+@Setter
+public class OptimizerStatsProperties {
+
+  /** Master switch. When {@code false}, no HTTP push is attempted and no client bean is wired. */
+  private boolean enabled = false;
+
+  /**
+   * Base URI of the Optimizer service. Path {@code /v1/optimizer/stats/{tableUuid}} is appended at
+   * call time. No default — required when {@link #enabled} is {@code true}.
+   */
+  private String baseUri;
+
+  /** Per-attempt request timeout in milliseconds. Default 1000. */
+  private long perAttemptTimeoutMs = 1000L;
+
+  /**
+   * Hard ceiling on total wall-clock time for the entire call (including retries) in milliseconds.
+   * Default 2000. When the outer timeout fires, the chain is cancelled and the error is swallowed.
+   */
+  private long totalTimeoutMs = 2000L;
+
+  /**
+   * Total attempt count (initial try plus retries). Default 3. With 1000 ms per attempt and a 2000
+   * ms outer ceiling, only about two attempts realistically fit on a slow path; the third is
+   * available if attempts return quickly.
+   */
+  private int maxAttempts = 3;
+}
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsRequest.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsRequest.java
new file mode 100644
index 000000000..68a8646da
--- /dev/null
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsRequest.java
@@ -0,0 +1,70 @@
+package com.linkedin.openhouse.tables.services.optimizer;
+
+import com.fasterxml.jackson.annotation.JsonInclude;
+import java.util.Map;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+
+/**
+ * Wire body for {@code PUT /v1/optimizer/stats/{tableUuid}}. Mirrors the optimizer's {@code
+ * UpsertTableStatsRequest} field-for-field — see {@code
+ * services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/UpsertTableStatsRequest.java}.
+ *
+ * <p>Tables service owns its own copy so that the wire contract is explicit at the call site and
+ * the optimizer client jar is not a compile-time dependency.
+ */
+@Data
+@Builder
+@NoArgsConstructor
+@AllArgsConstructor
+@JsonInclude(JsonInclude.Include.NON_NULL)
+public class OptimizerStatsRequest {
+
+  private String databaseName;
+  private String tableName;
+  private Stats stats;
+  private Map<String, String> tableProperties;
+
+  @Data
+  @Builder
+  @NoArgsConstructor
+  @AllArgsConstructor
+  @JsonInclude(JsonInclude.Include.NON_NULL)
+  public static class Stats {
+    private Snapshot snapshot;
+    private Delta delta;
+  }
+
+  /**
+   * Point-in-time snapshot metrics. Map to optimizer's {@code
+   * TableStatsPayload.SnapshotMetricsDto}.
+   */
+  @Data
+  @Builder
+  @NoArgsConstructor
+  @AllArgsConstructor
+  @JsonInclude(JsonInclude.Include.NON_NULL)
+  public static class Snapshot {
+    private String tableVersion;
+    private String tableLocation;
+    private Long tableSizeBytes;
+    private Long numCurrentFiles;
+  }
+
+  /**
+   * Per-commit incremental counters. Map to optimizer's {@code TableStatsPayload.CommitDeltaDto}.
+   */
+  @Data
+  @Builder
+  @NoArgsConstructor
+  @AllArgsConstructor
+  @JsonInclude(JsonInclude.Include.NON_NULL)
+  public static class Delta {
+    private Long numFilesAdded;
+    private Long numFilesDeleted;
+    private Long addedSizeBytes;
+    private Long deletedSizeBytes;
+  }
+}
diff --git a/services/tables/src/main/resources/application.properties b/services/tables/src/main/resources/application.properties
index 4a6c82a89..ddf4c3022 100644
--- a/services/tables/src/main/resources/application.properties
+++ b/services/tables/src/main/resources/application.properties
@@ -23,4 +23,12 @@ management.metrics.distribution.maximum-expected-value.catalog_metadata_retrieva
 management.metrics.distribution.maximum-expected-value.catalog_metadata_update_latency=600s
 management.metrics.distribution.maximum-expected-value.http.server.requests=600s
 server.shutdown=graceful
-spring.lifecycle.timeout-per-shutdown-phase=60s
\ No newline at end of file
+spring.lifecycle.timeout-per-shutdown-phase=60s
+
+# Post-commit Tables -> Optimizer stats push.
+# Disabled by default; environments that run the optimizer opt in via env vars.
+optimizer.stats.enabled=${OPTIMIZER_STATS_ENABLED:false}
+optimizer.stats.base-uri=${OPTIMIZER_STATS_BASE_URI:}
+optimizer.stats.per-attempt-timeout-ms=${OPTIMIZER_STATS_PER_ATTEMPT_TIMEOUT_MS:1000}
+optimizer.stats.total-timeout-ms=${OPTIMIZER_STATS_TOTAL_TIMEOUT_MS:2000}
+optimizer.stats.max-attempts=${OPTIMIZER_STATS_MAX_ATTEMPTS:3}
\ No newline at end of file
diff --git a/services/tables/src/test/java/com/linkedin/openhouse/tables/mock/service/IcebergSnapshotsServiceTest.java b/services/tables/src/test/java/com/linkedin/openhouse/tables/mock/service/IcebergSnapshotsServiceTest.java
index 44467b705..80f9233af 100644
--- a/services/tables/src/test/java/com/linkedin/openhouse/tables/mock/service/IcebergSnapshotsServiceTest.java
+++ b/services/tables/src/test/java/com/linkedin/openhouse/tables/mock/service/IcebergSnapshotsServiceTest.java
@@ -15,7 +15,10 @@
 import com.linkedin.openhouse.tables.model.TableDtoPrimaryKey;
 import com.linkedin.openhouse.tables.repository.OpenHouseInternalRepository;
 import com.linkedin.openhouse.tables.services.IcebergSnapshotsService;
+import com.linkedin.openhouse.tables.services.optimizer.OptimizerStatsClient;
 import com.linkedin.openhouse.tables.utils.TableUUIDGenerator;
+import java.util.Collections;
+import java.util.Map;
 import java.util.Optional;
 import java.util.UUID;
 import org.apache.iceberg.exceptions.BadRequestException;
@@ -45,6 +48,14 @@ public class IcebergSnapshotsServiceTest {
 
   @MockBean private TableUUIDGenerator tableUUIDGenerator;
 
+  /**
+   * Forces the optional optimizer-stats client into the service so that we can verify the
+   * post-commit push is invoked. Production wiring is conditional on {@code
+   * optimizer.stats.enabled=true}; this {@code @MockBean} bypasses that condition for the one test
+   * that exercises the hook.
+   */
+  @MockBean private OptimizerStatsClient optimizerStatsClient;
+
   private OpenHouseInternalRepository mockRepository;
 
   @Captor ArgumentCaptor<TableDto> tableDtoArgumentCaptor;
@@ -90,6 +101,32 @@ public void testTableCreated() {
     verifyCalls(key, TEST_TABLE_CREATOR, requestBody.getCreateUpdateTableRequestBody());
   }
 
+  @Test
+  public void testOptimizerStatsClientInvokedAfterSuccessfulCommit() {
+    final IcebergSnapshotsRequestBody requestBody =
+        TEST_ICEBERG_SNAPSHOTS_INITIAL_VERSION_REQUEST_BODY;
+    final String dbId = requestBody.getCreateUpdateTableRequestBody().getDatabaseId();
+    final String tableId = requestBody.getCreateUpdateTableRequestBody().getTableId();
+    final TableDtoPrimaryKey key =
+        TableDtoPrimaryKey.builder().databaseId(dbId).tableId(tableId).build();
+    final Map<String, String> summary = Collections.singletonMap("total-data-files", "7");
+    final TableDto savedDto =
+        TableDto.builder()
+            .databaseId(dbId)
+            .tableId(tableId)
+            .currentSnapshotSummary(summary)
+            .build();
+
+    Mockito.when(tableUUIDGenerator.generateUUID(Mockito.any(IcebergSnapshotsRequestBody.class)))
+        .thenReturn(UUID.randomUUID());
+    Mockito.when(mockRepository.findById(key)).thenReturn(Optional.empty());
+    Mockito.when(mockRepository.save(Mockito.any(TableDto.class))).thenReturn(savedDto);
+
+    service.putIcebergSnapshots(dbId, tableId, requestBody, TEST_TABLE_CREATOR);
+
+    Mockito.verify(optimizerStatsClient, Mockito.times(1)).report(savedDto, summary);
+  }
+
   @Test
   public void testPutTableExceptionHandling() {
     final IcebergSnapshotsRequestBody requestBody =
diff --git a/services/tables/src/test/java/com/linkedin/openhouse/tables/model/TableDtoMappingTest.java b/services/tables/src/test/java/com/linkedin/openhouse/tables/model/TableDtoMappingTest.java
index e3e19a7d8..64aefd19b 100644
--- a/services/tables/src/test/java/com/linkedin/openhouse/tables/model/TableDtoMappingTest.java
+++ b/services/tables/src/test/java/com/linkedin/openhouse/tables/model/TableDtoMappingTest.java
@@ -33,7 +33,8 @@ public class TableDtoMappingTest {
           "jsonSnapshots",
           "snapshotRefs",
           "policies",
-          "tableType");
+          "tableType",
+          "currentSnapshotSummary");
 
   /** Making all fields making it to map is expected, and all expected field are making it there. */
   @Test
diff --git a/services/tables/src/test/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsClientTest.java b/services/tables/src/test/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsClientTest.java
new file mode 100644
index 000000000..33ea0d197
--- /dev/null
+++ b/services/tables/src/test/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsClientTest.java
@@ -0,0 +1,242 @@
+package com.linkedin.openhouse.tables.services.optimizer;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.linkedin.openhouse.common.metrics.MetricsConstant;
+import com.linkedin.openhouse.tables.model.TableDto;
+import io.micrometer.core.instrument.MeterRegistry;
+import io.micrometer.core.instrument.simple.SimpleMeterRegistry;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+import okhttp3.mockwebserver.MockResponse;
+import okhttp3.mockwebserver.MockWebServer;
+import okhttp3.mockwebserver.RecordedRequest;
+import okhttp3.mockwebserver.SocketPolicy;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.springframework.http.client.reactive.ReactorClientHttpConnector;
+import org.springframework.web.reactive.function.client.WebClient;
+import reactor.netty.http.client.HttpClient;
+
+class OptimizerStatsClientTest {
+
+  private static final String TABLE_UUID = "uuid-1";
+  private static final String DB = "db1";
+  private static final String TABLE = "tbl1";
+  private static final Duration BLOCK_MAX = Duration.ofSeconds(5);
+
+  private MockWebServer server;
+  private MeterRegistry meterRegistry;
+  private OptimizerStatsProperties properties;
+  private OptimizerStatsClient client;
+  private final ObjectMapper mapper = new ObjectMapper();
+
+  @BeforeEach
+  void setUp() throws IOException {
+    server = new MockWebServer();
+    server.start();
+    meterRegistry = new SimpleMeterRegistry();
+    properties = new OptimizerStatsProperties();
+    properties.setEnabled(true);
+    properties.setBaseUri(server.url("/").toString());
+    properties.setPerAttemptTimeoutMs(1000L);
+    properties.setTotalTimeoutMs(2000L);
+    properties.setMaxAttempts(3);
+    client = new OptimizerStatsClient(buildWebClient(), meterRegistry, properties);
+  }
+
+  @AfterEach
+  void tearDown() throws IOException {
+    server.shutdown();
+  }
+
+  private WebClient buildWebClient() {
+    HttpClient httpClient = HttpClient.create();
+    return WebClient.builder()
+        .baseUrl(properties.getBaseUri())
+        .clientConnector(new ReactorClientHttpConnector(httpClient))
+        .build();
+  }
+
+  @Test
+  void report_optedIn_sendsRequestWithPayloadFieldsFromSummary() throws Exception {
+    server.enqueue(new MockResponse().setResponseCode(200));
+
+    TableDto saved =
+        optedInBuilder()
+            .tableVersion("v3")
+            .tableLocation("/data/tables/db1/tbl1")
+            .currentSnapshotSummary(summary())
+            .build();
+
+    client.reportAsync(saved, saved.getCurrentSnapshotSummary()).block(BLOCK_MAX);
+
+    RecordedRequest req = server.takeRequest(2, TimeUnit.SECONDS);
+    assertThat(req).isNotNull();
+    assertThat(req.getMethod()).isEqualTo("PUT");
+    assertThat(req.getPath()).isEqualTo("/v1/optimizer/stats/" + TABLE_UUID);
+
+    @SuppressWarnings("unchecked")
+    Map<String, Object> body = mapper.readValue(req.getBody().readUtf8(), Map.class);
+    assertThat(body).containsEntry("databaseName", DB).containsEntry("tableName", TABLE);
+    @SuppressWarnings("unchecked")
+    Map<String, Object> stats = (Map<String, Object>) body.get("stats");
+    @SuppressWarnings("unchecked")
+    Map<String, Object> snapshot = (Map<String, Object>) stats.get("snapshot");
+    @SuppressWarnings("unchecked")
+    Map<String, Object> delta = (Map<String, Object>) stats.get("delta");
+    assertThat(snapshot)
+        .containsEntry("tableVersion", "v3")
+        .containsEntry("tableLocation", "/data/tables/db1/tbl1")
+        .containsEntry("tableSizeBytes", 4096)
+        .containsEntry("numCurrentFiles", 12);
+    assertThat(delta)
+        .containsEntry("numFilesAdded", 5)
+        .containsEntry("numFilesDeleted", 2)
+        .containsEntry("addedSizeBytes", 2048)
+        .containsEntry("deletedSizeBytes", 1024);
+
+    assertThat(
+            meterRegistry
+                .counter(MetricsConstant.OPTIMIZER_STATS_ATTEMPTS, "outcome", "success")
+                .count())
+        .isEqualTo(1.0);
+  }
+
+  @Test
+  void report_notOptedIn_skipsCallAndIncrementsSkippedCounter() {
+    TableDto saved = builderWithoutOptIn().currentSnapshotSummary(summary()).build();
+
+    client.reportAsync(saved, saved.getCurrentSnapshotSummary()).block(BLOCK_MAX);
+
+    assertThat(server.getRequestCount()).isZero();
+    assertThat(
+            meterRegistry
+                .counter(MetricsConstant.OPTIMIZER_STATS_SKIPPED, "reason", "opt_out")
+                .count())
+        .isEqualTo(1.0);
+  }
+
+  @Test
+  void report_noSnapshot_skipsCallAndIncrementsSkippedCounter() {
+    TableDto saved = optedInBuilder().currentSnapshotSummary(null).build();
+
+    client.reportAsync(saved, null).block(BLOCK_MAX);
+
+    assertThat(server.getRequestCount()).isZero();
+    assertThat(
+            meterRegistry
+                .counter(MetricsConstant.OPTIMIZER_STATS_SKIPPED, "reason", "no_snapshot")
+                .count())
+        .isEqualTo(1.0);
+  }
+
+  @Test
+  void report_5xxThenSuccess_retriesAndSucceeds() {
+    server.enqueue(new MockResponse().setResponseCode(503));
+    server.enqueue(new MockResponse().setResponseCode(200));
+
+    TableDto saved = optedInBuilder().currentSnapshotSummary(summary()).build();
+    client.reportAsync(saved, saved.getCurrentSnapshotSummary()).block(BLOCK_MAX);
+
+    assertThat(server.getRequestCount()).isEqualTo(2);
+    assertThat(
+            meterRegistry
+                .counter(MetricsConstant.OPTIMIZER_STATS_ATTEMPTS, "outcome", "success")
+                .count())
+        .isEqualTo(1.0);
+    assertThat(
+            meterRegistry
+                .counter(MetricsConstant.OPTIMIZER_STATS_ATTEMPTS, "outcome", "retryable_failure")
+                .count())
+        .isEqualTo(1.0);
+  }
+
+  @Test
+  void report_allAttemptsFail_swallowsAndIncrementsFailedFinal() {
+    properties.setMaxAttempts(2);
+    server.enqueue(new MockResponse().setResponseCode(503));
+    server.enqueue(new MockResponse().setResponseCode(503));
+
+    TableDto saved = optedInBuilder().currentSnapshotSummary(summary()).build();
+    client.reportAsync(saved, saved.getCurrentSnapshotSummary()).block(BLOCK_MAX);
+
+    assertThat(server.getRequestCount()).isEqualTo(2);
+    assertThat(
+            meterRegistry
+                .counter(MetricsConstant.OPTIMIZER_STATS_FAILED_FINAL, "outcome", "server_error")
+                .count())
+        .isEqualTo(1.0);
+  }
+
+  @Test
+  void report_4xxNonRetryable_doesNotRetry() {
+    server.enqueue(new MockResponse().setResponseCode(400));
+
+    TableDto saved = optedInBuilder().currentSnapshotSummary(summary()).build();
+    client.reportAsync(saved, saved.getCurrentSnapshotSummary()).block(BLOCK_MAX);
+
+    assertThat(server.getRequestCount()).isEqualTo(1);
+    assertThat(
+            meterRegistry
+                .counter(MetricsConstant.OPTIMIZER_STATS_FAILED_FINAL, "outcome", "client_error")
+                .count())
+        .isEqualTo(1.0);
+  }
+
+  @Test
+  void report_perAttemptTimeoutTrips_recordsTimeout() {
+    properties.setPerAttemptTimeoutMs(150L);
+    properties.setTotalTimeoutMs(600L);
+    properties.setMaxAttempts(1);
+    client = new OptimizerStatsClient(buildWebClient(), meterRegistry, properties);
+    // Connection opens but the server never sends any response — forces Reactor's per-attempt
+    // .timeout(150ms) to fire with TimeoutException.
+    server.enqueue(new MockResponse().setSocketPolicy(SocketPolicy.NO_RESPONSE));
+
+    TableDto saved = optedInBuilder().currentSnapshotSummary(summary()).build();
+    client.reportAsync(saved, saved.getCurrentSnapshotSummary()).block(BLOCK_MAX);
+
+    assertThat(
+            meterRegistry
+                .counter(MetricsConstant.OPTIMIZER_STATS_FAILED_FINAL, "outcome", "timeout")
+                .count())
+        .isEqualTo(1.0);
+  }
+
+  // ---- helpers ----
+
+  private static Map<String, String> summary() {
+    Map<String, String> s = new HashMap<>();
+    s.put("total-data-files", "12");
+    s.put("total-files-size", "4096");
+    s.put("added-data-files", "5");
+    s.put("deleted-data-files", "2");
+    s.put("added-files-size", "2048");
+    s.put("removed-files-size", "1024");
+    return s;
+  }
+
+  private static TableDto.TableDtoBuilder optedInBuilder() {
+    Map<String, String> props = new HashMap<>();
+    props.put(OptimizerStatsClient.OPT_IN_PROPERTY, "true");
+    return TableDto.builder()
+        .tableUUID(TABLE_UUID)
+        .databaseId(DB)
+        .tableId(TABLE)
+        .tableProperties(props);
+  }
+
+  private static TableDto.TableDtoBuilder builderWithoutOptIn() {
+    return TableDto.builder()
+        .tableUUID(TABLE_UUID)
+        .databaseId(DB)
+        .tableId(TABLE)
+        .tableProperties(new HashMap<>());
+  }
+}

From af025aef96153de3852408e78bc0f2d8a43c5aef Mon Sep 17 00:00:00 2001
From: mkuchenbecker <mkuchenbecker@users.noreply.github.com>
Date: Wed, 27 May 2026 14:16:30 -0700
Subject: [PATCH 10/14] refactor(tables): generic PostCommitOperation
 framework; address review

Restructures the post-commit stats push around a small generic dispatcher
so async/timeout/swallow plumbing lives in one place. Addresses the
review comments on PR #608.

Framework (new):
- PostCommitOperation interface: String name() + Optional<Mono<Void>>
  prepare(TableDto). Implementations describe what to do; the dispatcher
  owns how it runs.
- PostCommitDispatcher (@ConditionalOnProperty tables.postcommit.enabled):
  per-op wall-clock timeout, error swallow, metric emission, fire-and-
  forget subscribe. Exposes package-private decorate() for synchronous
  testing.
- PostCommitProperties: enabled (default false), per-op-timeout-ms
  (default 3000).
- Metric keys renamed: POSTCOMMIT_OP_DURATION / POSTCOMMIT_OP_SKIPPED /
  POSTCOMMIT_OP_FAILED, tagged op={name} and outcome={success, timeout,
  network_error, server_error, client_error, prepare_threw,
  unknown_error}.

Why async: a synchronous post-commit push converts an optimizer outage
into a tables write outage. Stats are a best-effort scheduling signal
and must not block the write path. Crash-loss is bounded because state
is cumulative: the next commit re-pushes the current state.

Operation (refactor):
- OptimizerStatsClient -> OptimizerStatsPostCommitOperation implementing
  PostCommitOperation. Drops outer timeout, onErrorResume, .subscribe()
  (dispatcher owns these). Keeps per-attempt timeout + bounded retry on
  retryable errors only.
- Adds snapshotId to OptimizerStatsRequest.Snapshot; tracked for
  server-side idempotency wiring in BDP-102985.
- Path constant comment now cites TableStatsController.TABLE_PATH_TEMPLATE
  as the canonical source.

TableDto:
- currentSnapshotSummary (Map) replaced by Optional<CurrentSnapshotInfo>
  (snapshotId + summary). Stored nullable internally; consumers read
  through getCurrentSnapshot() which returns Optional. Javadoc rewritten
  to describe the semantic condition (presence ~ table has a committed
  snapshot), not the private impl that populates it.
- New CurrentSnapshotInfo value class.
- InternalRepositoryUtils populates the new field with both snapshotId
  and summary; no extra I/O (still purely in-memory).

Service:
- IcebergSnapshotsServiceImpl: Optional<OptimizerStatsClient> replaced
  by Optional<PostCommitDispatcher>; on-commit hook is one line.

Optimizer side:
- TableStatsController publishes BASE_PATH and TABLE_PATH_TEMPLATE as
  public constants; @RequestMapping refactored to use BASE_PATH.

Config:
- application.properties: drop optimizer.stats.total-timeout-ms; add
  tables.postcommit.enabled + tables.postcommit.per-op-timeout-ms.

Tests:
- New PostCommitDispatcherTest (6 cases, no sleeps; uses decorate() to
  block synchronously).
- OptimizerStatsClientTest renamed -> OptimizerStatsPostCommitOperationTest
  (6 cases adapted to the new prepare() contract; timeout case moved to
  PostCommitDispatcherTest).
- IcebergSnapshotsServiceTest swaps @MockBean OptimizerStatsClient for
  @MockBean PostCommitDispatcher and verifies dispatch(savedDto) once.
- TableDtoMappingTest updated for the renamed field.
- All :services:tables and :services:optimizer tests pass; spotless
  clean.

Filed: BDP-102985 (Optimizer stats: use snapshot ID as idempotency
token on upsert) under epic BDP-102026.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../common/metrics/MetricsConstant.java       |  12 +-
 .../api/controller/TableStatsController.java  |  14 +-
 .../tables/dto/mapper/TablesMapper.java       |   4 +-
 .../tables/model/CurrentSnapshotInfo.java     |  33 +++
 .../openhouse/tables/model/TableDto.java      |  24 +-
 .../impl/InternalRepositoryUtils.java         |  10 +-
 .../services/IcebergSnapshotsServiceImpl.java |  15 +-
 .../optimizer/OptimizerStatsClient.java       | 245 ------------------
 .../optimizer/OptimizerStatsConfig.java       |   9 +-
 .../OptimizerStatsPostCommitOperation.java    | 173 +++++++++++++
 .../optimizer/OptimizerStatsProperties.java   |  15 +-
 .../optimizer/OptimizerStatsRequest.java      |   6 +
 .../postcommit/PostCommitDispatcher.java      | 147 +++++++++++
 .../postcommit/PostCommitOperation.java       |  39 +++
 .../postcommit/PostCommitProperties.java      |  31 +++
 .../src/main/resources/application.properties |  10 +-
 .../service/IcebergSnapshotsServiceTest.java  |  29 +--
 .../tables/model/TableDtoMappingTest.java     |   2 +-
 ...ptimizerStatsPostCommitOperationTest.java} | 145 ++++-------
 .../postcommit/PostCommitDispatcherTest.java  | 177 +++++++++++++
 20 files changed, 747 insertions(+), 393 deletions(-)
 create mode 100644 services/tables/src/main/java/com/linkedin/openhouse/tables/model/CurrentSnapshotInfo.java
 delete mode 100644 services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsClient.java
 create mode 100644 services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsPostCommitOperation.java
 create mode 100644 services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitDispatcher.java
 create mode 100644 services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitOperation.java
 create mode 100644 services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitProperties.java
 rename services/tables/src/test/java/com/linkedin/openhouse/tables/services/optimizer/{OptimizerStatsClientTest.java => OptimizerStatsPostCommitOperationTest.java} (53%)
 create mode 100644 services/tables/src/test/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitDispatcherTest.java

diff --git a/services/common/src/main/java/com/linkedin/openhouse/common/metrics/MetricsConstant.java b/services/common/src/main/java/com/linkedin/openhouse/common/metrics/MetricsConstant.java
index 7a47b790e..1c7945f9c 100644
--- a/services/common/src/main/java/com/linkedin/openhouse/common/metrics/MetricsConstant.java
+++ b/services/common/src/main/java/com/linkedin/openhouse/common/metrics/MetricsConstant.java
@@ -64,9 +64,11 @@ private MetricsConstant() {}
   public static final String HTS_LIST_TABLES_TIME = "hts_list_tables_time";
   public static final String HTS_SEARCH_TABLES_TIME = "hts_search_tables_time";
 
-  // Optimizer post-commit stats push (Tables → Optimizer)
-  public static final String OPTIMIZER_STATS_DURATION = "optimizer_stats_duration";
-  public static final String OPTIMIZER_STATS_ATTEMPTS = "optimizer_stats_attempts";
-  public static final String OPTIMIZER_STATS_SKIPPED = "optimizer_stats_skipped";
-  public static final String OPTIMIZER_STATS_FAILED_FINAL = "optimizer_stats_failed_final";
+  // Tables post-commit operation framework (bounded, best-effort actions after commit).
+  // Tagged with op={operation name}; on failure additionally tagged with outcome={timeout,
+  // network_error, server_error, client_error, prepare_threw, unknown_error}.
+  // The duration timer is bounded by tables.postcommit.per-op-timeout-ms.
+  public static final String POSTCOMMIT_OP_DURATION = "postcommit_op_duration";
+  public static final String POSTCOMMIT_OP_SKIPPED = "postcommit_op_skipped";
+  public static final String POSTCOMMIT_OP_FAILED = "postcommit_op_failed";
 }
diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/controller/TableStatsController.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/controller/TableStatsController.java
index b119dd1c7..87fbfa709 100644
--- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/controller/TableStatsController.java
+++ b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/controller/TableStatsController.java
@@ -24,10 +24,22 @@
 
 /** REST controller for managing per-table stats in the optimizer DB. */
 @RestController
-@RequestMapping("/v1/optimizer/stats")
+@RequestMapping(TableStatsController.BASE_PATH)
 @RequiredArgsConstructor
 public class TableStatsController {
 
+  /**
+   * Base path for the stats endpoints. Single source of truth — external callers must reference.
+   */
+  public static final String BASE_PATH = "/v1/optimizer/stats";
+
+  /**
+   * URI template (with {@code {tableUuid}} placeholder) for the per-table stats upsert/fetch
+   * endpoint. Use this from any client that calls the optimizer over HTTP, rather than rebuilding
+   * the path inline.
+   */
+  public static final String TABLE_PATH_TEMPLATE = BASE_PATH + "/{tableUuid}";
+
   private final OptimizerDataService service;
 
   /**
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/dto/mapper/TablesMapper.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/dto/mapper/TablesMapper.java
index 08111cd82..9315df9d8 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/dto/mapper/TablesMapper.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/dto/mapper/TablesMapper.java
@@ -73,6 +73,7 @@ public interface TablesMapper {
     @Mapping(source = "requestBody.sortOrder", target = "sortOrder"),
     @Mapping(target = "lastModifiedTime", ignore = true),
     @Mapping(target = "creationTime", ignore = true),
+    @Mapping(target = "currentSnapshot", ignore = true)
   })
   TableDto toTableDto(TableDto tableDto, CreateUpdateTableRequestBody requestBody);
 
@@ -120,7 +121,8 @@ public interface TablesMapper {
         defaultExpression = "java(TableType.PRIMARY_TABLE)"),
     @Mapping(source = "requestBody.createUpdateTableRequestBody.sortOrder", target = "sortOrder"),
     @Mapping(target = "lastModifiedTime", ignore = true),
-    @Mapping(target = "creationTime", ignore = true)
+    @Mapping(target = "creationTime", ignore = true),
+    @Mapping(target = "currentSnapshot", ignore = true)
   })
   TableDto toTableDto(TableDto tableDto, IcebergSnapshotsRequestBody requestBody);
 
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/model/CurrentSnapshotInfo.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/model/CurrentSnapshotInfo.java
new file mode 100644
index 000000000..9f363b4ec
--- /dev/null
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/model/CurrentSnapshotInfo.java
@@ -0,0 +1,33 @@
+package com.linkedin.openhouse.tables.model;
+
+import java.util.Map;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.EqualsAndHashCode;
+import lombok.Getter;
+
+/**
+ * In-memory snapshot of the Iceberg current-snapshot metadata that was loaded alongside a {@link
+ * TableDto}. Carries only fields already materialized by the catalog client — never triggers a
+ * separate HDFS / object-store read.
+ *
+ * <p>Present whenever the table has at least one committed snapshot at construction time; absent
+ * (modeled as {@link java.util.Optional#empty()} on {@link TableDto}) when the table has no
+ * committed data — e.g. a {@code CREATE TABLE} with no rows yet.
+ */
+@Getter
+@Builder
+@AllArgsConstructor
+@EqualsAndHashCode
+public class CurrentSnapshotInfo {
+
+  /** Iceberg snapshot ID (decimal long). Stable per commit; usable as an idempotency token. */
+  private final long snapshotId;
+
+  /**
+   * Iceberg {@code Snapshot.summary()} map, unmodified. Keys include {@code total-data-files},
+   * {@code total-files-size}, {@code added-data-files}, {@code deleted-data-files}, {@code
+   * added-files-size}, {@code removed-files-size}.
+   */
+  private final Map<String, String> summary;
+}
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/model/TableDto.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/model/TableDto.java
index 24373f4a1..16608f286 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/model/TableDto.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/model/TableDto.java
@@ -11,6 +11,7 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Optional;
 import javax.persistence.Convert;
 import javax.persistence.ElementCollection;
 import javax.persistence.Entity;
@@ -85,16 +86,21 @@ public class TableDto {
   private boolean replaceCommit;
 
   /**
-   * Iceberg {@code Snapshot.summary()} for the current snapshot at the time the {@code TableDto}
-   * was constructed (post-commit on save; post-load on read). Populated only when an Iceberg {@code
-   * Table} is available — i.e. by {@code InternalRepositoryUtils.convertToTableDto}. Not persisted,
-   * not part of equality.
-   *
-   * <p>Used downstream by the optimizer post-commit stats push (see {@code
-   * services.optimizer.OptimizerStatsClient}) so that the service layer can read snapshot stats
-   * without a separate HDFS round-trip.
+   * In-memory current-snapshot metadata captured when this {@code TableDto} was built from an
+   * Iceberg {@code Table}. Present whenever the underlying table has at least one committed
+   * snapshot at that point; absent for tables with no committed data (e.g. {@code CREATE TABLE}
+   * with no rows). Not persisted, not part of equality. Stored nullable internally; consumers must
+   * read through {@link #getCurrentSnapshot()} to get the {@link Optional}.
    */
-  @Transient @EqualsAndHashCode.Exclude private Map<String, String> currentSnapshotSummary;
+  @Getter(AccessLevel.NONE)
+  @Transient
+  @EqualsAndHashCode.Exclude
+  private CurrentSnapshotInfo currentSnapshot;
+
+  /** Returns the current-snapshot metadata if any, else {@link Optional#empty()}. */
+  public Optional<CurrentSnapshotInfo> getCurrentSnapshot() {
+    return Optional.ofNullable(currentSnapshot);
+  }
 
   /**
    * Bundling eligible string type field into a map as {@link org.mapstruct.Mapper} doesn't provide
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/repository/impl/InternalRepositoryUtils.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/repository/impl/InternalRepositoryUtils.java
index 9ba40c2d8..d3ced5458 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/repository/impl/InternalRepositoryUtils.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/repository/impl/InternalRepositoryUtils.java
@@ -9,6 +9,7 @@
 import com.linkedin.openhouse.tables.dto.mapper.iceberg.PartitionSpecMapper;
 import com.linkedin.openhouse.tables.dto.mapper.iceberg.PoliciesSpecMapper;
 import com.linkedin.openhouse.tables.dto.mapper.iceberg.TableTypeMapper;
+import com.linkedin.openhouse.tables.model.CurrentSnapshotInfo;
 import com.linkedin.openhouse.tables.model.TableDto;
 import com.linkedin.openhouse.tables.repository.PreservedKeyChecker;
 import java.net.URI;
@@ -135,8 +136,13 @@ static TableDto convertToTableDto(
             .jsonSnapshots(null)
             .tableProperties(megaProps)
             .sortOrder(SortOrderParser.toJson(table.sortOrder()))
-            .currentSnapshotSummary(
-                table.currentSnapshot() == null ? null : table.currentSnapshot().summary())
+            .currentSnapshot(
+                table.currentSnapshot() == null
+                    ? null
+                    : CurrentSnapshotInfo.builder()
+                        .snapshotId(table.currentSnapshot().snapshotId())
+                        .summary(table.currentSnapshot().summary())
+                        .build())
             .build();
 
     return tableDto;
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/IcebergSnapshotsServiceImpl.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/IcebergSnapshotsServiceImpl.java
index bb2cbde4c..6623b5f37 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/IcebergSnapshotsServiceImpl.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/IcebergSnapshotsServiceImpl.java
@@ -10,7 +10,7 @@
 import com.linkedin.openhouse.tables.model.TableDto;
 import com.linkedin.openhouse.tables.model.TableDtoPrimaryKey;
 import com.linkedin.openhouse.tables.repository.OpenHouseInternalRepository;
-import com.linkedin.openhouse.tables.services.optimizer.OptimizerStatsClient;
+import com.linkedin.openhouse.tables.services.postcommit.PostCommitDispatcher;
 import com.linkedin.openhouse.tables.utils.AuthorizationUtils;
 import com.linkedin.openhouse.tables.utils.TableUUIDGenerator;
 import java.util.Optional;
@@ -34,11 +34,11 @@ public class IcebergSnapshotsServiceImpl implements IcebergSnapshotsService {
   @Autowired AuthorizationUtils authorizationUtils;
 
   /**
-   * Present only when {@code optimizer.stats.enabled=true}. When absent, no post-commit push is
-   * attempted.
+   * Present only when {@code tables.postcommit.enabled=true}. When absent, the on-commit hook is a
+   * literal no-op and no post-commit operations run.
    */
   @Autowired(required = false)
-  Optional<OptimizerStatsClient> optimizerStatsClient = Optional.empty();
+  Optional<PostCommitDispatcher> postCommitDispatcher = Optional.empty();
 
   @Override
   public Pair<TableDto, Boolean> putIcebergSnapshots(
@@ -92,12 +92,7 @@ public Pair<TableDto, Boolean> putIcebergSnapshots(
     }
     try {
       TableDto savedDto = openHouseInternalRepository.save(tableDtoToSave);
-      // Fire-and-forget push of the post-commit snapshot summary to the optimizer. Returns
-      // immediately; failures are swallowed inside the client. Skipped at the client when the
-      // table is not opted in via maintenance.optimizer.stats.enabled or when there is no current
-      // snapshot (e.g. CREATE TABLE without a data commit).
-      optimizerStatsClient.ifPresent(
-          client -> client.report(savedDto, savedDto.getCurrentSnapshotSummary()));
+      postCommitDispatcher.ifPresent(d -> d.dispatch(savedDto));
       return Pair.of(savedDto, !tableDto.isPresent());
     } catch (BadRequestException e) {
       throw new RequestValidationFailureException(e.getMessage(), e);
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsClient.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsClient.java
deleted file mode 100644
index 23c69082e..000000000
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsClient.java
+++ /dev/null
@@ -1,245 +0,0 @@
-package com.linkedin.openhouse.tables.services.optimizer;
-
-import com.linkedin.openhouse.common.metrics.MetricsConstant;
-import com.linkedin.openhouse.tables.model.TableDto;
-import io.micrometer.core.instrument.MeterRegistry;
-import io.micrometer.core.instrument.Timer;
-import java.time.Duration;
-import java.util.Map;
-import java.util.concurrent.TimeoutException;
-import lombok.extern.slf4j.Slf4j;
-import org.springframework.beans.factory.annotation.Qualifier;
-import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
-import org.springframework.stereotype.Component;
-import org.springframework.web.reactive.function.client.WebClient;
-import org.springframework.web.reactive.function.client.WebClientRequestException;
-import org.springframework.web.reactive.function.client.WebClientResponseException;
-import reactor.core.publisher.Mono;
-import reactor.util.retry.Retry;
-import reactor.util.retry.RetrySpec;
-
-/**
- * Pushes a stats record to the optimizer's {@code PUT /v1/optimizer/stats/{tableUuid}} endpoint
- * after a successful Iceberg snapshot commit. Fire-and-forget — the {@link #report} call returns
- * immediately on the commit thread; the HTTP exchange runs on the WebClient's Netty event loop.
- *
- * <p>Errors are recorded as Micrometer metrics and logged at {@code warn} level, but never
- * propagated. A push failure must not break the commit.
- *
- * <p>Timeout / retry shape (from {@link OptimizerStatsProperties}):
- *
- * <ul>
- *   <li>{@code perAttemptTimeoutMs} bounds each HTTP attempt (1000 ms default).
- *   <li>{@code maxAttempts} caps the total number of attempts (3 default — one initial + two
- *       retries). Retries only fire on retryable errors (network, 5xx, 408, 429, timeout).
- *   <li>{@code totalTimeoutMs} is the outer wall-clock ceiling (2000 ms default). Hard cancel.
- * </ul>
- *
- * <p>The bean is only wired when {@code optimizer.stats.enabled=true}.
- */
-@Slf4j
-@Component
-@ConditionalOnProperty(prefix = "optimizer.stats", name = "enabled", havingValue = "true")
-public class OptimizerStatsClient {
-
-  /** Per-call URL path. */
-  static final String PATH_TEMPLATE = "/v1/optimizer/stats/{tableUuid}";
-
-  /** Table-property key that opts a table in for the post-commit stats push. */
-  static final String OPT_IN_PROPERTY = "maintenance.optimizer.stats.enabled";
-
-  /** Iceberg snapshot-summary keys we read. All values are decimal-string longs. */
-  static final String SUMMARY_TOTAL_DATA_FILES = "total-data-files";
-
-  static final String SUMMARY_TOTAL_FILES_SIZE = "total-files-size";
-  static final String SUMMARY_ADDED_DATA_FILES = "added-data-files";
-  static final String SUMMARY_DELETED_DATA_FILES = "deleted-data-files";
-  static final String SUMMARY_ADDED_FILES_SIZE = "added-files-size";
-  static final String SUMMARY_REMOVED_FILES_SIZE = "removed-files-size";
-
-  private final WebClient webClient;
-  private final MeterRegistry meterRegistry;
-  private final OptimizerStatsProperties properties;
-
-  public OptimizerStatsClient(
-      @Qualifier("optimizerStatsWebClient") WebClient webClient,
-      MeterRegistry meterRegistry,
-      OptimizerStatsProperties properties) {
-    this.webClient = webClient;
-    this.meterRegistry = meterRegistry;
-    this.properties = properties;
-  }
-
-  /**
-   * Build and dispatch a stats push for the just-committed table. Returns immediately; the HTTP
-   * call runs in the background. {@code snapshotSummary} is the in-memory {@code
-   * Snapshot.summary()} map from the latest snapshot (no HDFS round-trip).
-   *
-   * <p>The call is skipped (and a {@code skipped} counter is emitted) when:
-   *
-   * <ul>
-   *   <li>The table is not opted in via {@link #OPT_IN_PROPERTY}.
-   *   <li>{@code snapshotSummary} is null or empty (no committed snapshot yet — e.g. CREATE TABLE
-   *       with no rows).
-   * </ul>
-   */
-  public void report(TableDto saved, Map<String, String> snapshotSummary) {
-    reportAsync(saved, snapshotSummary).subscribe();
-  }
-
-  /**
-   * Same as {@link #report} but returns the underlying {@link Mono} so callers (notably tests) can
-   * block on completion. Production callers use {@link #report}.
-   */
-  Mono<Void> reportAsync(TableDto saved, Map<String, String> snapshotSummary) {
-    if (!isOptedIn(saved)) {
-      meterRegistry
-          .counter(MetricsConstant.OPTIMIZER_STATS_SKIPPED, "reason", "opt_out")
-          .increment();
-      return Mono.empty();
-    }
-    if (snapshotSummary == null || snapshotSummary.isEmpty()) {
-      meterRegistry
-          .counter(MetricsConstant.OPTIMIZER_STATS_SKIPPED, "reason", "no_snapshot")
-          .increment();
-      return Mono.empty();
-    }
-
-    OptimizerStatsRequest body = buildRequest(saved, snapshotSummary);
-    String tableUuid = saved.getTableUUID();
-    Timer.Sample sample = Timer.start(meterRegistry);
-
-    RetrySpec retrySpec =
-        Retry.max(Math.max(0, properties.getMaxAttempts() - 1))
-            .filter(this::isRetryable)
-            .onRetryExhaustedThrow((spec, signal) -> signal.failure());
-
-    return webClient
-        .put()
-        .uri(PATH_TEMPLATE, tableUuid)
-        .bodyValue(body)
-        .retrieve()
-        .toBodilessEntity()
-        .timeout(Duration.ofMillis(properties.getPerAttemptTimeoutMs()))
-        .doOnError(
-            e ->
-                meterRegistry
-                    .counter(
-                        MetricsConstant.OPTIMIZER_STATS_ATTEMPTS,
-                        "outcome",
-                        isRetryable(e) ? "retryable_failure" : "non_retryable_failure")
-                    .increment())
-        .doOnSuccess(
-            ignored ->
-                meterRegistry
-                    .counter(MetricsConstant.OPTIMIZER_STATS_ATTEMPTS, "outcome", "success")
-                    .increment())
-        .retryWhen(retrySpec)
-        .timeout(Duration.ofMillis(properties.getTotalTimeoutMs()))
-        .doOnSuccess(
-            ignored ->
-                sample.stop(
-                    meterRegistry.timer(
-                        MetricsConstant.OPTIMIZER_STATS_DURATION, "outcome", "success")))
-        .onErrorResume(
-            e -> {
-              String outcome = classifyOutcome(e);
-              sample.stop(
-                  meterRegistry.timer(
-                      MetricsConstant.OPTIMIZER_STATS_DURATION, "outcome", outcome));
-              meterRegistry
-                  .counter(MetricsConstant.OPTIMIZER_STATS_FAILED_FINAL, "outcome", outcome)
-                  .increment();
-              log.warn(
-                  "Optimizer stats push failed for table {} ({}): {}",
-                  tableUuid,
-                  outcome,
-                  e.toString());
-              return Mono.empty();
-            })
-        .then();
-  }
-
-  /** {@code true} iff the table's properties contain the literal opt-in value {@code "true"}. */
-  private boolean isOptedIn(TableDto saved) {
-    Map<String, String> props = saved.getTableProperties();
-    return props != null && "true".equals(props.get(OPT_IN_PROPERTY));
-  }
-
-  /** Build the wire body. Missing summary keys default to 0L. */
-  OptimizerStatsRequest buildRequest(TableDto saved, Map<String, String> summary) {
-    OptimizerStatsRequest.Snapshot snapshot =
-        OptimizerStatsRequest.Snapshot.builder()
-            .tableVersion(saved.getTableVersion())
-            .tableLocation(saved.getTableLocation())
-            .tableSizeBytes(longOrZero(summary, SUMMARY_TOTAL_FILES_SIZE))
-            .numCurrentFiles(longOrZero(summary, SUMMARY_TOTAL_DATA_FILES))
-            .build();
-    OptimizerStatsRequest.Delta delta =
-        OptimizerStatsRequest.Delta.builder()
-            .numFilesAdded(longOrZero(summary, SUMMARY_ADDED_DATA_FILES))
-            .numFilesDeleted(longOrZero(summary, SUMMARY_DELETED_DATA_FILES))
-            .addedSizeBytes(longOrZero(summary, SUMMARY_ADDED_FILES_SIZE))
-            .deletedSizeBytes(longOrZero(summary, SUMMARY_REMOVED_FILES_SIZE))
-            .build();
-    return OptimizerStatsRequest.builder()
-        .databaseName(saved.getDatabaseId())
-        .tableName(saved.getTableId())
-        .stats(OptimizerStatsRequest.Stats.builder().snapshot(snapshot).delta(delta).build())
-        .tableProperties(saved.getTableProperties())
-        .build();
-  }
-
-  /**
-   * Retryable errors: per-attempt timeout, network-level failures, 5xx, 408, 429. Other 4xx are
-   * client errors — retrying won't fix them.
-   */
-  boolean isRetryable(Throwable e) {
-    if (e instanceof TimeoutException) {
-      return true;
-    }
-    if (e instanceof WebClientRequestException) {
-      return true;
-    }
-    if (e instanceof WebClientResponseException) {
-      int code = ((WebClientResponseException) e).getStatusCode().value();
-      return code == 408 || code == 429 || (code >= 500 && code < 600);
-    }
-    return false;
-  }
-
-  /**
-   * Map a final-stage error to one of {@code success, timeout, network_error, server_error,
-   * client_error, unknown_error} for the {@code outcome} tag on duration / failed_final metrics.
-   */
-  private String classifyOutcome(Throwable e) {
-    if (e instanceof TimeoutException) {
-      return "timeout";
-    }
-    if (e instanceof WebClientRequestException) {
-      return "network_error";
-    }
-    if (e instanceof WebClientResponseException) {
-      int code = ((WebClientResponseException) e).getStatusCode().value();
-      if (code >= 500 && code < 600) {
-        return "server_error";
-      }
-      if (code >= 400 && code < 500) {
-        return "client_error";
-      }
-    }
-    return "unknown_error";
-  }
-
-  private static long longOrZero(Map<String, String> m, String key) {
-    String v = m.get(key);
-    if (v == null) {
-      return 0L;
-    }
-    try {
-      return Long.parseLong(v);
-    } catch (NumberFormatException nfe) {
-      return 0L;
-    }
-  }
-}
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsConfig.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsConfig.java
index bbddbce6b..8061fb644 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsConfig.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsConfig.java
@@ -20,10 +20,11 @@ public class OptimizerStatsConfig {
 
   /**
    * Dedicated WebClient for the optimizer stats endpoint. Per-attempt and outer timeouts are
-   * applied at the call site on the Reactor chain in {@link OptimizerStatsClient} — they are not
-   * configured on the underlying Netty client so that the timeout always emerges as a standard
-   * {@link java.util.concurrent.TimeoutException} (not a Netty {@code ReadTimeoutException}), which
-   * keeps the client's outcome classification simple.
+   * applied at the call site on the Reactor chain (per-attempt in {@link
+   * OptimizerStatsPostCommitOperation}, outer per-op in the {@code PostCommitDispatcher}) — they
+   * are not configured on the underlying Netty client so that the timeout always emerges as a
+   * standard {@link java.util.concurrent.TimeoutException} (not a Netty {@code
+   * ReadTimeoutException}), which keeps the dispatcher's outcome classification simple.
    */
   @Bean("optimizerStatsWebClient")
   public WebClient optimizerStatsWebClient(OptimizerStatsProperties properties) {
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsPostCommitOperation.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsPostCommitOperation.java
new file mode 100644
index 000000000..0a82d8598
--- /dev/null
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsPostCommitOperation.java
@@ -0,0 +1,173 @@
+package com.linkedin.openhouse.tables.services.optimizer;
+
+import com.linkedin.openhouse.tables.model.CurrentSnapshotInfo;
+import com.linkedin.openhouse.tables.model.TableDto;
+import com.linkedin.openhouse.tables.services.postcommit.PostCommitOperation;
+import java.time.Duration;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.TimeoutException;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.beans.factory.annotation.Qualifier;
+import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
+import org.springframework.stereotype.Component;
+import org.springframework.web.reactive.function.client.WebClient;
+import org.springframework.web.reactive.function.client.WebClientRequestException;
+import org.springframework.web.reactive.function.client.WebClientResponseException;
+import reactor.core.publisher.Mono;
+import reactor.util.retry.Retry;
+import reactor.util.retry.RetrySpec;
+
+/**
+ * {@link PostCommitOperation} that PUTs a snapshot-stats record to the optimizer's per-table stats
+ * endpoint. {@link #prepare(TableDto)} returns a {@link Mono} that completes on HTTP 2xx and
+ * signals an error otherwise; the dispatcher owns timeout, subscription, error swallowing, and
+ * metric emission.
+ *
+ * <p>Skipped (operation returns {@link Optional#empty()}) when the table is not opted in via the
+ * {@link #OPT_IN_PROPERTY} table property, or when no current snapshot is present (e.g. {@code
+ * CREATE TABLE} with no rows yet).
+ *
+ * <p>Internal retry: bounded by {@link OptimizerStatsProperties#getMaxAttempts()}, fires only on
+ * retryable errors (network, {@link TimeoutException}, HTTP 408 / 429 / 5xx). The dispatcher's
+ * outer per-op timeout is the hard ceiling on the whole chain.
+ *
+ * <p>Bean is only wired when {@code optimizer.stats.enabled=true}. The path constant is
+ * intentionally duplicated from {@code TableStatsController.TABLE_PATH_TEMPLATE}; keep in sync.
+ * Tables service does not take a compile-time dependency on the optimizer service jar.
+ */
+@Slf4j
+@Component
+@ConditionalOnProperty(prefix = "optimizer.stats", name = "enabled", havingValue = "true")
+public class OptimizerStatsPostCommitOperation implements PostCommitOperation {
+
+  /** Metric tag value for {@code op}. */
+  static final String OP_NAME = "optimizer_stats";
+
+  /**
+   * Per-call URL path. Intentionally duplicated from {@code
+   * TableStatsController.TABLE_PATH_TEMPLATE}; keep in sync.
+   */
+  static final String PATH_TEMPLATE = "/v1/optimizer/stats/{tableUuid}";
+
+  /** Table-property key that opts a table in for the post-commit stats push. */
+  static final String OPT_IN_PROPERTY = "maintenance.optimizer.stats.enabled";
+
+  /** Iceberg snapshot-summary keys we read. All values are decimal-string longs. */
+  static final String SUMMARY_TOTAL_DATA_FILES = "total-data-files";
+
+  static final String SUMMARY_TOTAL_FILES_SIZE = "total-files-size";
+  static final String SUMMARY_ADDED_DATA_FILES = "added-data-files";
+  static final String SUMMARY_DELETED_DATA_FILES = "deleted-data-files";
+  static final String SUMMARY_ADDED_FILES_SIZE = "added-files-size";
+  static final String SUMMARY_REMOVED_FILES_SIZE = "removed-files-size";
+
+  private final WebClient webClient;
+  private final OptimizerStatsProperties properties;
+
+  public OptimizerStatsPostCommitOperation(
+      @Qualifier("optimizerStatsWebClient") WebClient webClient,
+      OptimizerStatsProperties properties) {
+    this.webClient = webClient;
+    this.properties = properties;
+  }
+
+  @Override
+  public String name() {
+    return OP_NAME;
+  }
+
+  @Override
+  public Optional<Mono<Void>> prepare(TableDto savedDto) {
+    if (!isOptedIn(savedDto)) {
+      return Optional.empty();
+    }
+    Optional<CurrentSnapshotInfo> snapshot = savedDto.getCurrentSnapshot();
+    if (!snapshot.isPresent()) {
+      return Optional.empty();
+    }
+
+    OptimizerStatsRequest body = buildRequest(savedDto, snapshot.get());
+    String tableUuid = savedDto.getTableUUID();
+
+    RetrySpec retrySpec =
+        Retry.max(Math.max(0, properties.getMaxAttempts() - 1)).filter(this::isRetryable);
+
+    Mono<Void> chain =
+        webClient
+            .put()
+            .uri(PATH_TEMPLATE, tableUuid)
+            .bodyValue(body)
+            .retrieve()
+            .toBodilessEntity()
+            .timeout(Duration.ofMillis(properties.getPerAttemptTimeoutMs()))
+            .retryWhen(retrySpec.onRetryExhaustedThrow((spec, signal) -> signal.failure()))
+            .then();
+    return Optional.of(chain);
+  }
+
+  /** {@code true} iff the table's properties contain the literal opt-in value {@code "true"}. */
+  private boolean isOptedIn(TableDto saved) {
+    Map<String, String> props = saved.getTableProperties();
+    return props != null && "true".equals(props.get(OPT_IN_PROPERTY));
+  }
+
+  /** Build the wire body. Missing summary keys default to 0L. */
+  OptimizerStatsRequest buildRequest(TableDto saved, CurrentSnapshotInfo snapshot) {
+    Map<String, String> summary = snapshot.getSummary();
+    OptimizerStatsRequest.Snapshot snapshotPayload =
+        OptimizerStatsRequest.Snapshot.builder()
+            .snapshotId(snapshot.getSnapshotId())
+            .tableVersion(saved.getTableVersion())
+            .tableLocation(saved.getTableLocation())
+            .tableSizeBytes(longOrZero(summary, SUMMARY_TOTAL_FILES_SIZE))
+            .numCurrentFiles(longOrZero(summary, SUMMARY_TOTAL_DATA_FILES))
+            .build();
+    OptimizerStatsRequest.Delta delta =
+        OptimizerStatsRequest.Delta.builder()
+            .numFilesAdded(longOrZero(summary, SUMMARY_ADDED_DATA_FILES))
+            .numFilesDeleted(longOrZero(summary, SUMMARY_DELETED_DATA_FILES))
+            .addedSizeBytes(longOrZero(summary, SUMMARY_ADDED_FILES_SIZE))
+            .deletedSizeBytes(longOrZero(summary, SUMMARY_REMOVED_FILES_SIZE))
+            .build();
+    return OptimizerStatsRequest.builder()
+        .databaseName(saved.getDatabaseId())
+        .tableName(saved.getTableId())
+        .stats(OptimizerStatsRequest.Stats.builder().snapshot(snapshotPayload).delta(delta).build())
+        .tableProperties(saved.getTableProperties())
+        .build();
+  }
+
+  /**
+   * Retryable errors: per-attempt timeout, network-level failures, 5xx, 408, 429. Other 4xx are
+   * client errors — retrying won't fix them.
+   */
+  boolean isRetryable(Throwable e) {
+    if (e instanceof TimeoutException) {
+      return true;
+    }
+    if (e instanceof WebClientRequestException) {
+      return true;
+    }
+    if (e instanceof WebClientResponseException) {
+      int code = ((WebClientResponseException) e).getStatusCode().value();
+      return code == 408 || code == 429 || (code >= 500 && code < 600);
+    }
+    return false;
+  }
+
+  private static long longOrZero(Map<String, String> m, String key) {
+    if (m == null) {
+      return 0L;
+    }
+    String v = m.get(key);
+    if (v == null) {
+      return 0L;
+    }
+    try {
+      return Long.parseLong(v);
+    } catch (NumberFormatException nfe) {
+      return 0L;
+    }
+  }
+}
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsProperties.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsProperties.java
index 7d746ee93..53b1844be 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsProperties.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsProperties.java
@@ -25,19 +25,16 @@ public class OptimizerStatsProperties {
    */
   private String baseUri;
 
-  /** Per-attempt request timeout in milliseconds. Default 1000. */
-  private long perAttemptTimeoutMs = 1000L;
-
   /**
-   * Hard ceiling on total wall-clock time for the entire call (including retries) in milliseconds.
-   * Default 2000. When the outer timeout fires, the chain is cancelled and the error is swallowed.
+   * Per-attempt HTTP timeout in milliseconds. Bounds each individual attempt so retries can fit
+   * inside the dispatcher's outer per-op budget ({@code tables.postcommit.per-op-timeout-ms},
+   * default 3000 ms). Default 1000 ms.
    */
-  private long totalTimeoutMs = 2000L;
+  private long perAttemptTimeoutMs = 1000L;
 
   /**
-   * Total attempt count (initial try plus retries). Default 3. With 1000 ms per attempt and a 2000
-   * ms outer ceiling, only about two attempts realistically fit on a slow path; the third is
-   * available if attempts return quickly.
+   * Total attempt count (initial try plus retries). Default 3. Retries fire only on retryable
+   * errors (network, timeout, 408/429/5xx); other 4xx fail fast without retry.
    */
   private int maxAttempts = 3;
 }
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsRequest.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsRequest.java
index 68a8646da..df929f666 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsRequest.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsRequest.java
@@ -47,6 +47,12 @@ public static class Stats {
   @AllArgsConstructor
   @JsonInclude(JsonInclude.Include.NON_NULL)
   public static class Snapshot {
+    /**
+     * Iceberg snapshot ID. Sent so the optimizer can use it as an idempotency token on upsert and
+     * reject out-of-order replays — server-side wiring tracked in BDP-102985.
+     */
+    private Long snapshotId;
+
     private String tableVersion;
     private String tableLocation;
     private Long tableSizeBytes;
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitDispatcher.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitDispatcher.java
new file mode 100644
index 000000000..d205aee12
--- /dev/null
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitDispatcher.java
@@ -0,0 +1,147 @@
+package com.linkedin.openhouse.tables.services.postcommit;
+
+import com.linkedin.openhouse.common.metrics.MetricsConstant;
+import com.linkedin.openhouse.tables.model.TableDto;
+import io.micrometer.core.instrument.MeterRegistry;
+import io.micrometer.core.instrument.Timer;
+import java.time.Duration;
+import java.util.List;
+import java.util.Optional;
+import java.util.concurrent.TimeoutException;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
+import org.springframework.boot.context.properties.EnableConfigurationProperties;
+import org.springframework.stereotype.Component;
+import org.springframework.web.reactive.function.client.WebClientRequestException;
+import org.springframework.web.reactive.function.client.WebClientResponseException;
+import reactor.core.publisher.Mono;
+
+/**
+ * Runs {@link PostCommitOperation}s after a successful Iceberg commit. Best-effort and bounded:
+ * each operation gets a wall-clock timeout, errors are swallowed after metric/log, and dispatch
+ * itself is fire-and-forget so the commit thread is never blocked on operation work.
+ *
+ * <p><b>Why async.</b> A synchronous post-commit hook converts a downstream outage (the optimizer
+ * being slow or unavailable, a network glitch) into a Tables-Service write outage — the post-commit
+ * push is a best-effort scheduling signal, not a write-correctness step, and its blast radius must
+ * not include the write path. The crash-loss window (a JVM dying after commit and before the HTTP
+ * push completes) is acceptable because operations are designed to be cumulative: the next commit
+ * carries the same state forward and the consumer self-corrects from missing data.
+ *
+ * <p><b>Why the dispatcher owns timeouts.</b> Operations only describe payload + endpoint. The
+ * timeout/swallow/subscribe machinery lives here once so individual operations stay small and so
+ * the contract across operations is uniform (one knob: {@code
+ * tables.postcommit.per-op-timeout-ms}).
+ *
+ * <p>Bean is only wired when {@code tables.postcommit.enabled=true}.
+ */
+@Slf4j
+@Component
+@EnableConfigurationProperties(PostCommitProperties.class)
+@ConditionalOnProperty(prefix = "tables.postcommit", name = "enabled", havingValue = "true")
+public class PostCommitDispatcher {
+
+  private final List<PostCommitOperation> operations;
+  private final PostCommitProperties properties;
+  private final MeterRegistry meterRegistry;
+
+  public PostCommitDispatcher(
+      List<PostCommitOperation> operations,
+      PostCommitProperties properties,
+      MeterRegistry meterRegistry) {
+    this.operations = operations;
+    this.properties = properties;
+    this.meterRegistry = meterRegistry;
+  }
+
+  /**
+   * Dispatch all registered operations for {@code savedDto}. Returns immediately on the calling
+   * thread; each operation runs on its underlying reactive scheduler. Never throws.
+   */
+  public void dispatch(TableDto savedDto) {
+    for (PostCommitOperation op : operations) {
+      decorate(op, savedDto).ifPresent(Mono::subscribe);
+    }
+  }
+
+  /**
+   * Returns the fully-decorated {@link Mono} for {@code op} (per-op timeout, success / error metric
+   * emission, error swallow) without subscribing. Emits the {@code skipped} or {@code
+   * prepare_threw} metric synchronously and returns {@link Optional#empty()} in those cases.
+   *
+   * <p>Package-private so tests can {@code .block()} on the decorated chain rather than polling for
+   * metric emission after a fire-and-forget {@code subscribe()}.
+   */
+  Optional<Mono<Void>> decorate(PostCommitOperation op, TableDto savedDto) {
+    Optional<Mono<Void>> work;
+    try {
+      work = op.prepare(savedDto);
+    } catch (RuntimeException e) {
+      // Defensive: a prepare() that throws synchronously must not break dispatch of later ops.
+      meterRegistry
+          .counter(
+              MetricsConstant.POSTCOMMIT_OP_FAILED, "op", op.name(), "outcome", "prepare_threw")
+          .increment();
+      log.warn("Post-commit op {} prepare() threw {}", op.name(), e.toString());
+      return Optional.empty();
+    }
+    if (!work.isPresent()) {
+      meterRegistry.counter(MetricsConstant.POSTCOMMIT_OP_SKIPPED, "op", op.name()).increment();
+      return Optional.empty();
+    }
+    Timer.Sample sample = Timer.start(meterRegistry);
+    Mono<Void> decorated =
+        work.get()
+            .timeout(Duration.ofMillis(properties.getPerOpTimeoutMs()))
+            .doOnSuccess(
+                ignored ->
+                    sample.stop(
+                        meterRegistry.timer(
+                            MetricsConstant.POSTCOMMIT_OP_DURATION,
+                            "op",
+                            op.name(),
+                            "outcome",
+                            "success")))
+            .onErrorResume(
+                e -> {
+                  String outcome = classifyOutcome(e);
+                  sample.stop(
+                      meterRegistry.timer(
+                          MetricsConstant.POSTCOMMIT_OP_DURATION,
+                          "op",
+                          op.name(),
+                          "outcome",
+                          outcome));
+                  meterRegistry
+                      .counter(
+                          MetricsConstant.POSTCOMMIT_OP_FAILED, "op", op.name(), "outcome", outcome)
+                      .increment();
+                  log.warn("Post-commit op {} failed ({}): {}", op.name(), outcome, e.toString());
+                  return Mono.empty();
+                });
+    return Optional.of(decorated);
+  }
+
+  /**
+   * Map a terminal error to a small set of outcome tags. Kept here so all operations share the same
+   * taxonomy.
+   */
+  private static String classifyOutcome(Throwable e) {
+    if (e instanceof TimeoutException) {
+      return "timeout";
+    }
+    if (e instanceof WebClientRequestException) {
+      return "network_error";
+    }
+    if (e instanceof WebClientResponseException) {
+      int code = ((WebClientResponseException) e).getStatusCode().value();
+      if (code >= 500 && code < 600) {
+        return "server_error";
+      }
+      if (code >= 400 && code < 500) {
+        return "client_error";
+      }
+    }
+    return "unknown_error";
+  }
+}
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitOperation.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitOperation.java
new file mode 100644
index 000000000..5ba49b371
--- /dev/null
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitOperation.java
@@ -0,0 +1,39 @@
+package com.linkedin.openhouse.tables.services.postcommit;
+
+import com.linkedin.openhouse.tables.model.TableDto;
+import java.util.Optional;
+import reactor.core.publisher.Mono;
+
+/**
+ * A best-effort, bounded, asynchronous action run by the Tables Service after a successful Iceberg
+ * commit. Operations are invoked by {@link PostCommitDispatcher} and are subject to a single per-op
+ * wall-clock timeout owned by the dispatcher. Operations never affect commit correctness: any error
+ * they signal is recorded as a metric and a log line, then swallowed.
+ *
+ * <p>Implementations describe <em>what</em> to do; the dispatcher owns <em>how</em> it runs
+ * (subscription, timeout, error handling, metric emission). Each operation contributes:
+ *
+ * <ul>
+ *   <li>{@link #name()} — short identifier used as a metric tag.
+ *   <li>{@link #prepare(TableDto)} — returns the work to perform, or {@link Optional#empty()} when
+ *       the operation does not apply to this commit (e.g. table not opted in, no committed
+ *       snapshot). Returning empty is a normal outcome; the dispatcher emits a {@code skipped}
+ *       metric and moves on.
+ * </ul>
+ *
+ * <p>The returned {@link Mono} must not subscribe itself, apply its own outer timeout, or swallow
+ * errors — those concerns belong to the dispatcher. Implementations may apply internal retries with
+ * bounded budgets.
+ */
+public interface PostCommitOperation {
+
+  /** Short, stable identifier for this operation. Used as the {@code op} metric tag. */
+  String name();
+
+  /**
+   * Build the work to run for {@code savedDto}, or return {@link Optional#empty()} if this
+   * operation does not apply. The dispatcher only invokes the returned {@link Mono} after applying
+   * its own timeout / error-swallow plumbing.
+   */
+  Optional<Mono<Void>> prepare(TableDto savedDto);
+}
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitProperties.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitProperties.java
new file mode 100644
index 000000000..8a153cd79
--- /dev/null
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitProperties.java
@@ -0,0 +1,31 @@
+package com.linkedin.openhouse.tables.services.postcommit;
+
+import lombok.Getter;
+import lombok.Setter;
+import org.springframework.boot.context.properties.ConfigurationProperties;
+
+/**
+ * Configuration for the Tables Service post-commit operation framework. Property prefix: {@code
+ * tables.postcommit}.
+ *
+ * <p>When {@code enabled} is {@code false} (the default), no dispatcher bean is constructed and
+ * registered {@link PostCommitOperation} beans, if any, are never invoked. Per-OH-instance roll-out
+ * flips a single flag.
+ */
+@ConfigurationProperties("tables.postcommit")
+@Getter
+@Setter
+public class PostCommitProperties {
+
+  /** Master switch. When {@code false}, no operations are dispatched on commit. */
+  private boolean enabled = false;
+
+  /**
+   * Wall-clock ceiling applied by the dispatcher to each operation's {@link
+   * PostCommitOperation#prepare(com.linkedin.openhouse.tables.model.TableDto)} {@code Mono}. Bounds
+   * how long a misbehaving operation can keep resources (connections, threads) tied up. Does not
+   * bound commit-thread latency — operations run on the executing reactive scheduler, not on the
+   * commit thread. Default 3000 ms.
+   */
+  private long perOpTimeoutMs = 3000L;
+}
diff --git a/services/tables/src/main/resources/application.properties b/services/tables/src/main/resources/application.properties
index ddf4c3022..24f5554c5 100644
--- a/services/tables/src/main/resources/application.properties
+++ b/services/tables/src/main/resources/application.properties
@@ -25,10 +25,14 @@ management.metrics.distribution.maximum-expected-value.http.server.requests=600s
 server.shutdown=graceful
 spring.lifecycle.timeout-per-shutdown-phase=60s
 
-# Post-commit Tables -> Optimizer stats push.
-# Disabled by default; environments that run the optimizer opt in via env vars.
+# Post-commit operation framework. Disabled by default; flip per OH instance.
+# Outer per-op wall-clock budget is enforced by PostCommitDispatcher.
+tables.postcommit.enabled=${TABLES_POSTCOMMIT_ENABLED:false}
+tables.postcommit.per-op-timeout-ms=${TABLES_POSTCOMMIT_PER_OP_TIMEOUT_MS:3000}
+
+# Optimizer stats post-commit operation (one impl of PostCommitOperation).
+# Both flags must be true for the push to be wired and dispatched.
 optimizer.stats.enabled=${OPTIMIZER_STATS_ENABLED:false}
 optimizer.stats.base-uri=${OPTIMIZER_STATS_BASE_URI:}
 optimizer.stats.per-attempt-timeout-ms=${OPTIMIZER_STATS_PER_ATTEMPT_TIMEOUT_MS:1000}
-optimizer.stats.total-timeout-ms=${OPTIMIZER_STATS_TOTAL_TIMEOUT_MS:2000}
 optimizer.stats.max-attempts=${OPTIMIZER_STATS_MAX_ATTEMPTS:3}
\ No newline at end of file
diff --git a/services/tables/src/test/java/com/linkedin/openhouse/tables/mock/service/IcebergSnapshotsServiceTest.java b/services/tables/src/test/java/com/linkedin/openhouse/tables/mock/service/IcebergSnapshotsServiceTest.java
index 80f9233af..648ae1871 100644
--- a/services/tables/src/test/java/com/linkedin/openhouse/tables/mock/service/IcebergSnapshotsServiceTest.java
+++ b/services/tables/src/test/java/com/linkedin/openhouse/tables/mock/service/IcebergSnapshotsServiceTest.java
@@ -11,14 +11,14 @@
 import com.linkedin.openhouse.tables.api.spec.v0.request.components.Policies;
 import com.linkedin.openhouse.tables.dto.mapper.TablesMapper;
 import com.linkedin.openhouse.tables.dto.mapper.TablesMapperImpl;
+import com.linkedin.openhouse.tables.model.CurrentSnapshotInfo;
 import com.linkedin.openhouse.tables.model.TableDto;
 import com.linkedin.openhouse.tables.model.TableDtoPrimaryKey;
 import com.linkedin.openhouse.tables.repository.OpenHouseInternalRepository;
 import com.linkedin.openhouse.tables.services.IcebergSnapshotsService;
-import com.linkedin.openhouse.tables.services.optimizer.OptimizerStatsClient;
+import com.linkedin.openhouse.tables.services.postcommit.PostCommitDispatcher;
 import com.linkedin.openhouse.tables.utils.TableUUIDGenerator;
 import java.util.Collections;
-import java.util.Map;
 import java.util.Optional;
 import java.util.UUID;
 import org.apache.iceberg.exceptions.BadRequestException;
@@ -49,12 +49,11 @@ public class IcebergSnapshotsServiceTest {
   @MockBean private TableUUIDGenerator tableUUIDGenerator;
 
   /**
-   * Forces the optional optimizer-stats client into the service so that we can verify the
-   * post-commit push is invoked. Production wiring is conditional on {@code
-   * optimizer.stats.enabled=true}; this {@code @MockBean} bypasses that condition for the one test
-   * that exercises the hook.
+   * Forces the optional post-commit dispatcher into the service so that we can verify it is invoked
+   * after a successful save. Production wiring is conditional on {@code
+   * tables.postcommit.enabled=true}; this {@code @MockBean} bypasses that condition.
    */
-  @MockBean private OptimizerStatsClient optimizerStatsClient;
+  @MockBean private PostCommitDispatcher postCommitDispatcher;
 
   private OpenHouseInternalRepository mockRepository;
 
@@ -102,20 +101,20 @@ public void testTableCreated() {
   }
 
   @Test
-  public void testOptimizerStatsClientInvokedAfterSuccessfulCommit() {
+  public void testPostCommitDispatcherInvokedAfterSuccessfulCommit() {
     final IcebergSnapshotsRequestBody requestBody =
         TEST_ICEBERG_SNAPSHOTS_INITIAL_VERSION_REQUEST_BODY;
     final String dbId = requestBody.getCreateUpdateTableRequestBody().getDatabaseId();
     final String tableId = requestBody.getCreateUpdateTableRequestBody().getTableId();
     final TableDtoPrimaryKey key =
         TableDtoPrimaryKey.builder().databaseId(dbId).tableId(tableId).build();
-    final Map<String, String> summary = Collections.singletonMap("total-data-files", "7");
-    final TableDto savedDto =
-        TableDto.builder()
-            .databaseId(dbId)
-            .tableId(tableId)
-            .currentSnapshotSummary(summary)
+    final CurrentSnapshotInfo snapshot =
+        CurrentSnapshotInfo.builder()
+            .snapshotId(42L)
+            .summary(Collections.singletonMap("total-data-files", "7"))
             .build();
+    final TableDto savedDto =
+        TableDto.builder().databaseId(dbId).tableId(tableId).currentSnapshot(snapshot).build();
 
     Mockito.when(tableUUIDGenerator.generateUUID(Mockito.any(IcebergSnapshotsRequestBody.class)))
         .thenReturn(UUID.randomUUID());
@@ -124,7 +123,7 @@ public void testOptimizerStatsClientInvokedAfterSuccessfulCommit() {
 
     service.putIcebergSnapshots(dbId, tableId, requestBody, TEST_TABLE_CREATOR);
 
-    Mockito.verify(optimizerStatsClient, Mockito.times(1)).report(savedDto, summary);
+    Mockito.verify(postCommitDispatcher, Mockito.times(1)).dispatch(savedDto);
   }
 
   @Test
diff --git a/services/tables/src/test/java/com/linkedin/openhouse/tables/model/TableDtoMappingTest.java b/services/tables/src/test/java/com/linkedin/openhouse/tables/model/TableDtoMappingTest.java
index 64aefd19b..53fb633e3 100644
--- a/services/tables/src/test/java/com/linkedin/openhouse/tables/model/TableDtoMappingTest.java
+++ b/services/tables/src/test/java/com/linkedin/openhouse/tables/model/TableDtoMappingTest.java
@@ -34,7 +34,7 @@ public class TableDtoMappingTest {
           "snapshotRefs",
           "policies",
           "tableType",
-          "currentSnapshotSummary");
+          "currentSnapshot");
 
   /** Making all fields making it to map is expected, and all expected field are making it there. */
   @Test
diff --git a/services/tables/src/test/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsClientTest.java b/services/tables/src/test/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsPostCommitOperationTest.java
similarity index 53%
rename from services/tables/src/test/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsClientTest.java
rename to services/tables/src/test/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsPostCommitOperationTest.java
index 33ea0d197..417db2abf 100644
--- a/services/tables/src/test/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsClientTest.java
+++ b/services/tables/src/test/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsPostCommitOperationTest.java
@@ -3,51 +3,49 @@
 import static org.assertj.core.api.Assertions.assertThat;
 
 import com.fasterxml.jackson.databind.ObjectMapper;
-import com.linkedin.openhouse.common.metrics.MetricsConstant;
+import com.linkedin.openhouse.tables.model.CurrentSnapshotInfo;
 import com.linkedin.openhouse.tables.model.TableDto;
-import io.micrometer.core.instrument.MeterRegistry;
-import io.micrometer.core.instrument.simple.SimpleMeterRegistry;
 import java.io.IOException;
 import java.time.Duration;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.Optional;
 import java.util.concurrent.TimeUnit;
 import okhttp3.mockwebserver.MockResponse;
 import okhttp3.mockwebserver.MockWebServer;
 import okhttp3.mockwebserver.RecordedRequest;
-import okhttp3.mockwebserver.SocketPolicy;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 import org.springframework.http.client.reactive.ReactorClientHttpConnector;
 import org.springframework.web.reactive.function.client.WebClient;
+import org.springframework.web.reactive.function.client.WebClientResponseException;
+import reactor.core.publisher.Mono;
 import reactor.netty.http.client.HttpClient;
 
-class OptimizerStatsClientTest {
+class OptimizerStatsPostCommitOperationTest {
 
   private static final String TABLE_UUID = "uuid-1";
   private static final String DB = "db1";
   private static final String TABLE = "tbl1";
+  private static final long SNAPSHOT_ID = 9876543210L;
   private static final Duration BLOCK_MAX = Duration.ofSeconds(5);
 
   private MockWebServer server;
-  private MeterRegistry meterRegistry;
   private OptimizerStatsProperties properties;
-  private OptimizerStatsClient client;
+  private OptimizerStatsPostCommitOperation op;
   private final ObjectMapper mapper = new ObjectMapper();
 
   @BeforeEach
   void setUp() throws IOException {
     server = new MockWebServer();
     server.start();
-    meterRegistry = new SimpleMeterRegistry();
     properties = new OptimizerStatsProperties();
     properties.setEnabled(true);
     properties.setBaseUri(server.url("/").toString());
     properties.setPerAttemptTimeoutMs(1000L);
-    properties.setTotalTimeoutMs(2000L);
     properties.setMaxAttempts(3);
-    client = new OptimizerStatsClient(buildWebClient(), meterRegistry, properties);
+    op = new OptimizerStatsPostCommitOperation(buildWebClient(), properties);
   }
 
   @AfterEach
@@ -56,25 +54,31 @@ void tearDown() throws IOException {
   }
 
   private WebClient buildWebClient() {
-    HttpClient httpClient = HttpClient.create();
     return WebClient.builder()
         .baseUrl(properties.getBaseUri())
-        .clientConnector(new ReactorClientHttpConnector(httpClient))
+        .clientConnector(new ReactorClientHttpConnector(HttpClient.create()))
         .build();
   }
 
   @Test
-  void report_optedIn_sendsRequestWithPayloadFieldsFromSummary() throws Exception {
+  void name_isStableTagValue() {
+    assertThat(op.name()).isEqualTo("optimizer_stats");
+  }
+
+  @Test
+  void prepare_optedIn_emitsRequestWithFullPayload() throws Exception {
     server.enqueue(new MockResponse().setResponseCode(200));
 
     TableDto saved =
         optedInBuilder()
             .tableVersion("v3")
             .tableLocation("/data/tables/db1/tbl1")
-            .currentSnapshotSummary(summary())
+            .currentSnapshot(snapshot())
             .build();
 
-    client.reportAsync(saved, saved.getCurrentSnapshotSummary()).block(BLOCK_MAX);
+    Optional<Mono<Void>> work = op.prepare(saved);
+    assertThat(work).isPresent();
+    work.get().block(BLOCK_MAX);
 
     RecordedRequest req = server.takeRequest(2, TimeUnit.SECONDS);
     assertThat(req).isNotNull();
@@ -91,6 +95,7 @@ void report_optedIn_sendsRequestWithPayloadFieldsFromSummary() throws Exception
     @SuppressWarnings("unchecked")
     Map<String, Object> delta = (Map<String, Object>) stats.get("delta");
     assertThat(snapshot)
+        .containsEntry("snapshotId", SNAPSHOT_ID)
         .containsEntry("tableVersion", "v3")
         .containsEntry("tableLocation", "/data/tables/db1/tbl1")
         .containsEntry("tableSizeBytes", 4096)
@@ -100,116 +105,80 @@ void report_optedIn_sendsRequestWithPayloadFieldsFromSummary() throws Exception
         .containsEntry("numFilesDeleted", 2)
         .containsEntry("addedSizeBytes", 2048)
         .containsEntry("deletedSizeBytes", 1024);
-
-    assertThat(
-            meterRegistry
-                .counter(MetricsConstant.OPTIMIZER_STATS_ATTEMPTS, "outcome", "success")
-                .count())
-        .isEqualTo(1.0);
   }
 
   @Test
-  void report_notOptedIn_skipsCallAndIncrementsSkippedCounter() {
-    TableDto saved = builderWithoutOptIn().currentSnapshotSummary(summary()).build();
-
-    client.reportAsync(saved, saved.getCurrentSnapshotSummary()).block(BLOCK_MAX);
-
+  void prepare_notOptedIn_returnsEmpty() {
+    TableDto saved = builderWithoutOptIn().currentSnapshot(snapshot()).build();
+    assertThat(op.prepare(saved)).isEmpty();
     assertThat(server.getRequestCount()).isZero();
-    assertThat(
-            meterRegistry
-                .counter(MetricsConstant.OPTIMIZER_STATS_SKIPPED, "reason", "opt_out")
-                .count())
-        .isEqualTo(1.0);
   }
 
   @Test
-  void report_noSnapshot_skipsCallAndIncrementsSkippedCounter() {
-    TableDto saved = optedInBuilder().currentSnapshotSummary(null).build();
-
-    client.reportAsync(saved, null).block(BLOCK_MAX);
-
+  void prepare_noSnapshot_returnsEmpty() {
+    TableDto saved = optedInBuilder().currentSnapshot(null).build();
+    assertThat(op.prepare(saved)).isEmpty();
     assertThat(server.getRequestCount()).isZero();
-    assertThat(
-            meterRegistry
-                .counter(MetricsConstant.OPTIMIZER_STATS_SKIPPED, "reason", "no_snapshot")
-                .count())
-        .isEqualTo(1.0);
   }
 
   @Test
-  void report_5xxThenSuccess_retriesAndSucceeds() {
+  void prepare_5xxThenSuccess_retriesAndCompletes() {
     server.enqueue(new MockResponse().setResponseCode(503));
     server.enqueue(new MockResponse().setResponseCode(200));
 
-    TableDto saved = optedInBuilder().currentSnapshotSummary(summary()).build();
-    client.reportAsync(saved, saved.getCurrentSnapshotSummary()).block(BLOCK_MAX);
+    TableDto saved = optedInBuilder().currentSnapshot(snapshot()).build();
+    op.prepare(saved).orElseThrow(AssertionError::new).block(BLOCK_MAX);
 
     assertThat(server.getRequestCount()).isEqualTo(2);
-    assertThat(
-            meterRegistry
-                .counter(MetricsConstant.OPTIMIZER_STATS_ATTEMPTS, "outcome", "success")
-                .count())
-        .isEqualTo(1.0);
-    assertThat(
-            meterRegistry
-                .counter(MetricsConstant.OPTIMIZER_STATS_ATTEMPTS, "outcome", "retryable_failure")
-                .count())
-        .isEqualTo(1.0);
   }
 
   @Test
-  void report_allAttemptsFail_swallowsAndIncrementsFailedFinal() {
+  void prepare_allAttemptsFail_propagatesUnderlyingError() {
     properties.setMaxAttempts(2);
     server.enqueue(new MockResponse().setResponseCode(503));
     server.enqueue(new MockResponse().setResponseCode(503));
 
-    TableDto saved = optedInBuilder().currentSnapshotSummary(summary()).build();
-    client.reportAsync(saved, saved.getCurrentSnapshotSummary()).block(BLOCK_MAX);
+    TableDto saved = optedInBuilder().currentSnapshot(snapshot()).build();
+    Mono<Void> chain = op.prepare(saved).orElseThrow(AssertionError::new);
 
+    assertThatChainErrors(chain, WebClientResponseException.class);
     assertThat(server.getRequestCount()).isEqualTo(2);
-    assertThat(
-            meterRegistry
-                .counter(MetricsConstant.OPTIMIZER_STATS_FAILED_FINAL, "outcome", "server_error")
-                .count())
-        .isEqualTo(1.0);
   }
 
   @Test
-  void report_4xxNonRetryable_doesNotRetry() {
+  void prepare_4xxNonRetryable_doesNotRetry() {
     server.enqueue(new MockResponse().setResponseCode(400));
 
-    TableDto saved = optedInBuilder().currentSnapshotSummary(summary()).build();
-    client.reportAsync(saved, saved.getCurrentSnapshotSummary()).block(BLOCK_MAX);
+    TableDto saved = optedInBuilder().currentSnapshot(snapshot()).build();
+    Mono<Void> chain = op.prepare(saved).orElseThrow(AssertionError::new);
 
+    assertThatChainErrors(chain, WebClientResponseException.class);
     assertThat(server.getRequestCount()).isEqualTo(1);
-    assertThat(
-            meterRegistry
-                .counter(MetricsConstant.OPTIMIZER_STATS_FAILED_FINAL, "outcome", "client_error")
-                .count())
-        .isEqualTo(1.0);
   }
 
-  @Test
-  void report_perAttemptTimeoutTrips_recordsTimeout() {
-    properties.setPerAttemptTimeoutMs(150L);
-    properties.setTotalTimeoutMs(600L);
-    properties.setMaxAttempts(1);
-    client = new OptimizerStatsClient(buildWebClient(), meterRegistry, properties);
-    // Connection opens but the server never sends any response — forces Reactor's per-attempt
-    // .timeout(150ms) to fire with TimeoutException.
-    server.enqueue(new MockResponse().setSocketPolicy(SocketPolicy.NO_RESPONSE));
+  // ---- helpers ----
 
-    TableDto saved = optedInBuilder().currentSnapshotSummary(summary()).build();
-    client.reportAsync(saved, saved.getCurrentSnapshotSummary()).block(BLOCK_MAX);
+  private void assertThatChainErrors(Mono<Void> chain, Class<? extends Throwable> errorType) {
+    try {
+      chain.block(BLOCK_MAX);
+      throw new AssertionError("expected chain to signal an error of type " + errorType);
+    } catch (RuntimeException e) {
+      Throwable cause = unwrap(e);
+      assertThat(cause).isInstanceOf(errorType);
+    }
+  }
 
-    assertThat(
-            meterRegistry
-                .counter(MetricsConstant.OPTIMIZER_STATS_FAILED_FINAL, "outcome", "timeout")
-                .count())
-        .isEqualTo(1.0);
+  private static Throwable unwrap(Throwable t) {
+    Throwable cur = t;
+    while (cur.getCause() != null && cur.getCause() != cur) {
+      cur = cur.getCause();
+    }
+    return cur;
   }
 
-  // ---- helpers ----
+  private static CurrentSnapshotInfo snapshot() {
+    return CurrentSnapshotInfo.builder().snapshotId(SNAPSHOT_ID).summary(summary()).build();
+  }
 
   private static Map<String, String> summary() {
     Map<String, String> s = new HashMap<>();
@@ -224,7 +193,7 @@ private static Map<String, String> summary() {
 
   private static TableDto.TableDtoBuilder optedInBuilder() {
     Map<String, String> props = new HashMap<>();
-    props.put(OptimizerStatsClient.OPT_IN_PROPERTY, "true");
+    props.put(OptimizerStatsPostCommitOperation.OPT_IN_PROPERTY, "true");
     return TableDto.builder()
         .tableUUID(TABLE_UUID)
         .databaseId(DB)
diff --git a/services/tables/src/test/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitDispatcherTest.java b/services/tables/src/test/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitDispatcherTest.java
new file mode 100644
index 000000000..14b6f2dde
--- /dev/null
+++ b/services/tables/src/test/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitDispatcherTest.java
@@ -0,0 +1,177 @@
+package com.linkedin.openhouse.tables.services.postcommit;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import com.linkedin.openhouse.common.metrics.MetricsConstant;
+import com.linkedin.openhouse.tables.model.TableDto;
+import io.micrometer.core.instrument.MeterRegistry;
+import io.micrometer.core.instrument.simple.SimpleMeterRegistry;
+import java.time.Duration;
+import java.util.Arrays;
+import java.util.Optional;
+import java.util.concurrent.atomic.AtomicInteger;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import reactor.core.publisher.Mono;
+
+class PostCommitDispatcherTest {
+
+  private static final Duration BLOCK_MAX = Duration.ofSeconds(5);
+
+  private MeterRegistry meterRegistry;
+  private PostCommitProperties properties;
+  private final TableDto savedDto = TableDto.builder().databaseId("db").tableId("t").build();
+
+  @BeforeEach
+  void setUp() {
+    meterRegistry = new SimpleMeterRegistry();
+    properties = new PostCommitProperties();
+    properties.setEnabled(true);
+    properties.setPerOpTimeoutMs(500L);
+  }
+
+  private PostCommitDispatcher dispatcherWith(PostCommitOperation... ops) {
+    return new PostCommitDispatcher(Arrays.asList(ops), properties, meterRegistry);
+  }
+
+  @Test
+  void dispatch_invokesEveryRegisteredOperationsPrepare() {
+    AtomicInteger aCount = new AtomicInteger();
+    AtomicInteger bCount = new AtomicInteger();
+    dispatcherWith(new EmptyOp("a", aCount), new EmptyOp("b", bCount)).dispatch(savedDto);
+
+    assertThat(aCount.get()).isEqualTo(1);
+    assertThat(bCount.get()).isEqualTo(1);
+  }
+
+  @Test
+  void decorate_emptyPrepare_incrementsSkippedAndReturnsEmpty() {
+    PostCommitOperation op = new EmptyOp("opx", new AtomicInteger());
+
+    Optional<Mono<Void>> decorated = dispatcherWith(op).decorate(op, savedDto);
+
+    assertThat(decorated).isEmpty();
+    assertThat(meterRegistry.counter(MetricsConstant.POSTCOMMIT_OP_SKIPPED, "op", "opx").count())
+        .isEqualTo(1.0);
+  }
+
+  @Test
+  void decorate_successfulWork_recordsDurationWithSuccessOutcome() {
+    PostCommitOperation op = new SimpleOp("ok", Mono::empty);
+
+    dispatcherWith(op).decorate(op, savedDto).orElseThrow(AssertionError::new).block(BLOCK_MAX);
+
+    assertThat(
+            meterRegistry
+                .timer(MetricsConstant.POSTCOMMIT_OP_DURATION, "op", "ok", "outcome", "success")
+                .count())
+        .isEqualTo(1L);
+  }
+
+  @Test
+  void decorate_workSignalsError_incrementsFailedWithClassifiedOutcomeAndSwallowsError() {
+    RuntimeException boom = new RuntimeException("boom");
+    PostCommitOperation op = new SimpleOp("broke", () -> Mono.error(boom));
+
+    // Dispatcher swallows the error — block() returns normally.
+    dispatcherWith(op).decorate(op, savedDto).orElseThrow(AssertionError::new).block(BLOCK_MAX);
+
+    assertThat(
+            meterRegistry
+                .counter(
+                    MetricsConstant.POSTCOMMIT_OP_FAILED, "op", "broke", "outcome", "unknown_error")
+                .count())
+        .isEqualTo(1.0);
+  }
+
+  @Test
+  void decorate_prepareThrowsSynchronously_incrementsFailedAndDispatchContinuesToLaterOps() {
+    AtomicInteger laterCount = new AtomicInteger();
+    PostCommitOperation thrower =
+        new PostCommitOperation() {
+          @Override
+          public String name() {
+            return "thrower";
+          }
+
+          @Override
+          public Optional<Mono<Void>> prepare(TableDto dto) {
+            throw new IllegalStateException("nope");
+          }
+        };
+    PostCommitOperation later = new EmptyOp("later", laterCount);
+
+    dispatcherWith(thrower, later).dispatch(savedDto);
+
+    assertThat(
+            meterRegistry
+                .counter(
+                    MetricsConstant.POSTCOMMIT_OP_FAILED,
+                    "op",
+                    "thrower",
+                    "outcome",
+                    "prepare_threw")
+                .count())
+        .isEqualTo(1.0);
+    assertThat(laterCount.get())
+        .as("later operations must still run after a synchronous prepare() throw")
+        .isEqualTo(1);
+  }
+
+  @Test
+  void decorate_workExceedsPerOpTimeout_incrementsFailedWithTimeoutOutcome() {
+    properties.setPerOpTimeoutMs(50L);
+    PostCommitOperation op = new SimpleOp("slow", Mono::never);
+
+    dispatcherWith(op).decorate(op, savedDto).orElseThrow(AssertionError::new).block(BLOCK_MAX);
+
+    assertThat(
+            meterRegistry
+                .counter(MetricsConstant.POSTCOMMIT_OP_FAILED, "op", "slow", "outcome", "timeout")
+                .count())
+        .isEqualTo(1.0);
+  }
+
+  // ---- helpers ----
+
+  private static class EmptyOp implements PostCommitOperation {
+    private final String name;
+    private final AtomicInteger calls;
+
+    EmptyOp(String name, AtomicInteger calls) {
+      this.name = name;
+      this.calls = calls;
+    }
+
+    @Override
+    public String name() {
+      return name;
+    }
+
+    @Override
+    public Optional<Mono<Void>> prepare(TableDto dto) {
+      calls.incrementAndGet();
+      return Optional.empty();
+    }
+  }
+
+  private static class SimpleOp implements PostCommitOperation {
+    private final String name;
+    private final java.util.function.Supplier<Mono<Void>> work;
+
+    SimpleOp(String name, java.util.function.Supplier<Mono<Void>> work) {
+      this.name = name;
+      this.work = work;
+    }
+
+    @Override
+    public String name() {
+      return name;
+    }
+
+    @Override
+    public Optional<Mono<Void>> prepare(TableDto dto) {
+      return Optional.of(work.get());
+    }
+  }
+}

From 119c609f9cd36a07bdcd8a2be2992d03e3e332dc Mon Sep 17 00:00:00 2001
From: mkuchenbecker <mkuchenbecker@users.noreply.github.com>
Date: Wed, 27 May 2026 14:49:25 -0700
Subject: [PATCH 11/14] docs: collapse <p>/<ul>/<em>/<b> noise in postcommit
 javadoc

Per review feedback that the HTML tags made the diff ugly. Drop
paragraph breaks where the comment can be one sentence/paragraph,
drop bullet lists in favor of inline prose, drop <em>/<b> emphasis
entirely. No behavior change.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../tables/model/CurrentSnapshotInfo.java     | 12 +++----
 .../openhouse/tables/model/TableDto.java      |  3 +-
 .../optimizer/OptimizerStatsConfig.java       | 17 +++++-----
 .../OptimizerStatsPostCommitOperation.java    | 20 ++++-------
 .../optimizer/OptimizerStatsProperties.java   |  7 ++--
 .../optimizer/OptimizerStatsRequest.java      |  7 ++--
 .../postcommit/PostCommitDispatcher.java      | 33 ++++++-------------
 .../postcommit/PostCommitOperation.java       | 28 ++++------------
 .../postcommit/PostCommitProperties.java      | 19 ++++-------
 9 files changed, 50 insertions(+), 96 deletions(-)

diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/model/CurrentSnapshotInfo.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/model/CurrentSnapshotInfo.java
index 9f363b4ec..74644ac3f 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/model/CurrentSnapshotInfo.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/model/CurrentSnapshotInfo.java
@@ -7,13 +7,11 @@
 import lombok.Getter;
 
 /**
- * In-memory snapshot of the Iceberg current-snapshot metadata that was loaded alongside a {@link
- * TableDto}. Carries only fields already materialized by the catalog client — never triggers a
- * separate HDFS / object-store read.
- *
- * <p>Present whenever the table has at least one committed snapshot at construction time; absent
- * (modeled as {@link java.util.Optional#empty()} on {@link TableDto}) when the table has no
- * committed data — e.g. a {@code CREATE TABLE} with no rows yet.
+ * In-memory snapshot of the Iceberg current-snapshot metadata loaded alongside a {@link TableDto}.
+ * Carries only fields already materialized by the catalog client — never triggers a separate HDFS
+ * or object-store read. Present whenever the table has at least one committed snapshot at
+ * construction time; absent (modeled as {@link java.util.Optional#empty()} on {@link TableDto}) for
+ * tables with no committed data, e.g. a {@code CREATE TABLE} with no rows yet.
  */
 @Getter
 @Builder
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/model/TableDto.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/model/TableDto.java
index 16608f286..bd44653e7 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/model/TableDto.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/model/TableDto.java
@@ -89,8 +89,7 @@ public class TableDto {
    * In-memory current-snapshot metadata captured when this {@code TableDto} was built from an
    * Iceberg {@code Table}. Present whenever the underlying table has at least one committed
    * snapshot at that point; absent for tables with no committed data (e.g. {@code CREATE TABLE}
-   * with no rows). Not persisted, not part of equality. Stored nullable internally; consumers must
-   * read through {@link #getCurrentSnapshot()} to get the {@link Optional}.
+   * with no rows). Not persisted, not part of equality. Read through {@link #getCurrentSnapshot()}.
    */
   @Getter(AccessLevel.NONE)
   @Transient
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsConfig.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsConfig.java
index 8061fb644..351ac604d 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsConfig.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsConfig.java
@@ -9,9 +9,9 @@
 import reactor.netty.http.client.HttpClient;
 
 /**
- * Wiring for the post-commit Tables → Optimizer stats push. Only active when {@code
- * optimizer.stats.enabled=true}; otherwise no WebClient or client bean is constructed and the
- * on-commit hook in {@code IcebergSnapshotsServiceImpl} is a no-op.
+ * Wiring for the post-commit Tables → Optimizer stats push. Active only when {@code
+ * optimizer.stats.enabled=true}; otherwise no WebClient bean is constructed and the dispatcher sees
+ * no optimizer-stats operation.
  */
 @Configuration
 @EnableConfigurationProperties(OptimizerStatsProperties.class)
@@ -19,12 +19,11 @@
 public class OptimizerStatsConfig {
 
   /**
-   * Dedicated WebClient for the optimizer stats endpoint. Per-attempt and outer timeouts are
-   * applied at the call site on the Reactor chain (per-attempt in {@link
-   * OptimizerStatsPostCommitOperation}, outer per-op in the {@code PostCommitDispatcher}) — they
-   * are not configured on the underlying Netty client so that the timeout always emerges as a
-   * standard {@link java.util.concurrent.TimeoutException} (not a Netty {@code
-   * ReadTimeoutException}), which keeps the dispatcher's outcome classification simple.
+   * Dedicated WebClient for the optimizer stats endpoint. Per-attempt timeout is applied on the
+   * Reactor chain in {@link OptimizerStatsPostCommitOperation} and the outer per-op timeout in
+   * {@code PostCommitDispatcher}; neither is configured on the Netty client so the timeout always
+   * emerges as a standard {@link java.util.concurrent.TimeoutException} rather than a Netty {@code
+   * ReadTimeoutException}, keeping the dispatcher's outcome classification simple.
    */
   @Bean("optimizerStatsWebClient")
   public WebClient optimizerStatsWebClient(OptimizerStatsProperties properties) {
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsPostCommitOperation.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsPostCommitOperation.java
index 0a82d8598..eb858dece 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsPostCommitOperation.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsPostCommitOperation.java
@@ -22,19 +22,13 @@
  * {@link PostCommitOperation} that PUTs a snapshot-stats record to the optimizer's per-table stats
  * endpoint. {@link #prepare(TableDto)} returns a {@link Mono} that completes on HTTP 2xx and
  * signals an error otherwise; the dispatcher owns timeout, subscription, error swallowing, and
- * metric emission.
- *
- * <p>Skipped (operation returns {@link Optional#empty()}) when the table is not opted in via the
- * {@link #OPT_IN_PROPERTY} table property, or when no current snapshot is present (e.g. {@code
- * CREATE TABLE} with no rows yet).
- *
- * <p>Internal retry: bounded by {@link OptimizerStatsProperties#getMaxAttempts()}, fires only on
- * retryable errors (network, {@link TimeoutException}, HTTP 408 / 429 / 5xx). The dispatcher's
- * outer per-op timeout is the hard ceiling on the whole chain.
- *
- * <p>Bean is only wired when {@code optimizer.stats.enabled=true}. The path constant is
- * intentionally duplicated from {@code TableStatsController.TABLE_PATH_TEMPLATE}; keep in sync.
- * Tables service does not take a compile-time dependency on the optimizer service jar.
+ * metric emission. Returns {@link Optional#empty()} when the table is not opted in via {@link
+ * #OPT_IN_PROPERTY} or has no current snapshot. Internal retry is bounded by {@link
+ * OptimizerStatsProperties#getMaxAttempts()} and fires only on retryable errors (network, {@link
+ * TimeoutException}, HTTP 408 / 429 / 5xx); the dispatcher's per-op timeout is the hard ceiling.
+ * Bean wired only when {@code optimizer.stats.enabled=true}. Path constant is intentionally
+ * duplicated from {@code TableStatsController.TABLE_PATH_TEMPLATE} (keep in sync) so the tables
+ * service does not take a compile-time dependency on the optimizer jar.
  */
 @Slf4j
 @Component
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsProperties.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsProperties.java
index 53b1844be..ca2b78a2d 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsProperties.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsProperties.java
@@ -6,10 +6,9 @@
 
 /**
  * Configuration for the post-commit stats push from the Tables Service to the Optimizer's stats
- * endpoint. Property prefix: {@code optimizer.stats}.
- *
- * <p>When {@code enabled} is false (the default), no client bean is constructed and the push is a
- * no-op on every commit. Environments that want the feed turn it on and set {@link #baseUri}.
+ * endpoint. Property prefix {@code optimizer.stats}. When {@code enabled} is false (the default),
+ * no client bean is constructed and the operation is absent from the dispatcher. Environments that
+ * want the feed turn it on and set {@link #baseUri}.
  */
 @ConfigurationProperties("optimizer.stats")
 @Getter
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsRequest.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsRequest.java
index df929f666..2496436d3 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsRequest.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsRequest.java
@@ -9,11 +9,8 @@
 
 /**
  * Wire body for {@code PUT /v1/optimizer/stats/{tableUuid}}. Mirrors the optimizer's {@code
- * UpsertTableStatsRequest} field-for-field — see {@code
- * services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/UpsertTableStatsRequest.java}.
- *
- * <p>Tables service owns its own copy so that the wire contract is explicit at the call site and
- * the optimizer client jar is not a compile-time dependency.
+ * UpsertTableStatsRequest} field-for-field. Duplicated here so the tables service does not take a
+ * compile-time dependency on the optimizer jar; keep in sync with that type.
  */
 @Data
 @Builder
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitDispatcher.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitDispatcher.java
index d205aee12..44c8a166b 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitDispatcher.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitDispatcher.java
@@ -17,23 +17,12 @@
 import reactor.core.publisher.Mono;
 
 /**
- * Runs {@link PostCommitOperation}s after a successful Iceberg commit. Best-effort and bounded:
- * each operation gets a wall-clock timeout, errors are swallowed after metric/log, and dispatch
- * itself is fire-and-forget so the commit thread is never blocked on operation work.
- *
- * <p><b>Why async.</b> A synchronous post-commit hook converts a downstream outage (the optimizer
- * being slow or unavailable, a network glitch) into a Tables-Service write outage — the post-commit
- * push is a best-effort scheduling signal, not a write-correctness step, and its blast radius must
- * not include the write path. The crash-loss window (a JVM dying after commit and before the HTTP
- * push completes) is acceptable because operations are designed to be cumulative: the next commit
- * carries the same state forward and the consumer self-corrects from missing data.
- *
- * <p><b>Why the dispatcher owns timeouts.</b> Operations only describe payload + endpoint. The
- * timeout/swallow/subscribe machinery lives here once so individual operations stay small and so
- * the contract across operations is uniform (one knob: {@code
- * tables.postcommit.per-op-timeout-ms}).
- *
- * <p>Bean is only wired when {@code tables.postcommit.enabled=true}.
+ * Runs {@link PostCommitOperation}s after a successful Iceberg commit. Each operation gets a
+ * wall-clock timeout ({@code tables.postcommit.per-op-timeout-ms}), errors are swallowed after
+ * metric/log, and dispatch is fire-and-forget so the commit thread is never blocked. Operations
+ * describe payload and endpoint only — the timeout/swallow/subscribe machinery lives here so the
+ * contract across operations is uniform. Bean wired only when {@code
+ * tables.postcommit.enabled=true}.
  */
 @Slf4j
 @Component
@@ -65,12 +54,10 @@ public void dispatch(TableDto savedDto) {
   }
 
   /**
-   * Returns the fully-decorated {@link Mono} for {@code op} (per-op timeout, success / error metric
-   * emission, error swallow) without subscribing. Emits the {@code skipped} or {@code
-   * prepare_threw} metric synchronously and returns {@link Optional#empty()} in those cases.
-   *
-   * <p>Package-private so tests can {@code .block()} on the decorated chain rather than polling for
-   * metric emission after a fire-and-forget {@code subscribe()}.
+   * Returns the fully-decorated {@link Mono} for {@code op} (per-op timeout, success/error metric
+   * emission, error swallow) without subscribing. Emits {@code skipped} or {@code prepare_threw}
+   * synchronously and returns {@link Optional#empty()} in those cases. Package-private so tests can
+   * {@code .block()} on the chain rather than poll for metric emission.
    */
   Optional<Mono<Void>> decorate(PostCommitOperation op, TableDto savedDto) {
     Optional<Mono<Void>> work;
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitOperation.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitOperation.java
index 5ba49b371..b94804b34 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitOperation.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitOperation.java
@@ -6,24 +6,10 @@
 
 /**
  * A best-effort, bounded, asynchronous action run by the Tables Service after a successful Iceberg
- * commit. Operations are invoked by {@link PostCommitDispatcher} and are subject to a single per-op
- * wall-clock timeout owned by the dispatcher. Operations never affect commit correctness: any error
- * they signal is recorded as a metric and a log line, then swallowed.
- *
- * <p>Implementations describe <em>what</em> to do; the dispatcher owns <em>how</em> it runs
- * (subscription, timeout, error handling, metric emission). Each operation contributes:
- *
- * <ul>
- *   <li>{@link #name()} — short identifier used as a metric tag.
- *   <li>{@link #prepare(TableDto)} — returns the work to perform, or {@link Optional#empty()} when
- *       the operation does not apply to this commit (e.g. table not opted in, no committed
- *       snapshot). Returning empty is a normal outcome; the dispatcher emits a {@code skipped}
- *       metric and moves on.
- * </ul>
- *
- * <p>The returned {@link Mono} must not subscribe itself, apply its own outer timeout, or swallow
- * errors — those concerns belong to the dispatcher. Implementations may apply internal retries with
- * bounded budgets.
+ * commit. Operations are invoked by {@link PostCommitDispatcher}, which owns the per-op timeout,
+ * subscription, error swallowing, and metric emission. An error from prepare() is recorded and
+ * dropped — it never affects commit correctness. Implementations may apply internal retries with
+ * bounded budgets but must not subscribe themselves or apply an outer timeout.
  */
 public interface PostCommitOperation {
 
@@ -31,9 +17,9 @@ public interface PostCommitOperation {
   String name();
 
   /**
-   * Build the work to run for {@code savedDto}, or return {@link Optional#empty()} if this
-   * operation does not apply. The dispatcher only invokes the returned {@link Mono} after applying
-   * its own timeout / error-swallow plumbing.
+   * Build the work to run for {@code savedDto}, or {@link Optional#empty()} when the operation does
+   * not apply (e.g. table not opted in, no committed snapshot). The dispatcher records empty as a
+   * {@code skipped} metric.
    */
   Optional<Mono<Void>> prepare(TableDto savedDto);
 }
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitProperties.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitProperties.java
index 8a153cd79..bc757d37f 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitProperties.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitProperties.java
@@ -5,27 +5,22 @@
 import org.springframework.boot.context.properties.ConfigurationProperties;
 
 /**
- * Configuration for the Tables Service post-commit operation framework. Property prefix: {@code
- * tables.postcommit}.
- *
- * <p>When {@code enabled} is {@code false} (the default), no dispatcher bean is constructed and
- * registered {@link PostCommitOperation} beans, if any, are never invoked. Per-OH-instance roll-out
- * flips a single flag.
+ * Configuration for the Tables Service post-commit operation framework. Property prefix {@code
+ * tables.postcommit}. When {@code enabled} is false (the default), no dispatcher bean is
+ * constructed and registered {@link PostCommitOperation} beans are never invoked.
  */
 @ConfigurationProperties("tables.postcommit")
 @Getter
 @Setter
 public class PostCommitProperties {
 
-  /** Master switch. When {@code false}, no operations are dispatched on commit. */
+  /** Master switch. When false, no operations are dispatched on commit. */
   private boolean enabled = false;
 
   /**
-   * Wall-clock ceiling applied by the dispatcher to each operation's {@link
-   * PostCommitOperation#prepare(com.linkedin.openhouse.tables.model.TableDto)} {@code Mono}. Bounds
-   * how long a misbehaving operation can keep resources (connections, threads) tied up. Does not
-   * bound commit-thread latency — operations run on the executing reactive scheduler, not on the
-   * commit thread. Default 3000 ms.
+   * Wall-clock ceiling applied by the dispatcher to each operation's prepared {@code Mono}. Bounds
+   * resource occupancy (connections, threads) but does not block the commit thread. Default 3000
+   * ms.
    */
   private long perOpTimeoutMs = 3000L;
 }

From 1c7fbd79cdf28c511ac90d8122f7a78ba3241725 Mon Sep 17 00:00:00 2001
From: mkuchenbecker <mkuchenbecker@users.noreply.github.com>
Date: Wed, 27 May 2026 14:57:49 -0700
Subject: [PATCH 12/14] docs: switch to // comments with line breaks, drop
 block javadoc

Spotless's googlejavaformat rewrites multi-paragraph block javadoc to
re-insert <p> tags. // comments are left untouched. Per review
feedback that the source should read like prose with paragraph
breaks, not like a webpage, swap all multi-paragraph descriptions on
the new files to // blocks. Single-line descriptors that were javadoc
become a single // line above the declaration. Complete sentences
throughout. No behavior change.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../tables/model/CurrentSnapshotInfo.java     | 25 +++++----
 .../openhouse/tables/model/TableDto.java      | 15 +++---
 .../optimizer/OptimizerStatsConfig.java       | 25 ++++-----
 .../OptimizerStatsPostCommitOperation.java    | 53 ++++++++++---------
 .../optimizer/OptimizerStatsProperties.java   | 36 ++++++-------
 .../optimizer/OptimizerStatsRequest.java      | 26 ++++-----
 .../postcommit/PostCommitDispatcher.java      | 46 ++++++++--------
 .../postcommit/PostCommitOperation.java       | 27 +++++-----
 .../postcommit/PostCommitProperties.java      | 23 ++++----
 9 files changed, 136 insertions(+), 140 deletions(-)

diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/model/CurrentSnapshotInfo.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/model/CurrentSnapshotInfo.java
index 74644ac3f..90437d418 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/model/CurrentSnapshotInfo.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/model/CurrentSnapshotInfo.java
@@ -6,26 +6,25 @@
 import lombok.EqualsAndHashCode;
 import lombok.Getter;
 
-/**
- * In-memory snapshot of the Iceberg current-snapshot metadata loaded alongside a {@link TableDto}.
- * Carries only fields already materialized by the catalog client — never triggers a separate HDFS
- * or object-store read. Present whenever the table has at least one committed snapshot at
- * construction time; absent (modeled as {@link java.util.Optional#empty()} on {@link TableDto}) for
- * tables with no committed data, e.g. a {@code CREATE TABLE} with no rows yet.
- */
+// In-memory snapshot of the Iceberg current-snapshot metadata that was loaded alongside a
+// TableDto.
+//
+// This carries only fields already materialized by the catalog client. Constructing it never
+// triggers a separate HDFS or object-store read.
+//
+// The value is present whenever the underlying table has at least one committed snapshot at
+// construction time. It is absent (modeled as Optional.empty() on TableDto) for tables with no
+// committed data, such as a CREATE TABLE with no rows yet.
 @Getter
 @Builder
 @AllArgsConstructor
 @EqualsAndHashCode
 public class CurrentSnapshotInfo {
 
-  /** Iceberg snapshot ID (decimal long). Stable per commit; usable as an idempotency token. */
+  // Iceberg snapshot ID. Stable per commit; usable as an idempotency token.
   private final long snapshotId;
 
-  /**
-   * Iceberg {@code Snapshot.summary()} map, unmodified. Keys include {@code total-data-files},
-   * {@code total-files-size}, {@code added-data-files}, {@code deleted-data-files}, {@code
-   * added-files-size}, {@code removed-files-size}.
-   */
+  // Iceberg Snapshot.summary() map, unmodified. Keys include total-data-files, total-files-size,
+  // added-data-files, deleted-data-files, added-files-size, removed-files-size.
   private final Map<String, String> summary;
 }
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/model/TableDto.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/model/TableDto.java
index bd44653e7..59dc2e80f 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/model/TableDto.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/model/TableDto.java
@@ -85,18 +85,19 @@ public class TableDto {
 
   private boolean replaceCommit;
 
-  /**
-   * In-memory current-snapshot metadata captured when this {@code TableDto} was built from an
-   * Iceberg {@code Table}. Present whenever the underlying table has at least one committed
-   * snapshot at that point; absent for tables with no committed data (e.g. {@code CREATE TABLE}
-   * with no rows). Not persisted, not part of equality. Read through {@link #getCurrentSnapshot()}.
-   */
+  // In-memory current-snapshot metadata captured when this TableDto was built from an Iceberg
+  // Table.
+  //
+  // Present whenever the underlying table has at least one committed snapshot at that point.
+  // Absent for tables with no committed data, such as a CREATE TABLE with no rows yet.
+  //
+  // Not persisted and not part of equality. Read through getCurrentSnapshot().
   @Getter(AccessLevel.NONE)
   @Transient
   @EqualsAndHashCode.Exclude
   private CurrentSnapshotInfo currentSnapshot;
 
-  /** Returns the current-snapshot metadata if any, else {@link Optional#empty()}. */
+  // Returns the current-snapshot metadata if any, else Optional.empty().
   public Optional<CurrentSnapshotInfo> getCurrentSnapshot() {
     return Optional.ofNullable(currentSnapshot);
   }
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsConfig.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsConfig.java
index 351ac604d..47613f98c 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsConfig.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsConfig.java
@@ -8,23 +8,24 @@
 import org.springframework.web.reactive.function.client.WebClient;
 import reactor.netty.http.client.HttpClient;
 
-/**
- * Wiring for the post-commit Tables → Optimizer stats push. Active only when {@code
- * optimizer.stats.enabled=true}; otherwise no WebClient bean is constructed and the dispatcher sees
- * no optimizer-stats operation.
- */
+// Wiring for the post-commit Tables-to-Optimizer stats push.
+//
+// This configuration is active only when optimizer.stats.enabled=true. When the flag is false,
+// no WebClient bean is constructed, and the dispatcher sees no optimizer-stats operation.
 @Configuration
 @EnableConfigurationProperties(OptimizerStatsProperties.class)
 @ConditionalOnProperty(prefix = "optimizer.stats", name = "enabled", havingValue = "true")
 public class OptimizerStatsConfig {
 
-  /**
-   * Dedicated WebClient for the optimizer stats endpoint. Per-attempt timeout is applied on the
-   * Reactor chain in {@link OptimizerStatsPostCommitOperation} and the outer per-op timeout in
-   * {@code PostCommitDispatcher}; neither is configured on the Netty client so the timeout always
-   * emerges as a standard {@link java.util.concurrent.TimeoutException} rather than a Netty {@code
-   * ReadTimeoutException}, keeping the dispatcher's outcome classification simple.
-   */
+  // Dedicated WebClient for the optimizer stats endpoint.
+  //
+  // The per-attempt timeout is applied on the Reactor chain in OptimizerStatsPostCommitOperation,
+  // and the outer per-op timeout is applied by PostCommitDispatcher. Neither timeout is
+  // configured on the underlying Netty client.
+  //
+  // This arrangement ensures that any timeout always emerges as a standard
+  // java.util.concurrent.TimeoutException rather than a Netty ReadTimeoutException, which keeps
+  // the dispatcher's outcome classification simple.
   @Bean("optimizerStatsWebClient")
   public WebClient optimizerStatsWebClient(OptimizerStatsProperties properties) {
     return WebClient.builder()
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsPostCommitOperation.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsPostCommitOperation.java
index eb858dece..756da2d10 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsPostCommitOperation.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsPostCommitOperation.java
@@ -18,38 +18,38 @@
 import reactor.util.retry.Retry;
 import reactor.util.retry.RetrySpec;
 
-/**
- * {@link PostCommitOperation} that PUTs a snapshot-stats record to the optimizer's per-table stats
- * endpoint. {@link #prepare(TableDto)} returns a {@link Mono} that completes on HTTP 2xx and
- * signals an error otherwise; the dispatcher owns timeout, subscription, error swallowing, and
- * metric emission. Returns {@link Optional#empty()} when the table is not opted in via {@link
- * #OPT_IN_PROPERTY} or has no current snapshot. Internal retry is bounded by {@link
- * OptimizerStatsProperties#getMaxAttempts()} and fires only on retryable errors (network, {@link
- * TimeoutException}, HTTP 408 / 429 / 5xx); the dispatcher's per-op timeout is the hard ceiling.
- * Bean wired only when {@code optimizer.stats.enabled=true}. Path constant is intentionally
- * duplicated from {@code TableStatsController.TABLE_PATH_TEMPLATE} (keep in sync) so the tables
- * service does not take a compile-time dependency on the optimizer jar.
- */
+// A PostCommitOperation that PUTs a snapshot-stats record to the optimizer's per-table stats
+// endpoint.
+//
+// prepare() returns a Mono that completes on HTTP 2xx and signals an error otherwise. The
+// dispatcher owns the timeout, the subscription, error swallowing, and metric emission. The
+// returned value is Optional.empty() when the table is not opted in via OPT_IN_PROPERTY or has
+// no current snapshot.
+//
+// Internal retry is bounded by OptimizerStatsProperties.maxAttempts and fires only on retryable
+// errors: network failures, a TimeoutException, or HTTP 408 / 429 / 5xx responses. The
+// dispatcher's per-op timeout is the hard ceiling on the whole chain.
+//
+// The bean is wired only when optimizer.stats.enabled=true. PATH_TEMPLATE is intentionally
+// duplicated from TableStatsController.TABLE_PATH_TEMPLATE so the tables service does not take a
+// compile-time dependency on the optimizer jar. Keep both copies in sync.
 @Slf4j
 @Component
 @ConditionalOnProperty(prefix = "optimizer.stats", name = "enabled", havingValue = "true")
 public class OptimizerStatsPostCommitOperation implements PostCommitOperation {
 
-  /** Metric tag value for {@code op}. */
+  // Metric tag value for the "op" tag.
   static final String OP_NAME = "optimizer_stats";
 
-  /**
-   * Per-call URL path. Intentionally duplicated from {@code
-   * TableStatsController.TABLE_PATH_TEMPLATE}; keep in sync.
-   */
+  // Per-call URL path. Intentionally duplicated from TableStatsController.TABLE_PATH_TEMPLATE so
+  // that we avoid a compile-time dependency on the optimizer jar. Keep both copies in sync.
   static final String PATH_TEMPLATE = "/v1/optimizer/stats/{tableUuid}";
 
-  /** Table-property key that opts a table in for the post-commit stats push. */
+  // Table-property key that opts a table in for the post-commit stats push.
   static final String OPT_IN_PROPERTY = "maintenance.optimizer.stats.enabled";
 
-  /** Iceberg snapshot-summary keys we read. All values are decimal-string longs. */
+  // Iceberg snapshot-summary keys we read. All values are decimal-string longs.
   static final String SUMMARY_TOTAL_DATA_FILES = "total-data-files";
-
   static final String SUMMARY_TOTAL_FILES_SIZE = "total-files-size";
   static final String SUMMARY_ADDED_DATA_FILES = "added-data-files";
   static final String SUMMARY_DELETED_DATA_FILES = "deleted-data-files";
@@ -100,13 +100,14 @@ public Optional<Mono<Void>> prepare(TableDto savedDto) {
     return Optional.of(chain);
   }
 
-  /** {@code true} iff the table's properties contain the literal opt-in value {@code "true"}. */
+  // Returns true when the table's properties contain the literal opt-in value "true" for the
+  // configured OPT_IN_PROPERTY.
   private boolean isOptedIn(TableDto saved) {
     Map<String, String> props = saved.getTableProperties();
     return props != null && "true".equals(props.get(OPT_IN_PROPERTY));
   }
 
-  /** Build the wire body. Missing summary keys default to 0L. */
+  // Builds the wire body. Missing summary keys default to 0L.
   OptimizerStatsRequest buildRequest(TableDto saved, CurrentSnapshotInfo snapshot) {
     Map<String, String> summary = snapshot.getSummary();
     OptimizerStatsRequest.Snapshot snapshotPayload =
@@ -132,10 +133,10 @@ OptimizerStatsRequest buildRequest(TableDto saved, CurrentSnapshotInfo snapshot)
         .build();
   }
 
-  /**
-   * Retryable errors: per-attempt timeout, network-level failures, 5xx, 408, 429. Other 4xx are
-   * client errors — retrying won't fix them.
-   */
+  // Returns true for errors that a later attempt could plausibly succeed against: a per-attempt
+  // timeout, a network-level failure, or an HTTP 5xx / 408 / 429 response.
+  //
+  // Other 4xx responses are client errors that retries cannot fix, so we fail fast on them.
   boolean isRetryable(Throwable e) {
     if (e instanceof TimeoutException) {
       return true;
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsProperties.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsProperties.java
index ca2b78a2d..877ce3423 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsProperties.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsProperties.java
@@ -4,36 +4,32 @@
 import lombok.Setter;
 import org.springframework.boot.context.properties.ConfigurationProperties;
 
-/**
- * Configuration for the post-commit stats push from the Tables Service to the Optimizer's stats
- * endpoint. Property prefix {@code optimizer.stats}. When {@code enabled} is false (the default),
- * no client bean is constructed and the operation is absent from the dispatcher. Environments that
- * want the feed turn it on and set {@link #baseUri}.
- */
+// Configuration for the post-commit stats push from the Tables Service to the Optimizer's stats
+// endpoint. Property prefix is optimizer.stats.
+//
+// When enabled is false (the default), no client bean is constructed, and the operation is
+// absent from the dispatcher. Environments that want the feed turn it on and set baseUri.
 @ConfigurationProperties("optimizer.stats")
 @Getter
 @Setter
 public class OptimizerStatsProperties {
 
-  /** Master switch. When {@code false}, no HTTP push is attempted and no client bean is wired. */
+  // Master switch. When false, no HTTP push is attempted and no client bean is wired.
   private boolean enabled = false;
 
-  /**
-   * Base URI of the Optimizer service. Path {@code /v1/optimizer/stats/{tableUuid}} is appended at
-   * call time. No default — required when {@link #enabled} is {@code true}.
-   */
+  // Base URI of the Optimizer service. The path /v1/optimizer/stats/{tableUuid} is appended at
+  // call time. No default; required when enabled is true.
   private String baseUri;
 
-  /**
-   * Per-attempt HTTP timeout in milliseconds. Bounds each individual attempt so retries can fit
-   * inside the dispatcher's outer per-op budget ({@code tables.postcommit.per-op-timeout-ms},
-   * default 3000 ms). Default 1000 ms.
-   */
+  // Per-attempt HTTP timeout in milliseconds.
+  //
+  // Bounds each individual attempt so retries can fit inside the dispatcher's outer per-op budget
+  // (tables.postcommit.per-op-timeout-ms, default 3000 ms). Default is 1000 ms.
   private long perAttemptTimeoutMs = 1000L;
 
-  /**
-   * Total attempt count (initial try plus retries). Default 3. Retries fire only on retryable
-   * errors (network, timeout, 408/429/5xx); other 4xx fail fast without retry.
-   */
+  // Total attempt count (the initial try plus retries). Default is 3.
+  //
+  // Retries fire only on retryable errors: network failures, a timeout, or an HTTP 408 / 429 /
+  // 5xx response. Other 4xx responses fail fast without retry.
   private int maxAttempts = 3;
 }
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsRequest.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsRequest.java
index 2496436d3..8239184ec 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsRequest.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsRequest.java
@@ -7,11 +7,11 @@
 import lombok.Data;
 import lombok.NoArgsConstructor;
 
-/**
- * Wire body for {@code PUT /v1/optimizer/stats/{tableUuid}}. Mirrors the optimizer's {@code
- * UpsertTableStatsRequest} field-for-field. Duplicated here so the tables service does not take a
- * compile-time dependency on the optimizer jar; keep in sync with that type.
- */
+// Wire body for PUT /v1/optimizer/stats/{tableUuid}.
+//
+// This mirrors the optimizer's UpsertTableStatsRequest field-for-field. The type is duplicated
+// here so that the tables service does not take a compile-time dependency on the optimizer jar.
+// Keep both copies in sync.
 @Data
 @Builder
 @NoArgsConstructor
@@ -34,20 +34,16 @@ public static class Stats {
     private Delta delta;
   }
 
-  /**
-   * Point-in-time snapshot metrics. Map to optimizer's {@code
-   * TableStatsPayload.SnapshotMetricsDto}.
-   */
+  // Point-in-time snapshot metrics. Maps to the optimizer's
+  // TableStatsPayload.SnapshotMetricsDto.
   @Data
   @Builder
   @NoArgsConstructor
   @AllArgsConstructor
   @JsonInclude(JsonInclude.Include.NON_NULL)
   public static class Snapshot {
-    /**
-     * Iceberg snapshot ID. Sent so the optimizer can use it as an idempotency token on upsert and
-     * reject out-of-order replays — server-side wiring tracked in BDP-102985.
-     */
+    // Iceberg snapshot ID. Sent so the optimizer can use it as an idempotency token on upsert and
+    // reject out-of-order replays. Server-side wiring is tracked in BDP-102985.
     private Long snapshotId;
 
     private String tableVersion;
@@ -56,9 +52,7 @@ public static class Snapshot {
     private Long numCurrentFiles;
   }
 
-  /**
-   * Per-commit incremental counters. Map to optimizer's {@code TableStatsPayload.CommitDeltaDto}.
-   */
+  // Per-commit incremental counters. Maps to the optimizer's TableStatsPayload.CommitDeltaDto.
   @Data
   @Builder
   @NoArgsConstructor
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitDispatcher.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitDispatcher.java
index 44c8a166b..5a04e2df7 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitDispatcher.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitDispatcher.java
@@ -16,14 +16,16 @@
 import org.springframework.web.reactive.function.client.WebClientResponseException;
 import reactor.core.publisher.Mono;
 
-/**
- * Runs {@link PostCommitOperation}s after a successful Iceberg commit. Each operation gets a
- * wall-clock timeout ({@code tables.postcommit.per-op-timeout-ms}), errors are swallowed after
- * metric/log, and dispatch is fire-and-forget so the commit thread is never blocked. Operations
- * describe payload and endpoint only — the timeout/swallow/subscribe machinery lives here so the
- * contract across operations is uniform. Bean wired only when {@code
- * tables.postcommit.enabled=true}.
- */
+// Runs PostCommitOperations after a successful Iceberg commit.
+//
+// Each operation receives a wall-clock timeout from tables.postcommit.per-op-timeout-ms. Any
+// error the operation signals is recorded as a metric and a log line, then swallowed. Dispatch
+// is fire-and-forget, so the commit thread is never blocked on operation work.
+//
+// Operations describe payload and endpoint only. The timeout, error swallowing, subscription,
+// and metric emission all live here, so the contract across operations stays uniform.
+//
+// The bean is constructed only when tables.postcommit.enabled=true.
 @Slf4j
 @Component
 @EnableConfigurationProperties(PostCommitProperties.class)
@@ -43,22 +45,24 @@ public PostCommitDispatcher(
     this.meterRegistry = meterRegistry;
   }
 
-  /**
-   * Dispatch all registered operations for {@code savedDto}. Returns immediately on the calling
-   * thread; each operation runs on its underlying reactive scheduler. Never throws.
-   */
+  // Dispatches all registered operations for savedDto.
+  //
+  // Returns immediately on the calling thread. Each operation runs on its underlying reactive
+  // scheduler. This method never throws.
   public void dispatch(TableDto savedDto) {
     for (PostCommitOperation op : operations) {
       decorate(op, savedDto).ifPresent(Mono::subscribe);
     }
   }
 
-  /**
-   * Returns the fully-decorated {@link Mono} for {@code op} (per-op timeout, success/error metric
-   * emission, error swallow) without subscribing. Emits {@code skipped} or {@code prepare_threw}
-   * synchronously and returns {@link Optional#empty()} in those cases. Package-private so tests can
-   * {@code .block()} on the chain rather than poll for metric emission.
-   */
+  // Returns the fully-decorated Mono for op without subscribing to it.
+  //
+  // The decoration applies the per-op timeout, records the success or error metric, and swallows
+  // any error. When the operation does not apply (or its prepare() throws synchronously), this
+  // method emits the "skipped" or "prepare_threw" metric and returns Optional.empty().
+  //
+  // Package-private so that tests can .block() on the chain rather than poll for metric emission
+  // after a fire-and-forget subscription.
   Optional<Mono<Void>> decorate(PostCommitOperation op, TableDto savedDto) {
     Optional<Mono<Void>> work;
     try {
@@ -109,10 +113,8 @@ Optional<Mono<Void>> decorate(PostCommitOperation op, TableDto savedDto) {
     return Optional.of(decorated);
   }
 
-  /**
-   * Map a terminal error to a small set of outcome tags. Kept here so all operations share the same
-   * taxonomy.
-   */
+  // Maps a terminal error to a small set of outcome tags. The classifier lives here so that all
+  // operations share the same taxonomy.
   private static String classifyOutcome(Throwable e) {
     if (e instanceof TimeoutException) {
       return "timeout";
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitOperation.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitOperation.java
index b94804b34..6cf99cd7e 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitOperation.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitOperation.java
@@ -4,22 +4,23 @@
 import java.util.Optional;
 import reactor.core.publisher.Mono;
 
-/**
- * A best-effort, bounded, asynchronous action run by the Tables Service after a successful Iceberg
- * commit. Operations are invoked by {@link PostCommitDispatcher}, which owns the per-op timeout,
- * subscription, error swallowing, and metric emission. An error from prepare() is recorded and
- * dropped — it never affects commit correctness. Implementations may apply internal retries with
- * bounded budgets but must not subscribe themselves or apply an outer timeout.
- */
+// A best-effort, bounded, asynchronous action that the Tables Service runs after a successful
+// Iceberg commit.
+//
+// Operations are invoked by PostCommitDispatcher, which owns the per-op timeout, the
+// subscription, error swallowing, and metric emission. An error signaled from prepare() is
+// recorded and dropped, so it never affects commit correctness.
+//
+// Implementations describe what to push and where. They may apply internal retries within a
+// bounded budget. They must not subscribe themselves or apply an outer timeout, because the
+// dispatcher already owns both concerns.
 public interface PostCommitOperation {
 
-  /** Short, stable identifier for this operation. Used as the {@code op} metric tag. */
+  // Returns a short, stable identifier for this operation. Used as the "op" metric tag.
   String name();
 
-  /**
-   * Build the work to run for {@code savedDto}, or {@link Optional#empty()} when the operation does
-   * not apply (e.g. table not opted in, no committed snapshot). The dispatcher records empty as a
-   * {@code skipped} metric.
-   */
+  // Builds the work to run for savedDto, or returns Optional.empty() when the operation does not
+  // apply to this commit (for example, when the table is not opted in or has no committed
+  // snapshot). The dispatcher records an empty return as a "skipped" metric.
   Optional<Mono<Void>> prepare(TableDto savedDto);
 }
diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitProperties.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitProperties.java
index bc757d37f..98e8b02eb 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitProperties.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/postcommit/PostCommitProperties.java
@@ -4,23 +4,24 @@
 import lombok.Setter;
 import org.springframework.boot.context.properties.ConfigurationProperties;
 
-/**
- * Configuration for the Tables Service post-commit operation framework. Property prefix {@code
- * tables.postcommit}. When {@code enabled} is false (the default), no dispatcher bean is
- * constructed and registered {@link PostCommitOperation} beans are never invoked.
- */
+// Configuration for the Tables Service post-commit operation framework. Property prefix is
+// tables.postcommit.
+//
+// When enabled is false (the default), the dispatcher bean is not constructed, and any
+// registered PostCommitOperation beans are never invoked.
 @ConfigurationProperties("tables.postcommit")
 @Getter
 @Setter
 public class PostCommitProperties {
 
-  /** Master switch. When false, no operations are dispatched on commit. */
+  // Master switch. When false, no operations are dispatched on commit.
   private boolean enabled = false;
 
-  /**
-   * Wall-clock ceiling applied by the dispatcher to each operation's prepared {@code Mono}. Bounds
-   * resource occupancy (connections, threads) but does not block the commit thread. Default 3000
-   * ms.
-   */
+  // Wall-clock ceiling that the dispatcher applies to each operation's prepared Mono.
+  //
+  // This bounds resource occupancy (connections, threads) for a misbehaving operation. It does
+  // not block the commit thread, because operations run on the underlying reactive scheduler.
+  //
+  // Default is 3000 ms.
   private long perOpTimeoutMs = 3000L;
 }

From ecc65f8ff9bed21165f9f10002138af752bcee7e Mon Sep 17 00:00:00 2001
From: mkuchenbecker <mkuchenbecker@users.noreply.github.com>
Date: Wed, 27 May 2026 15:43:21 -0700
Subject: [PATCH 13/14] docs: drop internal ticket reference from
 OptimizerStatsRequest comment

Server-side idempotency wiring is a follow-up; the comment now states
that without naming an internal tracker.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../tables/services/optimizer/OptimizerStatsRequest.java        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsRequest.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsRequest.java
index 8239184ec..2cb2b2873 100644
--- a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsRequest.java
+++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/optimizer/OptimizerStatsRequest.java
@@ -43,7 +43,7 @@ public static class Stats {
   @JsonInclude(JsonInclude.Include.NON_NULL)
   public static class Snapshot {
     // Iceberg snapshot ID. Sent so the optimizer can use it as an idempotency token on upsert and
-    // reject out-of-order replays. Server-side wiring is tracked in BDP-102985.
+    // reject out-of-order replays.
     private Long snapshotId;
 
     private String tableVersion;

From f31fea01b6ada5855baba88b449ff85723b2603b Mon Sep 17 00:00:00 2001
From: mkuchenbecker <mkuchenbecker@users.noreply.github.com>
Date: Wed, 27 May 2026 17:53:34 -0700
Subject: [PATCH 14/14] test(optimizer-5): wire end-to-end demo to the new
 post-commit framework

The on-commit stats hook used to be gated by cluster.optimizer.base-uri
and pushed a minimal payload (databaseName/tableName/properties only).
The merge replaces it with the PostCommitOperation framework, which is
opt-in per table and pushes the full snapshot summary + snapshot id.

Update the demo to exercise the new path:

  * docker-compose: set TABLES_POSTCOMMIT_ENABLED=true,
    OPTIMIZER_STATS_ENABLED=true, OPTIMIZER_STATS_BASE_URI (replaces
    CLUSTER_OPTIMIZER_BASE_URI, which is no longer read by anything).
  * CREATE TABLE in demo-setup.sh now sets
    maintenance.optimizer.stats.enabled=true alongside the OFD flag,
    so each demo table is opted in to the stats push.
  * Stats verification is now a bounded poll (30s) because the push is
    async fire-and-forget. After settling, the check also asserts the
    new payload shape: a sample table's stats row carries non-zero
    numCurrentFiles, non-zero tableSizeBytes, and the snapshotId.

End result: the demo still passes if and only if every committed table
generated a stats row, and the row actually contains snapshot stats
(not just identity fields).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 demo-setup.sh                                 | 40 +++++++++++++++++--
 .../oh-hadoop-spark/docker-compose.yml        |  5 ++-
 2 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/demo-setup.sh b/demo-setup.sh
index b39116a4d..6fc5491f7 100755
--- a/demo-setup.sh
+++ b/demo-setup.sh
@@ -69,7 +69,10 @@ for entry in $TABLES; do
   "$SCRIPT_DIR/local-spark-sql.sh" "DROP TABLE IF EXISTS openhouse.db1.$TABLE" > /dev/null
   "$SCRIPT_DIR/local-spark-sql.sh" "CREATE TABLE openhouse.db1.$TABLE (
     id STRING, val STRING
-  ) TBLPROPERTIES ('maintenance.optimizer.ofd.enabled'='true')" > /dev/null
+  ) TBLPROPERTIES (
+    'maintenance.optimizer.ofd.enabled'='true',
+    'maintenance.optimizer.stats.enabled'='true'
+  )" > /dev/null
 
   for i in $(seq 1 "$WRITES"); do
     "$SCRIPT_DIR/local-spark-sql.sh" \
@@ -88,11 +91,42 @@ for entry in $TABLES; do
   echo "$TABLE=$TABLE_UUID" >> /tmp/demo_ofd_uuids.txt
 done
 
-STATS_COUNT=$(curl -sf "$OPT_API/v1/optimizer/stats?limit=100" | jq 'length')
+# Stats push is async fire-and-forget; the dispatcher subscribes on the Netty event
+# loop after the commit thread returns. Poll briefly for the rows to settle so we
+# don't false-fail before the loop wakes up.
+echo ""
+echo "=== [Wait] Tables on-commit stats push to land in optimizer DB ==="
+STATS_WAIT_SECS=30
+i=0
+STATS_COUNT=0
+while [ $i -lt "$STATS_WAIT_SECS" ]; do
+  STATS_COUNT=$(curl -sf "$OPT_API/v1/optimizer/stats?limit=100" | jq 'length')
+  [ "$STATS_COUNT" -ge "$TABLE_COUNT" ] && break
+  printf "  stats rows: %d/%d (waited %ds)\r" "$STATS_COUNT" "$TABLE_COUNT" "$i"
+  sleep 2
+  i=$((i + 2))
+done
+echo ""
 [ "$STATS_COUNT" -ge "$TABLE_COUNT" ] \
-  || { echo "FAIL: expected $TABLE_COUNT stats rows, got $STATS_COUNT"; exit 1; }
+  || { echo "FAIL: expected $TABLE_COUNT stats rows, got $STATS_COUNT after ${STATS_WAIT_SECS}s"; exit 1; }
 echo "PASS: $STATS_COUNT stats rows posted by Tables Service on-commit hook"
 
+# Verify the new payload shape carries snapshot stats, not just identity fields.
+# Pick one table and confirm the optimizer recorded a non-zero numCurrentFiles, a
+# non-zero tableSizeBytes, and the snapshot ID we sent.
+SAMPLE_UUID=$(head -n1 /tmp/demo_ofd_uuids.txt | cut -d= -f2)
+SAMPLE_ROW=$(curl -sf "$OPT_API/v1/optimizer/stats/$SAMPLE_UUID")
+SAMPLE_NUM_FILES=$(echo "$SAMPLE_ROW" | jq -r '.stats.snapshot.numCurrentFiles // 0')
+SAMPLE_SIZE=$(echo "$SAMPLE_ROW" | jq -r '.stats.snapshot.tableSizeBytes // 0')
+SAMPLE_SNAPSHOT_ID=$(echo "$SAMPLE_ROW" | jq -r '.stats.snapshot.snapshotId // empty')
+[ "$SAMPLE_NUM_FILES" -gt 0 ] \
+  || { echo "FAIL: sample stats row has numCurrentFiles=$SAMPLE_NUM_FILES, expected > 0"; exit 1; }
+[ "$SAMPLE_SIZE" -gt 0 ] \
+  || { echo "FAIL: sample stats row has tableSizeBytes=$SAMPLE_SIZE, expected > 0"; exit 1; }
+[ -n "$SAMPLE_SNAPSHOT_ID" ] \
+  || { echo "FAIL: sample stats row missing snapshotId"; exit 1; }
+echo "PASS: sample stats row carries snapshot payload (files=$SAMPLE_NUM_FILES, bytes=$SAMPLE_SIZE, snapshotId=$SAMPLE_SNAPSHOT_ID)"
+
 echo ""
 echo "=== [2/4] Expire old snapshots (creates orphan data files) ==="
 kill_idle_session
diff --git a/infra/recipes/docker-compose/oh-hadoop-spark/docker-compose.yml b/infra/recipes/docker-compose/oh-hadoop-spark/docker-compose.yml
index ec404ae81..bcf4787bf 100644
--- a/infra/recipes/docker-compose/oh-hadoop-spark/docker-compose.yml
+++ b/infra/recipes/docker-compose/oh-hadoop-spark/docker-compose.yml
@@ -22,7 +22,10 @@ services:
       - OTEL_METRICS_EXPORTER=none
       - OTEL_LOGS_EXPORTER=none
       - OTEL_TRACES_SAMPLER=always_on
-      - CLUSTER_OPTIMIZER_BASE_URI=http://openhouse-optimizer:8080
+      # Post-commit operation framework: enable the dispatcher and the optimizer-stats op.
+      - TABLES_POSTCOMMIT_ENABLED=true
+      - OPTIMIZER_STATS_ENABLED=true
+      - OPTIMIZER_STATS_BASE_URI=http://openhouse-optimizer:8080
 
   openhouse-jobs:
     container_name: local.openhouse-jobs