From e0f4458190fafd710a5b655cab49eb0d2a0b5c78 Mon Sep 17 00:00:00 2001
From: bnsoni <bhargavsoni8@gmail.com>
Date: Wed, 29 Apr 2026 22:38:56 +0300
Subject: [PATCH] fix(ibl_edx): layout-agnostic LMS readiness probe + explicit
 migration check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The `Wait for LMS to be ready` task in the `ibl_edx` ansible role has been
the silent failure point of every fresh-bootstrap deployment that takes
the non-AMI compose path. Two compounding bugs land the operator at the
same misleading "wait timed out after 40 retries" message after 10 minutes:

1. The probe target — `http://localhost:8600/heartbeat` — only exists on
   tutor's published-port layout. On bootstrap paths that render a
   Caddy-fronted layout (LMS reachable only via a Host header on the
   local reverse proxy), `:8600` is never bound on the host. The wait
   times out even though LMS is healthy on a different port.

2. The launch step has `ignore_errors: true`. When `ibl tutor local
   launch -I` silently skips its `tutor local do init` migration step,
   the openedx schema stays empty, LMS crash-loops on
   `ProgrammingError: Table 'openedx.<table>' doesn't exist`, and the
   wait task — not the launch task — is the one that fails. The actual
   error gets buried 10 minutes deep in retry noise.

This commit makes the role self-diagnose both failure modes:

- Replace the `:8600` probe with a layout-agnostic
  `--resolve learn.{{ base_domain }}:80:127.0.0.1` probe through the
  host nginx (always on `:80`, routes by Host to whichever upstream the
  deployment rendered — direct LMS, Caddy, etc.). Works uniformly on
  every bootstrap path.

- Add a dedicated `Verify openedx migrations ran` task between launch
  and the readiness wait. Polls for `waffle_flag` (created early in the
  migration sequence) and fails in ≤3 minutes with an actionable
  recovery message ("SSH in and run `ibl tutor local do init`, OR
  `docker compose down -v` and re-run `ibl edx launch`") if the
  schema is empty.

- Move `RestartCount` sampling INTO the readiness loop. A real LMS
  crash loop now returns rc=2 from the second iteration (~30s) with the
  last error from `docker logs` in stderr, instead of consuming the
  full 40-retry budget. rc=1 (not yet ready) remains the normal retry
  signal.

Net: an empty-schema LMS now surfaces in ~3 min with the right diagnosis;
a crashing LMS surfaces in ~30s with the actual log error; a healthy
LMS reaches ready on every layout that has a working host nginx.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../roles/ibl_edx/tasks/main.yml              | 83 ++++++++++++++++---
 1 file changed, 70 insertions(+), 13 deletions(-)
diff --git a/src/iblai_infra/ansible/templates/single-server/roles/ibl_edx/tasks/main.yml b/src/iblai_infra/ansible/templates/single-server/roles/ibl_edx/tasks/main.yml
index 9e30866..8d7ce9f 100644
--- a/src/iblai_infra/ansible/templates/single-server/roles/ibl_edx/tasks/main.yml
+++ b/src/iblai_infra/ansible/templates/single-server/roles/ibl_edx/tasks/main.yml
@@ -50,28 +50,85 @@
       Check 'ibl edx launch' output above for errors.
   when: edx_containers.stdout | trim | length == 0
 
-- name: Wait for LMS to be ready
+# Verify the openedx schema actually got migrated by `ibl edx launch`.
+#
+# `ibl edx launch` calls `ibl tutor local launch -I`, which is supposed to
+# run `tutor local do init` (and therefore the openedx migrations). On
+# certain bootstrap paths the init step has been observed to silently skip
+# migrations, leaving the LMS container in a permanent crash loop with
+# `(1146, "Table 'openedx.<table>' doesn't exist")`. Without this guard,
+# the operator only finds out 10 minutes later when the LMS-readiness
+# wait task times out — buried under retry noise. Catch it directly: if
+# `waffle_flag` (a table created early in the migration sequence) doesn't
+# exist within ~3 minutes of launch returning, fail with an actionable
+# message.
+- name: Verify openedx migrations ran (waffle_flag must exist)
   become: false
   shell: |
-    curl -s -o /dev/null -w '%{http_code}' http://localhost:8600/heartbeat 2>/dev/null | grep -q '200'
+    set -o pipefail
+    PW=$(docker exec ibl_prod_mysql printenv MYSQL_ROOT_PASSWORD 2>/dev/null)
+    docker exec ibl_prod_mysql mysql -uroot -p"$PW" \
+      -e "SHOW TABLES LIKE 'waffle_flag';" openedx 2>/dev/null \
+      | grep -q waffle_flag
   args:
     executable: /bin/bash
-  register: lms_health
-  until: lms_health.rc == 0
-  retries: 40
-  delay: 15
+  register: openedx_migrations_ok
+  retries: 6
+  delay: 30
+  until: openedx_migrations_ok.rc == 0
+  failed_when: false
 
-- name: Check LMS container is not crash-looping
+- name: Fail with a clear message if openedx migrations didn't run
+  fail:
+    msg: >-
+      `ibl edx launch` returned but `openedx.waffle_flag` does not exist —
+      the `tutor local do init` step did not complete migrations. SSH in
+      and run `ibl tutor local do init` manually, OR `docker compose down -v`
+      from `{{ ibl_root }}/app/ibl-edx/ibl-edx-pro/env/local/` and re-run
+      `ibl edx launch`.
+  when: openedx_migrations_ok.rc | default(1) | int != 0
+
+# LMS readiness probe — layout-agnostic. The earlier version polled
+# `http://localhost:8600/heartbeat` directly, which only exists when edX
+# is running with tutor's published-port layout. Some bootstrap paths
+# render a Caddy-fronted layout where LMS is reachable only via a Host
+# header on the local reverse proxy, NOT on `:8600`. Probing through the
+# host nginx (always on `:80`) with a `--resolve` to `127.0.0.1` works
+# uniformly: nginx routes by Host to whichever upstream the deployment
+# rendered (LMS direct, or Caddy on `:81`, or anything else).
+#
+# We also sample `RestartCount` inside the same loop so a real crash loop
+# (e.g. the schema-mismatch failure that motivated the migration check
+# above) returns rc=2 immediately instead of letting the operator burn
+# the full retry budget. rc=1 (not yet ready) is the normal retry signal.
+- name: Wait for LMS to be ready (with crash-loop early-exit)
   become: false
   shell: |
-    docker inspect --format '{{ '{{' }}.RestartCount{{ '}}' }}' ibl_prod_lms
+    set +e
+    HTTP=$(curl -sk --resolve learn.{{ base_domain }}:80:127.0.0.1 \
+      -o /dev/null -w '%{http_code}' --max-time 5 \
+      http://learn.{{ base_domain }}/heartbeat 2>/dev/null)
+    RC=$(docker inspect --format '{{ '{{' }}.RestartCount{{ '}}' }}' ibl_prod_lms 2>/dev/null || echo 0)
+    if [ "$RC" -gt 5 ]; then
+      LAST=$(docker logs --tail 30 ibl_prod_lms 2>&1 \
+        | grep -Ei 'error|exception|traceback|fatal' \
+        | tail -3 | head -c 600)
+      echo "LMS_CRASHLOOP RestartCount=$RC last_error: $LAST" >&2
+      exit 2
+    fi
+    [ "$HTTP" = "200" ] && exit 0
+    exit 1
   args:
     executable: /bin/bash
-  register: lms_restart_count
+  register: lms_health
+  retries: 40
+  delay: 15
+  until: lms_health.rc == 0
+  failed_when: lms_health.rc not in [0, 1]
 
-- name: Fail if LMS is crash-looping
+- name: Fail with crash-loop diagnosis if LMS never stabilized
   fail:
     msg: >-
-      ibl_prod_lms has restarted {{ lms_restart_count.stdout | trim }} times.
-      Run 'docker logs ibl_prod_lms' on the server to diagnose.
-  when: lms_restart_count.stdout | trim | int > 3
+      ibl_prod_lms is in a crash loop (RestartCount > 5). Last error
+      from the container: {{ lms_health.stderr | default('see docker logs ibl_prod_lms on the server') }}
+  when: lms_health.rc is defined and lms_health.rc == 2