diff --git a/src/iblai_infra/ansible/templates/single-server/roles/ibl_edx/tasks/main.yml b/src/iblai_infra/ansible/templates/single-server/roles/ibl_edx/tasks/main.yml
index 9e30866..8d7ce9f 100644
--- a/src/iblai_infra/ansible/templates/single-server/roles/ibl_edx/tasks/main.yml
+++ b/src/iblai_infra/ansible/templates/single-server/roles/ibl_edx/tasks/main.yml
@@ -50,28 +50,85 @@
Check 'ibl edx launch' output above for errors.
when: edx_containers.stdout | trim | length == 0
-- name: Wait for LMS to be ready
+# Verify the openedx schema actually got migrated by `ibl edx launch`.
+#
+# `ibl edx launch` calls `ibl tutor local launch -I`, which is supposed to
+# run `tutor local do init` (and therefore the openedx migrations). On
+# certain bootstrap paths the init step has been observed to silently skip
+# migrations, leaving the LMS container in a permanent crash loop with
+# `(1146, "Table 'openedx.
' doesn't exist")`. Without this guard,
+# the operator only finds out 10 minutes later when the LMS-readiness
+# wait task times out — buried under retry noise. Catch it directly: if
+# `waffle_flag` (a table created early in the migration sequence) doesn't
+# exist within ~3 minutes of launch returning, fail with an actionable
+# message.
+- name: Verify openedx migrations ran (waffle_flag must exist)
become: false
shell: |
- curl -s -o /dev/null -w '%{http_code}' http://localhost:8600/heartbeat 2>/dev/null | grep -q '200'
+ set -o pipefail
+ PW=$(docker exec ibl_prod_mysql printenv MYSQL_ROOT_PASSWORD 2>/dev/null)
+ docker exec ibl_prod_mysql mysql -uroot -p"$PW" \
+ -e "SHOW TABLES LIKE 'waffle_flag';" openedx 2>/dev/null \
+ | grep -q waffle_flag
args:
executable: /bin/bash
- register: lms_health
- until: lms_health.rc == 0
- retries: 40
- delay: 15
+ register: openedx_migrations_ok
+ retries: 6
+ delay: 30
+ until: openedx_migrations_ok.rc == 0
+ failed_when: false
-- name: Check LMS container is not crash-looping
+- name: Fail with a clear message if openedx migrations didn't run
+ fail:
+ msg: >-
+ `ibl edx launch` returned but `openedx.waffle_flag` does not exist —
+ the `tutor local do init` step did not complete migrations. SSH in
+ and run `ibl tutor local do init` manually, OR `docker compose down -v`
+ from `{{ ibl_root }}/app/ibl-edx/ibl-edx-pro/env/local/` and re-run
+ `ibl edx launch`.
+ when: openedx_migrations_ok.rc | default(1) | int != 0
+
+# LMS readiness probe — layout-agnostic. The earlier version polled
+# `http://localhost:8600/heartbeat` directly, which only exists when edX
+# is running with tutor's published-port layout. Some bootstrap paths
+# render a Caddy-fronted layout where LMS is reachable only via a Host
+# header on the local reverse proxy, NOT on `:8600`. Probing through the
+# host nginx (always on `:80`) with a `--resolve` to `127.0.0.1` works
+# uniformly: nginx routes by Host to whichever upstream the deployment
+# rendered (LMS direct, or Caddy on `:81`, or anything else).
+#
+# We also sample `RestartCount` inside the same loop so a real crash loop
+# (e.g. the schema-mismatch failure that motivated the migration check
+# above) returns rc=2 immediately instead of letting the operator burn
+# the full retry budget. rc=1 (not yet ready) is the normal retry signal.
+- name: Wait for LMS to be ready (with crash-loop early-exit)
become: false
shell: |
- docker inspect --format '{{ '{{' }}.RestartCount{{ '}}' }}' ibl_prod_lms
+ set +e
+ HTTP=$(curl -sk --resolve learn.{{ base_domain }}:80:127.0.0.1 \
+ -o /dev/null -w '%{http_code}' --max-time 5 \
+ http://learn.{{ base_domain }}/heartbeat 2>/dev/null)
+ RC=$(docker inspect --format '{{ '{{' }}.RestartCount{{ '}}' }}' ibl_prod_lms 2>/dev/null || echo 0)
+ if [ "$RC" -gt 5 ]; then
+ LAST=$(docker logs --tail 30 ibl_prod_lms 2>&1 \
+ | grep -Ei 'error|exception|traceback|fatal' \
+ | tail -3 | head -c 600)
+ echo "LMS_CRASHLOOP RestartCount=$RC last_error: $LAST" >&2
+ exit 2
+ fi
+ [ "$HTTP" = "200" ] && exit 0
+ exit 1
args:
executable: /bin/bash
- register: lms_restart_count
+ register: lms_health
+ retries: 40
+ delay: 15
+ until: lms_health.rc == 0
+ failed_when: lms_health.rc not in [0, 1]
-- name: Fail if LMS is crash-looping
+- name: Fail with crash-loop diagnosis if LMS never stabilized
fail:
msg: >-
- ibl_prod_lms has restarted {{ lms_restart_count.stdout | trim }} times.
- Run 'docker logs ibl_prod_lms' on the server to diagnose.
- when: lms_restart_count.stdout | trim | int > 3
+ ibl_prod_lms is in a crash loop (RestartCount > 5). Last error
+ from the container: {{ lms_health.stderr | default('see docker logs ibl_prod_lms on the server') }}
+ when: lms_health.rc is defined and lms_health.rc == 2