From e0f4458190fafd710a5b655cab49eb0d2a0b5c78 Mon Sep 17 00:00:00 2001 From: bnsoni Date: Wed, 29 Apr 2026 22:38:56 +0300 Subject: [PATCH] fix(ibl_edx): layout-agnostic LMS readiness probe + explicit migration check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `Wait for LMS to be ready` task in the `ibl_edx` ansible role has been the silent failure point of every fresh-bootstrap deployment that takes the non-AMI compose path. Two compounding bugs land the operator at the same misleading "wait timed out after 40 retries" message after 10 minutes: 1. The probe target — `http://localhost:8600/heartbeat` — only exists on tutor's published-port layout. On bootstrap paths that render a Caddy-fronted layout (LMS reachable only via a Host header on the local reverse proxy), `:8600` is never bound on the host. The wait times out even though LMS is healthy on a different port. 2. The launch step has `ignore_errors: true`. When `ibl tutor local launch -I` silently skips its `tutor local do init` migration step, the openedx schema stays empty, LMS crash-loops on `ProgrammingError: Table 'openedx.' doesn't exist`, and the wait task — not the launch task — is the one that fails. The actual error gets buried 10 minutes deep in retry noise. This commit makes the role self-diagnose both failure modes: - Replace the `:8600` probe with a layout-agnostic `--resolve learn.{{ base_domain }}:80:127.0.0.1` probe through the host nginx (always on `:80`, routes by Host to whichever upstream the deployment rendered — direct LMS, Caddy, etc.). Works uniformly on every bootstrap path. - Add a dedicated `Verify openedx migrations ran` task between launch and the readiness wait. Polls for `waffle_flag` (created early in the migration sequence) and fails in ≤3 minutes with an actionable recovery message ("SSH in and run `ibl tutor local do init`, OR `docker compose down -v` and re-run `ibl edx launch`") if the schema is empty. - Move `RestartCount` sampling INTO the readiness loop. A real LMS crash loop now returns rc=2 from the second iteration (~30s) with the last error from `docker logs` in stderr, instead of consuming the full 40-retry budget. rc=1 (not yet ready) remains the normal retry signal. Net: an empty-schema LMS now surfaces in ~3 min with the right diagnosis; a crashing LMS surfaces in ~30s with the actual log error; a healthy LMS reaches ready on every layout that has a working host nginx. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../roles/ibl_edx/tasks/main.yml | 83 ++++++++++++++++--- 1 file changed, 70 insertions(+), 13 deletions(-) diff --git a/src/iblai_infra/ansible/templates/single-server/roles/ibl_edx/tasks/main.yml b/src/iblai_infra/ansible/templates/single-server/roles/ibl_edx/tasks/main.yml index 9e30866..8d7ce9f 100644 --- a/src/iblai_infra/ansible/templates/single-server/roles/ibl_edx/tasks/main.yml +++ b/src/iblai_infra/ansible/templates/single-server/roles/ibl_edx/tasks/main.yml @@ -50,28 +50,85 @@ Check 'ibl edx launch' output above for errors. when: edx_containers.stdout | trim | length == 0 -- name: Wait for LMS to be ready +# Verify the openedx schema actually got migrated by `ibl edx launch`. +# +# `ibl edx launch` calls `ibl tutor local launch -I`, which is supposed to +# run `tutor local do init` (and therefore the openedx migrations). On +# certain bootstrap paths the init step has been observed to silently skip +# migrations, leaving the LMS container in a permanent crash loop with +# `(1146, "Table 'openedx.
' doesn't exist")`. Without this guard, +# the operator only finds out 10 minutes later when the LMS-readiness +# wait task times out — buried under retry noise. Catch it directly: if +# `waffle_flag` (a table created early in the migration sequence) doesn't +# exist within ~3 minutes of launch returning, fail with an actionable +# message. +- name: Verify openedx migrations ran (waffle_flag must exist) become: false shell: | - curl -s -o /dev/null -w '%{http_code}' http://localhost:8600/heartbeat 2>/dev/null | grep -q '200' + set -o pipefail + PW=$(docker exec ibl_prod_mysql printenv MYSQL_ROOT_PASSWORD 2>/dev/null) + docker exec ibl_prod_mysql mysql -uroot -p"$PW" \ + -e "SHOW TABLES LIKE 'waffle_flag';" openedx 2>/dev/null \ + | grep -q waffle_flag args: executable: /bin/bash - register: lms_health - until: lms_health.rc == 0 - retries: 40 - delay: 15 + register: openedx_migrations_ok + retries: 6 + delay: 30 + until: openedx_migrations_ok.rc == 0 + failed_when: false -- name: Check LMS container is not crash-looping +- name: Fail with a clear message if openedx migrations didn't run + fail: + msg: >- + `ibl edx launch` returned but `openedx.waffle_flag` does not exist — + the `tutor local do init` step did not complete migrations. SSH in + and run `ibl tutor local do init` manually, OR `docker compose down -v` + from `{{ ibl_root }}/app/ibl-edx/ibl-edx-pro/env/local/` and re-run + `ibl edx launch`. + when: openedx_migrations_ok.rc | default(1) | int != 0 + +# LMS readiness probe — layout-agnostic. The earlier version polled +# `http://localhost:8600/heartbeat` directly, which only exists when edX +# is running with tutor's published-port layout. Some bootstrap paths +# render a Caddy-fronted layout where LMS is reachable only via a Host +# header on the local reverse proxy, NOT on `:8600`. Probing through the +# host nginx (always on `:80`) with a `--resolve` to `127.0.0.1` works +# uniformly: nginx routes by Host to whichever upstream the deployment +# rendered (LMS direct, or Caddy on `:81`, or anything else). +# +# We also sample `RestartCount` inside the same loop so a real crash loop +# (e.g. the schema-mismatch failure that motivated the migration check +# above) returns rc=2 immediately instead of letting the operator burn +# the full retry budget. rc=1 (not yet ready) is the normal retry signal. +- name: Wait for LMS to be ready (with crash-loop early-exit) become: false shell: | - docker inspect --format '{{ '{{' }}.RestartCount{{ '}}' }}' ibl_prod_lms + set +e + HTTP=$(curl -sk --resolve learn.{{ base_domain }}:80:127.0.0.1 \ + -o /dev/null -w '%{http_code}' --max-time 5 \ + http://learn.{{ base_domain }}/heartbeat 2>/dev/null) + RC=$(docker inspect --format '{{ '{{' }}.RestartCount{{ '}}' }}' ibl_prod_lms 2>/dev/null || echo 0) + if [ "$RC" -gt 5 ]; then + LAST=$(docker logs --tail 30 ibl_prod_lms 2>&1 \ + | grep -Ei 'error|exception|traceback|fatal' \ + | tail -3 | head -c 600) + echo "LMS_CRASHLOOP RestartCount=$RC last_error: $LAST" >&2 + exit 2 + fi + [ "$HTTP" = "200" ] && exit 0 + exit 1 args: executable: /bin/bash - register: lms_restart_count + register: lms_health + retries: 40 + delay: 15 + until: lms_health.rc == 0 + failed_when: lms_health.rc not in [0, 1] -- name: Fail if LMS is crash-looping +- name: Fail with crash-loop diagnosis if LMS never stabilized fail: msg: >- - ibl_prod_lms has restarted {{ lms_restart_count.stdout | trim }} times. - Run 'docker logs ibl_prod_lms' on the server to diagnose. - when: lms_restart_count.stdout | trim | int > 3 + ibl_prod_lms is in a crash loop (RestartCount > 5). Last error + from the container: {{ lms_health.stderr | default('see docker logs ibl_prod_lms on the server') }} + when: lms_health.rc is defined and lms_health.rc == 2