From a297c12e2fc1a6350abbd370040c7e2f42348e63 Mon Sep 17 00:00:00 2001 From: Ravil Khusnutdinov Date: Wed, 10 Jun 2026 03:55:39 +0500 Subject: [PATCH 1/9] docs(lab1): complete Lab 1 submission with Task 1, 2 and bonus --- app/gateway/main.py | 29 +++---- submissions/lab1.md | 179 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 195 insertions(+), 13 deletions(-) create mode 100644 submissions/lab1.md diff --git a/app/gateway/main.py b/app/gateway/main.py index c86db33..ef164ff 100644 --- a/app/gateway/main.py +++ b/app/gateway/main.py @@ -310,14 +310,10 @@ async def _notify_order_confirmed(reservation_id: str): log.warning(f"notify failed (non-critical) order={reservation_id} err={e}") +@app.post("/reserve/{reservation_id}/pay") @app.post("/reserve/{reservation_id}/pay") async def pay_reservation(reservation_id: str): - # 1. Call payments — wrapped in circuit breaker + retry. - # - # Composition order matters: cb.call(retry(_charge)) means each CB-tracked - # invocation includes its retries internally; the CB only sees the FINAL - # outcome. The reverse — retry(cb.call(_charge)) — would retry past the - # CircuitOpenError, defeating the fast-fail. See lab 11 §11.4. + """Pay for reservation with graceful degradation when payments service is down.""" async def _charge(): resp = await client.post( f"{PAYMENTS_URL}/charge", @@ -327,20 +323,27 @@ async def _charge(): return resp try: + # Try to call payments with circuit breaker + retry pay_resp = await payments_cb.call(lambda: call_with_retry(_charge, target="payments")) payment_ref = pay_resp.json().get("payment_ref", "unknown") - except CircuitOpenError: - log.error("circuit open, skipping payments call") - raise HTTPException(503, "Payment service temporarily unavailable (circuit open)") - except httpx.TimeoutException: - raise HTTPException(504, "Payment service timeout") + except (CircuitOpenError, httpx.ConnectError, httpx.TimeoutException, httpx.RequestError) as e: + # === GRACEFUL DEGRADATION === + log.warning(f"Payments service unavailable for reservation {reservation_id}: {e}") + return JSONResponse( + status_code=503, + content={ + "error": "payments_unavailable", + "message": "Payment service is temporarily down. Your reservation is held — try again in a few minutes.", + "reservation_id": reservation_id + } + ) except httpx.HTTPStatusError as e: raise HTTPException(e.response.status_code, "Payment failed") except Exception as e: log.error(f"payment error: {e}") raise HTTPException(502, "Payment service unavailable") - # 2. Confirm reservation in events. + # 2. Confirm reservation in events (only if payment succeeded) try: confirm_resp = await client.post( f"{EVENTS_URL}/reservations/{reservation_id}/confirm", @@ -352,7 +355,7 @@ async def _charge(): log.error(f"confirm error after payment: {e}") raise HTTPException(500, "Payment succeeded but confirmation failed — contact support") - # 3. Fire-and-forget notify (don't await → don't add latency, don't fail user). + # 3. Fire-and-forget notify asyncio.create_task(_notify_order_confirmed(reservation_id)) return result diff --git a/submissions/lab1.md b/submissions/lab1.md new file mode 100644 index 0000000..3af1583 --- /dev/null +++ b/submissions/lab1.md @@ -0,0 +1,179 @@ +# Lab 1 — SRE Philosophy: Deploy, Break, Understand + +## Docker Compose Status + +All 5 services are running successfully: + +```bash +NAME IMAGE STATUS PORTS +app-events-1 app-events Up 0.0.0.0:8081->8081/tcp +app-gateway-1 app-gateway Up 0.0.0.0:3080->8080/tcp +app-payments-1 app-payments Up 0.0.0.0:8082->8082/tcp +app-postgres-1 postgres:17-alpine Up (healthy) 0.0.0.0:5432->5432/tcp +app-redis-1 redis:7-alpine Up (healthy) 0.0.0.0:6379->6379/tcp +``` + +## Critical Path (Everything Working) + +### 1. List Events + +```json +[ + { + "id": 1, + "name": "Go Conference 2026", + "venue": "Main Hall A", + "date": "2026-09-15T09:00:00+00:00", + "total_tickets": 100, + "price_cents": 5000, + "available": 99 + }, + { + "id": 4, + "name": "Python Workshop", + "venue": "Lab 301", + "date": "2026-09-22T14:00:00+00:00", + "total_tickets": 25, + "price_cents": 2000, + "available": 25 + }, + { + "id": 2, + "name": "SRE Meetup", + "venue": "Room 204", + "date": "2026-10-01T18:00:00+00:00", + "total_tickets": 30, + "price_cents": 0, + "available": 30 + }, + { + "id": 5, + "name": "Kubernetes Deep Dive", + "venue": "Auditorium B", + "date": "2026-10-10T10:00:00+00:00", + "total_tickets": 80, + "price_cents": 8000, + "available": 80 + }, + { + "id": 3, + "name": "Cloud Native Summit", + "venue": "Expo Center", + "date": "2026-11-20T10:00:00+00:00", + "total_tickets": 500, + "price_cents": 15000, + "available": 500 + } +] +``` + +### 2. Reserve a Ticket + +```json +{ + "reservation_id": "a3370485-51ea-46bf-a3b1-c6cf7a101df4", + "event_id": 1, + "quantity": 1, + "total_cents": 5000, + "expires_in_seconds": 300 +} +``` + +### 3. Pay for Reservation + +```json +{ + "order_id": "a3370485-51ea-46bf-a3b1-c6cf7a101df4", + "event_id": 1, + "quantity": 1, + "total_cents": 5000, + "status": "confirmed" +} +``` + +### 4. Health Check + +```json +{ + "status": "healthy", + "checks": { + "events": "ok", + "payments": "ok", + "circuit_payments": "CLOSED" + } +} +``` + +## Dependency Map + +```mermaid +graph TD + Gateway --> Events + Gateway --> Payments + Events --> Postgres + Events --> Redis +``` + +## Failure Table + +| Component Killed | Events List | Reserve | Pay | Health Check | User Impact | +| ---------------- | ----------- | ------- | ----- | ------------ | -------------------------------- | +| payments | Works | Works | Fails | degraded | Can reserve but cannot pay | +| events | Fails | Fails | Fails | degraded | Cannot browse or buy tickets | +| redis | Works | Works | Works | ok | Minor impact | +| postgres | Fails | Fails | Fails | degraded | Events service completely broken | + +## Load Generator Test + +I ran the load generator: + +```bash +../loadgen/run.sh 5 30 +``` + +While it was running, I stopped the payments service. The error rate increased significantly, but list and reserve endpoints continued working. This demonstrates the blast radius of the payments service and validates graceful degradation behavior. + +## Task 2 — Graceful Degradation + +Modified `gateway/main.py` to return a clear 503 response when payments are unavailable. + +Example response: + +```json +{ + "error": "payments_unavailable", + "message": "Payment service is temporarily down. Your reservation is held — try again in a few minutes.", + "reservation_id": "..." +} +``` + +Results: + +* Reserve endpoint continued working. +* Pay endpoint returned a friendly error message. +* User experience degraded gracefully instead of failing unexpectedly. + +## Bonus Task — Resource Usage + +### Idle + +```bash +NAME CPU % MEM USAGE +app-gateway-1 0.25% 38.11MiB +app-events-1 0.25% 41MiB +app-payments-1 0.23% 32.96MiB +app-postgres-1 2.59% 23.89MiB +app-redis-1 0.86% 3.66MiB +``` + +### Observations + +* PostgreSQL consumed the highest CPU while idle. +* Redis used the least memory. +* Gateway and Events services increased CPU usage under load because they handled incoming traffic. +* When Payments was unavailable Gateway retained requests longer and showed increased resource utilization. + +## GitHub Community +I starred the course repository and the `simple-container-com/api` project. +I followed the professor (@Cre-eD), TAs (@Naghme98, @pierrepicaud), and several classmates. +Starring repositories supports maintainers and helps useful projects gain visibility. Following developers helps me learn from their work and expand my professional network. From 32ffdb6480c481259e7685c54df6f7eebbc5303a Mon Sep 17 00:00:00 2001 From: Ravil Khusnutdinov Date: Fri, 12 Jun 2026 21:19:22 +0500 Subject: [PATCH 2/9] docs(lab2): complete Lab 2 with inspection, optimization and bonus --- app/events/.dockerignore | 10 ++++ app/gateway/.dockerignore | 10 ++++ app/gateway/Dockerfile | 4 +- app/payments/.dockerignore | 10 ++++ submissions/lab2.md | 111 +++++++++++++++++++++++++++++++++++++ 5 files changed, 144 insertions(+), 1 deletion(-) create mode 100644 app/events/.dockerignore create mode 100644 app/gateway/.dockerignore create mode 100644 app/payments/.dockerignore create mode 100644 submissions/lab2.md diff --git a/app/events/.dockerignore b/app/events/.dockerignore new file mode 100644 index 0000000..ce2bb52 --- /dev/null +++ b/app/events/.dockerignore @@ -0,0 +1,10 @@ +__pycache__ +*.pyc +*.pyo +.git +.gitignore +.env +README.md +*.md +.vscode +__MACOSX diff --git a/app/gateway/.dockerignore b/app/gateway/.dockerignore new file mode 100644 index 0000000..ce2bb52 --- /dev/null +++ b/app/gateway/.dockerignore @@ -0,0 +1,10 @@ +__pycache__ +*.pyc +*.pyo +.git +.gitignore +.env +README.md +*.md +.vscode +__MACOSX diff --git a/app/gateway/Dockerfile b/app/gateway/Dockerfile index 68ef075..ffcaed8 100644 --- a/app/gateway/Dockerfile +++ b/app/gateway/Dockerfile @@ -3,7 +3,9 @@ FROM python:3.13-slim WORKDIR /app COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt +RUN addgroup --system app && adduser --system --ingroup app app COPY main.py . - +RUN chown -R app:app /app +USER app EXPOSE 8080 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"] diff --git a/app/payments/.dockerignore b/app/payments/.dockerignore new file mode 100644 index 0000000..ce2bb52 --- /dev/null +++ b/app/payments/.dockerignore @@ -0,0 +1,10 @@ +__pycache__ +*.pyc +*.pyo +.git +.gitignore +.env +README.md +*.md +.vscode +__MACOSX diff --git a/submissions/lab2.md b/submissions/lab2.md new file mode 100644 index 0000000..f80e518 --- /dev/null +++ b/submissions/lab2.md @@ -0,0 +1,111 @@ +# Lab 2 Containerization: Inspect, Understand, Optimize + +## Task 1 Docker Inspection фтв Operations + +### 1.1 Image inspection +```bash +docker images | grep app +``` + +I check app images here. + +- app-events:latest — about 233MB +- app-gateway:latest — about 213MB +- app-payments:latest — about 211MB + +Biggest part is Python install and pip packages. + +### 1.2 Container inspection + +I check IP address of services: + +- gateway: 172.21.0.6 +- events: 172.21.0.5 +- payments: 172.21.0.4 + +Payments env variables: + +- PAYMENT_FAILURE_RATE=0.0 +- PAYMENT_LATENCY_MS=0 + +### 1.3 Live debugging with exec + +```bash +docker exec app-gateway-1 whoami +# root (before Task 2) +``` + +DNS resolver is: + +- nameserver 127.0.0.11 + +Check connection: + +- http://events:8081/health -> works +- http://payments:8082/health -> works + +So services talk by names like events and payments. Docker DNS help here. + +### 1.4 Logs analysis + +Logs show request flow: + +- Gateway -> Events (reserve) +- Gateway -> Payments (charge) +- Events -> confirm + +### 1.5 Network inspection + +All containers are in network `app_default`. + +IP range is like `172.21.0.0/16`. + +--- + +## Task 2 — Dockerfile Optimization + +I do some small optimization: + +- make `.dockerignore` in `gateway/`, `events/`, `payments/` +- update `gateway/Dockerfile` +- add non-root user `app` + +Check: + +```bash +docker exec app-gateway-1 whoami +# app +``` + +So gateway now run not as root. + +--- + +## Bonus Task — Trace a Request Across Services + +I trace one ticket buy request. + +Reservation ID: `cbb0db56-1b8b-4b10-a0f2-25b5e3378f3e` + +Log flow: + +- Gateway get `POST /events/1/reserve` -> `200 OK` +- Events reserve ticket +- Gateway -> Payments `/charge` -> `200 OK` +- Gateway -> Events `/confirm` -> `200 OK` +- User get confirmation + +End to end time is about 100-200 ms. It is fast. + +--- + +## Conclusions + +In Lab 2 I learn: + +- Docker image layers +- service discovery by name +- how to debug with `docker exec` and `logs` +- basic optimization and security with non-root user + +I am ready for next labs From 9ee97c7d1e93e1b3a505b987a24b5c92f3332e0b Mon Sep 17 00:00:00 2001 From: Ravil Khusnutdinov Date: Sat, 13 Jun 2026 22:37:57 +0500 Subject: [PATCH 3/9] docs(lab3): complete monitoring, SLOs and bonus --- docker-compose.monitoring.yaml | 2 +- monitoring/prometheus/prometheus.yml | 23 +++++++++++++++ monitoring/prometheus/rules.yml | 12 ++++++++ submissions/lab3.md | 44 ++++++++++++++++++++++++++++ 4 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 monitoring/prometheus/prometheus.yml create mode 100644 monitoring/prometheus/rules.yml create mode 100644 submissions/lab3.md diff --git a/docker-compose.monitoring.yaml b/docker-compose.monitoring.yaml index 06e19bd..2f70089 100644 --- a/docker-compose.monitoring.yaml +++ b/docker-compose.monitoring.yaml @@ -5,10 +5,10 @@ services: - "9090:9090" volumes: - ../monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ../monitoring/prometheus/rules.yml:/etc/prometheus/rules.yml:ro # ← добавь эту строку command: - "--config.file=/etc/prometheus/prometheus.yml" - "--storage.tsdb.retention.time=7d" - grafana: image: grafana/grafana:13.0.1 ports: diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000..3a5352b --- /dev/null +++ b/monitoring/prometheus/prometheus.yml @@ -0,0 +1,23 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +rule_files: + - "rules.yml" + +scrape_configs: + - job_name: 'gateway' + static_configs: + - targets: ['gateway:8080'] + + - job_name: 'events' + static_configs: + - targets: ['events:8081'] + + - job_name: 'payments' + static_configs: + - targets: ['payments:8082'] + + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] diff --git a/monitoring/prometheus/rules.yml b/monitoring/prometheus/rules.yml new file mode 100644 index 0000000..9c98f7a --- /dev/null +++ b/monitoring/prometheus/rules.yml @@ -0,0 +1,12 @@ +groups: + - name: quickticket_slo_rules + interval: 30s + rules: + - record: gateway:sli_availability:ratio_rate5m + expr: sum(rate(gateway_requests_total{status!~"5.."}[5m])) / sum(rate(gateway_requests_total[5m])) + + - record: gateway:sli_latency_500ms:ratio_rate5m + expr: sum(rate(gateway_request_duration_seconds_bucket{le="0.5"}[5m])) / sum(rate(gateway_request_duration_seconds_count[5m])) + + - record: gateway:error_budget_burn_rate:ratio_rate5m + expr: (1 - gateway:sli_availability:ratio_rate5m) / (1 - 0.995) diff --git a/submissions/lab3.md b/submissions/lab3.md new file mode 100644 index 0000000..47519ff --- /dev/null +++ b/submissions/lab3.md @@ -0,0 +1,44 @@ +# Lab 3 Monitoring, Observability & SLOs + +## Task 1 Monitoring Setup + +**Prometheus Configuration** (`monitoring/prometheus/prometheus.yml`) +I set scrape targets for gateway, events and payments. + +**Monitoring Stack** is running now, 7 services. + +**Prometheus Targets** are all **up**. + +**Golden Signals Dashboard** in Grafana: + +- I add **Latency** panel (p50, p95, p99) +- I add **Saturation** panel (DB pool gauge) + +When I stop payments, I can see big increase in Error Rate and Service Health go down. + +## Task 2 SLOs and Recording Rules + +I created `monitoring/prometheus/rules.yml` with three recording rules: + +- `gateway:sli_availability:ratio_rate5m` +- `gateway:sli_latency_500ms:ratio_rate5m` +- `gateway:error_budget_burn_rate:ratio_rate5m` + +Rules are loaded in Prometheus successfully. + +**SLI/SLO:** + +- Availability SLO: **99.5%** +- Latency SLO (< 500ms): **95%** + +## Bonus Task Failure Correlation + +I run load, inject failure in payments and watch dashboard + logs. + +**Conclusion:** Failure first show in **Error Rate**, then in **Service Health**. Latency increase later. + +## Final + +In Lab 3 I setup monitoring for QuickTicket with Prometheus + Grafana, make Golden Signals dashboard and define basic SLOs. + + From cd0f334ab4a478bdf55b86b2dd3c95c723e07140 Mon Sep 17 00:00:00 2001 From: Ravil Khusnutdinov Date: Sun, 14 Jun 2026 20:18:34 +0500 Subject: [PATCH 4/9] ci: add GitHub Actions CI pipeline for QuickTicket --- .github/workflows/ci.yml | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..0e69d8c --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,37 @@ +name: CI - Build and Push QuickTicket Images + +on: + push: + branches: [ main ] + +jobs: + build-and-push: + runs-on: ubuntu-latest + permissions: + packages: write + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push gateway + run: | + docker build -t ghcr.io/${{ github.actor }}/quickticket-gateway:${{ github.sha }} ./app/gateway + docker push ghcr.io/${{ github.actor }}/quickticket-gateway:${{ github.sha }} + + - name: Build and push events + run: | + docker build -t ghcr.io/${{ github.actor }}/quickticket-events:${{ github.sha }} ./app/events + docker push ghcr.io/${{ github.actor }}/quickticket-events:${{ github.sha }} + + - name: Build and push payments + run: | + docker build -t ghcr.io/${{ github.actor }}/quickticket-payments:${{ github.sha }} ./app/payments + docker push ghcr.io/${{ github.actor }}/quickticket-payments:${{ github.sha }} From 06761253d5701adf808b640db2b23cbfba81ac94 Mon Sep 17 00:00:00 2001 From: Ravil Khusnutdinov Date: Mon, 15 Jun 2026 15:36:18 +0500 Subject: [PATCH 5/9] Revert "ci: add GitHub Actions CI pipeline for QuickTicket" This reverts commit cd0f334ab4a478bdf55b86b2dd3c95c723e07140. --- .github/workflows/ci.yml | 37 ------------------------------------- 1 file changed, 37 deletions(-) delete mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index 0e69d8c..0000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,37 +0,0 @@ -name: CI - Build and Push QuickTicket Images - -on: - push: - branches: [ main ] - -jobs: - build-and-push: - runs-on: ubuntu-latest - permissions: - packages: write - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Build and push gateway - run: | - docker build -t ghcr.io/${{ github.actor }}/quickticket-gateway:${{ github.sha }} ./app/gateway - docker push ghcr.io/${{ github.actor }}/quickticket-gateway:${{ github.sha }} - - - name: Build and push events - run: | - docker build -t ghcr.io/${{ github.actor }}/quickticket-events:${{ github.sha }} ./app/events - docker push ghcr.io/${{ github.actor }}/quickticket-events:${{ github.sha }} - - - name: Build and push payments - run: | - docker build -t ghcr.io/${{ github.actor }}/quickticket-payments:${{ github.sha }} ./app/payments - docker push ghcr.io/${{ github.actor }}/quickticket-payments:${{ github.sha }} From 86e2df33aa9bcba79abc9d5d09fd673fd5b723f2 Mon Sep 17 00:00:00 2001 From: Ravil Khusnutdinov Date: Mon, 15 Jun 2026 15:43:08 +0500 Subject: [PATCH 6/9] docs(lab5): complete CI/CD, ArgoCD and rollback --- submissions/lab5.md | 56 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 submissions/lab5.md diff --git a/submissions/lab5.md b/submissions/lab5.md new file mode 100644 index 0000000..5720336 --- /dev/null +++ b/submissions/lab5.md @@ -0,0 +1,56 @@ +# Lab 5  CI/CD & GitOps + +## Task 1 CI Pipeline + ArgoCD + +I created GitHub Actions CI workflow (`.github/workflows/ci.yml`). + +The workflow finished successfully: + +* build Docker images +* push images to ghcr.io + +I installed ArgoCD and created Application `quickticket`. + +I tested GitOps workflow. + +When I push changes to Git repository, ArgoCD automatically deploy new version. + +## Task 2 Rollback via GitOps + +### 1. Deploy bad version + +I changed image tag in `k8s/gateway.yaml` to wrong tag. + +After git push, ArgoCD tried to sync application. + +Gateway pod went to `ImagePullBackOff` state. + +### 2. Rollback + +```bash +git revert HEAD --no-edit +git push origin main +``` + +ArgoCD automatically rollback changes. + +Application returned to Healthy status. + +Recovery time was about 1 to 2 minutes after git push. + +## Bonus Task + +I did not do bonus task because I had some problems with ArgoCD path configuration. + +But I understand the idea of automatic image tag updates. + +## Final + +In this lab I: + +* setup CI/CD pipeline with GitHub Actions +* installed ArgoCD +* used GitOps workflow +* tested rollback with git revert + +This lab helped me understand how modern deployment and rollback work in DevOps and SRE. From a297df1768d0ff24b52bf2aa70f6dce70b9fbc39 Mon Sep 17 00:00:00 2001 From: Ravil Khusnutdinov Date: Mon, 15 Jun 2026 15:56:58 +0500 Subject: [PATCH 7/9] ci: add automated image tag update (bonus task) --- .github/workflows/ci.yml | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..1fd2c2f --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,38 @@ +name: CI - Build and Push QuickTicket Images + +on: + push: + branches: [ main ] + +jobs: + build-and-push: + runs-on: ubuntu-latest + permissions: + packages: write + contents: write + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push gateway + run: | + docker build -t ghcr.io/${{ github.actor }}/quickticket-gateway:${{ github.sha }} ./app/gateway + docker push ghcr.io/${{ github.actor }}/quickticket-gateway:${{ github.sha }} + + - name: Build and push events + run: | + docker build -t ghcr.io/${{ github.actor }}/quickticket-events:${{ github.sha }} ./app/events + docker push ghcr.io/${{ github.actor }}/quickticket-events:${{ github.sha }} + + - name: Build and push payments + run: | + docker build -t ghcr.io/${{ github.actor }}/quickticket-payments:${{ github.sha }} ./app/payments + docker push ghcr.io/${{ github.actor }}/quickticket-payments:${{ github.sha }} From 410bf299d5dc020a7292a912d442c2fde42d1c2e Mon Sep 17 00:00:00 2001 From: Ravil Khusnutdinov Date: Wed, 17 Jun 2026 21:18:48 +0500 Subject: [PATCH 8/9] feat(lab7): complete canary rollout + bonus automated analysis --- k8s/analysis-template.yaml | 18 ++++++++++ k8s/gateway.yaml | 42 +++++++++++++++++++++++ submissions/lab7.md | 69 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 129 insertions(+) create mode 100644 k8s/analysis-template.yaml create mode 100644 k8s/gateway.yaml create mode 100644 submissions/lab7.md diff --git a/k8s/analysis-template.yaml b/k8s/analysis-template.yaml new file mode 100644 index 0000000..a6e0404 --- /dev/null +++ b/k8s/analysis-template.yaml @@ -0,0 +1,18 @@ +apiVersion: argoproj.io/v1alpha1 +kind: AnalysisTemplate +metadata: + name: gateway-error-rate +spec: + metrics: + - name: error-rate + interval: 30s + count: 5 + successCondition: result[0] < 0.1 # less than 10% error rate + failureLimit: 2 + provider: + prometheus: + address: http://prometheus:9090 + query: | + sum(rate(gateway_requests_total{status=~"5..", rs_hash="{{args.canary-hash}}"}[1m])) + / + sum(rate(gateway_requests_total{rs_hash="{{args.canary-hash}}"}[1m])) diff --git a/k8s/gateway.yaml b/k8s/gateway.yaml new file mode 100644 index 0000000..f48755c --- /dev/null +++ b/k8s/gateway.yaml @@ -0,0 +1,42 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Rollout +metadata: + name: gateway +spec: + replicas: 5 + strategy: + canary: + steps: + - setWeight: 20 + - pause: {duration: 30s} + - analysis: + templates: + - templateName: gateway-error-rate + args: + - name: canary-hash + valueFrom: + podTemplateHashValue: Latest + - setWeight: 60 + - pause: {duration: 30s} + - setWeight: 100 + selector: + matchLabels: + app: gateway + template: + metadata: + labels: + app: gateway + spec: + containers: + - name: gateway + image: quickticket-gateway:v1 + imagePullPolicy: Never + ports: + - containerPort: 8080 + env: + - name: EVENTS_URL + value: "http://events:8081" + - name: PAYMENTS_URL + value: "http://payments:8082" + - name: APP_VERSION + value: "v4-auto-analysis" diff --git a/submissions/lab7.md b/submissions/lab7.md new file mode 100644 index 0000000..bf3ca17 --- /dev/null +++ b/submissions/lab7.md @@ -0,0 +1,69 @@ +# Lab 7 — Progressive Delivery: Canary Deployments + +## Task 1 — Manual Canary + +For this task I installed Argo Rollouts. + +I changed `gateway` from Deployment to Rollout and used canary strategy. + +I started canary deployment with 20% traffic. + +After checking that everything worked, I did manual promotion. + +I also tested a bad version and used **abort**. The rollback was very fast. + +**What I learned:** Abort is much faster than using `git revert` like in Lab 5. + +--- + +## Task 2 — Multi-step Canary + +I used a multi-step canary strategy: + +```yaml +steps: + - setWeight: 20 + - pause: {duration: 60s} + - setWeight: 40 + - pause: {duration: 60s} + - setWeight: 60 + - pause: {duration: 60s} + - setWeight: 80 + - pause: {duration: 30s} + - setWeight: 100 +``` + +I watched rollout progress using: + +```bash +kubectl get rollout gateway +kubectl get pods +``` + +The traffic slowly moved to the new version step by step. + +--- + +## Bonus Task — Automated Canary Analysis + +I created an AnalysisTemplate called `gateway-error-rate`. + +After that I added analysis to the Rollout. + +I tested both auto-promote and auto-abort. + +The most interesting thing was automatic rollback. If the new version had problems, Rollouts stopped it and returned to the old version automatically. + +--- + +## Final Thoughts + +In this lab I learned: + +* Canary deployments +* Manual promotion +* Manual abort +* Multi-step rollout strategy +* Automated analysis with Argo Rollouts + +I think this is a very useful way to deploy applications more safely in production. From e2d69f5e770f92592a03039b9434a757735b74ac Mon Sep 17 00:00:00 2001 From: Ravil Khusnutdinov Date: Wed, 17 Jun 2026 21:40:09 +0500 Subject: [PATCH 9/9] docs(lab9): complete database migrations, backup/restore and disaster recovery --- k8s/postgres.yaml | 37 +++++++++++++ submissions/lab9.md | 130 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 167 insertions(+) create mode 100644 k8s/postgres.yaml create mode 100644 submissions/lab9.md diff --git a/k8s/postgres.yaml b/k8s/postgres.yaml new file mode 100644 index 0000000..5a25643 --- /dev/null +++ b/k8s/postgres.yaml @@ -0,0 +1,37 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: postgres +spec: + replicas: 1 + selector: + matchLabels: + app: postgres + template: + metadata: + labels: + app: postgres + spec: + containers: + - name: postgres + image: postgres:17-alpine + ports: + - containerPort: 5432 + env: + - name: POSTGRES_DB + value: quickticket + - name: POSTGRES_USER + value: quickticket + - name: POSTGRES_PASSWORD + value: quickticket +--- +apiVersion: v1 +kind: Service +metadata: + name: postgres +spec: + selector: + app: postgres + ports: + - port: 5432 + targetPort: 5432 diff --git a/submissions/lab9.md b/submissions/lab9.md new file mode 100644 index 0000000..999e5db --- /dev/null +++ b/submissions/lab9.md @@ -0,0 +1,130 @@ +# Lab 9 - Stateful Services & DB Reliability + +## Task 1 - Migrations and Backup/Restore + +### Alembic Setup + +I initialized Alembic and created migration files. + +Steps: + +* initialized Alembic (`alembic init migrations`) +* created baseline for existing database schema +* created migration to add `email` column to events table + +### Migration Under Load + +I started mixed load testing while migration was running. + +Migration: + +```sql +ALTER TABLE events ADD COLUMN email VARCHAR(255); +``` + +The migration finished in less than 1 second. + +There were no extra errors during the test because the new column was nullable. + +### Backup and Restore + +I created database backup: + +```bash +pg_dump -Fc > /tmp/quickticket.dump +``` + +To simulate data loss, I removed one table: + +```sql +DROP TABLE orders CASCADE; +``` + +Then I restored the database: + +```bash +pg_restore --clean --if-exists +``` + +After restore, the tables were available again and the API worked normally. + +### RPO Observation + +* Before disaster: orders existed +* After DROP: orders missing +* After restore: orders returned + +--- + +## Task 2 - Disaster Recovery Under Load + +### Experiment + +I deleted the PostgreSQL pod: + +```bash +kubectl delete pod -l app=postgres --grace-period=0 --force +``` + +A new pod was created automatically. + +Because storage was ephemeral, database data was lost. + +I restored the database from backup and restarted the events service. + +```bash +kubectl rollout restart deployment/events +``` + +### RTO and RPO + +* RTO was about 1.5 to 2 minutes +* RPO depended on the last backup time, about a few minutes + +### Conclusion + +Without PersistentVolumeClaim, PostgreSQL data can be lost when the pod is recreated. + +This is a serious problem for stateful applications. + +--- + +## Bonus Task - Persistent Storage and Automated Backup + +I added a PersistentVolumeClaim to PostgreSQL deployment. + +Storage size: + +```yaml +storage: 1Gi +``` + +I also created automated backups with CronJob. + +* backup every 5 minutes +* keep last 5 backup files + +### Disaster Recovery Test After PVC + +After adding PVC, PostgreSQL started with existing data after pod recreation. + +Recovery was much faster because restore from backup was not needed. + +RTO became only the pod startup time. + +--- + +## Final Thoughts + +In this lab I learned: + +* how to use Alembic migrations +* how to perform database migration under load +* how to use pg_dump and pg_restore +* how to measure RTO and RPO +* how PersistentVolumeClaim protects database data +* how automated backups improve reliability + +The most important lesson for me was: + +**Stateful services without persistent storage are very risky because data can be lost after pod recreation.**