From a297c12e2fc1a6350abbd370040c7e2f42348e63 Mon Sep 17 00:00:00 2001
From: Ravil Khusnutdinov <ra.khusnutdinov@innopolis.university>
Date: Wed, 10 Jun 2026 03:55:39 +0500
Subject: [PATCH 1/9] docs(lab1): complete Lab 1 submission with Task 1, 2 and
 bonus

---
 app/gateway/main.py |  29 +++----
 submissions/lab1.md | 179 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 195 insertions(+), 13 deletions(-)
 create mode 100644 submissions/lab1.md

diff --git a/app/gateway/main.py b/app/gateway/main.py
index c86db33..ef164ff 100644
--- a/app/gateway/main.py
+++ b/app/gateway/main.py
@@ -310,14 +310,10 @@ async def _notify_order_confirmed(reservation_id: str):
         log.warning(f"notify failed (non-critical) order={reservation_id} err={e}")
 
 
+@app.post("/reserve/{reservation_id}/pay")
 @app.post("/reserve/{reservation_id}/pay")
 async def pay_reservation(reservation_id: str):
-    # 1. Call payments — wrapped in circuit breaker + retry.
-    #
-    # Composition order matters: cb.call(retry(_charge)) means each CB-tracked
-    # invocation includes its retries internally; the CB only sees the FINAL
-    # outcome. The reverse — retry(cb.call(_charge)) — would retry past the
-    # CircuitOpenError, defeating the fast-fail. See lab 11 §11.4.
+    """Pay for reservation with graceful degradation when payments service is down."""
     async def _charge():
         resp = await client.post(
             f"{PAYMENTS_URL}/charge",
@@ -327,20 +323,27 @@ async def _charge():
         return resp
 
     try:
+        # Try to call payments with circuit breaker + retry
         pay_resp = await payments_cb.call(lambda: call_with_retry(_charge, target="payments"))
         payment_ref = pay_resp.json().get("payment_ref", "unknown")
-    except CircuitOpenError:
-        log.error("circuit open, skipping payments call")
-        raise HTTPException(503, "Payment service temporarily unavailable (circuit open)")
-    except httpx.TimeoutException:
-        raise HTTPException(504, "Payment service timeout")
+    except (CircuitOpenError, httpx.ConnectError, httpx.TimeoutException, httpx.RequestError) as e:
+        # === GRACEFUL DEGRADATION ===
+        log.warning(f"Payments service unavailable for reservation {reservation_id}: {e}")
+        return JSONResponse(
+            status_code=503,
+            content={
+                "error": "payments_unavailable",
+                "message": "Payment service is temporarily down. Your reservation is held — try again in a few minutes.",
+                "reservation_id": reservation_id
+            }
+        )
     except httpx.HTTPStatusError as e:
         raise HTTPException(e.response.status_code, "Payment failed")
     except Exception as e:
         log.error(f"payment error: {e}")
         raise HTTPException(502, "Payment service unavailable")
 
-    # 2. Confirm reservation in events.
+    # 2. Confirm reservation in events (only if payment succeeded)
     try:
         confirm_resp = await client.post(
             f"{EVENTS_URL}/reservations/{reservation_id}/confirm",
@@ -352,7 +355,7 @@ async def _charge():
         log.error(f"confirm error after payment: {e}")
         raise HTTPException(500, "Payment succeeded but confirmation failed — contact support")
 
-    # 3. Fire-and-forget notify (don't await → don't add latency, don't fail user).
+    # 3. Fire-and-forget notify
     asyncio.create_task(_notify_order_confirmed(reservation_id))
 
     return result
diff --git a/submissions/lab1.md b/submissions/lab1.md
new file mode 100644
index 0000000..3af1583
--- /dev/null
+++ b/submissions/lab1.md
@@ -0,0 +1,179 @@
+# Lab 1 — SRE Philosophy: Deploy, Break, Understand
+
+## Docker Compose Status
+
+All 5 services are running successfully:
+
+```bash
+NAME                IMAGE                    STATUS                      PORTS
+app-events-1        app-events               Up                          0.0.0.0:8081->8081/tcp
+app-gateway-1       app-gateway              Up                          0.0.0.0:3080->8080/tcp
+app-payments-1      app-payments             Up                          0.0.0.0:8082->8082/tcp
+app-postgres-1      postgres:17-alpine       Up (healthy)                0.0.0.0:5432->5432/tcp
+app-redis-1         redis:7-alpine           Up (healthy)                0.0.0.0:6379->6379/tcp
+```
+
+## Critical Path (Everything Working)
+
+### 1. List Events
+
+```json
+[
+  {
+    "id": 1,
+    "name": "Go Conference 2026",
+    "venue": "Main Hall A",
+    "date": "2026-09-15T09:00:00+00:00",
+    "total_tickets": 100,
+    "price_cents": 5000,
+    "available": 99
+  },
+  {
+    "id": 4,
+    "name": "Python Workshop",
+    "venue": "Lab 301",
+    "date": "2026-09-22T14:00:00+00:00",
+    "total_tickets": 25,
+    "price_cents": 2000,
+    "available": 25
+  },
+  {
+    "id": 2,
+    "name": "SRE Meetup",
+    "venue": "Room 204",
+    "date": "2026-10-01T18:00:00+00:00",
+    "total_tickets": 30,
+    "price_cents": 0,
+    "available": 30
+  },
+  {
+    "id": 5,
+    "name": "Kubernetes Deep Dive",
+    "venue": "Auditorium B",
+    "date": "2026-10-10T10:00:00+00:00",
+    "total_tickets": 80,
+    "price_cents": 8000,
+    "available": 80
+  },
+  {
+    "id": 3,
+    "name": "Cloud Native Summit",
+    "venue": "Expo Center",
+    "date": "2026-11-20T10:00:00+00:00",
+    "total_tickets": 500,
+    "price_cents": 15000,
+    "available": 500
+  }
+]
+```
+
+### 2. Reserve a Ticket
+
+```json
+{
+  "reservation_id": "a3370485-51ea-46bf-a3b1-c6cf7a101df4",
+  "event_id": 1,
+  "quantity": 1,
+  "total_cents": 5000,
+  "expires_in_seconds": 300
+}
+```
+
+### 3. Pay for Reservation
+
+```json
+{
+  "order_id": "a3370485-51ea-46bf-a3b1-c6cf7a101df4",
+  "event_id": 1,
+  "quantity": 1,
+  "total_cents": 5000,
+  "status": "confirmed"
+}
+```
+
+### 4. Health Check
+
+```json
+{
+  "status": "healthy",
+  "checks": {
+    "events": "ok",
+    "payments": "ok",
+    "circuit_payments": "CLOSED"
+  }
+}
+```
+
+## Dependency Map
+
+```mermaid
+graph TD
+    Gateway --> Events
+    Gateway --> Payments
+    Events --> Postgres
+    Events --> Redis
+```
+
+## Failure Table
+
+| Component Killed | Events List | Reserve | Pay   | Health Check | User Impact                      |
+| ---------------- | ----------- | ------- | ----- | ------------ | -------------------------------- |
+| payments         | Works       | Works   | Fails | degraded     | Can reserve but cannot pay       |
+| events           | Fails       | Fails   | Fails | degraded     | Cannot browse or buy tickets     |
+| redis            | Works       | Works   | Works | ok           | Minor impact                     |
+| postgres         | Fails       | Fails   | Fails | degraded     | Events service completely broken |
+
+## Load Generator Test
+
+I ran the load generator:
+
+```bash
+../loadgen/run.sh 5 30
+```
+
+While it was running, I stopped the payments service. The error rate increased significantly, but list and reserve endpoints continued working. This demonstrates the blast radius of the payments service and validates graceful degradation behavior.
+
+## Task 2 — Graceful Degradation
+
+Modified `gateway/main.py` to return a clear 503 response when payments are unavailable.
+
+Example response:
+
+```json
+{
+  "error": "payments_unavailable",
+  "message": "Payment service is temporarily down. Your reservation is held — try again in a few minutes.",
+  "reservation_id": "..."
+}
+```
+
+Results:
+
+* Reserve endpoint continued working.
+* Pay endpoint returned a friendly error message.
+* User experience degraded gracefully instead of failing unexpectedly.
+
+## Bonus Task — Resource Usage
+
+### Idle
+
+```bash
+NAME            CPU %    MEM USAGE
+app-gateway-1   0.25%    38.11MiB
+app-events-1    0.25%    41MiB
+app-payments-1  0.23%    32.96MiB
+app-postgres-1  2.59%    23.89MiB
+app-redis-1     0.86%    3.66MiB
+```
+
+### Observations
+
+* PostgreSQL consumed the highest CPU while idle.
+* Redis used the least memory.
+* Gateway and Events services increased CPU usage under load because they handled incoming traffic.
+* When Payments was unavailable Gateway retained requests longer and showed increased resource utilization.
+
+## GitHub Community
+I starred the course repository and the `simple-container-com/api` project.
+I followed the professor (@Cre-eD), TAs (@Naghme98, @pierrepicaud), and several classmates.
+Starring repositories supports maintainers and helps useful projects gain visibility. Following developers helps me learn from their work and expand my professional network.

From 32ffdb6480c481259e7685c54df6f7eebbc5303a Mon Sep 17 00:00:00 2001
From: Ravil Khusnutdinov <ra.khusnutdinov@innopolis.university>
Date: Fri, 12 Jun 2026 21:19:22 +0500
Subject: [PATCH 2/9] docs(lab2): complete Lab 2 with inspection, optimization
 and bonus

---
 app/events/.dockerignore   |  10 ++++
 app/gateway/.dockerignore  |  10 ++++
 app/gateway/Dockerfile     |   4 +-
 app/payments/.dockerignore |  10 ++++
 submissions/lab2.md        | 111 +++++++++++++++++++++++++++++++++++++
 5 files changed, 144 insertions(+), 1 deletion(-)
 create mode 100644 app/events/.dockerignore
 create mode 100644 app/gateway/.dockerignore
 create mode 100644 app/payments/.dockerignore
 create mode 100644 submissions/lab2.md

diff --git a/app/events/.dockerignore b/app/events/.dockerignore
new file mode 100644
index 0000000..ce2bb52
--- /dev/null
+++ b/app/events/.dockerignore
@@ -0,0 +1,10 @@
+__pycache__
+*.pyc
+*.pyo
+.git
+.gitignore
+.env
+README.md
+*.md
+.vscode
+__MACOSX
diff --git a/app/gateway/.dockerignore b/app/gateway/.dockerignore
new file mode 100644
index 0000000..ce2bb52
--- /dev/null
+++ b/app/gateway/.dockerignore
@@ -0,0 +1,10 @@
+__pycache__
+*.pyc
+*.pyo
+.git
+.gitignore
+.env
+README.md
+*.md
+.vscode
+__MACOSX
diff --git a/app/gateway/Dockerfile b/app/gateway/Dockerfile
index 68ef075..ffcaed8 100644
--- a/app/gateway/Dockerfile
+++ b/app/gateway/Dockerfile
@@ -3,7 +3,9 @@ FROM python:3.13-slim
 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
+RUN addgroup --system app && adduser --system --ingroup app app
 COPY main.py .
-
+RUN chown -R app:app /app
+USER app
 EXPOSE 8080
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]
diff --git a/app/payments/.dockerignore b/app/payments/.dockerignore
new file mode 100644
index 0000000..ce2bb52
--- /dev/null
+++ b/app/payments/.dockerignore
@@ -0,0 +1,10 @@
+__pycache__
+*.pyc
+*.pyo
+.git
+.gitignore
+.env
+README.md
+*.md
+.vscode
+__MACOSX
diff --git a/submissions/lab2.md b/submissions/lab2.md
new file mode 100644
index 0000000..f80e518
--- /dev/null
+++ b/submissions/lab2.md
@@ -0,0 +1,111 @@
+# Lab 2 Containerization: Inspect, Understand, Optimize
+
+## Task 1 Docker Inspection фтв Operations
+
+### 1.1 Image inspection
+```bash
+docker images | grep app
+```
+
+I check app images here.
+
+- app-events:latest — about 233MB
+- app-gateway:latest — about 213MB
+- app-payments:latest — about 211MB
+
+Biggest part is Python install and pip packages.
+
+### 1.2 Container inspection
+
+I check IP address of services:
+
+- gateway: 172.21.0.6
+- events: 172.21.0.5
+- payments: 172.21.0.4
+
+Payments env variables:
+
+- PAYMENT_FAILURE_RATE=0.0
+- PAYMENT_LATENCY_MS=0
+
+### 1.3 Live debugging with exec
+
+```bash
+docker exec app-gateway-1 whoami
+# root (before Task 2)
+```
+
+DNS resolver is:
+
+- nameserver 127.0.0.11
+
+Check connection:
+
+- http://events:8081/health -> works
+- http://payments:8082/health -> works
+
+So services talk by names like events and payments. Docker DNS help here.
+
+### 1.4 Logs analysis
+
+Logs show request flow:
+
+- Gateway -> Events (reserve)
+- Gateway -> Payments (charge)
+- Events -> confirm
+
+### 1.5 Network inspection
+
+All containers are in network `app_default`.
+
+IP range is like `172.21.0.0/16`.
+
+---
+
+## Task 2 — Dockerfile Optimization
+
+I do some small optimization:
+
+- make `.dockerignore` in `gateway/`, `events/`, `payments/`
+- update `gateway/Dockerfile`
+- add non-root user `app`
+
+Check:
+
+```bash
+docker exec app-gateway-1 whoami
+# app
+```
+
+So gateway now run not as root.
+
+---
+
+## Bonus Task — Trace a Request Across Services
+
+I trace one ticket buy request.
+
+Reservation ID: `cbb0db56-1b8b-4b10-a0f2-25b5e3378f3e`
+
+Log flow:
+
+- Gateway get `POST /events/1/reserve` -> `200 OK`
+- Events reserve ticket
+- Gateway -> Payments `/charge` -> `200 OK`
+- Gateway -> Events `/confirm` -> `200 OK`
+- User get confirmation
+
+End to end time is about 100-200 ms. It is fast.
+
+---
+
+## Conclusions
+
+In Lab 2 I learn:
+
+- Docker image layers
+- service discovery by name
+- how to debug with `docker exec` and `logs`
+- basic optimization and security with non-root user
+
+I am ready for next labs

From 9ee97c7d1e93e1b3a505b987a24b5c92f3332e0b Mon Sep 17 00:00:00 2001
From: Ravil Khusnutdinov <ra.khusnutdinov@innopolis.university>
Date: Sat, 13 Jun 2026 22:37:57 +0500
Subject: [PATCH 3/9] docs(lab3): complete monitoring, SLOs and bonus

---
 docker-compose.monitoring.yaml       |  2 +-
 monitoring/prometheus/prometheus.yml | 23 +++++++++++++++
 monitoring/prometheus/rules.yml      | 12 ++++++++
 submissions/lab3.md                  | 44 ++++++++++++++++++++++++++++
 4 files changed, 80 insertions(+), 1 deletion(-)
 create mode 100644 monitoring/prometheus/prometheus.yml
 create mode 100644 monitoring/prometheus/rules.yml
 create mode 100644 submissions/lab3.md

diff --git a/docker-compose.monitoring.yaml b/docker-compose.monitoring.yaml
index 06e19bd..2f70089 100644
--- a/docker-compose.monitoring.yaml
+++ b/docker-compose.monitoring.yaml
@@ -5,10 +5,10 @@ services:
       - "9090:9090"
     volumes:
       - ../monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+      - ../monitoring/prometheus/rules.yml:/etc/prometheus/rules.yml:ro   # ← добавь эту строку
     command:
       - "--config.file=/etc/prometheus/prometheus.yml"
       - "--storage.tsdb.retention.time=7d"
-
   grafana:
     image: grafana/grafana:13.0.1
     ports:
diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml
new file mode 100644
index 0000000..3a5352b
--- /dev/null
+++ b/monitoring/prometheus/prometheus.yml
@@ -0,0 +1,23 @@
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+
+rule_files:
+  - "rules.yml"
+
+scrape_configs:
+  - job_name: 'gateway'
+    static_configs:
+      - targets: ['gateway:8080']
+
+  - job_name: 'events'
+    static_configs:
+      - targets: ['events:8081']
+
+  - job_name: 'payments'
+    static_configs:
+      - targets: ['payments:8082']
+
+  - job_name: 'prometheus'
+    static_configs:
+      - targets: ['localhost:9090']
diff --git a/monitoring/prometheus/rules.yml b/monitoring/prometheus/rules.yml
new file mode 100644
index 0000000..9c98f7a
--- /dev/null
+++ b/monitoring/prometheus/rules.yml
@@ -0,0 +1,12 @@
+groups:
+  - name: quickticket_slo_rules
+    interval: 30s
+    rules:
+      - record: gateway:sli_availability:ratio_rate5m
+        expr: sum(rate(gateway_requests_total{status!~"5.."}[5m])) / sum(rate(gateway_requests_total[5m]))
+
+      - record: gateway:sli_latency_500ms:ratio_rate5m
+        expr: sum(rate(gateway_request_duration_seconds_bucket{le="0.5"}[5m])) / sum(rate(gateway_request_duration_seconds_count[5m]))
+
+      - record: gateway:error_budget_burn_rate:ratio_rate5m
+        expr: (1 - gateway:sli_availability:ratio_rate5m) / (1 - 0.995)
diff --git a/submissions/lab3.md b/submissions/lab3.md
new file mode 100644
index 0000000..47519ff
--- /dev/null
+++ b/submissions/lab3.md
@@ -0,0 +1,44 @@
+# Lab 3 Monitoring, Observability & SLOs
+
+## Task 1  Monitoring Setup
+
+**Prometheus Configuration** (`monitoring/prometheus/prometheus.yml`)  
+I set scrape targets for gateway, events and payments.
+
+**Monitoring Stack** is running now, 7 services.
+
+**Prometheus Targets** are all **up**.
+
+**Golden Signals Dashboard** in Grafana:
+
+- I add **Latency** panel (p50, p95, p99)
+- I add **Saturation** panel (DB pool gauge)
+
+When I stop payments, I can see big increase in Error Rate and Service Health go down.
+
+## Task 2 SLOs and Recording Rules
+
+I created `monitoring/prometheus/rules.yml` with three recording rules:
+
+- `gateway:sli_availability:ratio_rate5m`
+- `gateway:sli_latency_500ms:ratio_rate5m`
+- `gateway:error_budget_burn_rate:ratio_rate5m`
+
+Rules are loaded in Prometheus successfully.
+
+**SLI/SLO:**
+
+- Availability SLO: **99.5%**
+- Latency SLO (< 500ms): **95%**
+
+## Bonus Task Failure Correlation
+
+I run load, inject failure in payments and watch dashboard + logs.
+
+**Conclusion:** Failure first show in **Error Rate**, then in **Service Health**. Latency increase later.
+
+## Final
+
+In Lab 3 I setup monitoring for QuickTicket with Prometheus + Grafana, make Golden Signals dashboard and define basic SLOs.
+
+

From cd0f334ab4a478bdf55b86b2dd3c95c723e07140 Mon Sep 17 00:00:00 2001
From: Ravil Khusnutdinov <ra.khusnutdinov@innopolis.university>
Date: Sun, 14 Jun 2026 20:18:34 +0500
Subject: [PATCH 4/9] ci: add GitHub Actions CI pipeline for QuickTicket

---
 .github/workflows/ci.yml | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 .github/workflows/ci.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..0e69d8c
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,37 @@
+name: CI - Build and Push QuickTicket Images
+
+on:
+  push:
+    branches: [ main ]
+
+jobs:
+  build-and-push:
+    runs-on: ubuntu-latest
+    permissions:
+      packages: write
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push gateway
+        run: |
+          docker build -t ghcr.io/${{ github.actor }}/quickticket-gateway:${{ github.sha }} ./app/gateway
+          docker push ghcr.io/${{ github.actor }}/quickticket-gateway:${{ github.sha }}
+
+      - name: Build and push events
+        run: |
+          docker build -t ghcr.io/${{ github.actor }}/quickticket-events:${{ github.sha }} ./app/events
+          docker push ghcr.io/${{ github.actor }}/quickticket-events:${{ github.sha }}
+
+      - name: Build and push payments
+        run: |
+          docker build -t ghcr.io/${{ github.actor }}/quickticket-payments:${{ github.sha }} ./app/payments
+          docker push ghcr.io/${{ github.actor }}/quickticket-payments:${{ github.sha }}

From 06761253d5701adf808b640db2b23cbfba81ac94 Mon Sep 17 00:00:00 2001
From: Ravil Khusnutdinov <ra.khusnutdinov@innopolis.university>
Date: Mon, 15 Jun 2026 15:36:18 +0500
Subject: [PATCH 5/9] Revert "ci: add GitHub Actions CI pipeline for
 QuickTicket"

This reverts commit cd0f334ab4a478bdf55b86b2dd3c95c723e07140.
---
 .github/workflows/ci.yml | 37 -------------------------------------
 1 file changed, 37 deletions(-)
 delete mode 100644 .github/workflows/ci.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
deleted file mode 100644
index 0e69d8c..0000000
--- a/.github/workflows/ci.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-name: CI - Build and Push QuickTicket Images
-
-on:
-  push:
-    branches: [ main ]
-
-jobs:
-  build-and-push:
-    runs-on: ubuntu-latest
-    permissions:
-      packages: write
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Log in to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Build and push gateway
-        run: |
-          docker build -t ghcr.io/${{ github.actor }}/quickticket-gateway:${{ github.sha }} ./app/gateway
-          docker push ghcr.io/${{ github.actor }}/quickticket-gateway:${{ github.sha }}
-
-      - name: Build and push events
-        run: |
-          docker build -t ghcr.io/${{ github.actor }}/quickticket-events:${{ github.sha }} ./app/events
-          docker push ghcr.io/${{ github.actor }}/quickticket-events:${{ github.sha }}
-
-      - name: Build and push payments
-        run: |
-          docker build -t ghcr.io/${{ github.actor }}/quickticket-payments:${{ github.sha }} ./app/payments
-          docker push ghcr.io/${{ github.actor }}/quickticket-payments:${{ github.sha }}

From 86e2df33aa9bcba79abc9d5d09fd673fd5b723f2 Mon Sep 17 00:00:00 2001
From: Ravil Khusnutdinov <ra.khusnutdinov@innopolis.university>
Date: Mon, 15 Jun 2026 15:43:08 +0500
Subject: [PATCH 6/9] docs(lab5): complete CI/CD, ArgoCD and rollback

---
 submissions/lab5.md | 56 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 submissions/lab5.md

diff --git a/submissions/lab5.md b/submissions/lab5.md
new file mode 100644
index 0000000..5720336
--- /dev/null
+++ b/submissions/lab5.md
@@ -0,0 +1,56 @@
+# Lab 5  CI/CD & GitOps
+
+## Task 1 CI Pipeline + ArgoCD
+
+I created GitHub Actions CI workflow (`.github/workflows/ci.yml`).
+
+The workflow finished successfully:
+
+* build Docker images
+* push images to ghcr.io
+
+I installed ArgoCD and created Application `quickticket`.
+
+I tested GitOps workflow.
+
+When I push changes to Git repository, ArgoCD automatically deploy new version.
+
+## Task 2 Rollback via GitOps
+
+### 1. Deploy bad version
+
+I changed image tag in `k8s/gateway.yaml` to wrong tag.
+
+After git push, ArgoCD tried to sync application.
+
+Gateway pod went to `ImagePullBackOff` state.
+
+### 2. Rollback
+
+```bash
+git revert HEAD --no-edit
+git push origin main
+```
+
+ArgoCD automatically rollback changes.
+
+Application returned to Healthy status.
+
+Recovery time was about 1 to 2 minutes after git push.
+
+## Bonus Task
+
+I did not do bonus task because I had some problems with ArgoCD path configuration.
+
+But I understand the idea of automatic image tag updates.
+
+## Final
+
+In this lab I:
+
+* setup CI/CD pipeline with GitHub Actions
+* installed ArgoCD
+* used GitOps workflow
+* tested rollback with git revert
+
+This lab helped me understand how modern deployment and rollback work in DevOps and SRE.

From a297df1768d0ff24b52bf2aa70f6dce70b9fbc39 Mon Sep 17 00:00:00 2001
From: Ravil Khusnutdinov <ra.khusnutdinov@innopolis.university>
Date: Mon, 15 Jun 2026 15:56:58 +0500
Subject: [PATCH 7/9] ci: add automated image tag update (bonus task)

---
 .github/workflows/ci.yml | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 .github/workflows/ci.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..1fd2c2f
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,38 @@
+name: CI - Build and Push QuickTicket Images
+
+on:
+  push:
+    branches: [ main ]
+
+jobs:
+  build-and-push:
+    runs-on: ubuntu-latest
+    permissions:
+      packages: write
+      contents: write
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push gateway
+        run: |
+          docker build -t ghcr.io/${{ github.actor }}/quickticket-gateway:${{ github.sha }} ./app/gateway
+          docker push ghcr.io/${{ github.actor }}/quickticket-gateway:${{ github.sha }}
+
+      - name: Build and push events
+        run: |
+          docker build -t ghcr.io/${{ github.actor }}/quickticket-events:${{ github.sha }} ./app/events
+          docker push ghcr.io/${{ github.actor }}/quickticket-events:${{ github.sha }}
+
+      - name: Build and push payments
+        run: |
+          docker build -t ghcr.io/${{ github.actor }}/quickticket-payments:${{ github.sha }} ./app/payments
+          docker push ghcr.io/${{ github.actor }}/quickticket-payments:${{ github.sha }}

From 410bf299d5dc020a7292a912d442c2fde42d1c2e Mon Sep 17 00:00:00 2001
From: Ravil Khusnutdinov <ra.khusnutdinov@innopolis.university>
Date: Wed, 17 Jun 2026 21:18:48 +0500
Subject: [PATCH 8/9] feat(lab7): complete canary rollout + bonus automated
 analysis

---
 k8s/analysis-template.yaml | 18 ++++++++++
 k8s/gateway.yaml           | 42 +++++++++++++++++++++++
 submissions/lab7.md        | 69 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 129 insertions(+)
 create mode 100644 k8s/analysis-template.yaml
 create mode 100644 k8s/gateway.yaml
 create mode 100644 submissions/lab7.md

diff --git a/k8s/analysis-template.yaml b/k8s/analysis-template.yaml
new file mode 100644
index 0000000..a6e0404
--- /dev/null
+++ b/k8s/analysis-template.yaml
@@ -0,0 +1,18 @@
+apiVersion: argoproj.io/v1alpha1
+kind: AnalysisTemplate
+metadata:
+  name: gateway-error-rate
+spec:
+  metrics:
+  - name: error-rate
+    interval: 30s
+    count: 5
+    successCondition: result[0] < 0.1   # less than 10% error rate
+    failureLimit: 2
+    provider:
+      prometheus:
+        address: http://prometheus:9090
+        query: |
+          sum(rate(gateway_requests_total{status=~"5..", rs_hash="{{args.canary-hash}}"}[1m])) 
+          / 
+          sum(rate(gateway_requests_total{rs_hash="{{args.canary-hash}}"}[1m]))
diff --git a/k8s/gateway.yaml b/k8s/gateway.yaml
new file mode 100644
index 0000000..f48755c
--- /dev/null
+++ b/k8s/gateway.yaml
@@ -0,0 +1,42 @@
+apiVersion: argoproj.io/v1alpha1
+kind: Rollout
+metadata:
+  name: gateway
+spec:
+  replicas: 5
+  strategy:
+    canary:
+      steps:
+        - setWeight: 20
+        - pause: {duration: 30s}
+        - analysis:
+            templates:
+            - templateName: gateway-error-rate
+            args:
+            - name: canary-hash
+              valueFrom:
+                podTemplateHashValue: Latest
+        - setWeight: 60
+        - pause: {duration: 30s}
+        - setWeight: 100
+  selector:
+    matchLabels:
+      app: gateway
+  template:
+    metadata:
+      labels:
+        app: gateway
+    spec:
+      containers:
+      - name: gateway
+        image: quickticket-gateway:v1
+        imagePullPolicy: Never
+        ports:
+        - containerPort: 8080
+        env:
+        - name: EVENTS_URL
+          value: "http://events:8081"
+        - name: PAYMENTS_URL
+          value: "http://payments:8082"
+        - name: APP_VERSION
+          value: "v4-auto-analysis"
diff --git a/submissions/lab7.md b/submissions/lab7.md
new file mode 100644
index 0000000..bf3ca17
--- /dev/null
+++ b/submissions/lab7.md
@@ -0,0 +1,69 @@
+# Lab 7 — Progressive Delivery: Canary Deployments
+
+## Task 1 — Manual Canary
+
+For this task I installed Argo Rollouts.
+
+I changed `gateway` from Deployment to Rollout and used canary strategy.
+
+I started canary deployment with 20% traffic.
+
+After checking that everything worked, I did manual promotion.
+
+I also tested a bad version and used **abort**. The rollback was very fast.
+
+**What I learned:** Abort is much faster than using `git revert` like in Lab 5.
+
+---
+
+## Task 2 — Multi-step Canary
+
+I used a multi-step canary strategy:
+
+```yaml
+steps:
+  - setWeight: 20
+  - pause: {duration: 60s}
+  - setWeight: 40
+  - pause: {duration: 60s}
+  - setWeight: 60
+  - pause: {duration: 60s}
+  - setWeight: 80
+  - pause: {duration: 30s}
+  - setWeight: 100
+```
+
+I watched rollout progress using:
+
+```bash
+kubectl get rollout gateway
+kubectl get pods
+```
+
+The traffic slowly moved to the new version step by step.
+
+---
+
+## Bonus Task — Automated Canary Analysis
+
+I created an AnalysisTemplate called `gateway-error-rate`.
+
+After that I added analysis to the Rollout.
+
+I tested both auto-promote and auto-abort.
+
+The most interesting thing was automatic rollback. If the new version had problems, Rollouts stopped it and returned to the old version automatically.
+
+---
+
+## Final Thoughts
+
+In this lab I learned:
+
+* Canary deployments
+* Manual promotion
+* Manual abort
+* Multi-step rollout strategy
+* Automated analysis with Argo Rollouts
+
+I think this is a very useful way to deploy applications more safely in production.

From e2d69f5e770f92592a03039b9434a757735b74ac Mon Sep 17 00:00:00 2001
From: Ravil Khusnutdinov <ra.khusnutdinov@innopolis.university>
Date: Wed, 17 Jun 2026 21:40:09 +0500
Subject: [PATCH 9/9] docs(lab9): complete database migrations, backup/restore
 and disaster recovery

---
 k8s/postgres.yaml   |  37 +++++++++++++
 submissions/lab9.md | 130 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 167 insertions(+)
 create mode 100644 k8s/postgres.yaml
 create mode 100644 submissions/lab9.md

diff --git a/k8s/postgres.yaml b/k8s/postgres.yaml
new file mode 100644
index 0000000..5a25643
--- /dev/null
+++ b/k8s/postgres.yaml
@@ -0,0 +1,37 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: postgres
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: postgres
+  template:
+    metadata:
+      labels:
+        app: postgres
+    spec:
+      containers:
+      - name: postgres
+        image: postgres:17-alpine
+        ports:
+        - containerPort: 5432
+        env:
+        - name: POSTGRES_DB
+          value: quickticket
+        - name: POSTGRES_USER
+          value: quickticket
+        - name: POSTGRES_PASSWORD
+          value: quickticket
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: postgres
+spec:
+  selector:
+    app: postgres
+  ports:
+  - port: 5432
+    targetPort: 5432
diff --git a/submissions/lab9.md b/submissions/lab9.md
new file mode 100644
index 0000000..999e5db
--- /dev/null
+++ b/submissions/lab9.md
@@ -0,0 +1,130 @@
+# Lab 9 - Stateful Services & DB Reliability
+
+## Task 1 - Migrations and Backup/Restore
+
+### Alembic Setup
+
+I initialized Alembic and created migration files.
+
+Steps:
+
+* initialized Alembic (`alembic init migrations`)
+* created baseline for existing database schema
+* created migration to add `email` column to events table
+
+### Migration Under Load
+
+I started mixed load testing while migration was running.
+
+Migration:
+
+```sql
+ALTER TABLE events ADD COLUMN email VARCHAR(255);
+```
+
+The migration finished in less than 1 second.
+
+There were no extra errors during the test because the new column was nullable.
+
+### Backup and Restore
+
+I created database backup:
+
+```bash
+pg_dump -Fc > /tmp/quickticket.dump
+```
+
+To simulate data loss, I removed one table:
+
+```sql
+DROP TABLE orders CASCADE;
+```
+
+Then I restored the database:
+
+```bash
+pg_restore --clean --if-exists
+```
+
+After restore, the tables were available again and the API worked normally.
+
+### RPO Observation
+
+* Before disaster: orders existed
+* After DROP: orders missing
+* After restore: orders returned
+
+---
+
+## Task 2 - Disaster Recovery Under Load
+
+### Experiment
+
+I deleted the PostgreSQL pod:
+
+```bash
+kubectl delete pod -l app=postgres --grace-period=0 --force
+```
+
+A new pod was created automatically.
+
+Because storage was ephemeral, database data was lost.
+
+I restored the database from backup and restarted the events service.
+
+```bash
+kubectl rollout restart deployment/events
+```
+
+### RTO and RPO
+
+* RTO was about 1.5 to 2 minutes
+* RPO depended on the last backup time, about a few minutes
+
+### Conclusion
+
+Without PersistentVolumeClaim, PostgreSQL data can be lost when the pod is recreated.
+
+This is a serious problem for stateful applications.
+
+---
+
+## Bonus Task - Persistent Storage and Automated Backup
+
+I added a PersistentVolumeClaim to PostgreSQL deployment.
+
+Storage size:
+
+```yaml
+storage: 1Gi
+```
+
+I also created automated backups with CronJob.
+
+* backup every 5 minutes
+* keep last 5 backup files
+
+### Disaster Recovery Test After PVC
+
+After adding PVC, PostgreSQL started with existing data after pod recreation.
+
+Recovery was much faster because restore from backup was not needed.
+
+RTO became only the pod startup time.
+
+---
+
+## Final Thoughts
+
+In this lab I learned:
+
+* how to use Alembic migrations
+* how to perform database migration under load
+* how to use pg_dump and pg_restore
+* how to measure RTO and RPO
+* how PersistentVolumeClaim protects database data
+* how automated backups improve reliability
+
+The most important lesson for me was:
+
+**Stateful services without persistent storage are very risky because data can be lost after pod recreation.**