{status}
-diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..7814c59 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,59 @@ +name: CI + +on: + pull_request: + +jobs: + python-check: + runs-on: ubuntu-latest + strategy: + matrix: + module: + - backend + - autoscaler + - load_balancer + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Compile Python files + run: python -m compileall ${{ matrix.module }} + + frontend-build: + runs-on: ubuntu-latest + defaults: + run: + working-directory: fe + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + cache: 'npm' + cache-dependency-path: fe/package-lock.json + + - name: Install dependencies + run: npm ci + + - name: Lint frontend + run: npm run lint + + - name: Build frontend + run: npm run build + + docker-compose-check: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Validate docker compose configuration + run: docker compose config diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b617770 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.idea/ +__pycache__/ +*.py[cod] diff --git a/autoscaler/autoscaler.py b/autoscaler/autoscaler.py index 7d01c13..d5bae08 100644 --- a/autoscaler/autoscaler.py +++ b/autoscaler/autoscaler.py @@ -1,175 +1,34 @@ -import time import logging -import os -import multiprocessing -import signal, sys, docker, json -import requests -from metrics import PrometheusClient, DockerManager, clear_prometheus_targets +from cleanup import register_signal_handlers +from config import load_settings +from scaler import AutoScaler +from targets import clear_prometheus_targets -class AutoScaler: - def __init__( - self, - prom_url: str, - docker_image: str, - label: str = 'autoscale_service', - cpu_threshold: float = 0.7, - min_instances: int = 1, - max_instances: int = 10, - check_interval: int = 10, - load_balancer_url: str = "http://host.docker.internal:8000" - ): - self.prom = PrometheusClient(prom_url) - self.dock = DockerManager() - self.image = docker_image - self.label = label - self.threshold = cpu_threshold - self.min = min_instances - self.max = max_instances - self.interval = check_interval - self.above_since = None - self.below_since = None - - self.load_balancer_url = load_balancer_url - - def notify_load_balancer(self): - """로드밸런서에 서버 목록 갱신 요청""" - try: - refresh_url = f"{self.load_balancer_url}/refresh-servers" - - response = requests.post(refresh_url, timeout=3) - - if response.status_code == 200: - logging.info("✅ Load balancer server refresh triggered") - else: - logging.warning(f"⚠️ Load balancer refresh failed: {response.status_code}") - - except requests.exceptions.ConnectionError: - logging.warning("🔌 Could not connect to load balancer") - except requests.exceptions.Timeout: - logging.warning("⏰ Load balancer request timed out") - except Exception as e: - logging.error(f"❗ Error notifying load balancer: {e}") - - def scale(self) -> None: - containers = self.dock.list_containers(self.label) - autoscaled_containers = [c for c in containers if not self.dock._is_fixed(c)] - count = len(containers) - - # under min instances - if count < self.min: - logging.info(f"Instances below minimum ({count} < {self.min}). Scaling up.") - self.dock.run_container(self.image, self.label) - #서버 갱신 요청 - self.notify_load_balancer() - - self.above_since = None - self.below_since = None - return - - num_cpus = multiprocessing.cpu_count() - usages = [self.dock.get_container_cpu(c) for c in containers] - raw_avg = sum(usages) / count if usages else 0.0 - avg_cpu = raw_avg / num_cpus - - logging.info( - f"Avg CPU: {avg_cpu:.2f}% (per core) " - f"across {count} containers" - ) - - now = time.time() - - if avg_cpu > (self.threshold*100): - if self.above_since is None: - self.above_since = now - logging.debug("CPU above threshold, starting timer for scale-out.") - elif now - self.above_since >= 30 and count < self.max: - logging.info("CPU above threshold for ≥ 2 minutes. Scaling up by 1.") - self.dock.run_container(self.image, self.label) - - # 서버 갱신 요청 - self.notify_load_balancer() - - self.above_since = None - self.below_since = None - else: - self.above_since = None - - if avg_cpu < (self.threshold * 50): - if self.below_since is None: - self.below_since = now - logging.debug("CPU below half-threshold, starting timer for scale-in.") - elif now - self.below_since >= 15 and len(autoscaled_containers) > 0: - target = autoscaled_containers[-1] - logging.info(f"CPU below half-threshold for ≥ 1 minute. Scaling down container: {target.name}") - self.dock.remove_container(target) - - # 서버 갱신 요청 - self.notify_load_balancer() - - self.above_since = None - self.below_since = None - elif now - self.below_since >= 30: - logging.info("CPU below half-threshold, but no removable container found (all fixed).") - else: - self.below_since = None - - def run(self) -> None: - logging.info("Starting AutoScaler loop.") - while True: - try: - self.scale() - except Exception as e: - logging.error(f"Error during scaling: {e}") - time.sleep(self.interval) - - -if __name__ == '__main__': +def main(): logging.basicConfig( level=logging.INFO, - format='%(asctime)s [%(levelname)s] %(message)s', - datefmt='%Y-%m-%d %H:%M:%S' + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", ) - clear_prometheus_targets() - prom_url = os.getenv('PROM_URL', 'http://localhost:9090') - docker_img = os.getenv('DOCKER_IMAGE', '') - min_i = int(os.getenv('MIN_INSTANCES', 1)) - max_i = int(os.getenv('MAX_INSTANCES', 10)) - cpu_th = float(os.getenv('CPU_THRESHOLD', 0.7)) - interval = int(os.getenv('CHECK_INTERVAL', 30)) - - def graceful_shutdown(signum, frame): - print("📦 Shutting down autoscaler...") - - # 1. Docker에서 autoscale_service-* 컨테이너 삭제 - client = docker.from_env() - for container in client.containers.list(all=True): - if container.name.startswith("autoscale_service-"): - print(f"🗑 Removing container {container.name}") - try: - container.remove(force=True) - except Exception as e: - print(f"❌ Failed to remove {container.name}: {e}") - # 2. flask.json 초기화 - try: - flask_json_path = "/app/prometheus/targets/flask.json" - if os.path.exists(flask_json_path): - with open(flask_json_path, "w") as f: - json.dump([], f) - print("🧹 flask.json cleared") - except Exception as e: - print(f"❌ Failed to clear flask.json: {e}") - - sys.exit(0) + register_signal_handlers() + clear_prometheus_targets() + settings = load_settings() scaler = AutoScaler( - prom_url, - docker_img, - min_instances=min_i, - max_instances=max_i, - cpu_threshold=cpu_th, - check_interval=interval + prom_url=settings.prom_url, + docker_image=settings.docker_image, + label=settings.label, + min_instances=settings.min_instances, + max_instances=settings.max_instances, + cpu_threshold=settings.cpu_threshold, + check_interval=settings.check_interval, + load_balancer_url=settings.load_balancer_url, ) scaler.run() + + +if __name__ == "__main__": + main() diff --git a/autoscaler/cleanup.py b/autoscaler/cleanup.py new file mode 100644 index 0000000..513190d --- /dev/null +++ b/autoscaler/cleanup.py @@ -0,0 +1,35 @@ +import signal +import sys + +import docker +from targets import FLASK_TARGET_PATH, clear_prometheus_targets + + +def cleanup_autoscaled_containers(): + client = docker.from_env() + for container in client.containers.list(all=True): + if container.name.startswith("autoscale_service-"): + print(f"Removing container {container.name}") + try: + container.remove(force=True) + except Exception as error: + print(f"Failed to remove {container.name}: {error}") + + +def clear_local_target_file(): + clear_prometheus_targets() + print(f"Cleared target file: {FLASK_TARGET_PATH}") + + +def register_signal_handlers(): + def graceful_shutdown(signum, frame): + print("Shutting down autoscaler...") + cleanup_autoscaled_containers() + try: + clear_local_target_file() + except Exception as error: + print(f"Failed to clear flask.json: {error}") + sys.exit(0) + + signal.signal(signal.SIGINT, graceful_shutdown) + signal.signal(signal.SIGTERM, graceful_shutdown) diff --git a/autoscaler/config.py b/autoscaler/config.py new file mode 100644 index 0000000..dfde555 --- /dev/null +++ b/autoscaler/config.py @@ -0,0 +1,30 @@ +import os +from dataclasses import dataclass + + +@dataclass +class AutoScalerSettings: + prom_url: str + docker_image: str + label: str + cpu_threshold: float + min_instances: int + max_instances: int + check_interval: int + load_balancer_url: str + + +def load_settings() -> AutoScalerSettings: + return AutoScalerSettings( + prom_url=os.getenv("PROM_URL", "http://localhost:9090"), + docker_image=os.getenv("DOCKER_IMAGE", ""), + label=os.getenv("AUTOSCALE_LABEL", "autoscale_service"), + cpu_threshold=float(os.getenv("CPU_THRESHOLD", 0.7)), + min_instances=int(os.getenv("MIN_INSTANCES", 1)), + max_instances=int(os.getenv("MAX_INSTANCES", 10)), + check_interval=int(os.getenv("CHECK_INTERVAL", 30)), + load_balancer_url=os.getenv( + "LOAD_BALANCER_URL", + "http://host.docker.internal:8000", + ), + ) diff --git a/autoscaler/docker_manager.py b/autoscaler/docker_manager.py new file mode 100644 index 0000000..729536d --- /dev/null +++ b/autoscaler/docker_manager.py @@ -0,0 +1,88 @@ +import logging +import os +import time +import uuid + +import docker + +from targets import write_prometheus_targets + + +class DockerManager: + def __init__(self): + self.client = docker.from_env() + + def list_containers(self, label: str): + containers = self.client.containers.list(filters={"label": label}) + for container in containers: + logging.debug( + "Container %s (%s) - fixed=%s", + container.name, + container.short_id, + container.labels.get("fixed"), + ) + return containers + + def run_container(self, image: str, label: str): + project = os.getenv("COMPOSE_PROJECT_NAME", "pnu_cloud_computing") + compose_labels = { + "com.docker.compose.project": project, + "com.docker.compose.service": label, + "com.docker.compose.oneoff": "False", + "autoscale_service": label, + } + labels = {"autoscale_service": label} + existing = self.list_containers(label) + if not any(self.is_fixed(container) for container in existing): + labels["fixed"] = "true" + + container_name = f"{label}-{uuid.uuid4().hex[:5]}" + container = self.client.containers.run( + image, + name=container_name, + labels={**compose_labels, **labels}, + detach=True, + ports={"5000/tcp": None}, + network="pnu_cloud_computing_mynet", + ) + self.update_prometheus_targets(label) + return container + + def remove_container(self, container): + if self.is_fixed(container): + logging.info("Skipping fixed container: %s (%s)", container.name, container.short_id) + return + + logging.info("Removing container: %s (%s)", container.name, container.short_id) + container.stop() + container.remove() + self.update_prometheus_targets(container.labels.get("autoscale_service")) + + def get_container_cpu(self, container): + stats_stream = container.stats(stream=True, decode=True) + first = next(stats_stream) + time.sleep(1) + second = next(stats_stream) + del stats_stream + + cpu_delta = ( + second["cpu_stats"]["cpu_usage"]["total_usage"] + - first["cpu_stats"]["cpu_usage"]["total_usage"] + ) + system_delta = ( + second["cpu_stats"]["system_cpu_usage"] + - first["cpu_stats"]["system_cpu_usage"] + ) + + if system_delta > 0.0 and cpu_delta > 0.0: + num_cpus = len(second["cpu_stats"]["cpu_usage"].get("percpu_usage", [])) or 1 + return (cpu_delta / system_delta) * num_cpus * 100.0 + return 0.0 + + def update_prometheus_targets(self, label: str): + containers = self.list_containers(label) + targets = [f"{container.name}:5000" for container in containers if not self.is_fixed(container)] + write_prometheus_targets(targets) + + def is_fixed(self, container) -> bool: + return str(container.labels.get("fixed", "")).lower() == "true" diff --git a/autoscaler/metrics.py b/autoscaler/metrics.py deleted file mode 100644 index 75ec79d..0000000 --- a/autoscaler/metrics.py +++ /dev/null @@ -1,120 +0,0 @@ -import requests -import docker -import json -import time -import logging -import os -import uuid - -FLASK_TARGET_PATH = '/etc/prometheus/targets/flask.json' - -class PrometheusClient: - def __init__(self, base_url: str): - self.base_url = base_url.rstrip('/') - - def get_metric(self, query: str) -> float: - url = f"{self.base_url}/api/v1/query" - resp = requests.get(url, params={'query': query}) - resp.raise_for_status() - data = resp.json() - if data.get('status') != 'success' or not data['data']['result']: - return 0.0 - return float(data['data']['result'][0]['value'][1]) - - def get_avg_cpu_usage(self, label: str) -> float: - query = ( - 'sum(rate(container_cpu_usage_seconds_total{job="cadvisor"}[1m]) * 0.01)' - ) - return self.get_metric(query) - - def get_container_count(self, label: str) -> int: - query = ( - 'count(' - 'container_memory_usage_bytes' - '{job="cadvisor",container_label_autoscale_service="' + label + '"}' - ')' - ) - return int(self.get_metric(query)) - -class DockerManager: - def __init__(self): - self.client = docker.from_env() - - def list_containers(self, label: str): - containers = self.client.containers.list(filters={'label': label}) - for c in containers: - logging.debug(f"Container {c.name} ({c.short_id}) - fixed={c.labels.get('fixed')}") - return containers - - def run_container(self, image: str, label: str): - project = os.getenv('COMPOSE_PROJECT_NAME', 'pnu_cloud_computing') - service = label - compose_labels = { - 'com.docker.compose.project': project, - 'com.docker.compose.service': service, - 'com.docker.compose.oneoff': 'False', - 'autoscale_service': label, - } - labels = {'autoscale_service': label} - existing = self.list_containers(label) - if not any(self._is_fixed(c) for c in existing): - labels['fixed'] = 'true' - - labels = {**compose_labels, **labels} - container_name = f"{label}-{uuid.uuid4().hex[:5]}" - - container = self.client.containers.run( - image, - name=container_name, - labels=labels, - detach=True, - ports={'5000/tcp': None}, - network='pnu_cloud_computing_mynet' - ) - self.update_prometheus_targets(label) - return container - - def remove_container(self, container): - if self._is_fixed(container): - logging.info(f"Skipping fixed container: {container.name} ({container.short_id})") - return - logging.info(f"Removing container: {container.name} ({container.short_id})") - container.stop() - container.remove() - self.update_prometheus_targets(container.labels.get('autoscale_service')) - - def get_container_cpu(self, container): - stats_stream = container.stats(stream=True, decode=True) - first = next(stats_stream) - time.sleep(1) - second = next(stats_stream) - del stats_stream - - cpu_delta = second['cpu_stats']['cpu_usage']['total_usage'] - first['cpu_stats']['cpu_usage']['total_usage'] - system_delta = second['cpu_stats']['system_cpu_usage'] - first['cpu_stats']['system_cpu_usage'] - - if system_delta > 0.0 and cpu_delta > 0.0: - num_cpus = len(second['cpu_stats']['cpu_usage'].get('percpu_usage', [])) or 1 - return (cpu_delta / system_delta) * num_cpus * 100.0 - return 0.0 - - def update_prometheus_targets(self, label: str): - containers = self.list_containers(label) - targets = [] - - for c in containers: - if not self._is_fixed(c): - targets.append(f"{c.name}:5000") - - os.makedirs(os.path.dirname(FLASK_TARGET_PATH), exist_ok=True) - with open(FLASK_TARGET_PATH, 'w') as f: - json.dump([{"targets": targets, "labels": {"job": "flask-autoscaled"}}], f) - - - def _is_fixed(self, container) -> bool: - return str(container.labels.get('fixed', '')).lower() == 'true' - -def clear_prometheus_targets(): - os.makedirs(os.path.dirname(FLASK_TARGET_PATH), exist_ok=True) - with open(FLASK_TARGET_PATH, 'w') as f: - json.dump([{"targets": [], "labels": {"job": "flask-autoscaled"}}], f) diff --git a/autoscaler/notifier.py b/autoscaler/notifier.py new file mode 100644 index 0000000..f61bf4b --- /dev/null +++ b/autoscaler/notifier.py @@ -0,0 +1,24 @@ +import logging + +import requests + + +class LoadBalancerNotifier: + def __init__(self, load_balancer_url: str): + self.load_balancer_url = load_balancer_url + + def notify_refresh(self) -> None: + try: + refresh_url = f"{self.load_balancer_url}/refresh-servers" + response = requests.post(refresh_url, timeout=3) + + if response.status_code == 200: + logging.info("Load balancer server refresh triggered") + else: + logging.warning("Load balancer refresh failed: %s", response.status_code) + except requests.exceptions.ConnectionError: + logging.warning("Could not connect to load balancer") + except requests.exceptions.Timeout: + logging.warning("Load balancer request timed out") + except Exception as error: + logging.error("Error notifying load balancer: %s", error) diff --git a/autoscaler/prometheus_client.py b/autoscaler/prometheus_client.py new file mode 100644 index 0000000..0c0f5e1 --- /dev/null +++ b/autoscaler/prometheus_client.py @@ -0,0 +1,28 @@ +import requests + + +class PrometheusClient: + def __init__(self, base_url: str): + self.base_url = base_url.rstrip("/") + + def get_metric(self, query: str) -> float: + url = f"{self.base_url}/api/v1/query" + response = requests.get(url, params={"query": query}) + response.raise_for_status() + data = response.json() + if data.get("status") != "success" or not data["data"]["result"]: + return 0.0 + return float(data["data"]["result"][0]["value"][1]) + + def get_avg_cpu_usage(self, label: str) -> float: + query = 'sum(rate(container_cpu_usage_seconds_total{job="cadvisor"}[1m]) * 0.01)' + return self.get_metric(query) + + def get_container_count(self, label: str) -> int: + query = ( + 'count(' + 'container_memory_usage_bytes' + '{job="cadvisor",container_label_autoscale_service="' + label + '"}' + ')' + ) + return int(self.get_metric(query)) diff --git a/autoscaler/scaler.py b/autoscaler/scaler.py new file mode 100644 index 0000000..d1490f4 --- /dev/null +++ b/autoscaler/scaler.py @@ -0,0 +1,94 @@ +import logging +import multiprocessing +import time + +from docker_manager import DockerManager +from notifier import LoadBalancerNotifier +from prometheus_client import PrometheusClient + + +class AutoScaler: + def __init__( + self, + prom_url: str, + docker_image: str, + label: str = "autoscale_service", + cpu_threshold: float = 0.7, + min_instances: int = 1, + max_instances: int = 10, + check_interval: int = 10, + load_balancer_url: str = "http://host.docker.internal:8000", + ): + self.prom = PrometheusClient(prom_url) + self.dock = DockerManager() + self.notifier = LoadBalancerNotifier(load_balancer_url) + self.image = docker_image + self.label = label + self.threshold = cpu_threshold + self.min = min_instances + self.max = max_instances + self.interval = check_interval + self.above_since = None + self.below_since = None + + def notify_load_balancer(self) -> None: + self.notifier.notify_refresh() + + def scale(self) -> None: + containers = self.dock.list_containers(self.label) + autoscaled_containers = [container for container in containers if not self.dock.is_fixed(container)] + count = len(containers) + + if count < self.min: + logging.info("Instances below minimum (%s < %s). Scaling up.", count, self.min) + self.dock.run_container(self.image, self.label) + self.notify_load_balancer() + self.above_since = None + self.below_since = None + return + + num_cpus = multiprocessing.cpu_count() + usages = [self.dock.get_container_cpu(container) for container in containers] + raw_avg = sum(usages) / count if usages else 0.0 + avg_cpu = raw_avg + + logging.info("Avg CPU: %.2f%% (per core) across %s containers", avg_cpu, count) + + now = time.time() + if avg_cpu > (self.threshold * 100): + if self.above_since is None: + self.above_since = now + logging.debug("CPU above threshold, starting timer for scale-out.") + elif now - self.above_since >= 30 and count < self.max: + logging.info("CPU above threshold long enough. Scaling up by 1.") + self.dock.run_container(self.image, self.label) + self.notify_load_balancer() + self.above_since = None + self.below_since = None + else: + self.above_since = None + + if avg_cpu < (self.threshold * 50): + if self.below_since is None: + self.below_since = now + logging.debug("CPU below half-threshold, starting timer for scale-in.") + elif now - self.below_since >= 15 and autoscaled_containers: + target = autoscaled_containers[-1] + logging.info("CPU below half-threshold long enough. Scaling down container: %s", target.name) + self.dock.remove_container(target) + self.notify_load_balancer() + self.above_since = None + self.below_since = None + elif now - self.below_since >= 30: + logging.info("CPU below half-threshold, but no removable container found (all fixed).") + else: + self.below_since = None + + def run(self) -> None: + logging.info("Starting AutoScaler loop.") + while True: + try: + self.scale() + except Exception as error: + logging.error("Error during scaling: %s", error) + time.sleep(self.interval) diff --git a/autoscaler/targets.py b/autoscaler/targets.py new file mode 100644 index 0000000..046445e --- /dev/null +++ b/autoscaler/targets.py @@ -0,0 +1,14 @@ +import json +import os + +FLASK_TARGET_PATH = "/etc/prometheus/targets/flask.json" + + +def write_prometheus_targets(targets): + os.makedirs(os.path.dirname(FLASK_TARGET_PATH), exist_ok=True) + with open(FLASK_TARGET_PATH, "w") as file: + json.dump([{"targets": targets, "labels": {"job": "flask-autoscaled"}}], file) + + +def clear_prometheus_targets(): + write_prometheus_targets([]) diff --git a/backend/Dockerfile b/backend/Dockerfile index d8d3969..0615082 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -2,9 +2,9 @@ FROM python:3.10-slim WORKDIR /app -COPY src/requirements.txt /app/ +COPY requirements.txt /app/ RUN pip install --no-cache-dir -r /app/requirements.txt -COPY src/ /app/ +COPY . /app/ CMD ["python", "/app/server.py"] diff --git a/backend/__init__.py b/backend/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/backend/__init__.py @@ -0,0 +1 @@ + diff --git a/backend/app.py b/backend/app.py new file mode 100644 index 0000000..06b96b5 --- /dev/null +++ b/backend/app.py @@ -0,0 +1,47 @@ +from flask import Flask, Response, request +from flask_cors import CORS +from prometheus_client import CONTENT_TYPE_LATEST, Counter, generate_latest +from prometheus_flask_exporter import PrometheusMetrics + +from config import DEFAULT_LOAD_DURATION, TARGET_URL +from stress import StressController + + +def create_app() -> Flask: + app = Flask(__name__) + CORS(app) + + PrometheusMetrics(app, group_by="endpoint") + load_request_counter = Counter( + "load_requests_total", + "Total /load POST requests", + ) + stress_controller = StressController(TARGET_URL) + + @app.route("/load", methods=["GET", "POST", "PUT", "DELETE", "PATCH"]) + def load_handler(): + load_request_counter.inc() + duration = float(request.args.get("duration", str(DEFAULT_LOAD_DURATION))) + stress_controller.enqueue_cpu_load(duration) + return "ok" + + @app.route("/cpu/toggle", methods=["POST"]) + def cpu_toggle(): + return stress_controller.toggle() + + @app.route("/health", methods=["GET", "POST"]) + def health_check(): + return "OK", 200 + + @app.route("/metrics") + def metrics_handler(): + return Response(generate_latest(), mimetype=CONTENT_TYPE_LATEST) + + @app.route("/") + def home(): + return "hello, this is pnu cloud computing term project", 200 + + return app + + +app = create_app() diff --git a/backend/config.py b/backend/config.py new file mode 100644 index 0000000..390e4e8 --- /dev/null +++ b/backend/config.py @@ -0,0 +1,9 @@ +TARGET_URL = "http://load_balancer:8000/load" +DEFAULT_LOAD_DURATION = 0.5 +LOAD_STEP_DURATION = 5 +MAX_RPS = 150 +INITIAL_RPS = 20 +RPS_INCREMENT = 20 +REQUEST_TIMEOUT_SECONDS = 5.0 +THREAD_POOL_WORKERS = 50 +CPU_POOL_WORKERS = 4 diff --git a/backend/src/requirements.txt b/backend/requirements.txt similarity index 87% rename from backend/src/requirements.txt rename to backend/requirements.txt index 1e84286..070f8d0 100644 --- a/backend/src/requirements.txt +++ b/backend/requirements.txt @@ -2,4 +2,4 @@ flask prometheus_client flask-cors prometheus_flask_exporter -requests \ No newline at end of file +requests diff --git a/backend/server.py b/backend/server.py new file mode 100644 index 0000000..1f53e1e --- /dev/null +++ b/backend/server.py @@ -0,0 +1,5 @@ +from app import app + + +if __name__ == "__main__": + app.run(host="0.0.0.0", port=5000, threaded=True) diff --git a/backend/src/server.py b/backend/src/server.py deleted file mode 100644 index ef2da0d..0000000 --- a/backend/src/server.py +++ /dev/null @@ -1,127 +0,0 @@ -from flask import Flask, request, Response -from flask_cors import CORS -from prometheus_flask_exporter import PrometheusMetrics -from prometheus_client import Counter, generate_latest, CONTENT_TYPE_LATEST -from multiprocessing import Process, Manager, cpu_count -import multiprocessing -from http.server import HTTPServer, BaseHTTPRequestHandler -import json -import time -import requests - - -app = Flask(__name__) -health_app = Flask(__name__) -CORS(app) - -metrics = PrometheusMetrics(app, group_by='endpoint') -load_request_counter = Counter("load_requests_total", "Total /load POST requests") - -TARGET_URL = "http://load_balancer:8000/load" - -# 전역 상태 -manager = Manager() -stop_event = manager.Event() -load_process = None - -from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor - -# 전역 프로세스 풀 (부하 연산용) -cpu_executor = ProcessPoolExecutor(max_workers=4) - -# 실제 CPU 부하를 주는 작업 (휴식 제거) -def cpu_stress_worker(duration): - # stop_event 제거 (Pool에서 실행하기 위해 단순화) - end = time.time() + duration - while time.time() < end: - # 타이트한 루프 연산 - _ = [i * i for i in range(1000)] - return "done" - -# 백엔드 서버에서 부하 처리 -@app.route('/load', methods=['GET', 'POST', 'PUT', 'DELETE', 'PATCH']) -def load_handler(): - load_request_counter.inc() - duration = float(request.args.get("duration", "0.5")) - - # 프로세스 풀에 부하 작업 던지기 (비차단) - # 헬스체크 응답을 방해하지 않기 위해 메인 스레드는 즉시 반환하거나 짧게만 대기 - cpu_executor.submit(cpu_stress_worker, duration) - - return "ok" - -# rps만큼 병렬로 POST /load 호출 -def _send_requests(rps, duration_sec, url, stop_event): - if not url: - return - - interval = 1.0 / rps - end_time = time.time() + duration_sec - - with ThreadPoolExecutor(max_workers=50) as executor: - while time.time() < end_time: - if stop_event.is_set(): - break - # 비동기적으로 요청 발사 - executor.submit(requests.post, url, timeout=5.0) - time.sleep(interval) - -# 증가하는 RPS로 부하 주기 루프 -def send_http_load_loop(stop_event): - url = f"{TARGET_URL}?duration=0.5" - step_duration = 5 - max_rps = 150 - rps = 20 - rps_increment = 20 - - print(f"🚀 부하 생성 루프 시작 (Target: {url})") - while not stop_event.is_set(): - print(f"🔥 현재 부하 강도: {rps} RPS") - _send_requests(rps, step_duration, url, stop_event) - if rps < max_rps: - rps += rps_increment - - -@app.route('/cpu/toggle', methods=['POST']) -def cpu_toggle(): - global load_process, stop_event - - if load_process is None or not load_process.is_alive(): - print("📌 부하 시작") - stop_event = manager.Event() - load_process = Process(target=send_http_load_loop, args=(stop_event,)) - load_process.start() - print(f"✅ 부하 프로세스 시작됨: pid={load_process.pid}") - return "started" - else: - print("🛑 부하 중지 요청 받음") - stop_event.set() - load_process.join(timeout=3) - if load_process.is_alive(): - print("⚠️ 프로세스가 살아있어서 강제 종료합니다.") - load_process.terminate() - load_process.join() - else: - print("✅ 프로세스 정상 종료됨.") - load_process = None - return "stopped" - - -@app.route('/health', methods=['GET','POST']) -def health_check(): - return "OK", 200 - -@app.route('/metrics') -def metrics_handler(): - return Response(generate_latest(), mimetype=CONTENT_TYPE_LATEST) - -# 기본 루트 -@app.route('/') -def home(): - return "hello, this is pnu cloud computing term project", 200 - -if __name__ == '__main__': - # health_process = multiprocessing.Process(target=run_health_process, daemon=True) - # health_process.start() - - app.run(host='0.0.0.0', port=5000, threaded=True) diff --git a/backend/stress.py b/backend/stress.py new file mode 100644 index 0000000..85d3293 --- /dev/null +++ b/backend/stress.py @@ -0,0 +1,86 @@ +import time +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor +from multiprocessing import Manager, Process + +import requests + +from config import ( + CPU_POOL_WORKERS, + DEFAULT_LOAD_DURATION, + INITIAL_RPS, + LOAD_STEP_DURATION, + MAX_RPS, + REQUEST_TIMEOUT_SECONDS, + RPS_INCREMENT, + THREAD_POOL_WORKERS, +) + + +def cpu_stress_worker(duration: float): + end = time.time() + duration + while time.time() < end: + _ = [i * i for i in range(1000)] + return "done" + + +def send_requests(rps: int, duration_sec: int, url: str, stop_event) -> None: + if not url: + return + + interval = 1.0 / rps + end_time = time.time() + duration_sec + + with ThreadPoolExecutor(max_workers=THREAD_POOL_WORKERS) as executor: + while time.time() < end_time: + if stop_event.is_set(): + break + executor.submit(requests.post, url, timeout=REQUEST_TIMEOUT_SECONDS) + time.sleep(interval) + + +def send_http_load_loop(stop_event, target_url: str) -> None: + url = f"{target_url}?duration={DEFAULT_LOAD_DURATION}" + rps = INITIAL_RPS + + print(f"Starting load generation loop (target: {url})") + while not stop_event.is_set(): + print(f"Current load level: {rps} RPS") + send_requests(rps, LOAD_STEP_DURATION, url, stop_event) + if rps < MAX_RPS: + rps += RPS_INCREMENT + + +class StressController: + def __init__(self, target_url: str): + self.target_url = target_url + self.manager = Manager() + self.stop_event = self.manager.Event() + self.load_process = None + self.cpu_executor = ProcessPoolExecutor(max_workers=CPU_POOL_WORKERS) + + def enqueue_cpu_load(self, duration: float) -> None: + self.cpu_executor.submit(cpu_stress_worker, duration) + + def toggle(self) -> str: + if self.load_process is None or not self.load_process.is_alive(): + print("Starting stress process") + self.stop_event = self.manager.Event() + self.load_process = Process( + target=send_http_load_loop, + args=(self.stop_event, self.target_url), + ) + self.load_process.start() + print(f"Stress process started with pid={self.load_process.pid}") + return "started" + + print("Received request to stop stress process") + self.stop_event.set() + self.load_process.join(timeout=3) + if self.load_process.is_alive(): + print("Stress process is still alive. Terminating it forcefully.") + self.load_process.terminate() + self.load_process.join() + else: + print("Stress process stopped cleanly.") + self.load_process = None + return "stopped" diff --git a/fe/src/App.css b/fe/src/App.css deleted file mode 100644 index 92b61cc..0000000 --- a/fe/src/App.css +++ /dev/null @@ -1,9 +0,0 @@ -@tailwind base; -@tailwind components; -@tailwind utilities; - -#root { - margin: 0; - padding: 0; -} - diff --git a/fe/src/App.tsx b/fe/src/App.tsx deleted file mode 100644 index 3d7ded3..0000000 --- a/fe/src/App.tsx +++ /dev/null @@ -1,35 +0,0 @@ -import { useState } from 'react' -import reactLogo from './assets/react.svg' -import viteLogo from '/vite.svg' -import './App.css' - -function App() { - const [count, setCount] = useState(0) - - return ( - <> -
-
- Edit src/App.tsx and save to test HMR
-
- Click on the Vite and React logos to learn more -
- > - ) -} - -export default App diff --git a/fe/src/assets/react.svg b/fe/src/assets/react.svg deleted file mode 100644 index 6c87de9..0000000 --- a/fe/src/assets/react.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/fe/src/components/Overview.tsx b/fe/src/components/Overview.tsx index 2e99749..606506b 100644 --- a/fe/src/components/Overview.tsx +++ b/fe/src/components/Overview.tsx @@ -1,21 +1,16 @@ +import { GRAFANA_PANELS, buildGrafanaPanelUrl } from '../config/grafana'; + const Overview = () => { return ( -{status}
-{status}