diff --git a/attendee/settings/base.py b/attendee/settings/base.py index 87a41a28..d2ee6361 100644 --- a/attendee/settings/base.py +++ b/attendee/settings/base.py @@ -270,6 +270,9 @@ CHARGE_CREDITS_FOR_BOTS = os.getenv("CHARGE_CREDITS_FOR_BOTS", "false") == "true" +# Celery task time limits +BOT_TASK_SOFT_TIME_LIMIT_SECONDS = int(os.getenv("BOT_TASK_SOFT_TIME_LIMIT_SECONDS", 14400)) # 4 hours default + BOT_POD_NAMESPACE = os.getenv("BOT_POD_NAMESPACE", "attendee") WEBPAGE_STREAMER_POD_NAMESPACE = os.getenv("WEBPAGE_STREAMER_POD_NAMESPACE", "attendee-webpage-streamer") REQUIRE_HTTPS_WEBHOOKS = os.getenv("REQUIRE_HTTPS_WEBHOOKS", "true") == "true" diff --git a/bots/models.py b/bots/models.py index d1f88426..d8d17008 100644 --- a/bots/models.py +++ b/bots/models.py @@ -1135,6 +1135,7 @@ class BotEventSubTypes(models.IntegerChoices): BOT_RECORDING_PERMISSION_DENIED_HOST_DENIED_PERMISSION = 23, "Bot recording permission denied - Host denied permission" BOT_RECORDING_PERMISSION_DENIED_REQUEST_TIMED_OUT = 24, "Bot recording permission denied - Request timed out" BOT_RECORDING_PERMISSION_DENIED_HOST_CLIENT_CANNOT_GRANT_PERMISSION = 25, "Bot recording permission denied - Host client cannot grant permission" + FATAL_ERROR_SOFT_TIME_LIMIT_EXCEEDED = 26, "Fatal error - Soft time limit exceeded" @classmethod def sub_type_to_api_code(cls, value): @@ -1165,6 +1166,7 @@ def sub_type_to_api_code(cls, value): cls.BOT_RECORDING_PERMISSION_DENIED_HOST_DENIED_PERMISSION: "host_denied_permission", cls.BOT_RECORDING_PERMISSION_DENIED_REQUEST_TIMED_OUT: "request_timed_out", cls.BOT_RECORDING_PERMISSION_DENIED_HOST_CLIENT_CANNOT_GRANT_PERMISSION: "host_client_cannot_grant_permission", + cls.FATAL_ERROR_SOFT_TIME_LIMIT_EXCEEDED: "soft_time_limit_exceeded", } return mapping.get(value) @@ -1205,7 +1207,7 @@ class Meta: models.CheckConstraint( check=( # For FATAL_ERROR event type, must have one of the valid event subtypes - (Q(event_type=BotEventTypes.FATAL_ERROR) & (Q(event_sub_type=BotEventSubTypes.FATAL_ERROR_PROCESS_TERMINATED) | Q(event_sub_type=BotEventSubTypes.FATAL_ERROR_ATTENDEE_INTERNAL_ERROR) | Q(event_sub_type=BotEventSubTypes.FATAL_ERROR_OUT_OF_CREDITS) | Q(event_sub_type=BotEventSubTypes.FATAL_ERROR_RTMP_CONNECTION_FAILED) | Q(event_sub_type=BotEventSubTypes.FATAL_ERROR_UI_ELEMENT_NOT_FOUND) | Q(event_sub_type=BotEventSubTypes.FATAL_ERROR_HEARTBEAT_TIMEOUT) | Q(event_sub_type=BotEventSubTypes.FATAL_ERROR_BOT_NOT_LAUNCHED))) + (Q(event_type=BotEventTypes.FATAL_ERROR) & (Q(event_sub_type=BotEventSubTypes.FATAL_ERROR_PROCESS_TERMINATED) | Q(event_sub_type=BotEventSubTypes.FATAL_ERROR_ATTENDEE_INTERNAL_ERROR) | Q(event_sub_type=BotEventSubTypes.FATAL_ERROR_OUT_OF_CREDITS) | Q(event_sub_type=BotEventSubTypes.FATAL_ERROR_RTMP_CONNECTION_FAILED) | Q(event_sub_type=BotEventSubTypes.FATAL_ERROR_UI_ELEMENT_NOT_FOUND) | Q(event_sub_type=BotEventSubTypes.FATAL_ERROR_HEARTBEAT_TIMEOUT) | Q(event_sub_type=BotEventSubTypes.FATAL_ERROR_BOT_NOT_LAUNCHED) | Q(event_sub_type=BotEventSubTypes.FATAL_ERROR_SOFT_TIME_LIMIT_EXCEEDED))) | # For COULD_NOT_JOIN event type, must have one of the valid event subtypes (Q(event_type=BotEventTypes.COULD_NOT_JOIN) & (Q(event_sub_type=BotEventSubTypes.COULD_NOT_JOIN_MEETING_NOT_STARTED_WAITING_FOR_HOST) | Q(event_sub_type=BotEventSubTypes.COULD_NOT_JOIN_UNABLE_TO_CONNECT_TO_MEETING) | Q(event_sub_type=BotEventSubTypes.COULD_NOT_JOIN_MEETING_WAITING_ROOM_TIMEOUT_EXCEEDED) | Q(event_sub_type=BotEventSubTypes.COULD_NOT_JOIN_MEETING_ZOOM_AUTHORIZATION_FAILED) | Q(event_sub_type=BotEventSubTypes.COULD_NOT_JOIN_MEETING_LOGIN_REQUIRED) | Q(event_sub_type=BotEventSubTypes.COULD_NOT_JOIN_MEETING_BOT_LOGIN_ATTEMPT_FAILED) | Q(event_sub_type=BotEventSubTypes.COULD_NOT_JOIN_MEETING_ZOOM_MEETING_STATUS_FAILED) | Q(event_sub_type=BotEventSubTypes.COULD_NOT_JOIN_MEETING_UNPUBLISHED_ZOOM_APP) | Q(event_sub_type=BotEventSubTypes.COULD_NOT_JOIN_MEETING_ZOOM_SDK_INTERNAL_ERROR) | Q(event_sub_type=BotEventSubTypes.COULD_NOT_JOIN_MEETING_REQUEST_TO_JOIN_DENIED) | Q(event_sub_type=BotEventSubTypes.COULD_NOT_JOIN_MEETING_MEETING_NOT_FOUND))) diff --git a/bots/tasks/run_bot_task.py b/bots/tasks/run_bot_task.py index 6bece3fd..0264ab41 100644 --- a/bots/tasks/run_bot_task.py +++ b/bots/tasks/run_bot_task.py @@ -3,9 +3,12 @@ import signal from celery import shared_task +from celery.exceptions import SoftTimeLimitExceeded from celery.signals import worker_shutting_down +from django.conf import settings from bots.bot_controller import BotController +from bots.models import Bot, BotEventManager, BotEventSubTypes, BotEventTypes logger = logging.getLogger(__name__) @@ -15,12 +18,27 @@ class StagedBotInterrupted(Exception): pass -@shared_task(bind=True, soft_time_limit=14400, autoretry_for=(StagedBotInterrupted,), retry_kwargs={'max_retries': 5}) # 4 hours - must exceed BOT_MAX_UPTIME_SECONDS +@shared_task(bind=True, soft_time_limit=settings.BOT_TASK_SOFT_TIME_LIMIT_SECONDS, autoretry_for=(StagedBotInterrupted,), retry_kwargs={'max_retries': 5}) def run_bot(self, bot_id): logger.info(f"Running bot {bot_id}") bot_controller = BotController(bot_id) - bot_controller.run() + try: + bot_controller.run() + except SoftTimeLimitExceeded: + logger.warning(f"Bot {bot_id} exceeded soft time limit ({settings.BOT_TASK_SOFT_TIME_LIMIT_SECONDS}s)") + try: + bot = Bot.objects.get(id=bot_id) + BotEventManager.create_event( + bot=bot, + event_type=BotEventTypes.FATAL_ERROR, + event_sub_type=BotEventSubTypes.FATAL_ERROR_SOFT_TIME_LIMIT_EXCEEDED, + ) + except Exception as e: + logger.error(f"Failed to create FATAL_ERROR event for bot {bot_id}: {e}") + finally: + bot_controller.cleanup() + return # After run() completes, check if the bot was interrupted while in STAGED state # If so, raise an exception to trigger retry