diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml index a2a710b8..839d5891 100644 --- a/.github/workflows/github-actions.yml +++ b/.github/workflows/github-actions.yml @@ -1,9 +1,7 @@ on: push - jobs: build: - runs-on: ubuntu-latest services: @@ -18,40 +16,36 @@ jobs: # needed because the postgres container does not provide a healthcheck options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 - steps: - - uses: actions/checkout@v2 - - name: Set up Python 3.9 - uses: actions/setup-python@v4 - with: - python-version: 3.9 - - name: Install dependencies - run: | - pip install pip==19.3.1 - pip install --force-reinstall 'setuptools<58.0.0' - pip install -r requirements.txt - pip install -r requirements-toil.txt - #python manage.py migrate - - name: Run migrations - run: python manage.py migrate - - name: Run test - #run: python manage.py test - run: | - coverage run --source='.' manage.py test - coverage report -m --fail-under=75 - - name: Run flake8 - uses: suo/flake8-github-action@releases/v1 - with: - checkName: 'build' # NOTE: this needs to be the same as the job name + - uses: actions/checkout@v2 + - name: Set up Python 3.10 + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install dependencies + run: | + pip install pip==19.3.1 + pip install -r requirements.txt + pip install -r requirements-toil.txt + - name: Run migrations + run: python manage.py migrate + - name: Run test + #run: python manage.py test + run: | + coverage run --source='.' manage.py test + coverage report -m --fail-under=75 + - name: Run flake8 + uses: suo/flake8-github-action@releases/v1 + with: + checkName: "build" # NOTE: this needs to be the same as the job name env: - #database env variables + #database env variables RIDGEBACK_DB_NAME: github_actions RIDGEBACK_DB_PASSWORD: postgres RIDGEBACK_DB_USERNAME: postgres #lsf env variables RIDGEBACK_LSF_WALLTIME: 10:00 - RIDGEBACK_LSF_SLA: SLA #celery env variables CELERY_LOG_PATH: /sample_path @@ -69,23 +63,33 @@ jobs: RIDGEBACK_RABBITMQ_USERNAME: sample_username RIDGEBACK_RABBITMQ_PASSWORD: sample_password - #pipeline env variables + #pipeline env variables ARGOS_JOB_STORE_ROOT: /sample_path ARGOS_WORK_DIR_ROOT: /sample_path ARGOS_TMP_DIR_ROOT: /sample_path + ARGOS_PARTITION: sample_SLA TEMPO_JOB_STORE_ROOT: /sample_path TEMPO_WORK_DIR_ROOT: /sample_path TEMPO_TMP_DIR_ROOT: /sample_path + TEMPO_PARTITION: sample_SLA ACCESS_JOB_STORE_ROOT: /sample_path ACCESS_WORK_DIR_ROOT: /sample_path ACCESS_TMP_DIR_ROOT: /sample_path + ACCESS_PARTITION: sample_SLA CMO_CH_JOB_STORE_ROOT: /sample_path CMO_CH_WORK_DIR_ROOT: /sample_path CMO_CH_TMP_DIR_ROOT: /sample_path + CMO_CH_PARTITION: sample_SLA ACCESS_HEME_JOB_STORE_ROOT: /sample_path ACCESS_HEME_WORK_DIR_ROOT: /sample_path ACCESS_HEME_TMP_DIR_ROOT: /sample_path + ACCESS_HEME_PARTITION: sample_SLA + MICROBIOME_JOB_STORE_ROOT: /sample_path + MICROBIOME_WORK_DIR_ROOT: /sample_path + MICROBIOME_TMP_DIR_ROOT: /sample_path + MICROBIOME_PARTITION: sample_SLA DEFAULT_JOB_STORE_ROOT: /sample_path DEFAULT_WORK_DIR_ROOT: /sample_path DEFAULT_TMP_DIR_ROOT: /sample_path + DEFAULT_PARTITION: sample_SLA diff --git a/.gitignore b/.gitignore index 67398010..cd8cb177 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ conda/ toil .toil/ *.sif +.env diff --git a/.travis.yml b/.travis.yml index a54f4780..7966b9d6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,7 @@ addons: - postgresql-14 - postgresql-client-14 python: - - "3.9" + - "3.10" before_install: - sudo apt-get update - sudo service postgresql restart 14 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..592f59d1 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,28 @@ +FROM python:3.10-slim + +LABEL org.opencontainers.image.vendor="MSKCC" \ + org.opencontainers.image.authors="Nikhil Kumar (kumarn1@mskcc.org)" \ + org.opencontainers.image.created="2025-09-15T16:04:00Z" \ + org.opencontainers.image.licenses="Apache-2.0" \ + org.opencontainers.image.version="1.0.0" \ + org.opencontainers.image.source="https://github.com/mskcc/ridgeback" \ + org.opencontainers.image.title="Ridgeback" \ + org.opencontainers.image.description="API for running Toil and Nextflow jobs, supports LSF, SLURM, and singleMachine mode" + +ENV DEBIAN_FRONTEND noninteractive + +RUN apt-get update \ + # Install dependencies + && apt-get -y --no-install-recommends install \ + wget curl libldap2-dev libsasl2-dev procps libssl-dev libxml2-dev libxslt-dev \ + libpq-dev gawk nodejs git build-essential openssh-client \ + # Install alternative ssl library + && wget http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb \ + && dpkg -i libssl1.1_1.1.1f-1ubuntu2_amd64.deb \ + # Install libffi6 for local python commands + && wget http://archive.ubuntu.com/ubuntu/pool/main/libf/libffi/libffi6_3.2.1-8_amd64.deb \ + && dpkg -i libffi6_3.2.1-8_amd64.deb \ + # Clean up image + && rm -rf /var/lib/apt/lists/* + + diff --git a/README.md b/README.md index 3369e432..987f68f6 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # Ridgeback -API for Toil LSF +API for running Toil and Nextflow jobs. +Supports LSF, SLURM, and singleMachine mode ## Filebeat + See https://app.gitbook.com/@mskcc-1/s/experimental-dev/filebeat diff --git a/batch_systems/__init__.py b/batch_systems/__init__.py new file mode 100644 index 00000000..4d59af0d --- /dev/null +++ b/batch_systems/__init__.py @@ -0,0 +1,4 @@ +from .lsf_client.lsf_client import LSFClient +from .slurm_client.slurm_client import SLURMClient + +__all__ = ["LSFClient", "SLURMClient"] diff --git a/batch_systems/batch_system.py b/batch_systems/batch_system.py new file mode 100644 index 00000000..e84e2e39 --- /dev/null +++ b/batch_systems/batch_system.py @@ -0,0 +1,140 @@ +from django.conf import settings +from getpass import getuser +import logging + + +def get_batch_system(user=getuser()): + if settings.BATCH_SYSTEM == "LSF": + from batch_systems.lsf_client.lsf_client import LSFClient + + return LSFClient(user) + elif settings.BATCH_SYSTEM == "SLURM": + from batch_systems.slurm_client.slurm_client import SLURMClient + + return SLURMClient(user) + else: + raise Exception(f"Batch system {settings.BATCH_SYSTEM} not supported, please use either LSF or SLURM") + + +class BatchClient(object): + """ + Client for a generic Batch system + + Attributes: + logger (logging): logging module + """ + + def __init__(self, user=getuser()): + """ + init function + """ + self.logger = logging.getLogger("BATCH_client") + self.logfileName = "batch.log" + self.name = "batch" + self.user = user + + def submit(self, command, job_args, stdout, job_id, partition, env={}): + """ + Submit command to bath system and store log in stdout + + Args: + command (list): command to submit + job_args (list): Additional options for leader job + stdout (str): log file path + job_id (str): job_id + partition (str): the batch system partition to use + env (dict): Environment variables + + Returns: + int: batch job id + """ + + def terminate(self, job_id): + """ + Kill Batch job + + Args: + job_id (str): job_id + + Returns: + bool: successful + """ + + def set_walltime(self, expected_limit, hard_limit): + """ + Set the walltime args of the batch job + """ + walltime_args = [] + return walltime_args + + def set_memlimit(self, mem_limit, default=None): + """ + Set the memlimit args of the batch job + """ + mem_limit_args = [] + return mem_limit_args + + def set_num_tasks(self, num_tasks, default=None): + """ + Set the number of tasks for the batch job + """ + num_task_args = [] + return num_task_args + + def get_env_export_flag(self): + """ + Flag to enable env propagation for the batch jobs + + Returns: + str: CLI flag to enable env propagation + """ + + def set_group(self, group_id): + """ + Set the group args of the batch job + """ + group_id_args = [] + return group_id_args + + def set_stdout_file(self, stdout_file): + """ + Set the output path of the log file + """ + return [] + + def set_service_queue(self, partition): + """ + Set the service queue parameter + """ + service_queue_args = [] + return service_queue_args + + def status(self, external_job_id): + """Parse Batch status + + Args: + external_job_id (str): Batch id + + Returns: + tuple: (Ridgeback Status int, extra info) + """ + status = None + return status + + def suspend(self, job_id): + """ + Suspend Batch job + Args: + job_id (str): id of job + Returns: + bool: successful + """ + + def resume(self, job_id): + """ + Resume Batch job + Args: + job_id (str): id of job + Returns: + bool: successful + """ diff --git a/batch_systems/lsf_client/lsf_client.py b/batch_systems/lsf_client/lsf_client.py index ef88e780..49b572ac 100644 --- a/batch_systems/lsf_client/lsf_client.py +++ b/batch_systems/lsf_client/lsf_client.py @@ -7,16 +7,18 @@ import subprocess import json import logging -from django.conf import settings from orchestrator.models import Status from orchestrator.exceptions import FailToSubmitToSchedulerException, FetchStatusException +from batch_systems.batch_system import BatchClient +from submitter.userswitcher import userswitch +from getpass import getuser def format_lsf_job_id(job_id): return "/{}".format(job_id) -class LSFClient(object): +class LSFClient(BatchClient): """ Client for LSF @@ -24,31 +26,38 @@ class LSFClient(object): logger (logging): logging module """ - def __init__(self): + def __init__(self, user=getuser()): """ init function """ self.logger = logging.getLogger("LSF_client") + self.logfileName = "lsf.log" + self.name = "lsf" + self.user = user - def submit(self, command, job_args, stdout, job_id, env={}): + @userswitch + def submit(self, command, job_args, stdout, job_id, partition, env={}): """ Submit command to LSF and store log in stdout Args: - command (str): command to submit + command (list): command to submit job_args (list): Additional options for leader bsub stdout (str): log file path job_id (str): job_id + partition (str): the batch system partition to use env (dict): Environment variables Returns: int: lsf job id """ - if settings.LSF_SLA: - bsub_command = ["bsub", "-sla", settings.LSF_SLA, "-g", format_lsf_job_id(job_id), "-oo", stdout] + job_args - else: - bsub_command = ["bsub", "-g", format_lsf_job_id(job_id), "-oo", stdout] + job_args - + bsub_command = ( + ["bsub"] + + self.set_service_queue(partition) + + self.set_group(job_id) + + self.set_stdout_file(stdout) + + job_args + ) bsub_command.extend(command) current_env = os.environ.copy() for k, v in env.items(): @@ -71,6 +80,7 @@ def submit(self, command, job_args, stdout, job_id, env={}): ) return self._parse_procid(process.stdout) + @userswitch def terminate(self, job_id): """ Kill LSF job @@ -88,7 +98,64 @@ def terminate(self, job_id): return True return False - def parse_bjobs(self, bjobs_output_str): + def set_walltime(self, expected_limit, hard_limit): + walltime_args = [] + if expected_limit: + walltime_args = walltime_args + ["-We", str(expected_limit)] + if hard_limit: + walltime_args = walltime_args + ["-W", str(hard_limit)] + return walltime_args + + def set_memlimit(self, mem_limit, default=None): + mem_limit_args = [] + if mem_limit: + return ["-M", mem_limit] + if default: + mem_limit_args = ["-M", default] + return mem_limit_args + + def set_num_tasks(self, num_tasks, default=None): + """ + Set the number of tasks for the batch job + """ + num_task_args = [] + if default: + num_task_args = ["-n", default] + if num_tasks: + num_task_args = ["-n", num_tasks] + return num_task_args + + def get_env_export_flag(self): + """ + Flag to enable env propagation for the batch jobs + + Returns: + str: CLI flag to enable env propagation + """ + return "-env all" + + def set_group(self, group_id): + group_id_args = [] + if group_id: + group_id_args = ["-g", format_lsf_job_id(group_id)] + return group_id_args + + def set_stdout_file(self, stdout_file): + if stdout_file: + return ["-oo", stdout_file] + else: + return ["-oo", self.logfileName] + + def set_service_queue(self, partition): + """ + Set the service queue parameter + """ + service_queue_args = [] + if partition: + service_queue_args = ["-sla", partition] + return service_queue_args + + def _parse_bjobs(self, bjobs_output_str): """ Parse the output of bjobs into a descriptive dict @@ -191,7 +258,7 @@ def _parse_status(self, stdout, external_job_id): Returns: tuple: (Ridgeback Status int, extra info) """ - bjobs_records = self.parse_bjobs(stdout) + bjobs_records = self._parse_bjobs(stdout) if bjobs_records: process_output = bjobs_records[0] if "STAT" in process_output: @@ -204,6 +271,7 @@ def _parse_status(self, stdout, external_job_id): return Status.UNKNOWN, error_message.strip() raise FetchStatusException(f"Failed to get status for job {external_job_id}") + @userswitch def status(self, external_job_id): """Parse LSF status @@ -225,11 +293,12 @@ def status(self, external_job_id): status = self._parse_status(process.stdout, external_job_id) return status + @userswitch def suspend(self, job_id): """ Suspend LSF job Args: - extrnsl_job_id (str): id of job + job_id (str): id of job Returns: bool: successful """ @@ -240,6 +309,7 @@ def suspend(self, job_id): return True return False + @userswitch def resume(self, job_id): """ Resume LSF job diff --git a/batch_systems/slurm_client/__init__.py b/batch_systems/slurm_client/__init__.py new file mode 100644 index 00000000..2926f010 --- /dev/null +++ b/batch_systems/slurm_client/__init__.py @@ -0,0 +1,3 @@ +from .slurm_client import SLURMClient + +__all__ = ["SLURMClient"] diff --git a/batch_systems/slurm_client/slurm_client.py b/batch_systems/slurm_client/slurm_client.py new file mode 100644 index 00000000..0ed76d3a --- /dev/null +++ b/batch_systems/slurm_client/slurm_client.py @@ -0,0 +1,359 @@ +""" +Submit, monitor, and control SLURM jobs +""" + +import os +import re +import subprocess +import logging +from orchestrator.models import Status +from orchestrator.exceptions import FailToSubmitToSchedulerException, FetchStatusException +from batch_systems.batch_system import BatchClient +from submitter.userswitcher import userswitch +from getpass import getuser + + +class SLURMClient(BatchClient): + """ + Client for SLURM + + Attributes: + logger (logging): logging module + """ + + def __init__(self, user=getuser()): + """ + init function + """ + self.logger = logging.getLogger("SLURM_client") + self.logfileName = "slurm.log" + self.name = "slurm" + self.user = user + + @userswitch + def submit(self, command, job_args, stdout, job_id, partition, env={}): + """ + Submit command to SLURM and store log in stdout + + Args: + command (list): command to submit + job_args (list): Additional options for leader sbatch + stdout (str): log file path + job_id (str): job_id + partition (str): the batch system partition to use + env (dict): Environment variables + + Returns: + int: slurm job id + """ + work_dir = os.path.dirname(stdout) + + command_str = " ".join(command) + + sbatch_command = ( + ["sbatch"] + + self.set_service_queue(partition) + + self.set_group(job_id) + + self.set_stdout_file(stdout) + + job_args + + [f"--wrap=exec {command_str}"] + ) + current_env = os.environ.copy() + for k, v in env.items(): + if v: + current_env[k] = v + elif k in current_env: + current_env.pop(k) + self.logger.debug("Running command: %s\nEnv: %s", sbatch_command, current_env) + process = subprocess.run( + sbatch_command, + check=True, + stdout=subprocess.PIPE, + universal_newlines=True, + cwd=work_dir, + env=current_env, + ) + if process.returncode != 0: + self.logger.exception(f"Failed to submit job to SLURM. Process return_code: {process.returncode}") + raise FailToSubmitToSchedulerException( + f"Failed to submit job to SLURM. Process return_code: {process.returncode}" + ) + return self._parse_procid(process.stdout) + + @userswitch + def terminate(self, job_id): + """ + Kill SLURM job + + Args: + job_id (str): job_id + + Returns: + bool: successful + """ + self.logger.debug("Terminating SLURM jobs for job %s", job_id) + scancel_command = ["scancel", f"--wckey={job_id}"] + process = subprocess.run(scancel_command, check=True, stdout=subprocess.PIPE, universal_newlines=True) + if process.returncode in (0, 255): + return True + return False + + def set_walltime(self, expected_limit, hard_limit): + walltime_args = [] + if expected_limit: + self.logger.debug( + "Expected limits on submit are no supported, please check the cluster KillWait and OverTimeLimit params" + ) + if hard_limit: + walltime_args = [f"--time={hard_limit}"] + return walltime_args + + def set_memlimit(self, mem_limit, default=None): + mem_limit_args = [] + if mem_limit: + return [f"--mem={mem_limit}G"] + if default: + mem_limit_args = [f"--mem={default}G"] + return mem_limit_args + + def set_num_tasks(self, num_tasks, default=None): + """ + Set the number of tasks for the batch job + """ + num_task_args = [] + if default: + num_task_args = [f"--cpus-per-task={default}"] + if num_tasks: + num_task_args = [f"--cpus-per-task={num_tasks}"] + return num_task_args + + def get_env_export_flag(self): + """ + Flag to enable env propagation for the batch jobs + + Returns: + str: CLI flag to enable env propagation + """ + return "--export=ALL" + + def set_group(self, group_id): + group_id_args = [] + if group_id: + group_id_args = [f"--wckey={group_id}"] + return group_id_args + + def set_stdout_file(self, stdout_file): + if stdout_file: + return [f"--output={stdout_file}"] + return [f"--output={self.logfileName}"] + + def set_service_queue(self, partition): + service_queue_args = [] + if partition: + service_queue_args = [f"--partition={partition}"] + return service_queue_args + + def _parse_sacct(self, sacct_output_str, external_job_id): + """ + Parse the output of sacct into a descriptive dict + + Args: + sacct_output_str (str): Stdout from sacct + external_job_id (str): SLURM job id + + Returns: + tuple: sacct job record (id,status,batch_system_exitcode,tool_exitcode) + """ + sacct_record = None + if sacct_output_str: + output_lines = sacct_output_str.strip().split("\n") + for single_sacct_line in output_lines: + slurm_job_info = single_sacct_line.strip().split("|") + slurm_id = slurm_job_info[0] + if slurm_id == f"{external_job_id}.batch": + status = slurm_job_info[1] + exitcode_batch = slurm_job_info[2].split(":")[0] + exitcode_tool = slurm_job_info[2].split(":")[1] + sacct_record = (slurm_id, status, exitcode_batch, exitcode_tool) + if slurm_id == f"{external_job_id}" and not sacct_record: + status = slurm_job_info[1] + exitcode_batch = slurm_job_info[2].split(":")[0] + exitcode_tool = slurm_job_info[2].split(":")[1] + sacct_record = (slurm_id, status, exitcode_batch, exitcode_tool) + if not sacct_record: + self.logger.error(f"Error - sacct command could not find job {external_job_id}") + return sacct_record + + def _parse_procid(self, stdout): + """ + Parse sbatch output and retrieve the SLURM id + + Args: + stdout (str): sbatch output + + Returns: + int: SLURM id + """ + self.logger.debug("SLURM returned %s", stdout) + slurm_job_id_search = re.search("Submitted batch job (.*)", stdout) + if slurm_job_id_search: + slurm_job_id = int(slurm_job_id_search[1]) + self.logger.debug("Got the job id: %s", slurm_job_id) + return slurm_job_id + else: + self.logger.error("Could not parse job_id. Job is not submitted to SLURM\nReason: %s", stdout) + raise FailToSubmitToSchedulerException(f"Reason: {stdout}") + + def _handle_status(self, process_status, batchsystem_exitcode, tool_exitcode, external_job_id): + """ + Map SLURM status to Ridgeback status + + Args: + process_status (str): SLURM status of process + batchsystem_exitcode (str): Exitcode from the batchsystem + tool_exitcode (str): Exitcode form the tool + + Returns: + tuple: (Ridgeback Status int, extra info) + """ + + # If a job is in one of these states, it might eventually move to a different + # state. + + if process_status == "COMPLETED": + self.logger.debug("Job [%s] completed", external_job_id) + return Status.COMPLETED, None + if process_status in [ + "PENDING", + "CONFIGURING", + "REQUEUED", + "REQUEUE_FED", + "REQUEUE_HOLD", + "RESIZING", + "RESV_DEL_HOLD", + "POWER_UP_NODE", + ]: + self.logger.debug("Job [%s] is pending", external_job_id) + return Status.PENDING, None + if process_status in [ + "BOOT_FAIL", + "LAUNCH_FAILED", + "CANCELLED", + "DEADLINE", + "FAILED", + "NODE_FAIL", + "OUT_OF_MEMORY", + "PREEMPTED", + "REVOKED", + "SPECIAL_EXIT", + "RECONFIG_FAIL", + "TIMEOUT", + ]: + exit_info = ( + f"{process_status}, tool exit code: {tool_exitcode}, batchsystem exit code: {batchsystem_exitcode}" + ) + self.logger.error("Job [%s] failed with: %s", external_job_id, exit_info) + return Status.FAILED, exit_info.strip() + if process_status in ["RUNNING", "COMPLETING", "STAGE_OUT"]: + self.logger.debug("Job [%s] is running", external_job_id) + return Status.RUNNING, None + if process_status in ["SUSPENDED", "STOPPED"]: + self.logger.debug("Job [%s] is suspended", external_job_id) + suspended_info = "Job suspended" + return Status.SUSPENDED, suspended_info.strip() + self.logger.debug("Job [%s] is in an unhandled state (%s)", external_job_id, process_status) + status_info = "Job is in an unhandles state: {}".format(process_status) + return Status.UNKNOWN, status_info.strip() + + def _parse_status(self, stdout, external_job_id): + """Parse SLURM stdout helper + + Args: + stdout (str): stdout of bjobs + external_job_id (str): SLURM id + + Returns: + tuple: (Ridgeback Status int, extra info) + """ + + sacct_record = self._parse_sacct(stdout, external_job_id) + if sacct_record: + status = sacct_record[1] + exit_batch = sacct_record[2] + exit_tool = sacct_record[3] + return self._handle_status(status, exit_batch, exit_tool, external_job_id) + + raise FetchStatusException(f"Failed to get status for job {external_job_id}") + + @userswitch + def status(self, external_job_id): + """Parse SLURM status + + Args: + external_job_id (str): SLURM id + + Returns: + tuple: (Ridgeback Status int, extra info) + """ + saact_command = ["sacct", f"--jobs={external_job_id}.batch", "--format=jobid,state,exitcode", "-n", "-P"] + self.logger.debug("Checking slurm status for job: %s", external_job_id) + process = subprocess.run(saact_command, check=True, stdout=subprocess.PIPE, universal_newlines=True) + status = self._parse_status(process.stdout, str(external_job_id)) + return status + + @userswitch + def _get_job_list(self, job_id): + """Get slurm job ids in a group + + Args: + job_id (str): id of job + + Returns: + list: SLURM job ids + """ + + slurm_jobs = [] + saact_command = ["sacct", f"--wckeys={job_id}", "--format=jobid", "-n", "-P"] + process = subprocess.run(saact_command, check=True, stdout=subprocess.PIPE, universal_newlines=True) + output = process.stdout + for single_line in output.strip().split("\n"): + single_slurm_id = single_line.split(".")[0] + if single_slurm_id and single_slurm_id not in slurm_jobs: + slurm_jobs.append(single_slurm_id) + return slurm_jobs + + @userswitch + def suspend(self, job_id): + """ + Suspend SLURM job + Args: + job_id (str): id of job + Returns: + bool: successful + """ + self.logger.debug("Suspending SLURM jobs for job %s", job_id) + job_list = self._get_job_list(job_id) + job_list_str = ",".join(job_list) + scontrol_command = ["scontrol", "suspend", f"{job_list_str}"] + process = subprocess.run(scontrol_command, stdout=subprocess.PIPE, universal_newlines=True) + if process.returncode == 0: + return True + return False + + @userswitch + def resume(self, job_id): + """ + Resume SLURM job + Args: + job_id (str): id of job + Returns: + bool: successful + """ + self.logger.debug("Resuming SLURM jobs for job %s", job_id) + job_list = self._get_job_list(job_id) + job_list_str = ",".join(job_list) + scontrol_command = ["scontrol", "resume", f"{job_list_str}"] + process = subprocess.run(scontrol_command, stdout=subprocess.PIPE, universal_newlines=True) + if process.returncode == 0: + return True + return False diff --git a/compose.yaml b/compose.yaml new file mode 100644 index 00000000..940e766f --- /dev/null +++ b/compose.yaml @@ -0,0 +1,636 @@ +name: "Ridgeback Services ${RIDGEBACK_DEPLOYMENT}" + +x-ridgeback_celery: &ridgeback_celery + image: mskcc/ridgeback:${RIDGEBACK_VERSION} + restart: always + user: "${DOCKER_UID}:${DOCKER_GID}" + networks: + - voyager_net + post_start: + - command: + - /bin/bash + - -c + - | + sed -i 's/UID_MAX 60000/UID_MAX 6000000000/g' /etc/login.defs + sed -i 's/GID_MAX 60000/GID_MAX 6000000000/g' /etc/login.defs + sed -i 's/UID_MIN 1000/UID_MIN 100/g' /etc/login.defs + sed -i 's/GID_MIN 1000/GID_MIN 100/g' /etc/login.defs + getent group slurm &>/dev/null || groupadd -g ${SLURM_UID} slurm + getent group munge &>/dev/null || groupadd -g ${MUNGE_UID} munge + for single_group in ${all_groups}; do + single_group_items=($${!single_group[0]}) + getent group $${single_group_items[1]} &>/dev/null || groupadd -g $${single_group_items[0]} $${single_group_items[1]} + done + id -u ${DOCKER_USERNAME} &>/dev/null || useradd -s /bin/bash -d ${HOME} --uid ${DOCKER_UID} -m ${DOCKER_USERNAME} -G ${DOCKER_GROUPS} + id -u slurm &>/dev/null || useradd -s /bin/bash -d /var/lib/slurm --uid ${SLURM_UID} -m slurm -g slurm + id -u munge &>/dev/null || useradd -s /sbin/nologin -d ${SLURM_MUNGE_VAR} --uid ${MUNGE_UID} -m munge -g munge + for single_user in ${all_users}; do + single_user_items=($${!single_user[0]}) + id -u $${single_user_items[1]} &>/dev/null || useradd -s /bin/bash --uid $${single_user_items[0]} -m $${single_user_items[1]} -G $${single_user_items[2]} + done + user: root + env_file: .env + group_add: + - ${DOCKER_GROUP_1} + - ${DOCKER_GROUP_2} + environment: + - RIDGEBACK_DB_URL=ridgeback_postgres + - RIDGEBACK_MEMCACHED_HOST=ridgeback_memcached + - RIDGEBACK_RABBITMQ_URL=ridgeback_rabbitmq + - RIDGEBACK_DB_PORT=5432 + - RIDGEBACK_MEMCACHED_PORT=11211 + - RIDGEBACK_RABBITMQ_PORT=5672 + - RIDGEBACK_LOG_PATH=/ridgeback/celery/logs/django_server.log + - PATH=${PATH}:/usr/batchsystem/bin + volumes: + - ${SLURM_BIN_PATH}:/usr/batchsystem/bin + - logs_path:/ridgeback/celery/logs/ + - celery_path:/ridgeback/celery/ + - ${CLUSTER_FILESYSTEM_MOUNT}:${CLUSTER_FILESYSTEM_MOUNT} + - ${CLUSTER_SCRATCH_MOUNT}:${CLUSTER_SCRATCH_MOUNT} + - ${CLUSTER_PYTHON_MOUNT}:${CLUSTER_PYTHON_MOUNT} + - ${CLUSTER_PYTHON_LIBSO_MOUNT}:${CONTAINER_PYTHON_LIBSO_MOUNT} + - ${CLUSTER_PYTHON_LIB_MOUNT}:${CLUSTER_PYTHON_LIB_MOUNT} + - ${CLUSTER_PYTHON_INCLUDE_MOUNT}:${CLUSTER_PYTHON_INCLUDE_MOUNT} + - ${CLUSTER_CODE_PATH:-/dev/null}:${CLUSTER_CODE_PATH:-/dev/null} + - type: bind + source: /etc/profile + target: /etc/profile + read_only: true + - type: bind + source: /etc/bashrc + target: /etc/bashrc + read_only: true + - type: bind + source: ${SLURM_ETC} + target: ${SLURM_ETC} + read_only: true + - type: bind + source: ${SLURM_LIB_PATH} + target: ${SLURM_LIB_PATH} + read_only: true + - type: bind + source: ${SLURM_MUNGE_VAR} + target: ${SLURM_MUNGE_VAR} + read_only: true + - type: bind + source: ${SLURM_LIBMUNGE_OBJECT} + target: ${SLURM_LIBMUNGE_OBJECT} + read_only: true + - type: bind + source: ${CLUSTER_ADMIN_MOUNT} + target: ${CLUSTER_ADMIN_MOUNT} + read_only: true + - type: bind + source: ${USER_SOFTWARE_MOUNT} + target: ${USER_SOFTWARE_MOUNT} + read_only: true + - type: bind + source: ${CENTRIFYDC_PATH} + target: ${CENTRIFYDC_PATH} + read_only: true + - type: bind + source: ${CENTRIFYDC_DZDO_PATH} + target: ${CENTRIFYDC_DZDO_PATH} + read_only: true + - type: bind + source: ${CENTRIFYDC_ADINFO_PATH} + target: ${CENTRIFYDC_ADINFO_PATH} + read_only: true + - type: bind + source: ${CENTRIFYDC_ETC_PATH} + target: ${CENTRIFYDC_ETC_PATH} + read_only: true + - type: bind + source: ${CENTRIFYDC_VAR_PATH} + target: ${CENTRIFYDC_VAR_PATH} + read_only: true + - type: bind + source: ${CENTRIFYDC_NSSWITCH_PATH} + target: ${CENTRIFYDC_NSSWITCH_PATH} + read_only: true + - type: bind + source: ${CENTRIFYDC_LIBNSS_PATH} + target: ${CENTRIFYDC_LIBNSS_PATH} + read_only: true + entrypoint: ["/bin/bash", "-c"] + healthcheck: + test: ". ${RIDGEBACK_VENV}/bin/activate; celery --workdir ${RIDGEBACK_PATH} -A orchestrator status || exit 1" + interval: 30s + timeout: 10s + retries: 3 + depends_on: + ridgeback_postgres: + condition: service_healthy + restart: true + ridgeback_memcached: + condition: service_healthy + restart: false + ridgeback_rabbitmq: + condition: service_healthy + restart: false + ridgeback_celery_beat: + condition: service_healthy + restart: false + +services: + ridgeback_create_volumes: + image: alpine:3.8 + restart: no + volumes: + - postgres_path:/postgres + - logs_path:/logs + - celery_path:/celery + - rabbitmq_path:/rabbitmq + - server_path:/server + - logrotate_path:/logrotate + - db_backup_path:/db_backup + entrypoint: ["/bin/sh", "-c"] + command: + - | + chown -R ${DOCKER_UID}:${DOCKER_GID} /postgres + chown -R ${DOCKER_UID}:${DOCKER_GID} /logs + chmod -R 777 /logs + chown -R ${DOCKER_UID}:${DOCKER_GID} /celery + chown -R ${DOCKER_UID}:${DOCKER_GID} /rabbitmq + chown -R ${DOCKER_UID}:${DOCKER_GID} /server + chown -R ${DOCKER_UID}:${DOCKER_GID} /logrotate + chown -R ${DOCKER_UID}:${DOCKER_GID} /db_backup + ridgeback_postgres: + image: postgres:17-trixie + restart: on-failure + user: "${DOCKER_UID}:${DOCKER_GID}" + networks: + - voyager_net + volumes: + - postgres_path:/var/lib/postgresql/data/ + environment: + - POSTGRES_USER=${RIDGEBACK_DB_USERNAME} + - POSTGRES_PASSWORD=${RIDGEBACK_DB_PASSWORD} + - POSTGRES_DB=${RIDGEBACK_DB_NAME} + ports: + - ${RIDGEBACK_DB_PORT}:5432 + command: + - -c + - max_connections=300 + - -c + - shared_buffers=15GB + - -c + - effective_cache_size=45GB + - -c + - maintenance_work_mem=2GB + - -c + - checkpoint_completion_target=0.9 + - -c + - wal_buffers=16MB + - -c + - default_statistics_target=100 + - -c + - random_page_cost=1.1 + - -c + - effective_io_concurrency=200 + - -c + - work_mem=13107kB + - -c + - huge_pages=try + - -c + - min_wal_size=2GB + - -c + - max_wal_size=8GB + - -c + - max_worker_processes=20 + - -c + - max_parallel_workers_per_gather=4 + - -c + - max_parallel_workers=20 + - -c + - max_parallel_maintenance_workers=4 + healthcheck: + test: + [ + "CMD-SHELL", + "sh -c 'pg_isready -U ${RIDGEBACK_DB_USERNAME} -d ${RIDGEBACK_DB_NAME}'", + ] + interval: 30s + timeout: 10s + retries: 3 + depends_on: + - ridgeback_create_volumes + ridgeback_memcached: + image: mskcc/memcached-nc:1.6.39 + restart: on-failure + user: "${DOCKER_UID}:${DOCKER_GID}" + networks: + - voyager_net + expose: + - "11211" + command: ["memcached"] + healthcheck: + test: ["CMD", "nc", "-z", "localhost", "11211"] + interval: 30s + timeout: 10s + retries: 3 + ridgeback_rabbitmq: + image: rabbitmq:4.0.6-management-alpine + restart: always + user: "${DOCKER_UID}:${DOCKER_GID}" + networks: + - voyager_net + volumes: + - rabbitmq_path:/var/lib/rabbitmq/ + - logs_path:/var/log/rabbitmq/ + ports: + - ${RIDGEBACK_RABBITMQ_MANAGEMENT_PORT}:15672 + expose: + - "5672" + environment: + - RABBITMQ_NODENAME=rabbitmq_ridgeback + - RABBITMQ_DEFAULT_USER=${RIDGEBACK_RABBITMQ_USERNAME} + - RABBITMQ_DEFAULT_PASS=${RIDGEBACK_RABBITMQ_PASSWORD} + - RABBITMQ_LOGS=/var/log/rabbitmq/rabbitmq.log + healthcheck: + test: ["CMD", "rabbitmq-diagnostics", "check_running"] + interval: 30s + timeout: 10s + retries: 3 + depends_on: + - ridgeback_create_volumes + ridgeback: + image: mskcc/ridgeback:${RIDGEBACK_VERSION} + restart: always + user: "${DOCKER_UID}:${DOCKER_GID}" + networks: + - voyager_net + env_file: .env + group_add: + - ${DOCKER_GROUP_1} + - ${DOCKER_GROUP_2} + environment: + - RIDGEBACK_DB_URL=ridgeback_postgres + - RIDGEBACK_MEMCACHED_HOST=ridgeback_memcached + - RIDGEBACK_RABBITMQ_URL=ridgeback_rabbitmq + - RIDGEBACK_DB_PORT=5432 + - RIDGEBACK_MEMCACHED_PORT=11211 + - RIDGEBACK_RABBITMQ_PORT=5672 + - RIDGEBACK_LOG_PATH=/ridgeback/server/django_server.log + volumes: + - logs_path:/ridgeback/server/ + - server_path:/ridgeback_staticfiles/ + - ${CLUSTER_FILESYSTEM_MOUNT}:${CLUSTER_FILESYSTEM_MOUNT} + - ${CLUSTER_CODE_PATH:-/dev/null}:${CLUSTER_CODE_PATH:-/dev/null} + - ${CLUSTER_SCRATCH_MOUNT}:${CLUSTER_SCRATCH_MOUNT} + - ${CLUSTER_PYTHON_MOUNT}:${CLUSTER_PYTHON_MOUNT} + - ${CLUSTER_PYTHON_LIBSO_MOUNT}:${CONTAINER_PYTHON_LIBSO_MOUNT} + - ${CLUSTER_PYTHON_LIB_MOUNT}:${CLUSTER_PYTHON_LIB_MOUNT} + - ${CLUSTER_PYTHON_INCLUDE_MOUNT}:${CLUSTER_PYTHON_INCLUDE_MOUNT} + ports: + - ${RIDGEBACK_PORT}:${RIDGEBACK_PORT} + post_start: + - command: + - /bin/bash + - -c + - | + sed -i 's/UID_MAX 60000/UID_MAX 6000000000/g' /etc/login.defs + sed -i 's/GID_MAX 60000/GID_MAX 6000000000/g' /etc/login.defs + sed -i 's/UID_MIN 1000/UID_MIN 100/g' /etc/login.defs + sed -i 's/GID_MIN 1000/GID_MIN 100/g' /etc/login.defs + for single_group in ${all_groups}; do + single_group_items=($${!single_group[0]}) + getent group $${single_group_items[1]} &>/dev/null || groupadd -g $${single_group_items[0]} $${single_group_items[1]} + done + id -u ${DOCKER_USERNAME} &>/dev/null || useradd -s /bin/bash -d ${HOME} --uid ${DOCKER_UID} -m ${DOCKER_USERNAME} -G ${DOCKER_GROUPS} + user: root + entrypoint: ["/bin/bash", "-c"] + command: + - | + source ${RIDGEBACK_VENV}/bin/activate + + if ! python3 -c "import toil" 2>&1 >/dev/null + then + pip3 install --upgrade pip && + pip3 install --force-reinstall 'setuptools<58.0.0' && + pip3 install "cython<3.0.0" wheel && + pip3 install "pyyaml==5.4.1" --no-build-isolation && + pip3 install --use-pep517 -r ${RIDGEBACK_PATH}/requirements.txt && + pip3 install --use-pep517 -r ${RIDGEBACK_PATH}/requirements-toil.txt + fi + + python3 ${RIDGEBACK_PATH}/manage.py migrate --noinput + echo "User.objects.filter(username='admin').exists() or User.objects.create_superuser('admin','voyager@mskcc.org','${RIDGEBACK_DB_PASSWORD}')" | python3 ${RIDGEBACK_PATH}/manage.py shell_plus + python3 ${RIDGEBACK_PATH}/manage.py collectstatic --noinput + python3 ${RIDGEBACK_PATH}/manage.py runserver 0.0.0.0:${RIDGEBACK_PORT} >> /ridgeback/server/web_server.log 2>&1 + healthcheck: + test: + ["CMD-SHELL", "curl -sSf http://localhost:${RIDGEBACK_PORT}/ || exit 1"] + interval: 30s + timeout: 10s + retries: 3 + depends_on: + ridgeback_postgres: + condition: service_healthy + restart: true + ridgeback_memcached: + condition: service_healthy + restart: false + ridgeback_rabbitmq: + condition: service_healthy + restart: false + ridgeback_celery_beat: + <<: *ridgeback_celery + command: + - | + source ${RIDGEBACK_VENV}/bin/activate + + PIDFILE=/ridgeback/celery/ridgeback.${RIDGEBACK_DEPLOYMENT}.ridgeback_beat.pid + + [ -e $$PIDFILE ] && rm $$PIDFILE + echo 'Running orchestrator beat...' + + celery --workdir ${RIDGEBACK_PATH} \ + -A orchestrator beat \ + -l info \ + -f /ridgeback/celery/logs/ridgeback_beat.log \ + --pidfile $$PIDFILE \ + -s /ridgeback/celery/ridgeback.${RIDGEBACK_DEPLOYMENT}.celerybeat-schedule + healthcheck: + test: + [ + "CMD-SHELL", + "ps -p $(pgrep -F /ridgeback/celery/ridgeback.${RIDGEBACK_DEPLOYMENT}.ridgeback_beat.pid)", + ] + interval: 30s + timeout: 10s + retries: 3 + depends_on: + ridgeback_postgres: + condition: service_healthy + restart: true + ridgeback_memcached: + condition: service_healthy + restart: false + ridgeback_rabbitmq: + condition: service_healthy + restart: false + ridgeback: + condition: service_healthy + restart: false + ridgeback_celery_command_queue: + <<: *ridgeback_celery + tmpfs: + - /tmp:size=500M,mode=777 + command: + - | + source ${RIDGEBACK_VENV}/bin/activate + + PIDFILE=/ridgeback/celery/ridgeback.${RIDGEBACK_DEPLOYMENT}.${RIDGEBACK_COMMAND_QUEUE}.pid + [ -e $$PIDFILE ] && rm $$PIDFILE + echo 'Running command queue worker...' + + celery --workdir ${RIDGEBACK_PATH} \ + -A orchestrator worker \ + -l info \ + -Q ${RIDGEBACK_COMMAND_QUEUE} \ + -f /ridgeback/celery/logs/${RIDGEBACK_COMMAND_QUEUE}.log \ + --pidfile $$PIDFILE \ + --concurrency=30 \ + -n ridgeback.${RIDGEBACK_DEPLOYMENT}.${RIDGEBACK_COMMAND_QUEUE} + healthcheck: + test: ". ${RIDGEBACK_VENV}/bin/activate; celery --workdir ${RIDGEBACK_PATH} -A orchestrator status -d celery@ridgeback.${RIDGEBACK_DEPLOYMENT}.${RIDGEBACK_COMMAND_QUEUE}" + interval: 30s + timeout: 10s + retries: 3 + ridgeback_celery_action_queue: + <<: *ridgeback_celery + tmpfs: + - /tmp:size=500M,mode=777 + command: + - | + source ${RIDGEBACK_VENV}/bin/activate + + PIDFILE=/ridgeback/celery/ridgeback.${RIDGEBACK_DEPLOYMENT}.${RIDGEBACK_ACTION_QUEUE}.pid + [ -e $$PIDFILE ] && rm $$PIDFILE + echo 'Running action queue worker...' + celery --workdir ${RIDGEBACK_PATH} \ + -A orchestrator worker \ + -l info \ + -Q ${RIDGEBACK_ACTION_QUEUE} \ + -f /ridgeback/celery/logs/${RIDGEBACK_ACTION_QUEUE}.log \ + --pidfile $$PIDFILE \ + --concurrency=10 \ + -n ridgeback.${RIDGEBACK_DEPLOYMENT}.${RIDGEBACK_ACTION_QUEUE} + healthcheck: + test: ". ${RIDGEBACK_VENV}/bin/activate; celery --workdir ${RIDGEBACK_PATH} -A orchestrator status -d celery@ridgeback.${RIDGEBACK_DEPLOYMENT}.${RIDGEBACK_ACTION_QUEUE} || exit 1" + interval: 30s + timeout: 10s + retries: 3 + ridgeback_celery_check_status_queue: + <<: *ridgeback_celery + command: + - | + source ${RIDGEBACK_VENV}/bin/activate + + PIDFILE=/ridgeback/celery/ridgeback.${RIDGEBACK_DEPLOYMENT}.${RIDGEBACK_CHECK_STATUS_QUEUE}.pid + [ -e $$PIDFILE ] && rm $$PIDFILE + echo 'Running check status queue worker...' + + celery --workdir ${RIDGEBACK_PATH} \ + -A orchestrator worker \ + -l info \ + -Q ${RIDGEBACK_CHECK_STATUS_QUEUE} \ + -f /ridgeback/celery/logs/${RIDGEBACK_CHECK_STATUS_QUEUE}.log \ + --pidfile $$PIDFILE \ + --concurrency=10 \ + -n ridgeback.${RIDGEBACK_DEPLOYMENT}.${RIDGEBACK_CHECK_STATUS_QUEUE} + healthcheck: + test: ". ${RIDGEBACK_VENV}/bin/activate; celery --workdir ${RIDGEBACK_PATH} -A orchestrator status -d celery@ridgeback.${RIDGEBACK_DEPLOYMENT}.${RIDGEBACK_CHECK_STATUS_QUEUE} || exit 1" + interval: 30s + timeout: 10s + retries: 3 + ridgeback_celery_submit_job_queue: + <<: *ridgeback_celery + command: + - | + source ${RIDGEBACK_VENV}/bin/activate + + PIDFILE=/ridgeback/celery/ridgeback.${RIDGEBACK_DEPLOYMENT}.${RIDGEBACK_SUBMIT_JOB_QUEUE}.pid + [ -e $$PIDFILE ] && rm $$PIDFILE + echo 'Running submit job queue worker...' + + celery --workdir ${RIDGEBACK_PATH} \ + -A orchestrator worker \ + -l info \ + -Q ${RIDGEBACK_SUBMIT_JOB_QUEUE} \ + -f /ridgeback/celery/logs/${RIDGEBACK_SUBMIT_JOB_QUEUE}.log \ + --pidfile $$PIDFILE \ + --concurrency=5 \ + -n ridgeback.${RIDGEBACK_DEPLOYMENT}.${RIDGEBACK_SUBMIT_JOB_QUEUE} + healthcheck: + test: ". ${RIDGEBACK_VENV}/bin/activate; celery --workdir ${RIDGEBACK_PATH} -A orchestrator status -d celery@ridgeback.${RIDGEBACK_DEPLOYMENT}.${RIDGEBACK_SUBMIT_JOB_QUEUE} || exit 1" + interval: 30s + timeout: 10s + retries: 3 + ridgeback_celery_set_permission_queue: + <<: *ridgeback_celery + command: + - | + source ${RIDGEBACK_VENV}/bin/activate + + PIDFILE=/ridgeback/celery/ridgeback.${RIDGEBACK_DEPLOYMENT}.${RIDGEBACK_SET_PERMISSIONS_QUEUE}.pid + [ -e $$PIDFILE ] && rm $$PIDFILE + echo 'Running set permission queue worker...' + + celery --workdir ${RIDGEBACK_PATH} \ + -A orchestrator worker \ + -l info \ + -Q ${RIDGEBACK_SET_PERMISSIONS_QUEUE} \ + -f /ridgeback/celery/logs/${RIDGEBACK_SET_PERMISSIONS_QUEUE}.log \ + --pidfile $$PIDFILE \ + --concurrency=10 \ + -n ridgeback.${RIDGEBACK_DEPLOYMENT}.${RIDGEBACK_SET_PERMISSIONS_QUEUE} + healthcheck: + test: ". ${RIDGEBACK_VENV}/bin/activate; celery --workdir ${RIDGEBACK_PATH} -A orchestrator status -d celery@ridgeback.${RIDGEBACK_DEPLOYMENT}.${RIDGEBACK_SET_PERMISSIONS_QUEUE} || exit 1" + interval: 30s + timeout: 10s + retries: 3 + ridgeback_celery_cleanup_queue: + <<: *ridgeback_celery + command: + - | + source ${RIDGEBACK_VENV}/bin/activate + + PIDFILE=/ridgeback/celery/ridgeback.${RIDGEBACK_DEPLOYMENT}.${RIDGEBACK_CLEANUP_QUEUE}.pid + [ -e $$PIDFILE ] && rm $$PIDFILE + echo 'Running cleanup queue worker...' + + celery --workdir ${RIDGEBACK_PATH} \ + -A orchestrator worker \ + -l info \ + -Q ${RIDGEBACK_CLEANUP_QUEUE} \ + -f /ridgeback/celery/logs/${RIDGEBACK_CLEANUP_QUEUE}.log \ + --pidfile $$PIDFILE \ + --concurrency=2 \ + -n ridgeback.${RIDGEBACK_DEPLOYMENT}.${RIDGEBACK_CLEANUP_QUEUE} + healthcheck: + test: ". ${RIDGEBACK_VENV}/bin/activate; celery --workdir ${RIDGEBACK_PATH} -A orchestrator status -d celery@ridgeback.${RIDGEBACK_DEPLOYMENT}.${RIDGEBACK_CLEANUP_QUEUE} || exit 1" + interval: 30s + timeout: 10s + retries: 3 + ridgeback_logrotate: + image: mskcc/voyager-compose-utils:1.0.1 + restart: always + user: "${DOCKER_UID}:${DOCKER_GID}" + networks: + - voyager_net + volumes: + - logs_path:/logs + - logrotate_path:/logrotate + entrypoint: ["/bin/sh", "-c"] + command: + - | + mkdir -p /logs/archive + cat > /logrotate/logrotate.conf < /logrotate/logrotate.cron <> /logs/logrotate_cron.log 2>&1 + healthcheck: + test: "find /logs/last_completed_logrotate_cron -type f -mtime -2 | read" + interval: 12h + timeout: 10s + retries: 3 + depends_on: + ridgeback_celery_beat: + condition: service_healthy + restart: false + ridgeback_celery_command_queue: + condition: service_healthy + restart: false + ridgeback_db_backup: + image: mskcc/voyager-compose-utils:1.0.1 + restart: always + user: "${DOCKER_UID}:${DOCKER_GID}" + networks: + - voyager_net + volumes: + - logs_path:/logs + - db_backup_path:/db_backup + environment: + - PGPASSWORD=${RIDGEBACK_DB_PASSWORD} + entrypoint: ["/bin/sh", "-c"] + command: + - | + mkdir -p /db_backup/archive + cat > /db_backup/db_backup.cron <> /logs/db_backup_cron.log 2>&1 + healthcheck: + test: "find /logs/last_completed_db_backup_cron -type f -mtime -2 | read" + interval: 12h + timeout: 10s + retries: 3 + depends_on: + ridgeback_celery_beat: + condition: service_healthy + restart: false + ridgeback_celery_command_queue: + condition: service_healthy + restart: false +volumes: + postgres_path: + driver: local + driver_opts: + type: "none" + o: "bind" + device: "${RIDGEBACK_POSTGRES_PATH}" + logs_path: + driver: local + driver_opts: + type: "none" + o: "bind" + device: "${RIDGEBACK_LOGS_PATH}" + celery_path: + driver: local + driver_opts: + type: "none" + o: "bind" + device: "${RIDGEBACK_CELERY_PATH}" + rabbitmq_path: + driver: local + driver_opts: + type: "none" + o: "bind" + device: "${RIDGEBACK_RABITMQ_PATH}" + server_path: + driver: local + driver_opts: + type: "none" + o: "bind" + device: "${RIDGEBACK_SERVER_PATH}" + logrotate_path: + driver: local + driver_opts: + type: "none" + o: "bind" + device: "${RIDGEBACK_LOGROTATE_PATH}" + db_backup_path: + driver: local + driver_opts: + type: "none" + o: "bind" + device: "${RIDGEBACK_DB_BACKUP_PATH}" +networks: + voyager_net: + name: voyager_network_${RIDGEBACK_DEPLOYMENT} + driver: bridge diff --git a/container/Readme.md b/container/Readme.md index beabc523..d05ea173 100644 --- a/container/Readme.md +++ b/container/Readme.md @@ -68,6 +68,13 @@ The default rabbitmq queue should be the same queue set in orchestrator/celery.p | :-------------------------------------- | :----------------- | :------ | | SINGULARITYENV_RIDGEBACK_LSF_STACKLIMIT | stacklimit for LSF | None | +##### SLURM + +| Variable | Description | +| :--------------------------------------- | :--------------------------- | +| SINGULARITYENV_SLURM_BINDIR | The path to the lsf bin dir | +| SINGULARITYENV_RIDGEBACK_SLURM_PARTITION | Partition for the SLURM jobs | + ##### Celery | Optional Variable | Description | Default | diff --git a/container/celery_services.def b/container/celery_services.def index c5983b46..75577914 100644 --- a/container/celery_services.def +++ b/container/celery_services.def @@ -105,53 +105,50 @@ Includecmd: no echo "ERROR: SINGULARITYENV_RIDGEBACK_CELERY_EVENT_QUEUE_PREFIX not set." exit 1 fi + + ### SLURM env variables + if [ -z "$SLURM_BINDIR" ]; then + echo "WARNING environment variable SLURM_BINDIR is not defined, SLURM may not work" + fi ### LSF env variables if [ -z "$LSF_LIBDIR" ]; then - echo "ERROR environment variable LSF_LIBDIR is not defined, LSF will not work" - exit 1 + echo "WARNING environment variable LSF_LIBDIR is not defined, LSF will not work" fi if [ -z "$LSF_SERVERDIR" ]; then - echo "ERROR environment variable LSF_SERVERDIR is not defined, LSF will not work" - exit 1 + echo "WARNING environment variable LSF_SERVERDIR is not defined, LSF will not work" fi if [ -z "$LSF_ENVDIR" ]; then - echo "ERROR environment variable LSF_ENVDIR is not defined, LSF will not work" - exit 1 + echo "WARNING environment variable LSF_ENVDIR is not defined, LSF will not work" fi if [ -z "$LSF_BINDIR" ]; then - echo "ERROR environment variable LSF_BINDIR is not defined, LSF will not work" - exit 1 + echo "WARNING environment variable LSF_BINDIR is not defined, LSF will not work" fi if [ ! -d "$LSF_LIBDIR" ]; then - echo "ERROR $LSF_LIBDIR is not mounted or does not exist" - exit 1 + echo "WARNING $LSF_LIBDIR is not mounted or does not exist" fi if [ ! -d "$LSF_SERVERDIR" ]; then - echo "ERROR $LSF_SERVERDIR is not mounted or does not exist" - exit 1 + echo "WARNING $LSF_SERVERDIR is not mounted or does not exist" fi if [ ! -d "$LSF_ENVDIR" ]; then - echo "ERROR $LSF_ENVDIR is not mounted or does not exist" - exit 1 + echo "WARNING $LSF_ENVDIR is not mounted or does not exist" fi if [ ! -d "$LSF_BINDIR" ]; then - echo "ERROR $LSF_BINDIR is not mounted or does not exist" - exit 1 + echo "WARNING $LSF_BINDIR is not mounted or does not exist" fi RIDGEBACK_VENV_ACTIVATE=$RIDGEBACK_VENV/bin/activate echo "Activating venv $RIGEBACK_VENV" . $RIDGEBACK_VENV_ACTIVATE - export PATH=$LSF_BINDIR:$SINGULARITY_BIN_PATH:$PATH + export PATH=$SLURM_BINDIR:$LSF_BINDIR:$SINGULARITY_BIN_PATH:$PATH echo "Running orchestrator beat..." nohup celery --workdir ${RIDGEBACK_PATH} \ diff --git a/container/ridgeback_service.def b/container/ridgeback_service.def index 154c83a2..30f046f8 100644 --- a/container/ridgeback_service.def +++ b/container/ridgeback_service.def @@ -64,25 +64,26 @@ Includecmd: no echo "RIDGEBACK_TOIL set to $RIDGEBACK_TOIL" >> $RIDGEBACK_HOME/logs/boot.log fi + ### SLURM env variables + if [ -z "$SLURM_BINDIR" ]; then + echo "WARNING environment variable SLURM_BINDIR is not defined, SLURM may not work" + fi + ### LSF Parameters to communicate with LSF if [ -z "$LSF_ENVDIR" ]; then - echo "ERROR: SINGULARITYENV_LSF_ENVDIR is not set." >> $RIDGEBACK_HOME/logs/boot.log - exit 1 + echo "WARNING: SINGULARITYENV_LSF_ENVDIR is not set." >> $RIDGEBACK_HOME/logs/boot.log fi if [ -z "$LSF_BINDIR" ]; then - echo "ERROR: SINGULARITYENV_LSF_BINDIR is not set." >> $RIDGEBACK_HOME/logs/boot.log - exit 1 + echo "WARNING: SINGULARITYENV_LSF_BINDIR is not set." >> $RIDGEBACK_HOME/logs/boot.log fi if [ -z "$LSF_LIBDIR" ]; then - echo "ERROR: SINGULARITYENV_LSF_LIBDIR is not set." >> $RIDGEBACK_HOME/logs/boot.log - exit 1 + echo "WARNING: SINGULARITYENV_LSF_LIBDIR is not set." >> $RIDGEBACK_HOME/logs/boot.log fi if [ -z "$LSF_SERVERDIR" ]; then - echo "ERROR: SINGULARITYENV_LSF_SERVERDIR is not set." >> $RIDGEBACK_HOME/logs/boot.log - exit 1 + echo "WARNING: SINGULARITYENV_LSF_SERVERDIR is not set." >> $RIDGEBACK_HOME/logs/boot.log fi python3 ${RIDGEBACK_PATH}/manage.py migrate --noinput diff --git a/docs/compose.md b/docs/compose.md new file mode 100644 index 00000000..6ff29be5 --- /dev/null +++ b/docs/compose.md @@ -0,0 +1,70 @@ +# Ridgeback Docker Compose Overview + +This document explains the `compose.yml` configuration used to run Ridgeback. It covers each service, its purpose, key environment variables, and how the components interact. + +## Service Overview + +| Service | Purpose | +| -------------------------- | ------------------------------------------------------------------------------------------------------------------------- | +| `ridgeback_create_volumes` | Create host directories (postgres, logs, celery, rabbitmq, server, logrotate) with the correct UID/GID. | +| `ridgeback_postgres` | PostgreSQL database instance for Ridgeback. | +| `ridgeback_memcached` | Memcached cache server. | +| `ridgeback_rabbitmq` | RabbitMQ message broker with management UI. | +| `ridgeback` | Django application that hosts the Ridgeback web interface and API. | +| `ridgeback_celery_beat` | Celery beat scheduler that triggers periodic tasks. | +| `ridgeback_celery_*_queue` | Various Celery workers that process different task queues (command, action, status, submit‑job, set‑permission, cleanup). | +| `ridgeback_logrotate` | Periodically rotates application logs. | +| `ridgeback_db_backup` | Schedules regular database backups. | + +--- + +### 1. Database & Cache Services + +| Service | Image | Key Environment Variables | Notes | +| --------------------- | ---------------------------------- | --------------------------------------------------- | -------------------------------------------------------- | +| `ridgeback_postgres` | `postgres:17` | `POSTGRES_USER`, `POSTGRES_PASSWORD`, `POSTGRES_DB` | Exposes port `${RIDGEBACK_DB_PORT}` | +| `ridgeback_memcached` | `mskcc/memcached-nc:1.6.39` | `RIDGEBACK_MEMCACHED_PORT` | Exposes port `${RIDGEBACK_MEMCACHED_PORT}` | +| `ridgeback_rabbitmq` | `rabbitmq:4.0.6-management-alpine` | `RABBITMQ_DEFAULT_USER`, `RABBITMQ_DEFAULT_PASS` | Management UI on `${RIDGEBACK_RABBITMQ_MANAGEMENT_PORT}` | + +All three depend on the volume‑creation service. + +### 2. Celery Workers & Beat + +All workers share the anchor `x-ridgeback_celery` for common settings (image, user, network, env_file, volumes). Each worker overrides the `command` to start its specific queue. + +| Worker | Queue | Concurrency | Command | +| --------------------------------------- | ------------------------------------ | ----------- | -------------------------------------- | +| `ridgeback_celery_beat` | N/A (beat) | 1 | Starts Celery beat with schedule file. | +| `ridgeback_celery_command_queue` | `${RIDGEBACK_COMMAND_QUEUE}` | 30 | Worker for command queue. | +| `ridgeback_celery_action_queue` | `${RIDGEBACK_ACTION_QUEUE}` | 10 | Worker for action queue. | +| `ridgeback_celery_check_status_queue` | `${RIDGEBACK_CHECK_STATUS_QUEUE}` | 10 | Worker for status checks. | +| `ridgeback_celery_submit_job_queue` | `${RIDGEBACK_SUBMIT_JOB_QUEUE}` | 5 | Worker for job submission. | +| `ridgeback_celery_set_permission_queue` | `${RIDGEBACK_SET_PERMISSIONS_QUEUE}` | 10 | Worker for permission setting. | +| `ridgeback_celery_cleanup_queue` | `${RIDGEBACK_CLEANUP_QUEUE}` | 2 | Worker for cleanup tasks. | + +All depend on PostgreSQL, Memcached, RabbitMQ, and `ridgeback_celery_beat` for health. + +### 3. Auxiliary Services + +| Service | Image | Purpose | +| --------------------- | ----------------------------------- | ------------------------------------------- | +| `ridgeback_logrotate` | `mskcc/voyager-compose-utils:1.0.0` | Rotates logs weekly | +| `ridgeback_db_backup` | Same image | Schedules database backups using `pg_dump`. | + +Both depend on the beat and command‑queue workers. + +### 4. Network + +- **`voyager_net`** – Bridge network shared by all services and also [Beagle](https://github.com/mskcc/beagle) + +## Key Environment Variables + +| Variable | Description | +| -------------------------------------------------------------------------- | ----------------------------------------------- | +| `RIDGEBACK_VERSION` | Docker image tag for Ridgeback. | +| `DOCKER_UID`, `DOCKER_GID` | UID/GID for container processes. | +| `RIDGEBACK_DB_USERNAME`, `_PASSWORD`, `_NAME` | PostgreSQL credentials. | +| `RIDGEBACK_RABBITMQ_USERNAME`, `_PASSWORD` | RabbitMQ credentials. | +| `CLUSTER_FILESYSTEM_MOUNT`, `CLUSTER_SCRATCH_MOUNT`, `CLUSTER_ADMIN_MOUNT` | Bind mounts for cluster file system access. | +| `CLUSTER_CODE_PATH` | Path for the ridgeback code base on the cluster | +| `LOGROTATE_*`, `DB_BACKUP_*` | Log rotation and backup scheduling options. | diff --git a/orchestrator/commands/command.py b/orchestrator/commands/command.py index c621c57b..f54cc6cd 100644 --- a/orchestrator/commands/command.py +++ b/orchestrator/commands/command.py @@ -3,7 +3,7 @@ class CommandType(IntEnum): - CHECK_STATUS_ON_LSF = 0 + CHECK_STATUS_ON_BATCH_SYSTEM = 0 CHECK_COMMAND_LINE_STATUS = 1 PREPARE = 2 SUBMIT = 3 diff --git a/orchestrator/migrations/0020_auto_20250402_0942.py b/orchestrator/migrations/0020_auto_20250402_0942.py new file mode 100644 index 00000000..9243e6b0 --- /dev/null +++ b/orchestrator/migrations/0020_auto_20250402_0942.py @@ -0,0 +1,18 @@ +# Generated by Django 2.2.28 on 2025-04-02 09:42 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("orchestrator", "0019_job_log_prefix"), + ] + + operations = [ + migrations.AlterField( + model_name="commandlinetooljob", + name="job_id", + field=models.CharField(max_length=50), + ), + ] diff --git a/orchestrator/migrations/0021_job_user.py b/orchestrator/migrations/0021_job_user.py new file mode 100644 index 00000000..94373434 --- /dev/null +++ b/orchestrator/migrations/0021_job_user.py @@ -0,0 +1,18 @@ +# Generated by Django 2.2.28 on 2025-07-30 18:09 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("orchestrator", "0020_auto_20250402_0942"), + ] + + operations = [ + migrations.AddField( + model_name="job", + name="user", + field=models.CharField(default="kumarn1", max_length=100), + ), + ] diff --git a/orchestrator/migrations/0022_auto_20250805_1406.py b/orchestrator/migrations/0022_auto_20250805_1406.py new file mode 100644 index 00000000..bd432693 --- /dev/null +++ b/orchestrator/migrations/0022_auto_20250805_1406.py @@ -0,0 +1,18 @@ +# Generated by Django 2.2.28 on 2025-08-05 18:06 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("orchestrator", "0021_job_user"), + ] + + operations = [ + migrations.AlterField( + model_name="job", + name="output_uid", + field=models.IntegerField(default=6146), + ), + ] diff --git a/orchestrator/migrations/0023_auto_20251210_1043.py b/orchestrator/migrations/0023_auto_20251210_1043.py new file mode 100644 index 00000000..d892cdc5 --- /dev/null +++ b/orchestrator/migrations/0023_auto_20251210_1043.py @@ -0,0 +1,18 @@ +# Generated by Django 2.2.28 on 2025-12-10 15:43 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("orchestrator", "0022_auto_20250805_1406"), + ] + + operations = [ + migrations.AlterField( + model_name="job", + name="root_permission", + field=models.CharField(default="750", max_length=4), + ), + ] diff --git a/orchestrator/models.py b/orchestrator/models.py index bf11138c..6f2f57c8 100755 --- a/orchestrator/models.py +++ b/orchestrator/models.py @@ -7,6 +7,7 @@ from django.utils.dateparse import parse_datetime from django.utils.timezone import is_aware, make_aware, now from django.conf import settings +from getpass import getuser logger = logging.getLogger(__name__) @@ -22,6 +23,15 @@ def message_default(): return message_default_dict +def get_default_for_job(model_field): + if model_field == "root_permission": + return settings.OUTPUT_DEFAULT_PERMISSION + elif model_field == "output_uid": + return settings.OUTPUT_DEFAULT_UID + elif model_field == "output_gid": + return settings.OUTPUT_DEFAULT_GID + + class Status(IntEnum): CREATED = 0 PREPARED = 1 @@ -166,9 +176,10 @@ class Job(BaseModel): external_id = models.CharField(max_length=50, null=True, blank=True) base_dir = models.CharField(max_length=1000) root_dir = models.CharField(max_length=1000) - root_permission = models.CharField(default=settings.OUTPUT_DEFAULT_PERMISSION, max_length=3) - output_uid = models.IntegerField(default=settings.OUTPUT_DEFAULT_UID, editable=True) - output_gid = models.IntegerField(default=settings.OUTPUT_DEFAULT_GID, editable=True) + root_permission = models.CharField(default=get_default_for_job("root_permission"), max_length=4) + user = models.CharField(default=getuser(), max_length=100) + output_uid = models.IntegerField(default=get_default_for_job("output_gid"), editable=True) + output_gid = models.IntegerField(default=get_default_for_job("output_gid"), editable=True) job_store_location = models.CharField(max_length=1000, null=True, blank=True) resume_job_store_location = models.CharField(max_length=1000, null=True, blank=True) working_dir = models.CharField(max_length=1000, null=True, blank=True) @@ -193,13 +204,15 @@ class Job(BaseModel): metadata = JSONField(blank=True, null=True, default=dict) def job_prepared(self, job_store_dir, job_work_dir, job_output_dir, log_path, log_prefix): + from batch_systems.batch_system import get_batch_system + self.status = Status.PREPARED self.job_store_location = job_store_dir self.working_dir = job_work_dir self.output_directory = job_output_dir self.log_dir = log_path self.log_prefix = log_prefix - self.message["log"] = os.path.join(job_work_dir, "lsf.log") + self.message["log"] = os.path.join(job_work_dir, get_batch_system().logfileName) self.save( update_fields=[ "status", @@ -266,7 +279,7 @@ class CommandLineToolJob(BaseModel): submitted = models.DateTimeField(blank=True, null=True) finished = models.DateTimeField(blank=True, null=True) job_name = models.CharField(max_length=100) - job_id = models.CharField(max_length=20) + job_id = models.CharField(max_length=50) details = JSONField(blank=True, null=True) def get_aware_datetime(self, date_str): diff --git a/orchestrator/tasks.py b/orchestrator/tasks.py index 6d4b1efd..8cc0280b 100644 --- a/orchestrator/tasks.py +++ b/orchestrator/tasks.py @@ -3,6 +3,8 @@ import shutil import logging import tempfile +from pathlib import Path +from getpass import getuser from datetime import timedelta from celery import shared_task from django.conf import settings @@ -13,12 +15,13 @@ from submitter.factory import JobSubmitterFactory from orchestrator.scheduler import Scheduler from orchestrator.commands import Command, CommandType -from batch_systems.lsf_client.lsf_client import LSFClient +from batch_systems.batch_system import get_batch_system from orchestrator.exceptions import ( RetryException, StopException, FetchStatusException, ) +from submitter.userswitcher import userswitch logger = logging.getLogger(__name__) @@ -26,14 +29,13 @@ def get_job_info_path(job_id): job = Job.objects.get(id=job_id) - work_dir = os.path.join( - settings.PIPELINE_CONFIG.get(job.metadata["pipeline_name"], "NA")["WORK_DIR_ROOT"], str(job_id) - ) + work_dir = job.working_dir job_info_path = os.path.join(work_dir, ".run.info") return job_info_path -def save_job_info(job_id, external_id, job_store_location, working_dir, output_directory, metadata={}): +@userswitch +def save_job_info(job, external_id, job_store_location, working_dir, output_directory, metadata={}): if os.path.exists(working_dir): job_info = { "external_id": external_id, @@ -42,7 +44,7 @@ def save_job_info(job_id, external_id, job_store_location, working_dir, output_d "output_directory": output_directory, } job_info.update(metadata) - job_info_path = get_job_info_path(job_id) + job_info_path = get_job_info_path(job.id) with open(job_info_path, "w") as job_info_file: json.dump({"meta": "run_info"}, job_info_file) job_info_file.write("\n") @@ -54,8 +56,7 @@ def save_job_info(job_id, external_id, job_store_location, working_dir, output_d def suspend_job(job): if Status(job.status).transition(Status.SUSPENDED): - lsf_client = LSFClient() - job_suspended = lsf_client.suspend(str(job.id)) + job_suspended = get_batch_system(job.user).suspend(str(job.id)) if not job_suspended: raise RetryException("Failed to suspend job: %s" % str(job.id)) job.update_status(Status.SUSPENDED) @@ -74,9 +75,9 @@ def resume_job(job): log_dir=job.log_dir, log_prefix=job.log_prefix, app_name=job.metadata["pipeline_name"], + user=job.user, ) - lsf_client = LSFClient() - job_resumed = lsf_client.resume(submitter.job_id) + job_resumed = get_batch_system(job.user).resume(submitter.job_id) if not job_resumed: raise RetryException("Failed to resume job: %s" % str(job.id)) job.update_status(Status.RUNNING) @@ -107,7 +108,7 @@ def process_jobs(): for job_id in status_jobs: # Send CHECK_STATUS commands for Jobs - command_processor.delay(Command(CommandType.CHECK_STATUS_ON_LSF, str(job_id)).to_dict()) + command_processor.delay(Command(CommandType.CHECK_STATUS_ON_BATCH_SYSTEM, str(job_id)).to_dict()) jobs = Scheduler.get_jobs_to_submit() @@ -134,10 +135,10 @@ def command_processor(self, command_dict): logger.debug("PREPARE command for job %s" % command.job_id) prepare_job(job) elif command.command_type == CommandType.SUBMIT: - logger.debug("SUBMIT command for job %s" % command.job_id) - submit_job_to_lsf(job, self.request.retries) - elif command.command_type == CommandType.CHECK_STATUS_ON_LSF: - logger.debug("CHECK_STATUS_ON_LSF command for job %s" % command.job_id) + logger.info("SUBMIT command for job %s" % command.job_id) + submit_job_to_batch_system(job, self.request.retries) + elif command.command_type == CommandType.CHECK_STATUS_ON_BATCH_SYSTEM: + logger.info("CHECK_STATUS_ON_BATCH_SYSTEM command for job %s" % command.job_id) check_job_status(job) elif command.command_type == CommandType.CHECK_COMMAND_LINE_STATUS: logger.debug("CHECK_COMMAND_LINE_STATUS command for job %s" % command.job_id) @@ -166,13 +167,16 @@ def command_processor(self, command_dict): logger.error(f"Command {str(command.command_type)} failed. Not retrying. Exception {str(e)}") if command.command_type == CommandType.SUBMIT: reset_job_to_created(command.job_id) + elif command.command_type == CommandType.PREPARE: + job = Job.objects.get(id=command.job_id) + _fail(job, str(e)) def reset_job_to_created(job_id): job = Job.objects.get(id=job_id) - clean_directory(job.job_store_location) - clean_directory(job.working_dir) - clean_directory(job.log_dir) + clean_directory(job, job.job_store_location) + clean_directory(job, job.working_dir) + clean_directory(job, job.log_dir) job.job_store_location = "" job.working_dir = "" job.status = Status.CREATED @@ -205,19 +209,21 @@ def prepare_job(job): log_dir=job.log_dir, log_prefix=job.log_prefix, app_name=job.metadata["pipeline_name"], + user=job.user, ) try: job_store_dir, job_work_dir, job_output_dir, log_dir, log_prefix = submitter.prepare_to_submit() except Exception as e: - raise RetryException(f"Failed to fetch status for job {str(job.id)} {e}") + if job.resume_job_store_location and not os.path.exists(job.resume_job_store_location): + raise StopException(f"Stopping job preparation, {e}") + raise RetryException(f"Failed to prepare the job {str(job.id)} {e}") else: job.job_prepared(job_store_dir, job_work_dir, job_output_dir, log_dir, log_prefix) -def submit_job_to_lsf(job, retries=0): +def submit_job_to_batch_system(job, retries=0): if Status(job.status).transition(Status.SUBMITTED): - logger.info(f"Submitting job {str(job.id)} to lsf. Try {retries}") - lsf_client = LSFClient() + logger.info(f"Submitting job {str(job.id)} to {get_batch_system().name}. Try {retries}") submitter = JobSubmitterFactory.factory( job.type, str(job.id), @@ -231,10 +237,11 @@ def submit_job_to_lsf(job, retries=0): log_dir=job.log_dir, log_prefix=job.log_prefix, app_name=job.metadata["pipeline_name"], + user=job.user, ) try: - command_line, args, log_path, job_id, env = submitter.get_submit_command() - external_job_id = lsf_client.submit(command_line, args, log_path, job_id, env) + command_line, args, log_path, job_id, partition, env = submitter.get_submit_command() + external_job_id = get_batch_system(job.user).submit(command_line, args, log_path, job_id, partition, env) except Exception as f: if retries < 5: logger.exception(str(f)) @@ -243,11 +250,11 @@ def submit_job_to_lsf(job, retries=0): logger.exception(str(f)) raise StopException(f"Failed to submit job to scheduler {str(job.id)} no more retries") else: - logger.info(f"Job {str(job.id)} submitted to lsf with id: {external_job_id}") + logger.info(f"Job {str(job.id)} submitted to {get_batch_system().name} with id: {external_job_id}") job.submitted_to_scheduler(external_job_id) # Keeping this for debugging purposes save_job_info( - str(job.id), + job, external_job_id, submitter.job_store_dir, submitter.job_work_dir, @@ -307,26 +314,25 @@ def check_job_status(job): ): return try: - lsf_client = LSFClient() - lsf_status, lsf_message = lsf_client.status(str(job.external_id)) + batch_system_status, batch_system_message = get_batch_system(job.user).status(str(job.external_id)) except FetchStatusException as e: - # If failed to check status on LSF retry + # If failed to check status on batch system retry logger.exception(e) raise RetryException("Failed to fetch status for job %s" % (str(job.id))) - if Status(job.status).transition(lsf_status): - if lsf_status in ( + if Status(job.status).transition(batch_system_status): + if batch_system_status in ( Status.SUBMITTED, Status.PENDING, Status.RUNNING, Status.UNKNOWN, ): - job.update_status(lsf_status) + job.update_status(batch_system_status) - # if lsf_status in (Status.RUNNING,): - # command_processor.delay(Command(CommandType.CHECK_HANGING, str(job.id)).to_dict()) - # command_processor.delay(Command(CommandType.CHECK_COMMAND_LINE_STATUS, str(job.id)).to_dict()) + if batch_system_status in (Status.RUNNING,): + command_processor.delay(Command(CommandType.CHECK_HANGING, str(job.id)).to_dict()) + command_processor.delay(Command(CommandType.CHECK_COMMAND_LINE_STATUS, str(job.id)).to_dict()) - elif lsf_status in (Status.COMPLETED,): + elif batch_system_status in (Status.COMPLETED,): submitter = JobSubmitterFactory.factory( job.type, str(job.id), @@ -337,6 +343,7 @@ def check_job_status(job): log_dir=job.log_dir, log_prefix=job.log_prefix, app_name=job.metadata["pipeline_name"], + user=job.user, ) outputs, error_message = submitter.get_outputs() if outputs: @@ -345,11 +352,12 @@ def check_job_status(job): else: _fail(job, error_message) command_processor.delay(Command(CommandType.CHECK_COMMAND_LINE_STATUS, str(job.id)).to_dict()) + elif batch_system_status in (Status.FAILED,): + _fail(job, batch_system_message) - elif lsf_status in (Status.FAILED,): - _fail(job, lsf_message) + command_processor.delay(Command(CommandType.CHECK_COMMAND_LINE_STATUS, str(job.id)).to_dict()) else: - raise StopException("Invalid transition %s to %s" % (Status(job.status).name, Status(lsf_status).name)) + raise StopException("Invalid transition %s to %s" % (Status(job.status).name, Status(batch_system_status).name)) def _add_alert(job, alert_obj): @@ -467,19 +475,17 @@ def terminate_job(job): Status.SUSPENDED, Status.UNKNOWN, ): - lsf_client = LSFClient() - job_killed = lsf_client.terminate(str(job.id)) + job_killed = get_batch_system(job.user).terminate(str(job.id)) if not job_killed: raise RetryException("Failed to TERMINATE job %s" % str(job.id)) job.terminate() +@userswitch def set_permission(job): failed_to_set = None dirs = job.root_dir.replace(job.base_dir, "").split("/") permission_str = job.root_permission - uid = job.output_uid - gid = job.output_gid permissions_dir = job.base_dir for d in dirs: failed_to_set = False @@ -489,22 +495,23 @@ def set_permission(job): except Exception: raise TypeError("Could not convert %s to permission octal" % str(permission_str)) try: - os.chmod(permissions_dir, permission_octal) + if Path(permissions_dir).owner() == getuser(): + os.chmod(permissions_dir, permission_octal) + else: + logger.debug(f"Skipping permission change for {permissions_dir} as it is not owned by {getuser()}") for root, dirs, files in os.walk(permissions_dir): for single_dir in dirs: if oct(os.lstat(os.path.join(root, single_dir)).st_mode)[-3:] != permission_octal: logger.debug(f"Setting permissions for {os.path.join(root, single_dir)}") path = os.path.join(root, single_dir) os.chmod(path, permission_octal) - os.chown(path, uid=uid, gid=gid) for single_file in files: if oct(os.lstat(os.path.join(root, single_file)).st_mode)[-3:] != permission_octal: path = os.path.join(root, single_file) logger.debug(f"Setting permissions for {path}") os.chmod(path, permission_octal) - os.chown(path, uid=uid, gid=gid) except Exception: - logger.error(f"Failed to set permissions for directory {permissions_dir}") + logger.exception(f"Failed to set permissions for directory {permissions_dir}") failed_to_set = True continue else: @@ -525,17 +532,21 @@ def full_cleanup_jobs(self): @shared_task(bind=True) def cleanup_completed_jobs(self): - cleanup_jobs(Status.COMPLETED, settings.CLEANUP_COMPLETED_JOBS, exclude=["input.json", "lsf.log"]) + cleanup_jobs( + Status.COMPLETED, settings.CLEANUP_COMPLETED_JOBS, exclude=["input.json", get_batch_system().logfileName] + ) @shared_task(bind=True) def cleanup_failed_jobs(self): - cleanup_jobs(Status.FAILED, settings.CLEANUP_FAILED_JOBS, exclude=["input.json", "lsf.log"]) + cleanup_jobs(Status.FAILED, settings.CLEANUP_FAILED_JOBS, exclude=["input.json", get_batch_system().logfileName]) @shared_task(bind=True) def cleanup_terminated_jobs(self): - cleanup_jobs(Status.TERMINATED, settings.CLEANUP_TERMINATED_JOBS, exclude=["input.json", "lsf.log"]) + cleanup_jobs( + Status.TERMINATED, settings.CLEANUP_TERMINATED_JOBS, exclude=["input.json", get_batch_system().logfileName] + ) def cleanup_jobs(status, time_delta, exclude=[]): @@ -559,15 +570,18 @@ def cleanup_folders(self, job_id, exclude, job_store=True, work_dir=True): logger.error("Job with id:%s not found" % job_id) return if job_store: - if clean_directory(job.job_store_location): + if clean_directory(job, job.job_store_location): job.job_store_clean_up = now() if work_dir: - if clean_directory(job.working_dir, exclude=exclude): + if clean_directory(job, job.working_dir, exclude=exclude): job.working_dir_clean_up = now() job.save() -def clean_directory(path, exclude=[]): +@userswitch +def clean_directory(job, path, exclude=[]): + if not path: + return False with tempfile.TemporaryDirectory() as tmpdirname: for f in exclude: src = os.path.join(path, f) @@ -616,6 +630,7 @@ def update_command_line_jobs(command_line_jobs, root): ) +@userswitch def check_status_of_command_line_jobs(job): submitter = JobSubmitterFactory.factory( job.type, @@ -627,6 +642,7 @@ def check_status_of_command_line_jobs(job): log_dir=job.log_dir, log_prefix=job.log_prefix, app_name=job.metadata["pipeline_name"], + user=job.user, ) track_cache_str = job.track_cache command_line_status = submitter.get_commandline_status(track_cache_str) diff --git a/orchestrator/tests/test_tasks.py b/orchestrator/tests/test_tasks.py index 4a91ef2c..226e863a 100644 --- a/orchestrator/tests/test_tasks.py +++ b/orchestrator/tests/test_tasks.py @@ -410,7 +410,7 @@ def test_process_jobs(self, check_leader_not_running, command_processor, add, de process_jobs() calls = [ - call(Command(CommandType.CHECK_STATUS_ON_LSF, str(job_pending_1.id)).to_dict()), + call(Command(CommandType.CHECK_STATUS_ON_BATCH_SYSTEM, str(job_pending_1.id)).to_dict()), call(Command(CommandType.PREPARE, str(job_created_1.id)).to_dict()), ] @@ -441,7 +441,7 @@ def _raise_retryable_exception(job_id): delete.return_value = True status.side_effect = _raise_retryable_exception with self.assertRaises(RetryException): - command_processor(Command(CommandType.CHECK_STATUS_ON_LSF, str(job_pending_1.id)).to_dict()) + command_processor(Command(CommandType.CHECK_STATUS_ON_BATCH_SYSTEM, str(job_pending_1.id)).to_dict()) @patch("django.core.cache.cache.delete") @patch("django.core.cache.cache.add") @@ -554,12 +554,13 @@ def test_get_job_info_path(self): } }, external_id="ext_id", + working_dir="/toil/work/dir/root", status=Status.SUBMITTED, metadata={"pipeline_name": "TEST"}, ) with self.settings(PIPELINE_CONFIG=PIPELINE_CONFIG): res = get_job_info_path(str(job.id)) - self.assertEqual(res, f"/toil/work/dir/root/{str(job.id)}/.run.info") + self.assertEqual(res, f"{str(job.working_dir)}/.run.info") def test_permission(self): with tempfile.TemporaryDirectory() as temp_path: diff --git a/requirements-toil.txt b/requirements-toil.txt index aeca5b77..8d4a12af 100644 --- a/requirements-toil.txt +++ b/requirements-toil.txt @@ -1,9 +1 @@ -schema-salad<8,>=7 -rdflib<4.3.0,>=4.2.2 -typing-extensions>=4.1.0 -mock==4.0.2 -pytest==4.3.1 -pytest-cov==2.6.1 -pytest-timeout==1.3.3 -git+https://github.com/mskcc/toil.git@5.4.3#egg=toil[cwl] -cwltest +toil[cwl] @ git+https://github.com/DataBiosphere/toil.git@releases/8.0.0 diff --git a/requirements.txt b/requirements.txt index 4cc67d1e..083cd3b7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,7 +16,7 @@ django-pymemcache==1.0.0 coverage==5.5 flake8==3.9.0 black==22.3.0 -elastic-apm==6.1.3 ddtrace==1.7.3 django-extensions==3.1.1 urllib3==1.26.6 +ipython==8.37.0 diff --git a/ridgeback/__init__.py b/ridgeback/__init__.py index c53ad1ad..8c0d5d5b 100644 --- a/ridgeback/__init__.py +++ b/ridgeback/__init__.py @@ -1 +1 @@ -__version__ = "1.40.0" +__version__ = "2.0.0" diff --git a/ridgeback/settings.py b/ridgeback/settings.py index 82fbe5b5..666e1ae5 100644 --- a/ridgeback/settings.py +++ b/ridgeback/settings.py @@ -39,17 +39,6 @@ SESSION_COOKIE_NAME = os.environ.get("RIDGEBACK_COOKIE_SESSION_NAME", "ridgeback_prod_session") - -ELASTIC_APM = { - # Set the required service name. Allowed characters: - # a-z, A-Z, 0-9, -, _, and space - "SERVICE_NAME": "ridgeback", - # Set the custom APM Server URL (default: http://localhost:8200) - "SERVER_URL": "http://bic-dockerapp01.mskcc.org:8200/", - # Set the service environment - "ENVIRONMENT": ENVIRONMENT, -} - # Application definition INSTALLED_APPS = [ @@ -63,12 +52,10 @@ "orchestrator.apps.OrchestratorConfig", "rest_framework", "drf_yasg", - "elasticapm", "django_extensions", ] MIDDLEWARE = [ - "elasticapm.contrib.django.middleware.TracingMiddleware", "django.middleware.security.SecurityMiddleware", "django.contrib.sessions.middleware.SessionMiddleware", "django.middleware.common.CommonMiddleware", @@ -118,7 +105,7 @@ "PORT": DB_PORT, } } - +MEMCACHED_HOST = os.environ.get("RIDGEBACK_MEMCACHED_HOST", "127.0.0.1") MEMCACHED_PORT = os.environ.get("RIDGEBACK_MEMCACHED_PORT", 11211) if ENVIRONMENT == "dev": @@ -132,7 +119,7 @@ CACHES = { "default": { "BACKEND": "djpymemcache.backend.PyMemcacheCache", - "LOCATION": "127.0.0.1:%s" % MEMCACHED_PORT, + "LOCATION": "%s:%s" % (MEMCACHED_HOST, MEMCACHED_PORT), "OPTIONS": { # see https://pymemcache.readthedocs.io/en/latest/apidoc/pymemcache.client.base.html "default_noreply": False @@ -203,6 +190,7 @@ RABBITMQ_USERNAME = os.environ.get("RIDGEBACK_RABBITMQ_USERNAME", "guest") RABBITMQ_PASSWORD = os.environ.get("RIDGEBACK_RABBITMQ_PASSWORD", "guest") RABBITMQ_URL = os.environ.get("RIDGEBACK_RABBITMQ_URL", "localhost") +ENABLE_USER_SWITCH = True if os.environ.get("RIDGEBACK_ENABLE_USER_SWITCH", "true") != "false" else False CELERY_BROKER_URL = os.environ.get( "CELERY_BROKER_URL", @@ -254,42 +242,62 @@ "JOB_STORE_ROOT": os.environ["ARGOS_JOB_STORE_ROOT"], "WORK_DIR_ROOT": os.environ["ARGOS_WORK_DIR_ROOT"], "TMP_DIR_ROOT": os.environ["ARGOS_TMP_DIR_ROOT"], + "PARTITION": os.environ.get("ARGOS_PARTITION", None), }, "TEMPO": { "JOB_STORE_ROOT": os.environ["TEMPO_JOB_STORE_ROOT"], "WORK_DIR_ROOT": os.environ["TEMPO_WORK_DIR_ROOT"], "TMP_DIR_ROOT": os.environ["TEMPO_TMP_DIR_ROOT"], + "PARTITION": os.environ.get("TEMPO_PARTITION", None), }, "ACCESS": { "JOB_STORE_ROOT": os.environ["ACCESS_JOB_STORE_ROOT"], "WORK_DIR_ROOT": os.environ["ACCESS_WORK_DIR_ROOT"], "TMP_DIR_ROOT": os.environ["ACCESS_TMP_DIR_ROOT"], + "PARTITION": os.environ.get("ACCESS_PARTITION", None), }, "CMO-CH": { "JOB_STORE_ROOT": os.environ["CMO_CH_JOB_STORE_ROOT"], "WORK_DIR_ROOT": os.environ["CMO_CH_WORK_DIR_ROOT"], "TMP_DIR_ROOT": os.environ["CMO_CH_TMP_DIR_ROOT"], + "PARTITION": os.environ.get("CMO_CH_PARTITION", None), }, "ACCESS_HEME": { "JOB_STORE_ROOT": os.environ["ACCESS_HEME_JOB_STORE_ROOT"], "WORK_DIR_ROOT": os.environ["ACCESS_HEME_WORK_DIR_ROOT"], "TMP_DIR_ROOT": os.environ["ACCESS_HEME_TMP_DIR_ROOT"], + "PARTITION": os.environ.get("ACCESS_HEME_PARTITION", None), + }, + "MICROBIOME": { + "JOB_STORE_ROOT": os.environ["MICROBIOME_JOB_STORE_ROOT"], + "WORK_DIR_ROOT": os.environ["MICROBIOME_WORK_DIR_ROOT"], + "TMP_DIR_ROOT": os.environ["MICROBIOME_TMP_DIR_ROOT"], + "PARTITION": os.environ.get("MICROBIOME_PARTITION", None), }, "NA": { "JOB_STORE_ROOT": os.environ["DEFAULT_JOB_STORE_ROOT"], "WORK_DIR_ROOT": os.environ["DEFAULT_WORK_DIR_ROOT"], "TMP_DIR_ROOT": os.environ["DEFAULT_TMP_DIR_ROOT"], + "PARTITION": os.environ.get("DEFAULT_PARTITION", None), }, } +# Batch System settings -# Toil settings +BATCH_SYSTEM = os.environ.get("RIDGEBACK_BATCH_SYSTEM", "LSF") + +# LSF settings LSF_WALLTIME = os.environ["RIDGEBACK_LSF_WALLTIME"] LSF_SLA = os.environ.get("RIDGEBACK_LSF_SLA", None) + +# Toil settings + CWLTOIL = os.environ.get("RIDGEBACK_TOIL", "toil-cwl-runner") TOIL_STATE_POLLING_WAIT = os.environ.get("TOIL_STATE_POLLING_WAIT", 60) -TOIL_MAX_CORES = os.environ.get("RIDGEBACK_TOIL_MAX_CORES", "24") +TOIL_MAX_CORES = os.environ.get("RIDGEBACK_TOIL_MAX_CORES", "40") TOIL_DEFAULT_MEMORY = os.environ.get("RIDGEBACK_TOIL_DEFAULT_MEMORY", "8G") +SINGLE_MACHINE_CORES = os.environ.get("RIDGEBACK_SINGLE_MACHINE_CORES", 16) +SINGLE_MACHINE_MEMORY = os.environ.get("RIDGEBACK_SINGLE_MACHINE_MEMORY", 25) # Nextflow settings @@ -319,3 +327,8 @@ # ACCESS LEGACY INFO ACCESS_LEGACY_APP = os.environ.get("ACCESS_LEGACY_APP", "access-pipeline") +ACCESS_LEGACY_CONDA_ENV = os.environ.get( + "ACCESS_LEGACY_CONDA_ENV", "/usersoftware/core005/access/production/V1/micromamba/envs/ACCESS-voyager/bin" +) + +SHELL_PLUS = "ipython" diff --git a/setup.cfg b/setup.cfg index 9834cf7d..3aad9c22 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [flake8] exclude = .git,*migrations* max-line-length = 120 -ignore = E203 +ignore = E203,W503 per-file-ignores = __init__.py:F401 diff --git a/submitter/factory.py b/submitter/factory.py index 019696a3..1209f591 100644 --- a/submitter/factory.py +++ b/submitter/factory.py @@ -1,5 +1,6 @@ from orchestrator.models import PipelineType from submitter import NextflowJobSubmitter, ToilJobSubmitter +from getpass import getuser class JobSubmitterFactory(object): @@ -17,6 +18,7 @@ def factory( log_dir=None, log_prefix="", app_name="NA", + user=getuser(), ): if type == PipelineType.CWL: return ToilJobSubmitter( @@ -31,6 +33,7 @@ def factory( log_dir, log_prefix, app_name, + user=user, ) elif type == PipelineType.NEXTFLOW: return NextflowJobSubmitter( @@ -45,4 +48,5 @@ def factory( log_dir, log_prefix, app_name, + user=user, ) diff --git a/submitter/jobsubmitter.py b/submitter/jobsubmitter.py index 4e56bef7..21d54440 100644 --- a/submitter/jobsubmitter.py +++ b/submitter/jobsubmitter.py @@ -1,5 +1,10 @@ +import os +import shutil from submitter.app import App from django.conf import settings +from submitter.userswitcher import userswitch +from getpass import getuser +from batch_systems.batch_system import get_batch_system class JobSubmitter(object): @@ -8,6 +13,8 @@ def __init__( job_id, app, inputs, + root_dir, + resume_jobstore, walltime, tool_walltime, memlimit, @@ -15,6 +22,7 @@ def __init__( log_prefix="", app_name="NA", root_permissions=settings.OUTPUT_DEFAULT_PERMISSION, + user=getuser(), ): self.app = App.factory(app) self.job_id = job_id @@ -26,6 +34,22 @@ def __init__( self.log_prefix = log_prefix self.app_name = app_name self.root_permissions = root_permissions + self.user = user + self.pipeline_config = None + self.partition_isolated = None + pipeline_config = settings.PIPELINE_CONFIG.get(self.app_name) + if not pipeline_config: + pipeline_config = settings.PIPELINE_CONFIG["NA"] + self.resume_jobstore = resume_jobstore + if resume_jobstore: + self.job_store_dir = resume_jobstore + else: + self.job_store_dir = os.path.join(pipeline_config["JOB_STORE_ROOT"], self.job_id) + self.partition = pipeline_config["PARTITION"] + self.job_work_dir = os.path.join(pipeline_config["WORK_DIR_ROOT"], self.job_id) + self.job_outputs_dir = root_dir + self.job_tmp_dir = os.path.join(pipeline_config["TMP_DIR_ROOT"], self.job_id) + self.batch_system = get_batch_system() def prepare_to_submit(self): """ @@ -35,7 +59,7 @@ def prepare_to_submit(self): def get_submit_command(self): """ - return: command_line, args, log_path, job_id, env_map + return: command_line, args, log_path, job_id, partition, env_map """ pass @@ -55,12 +79,34 @@ def _dump_app_inputs(self): :return: app location, inputs, location """ + @userswitch def _prepare_directories(self): """ Prepare execution directories :return: """ + if not os.path.exists(self.job_work_dir): + os.mkdir(self.job_work_dir) + if self.user: + shutil.chown(self.job_work_dir, user=self.user) + + if os.path.exists(self.job_store_dir) and not self.resume_jobstore: + + shutil.rmtree(self.job_store_dir) + + if self.resume_jobstore: + if not os.path.exists(self.resume_jobstore): + raise Exception("The jobstore indicated to be resumed could not be found") + + if not os.path.exists(self.job_tmp_dir): + os.mkdir(self.job_tmp_dir) + + if self.log_dir: + if not os.path.exists(self.log_dir): + mode_int = int(self.root_permissions, 8) + os.makedirs(self.log_dir, mode=mode_int, exist_ok=True) + def _job_args(self): pass diff --git a/submitter/nextflow_submitter/nextflow_jobsubmitter.py b/submitter/nextflow_submitter/nextflow_jobsubmitter.py index 91c0d996..73a112ce 100755 --- a/submitter/nextflow_submitter/nextflow_jobsubmitter.py +++ b/submitter/nextflow_submitter/nextflow_jobsubmitter.py @@ -1,9 +1,10 @@ import os -import shutil import hashlib import json from django.conf import settings from submitter import JobSubmitter +from submitter.userswitcher import userswitch +from getpass import getuser class NextflowJobSubmitter(JobSubmitter): @@ -21,6 +22,7 @@ def __init__( log_prefix="", app_name="NA", root_permissions=settings.OUTPUT_DEFAULT_PERMISSION, + user=getuser(), ): """ :param job_id: @@ -52,45 +54,39 @@ def __init__( job_id, app, inputs, + root_dir, + resume_jobstore, walltime, tool_walltime, memlimit, log_dir, log_prefix, app_name, - root_permissions, + root_permissions=root_permissions, + user=user, ) - self.resume_jobstore = resume_jobstore - dir_config = settings.PIPELINE_CONFIG.get(self.app_name) if self.app.nfcore_template: self.cli_output_name = "--outdir" else: self.cli_output_name = "--outDir" - if not dir_config: - dir_config = settings.PIPELINE_CONFIG["NA"] - if resume_jobstore: - self.job_store_dir = resume_jobstore - else: - self.job_store_dir = os.path.join(dir_config["JOB_STORE_ROOT"], self.job_id) - self.job_work_dir = os.path.join(dir_config["WORK_DIR_ROOT"], self.job_id) - self.job_outputs_dir = root_dir - self.job_tmp_dir = os.path.join(dir_config["TMP_DIR_ROOT"], self.job_id) def prepare_to_submit(self): self._prepare_directories() self._dump_app_inputs() return self.job_store_dir, self.job_work_dir, self.job_outputs_dir, self.log_dir, self.log_prefix + @userswitch def get_submit_command(self): command_line = self._command_line() - log_path = os.path.join(self.job_work_dir, "lsf.log") + log_path = os.path.join(self.job_work_dir, self.batch_system.logfileName) env = dict() env["NXF_OPTS"] = settings.NEXTFLOW_NXF_OPTS env["JAVA_HOME"] = settings.NEXTFLOW_JAVA_HOME env["PATH"] = env["JAVA_HOME"] + "bin:" + os.environ["PATH"] env["TMPDIR"] = self.job_tmp_dir env["NXF_CACHE_DIR"] = self.job_store_dir - return command_line, self._leader_args(), log_path, self.job_id, env + env["NXF_SLURM_PARTITION"] = self.partition + return command_line, self._leader_args(), log_path, self.job_id, self.partition, env def _leader_args(self): args = self._walltime() @@ -98,10 +94,10 @@ def _leader_args(self): return args def _walltime(self): - return ["-W", str(self.walltime)] if self.walltime else [] + return self.batch_system.set_walltime(None, self.walltime) def _memlimit(self): - return ["-M", self.memlimit] if self.memlimit else ["-M", "20"] + return self.batch_system.set_memlimit(self.memlimit, default="20") def _sha1(self, path, buffersize=1024 * 1024): try: @@ -154,6 +150,7 @@ def _output_construct(self, path): } return file_obj + @userswitch def get_outputs(self): error_message = None result = list() @@ -198,13 +195,18 @@ def inputs_location(self): inputs = self.inputs.get("inputs", []) input_map = dict() for i in inputs: - input_map[i["name"]] = os.path.join(self.job_work_dir, i["name"]) + i.get("extension", "") + file_path = os.path.join(self.job_work_dir, i["name"]) + extension = i.get("extension", "") + if extension: + file_path = file_path + extension + input_map[i["name"]] = file_path return input_map @property def config_location(self): return os.path.join(self.job_work_dir, "nf.config") + @userswitch def _dump_app_inputs(self): input_map = dict() inputs = self.inputs.get("inputs", []) @@ -216,7 +218,9 @@ def _dump_app_inputs(self): self._dump_config(config) def _dump_input(self, name, content, extension, root_dir): - file_path = os.path.join(root_dir, name) + extension + file_path = os.path.join(root_dir, name) + if extension: + file_path = file_path + extension with open(file_path, "w") as f: f.write(content) return file_path @@ -227,27 +231,11 @@ def _dump_config(self, config): f.write(config) return file_path - def _prepare_directories(self): - if not os.path.exists(self.job_work_dir): - os.mkdir(self.job_work_dir) - - if os.path.exists(self.job_store_dir) and not self.resume_jobstore: - shutil.rmtree(self.job_store_dir) - - if self.resume_jobstore: - if not os.path.exists(self.resume_jobstore): - raise Exception("The jobstore indicated to be resumed could not be found") - - if not os.path.exists(self.job_tmp_dir): - os.mkdir(self.job_tmp_dir) - - if self.log_dir: - if not os.path.exists(self.log_dir): - mode_int = int(self.root_permissions, 8) - os.makedirs(self.log_dir, mode=mode_int, exist_ok=True) - def _command_line(self): profile = self.inputs["profile"] + config = self.inputs.get("config") + if not profile: + profile = "''" params = self.inputs.get("params", {}) command_line = [ settings.NEXTFLOW, @@ -264,7 +252,7 @@ def _command_line(self): ] for k, v in self.inputs_location.items(): command_line.extend(["--%s" % k, v]) - if self.config_location: + if config: command_line.extend(["-c", self.config_location]) if params: for k, v in params.items(): diff --git a/submitter/toil_submitter/toil_jobsubmitter.py b/submitter/toil_submitter/toil_jobsubmitter.py index 3b72cb08..bffa2ace 100644 --- a/submitter/toil_submitter/toil_jobsubmitter.py +++ b/submitter/toil_submitter/toil_jobsubmitter.py @@ -1,13 +1,14 @@ import os +import re import json -import shutil import copy from django.conf import settings from django.core.serializers.json import DjangoJSONEncoder from orchestrator.models import Status from submitter import JobSubmitter from .toil_track_utils import ToilTrack, ToolStatus -from batch_systems.lsf_client.lsf_client import format_lsf_job_id +from submitter.userswitcher import userswitch +from getpass import getuser def translate_toil_to_model_status(status): @@ -39,53 +40,58 @@ def __init__( log_prefix="", app_name="NA", root_permissions=settings.OUTPUT_DEFAULT_PERMISSION, + user=getuser(), ): JobSubmitter.__init__( self, job_id, app, inputs, + root_dir, + resume_jobstore, walltime, tool_walltime, memlimit, log_dir, log_prefix, app_name, - root_permissions, + root_permissions=root_permissions, + user=user, ) - dir_config = settings.PIPELINE_CONFIG.get(self.app_name) - if not dir_config: - dir_config = settings.PIPELINE_CONFIG["NA"] - self.resume_jobstore = resume_jobstore - if resume_jobstore: - self.job_store_dir = resume_jobstore - else: - self.job_store_dir = os.path.join(dir_config["JOB_STORE_ROOT"], self.job_id) - self.job_work_dir = os.path.join(dir_config["WORK_DIR_ROOT"], self.job_id) - self.job_outputs_dir = root_dir - self.job_tmp_dir = os.path.join(dir_config["TMP_DIR_ROOT"], self.job_id) + + self.batch_system_args_env = None + self.single_machine_mode_workflows = ["nucleo_qc", "argos-qc"] + if settings.BATCH_SYSTEM == "LSF": + self.batch_system_args_env = "TOIL_LSF_ARGS" + elif settings.BATCH_SYSTEM == "SLURM": + self.batch_system_args_env = "TOIL_SLURM_ARGS" def prepare_to_submit(self): self._prepare_directories() self._dump_app_inputs() - self.app.resolve(self.job_work_dir) + self.resolve_app() return self.job_store_dir, self.job_work_dir, self.job_outputs_dir, self.log_dir, self.log_prefix + @userswitch + def resolve_app(self): + self.app.resolve(self.job_work_dir) + def get_submit_command(self): command_line = self._command_line() - log_path = os.path.join(self.job_work_dir, "lsf.log") + log_path = os.path.join(self.job_work_dir, self.batch_system.logfileName) env = dict() - if settings.LSF_SLA: - toil_lsf_args = "-sla %s %s %s" % ( - settings.LSF_SLA, - " ".join(self._job_group()), - " ".join(self._tool_args()), - ) - else: - toil_lsf_args = "%s %s" % (" ".join(self._job_group()), " ".join(self._tool_args())) + toil_batch_system_args = "%s %s %s" % ( + " ".join(self._service_queue()), + " ".join(self._job_group()), + " ".join(self._tool_args()), + ) env["JAVA_HOME"] = None - env["TOIL_LSF_ARGS"] = toil_lsf_args - return command_line, self._leader_args(), log_path, self.job_id, env + env["TMP"] = self.job_tmp_dir + env["TMPDIR"] = self.job_tmp_dir + env[self.batch_system_args_env] = toil_batch_system_args.strip() + if settings.ACCESS_LEGACY_APP in self.app.github.lower(): + env["PATH"] = f"{settings.ACCESS_LEGACY_CONDA_ENV}:{os.environ.get('PATH')}" + return command_line, self._leader_args(), log_path, self.job_id, self.partition, env def get_commandline_status(self, cache): """ @@ -136,21 +142,31 @@ def get_commandline_status(self, cache): return job_safe, track_cache_safe + @userswitch def get_outputs(self): error_message = None result_json = None - lsf_log_path = os.path.join(self.job_work_dir, "lsf.log") + log_path = os.path.join(self.job_work_dir, self.batch_system.logfileName) try: - with open(lsf_log_path, "r") as f: + with open(log_path, "r") as f: data = f.readlines() data = "".join(data) substring = data.split("\n{")[1] - result = ("{" + substring).split("-----------")[0] + substring = re.sub(r"is.*mskcc\.org.*Successfully deleted the job store.*\)\r?\n", "", substring) + # Keep original opening brace + result = "{" + substring + if "-----------" in substring: + # Handle the special marker case + result = result[: result.rfind("-----------")] + else: + last_brace_idx = result.rfind("}") + result = result[: last_brace_idx + 1] + # Now it's safe to parse result_json = json.loads(result) except (IndexError, ValueError): - error_message = "Could not parse json from %s" % lsf_log_path + error_message = "Could not parse json from %s" % log_path except FileNotFoundError: - error_message = "Could not find %s" % lsf_log_path + error_message = "Could not find %s" % log_path if self.log_dir: output_log_name = f"{self.log_prefix}.output.json" if self.log_prefix else "output.json" @@ -168,6 +184,7 @@ def app_location(self): def inputs_location(self): return os.path.join(self.job_work_dir, "input.json") + @userswitch def _dump_app_inputs(self): inputs_location = self.inputs_location with open(inputs_location, "w") as f: @@ -178,27 +195,13 @@ def _dump_app_inputs(self): with open(inputs_log_location, "w") as f: json.dump(self.inputs, f) - def _prepare_directories(self): - if not os.path.exists(self.job_work_dir): - os.mkdir(self.job_work_dir) - - if self.log_dir: - if not os.path.exists(self.log_dir): - mode_int = int(self.root_permissions, 8) - os.makedirs(self.log_dir, mode=mode_int, exist_ok=True) - - if os.path.exists(self.job_store_dir) and not self.resume_jobstore: - shutil.rmtree(self.job_store_dir) - - if self.resume_jobstore: - if not os.path.exists(self.resume_jobstore): - raise Exception("The job_store indicated to be resumed could not be found") - - if not os.path.exists(self.job_tmp_dir): - os.mkdir(self.job_tmp_dir) - def _leader_args(self): + single_machine = any([w in self.app.github.lower() for w in self.single_machine_mode_workflows]) args = self._walltime() + if single_machine: + args.extend(self._numtasks(int(settings.SINGLE_MACHINE_CORES))) + if not self.memlimit: + self.memlimit = settings.SINGLE_MACHINE_MEMORY args.extend(self._memlimit()) return args @@ -207,69 +210,27 @@ def _tool_args(self): if self.tool_walltime: expected_limit = max(1, int(self.tool_walltime / 3)) hard_limit = self.tool_walltime - args = ["-We", str(expected_limit), "-W", str(hard_limit)] - args.extend(self._memlimit()) + args = self.batch_system.set_walltime(expected_limit, hard_limit) return args + def _service_queue(self): + return self.batch_system.set_service_queue(self.partition) + def _walltime(self): - return ["-W", str(self.walltime)] if self.walltime else [] + return self.batch_system.set_walltime(None, self.walltime) def _memlimit(self): - return ["-M", self.memlimit] if self.memlimit else [] + return self.batch_system.set_memlimit(self.memlimit, default="5") + + def _numtasks(self, num_tasks): + return self.batch_system.set_num_tasks(num_tasks) def _job_group(self): - return ["-g", format_lsf_job_id(self.job_id)] + return self.batch_system.set_group(self.job_id) def _command_line(self): - single_machine_mode_workflows = ["nucleo_qc", "argos-qc"] - single_machine = any([w in self.app.github.lower() for w in single_machine_mode_workflows]) - if settings.ACCESS_LEGACY_APP in self.app.github.lower(): - """ - Start ACCESS-specific code - """ - access_path = "PATH=/home/accessbot/miniconda3/envs/ACCESS_cmplx_geno_test/bin:{}" - path = access_path.format(os.environ.get("PATH")) - command_line = [ - path, - "toil-cwl-runner", - "--no-container", - "--logFile", - "toil_log.log", - "--batchSystem", - "lsf", - "--logLevel", - "DEBUG", - "--stats", - "--cleanWorkDir", - "onSuccess", - "--disableCaching", - "--defaultMemory", - "10G", - "--retryCount", - "2", - "--disableChaining", - "--preserve-environment", - "PATH", - "TMPDIR", - "TOIL_LSF_ARGS", - "CWL_SINGULARITY_CACHE", - "PWD", - "_JAVA_OPTIONS", - "PYTHONPATH", - "TEMP", - "--jobStore", - self.job_store_dir, - "--tmpdir-prefix", - self.job_tmp_dir, - "--workDir", - self.job_work_dir, - "--outdir", - self.job_outputs_dir, - ] - """ - End ACCESS-specific code - """ - elif single_machine: + single_machine = any([w in self.app.github.lower() for w in self.single_machine_mode_workflows]) + if single_machine: command_line = [ settings.CWLTOIL, "--singularity", @@ -282,17 +243,15 @@ def _command_line(self): str(settings.TOIL_STATE_POLLING_WAIT), "--disable-user-provenance", "--disable-host-provenance", - "--stats", "--cleanWorkDir", "onSuccess", - "--debug", "--disableProgress", "--doubleMem", + "True", "--disableCaching", "--preserve-environment", "PATH", "TMPDIR", - "TOIL_LSF_ARGS", "CWL_SINGULARITY_CACHE", "SINGULARITYENV_LC_ALL", "PWD", @@ -300,13 +259,14 @@ def _command_line(self): settings.TOIL_DEFAULT_MEMORY, "--maxCores", settings.TOIL_MAX_CORES, - "--maxDisk", - "128G", + "--jobStoreTimeout", + "600", "--maxMemory", "256G", "--not-strict", "--runCwlInternalJobsOnWorkers", "--realTimeLogging", + "True", "--jobStore", self.job_store_dir, "--tmpdir-prefix", @@ -317,6 +277,8 @@ def _command_line(self): self.job_outputs_dir, "--maxLocalJobs", "500", + "--no-prepull", + "--reference-inputs", ] else: command_line = [ @@ -326,22 +288,21 @@ def _command_line(self): "--logFile", "toil_log.log", "--batchSystem", - "lsf", + self.batch_system.name, "--statePollingWait", str(settings.TOIL_STATE_POLLING_WAIT), "--disable-user-provenance", "--disable-host-provenance", - "--stats", "--cleanWorkDir", "onSuccess", - "--debug", "--disableProgress", "--doubleMem", + "True", "--disableCaching", "--preserve-environment", "PATH", "TMPDIR", - "TOIL_LSF_ARGS", + self.batch_system_args_env, "CWL_SINGULARITY_CACHE", "SINGULARITYENV_LC_ALL", "PWD", @@ -349,13 +310,14 @@ def _command_line(self): settings.TOIL_DEFAULT_MEMORY, "--maxCores", settings.TOIL_MAX_CORES, - "--maxDisk", - "128G", + "--jobStoreTimeout", + "600", "--maxMemory", "256G", "--not-strict", "--runCwlInternalJobsOnWorkers", "--realTimeLogging", + "True", "--jobStore", self.job_store_dir, "--tmpdir-prefix", @@ -366,6 +328,8 @@ def _command_line(self): self.job_outputs_dir, "--maxLocalJobs", "500", + "--no-prepull", + "--reference-inputs", ] if self.resume_jobstore: command_line.extend(["--restart", self.app_location]) diff --git a/submitter/toil_submitter/toil_track_utils.py b/submitter/toil_submitter/toil_track_utils.py index 12c5fcc9..4c4a16a6 100755 --- a/submitter/toil_submitter/toil_track_utils.py +++ b/submitter/toil_submitter/toil_track_utils.py @@ -11,13 +11,14 @@ import copy import json import glob +import inspect from enum import IntEnum from datetime import datetime from json.decoder import JSONDecodeError from orchestrator.exceptions import StopException from toil.jobStores.fileJobStore import FileJobStore +from toil.version import baseVersion from toil.toilState import ToilState as toil_state -from toil.cwl.cwltoil import CWL_INTERNAL_JOBS from toil.jobStores.abstractJobStore import NoSuchJobException, NoSuchJobStoreException, JobException @@ -27,6 +28,13 @@ JITTER = 5 ATTEMPTS = 5 +TOIL_MAJOR_VERSION = int(baseVersion[0]) + +if TOIL_MAJOR_VERSION >= 8: + from toil.bus import replay_message_bus +else: + from toil.cwl.cwltoil import CWL_INTERNAL_JOBS + def _get_method(file_job_store, method): """ @@ -42,25 +50,32 @@ def _get_method(file_job_store, method): def _check_job_method(file_job_store): """ TOIL Adapter function to check for job existence using method under diffrent - names from TOIL 5.4 and 3.21 + names from TOIL versions 8.0, 5.4, and 3.21 """ - check_function = _get_method(file_job_store, "_checkJobStoreIdExists") - if check_function: - return check_function + default_check_function = _get_method(file_job_store, "_wait_for_exists") - fallback_function = _get_method(file_job_store, "_checkJobStoreId") + if default_check_function: + return default_check_function + + fallback_function = _get_method(file_job_store, "_checkJobStoreIdExists") if fallback_function: return fallback_function + # What about second fallback? Ref - https://knowyourmeme.com/memes/second-breakfast + second_fallback_function = _get_method(file_job_store, "_checkJobStoreId") + + if second_fallback_function: + return second_fallback_function + raise Exception("Unable to check jobs, possible incompatibility with the current TOIL version") def _check_retry_count(job): """ TOIL Adapter function to check for the job retry count using attributes under - diffrent names from TOIL 5.4 and 3.21 + diffrent names from TOIL versions 8.0, 5.4, and 3.21 """ if hasattr(job, "_remainingTryCount"): return getattr(job, "_remainingTryCount") @@ -77,7 +92,7 @@ def _check_retry_count(job): def _check_job_stats(job): """ TOIL Adapter function to check for job status using - the proper attributes from TOIL 5.4 and 3.21 + the proper attributes from TOIL versions 8.0, 5.4, and 3.21 """ disk = None memory = None @@ -108,6 +123,17 @@ def _get_job_stream_path(text): return None +def _get_job_id(text): + """ + TOIL helper function to parse the + job id path from text + """ + job_id = re.search(r"kind\S*", text) + if job_id: + return job_id[0] + return None + + def _read_stats_file(stats_path): """ TOIL Adapter function to read the stats file @@ -124,11 +150,16 @@ def _read_stats_file(stats_path): jobs = stats_json.get("jobs", []) for single_worker, single_job in zip(worker_logs, jobs): worker_text = single_worker.get("text", "") - job_stream = _get_job_stream_path(worker_text) + job_stream = None + job_id = None + if TOIL_MAJOR_VERSION >= 8: + job_id = _get_job_id(worker_text) + else: + job_stream = _get_job_stream_path(worker_text) job_mem = single_job.get("memory") job_cpu = single_job.get("clock") - if job_stream and job_mem and job_cpu: - stats_info.append((job_stream, job_mem, job_cpu)) + if (job_stream or job_id) and job_mem and job_cpu: + stats_info.append((job_id, job_stream, job_mem, job_cpu)) return stats_info @@ -154,10 +185,13 @@ def _resume_job_store(job_store_path, total_attempts): raise Exception("Job store path %s not found" % job_store_path) read_only_job_store_obj = ReadOnlyFileJobStore(job_store_path, total_attempts) read_only_job_store_obj.resume() - read_only_job_store_obj.set_job_cache() - job_store_cache = read_only_job_store_obj.job_cache - root_job = _clean_job_store(read_only_job_store_obj, job_store_cache) - return read_only_job_store_obj, root_job + if TOIL_MAJOR_VERSION >= 8: + return read_only_job_store_obj, read_only_job_store_obj.load_root_job() + else: + read_only_job_store_obj.set_job_cache() + job_store_cache = read_only_job_store_obj.job_cache + root_job = _clean_job_store(read_only_job_store_obj, job_store_cache) + return read_only_job_store_obj, root_job def _load_job_store(job_store, root_job): @@ -166,9 +200,12 @@ def _load_job_store(job_store, root_job): into a TOIL state object and avoid random filesystem issues """ - job_store_cache = job_store.job_cache - toil_state_obj = toil_state(job_store, root_job, jobCache=job_store_cache) - return toil_state_obj + if TOIL_MAJOR_VERSION >= 8: + return None + else: + job_store_cache = job_store.job_cache + toil_state_obj = toil_state(job_store, root_job, jobCache=job_store_cache) + return toil_state_obj def _check_job_state(work_log_path, jobs_path): @@ -191,6 +228,15 @@ def _check_job_state(work_log_path, jobs_path): return None +def get_job_id_from_worker_log(worker_log_path): + if os.path.exists(worker_log_path): + with open(worker_log_path) as worker_log: + for single_line in worker_log: + if "Working on job" in single_line: + return _get_job_id(single_line) + return None + + def _check_worker_logs(work_dir, work_log_to_job_id, jobs_path): """ Check the work directory for worker logs and report @@ -205,7 +251,10 @@ def _check_worker_logs(work_dir, work_log_to_job_id, jobs_path): if single_worker_log in work_log_to_job_id: job_id = work_log_to_job_id[single_worker_log] else: - job_id = _check_job_state(single_worker_log, jobs_path) + if TOIL_MAJOR_VERSION >= 8: + job_id = get_job_id_from_worker_log(single_worker_log) + else: + job_id = _check_job_state(single_worker_log, jobs_path) if job_id: worker_dict[job_id] = (single_worker_log, last_modified) return worker_dict @@ -223,25 +272,11 @@ def _get_file_modification_time(file_path): return None -def _get_current_jobs(toil_state_obj): +def _get_bus_path(job_store): """ - TOIL Adapter function to get updated jobs - from the toil_state_obj + TOIL helper function to get leader bus path, only supported for TOIL >= 8.0.0 """ - updated_jobs = toil_state_obj.updatedJobs - if not updated_jobs: - return [] - - job_list = [] - if isinstance(updated_jobs, set): - for single_job in updated_jobs: - job_list.append(single_job[0]) - elif isinstance(updated_jobs, dict): - for single_job in updated_jobs.values(): - job_list.append(single_job[0]) - else: - raise Exception("Unable to check TOIL state, possible incompatibility with the current TOIL version") - return job_list + return job_store.config.write_messages def _get_job_display_name(job): @@ -249,27 +284,42 @@ def _get_job_display_name(job): TOIL adapter to get the display name of the job from TOIL job. Use the field job_name or display_name depending on the TOIL version Example: - job_name: file:///Users/kumarn1/work/ridgeback/tests/test_jobstores/ - test_cwl/sleep.cwl#simpleWorkflow/sleep/sleep + job_name: file:///Users/kumarn1/cwl/test_jobstore/sleep.cwl#simpleWorkflow/sleep/sleep returns "sleep" When id is not specified in the cwl it will return the name of the cwl Example: - job_name: file:///Users/kumarn1/work/ridgeback/tests/test_jobstores/ - test_cwl/sleep.cwl + job_name: file:///Users/kumarn1/cwl/test_jobstore/sleep.cwl returns "sleep" """ - job_name = job.jobName - display_name = job.displayName - cwl_path = None - if "cwl" in job_name: - cwl_path = job_name - elif "cwl" in display_name: - cwl_path = display_name + if TOIL_MAJOR_VERSION >= 8: + display_name = None + unit_name = job.unitName + if not unit_name: + unit_name = job.displayName + if ".cwl" in unit_name: + display_name = unit_name.split(".")[-2] + elif "." in unit_name: + split_list = unit_name.split(".") + if split_list[-1].startswith("_"): + display_name = "".join(split_list[-2::]) + else: + display_name = unit_name.split(".")[-1] + else: + display_name = unit_name + return display_name else: - raise Exception("Could not find name in possible values %s %s" % (job_name, display_name)) - job_basename = os.path.basename(cwl_path) - display_name = os.path.splitext(job_basename)[0] - return display_name + job_name = job.jobName + display_name = job.displayName + cwl_path = None + if "cwl" in job_name: + cwl_path = job_name + elif "cwl" in display_name: + cwl_path = display_name + else: + raise Exception("Could not find name in possible values %s %s" % (job_name, display_name)) + job_basename = os.path.basename(cwl_path) + display_name = os.path.splitext(job_basename)[0] + return display_name class ReadOnlyFileJobStore(FileJobStore): @@ -292,8 +342,13 @@ def check_if_job_exists(self, job_store_id): Check if the job exists in the job store """ check_function = _check_job_method(self) + max_tries = 0 + args = inspect.getfullargspec(check_function).args try: - check_function(job_store_id) + if "maxTries" in args: + check_function(job_store_id, maxTries=max_tries) + else: + check_function(job_store_id) return True except NoSuchJobException: return False @@ -301,11 +356,17 @@ def check_if_job_exists(self, job_store_id): def load(self, jobStoreID): if jobStoreID in self.job_cache: return self.job_cache[jobStoreID] - self.check_if_job_exists(jobStoreID) - job_file = self._getJobFileName(jobStoreID) - with open(job_file, "rb") as file_handle: - job = pickle.load(file_handle) - return job + if self.check_if_job_exists(jobStoreID): + if TOIL_MAJOR_VERSION >= 8: + job_file = self._get_job_file_name(jobStoreID) + else: + job_file = self._getJobFileName(jobStoreID) + try: + with open(job_file, "rb") as file_handle: + job = pickle.load(file_handle) + except OSError: + return None + return job def set_job_cache(self): """ @@ -384,7 +445,6 @@ def writeFileStream(self, jobStoreID=None, cleanup=False, basename=None, encodin pass # pylint: enable=too-many-arguments - def writeSharedFileStream(self, sharedFileName, isProtected=None, encoding=None, errors=None): pass @@ -406,6 +466,9 @@ class ToolStatus(IntEnum): FAILED = 4 UNKNOWN = 5 + def __str__(self): + return self.name + class ToilTrack: """ @@ -433,7 +496,6 @@ def __init__( self.show_cwl_internal = show_cwl_internal # pylint: enable=too-many-instance-attributes - def create_job_id(self, job_store_id, id_prefix_param=None, id_suffix_param=None): """ Create a job id using the Id in the TOIL jobstore with @@ -464,12 +526,12 @@ def create_job_id(self, job_store_id, id_prefix_param=None, id_suffix_param=None id_string = "%s-%s-%s" % (id_prefix, job_id, id_suffix) return id_string - def mark_job_as_failed(self, job_id, job_name, job): + def mark_job_as_failed(self, job_id, job): """ Mark a job as failed """ job_dict = self.jobs - if job_name not in CWL_INTERNAL_JOBS or self.show_cwl_internal: + if not self.is_local_job(job): if job_id in job_dict: job_dict[job_id]["status"] = ToolStatus.FAILED if not job_dict[job_id]["finished"]: @@ -494,6 +556,36 @@ def mark_job_as_failed(self, job_id, job_name, job): } job_dict[job_id] = new_job + def mark_job_as_completed(self, job_id, job): + """ + Mark a job as completed + """ + job_dict = self.jobs + if not self.is_local_job(job): + if job_id in job_dict: + job_dict[job_id]["status"] = ToolStatus.COMPLETED + if not job_dict[job_id]["finished"]: + job_dict[job_id]["finished"] = datetime.now() + else: + cores, disk, memory = _check_job_stats(job) + display_name = _get_job_display_name(job) + new_job = { + "name": display_name, + "disk": disk, + "status": ToolStatus.COMPLETED, + "job_stream": None, + "memory_req": memory, + "cores_req": cores, + "cpu_usage": [], + "mem_usage": [], + "started": datetime.now(), + "submitted": datetime.now(), + "last_modified": datetime.now(), + "log_path": None, + "finished": datetime.now(), + } + job_dict[job_id] = new_job + def _update_job_stats(self, job_key, job_mem, job_cpu): """ Parse and update job stats @@ -526,10 +618,13 @@ def check_stats(self, job_store_obj): stats_file_list = job_store_obj.get_stats_files() for single_stats_path in stats_file_list: stats_info = _read_stats_file(single_stats_path) - for job_stream, job_mem, job_cpu in stats_info: - job_key = jobs_path.get(job_stream) - if job_key: - self._update_job_stats(job_key, job_mem, job_cpu) + for job_id, job_stream, job_mem, job_cpu in stats_info: + if job_id: + self._update_job_stats(job_id, job_mem, job_cpu) + else: + job_key = jobs_path.get(job_stream) + if job_key: + self._update_job_stats(job_key, job_mem, job_cpu) def handle_failed_jobs(self, job_store): """ @@ -541,9 +636,11 @@ def handle_failed_jobs(self, job_store): if retry_count is not None: previous_suffix = self.total_attempts - (retry_count + 1) jobstore_id = single_job.jobStoreID - job_id = self.create_job_id(jobstore_id, id_suffix_param=previous_suffix) - job_name = single_job.jobName - self.mark_job_as_failed(job_id, job_name, single_job) + if TOIL_MAJOR_VERSION >= 8: + job_id = jobstore_id + else: + job_id = self.create_job_id(jobstore_id, id_suffix_param=previous_suffix) + self.mark_job_as_failed(job_id, single_job) def handle_restarted_jobs(self, job_store): """ @@ -557,32 +654,92 @@ def handle_restarted_jobs(self, job_store): jobstore_id = single_job.jobStoreID previous_retry_count = self.total_attempts - (retry_count + 1) retry_job_ids[jobstore_id] = previous_retry_count - job_id = self.create_job_id(jobstore_id, id_suffix_param=previous_retry_count) + if TOIL_MAJOR_VERSION >= 8: + job_id = jobstore_id + else: + job_id = self.create_job_id(jobstore_id, id_suffix_param=previous_retry_count) if job_id in job_dict and job_dict[job_id]["status"] != ToolStatus.FAILED: - job_name = single_job.jobName - self.mark_job_as_failed(job_id, job_name, single_job) + self.mark_job_as_failed(job_id, single_job) + + def set_job_stream(self, single_job, job_id): + job_stream = None + if single_job.command: + job_stream = _get_job_stream_path(single_job.command) + if not job_stream: + logger.debug("Could not find job_stream for job %s [%s]", single_job.job_name, job_id) + self.jobs_path[job_stream] = job_id + return job_stream + + def is_local_job(self, job): + if TOIL_MAJOR_VERSION >= 8: + return job.local and self.show_cwl_internal + else: + return job.jobName in CWL_INTERNAL_JOBS and self.show_cwl_internal - def handle_current_jobs(self, toil_state_obj): + def _get_current_jobs(self, toil_state_obj, job_store): + """ + TOIL Adapter function to get updated jobs + from the toil_state_obj + """ + job_list = [] + + if TOIL_MAJOR_VERSION >= 8: + message_bus = _get_bus_path(job_store) + if message_bus and os.path.exists(message_bus): + all_job_statuses = replay_message_bus(message_bus) + for job_status in all_job_statuses.values(): + if job_status.is_running(): + single_job = job_store.load(job_status.job_store_id) + if single_job: + job_list.append(single_job) + elif job_status.exit_code == 0: + single_job = job_store.load(job_status.job_store_id) + if single_job: + self.mark_job_as_completed(job_status.job_store_id, single_job) + elif job_status.exit_code > 0: + single_job = job_store.load(job_status.job_store_id) + if single_job: + self.mark_job_as_failed(job_status.job_store_id, single_job) + else: + pass + + return job_list + else: + updated_jobs = toil_state_obj.updatedJobs + if not updated_jobs: + return [] + if isinstance(updated_jobs, set): + for single_job in updated_jobs: + job_list.append(single_job[0]) + elif isinstance(updated_jobs, dict): + for single_job in updated_jobs.values(): + job_list.append(single_job[0]) + else: + raise Exception("Unable to check TOIL state, possible incompatibility with the current TOIL version") + return job_list + + def handle_current_jobs(self, toil_state_obj, job_store): """ Check TOIL jobstore for current/new jobs, add new jobs, and collect stats on new jobs """ jobs_dict = self.jobs current_jobs = [] - for single_job in _get_current_jobs(toil_state_obj): - job_name = single_job.jobName - if job_name not in CWL_INTERNAL_JOBS or self.show_cwl_internal: + for single_job in self._get_current_jobs(toil_state_obj, job_store): + if not self.is_local_job(single_job): cores, disk, memory = _check_job_stats(single_job) jobstore_id = single_job.jobStoreID - job_id = self.create_job_id(jobstore_id) + if TOIL_MAJOR_VERSION >= 8: + job_id = jobstore_id + else: + job_id = self.create_job_id(jobstore_id) current_jobs.append(job_id) job_stream = None - if single_job.command: - job_stream = _get_job_stream_path(single_job.command) - if not job_stream: - logger.debug("Could not find job_stream for job %s [%s]", job_name, job_id) - if job_stream and job_id not in jobs_dict: - self.jobs_path[job_stream] = job_id + if TOIL_MAJOR_VERSION < 8: + job_stream = self.set_job_stream(single_job, job_store) + if not job_stream: + continue + if job_id not in jobs_dict: display_name = _get_job_display_name(single_job) new_job = { "name": display_name, @@ -623,16 +780,17 @@ def handle_running_jobs(self): worker_info = _check_worker_logs(self.work_dir, worker_log_to_job_dict, self.jobs_path) for single_job_id in worker_info: worker_log, last_modified = worker_info[single_job_id] - job_obj = job_dict[single_job_id] - if job_obj["status"] == ToolStatus.PENDING or job_obj["status"] == ToolStatus.UNKNOWN: - job_obj["status"] = ToolStatus.RUNNING - if not job_obj["started"]: - job_obj["started"] = datetime.now() - if last_modified: - job_obj["last_modified"] = last_modified - job_obj["log_path"] = worker_log - if worker_log not in worker_log_to_job_dict: - worker_log_to_job_dict[worker_log] = single_job_id + if single_job_id in job_dict: + job_obj = job_dict[single_job_id] + if job_obj["status"] == ToolStatus.PENDING or job_obj["status"] == ToolStatus.UNKNOWN: + job_obj["status"] = ToolStatus.RUNNING + if not job_obj["started"]: + job_obj["started"] = datetime.now() + if last_modified: + job_obj["last_modified"] = last_modified + job_obj["log_path"] = worker_log + if worker_log not in worker_log_to_job_dict: + worker_log_to_job_dict[worker_log] = single_job_id def check_status(self): """ @@ -646,6 +804,8 @@ def check_status(self): except JobException: logger.warning("No job has been set as the root in this job store") return + except Exception: + raise StopException("This job was run by a version of TOIL that is not supported") if not root_job: logger.warning("RootJob couldn't be fetched") raise StopException("RootJob couldn't be fetched") @@ -653,11 +813,11 @@ def check_status(self): if not job_store.check_if_job_exists(root_job_id): logger.warning("Jobstore root not found, toil job may be finished or just starting") toil_state_obj = _load_job_store(job_store, root_job) - if not toil_state_obj: + if not toil_state_obj and TOIL_MAJOR_VERSION < 8: logger.warning("TOIL state is unexpectedly empty") self.handle_failed_jobs(job_store) self.handle_restarted_jobs(job_store) - current_jobs = self.handle_current_jobs(toil_state_obj) + current_jobs = self.handle_current_jobs(toil_state_obj, job_store) self.handle_finished_jobs(current_jobs) self.handle_running_jobs() self.check_stats(job_store) @@ -678,6 +838,9 @@ def script_track_status(toil_track_obj): jobs_path = toil_track_obj.jobs_path jobs = toil_track_obj.jobs work_log_to_job_id = toil_track_obj.work_log_to_job_id + for single_job_id in jobs: + single_job = jobs[single_job_id] + single_job["status"] = str(single_job["status"]) print(json.dumps(jobs, indent=4, sort_keys=True, default=str)) time.sleep(4) @@ -701,10 +864,10 @@ def main(): """ usage_str = """ - USAGE: - toil_track_utils.py track [job_store_path] [work_dir_path] - toil_track_utils.py snapshot [job_store_path_1] [work_dir_path_1] [job_store_path_2] [work_dir_path_2] - """ + USAGE: + toil_track_utils.py track [job_store_path] [work_dir_path] + toil_track_utils.py snapshot [job_store_path_1] [work_dir_path_1] [job_store_path_2] [work_dir_path_2] + """ if len(sys.argv) not in [4, 6]: print(usage_str) diff --git a/submitter/userswitcher.py b/submitter/userswitcher.py new file mode 100755 index 00000000..bf355bd7 --- /dev/null +++ b/submitter/userswitcher.py @@ -0,0 +1,109 @@ +import os +import sys +import subprocess +import dill +import contextlib +import io +import logging +from pathlib import Path +from functools import wraps +from getpass import getuser +import django +import tempfile +from django.conf import settings + +log = logging.getLogger(__name__) + + +def userscript(): + stdout_buffer = io.StringIO() + stderr_buffer = io.StringIO() + exception_raised = False + output = None + with contextlib.redirect_stdout(stdout_buffer), contextlib.redirect_stderr(stderr_buffer): + try: + env_path = sys.argv[1] + with open(env_path, "rb") as env_file: + env_dict = dill.load(env_file) + for single_env in env_dict: + if single_env == "PATH": + os.environ[single_env] = env_dict[single_env] + else: + os.environ.setdefault(single_env, env_dict[single_env]) + ridgeback_path = env_dict["RIDGEBACK_PATH"] + sys.path.append(ridgeback_path) + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "ridgeback.settings") + django.setup() + func_data = sys.stdin.buffer.read() + func, args, kwargs = dill.loads(func_data) + output = func(*args, **kwargs) + except Exception: + log.exception("Exception when running the function as another user") + exception_raised = True + script_tuple = (output, stdout_buffer.getvalue().encode()) + sys.stderr.buffer.write(stderr_buffer.getvalue().encode()) + sys.stdout.buffer.write(dill.dumps(script_tuple)) + if exception_raised: + sys.exit(1) + + +def userswitch(func): + @wraps(func) + def dzdo_wrapper(*args, **kwargs): + # jobsubmitter/batchsystem objects will have the user attribute in self + user = args[0].user + current_env = {} + if user == getuser() or not settings.ENABLE_USER_SWITCH: + return func(*args, **kwargs) + else: + for key, value in os.environ.items(): + current_env[key] = value + proc_command = ["dzdo", "--login", "-u", f"{user}", sys.executable, Path(__file__).absolute()] + try: + job_func = dill.dumps((func, args, kwargs)) + with tempfile.NamedTemporaryFile(mode="wb", dir="/tmp") as tmp_env_file: + os.chmod(tmp_env_file.name, 0o755) + dill.dump(current_env, tmp_env_file) + dzdo_process = subprocess.run( + proc_command + [tmp_env_file.name], input=job_func, check=True, capture_output=True, env=current_env + ) + output, stdout = dill.loads(dzdo_process.stdout) + func_stdout = stdout.decode().strip() + func_stderr = dzdo_process.stderr.decode().strip() + if func_stdout: + log.info(func_stdout) + if func_stderr: + log.error(func_stderr) + return output + except subprocess.CalledProcessError as e: + stdout_str = "" + stderr_str = "" + try: + stderr = e.stderr + if stderr: + stderr_str = stderr.decode().strip() + output, stdout = dill.loads(e.output) + if stdout: + stdout_str = stdout.decode().strip() + except Exception: + stdout_str = "NA" + exception_message = f""" + Error while userswitching: + USER: {user} + Return Code: {e.returncode} + Output: {stdout_str} + Error: {stderr_str} + """ + raise Exception(exception_message) + except FileNotFoundError as e: + exception_message = f""" + Error, command not found while userswitching: + {e.filename} not found. + """ + raise Exception(exception_message) + + return dzdo_wrapper + + +if __name__ == "__main__": + userscript() diff --git a/tests/data/toil_8.0.0.tar.gz b/tests/data/toil_8.0.0.tar.gz new file mode 100644 index 00000000..e46d1032 Binary files /dev/null and b/tests/data/toil_8.0.0.tar.gz differ diff --git a/tests/test_api.py b/tests/test_api.py index 8e0b7092..0537f571 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -45,7 +45,7 @@ def test_404_read(self): response = self.client.get(url) self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) - @patch("orchestrator.tasks.submit_job_to_lsf") + @patch("orchestrator.tasks.submit_job_to_batch_system") def test_create(self, submit_jobs_mock): ddtrace.tracer.enabled = False url = self.api_root + "jobs/" @@ -80,7 +80,7 @@ def test_delete_authorized(self): response = self.client.delete(url) self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT) - @patch("orchestrator.tasks.submit_job_to_lsf") + @patch("orchestrator.tasks.submit_job_to_batch_system") def test_resume(self, submit_jobs_mock): ddtrace.tracer.enabled = False url = "{}jobs/{}/resume/".format(self.api_root, self.example_job.id) diff --git a/tests/test_commandline.py b/tests/test_commandline.py index 335ef142..38b92571 100644 --- a/tests/test_commandline.py +++ b/tests/test_commandline.py @@ -5,6 +5,7 @@ import os from shutil import unpack_archive, copytree, copy import tempfile +from mock import patch from django.test import TestCase, override_settings import toil from orchestrator.models import Job, Status, PipelineType, CommandLineToolJob @@ -33,7 +34,7 @@ def get_toil_mock(self, toil_version): def setUp(self): Job.objects.all().delete() self.toil_version = toil.version.baseVersion - if self.toil_version not in ["3.21.0", "5.4.0a1"]: + if self.toil_version not in ["3.21.0", "5.4.0a1", "8.0.0"]: raise Exception("TOIL version: %s not supported" % self.toil_version) self.mock_dir = tempfile.TemporaryDirectory() self.job = Job( @@ -55,17 +56,24 @@ def mock_track(self, run_type): """ Mock track using TOIL snapshots """ + mock_data_path = os.path.join(self.mock_full_path, run_type) first_jobstore = os.path.join(mock_data_path, "0", "jobstore") first_work = os.path.join(mock_data_path, "0", "work") - second_jobstore = os.path.join(mock_data_path, "1", "jobstore") - second_work = os.path.join(mock_data_path, "1", "work") - with tempfile.TemporaryDirectory() as tmpdir: - self.check_status(first_jobstore, first_work, tmpdir) - with tempfile.TemporaryDirectory() as tmpdir: - self.check_status(second_jobstore, second_work, tmpdir) + if self.toil_version == "8.0.0": + with tempfile.TemporaryDirectory() as tmpdir: + bus_path = os.path.join(mock_data_path, "0", "bus") + self.check_status(first_jobstore, first_work, tmpdir, bus_path) + else: + second_jobstore = os.path.join(mock_data_path, "1", "jobstore") + second_work = os.path.join(mock_data_path, "1", "work") + with tempfile.TemporaryDirectory() as tmpdir: + self.check_status(first_jobstore, first_work, tmpdir, None) + with tempfile.TemporaryDirectory() as tmpdir: + self.check_status(second_jobstore, second_work, tmpdir, None) - def check_status(self, jobstore, work_dir, tmp_dir): + @patch("submitter.toil_submitter.toil_track_utils._get_bus_path") + def check_status(self, jobstore, work_dir, tmp_dir, bus_path, get_bus_path): """ Check status of command line jobs """ @@ -74,11 +82,20 @@ def check_status(self, jobstore, work_dir, tmp_dir): tmp_jobstore = os.path.join(tmp_dir, "jobstore") new_work_dir = os.path.join(tmp_work_dir, job_id) new_jobstore = os.path.join(tmp_jobstore, job_id) + if bus_path: + new_bus_path = os.path.join(tmp_dir, "bus") + copy(bus_path, new_bus_path) + get_bus_path.return_value = new_bus_path copytree(jobstore, new_jobstore) copytree(work_dir, new_work_dir) with override_settings( PIPELINE_CONFIG={ - "NA": {"JOB_STORE_ROOT": tmp_jobstore, "WORK_DIR_ROOT": tmp_work_dir, "TMP_DIR_ROOT": "/tmp"} + "NA": { + "JOB_STORE_ROOT": tmp_jobstore, + "WORK_DIR_ROOT": tmp_work_dir, + "TMP_DIR_ROOT": "/tmp", + "PARTITION": "test_partition", + } } ): check_status_of_command_line_jobs(self.job) @@ -90,7 +107,10 @@ def test_running(self): self.mock_track("running") mock_num_completed = 0 mock_num_running = 0 - if self.toil_version == "3.21.0": + if self.toil_version == "8.0.0": + mock_num_completed = 1 + mock_num_running = 2 + elif self.toil_version == "3.21.0": mock_num_completed = 1 mock_num_running = 2 elif self.toil_version == "5.4.0a1": @@ -107,24 +127,27 @@ def test_failed(self): Test if failed jobs are properly parsed """ self.mock_track("failed") - mock_num_failed = 2 + if self.toil_version == "8.0.0": + mock_num_failed = 3 + else: + mock_num_failed = 2 num_failed = CommandLineToolJob.objects.filter(status=(Status.FAILED)).count() self.assertEqual(num_failed, mock_num_failed) def test_details_set(self): """ - Test if the metadata is being set for commandLineJObs + Test if the metadata is being set for commandLineJobs """ self.mock_track("running") first_running_job = CommandLineToolJob.objects.filter(status=(Status.RUNNING)).first() details = first_running_job.details self.assertIsNotNone(details) self.assertIsNotNone(details["cores_req"]) - self.assertIsNotNone(details["cpu_usage"]) - self.assertIsNotNone(details["job_stream"]) + self.assertTrue("cpu_usage" in details) + self.assertTrue("job_stream" in details) self.assertIsNotNone(details["last_modified"]) - self.assertIsNotNone(details["log_path"]) - self.assertIsNotNone(details["mem_usage"]) + self.assertTrue("log_path" in details) + self.assertTrue("mem_usage" in details) self.assertIsNotNone(details["memory_req"]) def test_hanging_toil_leader_not_running(self): @@ -249,10 +272,11 @@ def test_hanging_message_for_tool_running(self): single_job.save() first_command = CommandLineToolJob.objects.first() first_command.status = Status.RUNNING + example_log = "path/to/log.log" + first_command.details["log_path"] = example_log first_command.save() - command_log_path = first_command.details["log_path"] with override_settings(MAX_HANGING_HOURS=0): check_job_hanging(self.job) self.job.refresh_from_db() self.assertIsNotNone(self.job.message["alerts"][0]) - self.assertTrue(command_log_path in self.job.message["alerts"][0]["message"]) + self.assertTrue(example_log in self.job.message["alerts"][0]["message"]) diff --git a/tests/test_lsf_client.py b/tests/test_lsf_client.py index 2cf59985..be44e0c4 100644 --- a/tests/test_lsf_client.py +++ b/tests/test_lsf_client.py @@ -68,10 +68,26 @@ def test_submit(self, submit_process): submit_process_obj.stdout = self.submit_response submit_process_obj.returncode = 0 submit_process.return_value = submit_process_obj + partition = "test_partition" + lsf_id = self.lsf_client.submit(command, args, stdout_file, self.example_job_id, partition, {}) + expected_command = ["bsub", "-sla", partition, "-g", self.example_lsf_id, "-oo", stdout_file] + args + command + self.assertEqual(lsf_id, self.example_id) + self.assertEqual(submit_process.call_args[0][0], expected_command) + + @patch("subprocess.run") + def test_submit_no_sla(self, submit_process): + """ + Test LSF submit with no sla + """ + command = ["ls"] + args = [] + stdout_file = "stdout.txt" + submit_process_obj = Mock() + submit_process_obj.stdout = self.submit_response + submit_process_obj.returncode = 0 + submit_process.return_value = submit_process_obj lsf_id = self.lsf_client.submit(command, args, stdout_file, self.example_job_id, {}) - expected_command = ( - ["bsub", "-sla", settings.LSF_SLA, "-g", self.example_lsf_id, "-oo", stdout_file] + args + command - ) + expected_command = ["bsub", "-g", self.example_lsf_id, "-oo", stdout_file] + args + command self.assertEqual(lsf_id, self.example_id) self.assertEqual(submit_process.call_args[0][0], expected_command) @@ -87,7 +103,7 @@ def test_submit_slow_lsf(self, submit_process): submit_process_obj.stdout = self.submit_response_please_wait submit_process_obj.returncode = 0 submit_process.return_value = submit_process_obj - lsf_id = self.lsf_client.submit(command, args, stdout_file, self.example_job_id, {}) + lsf_id = self.lsf_client.submit(command, args, stdout_file, self.example_job_id, settings.LSF_SLA, {}) self.assertEqual(lsf_id, self.example_id) @patch("subprocess.run") @@ -103,6 +119,32 @@ def test_terminate(self, terminate_process): self.assertEqual(terminate_process.call_args[0][0], expected_command) self.assertEqual(terminated, True) + @patch("subprocess.run") + def test_suspend(self, suspend_process): + """ + Test LSF suspend + """ + suspend_process_obj = Mock() + suspend_process_obj.returncode = 0 + suspend_process.return_value = suspend_process_obj + expected_command = ["bstop", "-g", self.example_lsf_id, "0"] + suspended = self.lsf_client.suspend(self.example_job_id) + self.assertEqual(suspend_process.call_args[0][0], expected_command) + self.assertEqual(suspended, True) + + @patch("subprocess.run") + def test_resume(self, resume_process): + """ + Test LSF resume + """ + resume_process_obj = Mock() + resume_process_obj.returncode = 0 + resume_process.return_value = resume_process_obj + expected_command = ["bresume", "-g", self.example_lsf_id, "0"] + resumed = self.lsf_client.resume(self.example_job_id) + self.assertEqual(resume_process.call_args[0][0], expected_command) + self.assertEqual(resumed, True) + @patch("subprocess.run") def test_failed_status(self, status_process): """ diff --git a/tests/test_slurm_client.py b/tests/test_slurm_client.py new file mode 100644 index 00000000..3ca39669 --- /dev/null +++ b/tests/test_slurm_client.py @@ -0,0 +1,166 @@ +from django.test import TestCase +from mock import patch, Mock +from batch_systems.slurm_client.slurm_client import SLURMClient +from orchestrator.models import Status + + +class TestSLURMClient(TestCase): + """ + Test LSF Client + """ + + def setUp(self): + self.example_id = "12345678" + self.example_job_id = "d736e17e-d67a-4897-901b-decab9942398" + self.submit_response = "Submitted batch job {}".format(self.example_id) + self.slurm_client = SLURMClient() + self.example_partion = "partition1" + self.status_completed_response = """ + {}|COMPLETED|0:0 + {}.batch|COMPLETED|0:0 + {}.extern|COMPLETED|0:0 + """.format( + self.example_id, self.example_id, self.example_id + ) + self.status_failed_response = """ + {}|FAILED|1:0 + {}.batch|FAILED|1:0 + {}.extern|COMPLETED|0:0 + """.format( + self.example_id, self.example_id, self.example_id + ) + self.status_pend_response = """ + {}|PENDING|0:0 + """.format( + self.example_id + ) + self.exit_reason = "FAILED, tool exit code: 0, batchsystem exit code: 1" + self.pend_reason = None + + @patch("subprocess.run") + def test_submit(self, submit_process): + """ + Test SLURM submit + """ + command = "ls" + args = [] + stdout_file = f"{self.slurm_client.logfileName}" + submit_process_obj = Mock() + submit_process_obj.stdout = self.submit_response + submit_process_obj.returncode = 0 + submit_process.return_value = submit_process_obj + slurm_id = self.slurm_client.submit([command], args, stdout_file, self.example_job_id, self.example_partion, {}) + expected_command = ( + [ + "sbatch", + f"--partition={self.example_partion}", + f"--wckey={self.example_job_id}", + f"--output={self.slurm_client.logfileName}", + ] + + args + + [f"--wrap=exec {command}"] + ) + self.assertEqual(f"{slurm_id}", self.example_id) + self.assertEqual(submit_process.call_args[0][0], expected_command) + + @patch("subprocess.run") + def test_submit_with_args(self, submit_process): + """ + Test SLURM submit with mem and walltime args + """ + command = "ls" + args = [] + stdout_file = f"{self.slurm_client.logfileName}" + submit_process_obj = Mock() + submit_process_obj.stdout = self.submit_response + submit_process_obj.returncode = 0 + submit_process.return_value = submit_process_obj + expected_limit = 10 + mem_limit = 8 + args = self.slurm_client.set_walltime(expected_limit, None) + args.extend(self.slurm_client.set_memlimit(mem_limit)) + + slurm_id = self.slurm_client.submit([command], args, stdout_file, self.example_job_id, self.example_partion, {}) + expected_command = ( + [ + "sbatch", + f"--partition={self.example_partion}", + f"--wckey={self.example_job_id}", + f"--output={self.slurm_client.logfileName}", + ] + + args + + [f"--wrap=exec {command}"] + ) + self.assertEqual(f"{slurm_id}", self.example_id) + self.assertEqual(submit_process.call_args[0][0], expected_command) + + @patch("subprocess.run") + def test_terminate(self, terminate_process): + """ + Test SLURM terminate + """ + terminate_process_obj = Mock() + terminate_process_obj.returncode = 0 + terminate_process.return_value = terminate_process_obj + expected_command = ["scancel", f"--wckey={self.example_job_id}"] + terminated = self.slurm_client.terminate(self.example_job_id) + self.assertEqual(terminate_process.call_args[0][0], expected_command) + self.assertEqual(terminated, True) + + @patch("subprocess.run") + def test_suspend(self, suspend_process): + """ + Test SLURM suspend + """ + sacct_process_obj = Mock() + sacct_process_obj.stdout = f"{self.example_id}" + sacct_process_obj.returncode = 0 + scontrol_process_obj = Mock() + scontrol_process_obj.returncode = 0 + suspend_process.side_effect = [sacct_process_obj, scontrol_process_obj] + expected_command = ["scontrol", "suspend", f"{self.example_id}"] + suspended = self.slurm_client.suspend(self.example_job_id) + self.assertEqual(suspend_process.call_args[0][0], expected_command) + self.assertEqual(suspended, True) + + @patch("subprocess.run") + def test_resume(self, resume_process): + """ + Test SLURM resume + """ + sacct_process_obj = Mock() + sacct_process_obj.stdout = f"{self.example_id}" + sacct_process_obj.returncode = 0 + scontrol_process_obj = Mock() + scontrol_process_obj.returncode = 0 + resume_process.side_effect = [sacct_process_obj, scontrol_process_obj] + expected_command = ["scontrol", "resume", f"{self.example_id}"] + resumed = self.slurm_client.resume(self.example_job_id) + self.assertEqual(resume_process.call_args[0][0], expected_command) + self.assertEqual(resumed, True) + + @patch("subprocess.run") + def test_failed_status(self, status_process): + """ + Test SLURM failed status + """ + status_process_obj = Mock() + status_process_obj.returncode = 0 + status_process_obj.stdout = self.status_failed_response + status_process.return_value = status_process_obj + status = self.slurm_client.status(self.example_id) + expected_status = Status.FAILED, self.exit_reason + self.assertEqual(status, expected_status) + + @patch("subprocess.run") + def test_pend_status(self, status_process): + """ + Test SLURM pending status + """ + status_process_obj = Mock() + status_process_obj.returncode = 0 + status_process_obj.stdout = self.status_pend_response + status_process.return_value = status_process_obj + status = self.slurm_client.status(self.example_id) + expected_status = Status.PENDING, self.pend_reason + self.assertEqual(status, expected_status) diff --git a/tests/test_tasks.py b/tests/test_tasks.py index 94b9019e..c953261c 100644 --- a/tests/test_tasks.py +++ b/tests/test_tasks.py @@ -1,9 +1,9 @@ from unittest import skip -from django.test import TestCase +from django.test import TestCase, override_settings from orchestrator.models import Job, Status, PipelineType from orchestrator.tasks import ( prepare_job, - submit_job_to_lsf, + submit_job_to_batch_system, process_jobs, cleanup_completed_jobs, cleanup_failed_jobs, @@ -13,10 +13,10 @@ from mock import patch, call import uuid from batch_systems.lsf_client.lsf_client import format_lsf_job_id - from submitter.toil_submitter import ToilJobSubmitter MAX_RUNNING_JOBS = 3 +DEFAULT_MEMLIMIT = 5 class TestTasks(TestCase): @@ -28,24 +28,46 @@ def setUp(self): self.submitting_job = Job.objects.filter(status=Status.SUBMITTING).first() @patch("submitter.toil_submitter.toil_jobsubmitter.ToilJobSubmitter.__init__") - @patch("orchestrator.tasks.submit_job_to_lsf") + @patch("orchestrator.tasks.submit_job_to_batch_system") @patch("batch_systems.lsf_client.lsf_client.LSFClient.submit") @skip("Need to mock memcached lock") - def test_submit_polling(self, job_submitter, submit_job_to_lsf, init): - init.return_value = None - job_submitter.return_value = ( - self.current_job.external_id, - self.current_job.job_store_location, - self.current_job.working_dir, - self.current_job.output_directory, - ) - submit_job_to_lsf.return_value = None - created_jobs = len(Job.objects.filter(status=Status.CREATED)) - process_jobs() - self.assertEqual(submit_job_to_lsf.delay.call_count, created_jobs) - submit_job_to_lsf.reset_mock() - process_jobs() - self.assertEqual(submit_job_to_lsf.delay.call_count, 0) + def test_submit_polling_lsf(self, job_submitter, submit_job_to_batch_system, init): + with override_settings(BATCH_SYSTEM="LSF"): + init.return_value = None + job_submitter.return_value = ( + self.current_job.external_id, + self.current_job.job_store_location, + self.current_job.working_dir, + self.current_job.output_directory, + ) + submit_job_to_batch_system.return_value = None + created_jobs = len(Job.objects.filter(status=Status.CREATED)) + process_jobs() + self.assertEqual(submit_job_to_batch_system.delay.call_count, created_jobs) + submit_job_to_batch_system.reset_mock() + process_jobs() + self.assertEqual(submit_job_to_batch_system.delay.call_count, 0) + + @patch("submitter.toil_submitter.toil_jobsubmitter.ToilJobSubmitter.__init__") + @patch("orchestrator.tasks.submit_job_to_batch_system") + @patch("batch_systems.slurm_client.slurm_client.SLURMClient.submit") + @skip("Need to mock memcached lock") + def test_submit_polling_slurm(self, job_submitter, submit_job_to_batch_system, init): + with override_settings(BATCH_SYSTEM="SLURM"): + init.return_value = None + job_submitter.return_value = ( + self.current_job.external_id, + self.current_job.job_store_location, + self.current_job.working_dir, + self.current_job.output_directory, + ) + submit_job_to_batch_system.return_value = None + created_jobs = len(Job.objects.filter(status=Status.CREATED)) + process_jobs() + self.assertEqual(submit_job_to_batch_system.delay.call_count, created_jobs) + submit_job_to_batch_system.reset_mock() + process_jobs() + self.assertEqual(submit_job_to_batch_system.delay.call_count, 0) @patch("submitter.toil_submitter.toil_jobsubmitter.ToilJobSubmitter.prepare_to_submit") def test_prepare_job(self, prepare_to_submit): @@ -63,216 +85,463 @@ def test_prepare_job(self, prepare_to_submit): @patch("batch_systems.lsf_client.lsf_client.LSFClient.submit") @patch("orchestrator.tasks.save_job_info") - def test_submit(self, save_job_info, submit): - save_job_info.return_value = None - submit.return_value = self.submitting_job.external_id - submit_job_to_lsf(self.submitting_job) - self.submitting_job.refresh_from_db() - self.assertEqual(self.submitting_job.finished, None) - self.assertEqual(self.submitting_job.status, Status.SUBMITTED) - - def test_job_args(self): - job_id = str(uuid.uuid4()) - app = {"github": {"repository": "awesome_repo", "entrypoint": "test.cwl"}} - root_dir = "test_root" - resume_jobstore = None - walltime = None - tool_walltime = None - memlimit = None - inputs = {} - expected_job_group = "-g {}".format(format_lsf_job_id(job_id)) - jobsubmitterObject = ToilJobSubmitter( - job_id, app, inputs, root_dir, resume_jobstore, walltime, tool_walltime, memlimit - ) - job_group = " ".join(jobsubmitterObject._job_group()) - self.assertEqual(job_group, expected_job_group) - - def test_job_args_walltime(self): - job_id = str(uuid.uuid4()) - app = {"github": {"repository": "awesome_repo", "entrypoint": "test.cwl"}} - root_dir = "test_root" - resume_jobstore = None - walltime = 7200 - tool_walltime = 24 - memlimit = None - inputs = {} - expected_job_args = "-W {}".format(walltime) - jobsubmitterObject = ToilJobSubmitter( - job_id, app, inputs, root_dir, resume_jobstore, walltime, tool_walltime, memlimit - ) - leader_args_list = jobsubmitterObject._leader_args() - leader_args = " ".join([str(single_arg) for single_arg in leader_args_list]) - self.assertEqual(leader_args, expected_job_args) - - def test_job_args_tool_walltime(self): - job_id = str(uuid.uuid4()) - app = {"github": {"repository": "awesome_repo", "entrypoint": "test.cwl"}} - root_dir = "test_root" - resume_jobstore = None - walltime = 7200 - tool_walltime = 24 - walltime_hard = 24 - walltime_expected = 8 - memlimit = None - inputs = {} - expected_tool_args = "-We {} -W {}".format(walltime_expected, walltime_hard) - jobsubmitterObject = ToilJobSubmitter( - job_id, app, inputs, root_dir, resume_jobstore, walltime, tool_walltime, memlimit - ) - tool_args_list = jobsubmitterObject._tool_args() - tool_args = " ".join([str(single_arg) for single_arg in tool_args_list]) - self.assertEqual(tool_args, expected_tool_args) - - def test_job_args_memlimit(self): - job_id = str(uuid.uuid4()) - app = {"github": {"repository": "awesome_repo", "entrypoint": "test.cwl"}} - root_dir = "test_root" - resume_jobstore = None - walltime = None - tool_walltime = None - memlimit = 10 - inputs = {} - expected_leader_args = "-M {}".format(memlimit) - jobsubmitterObject = ToilJobSubmitter( - job_id, app, inputs, root_dir, resume_jobstore, walltime, tool_walltime, memlimit - ) - leader_args_list = jobsubmitterObject._leader_args() - leader_args = " ".join([str(single_arg) for single_arg in leader_args_list]) - self.assertEqual(leader_args, expected_leader_args) - - def test_job_args_all_options(self): - job_id = str(uuid.uuid4()) - app = {"github": {"repository": "awesome_repo", "entrypoint": "test.cwl"}} - root_dir = "test_root" - resume_jobstore = None - walltime = 7200 - tool_walltime = 24 - tool_walltime = 24 - walltime_hard = 24 - walltime_expected = 8 - memlimit = 10 - inputs = {} - expected_leader_args = "-W {} -M {}".format(walltime, memlimit) - expected_job_group = "-g {}".format(format_lsf_job_id(job_id)) - expected_tool_args = "-We {} -W {} -M {}".format(walltime_expected, walltime_hard, memlimit) - jobsubmitterObject = ToilJobSubmitter( - job_id, app, inputs, root_dir, resume_jobstore, walltime, tool_walltime, memlimit - ) - leader_args_list = jobsubmitterObject._leader_args() - leader_args = " ".join([str(single_arg) for single_arg in leader_args_list]) - job_group = " ".join(jobsubmitterObject._job_group()) - tool_args_list = jobsubmitterObject._tool_args() - tool_args = " ".join([str(single_arg) for single_arg in tool_args_list]) - self.assertEqual(leader_args, expected_leader_args) - self.assertEqual(job_group, expected_job_group) - self.assertEqual(tool_args, expected_tool_args) + def test_submit_lsf(self, save_job_info, submit): + with override_settings(BATCH_SYSTEM="LSF"): + save_job_info.return_value = None + submit.return_value = self.submitting_job.external_id + submit_job_to_batch_system(self.submitting_job) + self.submitting_job.refresh_from_db() + self.assertEqual(self.submitting_job.finished, None) + self.assertEqual(self.submitting_job.status, Status.SUBMITTED) + + @patch("batch_systems.slurm_client.slurm_client.SLURMClient.submit") + @patch("orchestrator.tasks.save_job_info") + def test_submit_slurm(self, save_job_info, submit): + with override_settings(BATCH_SYSTEM="SLURM"): + save_job_info.return_value = None + submit.return_value = self.submitting_job.external_id + submit_job_to_batch_system(self.submitting_job) + self.submitting_job.refresh_from_db() + self.assertEqual(self.submitting_job.finished, None) + self.assertEqual(self.submitting_job.status, Status.SUBMITTED) + + def test_job_args_lsf(self): + with override_settings(BATCH_SYSTEM="LSF"): + job_id = str(uuid.uuid4()) + app = {"github": {"repository": "awesome_repo", "entrypoint": "test.cwl"}} + root_dir = "test_root" + resume_jobstore = None + walltime = None + tool_walltime = None + memlimit = None + inputs = {} + expected_job_group = "-g {}".format(format_lsf_job_id(job_id)) + jobsubmitterObject = ToilJobSubmitter( + job_id, app, inputs, root_dir, resume_jobstore, walltime, tool_walltime, memlimit + ) + job_group = " ".join(jobsubmitterObject._job_group()) + self.assertEqual(job_group, expected_job_group) + + def test_job_args_slurm(self): + with override_settings(BATCH_SYSTEM="SLURM"): + job_id = str(uuid.uuid4()) + app = {"github": {"repository": "awesome_repo", "entrypoint": "test.cwl"}} + root_dir = "test_root" + resume_jobstore = None + walltime = None + tool_walltime = None + memlimit = None + inputs = {} + expected_job_group = "--wckey={}".format(job_id) + jobsubmitterObject = ToilJobSubmitter( + job_id, app, inputs, root_dir, resume_jobstore, walltime, tool_walltime, memlimit + ) + job_group = " ".join(jobsubmitterObject._job_group()) + self.assertEqual(job_group, expected_job_group) + + def test_job_args_walltime_lsf(self): + with override_settings(BATCH_SYSTEM="LSF"): + job_id = str(uuid.uuid4()) + app = {"github": {"repository": "awesome_repo", "entrypoint": "test.cwl"}} + root_dir = "test_root" + resume_jobstore = None + walltime = 7200 + tool_walltime = 24 + memlimit = None + inputs = {} + expected_job_args = "-W {} -M {}".format(walltime, DEFAULT_MEMLIMIT) + jobsubmitterObject = ToilJobSubmitter( + job_id, app, inputs, root_dir, resume_jobstore, walltime, tool_walltime, memlimit + ) + leader_args_list = jobsubmitterObject._leader_args() + leader_args = " ".join([str(single_arg) for single_arg in leader_args_list]) + self.assertEqual(leader_args, expected_job_args) + + def test_job_args_walltime_slurm(self): + with override_settings(BATCH_SYSTEM="SLURM"): + job_id = str(uuid.uuid4()) + app = {"github": {"repository": "awesome_repo", "entrypoint": "test.cwl"}} + root_dir = "test_root" + resume_jobstore = None + walltime = 7200 + tool_walltime = 24 + memlimit = None + inputs = {} + expected_job_args = "--time={} --mem={}G".format(walltime, DEFAULT_MEMLIMIT) + jobsubmitterObject = ToilJobSubmitter( + job_id, app, inputs, root_dir, resume_jobstore, walltime, tool_walltime, memlimit + ) + leader_args_list = jobsubmitterObject._leader_args() + leader_args = " ".join([str(single_arg) for single_arg in leader_args_list]) + self.assertEqual(leader_args, expected_job_args) + + def test_job_args_tool_walltime_lsf(self): + with override_settings(BATCH_SYSTEM="LSF"): + job_id = str(uuid.uuid4()) + app = {"github": {"repository": "awesome_repo", "entrypoint": "test.cwl"}} + root_dir = "test_root" + resume_jobstore = None + walltime = 7200 + tool_walltime = 24 + walltime_hard = 24 + walltime_expected = 8 + memlimit = None + inputs = {} + expected_tool_args = "-We {} -W {}".format(walltime_expected, walltime_hard) + jobsubmitterObject = ToilJobSubmitter( + job_id, app, inputs, root_dir, resume_jobstore, walltime, tool_walltime, memlimit + ) + tool_args_list = jobsubmitterObject._tool_args() + tool_args = " ".join([str(single_arg) for single_arg in tool_args_list]) + self.assertEqual(tool_args, expected_tool_args) + + def test_job_args_tool_walltime_slurm(self): + with override_settings(BATCH_SYSTEM="SLURM"): + job_id = str(uuid.uuid4()) + app = {"github": {"repository": "awesome_repo", "entrypoint": "test.cwl"}} + root_dir = "test_root" + resume_jobstore = None + walltime = 7200 + tool_walltime = 24 + walltime_hard = 24 + memlimit = None + inputs = {} + expected_tool_args = "--time={}".format(walltime_hard) + jobsubmitterObject = ToilJobSubmitter( + job_id, app, inputs, root_dir, resume_jobstore, walltime, tool_walltime, memlimit + ) + tool_args_list = jobsubmitterObject._tool_args() + tool_args = " ".join([str(single_arg) for single_arg in tool_args_list]) + self.assertEqual(tool_args, expected_tool_args) + + def test_job_args_default_memlimit_lsf(self): + with override_settings(BATCH_SYSTEM="LSF"): + job_id = str(uuid.uuid4()) + app = {"github": {"repository": "awesome_repo", "entrypoint": "test.cwl"}} + root_dir = "test_root" + resume_jobstore = None + walltime = None + tool_walltime = None + memlimit = None + inputs = {} + expected_leader_args = "-M {}".format(DEFAULT_MEMLIMIT) + jobsubmitterObject = ToilJobSubmitter( + job_id, app, inputs, root_dir, resume_jobstore, walltime, tool_walltime, memlimit + ) + leader_args_list = jobsubmitterObject._leader_args() + leader_args = " ".join([str(single_arg) for single_arg in leader_args_list]) + self.assertEqual(leader_args, expected_leader_args) + + def test_job_args_memlimit_lsf(self): + with override_settings(BATCH_SYSTEM="LSF"): + job_id = str(uuid.uuid4()) + app = {"github": {"repository": "awesome_repo", "entrypoint": "test.cwl"}} + root_dir = "test_root" + resume_jobstore = None + walltime = None + tool_walltime = None + memlimit = 10 + inputs = {} + expected_leader_args = "-M {}".format(memlimit) + jobsubmitterObject = ToilJobSubmitter( + job_id, app, inputs, root_dir, resume_jobstore, walltime, tool_walltime, memlimit + ) + leader_args_list = jobsubmitterObject._leader_args() + leader_args = " ".join([str(single_arg) for single_arg in leader_args_list]) + self.assertEqual(leader_args, expected_leader_args) + + def test_job_args_memlimit_slurm(self): + with override_settings(BATCH_SYSTEM="SLURM"): + job_id = str(uuid.uuid4()) + app = {"github": {"repository": "awesome_repo", "entrypoint": "test.cwl"}} + root_dir = "test_root" + resume_jobstore = None + walltime = None + tool_walltime = None + memlimit = 10 + inputs = {} + expected_leader_args = "--mem={}G".format(memlimit) + jobsubmitterObject = ToilJobSubmitter( + job_id, app, inputs, root_dir, resume_jobstore, walltime, tool_walltime, memlimit + ) + leader_args_list = jobsubmitterObject._leader_args() + leader_args = " ".join([str(single_arg) for single_arg in leader_args_list]) + self.assertEqual(leader_args, expected_leader_args) + + def test_job_args_default_memlimit_slurm(self): + with override_settings(BATCH_SYSTEM="SLURM"): + job_id = str(uuid.uuid4()) + app = {"github": {"repository": "awesome_repo", "entrypoint": "test.cwl"}} + root_dir = "test_root" + resume_jobstore = None + walltime = None + tool_walltime = None + memlimit = None + inputs = {} + expected_leader_args = "--mem={}G".format(DEFAULT_MEMLIMIT) + jobsubmitterObject = ToilJobSubmitter( + job_id, app, inputs, root_dir, resume_jobstore, walltime, tool_walltime, memlimit + ) + leader_args_list = jobsubmitterObject._leader_args() + leader_args = " ".join([str(single_arg) for single_arg in leader_args_list]) + self.assertEqual(leader_args, expected_leader_args) + + def test_job_args_all_options_lsf(self): + with override_settings(BATCH_SYSTEM="LSF"): + job_id = str(uuid.uuid4()) + app = {"github": {"repository": "awesome_repo", "entrypoint": "test.cwl"}} + root_dir = "test_root" + resume_jobstore = None + walltime = 7200 + tool_walltime = 24 + walltime_hard = 24 + walltime_expected = 8 + memlimit = 10 + inputs = {} + expected_leader_args = "-W {} -M {}".format(walltime, memlimit) + expected_job_group = "-g {}".format(format_lsf_job_id(job_id)) + expected_tool_args = "-We {} -W {}".format(walltime_expected, walltime_hard) + jobsubmitterObject = ToilJobSubmitter( + job_id, app, inputs, root_dir, resume_jobstore, walltime, tool_walltime, memlimit + ) + leader_args_list = jobsubmitterObject._leader_args() + leader_args = " ".join([str(single_arg) for single_arg in leader_args_list]) + job_group = " ".join(jobsubmitterObject._job_group()) + tool_args_list = jobsubmitterObject._tool_args() + tool_args = " ".join([str(single_arg) for single_arg in tool_args_list]) + self.assertEqual(leader_args, expected_leader_args) + self.assertEqual(job_group, expected_job_group) + self.assertEqual(tool_args, expected_tool_args) + + def test_job_args_all_options_slurm(self): + with override_settings(BATCH_SYSTEM="SLURM"): + job_id = str(uuid.uuid4()) + app = {"github": {"repository": "awesome_repo", "entrypoint": "test.cwl"}} + root_dir = "test_root" + resume_jobstore = None + walltime = 7200 + tool_walltime = 24 + walltime_hard = 24 + memlimit = 10 + inputs = {} + expected_leader_args = "--time={} --mem={}G".format(walltime, memlimit) + expected_job_group = "--wckey={}".format(job_id) + expected_tool_args = "--time={}".format(walltime_hard) + jobsubmitterObject = ToilJobSubmitter( + job_id, app, inputs, root_dir, resume_jobstore, walltime, tool_walltime, memlimit + ) + leader_args_list = jobsubmitterObject._leader_args() + leader_args = " ".join([str(single_arg) for single_arg in leader_args_list]) + job_group = " ".join(jobsubmitterObject._job_group()) + tool_args_list = jobsubmitterObject._tool_args() + tool_args = " ".join([str(single_arg) for single_arg in tool_args_list]) + self.assertEqual(leader_args, expected_leader_args) + self.assertEqual(job_group, expected_job_group) + self.assertEqual(tool_args, expected_tool_args) @patch("orchestrator.tasks.command_processor.delay") @patch("orchestrator.tasks.get_job_info_path") @patch("batch_systems.lsf_client.lsf_client.LSFClient.status") @patch("submitter.toil_submitter.ToilJobSubmitter.get_outputs") @patch("orchestrator.tasks.set_permissions_job.delay") - def test_complete(self, permission, get_outputs, status, get_job_info_path, command_processor): - self.current_job.status = Status.PENDING - self.current_job.save() - permission.return_value = None - command_processor.return_value = True - get_outputs.return_value = {"outputs": True}, None - get_job_info_path.return_value = "sample/job/path" - status.return_value = Status.COMPLETED, None - check_job_status(self.current_job) - self.current_job.refresh_from_db() - self.assertEqual(self.current_job.status, Status.SET_PERMISSIONS) - self.assertNotEqual(self.current_job.finished, None) + def test_complete_lsf(self, permission, get_outputs, status, get_job_info_path, command_processor): + with override_settings(BATCH_SYSTEM="LSF"): + self.current_job.status = Status.PENDING + self.current_job.save() + permission.return_value = None + command_processor.return_value = True + get_outputs.return_value = {"outputs": True}, None + get_job_info_path.return_value = "sample/job/path" + status.return_value = Status.COMPLETED, None + check_job_status(self.current_job) + self.current_job.refresh_from_db() + self.assertEqual(self.current_job.status, Status.SET_PERMISSIONS) + self.assertNotEqual(self.current_job.finished, None) + + @patch("orchestrator.tasks.command_processor.delay") + @patch("orchestrator.tasks.get_job_info_path") + @patch("batch_systems.slurm_client.slurm_client.SLURMClient.status") + @patch("submitter.toil_submitter.ToilJobSubmitter.get_outputs") + @patch("orchestrator.tasks.set_permissions_job.delay") + def test_complete_slurm(self, permission, get_outputs, status, get_job_info_path, command_processor): + with override_settings(BATCH_SYSTEM="SLURM"): + self.current_job.status = Status.PENDING + self.current_job.save() + permission.return_value = None + command_processor.return_value = True + get_outputs.return_value = {"outputs": True}, None + get_job_info_path.return_value = "sample/job/path" + status.return_value = Status.COMPLETED, None + check_job_status(self.current_job) + self.current_job.refresh_from_db() + self.assertEqual(self.current_job.status, Status.SET_PERMISSIONS) + self.assertNotEqual(self.current_job.finished, None) @patch("orchestrator.tasks.command_processor.delay") @patch("orchestrator.tasks.get_job_info_path") @patch("batch_systems.lsf_client.lsf_client.LSFClient.status") - def test_fail(self, status, get_job_info_path, command_processor): - self.current_job.status = Status.PENDING - self.current_job.save() - command_processor.return_value = True - get_job_info_path.return_value = "sample/job/path" - status.return_value = Status.FAILED, "submitter reason" - check_job_status(self.current_job) - self.current_job.refresh_from_db() - self.assertEqual(self.current_job.status, Status.FAILED) - self.assertNotEqual(self.current_job.finished, None) - info_message = self.current_job.message["info"] - failed_jobs = self.current_job.message["failed_jobs"] - unknown_jobs = self.current_job.message["unknown_jobs"] - expected_failed_jobs = { - "failed_job_1": ["failed_job_1_id"], - "failed_job_2": ["failed_job_2_id"], - "running_job": ["running_job_id"], - } - expected_unknown_jobs = {"unknown_job": ["unknown_job_id_1", "unknown_job_id_2"]} - self.assertEqual(info_message, "submitter reason") - self.assertEqual(failed_jobs, expected_failed_jobs) - self.assertEqual(unknown_jobs, expected_unknown_jobs) + def test_fail_lsf(self, status, get_job_info_path, command_processor): + with override_settings(BATCH_SYSTEM="LSF"): + self.current_job.status = Status.PENDING + self.current_job.save() + command_processor.return_value = True + get_job_info_path.return_value = "sample/job/path" + status.return_value = Status.FAILED, "submitter reason" + check_job_status(self.current_job) + self.current_job.refresh_from_db() + self.assertEqual(self.current_job.status, Status.FAILED) + self.assertNotEqual(self.current_job.finished, None) + info_message = self.current_job.message["info"] + failed_jobs = self.current_job.message["failed_jobs"] + unknown_jobs = self.current_job.message["unknown_jobs"] + expected_failed_jobs = { + "failed_job_1": ["failed_job_1_id"], + "failed_job_2": ["failed_job_2_id"], + "running_job": ["running_job_id"], + } + expected_unknown_jobs = {"unknown_job": ["unknown_job_id_1", "unknown_job_id_2"]} + self.assertEqual(info_message, "submitter reason") + self.assertEqual(failed_jobs, expected_failed_jobs) + self.assertEqual(unknown_jobs, expected_unknown_jobs) + + @patch("orchestrator.tasks.command_processor.delay") + @patch("orchestrator.tasks.get_job_info_path") + @patch("batch_systems.slurm_client.slurm_client.SLURMClient.status") + def test_fail_slurm(self, status, get_job_info_path, command_processor): + with override_settings(BATCH_SYSTEM="SLURM"): + self.current_job.status = Status.PENDING + self.current_job.save() + command_processor.return_value = True + get_job_info_path.return_value = "sample/job/path" + status.return_value = Status.FAILED, "submitter reason" + check_job_status(self.current_job) + self.current_job.refresh_from_db() + self.assertEqual(self.current_job.status, Status.FAILED) + self.assertNotEqual(self.current_job.finished, None) + info_message = self.current_job.message["info"] + failed_jobs = self.current_job.message["failed_jobs"] + unknown_jobs = self.current_job.message["unknown_jobs"] + expected_failed_jobs = { + "failed_job_1": ["failed_job_1_id"], + "failed_job_2": ["failed_job_2_id"], + "running_job": ["running_job_id"], + } + expected_unknown_jobs = {"unknown_job": ["unknown_job_id_1", "unknown_job_id_2"]} + self.assertEqual(info_message, "submitter reason") + self.assertEqual(failed_jobs, expected_failed_jobs) + self.assertEqual(unknown_jobs, expected_unknown_jobs) @patch("orchestrator.tasks.command_processor.delay") @patch("orchestrator.tasks.get_job_info_path") @patch("batch_systems.lsf_client.lsf_client.LSFClient.status") - def test_running(self, status, get_job_info_path, command_processor): - self.current_job.status = Status.PENDING - self.current_job.save() - command_processor.return_value = True - get_job_info_path.return_value = "sample/job/path" - status.return_value = Status.RUNNING, None - check_job_status(self.current_job) - self.current_job.refresh_from_db() - self.assertEqual(self.current_job.status, Status.RUNNING) - self.assertNotEqual(self.current_job.started, None) - self.assertEqual(self.current_job.finished, None) + def test_running_lsf(self, status, get_job_info_path, command_processor): + with override_settings(BATCH_SYSTEM="LSF"): + self.current_job.status = Status.PENDING + self.current_job.save() + command_processor.return_value = True + get_job_info_path.return_value = "sample/job/path" + status.return_value = Status.RUNNING, None + check_job_status(self.current_job) + self.current_job.refresh_from_db() + self.assertEqual(self.current_job.status, Status.RUNNING) + self.assertNotEqual(self.current_job.started, None) + self.assertEqual(self.current_job.finished, None) + + @patch("orchestrator.tasks.command_processor.delay") + @patch("orchestrator.tasks.get_job_info_path") + @patch("batch_systems.slurm_client.slurm_client.SLURMClient.status") + def test_running_slurm(self, status, get_job_info_path, command_processor): + with override_settings(BATCH_SYSTEM="SLURM"): + self.current_job.status = Status.PENDING + self.current_job.save() + command_processor.return_value = True + get_job_info_path.return_value = "sample/job/path" + status.return_value = Status.RUNNING, None + check_job_status(self.current_job) + self.current_job.refresh_from_db() + self.assertEqual(self.current_job.status, Status.RUNNING) + self.assertNotEqual(self.current_job.started, None) + self.assertEqual(self.current_job.finished, None) @patch("orchestrator.tasks.command_processor.delay") @patch("batch_systems.lsf_client.lsf_client.LSFClient.status") @skip("We are no longer failing tests on pending status, and instead letting the task fail it") - def test_fail_not_submitted(self, status, command_processor): - command_processor.return_value = True - status.return_value = Status.PENDING, None - self.current_job.status = Status.PENDING - self.current_job.external_id = None - self.current_job.save() - check_job_status(self.current_job) - self.current_job.refresh_from_db() - self.assertEqual(self.current_job.status, Status.FAILED) - self.assertNotEqual(self.current_job.finished, None) - info_message = self.current_job.message["info"] - failed_jobs = self.current_job.message["failed_jobs"] - unknown_jobs = self.current_job.message["unknown_jobs"] - expected_failed_jobs = {} - expected_unknown_jobs = {} - self.assertTrue("External id not provided" in info_message) - self.assertEqual(failed_jobs, expected_failed_jobs) - self.assertEqual(unknown_jobs, expected_unknown_jobs) + def test_fail_not_submitted_lsf(self, status, command_processor): + with override_settings(BATCH_SYSTEM="LSF"): + command_processor.return_value = True + status.return_value = Status.PENDING, None + self.current_job.status = Status.PENDING + self.current_job.external_id = None + self.current_job.save() + check_job_status(self.current_job) + self.current_job.refresh_from_db() + self.assertEqual(self.current_job.status, Status.FAILED) + self.assertNotEqual(self.current_job.finished, None) + info_message = self.current_job.message["info"] + failed_jobs = self.current_job.message["failed_jobs"] + unknown_jobs = self.current_job.message["unknown_jobs"] + expected_failed_jobs = {} + expected_unknown_jobs = {} + self.assertTrue("External id not provided" in info_message) + self.assertEqual(failed_jobs, expected_failed_jobs) + self.assertEqual(unknown_jobs, expected_unknown_jobs) @patch("orchestrator.tasks.cleanup_folders") - def test_cleanup(self, cleanup_folders): - Job.objects.create( - type=PipelineType.CWL, - app={"app": "link"}, - status=Status.COMPLETED, - created_date=datetime.now() - timedelta(days=1), - finished=datetime.now() - timedelta(days=1), - ) - testtime = datetime.now() - timedelta(days=32) - with patch("django.utils.timezone.now") as mock_now: - mock_now.return_value = testtime - job_old_completed = Job.objects.create( - type=PipelineType.CWL, app={"app": "link"}, status=Status.COMPLETED, finished=testtime + def test_cleanup_lsf(self, cleanup_folders): + with override_settings(BATCH_SYSTEM="LSF"): + Job.objects.create( + type=PipelineType.CWL, + app={"app": "link"}, + status=Status.COMPLETED, + created_date=datetime.now() - timedelta(days=1), + finished=datetime.now() - timedelta(days=1), ) - job_old_failed = Job.objects.create( - type=PipelineType.CWL, app={"app": "link"}, status=Status.FAILED, finished=testtime + testtime = datetime.now() - timedelta(days=32) + with patch("django.utils.timezone.now") as mock_now: + mock_now.return_value = testtime + job_old_completed = Job.objects.create( + type=PipelineType.CWL, app={"app": "link"}, status=Status.COMPLETED, finished=testtime + ) + job_old_failed = Job.objects.create( + type=PipelineType.CWL, app={"app": "link"}, status=Status.FAILED, finished=testtime + ) + + cleanup_completed_jobs() + cleanup_failed_jobs() + + calls = [ + call(str(job_old_completed.id), exclude=["input.json", "lsf.log"]), + call(str(job_old_failed.id), exclude=["input.json", "lsf.log"]), + ] + + cleanup_folders.delay.assert_has_calls(calls, any_order=True) + + @patch("orchestrator.tasks.cleanup_folders") + def test_cleanup_slurm(self, cleanup_folders): + with override_settings(BATCH_SYSTEM="SLURM"): + Job.objects.create( + type=PipelineType.CWL, + app={"app": "link"}, + status=Status.COMPLETED, + created_date=datetime.now() - timedelta(days=1), + finished=datetime.now() - timedelta(days=1), ) + testtime = datetime.now() - timedelta(days=32) + with patch("django.utils.timezone.now") as mock_now: + mock_now.return_value = testtime + job_old_completed = Job.objects.create( + type=PipelineType.CWL, app={"app": "link"}, status=Status.COMPLETED, finished=testtime + ) + job_old_failed = Job.objects.create( + type=PipelineType.CWL, app={"app": "link"}, status=Status.FAILED, finished=testtime + ) - cleanup_completed_jobs() - cleanup_failed_jobs() + cleanup_completed_jobs() + cleanup_failed_jobs() - calls = [ - call(str(job_old_completed.id), exclude=["input.json", "lsf.log"]), - call(str(job_old_failed.id), exclude=["input.json", "lsf.log"]), - ] + calls = [ + call(str(job_old_completed.id), exclude=["input.json", "slurm.log"]), + call(str(job_old_failed.id), exclude=["input.json", "slurm.log"]), + ] - cleanup_folders.delay.assert_has_calls(calls, any_order=True) + cleanup_folders.delay.assert_has_calls(calls, any_order=True) diff --git a/travis_env.sh b/travis_env.sh index 262606c6..777aef30 100644 --- a/travis_env.sh +++ b/travis_env.sh @@ -23,26 +23,35 @@ export RIDGEBACK_RABBITMQ_PASSWORD=sample_password export ARGOS_JOB_STORE_ROOT=/sample_path export ARGOS_WORK_DIR_ROOT=/sample_path export ARGOS_TMP_DIR_ROOT=/sample_path +export ARGOS_PARTITION=sample_SLA export TEMPO_JOB_STORE_ROOT=/sample_path export TEMPO_WORK_DIR_ROOT=/sample_path export TEMPO_TMP_DIR_ROOT=/sample_path +export TEMPO_PARTITION=sample_SLA export ACCESS_JOB_STORE_ROOT=/sample_path export ACCESS_WORK_DIR_ROOT=/sample_path export ACCESS_TMP_DIR_ROOT=/sample_path +export ACCESS_PARTITION=sample_SLA export CMO_CH_JOB_STORE_ROOT=/sample_path export CMO_CH_WORK_DIR_ROOT=/sample_path export CMO_CH_TMP_DIR_ROOT=/sample_path +export CMO_CH_PARTITION=sample_SLA export ACCESS_HEME_JOB_STORE_ROOT=/sample_path export ACCESS_HEME_WORK_DIR_ROOT=/sample_path export ACCESS_HEME_TMP_DIR_ROOT=/sample_path +export ACCESS_HEME_PARTITION=sample_SLA +export MICROBIOME_JOB_STORE_ROOT=/sample_path +export MICROBIOME_WORK_DIR_ROOT=/sample_path +export MICROBIOME_TMP_DIR_ROOT=/sample_path +export MICROBIOME_PARTITION=sample_SLA export DEFAULT_JOB_STORE_ROOT=/sample_path export DEFAULT_WORK_DIR_ROOT=/sample_path export DEFAULT_TMP_DIR_ROOT=/sample_path +export DEFAULT_PARTITION=sample_SLA ### Set the LSF env variable export RIDGEBACK_LSF_WALLTIME=10:00 -export RIDGEBACK_LSF_SLA=sample_SLA ### Set the celery env variable