From 53b70f027bcb2e2474d9f512b1a44f3a8c007371 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Sun, 1 Mar 2026 20:06:26 +0530 Subject: [PATCH 01/11] Added retry for unreachable iDRACs --- utils/roles/idrac_pxe_boot/tasks/main.yml | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/utils/roles/idrac_pxe_boot/tasks/main.yml b/utils/roles/idrac_pxe_boot/tasks/main.yml index 70fb4dde2c..7c153f4a24 100644 --- a/utils/roles/idrac_pxe_boot/tasks/main.yml +++ b/utils/roles/idrac_pxe_boot/tasks/main.yml @@ -19,11 +19,14 @@ idrac_password: "{{ bmc_password | default(hostvars['localhost']['bmc_password']) }}" validate_certs: false register: lc_check_status - ignore_errors: true - -- name: Set reboot type - ansible.builtin.set_fact: - reboot_type: "{{ 'ForceRestart' if force_restart else 'GracefulRestart' }}" + until: + - lc_check_status.lc_status_info.LCReady is defined + - lc_check_status.lc_status_info.LCReady + retries: 3 + delay: 5 + # ignore_errors: true + # free startegy should be used to avoid blocking on failed hosts + # but issue will be summarizing/syncing summary report of all at the end - name: IDRAC ops when ready when: @@ -50,7 +53,7 @@ username: "{{ bmc_username | default(hostvars['localhost']['bmc_username']) }}" password: "{{ bmc_password | default(hostvars['localhost']['bmc_password']) }}" validate_certs: false - reset_type: ForceRestart + reset_type: "{{ 'ForceRestart' if force_restart else 'GracefulRestart' }}" when: restart_host register: restart_op failed_when: false From 8a6d40bb053b2d10f32d7386a8bc618d45fe3901 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Mon, 2 Mar 2026 09:08:01 +0000 Subject: [PATCH 02/11] multi-node mpi jobs firewall permission --- ...-group-login_compiler_node_aarch64.yaml.j2 | 32 +++++++++++++++++++ ...i-group-login_compiler_node_x86_64.yaml.j2 | 32 +++++++++++++++++++ .../ci-group-login_node_aarch64.yaml.j2 | 32 +++++++++++++++++++ .../ci-group-login_node_x86_64.yaml.j2 | 32 +++++++++++++++++++ .../ci-group-slurm_node_aarch64.yaml.j2 | 30 +++++++++++++++++ .../ci-group-slurm_node_x86_64.yaml.j2 | 30 +++++++++++++++++ .../roles/slurm_config/defaults/main.yml | 2 +- 7 files changed, 189 insertions(+), 1 deletion(-) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 36332685ac..fe6966c4be 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -315,6 +315,38 @@ - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SrunPortRange }}/udp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/tcp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/udp + + # Add PXE network to trusted zone for ORTE communication + - echo "[INFO] Adding PXE network to trusted zone for ORTE communication" + - | + bash -c ' + ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}" + NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" + + # Convert IP to integer and calculate network address + ip_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) + } + + int_to_ip() { + local ip=$1 + echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" + } + + ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP") + HOST_BITS=$(( 32 - NETMASK_BITS )) + HOST_MASK=$(( (1 << HOST_BITS) - 1 )) + NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF )) + NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK )) + NETWORK_IP=$(int_to_ip "$NETWORK_INT") + + PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS" + echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET" + firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent + ' + - firewall-cmd --reload - systemctl enable sshd - systemctl start sshd diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 51121a2e82..1ee1fce5e1 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -317,6 +317,38 @@ - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SrunPortRange }}/udp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/tcp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/udp + + # Add PXE network to trusted zone for ORTE communication + - echo "[INFO] Adding PXE network to trusted zone for ORTE communication" + - | + bash -c ' + ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}" + NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" + + # Convert IP to integer and calculate network address + ip_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) + } + + int_to_ip() { + local ip=$1 + echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" + } + + ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP") + HOST_BITS=$(( 32 - NETMASK_BITS )) + HOST_MASK=$(( (1 << HOST_BITS) - 1 )) + NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF )) + NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK )) + NETWORK_IP=$(int_to_ip "$NETWORK_INT") + + PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS" + echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET" + firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent + ' + - firewall-cmd --reload - systemctl enable sshd - systemctl start sshd diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index d6071116ac..cdea0cd340 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -171,6 +171,38 @@ - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SrunPortRange }}/udp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/tcp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/udp + + # Add PXE network to trusted zone for ORTE communication + - echo "[INFO] Adding PXE network to trusted zone for ORTE communication" + - | + bash -c ' + ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}" + NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" + + # Convert IP to integer and calculate network address + ip_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) + } + + int_to_ip() { + local ip=$1 + echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" + } + + ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP") + HOST_BITS=$(( 32 - NETMASK_BITS )) + HOST_MASK=$(( (1 << HOST_BITS) - 1 )) + NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF )) + NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK )) + NETWORK_IP=$(int_to_ip "$NETWORK_INT") + + PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS" + echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET" + firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent + ' + - firewall-cmd --reload - systemctl enable sshd - systemctl start sshd diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index 524553bd55..b744859381 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -174,6 +174,38 @@ - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SrunPortRange }}/udp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/tcp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/udp + + # Add PXE network to trusted zone for ORTE communication + - echo "[INFO] Adding PXE network to trusted zone for ORTE communication" + - | + bash -c ' + ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}" + NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" + + # Convert IP to integer and calculate network address + ip_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) + } + + int_to_ip() { + local ip=$1 + echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" + } + + ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP") + HOST_BITS=$(( 32 - NETMASK_BITS )) + HOST_MASK=$(( (1 << HOST_BITS) - 1 )) + NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF )) + NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK )) + NETWORK_IP=$(int_to_ip "$NETWORK_INT") + + PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS" + echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET" + firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent + ' + - firewall-cmd --reload - systemctl enable sshd - systemctl start sshd diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 0f25c81307..981b283a99 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -386,6 +386,36 @@ firewall-cmd --permanent --add-service=ssh firewall-cmd --permanent --add-port="${SRUN_RANGE}"/tcp firewall-cmd --permanent --add-port="${SLURMD_PORT}"/tcp + + # Add PXE network to trusted zone for ORTE communication + echo "[INFO] Adding PXE network to trusted zone for ORTE communication" + # Calculate PXE subnet using admin IP and netmask bits + ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}" + NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" + + # Convert IP to integer and calculate network address + ip_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) + } + + int_to_ip() { + local ip=$1 + echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" + } + + ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP") + HOST_BITS=$(( 32 - NETMASK_BITS )) + HOST_MASK=$(( (1 << HOST_BITS) - 1 )) + NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF )) + NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK )) + NETWORK_IP=$(int_to_ip "$NETWORK_INT") + + PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS" + echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET" + firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent + firewall-cmd --reload echo "[INFO] Unmounting controller slurm.conf directory from $CTLD_SLURM_DIR_MNT" diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index d21fcf9c5c..b914abdecd 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -403,6 +403,36 @@ firewall-cmd --permanent --add-service=ssh firewall-cmd --permanent --add-port="${SRUN_RANGE}"/tcp firewall-cmd --permanent --add-port="${SLURMD_PORT}"/tcp + + # Add PXE network to trusted zone for ORTE communication + echo "[INFO] Adding PXE network to trusted zone for ORTE communication" + # Calculate PXE subnet using admin IP and netmask bits + ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}" + NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" + + # Convert IP to integer and calculate network address + ip_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) + } + + int_to_ip() { + local ip=$1 + echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" + } + + ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP") + HOST_BITS=$(( 32 - NETMASK_BITS )) + HOST_MASK=$(( (1 << HOST_BITS) - 1 )) + NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF )) + NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK )) + NETWORK_IP=$(int_to_ip "$NETWORK_INT") + + PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS" + echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET" + firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent + firewall-cmd --reload echo "[INFO] Unmounting controller slurm.conf directory from $CTLD_SLURM_DIR_MNT" diff --git a/discovery/roles/slurm_config/defaults/main.yml b/discovery/roles/slurm_config/defaults/main.yml index edee927fc9..68770c386c 100644 --- a/discovery/roles/slurm_config/defaults/main.yml +++ b/discovery/roles/slurm_config/defaults/main.yml @@ -19,7 +19,7 @@ slurmctld_service_default_path: '/usr/lib/systemd/system/slurmctld.service' slurmd_service_default_path: '/usr/lib/systemd/system/slurmd.service' slurmdbd_service_default_path: '/usr/lib/systemd/system/slurmdbd.service' sys_env_path: '/etc/environment' -default_real_memory: 884736 +default_real_memory: 864 default_threadspercore: 1 default_corespersocket: 72 default_sockets: 2 From 884011da830f7d28c07df8d83b3ffdfbe1cb5c5e Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Mon, 2 Mar 2026 15:45:07 +0530 Subject: [PATCH 03/11] Updated gpu default for slurm conf pxe boot with retries added hwloc for slurm packages --- .../roles/slurm_config/defaults/main.yml | 10 +++--- .../slurm_config/tasks/build_slurm_conf.yml | 6 ---- discovery/roles/slurm_config/tasks/confs.yml | 31 +++++++---------- discovery/roles/slurm_config/vars/main.yml | 5 --- .../aarch64/rhel/10.0/slurm_custom.json | 5 ++- .../config/x86_64/rhel/10.0/slurm_custom.json | 5 ++- utils/roles/idrac_pxe_boot/tasks/main.yml | 33 ++++++++++++------- utils/roles/idrac_pxe_boot/vars/main.yml | 3 ++ utils/set_pxe_boot.yml | 16 ++++++++- 9 files changed, 64 insertions(+), 50 deletions(-) diff --git a/discovery/roles/slurm_config/defaults/main.yml b/discovery/roles/slurm_config/defaults/main.yml index edee927fc9..aa256ac085 100644 --- a/discovery/roles/slurm_config/defaults/main.yml +++ b/discovery/roles/slurm_config/defaults/main.yml @@ -35,6 +35,7 @@ __default_config: ConstrainDevices: true ConstrainRAMSpace: true ConstrainSwapSpace: true + slurm: SlurmUser: "{{ slurm_user }}" SlurmctldPort: 6817 @@ -51,7 +52,10 @@ __default_config: PrologFlags: contain JobAcctGatherType: jobacct_gather/linux JobAcctGatherFrequency: 30 - SelectType: select/linear + SelectType: select/cons_tres + GresTypes: gpu + SelectTypeParameters: CR_Core_Memory + SlurmdParameters: l3cache_as_socket # Requires hwloc v2. SlurmctldLogFile: "/var/log/slurm/slurmctld.log" SlurmdLogFile: "/var/log/slurm/slurmd.log" SlurmctldPidFile: /var/run/slurmctld.pid @@ -62,6 +66,7 @@ __default_config: SlurmdTimeout: 300 Epilog: "/etc/slurm/epilog.d/logout_user.sh" PluginDir: "{{ plugin_slurm_dir }}" + MaxNodeCount: 65000 NodeSet: - NodeSet: "{{ slurm_partition_name }}" Feature: "{{ slurm_partition_name }}" @@ -73,9 +78,6 @@ __default_config: Nodes: ALL MaxTime: INFINITE State: UP - # S_P_ARRAY type paramater to be provided this way - # Epilog: - # - Epilog: "/etc/slurm/epilog.d/logout_user.sh" slurmdbd: AuthType: auth/munge LogFile: "/var/log/slurm/slurmdbd.log" diff --git a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml index e322c4f3f1..d930f64418 100644 --- a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml +++ b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml @@ -46,12 +46,6 @@ when: node_params is defined and node_params no_log: "{{ _no_log }}" -- name: Add gpu parameters to slurm conf - ansible.builtin.set_fact: - apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(gpu_slurm_conf))}) }}" - when: gpu_params is defined and gpu_params - no_log: "{{ _no_log }}" - - name: Add dbd parameters to slurm conf ansible.builtin.set_fact: apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(dbd_slurm_conf))}) }}" diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index 1601a81334..77a7427a75 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -384,26 +384,17 @@ - name: Handle user choice - ABORT ansible.builtin.fail: - msg: | - ================================================================================ - PLAYBOOK ABORTED BY USER - ================================================================================ - - You chose to abort the playbook (Option A). - - Next Steps: - 1. Cancel running jobs manually: - {% for node in busy_nodes.keys() %} - scancel -w {{ node }} - {% endfor %} - - 2. Or wait for jobs to complete naturally - - 3. Re-run this playbook to remove the nodes - - Idle nodes (if any) have already been removed from the cluster. - - ================================================================================ + msg: + - "===============================================================================" + - "PLAYBOOK ABORTED BY USER" + - "===============================================================================" + - "You chose to abort the playbook (Option A)." + - "Next Steps:" + - "1. Cancel running jobs manually: 'scancel -w '" + - "2. Or wait for jobs to complete naturally" + - "3. Re-run this playbook to remove the nodes" + - "Idle nodes (if any) have already been removed from the cluster." + - "===============================================================================" when: - busy_nodes is defined - busy_nodes | length > 0 diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 37e6ff9869..fca080a65e 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -61,11 +61,6 @@ db_dir: cmpt_dir: - /etc/slurm/epilog.d -gpu_slurm_conf: - GresTypes: gpu - SelectType: select/cons_tres - SelectTypeParameters: CR_Core_Memory - SlurmdParameters: l3cache_as_socket innodb_buffer_pool_size: 4G innodb_lock_wait_timeout: 900 conf_server: "--conf-server {{ ctld_list | join(',') }}" diff --git a/input/config/aarch64/rhel/10.0/slurm_custom.json b/input/config/aarch64/rhel/10.0/slurm_custom.json index db95f2f5fb..79380a7f80 100644 --- a/input/config/aarch64/rhel/10.0/slurm_custom.json +++ b/input/config/aarch64/rhel/10.0/slurm_custom.json @@ -7,9 +7,12 @@ {"package": "python3-firewall", "type": "rpm", "repo_name": "aarch64_baseos"}, {"package": "pmix", "type": "rpm", "repo_name": "aarch64_appstream"}, {"package": "pmix-devel", "type": "rpm", "repo_name": "aarch64_appstream"}, + {"package": "hwloc", "type": "rpm", "repo_name": "aarch64_appstream"}, + {"package": "hwloc-libs", "type": "rpm", "repo_name": "aarch64_appstream"}, + {"package": "slurm", "type": "rpm", "repo_name": "aarch64_slurm_custom"}, {"package": "nvcr.io/nvidia/hpc-benchmarks", "tag": "25.09", "type": "image"}, {"package": "apptainer", "type": "rpm", "repo_name": "epel" }, - {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } + {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } ] }, "slurm_control_node": { diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json index 2b33b0de90..f9f6983e86 100644 --- a/input/config/x86_64/rhel/10.0/slurm_custom.json +++ b/input/config/x86_64/rhel/10.0/slurm_custom.json @@ -5,9 +5,12 @@ {"package": "firewalld", "type": "rpm", "repo_name": "x86_64_baseos"}, {"package": "python3-firewall", "type": "rpm", "repo_name": "x86_64_baseos"}, {"package": "pmix", "type": "rpm", "repo_name": "x86_64_appstream"}, + {"package": "hwloc", "type": "rpm", "repo_name": "x86_64_appstream"}, + {"package": "hwloc-libs", "type": "rpm", "repo_name": "x86_64_appstream"}, {"package": "nvcr.io/nvidia/hpc-benchmarks", "tag": "25.09", "type": "image"}, {"package": "apptainer", "type": "rpm", "repo_name": "epel" }, - {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } + {"package": "slurm", "type": "rpm", "repo_name": "x86_64_slurm_custom"}, + {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } ] }, "slurm_control_node": { diff --git a/utils/roles/idrac_pxe_boot/tasks/main.yml b/utils/roles/idrac_pxe_boot/tasks/main.yml index 7c153f4a24..79f1ec3957 100644 --- a/utils/roles/idrac_pxe_boot/tasks/main.yml +++ b/utils/roles/idrac_pxe_boot/tasks/main.yml @@ -33,7 +33,7 @@ - lc_check_status is success - lc_check_status.lc_status_info.LCReady block: - - name: Set boot from pxe + - name: Set boot option from pxe dellemc.openmanage.idrac_boot: idrac_ip: "{{ inventory_hostname }}" idrac_user: "{{ bmc_username | default(hostvars['localhost']['bmc_username']) }}" @@ -72,37 +72,46 @@ - not (restart_op is changed) - name: Check LC availibility - ansible.builtin.fail: - msg: "{{ lc_check_fail_msg }}" - when: lc_check_status is failed or not (lc_check_status.lc_status_info.LCReady | default(false)) + ansible.builtin.set_fact: + reboot_failed: true + reboot_status: "{{ lc_check_fail_msg }}" + when: lc_check_status is unreachable or lc_check_status is failed or not (lc_check_status.lc_status_info.LCReady | default(false)) - name: Fail if PXE provisioning failed - ansible.builtin.fail: - msg: "{{ pxe_provisioning_fail_msg }}" + ansible.builtin.set_fact: + reboot_failed: true + reboot_status: "{{ pxe_provisioning_fail_msg }}" when: + - not reboot_failed - pxe_provisioning is defined - pxe_provisioning is failed - name: Fail if PXE provisioning target is unreachable - ansible.builtin.fail: - msg: "{{ unreachable_idrac_msg }}" + ansible.builtin.set_fact: + reboot_failed: true + reboot_status: "{{ unreachable_idrac_msg }}" when: + - not reboot_failed - pxe_provisioning is defined - pxe_provisioning is unreachable - name: Fail if power operation failed - ansible.builtin.fail: - msg: "Power operation failed on {{ inventory_hostname }}. Failed to restart server." + ansible.builtin.set_fact: + reboot_failed: true + reboot_status: "Power operation failed on {{ inventory_hostname }}. Failed to restart server." when: + - not reboot_failed - restart_host - not (restart_op is defined and restart_op is changed) - not (power_on_op is defined and power_on_op is changed) - name: Summarize PXE boot and power operation results - ansible.builtin.debug: - msg: >- + ansible.builtin.set_fact: + reboot_failed: false + reboot_status: >- PXE Boot: {{ 'OK' if pxe_provisioning is success else ('UNREACHABLE' if pxe_provisioning is unreachable else 'FAILED') }} | Power: {{ 'Restart OK' if (restart_op is defined and restart_op is changed) else ('On OK' if (power_on_op is defined and power_on_op is changed) else ('Skipped (no restart)' if not restart_host else 'FAILED')) }} + when: not reboot_failed diff --git a/utils/roles/idrac_pxe_boot/vars/main.yml b/utils/roles/idrac_pxe_boot/vars/main.yml index 53de8aa0e9..4bf99d7bfe 100644 --- a/utils/roles/idrac_pxe_boot/vars/main.yml +++ b/utils/roles/idrac_pxe_boot/vars/main.yml @@ -18,6 +18,9 @@ restart_host: true # Change to true for forceful reboot. by default graceful will happen force_restart: true +reboot_status: "PXE boot initiated but not completed." +reboot_failed: false + # Set boot source override mode. Valid values are once, continuous, or disabled boot_source_override_enabled: continuous diff --git a/utils/set_pxe_boot.yml b/utils/set_pxe_boot.yml index a46ebd7091..16e11534b0 100644 --- a/utils/set_pxe_boot.yml +++ b/utils/set_pxe_boot.yml @@ -54,13 +54,27 @@ # This configures Dell iDRAC BMCs to boot a host from PXE (network) and optionally reboots the server. # This will set the boot mode to pxe -# Note: Restart will not happen if the server is powered off, only pxe mode will be set. - name: Reboot Host via PXE hosts: bmc connection: local + strategy: host_pinned gather_facts: false roles: - role: idrac_pxe_boot # vars: # restart_host: false # By default restart will be true, set to false not to restart # force_restart: true # By default graceful_restart will happen, set to true to force restart + +- name: Synchronized Reporting + hosts: bmc + connection: local + gather_facts: false + tasks: + - name: Fail if reboot function failed + ansible.builtin.fail: + msg: "{{ reboot_status }}" + when: reboot_failed + + - name: Show passed iDRACs + ansible.builtin.debug: + msg: "{{ inventory_hostname }}: {{ reboot_status }}" From 748e34758db914ac5a428321d16649f99df364dd Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Mon, 2 Mar 2026 15:49:26 +0530 Subject: [PATCH 04/11] Pxe mapping x86 update --- examples/pxe_mapping_file.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pxe_mapping_file.csv b/examples/pxe_mapping_file.csv index 4d1c4775ed..819ed1a738 100644 --- a/examples/pxe_mapping_file.csv +++ b/examples/pxe_mapping_file.csv @@ -1,6 +1,6 @@ FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 +slurm_node_x86_64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 login_node_x86_64,grp9,ABFG78,,login-compiler-node2,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 From d5a0bd7fde1f73bcf281e1564f8c25f507b8b918 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> Date: Mon, 2 Mar 2026 15:51:51 +0530 Subject: [PATCH 05/11] Update main.yml Signed-off-by: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> --- discovery/roles/slurm_config/defaults/main.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/discovery/roles/slurm_config/defaults/main.yml b/discovery/roles/slurm_config/defaults/main.yml index aa256ac085..23879f67b6 100644 --- a/discovery/roles/slurm_config/defaults/main.yml +++ b/discovery/roles/slurm_config/defaults/main.yml @@ -35,7 +35,6 @@ __default_config: ConstrainDevices: true ConstrainRAMSpace: true ConstrainSwapSpace: true - slurm: SlurmUser: "{{ slurm_user }}" SlurmctldPort: 6817 From 2bfbbf5d32b078ff2e012a0280dfaec74ca0be52 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> Date: Mon, 2 Mar 2026 17:01:52 +0530 Subject: [PATCH 06/11] Update pxe_mapping_file.csv Signed-off-by: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> --- examples/pxe_mapping_file.csv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/pxe_mapping_file.csv b/examples/pxe_mapping_file.csv index 819ed1a738..f4c50517ac 100644 --- a/examples/pxe_mapping_file.csv +++ b/examples/pxe_mapping_file.csv @@ -1,6 +1,6 @@ FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_x86_64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 login_node_x86_64,grp9,ABFG78,,login-compiler-node2,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 @@ -8,4 +8,4 @@ service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54 service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55 service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56 -service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 \ No newline at end of file +service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 From d192280167847ac8a270dc52e437688b30d9ef87 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> Date: Mon, 2 Mar 2026 17:02:39 +0530 Subject: [PATCH 07/11] Update pxe_mapping_file.csv Signed-off-by: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> From 2d6583fbee560ec76324f0152116e5911b11bed0 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> Date: Mon, 2 Mar 2026 17:03:20 +0530 Subject: [PATCH 08/11] Update slurm_custom.json Signed-off-by: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> --- input/config/aarch64/rhel/10.0/slurm_custom.json | 3 --- 1 file changed, 3 deletions(-) diff --git a/input/config/aarch64/rhel/10.0/slurm_custom.json b/input/config/aarch64/rhel/10.0/slurm_custom.json index 79380a7f80..b0477f2a95 100644 --- a/input/config/aarch64/rhel/10.0/slurm_custom.json +++ b/input/config/aarch64/rhel/10.0/slurm_custom.json @@ -7,9 +7,6 @@ {"package": "python3-firewall", "type": "rpm", "repo_name": "aarch64_baseos"}, {"package": "pmix", "type": "rpm", "repo_name": "aarch64_appstream"}, {"package": "pmix-devel", "type": "rpm", "repo_name": "aarch64_appstream"}, - {"package": "hwloc", "type": "rpm", "repo_name": "aarch64_appstream"}, - {"package": "hwloc-libs", "type": "rpm", "repo_name": "aarch64_appstream"}, - {"package": "slurm", "type": "rpm", "repo_name": "aarch64_slurm_custom"}, {"package": "nvcr.io/nvidia/hpc-benchmarks", "tag": "25.09", "type": "image"}, {"package": "apptainer", "type": "rpm", "repo_name": "epel" }, {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } From fff42d3e9a9b4ff39c20d964c08315074f406f13 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> Date: Mon, 2 Mar 2026 17:03:41 +0530 Subject: [PATCH 09/11] Update slurm_custom.json Signed-off-by: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> --- input/config/x86_64/rhel/10.0/slurm_custom.json | 3 --- 1 file changed, 3 deletions(-) diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json index f9f6983e86..e7962f4723 100644 --- a/input/config/x86_64/rhel/10.0/slurm_custom.json +++ b/input/config/x86_64/rhel/10.0/slurm_custom.json @@ -5,11 +5,8 @@ {"package": "firewalld", "type": "rpm", "repo_name": "x86_64_baseos"}, {"package": "python3-firewall", "type": "rpm", "repo_name": "x86_64_baseos"}, {"package": "pmix", "type": "rpm", "repo_name": "x86_64_appstream"}, - {"package": "hwloc", "type": "rpm", "repo_name": "x86_64_appstream"}, - {"package": "hwloc-libs", "type": "rpm", "repo_name": "x86_64_appstream"}, {"package": "nvcr.io/nvidia/hpc-benchmarks", "tag": "25.09", "type": "image"}, {"package": "apptainer", "type": "rpm", "repo_name": "epel" }, - {"package": "slurm", "type": "rpm", "repo_name": "x86_64_slurm_custom"}, {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } ] }, From 75f2602e0669982bed96c214b06e561d05ac0b0d Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Tue, 3 Mar 2026 00:50:00 +0530 Subject: [PATCH 10/11] Delete when mix of busy and idle nodes, scenario for abort --- discovery/roles/slurm_config/tasks/confs.yml | 8 ++++++++ discovery/roles/slurm_config/tasks/remove_node.yml | 12 ++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index 77a7427a75..2378682b7d 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -321,6 +321,14 @@ - busy_nodes | length > 0 - user_input.user_input | default('') | trim | upper == 'A' +- name: Empty Busy nodes for Force removal + ansible.builtin.set_fact: + busy_nodes: {} + when: + - busy_nodes is defined + - busy_nodes | length > 0 + - user_input.user_input | default('') | trim | upper == 'F' + - name: Remove nodes ansible.builtin.include_tasks: remove_node.yml when: diff --git a/discovery/roles/slurm_config/tasks/remove_node.yml b/discovery/roles/slurm_config/tasks/remove_node.yml index da6b2a72ae..ceb766898b 100644 --- a/discovery/roles/slurm_config/tasks/remove_node.yml +++ b/discovery/roles/slurm_config/tasks/remove_node.yml @@ -47,11 +47,19 @@ - "'slurm' in conf_merge_dict" - filtered_nodenames is defined +- name: Set partition nodes exactly as cmpt_list minus removed nodes + ansible.builtin.set_fact: + partition_nodes: "{{ cmpt_list | difference(nodes_in_normal_not_in_cmpt) | union(busy_nodes.keys() | default([]) | list) }}" + when: + - "'slurm' in conf_merge_dict" + - nodes_in_normal_not_in_cmpt is defined + - nodes_in_normal_not_in_cmpt | length > 0 + - name: Update normal partition Nodes to match cmpt_list ansible.builtin.set_fact: updated_partitions: "{{ updated_partitions | default([]) - + [item | combine({'Nodes': ((cmpt_list | difference(nodes_in_normal_not_in_cmpt)) | join(',')) - if (cmpt_list | difference(nodes_in_normal_not_in_cmpt)) | length > 0 else slurm_partition_name}) + + [item | combine({'Nodes': (partition_nodes | join(',')) + if partition_nodes | length > 0 else slurm_partition_name}) if item.PartitionName == slurm_partition_name else item] }}" loop: "{{ slurm_conf_dict.PartitionName | default([]) }}" when: From b133a9d1241fdb44b0baad9be6959284901be753 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Tue, 3 Mar 2026 11:50:40 +0530 Subject: [PATCH 11/11] initialize --- discovery/roles/slurm_config/vars/main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index fca080a65e..3cc8bdd6af 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -126,6 +126,7 @@ partition_params: MaxTime: "INFINITE" State: "UP" Default: "YES" +busy_nodes: {} openldap_dir_name: "openldap/" software_config_file: "{{ input_project_dir }}/software_config.json" omnia_run_tags: "{{ hostvars['localhost']['omnia_run_tags'] }}"