From 325552116b6204d8184827c8fdfbeff36c2f91ca Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Tue, 24 Feb 2026 13:09:53 +0000 Subject: [PATCH 01/41] User given slurm node info --- .../common_utils/data_validation.py | 7 + .../input_validation/schema/omnia_config.json | 44 ++++++ .../validation_flows/common_validation.py | 25 ++- .../slurm_config/tasks/build_slurm_conf.yml | 5 - discovery/roles/slurm_config/tasks/confs.yml | 119 +++++++++++++- .../tasks/read_node_homogeneous.yml | 46 ++++++ .../tasks/read_node_idrac_group.yml | 147 ++++++++++++++++++ .../tasks/read_slurm_hostnames.yml | 10 ++ 8 files changed, 395 insertions(+), 8 deletions(-) create mode 100644 discovery/roles/slurm_config/tasks/read_node_homogeneous.yml create mode 100644 discovery/roles/slurm_config/tasks/read_node_idrac_group.yml diff --git a/common/library/module_utils/input_validation/common_utils/data_validation.py b/common/library/module_utils/input_validation/common_utils/data_validation.py index fc60ba0dd7..065a2fbb1f 100644 --- a/common/library/module_utils/input_validation/common_utils/data_validation.py +++ b/common/library/module_utils/input_validation/common_utils/data_validation.py @@ -58,6 +58,13 @@ def schema(config): error_bucket.append("input data reading failed.") return error_bucket + # Normalize case-sensitive fields for omnia_config.yml + if "omnia_config" in input_file_path: + if "slurm_cluster" in input_data: + for cluster in input_data["slurm_cluster"]: + if "node_discovery_mode" in cluster and isinstance(cluster["node_discovery_mode"], str): + cluster["node_discovery_mode"] = cluster["node_discovery_mode"].lower() + # Load schema with open(schema_file_path, "r", encoding="utf-8") as schema_file: j_schema = json.load(schema_file) diff --git a/common/library/module_utils/input_validation/schema/omnia_config.json b/common/library/module_utils/input_validation/schema/omnia_config.json index ca7266124c..def60d51e4 100644 --- a/common/library/module_utils/input_validation/schema/omnia_config.json +++ b/common/library/module_utils/input_validation/schema/omnia_config.json @@ -23,6 +23,50 @@ "type": "boolean", "description": "Variable indicates whether a specific configuration file path under config_sources should be used as-is without merging" }, + "node_discovery_mode": { + "type": "string", + "enum": ["homogeneous", "heterogeneous"], + "description": "Node hardware discovery mode. 'homogeneous' for group-based discovery, 'heterogeneous' for individual node discovery. Default: heterogeneous" + }, + "node_hardware_defaults": { + "type": "object", + "description": "Hardware specifications for homogeneous node groups. Key is group name (grp0-grp100), value is hardware specs.", + "patternProperties": { + "^grp([0-9]|[1-9][0-9]|100)$": { + "type": "object", + "properties": { + "sockets": { + "type": "integer", + "minimum": 1, + "description": "Number of CPU sockets per node" + }, + "cores_per_socket": { + "type": "integer", + "minimum": 1, + "description": "Number of CPU cores per socket" + }, + "threads_per_core": { + "type": "integer", + "minimum": 1, + "description": "Number of CPU threads per core" + }, + "real_memory": { + "type": "integer", + "minimum": 1, + "description": "Memory in MB (exact value to use in Slurm)" + }, + "gres": { + "type": "string", + "pattern": "^gpu:[0-9]+$", + "description": "GPU resources in format 'gpu:N' (optional)" + } + }, + "required": ["sockets", "cores_per_socket", "threads_per_core", "real_memory"], + "additionalProperties": false + } + }, + "additionalProperties": false + }, "config_sources": { "type": "object", "description": "Config can be a file path or inline mapping", diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py index 36f55130d4..024189e967 100644 --- a/common/library/module_utils/input_validation/validation_flows/common_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py @@ -1074,7 +1074,30 @@ def validate_omnia_config( "slurm NFS not provided", f"NFS name {', '.join(diff_set)} required for slurm is not defined in {storage_config}" )) - + + # Validate node_hardware_defaults requires node_discovery_mode=homogeneous + for clst in data.get('slurm_cluster', []): + node_hardware_defaults = clst.get('node_hardware_defaults') + node_discovery_mode = clst.get('node_discovery_mode') + + # Normalize mode to lowercase for case-insensitive comparison + if node_discovery_mode and isinstance(node_discovery_mode, str): + node_discovery_mode = node_discovery_mode.lower() + + if node_hardware_defaults and len(node_hardware_defaults) > 0: + if not node_discovery_mode or node_discovery_mode != 'homogeneous': + group_names = list(node_hardware_defaults.keys()) + errors.append( + create_error_msg( + input_file_path, + "slurm_cluster configuration inconsistency", + f"'node_hardware_defaults' is specified for groups {group_names}, but 'node_discovery_mode' is not set to 'homogeneous'. " + f"Current mode: {node_discovery_mode if node_discovery_mode else 'not set (defaults to heterogeneous)'}. " + f"Either set 'node_discovery_mode: \"homogeneous\"' to use the hardware specifications, " + f"or remove 'node_hardware_defaults' to use heterogeneous discovery." + )) + + cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')] skip_conf_validation = os.path.exists("/opt/omnia/input/.skip_slurm_conf_validation") cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')] skip_merge_list = [clst.get('skip_merge', False) for clst in data.get('slurm_cluster')] diff --git a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml index 84bb493442..e322c4f3f1 100644 --- a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml +++ b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml @@ -12,11 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -- name: Read NodeName parameters from iDRAC - ansible.builtin.include_tasks: read_node_idrac.yml - when: cmpt_list - loop: "{{ cmpt_list }}" - - name: Append node_params list into NodeName list ansible.builtin.set_fact: apply_config: "{{ apply_config | default({}) diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index 0db74bc166..2418c47680 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -24,6 +24,121 @@ - skip_merge | default(false) - configs_input is defined +- name: Initialize node_params collection + ansible.builtin.set_fact: + node_params: [] + +- name: Set discovery mode and get groups with specs + ansible.builtin.set_fact: + discovery_mode: "{{ (slurm_cluster[0].node_discovery_mode | default('heterogeneous')) | lower }}" + groups_with_specs: "{{ slurm_cluster[0].node_hardware_defaults | default({}) | list }}" + +- name: DEBUG - Show discovery configuration + ansible.builtin.debug: + msg: + - "Discovery Mode: {{ discovery_mode }}" + - "Groups with specs: {{ groups_with_specs }}" + - "Total compute nodes: {{ cmpt_list | length }}" + - "Node-to-hardware-group mapping: {{ name_hardware_group_map }}" + +- name: Categorize nodes for processing (homogeneous mode) + ansible.builtin.set_fact: + homogeneous_nodes: [] + sample_idrac_groups: {} + when: discovery_mode == 'homogeneous' + +- name: DEBUG - Show node group checking details + ansible.builtin.debug: + msg: + - "Node: {{ item }}" + - "Hardware Group: {{ name_hardware_group_map.get(item, 'NOT_FOUND') }}" + - "Groups with specs: {{ groups_with_specs }}" + - "Is in specs: {{ name_hardware_group_map.get(item, '') in groups_with_specs }}" + loop: "{{ cmpt_list }}" + when: + - discovery_mode == 'homogeneous' + - name_hardware_group_map | length > 0 + +- name: Build homogeneous nodes list (groups with specs) + ansible.builtin.set_fact: + homogeneous_nodes: >- + {{ + homogeneous_nodes + [item] + if (name_hardware_group_map.get(item, '') in groups_with_specs) + else homogeneous_nodes + }} + loop: "{{ cmpt_list }}" + when: + - discovery_mode == 'homogeneous' + - name_hardware_group_map | length > 0 + +- name: DEBUG - Show homogeneous nodes categorization + ansible.builtin.debug: + msg: + - "Homogeneous nodes (with user specs): {{ homogeneous_nodes | default([]) }}" + - "Nodes count: {{ homogeneous_nodes | default([]) | length }}" + - "Will process homogeneous task: {{ (discovery_mode == 'homogeneous' and homogeneous_nodes | length > 0) }}" + when: discovery_mode == 'homogeneous' + +- name: DEBUG - About to process homogeneous nodes + ansible.builtin.debug: + msg: "Processing {{ homogeneous_nodes | length }} nodes with user specs: {{ homogeneous_nodes }}" + when: + - discovery_mode == 'homogeneous' + - homogeneous_nodes | length > 0 + +- name: Process homogeneous groups with user specs (no iDRAC) + ansible.builtin.include_tasks: read_node_homogeneous.yml + loop: "{{ homogeneous_nodes }}" + loop_control: + loop_var: item + when: + - discovery_mode == 'homogeneous' + - homogeneous_nodes | length > 0 + +- name: Build sample iDRAC groups mapping (groups without specs) + ansible.builtin.set_fact: + sample_idrac_groups: >- + {{ + sample_idrac_groups | default({}) | + combine({ + name_hardware_group_map.get(item, ''): sample_idrac_groups.get(name_hardware_group_map.get(item, ''), []) + [item] + }) + }} + loop: "{{ cmpt_list }}" + when: + - discovery_mode == 'homogeneous' + - name_hardware_group_map | length > 0 + - name_hardware_group_map.get(item, '') not in groups_with_specs + +- name: Process homogeneous groups with user specs (no iDRAC) + ansible.builtin.include_tasks: read_node_homogeneous.yml + loop: "{{ homogeneous_nodes }}" + when: + - discovery_mode == 'homogeneous' + - homogeneous_nodes | length > 0 + +- name: Process homogeneous groups without specs (group iDRAC) + ansible.builtin.include_tasks: read_node_idrac_group.yml + loop: "{{ sample_idrac_groups | dict2items }}" + loop_control: + loop_var: group_item + when: + - discovery_mode == 'homogeneous' + - sample_idrac_groups | default({}) | length > 0 + +- name: Process heterogeneous nodes (individual iDRAC) + ansible.builtin.include_tasks: read_node_idrac.yml + loop: "{{ cmpt_list }}" + when: + - discovery_mode == 'heterogeneous' + +- name: DEBUG - Show final node_params before building slurm.conf + ansible.builtin.debug: + msg: + - "Total node_params entries: {{ node_params | length }}" + - "node_params: {{ node_params }}" + - name: Build slurm.conf ansible.builtin.include_tasks: build_slurm_conf.yml when: "'slurm' in conf_files" @@ -83,8 +198,8 @@ conf_merge_dict | default({}) | combine({ existing_conf_set.item.1: ( - [apply_config[existing_conf_set.item.1]] - + ([existing_conf_set.stat.path] if existing_conf_set.stat.exists else []) + ([existing_conf_set.stat.path] if existing_conf_set.stat.exists else []) + + [apply_config[existing_conf_set.item.1]] + ([parsed_configs_input.get(existing_conf_set.item.1)] if parsed_configs_input is defined and parsed_configs_input.get(existing_conf_set.item.1) else []) ) diff --git a/discovery/roles/slurm_config/tasks/read_node_homogeneous.yml b/discovery/roles/slurm_config/tasks/read_node_homogeneous.yml new file mode 100644 index 0000000000..16b9f77a26 --- /dev/null +++ b/discovery/roles/slurm_config/tasks/read_node_homogeneous.yml @@ -0,0 +1,46 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Get group name and hardware specs for homogeneous node + ansible.builtin.set_fact: + group_name: "{{ name_hardware_group_map[item] }}" + group_specs: "{{ slurm_cluster[0].node_hardware_defaults[name_hardware_group_map[item]] }}" + +- name: DEBUG - Show group specs being applied + ansible.builtin.debug: + msg: + - "Node: {{ item }}" + - "Group: {{ group_name }}" + - "Sockets: {{ group_specs.sockets }}" + - "CoresPerSocket: {{ group_specs.cores_per_socket }}" + - "ThreadsPerCore: {{ group_specs.threads_per_core }}" + - "RealMemory: {{ group_specs.real_memory }}" + +- name: Build node parameters from user specs (no iDRAC) + ansible.builtin.set_fact: + proc_params: "{{ {'NodeName': item} + | combine({'Sockets': group_specs.sockets}) + | combine({'CoresPerSocket': group_specs.cores_per_socket}) + | combine({'ThreadsPerCore': group_specs.threads_per_core}) + | combine({'RealMemory': group_specs.real_memory}) + | combine({'Gres': group_specs.gres} if group_specs.gres is defined else {}) }}" + +- name: DEBUG - Show built proc_params + ansible.builtin.debug: + msg: "proc_params: {{ proc_params }}" + +- name: Add to Nodeparam dict + ansible.builtin.set_fact: + node_params: "{{ node_params + [proc_params] }}" + gpu_params: "{{ gpu_params | default({}) | combine({item: []} if group_specs.gres is defined else {}) }}" diff --git a/discovery/roles/slurm_config/tasks/read_node_idrac_group.yml b/discovery/roles/slurm_config/tasks/read_node_idrac_group.yml new file mode 100644 index 0000000000..d2353e3e76 --- /dev/null +++ b/discovery/roles/slurm_config/tasks/read_node_idrac_group.yml @@ -0,0 +1,147 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Initialize group discovery variables + ansible.builtin.set_fact: + group_name: "{{ group_item.key }}" + group_nodes: "{{ group_item.value }}" + discovered_specs: {} + responsive_node: "" + +- name: Try each node in group until iDRAC responds + ansible.builtin.uri: + url: "https://{{ bmc_ip_map[item] }}/redfish/v1/Systems/System.Embedded.1" + user: "{{ bmc_username }}" + password: "{{ bmc_password }}" + method: GET + force_basic_auth: true + validate_certs: false + return_content: true + body_format: json + timeout: 60 + headers: + Accept: "application/json" + Content-Type: "application/json" + OData-Version: "4.0" + status_code: + - 200 + register: idrac_test + loop: "{{ group_nodes }}" + when: responsive_node == "" + register: idrac_results + +- name: Set responsive node from successful iDRAC call + ansible.builtin.set_fact: + responsive_node: "{{ item.item }}" + loop: "{{ idrac_results.results }}" + when: + - responsive_node == "" + - item.status == 200 + +- name: Read Processor information from responsive node + ansible.builtin.uri: + url: "https://{{ bmc_ip_map[responsive_node] }}/redfish/v1/Systems/System.Embedded.1/Processors?$expand=*($levels=1)" + user: "{{ bmc_username }}" + password: "{{ bmc_password }}" + method: GET + force_basic_auth: true + validate_certs: false + return_content: true + body_format: json + timeout: 60 + headers: + Accept: "application/json" + Content-Type: "application/json" + OData-Version: "4.0" + status_code: + - 200 + register: proc_info + when: responsive_node != "" + +- name: Read Memory information from responsive node + ansible.builtin.uri: + url: "https://{{ bmc_ip_map[responsive_node] }}/redfish/v1/Systems/System.Embedded.1" + user: "{{ bmc_username }}" + password: "{{ bmc_password }}" + method: GET + force_basic_auth: true + validate_certs: false + return_content: true + body_format: json + timeout: 60 + headers: + Accept: "application/json" + Content-Type: "application/json" + OData-Version: "4.0" + status_code: + - 200 + register: mem_info + when: responsive_node != "" + +- name: Extract CPU and GPU information + ansible.builtin.set_fact: + cpus: "{{ proc_info.json.Members | default([]) | selectattr('ProcessorType', 'equalto', 'CPU') | list }}" + gpus: "{{ proc_info.json.Members | default([]) + | selectattr('ProcessorType', 'equalto', 'GPU') + | selectattr('Manufacturer', 'search', '(?i)nvidia') | list }}" + when: responsive_node != "" + +- name: Calculate total memory in MB (GiB → MB) + ansible.builtin.set_fact: + total_memory_mb: "{{ (mem_info.json.MemorySummary.TotalSystemMemoryGiB | default(default_real_memory)) * 1024 | int }}" + when: responsive_node != "" + +- name: Calculate 90% of real memory + ansible.builtin.set_fact: + real_memory: "{{ ((total_memory_mb | float) * 0.90) | int }}" + when: responsive_node != "" + +- name: Build discovered hardware specs + ansible.builtin.set_fact: + discovered_specs: "{{ { + 'sockets': (1 if (cpus | length == 0) else (cpus | length)), + 'cores_per_socket': (cpus[0].TotalEnabledCores | default(default_corespersocket)), + 'threads_per_core': ((cpus[0].TotalThreads | default(default_threadspercore)) // (cpus[0].TotalCores | default(1))), + 'real_memory': real_memory | default(default_real_memory), + 'gres': ('gpu:' ~ (gpus | default([1]) | length | string) if (gpus | default([])) else '') + } }}" + when: responsive_node != "" + +- name: Apply discovered specs to all nodes in group + ansible.builtin.set_fact: + node_params: "{{ node_params + [{ + 'NodeName': hostname, + 'Sockets': discovered_specs.sockets, + 'CoresPerSocket': discovered_specs.cores_per_socket, + 'ThreadsPerCore': discovered_specs.threads_per_core, + 'RealMemory': discovered_specs.real_memory + } | combine({'Gres': discovered_specs.gres} if discovered_specs.gres != '' else {})] }}" + loop: "{{ group_nodes }}" + loop_control: + loop_var: hostname + when: discovered_specs | length > 0 + +- name: Use default values for all nodes in group (iDRAC failed) + ansible.builtin.set_fact: + node_params: "{{ node_params + [{ + 'NodeName': hostname, + 'Sockets': default_corespersocket, + 'CoresPerSocket': default_corespersocket, + 'ThreadsPerCore': default_threadspercore, + 'RealMemory': default_real_memory + }] }}" + loop: "{{ group_nodes }}" + loop_control: + loop_var: hostname + when: discovered_specs | length == 0 diff --git a/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml b/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml index 0f7b3a16b2..a4ff37b27c 100644 --- a/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml +++ b/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml @@ -21,6 +21,16 @@ ansible.builtin.set_fact: node_yaml: "{{ slurped_yaml.content | b64decode | from_yaml }}" +- name: Build hostname to GROUP_NAME mapping from existing PXE mapping + ansible.builtin.set_fact: + name_hardware_group_map: "{{ dict(hostvars['localhost']['pxe_mapping_dict'].dict.values() | map(attribute='HOSTNAME') | list | zip(hostvars['localhost']['pxe_mapping_dict'].dict.values() | map(attribute='GROUP_NAME') | list)) }}" + when: hostvars['localhost']['pxe_mapping_dict'] is defined + +- name: Set empty mapping if PXE mapping not available + ansible.builtin.set_fact: + name_hardware_group_map: {} + when: hostvars['localhost']['pxe_mapping_dict'] is not defined + - name: Read the node name group ansible.builtin.set_fact: name_group_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='group') }}" From 6b584edfb7eddc09e1e044db431b3ffdf5123bc6 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Tue, 24 Feb 2026 13:18:57 +0000 Subject: [PATCH 02/41] fallback scenario and input dir --- .../tasks/read_node_idrac_group.yml | 86 +++++++++++++++++++ input/omnia_config.yml | 54 ++++++++++++ 2 files changed, 140 insertions(+) diff --git a/discovery/roles/slurm_config/tasks/read_node_idrac_group.yml b/discovery/roles/slurm_config/tasks/read_node_idrac_group.yml index d2353e3e76..f1113b397c 100644 --- a/discovery/roles/slurm_config/tasks/read_node_idrac_group.yml +++ b/discovery/roles/slurm_config/tasks/read_node_idrac_group.yml @@ -97,6 +97,92 @@ | selectattr('Manufacturer', 'search', '(?i)nvidia') | list }}" when: responsive_node != "" +- name: Fallback - Get PCIe devices for GPU detection + ansible.builtin.uri: + url: "https://{{ bmc_ip_map[responsive_node] }}/redfish/v1/Systems/System.Embedded.1/PCIeDevices" + user: "{{ bmc_username }}" + password: "{{ bmc_password }}" + method: GET + force_basic_auth: true + validate_certs: false + return_content: true + body_format: json + timeout: 60 + headers: + Accept: "application/json" + Content-Type: "application/json" + OData-Version: "4.0" + status_code: + - 200 + register: pcie_devices + failed_when: false + when: responsive_node != "" and gpus | length == 0 + +- name: Fallback - Extract PCIe device URLs + ansible.builtin.set_fact: + pcie_device_urls: "{{ pcie_devices.json.Members | default([]) | json_query('[*].\"@odata.id\"') }}" + when: responsive_node != "" and gpus | length == 0 + +- name: Fallback - Get PCIe Device details for GPU detection + ansible.builtin.uri: + url: "https://{{ bmc_ip_map[responsive_node] }}{{ item }}" + user: "{{ bmc_username }}" + password: "{{ bmc_password }}" + method: GET + force_basic_auth: true + validate_certs: false + return_content: true + body_format: json + timeout: 60 + headers: + Accept: "application/json" + Content-Type: "application/json" + OData-Version: "4.0" + status_code: + - 200 + register: pcie_device_details + loop: "{{ pcie_device_urls | default([]) }}" + loop_control: + label: "{{ item }}" + failed_when: false + when: responsive_node != "" and gpus | length == 0 and pcie_device_urls is defined and pcie_device_urls | length > 0 + +- name: Fallback - Detect GPUs from PCIe devices + ansible.builtin.set_fact: + fallback_gpus: "{{ pcie_device_details.results | default([]) + | selectattr('json', 'defined') + | map(attribute='json') + | selectattr('ClassCode', 'defined') + | selectattr('VendorId', 'defined') + | selectattr('ClassCode', 'equalto', '0x0300') | list }}" + when: responsive_node != "" and gpus | length == 0 + +- name: Fallback - Detect GPUs from PCIe devices (additional criteria) + ansible.builtin.set_fact: + fallback_gpus_additional: "{{ pcie_device_details.results | default([]) + | selectattr('json', 'defined') + | map(attribute='json') + | selectattr('ClassCode', 'defined') + | selectattr('VendorId', 'defined') + | selectattr('ClassCode', 'equalto', '0x0302') | list }}" + when: responsive_node != "" and gpus | length == 0 and fallback_gpus | default([]) | length == 0 + +- name: Fallback - Detect GPUs from Manufacturer/Name (NVIDIA only) + ansible.builtin.set_fact: + fallback_gpus_manufacturer: "{{ pcie_device_details.results | default([]) + | selectattr('json', 'defined') + | map(attribute='json') + | selectattr('Manufacturer', 'defined') + | selectattr('Name', 'defined') + | selectattr('Manufacturer', 'search', '(?i)NVIDIA') + | selectattr('Name', 'search', '(?i)GPU|RTX|TESLA|A100|H100|L40|GB') | list }}" + when: responsive_node != "" and gpus | length == 0 and fallback_gpus | default([]) | length == 0 and fallback_gpus_additional | default([]) | length == 0 + +- name: Fallback - Update GPUs list if PCIe detection found GPUs + ansible.builtin.set_fact: + gpus: "{{ (fallback_gpus | default([])) or (fallback_gpus_additional | default([])) or (fallback_gpus_manufacturer | default([])) }}" + when: responsive_node != "" and gpus | length == 0 + - name: Calculate total memory in MB (GiB → MB) ansible.builtin.set_fact: total_memory_mb: "{{ (mem_info.json.MemorySummary.TotalSystemMemoryGiB | default(default_real_memory)) * 1024 | int }}" diff --git a/input/omnia_config.yml b/input/omnia_config.yml index 943d70e530..5a0ebdd89f 100644 --- a/input/omnia_config.yml +++ b/input/omnia_config.yml @@ -36,6 +36,42 @@ # It accepts true and false values # Default value is false +# node_discovery_mode +# Controls how hardware specifications are discovered for Slurm compute nodes +# Options: "heterogeneous" or "homogeneous" +# - heterogeneous: Discovers each node individually via iDRAC (1 call per node) +# Best for: Mixed hardware environments with different node configurations +# - homogeneous: Groups nodes by hardware type for optimized discovery +# Best for: Standardized hardware groups (grp0-grp100 in pxe_mapping_file.csv) +# Performance: 0 iDRAC calls (with specs) or 1 call per group (without specs) +# Default value is heterogeneous + +# node_hardware_defaults +# Optional: Pre-define hardware specifications for homogeneous node groups +# Only used when node_discovery_mode is set to "homogeneous" +# Key: GROUP_NAME from pxe_mapping_file.csv (e.g., grp0, grp1, grp2, etc.) +# Value: Hardware specifications for all nodes in that group +# - sockets: Number of CPU sockets per node (integer, minimum 1) +# - cores_per_socket: Number of CPU cores per socket (integer, minimum 1) +# - threads_per_core: Number of CPU threads per core (integer, minimum 1) +# - real_memory: Memory in MB (integer, minimum 1) +# - gres: Optional GPU resources in format "gpu:N" (e.g., "gpu:4") +# If a group is not listed here, one node from that group will be discovered via iDRAC +# and the specs will be applied to all nodes in the group +# Example: +# node_hardware_defaults: +# grp1: +# sockets: 2 +# cores_per_socket: 64 +# threads_per_core: 2 +# real_memory: 512000 +# gres: "gpu:4" +# grp2: +# sockets: 2 +# cores_per_socket: 32 +# threads_per_core: 2 +# real_memory: 256000 + # config_sources # defines how the Slurm configuration files are provided to the cluster. # : @@ -60,6 +96,24 @@ slurm_cluster: - cluster_name: slurm_cluster nfs_storage_name: nfs_slurm # skip_merge: true + + # Uncomment to enable homogeneous discovery mode + # node_discovery_mode: "homogeneous" + + # Uncomment to provide hardware specs for homogeneous groups + # node_hardware_defaults: + # grp1: + # sockets: 2 + # cores_per_socket: 64 + # threads_per_core: 2 + # real_memory: 512000 + # gres: "gpu:4" + # grp2: + # sockets: 2 + # cores_per_socket: 32 + # threads_per_core: 2 + # real_memory: 256000 + # config_sources: # slurm: # SlurmctldTimeout: 60 From e196a706d7c87e57371c213b16dd401e38cfa7ff Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Tue, 24 Feb 2026 13:30:23 +0000 Subject: [PATCH 03/41] lint issues fixed --- discovery/roles/slurm_config/tasks/confs.yml | 14 +++++++------- .../slurm_config/tasks/read_node_homogeneous.yml | 2 +- .../slurm_config/tasks/read_node_idrac_group.yml | 3 +-- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index 2418c47680..8aff4e0e21 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -55,7 +55,7 @@ - "Groups with specs: {{ groups_with_specs }}" - "Is in specs: {{ name_hardware_group_map.get(item, '') in groups_with_specs }}" loop: "{{ cmpt_list }}" - when: + when: - discovery_mode == 'homogeneous' - name_hardware_group_map | length > 0 @@ -68,7 +68,7 @@ else homogeneous_nodes }} loop: "{{ cmpt_list }}" - when: + when: - discovery_mode == 'homogeneous' - name_hardware_group_map | length > 0 @@ -92,7 +92,7 @@ loop: "{{ homogeneous_nodes }}" loop_control: loop_var: item - when: + when: - discovery_mode == 'homogeneous' - homogeneous_nodes | length > 0 @@ -106,7 +106,7 @@ }) }} loop: "{{ cmpt_list }}" - when: + when: - discovery_mode == 'homogeneous' - name_hardware_group_map | length > 0 - name_hardware_group_map.get(item, '') not in groups_with_specs @@ -114,7 +114,7 @@ - name: Process homogeneous groups with user specs (no iDRAC) ansible.builtin.include_tasks: read_node_homogeneous.yml loop: "{{ homogeneous_nodes }}" - when: + when: - discovery_mode == 'homogeneous' - homogeneous_nodes | length > 0 @@ -123,14 +123,14 @@ loop: "{{ sample_idrac_groups | dict2items }}" loop_control: loop_var: group_item - when: + when: - discovery_mode == 'homogeneous' - sample_idrac_groups | default({}) | length > 0 - name: Process heterogeneous nodes (individual iDRAC) ansible.builtin.include_tasks: read_node_idrac.yml loop: "{{ cmpt_list }}" - when: + when: - discovery_mode == 'heterogeneous' - name: DEBUG - Show final node_params before building slurm.conf diff --git a/discovery/roles/slurm_config/tasks/read_node_homogeneous.yml b/discovery/roles/slurm_config/tasks/read_node_homogeneous.yml index 16b9f77a26..7c5b100b8e 100644 --- a/discovery/roles/slurm_config/tasks/read_node_homogeneous.yml +++ b/discovery/roles/slurm_config/tasks/read_node_homogeneous.yml @@ -29,7 +29,7 @@ - name: Build node parameters from user specs (no iDRAC) ansible.builtin.set_fact: - proc_params: "{{ {'NodeName': item} + proc_params: "{{ {'NodeName': item} | combine({'Sockets': group_specs.sockets}) | combine({'CoresPerSocket': group_specs.cores_per_socket}) | combine({'ThreadsPerCore': group_specs.threads_per_core}) diff --git a/discovery/roles/slurm_config/tasks/read_node_idrac_group.yml b/discovery/roles/slurm_config/tasks/read_node_idrac_group.yml index f1113b397c..c121d7ed28 100644 --- a/discovery/roles/slurm_config/tasks/read_node_idrac_group.yml +++ b/discovery/roles/slurm_config/tasks/read_node_idrac_group.yml @@ -36,7 +36,6 @@ OData-Version: "4.0" status_code: - 200 - register: idrac_test loop: "{{ group_nodes }}" when: responsive_node == "" register: idrac_results @@ -45,7 +44,7 @@ ansible.builtin.set_fact: responsive_node: "{{ item.item }}" loop: "{{ idrac_results.results }}" - when: + when: - responsive_node == "" - item.status == 200 From 6a60aa57b4e629012ccbfb6bcd4900d499ebec04 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Tue, 24 Feb 2026 13:45:55 +0000 Subject: [PATCH 04/41] lint errors fixed --- .../tasks/read_node_idrac_group.yml | 17 ++++++++++------- .../slurm_config/tasks/read_slurm_hostnames.yml | 8 +++++++- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/discovery/roles/slurm_config/tasks/read_node_idrac_group.yml b/discovery/roles/slurm_config/tasks/read_node_idrac_group.yml index c121d7ed28..2303b1debc 100644 --- a/discovery/roles/slurm_config/tasks/read_node_idrac_group.yml +++ b/discovery/roles/slurm_config/tasks/read_node_idrac_group.yml @@ -219,13 +219,16 @@ - name: Use default values for all nodes in group (iDRAC failed) ansible.builtin.set_fact: - node_params: "{{ node_params + [{ - 'NodeName': hostname, - 'Sockets': default_corespersocket, - 'CoresPerSocket': default_corespersocket, - 'ThreadsPerCore': default_threadspercore, - 'RealMemory': default_real_memory - }] }}" + node_params: >- + {{ + node_params + [{ + 'NodeName': hostname, + 'Sockets': default_corespersocket, + 'CoresPerSocket': default_corespersocket, + 'ThreadsPerCore': default_threadspercore, + 'RealMemory': default_real_memory + }] + }} loop: "{{ group_nodes }}" loop_control: loop_var: hostname diff --git a/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml b/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml index a4ff37b27c..c61e8d92a9 100644 --- a/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml +++ b/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml @@ -23,7 +23,13 @@ - name: Build hostname to GROUP_NAME mapping from existing PXE mapping ansible.builtin.set_fact: - name_hardware_group_map: "{{ dict(hostvars['localhost']['pxe_mapping_dict'].dict.values() | map(attribute='HOSTNAME') | list | zip(hostvars['localhost']['pxe_mapping_dict'].dict.values() | map(attribute='GROUP_NAME') | list)) }}" + name_hardware_group_map: >- + {{ + dict( + hostvars['localhost']['pxe_mapping_dict'].dict.values() | map(attribute='HOSTNAME') | list + | zip(hostvars['localhost']['pxe_mapping_dict'].dict.values() | map(attribute='GROUP_NAME') | list) + ) + }} when: hostvars['localhost']['pxe_mapping_dict'] is defined - name: Set empty mapping if PXE mapping not available From 3fc791da34b0c0bd5165ba9069ead9ce1ae3fcc0 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Wed, 25 Feb 2026 02:17:51 +0000 Subject: [PATCH 05/41] warnings of jinja spacing --- .../tasks/read_node_idrac_group.yml | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/discovery/roles/slurm_config/tasks/read_node_idrac_group.yml b/discovery/roles/slurm_config/tasks/read_node_idrac_group.yml index 2303b1debc..dd6582923d 100644 --- a/discovery/roles/slurm_config/tasks/read_node_idrac_group.yml +++ b/discovery/roles/slurm_config/tasks/read_node_idrac_group.yml @@ -205,13 +205,17 @@ - name: Apply discovered specs to all nodes in group ansible.builtin.set_fact: - node_params: "{{ node_params + [{ - 'NodeName': hostname, - 'Sockets': discovered_specs.sockets, - 'CoresPerSocket': discovered_specs.cores_per_socket, - 'ThreadsPerCore': discovered_specs.threads_per_core, - 'RealMemory': discovered_specs.real_memory - } | combine({'Gres': discovered_specs.gres} if discovered_specs.gres != '' else {})] }}" + node_params: >- + {{ + node_params + [ + {'NodeName': hostname, + 'Sockets': discovered_specs.sockets, + 'CoresPerSocket': discovered_specs.cores_per_socket, + 'ThreadsPerCore': discovered_specs.threads_per_core, + 'RealMemory': discovered_specs.real_memory} + | combine({'Gres': discovered_specs.gres} if discovered_specs.gres != '' else {}) + ] + }} loop: "{{ group_nodes }}" loop_control: loop_var: hostname From c594d09665d365a5be12ba0659b19eb0782ccd71 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Wed, 25 Feb 2026 16:52:34 +0530 Subject: [PATCH 06/41] Update omnia.sh --- omnia.sh | 177 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 157 insertions(+), 20 deletions(-) diff --git a/omnia.sh b/omnia.sh index e380de2745..e9d48b1fe8 100755 --- a/omnia.sh +++ b/omnia.sh @@ -30,6 +30,81 @@ GREEN='\033[0;32m' BLUE='\033[0;34m' NC='\033[0m' # No Color YELLOW='\033[0;33m' + +# Function to get version from git tag +get_version_from_git_tag() { + local tag_version + local script_dir + local git_root + + # First try to get script directory + if [ -L "${BASH_SOURCE[0]}" ]; then + # If script is a symlink, resolve it + script_dir="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" && pwd)" + else + script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + fi + + # Find git repository by traversing up from script directory + git_root="$script_dir" + while [ "$git_root" != "/" ] && [ ! -d "$git_root/.git" ]; do + git_root="$(dirname "$git_root")" + done + + # If we found a git repository, run git command + if [ "$git_root" != "/" ] && [ -d "$git_root/.git" ]; then + tag_version=$(cd "$git_root" && git tag --points-at HEAD 2>/dev/null | head -n 1) + else + tag_version="" + fi + + if [ -z "$tag_version" ]; then + echo "" + return 1 + fi + + # If tag starts with 'v', strip it and return the rest + if [[ "$tag_version" =~ ^v(.+)$ ]]; then + echo "${BASH_REMATCH[1]}" + return 0 + fi + + # Tag doesn't start with 'v', return as-is + echo "$tag_version" + return 0 +} + +# Function to validate version string format +validate_version_string() { + local version="$1" + + # Check if version is empty + if [ -z "$version" ]; then + return 1 + fi + + # Basic version format validation: X.Y.Z.W or X.Y.Z.W-rcN or X.Y.Z.W-suffix + if [[ "$version" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(-[a-zA-Z0-9]+)?$ ]]; then + return 0 + fi + + return 1 +} + +# Function to get version for metadata (from git tag or default) +get_metadata_version() { + local default_version="${1:-$omnia_release}" + local git_tag_version + + git_tag_version=$(get_version_from_git_tag) + + if [ -n "$git_tag_version" ] && validate_version_string "$git_tag_version"; then + echo "$git_tag_version" + else + echo "$default_version" + fi +} + omnia_release=2.1.0.0 core_container_status=false @@ -103,11 +178,15 @@ get_available_upgrade_versions() { # Function to get available rollback versions (lower than current) get_available_rollback_versions() { local current_version="$1" + local normalized_current_version="${current_version%%-rc*}" + if [ -z "$normalized_current_version" ]; then + normalized_current_version="$current_version" + fi local available_versions=() # Find versions lower than current for version in "${ALL_OMNIA_VERSIONS[@]}"; do - if [ "$version" = "$current_version" ]; then + if [ "$version" = "$normalized_current_version" ]; then break fi available_versions+=("$version") @@ -136,7 +215,9 @@ rollback_same_tag() { return 1 fi - echo "[INFO] [ROLLBACK] Updating metadata to version $target_version" + # Get version from git tag or use target version + local metadata_version=$(get_metadata_version "$target_version") + echo "[INFO] [ROLLBACK] Updating metadata to version $metadata_version" # Update version metadata if ! podman exec -u root omnia_core bash -c " @@ -146,9 +227,9 @@ rollback_same_tag() { exit 1 fi if grep -q '^omnia_version:' '$CONTAINER_METADATA_FILE'; then - sed -i 's/^omnia_version:.*/omnia_version: $target_version/' '$CONTAINER_METADATA_FILE' + sed -i 's/^omnia_version:.*/omnia_version: $metadata_version/' '$CONTAINER_METADATA_FILE' else - echo 'omnia_version: $target_version' >> '$CONTAINER_METADATA_FILE' + echo 'omnia_version: $metadata_version' >> '$CONTAINER_METADATA_FILE' fi "; then echo "[ERROR] [ROLLBACK] Failed to update metadata version" @@ -189,14 +270,14 @@ rollback_same_tag() { # Verify version update local updated_version=$(get_current_omnia_version) - if [ "$updated_version" != "$target_version" ]; then + if [ "$updated_version" != "$metadata_version" ]; then echo "[ERROR] [ROLLBACK] Version update verification failed" - echo "[ERROR] [ROLLBACK] Expected: $target_version, Found: $updated_version" + echo "[ERROR] [ROLLBACK] Expected: $metadata_version, Found: $updated_version" return 1 fi echo "[INFO] [ROLLBACK] Same-tag rollback completed successfully" - echo "[INFO] [ROLLBACK] Version rolled back to: $target_version" + echo "[INFO] [ROLLBACK] Version rolled back to: $metadata_version" return 0 } @@ -247,6 +328,37 @@ get_current_omnia_version() { fi } +# Update metadata with git tag version from inside container +update_metadata_with_git_tag() { + local default_version="${1:-$omnia_release}" + + podman exec -u root omnia_core bash -c ' + set -e + + cd /omnia || exit 0 + git_tag_version=$(git tag --points-at HEAD 2>/dev/null | head -n 1 || true) + + if [[ "$git_tag_version" =~ ^v(.+)$ ]]; then + git_tag_version="${BASH_REMATCH[1]}" + fi + + if [[ "$git_tag_version" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(-[a-zA-Z0-9]+)?$ ]]; then + metadata_version="$git_tag_version" + else + metadata_version="'"$default_version"'" + fi + + if [ -f '"'$CONTAINER_METADATA_FILE'"' ]; then + if grep -q "^omnia_version:" '"'$CONTAINER_METADATA_FILE'"'; then + sed -i "s/^omnia_version:.*/omnia_version: $metadata_version/" '"'$CONTAINER_METADATA_FILE'"' + else + echo "omnia_version: $metadata_version" >> '"'$CONTAINER_METADATA_FILE'"' + fi + echo "[INFO] Updated omnia_version to: $metadata_version" + fi + ' || true +} + show_post_upgrade_instructions() { local upgraded_version="$1" @@ -1036,12 +1148,15 @@ EOF oim_metadata_file="$OMNIA_METADATA_FILE" + # Get version from git tag or use default + local metadata_version=$(get_metadata_version "$omnia_release") + if [ ! -f "$oim_metadata_file" ]; then echo -e "${GREEN} Creating oim_metadata file${NC}" { echo "oim_crt: \"podman\"" echo "oim_shared_path: $omnia_path" - echo "omnia_version: $omnia_release" + echo "omnia_version: $metadata_version" echo "oim_hostname: $(hostname)" echo "oim_node_name: $(hostname -s)" echo "domain_name: $domain_name" @@ -1059,9 +1174,9 @@ EOF else sed -i '/^upgrade_backup_dir:/d' "$oim_metadata_file" >/dev/null 2>&1 || true if grep -q '^omnia_version:' "$oim_metadata_file"; then - sed -i "s/^omnia_version:.*/omnia_version: $omnia_release/" "$oim_metadata_file" >/dev/null 2>&1 || true + sed -i "s/^omnia_version:.*/omnia_version: $metadata_version/" "$oim_metadata_file" >/dev/null 2>&1 || true else - echo "omnia_version: $omnia_release" >> "$oim_metadata_file" + echo "omnia_version: $metadata_version" >> "$oim_metadata_file" fi fi @@ -1204,6 +1319,9 @@ start_container_session() { init_ssh_config + # Update metadata with git tag version from inside container + update_metadata_with_git_tag "$omnia_release" + # Entering Omnia-core container ssh omnia_core } @@ -1560,7 +1678,9 @@ phase4_same_tag_upgrade() { return 1 fi - echo "[INFO] [ORCHESTRATOR] Updating metadata to version $target_version" + # Get version from git tag or use target version + local metadata_version=$(get_metadata_version "$target_version") + echo "[INFO] [ORCHESTRATOR] Updating metadata to version $metadata_version" # Update version metadata if ! podman exec -u root omnia_core bash -c " @@ -1570,9 +1690,9 @@ phase4_same_tag_upgrade() { exit 1 fi if grep -q '^omnia_version:' '$CONTAINER_METADATA_FILE'; then - sed -i 's/^omnia_version:.*/omnia_version: $target_version/' '$CONTAINER_METADATA_FILE' + sed -i 's/^omnia_version:.*/omnia_version: $metadata_version/' '$CONTAINER_METADATA_FILE' else - echo 'omnia_version: $target_version' >> '$CONTAINER_METADATA_FILE' + echo 'omnia_version: $metadata_version' >> '$CONTAINER_METADATA_FILE' fi "; then echo "[ERROR] [ORCHESTRATOR] Failed to update metadata version" @@ -1613,14 +1733,17 @@ phase4_same_tag_upgrade() { # Verify version update local updated_version=$(get_current_omnia_version) - if [ "$updated_version" != "$target_version" ]; then + if [ "$updated_version" != "$metadata_version" ]; then echo "[ERROR] [ORCHESTRATOR] Version update verification failed" - echo "[ERROR] [ORCHESTRATOR] Expected: $target_version, Found: $updated_version" + echo "[ERROR] [ORCHESTRATOR] Expected: $metadata_version, Found: $updated_version" return 1 fi echo "[INFO] [ORCHESTRATOR] Same-tag upgrade completed successfully" - echo "[INFO] [ORCHESTRATOR] Version updated to: $target_version" + echo "[INFO] [ORCHESTRATOR] Version updated to: $metadata_version" + + # Update metadata with git tag version from inside container + update_metadata_with_git_tag "$target_version" show_post_upgrade_instructions "$target_version" @@ -1698,7 +1821,7 @@ phase4_container_swap() { sleep 1 done - if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + if ! podman ps --format '{{.Names}} {{.Status}}' | grep -E "omnia_core.*Up" | grep -q "healthy\|Up"; then echo "[ERROR] [ORCHESTRATOR] Phase 4.4 failed: Container failed health check after swap" echo "[ERROR] [ORCHESTRATOR] Upgrade failed: $TARGET_CONTAINER_TAG container failed health check" echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." @@ -1706,7 +1829,9 @@ phase4_container_swap() { return 1 fi - echo "[INFO] [ORCHESTRATOR] Updating metadata omnia_version to $TARGET_OMNIA_VERSION" + # Get version from git tag or use target version + local metadata_version=$(get_metadata_version "$TARGET_OMNIA_VERSION") + echo "[INFO] [ORCHESTRATOR] Updating metadata omnia_version to $metadata_version" if ! podman exec -u root omnia_core bash -c " set -e if [ ! -f '$CONTAINER_METADATA_FILE' ]; then @@ -1714,9 +1839,9 @@ phase4_container_swap() { exit 1 fi if grep -q '^omnia_version:' '$CONTAINER_METADATA_FILE'; then - sed -i 's/^omnia_version:.*/omnia_version: $TARGET_OMNIA_VERSION/' '$CONTAINER_METADATA_FILE' + sed -i 's/^omnia_version:.*/omnia_version: $metadata_version/' '$CONTAINER_METADATA_FILE' else - echo 'omnia_version: $TARGET_OMNIA_VERSION' >> '$CONTAINER_METADATA_FILE' + echo 'omnia_version: $metadata_version' >> '$CONTAINER_METADATA_FILE' fi "; then echo "[ERROR] [ORCHESTRATOR] Phase 4.5 failed: Failed to update metadata version" @@ -1727,6 +1852,8 @@ phase4_container_swap() { fi echo "[INFO] [ORCHESTRATOR] Phase 4: Container swap completed" + # Update metadata with git tag version from inside container + update_metadata_with_git_tag "$TARGET_OMNIA_VERSION" return 0 } @@ -1744,6 +1871,13 @@ upgrade_omnia_core() { exit 1 fi + # Block upgrades when running from an RC build + if [[ "$OMNIA_VERSION" == *-rc* ]]; then + echo -e "${RED}Upgrade is not supported for release-candidate builds ($OMNIA_VERSION).${NC}" + echo -e "${YELLOW}Please install a GA version before attempting an upgrade.${NC}" + exit 1 + fi + # Get current container tag OMNIA_CORE_CONTAINER_TAG=$(get_container_tag_from_version "$OMNIA_VERSION") @@ -2305,6 +2439,9 @@ rollback_omnia_core() { echo -e "${GREEN}✓ Configuration restored from backup${NC}" echo "" + # Update metadata with git tag version from inside container + update_metadata_with_git_tag "$selected_version" + # Clean up lock file before starting long-running ssh session rm -f "$lock_file" >/dev/null 2>&1 || true echo "[INFO] Rollback lock file removed before starting container session" From 327f929a6107878ff68510d25460dbf62c4040b8 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Thu, 26 Feb 2026 12:43:17 +0530 Subject: [PATCH 07/41] Update image not found error in omnia.sh --- omnia.sh | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/omnia.sh b/omnia.sh index e9d48b1fe8..f0852d9c86 100755 --- a/omnia.sh +++ b/omnia.sh @@ -289,16 +289,29 @@ validate_container_image() { echo -e "${BLUE}Validating target container image: omnia_core:$target_container_tag${NC}" if ! podman inspect "omnia_core:$target_container_tag" >/dev/null 2>&1; then - echo -e "${RED}ERROR: Target image missing locally: omnia_core:$target_container_tag${NC}" - echo -e "${YELLOW}Omnia does not pull images from Docker Hub. Build/load the image locally and retry.${NC}" - echo -e "1. Clone the Omnia Artifactory repository:" - echo -e " git clone https://github.com/dell/omnia-artifactory -b omnia-container-$target_version" - echo -e "2. Navigate to the repository directory:" - echo -e " cd omnia-artifactory" - echo -e "3. Build the core image locally (loads into local Podman by default):" - echo -e " ./build_images.sh core core_tag=$target_container_tag omnia_branch=$target_version" - echo -e "Then re-run:" - echo -e " ./omnia.sh --$operation" + echo "" + echo -e "${RED}================================================================================${NC}" + echo -e "${RED}ERROR: Target container image not found locally${NC}" + echo -e "${RED}================================================================================${NC}" + echo -e "${YELLOW}Required image:${NC} omnia_core:$target_container_tag" + echo "" + echo -e "${YELLOW}Omnia does not pull images from Docker Hub.${NC}" + echo -e "${YELLOW}You must build or load the container image locally before proceeding.${NC}" + echo "" + echo -e "${BLUE}Build the required image using the following commands:${NC}" + echo "" + echo -e "git clone https://github.com/dell/omnia-artifactory.git -b omnia-container-" + echo -e "${YELLOW}Note: Replace with the target Omnia version (e.g., v2.1.0.0)${NC}" + echo "" + echo -e "cd omnia-artifactory" + echo "" + echo -e "./build_images.sh core core_tag=$target_container_tag omnia_branch=" + echo -e "${YELLOW}Note: Replace with the target Omnia branch (e.g., v2.1.0.0)${NC}" + echo "" + echo -e "${BLUE}After the image is built successfully, re-run:${NC}" + echo -e "./omnia.sh --$operation" + echo "" + echo -e "${RED}================================================================================${NC}" return 1 fi From c6c60ed6bdc55a8ebb4e8608d265719d12fe6bd2 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Thu, 26 Feb 2026 14:27:21 +0530 Subject: [PATCH 08/41] Update omnia.sh --- omnia.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/omnia.sh b/omnia.sh index f0852d9c86..f3c0c87e37 100755 --- a/omnia.sh +++ b/omnia.sh @@ -305,8 +305,9 @@ validate_container_image() { echo "" echo -e "cd omnia-artifactory" echo "" - echo -e "./build_images.sh core core_tag=$target_container_tag omnia_branch=" + echo -e "./build_images.sh core core_tag= omnia_branch=" echo -e "${YELLOW}Note: Replace with the target Omnia branch (e.g., v2.1.0.0)${NC}" + echo -e "${YELLOW}Note: core_tag will be the first 2 digits of the version (e.g., 2.1 for v2.1.0.0)${NC}" echo "" echo -e "${BLUE}After the image is built successfully, re-run:${NC}" echo -e "./omnia.sh --$operation" From abf03fb37c614d44d49f926adf6143d2cc3873bf Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Thu, 26 Feb 2026 14:30:16 +0530 Subject: [PATCH 09/41] Update omnia.sh --- omnia.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/omnia.sh b/omnia.sh index f3c0c87e37..3d537347f5 100755 --- a/omnia.sh +++ b/omnia.sh @@ -307,7 +307,7 @@ validate_container_image() { echo "" echo -e "./build_images.sh core core_tag= omnia_branch=" echo -e "${YELLOW}Note: Replace with the target Omnia branch (e.g., v2.1.0.0)${NC}" - echo -e "${YELLOW}Note: core_tag will be the first 2 digits of the version (e.g., 2.1 for v2.1.0.0)${NC}" + echo -e "${YELLOW}Note: core_tag will be the first 2 digits of the target Omnia version (e.g., 2.1 for v2.1.0.0)${NC}" echo "" echo -e "${BLUE}After the image is built successfully, re-run:${NC}" echo -e "./omnia.sh --$operation" From 1d807f979ef310cbc3fbc920265535771ec79bce Mon Sep 17 00:00:00 2001 From: Katakam-Rakesh Date: Thu, 26 Feb 2026 15:32:32 +0530 Subject: [PATCH 10/41] Updating validation for user_repo_url repo names Signed-off-by: pullan1 --- .../common_utils/en_us_validation_msg.py | 4 ++++ .../validation_flows/local_repo_validation.py | 22 +++++++++++++++++++ input/local_repo_config.yml | 3 ++- 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py index 15a8537ac5..737637a252 100644 --- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py +++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py @@ -166,6 +166,10 @@ LDMS_REQUIRES_SLURM_MSG = ( "requires Slurm package 'slurm_custom' to be present in the 'softwares' list in software_config.json." ) +USER_REPO_NAME_PREFIX_FAIL_MSG = ( + "Repository name '{repo_name}' in {repo_key} must start with '{expected_prefix}'. " + "Please update the name to '{expected_prefix}{repo_name}'." +) # omnia_config.yml INVALID_PASSWORD_MSG = ("Provided password is invalid. Password must meet the specified " diff --git a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py index 447bd33c8d..1a025754ab 100644 --- a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py @@ -20,6 +20,7 @@ import re from ansible.module_utils.input_validation.common_utils import validation_utils from ansible.module_utils.input_validation.common_utils import config +from ansible.module_utils.input_validation.common_utils import en_us_validation_msg from ansible.module_utils.local_repo.software_utils import load_yaml, load_json file_names = config.files @@ -128,6 +129,27 @@ def validate_local_repo_config(input_file_path, data, if key_path and not os.path.exists(key_path): errors.append(create_error_msg(local_repo_yml, "user_registry", f"Key file not found: {key_path}")) + + # Validate user_repo_url name prefixes + user_repo_prefix_map = { + "user_repo_url_x86_64": "x86_64_", + "user_repo_url_aarch64": "aarch64_", + } + for repo_key, expected_prefix in user_repo_prefix_map.items(): + user_repos = data.get(repo_key) + if user_repos: + for repo in user_repos: + repo_name = repo.get("name", "") + if repo_name and not repo_name.startswith(expected_prefix): + errors.append(create_error_msg( + local_repo_yml, repo_key, + en_us_validation_msg.USER_REPO_NAME_PREFIX_FAIL_MSG.format( + repo_name=repo_name, + repo_key=repo_key, + expected_prefix=expected_prefix + ) + )) + repo_names = {} sub_result = check_subscription_status(logger) logger.info(f"validate_local_repo_config: Subscription status: {sub_result}") diff --git a/input/local_repo_config.yml b/input/local_repo_config.yml index f81d62640c..394cff3a6c 100644 --- a/input/local_repo_config.yml +++ b/input/local_repo_config.yml @@ -39,7 +39,7 @@ # Fields: # url : Base URL of the repository # gpgkey : GPG key URL (leave empty to disable gpgcheck; Omnia will trust this repo and user is responsible for its security) -# name : Name of the repository +# name : Name of the repository (must start with 'x86_64_', e.g., 'x86_64_my_repo') # sslcacert : Path to SSL CA certificate (if using SSL) # sslclientkey: Path to SSL client key (if using SSL) # sslclientcert: Path to SSL client certificate (if using SSL) @@ -55,6 +55,7 @@ # 3. user_repo_url_aarch64 #--------------------------- # Same as above but for aarch64 architecture. +# Note: name must start with 'aarch64_' (e.g., 'aarch64_my_repo'). # # 4. rhel_os_url_x86_64 #----------------------------- From e9d3d2f4ca5f5cdf7d0b1d7e7631a1e9f565c8ef Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Thu, 26 Feb 2026 04:45:10 -0600 Subject: [PATCH 11/41] upgrade to 2.1 --- .../templates/omnia_config.j2 | 17 +++++++++++++++++ upgrade/roles/upgrade_cluster/tasks/main.yml | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/upgrade/roles/import_input_parameters/templates/omnia_config.j2 b/upgrade/roles/import_input_parameters/templates/omnia_config.j2 index eff82ee1c5..2566b6bed5 100644 --- a/upgrade/roles/import_input_parameters/templates/omnia_config.j2 +++ b/upgrade/roles/import_input_parameters/templates/omnia_config.j2 @@ -108,6 +108,23 @@ slurm_cluster: {% endfor %} {% endif %} +# node_discovery_mode: "Homogeneous" +# node_discovery_mode: "heterogeneous" +# # Specify hardware specs for groups (optional - will use group iDRAC if not specified) + # node_hardware_defaults: + # grp12: + # sockets: 5 + # cores_per_socket: 6 + # threads_per_core: 3 + # real_memory: 117968 + # # gres: "gpu:4" # optional + + # grp13: + # sockets: 13 + # cores_per_socket: 13 + # threads_per_core: 13 + # real_memory: 117913 + # ----------------------------SERVICE K8S------------------------------------------------------ # For service k8s cluster below parameters are required,(List) # - cluster_name is required field diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml index ada4408f2e..d35cdd8957 100644 --- a/upgrade/roles/upgrade_cluster/tasks/main.yml +++ b/upgrade/roles/upgrade_cluster/tasks/main.yml @@ -55,6 +55,23 @@ - New variable: skip_merge (set to true to skip merging configs during upgrade when using external bundles) + - New variable: node_discovery_mode: 'Homogeneous' or 'Heterogeneous' + + - New variable: node_hardware_defaults: + + | Mode | What happens | iDRAC calls for 500 nodes | + + |---|---|---| + + Heterogeneous (default) | Each node queried individually 500 + + Homogeneous with specs provided | Specs applied directly, no querying 0 + + Homogeneous without specs | One node per group queried, specs shared 1 per group + + also provide node_hardware_defaults for groups where you want to apply specs + + Optional: NFS cleanup (only if you are reprovisioning the cluster) From 47b7d9ec820aaec946971651ad536c8416364cbd Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Thu, 26 Feb 2026 06:23:17 -0600 Subject: [PATCH 12/41] upgrade to 2.1 fixed spacing --- .../templates/omnia_config.j2 | 29 +++++++++---------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/upgrade/roles/import_input_parameters/templates/omnia_config.j2 b/upgrade/roles/import_input_parameters/templates/omnia_config.j2 index 2566b6bed5..7259706efa 100644 --- a/upgrade/roles/import_input_parameters/templates/omnia_config.j2 +++ b/upgrade/roles/import_input_parameters/templates/omnia_config.j2 @@ -108,22 +108,21 @@ slurm_cluster: {% endfor %} {% endif %} -# node_discovery_mode: "Homogeneous" -# node_discovery_mode: "heterogeneous" -# # Specify hardware specs for groups (optional - will use group iDRAC if not specified) + # node_discovery_mode: "homogeneous" + # # Specify hardware specs for groups (optional - will use group iDRAC if not specified) # node_hardware_defaults: - # grp12: - # sockets: 5 - # cores_per_socket: 6 - # threads_per_core: 3 - # real_memory: 117968 - # # gres: "gpu:4" # optional - - # grp13: - # sockets: 13 - # cores_per_socket: 13 - # threads_per_core: 13 - # real_memory: 117913 + # grp12: + # sockets: 5 + # cores_per_socket: 6 + # threads_per_core: 3 + # real_memory: 117968 + # gres: "gpu:4" # optional + + # grp13: + # sockets: 13 + # cores_per_socket: 13 + # threads_per_core: 13 + # real_memory: 117913 # ----------------------------SERVICE K8S------------------------------------------------------ # For service k8s cluster below parameters are required,(List) From a8426b9bb4b587b6930ac6c5bdcb8bbeff637b24 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Thu, 26 Feb 2026 06:27:53 -0600 Subject: [PATCH 13/41] upgrade lint fixed spacing --- upgrade/roles/upgrade_cluster/tasks/main.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml index d35cdd8957..d0761874a9 100644 --- a/upgrade/roles/upgrade_cluster/tasks/main.yml +++ b/upgrade/roles/upgrade_cluster/tasks/main.yml @@ -66,13 +66,11 @@ Heterogeneous (default) | Each node queried individually 500 Homogeneous with specs provided | Specs applied directly, no querying 0 - + Homogeneous without specs | One node per group queried, specs shared 1 per group also provide node_hardware_defaults for groups where you want to apply specs - - Optional: NFS cleanup (only if you are reprovisioning the cluster) If you choose to reprovision the cluster and your setup uses an NFS share for Kubernetes and/or Slurm, you may optionally perform an NFS From 994b72c213cf3bc81e62eea9d5c1f5fa3ce63d5a Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Thu, 26 Feb 2026 06:35:31 -0600 Subject: [PATCH 14/41] upgrade lint --- upgrade/roles/upgrade_cluster/tasks/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml index d0761874a9..92f7ad01a3 100644 --- a/upgrade/roles/upgrade_cluster/tasks/main.yml +++ b/upgrade/roles/upgrade_cluster/tasks/main.yml @@ -57,7 +57,7 @@ - New variable: node_discovery_mode: 'Homogeneous' or 'Heterogeneous' - - New variable: node_hardware_defaults: + - New variable: node_hardware_defaults: | Mode | What happens | iDRAC calls for 500 nodes | From 2b8f5046b3be74b278ff019741cc4c9b584e3ff3 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Fri, 27 Feb 2026 02:45:13 +0530 Subject: [PATCH 15/41] Delete node whern busy with choice to abort --- .../roles/slurm_config/defaults/main.yml | 3 + .../roles/slurm_config/tasks/backup_conf.yml | 69 +++++++++ discovery/roles/slurm_config/tasks/confs.yml | 144 +++++++++++------- .../slurm_config/tasks/detect_busy_nodes.yml | 63 ++++++++ .../tasks/drain_and_remove_node.yml | 35 ----- .../roles/slurm_config/tasks/remove_node.yml | 2 +- .../slurm_config/tasks/update_hosts_munge.yml | 53 +------ discovery/roles/slurm_config/vars/main.yml | 2 +- 8 files changed, 229 insertions(+), 142 deletions(-) create mode 100644 discovery/roles/slurm_config/tasks/backup_conf.yml create mode 100644 discovery/roles/slurm_config/tasks/detect_busy_nodes.yml diff --git a/discovery/roles/slurm_config/defaults/main.yml b/discovery/roles/slurm_config/defaults/main.yml index 955e4c2a37..730081a80b 100644 --- a/discovery/roles/slurm_config/defaults/main.yml +++ b/discovery/roles/slurm_config/defaults/main.yml @@ -61,6 +61,9 @@ __default_config: SlurmdTimeout: 300 Epilog: "/etc/slurm/epilog.d/logout_user.sh" PluginDir: "{{ plugin_slurm_dir }}" + NodeSet: + - NodeSet: "{{ slurm_partition_name }}" + Feature: "{{ slurm_partition_name }}" NodeName: - NodeName: DEFAULT State: UNKNOWN diff --git a/discovery/roles/slurm_config/tasks/backup_conf.yml b/discovery/roles/slurm_config/tasks/backup_conf.yml new file mode 100644 index 0000000000..eeb4cfc6ce --- /dev/null +++ b/discovery/roles/slurm_config/tasks/backup_conf.yml @@ -0,0 +1,69 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Backup Slurm configuration files when changed + when: + - ctld_conf_files is changed + - ctld_list is defined + - ctld_list | length > 0 + block: + - name: Set backup timestamp + ansible.builtin.set_fact: + backup_timestamp: "{{ ansible_date_time.date }}_{{ ansible_date_time.time | replace(':', '-') }}" + backup_base_name: "auto_backup_discovery" + + - name: Set backup name suffix + ansible.builtin.set_fact: + backup_name_suffix: "{{ backup_base_name ~ '_' ~ backup_timestamp }}" + + - name: Set backup directories + ansible.builtin.set_fact: + slurm_backups_root: "{{ share_path }}/slurm_backups" + backup_dir: "{{ share_path }}/slurm_backups/{{ backup_base_name ~ '_' ~ backup_timestamp }}" + + - name: Ensure slurm backups root exists + ansible.builtin.file: + path: "{{ slurm_backups_root }}" + state: directory + mode: '0755' + + - name: Create backup directory + ansible.builtin.file: + path: "{{ backup_dir }}" + state: directory + mode: '0755' + + - name: Create backup config directories + ansible.builtin.file: + path: "{{ backup_dir }}/{{ ctld_list[0] }}/{{ item }}" + state: directory + mode: '0755' + loop: + - etc/slurm + - etc/munge + - etc/my.cnf.d + + - name: Backup controller config directories + ansible.builtin.command: >- + cp -a "{{ slurm_config_path }}/{{ ctld_list[0] }}/{{ item }}/." "{{ backup_dir }}/{{ ctld_list[0] }}/{{ item }}/" + loop: + - etc/slurm + - etc/munge + - etc/my.cnf.d + changed_when: true + failed_when: false + + - name: Display backup location + ansible.builtin.debug: + msg: "Slurm config backup created at: {{ backup_dir }}/{{ ctld_list[0] }}" diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index 0db74bc166..0569080d08 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -146,6 +146,66 @@ - normal_partition is defined - normal_partition | length > 0 +- name: Detect busy nodes + ansible.builtin.include_tasks: detect_busy_nodes.yml + loop: "{{ nodes_in_normal_not_in_cmpt }}" + loop_control: + loop_var: node_to_remove + when: + - not force_scancel_node + - "'slurm' in conf_merge_dict" + - nodes_in_normal_not_in_cmpt is defined + - nodes_in_normal_not_in_cmpt | length > 0 + +- name: Prompt for user input when jobs are running + ansible.builtin.pause: + prompt: | + ================================================================================ + WARNING: ACTIVE JOBS DETECTED ON THE FOLLOWING NODES + ================================================================================ + + Busy Nodes: + {% for node, jobs in busy_nodes.items() %} + - {{ node }}: {{ jobs }} running job(s) + {% endfor %} + + These nodes has active running jobs. + All other nodes without jobs will be removed. + + To view job details, run: + {% for node in busy_nodes.keys() %} + squeue -w {{ node }} + {% endfor %} + + Available Options: + A. ABORT (Recommended) + - Remove the IDLE nodes from the cluster and abort this playbook + - Manually cancel jobs or wait for them to complete + - Then re-run this playbook to remove these nodes + + F. FORCE REMOVAL (Destructive) + - All running jobs will be forcefully terminated + - Users will lose any unsaved work + + ================================================================================ + Enter 'A' to abort, or 'F' to force remove: + register: user_input + until: user_input.user_input | default('') | trim | upper in ['A', 'F'] + retries: 10 + delay: 1 + when: + - busy_nodes is defined + - busy_nodes | length > 0 + - not force_scancel_node + +- name: Remove busy_nodes from nodes_in_normal_not_in_cmpt + ansible.builtin.set_fact: + nodes_in_normal_not_in_cmpt: "{{ nodes_in_normal_not_in_cmpt | difference(busy_nodes.keys() | list) }}" + when: + - busy_nodes is defined + - busy_nodes | length > 0 + - user_input.user_input | default('') | trim | upper == 'A' + - name: Remove nodes ansible.builtin.include_tasks: remove_node.yml when: @@ -196,61 +256,7 @@ loop_var: extra_conf - name: Backup Slurm configuration files when changed - when: - - ctld_conf_files is changed - - ctld_list is defined - - ctld_list | length > 0 - block: - - name: Set backup timestamp - ansible.builtin.set_fact: - backup_timestamp: "{{ ansible_date_time.date }}_{{ ansible_date_time.time | replace(':', '-') }}" - backup_base_name: "auto_backup_discovery" - - - name: Set backup name suffix - ansible.builtin.set_fact: - backup_name_suffix: "{{ backup_base_name ~ '_' ~ backup_timestamp }}" - - - name: Set backup directories - ansible.builtin.set_fact: - slurm_backups_root: "{{ share_path }}/slurm_backups" - backup_dir: "{{ share_path }}/slurm_backups/{{ backup_base_name ~ '_' ~ backup_timestamp }}" - - - name: Ensure slurm backups root exists - ansible.builtin.file: - path: "{{ slurm_backups_root }}" - state: directory - mode: '0755' - - - name: Create backup directory - ansible.builtin.file: - path: "{{ backup_dir }}" - state: directory - mode: '0755' - - - name: Create backup config directories - ansible.builtin.file: - path: "{{ backup_dir }}/{{ ctld_list[0] }}/{{ item }}" - state: directory - mode: '0755' - loop: - - etc/slurm - - etc/munge - - etc/my.cnf.d - - - name: Backup controller config directories - ansible.builtin.command: >- - cp -a "{{ slurm_config_path }}/{{ ctld_list[0] }}/{{ item }}/." "{{ backup_dir }}/{{ ctld_list[0] }}/{{ item }}/" - loop: - - etc/slurm - - etc/munge - - etc/my.cnf.d - changed_when: true - failed_when: false - - - name: Display backup location - ansible.builtin.debug: - msg: "Slurm config backup created at: {{ backup_dir }}/{{ ctld_list[0] }}" - + ansible.builtin.include_tasks: backup_conf.yml - name: Check if cluster running ansible.builtin.include_tasks: check_ctld_running.yml @@ -260,3 +266,31 @@ loop: "{{ ctld_list }}" loop_control: loop_var: ctld + +- name: Handle user choice - ABORT + ansible.builtin.fail: + msg: | + ================================================================================ + PLAYBOOK ABORTED BY USER + ================================================================================ + + You chose to abort the playbook (Option A). + + Next Steps: + 1. Cancel running jobs manually: + {% for node in busy_nodes.keys() %} + scancel -w {{ node }} + {% endfor %} + + 2. Or wait for jobs to complete naturally + + 3. Re-run this playbook to remove the nodes + + Idle nodes (if any) have already been removed from the cluster. + + ================================================================================ + when: + - busy_nodes is defined + - busy_nodes | length > 0 + - not force_scancel_node + - user_input.user_input | default('') | trim | upper in ['A'] diff --git a/discovery/roles/slurm_config/tasks/detect_busy_nodes.yml b/discovery/roles/slurm_config/tasks/detect_busy_nodes.yml new file mode 100644 index 0000000000..f88a19b440 --- /dev/null +++ b/discovery/roles/slurm_config/tasks/detect_busy_nodes.yml @@ -0,0 +1,63 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Check if node exists in Slurm cluster + ansible.builtin.command: scontrol show node {{ node_to_remove }} + register: node_exists_check + failed_when: false + ignore_unreachable: true + changed_when: false + delegate_to: "{{ ctld_list[0] }}" + +- name: Skip if node does not exist + ansible.builtin.debug: + msg: "Node {{ node_to_remove }} not found in cluster, skipping removal" + when: + - node_exists_check is reachable + - node_exists_check.rc != 0 + +- name: Process node removal + when: + - node_exists_check is reachable + - node_exists_check.rc == 0 + ignore_unreachable: true + block: + - name: Get current job count on node + ansible.builtin.shell: + cmd: | + set -o pipefail + squeue -w {{ node_to_remove }} -h | wc -l + register: current_jobs + changed_when: false + delegate_to: "{{ ctld_list[0] }}" + + - name: Display job information + ansible.builtin.debug: + msg: "Node {{ node_to_remove }} currently has {{ current_jobs.stdout }} running job(s)" + + - name: Populate busy_nodes list + ansible.builtin.set_fact: + busy_nodes: "{{ (busy_nodes | default({})).combine({node_to_remove: current_jobs.stdout | int}) }}" + when: + - current_jobs.stdout | int > 0 + + - name: Display busy_nodes list + ansible.builtin.debug: + var: busy_nodes + + rescue: + - name: Failure to detect busy nodes + ansible.builtin.debug: + msg: "Node {{ node_to_remove }} busy node detection failed from slurm cluster, + as task {{ ansible_failed_task.name }} failed." diff --git a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml index cf62b156aa..15d09bf3f3 100644 --- a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml +++ b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml @@ -46,41 +46,6 @@ ansible.builtin.debug: msg: "Node {{ node_to_remove }} currently has {{ current_jobs.stdout }} running job(s)" - - name: Prompt for user input when jobs are running - ansible.builtin.pause: - prompt: | - ================================================================================ - WARNING: ACTIVE JOBS DETECTED ON NODE {{ node_to_remove }} - ================================================================================ - - Current Status: - - Node: {{ node_to_remove }} - - Running Jobs: {{ current_jobs.stdout }} - - Node State: Will be set to DOWN and removed from cluster - - Impact: All running jobs on this node will be terminated - - To view job details, run: - squeue -w {{ node_to_remove }} - - Available Options: - 1. ABORT AND CANCEL MANUALLY (Recommended) - - Press Ctrl+C, then 'A' to abort this playbook - - Manually cancel jobs: scancel -w {{ node_to_remove }} - - Or wait for jobs to complete naturally - - Then re-run this playbook - - 2. FORCE REMOVAL (Destructive) - - Press Enter to proceed with immediate node removal - - All {{ current_jobs.stdout }} job(s) will be forcefully terminated - - Users will lose any unsaved work - - Job data may be incomplete or corrupted - - ================================================================================ - Your choice (Ctrl+C then 'A' to abort, or Enter to force remove): - when: - - current_jobs.stdout | int > 0 - - not force_scancel_node - - name: Force cancel jobs on the node to be removed from cluster ansible.builtin.command: scancel -f -w {{ node_to_remove }} # Safe does not fail if no jobs are running changed_when: true diff --git a/discovery/roles/slurm_config/tasks/remove_node.yml b/discovery/roles/slurm_config/tasks/remove_node.yml index eecf6d4f1b..b1acab6de8 100644 --- a/discovery/roles/slurm_config/tasks/remove_node.yml +++ b/discovery/roles/slurm_config/tasks/remove_node.yml @@ -50,7 +50,7 @@ - name: Update normal partition Nodes to match cmpt_list ansible.builtin.set_fact: updated_partitions: "{{ updated_partitions | default([]) - + [item | combine({'Nodes': (cmpt_list | join(',')) if cmpt_list | length > 0 else 'ALL'}) if item.PartitionName == slurm_partition_name else item] }}" + + [item | combine({'Nodes': ((cmpt_list | difference(nodes_in_normal_not_in_cmpt)) | join(',')) if cmpt_list | length > 0 else slurm_partition_name}) if item.PartitionName == slurm_partition_name else item] }}" loop: "{{ slurm_conf_dict.PartitionName | default([]) }}" when: - "'slurm' in conf_merge_dict" diff --git a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml index 147f1b484c..29683159ad 100644 --- a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml +++ b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml @@ -57,60 +57,13 @@ munge_key_changed: "{{ munge_key_copy.results | default([]) | rekey_on_member('item') }}" when: munge_key_copy is defined -# TODO: Clean unreachable handling - name: Block when munge key changed + ansible.builtin.debug: + msg: "Munge key updates detected on NFS for {{ slurmhost_ip }}\n. + Please restart munge service on {{ slurmhost_ip }} followed by the dependent slurm services on them\n." when: - munge_key_changed is defined - munge_key_changed[name_ip_map[slurmhost_ip]]['changed'] | default(false) - restart_slurm_services - delegate_to: "{{ slurmhost_ip }}" no_log: "{{ _no_log }}" ignore_unreachable: true - block: - - name: Update munge key permissions - ansible.builtin.file: - path: /etc/munge/munge.key - owner: munge - group: munge - mode: '0600' - register: munge_key_permissions_result - - - name: Restart munge service if key changed - ansible.builtin.service: - name: munge - state: restarted - register: munge_restart_result - when: - - munge_key_permissions_result is defined - - munge_key_permissions_result is success - - - name: Restart slurmctld if munge restarted - ansible.builtin.service: - name: slurmctld - state: restarted - when: - - name_ip_map[slurmhost_ip] in ctld_list - - munge_restart_result is defined - - munge_restart_result is success - - - name: Restart slurmd if munge restarted - ansible.builtin.service: - name: slurmd - state: restarted - when: - - name_ip_map[slurmhost_ip] in (cmpt_list + login_list + compiler_login_list) - - munge_restart_result is defined - - munge_restart_result is success - - - name: Restart slurmdbd if munge restarted - ansible.builtin.service: - name: slurmdbd - state: restarted - when: - - name_ip_map[slurmhost_ip] in dbd_list - - munge_restart_result is defined - - munge_restart_result is success - rescue: - - name: Handle munge restart failure - ansible.builtin.debug: - msg: "Failed task {{ ansible_failed_task.name }} on {{ slurmhost_ip }}" diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index cc57a984da..37e6ff9869 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -127,7 +127,7 @@ dbd_slurm_conf: AccountingStorageType: accounting_storage/slurmdbd partition_params: PartitionName: "{{ slurm_partition_name }}" - Nodes: "{{ cmpt_list | join(',') if cmpt_list else 'ALL' }}" + Nodes: "{{ cmpt_list | join(',') if cmpt_list else slurm_partition_name }}" MaxTime: "INFINITE" State: "UP" Default: "YES" From 9183463a3a4a2521a8a93c939fc560ff6c1ee1be Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Fri, 27 Feb 2026 03:08:23 +0530 Subject: [PATCH 16/41] bmc inv validation --- utils/set_pxe_boot.yml | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/utils/set_pxe_boot.yml b/utils/set_pxe_boot.yml index f6cf1258c7..09e72322a0 100644 --- a/utils/set_pxe_boot.yml +++ b/utils/set_pxe_boot.yml @@ -26,6 +26,17 @@ # boot image must be reachable by the target nodes once the iDRAC # is set to PXE mode. # ------------------------------------------------------------------------- +- name: Validate BMC group exists in inventory + hosts: localhost + connection: local + gather_facts: false + tasks: + - name: Check if bmc group exists in inventory + ansible.builtin.fail: + msg: "Failed. 'bmc' group not found in inventory or has no hosts. + Please ensure the inventory file contains a 'bmc' group with at least one BMC IP address." + when: groups['bmc'] is not defined or groups['bmc'] | length | int == 0 + - name: Set_fact for fetch omnia config credentials hosts: localhost connection: local @@ -47,11 +58,6 @@ hosts: bmc connection: local gather_facts: false - pre_tasks: - - name: Validate bmc group - ansible.builtin.assert: - that: groups['bmc'] | length | int >= 1 - fail_msg: "Failed. bmc group in inventory must have atleast one bmc ip." roles: - role: idrac_pxe_boot # vars: From 056c3747cb243a6aa0df01c0fbe0b32e975a76b0 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Fri, 27 Feb 2026 10:58:14 +0530 Subject: [PATCH 17/41] Lint fix --- discovery/roles/slurm_config/tasks/remove_node.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/discovery/roles/slurm_config/tasks/remove_node.yml b/discovery/roles/slurm_config/tasks/remove_node.yml index b1acab6de8..da6b2a72ae 100644 --- a/discovery/roles/slurm_config/tasks/remove_node.yml +++ b/discovery/roles/slurm_config/tasks/remove_node.yml @@ -50,7 +50,9 @@ - name: Update normal partition Nodes to match cmpt_list ansible.builtin.set_fact: updated_partitions: "{{ updated_partitions | default([]) - + [item | combine({'Nodes': ((cmpt_list | difference(nodes_in_normal_not_in_cmpt)) | join(',')) if cmpt_list | length > 0 else slurm_partition_name}) if item.PartitionName == slurm_partition_name else item] }}" + + [item | combine({'Nodes': ((cmpt_list | difference(nodes_in_normal_not_in_cmpt)) | join(',')) + if (cmpt_list | difference(nodes_in_normal_not_in_cmpt)) | length > 0 else slurm_partition_name}) + if item.PartitionName == slurm_partition_name else item] }}" loop: "{{ slurm_conf_dict.PartitionName | default([]) }}" when: - "'slurm' in conf_merge_dict" From c729c2e61eed867658f1f2344b432dd05060b544 Mon Sep 17 00:00:00 2001 From: Rohith Ravut Date: Fri, 27 Feb 2026 11:08:27 +0530 Subject: [PATCH 18/41] Update ssh ci-group-login_compiler_node_aarch64.yaml.j2 Signed-off-by: Rohith Ravut --- .../cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 8918f03050..36332685ac 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -241,6 +241,8 @@ runcmd: - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_cuda_toolkit.sh + # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) + - mkdir -p {{ client_mount_path }}/slurm/ssh - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurm_slurmd_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd {{ slurm_slurmd_spool_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab From 84bbc0e5c646bfa217d3a2d5d18d595d2d20e791 Mon Sep 17 00:00:00 2001 From: Rohith Ravut Date: Fri, 27 Feb 2026 11:09:06 +0530 Subject: [PATCH 19/41] Update ssh ci-group-login_node_aarch64.yaml.j2 Signed-off-by: Rohith Ravut --- .../cloud_init/ci-group-login_node_aarch64.yaml.j2 | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index 4aacc2222d..d6071116ac 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -73,6 +73,13 @@ done fi + - path: /root/.ssh/config + permissions: '0600' + content: | + Host {{ slurm_control_ssh_patterns }} + IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa + IdentitiesOnly yes + {% if hostvars['localhost']['openldap_support'] %} - path: /etc/sssd/sssd.conf owner: root:root @@ -121,7 +128,8 @@ runcmd: - /usr/local/bin/set-ssh.sh - + # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) + - mkdir -p {{ client_mount_path }}/slurm/ssh - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurm_slurmd_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd {{ slurm_slurmd_spool_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab @@ -130,6 +138,7 @@ - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path}}/hpc_tools/container_images /hpc_tools/container_images nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path}}/hpc_tools/scripts /hpc_tools/scripts nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab From 9722f8b504b0fa0450b1f4528893c495bba09256 Mon Sep 17 00:00:00 2001 From: Rohith Ravut Date: Fri, 27 Feb 2026 11:13:39 +0530 Subject: [PATCH 20/41] Update ssh ci-group-slurm_node_aarch64.yaml.j2 Signed-off-by: Rohith Ravut --- .../templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index dacade639b..0f25c81307 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -243,7 +243,7 @@ exec > >(tee -a "$LOGFILE") 2>&1 echo "[INFO] ===== Starting directory creation and NFS mounts for Pulp cert, Slurm and Munge (aarch64) =====" - + mkdir -p {{ client_mount_path }}/slurm/ssh echo "[INFO] Creating base directories for Slurm and Munge" mkdir -pv {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts From 092aa3d5d20c3ed3244083f1855ba1ef597ba696 Mon Sep 17 00:00:00 2001 From: Rohith Ravut Date: Fri, 27 Feb 2026 11:14:30 +0530 Subject: [PATCH 21/41] Update ssh main.yml Signed-off-by: Rohith Ravut --- discovery/roles/passwordless_ssh/vars/main.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/discovery/roles/passwordless_ssh/vars/main.yml b/discovery/roles/passwordless_ssh/vars/main.yml index edd72e1e90..b48fcb833a 100644 --- a/discovery/roles/passwordless_ssh/vars/main.yml +++ b/discovery/roles/passwordless_ssh/vars/main.yml @@ -26,6 +26,9 @@ slurm_functional_groups: - slurm_node_x86_64 - login_node_x86_64 - login_compiler_node_x86_64 + - slurm_node_aarch64 + - login_node_aarch64 + - login_compiler_node_aarch64 # Nodes.yaml group completeness checks omnia_required_groups_from_nodes_yaml: @@ -41,5 +44,8 @@ omnia_optional_groups_from_nodes_yaml: - service_kube_control_plane_first_aarch64 - service_kube_control_plane_aarch64 - service_kube_node_aarch64 + - slurm_node_aarch64 + - login_node_aarch64 + - login_compiler_node_aarch64 ssh_private_key_path: /root/.ssh/config From 944ba28781304f28b2f873470accd2eb8712b4e0 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Fri, 27 Feb 2026 13:22:35 +0530 Subject: [PATCH 22/41] Updated pxe boot to handle poweroff case --- utils/roles/idrac_pxe_boot/tasks/main.yml | 99 +++++++++++++++++------ utils/set_pxe_boot.yml | 7 +- 2 files changed, 77 insertions(+), 29 deletions(-) diff --git a/utils/roles/idrac_pxe_boot/tasks/main.yml b/utils/roles/idrac_pxe_boot/tasks/main.yml index 0a9078f667..70fb4dde2c 100644 --- a/utils/roles/idrac_pxe_boot/tasks/main.yml +++ b/utils/roles/idrac_pxe_boot/tasks/main.yml @@ -19,40 +19,87 @@ idrac_password: "{{ bmc_password | default(hostvars['localhost']['bmc_password']) }}" validate_certs: false register: lc_check_status - -- name: Check LC availibility - ansible.builtin.fail: - msg: "{{ lc_check_fail_msg }}" - when: not lc_check_status.lc_status_info.LCReady + ignore_errors: true - name: Set reboot type ansible.builtin.set_fact: - reboot_type: "{{ 'none' if not restart_host else ('force_restart' if force_restart else 'graceful_restart') }}" + reboot_type: "{{ 'ForceRestart' if force_restart else 'GracefulRestart' }}" -- name: Set boot from pxe - dellemc.openmanage.idrac_boot: - idrac_ip: "{{ inventory_hostname }}" - idrac_user: "{{ bmc_username | default(hostvars['localhost']['bmc_username']) }}" - idrac_password: "{{ bmc_password | default(hostvars['localhost']['bmc_password']) }}" - validate_certs: false - boot_source_override_mode: uefi - boot_source_override_target: "{{ boot_source_override_target }}" - boot_source_override_enabled: "{{ boot_source_override_enabled }}" - reset_type: "{{ reboot_type }}" - register: pxe_provisioning - ignore_errors: true - ignore_unreachable: true +- name: IDRAC ops when ready + when: + - lc_check_status is success + - lc_check_status.lc_status_info.LCReady + block: + - name: Set boot from pxe + dellemc.openmanage.idrac_boot: + idrac_ip: "{{ inventory_hostname }}" + idrac_user: "{{ bmc_username | default(hostvars['localhost']['bmc_username']) }}" + idrac_password: "{{ bmc_password | default(hostvars['localhost']['bmc_password']) }}" + validate_certs: false + boot_source_override_mode: uefi + boot_source_override_target: "{{ boot_source_override_target }}" + boot_source_override_enabled: "{{ boot_source_override_enabled }}" + reset_type: "none" # Dont Restart here as to Handle poweroff case + register: pxe_provisioning + ignore_errors: true + ignore_unreachable: true -- name: OS provisioning failed using PXE + - name: Try ForceRestart + dellemc.openmanage.redfish_powerstate: + baseuri: "{{ inventory_hostname }}" + username: "{{ bmc_username | default(hostvars['localhost']['bmc_username']) }}" + password: "{{ bmc_password | default(hostvars['localhost']['bmc_password']) }}" + validate_certs: false + reset_type: ForceRestart + when: restart_host + register: restart_op + failed_when: false + + - name: Try On if ForceRestart did not change + dellemc.openmanage.redfish_powerstate: + baseuri: "{{ inventory_hostname }}" + username: "{{ bmc_username | default(hostvars['localhost']['bmc_username']) }}" + password: "{{ bmc_password | default(hostvars['localhost']['bmc_password']) }}" + validate_certs: false + reset_type: "On" + register: power_on_op + failed_when: false + when: + - restart_host + - not (restart_op is changed) + +- name: Check LC availibility + ansible.builtin.fail: + msg: "{{ lc_check_fail_msg }}" + when: lc_check_status is failed or not (lc_check_status.lc_status_info.LCReady | default(false)) + +- name: Fail if PXE provisioning failed ansible.builtin.fail: msg: "{{ pxe_provisioning_fail_msg }}" - when: pxe_provisioning is failed + when: + - pxe_provisioning is defined + - pxe_provisioning is failed -- name: IDRAC might be unreachable during OS provisioning - ansible.builtin.debug: +- name: Fail if PXE provisioning target is unreachable + ansible.builtin.fail: msg: "{{ unreachable_idrac_msg }}" - when: pxe_provisioning is unreachable + when: + - pxe_provisioning is defined + - pxe_provisioning is unreachable + +- name: Fail if power operation failed + ansible.builtin.fail: + msg: "Power operation failed on {{ inventory_hostname }}. Failed to restart server." + when: + - restart_host + - not (restart_op is defined and restart_op is changed) + - not (power_on_op is defined and power_on_op is changed) -- name: Provision OS status +- name: Summarize PXE boot and power operation results ansible.builtin.debug: - msg: "{{ provision_os_msg }}" + msg: >- + PXE Boot: {{ 'OK' if pxe_provisioning is success else ('UNREACHABLE' if pxe_provisioning is unreachable else 'FAILED') }} | + Power: {{ 'Restart OK' if (restart_op is defined and restart_op is changed) + else ('On OK' if (power_on_op is defined and power_on_op is changed) + else ('Skipped (no restart)' if not restart_host + else 'FAILED')) }} diff --git a/utils/set_pxe_boot.yml b/utils/set_pxe_boot.yml index 09e72322a0..a46ebd7091 100644 --- a/utils/set_pxe_boot.yml +++ b/utils/set_pxe_boot.yml @@ -16,11 +16,12 @@ # PXE PREREQUISITES # ------------------------------------------------------------------------- # 1. Dell iDRAC BMCs must be reachable from the Ansible controller -# 2. PXE (Pre‑boot eXecution Environment) support – the NIC's +# 2. The PXE order must be set in the BIOS/UEFI settings +# 3. PXE (Pre‑boot eXecution Environment) support – the NIC's # firmware must implement the PXE option and must be enabled. -# 3. The `dellemc.openmanage` Ansible collection must be installed: +# 4. The `dellemc.openmanage` Ansible collection must be installed: # ansible-galaxy collection install dellemc.openmanage -# 4. iDRAC firmware version must support the 'Boot Source Override' +# 5. iDRAC firmware version must support the 'Boot Source Override' # API (most modern iDRAC9/10 firmware do). # 5. The TFTP/NFS/HTTP server that provides the PXE # boot image must be reachable by the target nodes once the iDRAC From 25e9684fcf0c63f61acb030f0dac21248e545bba Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Fri, 27 Feb 2026 14:54:35 +0530 Subject: [PATCH 23/41] Error update for openLDAP container not running in discovery.yml --- discovery/discovery.yml | 6 +++ .../tasks/validate_openldap_container.yml | 38 +++++++++++++++++++ .../roles/discovery_validations/vars/main.yml | 20 ++++++++++ 3 files changed, 64 insertions(+) create mode 100644 discovery/roles/discovery_validations/tasks/validate_openldap_container.yml diff --git a/discovery/discovery.yml b/discovery/discovery.yml index 40fd00123c..e09f89119c 100644 --- a/discovery/discovery.yml +++ b/discovery/discovery.yml @@ -94,6 +94,12 @@ hosts: oim connection: ssh tasks: + - name: Validate OpenLDAP container is running + ansible.builtin.include_role: + name: discovery_validations + tasks_from: validate_openldap_container.yml + when: hostvars['localhost']['openldap_support'] + - name: Image validation ansible.builtin.include_role: name: discovery_validations diff --git a/discovery/roles/discovery_validations/tasks/validate_openldap_container.yml b/discovery/roles/discovery_validations/tasks/validate_openldap_container.yml new file mode 100644 index 0000000000..d94b74daea --- /dev/null +++ b/discovery/roles/discovery_validations/tasks/validate_openldap_container.yml @@ -0,0 +1,38 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Validate OpenLDAP container is running + block: + - name: Check if OpenLDAP container exists and is running + containers.podman.podman_container_info: + name: omnia_auth + register: openldap_container_check + failed_when: false + - name: Set OpenLDAP container status + ansible.builtin.set_fact: + openldap_container_missing: >- + {{ (openldap_container_check.containers | default([]) | length == 0) or + (openldap_container_check.containers | default([]) | length > 0 and + openldap_container_check.containers[0].State.Status != 'running') }} + + - name: Display OpenLDAP container error + ansible.builtin.pause: + prompt: "{{ openldap_container_missing_msg }}" + seconds: 1 + when: openldap_container_missing | bool + + - name: Fail if OpenLDAP container is not running + ansible.builtin.fail: + msg: "OpenLDAP container is not running. See error details above." + when: openldap_container_missing | bool diff --git a/discovery/roles/discovery_validations/vars/main.yml b/discovery/roles/discovery_validations/vars/main.yml index b7caee6f1b..f0191f23aa 100644 --- a/discovery/roles/discovery_validations/vars/main.yml +++ b/discovery/roles/discovery_validations/vars/main.yml @@ -80,3 +80,23 @@ oim_timezone_changed_warning_msg: | inconsistent behavior. oim_metadata_file_path: "/opt/omnia/.data/oim_metadata.yml" + +# Usage: validate_openldap_container.yml +openldap_container_missing_msg: "{{ '\x1b[31m' }}\ + ==================================================================================\n\ + ERROR: OpenLDAP container is not running.\n\ + ==================================================================================\n\ + \n\ + OpenLDAP support is enabled but the OpenLDAP container was not found.\n\ + This typically happens when prepare_oim.yml was run without the OpenLDAP\n\ + package in software_config.json.\n\ + \n\ + To resolve this issue:\n\ + 1. Ensure the 'openldap' package is added to software_config.json\n\ + 2. Run local_repo.yml to download OpenLDAP packages\n\ + 3. Re-run prepare_oim.yml to start the OpenLDAP container and generate certificates\n\ + 4. Then re-run discovery.yml\n\ + \n\ + For more information, refer to the Omnia documentation on OpenLDAP configuration.\n\ + ==================================================================================\ + {{ '\x1b[0m' }}" From 833776c8d32bd5cbfc875d159781389f59064bdf Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Fri, 27 Feb 2026 15:01:26 +0530 Subject: [PATCH 24/41] Update discovery.yml --- discovery/discovery.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/discovery/discovery.yml b/discovery/discovery.yml index e09f89119c..2a1c74e010 100644 --- a/discovery/discovery.yml +++ b/discovery/discovery.yml @@ -99,7 +99,7 @@ name: discovery_validations tasks_from: validate_openldap_container.yml when: hostvars['localhost']['openldap_support'] - + - name: Image validation ansible.builtin.include_role: name: discovery_validations From 1e89503f8889dd53e9c0624fcc71414aa45f8a41 Mon Sep 17 00:00:00 2001 From: pullan1 Date: Fri, 27 Feb 2026 15:05:11 +0530 Subject: [PATCH 25/41] localrepo pulp cleanup for all types Signed-off-by: pranavkumar7498 --- .../library/module_utils/local_repo/config.py | 10 +- .../local_repo/download_common.py | 29 +- common/library/modules/parallel_tasks.py | 2 +- common/library/modules/pulp_cleanup.py | 500 ++++++++++++------ local_repo/pulp_cleanup.yml | 10 +- .../pulp/templates/settings_template.j2 | 2 +- 6 files changed, 374 insertions(+), 179 deletions(-) diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py index d8e5593778..085f272ec3 100644 --- a/common/library/module_utils/local_repo/config.py +++ b/common/library/module_utils/local_repo/config.py @@ -120,10 +120,10 @@ } CLI_FILE_PATH = "/root/.config/pulp/cli.toml" -POST_TIMEOUT = 3600 # seconds -TAR_POLL_VAL = 45 # minutes -FILE_POLL_VAL = 1 # minutes -ISO_POLL_VAL = 45 # minutes +TAR_TIMEOUT_MIN = 45 # minutes +FILE_TIMEOUT_MIN = 1 # minutes +ISO_TIMEOUT_MIN = 45 # minutes +TASK_POLL_INTERVAL = 10 # seconds FILE_URI = "/pulp/api/v3/content/file/files/" PULP_SSL_CA_CERT = "/etc/pki/ca-trust/source/anchors/pulp_webserver.crt" # ---------------------------- @@ -160,7 +160,7 @@ "get_repo_version": "pulp container repository show --href %s", "list_tags_by_version": "pulp show --href /pulp/api/v3/content/container/tags/?repository_version=%s", "rename_repository": "pulp container repository update --name %s --new-name %s", - "orphan_cleanup": "pulp orphan cleanup", + "orphan_cleanup": "pulp orphan cleanup --protection-time 0", "container_distribution_show": "pulp container distribution show --name %s | jq .repository", "show_repository_version": "pulp container repository show --href %s | jq .latest_version_href", "list_image_tags": "pulp show --href /pulp/api/v3/content/container/tags/?repository_version=%s" diff --git a/common/library/module_utils/local_repo/download_common.py b/common/library/module_utils/local_repo/download_common.py index 892725b207..58185c2844 100644 --- a/common/library/module_utils/local_repo/download_common.py +++ b/common/library/module_utils/local_repo/download_common.py @@ -37,10 +37,10 @@ pulp_file_commands, pulp_rpm_commands, CLI_FILE_PATH, - POST_TIMEOUT, - ISO_POLL_VAL, - TAR_POLL_VAL, - FILE_POLL_VAL, + ISO_TIMEOUT_MIN, + TAR_TIMEOUT_MIN, + FILE_TIMEOUT_MIN, + TASK_POLL_INTERVAL, FILE_URI, PULP_SSL_CA_CERT ) @@ -201,7 +201,7 @@ def wait_for_task(task_href, base_url, username, password, logger, timeout=3600, logger.error("Timeout waiting for task to complete") return False -def handle_file_upload(repository_name, relative_path, file_url, poll_interval, logger): +def handle_file_upload(repository_name, relative_path, file_url, timeout_minutes, logger): """ Ensure repository exists, then POST a file to Pulp and wait for the task to complete. @@ -209,7 +209,7 @@ def handle_file_upload(repository_name, relative_path, file_url, poll_interval, repository_name (str): Name of the repository. relative_path (str): Relative path for the file in the repository. file_url (str): URL of the file to upload. - poll_interval: Polling time + timeout_minutes (int): Maximum time in minutes to wait for task completion. logger (logging.Logger): Logger instance. Returns: @@ -262,9 +262,10 @@ def handle_file_upload(repository_name, relative_path, file_url, poll_interval, return "Failed" # Wait for task completion - logger.info(f"Waiting for task {task_href} to complete...") + timeout_seconds = timeout_minutes * 60 + logger.info(f"Waiting for task {task_href} to complete (timeout: {timeout_minutes} min)...") task_result = wait_for_task(task_href, base_url, config["username"], passcode, - logger, timeout=POST_TIMEOUT, interval=poll_interval) + logger, timeout=timeout_seconds, interval=TASK_POLL_INTERVAL) if task_result: logger.info(f"File successfully uploaded to repository '{repository_name}'.") return "Success" @@ -272,7 +273,7 @@ def handle_file_upload(repository_name, relative_path, file_url, poll_interval, logger.error(f"Task {task_href} failed or timed out. File upload to repository '{repository_name}' failed.") return "Failed" -def handle_post_request(repository_name, relative_path, base_path, file_url, poll_interval,logger): +def handle_post_request(repository_name, relative_path, base_path, file_url, timeout_minutes,logger): """ Handles the full Pulp upload and distribution process for a given repository and file. Args: @@ -280,13 +281,13 @@ def handle_post_request(repository_name, relative_path, base_path, file_url, pol relative_path (str): Path where the file should be stored inside the repository. base_path (str): The base path for the distribution. file_url (str): URL of the file to be uploaded. - poll_interval: Interval for polling + timeout_minutes (int): Maximum time in minutes to wait for upload task completion. logger (logging.Logger): Logger for logging messages and errors. Returns: str: "Success" if the operation completes successfully, "Failed" otherwise. """ - result = handle_file_upload(repository_name, relative_path, file_url, poll_interval,logger) + result = handle_file_upload(repository_name, relative_path, file_url, timeout_minutes,logger) if result =="Success": distribution_name = repository_name logger.info("Creating publication...") @@ -483,7 +484,7 @@ def process_manifest(file,repo_store_path, status_file_path, cluster_os_type, cl relative_path = output_file base_path = manifest_directory.strip("/") status = handle_post_request(repository_name, relative_path, - base_path, url, FILE_POLL_VAL, logger) + base_path, url, FILE_TIMEOUT_MIN, logger) except Exception as e: logger.error(f"Error processing manifest: {e}") status= "Failed" @@ -775,7 +776,7 @@ def process_tarball(package, repo_store_path, status_file_path, version_variable if url: try: status = handle_post_request(repository_name, relative_path, - base_path, url, TAR_POLL_VAL,logger) + base_path, url, TAR_TIMEOUT_MIN,logger) except Exception as e: logger.error(f"Error processing tarball: {e}") status = "Failed" @@ -882,7 +883,7 @@ def process_iso(package, repo_store_path, status_file_path, # non-zero for failure) subprocess.run(['wget', '-q', '--spider', '--tries=1', url], check=True) status = handle_post_request(repository_name, relative_path, - base_path, url, ISO_POLL_VAL,logger) + base_path, url, ISO_TIMEOUT_MIN,logger) except subprocess.CalledProcessError as e: logger.error(f"Error executing iso commands: {e}") status = "Failed" diff --git a/common/library/modules/parallel_tasks.py b/common/library/modules/parallel_tasks.py index 17c14cf51f..e9d3d26c9c 100644 --- a/common/library/modules/parallel_tasks.py +++ b/common/library/modules/parallel_tasks.py @@ -106,7 +106,7 @@ def update_status_csv(csv_dir, software, overall_status,slogger): # Transform the new status. transformed_status = re.sub(r'failure', 'failed', overall_status.lower()) - transformed_status = re.sub(r'partial', 'failed', overall_status.lower()) + transformed_status = re.sub(r'timeout', 'failed', transformed_status) # Update or add the entry for each given software. if isinstance(software, list): diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py index a97f6d28d2..6edf16e294 100644 --- a/common/library/modules/pulp_cleanup.py +++ b/common/library/modules/pulp_cleanup.py @@ -35,7 +35,6 @@ from ansible.module_utils.local_repo.standard_logger import setup_standard_logger from ansible.module_utils.local_repo.config import ( CLEANUP_BASE_PATH_DEFAULT, - CLEANUP_STATUS_FILE_PATH_DEFAULT, CLEANUP_FILE_TYPES, pulp_rpm_commands, pulp_container_commands, @@ -49,12 +48,6 @@ # PRETTY TABLE FORMATTING # ============================================================================= -# ANSI color codes -GREEN = '\033[92m' -RED = '\033[91m' -YELLOW = '\033[93m' -RESET = '\033[0m' - def format_pretty_table(results: List[Dict[str, Any]]) -> str: """Format cleanup results into a pretty table.""" if not results: @@ -82,7 +75,6 @@ def format_pretty_table(results: List[Dict[str, Any]]) -> str: f" {str(r.get('name', '')).ljust(widths[0])} ", f" {str(r.get('type', '')).ljust(widths[1])} ", f" {str(r.get('status', '')).ljust(widths[2])} ", - #f" {colored_status}{status_padding} ", f" {msg.ljust(widths[3])} " ]) + "|" lines.append(row) @@ -230,25 +222,45 @@ def container_exists(name: str, logger) -> bool: def file_exists_in_status(name: str, base_path: str, logger) -> bool: """Check if file artifact exists in status files.""" try: - for status_file in glob.glob(f"{base_path}/x86_64/*/status.csv"): - with open(status_file, 'r', encoding='utf-8') as f: - if name in f.read(): - return True + for arch in ARCH_SUFFIXES: + for status_file in glob.glob(f"{base_path}/{arch}/*/status.csv"): + with open(status_file, 'r', encoding='utf-8') as f: + if name in f.read(): + return True return False - except Exception: + except OSError: return False -def get_all_repositories(logger) -> List[str]: - """Get all RPM repository names from Pulp.""" - cmd = pulp_rpm_commands["list_repositories"] +def _list_pulp_repos(cmd: str, label: str, logger) -> List[str]: + """List repository names from Pulp using the given command.""" result = run_cmd(cmd, logger) if result["rc"] != 0: - logger.error(f"Failed to list repositories: {result['stderr']}") + logger.error(f"Failed to list {label}: {result['stderr']}") return [] repos = safe_json_parse(result["stdout"]) return [r.get('name', '') for r in repos if r.get('name')] +def get_all_repositories(logger) -> List[str]: + """Get all RPM repository names from Pulp.""" + return _list_pulp_repos(pulp_rpm_commands["list_repositories"], "repositories", logger) + + +def get_all_containers(logger) -> List[str]: + """Get all container repository names from Pulp.""" + return _list_pulp_repos(pulp_container_commands["list_repositories"], "container repositories", logger) + + +def get_all_file_repositories(logger) -> List[str]: + """Get all file repository names from Pulp.""" + return _list_pulp_repos(pulp_file_commands["list_repositories"], "file repositories", logger) + + +def get_all_python_repositories(logger) -> List[str]: + """Get all Python repository names from Pulp.""" + return _list_pulp_repos(pulp_python_commands["list_repositories"], "Python repositories", logger) + + # ============================================================================= # CLEANUP FUNCTIONS # ============================================================================= @@ -305,17 +317,23 @@ def cleanup_container(user_input: str, base_path: str, logger) -> Dict[str, Any] Args: user_input: User-provided image name (e.g., registry.k8s.io/pause) + or Pulp repository name (e.g., container_repo_registry.k8s.io_pause) + when called from cleanup_containers=all """ result = {"name": user_input, "type": "container", "status": "Failed", "message": ""} - # Validate format - is_valid, error_msg = validate_container_format(user_input) - if not is_valid: - result["message"] = error_msg - return result + # Check if input is already a Pulp repository name (from get_all_containers) + if user_input.startswith('container_repo_'): + pulp_name = user_input + else: + # Validate format + is_valid, error_msg = validate_container_format(user_input) + if not is_valid: + result["message"] = error_msg + return result - # Convert to Pulp naming convention - pulp_name = convert_to_pulp_container_name(user_input) + # Convert to Pulp naming convention + pulp_name = convert_to_pulp_container_name(user_input) # Check existence if not container_exists(pulp_name, logger): @@ -351,87 +369,20 @@ def cleanup_container(user_input: str, base_path: str, logger) -> Dict[str, Any] return result -def file_exists_in_pulp(name: str, logger) -> Tuple[bool, str, str]: - """Check if file content exists in Pulp file repository. - - Returns: - Tuple of (exists, repo_name, content_href) - """ - try: - # List file repositories and search for the content - repo_list = run_cmd(pulp_file_commands["list_repositories"], logger) - if repo_list["rc"] != 0: - return False, "", "" - - repos = safe_json_parse(repo_list["stdout"]) - for repo in repos: - repo_name = repo.get('name', '') - # Check if this repo contains our file - content_list = run_cmd( - f"pulp file content list --repository {repo_name} --relative-path '{name}'", - logger - ) - if content_list["rc"] == 0: - contents = safe_json_parse(content_list["stdout"]) - if contents: - return True, repo_name, contents[0].get('pulp_href', '') - - return False, "", "" - except (OSError, ValueError): - return False, "", "" - - -def delete_file_from_pulp(name: str, repo_name: str, content_href: str, logger) -> Tuple[bool, str]: - """Delete file content from Pulp. - - Returns: - Tuple of (success, message) - """ - try: - messages = [] - - # 1. Remove content from repository - if content_href: - remove_result = run_cmd( - f"pulp file repository content remove --repository {repo_name} " - f"--href {content_href}", - logger - ) - if remove_result["rc"] == 0: - messages.append("Content removed from repository") - else: - # Try alternative: modify repository to remove content - run_cmd( - f"pulp file repository content modify --repository {repo_name} " - f"--remove-content '[{{\"pulp_href\": \"{content_href}\"}}]'", - logger - ) - - # 2. Delete distribution if exists - dist_result = run_cmd(pulp_file_commands["list_distributions"], logger) - if dist_result["rc"] == 0: - dists = safe_json_parse(dist_result["stdout"]) - for d in dists: - if d.get('name', '') == name or name in d.get('name', ''): - run_cmd(pulp_file_commands["delete_distribution"] % d.get('name', ''), logger) - messages.append("Distribution deleted") - - # 3. Try to delete the file repository if it's named after the artifact - repo_del = run_cmd(pulp_file_commands["delete_repository"] % name, logger) - if repo_del["rc"] == 0: - messages.append("Repository deleted") - - return True, "; ".join(messages) if messages else "Removed from Pulp" - - except Exception as e: - return False, f"Pulp deletion error: {str(e)}" - - -def cleanup_pip_module(name: str, base_path: str, repo_store_path: str, logger) -> Dict[str, Any]: +def cleanup_pip_module(name: str, base_path: str, repo_store_path: str, logger, + pulp_repo_name: str = None) -> Dict[str, Any]: """Cleanup a pip module from Pulp Python repository. Pip modules are stored as: pip_module== e.g., pip_modulecffi==1.17.1 + + Args: + name: Content name (e.g., 'cffi==1.17.1') used for status files and filesystem + base_path: Base path for status files + repo_store_path: Root store path for filesystem cleanup + logger: Logger instance + pulp_repo_name: Optional Pulp repo name override (from cleanup_files=all). + If None, derived from name. """ result = {"name": name, "type": "pip_module", "status": "Failed", "message": ""} messages = [] @@ -439,12 +390,12 @@ def cleanup_pip_module(name: str, base_path: str, repo_store_path: str, logger) content_removed = False try: - # Pulp Python repo name format: pip_module - # User input could be "cffi==1.17.1" or "pip_modulecffi==1.17.1" - if name.startswith("pip_module"): - pulp_repo_name = name - else: - pulp_repo_name = f"pip_module{name}" + # Use provided Pulp repo name or derive from content name + if not pulp_repo_name: + if name.startswith("pip_module"): + pulp_repo_name = name + else: + pulp_repo_name = f"pip_module{name}" logger.info(f"Looking for Python repository: {pulp_repo_name}") @@ -530,11 +481,21 @@ def get_pulp_file_repo_name(name: str, file_type: str) -> str: return name -def cleanup_file_repository(name: str, file_type: str, base_path: str, repo_store_path: str, logger) -> Dict[str, Any]: +def cleanup_file_repository(name: str, file_type: str, base_path: str, repo_store_path: str, logger, + pulp_repo_name: str = None) -> Dict[str, Any]: """Cleanup artifact from Pulp File repository. Handles: tarball, git, manifest, ansible_galaxy_collection All use 'pulp file' repository type with type-specific naming conventions. + + Args: + name: Content name (e.g., 'calico-v3.30.3') used for status files and filesystem + file_type: Artifact type (e.g., 'manifest', 'tarball') + base_path: Base path for status files + repo_store_path: Root store path for filesystem cleanup + logger: Logger instance + pulp_repo_name: Optional Pulp repo name override (from cleanup_files=all). + If None, derived from name + file_type. """ result = {"name": name, "type": file_type, "status": "Failed", "message": ""} messages = [] @@ -543,8 +504,9 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, repo_stor content_removed = False try: - # Get the expected Pulp repository name - pulp_repo_name = get_pulp_file_repo_name(name, file_type) + # Use provided Pulp repo name or derive from content name + if not pulp_repo_name: + pulp_repo_name = get_pulp_file_repo_name(name, file_type) logger.info(f"Looking for {file_type} repository: {pulp_repo_name}") # Check if repository exists directly @@ -620,21 +582,66 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, repo_stor return result +def parse_pulp_file_repo_name(repo_name: str) -> Tuple[str, str, str]: + """Parse a Pulp file/python repository name into components. + + Pulp repo names from cleanup_files=all follow the format: + {arch}_{type}{content_name} + e.g.: + x86_64_manifestcalico-v3.30.3 -> ('x86_64', 'manifest', 'calico-v3.30.3') + x86_64_pip_modulecffi==1.17.1 -> ('x86_64', 'pip_module', 'cffi==1.17.1') + aarch64_isocuda-run -> ('aarch64', 'iso', 'cuda-run') + + Returns: + Tuple of (arch, file_type, content_name). + Returns (None, None, repo_name) if parsing fails. + """ + for arch in ARCH_SUFFIXES: + prefix = f"{arch}_" + if repo_name.startswith(prefix): + remainder = repo_name[len(prefix):] + # Try longest type prefixes first to avoid partial matches + # e.g., 'ansible_galaxy_collection' before 'pip_module' before 'iso' + for file_type in sorted(CLEANUP_FILE_TYPES, key=len, reverse=True): + if remainder.startswith(file_type): + content_name = remainder[len(file_type):] + return arch, file_type, content_name + return arch, None, remainder + return None, None, repo_name + + def cleanup_file(name: str, base_path: str, repo_store_path: str, logger) -> Dict[str, Any]: """Cleanup a file artifact. Routes to appropriate handler: - pip_module: Pulp Python repository - tarball, git, manifest, ansible_galaxy_collection: Pulp File repository + + When called from cleanup_files=all, names are Pulp repo names like + 'x86_64_manifestcalico-v3.30.3'. These are parsed to extract the type + and content name for correct routing, status updates, and filesystem cleanup. """ - file_type = detect_file_type(name) + # Try parsing as a Pulp repo name (from cleanup_files=all) + arch, parsed_type, content_name = parse_pulp_file_repo_name(name) + + if parsed_type: + # Name is a Pulp repo name — use parsed type and content name + file_type = parsed_type + pulp_name = name + artifact_name = content_name + logger.info(f"Parsed Pulp repo name: arch={arch}, type={file_type}, content={artifact_name}") + else: + # Name is a user-provided content name — detect type from filesystem + file_type = detect_file_type(name) + pulp_name = None + artifact_name = name # Handle pip modules separately - they use Python repositories if file_type == "pip_module": - return cleanup_pip_module(name, base_path, repo_store_path, logger) + return cleanup_pip_module(artifact_name, base_path, repo_store_path, logger, pulp_repo_name=pulp_name) # All other file types use Pulp File repository - return cleanup_file_repository(name, file_type, base_path, repo_store_path, logger) + return cleanup_file_repository(artifact_name, file_type, base_path, repo_store_path, logger, pulp_repo_name=pulp_name) # ============================================================================= @@ -670,6 +677,10 @@ def cleanup_content_directory(content_name: str, content_type: str, repo_store_p logger.warning(result["message"]) return result + # If content_type is None (e.g., from cleanup_files=all when detect_file_type fails), + # search all known type directories to find and delete the content + types_to_search = [content_type] if content_type else CLEANUP_FILE_TYPES + try: for arch in ARCH_SUFFIXES: # Walk version directories (e.g., rhel/10.0) @@ -678,21 +689,27 @@ def cleanup_content_directory(content_name: str, content_type: str, repo_store_p continue for version_dir in glob.glob(f"{arch_path}/rhel/*/"): - content_dir = os.path.join(version_dir, content_type, content_name) - if os.path.exists(content_dir): - logger.info(f"Removing content directory: {content_dir}") - if os.path.isdir(content_dir): - shutil.rmtree(content_dir) - else: - os.remove(content_dir) - removed_dirs.append(content_dir) + for search_type in types_to_search: + content_dir = os.path.join(version_dir, search_type, content_name) + if os.path.exists(content_dir): + logger.info(f"Removing content directory: {content_dir}") + if os.path.isdir(content_dir): + shutil.rmtree(content_dir) + else: + os.remove(content_dir) + removed_dirs.append(content_dir) + # Remove parent type directory if now empty + type_dir = os.path.join(version_dir, search_type) + if os.path.isdir(type_dir) and not os.listdir(type_dir): + os.rmdir(type_dir) + logger.info(f"Removed empty directory: {type_dir}") if removed_dirs: result["status"] = "Success" result["message"] = f"Removed content: {', '.join(removed_dirs)}" else: result["message"] = (f"No filesystem content found for " - f"'{content_name}' under {content_type}") + f"'{content_name}' under {types_to_search}") logger.info(result["message"]) except Exception as e: @@ -702,6 +719,71 @@ def cleanup_content_directory(content_name: str, content_type: str, repo_store_p return result +def cleanup_all_file_content_directories(repo_store_path: str, logger) -> Dict[str, Any]: + """Remove all file-type content directories from the filesystem. + + Called during cleanup_files=all to ensure all locally stored files + under /offline_repo/cluster are deleted. + + Walks through all architectures, OS versions, and file-type directories, + removing all content within each file-type folder. + + Args: + repo_store_path: Root store path (e.g., '/opt/omnia') + logger: Logger instance + + Returns: + Dict with status and message + """ + result = {"name": "all_file_content", "type": "filesystem_bulk", + "status": "Failed", "message": ""} + removed_dirs = [] + + cluster_path = os.path.join(repo_store_path, "offline_repo", "cluster") + if not os.path.exists(cluster_path): + result["message"] = f"Content store path not found: {cluster_path}" + logger.warning(result["message"]) + return result + + try: + for arch in ARCH_SUFFIXES: + arch_path = os.path.join(cluster_path, arch) + if not os.path.isdir(arch_path): + continue + + for version_dir in glob.glob(f"{arch_path}/rhel/*/"): + for file_type in CLEANUP_FILE_TYPES: + type_dir = os.path.join(version_dir, file_type) + if os.path.isdir(type_dir): + # Remove all content within this type directory + for item in os.listdir(type_dir): + item_path = os.path.join(type_dir, item) + logger.info(f"Removing: {item_path}") + if os.path.isdir(item_path): + shutil.rmtree(item_path) + else: + os.remove(item_path) + removed_dirs.append(item_path) + # Remove the empty type directory itself + if not os.listdir(type_dir): + os.rmdir(type_dir) + logger.info(f"Removed empty directory: {type_dir}") + + if removed_dirs: + result["status"] = "Success" + result["message"] = f"Removed {len(removed_dirs)} content items from filesystem" + logger.info(result["message"]) + else: + result["message"] = "No file content found on filesystem" + logger.info(result["message"]) + + except Exception as e: + result["message"] = f"Bulk filesystem cleanup error: {str(e)}" + logger.error(f"Failed bulk filesystem cleanup: {e}") + + return result + + # ============================================================================= # STATUS FILE UPDATES # ============================================================================= @@ -710,7 +792,6 @@ def remove_rpms_from_repository(repo_name: str, base_path: str, logger) -> Dict[ """Remove RPMs that belong to a specific repository from status files. Uses the repo_name column in status.csv to accurately identify RPMs from the repository. - Now that all repo_names include architecture prefixes, the logic is simplified. Args: repo_name: Repository name (e.g., 'x86_64_appstream', 'aarch64_epel') @@ -756,14 +837,10 @@ def remove_rpms_from_repository(repo_name: str, base_path: str, logger) -> Dict[ row_type = row.get('type', '') rpm_repo = row.get('repo_name', '') - logger.info(f"Processing row: {row}") # For RPMs, check if they belong to the deleted repository - if row_type in ('rpm', 'rpm_repo', 'rpm_file'): - if has_repo_column and rpm_repo == repo_name: - removed = True - logger.info(f"Removing RPM '{name}' from {status_file} (repo {repo_name} deleted)") - else: - rows.append(row) + if row_type in ('rpm', 'rpm_repo', 'rpm_file') and has_repo_column and rpm_repo == repo_name: + removed = True + logger.info(f"Removing RPM '{name}' from {status_file} (repo {repo_name} deleted)") else: rows.append(row) @@ -784,11 +861,14 @@ def remove_rpms_from_repository(repo_name: str, base_path: str, logger) -> Dict[ return {} def remove_from_status_files(artifact_name: str, artifact_type: str, base_path: str, logger) -> Dict[str, List[str]]: - """Remove artifact from status.csv files and return affected software names by architecture. + """Remove artifact entries from status.csv files and return affected software names. + + Removes entries so they are re-downloaded on next local_repo run + (local_repo processes packages that are Failed or not present in status.csv). Args: artifact_name: Name of the artifact to remove - artifact_type: Type of artifact (git, tarball, pip_module) + artifact_type: Type of artifact (git, tarball, pip_module, image) base_path: Base path for status files logger: Logger instance @@ -807,7 +887,6 @@ def remove_from_status_files(artifact_name: str, artifact_type: str, base_path: fieldnames = reader.fieldnames for row in reader: name = row.get('name', '') - row_type = row.get('type', '') # Match logic based on type should_remove = False if artifact_type == 'image': @@ -861,8 +940,6 @@ def mark_software_partial(affected_software, base_path: str, logger, artifact_ty # Normalize input: convert to arch_software_map if needed if isinstance(affected_software, list): - # Legacy list input - this should not happen with new remove_rpms_from_repository - # but we keep it for backward compatibility logger.warning("Received list input to mark_software_partial, applying to all architectures (legacy behavior)") arch_software_map = {arch: affected_software for arch in ARCH_SUFFIXES} else: @@ -900,17 +977,18 @@ def mark_software_partial(affected_software, base_path: str, logger, artifact_ty except OSError as e: logger.error(f"Failed to update software.csv: {e}") -def software_has_rpms(software_name: str, arch: str, base_path: str, logger) -> bool: - """Check if a software has any RPM dependencies in its status.csv. +def software_has_type(software_name: str, arch: str, base_path: str, logger, type_values: tuple) -> bool: + """Check if a software has entries of given types in its status.csv. Args: software_name: Name of the software arch: Architecture (x86_64 or aarch64) base_path: Base path for status files logger: Logger instance + type_values: Tuple of type strings to check for (e.g., ('rpm', 'rpm_repo')) Returns: - True if software has RPM entries, False otherwise + True if software has matching entries, False otherwise """ status_file = f"{base_path}/{arch}/{software_name}/status.csv" if not os.path.exists(status_file): @@ -920,32 +998,30 @@ def software_has_rpms(software_name: str, arch: str, base_path: str, logger) -> with open(status_file, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: - if row.get('type', '').lower() in ('rpm', 'rpm_repo'): + if row.get('type', '').lower() in type_values: return True return False except OSError as e: - logger.error(f"Error checking RPMs for {software_name}: {e}") + logger.error(f"Error checking {type_values} for {software_name}: {e}") return False -def mark_all_software_partial(base_path: str, logger): +def mark_all_software_partial_by_type(base_path: str, logger, type_values: tuple, type_label: str): """Mark software entries as partial in software.csv for all architectures. - This is called when cleanup_repos=all to mark software as partial - since all RPM repositories are being deleted. - Only marks software that actually has RPM dependencies. + Only marks software that actually has dependencies of the given types. Args: base_path: Base path for software.csv files logger: Logger instance + type_values: Tuple of type strings to check (e.g., ('rpm', 'rpm_repo')) + type_label: Human-readable label for logging (e.g., 'RPM', 'container') """ - logger.info("Marking software with RPM dependencies as partial (cleanup_repos=all)") + logger.info(f"Marking software with {type_label} dependencies as partial") try: for arch in ARCH_SUFFIXES: software_file = f"{base_path}/{arch}/software.csv" - logger.info( - f"Processing software file: {software_file}" - ) + logger.info(f"Processing software file: {software_file}") if not os.path.exists(software_file): logger.info(f"Software file not found: {software_file}") @@ -959,13 +1035,12 @@ def mark_all_software_partial(base_path: str, logger): for row in reader: software_name = row.get('name', '') if row.get('status') == 'success': - # Only mark as partial if software has RPM dependencies - if software_has_rpms(software_name, arch, base_path, logger): + if software_has_type(software_name, arch, base_path, logger, type_values): row['status'] = 'partial' updated = True - logger.info(f"Marked '{software_name}' as partial in {arch}/software.csv (has RPM deps)") + logger.info(f"Marked '{software_name}' as partial in {arch}/software.csv (has {type_label} deps)") else: - logger.info(f"Skipping '{software_name}' - no RPM dependencies") + logger.info(f"Skipping '{software_name}' - no {type_label} dependencies") rows.append(row) if fieldnames and rows and updated: @@ -975,7 +1050,60 @@ def mark_all_software_partial(base_path: str, logger): writer.writerows(rows) logger.info(f"Successfully updated {software_file}") except OSError as e: - logger.error(f"Failed to mark all software as partial: {e}") + logger.error(f"Failed to mark all software as partial ({type_label}): {e}") + + +def remove_all_from_status_files(artifact_type: str, base_path: str, logger) -> Dict[str, List[str]]: + """Remove all entries of a given type from status.csv files. + + Used by cleanup_containers=all and cleanup_files=all to bulk-remove + all entries of a specific artifact type from all status.csv files. + Removed entries will be re-downloaded on next local_repo run. + + Args: + artifact_type: Type of artifact to remove (e.g., 'image', 'tarball', 'git', 'pip_module') + base_path: Base path for status files + logger: Logger instance + + Returns: + Dict mapping architecture to list of affected software names + """ + affected_software = {} + try: + for arch in ARCH_SUFFIXES: + arch_affected = [] + for status_file in glob.glob(f"{base_path}/{arch}/*/status.csv"): + rows = [] + removed = False + with open(status_file, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames + for row in reader: + if row.get('type', '') == artifact_type: + removed = True + logger.info(f"Removing '{row.get('name', '')}' ({artifact_type}) from {status_file}") + else: + rows.append(row) + + if removed and fieldnames: + with open(status_file, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + + software_name = os.path.basename(os.path.dirname(status_file)) + if software_name not in arch_affected: + arch_affected.append(software_name) + + if arch_affected: + affected_software[arch] = arch_affected + + logger.info(f"remove_all_from_status_files({artifact_type}) returning: {affected_software}") + return affected_software + except OSError as e: + logger.error(f"Failed to remove all {artifact_type} from status files: {e}") + return {} + def write_cleanup_status(results: List[Dict], base_path: str): """Write cleanup results to status file.""" @@ -1022,12 +1150,11 @@ def run_module(): os.makedirs(base_path, exist_ok=True) logger = setup_standard_logger(log_dir) - # Handle 'all' keyword for repositories only + # Handle 'all' keyword for repositories cleanup_all_repos = ( cleanup_repos and len(cleanup_repos) == 1 and cleanup_repos[0].lower() == 'all' ) - #if cleanup_repos and len(cleanup_repos) == 1 and cleanup_repos[0].lower() == 'all': if cleanup_all_repos: logger.info("cleanup_repos='all' - fetching all repositories from Pulp") cleanup_repos = get_all_repositories(logger) @@ -1038,6 +1165,38 @@ def run_module(): ) logger.info(f"Found {len(cleanup_repos)} repositories to cleanup: {cleanup_repos}") + # Handle 'all' keyword for containers + cleanup_all_containers = ( + cleanup_containers and len(cleanup_containers) == 1 and + cleanup_containers[0].lower() == 'all' + ) + if cleanup_all_containers: + logger.info("cleanup_containers='all' - fetching all container repositories from Pulp") + cleanup_containers = get_all_containers(logger) + if not cleanup_containers: + module.fail_json( + msg="Failed to retrieve container repository list from Pulp. " + "Please check if Pulp services are running." + ) + logger.info(f"Found {len(cleanup_containers)} containers to cleanup: {cleanup_containers}") + + # Handle 'all' keyword for files + cleanup_all_files = ( + cleanup_files and len(cleanup_files) == 1 and + cleanup_files[0].lower() == 'all' + ) + if cleanup_all_files: + logger.info("cleanup_files='all' - fetching all file and Python repositories from Pulp") + file_repos = get_all_file_repositories(logger) + python_repos = get_all_python_repositories(logger) + cleanup_files = file_repos + python_repos + if not cleanup_files: + module.fail_json( + msg="Failed to retrieve file/Python repository list from Pulp. " + "Please check if Pulp services are running." + ) + logger.info(f"Found {len(cleanup_files)} file repos to cleanup: {cleanup_files}") + logger.info( f"Starting cleanup - repos: {cleanup_repos}, " f"containers: {cleanup_containers}, files: {cleanup_files}" @@ -1053,20 +1212,49 @@ def run_module(): # If cleanup_repos=all, mark software with RPM dependencies as partial if cleanup_all_repos and any(r['status'] == 'Success' for r in all_results if r['type'] == 'repository'): - mark_all_software_partial(base_path, logger) + mark_all_software_partial_by_type(base_path, logger, ('rpm', 'rpm_repo'), 'RPM') # Process containers + container_cleanup_success = False for container in cleanup_containers: result = cleanup_container(container, base_path, logger) all_results.append(result) + if result['status'] == 'Success': + container_cleanup_success = True logger.info(f"Container {container}: {result['status']} - {result['message']}") + # If cleanup_containers=all, bulk-remove all image entries from status files and mark software partial + if cleanup_all_containers and container_cleanup_success: + remove_all_from_status_files('image', base_path, logger) + mark_all_software_partial_by_type(base_path, logger, ('image',), 'container') + # Process files + file_cleanup_success = False for file in cleanup_files: result = cleanup_file(file, base_path, repo_store_path, logger) all_results.append(result) + if result['status'] == 'Success': + file_cleanup_success = True logger.info(f"File {file}: {result['status']} - {result['message']}") + # If cleanup_files=all, bulk-remove all file-type entries from status files, + # clean all local file content directories, and mark software partial + if cleanup_all_files and file_cleanup_success: + for ftype in CLEANUP_FILE_TYPES: + remove_all_from_status_files(ftype, base_path, logger) + cleanup_all_file_content_directories(repo_store_path, logger) + mark_all_software_partial_by_type(base_path, logger, tuple(CLEANUP_FILE_TYPES), 'file') + + # Run orphan cleanup once after all deletions to reclaim disk space + any_success = any(r['status'] == 'Success' for r in all_results) + if any_success: + logger.info("Running global orphan cleanup to reclaim disk space...") + orphan_result = run_cmd(pulp_rpm_commands["orphan_cleanup"], logger) + if orphan_result["rc"] == 0: + logger.info("Orphan cleanup completed successfully") + else: + logger.warning(f"Orphan cleanup warning: {orphan_result['stderr']}") + # Write status file status_file = write_cleanup_status(all_results, base_path) diff --git a/local_repo/pulp_cleanup.yml b/local_repo/pulp_cleanup.yml index 6f54e5f45f..97489cb3c8 100644 --- a/local_repo/pulp_cleanup.yml +++ b/local_repo/pulp_cleanup.yml @@ -18,9 +18,15 @@ # # Repository cleanup (include architecture prefix) # ansible-playbook pulp_cleanup.yml -e "cleanup_repos=x86_64_epel,aarch64_epel" # ansible-playbook pulp_cleanup.yml -e "cleanup_repos=x86_64_appstream" -# ansible-playbook pulp_cleanup.yml -e "cleanup_containers=nginx,redis" +# ansible-playbook pulp_cleanup.yml -e "cleanup_containers=registry.k8s.io/pause,docker.io/library/nginx" # ansible-playbook pulp_cleanup.yml -e "cleanup_files=git,chart-0.48.0" -# ansible-playbook pulp_cleanup.yml -e "cleanup_repos=x86_64_epel -e cleanup_containers=nginx -e force=true" +# ansible-playbook pulp_cleanup.yml -e "cleanup_repos=x86_64_epel -e cleanup_containers=docker.io/library/nginx -e force=true" +# +# # Cleanup ALL artifacts of a type: +# ansible-playbook pulp_cleanup.yml -e "cleanup_repos=all" +# ansible-playbook pulp_cleanup.yml -e "cleanup_containers=all" +# ansible-playbook pulp_cleanup.yml -e "cleanup_files=all" +# ansible-playbook pulp_cleanup.yml -e "cleanup_repos=all" -e "cleanup_containers=all" -e "cleanup_files=all" # # # Examples: x86_64_epel, aarch64_epel, x86_64_appstream, aarch64_baseos # # Note: Use architecture prefix (x86_64_ or aarch64_) for repository names diff --git a/prepare_oim/roles/deploy_containers/pulp/templates/settings_template.j2 b/prepare_oim/roles/deploy_containers/pulp/templates/settings_template.j2 index 63936346f0..67e07fe52f 100644 --- a/prepare_oim/roles/deploy_containers/pulp/templates/settings_template.j2 +++ b/prepare_oim/roles/deploy_containers/pulp/templates/settings_template.j2 @@ -11,4 +11,4 @@ TOKEN_AUTH_DISABLED=True {% endfor -%} {% set random_key = ''.join(random_key) -%} SECRET_KEY='{{ random_key }}' - +REMOTE_CONTENT_DOWNLOAD_TIMEOUT=2700 From 334dbf076e8dbb9e1118a6c3e1c403e83fa6679e Mon Sep 17 00:00:00 2001 From: pullan1 Date: Fri, 27 Feb 2026 15:15:17 +0530 Subject: [PATCH 26/41] updated copy right info Signed-off-by: pranavkumar7498 --- common/library/module_utils/local_repo/download_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/library/module_utils/local_repo/download_common.py b/common/library/module_utils/local_repo/download_common.py index 58185c2844..89bef2f0f8 100644 --- a/common/library/module_utils/local_repo/download_common.py +++ b/common/library/module_utils/local_repo/download_common.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -1177,4 +1177,4 @@ def process_rpm_file(package, repo_store_path, status_file_path, cluster_os_type # Write the status to the file write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name) logger.info("#" * 30 + f" {process_rpm_file.__name__} end " + "#" * 30) - return status \ No newline at end of file + return status From fcb069fbc136d05bddcd0a773ed9229ab2662632 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Fri, 27 Feb 2026 15:47:03 +0530 Subject: [PATCH 27/41] Update main.yml --- discovery/roles/discovery_validations/vars/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/discovery/roles/discovery_validations/vars/main.yml b/discovery/roles/discovery_validations/vars/main.yml index f0191f23aa..d25a5a5708 100644 --- a/discovery/roles/discovery_validations/vars/main.yml +++ b/discovery/roles/discovery_validations/vars/main.yml @@ -93,8 +93,8 @@ openldap_container_missing_msg: "{{ '\x1b[31m' }}\ \n\ To resolve this issue:\n\ 1. Ensure the 'openldap' package is added to software_config.json\n\ - 2. Run local_repo.yml to download OpenLDAP packages\n\ - 3. Re-run prepare_oim.yml to start the OpenLDAP container and generate certificates\n\ + 2. Re-run prepare_oim.yml to start the OpenLDAP container and generate certificates\n\ + 3. Run local_repo.yml to download OpenLDAP packages\n\ 4. Then re-run discovery.yml\n\ \n\ For more information, refer to the Omnia documentation on OpenLDAP configuration.\n\ From 9d4fa52fc0c57d0198d9bdcb627a0ccff563ccb7 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Fri, 27 Feb 2026 16:28:23 +0530 Subject: [PATCH 28/41] Update main.yml --- upgrade/roles/upgrade_cluster/vars/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/upgrade/roles/upgrade_cluster/vars/main.yml b/upgrade/roles/upgrade_cluster/vars/main.yml index fc50eacddb..6e141b6538 100644 --- a/upgrade/roles/upgrade_cluster/vars/main.yml +++ b/upgrade/roles/upgrade_cluster/vars/main.yml @@ -14,5 +14,5 @@ --- storage_config_path: "/opt/omnia/input/project_default/storage_config.yml" storage_content: "{{ lookup('file', storage_config_path, errors='ignore') | default('') }}" -storage_yaml: "{{ storage_content | length > 0 | ternary(storage_content | from_yaml, {}) }}" +storage_yaml: "{{ storage_content | from_yaml | default({}) }}" nfs_params: "{{ storage_yaml.nfs_client_params | default([]) }}" From a4129fff2133cd10ffb41cb449b2c84b39b63fd8 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Fri, 27 Feb 2026 16:46:12 +0530 Subject: [PATCH 29/41] Delete scenario fix --- discovery/roles/slurm_config/tasks/detect_busy_nodes.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/discovery/roles/slurm_config/tasks/detect_busy_nodes.yml b/discovery/roles/slurm_config/tasks/detect_busy_nodes.yml index f88a19b440..613fba6bb0 100644 --- a/discovery/roles/slurm_config/tasks/detect_busy_nodes.yml +++ b/discovery/roles/slurm_config/tasks/detect_busy_nodes.yml @@ -48,7 +48,7 @@ - name: Populate busy_nodes list ansible.builtin.set_fact: - busy_nodes: "{{ (busy_nodes | default({})).combine({node_to_remove: current_jobs.stdout | int}) }}" + busy_nodes: "{{ (busy_nodes | default({})) | combine({node_to_remove: current_jobs.stdout | int}) }}" when: - current_jobs.stdout | int > 0 From 66df2ca5d25571bfb3312bbf5facff3af9a295dc Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Fri, 27 Feb 2026 12:40:09 +0000 Subject: [PATCH 30/41] default values for slurm node info --- discovery/roles/slurm_config/defaults/main.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/discovery/roles/slurm_config/defaults/main.yml b/discovery/roles/slurm_config/defaults/main.yml index 955e4c2a37..3207c0d703 100644 --- a/discovery/roles/slurm_config/defaults/main.yml +++ b/discovery/roles/slurm_config/defaults/main.yml @@ -19,9 +19,10 @@ slurmctld_service_default_path: '/usr/lib/systemd/system/slurmctld.service' slurmd_service_default_path: '/usr/lib/systemd/system/slurmd.service' slurmdbd_service_default_path: '/usr/lib/systemd/system/slurmdbd.service' sys_env_path: '/etc/environment' -default_real_memory: 4 # This is the minimum default memory in GiB +default_real_memory: 884736 default_threadspercore: 1 -default_corespersocket: 1 +default_corespersocket: 72 +default_sockets: 2 share_prefix: "/" conf_path_items: {} conf_dict_items: {} From 53b70f027bcb2e2474d9f512b1a44f3a8c007371 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Sun, 1 Mar 2026 20:06:26 +0530 Subject: [PATCH 31/41] Added retry for unreachable iDRACs --- utils/roles/idrac_pxe_boot/tasks/main.yml | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/utils/roles/idrac_pxe_boot/tasks/main.yml b/utils/roles/idrac_pxe_boot/tasks/main.yml index 70fb4dde2c..7c153f4a24 100644 --- a/utils/roles/idrac_pxe_boot/tasks/main.yml +++ b/utils/roles/idrac_pxe_boot/tasks/main.yml @@ -19,11 +19,14 @@ idrac_password: "{{ bmc_password | default(hostvars['localhost']['bmc_password']) }}" validate_certs: false register: lc_check_status - ignore_errors: true - -- name: Set reboot type - ansible.builtin.set_fact: - reboot_type: "{{ 'ForceRestart' if force_restart else 'GracefulRestart' }}" + until: + - lc_check_status.lc_status_info.LCReady is defined + - lc_check_status.lc_status_info.LCReady + retries: 3 + delay: 5 + # ignore_errors: true + # free startegy should be used to avoid blocking on failed hosts + # but issue will be summarizing/syncing summary report of all at the end - name: IDRAC ops when ready when: @@ -50,7 +53,7 @@ username: "{{ bmc_username | default(hostvars['localhost']['bmc_username']) }}" password: "{{ bmc_password | default(hostvars['localhost']['bmc_password']) }}" validate_certs: false - reset_type: ForceRestart + reset_type: "{{ 'ForceRestart' if force_restart else 'GracefulRestart' }}" when: restart_host register: restart_op failed_when: false From 8a6d40bb053b2d10f32d7386a8bc618d45fe3901 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Mon, 2 Mar 2026 09:08:01 +0000 Subject: [PATCH 32/41] multi-node mpi jobs firewall permission --- ...-group-login_compiler_node_aarch64.yaml.j2 | 32 +++++++++++++++++++ ...i-group-login_compiler_node_x86_64.yaml.j2 | 32 +++++++++++++++++++ .../ci-group-login_node_aarch64.yaml.j2 | 32 +++++++++++++++++++ .../ci-group-login_node_x86_64.yaml.j2 | 32 +++++++++++++++++++ .../ci-group-slurm_node_aarch64.yaml.j2 | 30 +++++++++++++++++ .../ci-group-slurm_node_x86_64.yaml.j2 | 30 +++++++++++++++++ .../roles/slurm_config/defaults/main.yml | 2 +- 7 files changed, 189 insertions(+), 1 deletion(-) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 36332685ac..fe6966c4be 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -315,6 +315,38 @@ - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SrunPortRange }}/udp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/tcp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/udp + + # Add PXE network to trusted zone for ORTE communication + - echo "[INFO] Adding PXE network to trusted zone for ORTE communication" + - | + bash -c ' + ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}" + NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" + + # Convert IP to integer and calculate network address + ip_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) + } + + int_to_ip() { + local ip=$1 + echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" + } + + ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP") + HOST_BITS=$(( 32 - NETMASK_BITS )) + HOST_MASK=$(( (1 << HOST_BITS) - 1 )) + NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF )) + NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK )) + NETWORK_IP=$(int_to_ip "$NETWORK_INT") + + PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS" + echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET" + firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent + ' + - firewall-cmd --reload - systemctl enable sshd - systemctl start sshd diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 51121a2e82..1ee1fce5e1 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -317,6 +317,38 @@ - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SrunPortRange }}/udp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/tcp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/udp + + # Add PXE network to trusted zone for ORTE communication + - echo "[INFO] Adding PXE network to trusted zone for ORTE communication" + - | + bash -c ' + ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}" + NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" + + # Convert IP to integer and calculate network address + ip_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) + } + + int_to_ip() { + local ip=$1 + echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" + } + + ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP") + HOST_BITS=$(( 32 - NETMASK_BITS )) + HOST_MASK=$(( (1 << HOST_BITS) - 1 )) + NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF )) + NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK )) + NETWORK_IP=$(int_to_ip "$NETWORK_INT") + + PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS" + echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET" + firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent + ' + - firewall-cmd --reload - systemctl enable sshd - systemctl start sshd diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index d6071116ac..cdea0cd340 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -171,6 +171,38 @@ - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SrunPortRange }}/udp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/tcp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/udp + + # Add PXE network to trusted zone for ORTE communication + - echo "[INFO] Adding PXE network to trusted zone for ORTE communication" + - | + bash -c ' + ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}" + NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" + + # Convert IP to integer and calculate network address + ip_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) + } + + int_to_ip() { + local ip=$1 + echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" + } + + ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP") + HOST_BITS=$(( 32 - NETMASK_BITS )) + HOST_MASK=$(( (1 << HOST_BITS) - 1 )) + NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF )) + NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK )) + NETWORK_IP=$(int_to_ip "$NETWORK_INT") + + PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS" + echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET" + firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent + ' + - firewall-cmd --reload - systemctl enable sshd - systemctl start sshd diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index 524553bd55..b744859381 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -174,6 +174,38 @@ - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SrunPortRange }}/udp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/tcp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/udp + + # Add PXE network to trusted zone for ORTE communication + - echo "[INFO] Adding PXE network to trusted zone for ORTE communication" + - | + bash -c ' + ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}" + NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" + + # Convert IP to integer and calculate network address + ip_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) + } + + int_to_ip() { + local ip=$1 + echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" + } + + ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP") + HOST_BITS=$(( 32 - NETMASK_BITS )) + HOST_MASK=$(( (1 << HOST_BITS) - 1 )) + NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF )) + NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK )) + NETWORK_IP=$(int_to_ip "$NETWORK_INT") + + PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS" + echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET" + firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent + ' + - firewall-cmd --reload - systemctl enable sshd - systemctl start sshd diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 0f25c81307..981b283a99 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -386,6 +386,36 @@ firewall-cmd --permanent --add-service=ssh firewall-cmd --permanent --add-port="${SRUN_RANGE}"/tcp firewall-cmd --permanent --add-port="${SLURMD_PORT}"/tcp + + # Add PXE network to trusted zone for ORTE communication + echo "[INFO] Adding PXE network to trusted zone for ORTE communication" + # Calculate PXE subnet using admin IP and netmask bits + ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}" + NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" + + # Convert IP to integer and calculate network address + ip_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) + } + + int_to_ip() { + local ip=$1 + echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" + } + + ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP") + HOST_BITS=$(( 32 - NETMASK_BITS )) + HOST_MASK=$(( (1 << HOST_BITS) - 1 )) + NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF )) + NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK )) + NETWORK_IP=$(int_to_ip "$NETWORK_INT") + + PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS" + echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET" + firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent + firewall-cmd --reload echo "[INFO] Unmounting controller slurm.conf directory from $CTLD_SLURM_DIR_MNT" diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index d21fcf9c5c..b914abdecd 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -403,6 +403,36 @@ firewall-cmd --permanent --add-service=ssh firewall-cmd --permanent --add-port="${SRUN_RANGE}"/tcp firewall-cmd --permanent --add-port="${SLURMD_PORT}"/tcp + + # Add PXE network to trusted zone for ORTE communication + echo "[INFO] Adding PXE network to trusted zone for ORTE communication" + # Calculate PXE subnet using admin IP and netmask bits + ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}" + NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" + + # Convert IP to integer and calculate network address + ip_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) + } + + int_to_ip() { + local ip=$1 + echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" + } + + ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP") + HOST_BITS=$(( 32 - NETMASK_BITS )) + HOST_MASK=$(( (1 << HOST_BITS) - 1 )) + NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF )) + NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK )) + NETWORK_IP=$(int_to_ip "$NETWORK_INT") + + PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS" + echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET" + firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent + firewall-cmd --reload echo "[INFO] Unmounting controller slurm.conf directory from $CTLD_SLURM_DIR_MNT" diff --git a/discovery/roles/slurm_config/defaults/main.yml b/discovery/roles/slurm_config/defaults/main.yml index edee927fc9..68770c386c 100644 --- a/discovery/roles/slurm_config/defaults/main.yml +++ b/discovery/roles/slurm_config/defaults/main.yml @@ -19,7 +19,7 @@ slurmctld_service_default_path: '/usr/lib/systemd/system/slurmctld.service' slurmd_service_default_path: '/usr/lib/systemd/system/slurmd.service' slurmdbd_service_default_path: '/usr/lib/systemd/system/slurmdbd.service' sys_env_path: '/etc/environment' -default_real_memory: 884736 +default_real_memory: 864 default_threadspercore: 1 default_corespersocket: 72 default_sockets: 2 From 884011da830f7d28c07df8d83b3ffdfbe1cb5c5e Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Mon, 2 Mar 2026 15:45:07 +0530 Subject: [PATCH 33/41] Updated gpu default for slurm conf pxe boot with retries added hwloc for slurm packages --- .../roles/slurm_config/defaults/main.yml | 10 +++--- .../slurm_config/tasks/build_slurm_conf.yml | 6 ---- discovery/roles/slurm_config/tasks/confs.yml | 31 +++++++---------- discovery/roles/slurm_config/vars/main.yml | 5 --- .../aarch64/rhel/10.0/slurm_custom.json | 5 ++- .../config/x86_64/rhel/10.0/slurm_custom.json | 5 ++- utils/roles/idrac_pxe_boot/tasks/main.yml | 33 ++++++++++++------- utils/roles/idrac_pxe_boot/vars/main.yml | 3 ++ utils/set_pxe_boot.yml | 16 ++++++++- 9 files changed, 64 insertions(+), 50 deletions(-) diff --git a/discovery/roles/slurm_config/defaults/main.yml b/discovery/roles/slurm_config/defaults/main.yml index edee927fc9..aa256ac085 100644 --- a/discovery/roles/slurm_config/defaults/main.yml +++ b/discovery/roles/slurm_config/defaults/main.yml @@ -35,6 +35,7 @@ __default_config: ConstrainDevices: true ConstrainRAMSpace: true ConstrainSwapSpace: true + slurm: SlurmUser: "{{ slurm_user }}" SlurmctldPort: 6817 @@ -51,7 +52,10 @@ __default_config: PrologFlags: contain JobAcctGatherType: jobacct_gather/linux JobAcctGatherFrequency: 30 - SelectType: select/linear + SelectType: select/cons_tres + GresTypes: gpu + SelectTypeParameters: CR_Core_Memory + SlurmdParameters: l3cache_as_socket # Requires hwloc v2. SlurmctldLogFile: "/var/log/slurm/slurmctld.log" SlurmdLogFile: "/var/log/slurm/slurmd.log" SlurmctldPidFile: /var/run/slurmctld.pid @@ -62,6 +66,7 @@ __default_config: SlurmdTimeout: 300 Epilog: "/etc/slurm/epilog.d/logout_user.sh" PluginDir: "{{ plugin_slurm_dir }}" + MaxNodeCount: 65000 NodeSet: - NodeSet: "{{ slurm_partition_name }}" Feature: "{{ slurm_partition_name }}" @@ -73,9 +78,6 @@ __default_config: Nodes: ALL MaxTime: INFINITE State: UP - # S_P_ARRAY type paramater to be provided this way - # Epilog: - # - Epilog: "/etc/slurm/epilog.d/logout_user.sh" slurmdbd: AuthType: auth/munge LogFile: "/var/log/slurm/slurmdbd.log" diff --git a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml index e322c4f3f1..d930f64418 100644 --- a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml +++ b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml @@ -46,12 +46,6 @@ when: node_params is defined and node_params no_log: "{{ _no_log }}" -- name: Add gpu parameters to slurm conf - ansible.builtin.set_fact: - apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(gpu_slurm_conf))}) }}" - when: gpu_params is defined and gpu_params - no_log: "{{ _no_log }}" - - name: Add dbd parameters to slurm conf ansible.builtin.set_fact: apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(dbd_slurm_conf))}) }}" diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index 1601a81334..77a7427a75 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -384,26 +384,17 @@ - name: Handle user choice - ABORT ansible.builtin.fail: - msg: | - ================================================================================ - PLAYBOOK ABORTED BY USER - ================================================================================ - - You chose to abort the playbook (Option A). - - Next Steps: - 1. Cancel running jobs manually: - {% for node in busy_nodes.keys() %} - scancel -w {{ node }} - {% endfor %} - - 2. Or wait for jobs to complete naturally - - 3. Re-run this playbook to remove the nodes - - Idle nodes (if any) have already been removed from the cluster. - - ================================================================================ + msg: + - "===============================================================================" + - "PLAYBOOK ABORTED BY USER" + - "===============================================================================" + - "You chose to abort the playbook (Option A)." + - "Next Steps:" + - "1. Cancel running jobs manually: 'scancel -w '" + - "2. Or wait for jobs to complete naturally" + - "3. Re-run this playbook to remove the nodes" + - "Idle nodes (if any) have already been removed from the cluster." + - "===============================================================================" when: - busy_nodes is defined - busy_nodes | length > 0 diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 37e6ff9869..fca080a65e 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -61,11 +61,6 @@ db_dir: cmpt_dir: - /etc/slurm/epilog.d -gpu_slurm_conf: - GresTypes: gpu - SelectType: select/cons_tres - SelectTypeParameters: CR_Core_Memory - SlurmdParameters: l3cache_as_socket innodb_buffer_pool_size: 4G innodb_lock_wait_timeout: 900 conf_server: "--conf-server {{ ctld_list | join(',') }}" diff --git a/input/config/aarch64/rhel/10.0/slurm_custom.json b/input/config/aarch64/rhel/10.0/slurm_custom.json index db95f2f5fb..79380a7f80 100644 --- a/input/config/aarch64/rhel/10.0/slurm_custom.json +++ b/input/config/aarch64/rhel/10.0/slurm_custom.json @@ -7,9 +7,12 @@ {"package": "python3-firewall", "type": "rpm", "repo_name": "aarch64_baseos"}, {"package": "pmix", "type": "rpm", "repo_name": "aarch64_appstream"}, {"package": "pmix-devel", "type": "rpm", "repo_name": "aarch64_appstream"}, + {"package": "hwloc", "type": "rpm", "repo_name": "aarch64_appstream"}, + {"package": "hwloc-libs", "type": "rpm", "repo_name": "aarch64_appstream"}, + {"package": "slurm", "type": "rpm", "repo_name": "aarch64_slurm_custom"}, {"package": "nvcr.io/nvidia/hpc-benchmarks", "tag": "25.09", "type": "image"}, {"package": "apptainer", "type": "rpm", "repo_name": "epel" }, - {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } + {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } ] }, "slurm_control_node": { diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json index 2b33b0de90..f9f6983e86 100644 --- a/input/config/x86_64/rhel/10.0/slurm_custom.json +++ b/input/config/x86_64/rhel/10.0/slurm_custom.json @@ -5,9 +5,12 @@ {"package": "firewalld", "type": "rpm", "repo_name": "x86_64_baseos"}, {"package": "python3-firewall", "type": "rpm", "repo_name": "x86_64_baseos"}, {"package": "pmix", "type": "rpm", "repo_name": "x86_64_appstream"}, + {"package": "hwloc", "type": "rpm", "repo_name": "x86_64_appstream"}, + {"package": "hwloc-libs", "type": "rpm", "repo_name": "x86_64_appstream"}, {"package": "nvcr.io/nvidia/hpc-benchmarks", "tag": "25.09", "type": "image"}, {"package": "apptainer", "type": "rpm", "repo_name": "epel" }, - {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } + {"package": "slurm", "type": "rpm", "repo_name": "x86_64_slurm_custom"}, + {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } ] }, "slurm_control_node": { diff --git a/utils/roles/idrac_pxe_boot/tasks/main.yml b/utils/roles/idrac_pxe_boot/tasks/main.yml index 7c153f4a24..79f1ec3957 100644 --- a/utils/roles/idrac_pxe_boot/tasks/main.yml +++ b/utils/roles/idrac_pxe_boot/tasks/main.yml @@ -33,7 +33,7 @@ - lc_check_status is success - lc_check_status.lc_status_info.LCReady block: - - name: Set boot from pxe + - name: Set boot option from pxe dellemc.openmanage.idrac_boot: idrac_ip: "{{ inventory_hostname }}" idrac_user: "{{ bmc_username | default(hostvars['localhost']['bmc_username']) }}" @@ -72,37 +72,46 @@ - not (restart_op is changed) - name: Check LC availibility - ansible.builtin.fail: - msg: "{{ lc_check_fail_msg }}" - when: lc_check_status is failed or not (lc_check_status.lc_status_info.LCReady | default(false)) + ansible.builtin.set_fact: + reboot_failed: true + reboot_status: "{{ lc_check_fail_msg }}" + when: lc_check_status is unreachable or lc_check_status is failed or not (lc_check_status.lc_status_info.LCReady | default(false)) - name: Fail if PXE provisioning failed - ansible.builtin.fail: - msg: "{{ pxe_provisioning_fail_msg }}" + ansible.builtin.set_fact: + reboot_failed: true + reboot_status: "{{ pxe_provisioning_fail_msg }}" when: + - not reboot_failed - pxe_provisioning is defined - pxe_provisioning is failed - name: Fail if PXE provisioning target is unreachable - ansible.builtin.fail: - msg: "{{ unreachable_idrac_msg }}" + ansible.builtin.set_fact: + reboot_failed: true + reboot_status: "{{ unreachable_idrac_msg }}" when: + - not reboot_failed - pxe_provisioning is defined - pxe_provisioning is unreachable - name: Fail if power operation failed - ansible.builtin.fail: - msg: "Power operation failed on {{ inventory_hostname }}. Failed to restart server." + ansible.builtin.set_fact: + reboot_failed: true + reboot_status: "Power operation failed on {{ inventory_hostname }}. Failed to restart server." when: + - not reboot_failed - restart_host - not (restart_op is defined and restart_op is changed) - not (power_on_op is defined and power_on_op is changed) - name: Summarize PXE boot and power operation results - ansible.builtin.debug: - msg: >- + ansible.builtin.set_fact: + reboot_failed: false + reboot_status: >- PXE Boot: {{ 'OK' if pxe_provisioning is success else ('UNREACHABLE' if pxe_provisioning is unreachable else 'FAILED') }} | Power: {{ 'Restart OK' if (restart_op is defined and restart_op is changed) else ('On OK' if (power_on_op is defined and power_on_op is changed) else ('Skipped (no restart)' if not restart_host else 'FAILED')) }} + when: not reboot_failed diff --git a/utils/roles/idrac_pxe_boot/vars/main.yml b/utils/roles/idrac_pxe_boot/vars/main.yml index 53de8aa0e9..4bf99d7bfe 100644 --- a/utils/roles/idrac_pxe_boot/vars/main.yml +++ b/utils/roles/idrac_pxe_boot/vars/main.yml @@ -18,6 +18,9 @@ restart_host: true # Change to true for forceful reboot. by default graceful will happen force_restart: true +reboot_status: "PXE boot initiated but not completed." +reboot_failed: false + # Set boot source override mode. Valid values are once, continuous, or disabled boot_source_override_enabled: continuous diff --git a/utils/set_pxe_boot.yml b/utils/set_pxe_boot.yml index a46ebd7091..16e11534b0 100644 --- a/utils/set_pxe_boot.yml +++ b/utils/set_pxe_boot.yml @@ -54,13 +54,27 @@ # This configures Dell iDRAC BMCs to boot a host from PXE (network) and optionally reboots the server. # This will set the boot mode to pxe -# Note: Restart will not happen if the server is powered off, only pxe mode will be set. - name: Reboot Host via PXE hosts: bmc connection: local + strategy: host_pinned gather_facts: false roles: - role: idrac_pxe_boot # vars: # restart_host: false # By default restart will be true, set to false not to restart # force_restart: true # By default graceful_restart will happen, set to true to force restart + +- name: Synchronized Reporting + hosts: bmc + connection: local + gather_facts: false + tasks: + - name: Fail if reboot function failed + ansible.builtin.fail: + msg: "{{ reboot_status }}" + when: reboot_failed + + - name: Show passed iDRACs + ansible.builtin.debug: + msg: "{{ inventory_hostname }}: {{ reboot_status }}" From 748e34758db914ac5a428321d16649f99df364dd Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Mon, 2 Mar 2026 15:49:26 +0530 Subject: [PATCH 34/41] Pxe mapping x86 update --- examples/pxe_mapping_file.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pxe_mapping_file.csv b/examples/pxe_mapping_file.csv index 4d1c4775ed..819ed1a738 100644 --- a/examples/pxe_mapping_file.csv +++ b/examples/pxe_mapping_file.csv @@ -1,6 +1,6 @@ FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 +slurm_node_x86_64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 login_node_x86_64,grp9,ABFG78,,login-compiler-node2,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 From d5a0bd7fde1f73bcf281e1564f8c25f507b8b918 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> Date: Mon, 2 Mar 2026 15:51:51 +0530 Subject: [PATCH 35/41] Update main.yml Signed-off-by: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> --- discovery/roles/slurm_config/defaults/main.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/discovery/roles/slurm_config/defaults/main.yml b/discovery/roles/slurm_config/defaults/main.yml index aa256ac085..23879f67b6 100644 --- a/discovery/roles/slurm_config/defaults/main.yml +++ b/discovery/roles/slurm_config/defaults/main.yml @@ -35,7 +35,6 @@ __default_config: ConstrainDevices: true ConstrainRAMSpace: true ConstrainSwapSpace: true - slurm: SlurmUser: "{{ slurm_user }}" SlurmctldPort: 6817 From 2bfbbf5d32b078ff2e012a0280dfaec74ca0be52 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> Date: Mon, 2 Mar 2026 17:01:52 +0530 Subject: [PATCH 36/41] Update pxe_mapping_file.csv Signed-off-by: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> --- examples/pxe_mapping_file.csv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/pxe_mapping_file.csv b/examples/pxe_mapping_file.csv index 819ed1a738..f4c50517ac 100644 --- a/examples/pxe_mapping_file.csv +++ b/examples/pxe_mapping_file.csv @@ -1,6 +1,6 @@ FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_x86_64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 login_node_x86_64,grp9,ABFG78,,login-compiler-node2,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 @@ -8,4 +8,4 @@ service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54 service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55 service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56 -service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 \ No newline at end of file +service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 From d192280167847ac8a270dc52e437688b30d9ef87 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> Date: Mon, 2 Mar 2026 17:02:39 +0530 Subject: [PATCH 37/41] Update pxe_mapping_file.csv Signed-off-by: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> From 2d6583fbee560ec76324f0152116e5911b11bed0 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> Date: Mon, 2 Mar 2026 17:03:20 +0530 Subject: [PATCH 38/41] Update slurm_custom.json Signed-off-by: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> --- input/config/aarch64/rhel/10.0/slurm_custom.json | 3 --- 1 file changed, 3 deletions(-) diff --git a/input/config/aarch64/rhel/10.0/slurm_custom.json b/input/config/aarch64/rhel/10.0/slurm_custom.json index 79380a7f80..b0477f2a95 100644 --- a/input/config/aarch64/rhel/10.0/slurm_custom.json +++ b/input/config/aarch64/rhel/10.0/slurm_custom.json @@ -7,9 +7,6 @@ {"package": "python3-firewall", "type": "rpm", "repo_name": "aarch64_baseos"}, {"package": "pmix", "type": "rpm", "repo_name": "aarch64_appstream"}, {"package": "pmix-devel", "type": "rpm", "repo_name": "aarch64_appstream"}, - {"package": "hwloc", "type": "rpm", "repo_name": "aarch64_appstream"}, - {"package": "hwloc-libs", "type": "rpm", "repo_name": "aarch64_appstream"}, - {"package": "slurm", "type": "rpm", "repo_name": "aarch64_slurm_custom"}, {"package": "nvcr.io/nvidia/hpc-benchmarks", "tag": "25.09", "type": "image"}, {"package": "apptainer", "type": "rpm", "repo_name": "epel" }, {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } From fff42d3e9a9b4ff39c20d964c08315074f406f13 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> Date: Mon, 2 Mar 2026 17:03:41 +0530 Subject: [PATCH 39/41] Update slurm_custom.json Signed-off-by: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> --- input/config/x86_64/rhel/10.0/slurm_custom.json | 3 --- 1 file changed, 3 deletions(-) diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json index f9f6983e86..e7962f4723 100644 --- a/input/config/x86_64/rhel/10.0/slurm_custom.json +++ b/input/config/x86_64/rhel/10.0/slurm_custom.json @@ -5,11 +5,8 @@ {"package": "firewalld", "type": "rpm", "repo_name": "x86_64_baseos"}, {"package": "python3-firewall", "type": "rpm", "repo_name": "x86_64_baseos"}, {"package": "pmix", "type": "rpm", "repo_name": "x86_64_appstream"}, - {"package": "hwloc", "type": "rpm", "repo_name": "x86_64_appstream"}, - {"package": "hwloc-libs", "type": "rpm", "repo_name": "x86_64_appstream"}, {"package": "nvcr.io/nvidia/hpc-benchmarks", "tag": "25.09", "type": "image"}, {"package": "apptainer", "type": "rpm", "repo_name": "epel" }, - {"package": "slurm", "type": "rpm", "repo_name": "x86_64_slurm_custom"}, {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } ] }, From 75f2602e0669982bed96c214b06e561d05ac0b0d Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Tue, 3 Mar 2026 00:50:00 +0530 Subject: [PATCH 40/41] Delete when mix of busy and idle nodes, scenario for abort --- discovery/roles/slurm_config/tasks/confs.yml | 8 ++++++++ discovery/roles/slurm_config/tasks/remove_node.yml | 12 ++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index 77a7427a75..2378682b7d 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -321,6 +321,14 @@ - busy_nodes | length > 0 - user_input.user_input | default('') | trim | upper == 'A' +- name: Empty Busy nodes for Force removal + ansible.builtin.set_fact: + busy_nodes: {} + when: + - busy_nodes is defined + - busy_nodes | length > 0 + - user_input.user_input | default('') | trim | upper == 'F' + - name: Remove nodes ansible.builtin.include_tasks: remove_node.yml when: diff --git a/discovery/roles/slurm_config/tasks/remove_node.yml b/discovery/roles/slurm_config/tasks/remove_node.yml index da6b2a72ae..ceb766898b 100644 --- a/discovery/roles/slurm_config/tasks/remove_node.yml +++ b/discovery/roles/slurm_config/tasks/remove_node.yml @@ -47,11 +47,19 @@ - "'slurm' in conf_merge_dict" - filtered_nodenames is defined +- name: Set partition nodes exactly as cmpt_list minus removed nodes + ansible.builtin.set_fact: + partition_nodes: "{{ cmpt_list | difference(nodes_in_normal_not_in_cmpt) | union(busy_nodes.keys() | default([]) | list) }}" + when: + - "'slurm' in conf_merge_dict" + - nodes_in_normal_not_in_cmpt is defined + - nodes_in_normal_not_in_cmpt | length > 0 + - name: Update normal partition Nodes to match cmpt_list ansible.builtin.set_fact: updated_partitions: "{{ updated_partitions | default([]) - + [item | combine({'Nodes': ((cmpt_list | difference(nodes_in_normal_not_in_cmpt)) | join(',')) - if (cmpt_list | difference(nodes_in_normal_not_in_cmpt)) | length > 0 else slurm_partition_name}) + + [item | combine({'Nodes': (partition_nodes | join(',')) + if partition_nodes | length > 0 else slurm_partition_name}) if item.PartitionName == slurm_partition_name else item] }}" loop: "{{ slurm_conf_dict.PartitionName | default([]) }}" when: From b133a9d1241fdb44b0baad9be6959284901be753 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Tue, 3 Mar 2026 11:50:40 +0530 Subject: [PATCH 41/41] initialize --- discovery/roles/slurm_config/vars/main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index fca080a65e..3cc8bdd6af 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -126,6 +126,7 @@ partition_params: MaxTime: "INFINITE" State: "UP" Default: "YES" +busy_nodes: {} openldap_dir_name: "openldap/" software_config_file: "{{ input_project_dir }}/software_config.json" omnia_run_tags: "{{ hostvars['localhost']['omnia_run_tags'] }}"