From bb2126e6bc4909d769ae765e75eecbf6b681d252 Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Tue, 17 Mar 2026 15:26:40 +0530 Subject: [PATCH 1/3] Update create_slurm_dir.yml Signed-off-by: sakshi-singla-1735 --- .../slurm_config/tasks/create_slurm_dir.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index b68bcbbded..16af760f9d 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -46,6 +46,22 @@ - slurm_custom_aarch64.slurm_node is defined - slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0 +- name: Extract NVHPC package name for x86_64 from slurm_custom.json + ansible.builtin.set_fact: + nvhpc_pkg_name_x86_64: "{{ (slurm_custom_x86_64.slurm_node.cluster | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'match', '^nvhpc_.*_Linux_x86_64_cuda_.*') | first).package }}" + when: + - slurm_custom_x86_64 is defined + - slurm_custom_x86_64.slurm_node is defined + - slurm_custom_x86_64.slurm_node.cluster | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'match', '^nvhpc_.*_Linux_x86_64_cuda_.*') | list | length > 0 + +- name: Extract NVHPC package name for aarch64 from slurm_custom.json + ansible.builtin.set_fact: + nvhpc_pkg_name_aarch64: "{{ (slurm_custom_aarch64.slurm_node.cluster | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'match', '^nvhpc_.*_Linux_aarch64_cuda_.*') | first).package }}" + when: + - slurm_custom_aarch64 is defined + - slurm_custom_aarch64.slurm_node is defined + - slurm_custom_aarch64.slurm_node.cluster | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'match', '^nvhpc_.*_Linux_aarch64_cuda_.*') | list | length > 0 + - name: Set facts for slurm ansible.builtin.set_fact: nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}" From e92a051e5283ac840aff045c8996c8d301fd71ad Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Tue, 17 Mar 2026 15:27:24 +0530 Subject: [PATCH 2/3] Update main.yml Signed-off-by: sakshi-singla-1735 --- discovery/roles/slurm_config/vars/main.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 622416c7b8..9176e25356 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -152,10 +152,9 @@ offline_path_aarch64: [] ssh_private_key_path: /root/.ssh/oim_rsa # nvidia sdk vars -# Fully resolved tarball relative paths (no nested Jinja2) -# nvidia sdk vars -nvhpc_pkg_name_x86_64: "nvhpc_2025_2511_Linux_x86_64_cuda_13.0" -nvhpc_pkg_name_aarch64: "nvhpc_2025_2511_Linux_aarch64_cuda_13.0" +# Dynamically extracted from slurm_custom.json +nvhpc_pkg_name_x86_64: "{{ hostvars['oim']['nvhpc_pkg_name_x86_64'] }}" +nvhpc_pkg_name_aarch64: "{{ hostvars['oim']['nvhpc_pkg_name_aarch64'] }}" nvhpc_tarball_x86_64_relpath: "offline_repo/cluster/x86_64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/{{ nvhpc_pkg_name_x86_64 }}/{{ nvhpc_pkg_name_x86_64 }}.tar.gz" # noqa: yaml[line-length] nvhpc_tarball_aarch64_relpath: "offline_repo/cluster/aarch64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/{{ nvhpc_pkg_name_aarch64 }}/{{ nvhpc_pkg_name_aarch64 }}.tar.gz" # noqa: yaml[line-length] From ef107a9de905ba6ba7d09dbe9619743028ce135a Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Tue, 17 Mar 2026 15:36:27 +0530 Subject: [PATCH 3/3] lint fixes Signed-off-by: sakshi-singla-1735 --- discovery/roles/slurm_config/tasks/create_slurm_dir.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index 16af760f9d..920c80b863 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -48,19 +48,19 @@ - name: Extract NVHPC package name for x86_64 from slurm_custom.json ansible.builtin.set_fact: - nvhpc_pkg_name_x86_64: "{{ (slurm_custom_x86_64.slurm_node.cluster | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'match', '^nvhpc_.*_Linux_x86_64_cuda_.*') | first).package }}" + nvhpc_pkg_name_x86_64: "{{ (slurm_custom_x86_64.slurm_node.cluster | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'match', '^nvhpc_.*_Linux_x86_64_cuda_.*') | first).package }}" # noqa: yaml[line-length] when: - slurm_custom_x86_64 is defined - slurm_custom_x86_64.slurm_node is defined - - slurm_custom_x86_64.slurm_node.cluster | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'match', '^nvhpc_.*_Linux_x86_64_cuda_.*') | list | length > 0 + - slurm_custom_x86_64.slurm_node.cluster | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'match', '^nvhpc_.*_Linux_x86_64_cuda_.*') | list | length > 0 # noqa: yaml[line-length] - name: Extract NVHPC package name for aarch64 from slurm_custom.json ansible.builtin.set_fact: - nvhpc_pkg_name_aarch64: "{{ (slurm_custom_aarch64.slurm_node.cluster | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'match', '^nvhpc_.*_Linux_aarch64_cuda_.*') | first).package }}" + nvhpc_pkg_name_aarch64: "{{ (slurm_custom_aarch64.slurm_node.cluster | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'match', '^nvhpc_.*_Linux_aarch64_cuda_.*') | first).package }}" # noqa: yaml[line-length] when: - slurm_custom_aarch64 is defined - slurm_custom_aarch64.slurm_node is defined - - slurm_custom_aarch64.slurm_node.cluster | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'match', '^nvhpc_.*_Linux_aarch64_cuda_.*') | list | length > 0 + - slurm_custom_aarch64.slurm_node.cluster | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'match', '^nvhpc_.*_Linux_aarch64_cuda_.*') | list | length > 0 # noqa: yaml[line-length] - name: Set facts for slurm ansible.builtin.set_fact: