diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index b68bcbbded..920c80b863 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -46,6 +46,22 @@ - slurm_custom_aarch64.slurm_node is defined - slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0 +- name: Extract NVHPC package name for x86_64 from slurm_custom.json + ansible.builtin.set_fact: + nvhpc_pkg_name_x86_64: "{{ (slurm_custom_x86_64.slurm_node.cluster | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'match', '^nvhpc_.*_Linux_x86_64_cuda_.*') | first).package }}" # noqa: yaml[line-length] + when: + - slurm_custom_x86_64 is defined + - slurm_custom_x86_64.slurm_node is defined + - slurm_custom_x86_64.slurm_node.cluster | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'match', '^nvhpc_.*_Linux_x86_64_cuda_.*') | list | length > 0 # noqa: yaml[line-length] + +- name: Extract NVHPC package name for aarch64 from slurm_custom.json + ansible.builtin.set_fact: + nvhpc_pkg_name_aarch64: "{{ (slurm_custom_aarch64.slurm_node.cluster | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'match', '^nvhpc_.*_Linux_aarch64_cuda_.*') | first).package }}" # noqa: yaml[line-length] + when: + - slurm_custom_aarch64 is defined + - slurm_custom_aarch64.slurm_node is defined + - slurm_custom_aarch64.slurm_node.cluster | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'match', '^nvhpc_.*_Linux_aarch64_cuda_.*') | list | length > 0 # noqa: yaml[line-length] + - name: Set facts for slurm ansible.builtin.set_fact: nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}" diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 622416c7b8..9176e25356 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -152,10 +152,9 @@ offline_path_aarch64: [] ssh_private_key_path: /root/.ssh/oim_rsa # nvidia sdk vars -# Fully resolved tarball relative paths (no nested Jinja2) -# nvidia sdk vars -nvhpc_pkg_name_x86_64: "nvhpc_2025_2511_Linux_x86_64_cuda_13.0" -nvhpc_pkg_name_aarch64: "nvhpc_2025_2511_Linux_aarch64_cuda_13.0" +# Dynamically extracted from slurm_custom.json +nvhpc_pkg_name_x86_64: "{{ hostvars['oim']['nvhpc_pkg_name_x86_64'] }}" +nvhpc_pkg_name_aarch64: "{{ hostvars['oim']['nvhpc_pkg_name_aarch64'] }}" nvhpc_tarball_x86_64_relpath: "offline_repo/cluster/x86_64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/{{ nvhpc_pkg_name_x86_64 }}/{{ nvhpc_pkg_name_x86_64 }}.tar.gz" # noqa: yaml[line-length] nvhpc_tarball_aarch64_relpath: "offline_repo/cluster/aarch64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/{{ nvhpc_pkg_name_aarch64 }}/{{ nvhpc_pkg_name_aarch64 }}.tar.gz" # noqa: yaml[line-length]