From 56880aa3d534a913ad3794c4e447e5cb26cced8d Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Thu, 8 Jul 2021 11:23:06 -0600 Subject: [PATCH 01/22] code to link SmartRedis tools --- config/cesm/machines/config_machines.xml | 50 ++++++++++++------------ scripts/Tools/Makefile | 15 ++++--- 2 files changed, 36 insertions(+), 29 deletions(-) diff --git a/config/cesm/machines/config_machines.xml b/config/cesm/machines/config_machines.xml index 7d9b619fa70..95b0dfa5167 100644 --- a/config/cesm/machines/config_machines.xml +++ b/config/cesm/machines/config_machines.xml @@ -337,7 +337,7 @@ This allows using a different mpirun command to launch unit tests module - lang/python/3.6.5 + lang/python/3.6.5 compiler/gnu/8.2.0 @@ -563,6 +563,7 @@ This allows using a different mpirun command to launch unit tests ncarenv/1.3 python/3.7.9 cmake + python intel/19.1.1 @@ -643,7 +644,7 @@ This allows using a different mpirun command to launch unit tests mpt/2.21 - netcdf-mpi/4.7.3 + netcdf-mpi/4.7.4 pnetcdf/1.12.1 @@ -661,8 +662,9 @@ This allows using a different mpirun command to launch unit tests netcdf-mpi/4.7.4 - openmpi/3.1.4 - netcdf/4.7.1 + openmpi/4.0.5 + netcdf-mpi/4.7.4 + pnetcdf/1.12.2 ncarcompilers/0.5.0 @@ -1636,14 +1638,14 @@ This allows using a different mpirun command to launch unit tests + can be overridden in environment or $HOME/.cime/config --> + can be overridden in environment or $HOME/.cime/config --> /DFS-L/SCRATCH/moore/$USER/cesm_runs /DFS-L/DATA/moore/cesm/inputdata @@ -1664,13 +1666,13 @@ This allows using a different mpirun command to launch unit tests mpirun - -np {{ total_tasks }} + -np {{ total_tasks }} mpirun - -np {{ total_tasks }} + -np {{ total_tasks }} @@ -1683,15 +1685,15 @@ This allows using a different mpirun command to launch unit tests module module - + - intel/2018.3 - netcdf/4.7.0 + intel/2018.3 + netcdf/4.7.0 - openmpi/3.1.6 - pnetcdf/1.10.0 + openmpi/3.1.6 + pnetcdf/1.10.0 @@ -1716,14 +1718,14 @@ This allows using a different mpirun command to launch unit tests + can be overridden in environment or $HOME/.cime/config --> + can be overridden in environment or $HOME/.cime/config --> /DFS-L/SCRATCH/moore/$USER/cesm_runs /DFS-L/DATA/moore/cesm/inputdata @@ -1744,13 +1746,13 @@ This allows using a different mpirun command to launch unit tests mpirun - -np {{ total_tasks }} + -np {{ total_tasks }} mpirun - -np {{ total_tasks }} + -np {{ total_tasks }} @@ -1763,15 +1765,15 @@ This allows using a different mpirun command to launch unit tests module module - + - intel/2018.3 - netcdf/4.7.0 + intel/2018.3 + netcdf/4.7.0 - openmpi/3.1.6 - pnetcdf/1.10.0 + openmpi/3.1.6 + pnetcdf/1.10.0 diff --git a/scripts/Tools/Makefile b/scripts/Tools/Makefile index 2e699eec736..0d165c4a63c 100644 --- a/scripts/Tools/Makefile +++ b/scripts/Tools/Makefile @@ -77,6 +77,11 @@ CPPDEFS := $(USER_CPPDEFS) -D$(OS) include $(CASEROOT)/Macros.make +ifeq ($(strip $(USE_SMARTSIM)), TRUE) + CPPDEFS += -DSMARTREDIS + SLIBS += -L$(REDIS_HOME)/install/lib -lsmartredis -Wl,-rpath $(REDIS_HOME)/install/lib +endif + # Unless DEBUG mode is enabled, use NDEBUG to turn off assert statements. ifeq ($(strip $(DEBUG)),TRUE) # e3sm still has components that cannot build with -DDEBUG @@ -360,11 +365,11 @@ endif #=============================================================================== ifeq ($(strip $(MPILIB)), mpi-serial) - CC := $(SCC) - FC := $(SFC) - CXX := $(SCXX) - MPIFC := $(SFC) - MPICC := $(SCC) + CC := $(SCC) + FC := $(SFC) + CXX := $(SCXX) + MPIFC := $(SFC) + MPICC := $(SCC) MPICXX := $(SCXX) CONFIG_ARGS += MCT_PATH=$(SHAREDLIBROOT)/$(SHAREDPATH)/mct/mpi-serial else From e567cf23a3f910daff6c74ca8cb3afb31bd95c3e Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Fri, 16 Jul 2021 10:53:02 -0600 Subject: [PATCH 02/22] add a SmartSim test to xatm component --- config/cesm/machines/config_batch.xml | 5 ++ config/cesm/machines/config_compilers.xml | 14 ++++ config/cesm/machines/config_machines.xml | 75 ++++++++++--------- scripts/lib/CIME/buildlib.py | 5 ++ .../xatm/src/atm_comp_nuopc.F90 | 35 ++++++++- 5 files changed, 98 insertions(+), 36 deletions(-) diff --git a/config/cesm/machines/config_batch.xml b/config/cesm/machines/config_batch.xml index 687ef843740..a57bb7bbe48 100644 --- a/config/cesm/machines/config_batch.xml +++ b/config/cesm/machines/config_batch.xml @@ -246,6 +246,11 @@ -l select={{ num_nodes }}:ncpus={{ max_tasks_per_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }}:mem=700GB:ngpus={{ ngpus_per_node }} + + -S {{ shell }} + -l select={{ num_nodes }}:ncpus={{ max_tasks_per_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }} + + casper diff --git a/config/cesm/machines/config_compilers.xml b/config/cesm/machines/config_compilers.xml index 659b14c6e36..00e94d2bb3d 100644 --- a/config/cesm/machines/config_compilers.xml +++ b/config/cesm/machines/config_compilers.xml @@ -925,6 +925,20 @@ using a fortran linker. $ENV{PNETCDF} + + + + + + + + + + + + -DNO_MPIMOD diff --git a/config/cesm/machines/config_machines.xml b/config/cesm/machines/config_machines.xml index 95b0dfa5167..adce35826e8 100644 --- a/config/cesm/machines/config_machines.xml +++ b/config/cesm/machines/config_machines.xml @@ -337,7 +337,7 @@ This allows using a different mpirun command to launch unit tests module - lang/python/3.6.5 + lang/python/3.6.5 compiler/gnu/8.2.0 @@ -358,7 +358,7 @@ This allows using a different mpirun command to launch unit tests NCAR GPU platform, os is Linux, 36 pes/node, batch system is pbs casper* LINUX - pgi,intel,nvhpc,pgi-gpu,nvhpc-gpu + pgi,intel,nvhpc,pgi-gpu,nvhpc-gpu,gnu openmpi /glade/scratch/$USER $ENV{CESMDATAROOT}/inputdata @@ -397,6 +397,7 @@ This allows using a different mpirun command to launch unit tests ncarenv/1.3 cmake/3.18.2 + python pgi/20.4 @@ -410,6 +411,9 @@ This allows using a different mpirun command to launch unit tests nvhpc/20.9 + + gnu/9.1.0 + intel/19.1.1 mkl/2020.0.1 @@ -422,23 +426,26 @@ This allows using a different mpirun command to launch unit tests /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/pgi/20.4/ esmf-8.2.0b11_casper-ncdfio-openmpi-O - + + /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/gnu/11.1.0/ + esmf-8.2.0b12_casper-ncdfio-openmpi-g + + + /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/gnu/11.1.0/ + esmf-8.2.0b12_casper-ncdfio-openmpi-O + + openmpi/4.1.0 netcdf-mpi/4.8.0 pnetcdf/1.12.2 - + netcdf/4.8.0 - openmpi/4.1.0 - netcdf-mpi/4.7.4 - pnetcdf/1.12.2 + netcdf-mpi/4.7.4 cuda/11.0.3 - - netcdf/4.7.4 - openmpi/4.1.0 netcdf-mpi/4.7.4 @@ -563,7 +570,6 @@ This allows using a different mpirun command to launch unit tests ncarenv/1.3 python/3.7.9 cmake - python intel/19.1.1 @@ -644,7 +650,7 @@ This allows using a different mpirun command to launch unit tests mpt/2.21 - netcdf-mpi/4.7.4 + netcdf-mpi/4.7.3 pnetcdf/1.12.1 @@ -662,9 +668,8 @@ This allows using a different mpirun command to launch unit tests netcdf-mpi/4.7.4 - openmpi/4.0.5 - netcdf-mpi/4.7.4 - pnetcdf/1.12.2 + openmpi/3.1.4 + netcdf/4.7.1 ncarcompilers/0.5.0 @@ -1638,14 +1643,14 @@ This allows using a different mpirun command to launch unit tests + can be overridden in environment or $HOME/.cime/config --> + can be overridden in environment or $HOME/.cime/config --> /DFS-L/SCRATCH/moore/$USER/cesm_runs /DFS-L/DATA/moore/cesm/inputdata @@ -1666,13 +1671,13 @@ This allows using a different mpirun command to launch unit tests mpirun - -np {{ total_tasks }} + -np {{ total_tasks }} mpirun - -np {{ total_tasks }} + -np {{ total_tasks }} @@ -1685,15 +1690,15 @@ This allows using a different mpirun command to launch unit tests module module - + - intel/2018.3 - netcdf/4.7.0 + intel/2018.3 + netcdf/4.7.0 - openmpi/3.1.6 - pnetcdf/1.10.0 + openmpi/3.1.6 + pnetcdf/1.10.0 @@ -1718,14 +1723,14 @@ This allows using a different mpirun command to launch unit tests + can be overridden in environment or $HOME/.cime/config --> + can be overridden in environment or $HOME/.cime/config --> /DFS-L/SCRATCH/moore/$USER/cesm_runs /DFS-L/DATA/moore/cesm/inputdata @@ -1746,13 +1751,13 @@ This allows using a different mpirun command to launch unit tests mpirun - -np {{ total_tasks }} + -np {{ total_tasks }} mpirun - -np {{ total_tasks }} + -np {{ total_tasks }} @@ -1765,15 +1770,15 @@ This allows using a different mpirun command to launch unit tests module module - + - intel/2018.3 - netcdf/4.7.0 + intel/2018.3 + netcdf/4.7.0 - openmpi/3.1.6 - pnetcdf/1.10.0 + openmpi/3.1.6 + pnetcdf/1.10.0 diff --git a/scripts/lib/CIME/buildlib.py b/scripts/lib/CIME/buildlib.py index 9b75726e94c..2fdeb97517c 100644 --- a/scripts/lib/CIME/buildlib.py +++ b/scripts/lib/CIME/buildlib.py @@ -99,6 +99,11 @@ def run_gmake(case, compclass, compname, libroot, bldroot, libname="", user_cppd if user_cppdefs: cmd = cmd + "USER_CPPDEFS='{}'".format(user_cppdefs ) + use_smartsim = case.get_value("USE_SMARTSIM") + if use_smartsim: + cmd = cmd + " USE_SMARTSIM=TRUE " + print("Building with smartredis library") + stat, out, err = run_cmd(cmd, combine_output=True) print(out) if stat: diff --git a/src/components/xcpl_comps_nuopc/xatm/src/atm_comp_nuopc.F90 b/src/components/xcpl_comps_nuopc/xatm/src/atm_comp_nuopc.F90 index f8480b58683..3394692fed3 100644 --- a/src/components/xcpl_comps_nuopc/xatm/src/atm_comp_nuopc.F90 +++ b/src/components/xcpl_comps_nuopc/xatm/src/atm_comp_nuopc.F90 @@ -356,7 +356,11 @@ end subroutine InitializeRealize !=============================================================================== subroutine ModelAdvance(gcomp, rc) - +#ifdef SMARTREDIS + use nuopc_shr_methods, only : sr_client + use iso_c_binding, only : c_double + use ESMF, only : ESMF_VMGetCurrent, ESMF_VMGet +#endif ! input/output variables type(ESMF_GridComp) :: gcomp integer, intent(out) :: rc @@ -366,6 +370,17 @@ subroutine ModelAdvance(gcomp, rc) type(ESMF_State) :: exportState real(r8) :: nextsw_cday integer :: shrlogunit ! original log unit +#ifdef SMARTREDIS + type(ESMF_VM) :: vm + integer :: mytask + integer, parameter :: dim1 = 1 + integer, parameter :: dim2 = 2 + integer, parameter :: dim3 = 3 + character(len=9) :: key_prefix + real(r8), dimension(dim1, dim2, dim3) :: recv_array_real_64 + + real(kind=c_double), dimension(dim1, dim2, dim3) :: true_array_real_64 +#endif character(len=*),parameter :: subname=trim(modName)//':(ModelAdvance) ' !------------------------------------------------------------------------------- @@ -396,6 +411,24 @@ subroutine ModelAdvance(gcomp, rc) !-------------------------------- ! diagnostics !-------------------------------- +#ifdef SMARTREDIS + call ESMF_VMGetCurrent(vm, rc=rc) + if (chkerr(rc,__LINE__,u_FILE_u)) return + call ESMF_VMGet(vm, localPet=mytask, rc=rc) + if (chkerr(rc,__LINE__,u_FILE_u)) return + + call random_number(true_array_real_64) + call random_number(recv_array_real_64) + write(key_prefix, "(A,I6.6)") "pe_",mytask + if(mastertask) write(logunit, *) 'putting tenser to smartsim database ' + call sr_client%put_tensor(key_prefix//"true_array_real_64", true_array_real_64, shape(true_array_real_64)) + if(mastertask) write(logunit, *) 'retreveing tenser from database ' + call sr_client%unpack_tensor(key_prefix//"true_array_real_64", recv_array_real_64, shape(recv_array_real_64)) + if (.not. all(true_array_real_64 == recv_array_real_64)) then + call shr_sys_abort('true_array_real_64: FAILED') + endif +#endif + if (dbug > 1) then call state_diagnose(exportState,subname//':ES',rc=rc) From ebcec816a47c8d4456322435f282ca44856e1640 Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Fri, 16 Jul 2021 15:21:27 -0600 Subject: [PATCH 03/22] add smartsim scripts --- tools/MakeRedsky.sh | 18 --- tools/smartsim/launch.py | 94 ++++++++++++++ .../smartsim/launch_database_cluster.template | 118 ++++++++++++++++++ tools/smartsim/resv_job.template | 42 +++++++ tools/smartsim/utils.py | 85 +++++++++++++ 5 files changed, 339 insertions(+), 18 deletions(-) delete mode 100755 tools/MakeRedsky.sh create mode 100755 tools/smartsim/launch.py create mode 100644 tools/smartsim/launch_database_cluster.template create mode 100644 tools/smartsim/resv_job.template create mode 100644 tools/smartsim/utils.py diff --git a/tools/MakeRedsky.sh b/tools/MakeRedsky.sh deleted file mode 100755 index 663959de26a..00000000000 --- a/tools/MakeRedsky.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/csh - -source /usr/share/Modules/init/csh -module purge -module load intel/13.0 -module load openmpi-intel/1.6 - -setenv NETCDFROOT /projects/ccsm/yellowstone/netcdf-4.3.2-intel-13.0-openmpi-1.6 -setenv PATH $NETCDFROOT/bin:$PATH -setenv LD_LIBRARY_PATH $NETCDFROOT/lib:$LD_LIBRARY_PATH -setenv NETCDF_INCLUDES $NETCDFROOT/include -setenv NETCDF_LIBS $NETCDFROOT/lib - -setenv PNETCDFROOT $NETCDFROOT - -setenv USER_FC ifort - -gmake LIB_NETCDF=$NETCDF_LIBS INC_NETCDF=$NETCDF_INCLUDES NETCDF=$NETCDFROOT LDFLAGS="-L$NETCDF_LIBS -lnetcdff -lnetcdf"#! /bin/csh -f diff --git a/tools/smartsim/launch.py b/tools/smartsim/launch.py new file mode 100755 index 00000000000..209268684cf --- /dev/null +++ b/tools/smartsim/launch.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +import os, sys + +CESM_ROOT = os.getenv("CESM_ROOT") +if not CESM_ROOT: + raise SystemExit("ERROR: CESM_ROOT must be defined in environment") + +_LIBDIR = os.path.join(CESM_ROOT,"cime","scripts","Tools") +sys.path.append(_LIBDIR) +_LIBDIR = os.path.join(CESM_ROOT,"cime","scripts","lib") +sys.path.append(_LIBDIR) + + +import argparse, subprocess +from string import Template +from CIME.utils import run_cmd, expect +from CIME.case import Case + +def parse_command_line(args, description): + parser = argparse.ArgumentParser(description=description, + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument("--db-nodes", default=1, + help="Number of nodes for the SmartSim database, default=1") + parser.add_argument("--ngpus-per-node", default=0, + help="Number of gpus per SmartSim database node, default=0") + parser.add_argument("--walltime", default="00:30:00", + help="Total walltime for submitted job, default=00:30:00") + parser.add_argument("--member-nodes", default=1, + help="Number of nodes per ensemble member, default=1") + parser.add_argument("--account", default="P93300606", + help="Account ID") + parser.add_argument("--db-port", default=6780, + help="db port, default=6780") + parser.add_argument("--caseroots" , default=[os.getcwd()],nargs="*", + help="Case directory to reference.\n" + "Default is current directory.") + + args = parser.parse_args(args[1:]) + ngpus = "" + if int(args.ngpus_per_node) > 0: + ngpus = ":ngpus="+args.ngpus_per_node + + return {"db_nodes":args.db_nodes, + "caseroots" : ' '.join('"%s"' % x for x in args.caseroots), + "ngpus": ngpus, + "walltime": args.walltime, + "account" : args.account, + "db_port": args.db_port, + "cesmroot": CESM_ROOT, + "python_sys_path": sys.path}, args.caseroots + +def create_submit_files(templatevars): + template_files = ["resv_job.template", "launch_database_cluster.template"] + rootdir = os.path.dirname(os.path.realpath(__file__)) + for template in template_files: + with open(os.path.join(rootdir,template)) as f: + src = Template(f.read()) + result = src.safe_substitute(templatevars) + result_file = template.replace("template","sh") + with open(result_file, "w") as f: + f.write(result) + +def check_cases(caseroots): + """ + Assume all caseroots use the same number of nodes + """ + prevcasepes = -1 + for caseroot in caseroots: + with Case(caseroot) as case: + expect(case.get_value("BUILD_COMPLETE"),"ERROR: case build not complete for {}".format(caseroot)) + casepes = case.get_value("TOTALPES") + if prevcasepes > 0: + expect(prevcasepes == casepes, "Case {} pe layout mismatch".format(caseroot)) + else: + prevcasepes = casepes + member_nodes = case.num_nodes + + return member_nodes + +def _main_func(desc): + templatevars, caseroots = parse_command_line(sys.argv, desc) + print("caseroots are {}".format(caseroots)) + print("t caseroots are {}".format(templatevars["caseroots"])) + templatevars["member_nodes"] = check_cases(caseroots) + templatevars["ensemble_size"] = len(caseroots) + templatevars["client_nodes"] = int(templatevars["member_nodes"])*len(caseroots) + create_submit_files(templatevars) + run_cmd("qsub resv_job.sh", verbose=True) + + + + +if __name__ == "__main__": + _main_func(__doc__) diff --git a/tools/smartsim/launch_database_cluster.template b/tools/smartsim/launch_database_cluster.template new file mode 100644 index 00000000000..4a6469bfb86 --- /dev/null +++ b/tools/smartsim/launch_database_cluster.template @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +#PBS -N smartsimtest +#PBS -r n +#PBS -j oe +#PBS -V +#PBS -l walltime=$walltime +#PBS -A $account +##PBS -q regular +#PBS -V +#PBS -l select=$db_nodes:ncpus=1:ompthreads=1:mpiprocs=1$ngpus + +import os, sys, time + +# The python environment is not passed properly to submitted jobs on casper +_LIBDIR = $python_sys_path +sys.path.extend(_LIBDIR) +CESM_ROOT = "$cesmroot" +if not os.path.isdir(CESM_ROOT): + raise SystemExit("ERROR: CESM_ROOT must be defined in environment") + +_LIBDIR = os.path.join(CESM_ROOT,"cime","scripts","Tools") +sys.path.append(_LIBDIR) +_LIBDIR = os.path.join(CESM_ROOT,"cime","scripts","lib") +sys.path.append(_LIBDIR) + + +import socket, subprocess +import numpy as np +from CIME.utils import run_cmd, expect +from smartsim import Experiment, constants +from smartsim.database import PBSOrchestrator + +""" +Launch a distributed, in memory database cluster and use the +SmartRedis python client to send and recieve some numpy arrays. + +This example runs in an interactive allocation with at least three +nodes and 1 processor per node. + +i.e. qsub -l select=3:ncpus=1 -l walltime=00:10:00 -A -q premium -I +""" + +def collect_db_hosts(num_hosts): + """A simple method to collect hostnames because we are using + openmpi. (not needed for aprun(ALPS), Slurm, etc. + """ + + hosts = [] + if "PBS_NODEFILE" in os.environ: + node_file = os.environ["PBS_NODEFILE"] + with open(node_file, "r") as f: + for line in f.readlines(): + host = line.split(".")[0] + hosts.append(host) + else: + raise Exception("could not parse interactive allocation nodes from PBS_NODEFILE") + + # account for mpiprocs causing repeats in PBS_NODEFILE + hosts = list(set(hosts)) + if len(hosts) >= num_hosts: + return hosts[:num_hosts] + else: + raise Exception("PBS_NODEFILE {} had {} hosts, not {}".format(node_file, len(hosts),num_hosts)) + + +def launch_cluster_orc(exp, db_hosts, port): + """Just spin up a database cluster, check the status + and tear it down""" + + print(f"Starting Orchestrator on hosts: {db_hosts}") + # batch = False to launch on existing allocation + db = PBSOrchestrator(port=port, db_nodes=len(db_hosts), batch=False, + run_command="mpirun", hosts=db_hosts) + + # generate directories for output files + # pass in objects to make dirs for + exp.generate(db, overwrite=True) + + # start the database on interactive allocation + exp.start(db, block=True) + + # get the status of the database + statuses = exp.get_status(db) + print(f"Status of all database nodes: {statuses}") + + return db + +def monitor_client_jobs(rsvname): + jobs_done=False + while not jobs_done: + s, o, e = run_cmd("qstat -q {}".format(rsvname), verbose=True) + jobs_left = o.split()[-2:] + print("Jobs left: Running {} Queued {}".format(int(jobs_left[0]),int(jobs_left[1]))) + if int(jobs_left[0]) + int(jobs_left[1]) == 1: + jobs_done = True + else: + time.sleep(60) + + + + + +# create the experiment and specify PBS because cheyenne is a PBS system +exp = Experiment("launch_cluster_db", launcher="pbs") + +db_port = $db_port +db_hosts = collect_db_hosts($db_nodes) +# start the database +db = launch_cluster_orc(exp, db_hosts, db_port) + +rsvname = os.environ["RSVNAME"] +# stay alive until client jobs have completed +monitor_client_jobs(rsvname) + +# shutdown the database because we don't need it anymore +exp.stop(db) +# delete the job reservation +run_cmd("pbs_rdel {}".format(rsvname)) diff --git a/tools/smartsim/resv_job.template b/tools/smartsim/resv_job.template new file mode 100644 index 00000000000..97f497f3bc7 --- /dev/null +++ b/tools/smartsim/resv_job.template @@ -0,0 +1,42 @@ +#!/bin/bash -x +#PBS -N resv_job +#PBS -l select=$db_nodes:ncpus=1:mpiprocs=1$ngpus+$client_nodes:ncpus=36:mpiprocs=36 +#PBS -l gpu_type=v100 +#PBS -l walltime=$walltime +#PBS -W create_resv_from_job=true +#PBS -j oe +#PBS -k oed +#PBS -q casper +#PBS -A $account + +for rsv in $(qstat -Q|awk '$1 ~ /^R/{print $1}') +do + parent_job=$(pbs_rstat -F $rsv|awk '$1 ~ /^reserve_job/{print $3}') + if [[ "${PBS_JOBID}" == "${parent_job}" ]] ; then + rsvname=$rsv + break + fi +done +if [ -z $rsvname ]; then echo "rsv is unset"; exit -1; else echo "rsv name is set to '$rsvname'"; fi + +me=$(whoami) +pbs_ralter -U $me $rsvname + +db_jobid=$(qsub -q $rsvname -vRSVNAME=$rsvname launch_database_cluster.sh) + +head_host=$(qstat -f $PBS_JOBID|awk '$1 ~ /^exec_host$/{print $3}'|cut -d\/ -f1-1) +# This gets the ib network +SSDB="$(getent hosts ${head_host}-ib|awk '{print $1}'):$db_port" +# This gets the external network +#SSDB="$(getent hosts ${head_host}.ucar.edu |awk '{print $1}'):$db_port" +export SSDB + +for caseroot in $caseroots; +do + ./xmlchange JOB_QUEUE=$rsvname --subgroup case.test --force + ./xmlchange JOB_WALLCLOCK_TIME=00:20:00 --subgroup case.test + ./case.submit +done + + + diff --git a/tools/smartsim/utils.py b/tools/smartsim/utils.py new file mode 100644 index 00000000000..faf24b609cf --- /dev/null +++ b/tools/smartsim/utils.py @@ -0,0 +1,85 @@ +import subprocess, os, io + +def _convert_to_fd(filearg, from_dir, mode="a"): + filearg = _get_path(filearg, from_dir) + + return open(filearg, mode) + +_hack=object() + +def run_cmd(cmd, input_str=None, from_dir=None, verbose=None, + arg_stdout=_hack, arg_stderr=_hack, env=None, + combine_output=False, timeout=None, executable=None): + """ + Wrapper around subprocess to make it much more convenient to run shell commands + + >>> run_cmd('ls file_i_hope_doesnt_exist')[0] != 0 + True + """ + + # Real defaults for these value should be subprocess.PIPE + if arg_stdout is _hack: + arg_stdout = subprocess.PIPE + elif isinstance(arg_stdout, str): + arg_stdout = _convert_to_fd(arg_stdout, from_dir) + + if arg_stderr is _hack: + arg_stderr = subprocess.STDOUT if combine_output else subprocess.PIPE + elif isinstance(arg_stderr, str): + arg_stderr = _convert_to_fd(arg_stdout, from_dir) + + if verbose: + print("RUN: {}\nFROM: {}".format(cmd, os.getcwd() if from_dir is None else from_dir)) + + if (input_str is not None): + stdin = subprocess.PIPE + else: + stdin = None + + proc = subprocess.Popen(cmd, + shell=True, + stdout=arg_stdout, + stderr=arg_stderr, + stdin=stdin, + cwd=from_dir, + executable=executable, + env=env) + + output, errput = proc.communicate(input_str) + + # In Python3, subprocess.communicate returns bytes. We want to work with strings + # as much as possible, so we convert bytes to string (which is unicode in py3) via + # decode. + if output is not None: + try: + output = output.decode('utf-8', errors='ignore') + except AttributeError: + pass + if errput is not None: + try: + errput = errput.decode('utf-8', errors='ignore') + except AttributeError: + pass + + # Always strip outputs + if output: + output = output.strip() + if errput: + errput = errput.strip() + + stat = proc.wait() + if isinstance(arg_stdout, io.IOBase): + arg_stdout.close() # pylint: disable=no-member + if isinstance(arg_stderr, io.IOBase) and arg_stderr is not arg_stdout: + arg_stderr.close() # pylint: disable=no-member + + + if verbose: + if stat != 0: + print(" stat: {:d}\n".format(stat)) + if output: + print(" output: {}\n".format(output)) + if errput: + print(" errput: {}\n".format(errput)) + + return stat, output, errput From cdc4fb1556e79d8613f7b70d0d3db9ea8d7435de Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Mon, 19 Jul 2021 12:02:01 -0600 Subject: [PATCH 04/22] add a readme --- tools/smartsim/README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 tools/smartsim/README.md diff --git a/tools/smartsim/README.md b/tools/smartsim/README.md new file mode 100644 index 00000000000..93d984a7440 --- /dev/null +++ b/tools/smartsim/README.md @@ -0,0 +1,11 @@ +Running CESM with the CrayLabs SmartSim tool. + +The tools provided here can be used with PBS version 2021 or newer. +PBS must support the create_resv_from_job option and the user must be +enabled to use that option. + +It allows you to submit multiple jobs to the queue and +assure that all of the jobs will start and run concurrently. + +If --ngpus-per-node > 0 then the SmartSim database will be submitted +to gpu nodes while the CESM case(s) will run on cpu nodes. \ No newline at end of file From 2fdc56d3c9f604f1ac70dfc09df9eceb21da4cfa Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Mon, 19 Jul 2021 14:17:28 -0600 Subject: [PATCH 05/22] remove ifdefs from xatm --- .../xatm/src/atm_comp_nuopc.F90 | 75 +++++++++++-------- 1 file changed, 42 insertions(+), 33 deletions(-) diff --git a/src/components/xcpl_comps_nuopc/xatm/src/atm_comp_nuopc.F90 b/src/components/xcpl_comps_nuopc/xatm/src/atm_comp_nuopc.F90 index 3394692fed3..cbea2898590 100644 --- a/src/components/xcpl_comps_nuopc/xatm/src/atm_comp_nuopc.F90 +++ b/src/components/xcpl_comps_nuopc/xatm/src/atm_comp_nuopc.F90 @@ -356,11 +356,7 @@ end subroutine InitializeRealize !=============================================================================== subroutine ModelAdvance(gcomp, rc) -#ifdef SMARTREDIS - use nuopc_shr_methods, only : sr_client - use iso_c_binding, only : c_double - use ESMF, only : ESMF_VMGetCurrent, ESMF_VMGet -#endif + use nuopc_shr_methods, only : use_smartredis ! input/output variables type(ESMF_GridComp) :: gcomp integer, intent(out) :: rc @@ -370,17 +366,6 @@ subroutine ModelAdvance(gcomp, rc) type(ESMF_State) :: exportState real(r8) :: nextsw_cday integer :: shrlogunit ! original log unit -#ifdef SMARTREDIS - type(ESMF_VM) :: vm - integer :: mytask - integer, parameter :: dim1 = 1 - integer, parameter :: dim2 = 2 - integer, parameter :: dim3 = 3 - character(len=9) :: key_prefix - real(r8), dimension(dim1, dim2, dim3) :: recv_array_real_64 - - real(kind=c_double), dimension(dim1, dim2, dim3) :: true_array_real_64 -#endif character(len=*),parameter :: subname=trim(modName)//':(ModelAdvance) ' !------------------------------------------------------------------------------- @@ -411,24 +396,9 @@ subroutine ModelAdvance(gcomp, rc) !-------------------------------- ! diagnostics !-------------------------------- -#ifdef SMARTREDIS - call ESMF_VMGetCurrent(vm, rc=rc) - if (chkerr(rc,__LINE__,u_FILE_u)) return - call ESMF_VMGet(vm, localPet=mytask, rc=rc) - if (chkerr(rc,__LINE__,u_FILE_u)) return - - call random_number(true_array_real_64) - call random_number(recv_array_real_64) - write(key_prefix, "(A,I6.6)") "pe_",mytask - if(mastertask) write(logunit, *) 'putting tenser to smartsim database ' - call sr_client%put_tensor(key_prefix//"true_array_real_64", true_array_real_64, shape(true_array_real_64)) - if(mastertask) write(logunit, *) 'retreveing tenser from database ' - call sr_client%unpack_tensor(key_prefix//"true_array_real_64", recv_array_real_64, shape(recv_array_real_64)) - if (.not. all(true_array_real_64 == recv_array_real_64)) then - call shr_sys_abort('true_array_real_64: FAILED') + if(use_smartredis) then + call srtest() endif -#endif - if (dbug > 1) then call state_diagnose(exportState,subname//':ES',rc=rc) @@ -560,4 +530,43 @@ subroutine ModelFinalize(gcomp, rc) end if end subroutine ModelFinalize +! Test the smartredis interface + subroutine srtest() + use nuopc_shr_methods, only : sr_client + use iso_c_binding, only : c_double + use ESMF, only : ESMF_VMGetCurrent, ESMF_VMGet + + + type(ESMF_VM) :: vm + integer :: mytask + integer, parameter :: dim1 = 10 + integer, parameter :: dim2 = 20 + integer, parameter :: dim3 = 30 + character(len=9) :: key_prefix + real(r8), dimension(dim1, dim2, dim3) :: recv_array_real_64 + real(kind=c_double), dimension(dim1, dim2, dim3) :: true_array_real_64 + integer :: rc + + + call ESMF_VMGetCurrent(vm, rc=rc) + if (chkerr(rc,__LINE__,u_FILE_u)) return + call ESMF_VMGet(vm, localPet=mytask, rc=rc) + if (chkerr(rc,__LINE__,u_FILE_u)) return + + call random_number(true_array_real_64) + call random_number(recv_array_real_64) + write(key_prefix, "(A,I6.6)") "pe_",mytask + if(mastertask) write(logunit, *) 'putting tensor to smartsim database ' + call sr_client%put_tensor(key_prefix//"true_array_real_64", true_array_real_64, & + shape(true_array_real_64)) + if(mastertask) write(logunit, *) 'retreveing tensor from database ' + call sr_client%unpack_tensor(key_prefix//"true_array_real_64", recv_array_real_64,& + shape(recv_array_real_64)) + if (.not. all(true_array_real_64 == recv_array_real_64)) then + call shr_sys_abort('true_array_real_64: FAILED') + endif + + + + end subroutine srtest end module atm_comp_nuopc From 7aa8d798764ec68110d31d6d4a00bbb8124ee3b9 Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Mon, 19 Jul 2021 14:20:24 -0600 Subject: [PATCH 06/22] remove commented section --- config/cesm/machines/config_compilers.xml | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/config/cesm/machines/config_compilers.xml b/config/cesm/machines/config_compilers.xml index 00e94d2bb3d..659b14c6e36 100644 --- a/config/cesm/machines/config_compilers.xml +++ b/config/cesm/machines/config_compilers.xml @@ -925,20 +925,6 @@ using a fortran linker. $ENV{PNETCDF} - - - - - - - - - - - - -DNO_MPIMOD From a51afac806d07ac4526ad703a6cad590361b58fd Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Mon, 19 Jul 2021 14:22:19 -0600 Subject: [PATCH 07/22] remove unused utils --- tools/smartsim/utils.py | 85 ----------------------------------------- 1 file changed, 85 deletions(-) delete mode 100644 tools/smartsim/utils.py diff --git a/tools/smartsim/utils.py b/tools/smartsim/utils.py deleted file mode 100644 index faf24b609cf..00000000000 --- a/tools/smartsim/utils.py +++ /dev/null @@ -1,85 +0,0 @@ -import subprocess, os, io - -def _convert_to_fd(filearg, from_dir, mode="a"): - filearg = _get_path(filearg, from_dir) - - return open(filearg, mode) - -_hack=object() - -def run_cmd(cmd, input_str=None, from_dir=None, verbose=None, - arg_stdout=_hack, arg_stderr=_hack, env=None, - combine_output=False, timeout=None, executable=None): - """ - Wrapper around subprocess to make it much more convenient to run shell commands - - >>> run_cmd('ls file_i_hope_doesnt_exist')[0] != 0 - True - """ - - # Real defaults for these value should be subprocess.PIPE - if arg_stdout is _hack: - arg_stdout = subprocess.PIPE - elif isinstance(arg_stdout, str): - arg_stdout = _convert_to_fd(arg_stdout, from_dir) - - if arg_stderr is _hack: - arg_stderr = subprocess.STDOUT if combine_output else subprocess.PIPE - elif isinstance(arg_stderr, str): - arg_stderr = _convert_to_fd(arg_stdout, from_dir) - - if verbose: - print("RUN: {}\nFROM: {}".format(cmd, os.getcwd() if from_dir is None else from_dir)) - - if (input_str is not None): - stdin = subprocess.PIPE - else: - stdin = None - - proc = subprocess.Popen(cmd, - shell=True, - stdout=arg_stdout, - stderr=arg_stderr, - stdin=stdin, - cwd=from_dir, - executable=executable, - env=env) - - output, errput = proc.communicate(input_str) - - # In Python3, subprocess.communicate returns bytes. We want to work with strings - # as much as possible, so we convert bytes to string (which is unicode in py3) via - # decode. - if output is not None: - try: - output = output.decode('utf-8', errors='ignore') - except AttributeError: - pass - if errput is not None: - try: - errput = errput.decode('utf-8', errors='ignore') - except AttributeError: - pass - - # Always strip outputs - if output: - output = output.strip() - if errput: - errput = errput.strip() - - stat = proc.wait() - if isinstance(arg_stdout, io.IOBase): - arg_stdout.close() # pylint: disable=no-member - if isinstance(arg_stderr, io.IOBase) and arg_stderr is not arg_stdout: - arg_stderr.close() # pylint: disable=no-member - - - if verbose: - if stat != 0: - print(" stat: {:d}\n".format(stat)) - if output: - print(" output: {}\n".format(output)) - if errput: - print(" errput: {}\n".format(errput)) - - return stat, output, errput From a9ccf7d09d2c96498cf58f388bddda1c1ab09f64 Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Tue, 20 Jul 2021 16:20:22 -0600 Subject: [PATCH 08/22] use module for SmartRedis --- config/cesm/machines/config_batch.xml | 16 ++++---- config/cesm/machines/config_compilers.xml | 28 +++++++------ config/cesm/machines/config_machines.xml | 14 +++++++ scripts/Tools/Makefile | 3 +- tools/smartsim/launch.py | 49 +++++++++++------------ 5 files changed, 63 insertions(+), 47 deletions(-) diff --git a/config/cesm/machines/config_batch.xml b/config/cesm/machines/config_batch.xml index a57bb7bbe48..adfe9da9095 100644 --- a/config/cesm/machines/config_batch.xml +++ b/config/cesm/machines/config_batch.xml @@ -10,7 +10,7 @@ batch_redirect: Whether a redirect character is needed to submit jobs. batch_directive: The string that prepends a batch directive for the batch system. jobid_pattern: A perl regular expression used to filter out the returned job id from a - queue submission. + queue submission. =============================================================== batch_system @@ -243,7 +243,7 @@ -S {{ shell }} - -l select={{ num_nodes }}:ncpus={{ max_tasks_per_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }}:mem=700GB:ngpus={{ ngpus_per_node }} + -l select={{ num_nodes }}:ncpus={{ max_tasks_per_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }} @@ -428,7 +428,7 @@ - ssh login1 cd $CASEROOT ; sbatch @@ -473,8 +473,8 @@ + We achieve this setting both queues as the default queue but limiting it + to jobs that use 8 tasks or fewer; other jobs will land in sib2.9 --> sib2.9 sib2.9,sky2.4 @@ -487,8 +487,8 @@ + We achieve this setting both queues as the default queue but limiting it + to jobs that use 8 tasks or fewer; other jobs will land in sky2.4 --> sky2.4 sib2.9,sky2.4 @@ -726,7 +726,7 @@ regular - + -env diff --git a/config/cesm/machines/config_compilers.xml b/config/cesm/machines/config_compilers.xml index 659b14c6e36..38b3950235a 100644 --- a/config/cesm/machines/config_compilers.xml +++ b/config/cesm/machines/config_compilers.xml @@ -77,7 +77,7 @@ using a fortran linker. - -I$(EXEROOT)/atm/obj/FMS + -I$(EXEROOT)/atm/obj/FMS $(FC_AUTO_R8) @@ -121,10 +121,10 @@ using a fortran linker. -fconvert=big-endian -ffree-line-length-none -ffixed-line-length-none -fopenmp + least with some versions of gfortran (confirmed with 5.4.0, 6.3.0 and + 7.1.0), gfortran's isnan (which is called in cime via the + CPRGNU-specific shr_infnan_isnan) causes a floating point exception + when called on a signaling NaN. --> -g -Wall -Og -fbacktrace -ffpe-trap=zero,overflow -fcheck=bounds -O @@ -207,10 +207,10 @@ using a fortran linker. -fconvert=big-endian -ffree-line-length-none -ffixed-line-length-none -fopenmp + least with some versions of gfortran (confirmed with 5.4.0, 6.3.0 and + 7.1.0), gfortran's isnan (which is called in cime via the + CPRGNU-specific shr_infnan_isnan) causes a floating point exception + when called on a signaling NaN. --> -g -Wall -Og -fbacktrace -ffpe-trap=zero,overflow -fcheck=bounds -O @@ -907,6 +907,10 @@ using a fortran linker. -qopt-report -xCORE_AVX2 -no-fma + + + -Wl,-rpath /glade/u/apps/dav/opt/gnu/9.1.0/lib64 -L/glade/u/apps/dav/opt/gnu/9.1.0/lib64 -lgcc + -DPIO_ENABLE_LOGGING=ON @@ -931,8 +935,8 @@ using a fortran linker. + -fallow-invalid-boz is needed for some components that have non-standard uses of BOZ literal constants with gfortran10. + See https://github.com/ESMCI/cime/issues/3846 for details. --> -fallow-argument-mismatch -fallow-invalid-boz @@ -1314,7 +1318,7 @@ using a fortran linker. + may require installation of the Apple Developer Tools. --> -framework Accelerate -Wl,-rpath $(NETCDF)/lib diff --git a/config/cesm/machines/config_machines.xml b/config/cesm/machines/config_machines.xml index adce35826e8..93ccbb14b47 100644 --- a/config/cesm/machines/config_machines.xml +++ b/config/cesm/machines/config_machines.xml @@ -398,6 +398,7 @@ This allows using a different mpirun command to launch unit tests ncarenv/1.3 cmake/3.18.2 python + /glade/p/cesmdata/cseg/PROGS/modulefiles/CrayLabs pgi/20.4 @@ -417,6 +418,7 @@ This allows using a different mpirun command to launch unit tests intel/19.1.1 mkl/2020.0.1 + /glade/p/cesmdata/cseg/PROGS/modulefiles/CrayLabs /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/pgi/20.4/ @@ -434,6 +436,14 @@ This allows using a different mpirun command to launch unit tests /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/gnu/11.1.0/ esmf-8.2.0b12_casper-ncdfio-openmpi-O + + /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/intel/19.1.1/ + esmf-8.2.0b13_casper-ncdfio-openmpi-g + + + /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/intel/19.1.1/ + esmf-8.2.0b13_casper-ncdfio-openmpi-O + openmpi/4.1.0 netcdf-mpi/4.8.0 @@ -467,6 +477,10 @@ This allows using a different mpirun command to launch unit tests openmpi/4.1.0 netcdf-mpi/4.7.4 pnetcdf/1.12.2 + SmartRedis/0.1.1 + + + SmartRedis/0.1.1 netcdf/4.7.4 diff --git a/scripts/Tools/Makefile b/scripts/Tools/Makefile index 0d165c4a63c..ea8e0f23dfd 100644 --- a/scripts/Tools/Makefile +++ b/scripts/Tools/Makefile @@ -78,8 +78,7 @@ CPPDEFS := $(USER_CPPDEFS) -D$(OS) include $(CASEROOT)/Macros.make ifeq ($(strip $(USE_SMARTSIM)), TRUE) - CPPDEFS += -DSMARTREDIS - SLIBS += -L$(REDIS_HOME)/install/lib -lsmartredis -Wl,-rpath $(REDIS_HOME)/install/lib + SLIBS += -L$(SMARTREDIS_LIB) -lsmartredis endif # Unless DEBUG mode is enabled, use NDEBUG to turn off assert statements. diff --git a/tools/smartsim/launch.py b/tools/smartsim/launch.py index 209268684cf..f5ca9452054 100755 --- a/tools/smartsim/launch.py +++ b/tools/smartsim/launch.py @@ -18,36 +18,36 @@ def parse_command_line(args, description): parser = argparse.ArgumentParser(description=description, - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("--db-nodes", default=1, - help="Number of nodes for the SmartSim database, default=1") + help="Number of nodes for the SmartSim database, default=1") parser.add_argument("--ngpus-per-node", default=0, - help="Number of gpus per SmartSim database node, default=0") + help="Number of gpus per SmartSim database node, default=0") parser.add_argument("--walltime", default="00:30:00", - help="Total walltime for submitted job, default=00:30:00") + help="Total walltime for submitted job, default=00:30:00") parser.add_argument("--member-nodes", default=1, - help="Number of nodes per ensemble member, default=1") + help="Number of nodes per ensemble member, default=1") parser.add_argument("--account", default="P93300606", - help="Account ID") + help="Account ID") parser.add_argument("--db-port", default=6780, - help="db port, default=6780") + help="db port, default=6780") parser.add_argument("--caseroots" , default=[os.getcwd()],nargs="*", - help="Case directory to reference.\n" - "Default is current directory.") + help="Case directory to reference.\n" + "Default is current directory.") args = parser.parse_args(args[1:]) ngpus = "" if int(args.ngpus_per_node) > 0: ngpus = ":ngpus="+args.ngpus_per_node - return {"db_nodes":args.db_nodes, - "caseroots" : ' '.join('"%s"' % x for x in args.caseroots), - "ngpus": ngpus, - "walltime": args.walltime, - "account" : args.account, - "db_port": args.db_port, - "cesmroot": CESM_ROOT, - "python_sys_path": sys.path}, args.caseroots + return {"db_nodes":args.db_nodes, + "caseroots" : ' '.join('"%s"' % x for x in args.caseroots), + "ngpus": ngpus, + "walltime": args.walltime, + "account" : args.account, + "db_port": args.db_port, + "cesmroot": CESM_ROOT, + "python_sys_path": sys.path}, args.caseroots def create_submit_files(templatevars): template_files = ["resv_job.template", "launch_database_cluster.template"] @@ -56,32 +56,31 @@ def create_submit_files(templatevars): with open(os.path.join(rootdir,template)) as f: src = Template(f.read()) result = src.safe_substitute(templatevars) - result_file = template.replace("template","sh") + result_file = template.replace("template","sh") with open(result_file, "w") as f: f.write(result) -def check_cases(caseroots): +def check_cases(caseroots, db_nodes): """ Assume all caseroots use the same number of nodes """ prevcasepes = -1 for caseroot in caseroots: - with Case(caseroot) as case: + with Case(caseroot, read_only=False) as case: expect(case.get_value("BUILD_COMPLETE"),"ERROR: case build not complete for {}".format(caseroot)) casepes = case.get_value("TOTALPES") if prevcasepes > 0: - expect(prevcasepes == casepes, "Case {} pe layout mismatch".format(caseroot)) + expect(prevcasepes == casepes, "Case {} pe layout mismatch".format(caseroot)) else: prevcasepes = casepes member_nodes = case.num_nodes - + case.set_value("CREATE_SMARTSIM_CLUSTER", db_nodes > 1) + return member_nodes def _main_func(desc): templatevars, caseroots = parse_command_line(sys.argv, desc) - print("caseroots are {}".format(caseroots)) - print("t caseroots are {}".format(templatevars["caseroots"])) - templatevars["member_nodes"] = check_cases(caseroots) + templatevars["member_nodes"] = check_cases(caseroots, int(templatevars["db_nodes"])) templatevars["ensemble_size"] = len(caseroots) templatevars["client_nodes"] = int(templatevars["member_nodes"])*len(caseroots) create_submit_files(templatevars) From f8562aeade9402edc470ee46a9163f927976e1b2 Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Wed, 21 Jul 2021 07:23:51 -0600 Subject: [PATCH 09/22] add rpath to smartsim --- scripts/Tools/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/Tools/Makefile b/scripts/Tools/Makefile index ea8e0f23dfd..0cce88ae6e5 100644 --- a/scripts/Tools/Makefile +++ b/scripts/Tools/Makefile @@ -78,7 +78,7 @@ CPPDEFS := $(USER_CPPDEFS) -D$(OS) include $(CASEROOT)/Macros.make ifeq ($(strip $(USE_SMARTSIM)), TRUE) - SLIBS += -L$(SMARTREDIS_LIB) -lsmartredis + SLIBS += -L$(SMARTREDIS_LIB) -lsmartredis -Wl,-rpath $(SMARTREDIS_LIB) endif # Unless DEBUG mode is enabled, use NDEBUG to turn off assert statements. From 845bfb8bc20483ecbf4fc954101dfa693fd1e7cd Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Wed, 21 Jul 2021 14:46:06 -0600 Subject: [PATCH 10/22] add sleep to case run for smartsim db to start first --- scripts/Tools/case.submit | 8 ++- scripts/lib/CIME/case/case_run.py | 5 ++ tools/smartsim/launch.py | 55 ++++++++++++------- .../smartsim/launch_database_cluster.template | 10 +++- tools/smartsim/resv_job.template | 7 +-- 5 files changed, 55 insertions(+), 30 deletions(-) diff --git a/scripts/Tools/case.submit b/scripts/Tools/case.submit index 49979d86237..ebc9fe0b5fd 100755 --- a/scripts/Tools/case.submit +++ b/scripts/Tools/case.submit @@ -19,7 +19,7 @@ from standard_script_setup import * from CIME.case import Case from CIME.utils import expect from six.moves import configparser - +import time ############################################################################### def parse_command_line(args, description): ############################################################################### @@ -118,9 +118,13 @@ def _main_func(description, test_args=False): config.write(fd) elif os.path.exists(config_file): os.remove(config_file) - if not test_args: with Case(caseroot, read_only=False, record=True) as case: + use_smartsim = case.get_value("USE_SMARTSIM") + # Give the smartsim db time to spin up + if use_smartsim: + time.sleep(120) + case.submit(job=job, no_batch=no_batch, prereq=prereq, allow_fail=allow_fail, resubmit=resubmit, resubmit_immediate=resubmit_immediate, skip_pnl=skip_pnl, mail_user=mail_user, mail_type=mail_type, diff --git a/scripts/lib/CIME/case/case_run.py b/scripts/lib/CIME/case/case_run.py index 1ecd1715c92..b8f1cee21cc 100644 --- a/scripts/lib/CIME/case/case_run.py +++ b/scripts/lib/CIME/case/case_run.py @@ -199,6 +199,11 @@ def _run_model(case, lid, skip_pnl=False, da_cycle=0): jobid = batch_jobid() msg_func = lambda *args: jobid if jobid is not None else "" + if case.get_value("USE_SMARTSIM"): + logger.info("Give the SmartSim DB time to launch") + time.sleep(60) + logger.info("launching job") + return run_and_log_case_status(functor, "case.run", custom_starting_msg_functor=msg_func, custom_success_msg_functor=msg_func, diff --git a/tools/smartsim/launch.py b/tools/smartsim/launch.py index f5ca9452054..dd75671bb8c 100755 --- a/tools/smartsim/launch.py +++ b/tools/smartsim/launch.py @@ -18,36 +18,45 @@ def parse_command_line(args, description): parser = argparse.ArgumentParser(description=description, - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("--db-nodes", default=1, - help="Number of nodes for the SmartSim database, default=1") + help="Number of nodes for the SmartSim database, default=1") parser.add_argument("--ngpus-per-node", default=0, - help="Number of gpus per SmartSim database node, default=0") + help="Number of gpus per SmartSim database node, default=0") parser.add_argument("--walltime", default="00:30:00", - help="Total walltime for submitted job, default=00:30:00") + help="Total walltime for submitted job, default=00:30:00") parser.add_argument("--member-nodes", default=1, - help="Number of nodes per ensemble member, default=1") + help="Number of nodes per ensemble member, default=1") parser.add_argument("--account", default="P93300606", - help="Account ID") + help="Account ID") parser.add_argument("--db-port", default=6780, - help="db port, default=6780") + help="db port, default=6780") parser.add_argument("--caseroots" , default=[os.getcwd()],nargs="*", - help="Case directory to reference.\n" - "Default is current directory.") + help="Case directory to reference.\n" + "Default is current directory.") + parser.add_argument("--logroot", default="/glade/scratch/{}".format(os.environ["USER"]), + help="Root directory under which SmartSimdb log files will be written") + parser.add_argument("--dryrun", action="store_true", + help="Create job scripts, but do not submit") + args = parser.parse_args(args[1:]) ngpus = "" if int(args.ngpus_per_node) > 0: ngpus = ":ngpus="+args.ngpus_per_node + expect(int(args.db_nodes) != 2, "db-nodes size of 2 not allowed, decrease to 1 or increase to 3 or more") + + return {"db_nodes":args.db_nodes, - "caseroots" : ' '.join('"%s"' % x for x in args.caseroots), - "ngpus": ngpus, - "walltime": args.walltime, - "account" : args.account, - "db_port": args.db_port, - "cesmroot": CESM_ROOT, - "python_sys_path": sys.path}, args.caseroots + "caseroots" : ' '.join('"%s"' % x for x in args.caseroots), + "ngpus": ngpus, + "walltime": args.walltime, + "account" : args.account, + "db_port": args.db_port, + "cesmroot": CESM_ROOT, + "logroot" : args.logroot, + "python_sys_path": sys.path}, args.caseroots, args.dryrun def create_submit_files(templatevars): template_files = ["resv_job.template", "launch_database_cluster.template"] @@ -79,15 +88,19 @@ def check_cases(caseroots, db_nodes): return member_nodes def _main_func(desc): - templatevars, caseroots = parse_command_line(sys.argv, desc) + templatevars, caseroots, dryrun = parse_command_line(sys.argv, desc) templatevars["member_nodes"] = check_cases(caseroots, int(templatevars["db_nodes"])) templatevars["ensemble_size"] = len(caseroots) templatevars["client_nodes"] = int(templatevars["member_nodes"])*len(caseroots) + print("Creating submit files") create_submit_files(templatevars) - run_cmd("qsub resv_job.sh", verbose=True) - - - + if not dryrun: + print("Submitting job") + _, o, e = run_cmd("qsub resv_job.sh", verbose=True) + if e: + print("ERROR: {}".format(e)) + if o: + print("{}".format(o)) if __name__ == "__main__": _main_func(__doc__) diff --git a/tools/smartsim/launch_database_cluster.template b/tools/smartsim/launch_database_cluster.template index 4a6469bfb86..0d4645685cc 100644 --- a/tools/smartsim/launch_database_cluster.template +++ b/tools/smartsim/launch_database_cluster.template @@ -8,6 +8,7 @@ ##PBS -q regular #PBS -V #PBS -l select=$db_nodes:ncpus=1:ompthreads=1:mpiprocs=1$ngpus +#PBS -l gpu_type=v100 import os, sys, time @@ -22,13 +23,14 @@ _LIBDIR = os.path.join(CESM_ROOT,"cime","scripts","Tools") sys.path.append(_LIBDIR) _LIBDIR = os.path.join(CESM_ROOT,"cime","scripts","lib") sys.path.append(_LIBDIR) - +from smartsim.utils.log import log_to_file import socket, subprocess import numpy as np from CIME.utils import run_cmd, expect from smartsim import Experiment, constants from smartsim.database import PBSOrchestrator +log_to_file("$logroot/db_debug_log") """ Launch a distributed, in memory database cluster and use the @@ -76,7 +78,7 @@ def launch_cluster_orc(exp, db_hosts, port): # pass in objects to make dirs for exp.generate(db, overwrite=True) - # start the database on interactive allocation + # start the database exp.start(db, block=True) # get the status of the database @@ -93,6 +95,7 @@ def monitor_client_jobs(rsvname): print("Jobs left: Running {} Queued {}".format(int(jobs_left[0]),int(jobs_left[1]))) if int(jobs_left[0]) + int(jobs_left[1]) == 1: jobs_done = True +# jobs_done = False else: time.sleep(60) @@ -101,7 +104,8 @@ def monitor_client_jobs(rsvname): # create the experiment and specify PBS because cheyenne is a PBS system -exp = Experiment("launch_cluster_db", launcher="pbs") +logdir = os.path.join("$logroot","SmartSimdb.{}".format(os.environ['PBS_JOBID'])) +exp = Experiment(logdir, launcher="pbs") db_port = $db_port db_hosts = collect_db_hosts($db_nodes) diff --git a/tools/smartsim/resv_job.template b/tools/smartsim/resv_job.template index 97f497f3bc7..1a296772f6c 100644 --- a/tools/smartsim/resv_job.template +++ b/tools/smartsim/resv_job.template @@ -1,7 +1,7 @@ -#!/bin/bash -x +#!/bin/bash #PBS -N resv_job #PBS -l select=$db_nodes:ncpus=1:mpiprocs=1$ngpus+$client_nodes:ncpus=36:mpiprocs=36 -#PBS -l gpu_type=v100 +##PBS -l gpu_type=v100 #PBS -l walltime=$walltime #PBS -W create_resv_from_job=true #PBS -j oe @@ -22,7 +22,7 @@ if [ -z $rsvname ]; then echo "rsv is unset"; exit -1; else echo "rsv name is se me=$(whoami) pbs_ralter -U $me $rsvname -db_jobid=$(qsub -q $rsvname -vRSVNAME=$rsvname launch_database_cluster.sh) +db_jobid=$(qsub -q $rsvname -vSMARTSIM_LOG_LEVEL=debug -vRSVNAME=$rsvname launch_database_cluster.sh) head_host=$(qstat -f $PBS_JOBID|awk '$1 ~ /^exec_host$/{print $3}'|cut -d\/ -f1-1) # This gets the ib network @@ -30,7 +30,6 @@ SSDB="$(getent hosts ${head_host}-ib|awk '{print $1}'):$db_port" # This gets the external network #SSDB="$(getent hosts ${head_host}.ucar.edu |awk '{print $1}'):$db_port" export SSDB - for caseroot in $caseroots; do ./xmlchange JOB_QUEUE=$rsvname --subgroup case.test --force From 20042154a7f8a559151f183b1cc4920833f77f13 Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Thu, 22 Jul 2021 07:57:46 -0600 Subject: [PATCH 11/22] add gptl ifdefs to casper --- config/cesm/machines/config_compilers.xml | 4 ++++ tools/smartsim/resv_job.template | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/config/cesm/machines/config_compilers.xml b/config/cesm/machines/config_compilers.xml index 38b3950235a..c8671e0fe00 100644 --- a/config/cesm/machines/config_compilers.xml +++ b/config/cesm/machines/config_compilers.xml @@ -826,6 +826,10 @@ using a fortran linker. $ENV{NETCDF} gpfs $ENV{PNETCDF} + + + -DHAVE_NANOTIME -DBIT64 -DHAVE_VPRINTF -DHAVE_BACKTRACE -DHAVE_SLASHPROC -DHAVE_COMM_F2C -DHAVE_TIMES -DHAVE_GETTIMEOFDAY + diff --git a/tools/smartsim/resv_job.template b/tools/smartsim/resv_job.template index 1a296772f6c..73a52e10b78 100644 --- a/tools/smartsim/resv_job.template +++ b/tools/smartsim/resv_job.template @@ -1,7 +1,7 @@ #!/bin/bash #PBS -N resv_job #PBS -l select=$db_nodes:ncpus=1:mpiprocs=1$ngpus+$client_nodes:ncpus=36:mpiprocs=36 -##PBS -l gpu_type=v100 +#PBS -l gpu_type=v100 #PBS -l walltime=$walltime #PBS -W create_resv_from_job=true #PBS -j oe From 7b94f9a0d3726040eba26920db8d2c630f799a6d Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Thu, 22 Jul 2021 16:25:32 -0600 Subject: [PATCH 12/22] remove this sleep --- scripts/Tools/case.submit | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/scripts/Tools/case.submit b/scripts/Tools/case.submit index ebc9fe0b5fd..68424ff14ec 100755 --- a/scripts/Tools/case.submit +++ b/scripts/Tools/case.submit @@ -19,7 +19,7 @@ from standard_script_setup import * from CIME.case import Case from CIME.utils import expect from six.moves import configparser -import time + ############################################################################### def parse_command_line(args, description): ############################################################################### @@ -120,11 +120,6 @@ def _main_func(description, test_args=False): os.remove(config_file) if not test_args: with Case(caseroot, read_only=False, record=True) as case: - use_smartsim = case.get_value("USE_SMARTSIM") - # Give the smartsim db time to spin up - if use_smartsim: - time.sleep(120) - case.submit(job=job, no_batch=no_batch, prereq=prereq, allow_fail=allow_fail, resubmit=resubmit, resubmit_immediate=resubmit_immediate, skip_pnl=skip_pnl, mail_user=mail_user, mail_type=mail_type, From b1823ff24a2989e5194713c118b2668b556364ce Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Thu, 22 Jul 2021 16:30:13 -0600 Subject: [PATCH 13/22] add to README --- tools/smartsim/README.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/smartsim/README.md b/tools/smartsim/README.md index 93d984a7440..7bde1f6ae8c 100644 --- a/tools/smartsim/README.md +++ b/tools/smartsim/README.md @@ -1,4 +1,4 @@ -Running CESM with the CrayLabs SmartSim tool. +Running CESM with the CrayLabs SmartSim tool. https://github.com/CrayLabs/SmartSim The tools provided here can be used with PBS version 2021 or newer. PBS must support the create_resv_from_job option and the user must be @@ -8,4 +8,12 @@ It allows you to submit multiple jobs to the queue and assure that all of the jobs will start and run concurrently. If --ngpus-per-node > 0 then the SmartSim database will be submitted -to gpu nodes while the CESM case(s) will run on cpu nodes. \ No newline at end of file +to gpu nodes while the CESM case(s) will run on cpu nodes. + +See also: +https://github.com/ESCOMP/CESM/wiki/Using-HPE-SmartSim-with-CESM + +The smartredis https://github.com/CrayLabs/SmartRedis interface is built in share +and available to all components. If USE_SMARTSIM is false then a stub library is built +so that no cpp ifdefs are required. + From 51d467654153e8e9b2ccd38f48263e35bc93ca17 Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Thu, 22 Jul 2021 16:38:00 -0600 Subject: [PATCH 14/22] add documentation --- .../xcpl_comps_nuopc/xatm/src/atm_comp_nuopc.F90 | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/components/xcpl_comps_nuopc/xatm/src/atm_comp_nuopc.F90 b/src/components/xcpl_comps_nuopc/xatm/src/atm_comp_nuopc.F90 index cbea2898590..709eb5718ba 100644 --- a/src/components/xcpl_comps_nuopc/xatm/src/atm_comp_nuopc.F90 +++ b/src/components/xcpl_comps_nuopc/xatm/src/atm_comp_nuopc.F90 @@ -356,6 +356,10 @@ end subroutine InitializeRealize !=============================================================================== subroutine ModelAdvance(gcomp, rc) + ! + ! Test for interface to the CrayLabs smartredis client + ! see cime/tools/smartsim/README.md for details + ! use nuopc_shr_methods, only : use_smartredis ! input/output variables type(ESMF_GridComp) :: gcomp @@ -530,7 +534,12 @@ subroutine ModelFinalize(gcomp, rc) end if end subroutine ModelFinalize -! Test the smartredis interface + ! + ! Test for interface to the CrayLabs smartredis client + ! see cime/tools/smartsim/README.md for details + ! this simply writes a tensor then reads it back and compares the two + ! if everything is working they will be the same + ! subroutine srtest() use nuopc_shr_methods, only : sr_client use iso_c_binding, only : c_double From 347313a4f2062d042acc658ebe6fc03a0d2ce204 Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Fri, 23 Jul 2021 09:11:48 -0600 Subject: [PATCH 15/22] add launch wrapper to test scheduler for smartsim test --- scripts/lib/CIME/test_scheduler.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/scripts/lib/CIME/test_scheduler.py b/scripts/lib/CIME/test_scheduler.py index 665745f2c45..9670d2d05fd 100644 --- a/scripts/lib/CIME/test_scheduler.py +++ b/scripts/lib/CIME/test_scheduler.py @@ -793,25 +793,29 @@ def _run_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) - case_opts = parse_test_name(test)[1] + _,case_opts,_,_,_,_,testmods = parse_test_name(test) if case_opts is not None and "B" in case_opts: # pylint: disable=unsupported-membership-test self._log_output(test, "{} SKIPPED for test '{}'".format(RUN_PHASE, test)) self._update_test_status_file(test, SUBMIT_PHASE, TEST_PASS_STATUS) self._update_test_status_file(test, RUN_PHASE, TEST_PASS_STATUS) - return True, "SKIPPED" + return True, "SKIPPED" else: - cmd = "./case.submit" - if not self._allow_pnl: - cmd += " --skip-preview-namelist" - if self._no_batch: - cmd += " --no-batch" - if self._mail_user: - cmd += " --mail-user={}".format(self._mail_user) - if self._mail_type: - cmd += " -M={}".format(",".join(self._mail_type)) - if self._chksum: - cmd += " --chksum" + # SmartSim test needs a wrapper to submit + if "drv/smartsim" in testmods: + cmd = os.path.join(self._cime_root, "tools", "smartsim", "launch.py") + else: + cmd = "./case.submit" + if not self._allow_pnl: + cmd += " --skip-preview-namelist" + if self._no_batch: + cmd += " --no-batch" + if self._mail_user: + cmd += " --mail-user={}".format(self._mail_user) + if self._mail_type: + cmd += " -M={}".format(",".join(self._mail_type)) + if self._chksum: + cmd += " --chksum" return self._shell_cmd_for_phase(test, cmd, RUN_PHASE, from_dir=test_dir) From 85d41469fc2b5167d0c0ab58c663e4fc670b4f5e Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Mon, 2 Aug 2021 10:27:07 -0600 Subject: [PATCH 16/22] smartsim to cheyenne gnu --- config/cesm/machines/config_compilers.xml | 2 +- config/cesm/machines/config_machines.xml | 10 +++--- tools/smartsim/launch.py | 16 +++++++-- .../smartsim/launch_database_cluster.template | 2 +- tools/smartsim/resv_job.template | 36 +++++++++++++++---- 5 files changed, 51 insertions(+), 15 deletions(-) diff --git a/config/cesm/machines/config_compilers.xml b/config/cesm/machines/config_compilers.xml index c8671e0fe00..c50a145a8a3 100644 --- a/config/cesm/machines/config_compilers.xml +++ b/config/cesm/machines/config_compilers.xml @@ -941,7 +941,7 @@ using a fortran linker. - -fallow-argument-mismatch -fallow-invalid-boz + -ldl diff --git a/config/cesm/machines/config_machines.xml b/config/cesm/machines/config_machines.xml index 93ccbb14b47..d27847c420f 100644 --- a/config/cesm/machines/config_machines.xml +++ b/config/cesm/machines/config_machines.xml @@ -584,6 +584,7 @@ This allows using a different mpirun command to launch unit tests ncarenv/1.3 python/3.7.9 cmake + /glade/p/cesmdata/cseg/PROGS/modulefiles/CrayLabs intel/19.1.1 @@ -591,8 +592,8 @@ This allows using a different mpirun command to launch unit tests mkl - gnu/10.1.0 - openblas/0.3.9 + gnu/9.1.0 + openblas/0.3.6 pgi/20.4 @@ -682,8 +683,9 @@ This allows using a different mpirun command to launch unit tests netcdf-mpi/4.7.4 - openmpi/3.1.4 - netcdf/4.7.1 + openmpi/4.0.5 + netcdf-mpi/4.7.4 + SmartRedis/0.1.1 ncarcompilers/0.5.0 diff --git a/tools/smartsim/launch.py b/tools/smartsim/launch.py index dd75671bb8c..566fd539fdd 100755 --- a/tools/smartsim/launch.py +++ b/tools/smartsim/launch.py @@ -2,12 +2,14 @@ import os, sys CESM_ROOT = os.getenv("CESM_ROOT") -if not CESM_ROOT: - raise SystemExit("ERROR: CESM_ROOT must be defined in environment") _LIBDIR = os.path.join(CESM_ROOT,"cime","scripts","Tools") +if not os.path.isdir(_LIBDIR): + raise SystemExit("ERROR: CESM_ROOT must be defined in environment {}".format(CESM_ROOT)) sys.path.append(_LIBDIR) _LIBDIR = os.path.join(CESM_ROOT,"cime","scripts","lib") +if not os.path.isdir(_LIBDIR): + raise SystemExit("ERROR: CESM_ROOT must be defined in environment") sys.path.append(_LIBDIR) @@ -94,9 +96,17 @@ def _main_func(desc): templatevars["client_nodes"] = int(templatevars["member_nodes"])*len(caseroots) print("Creating submit files") create_submit_files(templatevars) + host = os.environ.get("NCAR_HOST") + if host == "cheyenne": + queue_name = "regular" + gpu_flag = "" + else: + queue_name = "casper" + gpu_flag = "-l gpu_type=vt100" + if not dryrun: print("Submitting job") - _, o, e = run_cmd("qsub resv_job.sh", verbose=True) + _, o, e = run_cmd("qsub -q {} {} resv_job.sh ".format(queue_name, gpu_flag), verbose=True) if e: print("ERROR: {}".format(e)) if o: diff --git a/tools/smartsim/launch_database_cluster.template b/tools/smartsim/launch_database_cluster.template index 0d4645685cc..89e9c5ed213 100644 --- a/tools/smartsim/launch_database_cluster.template +++ b/tools/smartsim/launch_database_cluster.template @@ -8,7 +8,7 @@ ##PBS -q regular #PBS -V #PBS -l select=$db_nodes:ncpus=1:ompthreads=1:mpiprocs=1$ngpus -#PBS -l gpu_type=v100 +##PBS -l gpu_type=v100 import os, sys, time diff --git a/tools/smartsim/resv_job.template b/tools/smartsim/resv_job.template index 73a52e10b78..8be29177afd 100644 --- a/tools/smartsim/resv_job.template +++ b/tools/smartsim/resv_job.template @@ -1,12 +1,11 @@ #!/bin/bash #PBS -N resv_job #PBS -l select=$db_nodes:ncpus=1:mpiprocs=1$ngpus+$client_nodes:ncpus=36:mpiprocs=36 -#PBS -l gpu_type=v100 +##PBS -l gpu_type=v100 #PBS -l walltime=$walltime #PBS -W create_resv_from_job=true #PBS -j oe #PBS -k oed -#PBS -q casper #PBS -A $account for rsv in $(qstat -Q|awk '$1 ~ /^R/{print $1}') @@ -22,18 +21,43 @@ if [ -z $rsvname ]; then echo "rsv is unset"; exit -1; else echo "rsv name is se me=$(whoami) pbs_ralter -U $me $rsvname -db_jobid=$(qsub -q $rsvname -vSMARTSIM_LOG_LEVEL=debug -vRSVNAME=$rsvname launch_database_cluster.sh) +if [[ "$NCAR_HOST" == "dav" ]]; then + gpu_type="-l gpu_type=vt100" +else + gpu_type="" +fi + +#db_jobid=$(qsub -q $rsvname $gpu_type -vSMARTSIM_LOG_LEVEL=debug -vRSVNAME=$rsvname launch_database_cluster.sh) +db_jobid=$(qsub -q $rsvname $gpu_type -vRSVNAME=$rsvname launch_database_cluster.sh) +if [ -z $db_jobid ]; then echo "db_jobid is unset"; exit -1; fi head_host=$(qstat -f $PBS_JOBID|awk '$1 ~ /^exec_host$/{print $3}'|cut -d\/ -f1-1) # This gets the ib network -SSDB="$(getent hosts ${head_host}-ib|awk '{print $1}'):$db_port" +if [[ "$NCAR_HOST" == "dav" ]]; then + host_name=${head_host}-ib +else + host_name=${head_host}-ib0 +fi +# This gets the ip over ib network and should be prefered +SSDB="$(getent hosts ${host_name}|awk '{print $1}'):$db_port" # This gets the external network #SSDB="$(getent hosts ${head_host}.ucar.edu |awk '{print $1}'):$db_port" + export SSDB +echo "gpu_type is $gpu_type, head_host is $head_host, db_jobid is $db_jobid, SSDB is $SSDB" for caseroot in $caseroots; do - ./xmlchange JOB_QUEUE=$rsvname --subgroup case.test --force - ./xmlchange JOB_WALLCLOCK_TIME=00:20:00 --subgroup case.test + cd $caseroot + testrun=`./xmlquery --value TEST` + if [[ "$testrun" == "TRUE" ]]; then + echo "Starting Test $caseroot" + ./xmlchange JOB_QUEUE=$rsvname --subgroup case.test --force + ./xmlchange JOB_WALLCLOCK_TIME=00:20:00 --subgroup case.test + else + echo "Starting run $caseroot" + ./xmlchange JOB_QUEUE=$rsvname --subgroup case.run --force + ./xmlchange JOB_WALLCLOCK_TIME=00:20:00 --subgroup case.run + fi ./case.submit done From 192c418af145a7b60108721ae50ca5abb8bb7fc9 Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Mon, 2 Aug 2021 15:17:17 -0600 Subject: [PATCH 17/22] smartsim setup --- config/cesm/machines/config_machines.xml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/config/cesm/machines/config_machines.xml b/config/cesm/machines/config_machines.xml index d27847c420f..091beadb158 100644 --- a/config/cesm/machines/config_machines.xml +++ b/config/cesm/machines/config_machines.xml @@ -687,6 +687,12 @@ This allows using a different mpirun command to launch unit tests netcdf-mpi/4.7.4 SmartRedis/0.1.1 + + openmpi/4.0.5 + netcdf-mpi/4.7.4 + pnetcdf/1.12.1 + SmartRedis/0.1.1 + ncarcompilers/0.5.0 From f5739d0900817f8056f287f38c84cab6273df7dd Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Tue, 3 Aug 2021 17:03:34 -0600 Subject: [PATCH 18/22] now working on cheyenne --- config/cesm/machines/config_compilers.xml | 32 ++++++++++++++--------- config/cesm/machines/config_machines.xml | 18 +++++++++---- 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/config/cesm/machines/config_compilers.xml b/config/cesm/machines/config_compilers.xml index c50a145a8a3..3bc9ba093db 100644 --- a/config/cesm/machines/config_compilers.xml +++ b/config/cesm/machines/config_compilers.xml @@ -77,7 +77,7 @@ using a fortran linker. - -I$(EXEROOT)/atm/obj/FMS + -I$(EXEROOT)/atm/obj/FMS $(FC_AUTO_R8) @@ -121,10 +121,10 @@ using a fortran linker. -fconvert=big-endian -ffree-line-length-none -ffixed-line-length-none -fopenmp + least with some versions of gfortran (confirmed with 5.4.0, 6.3.0 and + 7.1.0), gfortran's isnan (which is called in cime via the + CPRGNU-specific shr_infnan_isnan) causes a floating point exception + when called on a signaling NaN. --> -g -Wall -Og -fbacktrace -ffpe-trap=zero,overflow -fcheck=bounds -O @@ -207,10 +207,10 @@ using a fortran linker. -fconvert=big-endian -ffree-line-length-none -ffixed-line-length-none -fopenmp + least with some versions of gfortran (confirmed with 5.4.0, 6.3.0 and + 7.1.0), gfortran's isnan (which is called in cime via the + CPRGNU-specific shr_infnan_isnan) causes a floating point exception + when called on a signaling NaN. --> -g -Wall -Og -fbacktrace -ffpe-trap=zero,overflow -fcheck=bounds -O @@ -913,7 +913,7 @@ using a fortran linker. - -Wl,-rpath /glade/u/apps/dav/opt/gnu/9.1.0/lib64 -L/glade/u/apps/dav/opt/gnu/9.1.0/lib64 -lgcc + -Wl,-rpath /glade/u/apps/dav/opt/gnu/9.1.0/lib64 -DPIO_ENABLE_LOGGING=ON @@ -939,8 +939,8 @@ using a fortran linker. + -fallow-invalid-boz is needed for some components that have non-standard uses of BOZ literal constants with gfortran10. + See https://github.com/ESMCI/cime/issues/3846 for details. --> @@ -962,8 +962,14 @@ using a fortran linker. $ENV{CESMDATAROOT}/tools/pFUnit/pFUnit3.2.8_cheyenne_Intel17.0.1_MPI_openMP TRUE + + + -std=c11 -gxx-name=/glade/u/apps/ch/opt/gnu/9.1.0/bin/g++ -Wl,-rpath,/glade/u/apps/ch/opt/gnu/9.1.0/lib64 -L/glade/u/apps/ch/opt/gnu/9.1.0/lib64 -lgcc + + + -llapack -lblas @@ -1322,7 +1328,7 @@ using a fortran linker. + may require installation of the Apple Developer Tools. --> -framework Accelerate -Wl,-rpath $(NETCDF)/lib diff --git a/config/cesm/machines/config_machines.xml b/config/cesm/machines/config_machines.xml index 091beadb158..efad4b202ac 100644 --- a/config/cesm/machines/config_machines.xml +++ b/config/cesm/machines/config_machines.xml @@ -598,14 +598,22 @@ This allows using a different mpirun command to launch unit tests pgi/20.4 - + /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/intel/19.1.1/ esmf-8.2.0b13-ncdfio-mpt-g - + /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/intel/19.1.1/ esmf-8.2.0b13-ncdfio-mpt-O + + /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/intel/19.1.1/ + esmf-8.2.0b14-ncdfio-openmpi-g + + + /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/intel/19.1.1/ + esmf-8.2.0b14-ncdfio-openmpi-O + /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/intel/19.1.1/ esmf-8.2.0b13-ncdfio-mpiuni-g @@ -688,9 +696,9 @@ This allows using a different mpirun command to launch unit tests SmartRedis/0.1.1 - openmpi/4.0.5 - netcdf-mpi/4.7.4 - pnetcdf/1.12.1 + openmpi/4.1.0 + netcdf-mpi/4.8.0 + pnetcdf/1.12.2 SmartRedis/0.1.1 From d5b122fa7e62fc402cad91fae68c5a7e5b1354f6 Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Mon, 16 Aug 2021 10:42:43 -0600 Subject: [PATCH 19/22] fix error when testmods not defined --- scripts/lib/CIME/test_scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lib/CIME/test_scheduler.py b/scripts/lib/CIME/test_scheduler.py index 9670d2d05fd..2dde7c6fd59 100644 --- a/scripts/lib/CIME/test_scheduler.py +++ b/scripts/lib/CIME/test_scheduler.py @@ -802,7 +802,7 @@ def _run_phase(self, test): return True, "SKIPPED" else: # SmartSim test needs a wrapper to submit - if "drv/smartsim" in testmods: + if testmods and "drv/smartsim" in testmods: cmd = os.path.join(self._cime_root, "tools", "smartsim", "launch.py") else: cmd = "./case.submit" From f7181f9a00f48452f5da78eb2dd721297646063f Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Fri, 10 Sep 2021 09:41:45 -0600 Subject: [PATCH 20/22] rebase to cime master --- config/cesm/machines/config_compilers.xml | 4 ++-- config/cesm/machines/config_machines.xml | 3 +++ scripts/lib/CIME/test_scheduler.py | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/config/cesm/machines/config_compilers.xml b/config/cesm/machines/config_compilers.xml index 3bc9ba093db..429c8042191 100644 --- a/config/cesm/machines/config_compilers.xml +++ b/config/cesm/machines/config_compilers.xml @@ -906,7 +906,7 @@ using a fortran linker. - -qopt-report -xCORE_AVX2 -no-fma + -qopt-report -xCORE_AVX2 -no-fma -qopt-report -xCORE_AVX2 -no-fma @@ -914,7 +914,7 @@ using a fortran linker. -Wl,-rpath /glade/u/apps/dav/opt/gnu/9.1.0/lib64 - + -DPIO_ENABLE_LOGGING=ON diff --git a/config/cesm/machines/config_machines.xml b/config/cesm/machines/config_machines.xml index efad4b202ac..6baac8fec55 100644 --- a/config/cesm/machines/config_machines.xml +++ b/config/cesm/machines/config_machines.xml @@ -496,6 +496,9 @@ This allows using a different mpirun command to launch unit tests /glade/p/cesmdata/cseg $ENV{NETCDF} + + /glade/u/apps/dav/opt/gnu/9.1.0/bin/:$ENV{PATH} + -1 diff --git a/scripts/lib/CIME/test_scheduler.py b/scripts/lib/CIME/test_scheduler.py index 2dde7c6fd59..b2b209a658a 100644 --- a/scripts/lib/CIME/test_scheduler.py +++ b/scripts/lib/CIME/test_scheduler.py @@ -803,6 +803,7 @@ def _run_phase(self, test): else: # SmartSim test needs a wrapper to submit if testmods and "drv/smartsim" in testmods: + self._log_output(test, "Runnning smartsim launch script") cmd = os.path.join(self._cime_root, "tools", "smartsim", "launch.py") else: cmd = "./case.submit" From 98fd5a9f963cb3cb5bb2d39d6cd93596c83c280a Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Thu, 11 Nov 2021 16:56:45 -0700 Subject: [PATCH 21/22] need to explicitly forward environment --- tools/smartsim/resv_job.template | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/smartsim/resv_job.template b/tools/smartsim/resv_job.template index 8be29177afd..d8bc9b03a42 100644 --- a/tools/smartsim/resv_job.template +++ b/tools/smartsim/resv_job.template @@ -7,6 +7,7 @@ #PBS -j oe #PBS -k oed #PBS -A $account +#PBS -V for rsv in $(qstat -Q|awk '$1 ~ /^R/{print $1}') do From febc3b3603288856f96b1161099b68a90db9d245 Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Tue, 16 Nov 2021 11:18:49 -0700 Subject: [PATCH 22/22] updates to smartredis interface --- config/cesm/machines/config_machines.xml | 67 ++++++++++--------- scripts/lib/CIME/case/case_run.py | 2 +- .../xatm/src/atm_comp_nuopc.F90 | 30 +++++---- .../smartsim/launch_database_cluster.template | 8 ++- tools/smartsim/resv_job.template | 14 ++-- 5 files changed, 68 insertions(+), 53 deletions(-) diff --git a/config/cesm/machines/config_machines.xml b/config/cesm/machines/config_machines.xml index 6baac8fec55..21649dfec1c 100644 --- a/config/cesm/machines/config_machines.xml +++ b/config/cesm/machines/config_machines.xml @@ -585,7 +585,7 @@ This allows using a different mpirun command to launch unit tests ncarenv/1.3 - python/3.7.9 + conda/latest cmake /glade/p/cesmdata/cseg/PROGS/modulefiles/CrayLabs @@ -603,15 +603,15 @@ This allows using a different mpirun command to launch unit tests /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/intel/19.1.1/ - esmf-8.2.0b13-ncdfio-mpt-g + esmf-8.2.0b23-ncdfio-mpt-g /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/intel/19.1.1/ - esmf-8.2.0b13-ncdfio-mpt-O + esmf-8.2.0b23-ncdfio-mpt-O /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/intel/19.1.1/ - esmf-8.2.0b14-ncdfio-openmpi-g + esmf-8.2.0b23-ncdfio-openmpi-g /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/intel/19.1.1/ @@ -619,60 +619,60 @@ This allows using a different mpirun command to launch unit tests /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/intel/19.1.1/ - esmf-8.2.0b13-ncdfio-mpiuni-g + esmf-8.2.0b23-ncdfio-mpiuni-g /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/intel/19.1.1/ - esmf-8.2.0b13-ncdfio-mpiuni-O + esmf-8.2.0b23-ncdfio-mpiuni-O /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/gnu/10.1.0/ - esmf-8.2.0b13-ncdfio-mpt-g + esmf-8.2.0b23-ncdfio-mpt-g /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/gnu/10.1.0/ - esmf-8.2.0b13-ncdfio-mpt-O + esmf-8.2.0b23-ncdfio-mpt-O /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/gnu/10.1.0/ - esmf-8.2.0b13-ncdfio-openmpi-g + esmf-8.2.0b23-ncdfio-openmpi-g /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/gnu/10.1.0/ - esmf-8.2.0b13-ncdfio-openmpi-O + esmf-8.2.0b23-ncdfio-openmpi-O /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/gnu/10.1.0/ - esmf-8.2.0b13-ncdfio-mpiuni-g + esmf-8.2.0b23-ncdfio-mpiuni-g /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/gnu/10.1.0/ - esmf-8.2.0b13-ncdfio-mpiuni-O + esmf-8.2.0b23-ncdfio-mpiuni-O /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/pgi/20.4/ - esmf-8.2.0b13-ncdfio-mpt-g + esmf-8.2.0b23-ncdfio-mpt-g /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/pgi/20.4/ - esmf-8.2.0b13-ncdfio-mpt-O + esmf-8.2.0b23-ncdfio-mpt-O /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/pgi/20.4/ - esmf-8.2.0b13-ncdfio-openmpi-g + esmf-8.2.0b23-ncdfio-openmpi-g /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/pgi/20.4/ - esmf-8.2.0b13-ncdfio-openmpi-O + esmf-8.2.0b23-ncdfio-openmpi-O /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/pgi/20.4/ - esmf-8.2.0b13-ncdfio-mpiuni-g + esmf-8.2.0b23-ncdfio-mpiuni-g /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/pgi/20.4/ - esmf-8.2.0b13-ncdfio-mpiuni-O + esmf-8.2.0b23-ncdfio-mpiuni-O mpt/2.21 @@ -696,13 +696,13 @@ This allows using a different mpirun command to launch unit tests openmpi/4.0.5 netcdf-mpi/4.7.4 - SmartRedis/0.1.1 + SmartRedis/0.2.0 openmpi/4.1.0 netcdf-mpi/4.8.0 pnetcdf/1.12.2 - SmartRedis/0.1.1 + SmartRedis/0.2.0 ncarcompilers/0.5.0 @@ -733,6 +733,9 @@ This allows using a different mpirun command to launch unit tests PASSIVE true + + /glade/u/apps/ch/opt/gnu/9.1.0/lib64:$ENV{LD_LIBRARY_PATH} + false @@ -2039,52 +2042,52 @@ This allows using a different mpirun command to launch unit tests /fs/cgd/data0/modules/modulefiles/esmfpkgs/gfortran/9.3.0 - esmf-8.2.0b13-ncdfio-mvapich2-O + esmf-8.2.0b23-ncdfio-mvapich2-O /fs/cgd/data0/modules/modulefiles/esmfpkgs/gfortran/9.3.0 - esmf-8.2.0b13-ncdfio-mvapich2-g + esmf-8.2.0b23-ncdfio-mvapich2-g /fs/cgd/data0/modules/modulefiles/esmfpkgs/gfortran/9.3.0 - esmf-8.2.0b13-ncdfio-mpiuni-O + esmf-8.2.0b23-ncdfio-mpiuni-O /fs/cgd/data0/modules/modulefiles/esmfpkgs/gfortran/9.3.0 - esmf-8.2.0b13-ncdfio-mpiuni-g + esmf-8.2.0b23-ncdfio-mpiuni-g /fs/cgd/data0/modules/modulefiles/esmfpkgs/nag/6.2 - esmf-8.2.0b13-ncdfio-mvapich2-O + esmf-8.2.0b23-ncdfio-mvapich2-O /fs/cgd/data0/modules/modulefiles/esmfpkgs/nag/6.2 - esmf-8.2.0b13-ncdfio-mvapich2-g + esmf-8.2.0b23-ncdfio-mvapich2-g /fs/cgd/data0/modules/modulefiles/esmfpkgs/nag/6.2 - esmf-8.2.0b13-ncdfio-mpiuni-g + esmf-8.2.0b23-ncdfio-mpiuni-g /fs/cgd/data0/modules/modulefiles/esmfpkgs/nag/6.2 - esmf-8.2.0b13-ncdfio-mpiuni-O + esmf-8.2.0b23-ncdfio-mpiuni-O /fs/cgd/data0/modules/modulefiles/esmfpkgs/intel/20.0.1 - esmf-8.2.0b13-ncdfio-mpiuni-g + esmf-8.2.0b23-ncdfio-mpiuni-g /fs/cgd/data0/modules/modulefiles/esmfpkgs/intel/20.0.1 - esmf-8.2.0b13-ncdfio-mpiuni-O + esmf-8.2.0b23-ncdfio-mpiuni-O /fs/cgd/data0/modules/modulefiles/esmfpkgs/intel/20.0.1 - esmf-8.2.0b13-ncdfio-mvapich2-g + esmf-8.2.0b23-ncdfio-mvapich2-g /fs/cgd/data0/modules/modulefiles/esmfpkgs/intel/20.0.1 - esmf-8.2.0b13-ncdfio-mvapich2-O + esmf-8.2.0b23-ncdfio-mvapich2-O /fs/cgd/data0/modules/modulefiles/esmfpkgs/pgi/20.1 diff --git a/scripts/lib/CIME/case/case_run.py b/scripts/lib/CIME/case/case_run.py index b8f1cee21cc..3a140b770d7 100644 --- a/scripts/lib/CIME/case/case_run.py +++ b/scripts/lib/CIME/case/case_run.py @@ -201,7 +201,7 @@ def _run_model(case, lid, skip_pnl=False, da_cycle=0): if case.get_value("USE_SMARTSIM"): logger.info("Give the SmartSim DB time to launch") - time.sleep(60) + time.sleep(10) logger.info("launching job") return run_and_log_case_status(functor, "case.run", diff --git a/src/components/xcpl_comps_nuopc/xatm/src/atm_comp_nuopc.F90 b/src/components/xcpl_comps_nuopc/xatm/src/atm_comp_nuopc.F90 index 709eb5718ba..65fc4282fb0 100644 --- a/src/components/xcpl_comps_nuopc/xatm/src/atm_comp_nuopc.F90 +++ b/src/components/xcpl_comps_nuopc/xatm/src/atm_comp_nuopc.F90 @@ -561,21 +561,23 @@ subroutine srtest() if (chkerr(rc,__LINE__,u_FILE_u)) return call ESMF_VMGet(vm, localPet=mytask, rc=rc) if (chkerr(rc,__LINE__,u_FILE_u)) return - - call random_number(true_array_real_64) - call random_number(recv_array_real_64) - write(key_prefix, "(A,I6.6)") "pe_",mytask - if(mastertask) write(logunit, *) 'putting tensor to smartsim database ' - call sr_client%put_tensor(key_prefix//"true_array_real_64", true_array_real_64, & - shape(true_array_real_64)) - if(mastertask) write(logunit, *) 'retreveing tensor from database ' - call sr_client%unpack_tensor(key_prefix//"true_array_real_64", recv_array_real_64,& - shape(recv_array_real_64)) - if (.not. all(true_array_real_64 == recv_array_real_64)) then - call shr_sys_abort('true_array_real_64: FAILED') + + if (sr_client%isinitialized()) then + call random_number(true_array_real_64) + call random_number(recv_array_real_64) + write(key_prefix, "(A,I6.6)") "pe_",mytask + if(mastertask) write(logunit, *) 'putting tensor to smartsim database ' + call sr_client%put_tensor(key_prefix//"true_array_real_64", true_array_real_64, & + shape(true_array_real_64)) + if(mastertask) write(logunit, *) 'retreveing tensor from database ' + call sr_client%unpack_tensor(key_prefix//"true_array_real_64", recv_array_real_64,& + shape(recv_array_real_64)) + if (.not. all(true_array_real_64 == recv_array_real_64)) then + call shr_sys_abort('true_array_real_64: FAILED') + endif + else + call shr_sys_abort('Could not detect smartsim database') endif - - end subroutine srtest end module atm_comp_nuopc diff --git a/tools/smartsim/launch_database_cluster.template b/tools/smartsim/launch_database_cluster.template index 89e9c5ed213..df003b97316 100644 --- a/tools/smartsim/launch_database_cluster.template +++ b/tools/smartsim/launch_database_cluster.template @@ -68,10 +68,14 @@ def collect_db_hosts(num_hosts): def launch_cluster_orc(exp, db_hosts, port): """Just spin up a database cluster, check the status and tear it down""" - + ld_library_path = os.getenv("LD_LIBRARY_PATH") + os.environ["LD_LIBRARY_PATH"]="/glade/u/apps/ch/opt/gnu/9.1.0/lib64" + if ld_library_path: + os.environ["LD_LIBRARY_PATH"]="/glade/u/apps/ch/opt/gnu/9.1.0/lib64:{}".format(ld_library_path) + print(f"Starting Orchestrator on hosts: {db_hosts}") # batch = False to launch on existing allocation - db = PBSOrchestrator(port=port, db_nodes=len(db_hosts), batch=False, + db = PBSOrchestrator(port=port, db_nodes=len(db_hosts), batch=False, interface="ib0", run_command="mpirun", hosts=db_hosts) # generate directories for output files diff --git a/tools/smartsim/resv_job.template b/tools/smartsim/resv_job.template index d8bc9b03a42..db0306f9194 100644 --- a/tools/smartsim/resv_job.template +++ b/tools/smartsim/resv_job.template @@ -14,10 +14,12 @@ do parent_job=$(pbs_rstat -F $rsv|awk '$1 ~ /^reserve_job/{print $3}') if [[ "${PBS_JOBID}" == "${parent_job}" ]] ; then rsvname=$rsv + head_host=$(pbs_rstat -F $rsv|awk '$1 ~ /^resv_nodes/{print $3}' | cut -d: -f1-1) + head_host="${head_host:1}" break fi done -if [ -z $rsvname ]; then echo "rsv is unset"; exit -1; else echo "rsv name is set to '$rsvname'"; fi +if [ -z $rsvname ]; then echo "rsv is unset"; exit -1; else echo "rsv name is set to '$rsvname' head_host is $head_host"; fi me=$(whoami) pbs_ralter -U $me $rsvname @@ -27,22 +29,26 @@ if [[ "$NCAR_HOST" == "dav" ]]; then else gpu_type="" fi - -#db_jobid=$(qsub -q $rsvname $gpu_type -vSMARTSIM_LOG_LEVEL=debug -vRSVNAME=$rsvname launch_database_cluster.sh) db_jobid=$(qsub -q $rsvname $gpu_type -vRSVNAME=$rsvname launch_database_cluster.sh) if [ -z $db_jobid ]; then echo "db_jobid is unset"; exit -1; fi -head_host=$(qstat -f $PBS_JOBID|awk '$1 ~ /^exec_host$/{print $3}'|cut -d\/ -f1-1) +#qstat -n $PBS_JOBID +#head_host=$(qstat -n $PBS_JOBID | awk '$1 ~ /chadmi/{print $12}'|cut -d\/ -f1-1) + +if [ -z $head_host ]; then echo "head_host is unset, PBS_JOBID=$PBS_JOBID"; qdel $PBS_JOBID; exit -1; fi + # This gets the ib network if [[ "$NCAR_HOST" == "dav" ]]; then host_name=${head_host}-ib else host_name=${head_host}-ib0 fi +echo "HOST NAME is $host_name, head_host is $head_host" # This gets the ip over ib network and should be prefered SSDB="$(getent hosts ${host_name}|awk '{print $1}'):$db_port" # This gets the external network #SSDB="$(getent hosts ${head_host}.ucar.edu |awk '{print $1}'):$db_port" +#db_jobid=$(qsub -q $rsvname $gpu_type -vSMARTSIM_LOG_LEVEL=debug -vRSVNAME=$rsvname launch_database_cluster.sh) export SSDB echo "gpu_type is $gpu_type, head_host is $head_host, db_jobid is $db_jobid, SSDB is $SSDB"