We'd like to speed up our post-processing.
Currently reading just the top layer of a 3D output forces loading the whole volume because each record is a single HDF5 chunk. (Related to @pmav99's #219
diff --git a/sample_inputs/param.nml b/sample_inputs/param.nml
index 46f69c9d..78cbb806 100644
--- a/sample_inputs/param.nml
+++ b/sample_inputs/param.nml
@@ -846,6 +846,18 @@
!-----------------------------------------------------------------------
iof_ugrid = 0
+!-----------------------------------------------------------------------
+! Vertical chunking for 3D netCDF outputs (scribed I/O).
+! Controls the NF90 chunk shape used for every 3D variable written by
+! nc_writeout3D (salinity, temperature, velocity, tracers, etc.).
+! nchunk_vrt = 0 : default, one chunk = full (nvrt,nhoriz,1) slab
+! nchunk_vrt = N>0: N layers per chunk, i.e. chunks = (N,nhoriz,1).
+! N=1 gives the cheapest level-wise reads for
+! xarray/ParaView/etc. and keeps each chunk well
+! under the 4 GB netCDF-4 limit on large meshes.
+!-----------------------------------------------------------------------
+ nchunk_vrt = 0
+
!-----------------------------------------------------------------------
! Option for hotstart outputs
!-----------------------------------------------------------------------
diff --git a/src/Core/schism_glbl.F90 b/src/Core/schism_glbl.F90
index d4c35dcd..81ba30d4 100644
--- a/src/Core/schism_glbl.F90
+++ b/src/Core/schism_glbl.F90
@@ -94,7 +94,7 @@ module schism_glbl
&nstep_ice,niter_shap,iunder_deep,flag_fib,ielm_transport,max_subcyc, &
&itransport_only,iloadtide,nc_out,nu_sum_mult,iprecip_off_bnd, &
&iof_ugrid,model_type_pahm,iof_icm_sav,iof_icm_marsh,iof_icm_sfm,iof_icm_ba,&
- &iof_icm_clam,nbins_veg_vert,niter_hdif,nmarsh_types,istemp
+ &iof_icm_clam,nbins_veg_vert,niter_hdif,nmarsh_types,istemp,nchunk_vrt
integer,save :: ntrs(natrm),nnu_pts(natrm),mnu_pts,lev_tr_source(natrm)
integer,save,dimension(:),allocatable :: iof_hydro,iof_wwm,iof_gen,iof_age,iof_sed,iof_eco, &
&iof_icm,iof_icm_core,iof_icm_silica,iof_icm_zb,iof_icm_ph,iof_icm_srm,iof_cos,iof_fib, &
diff --git a/src/Core/scribe_io.F90 b/src/Core/scribe_io.F90
index 8606dd0a..6416ad0a 100644
--- a/src/Core/scribe_io.F90
+++ b/src/Core/scribe_io.F90
@@ -54,7 +54,8 @@
integer,save :: ifile,ihfskip,nspool,nc_out,nvrt,nproc_compute,np_global,ne_global,ns_global, &
&np_max,ne_max,ns_max,ncount_2dnode,ncount_2delem,ncount_2dside,ncount_3dnode,ncount_3delem,ncount_3dside, &
- &iths0,ncid_schism_2d,ncid_schism_3d,istart_sed_3dnode,start_year,start_month,start_day, ics,iof_ugrid
+ &iths0,ncid_schism_2d,ncid_schism_3d,istart_sed_3dnode,start_year,start_month,start_day, ics,iof_ugrid, &
+ &nchunk_vrt
!Output flag dim must be same as schism_init!
integer,save :: ntrs(natrm),iof_hydro(40),iof_wwm(40),iof_cos(20),iof_fib(5), &
&iof_sed2d(14),iof_ice(10),iof_ana(20),iof_marsh(2),counter_out_name,nout_icm_3d(2)
@@ -141,6 +142,7 @@
#endif
call mpi_recv(ics,1,itype,0,146,comm_schism,rrqst,ierr)
call mpi_recv(iof_ugrid,1,itype,0,147,comm_schism,rrqst,ierr)
+ call mpi_recv(nchunk_vrt,1,itype,0,148,comm_schism,rrqst,ierr)
if(myrank_scribe==0) then
write(16,*)'Scribe ',myrank_scribe,myrank_schism,nproc_scribe,nproc_compute
@@ -971,10 +973,21 @@
endif
!Define chunk size (contiguous block for access) for deflation: each chunk
- !must be < 4GB in size
- !TODO: add a scale for chunks(2) for large meshes
- chunks(1)=idim1; chunks(2)=idim2; chunks(3)=1
- !Outputs are in 4 bytes; 4GB is in binary not decimal - not precise
+ !must be < 4GB in size.
+ !nchunk_vrt (param.nml, SCHOUT) controls vertical chunking:
+ ! =0 : default behaviour, one big chunk for the whole 3D mesh loaded at one time step
+ ! >0 : group nchunk_vrt layers per chunk along the vertical dimension
+ ! (example: 1 = one layer per chunk; best for level-wise reads)
+ if(nchunk_vrt<=0) then
+ chunks(1)=idim1; chunks(2)=idim2; chunks(3)=1
+ else
+ chunks(1)=min(nchunk_vrt,idim1); chunks(2)=idim2; chunks(3)=1
+ endif
+ !Outputs are in 4 bytes; 4GB is in binary not decimal - not precise.
+ !Auto-shrink horizontal chunk on huge meshes instead of aborting.
+ do while(4.d0*chunks(1)*chunks(2)*chunks(3)>3.d9.and.chunks(2)>1)
+ chunks(2)=max(1,chunks(2)/2)
+ enddo
if(4.d0*chunks(1)*chunks(2)*chunks(3)>3.d9) call parallel_abort('nc_writeout3D: chunk size')
if(mod(it-nspool,ihfskip)==0) then
diff --git a/src/Hydro/schism_init.F90 b/src/Hydro/schism_init.F90
index 6a71bbcb..923664b8 100644
--- a/src/Hydro/schism_init.F90
+++ b/src/Hydro/schism_init.F90
@@ -220,7 +220,7 @@
namelist /SCHOUT/nc_out,iof_hydro,iof_wwm,iof_gen,iof_age,iof_sed,iof_eco,iof_icm_core, &
&iof_icm_silica,iof_icm_zb,iof_icm_ph,iof_icm_srm,iof_icm_sav,iof_icm_marsh,iof_icm_sfm, &
&iof_icm_ba,iof_icm_clam,iof_cos,iof_fib,iof_sed2d,iof_ice,iof_mice,iof_ana,iof_marsh,iof_dvd, &
- &nhot,nhot_write,iout_sta,nspool_sta,iof_ugrid
+ &nhot,nhot_write,iout_sta,nspool_sta,iof_ugrid,nchunk_vrt
!-------------------------------------------------------------------------------
!-------------------------------------------------------------------------------
@@ -533,6 +533,7 @@
iof_icm_core=0; iof_icm_silica=0; iof_icm_zb=0; iof_icm_ph=0; iof_icm_srm=0; iof_icm_sav=0
iof_icm_marsh=0; iof_icm_sfm=0; iof_icm_ba=0; iof_icm_clam=0; iof_cos=0; iof_fib=0; iof_sed2d=0
iof_ice=0; iof_mice=0; iof_ana=0; iof_marsh=0; nhot=0; nhot_write=8640; iout_sta=0; nspool_sta=10; iof_ugrid=0
+ nchunk_vrt=0 !default: whole-volume chunk (>0 = N layers per chunk)
read(15,nml=OPT)
read(15,nml=SCHOUT)
@@ -7160,6 +7161,7 @@
#endif
call mpi_send(ics,1,itype,nproc_schism-i,146,comm_schism,ierr)
call mpi_send(iof_ugrid,1,itype,nproc_schism-i,147,comm_schism,ierr)
+ call mpi_send(nchunk_vrt,1,itype,nproc_schism-i,148,comm_schism,ierr)
enddo !i
endif !myrank=0
Hi Joseph,
We'd like to speed up our post-processing.
Currently reading just the top layer of a 3D output forces loading the whole volume because each record is a single HDF5 chunk. (Related to @pmav99's #219
This patch adds a new SCHOUT switch
nchunk_vrtand uses it innc_writeout3D(scribed I/O) to chunk 3D outputs along the vertical dimension:nchunk_vrt= 0 (default): behaviour, no change for existing runs.nchunk_vrt= N > 0: N layers per chunk (e.g. 1: one layer per chunk, ideal for level-wise reads with xarray/ParaView).