From 8afad2e9b03a17dfa09fb7f077faf89aa63cff90 Mon Sep 17 00:00:00 2001
From: Josh Davis <jhdavis@umd.edu>
Date: Thu, 29 Jun 2023 17:53:06 -0400
Subject: [PATCH 01/23] Add start of Kokkos implementation

---
 kokkos/GridInit.cpp       | 180 ++++++++++++++
 kokkos/Main.cpp           | 120 +++++++++
 kokkos/Makefile           | 113 +++++++++
 kokkos/Materials.cpp      | 118 +++++++++
 kokkos/Simulation.cpp     | 401 +++++++++++++++++++++++++++++++
 kokkos/XSbench_header.hpp | 144 +++++++++++
 kokkos/XSutils.cpp        |  48 ++++
 kokkos/io.cpp             | 494 ++++++++++++++++++++++++++++++++++++++
 8 files changed, 1618 insertions(+)
 create mode 100644 kokkos/GridInit.cpp
 create mode 100644 kokkos/Main.cpp
 create mode 100644 kokkos/Makefile
 create mode 100644 kokkos/Materials.cpp
 create mode 100644 kokkos/Simulation.cpp
 create mode 100644 kokkos/XSbench_header.hpp
 create mode 100644 kokkos/XSutils.cpp
 create mode 100644 kokkos/io.cpp

diff --git a/kokkos/GridInit.cpp b/kokkos/GridInit.cpp
new file mode 100644
index 00000000..79c992f1
--- /dev/null
+++ b/kokkos/GridInit.cpp
@@ -0,0 +1,180 @@
+#include "XSbench_header.h"
+
+SimulationData grid_init_do_not_profile( Inputs in, int mype )
+{
+	// Structure to hold all allocated simuluation data arrays
+	SimulationData SD;
+
+	// Keep track of how much data we're allocating
+	size_t nbytes = 0;
+
+	// Set the initial seed value
+	uint64_t seed = 42;	
+
+	////////////////////////////////////////////////////////////////////
+	// Initialize Nuclide Grids
+	////////////////////////////////////////////////////////////////////
+	
+	if(mype == 0) printf("Intializing nuclide grids...\n");
+
+	// First, we need to initialize our nuclide grid. This comes in the form
+	// of a flattened 2D array that hold all the information we need to define
+	// the cross sections for all isotopes in the simulation. 
+	// The grid is composed of "NuclideGridPoint" structures, which hold the
+	// energy level of the grid point and all associated XS data at that level.
+	// An array of structures (AOS) is used instead of
+	// a structure of arrays, as the grid points themselves are accessed in 
+	// a random order, but all cross section interaction channels and the
+	// energy level are read whenever the gridpoint is accessed, meaning the
+	// AOS is more cache efficient.
+
+        // Josh: need to create unmanaged View of Host memory and then deep copy
+        // it to the device. Layouts need to be the same, figure out left or right
+        
+	// Initialize Nuclide Grid
+	SD.length_nuclide_grid = in.n_isotopes * in.n_gridpoints;
+	SD.nuclide_grid     = (NuclideGridPoint *) malloc( SD.length_nuclide_grid * sizeof(NuclideGridPoint));
+	assert(SD.nuclide_grid != NULL);
+	nbytes += SD.length_nuclide_grid * sizeof(NuclideGridPoint);
+	for( int i = 0; i < SD.length_nuclide_grid; i++ )
+	{
+		SD.nuclide_grid[i].energy        = LCG_random_double(&seed);
+		SD.nuclide_grid[i].total_xs      = LCG_random_double(&seed);
+		SD.nuclide_grid[i].elastic_xs    = LCG_random_double(&seed);
+		SD.nuclide_grid[i].absorbtion_xs = LCG_random_double(&seed);
+		SD.nuclide_grid[i].fission_xs    = LCG_random_double(&seed);
+		SD.nuclide_grid[i].nu_fission_xs = LCG_random_double(&seed);
+	}
+
+	// Sort so that each nuclide has data stored in ascending energy order.
+	for( int i = 0; i < in.n_isotopes; i++ )
+		qsort( &SD.nuclide_grid[i*in.n_gridpoints], in.n_gridpoints, sizeof(NuclideGridPoint), NGP_compare);
+	
+	// error debug check
+	/*
+	for( int i = 0; i < in.n_isotopes; i++ )
+	{
+		printf("NUCLIDE %d ==============================\n", i);
+		for( int j = 0; j < in.n_gridpoints; j++ )
+			printf("E%d = %lf\n", j, SD.nuclide_grid[i * in.n_gridpoints + j].energy);
+	}
+	*/
+	
+
+	////////////////////////////////////////////////////////////////////
+	// Initialize Acceleration Structure
+	////////////////////////////////////////////////////////////////////
+	
+	if( in.grid_type == NUCLIDE )
+	{
+		SD.length_unionized_energy_array = 0;
+		SD.length_index_grid = 0;
+	}
+	
+	if( in.grid_type == UNIONIZED )
+	{
+		if(mype == 0) printf("Intializing unionized grid...\n");
+
+		// Allocate space to hold the union of all nuclide energy data
+		SD.length_unionized_energy_array = in.n_isotopes * in.n_gridpoints;
+		SD.unionized_energy_array = (double *) malloc( SD.length_unionized_energy_array * sizeof(double));
+		assert(SD.unionized_energy_array != NULL );
+		nbytes += SD.length_unionized_energy_array * sizeof(double);
+
+		// Copy energy data over from the nuclide energy grid
+		for( int i = 0; i < SD.length_unionized_energy_array; i++ )
+			SD.unionized_energy_array[i] = SD.nuclide_grid[i].energy;
+
+		// Sort unionized energy array
+		qsort( SD.unionized_energy_array, SD.length_unionized_energy_array, sizeof(double), double_compare);
+
+		// Allocate space to hold the acceleration grid indices
+		SD.length_index_grid = SD.length_unionized_energy_array * in.n_isotopes;
+		SD.index_grid = (int *) malloc( SD.length_index_grid * sizeof(int));
+		assert(SD.index_grid != NULL);
+		nbytes += SD.length_index_grid * sizeof(int);
+
+		// Generates the double indexing grid
+		int * idx_low = (int *) calloc( in.n_isotopes, sizeof(int));
+		assert(idx_low != NULL );
+		double * energy_high = (double *) malloc( in.n_isotopes * sizeof(double));
+		assert(energy_high != NULL );
+
+		for( int i = 0; i < in.n_isotopes; i++ )
+			energy_high[i] = SD.nuclide_grid[i * in.n_gridpoints + 1].energy;
+
+		for( long e = 0; e < SD.length_unionized_energy_array; e++ )
+		{
+			double unionized_energy = SD.unionized_energy_array[e];
+			for( long i = 0; i < in.n_isotopes; i++ )
+			{
+				if( unionized_energy < energy_high[i]  )
+					SD.index_grid[e * in.n_isotopes + i] = idx_low[i];
+				else if( idx_low[i] == in.n_gridpoints - 2 )
+					SD.index_grid[e * in.n_isotopes + i] = idx_low[i];
+				else
+				{
+					idx_low[i]++;
+					SD.index_grid[e * in.n_isotopes + i] = idx_low[i];
+					energy_high[i] = SD.nuclide_grid[i * in.n_gridpoints + idx_low[i] + 1].energy;	
+				}
+			}
+		}
+
+		free(idx_low);
+		free(energy_high);
+	}
+
+	if( in.grid_type == HASH )
+	{
+		if(mype == 0) printf("Intializing hash grid...\n");
+		SD.length_unionized_energy_array = 0;
+		SD.length_index_grid  = in.hash_bins * in.n_isotopes;
+		SD.index_grid = (int *) malloc( SD.length_index_grid * sizeof(int)); 
+		assert(SD.index_grid != NULL);
+		nbytes += SD.length_index_grid * sizeof(int);
+
+		double du = 1.0 / in.hash_bins;
+
+		// For each energy level in the hash table
+		#pragma omp parallel for
+		for( long e = 0; e < in.hash_bins; e++ )
+		{
+			double energy = e * du;
+
+			// We need to determine the bounding energy levels for all isotopes
+			for( long i = 0; i < in.n_isotopes; i++ )
+			{
+				SD.index_grid[e * in.n_isotopes + i] = grid_search_nuclide( in.n_gridpoints, energy, SD.nuclide_grid + i * in.n_gridpoints, 0, in.n_gridpoints-1);
+			}
+		}
+	}
+
+	////////////////////////////////////////////////////////////////////
+	// Initialize Materials and Concentrations
+	////////////////////////////////////////////////////////////////////
+	if(mype == 0) printf("Intializing material data...\n");
+	
+	// Set the number of nuclides in each material
+	SD.num_nucs  = load_num_nucs(in.n_isotopes);
+	SD.length_num_nucs = 12; // There are always 12 materials in XSBench
+
+	// Intialize the flattened 2D grid of material data. The grid holds
+	// a list of nuclide indices for each of the 12 material types. The
+	// grid is allocated as a full square grid, even though not all
+	// materials have the same number of nuclides.
+	SD.mats = load_mats(SD.num_nucs, in.n_isotopes, &SD.max_num_nucs);
+	SD.length_mats = SD.length_num_nucs * SD.max_num_nucs;
+
+	// Intialize the flattened 2D grid of nuclide concentration data. The grid holds
+	// a list of nuclide concentrations for each of the 12 material types. The
+	// grid is allocated as a full square grid, even though not all
+	// materials have the same number of nuclides.
+	SD.concs = load_concs(SD.num_nucs, SD.max_num_nucs);
+	SD.length_concs = SD.length_mats;
+
+	if(mype == 0) printf("Intialization complete. Allocated %.0lf MB of data.\n", nbytes/1024.0/1024.0 );
+
+	return SD;
+
+}
diff --git a/kokkos/Main.cpp b/kokkos/Main.cpp
new file mode 100644
index 00000000..cb10de9c
--- /dev/null
+++ b/kokkos/Main.cpp
@@ -0,0 +1,120 @@
+#include "XSbench_header.h"
+
+#ifdef MPI
+#include<mpi.h>
+#endif
+
+int main( int argc, char* argv[] )
+{
+	// =====================================================================
+	// Initialization & Command Line Read-In
+	// =====================================================================
+	int version = 20;
+	int mype = 0;
+	double omp_start, omp_end;
+	int nprocs = 1;
+	unsigned long long verification;
+
+	#ifdef MPI
+	MPI_Status stat;
+	MPI_Init(&argc, &argv);
+	MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+	MPI_Comm_rank(MPI_COMM_WORLD, &mype);
+	#endif
+
+        // Start Kokkos
+        Kokkos::initialize();
+
+	// Process CLI Fields -- store in "Inputs" structure
+	Inputs in = read_CLI( argc, argv );
+
+	// Set number of OpenMP Threads
+	//omp_set_num_threads(in.nthreads); 
+
+	// Print-out of Input Summary
+	if( mype == 0 )
+		print_inputs( in, nprocs, version );
+
+	// =====================================================================
+	// Prepare Nuclide Energy Grids, Unionized Energy Grid, & Material Data
+	// This is not reflective of a real Monte Carlo simulation workload,
+	// therefore, do not profile this region!
+	// =====================================================================
+	
+	SimulationData SD;
+
+	// If read from file mode is selected, skip initialization and load
+	// all simulation data structures from file instead
+	if( in.binary_mode == READ )
+		SD = binary_read(in);
+	else
+		SD = grid_init_do_not_profile( in, mype );
+
+	// If writing from file mode is selected, write all simulation data
+	// structures to file
+	if( in.binary_mode == WRITE && mype == 0 )
+		binary_write(in, SD);
+
+
+	// =====================================================================
+	// Cross Section (XS) Parallel Lookup Simulation
+	// This is the section that should be profiled, as it reflects a 
+	// realistic continuous energy Monte Carlo macroscopic cross section
+	// lookup kernel.
+	// =====================================================================
+
+	if( mype == 0 )
+	{
+		printf("\n");
+		border_print();
+		center_print("SIMULATION", 79);
+		border_print();
+	}
+
+	// Start Simulation Timer
+	omp_start = omp_get_wtime();
+
+	// Run simulation
+	if( in.simulation_method == EVENT_BASED )
+	{
+		if( in.kernel_id == 0 )
+			verification = run_event_based_simulation(in, SD, mype);
+		else
+		{
+			printf("Error: No kernel ID %d found!\n", in.kernel_id);
+			exit(1);
+		}
+	}
+	else
+	{
+		printf("History-based simulation not implemented in OpenMP offload code. Instead,\nuse the event-based method with \"-m event\" argument.\n");
+		exit(1);
+	}
+
+	if( mype == 0)	
+	{	
+		printf("\n" );
+		printf("Simulation complete.\n" );
+	}
+
+	// End Simulation Timer
+	omp_end = omp_get_wtime();
+
+	// =====================================================================
+	// Output Results & Finalize
+	// =====================================================================
+
+	// Final Hash Step
+	verification = verification % 999983;
+
+	// Print / Save Results and Exit
+	int is_invalid_result = print_results( in, mype, omp_end-omp_start, nprocs, verification );
+
+        Kokkos::finalize();
+        
+	#ifdef MPI
+	MPI_Finalize();
+	#endif
+
+	return is_invalid_result;
+}
diff --git a/kokkos/Makefile b/kokkos/Makefile
new file mode 100644
index 00000000..5fbce917
--- /dev/null
+++ b/kokkos/Makefile
@@ -0,0 +1,113 @@
+#===============================================================================
+# User Options
+#===============================================================================
+
+COMPILER    = llvm
+OPTIMIZE    = yes
+DEBUG       = no
+PROFILE     = no
+MPI         = no
+
+#===============================================================================
+# Program name & source code list
+#===============================================================================
+
+program = XSBench
+
+source = \
+Main.cpp \
+io.cpp \
+Simulation.cpp \
+GridInit.cpp \
+XSutils.cpp \
+Materials.cpp
+
+obj = $(source:.cpp=.o)
+
+#===============================================================================
+# Sets Flags
+#===============================================================================
+
+# Standard Flags
+CFLAGS := -std=gnu99 -Wall
+
+# Linker Flags
+LDFLAGS = -lm
+
+# Regular gcc Compiler
+ifeq ($(COMPILER),gnu)
+  CC = gcc
+  CFLAGS += -fopenmp -flto
+endif
+
+# Intel Compiler
+ifeq ($(COMPILER),intel)
+  CC = icx 
+  CFLAGS += -fiopenmp -fopenmp-targets=spir64 -D__STRICT_ANSI__ 
+endif
+
+# LLVM Compiler Targeting A100 -- Change SM Level to Target Other GPUs
+ifeq ($(COMPILER),llvm)
+  CC = clang
+  CFLAGS += -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_80
+endif
+
+# IBM XL Compiler
+ifeq ($(COMPILER),ibm)
+  CC = xlc_r
+  CFLAGS += -qsmp=omp -qoffload
+endif
+
+# NVIDIA Compiler Targeting A100 -- Change SM Level to Target Other GPUs
+ifeq ($(COMPILER),nvidia)
+  CC = nvc
+  CFLAGS += -mp=gpu -Minfo=mp -gpu=cc80
+endif
+
+# AOMP Targeting MI100 -- Change march to Target Other GPUs
+ifeq ($(COMPILER),amd)
+  CC = clang
+  CFLAGS += -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908
+endif
+
+# Debug Flags
+ifeq ($(DEBUG),yes)
+  CFLAGS += -g
+  LDFLAGS  += -g
+endif
+
+# Profiling Flags
+ifeq ($(PROFILE),yes)
+  CFLAGS += -pg
+  LDFLAGS  += -pg
+endif
+
+# Optimization Flags
+ifeq ($(OPTIMIZE),yes)
+  CFLAGS += -O3
+endif
+
+# MPI
+ifeq ($(MPI),yes)
+  CC = mpicc
+  CFLAGS += -DMPI
+endif
+
+#===============================================================================
+# Targets to Build
+#===============================================================================
+
+$(program): $(obj) XSbench_header.hpp Makefile
+	$(CC) $(CFLAGS) $(obj) -o $@ $(LDFLAGS)
+
+%.o: %.cpp XSbench_header.hpp Makefile
+	$(CC) $(CFLAGS) -c $< -o $@
+
+clean:
+	rm -rf $(program) $(obj)
+
+edit:
+	vim -p $(source) XSbench_header.hpp
+
+run:
+	./$(program)
diff --git a/kokkos/Materials.cpp b/kokkos/Materials.cpp
new file mode 100644
index 00000000..69f1c7f7
--- /dev/null
+++ b/kokkos/Materials.cpp
@@ -0,0 +1,118 @@
+// Material data is hard coded into the functions in this file.
+// Note that there are 12 materials present in H-M (large or small)
+
+#include "XSbench_header.h"
+
+// num_nucs represents the number of nuclides that each material contains
+int * load_num_nucs(long n_isotopes)
+{
+	int * num_nucs = (int*)malloc(12*sizeof(int));
+	
+	// Material 0 is a special case (fuel). The H-M small reactor uses
+	// 34 nuclides, while H-M larges uses 300.
+	if( n_isotopes == 68 )
+		num_nucs[0]  = 34; // HM Small is 34, H-M Large is 321
+	else
+		num_nucs[0]  = 321; // HM Small is 34, H-M Large is 321
+
+	num_nucs[1]  = 5;
+	num_nucs[2]  = 4;
+	num_nucs[3]  = 4;
+	num_nucs[4]  = 27;
+	num_nucs[5]  = 21;
+	num_nucs[6]  = 21;
+	num_nucs[7]  = 21;
+	num_nucs[8]  = 21;
+	num_nucs[9]  = 21;
+	num_nucs[10] = 9;
+	num_nucs[11] = 9;
+
+	return num_nucs;
+}
+
+// Assigns an array of nuclide ID's to each material
+int * load_mats( int * num_nucs, long n_isotopes, int * max_num_nucs )
+{
+	*max_num_nucs = 0;
+	int num_mats = 12;
+	for( int m = 0; m < num_mats; m++ )
+	{
+		if( num_nucs[m] > *max_num_nucs )
+			*max_num_nucs = num_nucs[m];
+	}
+	int * mats = (int *) malloc( num_mats * (*max_num_nucs) * sizeof(int) );
+
+	// Small H-M has 34 fuel nuclides
+	int mats0_Sml[] =  { 58, 59, 60, 61, 40, 42, 43, 44, 45, 46, 1, 2, 3, 7,
+	                 8, 9, 10, 29, 57, 47, 48, 0, 62, 15, 33, 34, 52, 53, 
+	                 54, 55, 56, 18, 23, 41 }; //fuel
+	// Large H-M has 300 fuel nuclides
+	int mats0_Lrg[321] =  { 58, 59, 60, 61, 40, 42, 43, 44, 45, 46, 1, 2, 3, 7,
+	                 8, 9, 10, 29, 57, 47, 48, 0, 62, 15, 33, 34, 52, 53,
+	                 54, 55, 56, 18, 23, 41 }; //fuel
+	for( int i = 0; i < 321-34; i++ )
+		mats0_Lrg[34+i] = 68 + i; // H-M large adds nuclides to fuel only
+	
+	// These are the non-fuel materials	
+	int mats1[] =  { 63, 64, 65, 66, 67 }; // cladding
+	int mats2[] =  { 24, 41, 4, 5 }; // cold borated water
+	int mats3[] =  { 24, 41, 4, 5 }; // hot borated water
+	int mats4[] =  { 19, 20, 21, 22, 35, 36, 37, 38, 39, 25, 27, 28, 29,
+	                 30, 31, 32, 26, 49, 50, 51, 11, 12, 13, 14, 6, 16,
+	                 17 }; // RPV
+	int mats5[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // lower radial reflector
+	int mats6[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // top reflector / plate
+	int mats7[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // bottom plate
+	int mats8[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // bottom nozzle
+	int mats9[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // top nozzle
+	int mats10[] = { 24, 41, 4, 5, 63, 64, 65, 66, 67 }; // top of FA's
+	int mats11[] = { 24, 41, 4, 5, 63, 64, 65, 66, 67 }; // bottom FA's
+	
+	// H-M large v small dependency
+	if( n_isotopes == 68 )
+		memcpy( mats,  mats0_Sml,  num_nucs[0]  * sizeof(int) );	
+	else
+		memcpy( mats,  mats0_Lrg,  num_nucs[0]  * sizeof(int) );
+	
+	// Copy other materials
+	memcpy( mats + *max_num_nucs * 1,  mats1,  num_nucs[1]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 2,  mats2,  num_nucs[2]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 3,  mats3,  num_nucs[3]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 4,  mats4,  num_nucs[4]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 5,  mats5,  num_nucs[5]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 6,  mats6,  num_nucs[6]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 7,  mats7,  num_nucs[7]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 8,  mats8,  num_nucs[8]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 9,  mats9,  num_nucs[9]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 10, mats10, num_nucs[10] * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 11, mats11, num_nucs[11] * sizeof(int) );	
+
+
+	return mats;
+}
+
+// Randomizes the concentrations of all nuclides in a variety of materials
+double * load_concs( int * num_nucs, int max_num_nucs )
+{
+	uint64_t seed = STARTING_SEED * STARTING_SEED;
+	double * concs = (double *) malloc( 12 * max_num_nucs * sizeof( double ) );
+	
+	for( int i = 0; i < 12; i++ )
+		for( int j = 0; j < num_nucs[i]; j++ )
+			concs[i * max_num_nucs + j] = LCG_random_double(&seed);
+
+	// test
+	/*
+	for( int i = 0; i < 12; i++ )
+		for( int j = 0; j < num_nucs[i]; j++ )
+			printf("concs[%d][%d] = %lf\n", i, j, concs[i][j] );
+	*/
+
+	return concs;
+}
+
diff --git a/kokkos/Simulation.cpp b/kokkos/Simulation.cpp
new file mode 100644
index 00000000..b38053bc
--- /dev/null
+++ b/kokkos/Simulation.cpp
@@ -0,0 +1,401 @@
+#include "XSbench_header.h"
+
+////////////////////////////////////////////////////////////////////////////////////
+// BASELINE FUNCTIONS
+////////////////////////////////////////////////////////////////////////////////////
+// All "baseline" code is at the top of this file. The baseline code is a simple
+// implementation of the algorithm, with only minor CPU optimizations in place.
+// Following these functions are a number of optimized variants,
+// which each deploy a different combination of optimizations strategies. By
+// default, XSBench will only run the baseline implementation. Optimized variants
+// are not yet implemented for this OpenMP targeting offload port.
+////////////////////////////////////////////////////////////////////////////////////
+
+unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int mype)
+{
+	if( mype == 0)	
+		printf("Beginning event based simulation...\n");
+	
+	////////////////////////////////////////////////////////////////////////////////
+	// SUMMARY: Simulation Data Structure Manifest for "SD" Object
+	// Here we list all heap arrays (and lengths) in SD that would need to be
+	// offloaded manually if using an accelerator with a seperate memory space
+	////////////////////////////////////////////////////////////////////////////////
+	// int * num_nucs;                     // Length = length_num_nucs;
+	// double * concs;                     // Length = length_concs
+	// int * mats;                         // Length = length_mats
+	// double * unionized_energy_array;    // Length = length_unionized_energy_array
+	// int * index_grid;                   // Length = length_index_grid
+	// NuclideGridPoint * nuclide_grid;    // Length = length_nuclide_grid
+	// 
+	// Note: "unionized_energy_array" and "index_grid" can be of zero length
+	//        depending on lookup method.
+	//
+	// Note: "Lengths" are given as the number of objects in the array, not the
+	//       number of bytes.
+	////////////////////////////////////////////////////////////////////////////////
+
+
+	////////////////////////////////////////////////////////////////////////////////
+	// Begin Actual Simulation Loop 
+	////////////////////////////////////////////////////////////////////////////////
+	unsigned long long * verification = (unsigned long long *) malloc(in.lookups * sizeof(unsigned long long));
+
+	#pragma omp target teams distribute parallel for\
+	map(to: SD.max_num_nucs)\
+	map(to: SD.num_nucs[:SD.length_num_nucs])\
+	map(to: SD.concs[:SD.length_concs])\
+	map(to: SD.mats[:SD.length_mats])\
+	map(to: SD.unionized_energy_array[:SD.length_unionized_energy_array])\
+	map(to: SD.index_grid[:SD.length_index_grid])\
+	map(to: SD.nuclide_grid[:SD.length_nuclide_grid])\
+  map(from: verification[:in.lookups])
+	for( int i = 0; i < in.lookups; i++ )
+	{
+		// Set the initial seed value
+		uint64_t seed = STARTING_SEED;	
+
+		// Forward seed to lookup index (we need 2 samples per lookup)
+		seed = fast_forward_LCG(seed, 2*i);
+
+		// Randomly pick an energy and material for the particle
+		double p_energy = LCG_random_double(&seed);
+		int mat         = pick_mat(&seed); 
+
+		// debugging
+		//printf("E = %lf mat = %d\n", p_energy, mat);
+
+		double macro_xs_vector[5] = {0};
+		
+		// Perform macroscopic Cross Section Lookup
+		calculate_macro_xs(
+				p_energy,        // Sampled neutron energy (in lethargy)
+				mat,             // Sampled material type index neutron is in
+				in.n_isotopes,   // Total number of isotopes in simulation
+				in.n_gridpoints, // Number of gridpoints per isotope in simulation
+				SD.num_nucs,     // 1-D array with number of nuclides per material
+				SD.concs,        // Flattened 2-D array with concentration of each nuclide in each material
+				SD.unionized_energy_array, // 1-D Unionized energy array
+				SD.index_grid,   // Flattened 2-D grid holding indices into nuclide grid for each unionized energy level
+				SD.nuclide_grid, // Flattened 2-D grid holding energy levels and XS_data for all nuclides in simulation
+				SD.mats,         // Flattened 2-D array with nuclide indices defining composition of each type of material
+				macro_xs_vector, // 1-D array with result of the macroscopic cross section (5 different reaction channels)
+				in.grid_type,    // Lookup type (nuclide, hash, or unionized)
+				in.hash_bins,    // Number of hash bins used (if using hash lookup type)
+				SD.max_num_nucs  // Maximum number of nuclides present in any material
+				);
+
+		// For verification, and to prevent the compiler from optimizing
+		// all work out, we interrogate the returned macro_xs_vector array
+		// to find its maximum value index, then increment the verification
+		// value by that index. In this implementation, we prevent thread
+		// contention by using an OMP reduction on the verification value.
+		// For accelerators, a different approach might be required
+		// (e.g., atomics, reduction of thread-specific values in large
+		// array via CUDA thrust, etc).
+		double max = -1.0;
+		int max_idx = 0;
+		for(int j = 0; j < 5; j++ )
+		{
+			if( macro_xs_vector[j] > max )
+			{
+				max = macro_xs_vector[j];
+				max_idx = j;
+			}
+		}
+		verification[i] = max_idx+1;
+	}
+
+  // Reduce validation hash on the host
+  unsigned long long validation_hash = 0;
+	for( int i = 0; i < in.lookups; i++ )
+    validation_hash += verification[i];
+
+	return validation_hash;
+}
+
+// Calculates the microscopic cross section for a given nuclide & energy
+void calculate_micro_xs(   double p_energy, int nuc, long n_isotopes,
+                           long n_gridpoints,
+                           double *  egrid, int *  index_data,
+                           NuclideGridPoint *  nuclide_grids,
+                           long idx, double *  xs_vector, int grid_type, int hash_bins ){
+	// Variables
+	double f;
+	NuclideGridPoint * low, * high;
+
+	// If using only the nuclide grid, we must perform a binary search
+	// to find the energy location in this particular nuclide's grid.
+	if( grid_type == NUCLIDE )
+	{
+		// Perform binary search on the Nuclide Grid to find the index
+		idx = grid_search_nuclide( n_gridpoints, p_energy, &nuclide_grids[nuc*n_gridpoints], 0, n_gridpoints-1);
+
+		// pull ptr from nuclide grid and check to ensure that
+		// we're not reading off the end of the nuclide's grid
+		if( idx == n_gridpoints - 1 )
+			low = &nuclide_grids[nuc*n_gridpoints + idx - 1];
+		else
+			low = &nuclide_grids[nuc*n_gridpoints + idx];
+	}
+	else if( grid_type == UNIONIZED) // Unionized Energy Grid - we already know the index, no binary search needed.
+	{
+		// pull ptr from energy grid and check to ensure that
+		// we're not reading off the end of the nuclide's grid
+		if( index_data[idx * n_isotopes + nuc] == n_gridpoints - 1 )
+			low = &nuclide_grids[nuc*n_gridpoints + index_data[idx * n_isotopes + nuc] - 1];
+		else
+			low = &nuclide_grids[nuc*n_gridpoints + index_data[idx * n_isotopes + nuc]];
+	}
+	else // Hash grid
+	{
+		// load lower bounding index
+		int u_low = index_data[idx * n_isotopes + nuc];
+
+		// Determine higher bounding index
+		int u_high;
+		if( idx == hash_bins - 1 )
+			u_high = n_gridpoints - 1;
+		else
+			u_high = index_data[(idx+1)*n_isotopes + nuc] + 1;
+
+		// Check edge cases to make sure energy is actually between these
+		// Then, if things look good, search for gridpoint in the nuclide grid
+		// within the lower and higher limits we've calculated.
+		double e_low  = nuclide_grids[nuc*n_gridpoints + u_low].energy;
+		double e_high = nuclide_grids[nuc*n_gridpoints + u_high].energy;
+		int lower;
+		if( p_energy <= e_low )
+			lower = 0;
+		else if( p_energy >= e_high )
+			lower = n_gridpoints - 1;
+		else
+			lower = grid_search_nuclide( n_gridpoints, p_energy, &nuclide_grids[nuc*n_gridpoints], u_low, u_high);
+
+		if( lower == n_gridpoints - 1 )
+			low = &nuclide_grids[nuc*n_gridpoints + lower - 1];
+		else
+			low = &nuclide_grids[nuc*n_gridpoints + lower];
+	}
+	
+	high = low + 1;
+	
+	// calculate the re-useable interpolation factor
+	f = (high->energy - p_energy) / (high->energy - low->energy);
+
+	// Total XS
+	xs_vector[0] = high->total_xs - f * (high->total_xs - low->total_xs);
+	
+	// Elastic XS
+	xs_vector[1] = high->elastic_xs - f * (high->elastic_xs - low->elastic_xs);
+	
+	// Absorbtion XS
+	xs_vector[2] = high->absorbtion_xs - f * (high->absorbtion_xs - low->absorbtion_xs);
+	
+	// Fission XS
+	xs_vector[3] = high->fission_xs - f * (high->fission_xs - low->fission_xs);
+	
+	// Nu Fission XS
+	xs_vector[4] = high->nu_fission_xs - f * (high->nu_fission_xs - low->nu_fission_xs);
+	
+	//test
+	/*
+	if( omp_get_thread_num() == 0 )
+	{
+		printf("Lookup: Energy = %lf, nuc = %d\n", p_energy, nuc);
+		printf("e_h = %lf e_l = %lf\n", high->energy , low->energy);
+		printf("xs_h = %lf xs_l = %lf\n", high->elastic_xs, low->elastic_xs);
+		printf("total_xs = %lf\n\n", xs_vector[1]);
+	}
+	*/
+	
+}
+
+// Calculates macroscopic cross section based on a given material & energy 
+void calculate_macro_xs( double p_energy, int mat, long n_isotopes,
+                         long n_gridpoints, int *  num_nucs,
+                         double *  concs,
+                         double *  egrid, int *  index_data,
+                         NuclideGridPoint *  nuclide_grids,
+                         int *  mats,
+                         double *  macro_xs_vector, int grid_type, int hash_bins, int max_num_nucs ){
+	int p_nuc; // the nuclide we are looking up
+	long idx = -1;	
+	double conc; // the concentration of the nuclide in the material
+
+	// cleans out macro_xs_vector
+	for( int k = 0; k < 5; k++ )
+		macro_xs_vector[k] = 0;
+
+	// If we are using the unionized energy grid (UEG), we only
+	// need to perform 1 binary search per macroscopic lookup.
+	// If we are using the nuclide grid search, it will have to be
+	// done inside of the "calculate_micro_xs" function for each different
+	// nuclide in the material.
+	if( grid_type == UNIONIZED )
+		idx = grid_search( n_isotopes * n_gridpoints, p_energy, egrid);	
+	else if( grid_type == HASH )
+	{
+		double du = 1.0 / hash_bins;
+		idx = p_energy / du;
+	}
+	
+	// Once we find the pointer array on the UEG, we can pull the data
+	// from the respective nuclide grids, as well as the nuclide
+	// concentration data for the material
+	// Each nuclide from the material needs to have its micro-XS array
+	// looked up & interpolatied (via calculate_micro_xs). Then, the
+	// micro XS is multiplied by the concentration of that nuclide
+	// in the material, and added to the total macro XS array.
+	// (Independent -- though if parallelizing, must use atomic operations
+	//  or otherwise control access to the xs_vector and macro_xs_vector to
+	//  avoid simulataneous writing to the same data structure)
+	for( int j = 0; j < num_nucs[mat]; j++ )
+	{
+		double xs_vector[5];
+		p_nuc = mats[mat*max_num_nucs + j];
+		conc = concs[mat*max_num_nucs + j];
+		calculate_micro_xs( p_energy, p_nuc, n_isotopes,
+		                    n_gridpoints, egrid, index_data,
+		                    nuclide_grids, idx, xs_vector, grid_type, hash_bins );
+		for( int k = 0; k < 5; k++ )
+			macro_xs_vector[k] += xs_vector[k] * conc;
+	}
+	
+	//test
+	/*
+	for( int k = 0; k < 5; k++ )
+		printf("Energy: %lf, Material: %d, XSVector[%d]: %lf\n",
+		       p_energy, mat, k, macro_xs_vector[k]);
+			   */
+}
+
+
+// binary search for energy on unionized energy grid
+// returns lower index
+long grid_search( long n, double quarry, double *  A)
+{
+	long lowerLimit = 0;
+	long upperLimit = n-1;
+	long examinationPoint;
+	long length = upperLimit - lowerLimit;
+
+	while( length > 1 )
+	{
+		examinationPoint = lowerLimit + ( length / 2 );
+		
+		if( A[examinationPoint] > quarry )
+			upperLimit = examinationPoint;
+		else
+			lowerLimit = examinationPoint;
+		
+		length = upperLimit - lowerLimit;
+	}
+	
+	return lowerLimit;
+}
+
+// binary search for energy on nuclide energy grid
+long grid_search_nuclide( long n, double quarry, NuclideGridPoint * A, long low, long high)
+{
+	long lowerLimit = low;
+	long upperLimit = high;
+	long examinationPoint;
+	long length = upperLimit - lowerLimit;
+
+	while( length > 1 )
+	{
+		examinationPoint = lowerLimit + ( length / 2 );
+		
+		if( A[examinationPoint].energy > quarry )
+			upperLimit = examinationPoint;
+		else
+			lowerLimit = examinationPoint;
+		
+		length = upperLimit - lowerLimit;
+	}
+	
+	return lowerLimit;
+}
+
+// picks a material based on a probabilistic distribution
+int pick_mat( uint64_t * seed )
+{
+	// I have a nice spreadsheet supporting these numbers. They are
+	// the fractions (by volume) of material in the core. Not a 
+	// *perfect* approximation of where XS lookups are going to occur,
+	// but this will do a good job of biasing the system nonetheless.
+
+	// Also could be argued that doing fractions by weight would be 
+	// a better approximation, but volume does a good enough job for now.
+
+	double dist[12];
+	dist[0]  = 0.140;	// fuel
+	dist[1]  = 0.052;	// cladding
+	dist[2]  = 0.275;	// cold, borated water
+	dist[3]  = 0.134;	// hot, borated water
+	dist[4]  = 0.154;	// RPV
+	dist[5]  = 0.064;	// Lower, radial reflector
+	dist[6]  = 0.066;	// Upper reflector / top plate
+	dist[7]  = 0.055;	// bottom plate
+	dist[8]  = 0.008;	// bottom nozzle
+	dist[9]  = 0.015;	// top nozzle
+	dist[10] = 0.025;	// top of fuel assemblies
+	dist[11] = 0.013;	// bottom of fuel assemblies
+	
+	double roll = LCG_random_double(seed);
+
+	// makes a pick based on the distro
+	for( int i = 0; i < 12; i++ )
+	{
+		double running = 0;
+		for( int j = i; j > 0; j-- )
+			running += dist[j];
+		if( roll < running )
+			return i;
+	}
+
+	return 0;
+}
+
+double LCG_random_double(uint64_t * seed)
+{
+	// LCG parameters
+	const uint64_t m = 9223372036854775808ULL; // 2^63
+	const uint64_t a = 2806196910506780709ULL;
+	const uint64_t c = 1ULL;
+	*seed = (a * (*seed) + c) % m;
+	return (double) (*seed) / (double) m;
+	//return ldexp(*seed, -63);
+
+}	
+
+uint64_t fast_forward_LCG(uint64_t seed, uint64_t n)
+{
+	// LCG parameters
+	const uint64_t m = 9223372036854775808ULL; // 2^63
+	uint64_t a = 2806196910506780709ULL;
+	uint64_t c = 1ULL;
+
+	n = n % m;
+
+	uint64_t a_new = 1;
+	uint64_t c_new = 0;
+
+	while(n > 0) 
+	{
+		if(n & 1)
+		{
+			a_new *= a;
+			c_new = c_new * a + c;
+		}
+		c *= (a + 1);
+		a *= a;
+
+		n >>= 1;
+	}
+
+	return (a_new * seed + c_new) % m;
+
+}
+
diff --git a/kokkos/XSbench_header.hpp b/kokkos/XSbench_header.hpp
new file mode 100644
index 00000000..db5e9c01
--- /dev/null
+++ b/kokkos/XSbench_header.hpp
@@ -0,0 +1,144 @@
+// -*- c-basic-offset: 8; tab-width: 8; indent-tabs-mode: t; -*-
+#ifndef __XSBENCH_HEADER_H__
+#define __XSBENCH_HEADER_H__
+
+#include<stdio.h>
+#include<stdlib.h>
+#include<time.h>
+#include<string.h>
+#include<strings.h>
+#include<math.h>
+#include<omp.h>
+#include<unistd.h>
+#include<sys/time.h>
+#include<assert.h>
+#include<stdint.h>
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_View.hpp>
+#include <Kokkos_Sort.hpp>
+
+// Papi Header
+#ifdef PAPI
+#include "papi.h"
+#endif
+
+// Grid types
+#define UNIONIZED 0
+#define NUCLIDE 1
+#define HASH 2
+
+// Simulation types
+#define HISTORY_BASED 1
+#define EVENT_BASED 2
+
+// Binary Mode Type
+#define NONE 0
+#define READ 1
+#define WRITE 2
+
+// Starting Seed
+#define STARTING_SEED 1070
+
+// Structures
+typedef struct{
+	double energy;
+	double total_xs;
+	double elastic_xs;
+	double absorbtion_xs;
+	double fission_xs;
+	double nu_fission_xs;
+} NuclideGridPoint;
+
+typedef struct{
+	int nthreads;
+	long n_isotopes;
+	long n_gridpoints;
+	int lookups;
+	char * HM;
+	int grid_type; // 0: Unionized Grid (default)    1: Nuclide Grid
+	int hash_bins;
+	int particles;
+	int simulation_method;
+	int binary_mode;
+	int kernel_id;
+} Inputs;
+
+typedef Kokkos::View<int*> IntView;
+typedef Kokkos::View<double*> DoubleView;
+typedef Kokkos::View<NuclideGridPoint*> PointView;
+
+typedef struct{
+	IntView* d_num_nucs;				// Length = length_num_nucs;
+	DoubleView* d_concs;				// Length = length_concs
+	IntView* d_mats;				// Length = length_mats
+	DoubleView* d_unionized_energy_array;		// Length = length_unionized_energy_array
+	IntView* d_index_grid;				// Length = length_index_grid
+	PointView* d_nuclide_grid;			// Length = length_nuclide_grid
+	int * num_nucs;
+	double * concs;
+	int * mats;
+	double * unionized_energy_array;
+	int * index_grid;
+	NuclideGridPoint * nuclide_grid;
+	int length_num_nucs;
+	int length_concs;
+	int length_mats;
+	int length_unionized_energy_array;
+	long length_index_grid;
+	int length_nuclide_grid;
+	int max_num_nucs;
+	double * p_energy_samples;
+	int length_p_energy_samples;
+	int * mat_samples;
+	int length_mat_samples;
+} SimulationData;
+
+// io.c
+void logo(int version);
+void center_print(const char *s, int width);
+void border_print(void);
+void fancy_int(long a);
+Inputs read_CLI( int argc, char * argv[] );
+void print_CLI_error(void);
+void print_inputs(Inputs in, int nprocs, int version);
+int print_results( Inputs in, int mype, double runtime, int nprocs, unsigned long long vhash );
+void binary_write( Inputs in, SimulationData SD );
+SimulationData binary_read( Inputs in );
+
+// Simulation.c
+unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int mype);
+unsigned long long run_history_based_simulation(Inputs in, SimulationData SD, int mype);
+void calculate_micro_xs(   double p_energy, int nuc, long n_isotopes,
+                           long n_gridpoints,
+                           double *  egrid, int *  index_data,
+                           NuclideGridPoint *  nuclide_grids,
+                           long idx, double *  xs_vector, int grid_type, int hash_bins );
+void calculate_macro_xs( double p_energy, int mat, long n_isotopes,
+                         long n_gridpoints, int *  num_nucs,
+                         double *  concs,
+                         double *  egrid, int *  index_data,
+                         NuclideGridPoint *  nuclide_grids,
+                         int *  mats,
+                         double *  macro_xs_vector, int grid_type, int hash_bins, int max_num_nucs );
+long grid_search( long n, double quarry, double *  A);
+long grid_search_nuclide( long n, double quarry, NuclideGridPoint * A, long low, long high);
+int pick_mat( uint64_t * seed );
+double LCG_random_double(uint64_t * seed);
+uint64_t fast_forward_LCG(uint64_t seed, uint64_t n);
+unsigned long long run_event_based_simulation_optimization_1(Inputs in, SimulationData SD, int mype);
+
+// GridInit.c
+SimulationData grid_init_do_not_profile( Inputs in, int mype );
+
+// XSutils.c
+int NGP_compare( const void * a, const void * b );
+int double_compare(const void * a, const void * b);
+size_t estimate_mem_usage( Inputs in );
+
+// Materials.c
+int * load_num_nucs(long n_isotopes);
+int * load_mats( int * num_nucs, long n_isotopes, int * max_num_nucs );
+double * load_concs( int * num_nucs, int max_num_nucs );
+#endif
diff --git a/kokkos/XSutils.cpp b/kokkos/XSutils.cpp
new file mode 100644
index 00000000..d433ecdc
--- /dev/null
+++ b/kokkos/XSutils.cpp
@@ -0,0 +1,48 @@
+#include "XSbench_header.h"
+
+int double_compare(const void * a, const void * b)
+{
+	double A = *((double *) a);
+	double B = *((double *) b);
+
+	if( A > B )
+		return 1;
+	else if( A < B )
+		return -1;
+	else
+		return 0;
+}
+
+int NGP_compare(const void * a, const void * b)
+{
+	NuclideGridPoint A = *((NuclideGridPoint *) a);
+	NuclideGridPoint B = *((NuclideGridPoint *) b);
+
+	if( A.energy > B.energy )
+		return 1;
+	else if( A.energy < B.energy )
+		return -1;
+	else
+		return 0;
+}
+
+
+size_t estimate_mem_usage( Inputs in )
+{
+	size_t single_nuclide_grid = in.n_gridpoints * sizeof( NuclideGridPoint );
+	size_t all_nuclide_grids   = in.n_isotopes * single_nuclide_grid;
+	size_t size_UEG            = in.n_isotopes*in.n_gridpoints*sizeof(double) + in.n_isotopes*in.n_gridpoints*in.n_isotopes*sizeof(int);
+	size_t size_hash_grid      = in.hash_bins * in.n_isotopes * sizeof(int);
+	size_t memtotal;
+
+	if( in.grid_type == UNIONIZED )
+		memtotal          = all_nuclide_grids + size_UEG;
+	else if( in.grid_type == NUCLIDE )
+		memtotal          = all_nuclide_grids;
+	else
+		memtotal          = all_nuclide_grids + size_hash_grid;
+
+	memtotal          = ceil(memtotal / (1024.0*1024.0));
+	return memtotal;
+}
+
diff --git a/kokkos/io.cpp b/kokkos/io.cpp
new file mode 100644
index 00000000..17b6d32c
--- /dev/null
+++ b/kokkos/io.cpp
@@ -0,0 +1,494 @@
+#include "XSbench_header.h"
+
+#ifdef MPI
+#include<mpi.h>
+#endif
+
+// Prints program logo
+void logo(int version)
+{
+	border_print();
+	printf(
+	"                   __   __ ___________                 _                        \n"
+	"                   \\ \\ / //  ___| ___ \\               | |                       \n"
+	"                    \\ V / \\ `--.| |_/ / ___ _ __   ___| |__                     \n"
+	"                    /   \\  `--. \\ ___ \\/ _ \\ '_ \\ / __| '_ \\                    \n"
+	"                   / /^\\ \\/\\__/ / |_/ /  __/ | | | (__| | | |                   \n"
+	"                   \\/   \\/\\____/\\____/ \\___|_| |_|\\___|_| |_|                   \n\n"
+    );
+	border_print();
+	center_print("Developed at Argonne National Laboratory", 79);
+	char v[100];
+	sprintf(v, "Version: %d", version);
+	center_print(v, 79);
+	border_print();
+}
+
+// Prints Section titles in center of 80 char terminal
+void center_print(const char *s, int width)
+{
+	int length = strlen(s);
+	int i;
+	for (i=0; i<=(width-length)/2; i++) {
+		fputs(" ", stdout);
+	}
+	fputs(s, stdout);
+	fputs("\n", stdout);
+}
+
+int print_results( Inputs in, int mype, double runtime, int nprocs,
+	unsigned long long vhash )
+{
+	// Calculate Lookups per sec
+	int lookups = 0;
+	if( in.simulation_method == HISTORY_BASED )
+		lookups = in.lookups * in.particles;
+	else if( in.simulation_method == EVENT_BASED )
+		lookups = in.lookups;
+	int lookups_per_sec = (int) ((double) lookups / runtime);
+	
+	// If running in MPI, reduce timing statistics and calculate average
+	#ifdef MPI
+	int total_lookups = 0;
+	MPI_Barrier(MPI_COMM_WORLD);
+	MPI_Reduce(&lookups_per_sec, &total_lookups, 1, MPI_INT,
+	           MPI_SUM, 0, MPI_COMM_WORLD);
+	#endif
+
+	int is_invalid_result = 1;
+	
+	// Print output
+	if( mype == 0 )
+	{
+		border_print();
+		center_print("RESULTS", 79);
+		border_print();
+
+		// Print the results
+    printf("NOTE: Timings are estimated -- use nvprof/nsys/iprof/rocprof for formal analysis\n");
+		#ifdef MPI
+		printf("MPI ranks:   %d\n", nprocs);
+		#endif
+		#ifdef MPI
+		printf("Total Lookups/s:            ");
+		fancy_int(total_lookups);
+		printf("Avg Lookups/s per MPI rank: ");
+		fancy_int(total_lookups / nprocs);
+		#else
+		printf("Runtime:     %.3lf seconds\n", runtime);
+		printf("Lookups:     "); fancy_int(lookups);
+		printf("Lookups/s:   ");
+		fancy_int(lookups_per_sec);
+		#endif
+	}
+
+	unsigned long long large = 0;
+	unsigned long long small = 0; 
+	if( in.simulation_method == EVENT_BASED )
+	{
+		small = 945990;
+		large = 952131;
+	}
+	else if( in.simulation_method == HISTORY_BASED )
+	{
+		small = 941535;
+		large = 954318; 
+	}
+	if( strcmp(in.HM, "large") == 0 )
+	{
+		if( vhash == large )
+			is_invalid_result = 0;
+	}
+	else if( strcmp(in.HM, "small") == 0 )
+	{
+		if( vhash == small )
+			is_invalid_result = 0;
+	}
+
+	if(mype == 0 )
+	{
+		if( is_invalid_result )
+			printf("Verification checksum: %llu (WARNING - INAVALID CHECKSUM!)\n", vhash);
+		else
+			printf("Verification checksum: %llu (Valid)\n", vhash);
+		border_print();
+	}
+
+	return is_invalid_result;
+}
+
+void print_inputs(Inputs in, int nprocs, int version )
+{
+	// Calculate Estimate of Memory Usage
+	int mem_tot = estimate_mem_usage( in );
+	logo(version);
+	center_print("INPUT SUMMARY", 79);
+	border_print();
+	printf("Programming Model:            Kokkos\n");
+	if( in.simulation_method == EVENT_BASED )
+		printf("Simulation Method:            Event Based\n");
+	else
+		printf("Simulation Method:            History Based\n");
+	if( in.grid_type == NUCLIDE )
+		printf("Grid Type:                    Nuclide Grid\n");
+	else if( in.grid_type == UNIONIZED )
+		printf("Grid Type:                    Unionized Grid\n");
+	else
+		printf("Grid Type:                    Hash\n");
+
+	printf("Materials:                    %d\n", 12);
+	printf("H-M Benchmark Size:           %s\n", in.HM);
+	printf("Total Nuclides:               %ld\n", in.n_isotopes);
+	printf("Gridpoints (per Nuclide):     ");
+	fancy_int(in.n_gridpoints);
+	if( in.grid_type == HASH )
+	{
+		printf("Hash Bins:                    ");
+		fancy_int(in.hash_bins);
+	}
+	if( in.grid_type == UNIONIZED )
+	{
+		printf("Unionized Energy Gridpoints:  ");
+		fancy_int(in.n_isotopes*in.n_gridpoints);
+	}
+	if( in.simulation_method == HISTORY_BASED )
+	{
+		printf("Particle Histories:           "); fancy_int(in.particles);
+		printf("XS Lookups per Particle:      "); fancy_int(in.lookups);
+	}
+	printf("Total XS Lookups:             "); fancy_int(in.lookups);
+	#ifdef MPI
+	printf("MPI Ranks:                    %d\n", nprocs);
+	printf("Mem Usage per MPI Rank (MB):  "); fancy_int(mem_tot);
+	#else
+	printf("Est. Memory Usage (MB):       "); fancy_int(mem_tot);
+	#endif
+	printf("Binary File Mode:             ");
+	if( in.binary_mode == NONE )
+		printf("Off\n");
+	else if( in.binary_mode == READ)
+		printf("Read\n");
+	else
+		printf("Write\n");
+	border_print();
+	center_print("INITIALIZATION - DO NOT PROFILE", 79);
+	border_print();
+}
+
+void border_print(void)
+{
+	printf(
+	"==================================================================="
+	"=============\n");
+}
+
+// Prints comma separated integers - for ease of reading
+void fancy_int( long a )
+{
+    if( a < 1000 )
+        printf("%ld\n",a);
+
+    else if( a >= 1000 && a < 1000000 )
+        printf("%ld,%03ld\n", a / 1000, a % 1000);
+
+    else if( a >= 1000000 && a < 1000000000 )
+        printf("%ld,%03ld,%03ld\n",a / 1000000,(a % 1000000) / 1000,a % 1000 );
+
+    else if( a >= 1000000000 )
+        printf("%ld,%03ld,%03ld,%03ld\n",
+               a / 1000000000,
+               (a % 1000000000) / 1000000,
+               (a % 1000000) / 1000,
+               a % 1000 );
+    else
+        printf("%ld\n",a);
+}
+
+void print_CLI_error(void)
+{
+	printf("Usage: ./XSBench <options>\n");
+	printf("Options include:\n");
+	printf("  -m <simulation method>   Simulation method (history, event)\n");
+	printf("  -s <size>                Size of H-M Benchmark to run (small, large, XL, XXL)\n");
+	printf("  -g <gridpoints>          Number of gridpoints per nuclide (overrides -s defaults)\n");
+	printf("  -G <grid type>           Grid search type (unionized, nuclide, hash). Defaults to unionized.\n");
+	printf("  -p <particles>           Number of particle histories\n");
+	printf("  -l <lookups>             History Based: Number of Cross-section (XS) lookups per particle. Event Based: Total number of XS lookups.\n");
+	printf("  -h <hash bins>           Number of hash bins (only relevant when used with \"-G hash\")\n");
+	printf("  -b <binary mode>         Read or write all data structures to file. If reading, this will skip initialization phase. (read, write)\n");
+	printf("  -k <kernel ID>           Specifies which kernel to run. 0 is baseline, 1, 2, etc are optimized variants. (0 is default.)\n");
+	printf("Default is equivalent to: -m history -s large -l 34 -p 500000 -G unionized\n");
+	printf("See readme for full description of default run values\n");
+	exit(4);
+}
+
+Inputs read_CLI( int argc, char * argv[] )
+{
+	Inputs input;
+
+	// defaults to the history based simulation method
+	input.simulation_method = HISTORY_BASED;
+	
+	// defaults to max threads on the system	
+	input.nthreads = omp_get_num_procs();
+	
+	// defaults to 355 (corresponding to H-M Large benchmark)
+	input.n_isotopes = 355;
+	
+	// defaults to 11303 (corresponding to H-M Large benchmark)
+	input.n_gridpoints = 11303;
+
+	// defaults to 500,000
+	input.particles = 500000;
+	
+	// defaults to 34
+	input.lookups = 34;
+	
+	// default to unionized grid
+	input.grid_type = UNIONIZED;
+
+	// default to unionized grid
+	input.hash_bins = 10000;
+
+	// default to no binary read/write
+	input.binary_mode = NONE;
+	
+	// defaults to baseline kernel
+	input.kernel_id = 0;
+	
+	// defaults to H-M Large benchmark
+	input.HM = (char *) malloc( 6 * sizeof(char) );
+	input.HM[0] = 'l' ; 
+	input.HM[1] = 'a' ; 
+	input.HM[2] = 'r' ; 
+	input.HM[3] = 'g' ; 
+	input.HM[4] = 'e' ; 
+	input.HM[5] = '\0';
+	
+	// Check if user sets these
+	int user_g = 0;
+
+	int default_lookups = 1;
+	int default_particles = 1;
+	
+	// Collect Raw Input
+	for( int i = 1; i < argc; i++ )
+	{
+		char * arg = argv[i];
+
+		// n_gridpoints (-g)
+		if( strcmp(arg, "-g") == 0 )
+		{	
+			if( ++i < argc )
+			{
+				user_g = 1;
+				input.n_gridpoints = atol(argv[i]);
+			}
+			else
+				print_CLI_error();
+		}
+		// Simulation Method (-m)
+		else if( strcmp(arg, "-m") == 0 )
+		{
+			char * sim_type;
+			if( ++i < argc )
+				sim_type = argv[i];
+			else
+				print_CLI_error();
+
+			if( strcmp(sim_type, "history") == 0 )
+				input.simulation_method = HISTORY_BASED;
+			else if( strcmp(sim_type, "event") == 0 )
+			{
+				input.simulation_method = EVENT_BASED;
+				// Also resets default # of lookups
+				if( default_lookups && default_particles )
+				{
+					input.lookups =  input.lookups * input.particles;
+					input.particles = 0;
+				}
+			}
+			else
+				print_CLI_error();
+		}
+		// lookups (-l)
+		else if( strcmp(arg, "-l") == 0 )
+		{
+			if( ++i < argc )
+			{
+				input.lookups = atoi(argv[i]);
+				default_lookups = 0;
+			}
+			else
+				print_CLI_error();
+		}
+		// hash bins (-h)
+		else if( strcmp(arg, "-h") == 0 )
+		{
+			if( ++i < argc )
+				input.hash_bins = atoi(argv[i]);
+			else
+				print_CLI_error();
+		}
+		// particles (-p)
+		else if( strcmp(arg, "-p") == 0 )
+		{
+			if( ++i < argc )
+			{
+				input.particles = atoi(argv[i]);
+				default_particles = 0;
+			}
+			else
+				print_CLI_error();
+		}
+		// HM (-s)
+		else if( strcmp(arg, "-s") == 0 )
+		{	
+			if( ++i < argc )
+				input.HM = argv[i];
+			else
+				print_CLI_error();
+		}
+		// grid type (-G)
+		else if( strcmp(arg, "-G") == 0 )
+		{
+			char * grid_type;
+			if( ++i < argc )
+				grid_type = argv[i];
+			else
+				print_CLI_error();
+
+			if( strcmp(grid_type, "unionized") == 0 )
+				input.grid_type = UNIONIZED;
+			else if( strcmp(grid_type, "nuclide") == 0 )
+				input.grid_type = NUCLIDE;
+			else if( strcmp(grid_type, "hash") == 0 )
+				input.grid_type = HASH;
+			else
+				print_CLI_error();
+		}
+		// binary mode (-b)
+		else if( strcmp(arg, "-b") == 0 )
+		{
+			char * binary_mode;
+			if( ++i < argc )
+				binary_mode = argv[i];
+			else
+				print_CLI_error();
+
+			if( strcmp(binary_mode, "read") == 0 )
+				input.binary_mode = READ;
+			else if( strcmp(binary_mode, "write") == 0 )
+				input.binary_mode = WRITE;
+			else
+				print_CLI_error();
+		}
+		// kernel optimization selection (-k)
+		else if( strcmp(arg, "-k") == 0 )
+		{
+			if( ++i < argc )
+			{
+				input.kernel_id = atoi(argv[i]);
+			}
+			else
+				print_CLI_error();
+		}
+		else
+			print_CLI_error();
+	}
+
+	// Validate Input
+	
+	// Validate nthreads
+	if( input.nthreads < 1 )
+		print_CLI_error();
+	
+	// Validate n_isotopes
+	if( input.n_isotopes < 1 )
+		print_CLI_error();
+	
+	// Validate n_gridpoints
+	if( input.n_gridpoints < 1 )
+		print_CLI_error();
+
+	// Validate lookups
+	if( input.lookups < 1 )
+		print_CLI_error();
+
+	// Validate Hash Bins 
+	if( input.hash_bins < 1 )
+		print_CLI_error();
+	
+	// Validate HM size
+	if( strcasecmp(input.HM, "small") != 0 &&
+		strcasecmp(input.HM, "large") != 0 &&
+		strcasecmp(input.HM, "XL") != 0 &&
+		strcasecmp(input.HM, "XXL") != 0 )
+		print_CLI_error();
+	
+	// Set HM size specific parameters
+	// (defaults to large)
+	if( strcasecmp(input.HM, "small") == 0 )
+		input.n_isotopes = 68;
+	else if( strcasecmp(input.HM, "XL") == 0 && user_g == 0 )
+		input.n_gridpoints = 238847; // sized to make 120 GB XS data
+	else if( strcasecmp(input.HM, "XXL") == 0 && user_g == 0 )
+		input.n_gridpoints = 238847 * 2.1; // 252 GB XS data
+
+	// Return input struct
+	return input;
+}
+
+void binary_write( Inputs in, SimulationData SD )
+{
+	char * fname = "XS_data.dat";
+	printf("Writing all data structures to binary file %s...\n", fname);
+	FILE * fp = fopen(fname, "w");
+
+	// Write SimulationData Object. Include pointers, even though we won't be using them.
+	fwrite(&SD, sizeof(SimulationData), 1, fp);
+
+	// Write heap arrays in SimulationData Object
+	fwrite(SD.num_nucs,       sizeof(int), SD.length_num_nucs, fp);
+	fwrite(SD.concs,          sizeof(double), SD.length_concs, fp);
+	fwrite(SD.mats,           sizeof(int), SD.length_mats, fp);
+	fwrite(SD.nuclide_grid,   sizeof(NuclideGridPoint), SD.length_nuclide_grid, fp); 
+	fwrite(SD.index_grid, sizeof(int), SD.length_index_grid, fp);
+	fwrite(SD.unionized_energy_array, sizeof(double), SD.length_unionized_energy_array, fp);
+
+	fclose(fp);
+}
+
+SimulationData binary_read( Inputs in )
+{
+	SimulationData SD;
+	
+	char * fname = "XS_data.dat";
+	printf("Reading all data structures from binary file %s...\n", fname);
+
+	FILE * fp = fopen(fname, "r");
+	assert(fp != NULL);
+
+	// Read SimulationData Object. Include pointers, even though we won't be using them.
+	fread(&SD, sizeof(SimulationData), 1, fp);
+
+	// Allocate space for arrays on heap
+	SD.num_nucs = (int *) malloc(SD.length_num_nucs * sizeof(int));
+	SD.concs = (double *) malloc(SD.length_concs * sizeof(double));
+	SD.mats = (int *) malloc(SD.length_mats * sizeof(int));
+	SD.nuclide_grid = (NuclideGridPoint *) malloc(SD.length_nuclide_grid * sizeof(NuclideGridPoint));
+	SD.index_grid = (int *) malloc( SD.length_index_grid * sizeof(int));
+	SD.unionized_energy_array = (double *) malloc( SD.length_unionized_energy_array * sizeof(double));
+
+	// Read heap arrays into SimulationData Object
+	fread(SD.num_nucs,       sizeof(int), SD.length_num_nucs, fp);
+	fread(SD.concs,          sizeof(double), SD.length_concs, fp);
+	fread(SD.mats,           sizeof(int), SD.length_mats, fp);
+	fread(SD.nuclide_grid,   sizeof(NuclideGridPoint), SD.length_nuclide_grid, fp); 
+	fread(SD.index_grid, sizeof(int), SD.length_index_grid, fp);
+	fread(SD.unionized_energy_array, sizeof(double), SD.length_unionized_energy_array, fp);
+
+	fclose(fp);
+
+	return SD;
+}

From fc50bcd83ab16c21cedbb96c9c75e169d9f92ebf Mon Sep 17 00:00:00 2001
From: Josh Davis <jhdavis@umd.edu>
Date: Wed, 5 Jul 2023 22:53:43 -0400
Subject: [PATCH 02/23] Add compiling Kokkos port, verification fails

---
 kokkos/GridInit.cpp       |  45 +++++++++--
 kokkos/Main.cpp           |   3 +-
 kokkos/Makefile           |  71 ++++++-----------
 kokkos/Materials.cpp      |   2 +-
 kokkos/Simulation.cpp     | 160 +++++++++++++++++++++++++-------------
 kokkos/XSbench_header.hpp |  33 ++++----
 kokkos/XSutils.cpp        |   2 +-
 kokkos/io.cpp             |   2 +-
 8 files changed, 195 insertions(+), 123 deletions(-)

diff --git a/kokkos/GridInit.cpp b/kokkos/GridInit.cpp
index 79c992f1..faad5871 100644
--- a/kokkos/GridInit.cpp
+++ b/kokkos/GridInit.cpp
@@ -1,4 +1,5 @@
-#include "XSbench_header.h"
+// -*- c-basic-offset: 8; tab-width: 8; indent-tabs-mode: t; -*-
+#include "XSbench_header.hpp"
 
 SimulationData grid_init_do_not_profile( Inputs in, int mype )
 {
@@ -28,9 +29,6 @@ SimulationData grid_init_do_not_profile( Inputs in, int mype )
 	// energy level are read whenever the gridpoint is accessed, meaning the
 	// AOS is more cache efficient.
 
-        // Josh: need to create unmanaged View of Host memory and then deep copy
-        // it to the device. Layouts need to be the same, figure out left or right
-        
 	// Initialize Nuclide Grid
 	SD.length_nuclide_grid = in.n_isotopes * in.n_gridpoints;
 	SD.nuclide_grid     = (NuclideGridPoint *) malloc( SD.length_nuclide_grid * sizeof(NuclideGridPoint));
@@ -145,7 +143,7 @@ SimulationData grid_init_do_not_profile( Inputs in, int mype )
 			// We need to determine the bounding energy levels for all isotopes
 			for( long i = 0; i < in.n_isotopes; i++ )
 			{
-				SD.index_grid[e * in.n_isotopes + i] = grid_search_nuclide( in.n_gridpoints, energy, SD.nuclide_grid + i * in.n_gridpoints, 0, in.n_gridpoints-1);
+				SD.index_grid[e * in.n_isotopes + i] = grid_search_nuclide_old( in.n_gridpoints, energy, SD.nuclide_grid + i * in.n_gridpoints, 0, in.n_gridpoints-1);
 			}
 		}
 	}
@@ -173,6 +171,43 @@ SimulationData grid_init_do_not_profile( Inputs in, int mype )
 	SD.concs = load_concs(SD.num_nucs, SD.max_num_nucs);
 	SD.length_concs = SD.length_mats;
 
+        Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::HostSpace,
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+			u_num_nucs(SD.num_nucs, SD.length_num_nucs);
+        SD.d_num_nucs = new IntView("d_num_nucs", SD.length_num_nucs);
+        Kokkos::deep_copy(*SD.d_num_nucs, u_num_nucs);
+        
+        Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::HostSpace,
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+			u_concs(SD.concs, SD.length_concs);
+        SD.d_concs = new DoubleView("d_concs", SD.length_concs);
+        Kokkos::deep_copy(*SD.d_concs, u_concs);
+        
+        Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::HostSpace,
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+			u_mats(SD.mats, SD.length_mats);
+        SD.d_mats = new IntView("d_mats", SD.length_mats);
+        Kokkos::deep_copy(*SD.d_mats, u_mats);
+        
+        Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::HostSpace,
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+			u_unionized_energy_array(SD.unionized_energy_array, SD.length_unionized_energy_array);
+        SD.d_unionized_energy_array = new DoubleView("d_unionized_energy_array",
+						     SD.length_unionized_energy_array);
+        Kokkos::deep_copy(*SD.d_unionized_energy_array, u_unionized_energy_array);
+        
+        Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::HostSpace,
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+            u_index_grid(SD.index_grid, SD.length_index_grid);
+        SD.d_index_grid = new IntView("d_index_grid", SD.length_index_grid);
+        Kokkos::deep_copy(*SD.d_index_grid, u_index_grid);
+        
+        Kokkos::View<NuclideGridPoint*, Kokkos::LayoutLeft, Kokkos::HostSpace,
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+            u_nuclide_grid(SD.nuclide_grid, SD.length_nuclide_grid);
+        SD.d_nuclide_grid = new PointView("d_nuclide_grid", SD.length_nuclide_grid);
+        Kokkos::deep_copy(*SD.d_nuclide_grid, u_nuclide_grid);
+        
 	if(mype == 0) printf("Intialization complete. Allocated %.0lf MB of data.\n", nbytes/1024.0/1024.0 );
 
 	return SD;
diff --git a/kokkos/Main.cpp b/kokkos/Main.cpp
index cb10de9c..ddc13d23 100644
--- a/kokkos/Main.cpp
+++ b/kokkos/Main.cpp
@@ -1,4 +1,5 @@
-#include "XSbench_header.h"
+// -*- c-basic-offset: 8; tab-width: 8; indent-tabs-mode: t; -*-
+#include "XSbench_header.hpp"
 
 #ifdef MPI
 #include<mpi.h>
diff --git a/kokkos/Makefile b/kokkos/Makefile
index 5fbce917..24121519 100644
--- a/kokkos/Makefile
+++ b/kokkos/Makefile
@@ -2,11 +2,16 @@
 # User Options
 #===============================================================================
 
-COMPILER    = llvm
+KOKKOS_PATH = ${HOME}/repos/perf-port/kokkos
+CUDA_PATH   = /usr/local/cuda-11.6
+COMPILER    = nvidia
 OPTIMIZE    = yes
-DEBUG       = no
+DEBUG       = yes
 PROFILE     = no
 MPI         = no
+KOKKOS_DEVICES = OpenMP,Cuda
+KOKKOS_ARCH = Zen3,Ampere86
+SM_VERSION = 86
 
 #===============================================================================
 # Program name & source code list
@@ -29,79 +34,53 @@ obj = $(source:.cpp=.o)
 #===============================================================================
 
 # Standard Flags
-CFLAGS := -std=gnu99 -Wall
+CXXFLAGS := -Wall
 
 # Linker Flags
 LDFLAGS = -lm
 
-# Regular gcc Compiler
-ifeq ($(COMPILER),gnu)
-  CC = gcc
-  CFLAGS += -fopenmp -flto
-endif
-
-# Intel Compiler
-ifeq ($(COMPILER),intel)
-  CC = icx 
-  CFLAGS += -fiopenmp -fopenmp-targets=spir64 -D__STRICT_ANSI__ 
-endif
-
-# LLVM Compiler Targeting A100 -- Change SM Level to Target Other GPUs
-ifeq ($(COMPILER),llvm)
-  CC = clang
-  CFLAGS += -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_80
-endif
-
-# IBM XL Compiler
-ifeq ($(COMPILER),ibm)
-  CC = xlc_r
-  CFLAGS += -qsmp=omp -qoffload
-endif
-
-# NVIDIA Compiler Targeting A100 -- Change SM Level to Target Other GPUs
+# NVIDIA Compiler
 ifeq ($(COMPILER),nvidia)
-  CC = nvc
-  CFLAGS += -mp=gpu -Minfo=mp -gpu=cc80
-endif
-
-# AOMP Targeting MI100 -- Change march to Target Other GPUs
-ifeq ($(COMPILER),amd)
-  CC = clang
-  CFLAGS += -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908
+  CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+  CXXFLAGS += -Xcompiler -Wall -Xcompiler -O3 -arch=sm_$(SM_VERSION)
 endif
 
 # Debug Flags
 ifeq ($(DEBUG),yes)
-  CFLAGS += -g
-  LDFLAGS  += -g
+  CXXFLAGS += -g -G
+  LDFLAGS  += -g -G
 endif
 
 # Profiling Flags
 ifeq ($(PROFILE),yes)
-  CFLAGS += -pg
+  CXXFLAGS += -pg
   LDFLAGS  += -pg
 endif
 
 # Optimization Flags
 ifeq ($(OPTIMIZE),yes)
-  CFLAGS += -O3
+  CXXFLAGS += -O3
 endif
 
 # MPI
 ifeq ($(MPI),yes)
-  CC = mpicc
-  CFLAGS += -DMPI
+  CXX = mpicxx
+  CXXFLAGS += -DMPI
 endif
 
 #===============================================================================
 # Targets to Build
 #===============================================================================
 
-$(program): $(obj) XSbench_header.hpp Makefile
-	$(CC) $(CFLAGS) $(obj) -o $@ $(LDFLAGS)
+default: $(program)
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+$(program): $(obj) XSbench_header.hpp Makefile $(KOKKOS_LINK_DEPENDS)
+	$(CXX) $(CXXFLAGS) $(KOKKOS_LDFLAGS) $(obj) $(KOKKOS_LIBS) -o $@ $(LDFLAGS)
 
-%.o: %.cpp XSbench_header.hpp Makefile
-	$(CC) $(CFLAGS) -c $< -o $@
+%.o: %.cpp XSbench_header.hpp Makefile $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $< -o $@
 
 clean:
 	rm -rf $(program) $(obj)
diff --git a/kokkos/Materials.cpp b/kokkos/Materials.cpp
index 69f1c7f7..450b970d 100644
--- a/kokkos/Materials.cpp
+++ b/kokkos/Materials.cpp
@@ -1,7 +1,7 @@
 // Material data is hard coded into the functions in this file.
 // Note that there are 12 materials present in H-M (large or small)
 
-#include "XSbench_header.h"
+#include "XSbench_header.hpp"
 
 // num_nucs represents the number of nuclides that each material contains
 int * load_num_nucs(long n_isotopes)
diff --git a/kokkos/Simulation.cpp b/kokkos/Simulation.cpp
index b38053bc..f6d450b4 100644
--- a/kokkos/Simulation.cpp
+++ b/kokkos/Simulation.cpp
@@ -1,4 +1,5 @@
-#include "XSbench_header.h"
+// -*- c-basic-offset: 8; tab-width: 8; indent-tabs-mode: t; -*-
+#include "XSbench_header.hpp"
 
 ////////////////////////////////////////////////////////////////////////////////////
 // BASELINE FUNCTIONS
@@ -39,8 +40,12 @@ unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int
 	////////////////////////////////////////////////////////////////////////////////
 	// Begin Actual Simulation Loop 
 	////////////////////////////////////////////////////////////////////////////////
-	unsigned long long * verification = (unsigned long long *) malloc(in.lookups * sizeof(unsigned long long));
+	//unsigned long long * verification = (unsigned long long *) malloc(in.lookups * sizeof(unsigned long long));
+	Kokkos::View<unsigned long long*> d_verification("d_ver", in.lookups);
+	Kokkos::View<unsigned long long*>::HostMirror verification =
+			Kokkos::create_mirror_view(d_verification);
 
+	/*
 	#pragma omp target teams distribute parallel for\
 	map(to: SD.max_num_nucs)\
 	map(to: SD.num_nucs[:SD.length_num_nucs])\
@@ -49,9 +54,11 @@ unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int
 	map(to: SD.unionized_energy_array[:SD.length_unionized_energy_array])\
 	map(to: SD.index_grid[:SD.length_index_grid])\
 	map(to: SD.nuclide_grid[:SD.length_nuclide_grid])\
-  map(from: verification[:in.lookups])
+	map(from: verification[:in.lookups])
 	for( int i = 0; i < in.lookups; i++ )
 	{
+	*/
+	Kokkos::parallel_for("Simulation", in.lookups, KOKKOS_LAMBDA (int i) {
 		// Set the initial seed value
 		uint64_t seed = STARTING_SEED;	
 
@@ -73,12 +80,12 @@ unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int
 				mat,             // Sampled material type index neutron is in
 				in.n_isotopes,   // Total number of isotopes in simulation
 				in.n_gridpoints, // Number of gridpoints per isotope in simulation
-				SD.num_nucs,     // 1-D array with number of nuclides per material
-				SD.concs,        // Flattened 2-D array with concentration of each nuclide in each material
-				SD.unionized_energy_array, // 1-D Unionized energy array
-				SD.index_grid,   // Flattened 2-D grid holding indices into nuclide grid for each unionized energy level
-				SD.nuclide_grid, // Flattened 2-D grid holding energy levels and XS_data for all nuclides in simulation
-				SD.mats,         // Flattened 2-D array with nuclide indices defining composition of each type of material
+				SD.d_num_nucs,     // 1-D array with number of nuclides per material
+				SD.d_concs,        // Flattened 2-D array with concentration of each nuclide in each material
+				SD.d_unionized_energy_array, // 1-D Unionized energy array
+				SD.d_index_grid,   // Flattened 2-D grid holding indices into nuclide grid for each unionized energy level
+				SD.d_nuclide_grid, // Flattened 2-D grid holding energy levels and XS_data for all nuclides in simulation
+				SD.d_mats,         // Flattened 2-D array with nuclide indices defining composition of each type of material
 				macro_xs_vector, // 1-D array with result of the macroscopic cross section (5 different reaction channels)
 				in.grid_type,    // Lookup type (nuclide, hash, or unionized)
 				in.hash_bins,    // Number of hash bins used (if using hash lookup type)
@@ -103,82 +110,97 @@ unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int
 				max_idx = j;
 			}
 		}
-		verification[i] = max_idx+1;
-	}
+		d_verification(i) = max_idx+1;
+	});
 
-  // Reduce validation hash on the host
-  unsigned long long validation_hash = 0;
+	Kokkos::deep_copy(verification, d_verification);
+	Kokkos::fence();
+		
+	// Reduce validation hash on the host
+	unsigned long long validation_hash = 0;
 	for( int i = 0; i < in.lookups; i++ )
-    validation_hash += verification[i];
-
+		validation_hash += verification[i];
+	
 	return validation_hash;
 }
 
 // Calculates the microscopic cross section for a given nuclide & energy
+KOKKOS_INLINE_FUNCTION
 void calculate_micro_xs(   double p_energy, int nuc, long n_isotopes,
-                           long n_gridpoints,
-                           double *  egrid, int *  index_data,
-                           NuclideGridPoint *  nuclide_grids,
-                           long idx, double *  xs_vector, int grid_type, int hash_bins ){
+			   long n_gridpoints,
+			   DoubleView*  egrid, IntView*  index_data,
+			   PointView *  nuclide_grids,
+			   long idx, double *  xs_vector, int grid_type, int hash_bins ){
 	// Variables
 	double f;
 	NuclideGridPoint * low, * high;
+	long low_i, high_i;
 
 	// If using only the nuclide grid, we must perform a binary search
 	// to find the energy location in this particular nuclide's grid.
 	if( grid_type == NUCLIDE )
 	{
 		// Perform binary search on the Nuclide Grid to find the index
-		idx = grid_search_nuclide( n_gridpoints, p_energy, &nuclide_grids[nuc*n_gridpoints], 0, n_gridpoints-1);
+		idx = grid_search_nuclide( n_gridpoints, p_energy, nuclide_grids, nuc*n_gridpoints, 0, n_gridpoints-1);
 
 		// pull ptr from nuclide grid and check to ensure that
 		// we're not reading off the end of the nuclide's grid
-		if( idx == n_gridpoints - 1 )
-			low = &nuclide_grids[nuc*n_gridpoints + idx - 1];
-		else
-			low = &nuclide_grids[nuc*n_gridpoints + idx];
+		if( idx == n_gridpoints - 1 ) {
+			low_i = nuc*n_gridpoints + idx - 1;
+			low = &(*nuclide_grids)(low_i);
+		} else {
+			low_i = nuc*n_gridpoints + idx;
+			low = &(*nuclide_grids)(low_i);
+		}
 	}
 	else if( grid_type == UNIONIZED) // Unionized Energy Grid - we already know the index, no binary search needed.
 	{
 		// pull ptr from energy grid and check to ensure that
 		// we're not reading off the end of the nuclide's grid
-		if( index_data[idx * n_isotopes + nuc] == n_gridpoints - 1 )
-			low = &nuclide_grids[nuc*n_gridpoints + index_data[idx * n_isotopes + nuc] - 1];
-		else
-			low = &nuclide_grids[nuc*n_gridpoints + index_data[idx * n_isotopes + nuc]];
+		if( (*index_data)(idx * n_isotopes + nuc) == n_gridpoints - 1 ) {
+			low_i = nuc*n_gridpoints + (*index_data)(idx * n_isotopes + nuc) - 1;
+			low = &(*nuclide_grids)(low_i);
+		} else {
+			low_i = nuc*n_gridpoints + (*index_data)(idx * n_isotopes + nuc);
+			low = &(*nuclide_grids)(low_i);
+		}
 	}
 	else // Hash grid
 	{
 		// load lower bounding index
-		int u_low = index_data[idx * n_isotopes + nuc];
+		int u_low = (*index_data)(idx * n_isotopes + nuc);
 
 		// Determine higher bounding index
 		int u_high;
 		if( idx == hash_bins - 1 )
 			u_high = n_gridpoints - 1;
 		else
-			u_high = index_data[(idx+1)*n_isotopes + nuc] + 1;
+			u_high = (*index_data)((idx+1)*n_isotopes + nuc) + 1;
 
 		// Check edge cases to make sure energy is actually between these
 		// Then, if things look good, search for gridpoint in the nuclide grid
 		// within the lower and higher limits we've calculated.
-		double e_low  = nuclide_grids[nuc*n_gridpoints + u_low].energy;
-		double e_high = nuclide_grids[nuc*n_gridpoints + u_high].energy;
+		double e_low  = (*nuclide_grids)(nuc*n_gridpoints + u_low).energy;
+		double e_high = (*nuclide_grids)(nuc*n_gridpoints + u_high).energy;
 		int lower;
 		if( p_energy <= e_low )
 			lower = 0;
 		else if( p_energy >= e_high )
 			lower = n_gridpoints - 1;
 		else
-			lower = grid_search_nuclide( n_gridpoints, p_energy, &nuclide_grids[nuc*n_gridpoints], u_low, u_high);
-
-		if( lower == n_gridpoints - 1 )
-			low = &nuclide_grids[nuc*n_gridpoints + lower - 1];
-		else
-			low = &nuclide_grids[nuc*n_gridpoints + lower];
+			lower = grid_search_nuclide( n_gridpoints, p_energy, nuclide_grids, nuc*n_gridpoints, u_low, u_high);
+
+		if( lower == n_gridpoints - 1 ) {
+			low_i = nuc*n_gridpoints + lower - 1;
+			low = &(*nuclide_grids)(low_i);
+		} else {
+			low_i = nuc*n_gridpoints + lower;
+			low = &(*nuclide_grids)(low_i);
+		}
 	}
-	
-	high = low + 1;
+
+	high_i = low_i + 1;
+	high = &(*nuclide_grids)(high_i);
 	
 	// calculate the re-useable interpolation factor
 	f = (high->energy - p_energy) / (high->energy - low->energy);
@@ -211,13 +233,14 @@ void calculate_micro_xs(   double p_energy, int nuc, long n_isotopes,
 	
 }
 
-// Calculates macroscopic cross section based on a given material & energy 
+// Calculates macroscopic cross section based on a given material & energy
+KOKKOS_INLINE_FUNCTION
 void calculate_macro_xs( double p_energy, int mat, long n_isotopes,
-                         long n_gridpoints, int *  num_nucs,
-                         double *  concs,
-                         double *  egrid, int *  index_data,
-                         NuclideGridPoint *  nuclide_grids,
-                         int *  mats,
+                         long n_gridpoints, IntView*  num_nucs,
+                         DoubleView*  concs,
+                         DoubleView*  egrid, IntView*  index_data,
+                         PointView*  nuclide_grids,
+                         IntView*  mats,
                          double *  macro_xs_vector, int grid_type, int hash_bins, int max_num_nucs ){
 	int p_nuc; // the nuclide we are looking up
 	long idx = -1;	
@@ -233,7 +256,7 @@ void calculate_macro_xs( double p_energy, int mat, long n_isotopes,
 	// done inside of the "calculate_micro_xs" function for each different
 	// nuclide in the material.
 	if( grid_type == UNIONIZED )
-		idx = grid_search( n_isotopes * n_gridpoints, p_energy, egrid);	
+		idx = grid_search( n_isotopes * n_gridpoints, p_energy, egrid, 0);	
 	else if( grid_type == HASH )
 	{
 		double du = 1.0 / hash_bins;
@@ -250,11 +273,11 @@ void calculate_macro_xs( double p_energy, int mat, long n_isotopes,
 	// (Independent -- though if parallelizing, must use atomic operations
 	//  or otherwise control access to the xs_vector and macro_xs_vector to
 	//  avoid simulataneous writing to the same data structure)
-	for( int j = 0; j < num_nucs[mat]; j++ )
+	for( int j = 0; j < (*num_nucs)(mat); j++ )
 	{
 		double xs_vector[5];
-		p_nuc = mats[mat*max_num_nucs + j];
-		conc = concs[mat*max_num_nucs + j];
+		p_nuc = (*mats)(mat*max_num_nucs + j);
+		conc = (*concs)(mat*max_num_nucs + j);
 		calculate_micro_xs( p_energy, p_nuc, n_isotopes,
 		                    n_gridpoints, egrid, index_data,
 		                    nuclide_grids, idx, xs_vector, grid_type, hash_bins );
@@ -273,7 +296,8 @@ void calculate_macro_xs( double p_energy, int mat, long n_isotopes,
 
 // binary search for energy on unionized energy grid
 // returns lower index
-long grid_search( long n, double quarry, double *  A)
+KOKKOS_INLINE_FUNCTION
+long grid_search( long n, double quarry, DoubleView* A, long off)
 {
 	long lowerLimit = 0;
 	long upperLimit = n-1;
@@ -284,7 +308,7 @@ long grid_search( long n, double quarry, double *  A)
 	{
 		examinationPoint = lowerLimit + ( length / 2 );
 		
-		if( A[examinationPoint] > quarry )
+		if( (*A)(examinationPoint + off) > quarry )
 			upperLimit = examinationPoint;
 		else
 			lowerLimit = examinationPoint;
@@ -296,7 +320,30 @@ long grid_search( long n, double quarry, double *  A)
 }
 
 // binary search for energy on nuclide energy grid
-long grid_search_nuclide( long n, double quarry, NuclideGridPoint * A, long low, long high)
+KOKKOS_INLINE_FUNCTION
+long grid_search_nuclide( long n, double quarry, PointView* A, long off, long low, long high)
+{
+	long lowerLimit = low;
+	long upperLimit = high;
+	long examinationPoint;
+	long length = upperLimit - lowerLimit;
+
+	while( length > 1 )
+	{
+		examinationPoint = lowerLimit + ( length / 2 );
+		
+		if( (*A)(examinationPoint + off).energy > quarry )
+			upperLimit = examinationPoint;
+		else
+			lowerLimit = examinationPoint;
+		
+		length = upperLimit - lowerLimit;
+	}
+	
+	return lowerLimit;
+}
+
+long grid_search_nuclide_old( long n, double quarry, NuclideGridPoint* A, long low, long high)
 {
 	long lowerLimit = low;
 	long upperLimit = high;
@@ -318,7 +365,9 @@ long grid_search_nuclide( long n, double quarry, NuclideGridPoint * A, long low,
 	return lowerLimit;
 }
 
+
 // picks a material based on a probabilistic distribution
+KOKKOS_INLINE_FUNCTION
 int pick_mat( uint64_t * seed )
 {
 	// I have a nice spreadsheet supporting these numbers. They are
@@ -358,6 +407,8 @@ int pick_mat( uint64_t * seed )
 	return 0;
 }
 
+// This won't compile if this is inline for some reason
+KOKKOS_FUNCTION
 double LCG_random_double(uint64_t * seed)
 {
 	// LCG parameters
@@ -368,8 +419,9 @@ double LCG_random_double(uint64_t * seed)
 	return (double) (*seed) / (double) m;
 	//return ldexp(*seed, -63);
 
-}	
+}
 
+KOKKOS_INLINE_FUNCTION
 uint64_t fast_forward_LCG(uint64_t seed, uint64_t n)
 {
 	// LCG parameters
diff --git a/kokkos/XSbench_header.hpp b/kokkos/XSbench_header.hpp
index db5e9c01..f71e8ab6 100644
--- a/kokkos/XSbench_header.hpp
+++ b/kokkos/XSbench_header.hpp
@@ -15,9 +15,6 @@
 #include<stdint.h>
 
 #include <Kokkos_Core.hpp>
-#include <Kokkos_Parallel.hpp>
-#include <Kokkos_View.hpp>
-#include <Kokkos_Sort.hpp>
 
 // Papi Header
 #ifdef PAPI
@@ -110,22 +107,30 @@ SimulationData binary_read( Inputs in );
 // Simulation.c
 unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int mype);
 unsigned long long run_history_based_simulation(Inputs in, SimulationData SD, int mype);
+KOKKOS_INLINE_FUNCTION
 void calculate_micro_xs(   double p_energy, int nuc, long n_isotopes,
-                           long n_gridpoints,
-                           double *  egrid, int *  index_data,
-                           NuclideGridPoint *  nuclide_grids,
-                           long idx, double *  xs_vector, int grid_type, int hash_bins );
+			   long n_gridpoints,
+			   DoubleView*  egrid, IntView*  index_data,
+			   PointView *  nuclide_grids,
+			   long idx, double *  xs_vector, int grid_type, int hash_bins );
+KOKKOS_INLINE_FUNCTION
 void calculate_macro_xs( double p_energy, int mat, long n_isotopes,
-                         long n_gridpoints, int *  num_nucs,
-                         double *  concs,
-                         double *  egrid, int *  index_data,
-                         NuclideGridPoint *  nuclide_grids,
-                         int *  mats,
+                         long n_gridpoints, IntView*  num_nucs,
+                         DoubleView*  concs,
+                         DoubleView*  egrid, IntView*  index_data,
+                         PointView*  nuclide_grids,
+                         IntView*  mats,
                          double *  macro_xs_vector, int grid_type, int hash_bins, int max_num_nucs );
-long grid_search( long n, double quarry, double *  A);
-long grid_search_nuclide( long n, double quarry, NuclideGridPoint * A, long low, long high);
+KOKKOS_INLINE_FUNCTION
+long grid_search( long n, double quarry, DoubleView* A, long off);
+KOKKOS_INLINE_FUNCTION
+long grid_search_nuclide( long n, double quarry, PointView* A, long off, long low, long high);
+long grid_search_nuclide_old( long n, double quarry, NuclideGridPoint* A, long low, long high);
+KOKKOS_INLINE_FUNCTION
 int pick_mat( uint64_t * seed );
+KOKKOS_FUNCTION
 double LCG_random_double(uint64_t * seed);
+KOKKOS_INLINE_FUNCTION
 uint64_t fast_forward_LCG(uint64_t seed, uint64_t n);
 unsigned long long run_event_based_simulation_optimization_1(Inputs in, SimulationData SD, int mype);
 
diff --git a/kokkos/XSutils.cpp b/kokkos/XSutils.cpp
index d433ecdc..a2f64654 100644
--- a/kokkos/XSutils.cpp
+++ b/kokkos/XSutils.cpp
@@ -1,4 +1,4 @@
-#include "XSbench_header.h"
+#include "XSbench_header.hpp"
 
 int double_compare(const void * a, const void * b)
 {
diff --git a/kokkos/io.cpp b/kokkos/io.cpp
index 17b6d32c..2bd4b5d8 100644
--- a/kokkos/io.cpp
+++ b/kokkos/io.cpp
@@ -1,4 +1,4 @@
-#include "XSbench_header.h"
+#include "XSbench_header.hpp"
 
 #ifdef MPI
 #include<mpi.h>

From fb67e698fe069a214867946c558039c69be8abf8 Mon Sep 17 00:00:00 2001
From: Josh Davis <jhdavis@umd.edu>
Date: Mon, 10 Jul 2023 12:36:18 -0400
Subject: [PATCH 03/23] Small changes to Simulation in Kokkos for easier
 debugging

---
 kokkos/Simulation.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/kokkos/Simulation.cpp b/kokkos/Simulation.cpp
index f6d450b4..948fb6e3 100644
--- a/kokkos/Simulation.cpp
+++ b/kokkos/Simulation.cpp
@@ -44,6 +44,7 @@ unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int
 	Kokkos::View<unsigned long long*> d_verification("d_ver", in.lookups);
 	Kokkos::View<unsigned long long*>::HostMirror verification =
 			Kokkos::create_mirror_view(d_verification);
+	//Kokkos::deep_copy(d_verification, verification);
 
 	/*
 	#pragma omp target teams distribute parallel for\
@@ -58,7 +59,9 @@ unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int
 	for( int i = 0; i < in.lookups; i++ )
 	{
 	*/
-	Kokkos::parallel_for("Simulation", in.lookups, KOKKOS_LAMBDA (int i) {
+	Kokkos::parallel_for("Simulation",
+			     Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, in.lookups),
+			     KOKKOS_LAMBDA (int i) {
 		// Set the initial seed value
 		uint64_t seed = STARTING_SEED;	
 
@@ -119,7 +122,7 @@ unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int
 	// Reduce validation hash on the host
 	unsigned long long validation_hash = 0;
 	for( int i = 0; i < in.lookups; i++ )
-		validation_hash += verification[i];
+		validation_hash += verification(i);
 	
 	return validation_hash;
 }

From b1e6fbd39e7b932f309444d296c5a2660ce08256 Mon Sep 17 00:00:00 2001
From: Josh Davis <jhdavis@umd.edu>
Date: Mon, 10 Jul 2023 14:20:40 -0400
Subject: [PATCH 04/23] Changes to support CPU only compilation

---
 kokkos/Makefile | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/kokkos/Makefile b/kokkos/Makefile
index 24121519..46a7143c 100644
--- a/kokkos/Makefile
+++ b/kokkos/Makefile
@@ -10,7 +10,9 @@ DEBUG       = yes
 PROFILE     = no
 MPI         = no
 KOKKOS_DEVICES = OpenMP,Cuda
+#KOKKOS_DEVICES = OpenMP
 KOKKOS_ARCH = Zen3,Ampere86
+#KOKKOS_ARCH = Zen3
 SM_VERSION = 86
 
 #===============================================================================
@@ -45,10 +47,23 @@ ifeq ($(COMPILER),nvidia)
   CXXFLAGS += -Xcompiler -Wall -Xcompiler -O3 -arch=sm_$(SM_VERSION)
 endif
 
+# Clang Compiler
+ifeq ($(COMPILER),llvm)
+  CXX = clang++
+  CXXFLAGS += -flto -fopenmp -DOPENMP
+endif
+
+# GCC Compiler
+ifeq ($(COMPILER),gnu)
+  CXX = g++
+  CXXFLAGS += -flto -fopenmp -DOPENMP
+endif
+
+
 # Debug Flags
 ifeq ($(DEBUG),yes)
-  CXXFLAGS += -g -G
-  LDFLAGS  += -g -G
+  CXXFLAGS += -g
+  LDFLAGS  += -g
 endif
 
 # Profiling Flags

From 91d542f4c0d9cede76dec5d2d2b948aebf65cf3a Mon Sep 17 00:00:00 2001
From: Josh Davis <jhdavis@umd.edu>
Date: Mon, 10 Jul 2023 14:27:12 -0400
Subject: [PATCH 05/23] Change max num nucs to 0d View

---
 kokkos/GridInit.cpp       | 6 ++++++
 kokkos/Simulation.cpp     | 3 ++-
 kokkos/XSbench_header.hpp | 1 +
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/kokkos/GridInit.cpp b/kokkos/GridInit.cpp
index faad5871..32dbcc14 100644
--- a/kokkos/GridInit.cpp
+++ b/kokkos/GridInit.cpp
@@ -171,6 +171,12 @@ SimulationData grid_init_do_not_profile( Inputs in, int mype )
 	SD.concs = load_concs(SD.num_nucs, SD.max_num_nucs);
 	SD.length_concs = SD.length_mats;
 
+	Kokkos::View<int, Kokkos::LayoutLeft, Kokkos::HostSpace,
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+			u_max_num_nucs(&SD.max_num_nucs);
+        SD.d_max_num_nucs = new Kokkos::View<int>("d_max_num_nucs");
+        Kokkos::deep_copy(*SD.d_max_num_nucs, u_max_num_nucs);
+
         Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::HostSpace,
                      Kokkos::MemoryTraits<Kokkos::Unmanaged>>
 			u_num_nucs(SD.num_nucs, SD.length_num_nucs);
diff --git a/kokkos/Simulation.cpp b/kokkos/Simulation.cpp
index 948fb6e3..c627654b 100644
--- a/kokkos/Simulation.cpp
+++ b/kokkos/Simulation.cpp
@@ -74,6 +74,7 @@ unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int
 
 		// debugging
 		//printf("E = %lf mat = %d\n", p_energy, mat);
+		//if (i == 0 || i == 1) printf("d_max_num_nucs = %d\n", (*SD.d_max_num_nucs)());
 
 		double macro_xs_vector[5] = {0};
 		
@@ -92,7 +93,7 @@ unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int
 				macro_xs_vector, // 1-D array with result of the macroscopic cross section (5 different reaction channels)
 				in.grid_type,    // Lookup type (nuclide, hash, or unionized)
 				in.hash_bins,    // Number of hash bins used (if using hash lookup type)
-				SD.max_num_nucs  // Maximum number of nuclides present in any material
+				(*SD.d_max_num_nucs)()  // Maximum number of nuclides present in any material
 				);
 
 		// For verification, and to prevent the compiler from optimizing
diff --git a/kokkos/XSbench_header.hpp b/kokkos/XSbench_header.hpp
index f71e8ab6..b0ed55af 100644
--- a/kokkos/XSbench_header.hpp
+++ b/kokkos/XSbench_header.hpp
@@ -85,6 +85,7 @@ typedef struct{
 	int length_unionized_energy_array;
 	long length_index_grid;
 	int length_nuclide_grid;
+	Kokkos::View<int>* d_max_num_nucs;
 	int max_num_nucs;
 	double * p_energy_samples;
 	int length_p_energy_samples;

From a40652b6b73424fae79eb41a521fa0c4e01f0c85 Mon Sep 17 00:00:00 2001
From: Josh Davis <jhdavis@umd.edu>
Date: Mon, 10 Jul 2023 18:34:09 -0400
Subject: [PATCH 06/23] Fix improper copying of host pointer to device,
 validation passing

---
 kokkos/GridInit.cpp       | 19 ++++++--
 kokkos/Makefile           |  6 ++-
 kokkos/Simulation.cpp     | 96 ++++++++++++++++++---------------------
 kokkos/XSbench_header.hpp | 18 ++++----
 4 files changed, 73 insertions(+), 66 deletions(-)

diff --git a/kokkos/GridInit.cpp b/kokkos/GridInit.cpp
index 32dbcc14..32d7ca97 100644
--- a/kokkos/GridInit.cpp
+++ b/kokkos/GridInit.cpp
@@ -171,11 +171,20 @@ SimulationData grid_init_do_not_profile( Inputs in, int mype )
 	SD.concs = load_concs(SD.num_nucs, SD.max_num_nucs);
 	SD.length_concs = SD.length_mats;
 
-	Kokkos::View<int, Kokkos::LayoutLeft, Kokkos::HostSpace,
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>
-			u_max_num_nucs(&SD.max_num_nucs);
-        SD.d_max_num_nucs = new Kokkos::View<int>("d_max_num_nucs");
-        Kokkos::deep_copy(*SD.d_max_num_nucs, u_max_num_nucs);
+	int length_max_num_nucs = 1;
+
+	//Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::HostSpace,
+        //             Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+	//		u_max_num_nucs(&SD.max_num_nucs, 1);
+
+        SD.d_max_num_nucs = new IntView("d_max_num_nucs", length_max_num_nucs);
+
+	IntView::HostMirror* u_max_num_nucs = new IntView::HostMirror();
+	*u_max_num_nucs = Kokkos::create_mirror_view(*SD.d_max_num_nucs);
+	for (int i = 0; i < length_max_num_nucs; i++)
+		(*u_max_num_nucs)(0) = SD.max_num_nucs;
+
+        Kokkos::deep_copy(*SD.d_max_num_nucs, *u_max_num_nucs);
 
         Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::HostSpace,
                      Kokkos::MemoryTraits<Kokkos::Unmanaged>>
diff --git a/kokkos/Makefile b/kokkos/Makefile
index 46a7143c..3f2f4903 100644
--- a/kokkos/Makefile
+++ b/kokkos/Makefile
@@ -6,7 +6,7 @@ KOKKOS_PATH = ${HOME}/repos/perf-port/kokkos
 CUDA_PATH   = /usr/local/cuda-11.6
 COMPILER    = nvidia
 OPTIMIZE    = yes
-DEBUG       = yes
+DEBUG       = no
 PROFILE     = no
 MPI         = no
 KOKKOS_DEVICES = OpenMP,Cuda
@@ -64,6 +64,10 @@ endif
 ifeq ($(DEBUG),yes)
   CXXFLAGS += -g
   LDFLAGS  += -g
+ifeq ($(COMPILER),nvidia)
+  CXXFLAGS += -G
+  LDFLAGS  += -G
+endif
 endif
 
 # Profiling Flags
diff --git a/kokkos/Simulation.cpp b/kokkos/Simulation.cpp
index c627654b..437bbda0 100644
--- a/kokkos/Simulation.cpp
+++ b/kokkos/Simulation.cpp
@@ -44,21 +44,16 @@ unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int
 	Kokkos::View<unsigned long long*> d_verification("d_ver", in.lookups);
 	Kokkos::View<unsigned long long*>::HostMirror verification =
 			Kokkos::create_mirror_view(d_verification);
-	//Kokkos::deep_copy(d_verification, verification);
-
-	/*
-	#pragma omp target teams distribute parallel for\
-	map(to: SD.max_num_nucs)\
-	map(to: SD.num_nucs[:SD.length_num_nucs])\
-	map(to: SD.concs[:SD.length_concs])\
-	map(to: SD.mats[:SD.length_mats])\
-	map(to: SD.unionized_energy_array[:SD.length_unionized_energy_array])\
-	map(to: SD.index_grid[:SD.length_index_grid])\
-	map(to: SD.nuclide_grid[:SD.length_nuclide_grid])\
-	map(from: verification[:in.lookups])
-	for( int i = 0; i < in.lookups; i++ )
-	{
-	*/
+	Kokkos::deep_copy(d_verification, verification);
+
+	IntView num_nucs(*SD.d_num_nucs);
+	DoubleView concs(*SD.d_concs);
+	IntView mats(*SD.d_mats);
+	DoubleView unionized_energy_array(*SD.d_unionized_energy_array);
+	IntView index_grid(*SD.d_index_grid);
+	PointView nuclide_grid(*SD.d_nuclide_grid);
+	IntView max_num_nucs(*SD.d_max_num_nucs);
+	
 	Kokkos::parallel_for("Simulation",
 			     Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, in.lookups),
 			     KOKKOS_LAMBDA (int i) {
@@ -74,7 +69,6 @@ unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int
 
 		// debugging
 		//printf("E = %lf mat = %d\n", p_energy, mat);
-		//if (i == 0 || i == 1) printf("d_max_num_nucs = %d\n", (*SD.d_max_num_nucs)());
 
 		double macro_xs_vector[5] = {0};
 		
@@ -84,16 +78,16 @@ unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int
 				mat,             // Sampled material type index neutron is in
 				in.n_isotopes,   // Total number of isotopes in simulation
 				in.n_gridpoints, // Number of gridpoints per isotope in simulation
-				SD.d_num_nucs,     // 1-D array with number of nuclides per material
-				SD.d_concs,        // Flattened 2-D array with concentration of each nuclide in each material
-				SD.d_unionized_energy_array, // 1-D Unionized energy array
-				SD.d_index_grid,   // Flattened 2-D grid holding indices into nuclide grid for each unionized energy level
-				SD.d_nuclide_grid, // Flattened 2-D grid holding energy levels and XS_data for all nuclides in simulation
-				SD.d_mats,         // Flattened 2-D array with nuclide indices defining composition of each type of material
+				num_nucs,     // 1-D array with number of nuclides per material
+				concs,        // Flattened 2-D array with concentration of each nuclide in each material
+				unionized_energy_array, // 1-D Unionized energy array
+				index_grid,   // Flattened 2-D grid holding indices into nuclide grid for each unionized energy level
+				nuclide_grid, // Flattened 2-D grid holding energy levels and XS_data for all nuclides in simulation
+				mats,         // Flattened 2-D array with nuclide indices defining composition of each type of material
 				macro_xs_vector, // 1-D array with result of the macroscopic cross section (5 different reaction channels)
 				in.grid_type,    // Lookup type (nuclide, hash, or unionized)
 				in.hash_bins,    // Number of hash bins used (if using hash lookup type)
-				(*SD.d_max_num_nucs)()  // Maximum number of nuclides present in any material
+				max_num_nucs(0)  // Maximum number of nuclides present in any material
 				);
 
 		// For verification, and to prevent the compiler from optimizing
@@ -132,8 +126,8 @@ unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int
 KOKKOS_INLINE_FUNCTION
 void calculate_micro_xs(   double p_energy, int nuc, long n_isotopes,
 			   long n_gridpoints,
-			   DoubleView*  egrid, IntView*  index_data,
-			   PointView *  nuclide_grids,
+			   DoubleView  egrid, IntView  index_data,
+			   PointView  nuclide_grids,
 			   long idx, double *  xs_vector, int grid_type, int hash_bins ){
 	// Variables
 	double f;
@@ -151,41 +145,41 @@ void calculate_micro_xs(   double p_energy, int nuc, long n_isotopes,
 		// we're not reading off the end of the nuclide's grid
 		if( idx == n_gridpoints - 1 ) {
 			low_i = nuc*n_gridpoints + idx - 1;
-			low = &(*nuclide_grids)(low_i);
+			low = &nuclide_grids(low_i);
 		} else {
 			low_i = nuc*n_gridpoints + idx;
-			low = &(*nuclide_grids)(low_i);
+			low = &nuclide_grids(low_i);
 		}
 	}
 	else if( grid_type == UNIONIZED) // Unionized Energy Grid - we already know the index, no binary search needed.
 	{
 		// pull ptr from energy grid and check to ensure that
 		// we're not reading off the end of the nuclide's grid
-		if( (*index_data)(idx * n_isotopes + nuc) == n_gridpoints - 1 ) {
-			low_i = nuc*n_gridpoints + (*index_data)(idx * n_isotopes + nuc) - 1;
-			low = &(*nuclide_grids)(low_i);
+		if( index_data(idx * n_isotopes + nuc) == n_gridpoints - 1 ) {
+			low_i = nuc*n_gridpoints + index_data(idx * n_isotopes + nuc) - 1;
+			low = &nuclide_grids(low_i);
 		} else {
-			low_i = nuc*n_gridpoints + (*index_data)(idx * n_isotopes + nuc);
-			low = &(*nuclide_grids)(low_i);
+			low_i = nuc*n_gridpoints + index_data(idx * n_isotopes + nuc);
+			low = &nuclide_grids(low_i);
 		}
 	}
 	else // Hash grid
 	{
 		// load lower bounding index
-		int u_low = (*index_data)(idx * n_isotopes + nuc);
+		int u_low = index_data(idx * n_isotopes + nuc);
 
 		// Determine higher bounding index
 		int u_high;
 		if( idx == hash_bins - 1 )
 			u_high = n_gridpoints - 1;
 		else
-			u_high = (*index_data)((idx+1)*n_isotopes + nuc) + 1;
+			u_high = index_data((idx+1)*n_isotopes + nuc) + 1;
 
 		// Check edge cases to make sure energy is actually between these
 		// Then, if things look good, search for gridpoint in the nuclide grid
 		// within the lower and higher limits we've calculated.
-		double e_low  = (*nuclide_grids)(nuc*n_gridpoints + u_low).energy;
-		double e_high = (*nuclide_grids)(nuc*n_gridpoints + u_high).energy;
+		double e_low  = nuclide_grids(nuc*n_gridpoints + u_low).energy;
+		double e_high = nuclide_grids(nuc*n_gridpoints + u_high).energy;
 		int lower;
 		if( p_energy <= e_low )
 			lower = 0;
@@ -196,15 +190,15 @@ void calculate_micro_xs(   double p_energy, int nuc, long n_isotopes,
 
 		if( lower == n_gridpoints - 1 ) {
 			low_i = nuc*n_gridpoints + lower - 1;
-			low = &(*nuclide_grids)(low_i);
+			low = &nuclide_grids(low_i);
 		} else {
 			low_i = nuc*n_gridpoints + lower;
-			low = &(*nuclide_grids)(low_i);
+			low = &nuclide_grids(low_i);
 		}
 	}
 
 	high_i = low_i + 1;
-	high = &(*nuclide_grids)(high_i);
+	high = &nuclide_grids(high_i);
 	
 	// calculate the re-useable interpolation factor
 	f = (high->energy - p_energy) / (high->energy - low->energy);
@@ -240,11 +234,11 @@ void calculate_micro_xs(   double p_energy, int nuc, long n_isotopes,
 // Calculates macroscopic cross section based on a given material & energy
 KOKKOS_INLINE_FUNCTION
 void calculate_macro_xs( double p_energy, int mat, long n_isotopes,
-                         long n_gridpoints, IntView*  num_nucs,
-                         DoubleView*  concs,
-                         DoubleView*  egrid, IntView*  index_data,
-                         PointView*  nuclide_grids,
-                         IntView*  mats,
+                         long n_gridpoints, IntView  num_nucs,
+                         DoubleView  concs,
+                         DoubleView  egrid, IntView  index_data,
+                         PointView  nuclide_grids,
+                         IntView  mats,
                          double *  macro_xs_vector, int grid_type, int hash_bins, int max_num_nucs ){
 	int p_nuc; // the nuclide we are looking up
 	long idx = -1;	
@@ -277,11 +271,11 @@ void calculate_macro_xs( double p_energy, int mat, long n_isotopes,
 	// (Independent -- though if parallelizing, must use atomic operations
 	//  or otherwise control access to the xs_vector and macro_xs_vector to
 	//  avoid simulataneous writing to the same data structure)
-	for( int j = 0; j < (*num_nucs)(mat); j++ )
+	for( int j = 0; j < num_nucs(mat); j++ )
 	{
 		double xs_vector[5];
-		p_nuc = (*mats)(mat*max_num_nucs + j);
-		conc = (*concs)(mat*max_num_nucs + j);
+		p_nuc = mats(mat*max_num_nucs + j);
+		conc = concs(mat*max_num_nucs + j);
 		calculate_micro_xs( p_energy, p_nuc, n_isotopes,
 		                    n_gridpoints, egrid, index_data,
 		                    nuclide_grids, idx, xs_vector, grid_type, hash_bins );
@@ -301,7 +295,7 @@ void calculate_macro_xs( double p_energy, int mat, long n_isotopes,
 // binary search for energy on unionized energy grid
 // returns lower index
 KOKKOS_INLINE_FUNCTION
-long grid_search( long n, double quarry, DoubleView* A, long off)
+long grid_search( long n, double quarry, DoubleView A, long off)
 {
 	long lowerLimit = 0;
 	long upperLimit = n-1;
@@ -312,7 +306,7 @@ long grid_search( long n, double quarry, DoubleView* A, long off)
 	{
 		examinationPoint = lowerLimit + ( length / 2 );
 		
-		if( (*A)(examinationPoint + off) > quarry )
+		if( A(examinationPoint + off) > quarry )
 			upperLimit = examinationPoint;
 		else
 			lowerLimit = examinationPoint;
@@ -325,7 +319,7 @@ long grid_search( long n, double quarry, DoubleView* A, long off)
 
 // binary search for energy on nuclide energy grid
 KOKKOS_INLINE_FUNCTION
-long grid_search_nuclide( long n, double quarry, PointView* A, long off, long low, long high)
+long grid_search_nuclide( long n, double quarry, PointView A, long off, long low, long high)
 {
 	long lowerLimit = low;
 	long upperLimit = high;
@@ -336,7 +330,7 @@ long grid_search_nuclide( long n, double quarry, PointView* A, long off, long lo
 	{
 		examinationPoint = lowerLimit + ( length / 2 );
 		
-		if( (*A)(examinationPoint + off).energy > quarry )
+		if( A(examinationPoint + off).energy > quarry )
 			upperLimit = examinationPoint;
 		else
 			lowerLimit = examinationPoint;
diff --git a/kokkos/XSbench_header.hpp b/kokkos/XSbench_header.hpp
index b0ed55af..46dc43d1 100644
--- a/kokkos/XSbench_header.hpp
+++ b/kokkos/XSbench_header.hpp
@@ -85,7 +85,7 @@ typedef struct{
 	int length_unionized_energy_array;
 	long length_index_grid;
 	int length_nuclide_grid;
-	Kokkos::View<int>* d_max_num_nucs;
+	IntView* d_max_num_nucs;
 	int max_num_nucs;
 	double * p_energy_samples;
 	int length_p_energy_samples;
@@ -111,21 +111,21 @@ unsigned long long run_history_based_simulation(Inputs in, SimulationData SD, in
 KOKKOS_INLINE_FUNCTION
 void calculate_micro_xs(   double p_energy, int nuc, long n_isotopes,
 			   long n_gridpoints,
-			   DoubleView*  egrid, IntView*  index_data,
+			   DoubleView  egrid, IntView  index_data,
 			   PointView *  nuclide_grids,
 			   long idx, double *  xs_vector, int grid_type, int hash_bins );
 KOKKOS_INLINE_FUNCTION
 void calculate_macro_xs( double p_energy, int mat, long n_isotopes,
-                         long n_gridpoints, IntView*  num_nucs,
-                         DoubleView*  concs,
-                         DoubleView*  egrid, IntView*  index_data,
-                         PointView*  nuclide_grids,
-                         IntView*  mats,
+                         long n_gridpoints, IntView  num_nucs,
+                         DoubleView  concs,
+                         DoubleView  egrid, IntView  index_data,
+                         PointView  nuclide_grids,
+                         IntView  mats,
                          double *  macro_xs_vector, int grid_type, int hash_bins, int max_num_nucs );
 KOKKOS_INLINE_FUNCTION
-long grid_search( long n, double quarry, DoubleView* A, long off);
+long grid_search( long n, double quarry, DoubleView A, long off);
 KOKKOS_INLINE_FUNCTION
-long grid_search_nuclide( long n, double quarry, PointView* A, long off, long low, long high);
+long grid_search_nuclide( long n, double quarry, PointView A, long off, long low, long high);
 long grid_search_nuclide_old( long n, double quarry, NuclideGridPoint* A, long low, long high);
 KOKKOS_INLINE_FUNCTION
 int pick_mat( uint64_t * seed );

From c139c2e2e8e8cf47f2a4d12f5a98e1f59b539781 Mon Sep 17 00:00:00 2001
From: Josh Davis <jhdavis@umd.edu>
Date: Mon, 10 Jul 2023 18:39:26 -0400
Subject: [PATCH 07/23] Restore max num nucs initialization style

---
 kokkos/GridInit.cpp | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/kokkos/GridInit.cpp b/kokkos/GridInit.cpp
index 32d7ca97..32497504 100644
--- a/kokkos/GridInit.cpp
+++ b/kokkos/GridInit.cpp
@@ -173,18 +173,11 @@ SimulationData grid_init_do_not_profile( Inputs in, int mype )
 
 	int length_max_num_nucs = 1;
 
-	//Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::HostSpace,
-        //             Kokkos::MemoryTraits<Kokkos::Unmanaged>>
-	//		u_max_num_nucs(&SD.max_num_nucs, 1);
-
+	Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::HostSpace,
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+			u_max_num_nucs(&SD.max_num_nucs, 1);
         SD.d_max_num_nucs = new IntView("d_max_num_nucs", length_max_num_nucs);
-
-	IntView::HostMirror* u_max_num_nucs = new IntView::HostMirror();
-	*u_max_num_nucs = Kokkos::create_mirror_view(*SD.d_max_num_nucs);
-	for (int i = 0; i < length_max_num_nucs; i++)
-		(*u_max_num_nucs)(0) = SD.max_num_nucs;
-
-        Kokkos::deep_copy(*SD.d_max_num_nucs, *u_max_num_nucs);
+        Kokkos::deep_copy(*SD.d_max_num_nucs, u_max_num_nucs);
 
         Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::HostSpace,
                      Kokkos::MemoryTraits<Kokkos::Unmanaged>>

From 7aeaa649cdc16c2a02fb89efe78029e2417bda1a Mon Sep 17 00:00:00 2001
From: Josh Davis <jhdavis@umd.edu>
Date: Tue, 18 Jul 2023 09:40:21 -0400
Subject: [PATCH 08/23] Update clean rule

---
 kokkos/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kokkos/Makefile b/kokkos/Makefile
index 3f2f4903..0c5144dc 100644
--- a/kokkos/Makefile
+++ b/kokkos/Makefile
@@ -102,7 +102,7 @@ $(program): $(obj) XSbench_header.hpp Makefile $(KOKKOS_LINK_DEPENDS)
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $< -o $@
 
 clean:
-	rm -rf $(program) $(obj)
+	rm -rf $(program) $(obj) *.tmp Kokkos* desul/ libkokkos.a Lock_Array_CUDA.o
 
 edit:
 	vim -p $(source) XSbench_header.hpp

From e9f4c89ba9ee9d516acec4ef85bc05ac175fdc69 Mon Sep 17 00:00:00 2001
From: Josh Davis <jhdavis@udel.edu>
Date: Mon, 10 Jul 2023 19:47:03 -0400
Subject: [PATCH 09/23] Adjust Kokkos Makefile to default to Summit options

---
 kokkos/Makefile | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/kokkos/Makefile b/kokkos/Makefile
index 0c5144dc..7668679e 100644
--- a/kokkos/Makefile
+++ b/kokkos/Makefile
@@ -2,18 +2,16 @@
 # User Options
 #===============================================================================
 
-KOKKOS_PATH = ${HOME}/repos/perf-port/kokkos
-CUDA_PATH   = /usr/local/cuda-11.6
+KOKKOS_PATH = /gpfs/alpine/csc452/proj-shared/perf_port_builds/kokkos
+CUDA_PATH   = /sw/summit/cuda/11.5.2
 COMPILER    = nvidia
 OPTIMIZE    = yes
 DEBUG       = no
 PROFILE     = no
 MPI         = no
 KOKKOS_DEVICES = OpenMP,Cuda
-#KOKKOS_DEVICES = OpenMP
-KOKKOS_ARCH = Zen3,Ampere86
-#KOKKOS_ARCH = Zen3
-SM_VERSION = 86
+KOKKOS_ARCH = Power9,Volta70
+SM_VERSION = 70
 
 #===============================================================================
 # Program name & source code list

From 1d849f3e2e30a4a3570f25ca04b0e27a8270eb8c Mon Sep 17 00:00:00 2001
From: Josh Davis <jhdavis@udel.edu>
Date: Wed, 12 Jul 2023 13:20:31 -0400
Subject: [PATCH 10/23] Clean some whitespace, typedef unmanaged views for
 brevity

---
 kokkos/GridInit.cpp       | 62 +++++++++++++++------------------------
 kokkos/Main.cpp           | 12 ++++----
 kokkos/XSbench_header.hpp |  6 ++++
 3 files changed, 36 insertions(+), 44 deletions(-)

diff --git a/kokkos/GridInit.cpp b/kokkos/GridInit.cpp
index 32497504..164c1ba0 100644
--- a/kokkos/GridInit.cpp
+++ b/kokkos/GridInit.cpp
@@ -10,21 +10,21 @@ SimulationData grid_init_do_not_profile( Inputs in, int mype )
 	size_t nbytes = 0;
 
 	// Set the initial seed value
-	uint64_t seed = 42;	
+	uint64_t seed = 42;
 
 	////////////////////////////////////////////////////////////////////
 	// Initialize Nuclide Grids
 	////////////////////////////////////////////////////////////////////
-	
+
 	if(mype == 0) printf("Intializing nuclide grids...\n");
 
 	// First, we need to initialize our nuclide grid. This comes in the form
 	// of a flattened 2D array that hold all the information we need to define
-	// the cross sections for all isotopes in the simulation. 
+	// the cross sections for all isotopes in the simulation.
 	// The grid is composed of "NuclideGridPoint" structures, which hold the
 	// energy level of the grid point and all associated XS data at that level.
 	// An array of structures (AOS) is used instead of
-	// a structure of arrays, as the grid points themselves are accessed in 
+	// a structure of arrays, as the grid points themselves are accessed in
 	// a random order, but all cross section interaction channels and the
 	// energy level are read whenever the gridpoint is accessed, meaning the
 	// AOS is more cache efficient.
@@ -47,7 +47,7 @@ SimulationData grid_init_do_not_profile( Inputs in, int mype )
 	// Sort so that each nuclide has data stored in ascending energy order.
 	for( int i = 0; i < in.n_isotopes; i++ )
 		qsort( &SD.nuclide_grid[i*in.n_gridpoints], in.n_gridpoints, sizeof(NuclideGridPoint), NGP_compare);
-	
+
 	// error debug check
 	/*
 	for( int i = 0; i < in.n_isotopes; i++ )
@@ -57,18 +57,18 @@ SimulationData grid_init_do_not_profile( Inputs in, int mype )
 			printf("E%d = %lf\n", j, SD.nuclide_grid[i * in.n_gridpoints + j].energy);
 	}
 	*/
-	
+
 
 	////////////////////////////////////////////////////////////////////
 	// Initialize Acceleration Structure
 	////////////////////////////////////////////////////////////////////
-	
+
 	if( in.grid_type == NUCLIDE )
 	{
 		SD.length_unionized_energy_array = 0;
 		SD.length_index_grid = 0;
 	}
-	
+
 	if( in.grid_type == UNIONIZED )
 	{
 		if(mype == 0) printf("Intializing unionized grid...\n");
@@ -114,7 +114,7 @@ SimulationData grid_init_do_not_profile( Inputs in, int mype )
 				{
 					idx_low[i]++;
 					SD.index_grid[e * in.n_isotopes + i] = idx_low[i];
-					energy_high[i] = SD.nuclide_grid[i * in.n_gridpoints + idx_low[i] + 1].energy;	
+					energy_high[i] = SD.nuclide_grid[i * in.n_gridpoints + idx_low[i] + 1].energy;
 				}
 			}
 		}
@@ -128,7 +128,7 @@ SimulationData grid_init_do_not_profile( Inputs in, int mype )
 		if(mype == 0) printf("Intializing hash grid...\n");
 		SD.length_unionized_energy_array = 0;
 		SD.length_index_grid  = in.hash_bins * in.n_isotopes;
-		SD.index_grid = (int *) malloc( SD.length_index_grid * sizeof(int)); 
+		SD.index_grid = (int *) malloc( SD.length_index_grid * sizeof(int));
 		assert(SD.index_grid != NULL);
 		nbytes += SD.length_index_grid * sizeof(int);
 
@@ -152,7 +152,7 @@ SimulationData grid_init_do_not_profile( Inputs in, int mype )
 	// Initialize Materials and Concentrations
 	////////////////////////////////////////////////////////////////////
 	if(mype == 0) printf("Intializing material data...\n");
-	
+
 	// Set the number of nuclides in each material
 	SD.num_nucs  = load_num_nucs(in.n_isotopes);
 	SD.length_num_nucs = 12; // There are always 12 materials in XSBench
@@ -173,49 +173,35 @@ SimulationData grid_init_do_not_profile( Inputs in, int mype )
 
 	int length_max_num_nucs = 1;
 
-	Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::HostSpace,
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>
-			u_max_num_nucs(&SD.max_num_nucs, 1);
+	UIntView u_max_num_nucs(&SD.max_num_nucs, 1);
         SD.d_max_num_nucs = new IntView("d_max_num_nucs", length_max_num_nucs);
         Kokkos::deep_copy(*SD.d_max_num_nucs, u_max_num_nucs);
 
-        Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::HostSpace,
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>
-			u_num_nucs(SD.num_nucs, SD.length_num_nucs);
+        UIntView u_num_nucs(SD.num_nucs, SD.length_num_nucs);
         SD.d_num_nucs = new IntView("d_num_nucs", SD.length_num_nucs);
         Kokkos::deep_copy(*SD.d_num_nucs, u_num_nucs);
-        
-        Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::HostSpace,
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>
-			u_concs(SD.concs, SD.length_concs);
+
+        UDoubleView u_concs(SD.concs, SD.length_concs);
         SD.d_concs = new DoubleView("d_concs", SD.length_concs);
         Kokkos::deep_copy(*SD.d_concs, u_concs);
-        
-        Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::HostSpace,
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>
-			u_mats(SD.mats, SD.length_mats);
+
+        UIntView u_mats(SD.mats, SD.length_mats);
         SD.d_mats = new IntView("d_mats", SD.length_mats);
         Kokkos::deep_copy(*SD.d_mats, u_mats);
-        
-        Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::HostSpace,
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>
-			u_unionized_energy_array(SD.unionized_energy_array, SD.length_unionized_energy_array);
+
+        UDoubleView u_unionized_energy_array(SD.unionized_energy_array, SD.length_unionized_energy_array);
         SD.d_unionized_energy_array = new DoubleView("d_unionized_energy_array",
 						     SD.length_unionized_energy_array);
         Kokkos::deep_copy(*SD.d_unionized_energy_array, u_unionized_energy_array);
-        
-        Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::HostSpace,
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>
-            u_index_grid(SD.index_grid, SD.length_index_grid);
+
+        UIntView u_index_grid(SD.index_grid, SD.length_index_grid);
         SD.d_index_grid = new IntView("d_index_grid", SD.length_index_grid);
         Kokkos::deep_copy(*SD.d_index_grid, u_index_grid);
-        
-        Kokkos::View<NuclideGridPoint*, Kokkos::LayoutLeft, Kokkos::HostSpace,
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>
-            u_nuclide_grid(SD.nuclide_grid, SD.length_nuclide_grid);
+
+        UPointView u_nuclide_grid(SD.nuclide_grid, SD.length_nuclide_grid);
         SD.d_nuclide_grid = new PointView("d_nuclide_grid", SD.length_nuclide_grid);
         Kokkos::deep_copy(*SD.d_nuclide_grid, u_nuclide_grid);
-        
+
 	if(mype == 0) printf("Intialization complete. Allocated %.0lf MB of data.\n", nbytes/1024.0/1024.0 );
 
 	return SD;
diff --git a/kokkos/Main.cpp b/kokkos/Main.cpp
index ddc13d23..d208df3c 100644
--- a/kokkos/Main.cpp
+++ b/kokkos/Main.cpp
@@ -30,7 +30,7 @@ int main( int argc, char* argv[] )
 	Inputs in = read_CLI( argc, argv );
 
 	// Set number of OpenMP Threads
-	//omp_set_num_threads(in.nthreads); 
+	//omp_set_num_threads(in.nthreads);
 
 	// Print-out of Input Summary
 	if( mype == 0 )
@@ -41,7 +41,7 @@ int main( int argc, char* argv[] )
 	// This is not reflective of a real Monte Carlo simulation workload,
 	// therefore, do not profile this region!
 	// =====================================================================
-	
+
 	SimulationData SD;
 
 	// If read from file mode is selected, skip initialization and load
@@ -59,7 +59,7 @@ int main( int argc, char* argv[] )
 
 	// =====================================================================
 	// Cross Section (XS) Parallel Lookup Simulation
-	// This is the section that should be profiled, as it reflects a 
+	// This is the section that should be profiled, as it reflects a
 	// realistic continuous energy Monte Carlo macroscopic cross section
 	// lookup kernel.
 	// =====================================================================
@@ -92,8 +92,8 @@ int main( int argc, char* argv[] )
 		exit(1);
 	}
 
-	if( mype == 0)	
-	{	
+	if( mype == 0)
+	{
 		printf("\n" );
 		printf("Simulation complete.\n" );
 	}
@@ -112,7 +112,7 @@ int main( int argc, char* argv[] )
 	int is_invalid_result = print_results( in, mype, omp_end-omp_start, nprocs, verification );
 
         Kokkos::finalize();
-        
+
 	#ifdef MPI
 	MPI_Finalize();
 	#endif
diff --git a/kokkos/XSbench_header.hpp b/kokkos/XSbench_header.hpp
index 46dc43d1..62ebda0b 100644
--- a/kokkos/XSbench_header.hpp
+++ b/kokkos/XSbench_header.hpp
@@ -65,6 +65,12 @@ typedef struct{
 typedef Kokkos::View<int*> IntView;
 typedef Kokkos::View<double*> DoubleView;
 typedef Kokkos::View<NuclideGridPoint*> PointView;
+typedef Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::HostSpace,
+		     Kokkos::MemoryTraits<Kokkos::Unmanaged>> UIntView;
+typedef Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::HostSpace,
+		     Kokkos::MemoryTraits<Kokkos::Unmanaged>> UDoubleView;
+typedef Kokkos::View<NuclideGridPoint*, Kokkos::LayoutLeft, Kokkos::HostSpace,
+		     Kokkos::MemoryTraits<Kokkos::Unmanaged>> UPointView;
 
 typedef struct{
 	IntView* d_num_nucs;				// Length = length_num_nucs;

From 7bc64e41d328493aa94d6ab7a07cc16355010397 Mon Sep 17 00:00:00 2001
From: Josh Davis <jhdavis@udel.edu>
Date: Wed, 12 Jul 2023 13:30:16 -0400
Subject: [PATCH 11/23] Modify timers to align workload in Kokkos

---
 kokkos/GridInit.cpp       |  31 ----------
 kokkos/Main.cpp           |   7 +--
 kokkos/Simulation.cpp     | 119 ++++++++++++++++++++++++--------------
 kokkos/XSbench_header.hpp |   2 +-
 4 files changed, 79 insertions(+), 80 deletions(-)

diff --git a/kokkos/GridInit.cpp b/kokkos/GridInit.cpp
index 164c1ba0..c9515756 100644
--- a/kokkos/GridInit.cpp
+++ b/kokkos/GridInit.cpp
@@ -171,37 +171,6 @@ SimulationData grid_init_do_not_profile( Inputs in, int mype )
 	SD.concs = load_concs(SD.num_nucs, SD.max_num_nucs);
 	SD.length_concs = SD.length_mats;
 
-	int length_max_num_nucs = 1;
-
-	UIntView u_max_num_nucs(&SD.max_num_nucs, 1);
-        SD.d_max_num_nucs = new IntView("d_max_num_nucs", length_max_num_nucs);
-        Kokkos::deep_copy(*SD.d_max_num_nucs, u_max_num_nucs);
-
-        UIntView u_num_nucs(SD.num_nucs, SD.length_num_nucs);
-        SD.d_num_nucs = new IntView("d_num_nucs", SD.length_num_nucs);
-        Kokkos::deep_copy(*SD.d_num_nucs, u_num_nucs);
-
-        UDoubleView u_concs(SD.concs, SD.length_concs);
-        SD.d_concs = new DoubleView("d_concs", SD.length_concs);
-        Kokkos::deep_copy(*SD.d_concs, u_concs);
-
-        UIntView u_mats(SD.mats, SD.length_mats);
-        SD.d_mats = new IntView("d_mats", SD.length_mats);
-        Kokkos::deep_copy(*SD.d_mats, u_mats);
-
-        UDoubleView u_unionized_energy_array(SD.unionized_energy_array, SD.length_unionized_energy_array);
-        SD.d_unionized_energy_array = new DoubleView("d_unionized_energy_array",
-						     SD.length_unionized_energy_array);
-        Kokkos::deep_copy(*SD.d_unionized_energy_array, u_unionized_energy_array);
-
-        UIntView u_index_grid(SD.index_grid, SD.length_index_grid);
-        SD.d_index_grid = new IntView("d_index_grid", SD.length_index_grid);
-        Kokkos::deep_copy(*SD.d_index_grid, u_index_grid);
-
-        UPointView u_nuclide_grid(SD.nuclide_grid, SD.length_nuclide_grid);
-        SD.d_nuclide_grid = new PointView("d_nuclide_grid", SD.length_nuclide_grid);
-        Kokkos::deep_copy(*SD.d_nuclide_grid, u_nuclide_grid);
-
 	if(mype == 0) printf("Intialization complete. Allocated %.0lf MB of data.\n", nbytes/1024.0/1024.0 );
 
 	return SD;
diff --git a/kokkos/Main.cpp b/kokkos/Main.cpp
index d208df3c..860eea08 100644
--- a/kokkos/Main.cpp
+++ b/kokkos/Main.cpp
@@ -79,7 +79,7 @@ int main( int argc, char* argv[] )
 	if( in.simulation_method == EVENT_BASED )
 	{
 		if( in.kernel_id == 0 )
-			verification = run_event_based_simulation(in, SD, mype);
+			verification = run_event_based_simulation(in, SD, mype, &omp_end);
 		else
 		{
 			printf("Error: No kernel ID %d found!\n", in.kernel_id);
@@ -88,7 +88,7 @@ int main( int argc, char* argv[] )
 	}
 	else
 	{
-		printf("History-based simulation not implemented in OpenMP offload code. Instead,\nuse the event-based method with \"-m event\" argument.\n");
+		printf("History-based simulation not implemented in Kokkos code. Instead,\nuse the event-based method with \"-m event\" argument.\n");
 		exit(1);
 	}
 
@@ -98,9 +98,6 @@ int main( int argc, char* argv[] )
 		printf("Simulation complete.\n" );
 	}
 
-	// End Simulation Timer
-	omp_end = omp_get_wtime();
-
 	// =====================================================================
 	// Output Results & Finalize
 	// =====================================================================
diff --git a/kokkos/Simulation.cpp b/kokkos/Simulation.cpp
index 437bbda0..87d0329e 100644
--- a/kokkos/Simulation.cpp
+++ b/kokkos/Simulation.cpp
@@ -12,11 +12,11 @@
 // are not yet implemented for this OpenMP targeting offload port.
 ////////////////////////////////////////////////////////////////////////////////////
 
-unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int mype)
+unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int mype, double* end)
 {
-	if( mype == 0)	
+	if( mype == 0)
 		printf("Beginning event based simulation...\n");
-	
+
 	////////////////////////////////////////////////////////////////////////////////
 	// SUMMARY: Simulation Data Structure Manifest for "SD" Object
 	// Here we list all heap arrays (and lengths) in SD that would need to be
@@ -28,7 +28,7 @@ unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int
 	// double * unionized_energy_array;    // Length = length_unionized_energy_array
 	// int * index_grid;                   // Length = length_index_grid
 	// NuclideGridPoint * nuclide_grid;    // Length = length_nuclide_grid
-	// 
+	//
 	// Note: "unionized_energy_array" and "index_grid" can be of zero length
 	//        depending on lookup method.
 	//
@@ -36,15 +36,37 @@ unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int
 	//       number of bytes.
 	////////////////////////////////////////////////////////////////////////////////
 
+	// Data movment and setup
+	int length_max_num_nucs = 1;
 
-	////////////////////////////////////////////////////////////////////////////////
-	// Begin Actual Simulation Loop 
-	////////////////////////////////////////////////////////////////////////////////
-	//unsigned long long * verification = (unsigned long long *) malloc(in.lookups * sizeof(unsigned long long));
-	Kokkos::View<unsigned long long*> d_verification("d_ver", in.lookups);
-	Kokkos::View<unsigned long long*>::HostMirror verification =
-			Kokkos::create_mirror_view(d_verification);
-	Kokkos::deep_copy(d_verification, verification);
+	UIntView u_max_num_nucs(&SD.max_num_nucs, 1);
+        SD.d_max_num_nucs = new IntView("d_max_num_nucs", length_max_num_nucs);
+        Kokkos::deep_copy(*SD.d_max_num_nucs, u_max_num_nucs);
+
+        UIntView u_num_nucs(SD.num_nucs, SD.length_num_nucs);
+        SD.d_num_nucs = new IntView("d_num_nucs", SD.length_num_nucs);
+        Kokkos::deep_copy(*SD.d_num_nucs, u_num_nucs);
+
+        UDoubleView u_concs(SD.concs, SD.length_concs);
+        SD.d_concs = new DoubleView("d_concs", SD.length_concs);
+        Kokkos::deep_copy(*SD.d_concs, u_concs);
+
+        UIntView u_mats(SD.mats, SD.length_mats);
+        SD.d_mats = new IntView("d_mats", SD.length_mats);
+        Kokkos::deep_copy(*SD.d_mats, u_mats);
+
+        UDoubleView u_unionized_energy_array(SD.unionized_energy_array, SD.length_unionized_energy_array);
+        SD.d_unionized_energy_array = new DoubleView("d_unionized_energy_array",
+						     SD.length_unionized_energy_array);
+        Kokkos::deep_copy(*SD.d_unionized_energy_array, u_unionized_energy_array);
+
+        UIntView u_index_grid(SD.index_grid, SD.length_index_grid);
+        SD.d_index_grid = new IntView("d_index_grid", SD.length_index_grid);
+        Kokkos::deep_copy(*SD.d_index_grid, u_index_grid);
+
+        UPointView u_nuclide_grid(SD.nuclide_grid, SD.length_nuclide_grid);
+        SD.d_nuclide_grid = new PointView("d_nuclide_grid", SD.length_nuclide_grid);
+        Kokkos::deep_copy(*SD.d_nuclide_grid, u_nuclide_grid);
 
 	IntView num_nucs(*SD.d_num_nucs);
 	DoubleView concs(*SD.d_concs);
@@ -53,25 +75,34 @@ unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int
 	IntView index_grid(*SD.d_index_grid);
 	PointView nuclide_grid(*SD.d_nuclide_grid);
 	IntView max_num_nucs(*SD.d_max_num_nucs);
-	
+
+	////////////////////////////////////////////////////////////////////////////////
+	// Begin Actual Simulation Loop
+	////////////////////////////////////////////////////////////////////////////////
+	//unsigned long long * verification = (unsigned long long *) malloc(in.lookups * sizeof(unsigned long long));
+	Kokkos::View<unsigned long long*> d_verification("d_ver", in.lookups);
+	Kokkos::View<unsigned long long*>::HostMirror verification =
+			Kokkos::create_mirror_view(d_verification);
+	Kokkos::deep_copy(d_verification, verification);
+
 	Kokkos::parallel_for("Simulation",
 			     Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, in.lookups),
 			     KOKKOS_LAMBDA (int i) {
 		// Set the initial seed value
-		uint64_t seed = STARTING_SEED;	
+		uint64_t seed = STARTING_SEED;
 
 		// Forward seed to lookup index (we need 2 samples per lookup)
 		seed = fast_forward_LCG(seed, 2*i);
 
 		// Randomly pick an energy and material for the particle
 		double p_energy = LCG_random_double(&seed);
-		int mat         = pick_mat(&seed); 
+		int mat         = pick_mat(&seed);
 
 		// debugging
 		//printf("E = %lf mat = %d\n", p_energy, mat);
 
 		double macro_xs_vector[5] = {0};
-		
+
 		// Perform macroscopic Cross Section Lookup
 		calculate_macro_xs(
 				p_energy,        // Sampled neutron energy (in lethargy)
@@ -113,12 +144,15 @@ unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int
 
 	Kokkos::deep_copy(verification, d_verification);
 	Kokkos::fence();
-		
+
+	// End Simulation Timer
+	*end = omp_get_wtime();
+
 	// Reduce validation hash on the host
 	unsigned long long validation_hash = 0;
 	for( int i = 0; i < in.lookups; i++ )
 		validation_hash += verification(i);
-	
+
 	return validation_hash;
 }
 
@@ -199,25 +233,25 @@ void calculate_micro_xs(   double p_energy, int nuc, long n_isotopes,
 
 	high_i = low_i + 1;
 	high = &nuclide_grids(high_i);
-	
+
 	// calculate the re-useable interpolation factor
 	f = (high->energy - p_energy) / (high->energy - low->energy);
 
 	// Total XS
 	xs_vector[0] = high->total_xs - f * (high->total_xs - low->total_xs);
-	
+
 	// Elastic XS
 	xs_vector[1] = high->elastic_xs - f * (high->elastic_xs - low->elastic_xs);
-	
+
 	// Absorbtion XS
 	xs_vector[2] = high->absorbtion_xs - f * (high->absorbtion_xs - low->absorbtion_xs);
-	
+
 	// Fission XS
 	xs_vector[3] = high->fission_xs - f * (high->fission_xs - low->fission_xs);
-	
+
 	// Nu Fission XS
 	xs_vector[4] = high->nu_fission_xs - f * (high->nu_fission_xs - low->nu_fission_xs);
-	
+
 	//test
 	/*
 	if( omp_get_thread_num() == 0 )
@@ -228,7 +262,7 @@ void calculate_micro_xs(   double p_energy, int nuc, long n_isotopes,
 		printf("total_xs = %lf\n\n", xs_vector[1]);
 	}
 	*/
-	
+
 }
 
 // Calculates macroscopic cross section based on a given material & energy
@@ -241,7 +275,7 @@ void calculate_macro_xs( double p_energy, int mat, long n_isotopes,
                          IntView  mats,
                          double *  macro_xs_vector, int grid_type, int hash_bins, int max_num_nucs ){
 	int p_nuc; // the nuclide we are looking up
-	long idx = -1;	
+	long idx = -1;
 	double conc; // the concentration of the nuclide in the material
 
 	// cleans out macro_xs_vector
@@ -254,13 +288,13 @@ void calculate_macro_xs( double p_energy, int mat, long n_isotopes,
 	// done inside of the "calculate_micro_xs" function for each different
 	// nuclide in the material.
 	if( grid_type == UNIONIZED )
-		idx = grid_search( n_isotopes * n_gridpoints, p_energy, egrid, 0);	
+		idx = grid_search( n_isotopes * n_gridpoints, p_energy, egrid, 0);
 	else if( grid_type == HASH )
 	{
 		double du = 1.0 / hash_bins;
 		idx = p_energy / du;
 	}
-	
+
 	// Once we find the pointer array on the UEG, we can pull the data
 	// from the respective nuclide grids, as well as the nuclide
 	// concentration data for the material
@@ -282,7 +316,7 @@ void calculate_macro_xs( double p_energy, int mat, long n_isotopes,
 		for( int k = 0; k < 5; k++ )
 			macro_xs_vector[k] += xs_vector[k] * conc;
 	}
-	
+
 	//test
 	/*
 	for( int k = 0; k < 5; k++ )
@@ -305,15 +339,15 @@ long grid_search( long n, double quarry, DoubleView A, long off)
 	while( length > 1 )
 	{
 		examinationPoint = lowerLimit + ( length / 2 );
-		
+
 		if( A(examinationPoint + off) > quarry )
 			upperLimit = examinationPoint;
 		else
 			lowerLimit = examinationPoint;
-		
+
 		length = upperLimit - lowerLimit;
 	}
-	
+
 	return lowerLimit;
 }
 
@@ -329,15 +363,15 @@ long grid_search_nuclide( long n, double quarry, PointView A, long off, long low
 	while( length > 1 )
 	{
 		examinationPoint = lowerLimit + ( length / 2 );
-		
+
 		if( A(examinationPoint + off).energy > quarry )
 			upperLimit = examinationPoint;
 		else
 			lowerLimit = examinationPoint;
-		
+
 		length = upperLimit - lowerLimit;
 	}
-	
+
 	return lowerLimit;
 }
 
@@ -351,15 +385,15 @@ long grid_search_nuclide_old( long n, double quarry, NuclideGridPoint* A, long l
 	while( length > 1 )
 	{
 		examinationPoint = lowerLimit + ( length / 2 );
-		
+
 		if( A[examinationPoint].energy > quarry )
 			upperLimit = examinationPoint;
 		else
 			lowerLimit = examinationPoint;
-		
+
 		length = upperLimit - lowerLimit;
 	}
-	
+
 	return lowerLimit;
 }
 
@@ -369,11 +403,11 @@ KOKKOS_INLINE_FUNCTION
 int pick_mat( uint64_t * seed )
 {
 	// I have a nice spreadsheet supporting these numbers. They are
-	// the fractions (by volume) of material in the core. Not a 
+	// the fractions (by volume) of material in the core. Not a
 	// *perfect* approximation of where XS lookups are going to occur,
 	// but this will do a good job of biasing the system nonetheless.
 
-	// Also could be argued that doing fractions by weight would be 
+	// Also could be argued that doing fractions by weight would be
 	// a better approximation, but volume does a good enough job for now.
 
 	double dist[12];
@@ -389,7 +423,7 @@ int pick_mat( uint64_t * seed )
 	dist[9]  = 0.015;	// top nozzle
 	dist[10] = 0.025;	// top of fuel assemblies
 	dist[11] = 0.013;	// bottom of fuel assemblies
-	
+
 	double roll = LCG_random_double(seed);
 
 	// makes a pick based on the distro
@@ -432,7 +466,7 @@ uint64_t fast_forward_LCG(uint64_t seed, uint64_t n)
 	uint64_t a_new = 1;
 	uint64_t c_new = 0;
 
-	while(n > 0) 
+	while(n > 0)
 	{
 		if(n & 1)
 		{
@@ -448,4 +482,3 @@ uint64_t fast_forward_LCG(uint64_t seed, uint64_t n)
 	return (a_new * seed + c_new) % m;
 
 }
-
diff --git a/kokkos/XSbench_header.hpp b/kokkos/XSbench_header.hpp
index 62ebda0b..eef51284 100644
--- a/kokkos/XSbench_header.hpp
+++ b/kokkos/XSbench_header.hpp
@@ -112,7 +112,7 @@ void binary_write( Inputs in, SimulationData SD );
 SimulationData binary_read( Inputs in );
 
 // Simulation.c
-unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int mype);
+unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int mype, double* end);
 unsigned long long run_history_based_simulation(Inputs in, SimulationData SD, int mype);
 KOKKOS_INLINE_FUNCTION
 void calculate_micro_xs(   double p_energy, int nuc, long n_isotopes,

From 903f884e9d3a99d925f2ed24139f55072dd80848 Mon Sep 17 00:00:00 2001
From: Josh Davis <jhdavis@udel.edu>
Date: Wed, 9 Aug 2023 08:43:52 -0700
Subject: [PATCH 12/23] Some makefile changes for Kokkos

---
 kokkos/Makefile | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/kokkos/Makefile b/kokkos/Makefile
index 7668679e..f839664b 100644
--- a/kokkos/Makefile
+++ b/kokkos/Makefile
@@ -2,16 +2,16 @@
 # User Options
 #===============================================================================
 
-KOKKOS_PATH = /gpfs/alpine/csc452/proj-shared/perf_port_builds/kokkos
-CUDA_PATH   = /sw/summit/cuda/11.5.2
-COMPILER    = nvidia
-OPTIMIZE    = yes
-DEBUG       = no
-PROFILE     = no
-MPI         = no
-KOKKOS_DEVICES = OpenMP,Cuda
-KOKKOS_ARCH = Power9,Volta70
-SM_VERSION = 70
+KOKKOS_PATH ?= /global/homes/j/jhdavis/XSBench/kokkos/kokkos
+CUDA_PATH   ?= ${CUDA_HOME}
+COMPILER    ?= gnu
+OPTIMIZE    ?= yes
+DEBUG       ?= no
+PROFILE     ?= no
+MPI         ?= no
+KOKKOS_DEVICES ?= OpenMP,Cuda
+KOKKOS_ARCH ?= Zen3,Ampere80
+SM_VERSION ?= 80
 
 #===============================================================================
 # Program name & source code list
@@ -42,7 +42,7 @@ LDFLAGS = -lm
 # NVIDIA Compiler
 ifeq ($(COMPILER),nvidia)
   CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
-  CXXFLAGS += -Xcompiler -Wall -Xcompiler -O3 -arch=sm_$(SM_VERSION)
+  CXXFLAGS += -ccbin nvc++ -Xcompiler -Wall -Xcompiler -O3 -arch=sm_$(SM_VERSION)
 endif
 
 # Clang Compiler
@@ -53,8 +53,13 @@ endif
 
 # GCC Compiler
 ifeq ($(COMPILER),gnu)
-  CXX = g++
-  CXXFLAGS += -flto -fopenmp -DOPENMP
+  ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+    CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+    CXXFLAGS += -ccbin g++ -Xcompiler -Wall -Xcompiler -O3 -arch=sm_$(SM_VERSION) -Xcompiler -fopenmp -DOPENMP
+  else
+    CXX = g++
+    CXXFLAGS += -flto -fopenmp -DOPENMP
+  endif
 endif
 
 

From 3eeb92f794148ac46d8f900a1ccea47fe7af6a7d Mon Sep 17 00:00:00 2001
From: Pranav Sivaraman <14294205+pranav-sivaraman@users.noreply.github.com>
Date: Tue, 15 Aug 2023 18:18:21 +0530
Subject: [PATCH 13/23] feat: add cmake for kokkos port

---
 kokkos/CMakeLists.txt | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 kokkos/CMakeLists.txt

diff --git a/kokkos/CMakeLists.txt b/kokkos/CMakeLists.txt
new file mode 100644
index 00000000..4dabf3fd
--- /dev/null
+++ b/kokkos/CMakeLists.txt
@@ -0,0 +1,24 @@
+cmake_minimum_required(VERSION 3.16)
+
+project(
+    XSBench_Kokkos
+    VERSION 1.0
+    LANGUAGES CXX
+)
+
+set(CMAKE_CXX_STANDARD 17)
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+# set(CMAKE_CXX_FLAGS "-Wall -Wextra")
+set(CMAKE_CXX_FLAGS_DEBUG "-g")
+set(CMAKE_CXX_FLAGS_RELEASE "-O3")
+
+find_package(Kokkos REQUIRED)
+
+set(SOURCE Main.cpp io.cpp Simulation.cpp GridInit.cpp XSutils.cpp Materials.cpp)
+
+add_executable(xsbench ${SOURCE})
+target_link_libraries(xsbench Kokkos::kokkos)

From c93442e289d8f1ea6e94b357ded4fd1ac9d6bb8d Mon Sep 17 00:00:00 2001
From: Pranav Sivaraman <14294205+PerryStyle@users.noreply.github.com>
Date: Tue, 15 Aug 2023 05:56:46 -0700
Subject: [PATCH 14/23] refactor: rename xsbench excutable to XSBench

---
 kokkos/CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kokkos/CMakeLists.txt b/kokkos/CMakeLists.txt
index 4dabf3fd..6810c088 100644
--- a/kokkos/CMakeLists.txt
+++ b/kokkos/CMakeLists.txt
@@ -12,7 +12,7 @@ if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Release)
 endif()
 
-# set(CMAKE_CXX_FLAGS "-Wall -Wextra")
+set(CMAKE_CXX_FLAGS "-Wall -Wextra")
 set(CMAKE_CXX_FLAGS_DEBUG "-g")
 set(CMAKE_CXX_FLAGS_RELEASE "-O3")
 
@@ -20,5 +20,5 @@ find_package(Kokkos REQUIRED)
 
 set(SOURCE Main.cpp io.cpp Simulation.cpp GridInit.cpp XSutils.cpp Materials.cpp)
 
-add_executable(xsbench ${SOURCE})
-target_link_libraries(xsbench Kokkos::kokkos)
+add_executable(XSBench ${SOURCE})
+target_link_libraries(XSBench Kokkos::kokkos)

From 29e08a735e4504d3eb00669303d8c6429724f189 Mon Sep 17 00:00:00 2001
From: Pranav Sivaraman <14294205+pranav-sivaraman@users.noreply.github.com>
Date: Tue, 15 Aug 2023 18:59:41 +0530
Subject: [PATCH 15/23] refactor: rename *.hip to *.cpp

Building XSBench with HIP using the CUDA backend does not recognize
the file type .hip. Need to change to .cpp to compile.

From 30666713dd40b69639fcc32ae2ca059b38d03191 Mon Sep 17 00:00:00 2001
From: Josh Davis <jhdavis@udel.edu>
Date: Wed, 23 Aug 2023 15:54:44 -0400
Subject: [PATCH 16/23] Add missing install description for kokkos CMake

---
 kokkos/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kokkos/CMakeLists.txt b/kokkos/CMakeLists.txt
index 6810c088..50370f37 100644
--- a/kokkos/CMakeLists.txt
+++ b/kokkos/CMakeLists.txt
@@ -22,3 +22,5 @@ set(SOURCE Main.cpp io.cpp Simulation.cpp GridInit.cpp XSutils.cpp Materials.cpp
 
 add_executable(XSBench ${SOURCE})
 target_link_libraries(XSBench Kokkos::kokkos)
+
+install(TARGETS XSBench DESTINATION PROJECT_BINARY_DIR)

From cacfead7f4d480f81e60b34ac7414b945037210c Mon Sep 17 00:00:00 2001
From: Josh Davis <jhdavis@udel.edu>
Date: Wed, 23 Aug 2023 16:01:14 -0400
Subject: [PATCH 17/23] Adjust install dir to fix spack compatibility

---
 kokkos/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kokkos/CMakeLists.txt b/kokkos/CMakeLists.txt
index 50370f37..d9c6affd 100644
--- a/kokkos/CMakeLists.txt
+++ b/kokkos/CMakeLists.txt
@@ -23,4 +23,4 @@ set(SOURCE Main.cpp io.cpp Simulation.cpp GridInit.cpp XSutils.cpp Materials.cpp
 add_executable(XSBench ${SOURCE})
 target_link_libraries(XSBench Kokkos::kokkos)
 
-install(TARGETS XSBench DESTINATION PROJECT_BINARY_DIR)
+install(TARGETS XSBench DESTINATION bin)

From 8239f53673a09f0bbe4ee7856bf303b5170cc260 Mon Sep 17 00:00:00 2001
From: Pranav Sivaraman <14294205+pranav-sivaraman@users.noreply.github.com>
Date: Sun, 24 Sep 2023 14:49:43 -0400
Subject: [PATCH 18/23] refactor: use Kokkos timer instead of openmp to remove
 openmp dep

---
 kokkos/Main.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kokkos/Main.cpp b/kokkos/Main.cpp
index 860eea08..e2e5fe41 100644
--- a/kokkos/Main.cpp
+++ b/kokkos/Main.cpp
@@ -12,7 +12,6 @@ int main( int argc, char* argv[] )
 	// =====================================================================
 	int version = 20;
 	int mype = 0;
-	double omp_start, omp_end;
 	int nprocs = 1;
 	unsigned long long verification;
 
@@ -73,7 +72,8 @@ int main( int argc, char* argv[] )
 	}
 
 	// Start Simulation Timer
-	omp_start = omp_get_wtime();
+    Kokkos::Timer start;
+	start.reset()
 
 	// Run simulation
 	if( in.simulation_method == EVENT_BASED )
@@ -106,7 +106,7 @@ int main( int argc, char* argv[] )
 	verification = verification % 999983;
 
 	// Print / Save Results and Exit
-	int is_invalid_result = print_results( in, mype, omp_end-omp_start, nprocs, verification );
+	int is_invalid_result = print_results( in, mype, start.seconds(), nprocs, verification );
 
         Kokkos::finalize();
 

From 7e6161f699b3d4014606832dc571e9b148f6bbe9 Mon Sep 17 00:00:00 2001
From: Pranav Sivaraman <14294205+pranav-sivaraman@users.noreply.github.com>
Date: Sun, 24 Sep 2023 14:58:54 -0400
Subject: [PATCH 19/23] fix: change where timer starts

---
 kokkos/Main.cpp       | 10 +++++-----
 kokkos/Simulation.cpp |  6 ++++--
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/kokkos/Main.cpp b/kokkos/Main.cpp
index e2e5fe41..a8caa9d1 100644
--- a/kokkos/Main.cpp
+++ b/kokkos/Main.cpp
@@ -72,14 +72,14 @@ int main( int argc, char* argv[] )
 	}
 
 	// Start Simulation Timer
-    Kokkos::Timer start;
-	start.reset()
-
+    
+    double elapsed_time = 0;
+    
 	// Run simulation
 	if( in.simulation_method == EVENT_BASED )
 	{
 		if( in.kernel_id == 0 )
-			verification = run_event_based_simulation(in, SD, mype, &omp_end);
+			verification = run_event_based_simulation(in, SD, mype, &elapsed_time);
 		else
 		{
 			printf("Error: No kernel ID %d found!\n", in.kernel_id);
@@ -106,7 +106,7 @@ int main( int argc, char* argv[] )
 	verification = verification % 999983;
 
 	// Print / Save Results and Exit
-	int is_invalid_result = print_results( in, mype, start.seconds(), nprocs, verification );
+	int is_invalid_result = print_results( in, mype, elapsed_time, nprocs, verification );
 
         Kokkos::finalize();
 
diff --git a/kokkos/Simulation.cpp b/kokkos/Simulation.cpp
index 87d0329e..6ac060ab 100644
--- a/kokkos/Simulation.cpp
+++ b/kokkos/Simulation.cpp
@@ -16,7 +16,6 @@ unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int
 {
 	if( mype == 0)
 		printf("Beginning event based simulation...\n");
-
 	////////////////////////////////////////////////////////////////////////////////
 	// SUMMARY: Simulation Data Structure Manifest for "SD" Object
 	// Here we list all heap arrays (and lengths) in SD that would need to be
@@ -39,6 +38,9 @@ unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int
 	// Data movment and setup
 	int length_max_num_nucs = 1;
 
+    Kokkos::Timer start;
+    start.reset();
+
 	UIntView u_max_num_nucs(&SD.max_num_nucs, 1);
         SD.d_max_num_nucs = new IntView("d_max_num_nucs", length_max_num_nucs);
         Kokkos::deep_copy(*SD.d_max_num_nucs, u_max_num_nucs);
@@ -146,7 +148,7 @@ unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int
 	Kokkos::fence();
 
 	// End Simulation Timer
-	*end = omp_get_wtime();
+	*end = start.seconds();
 
 	// Reduce validation hash on the host
 	unsigned long long validation_hash = 0;

From 275174bc8bc3b8cc1554dc6185963061b66d8ede Mon Sep 17 00:00:00 2001
From: Pranav Sivaraman <14294205+pranav-sivaraman@users.noreply.github.com>
Date: Sun, 24 Sep 2023 15:02:39 -0400
Subject: [PATCH 20/23] fix: set default thread count to 1

---
 kokkos/io.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kokkos/io.cpp b/kokkos/io.cpp
index 2bd4b5d8..5447f167 100644
--- a/kokkos/io.cpp
+++ b/kokkos/io.cpp
@@ -230,7 +230,7 @@ Inputs read_CLI( int argc, char * argv[] )
 	input.simulation_method = HISTORY_BASED;
 	
 	// defaults to max threads on the system	
-	input.nthreads = omp_get_num_procs();
+	input.nthreads = 1;
 	
 	// defaults to 355 (corresponding to H-M Large benchmark)
 	input.n_isotopes = 355;

From 87c10de1f5344cf1816989dc26ea2799169ad125 Mon Sep 17 00:00:00 2001
From: Pranav Sivaraman <14294205+pranav-sivaraman@users.noreply.github.com>
Date: Thu, 19 Oct 2023 16:06:43 -0400
Subject: [PATCH 21/23] refactor: adjust KOKKOS_PATH in Makefile

---
 kokkos/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kokkos/Makefile b/kokkos/Makefile
index f839664b..24d31a31 100644
--- a/kokkos/Makefile
+++ b/kokkos/Makefile
@@ -2,8 +2,8 @@
 # User Options
 #===============================================================================
 
-KOKKOS_PATH ?= /global/homes/j/jhdavis/XSBench/kokkos/kokkos
-CUDA_PATH   ?= ${CUDA_HOME}
+MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST))))
+KOKKOS_PATH ?= $(MAKEFILE_PATH)kokkos
 COMPILER    ?= gnu
 OPTIMIZE    ?= yes
 DEBUG       ?= no

From e0b7bffebe7b3ee707a7c4ef330f57dfe280d667 Mon Sep 17 00:00:00 2001
From: Pranav Sivaraman <14294205+pranav-sivaraman@users.noreply.github.com>
Date: Thu, 19 Oct 2023 16:09:56 -0400
Subject: [PATCH 22/23] fix: adjust timer start and end

---
 kokkos/Simulation.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/kokkos/Simulation.cpp b/kokkos/Simulation.cpp
index 6ac060ab..45e09c48 100644
--- a/kokkos/Simulation.cpp
+++ b/kokkos/Simulation.cpp
@@ -38,9 +38,6 @@ unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int
 	// Data movment and setup
 	int length_max_num_nucs = 1;
 
-    Kokkos::Timer start;
-    start.reset();
-
 	UIntView u_max_num_nucs(&SD.max_num_nucs, 1);
         SD.d_max_num_nucs = new IntView("d_max_num_nucs", length_max_num_nucs);
         Kokkos::deep_copy(*SD.d_max_num_nucs, u_max_num_nucs);
@@ -87,6 +84,9 @@ unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int
 			Kokkos::create_mirror_view(d_verification);
 	Kokkos::deep_copy(d_verification, verification);
 
+    Kokkos::Timer start;
+    start.reset();
+
 	Kokkos::parallel_for("Simulation",
 			     Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, in.lookups),
 			     KOKKOS_LAMBDA (int i) {
@@ -148,13 +148,14 @@ unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int
 	Kokkos::fence();
 
 	// End Simulation Timer
-	*end = start.seconds();
 
 	// Reduce validation hash on the host
 	unsigned long long validation_hash = 0;
 	for( int i = 0; i < in.lookups; i++ )
 		validation_hash += verification(i);
 
+    *end = start.seconds();
+
 	return validation_hash;
 }
 

From 3aa0caf25ee3efd6284a0ad55e931188a858ff4a Mon Sep 17 00:00:00 2001
From: Pranav Sivaraman <14294205+pranav-sivaraman@users.noreply.github.com>
Date: Thu, 19 Oct 2023 16:38:24 -0400
Subject: [PATCH 23/23] feat: add kokkos model description to README

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index da71ae6a..afe8f327 100644
--- a/README.md
+++ b/README.md
@@ -50,6 +50,9 @@ This version of XSBench is written in SYCL, and can be used for CPU, GPU, FPGA,
 5. **XSBench/hip**
 This version of XSBench is written in HIP for use with GPU architectures. This version is derived from CUDA using an automatic conversion tool with only a few small manual changes.
 
+6. XSBench/kokkos 
+This version of XSBench is written using the Kokkos programming model allowing it to execute on multiple different GPUs or CPUs. We have provided both Makefile and CMake build system options for convenience
+
 ## Compilation
 
 To compile XSBench with default settings, navigate to your selected source directory and use the following command: