diff --git a/.github/workflows/benchmarks-reusable.yml b/.github/workflows/benchmarks-reusable.yml
index f7c660f3bd..bfd1e971f9 100644
--- a/.github/workflows/benchmarks-reusable.yml
+++ b/.github/workflows/benchmarks-reusable.yml
@@ -237,7 +237,7 @@ jobs:
- name: Upload HTML report
if: ${{ always() && inputs.upload_report }}
- uses: actions/cache/save@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
+ uses: actions/cache/save@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
with:
path: ur-repo/benchmark_results.html
key: benchmark-results-${{ matrix.adapter.str_name }}-${{ github.run_id }}
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index b4c40334d4..fbd4ffefef 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -52,7 +52,7 @@ jobs:
- name: Download benchmark HTML
id: download-bench-html
- uses: actions/cache/restore@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
+ uses: actions/cache/restore@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
with:
path: ur-repo/benchmark_results.html
key: benchmark-results-
diff --git a/Testing/Temporary/CTestCostData.txt b/Testing/Temporary/CTestCostData.txt
new file mode 100644
index 0000000000..ed97d539c0
--- /dev/null
+++ b/Testing/Temporary/CTestCostData.txt
@@ -0,0 +1 @@
+---
diff --git a/ft.sh b/ft.sh
new file mode 100644
index 0000000000..f7f73081f5
--- /dev/null
+++ b/ft.sh
@@ -0,0 +1,21 @@
+# add_test([=[enqueue-adapter_level_zero_v2]=] "/home/mateuszpn/pr2532/build/bin/test-enqueue" "--gtest_filter=*Level_Zero*")
+# set_tests_properties([=[enqueue-adapter_level_zero_v2]=] PROPERTIES ENVIRONMENT
+# "UR_ADAPTERS_FORCE_LOAD=\"/home/mateuszpn/pr2532/build/lib/libur_adapter_level_zero_v2.so.0.12.0\""
+# "LABELS "conformance;adapter_level_zero_v2"
+# WORKING_DIRECTORY "/home/mateuszpn/pr2532/build/test/conformance/enqueue" _BACKTRACE_TRIPLES "/home/mateuszpn/pr2532/test/conformance/CMakeLists.txt;22;add_test;/home/mateuszpn/pr2532/test/conformance/CMakeLists.txt;32;do_add_test;/home/mateuszpn/pr2532/test/conformance/CMakeLists.txt;67;add_test_adapter;/home/mateuszpn/pr2532/test/conformance/CMakeLists.txt;78;add_conformance_test;/home/mateuszpn/pr2532/test/conformance/enqueue/CMakeLists.txt;6;add_conformance_test_with_kernels_environment;/home/mateuszpn/pr2532/test/conformance/enqueue/CMakeLists.txt;0;")
+
+# Set environment variable
+export UR_ADAPTERS_FORCE_LOAD="/home/mateuszpn/pr2532/build/lib/libur_adapter_level_zero_v2.so.0.12.0"
+
+# Set working directory
+#cd /home/mateuszpn/pr2532/build/test/conformance/$1
+
+# Run the test with the specified filter
+#/home/mateuszpn/pr2532/build/bin/test-$1 --gtest_filter=*Level_Zero*
+
+# Set working directory
+cd /home/mateuszpn/pr2532/build/test/conformance/enqueue
+
+# Run the test with the specified filter
+#/home/mateuszpn/pr2532/build/bin/test-enqueue --gtest_filter=urEnqueueMemBufferMapTestWithParam.MapSignalEvent*Level_Zero*
+/home/mateuszpn/pr2532/build/bin/test-enqueue --gtest_filter=*Level_Zero*
\ No newline at end of file
diff --git a/scripts/benchmarks/output_markdown.py b/scripts/benchmarks/output_markdown.py
index 13df68d45e..fc3b65507b 100644
--- a/scripts/benchmarks/output_markdown.py
+++ b/scripts/benchmarks/output_markdown.py
@@ -27,7 +27,7 @@ def generate_markdown_details(results: list[Result]):
markdown_sections.append(f"""
-Benchmark details - environment, command, output...
+Benchmark details - environment, command...
""")
for res in results:
@@ -42,9 +42,6 @@ def generate_markdown_details(results: list[Result]):
#### Command:
{' '.join(res.command)}
-#### Output:
-{res.stdout}
-
""")
markdown_sections.append(f"""
diff --git a/scripts/templates/queue_api.cpp.mako b/scripts/templates/queue_api.cpp.mako
index 89f857e007..14def952ac 100644
--- a/scripts/templates/queue_api.cpp.mako
+++ b/scripts/templates/queue_api.cpp.mako
@@ -20,6 +20,8 @@ from templates import helper as th
*
*/
+// Do not edit. This file is auto generated from a template: scripts/templates/queue_api.cpp.mako
+
#include "queue_api.hpp"
#include "ur_util.hpp"
diff --git a/scripts/templates/queue_api.hpp.mako b/scripts/templates/queue_api.hpp.mako
index 352abbeb43..46ed74ed33 100644
--- a/scripts/templates/queue_api.hpp.mako
+++ b/scripts/templates/queue_api.hpp.mako
@@ -20,9 +20,12 @@ from templates import helper as th
*
*/
+// Do not edit. This file is auto generated from a template: scripts/templates/queue_api.hpp.mako
+
#pragma once
#include
+#include
struct ur_queue_handle_t_ {
virtual ~ur_queue_handle_t_();
@@ -32,4 +35,8 @@ struct ur_queue_handle_t_ {
%for obj in th.get_queue_related_functions(specs, n, tags):
virtual ${x}_result_t ${th.transform_queue_related_function_name(n, tags, obj, format=["type"])} = 0;
%endfor
+
+ virtual ur_result_t
+ enqueueCommandBuffer(ze_command_list_handle_t, ur_event_handle_t *,
+ uint32_t, const ur_event_handle_t *) = 0;
};
diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp
index 05c20a6614..37018dde6c 100644
--- a/source/adapters/cuda/command_buffer.cpp
+++ b/source/adapters/cuda/command_buffer.cpp
@@ -523,7 +523,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
ThreadsPerBlock, BlocksPerGrid));
// Set node param structure with the kernel related data
- auto &ArgIndices = hKernel->getArgIndices();
+ auto &ArgPointers = hKernel->getArgPointers();
CUDA_KERNEL_NODE_PARAMS NodeParams = {};
NodeParams.func = CuFunc;
NodeParams.gridDimX = BlocksPerGrid[0];
@@ -533,7 +533,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
NodeParams.blockDimY = ThreadsPerBlock[1];
NodeParams.blockDimZ = ThreadsPerBlock[2];
NodeParams.sharedMemBytes = LocalSize;
- NodeParams.kernelParams = const_cast(ArgIndices.data());
+ NodeParams.kernelParams = const_cast(ArgPointers.data());
// Create and add an new kernel node to the Cuda graph
UR_CHECK_ERROR(cuGraphAddKernelNode(&GraphNode, hCommandBuffer->CudaGraph,
@@ -1398,7 +1398,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
Params.blockDimZ = ThreadsPerBlock[2];
Params.sharedMemBytes = KernelCommandHandle->Kernel->getLocalSize();
Params.kernelParams =
- const_cast(KernelCommandHandle->Kernel->getArgIndices().data());
+ const_cast(KernelCommandHandle->Kernel->getArgPointers().data());
CUgraphNode Node = KernelCommandHandle->Node;
CUgraphExec CudaGraphExec = CommandBuffer->CudaGraphExec;
diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index 2a4a2cf54f..540ebb86fa 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -492,11 +492,11 @@ enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
UR_CHECK_ERROR(RetImplEvent->start());
}
- auto &ArgIndices = hKernel->getArgIndices();
+ auto &ArgPointers = hKernel->getArgPointers();
UR_CHECK_ERROR(cuLaunchKernel(
CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2],
ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize,
- CuStream, const_cast(ArgIndices.data()), nullptr));
+ CuStream, const_cast(ArgPointers.data()), nullptr));
if (phEvent) {
UR_CHECK_ERROR(RetImplEvent->record());
@@ -680,7 +680,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
UR_CHECK_ERROR(RetImplEvent->start());
}
- auto &ArgIndices = hKernel->getArgIndices();
+ auto &ArgPointers = hKernel->getArgPointers();
CUlaunchConfig launch_config;
launch_config.gridDimX = BlocksPerGrid[0];
@@ -696,7 +696,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
launch_config.numAttrs = launch_attribute.size();
UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc,
- const_cast(ArgIndices.data()),
+ const_cast(ArgPointers.data()),
nullptr));
if (phEvent) {
diff --git a/source/adapters/cuda/image.cpp b/source/adapters/cuda/image.cpp
index c11a85b293..87570e3b45 100644
--- a/source/adapters/cuda/image.cpp
+++ b/source/adapters/cuda/image.cpp
@@ -533,8 +533,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
image_res_desc.resType = CU_RESOURCE_TYPE_MIPMAPPED_ARRAY;
image_res_desc.res.mipmap.hMipmappedArray = (CUmipmappedArray)hImageMem;
}
- } else if (mem_type == CU_MEMORYTYPE_DEVICE) {
- // We have a USM pointer
+ } else if (mem_type == CU_MEMORYTYPE_DEVICE ||
+ mem_type == CU_MEMORYTYPE_HOST) {
+ // We have a USM pointer.
+ // Images may be created from device or host USM.
if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) {
image_res_desc.resType = CU_RESOURCE_TYPE_LINEAR;
image_res_desc.res.linear.devPtr = (CUdeviceptr)hImageMem;
diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp
index d1b3b61244..f299714b02 100644
--- a/source/adapters/cuda/kernel.hpp
+++ b/source/adapters/cuda/kernel.hpp
@@ -66,8 +66,10 @@ struct ur_kernel_handle_t_ {
args_t Storage;
/// Aligned size of each parameter, including padding.
args_size_t ParamSizes;
- /// Byte offset into /p Storage allocation for each parameter.
- args_index_t Indices;
+ /// Byte offset into /p Storage allocation for each argument.
+ args_index_t ArgPointers;
+ /// Position in the Storage array where the next argument should added.
+ size_t InsertPos = 0;
/// Aligned size in bytes for each local memory parameter after padding has
/// been added. Zero if the argument at the index isn't a local memory
/// argument.
@@ -90,33 +92,43 @@ struct ur_kernel_handle_t_ {
std::uint32_t ImplicitOffsetArgs[3] = {0, 0, 0};
arguments() {
- // Place the implicit offset index at the end of the indicies collection
- Indices.emplace_back(&ImplicitOffsetArgs);
+ // Place the implicit offset index at the end of the ArgPointers
+ // collection.
+ ArgPointers.emplace_back(&ImplicitOffsetArgs);
}
/// Add an argument to the kernel.
/// If the argument existed before, it is replaced.
/// Otherwise, it is added.
/// Gaps are filled with empty arguments.
- /// Implicit offset argument is kept at the back of the indices collection.
+ /// Implicit offset argument is kept at the back of the ArgPointers
+ /// collection.
void addArg(size_t Index, size_t Size, const void *Arg,
size_t LocalSize = 0) {
- if (Index + 2 > Indices.size()) {
+ // Expand storage to accommodate this Index if needed.
+ if (Index + 2 > ArgPointers.size()) {
// Move implicit offset argument index with the end
- Indices.resize(Index + 2, Indices.back());
+ ArgPointers.resize(Index + 2, ArgPointers.back());
// Ensure enough space for the new argument
ParamSizes.resize(Index + 1);
AlignedLocalMemSize.resize(Index + 1);
OriginalLocalMemSize.resize(Index + 1);
}
- ParamSizes[Index] = Size;
- // calculate the insertion point on the array
- size_t InsertPos = std::accumulate(std::begin(ParamSizes),
- std::begin(ParamSizes) + Index, 0);
- // Update the stored value for the argument
- std::memcpy(&Storage[InsertPos], Arg, Size);
- Indices[Index] = &Storage[InsertPos];
- AlignedLocalMemSize[Index] = LocalSize;
+
+ // Copy new argument to storage if it hasn't been added before.
+ if (ParamSizes[Index] == 0) {
+ ParamSizes[Index] = Size;
+ std::memcpy(&Storage[InsertPos], Arg, Size);
+ ArgPointers[Index] = &Storage[InsertPos];
+ AlignedLocalMemSize[Index] = LocalSize;
+ InsertPos += Size;
+ }
+ // Otherwise, update the existing argument.
+ else {
+ std::memcpy(ArgPointers[Index], Arg, Size);
+ AlignedLocalMemSize[Index] = LocalSize;
+ assert(Size == ParamSizes[Index]);
+ }
}
/// Returns the padded size and offset of a local memory argument.
@@ -128,7 +140,7 @@ struct ur_kernel_handle_t_ {
std::pair calcAlignedLocalArgument(size_t Index,
size_t Size) {
// Store the unpadded size of the local argument
- if (Index + 2 > Indices.size()) {
+ if (Index + 2 > ArgPointers.size()) {
AlignedLocalMemSize.resize(Index + 1);
OriginalLocalMemSize.resize(Index + 1);
}
@@ -158,10 +170,11 @@ struct ur_kernel_handle_t_ {
return std::make_pair(AlignedLocalSize, AlignedLocalOffset);
}
- // Iterate over all existing local argument which follows StartIndex
+ // Iterate over each existing local argument which follows StartIndex
// index, update the offset and pointer into the kernel local memory.
void updateLocalArgOffset(size_t StartIndex) {
- const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg
+ const size_t NumArgs =
+ ArgPointers.size() - 1; // Accounts for implicit arg
for (auto SuccIndex = StartIndex; SuccIndex < NumArgs; SuccIndex++) {
const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex];
if (OriginalLocalSize == 0) {
@@ -177,10 +190,7 @@ struct ur_kernel_handle_t_ {
AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize;
// Store new offset into local data
- const size_t InsertPos =
- std::accumulate(std::begin(ParamSizes),
- std::begin(ParamSizes) + SuccIndex, size_t{0});
- std::memcpy(&Storage[InsertPos], &SuccAlignedLocalOffset,
+ std::memcpy(ArgPointers[SuccIndex], &SuccAlignedLocalOffset,
sizeof(size_t));
}
}
@@ -228,7 +238,7 @@ struct ur_kernel_handle_t_ {
std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size);
}
- const args_index_t &getIndices() const noexcept { return Indices; }
+ const args_index_t &getArgPointers() const noexcept { return ArgPointers; }
uint32_t getLocalSize() const {
return std::accumulate(std::begin(AlignedLocalMemSize),
@@ -299,7 +309,7 @@ struct ur_kernel_handle_t_ {
/// real one required by the kernel, since this cannot be queried from
/// the CUDA Driver API
uint32_t getNumArgs() const noexcept {
- return static_cast(Args.Indices.size() - 1);
+ return static_cast(Args.ArgPointers.size() - 1);
}
void setKernelArg(int Index, size_t Size, const void *Arg) {
@@ -314,8 +324,8 @@ struct ur_kernel_handle_t_ {
return Args.setImplicitOffset(Size, ImplicitOffset);
}
- const arguments::args_index_t &getArgIndices() const {
- return Args.getIndices();
+ const arguments::args_index_t &getArgPointers() const {
+ return Args.getArgPointers();
}
void setWorkGroupMemory(size_t MemSize) { Args.setWorkGroupMemory(MemSize); }
diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp
index 09c59bb9f7..887eb75287 100644
--- a/source/adapters/hip/command_buffer.cpp
+++ b/source/adapters/hip/command_buffer.cpp
@@ -378,7 +378,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
pLocalWorkSize, hKernel, HIPFunc, ThreadsPerBlock, BlocksPerGrid));
// Set node param structure with the kernel related data
- auto &ArgIndices = hKernel->getArgIndices();
+ auto &ArgPointers = hKernel->getArgPointers();
hipKernelNodeParams NodeParams;
NodeParams.func = HIPFunc;
NodeParams.gridDim.x = BlocksPerGrid[0];
@@ -388,7 +388,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
NodeParams.blockDim.y = ThreadsPerBlock[1];
NodeParams.blockDim.z = ThreadsPerBlock[2];
NodeParams.sharedMemBytes = LocalSize;
- NodeParams.kernelParams = const_cast(ArgIndices.data());
+ NodeParams.kernelParams = const_cast(ArgPointers.data());
NodeParams.extra = nullptr;
// Create and add an new kernel node to the HIP graph
@@ -1098,7 +1098,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
Params.blockDim.z = ThreadsPerBlock[2];
Params.sharedMemBytes = hCommand->Kernel->getLocalSize();
Params.kernelParams =
- const_cast(hCommand->Kernel->getArgIndices().data());
+ const_cast(hCommand->Kernel->getArgPointers().data());
hipGraphNode_t Node = hCommand->Node;
hipGraphExec_t HipGraphExec = CommandBuffer->HIPGraphExec;
diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp
index 8c7c1c617d..849369de4b 100644
--- a/source/adapters/hip/enqueue.cpp
+++ b/source/adapters/hip/enqueue.cpp
@@ -308,7 +308,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
}
}
- auto ArgIndices = hKernel->getArgIndices();
+ auto ArgPointers = hKernel->getArgPointers();
// If migration of mem across buffer is needed, an event must be associated
// with this command, implicitly if phEvent is nullptr
@@ -322,7 +322,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
UR_CHECK_ERROR(hipModuleLaunchKernel(
HIPFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2],
ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2],
- hKernel->getLocalSize(), HIPStream, ArgIndices.data(), nullptr));
+ hKernel->getLocalSize(), HIPStream, ArgPointers.data(), nullptr));
if (phEvent) {
UR_CHECK_ERROR(RetImplEvent->record());
diff --git a/source/adapters/hip/kernel.hpp b/source/adapters/hip/kernel.hpp
index c6d30e81ad..5ec51e7fa4 100644
--- a/source/adapters/hip/kernel.hpp
+++ b/source/adapters/hip/kernel.hpp
@@ -61,8 +61,10 @@ struct ur_kernel_handle_t_ {
args_t Storage;
/// Aligned size of each parameter, including padding.
args_size_t ParamSizes;
- /// Byte offset into /p Storage allocation for each parameter.
- args_index_t Indices;
+ /// Byte offset into /p Storage allocation for each argument.
+ args_index_t ArgPointers;
+ /// Position in the Storage array where the next argument should added.
+ size_t InsertPos = 0;
/// Aligned size in bytes for each local memory parameter after padding has
/// been added. Zero if the argument at the index isn't a local memory
/// argument.
@@ -85,32 +87,41 @@ struct ur_kernel_handle_t_ {
arguments() {
// Place the implicit offset index at the end of the indicies collection
- Indices.emplace_back(&ImplicitOffsetArgs);
+ ArgPointers.emplace_back(&ImplicitOffsetArgs);
}
/// Add an argument to the kernel.
/// If the argument existed before, it is replaced.
/// Otherwise, it is added.
/// Gaps are filled with empty arguments.
- /// Implicit offset argument is kept at the back of the indices collection.
+ /// Implicit offset argument is kept at the back of the ArgPointers
+ /// collection.
void addArg(size_t Index, size_t Size, const void *Arg,
size_t LocalSize = 0) {
- if (Index + 2 > Indices.size()) {
- // Move implicit offset argument Index with the end
- Indices.resize(Index + 2, Indices.back());
+ // Expand storage to accommodate this Index if needed.
+ if (Index + 2 > ArgPointers.size()) {
+ // Move implicit offset argument index with the end
+ ArgPointers.resize(Index + 2, ArgPointers.back());
// Ensure enough space for the new argument
ParamSizes.resize(Index + 1);
AlignedLocalMemSize.resize(Index + 1);
OriginalLocalMemSize.resize(Index + 1);
}
- ParamSizes[Index] = Size;
- // calculate the insertion point on the array
- size_t InsertPos = std::accumulate(std::begin(ParamSizes),
- std::begin(ParamSizes) + Index, 0);
- // Update the stored value for the argument
- std::memcpy(&Storage[InsertPos], Arg, Size);
- Indices[Index] = &Storage[InsertPos];
- AlignedLocalMemSize[Index] = LocalSize;
+
+ // Copy new argument to storage if it hasn't been added before.
+ if (ParamSizes[Index] == 0) {
+ ParamSizes[Index] = Size;
+ std::memcpy(&Storage[InsertPos], Arg, Size);
+ ArgPointers[Index] = &Storage[InsertPos];
+ AlignedLocalMemSize[Index] = LocalSize;
+ InsertPos += Size;
+ }
+ // Otherwise, update the existing argument.
+ else {
+ std::memcpy(ArgPointers[Index], Arg, Size);
+ AlignedLocalMemSize[Index] = LocalSize;
+ assert(Size == ParamSizes[Index]);
+ }
}
/// Returns the padded size and offset of a local memory argument.
@@ -122,7 +133,7 @@ struct ur_kernel_handle_t_ {
std::pair calcAlignedLocalArgument(size_t Index,
size_t Size) {
// Store the unpadded size of the local argument
- if (Index + 2 > Indices.size()) {
+ if (Index + 2 > ArgPointers.size()) {
AlignedLocalMemSize.resize(Index + 1);
OriginalLocalMemSize.resize(Index + 1);
}
@@ -151,20 +162,12 @@ struct ur_kernel_handle_t_ {
return std::make_pair(AlignedLocalSize, AlignedLocalOffset);
}
- void addLocalArg(size_t Index, size_t Size) {
- // Get the aligned argument size and offset into local data
- auto [AlignedLocalSize, AlignedLocalOffset] =
- calcAlignedLocalArgument(Index, Size);
-
- // Store argument details
- addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
- AlignedLocalSize);
-
- // For every existing local argument which follows at later argument
- // indices, update the offset and pointer into the kernel local memory.
- // Required as padding will need to be recalculated.
- const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg
- for (auto SuccIndex = Index + 1; SuccIndex < NumArgs; SuccIndex++) {
+ // Iterate over each existing local argument which follows StartIndex
+ // index, update the offset and pointer into the kernel local memory.
+ void updateLocalArgOffset(size_t StartIndex) {
+ const size_t NumArgs =
+ ArgPointers.size() - 1; // Accounts for implicit arg
+ for (auto SuccIndex = StartIndex; SuccIndex < NumArgs; SuccIndex++) {
const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex];
if (OriginalLocalSize == 0) {
// Skip if successor argument isn't a local memory arg
@@ -179,14 +182,26 @@ struct ur_kernel_handle_t_ {
AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize;
// Store new offset into local data
- const size_t InsertPos =
- std::accumulate(std::begin(ParamSizes),
- std::begin(ParamSizes) + SuccIndex, size_t{0});
- std::memcpy(&Storage[InsertPos], &SuccAlignedLocalOffset,
+ std::memcpy(ArgPointers[SuccIndex], &SuccAlignedLocalOffset,
sizeof(size_t));
}
}
+ void addLocalArg(size_t Index, size_t Size) {
+ // Get the aligned argument size and offset into local data
+ auto [AlignedLocalSize, AlignedLocalOffset] =
+ calcAlignedLocalArgument(Index, Size);
+
+ // Store argument details
+ addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
+ AlignedLocalSize);
+
+ // For every existing local argument which follows at later argument
+ // indices, update the offset and pointer into the kernel local memory.
+ // Required as padding will need to be recalculated.
+ updateLocalArgOffset(Index + 1);
+ }
+
void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) {
assert(hMem && "Invalid mem handle");
// To avoid redundancy we are not storing mem obj with index i at index
@@ -206,7 +221,7 @@ struct ur_kernel_handle_t_ {
std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size);
}
- const args_index_t &getIndices() const noexcept { return Indices; }
+ const args_index_t &getArgPointers() const noexcept { return ArgPointers; }
uint32_t getLocalSize() const {
return std::accumulate(std::begin(AlignedLocalMemSize),
@@ -263,7 +278,7 @@ struct ur_kernel_handle_t_ {
/// offset. Note this only returns the current known number of arguments,
/// not the real one required by the kernel, since this cannot be queried
/// from the HIP Driver API
- uint32_t getNumArgs() const noexcept { return Args.Indices.size() - 1; }
+ uint32_t getNumArgs() const noexcept { return Args.ArgPointers.size() - 1; }
void setKernelArg(int Index, size_t Size, const void *Arg) {
Args.addArg(Index, Size, Arg);
@@ -277,8 +292,8 @@ struct ur_kernel_handle_t_ {
return Args.setImplicitOffset(Size, ImplicitOffset);
}
- const arguments::args_index_t &getArgIndices() const {
- return Args.getIndices();
+ const arguments::args_index_t &getArgPointers() const {
+ return Args.getArgPointers();
}
uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); }
diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt
index 39031a700d..c75c870be7 100644
--- a/source/adapters/level_zero/CMakeLists.txt
+++ b/source/adapters/level_zero/CMakeLists.txt
@@ -145,7 +145,9 @@ if(UR_BUILD_ADAPTER_L0_V2)
${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp
${CMAKE_CURRENT_SOURCE_DIR}/tensor_map.cpp
# v2-only sources
+ ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_buffer.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_cache.hpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_manager.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/context.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool_cache.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool.hpp
@@ -159,7 +161,9 @@ if(UR_BUILD_ADAPTER_L0_V2)
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/usm.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/api.cpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_buffer.cpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_cache.cpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_manager.cpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/context.cpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool_cache.cpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool.cpp
diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp
index 058f92f8ca..902da42d2c 100644
--- a/source/adapters/level_zero/command_buffer.cpp
+++ b/source/adapters/level_zero/command_buffer.cpp
@@ -1535,15 +1535,17 @@ ur_result_t waitForDependencies(ur_exp_command_buffer_handle_t CommandBuffer,
* @param CommandList The command-list to append the QueryKernelTimestamps
* command to.
* @param SignalEvent The event that must be signaled after the profiling is
- * finished. This event will contain the profiling information.
+ * finished.
* @param WaitEvent The event that must be waited on before starting the
* profiling.
+ * @param ProfilingEvent The event that will contain the profiling data.
* @return UR_RESULT_SUCCESS or an error code on failure.
*/
ur_result_t appendProfilingQueries(ur_exp_command_buffer_handle_t CommandBuffer,
ze_command_list_handle_t CommandList,
ur_event_handle_t SignalEvent,
- ur_event_handle_t WaitEvent) {
+ ur_event_handle_t WaitEvent,
+ ur_event_handle_t ProfilingEvent) {
// Multiple submissions of a command buffer implies that we need to save
// the event timestamps before resubmiting the command buffer. We
// therefore copy these timestamps in a dedicated USM memory section
@@ -1556,12 +1558,17 @@ ur_result_t appendProfilingQueries(ur_exp_command_buffer_handle_t CommandBuffer,
Profiling->Timestamps =
new ze_kernel_timestamp_result_t[Profiling->NumEvents];
+ uint32_t NumWaitEvents = WaitEvent ? 1 : 0;
+ ze_event_handle_t *ZeWaitEventList =
+ WaitEvent ? &(WaitEvent->ZeEvent) : nullptr;
+ ze_event_handle_t ZeSignalEvent =
+ SignalEvent ? SignalEvent->ZeEvent : nullptr;
ZE2UR_CALL(zeCommandListAppendQueryKernelTimestamps,
(CommandList, CommandBuffer->ZeEventsList.size(),
CommandBuffer->ZeEventsList.data(), (void *)Profiling->Timestamps,
- 0, SignalEvent->ZeEvent, 1, &(WaitEvent->ZeEvent)));
+ 0, ZeSignalEvent, NumWaitEvents, ZeWaitEventList));
- SignalEvent->CommandData = static_cast(Profiling);
+ ProfilingEvent->CommandData = static_cast(Profiling);
return UR_RESULT_SUCCESS;
}
@@ -1615,8 +1622,8 @@ ur_result_t enqueueImmediateAppendPath(
if (DoProfiling) {
UR_CALL(appendProfilingQueries(CommandBuffer, CommandListHelper->first,
- *Event,
- CommandBuffer->ComputeFinishedEvent));
+ *Event, CommandBuffer->ComputeFinishedEvent,
+ *Event));
}
// When the current execution is finished, signal ExecutionFinishedEvent to
@@ -1694,10 +1701,15 @@ ur_result_t enqueueWaitEventPath(ur_exp_command_buffer_handle_t CommandBuffer,
(ZeCopyCommandQueue, 1, &CommandBuffer->ZeCopyCommandList, nullptr));
}
+ ZE2UR_CALL(zeCommandListAppendBarrier,
+ (SignalCommandList->first, nullptr, 1,
+ &(CommandBuffer->ExecutionFinishedEvent->ZeEvent)));
+
// Reset the wait-event for the UR command-buffer that is signaled when its
// submission dependencies have been satisfied.
ZE2UR_CALL(zeCommandListAppendEventReset,
(SignalCommandList->first, CommandBuffer->WaitEvent->ZeEvent));
+
// Reset the all-reset-event for the UR command-buffer that is signaled when
// all events of the main command-list have been reset.
ZE2UR_CALL(zeCommandListAppendEventReset,
@@ -1705,14 +1717,12 @@ ur_result_t enqueueWaitEventPath(ur_exp_command_buffer_handle_t CommandBuffer,
if (DoProfiling) {
UR_CALL(appendProfilingQueries(CommandBuffer, SignalCommandList->first,
- *Event,
- CommandBuffer->ExecutionFinishedEvent));
- } else {
- ZE2UR_CALL(zeCommandListAppendBarrier,
- (SignalCommandList->first, (*Event)->ZeEvent, 1,
- &(CommandBuffer->ExecutionFinishedEvent->ZeEvent)));
+ nullptr, nullptr, *Event));
}
+ ZE2UR_CALL(zeCommandListAppendBarrier,
+ (SignalCommandList->first, (*Event)->ZeEvent, 0, nullptr));
+
UR_CALL(Queue->executeCommandList(SignalCommandList, false /*IsBlocking*/,
false /*OKToBatchCommand*/));
diff --git a/source/adapters/level_zero/v2/api.cpp b/source/adapters/level_zero/v2/api.cpp
index 9ae9bddcb9..edd9687445 100644
--- a/source/adapters/level_zero/v2/api.cpp
+++ b/source/adapters/level_zero/v2/api.cpp
@@ -239,47 +239,6 @@ ur_result_t urBindlessImagesReleaseExternalSemaphoreExp(
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
-ur_result_t
-urCommandBufferCreateExp(ur_context_handle_t hContext,
- ur_device_handle_t hDevice,
- const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
- ur_exp_command_buffer_handle_t *phCommandBuffer) {
- logger::error("{} function not implemented!", __FUNCTION__);
- return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-ur_result_t
-urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
- logger::error("{} function not implemented!", __FUNCTION__);
- return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-ur_result_t
-urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
- logger::error("{} function not implemented!", __FUNCTION__);
- return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-ur_result_t
-urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
- logger::error("{} function not implemented!", __FUNCTION__);
- return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-ur_result_t urCommandBufferAppendKernelLaunchExp(
- ur_exp_command_buffer_handle_t hCommandBuffer, ur_kernel_handle_t hKernel,
- uint32_t workDim, const size_t *pGlobalWorkOffset,
- const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
- uint32_t numKernelAlternatives, ur_kernel_handle_t *phKernelAlternatives,
- uint32_t numSyncPointsInWaitList,
- const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
- uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList,
- ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
- ur_exp_command_buffer_command_handle_t *phCommand) {
- logger::error("{} function not implemented!", __FUNCTION__);
- return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
ur_result_t urCommandBufferAppendUSMMemcpyExp(
ur_exp_command_buffer_handle_t hCommandBuffer, void *pDst, const void *pSrc,
size_t size, uint32_t numSyncPointsInWaitList,
@@ -415,14 +374,6 @@ ur_result_t urCommandBufferAppendUSMAdviseExp(
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
-ur_result_t urCommandBufferEnqueueExp(
- ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue,
- uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
- ur_event_handle_t *phEvent) {
- logger::error("{} function not implemented!", __FUNCTION__);
- return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
ur_result_t urCommandBufferRetainCommandExp(
ur_exp_command_buffer_command_handle_t hCommand) {
logger::error("{} function not implemented!", __FUNCTION__);
@@ -443,15 +394,6 @@ ur_result_t urCommandBufferUpdateKernelLaunchExp(
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
-ur_result_t
-urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer,
- ur_exp_command_buffer_info_t propName,
- size_t propSize, void *pPropValue,
- size_t *pPropSizeRet) {
- logger::error("{} function not implemented!", __FUNCTION__);
- return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
ur_result_t urCommandBufferUpdateSignalEventExp(
ur_exp_command_buffer_command_handle_t hCommand,
ur_event_handle_t *phEvent) {
diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp
new file mode 100644
index 0000000000..c35d97d76b
--- /dev/null
+++ b/source/adapters/level_zero/v2/command_buffer.cpp
@@ -0,0 +1,183 @@
+//===--------- command_buffer.cpp - Level Zero Adapter ---------------===//
+//
+// Copyright (C) 2024 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "command_buffer.hpp"
+#include "../helpers/kernel_helpers.hpp"
+#include "../ur_interface_loader.hpp"
+#include "logger/ur_logger.hpp"
+
+namespace {
+
+// Checks whether zeCommandListImmediateAppendCommandListsExp can be used for a
+// given context.
+void checkImmediateAppendSupport(ur_context_handle_t context) {
+ bool DriverSupportsImmediateAppend =
+ context->getPlatform()->ZeCommandListImmediateAppendExt.Supported;
+
+ if (!DriverSupportsImmediateAppend) {
+ logger::error("Adapter v2 is used but "
+ "the current driver does not support the "
+ "zeCommandListImmediateAppendCommandListsExp entrypoint.");
+ std::abort();
+ }
+}
+
+} // namespace
+
+ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_(
+ ur_context_handle_t context, ur_device_handle_t device,
+ v2::raii::command_list_unique_handle &&commandList,
+ const ur_exp_command_buffer_desc_t *desc)
+ : commandListManager(
+ context, device,
+ std::forward(commandList)),
+ isUpdatable(desc ? desc->isUpdatable : false) {}
+
+ur_result_t ur_exp_command_buffer_handle_t_::finalizeCommandBuffer() {
+ // It is not allowed to append to command list from multiple threads.
+ std::scoped_lock guard(this->Mutex);
+ UR_ASSERT(!isFinalized, UR_RESULT_ERROR_INVALID_OPERATION);
+ // Close the command lists and have them ready for dispatch.
+ ZE2UR_CALL(zeCommandListClose, (this->commandListManager.getZeCommandList()));
+ isFinalized = true;
+ return UR_RESULT_SUCCESS;
+}
+
+namespace ur::level_zero {
+
+ur_result_t
+urCommandBufferCreateExp(ur_context_handle_t context, ur_device_handle_t device,
+ const ur_exp_command_buffer_desc_t *commandBufferDesc,
+ ur_exp_command_buffer_handle_t *commandBuffer) try {
+ checkImmediateAppendSupport(context);
+
+ if (!context->getPlatform()->ZeMutableCmdListExt.Supported) {
+ throw UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+ }
+
+ using queue_group_type = ur_device_handle_t_::queue_group_info_t::type;
+ uint32_t queueGroupOrdinal =
+ device->QueueGroup[queue_group_type::Compute].ZeOrdinal;
+ v2::raii::command_list_unique_handle zeCommandList =
+ context->commandListCache.getRegularCommandList(device->ZeDevice, true,
+ queueGroupOrdinal, true);
+
+ *commandBuffer = new ur_exp_command_buffer_handle_t_(
+ context, device, std::move(zeCommandList), commandBufferDesc);
+ return UR_RESULT_SUCCESS;
+
+} catch (...) {
+ return exceptionToResult(std::current_exception());
+}
+
+ur_result_t
+urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) try {
+ hCommandBuffer->RefCount.increment();
+ return UR_RESULT_SUCCESS;
+} catch (...) {
+ return exceptionToResult(std::current_exception());
+}
+
+ur_result_t
+urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) try {
+ if (!hCommandBuffer->RefCount.decrementAndTest())
+ return UR_RESULT_SUCCESS;
+
+ delete hCommandBuffer;
+ return UR_RESULT_SUCCESS;
+} catch (...) {
+ return exceptionToResult(std::current_exception());
+}
+
+ur_result_t
+urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) try {
+ UR_ASSERT(hCommandBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+ UR_CALL(hCommandBuffer->finalizeCommandBuffer());
+ return UR_RESULT_SUCCESS;
+} catch (...) {
+ return exceptionToResult(std::current_exception());
+}
+
+ur_result_t urCommandBufferAppendKernelLaunchExp(
+ ur_exp_command_buffer_handle_t commandBuffer, ur_kernel_handle_t hKernel,
+ uint32_t workDim, const size_t *pGlobalWorkOffset,
+ const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
+ uint32_t numKernelAlternatives, ur_kernel_handle_t *kernelAlternatives,
+ uint32_t numSyncPointsInWaitList,
+ const ur_exp_command_buffer_sync_point_t *syncPointWaitList,
+ uint32_t numEventsInWaitList, const ur_event_handle_t *eventWaitList,
+ ur_exp_command_buffer_sync_point_t *retSyncPoint, ur_event_handle_t *event,
+ ur_exp_command_buffer_command_handle_t *command)
+
+ try {
+ // Need to know semantics
+ // - should they be checked before kernel execution or before kernel
+ // appending to list if latter then it is easy fix, if former then TODO
+ std::ignore = numEventsInWaitList;
+ std::ignore = eventWaitList;
+ std::ignore = event;
+
+ // sync mechanic can be ignored, because all lists are in-order
+ std::ignore = numSyncPointsInWaitList;
+ std::ignore = syncPointWaitList;
+ std::ignore = retSyncPoint;
+
+ // TODO
+ std::ignore = numKernelAlternatives;
+ std::ignore = kernelAlternatives;
+ std::ignore = command;
+ UR_CALL(commandBuffer->commandListManager.appendKernelLaunch(
+ hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, 0,
+ nullptr, nullptr));
+ return UR_RESULT_SUCCESS;
+} catch (...) {
+ return exceptionToResult(std::current_exception());
+}
+
+ur_result_t urCommandBufferEnqueueExp(
+ ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue,
+ uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+ ur_event_handle_t *phEvent) try {
+ return hQueue->enqueueCommandBuffer(
+ hCommandBuffer->commandListManager.getZeCommandList(), phEvent,
+ numEventsInWaitList, phEventWaitList);
+} catch (...) {
+ return exceptionToResult(std::current_exception());
+}
+
+ur_result_t
+urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer,
+ ur_exp_command_buffer_info_t propName,
+ size_t propSize, void *pPropValue,
+ size_t *pPropSizeRet) try {
+ UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
+
+ switch (propName) {
+ case UR_EXP_COMMAND_BUFFER_INFO_REFERENCE_COUNT:
+ return ReturnValue(uint32_t{hCommandBuffer->RefCount.load()});
+ case UR_EXP_COMMAND_BUFFER_INFO_DESCRIPTOR: {
+ ur_exp_command_buffer_desc_t Descriptor{};
+ Descriptor.stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC;
+ Descriptor.pNext = nullptr;
+ Descriptor.isUpdatable = hCommandBuffer->isUpdatable;
+ Descriptor.isInOrder = true;
+ Descriptor.enableProfiling = hCommandBuffer->isProfilingEnabled;
+
+ return ReturnValue(Descriptor);
+ }
+ default:
+ assert(!"Command-buffer info request not implemented");
+ }
+ return UR_RESULT_ERROR_INVALID_ENUMERATION;
+} catch (...) {
+ return exceptionToResult(std::current_exception());
+}
+
+} // namespace ur::level_zero
diff --git a/source/adapters/level_zero/v2/command_buffer.hpp b/source/adapters/level_zero/v2/command_buffer.hpp
new file mode 100644
index 0000000000..5e60d6537f
--- /dev/null
+++ b/source/adapters/level_zero/v2/command_buffer.hpp
@@ -0,0 +1,51 @@
+//===--------- command_buffer.hpp - Level Zero Adapter ---------------===//
+//
+// Copyright (C) 2024 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include "command_list_manager.hpp"
+#include "common.hpp"
+#include "context.hpp"
+#include "kernel.hpp"
+#include "queue_api.hpp"
+#include
+
+struct ur_exp_command_buffer_handle_t_ : public _ur_object {
+ ur_exp_command_buffer_handle_t_(
+ ur_context_handle_t context, ur_device_handle_t device,
+ v2::raii::command_list_unique_handle &&commandList,
+ const ur_exp_command_buffer_desc_t *desc);
+
+ ~ur_exp_command_buffer_handle_t_() = default;
+
+ ur_command_list_manager commandListManager;
+
+ ur_result_t finalizeCommandBuffer();
+ // Indicates if command-buffer commands can be updated after it is closed.
+ const bool isUpdatable = false;
+ // Command-buffer profiling is enabled.
+ const bool isProfilingEnabled = false;
+
+private:
+ // Indicates if command buffer was finalized.
+ bool isFinalized = false;
+};
+
+struct ur_exp_command_buffer_command_handle_t_ : public _ur_object {
+ ur_exp_command_buffer_command_handle_t_(ur_exp_command_buffer_handle_t,
+ uint64_t);
+
+private:
+ ~ur_exp_command_buffer_command_handle_t_();
+
+ // Command-buffer of this command.
+ ur_exp_command_buffer_handle_t commandBuffer;
+ // L0 command ID identifying this command
+ uint64_t commandId;
+};
diff --git a/source/adapters/level_zero/v2/command_list_manager.cpp b/source/adapters/level_zero/v2/command_list_manager.cpp
new file mode 100644
index 0000000000..987cb462a3
--- /dev/null
+++ b/source/adapters/level_zero/v2/command_list_manager.cpp
@@ -0,0 +1,115 @@
+//===--------- command_list_cache.hpp - Level Zero Adapter ---------------===//
+//
+// Copyright (C) 2024 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "command_list_manager.hpp"
+#include "../helpers/kernel_helpers.hpp"
+#include "../ur_interface_loader.hpp"
+#include "context.hpp"
+#include "kernel.hpp"
+
+ur_command_list_manager::ur_command_list_manager(
+ ur_context_handle_t context, ur_device_handle_t device,
+ v2::raii::command_list_unique_handle &&commandList, v2::event_flags_t flags,
+ ur_queue_handle_t_ *queue)
+ : context(context), device(device),
+ eventPool(context->eventPoolCache.borrow(device->Id.value(), flags)),
+ zeCommandList(std::move(commandList)), queue(queue) {
+ UR_CALL_THROWS(ur::level_zero::urContextRetain(context));
+ UR_CALL_THROWS(ur::level_zero::urDeviceRetain(device));
+}
+
+ur_command_list_manager::~ur_command_list_manager() {
+ ur::level_zero::urContextRelease(context);
+ ur::level_zero::urDeviceRelease(device);
+}
+
+std::pair
+ur_command_list_manager::getWaitListView(const ur_event_handle_t *phWaitEvents,
+ uint32_t numWaitEvents) {
+
+ waitList.resize(numWaitEvents);
+ for (uint32_t i = 0; i < numWaitEvents; i++) {
+ waitList[i] = phWaitEvents[i]->getZeEvent();
+ }
+
+ return {waitList.data(), static_cast(numWaitEvents)};
+}
+
+ze_event_handle_t
+ur_command_list_manager::getSignalEvent(ur_event_handle_t *hUserEvent,
+ ur_command_t commandType) {
+ if (hUserEvent && queue) {
+ *hUserEvent = eventPool->allocate();
+ (*hUserEvent)->resetQueueAndCommand(queue, commandType);
+ return (*hUserEvent)->getZeEvent();
+ } else {
+ return nullptr;
+ }
+}
+
+ur_result_t ur_command_list_manager::appendKernelLaunch(
+ ur_kernel_handle_t hKernel, uint32_t workDim,
+ const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+ const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
+ const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+ TRACK_SCOPE_LATENCY("ur_command_list_manager::appendKernelLaunch");
+
+ UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+ UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+ UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+ UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+
+ ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(device);
+
+ std::scoped_lock Lock(this->Mutex,
+ hKernel->Mutex);
+
+ ze_group_count_t zeThreadGroupDimensions{1, 1, 1};
+ uint32_t WG[3]{};
+ UR_CALL(calculateKernelWorkDimensions(hZeKernel, device,
+ zeThreadGroupDimensions, WG, workDim,
+ pGlobalWorkSize, pLocalWorkSize));
+
+ auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH);
+
+ auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList);
+
+ bool memoryMigrated = false;
+ auto memoryMigrate = [&](void *src, void *dst, size_t size) {
+ ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
+ (zeCommandList.get(), dst, src, size, nullptr,
+ waitList.second, waitList.first));
+ memoryMigrated = true;
+ };
+
+ UR_CALL(hKernel->prepareForSubmission(context, device, pGlobalWorkOffset,
+ workDim, WG[0], WG[1], WG[2],
+ memoryMigrate));
+
+ if (memoryMigrated) {
+ // If memory was migrated, we don't need to pass the wait list to
+ // the copy command again.
+ waitList.first = nullptr;
+ waitList.second = 0;
+ }
+
+ TRACK_SCOPE_LATENCY(
+ "ur_command_list_manager::zeCommandListAppendLaunchKernel");
+ ZE2UR_CALL(zeCommandListAppendLaunchKernel,
+ (zeCommandList.get(), hZeKernel, &zeThreadGroupDimensions,
+ zeSignalEvent, waitList.second, waitList.first));
+
+ return UR_RESULT_SUCCESS;
+}
+
+ze_command_list_handle_t ur_command_list_manager::getZeCommandList() {
+ return zeCommandList.get();
+}
diff --git a/source/adapters/level_zero/v2/command_list_manager.hpp b/source/adapters/level_zero/v2/command_list_manager.hpp
new file mode 100644
index 0000000000..9e0049a130
--- /dev/null
+++ b/source/adapters/level_zero/v2/command_list_manager.hpp
@@ -0,0 +1,54 @@
+//===--------- command_list_cache.hpp - Level Zero Adapter ---------------===//
+//
+// Copyright (C) 2024 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include "command_list_cache.hpp"
+#include "common.hpp"
+#include "event_pool_cache.hpp"
+#include "queue_api.hpp"
+#include
+
+struct ur_command_list_manager : public _ur_object {
+
+ ur_command_list_manager(ur_context_handle_t context,
+ ur_device_handle_t device,
+ v2::raii::command_list_unique_handle &&commandList,
+ v2::event_flags_t flags = v2::EVENT_FLAGS_COUNTER,
+ ur_queue_handle_t_ *queue = nullptr);
+ ~ur_command_list_manager();
+
+ ur_result_t appendKernelLaunch(ur_kernel_handle_t hKernel, uint32_t workDim,
+ const size_t *pGlobalWorkOffset,
+ const size_t *pGlobalWorkSize,
+ const size_t *pLocalWorkSize,
+ uint32_t numEventsInWaitList,
+ const ur_event_handle_t *phEventWaitList,
+ ur_event_handle_t *phEvent);
+ ur_result_t appendCommandListImmediate(
+ ze_command_list_handle_t commandList, ur_event_handle_t *phEvent,
+ uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList);
+ ze_command_list_handle_t getZeCommandList();
+
+ std::pair
+ getWaitListView(const ur_event_handle_t *phWaitEvents,
+ uint32_t numWaitEvents);
+ ze_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent,
+ ur_command_t commandType);
+
+private:
+ // UR context associated with this command-buffer
+ ur_context_handle_t context;
+ // Device associated with this command buffer
+ ur_device_handle_t device;
+ v2::raii::cache_borrowed_event_pool eventPool;
+ v2::raii::command_list_unique_handle zeCommandList;
+ ur_queue_handle_t_ *queue;
+ std::vector waitList;
+};
diff --git a/source/adapters/level_zero/v2/queue_api.cpp b/source/adapters/level_zero/v2/queue_api.cpp
index f4e2f47c09..28ff527413 100644
--- a/source/adapters/level_zero/v2/queue_api.cpp
+++ b/source/adapters/level_zero/v2/queue_api.cpp
@@ -11,6 +11,9 @@
*
*/
+// Do not edit. This file is auto generated from a template:
+// scripts/templates/queue_api.cpp.mako
+
#include "queue_api.hpp"
#include "ur_util.hpp"
diff --git a/source/adapters/level_zero/v2/queue_api.hpp b/source/adapters/level_zero/v2/queue_api.hpp
index e9e98874e8..88d812bbba 100644
--- a/source/adapters/level_zero/v2/queue_api.hpp
+++ b/source/adapters/level_zero/v2/queue_api.hpp
@@ -11,9 +11,13 @@
*
*/
+// Do not edit. This file is auto generated from a template:
+// scripts/templates/queue_api.hpp.mako
+
#pragma once
#include
+#include
struct ur_queue_handle_t_ {
virtual ~ur_queue_handle_t_();
@@ -158,4 +162,8 @@ struct ur_queue_handle_t_ {
const ur_exp_enqueue_native_command_properties_t *,
uint32_t, const ur_event_handle_t *,
ur_event_handle_t *) = 0;
+
+ virtual ur_result_t enqueueCommandBuffer(ze_command_list_handle_t,
+ ur_event_handle_t *, uint32_t,
+ const ur_event_handle_t *) = 0;
};
diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
index af65df78a2..1c738edf50 100644
--- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
+++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
@@ -24,13 +24,7 @@ namespace v2 {
std::pair
ur_queue_immediate_in_order_t::getWaitListView(
const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents) {
-
- waitList.resize(numWaitEvents);
- for (uint32_t i = 0; i < numWaitEvents; i++) {
- waitList[i] = phWaitEvents[i]->getZeEvent();
- }
-
- return {waitList.data(), static_cast(numWaitEvents)};
+ return commandListManager.getWaitListView(phWaitEvents, numWaitEvents);
}
static int32_t getZeOrdinal(ur_device_handle_t hDevice) {
@@ -58,25 +52,6 @@ static ze_command_queue_priority_t getZePriority(ur_queue_flags_t flags) {
return ZE_COMMAND_QUEUE_PRIORITY_NORMAL;
}
-ur_command_list_handler_t::ur_command_list_handler_t(
- ur_context_handle_t hContext, ur_device_handle_t hDevice,
- const ur_queue_properties_t *pProps)
- : commandList(hContext->commandListCache.getImmediateCommandList(
- hDevice->ZeDevice, true, getZeOrdinal(hDevice),
- true /* always enable copy offload */,
- ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
- getZePriority(pProps ? pProps->flags : ur_queue_flags_t{}),
- getZeIndex(pProps))) {}
-
-ur_command_list_handler_t::ur_command_list_handler_t(
- ze_command_list_handle_t hZeCommandList, bool ownZeHandle)
- : commandList(hZeCommandList,
- [ownZeHandle](ze_command_list_handle_t hZeCommandList) {
- if (ownZeHandle) {
- ZE_CALL_NOCHECK(zeCommandListDestroy, (hZeCommandList));
- }
- }) {}
-
static event_flags_t eventFlagsFromQueueFlags(ur_queue_flags_t flags) {
event_flags_t eventFlags = EVENT_FLAGS_COUNTER;
if (flags & UR_QUEUE_FLAG_PROFILING_ENABLE)
@@ -88,29 +63,35 @@ ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t(
ur_context_handle_t hContext, ur_device_handle_t hDevice,
const ur_queue_properties_t *pProps)
: hContext(hContext), hDevice(hDevice), flags(pProps ? pProps->flags : 0),
- eventPool(hContext->eventPoolCache.borrow(
- hDevice->Id.value(), eventFlagsFromQueueFlags(flags))),
- handler(hContext, hDevice, pProps) {}
+ commandListManager(
+ hContext, hDevice,
+ hContext->commandListCache.getImmediateCommandList(
+ hDevice->ZeDevice, true, getZeOrdinal(hDevice),
+ true /* always enable copy offload */,
+ ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
+ getZePriority(pProps ? pProps->flags : ur_queue_flags_t{}),
+ getZeIndex(pProps)),
+ eventFlagsFromQueueFlags(flags), this) {}
ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t(
ur_context_handle_t hContext, ur_device_handle_t hDevice,
ur_native_handle_t hNativeHandle, ur_queue_flags_t flags, bool ownZeQueue)
: hContext(hContext), hDevice(hDevice), flags(flags),
- eventPool(hContext->eventPoolCache.borrow(
- hDevice->Id.value(), eventFlagsFromQueueFlags(flags))),
- handler(reinterpret_cast(hNativeHandle),
- ownZeQueue) {}
-
-ur_event_handle_t
+ commandListManager(
+ hContext, hDevice,
+ raii::command_list_unique_handle(
+ reinterpret_cast(hNativeHandle),
+ [ownZeQueue](ze_command_list_handle_t hZeCommandList) {
+ if (ownZeQueue) {
+ ZE_CALL_NOCHECK(zeCommandListDestroy, (hZeCommandList));
+ }
+ }),
+ eventFlagsFromQueueFlags(flags)) {}
+
+ze_event_handle_t
ur_queue_immediate_in_order_t::getSignalEvent(ur_event_handle_t *hUserEvent,
ur_command_t commandType) {
- if (hUserEvent) {
- *hUserEvent = eventPool->allocate();
- (*hUserEvent)->resetQueueAndCommand(this, commandType);
- return *hUserEvent;
- } else {
- return nullptr;
- }
+ return commandListManager.getSignalEvent(hUserEvent, commandType);
}
ur_result_t
@@ -133,7 +114,7 @@ ur_queue_immediate_in_order_t::queueGetInfo(ur_queue_info_t propName,
return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
case UR_QUEUE_INFO_EMPTY: {
auto status = ZE_CALL_NOCHECK(zeCommandListHostSynchronize,
- (handler.commandList.get(), 0));
+ (commandListManager.getZeCommandList(), 0));
if (status == ZE_RESULT_SUCCESS) {
return ReturnValue(true);
} else if (status == ZE_RESULT_NOT_READY) {
@@ -175,8 +156,8 @@ void ur_queue_immediate_in_order_t::deferEventFree(ur_event_handle_t hEvent) {
ur_result_t ur_queue_immediate_in_order_t::queueGetNativeHandle(
ur_queue_native_desc_t *pDesc, ur_native_handle_t *phNativeQueue) {
std::ignore = pDesc;
- *phNativeQueue =
- reinterpret_cast(this->handler.commandList.get());
+ *phNativeQueue = reinterpret_cast(
+ this->commandListManager.getZeCommandList());
return UR_RESULT_SUCCESS;
}
@@ -189,7 +170,7 @@ ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
TRACK_SCOPE_LATENCY(
"ur_queue_immediate_in_order_t::zeCommandListHostSynchronize");
ZE2UR_CALL(zeCommandListHostSynchronize,
- (handler.commandList.get(), UINT64_MAX));
+ (commandListManager.getZeCommandList(), UINT64_MAX));
// Free deferred events
for (auto &hEvent : deferredEvents) {
@@ -223,52 +204,9 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunch(
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueKernelLaunch");
- UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
- UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER);
-
- UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
- UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
-
- ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(hDevice);
-
- std::scoped_lock Lock(this->Mutex,
- hKernel->Mutex);
-
- ze_group_count_t zeThreadGroupDimensions{1, 1, 1};
- uint32_t WG[3]{};
- UR_CALL(calculateKernelWorkDimensions(hZeKernel, hDevice,
- zeThreadGroupDimensions, WG, workDim,
- pGlobalWorkSize, pLocalWorkSize));
-
- auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH);
-
- auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList);
-
- bool memoryMigrated = false;
- auto memoryMigrate = [&](void *src, void *dst, size_t size) {
- ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
- (handler.commandList.get(), dst, src, size, nullptr,
- waitList.second, waitList.first));
- memoryMigrated = true;
- };
-
- UR_CALL(hKernel->prepareForSubmission(hContext, hDevice, pGlobalWorkOffset,
- workDim, WG[0], WG[1], WG[2],
- memoryMigrate));
-
- if (memoryMigrated) {
- // If memory was migrated, we don't need to pass the wait list to
- // the copy command again.
- waitList.first = nullptr;
- waitList.second = 0;
- }
-
- TRACK_SCOPE_LATENCY(
- "ur_queue_immediate_in_order_t::zeCommandListAppendLaunchKernel");
- auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr;
- ZE2UR_CALL(zeCommandListAppendLaunchKernel,
- (handler.commandList.get(), hZeKernel, &zeThreadGroupDimensions,
- zeSignalEvent, waitList.second, waitList.first));
+ UR_CALL(commandListManager.appendKernelLaunch(
+ hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
+ numEventsInWaitList, phEventWaitList, phEvent));
recordSubmittedKernel(hKernel);
@@ -287,20 +225,20 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWait(
return UR_RESULT_SUCCESS;
}
- auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_EVENTS_WAIT);
+ auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_EVENTS_WAIT);
auto [pWaitEvents, numWaitEvents] =
getWaitListView(phEventWaitList, numEventsInWaitList);
if (numWaitEvents > 0) {
- ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
- (handler.commandList.get(), numWaitEvents, pWaitEvents));
+ ZE2UR_CALL(
+ zeCommandListAppendWaitOnEvents,
+ (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents));
}
- if (signalEvent) {
+ if (zeSignalEvent) {
ZE2UR_CALL(zeCommandListAppendSignalEvent,
- (handler.commandList.get(), signalEvent->getZeEvent()));
+ (commandListManager.getZeCommandList(), zeSignalEvent));
}
-
return UR_RESULT_SUCCESS;
}
@@ -317,13 +255,13 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrierImpl(
return UR_RESULT_SUCCESS;
}
- auto signalEvent =
+ auto zeSignalEvent =
getSignalEvent(phEvent, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER);
auto [pWaitEvents, numWaitEvents] =
getWaitListView(phEventWaitList, numEventsInWaitList);
ZE2UR_CALL(zeCommandListAppendBarrier,
- (handler.commandList.get(), signalEvent->getZeEvent(),
+ (commandListManager.getZeCommandList(), zeSignalEvent,
numWaitEvents, pWaitEvents));
return UR_RESULT_SUCCESS;
@@ -358,7 +296,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCopyUnlocked(
size_t dstOffset, size_t size, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent,
ur_command_t commandType) {
- auto signalEvent = getSignalEvent(phEvent, commandType);
+ auto zeSignalEvent = getSignalEvent(phEvent, commandType);
auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList);
@@ -367,8 +305,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCopyUnlocked(
hDevice, ur_mem_handle_t_::device_access_mode_t::read_only, srcOffset,
size, [&](void *src, void *dst, size_t size) {
ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
- (handler.commandList.get(), dst, src, size, nullptr,
- waitList.second, waitList.first));
+ (commandListManager.getZeCommandList(), dst, src,
+ size, nullptr, waitList.second, waitList.first));
memoryMigrated = true;
}));
@@ -376,8 +314,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCopyUnlocked(
hDevice, ur_mem_handle_t_::device_access_mode_t::write_only, dstOffset,
size, [&](void *src, void *dst, size_t size) {
ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
- (handler.commandList.get(), dst, src, size, nullptr,
- waitList.second, waitList.first));
+ (commandListManager.getZeCommandList(), dst, src,
+ size, nullptr, waitList.second, waitList.first));
memoryMigrated = true;
}));
@@ -388,14 +326,13 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCopyUnlocked(
waitList.second = 0;
}
- auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr;
ZE2UR_CALL(zeCommandListAppendMemoryCopy,
- (handler.commandList.get(), pDst, pSrc, size, zeSignalEvent,
- waitList.second, waitList.first));
+ (commandListManager.getZeCommandList(), pDst, pSrc, size,
+ zeSignalEvent, waitList.second, waitList.first));
if (blocking) {
ZE2UR_CALL(zeCommandListHostSynchronize,
- (handler.commandList.get(), UINT64_MAX));
+ (commandListManager.getZeCommandList(), UINT64_MAX));
}
return UR_RESULT_SUCCESS;
@@ -447,7 +384,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueRegionCopyUnlocked(
auto zeParams = ur2zeRegionParams(srcOrigin, dstOrigin, region, srcRowPitch,
dstRowPitch, srcSlicePitch, dstSlicePitch);
- auto signalEvent = getSignalEvent(phEvent, commandType);
+ auto zeSignalEvent = getSignalEvent(phEvent, commandType);
auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList);
@@ -456,16 +393,16 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueRegionCopyUnlocked(
hDevice, ur_mem_handle_t_::device_access_mode_t::read_only, 0,
src->getSize(), [&](void *src, void *dst, size_t size) {
ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
- (handler.commandList.get(), dst, src, size, nullptr,
- waitList.second, waitList.first));
+ (commandListManager.getZeCommandList(), dst, src,
+ size, nullptr, waitList.second, waitList.first));
memoryMigrated = true;
}));
auto pDst = ur_cast(dst->getDevicePtr(
hDevice, ur_mem_handle_t_::device_access_mode_t::write_only, 0,
dst->getSize(), [&](void *src, void *dst, size_t size) {
ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
- (handler.commandList.get(), dst, src, size, nullptr,
- waitList.second, waitList.first));
+ (commandListManager.getZeCommandList(), dst, src,
+ size, nullptr, waitList.second, waitList.first));
memoryMigrated = true;
}));
@@ -476,16 +413,15 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueRegionCopyUnlocked(
waitList.second = 0;
}
- auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr;
ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion,
- (handler.commandList.get(), pDst, &zeParams.dstRegion,
+ (commandListManager.getZeCommandList(), pDst, &zeParams.dstRegion,
zeParams.dstPitch, zeParams.dstSlicePitch, pSrc,
&zeParams.srcRegion, zeParams.srcPitch, zeParams.srcSlicePitch,
zeSignalEvent, waitList.second, waitList.first));
if (blocking) {
ZE2UR_CALL(zeCommandListHostSynchronize,
- (handler.commandList.get(), UINT64_MAX));
+ (commandListManager.getZeCommandList(), UINT64_MAX));
}
return UR_RESULT_SUCCESS;
@@ -649,7 +585,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferMap(
std::scoped_lock lock(this->Mutex,
hBuffer->getMutex());
- auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_BUFFER_MAP);
+ auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_BUFFER_MAP);
auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList);
@@ -657,8 +593,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferMap(
auto pDst = ur_cast(hBuffer->mapHostPtr(
mapFlags, offset, size, [&](void *src, void *dst, size_t size) {
ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
- (handler.commandList.get(), dst, src, size, nullptr,
- waitList.second, waitList.first));
+ (commandListManager.getZeCommandList(), dst, src,
+ size, nullptr, waitList.second, waitList.first));
memoryMigrated = true;
}));
*ppRetMap = pDst;
@@ -666,17 +602,18 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferMap(
if (!memoryMigrated && waitList.second) {
// If memory was not migrated, we need to wait on the events here.
ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
- (handler.commandList.get(), waitList.second, waitList.first));
+ (commandListManager.getZeCommandList(), waitList.second,
+ waitList.first));
}
- if (signalEvent) {
+ if (zeSignalEvent) {
ZE2UR_CALL(zeCommandListAppendSignalEvent,
- (handler.commandList.get(), signalEvent->getZeEvent()));
+ (commandListManager.getZeCommandList(), zeSignalEvent));
}
if (blockingMap) {
ZE2UR_CALL(zeCommandListHostSynchronize,
- (handler.commandList.get(), UINT64_MAX));
+ (commandListManager.getZeCommandList(), UINT64_MAX));
}
return UR_RESULT_SUCCESS;
@@ -689,29 +626,28 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemUnmap(
std::scoped_lock lock(this->Mutex);
- auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_UNMAP);
+ auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_UNMAP);
auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList);
// TODO: currently unmapHostPtr deallocates memory immediately,
// since the memory might be used by the user, we need to make sure
// all dependencies are completed.
- ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
- (handler.commandList.get(), waitList.second, waitList.first));
+ ZE2UR_CALL(
+ zeCommandListAppendWaitOnEvents,
+ (commandListManager.getZeCommandList(), waitList.second, waitList.first));
bool memoryMigrated = false;
hMem->unmapHostPtr(pMappedPtr, [&](void *src, void *dst, size_t size) {
ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
- (handler.commandList.get(), dst, src, size, nullptr,
- waitList.second, waitList.first));
+ (commandListManager.getZeCommandList(), dst, src, size,
+ nullptr, waitList.second, waitList.first));
memoryMigrated = true;
});
-
- if (signalEvent) {
+ if (zeSignalEvent) {
ZE2UR_CALL(zeCommandListAppendSignalEvent,
- (handler.commandList.get(), signalEvent->getZeEvent()));
+ (commandListManager.getZeCommandList(), zeSignalEvent));
}
-
return UR_RESULT_SUCCESS;
}
@@ -721,7 +657,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericFillUnlocked(
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent,
ur_command_t commandType) {
- auto signalEvent = getSignalEvent(phEvent, commandType);
+ auto zeSignalEvent = getSignalEvent(phEvent, commandType);
auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList);
@@ -730,8 +666,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericFillUnlocked(
hDevice, ur_mem_handle_t_::device_access_mode_t::read_only, offset, size,
[&](void *src, void *dst, size_t size) {
ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
- (handler.commandList.get(), dst, src, size, nullptr,
- waitList.second, waitList.first));
+ (commandListManager.getZeCommandList(), dst, src,
+ size, nullptr, waitList.second, waitList.first));
memoryMigrated = true;
}));
@@ -746,10 +682,10 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericFillUnlocked(
// PatternSize must be a power of two for zeCommandListAppendMemoryFill.
// When it's not, the fill is emulated with zeCommandListAppendMemoryCopy.
- auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr;
ZE2UR_CALL(zeCommandListAppendMemoryFill,
- (handler.commandList.get(), pDst, pPattern, patternSize, size,
- zeSignalEvent, waitList.second, waitList.first));
+ (commandListManager.getZeCommandList(), pDst, pPattern,
+ patternSize, size, zeSignalEvent, waitList.second,
+ waitList.first));
return UR_RESULT_SUCCESS;
}
@@ -777,19 +713,18 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMMemcpy(
std::scoped_lock lock(this->Mutex);
- auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_MEMCPY);
+ auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_MEMCPY);
auto [pWaitEvents, numWaitEvents] =
getWaitListView(phEventWaitList, numEventsInWaitList);
- auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr;
ZE2UR_CALL(zeCommandListAppendMemoryCopy,
- (handler.commandList.get(), pDst, pSrc, size, zeSignalEvent,
- numWaitEvents, pWaitEvents));
+ (commandListManager.getZeCommandList(), pDst, pSrc, size,
+ zeSignalEvent, numWaitEvents, pWaitEvents));
if (blocking) {
ZE2UR_CALL(zeCommandListHostSynchronize,
- (handler.commandList.get(), UINT64_MAX));
+ (commandListManager.getZeCommandList(), UINT64_MAX));
}
return UR_RESULT_SUCCESS;
@@ -805,22 +740,22 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMPrefetch(
std::scoped_lock lock(this->Mutex);
- auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_PREFETCH);
+ auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_PREFETCH);
auto [pWaitEvents, numWaitEvents] =
getWaitListView(phEventWaitList, numEventsInWaitList);
if (pWaitEvents) {
- ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
- (handler.commandList.get(), numWaitEvents, pWaitEvents));
+ ZE2UR_CALL(
+ zeCommandListAppendWaitOnEvents,
+ (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents));
}
// TODO: figure out how to translate "flags"
ZE2UR_CALL(zeCommandListAppendMemoryPrefetch,
- (handler.commandList.get(), pMem, size));
-
- if (signalEvent) {
+ (commandListManager.getZeCommandList(), pMem, size));
+ if (zeSignalEvent) {
ZE2UR_CALL(zeCommandListAppendSignalEvent,
- (handler.commandList.get(), signalEvent->getZeEvent()));
+ (commandListManager.getZeCommandList(), zeSignalEvent));
}
return UR_RESULT_SUCCESS;
@@ -838,25 +773,25 @@ ur_queue_immediate_in_order_t::enqueueUSMAdvise(const void *pMem, size_t size,
auto zeAdvice = ur_cast(advice);
- auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE);
+ auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE);
auto [pWaitEvents, numWaitEvents] = getWaitListView(nullptr, 0);
if (pWaitEvents) {
- ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
- (handler.commandList.get(), numWaitEvents, pWaitEvents));
+ ZE2UR_CALL(
+ zeCommandListAppendWaitOnEvents,
+ (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents));
}
// TODO: figure out how to translate "flags"
ZE2UR_CALL(zeCommandListAppendMemAdvise,
- (handler.commandList.get(), this->hDevice->ZeDevice, pMem, size,
- zeAdvice));
+ (commandListManager.getZeCommandList(), this->hDevice->ZeDevice,
+ pMem, size, zeAdvice));
- if (signalEvent) {
+ if (zeSignalEvent) {
ZE2UR_CALL(zeCommandListAppendSignalEvent,
- (handler.commandList.get(), signalEvent->getZeEvent()));
+ (commandListManager.getZeCommandList(), zeSignalEvent));
}
-
return UR_RESULT_SUCCESS;
}
@@ -1058,15 +993,15 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp(
zeThreadGroupDimensions, WG, workDim,
pGlobalWorkSize, pLocalWorkSize));
- auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH);
+ auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH);
auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList);
bool memoryMigrated = false;
auto memoryMigrate = [&](void *src, void *dst, size_t size) {
ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
- (handler.commandList.get(), dst, src, size, nullptr,
- waitList.second, waitList.first));
+ (commandListManager.getZeCommandList(), dst, src, size,
+ nullptr, waitList.second, waitList.first));
memoryMigrated = true;
};
@@ -1083,10 +1018,10 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp(
TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::"
"zeCommandListAppendLaunchCooperativeKernel");
- auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr;
ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel,
- (handler.commandList.get(), hZeKernel, &zeThreadGroupDimensions,
- zeSignalEvent, waitList.second, waitList.first));
+ (commandListManager.getZeCommandList(), hZeKernel,
+ &zeThreadGroupDimensions, zeSignalEvent, waitList.second,
+ waitList.first));
recordSubmittedKernel(hKernel);
@@ -1101,33 +1036,56 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp(
std::scoped_lock lock(this->Mutex);
- auto signalEvent =
- getSignalEvent(phEvent, UR_COMMAND_TIMESTAMP_RECORDING_EXP);
-
- if (!signalEvent) {
+ if (!phEvent && !*phEvent) {
return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
}
-
+ getSignalEvent(phEvent, UR_COMMAND_TIMESTAMP_RECORDING_EXP);
auto [pWaitEvents, numWaitEvents] =
getWaitListView(phEventWaitList, numEventsInWaitList);
- signalEvent->recordStartTimestamp();
+ (*phEvent)->recordStartTimestamp();
auto [timestampPtr, zeSignalEvent] =
- signalEvent->getEventEndTimestampAndHandle();
+ (*phEvent)->getEventEndTimestampAndHandle();
ZE2UR_CALL(zeCommandListAppendWriteGlobalTimestamp,
- (handler.commandList.get(), timestampPtr, zeSignalEvent,
- numWaitEvents, pWaitEvents));
+ (commandListManager.getZeCommandList(), timestampPtr,
+ zeSignalEvent, numWaitEvents, pWaitEvents));
if (blocking) {
ZE2UR_CALL(zeCommandListHostSynchronize,
- (handler.commandList.get(), UINT64_MAX));
+ (commandListManager.getZeCommandList(), UINT64_MAX));
}
return UR_RESULT_SUCCESS;
}
+ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp(
+ uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists,
+ ur_event_handle_t *phEvent, uint32_t numEventsInWaitList,
+ const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand) {
+
+ std::scoped_lock Lock(this->Mutex);
+ auto zeSignalEvent = getSignalEvent(phEvent, callerCommand);
+
+ auto [pWaitEvents, numWaitEvents] =
+ getWaitListView(phEventWaitList, numEventsInWaitList);
+
+ ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp,
+ (commandListManager.getZeCommandList(), numCommandLists,
+ phCommandLists, zeSignalEvent, numWaitEvents, pWaitEvents));
+
+ return UR_RESULT_SUCCESS;
+}
+
+ur_result_t ur_queue_immediate_in_order_t::enqueueCommandBuffer(
+ ze_command_list_handle_t commandBufferCommandList,
+ ur_event_handle_t *phEvent, uint32_t numEventsInWaitList,
+ const ur_event_handle_t *phEventWaitList) {
+ return enqueueGenericCommandListsExp(1, &commandBufferCommandList, phEvent,
+ numEventsInWaitList, phEventWaitList,
+ UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP);
+}
ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunchCustomExp(
ur_kernel_handle_t hKernel, uint32_t workDim,
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
index e0d7f747b3..6cf8b0c51c 100644
--- a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
+++ b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
@@ -19,33 +19,19 @@
#include "ur/ur.hpp"
+#include "command_list_manager.hpp"
+
namespace v2 {
using queue_group_type = ur_device_handle_t_::queue_group_info_t::type;
-struct ur_command_list_handler_t {
- ur_command_list_handler_t(ur_context_handle_t hContext,
- ur_device_handle_t hDevice,
- const ur_queue_properties_t *pProps);
-
- ur_command_list_handler_t(ze_command_list_handle_t hZeCommandList,
- bool ownZeHandle);
-
- raii::command_list_unique_handle commandList;
-};
-
struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ {
private:
ur_context_handle_t hContext;
ur_device_handle_t hDevice;
ur_queue_flags_t flags;
- raii::cache_borrowed_event_pool eventPool;
-
- ur_command_list_handler_t handler;
-
- std::vector waitList;
-
+ ur_command_list_manager commandListManager;
std::vector deferredEvents;
std::vector submittedKernels;
@@ -53,7 +39,7 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ {
getWaitListView(const ur_event_handle_t *phWaitEvents,
uint32_t numWaitEvents);
- ur_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent,
+ ze_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent,
ur_command_t commandType);
void deferEventFree(ur_event_handle_t hEvent) override;
@@ -78,6 +64,11 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ {
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent,
ur_command_t commandType);
+ ur_result_t enqueueGenericCommandListsExp(
+ uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists,
+ ur_event_handle_t *phEvent, uint32_t numEventsInWaitList,
+ const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand);
+
ur_result_t
enqueueEventsWaitWithBarrierImpl(uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList,
@@ -277,6 +268,10 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ {
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent) override;
ur_result_t
+ enqueueCommandBuffer(ze_command_list_handle_t commandBufferCommandList,
+ ur_event_handle_t *phEvent, uint32_t numEventsInWaitList,
+ const ur_event_handle_t *phEventWaitList) override;
+ ur_result_t
enqueueNativeCommandExp(ur_exp_enqueue_native_command_function_t, void *,
uint32_t, const ur_mem_handle_t *,
const ur_exp_enqueue_native_command_properties_t *,
diff --git a/source/common/umf_pools/disjoint_pool_config_parser.cpp b/source/common/umf_pools/disjoint_pool_config_parser.cpp
index 8d5bc2066e..42c894b412 100644
--- a/source/common/umf_pools/disjoint_pool_config_parser.cpp
+++ b/source/common/umf_pools/disjoint_pool_config_parser.cpp
@@ -174,47 +174,36 @@ DisjointPoolAllConfigs parseDisjointPoolConfig(const std::string &config,
MemParser(Params, M);
};
- size_t MaxSize = (std::numeric_limits::max)();
-
// Update pool settings if specified in environment.
+ size_t MaxSize = (std::numeric_limits::max)();
size_t EnableBuffers = 1;
- if (config != "") {
- std::string Params = config;
- size_t Pos = Params.find(';');
- if (Pos != std::string::npos) {
- if (Pos > 0) {
- GetValue(Params, Pos, EnableBuffers);
+
+ bool EnableBuffersSet = false;
+ bool MaxSizeSet = false;
+ size_t Start = 0;
+ size_t End = config.find(';');
+ while (true) {
+ std::string Param = config.substr(Start, End - Start);
+ if (!EnableBuffersSet && (Param == "" || isdigit(Param[0]))) {
+ if (Param != "") {
+ GetValue(Param, Param.size(), EnableBuffers);
}
- Params.erase(0, Pos + 1);
- size_t Pos = Params.find(';');
- if (Pos != std::string::npos) {
- if (Pos > 0) {
- GetValue(Params, Pos, MaxSize);
- }
- Params.erase(0, Pos + 1);
- do {
- size_t Pos = Params.find(';');
- if (Pos != std::string::npos) {
- if (Pos > 0) {
- std::string MemParams = Params.substr(0, Pos);
- MemTypeParser(MemParams);
- }
- Params.erase(0, Pos + 1);
- if (Params.size() == 0) {
- break;
- }
- } else {
- MemTypeParser(Params);
- break;
- }
- } while (true);
- } else {
- // set MaxPoolSize for all configs
- GetValue(Params, Params.size(), MaxSize);
+ EnableBuffersSet = true;
+ } else if (!MaxSizeSet && (Param == "" || isdigit(Param[0]))) {
+ if (Param != "") {
+ GetValue(Param, Param.size(), MaxSize);
}
+ MaxSizeSet = true;
} else {
- GetValue(Params, Params.size(), EnableBuffers);
+ MemTypeParser(Param);
}
+
+ if (End == std::string::npos) {
+ break;
+ }
+
+ Start = End + 1;
+ End = config.find(';', Start);
}
AllConfigs.EnableBuffers = EnableBuffers;
diff --git a/test/adapters/cuda/kernel_tests.cpp b/test/adapters/cuda/kernel_tests.cpp
index 0f7f3351fe..7b83459c5f 100644
--- a/test/adapters/cuda/kernel_tests.cpp
+++ b/test/adapters/cuda/kernel_tests.cpp
@@ -153,7 +153,7 @@ TEST_P(cudaKernelTest, URKernelArgumentSimple) {
int number = 10;
ASSERT_SUCCESS(urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &number));
- const auto &kernelArgs = kernel->getArgIndices();
+ const auto &kernelArgs = kernel->getArgPointers();
ASSERT_EQ(kernelArgs.size(), 1 + NumberOfImplicitArgsCUDA);
int storedValue = *static_cast(kernelArgs[0]);
@@ -175,7 +175,7 @@ TEST_P(cudaKernelTest, URKernelArgumentSetTwice) {
int number = 10;
ASSERT_SUCCESS(urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &number));
- const auto &kernelArgs = kernel->getArgIndices();
+ const auto &kernelArgs = kernel->getArgPointers();
ASSERT_EQ(kernelArgs.size(), 1 + NumberOfImplicitArgsCUDA);
int storedValue = *static_cast(kernelArgs[0]);
ASSERT_EQ(storedValue, number);
@@ -183,7 +183,7 @@ TEST_P(cudaKernelTest, URKernelArgumentSetTwice) {
int otherNumber = 934;
ASSERT_SUCCESS(
urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &otherNumber));
- const auto kernelArgs2 = kernel->getArgIndices();
+ const auto kernelArgs2 = kernel->getArgPointers();
ASSERT_EQ(kernelArgs2.size(), 1 + NumberOfImplicitArgsCUDA);
storedValue = *static_cast(kernelArgs2[0]);
ASSERT_EQ(storedValue, otherNumber);
diff --git a/test/adapters/level_zero/event_cache_tests.cpp b/test/adapters/level_zero/event_cache_tests.cpp
index 14466ab805..09fcff7373 100644
--- a/test/adapters/level_zero/event_cache_tests.cpp
+++ b/test/adapters/level_zero/event_cache_tests.cpp
@@ -30,7 +30,7 @@ static std::shared_ptr<_zel_tracer_handle_t> tracer = [] {
zel_core_callbacks_t prologue_callbacks{};
prologue_callbacks.Event.pfnCreateCb = OnEnterEventCreate;
prologue_callbacks.Event.pfnDestroyCb = OnEnterEventDestroy;
- return enableTracing(prologue_callbacks, {});
+ return enableTracing(std::move(prologue_callbacks), {});
}();
template auto combineFlags(std::tuple tuple) {
diff --git a/test/adapters/level_zero/multi_device_event_cache_tests.cpp b/test/adapters/level_zero/multi_device_event_cache_tests.cpp
index f0cc261bb4..bc88eb2a02 100644
--- a/test/adapters/level_zero/multi_device_event_cache_tests.cpp
+++ b/test/adapters/level_zero/multi_device_event_cache_tests.cpp
@@ -20,7 +20,7 @@ static std::shared_ptr<_zel_tracer_handle_t> tracer = [] {
zel_core_callbacks_t prologue_callbacks{};
prologue_callbacks.CommandList.pfnAppendWaitOnEventsCb =
OnAppendWaitOnEventsCb;
- return enableTracing(prologue_callbacks, {});
+ return enableTracing(std::move(prologue_callbacks), {});
}();
using urMultiQueueMultiDeviceEventCacheTest = uur::urAllDevicesTest;
diff --git a/test/adapters/level_zero/ze_tracer_common.hpp b/test/adapters/level_zero/ze_tracer_common.hpp
index bf93c71fbb..8aa93c7c13 100644
--- a/test/adapters/level_zero/ze_tracer_common.hpp
+++ b/test/adapters/level_zero/ze_tracer_common.hpp
@@ -11,9 +11,9 @@
#include
-std::shared_ptr<_zel_tracer_handle_t>
-enableTracing(zel_core_callbacks_t prologueCallbacks,
- zel_core_callbacks_t epilogueCallbacks) {
+inline std::shared_ptr<_zel_tracer_handle_t>
+enableTracing(zel_core_callbacks_t &&prologueCallbacks,
+ zel_core_callbacks_t &&epilogueCallbacks) {
EXPECT_EQ(zeInit(ZE_INIT_FLAG_GPU_ONLY), ZE_RESULT_SUCCESS);
zel_tracer_desc_t tracer_desc = {ZEL_STRUCTURE_TYPE_TRACER_EXP_DESC, nullptr,
diff --git a/test/conformance/exp_command_buffer/update/local_memory_update.cpp b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
index 559a61e3ad..d55094a52c 100644
--- a/test/conformance/exp_command_buffer/update/local_memory_update.cpp
+++ b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
@@ -1094,3 +1094,220 @@ TEST_P(LocalMemoryMultiUpdateTest, UpdateWithoutBlocking) {
uint32_t *new_Y = (uint32_t *)shared_ptrs[4];
Validate(new_output, new_X, new_Y, new_A, global_size, local_size);
}
+
+struct LocalMemoryUpdateTestBaseOutOfOrder : LocalMemoryUpdateTestBase {
+ virtual void SetUp() override {
+ program_name = "saxpy_usm_local_mem";
+ UUR_RETURN_ON_FATAL_FAILURE(
+ urUpdatableCommandBufferExpExecutionTest::SetUp());
+
+ if (backend == UR_PLATFORM_BACKEND_LEVEL_ZERO) {
+ GTEST_SKIP()
+ << "Local memory argument update not supported on Level Zero.";
+ }
+
+ // HIP has extra args for local memory so we define an offset for arg
+ // indices here for updating
+ hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0;
+ ur_device_usm_access_capability_flags_t shared_usm_flags;
+ ASSERT_SUCCESS(
+ uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags));
+ if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) {
+ GTEST_SKIP() << "Shared USM is not supported.";
+ }
+
+ const size_t allocation_size = sizeof(uint32_t) * global_size * local_size;
+ for (auto &shared_ptr : shared_ptrs) {
+ ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr,
+ allocation_size, &shared_ptr));
+ ASSERT_NE(shared_ptr, nullptr);
+
+ std::vector pattern(allocation_size);
+ uur::generateMemFillPattern(pattern);
+ std::memcpy(shared_ptr, pattern.data(), allocation_size);
+ }
+
+ std::array index_order{};
+ if (backend != UR_PLATFORM_BACKEND_HIP) {
+ index_order = {3, 2, 4, 5, 1, 0};
+ } else {
+ index_order = {9, 8, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
+ }
+ size_t current_index = 0;
+
+ // Index 3 is A
+ ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++],
+ sizeof(A), nullptr, &A));
+ // Index 2 is output
+ ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++],
+ nullptr, shared_ptrs[0]));
+
+ // Index 4 is X
+ ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++],
+ nullptr, shared_ptrs[1]));
+ // Index 5 is Y
+ ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++],
+ nullptr, shared_ptrs[2]));
+
+ // Index 1 is local_mem_b arg
+ ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++],
+ local_mem_b_size, nullptr));
+ if (backend == UR_PLATFORM_BACKEND_HIP) {
+ ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++],
+ sizeof(hip_local_offset), nullptr,
+ &hip_local_offset));
+ ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++],
+ sizeof(hip_local_offset), nullptr,
+ &hip_local_offset));
+ ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++],
+ sizeof(hip_local_offset), nullptr,
+ &hip_local_offset));
+ }
+
+ // Index 0 is local_mem_a arg
+ ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++],
+ local_mem_a_size, nullptr));
+
+ // Hip has extra args for local mem at index 1-3
+ if (backend == UR_PLATFORM_BACKEND_HIP) {
+ ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++],
+ sizeof(hip_local_offset), nullptr,
+ &hip_local_offset));
+ ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++],
+ sizeof(hip_local_offset), nullptr,
+ &hip_local_offset));
+ ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++],
+ sizeof(hip_local_offset), nullptr,
+ &hip_local_offset));
+ }
+ }
+};
+
+struct LocalMemoryUpdateTestOutOfOrder : LocalMemoryUpdateTestBaseOutOfOrder {
+ void SetUp() override {
+ UUR_RETURN_ON_FATAL_FAILURE(LocalMemoryUpdateTestBaseOutOfOrder::SetUp());
+
+ // Append kernel command to command-buffer and close command-buffer
+ ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp(
+ updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset,
+ &global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr, nullptr,
+ nullptr, &command_handle));
+ ASSERT_NE(command_handle, nullptr);
+
+ ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle));
+ }
+
+ void TearDown() override {
+ if (command_handle) {
+ EXPECT_SUCCESS(urCommandBufferReleaseCommandExp(command_handle));
+ }
+
+ UUR_RETURN_ON_FATAL_FAILURE(
+ LocalMemoryUpdateTestBaseOutOfOrder::TearDown());
+ }
+
+ ur_exp_command_buffer_command_handle_t command_handle = nullptr;
+};
+
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(LocalMemoryUpdateTestOutOfOrder);
+
+// Test updating A,X,Y parameters to new values and local memory to larger
+// values when the kernel arguments were added out of order.
+TEST_P(LocalMemoryUpdateTestOutOfOrder, UpdateAllParameters) {
+ // Run command-buffer prior to update and verify output
+ ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+ nullptr, nullptr));
+ ASSERT_SUCCESS(urQueueFinish(queue));
+
+ uint32_t *output = (uint32_t *)shared_ptrs[0];
+ uint32_t *X = (uint32_t *)shared_ptrs[1];
+ uint32_t *Y = (uint32_t *)shared_ptrs[2];
+ Validate(output, X, Y, A, global_size, local_size);
+
+ // Update inputs
+ std::array
+ new_input_descs;
+ std::array new_value_descs;
+
+ size_t new_local_size = local_size * 4;
+ size_t new_local_mem_a_size = new_local_size * sizeof(uint32_t);
+
+ // New local_mem_a at index 0
+ new_value_descs[0] = {
+ UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+ nullptr, // pNext
+ 0, // argIndex
+ new_local_mem_a_size, // argSize
+ nullptr, // pProperties
+ nullptr, // hArgValue
+ };
+
+ // New local_mem_b at index 1
+ new_value_descs[1] = {
+ UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+ nullptr, // pNext
+ 1 + hip_arg_offset, // argIndex
+ local_mem_b_size, // argSize
+ nullptr, // pProperties
+ nullptr, // hArgValue
+ };
+
+ // New A at index 3
+ uint32_t new_A = 33;
+ new_value_descs[2] = {
+ UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+ nullptr, // pNext
+ 3 + (2 * hip_arg_offset), // argIndex
+ sizeof(new_A), // argSize
+ nullptr, // pProperties
+ &new_A, // hArgValue
+ };
+
+ // New X at index 4
+ new_input_descs[0] = {
+ UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
+ nullptr, // pNext
+ 4 + (2 * hip_arg_offset), // argIndex
+ nullptr, // pProperties
+ &shared_ptrs[3], // pArgValue
+ };
+
+ // New Y at index 5
+ new_input_descs[1] = {
+ UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
+ nullptr, // pNext
+ 5 + (2 * hip_arg_offset), // argIndex
+ nullptr, // pProperties
+ &shared_ptrs[4], // pArgValue
+ };
+
+ // Update kernel inputs
+ ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
+ UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
+ nullptr, // pNext
+ kernel, // hNewKernel
+ 0, // numNewMemObjArgs
+ new_input_descs.size(), // numNewPointerArgs
+ new_value_descs.size(), // numNewValueArgs
+ n_dimensions, // newWorkDim
+ nullptr, // pNewMemObjArgList
+ new_input_descs.data(), // pNewPointerArgList
+ new_value_descs.data(), // pNewValueArgList
+ nullptr, // pNewGlobalWorkOffset
+ nullptr, // pNewGlobalWorkSize
+ nullptr, // pNewLocalWorkSize
+ };
+
+ // Update kernel and enqueue command-buffer again
+ ASSERT_SUCCESS(
+ urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc));
+ ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+ nullptr, nullptr));
+ ASSERT_SUCCESS(urQueueFinish(queue));
+
+ // Verify that update occurred correctly
+ uint32_t *new_output = (uint32_t *)shared_ptrs[0];
+ uint32_t *new_X = (uint32_t *)shared_ptrs[3];
+ uint32_t *new_Y = (uint32_t *)shared_ptrs[4];
+ Validate(new_output, new_X, new_Y, new_A, global_size, local_size);
+}
\ No newline at end of file
diff --git a/test/conformance/kernel/urKernelGetInfo.cpp b/test/conformance/kernel/urKernelGetInfo.cpp
index 4748f8f96a..65694b5bdd 100644
--- a/test/conformance/kernel/urKernelGetInfo.cpp
+++ b/test/conformance/kernel/urKernelGetInfo.cpp
@@ -121,7 +121,7 @@ TEST_P(urKernelGetInfoTest, SuccessAttributes) {
}
TEST_P(urKernelGetInfoTest, SuccessNumRegs) {
- UUR_KNOWN_FAILURE_ON(uur::HIP{}, uur::OpenCL{});
+ UUR_KNOWN_FAILURE_ON(uur::HIP{});
ur_kernel_info_t property_name = UR_KERNEL_INFO_NUM_REGS;
size_t property_size = 0;
diff --git a/test/conformance/kernel/urKernelSetArgLocal.cpp b/test/conformance/kernel/urKernelSetArgLocal.cpp
index 688724ec09..f056d025bc 100644
--- a/test/conformance/kernel/urKernelSetArgLocal.cpp
+++ b/test/conformance/kernel/urKernelSetArgLocal.cpp
@@ -237,3 +237,102 @@ TEST_P(urKernelSetArgLocalMultiTest, Overwrite) {
Validate(output, X, Y, A, global_size, new_local_size);
}
+
+// Tests that adding arguments out of order (e.g. index 1 before index 0) works.
+struct urKernelSetArgLocalOutOfOrder : urKernelSetArgLocalMultiTest {
+ void SetUp() override {
+ program_name = "saxpy_usm_local_mem";
+ UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
+
+ ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND,
+ sizeof(backend), &backend, nullptr));
+
+ // HIP has extra args for local memory so we define an offset for arg
+ // indices here for updating
+ hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0;
+ ur_device_usm_access_capability_flags_t shared_usm_flags;
+ ASSERT_SUCCESS(
+ uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags));
+ if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) {
+ GTEST_SKIP() << "Shared USM is not supported.";
+ }
+
+ const size_t allocation_size = sizeof(uint32_t) * global_size * local_size;
+ for (auto &shared_ptr : shared_ptrs) {
+ ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr,
+ allocation_size, &shared_ptr));
+ ASSERT_NE(shared_ptr, nullptr);
+
+ std::vector pattern(allocation_size);
+ uur::generateMemFillPattern(pattern);
+ std::memcpy(shared_ptr, pattern.data(), allocation_size);
+ }
+
+ std::array index_order{};
+ if (backend != UR_PLATFORM_BACKEND_HIP) {
+ index_order = {3, 2, 4, 5, 1, 0};
+ } else {
+ index_order = {9, 8, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
+ }
+ size_t current_index = 0;
+
+ // Index 3 is A
+ ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++],
+ sizeof(A), nullptr, &A));
+ // Index 2 is output
+ ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++],
+ nullptr, shared_ptrs[0]));
+
+ // Index 4 is X
+ ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++],
+ nullptr, shared_ptrs[1]));
+ // Index 5 is Y
+ ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++],
+ nullptr, shared_ptrs[2]));
+
+ // Index 1 is local_mem_b arg
+ ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++],
+ local_mem_b_size, nullptr));
+ if (backend == UR_PLATFORM_BACKEND_HIP) {
+ ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++],
+ sizeof(hip_local_offset), nullptr,
+ &hip_local_offset));
+ ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++],
+ sizeof(hip_local_offset), nullptr,
+ &hip_local_offset));
+ ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++],
+ sizeof(hip_local_offset), nullptr,
+ &hip_local_offset));
+ }
+
+ // Index 0 is local_mem_a arg
+ ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++],
+ local_mem_a_size, nullptr));
+
+ // Hip has extra args for local mem at index 1-3
+ if (backend == UR_PLATFORM_BACKEND_HIP) {
+ ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++],
+ sizeof(hip_local_offset), nullptr,
+ &hip_local_offset));
+ ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++],
+ sizeof(hip_local_offset), nullptr,
+ &hip_local_offset));
+ ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++],
+ sizeof(hip_local_offset), nullptr,
+ &hip_local_offset));
+ }
+ }
+};
+
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urKernelSetArgLocalOutOfOrder);
+TEST_P(urKernelSetArgLocalOutOfOrder, Success) {
+ ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
+ &global_offset, &global_size,
+ &local_size, 0, nullptr, nullptr));
+ ASSERT_SUCCESS(urQueueFinish(queue));
+
+ uint32_t *output = (uint32_t *)shared_ptrs[0];
+ uint32_t *X = (uint32_t *)shared_ptrs[1];
+ uint32_t *Y = (uint32_t *)shared_ptrs[2];
+ Validate(output, X, Y, A, global_size, local_size);
+}
diff --git a/test/usm/usmPoolManager.cpp b/test/usm/usmPoolManager.cpp
index ec52d00c5e..4e82196eef 100644
--- a/test/usm/usmPoolManager.cpp
+++ b/test/usm/usmPoolManager.cpp
@@ -4,6 +4,7 @@
//
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "umf_pools/disjoint_pool_config_parser.hpp"
#include "ur_pool_manager.hpp"
#include
@@ -18,6 +19,26 @@ auto createMockPoolHandle() {
[](umf_memory_pool_t *) {});
}
+bool compareConfig(const usm::umf_disjoint_pool_config_t &left,
+ usm::umf_disjoint_pool_config_t &right) {
+ return left.MaxPoolableSize == right.MaxPoolableSize &&
+ left.Capacity == right.Capacity &&
+ left.SlabMinSize == right.SlabMinSize;
+}
+
+bool compareConfigs(const usm::DisjointPoolAllConfigs &left,
+ usm::DisjointPoolAllConfigs &right) {
+ return left.EnableBuffers == right.EnableBuffers &&
+ compareConfig(left.Configs[usm::DisjointPoolMemType::Host],
+ right.Configs[usm::DisjointPoolMemType::Host]) &&
+ compareConfig(left.Configs[usm::DisjointPoolMemType::Device],
+ right.Configs[usm::DisjointPoolMemType::Device]) &&
+ compareConfig(left.Configs[usm::DisjointPoolMemType::Shared],
+ right.Configs[usm::DisjointPoolMemType::Shared]) &&
+ compareConfig(left.Configs[usm::DisjointPoolMemType::SharedReadOnly],
+ right.Configs[usm::DisjointPoolMemType::SharedReadOnly]);
+}
+
TEST_P(urUsmPoolDescriptorTest, poolIsPerContextTypeAndDevice) {
auto &devices = uur::DevicesEnvironment::instance->devices;
@@ -111,4 +132,49 @@ TEST_P(urUsmPoolManagerTest, poolManagerGetNonexistant) {
}
}
+TEST_P(urUsmPoolManagerTest, config) {
+ // Check default config
+ usm::DisjointPoolAllConfigs def;
+ usm::DisjointPoolAllConfigs parsed1 =
+ usm::parseDisjointPoolConfig("1;host:2M,4,64K;device:4M,4,64K;"
+ "shared:0,0,2M;read_only_shared:4M,4,2M",
+ 0);
+ ASSERT_EQ(compareConfigs(def, parsed1), true);
+
+ // Check partially set config
+ usm::DisjointPoolAllConfigs part1 =
+ usm::parseDisjointPoolConfig("1;device:4M;shared:0,0,2M", 0);
+ ASSERT_EQ(compareConfigs(def, part1), true);
+
+ // Check partially set config #2
+ usm::DisjointPoolAllConfigs part2 =
+ usm::parseDisjointPoolConfig(";device:4M;shared:0,0,2M", 0);
+ ASSERT_EQ(compareConfigs(def, part2), true);
+
+ // Check partially set config #3
+ usm::DisjointPoolAllConfigs part3 =
+ usm::parseDisjointPoolConfig(";shared:0,0,2M", 0);
+ ASSERT_EQ(compareConfigs(def, part3), true);
+
+ // Check partially set config #4
+ usm::DisjointPoolAllConfigs part4 =
+ usm::parseDisjointPoolConfig(";device:4M", 0);
+ ASSERT_EQ(compareConfigs(def, part4), true);
+
+ // Check partially set config #5
+ usm::DisjointPoolAllConfigs part5 =
+ usm::parseDisjointPoolConfig(";;device:4M,4,64K", 0);
+ ASSERT_EQ(compareConfigs(def, part5), true);
+
+ // Check non-default config
+ usm::DisjointPoolAllConfigs test(def);
+ test.Configs[usm::DisjointPoolMemType::Shared].MaxPoolableSize = 128 * 1024;
+ test.Configs[usm::DisjointPoolMemType::Shared].Capacity = 4;
+ test.Configs[usm::DisjointPoolMemType::Shared].SlabMinSize = 64 * 1024;
+
+ usm::DisjointPoolAllConfigs parsed3 =
+ usm::parseDisjointPoolConfig("1;shared:128K,4,64K", 0);
+ ASSERT_EQ(compareConfigs(test, parsed3), true);
+}
+
UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urUsmPoolManagerTest);