diff --git a/.github/workflows/benchmarks-reusable.yml b/.github/workflows/benchmarks-reusable.yml index f7c660f3bd..bfd1e971f9 100644 --- a/.github/workflows/benchmarks-reusable.yml +++ b/.github/workflows/benchmarks-reusable.yml @@ -237,7 +237,7 @@ jobs: - name: Upload HTML report if: ${{ always() && inputs.upload_report }} - uses: actions/cache/save@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 + uses: actions/cache/save@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0 with: path: ur-repo/benchmark_results.html key: benchmark-results-${{ matrix.adapter.str_name }}-${{ github.run_id }} diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index b4c40334d4..fbd4ffefef 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -52,7 +52,7 @@ jobs: - name: Download benchmark HTML id: download-bench-html - uses: actions/cache/restore@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 + uses: actions/cache/restore@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0 with: path: ur-repo/benchmark_results.html key: benchmark-results- diff --git a/Testing/Temporary/CTestCostData.txt b/Testing/Temporary/CTestCostData.txt new file mode 100644 index 0000000000..ed97d539c0 --- /dev/null +++ b/Testing/Temporary/CTestCostData.txt @@ -0,0 +1 @@ +--- diff --git a/ft.sh b/ft.sh new file mode 100644 index 0000000000..f7f73081f5 --- /dev/null +++ b/ft.sh @@ -0,0 +1,21 @@ +# add_test([=[enqueue-adapter_level_zero_v2]=] "/home/mateuszpn/pr2532/build/bin/test-enqueue" "--gtest_filter=*Level_Zero*") +# set_tests_properties([=[enqueue-adapter_level_zero_v2]=] PROPERTIES ENVIRONMENT +# "UR_ADAPTERS_FORCE_LOAD=\"/home/mateuszpn/pr2532/build/lib/libur_adapter_level_zero_v2.so.0.12.0\"" +# "LABELS "conformance;adapter_level_zero_v2" +# WORKING_DIRECTORY "/home/mateuszpn/pr2532/build/test/conformance/enqueue" _BACKTRACE_TRIPLES "/home/mateuszpn/pr2532/test/conformance/CMakeLists.txt;22;add_test;/home/mateuszpn/pr2532/test/conformance/CMakeLists.txt;32;do_add_test;/home/mateuszpn/pr2532/test/conformance/CMakeLists.txt;67;add_test_adapter;/home/mateuszpn/pr2532/test/conformance/CMakeLists.txt;78;add_conformance_test;/home/mateuszpn/pr2532/test/conformance/enqueue/CMakeLists.txt;6;add_conformance_test_with_kernels_environment;/home/mateuszpn/pr2532/test/conformance/enqueue/CMakeLists.txt;0;") + +# Set environment variable +export UR_ADAPTERS_FORCE_LOAD="/home/mateuszpn/pr2532/build/lib/libur_adapter_level_zero_v2.so.0.12.0" + +# Set working directory +#cd /home/mateuszpn/pr2532/build/test/conformance/$1 + +# Run the test with the specified filter +#/home/mateuszpn/pr2532/build/bin/test-$1 --gtest_filter=*Level_Zero* + +# Set working directory +cd /home/mateuszpn/pr2532/build/test/conformance/enqueue + +# Run the test with the specified filter +#/home/mateuszpn/pr2532/build/bin/test-enqueue --gtest_filter=urEnqueueMemBufferMapTestWithParam.MapSignalEvent*Level_Zero* +/home/mateuszpn/pr2532/build/bin/test-enqueue --gtest_filter=*Level_Zero* \ No newline at end of file diff --git a/scripts/benchmarks/output_markdown.py b/scripts/benchmarks/output_markdown.py index 13df68d45e..fc3b65507b 100644 --- a/scripts/benchmarks/output_markdown.py +++ b/scripts/benchmarks/output_markdown.py @@ -27,7 +27,7 @@ def generate_markdown_details(results: list[Result]): markdown_sections.append(f"""
-Benchmark details - environment, command, output... +Benchmark details - environment, command... """) for res in results: @@ -42,9 +42,6 @@ def generate_markdown_details(results: list[Result]): #### Command: {' '.join(res.command)} -#### Output: -{res.stdout} -
""") markdown_sections.append(f""" diff --git a/scripts/templates/queue_api.cpp.mako b/scripts/templates/queue_api.cpp.mako index 89f857e007..14def952ac 100644 --- a/scripts/templates/queue_api.cpp.mako +++ b/scripts/templates/queue_api.cpp.mako @@ -20,6 +20,8 @@ from templates import helper as th * */ +// Do not edit. This file is auto generated from a template: scripts/templates/queue_api.cpp.mako + #include "queue_api.hpp" #include "ur_util.hpp" diff --git a/scripts/templates/queue_api.hpp.mako b/scripts/templates/queue_api.hpp.mako index 352abbeb43..46ed74ed33 100644 --- a/scripts/templates/queue_api.hpp.mako +++ b/scripts/templates/queue_api.hpp.mako @@ -20,9 +20,12 @@ from templates import helper as th * */ +// Do not edit. This file is auto generated from a template: scripts/templates/queue_api.hpp.mako + #pragma once #include +#include struct ur_queue_handle_t_ { virtual ~ur_queue_handle_t_(); @@ -32,4 +35,8 @@ struct ur_queue_handle_t_ { %for obj in th.get_queue_related_functions(specs, n, tags): virtual ${x}_result_t ${th.transform_queue_related_function_name(n, tags, obj, format=["type"])} = 0; %endfor + + virtual ur_result_t + enqueueCommandBuffer(ze_command_list_handle_t, ur_event_handle_t *, + uint32_t, const ur_event_handle_t *) = 0; }; diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp index 05c20a6614..37018dde6c 100644 --- a/source/adapters/cuda/command_buffer.cpp +++ b/source/adapters/cuda/command_buffer.cpp @@ -523,7 +523,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ThreadsPerBlock, BlocksPerGrid)); // Set node param structure with the kernel related data - auto &ArgIndices = hKernel->getArgIndices(); + auto &ArgPointers = hKernel->getArgPointers(); CUDA_KERNEL_NODE_PARAMS NodeParams = {}; NodeParams.func = CuFunc; NodeParams.gridDimX = BlocksPerGrid[0]; @@ -533,7 +533,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( NodeParams.blockDimY = ThreadsPerBlock[1]; NodeParams.blockDimZ = ThreadsPerBlock[2]; NodeParams.sharedMemBytes = LocalSize; - NodeParams.kernelParams = const_cast(ArgIndices.data()); + NodeParams.kernelParams = const_cast(ArgPointers.data()); // Create and add an new kernel node to the Cuda graph UR_CHECK_ERROR(cuGraphAddKernelNode(&GraphNode, hCommandBuffer->CudaGraph, @@ -1398,7 +1398,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( Params.blockDimZ = ThreadsPerBlock[2]; Params.sharedMemBytes = KernelCommandHandle->Kernel->getLocalSize(); Params.kernelParams = - const_cast(KernelCommandHandle->Kernel->getArgIndices().data()); + const_cast(KernelCommandHandle->Kernel->getArgPointers().data()); CUgraphNode Node = KernelCommandHandle->Node; CUgraphExec CudaGraphExec = CommandBuffer->CudaGraphExec; diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 2a4a2cf54f..540ebb86fa 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -492,11 +492,11 @@ enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, UR_CHECK_ERROR(RetImplEvent->start()); } - auto &ArgIndices = hKernel->getArgIndices(); + auto &ArgPointers = hKernel->getArgPointers(); UR_CHECK_ERROR(cuLaunchKernel( CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize, - CuStream, const_cast(ArgIndices.data()), nullptr)); + CuStream, const_cast(ArgPointers.data()), nullptr)); if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); @@ -680,7 +680,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( UR_CHECK_ERROR(RetImplEvent->start()); } - auto &ArgIndices = hKernel->getArgIndices(); + auto &ArgPointers = hKernel->getArgPointers(); CUlaunchConfig launch_config; launch_config.gridDimX = BlocksPerGrid[0]; @@ -696,7 +696,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( launch_config.numAttrs = launch_attribute.size(); UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc, - const_cast(ArgIndices.data()), + const_cast(ArgPointers.data()), nullptr)); if (phEvent) { diff --git a/source/adapters/cuda/image.cpp b/source/adapters/cuda/image.cpp index c11a85b293..87570e3b45 100644 --- a/source/adapters/cuda/image.cpp +++ b/source/adapters/cuda/image.cpp @@ -533,8 +533,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( image_res_desc.resType = CU_RESOURCE_TYPE_MIPMAPPED_ARRAY; image_res_desc.res.mipmap.hMipmappedArray = (CUmipmappedArray)hImageMem; } - } else if (mem_type == CU_MEMORYTYPE_DEVICE) { - // We have a USM pointer + } else if (mem_type == CU_MEMORYTYPE_DEVICE || + mem_type == CU_MEMORYTYPE_HOST) { + // We have a USM pointer. + // Images may be created from device or host USM. if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { image_res_desc.resType = CU_RESOURCE_TYPE_LINEAR; image_res_desc.res.linear.devPtr = (CUdeviceptr)hImageMem; diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp index d1b3b61244..f299714b02 100644 --- a/source/adapters/cuda/kernel.hpp +++ b/source/adapters/cuda/kernel.hpp @@ -66,8 +66,10 @@ struct ur_kernel_handle_t_ { args_t Storage; /// Aligned size of each parameter, including padding. args_size_t ParamSizes; - /// Byte offset into /p Storage allocation for each parameter. - args_index_t Indices; + /// Byte offset into /p Storage allocation for each argument. + args_index_t ArgPointers; + /// Position in the Storage array where the next argument should added. + size_t InsertPos = 0; /// Aligned size in bytes for each local memory parameter after padding has /// been added. Zero if the argument at the index isn't a local memory /// argument. @@ -90,33 +92,43 @@ struct ur_kernel_handle_t_ { std::uint32_t ImplicitOffsetArgs[3] = {0, 0, 0}; arguments() { - // Place the implicit offset index at the end of the indicies collection - Indices.emplace_back(&ImplicitOffsetArgs); + // Place the implicit offset index at the end of the ArgPointers + // collection. + ArgPointers.emplace_back(&ImplicitOffsetArgs); } /// Add an argument to the kernel. /// If the argument existed before, it is replaced. /// Otherwise, it is added. /// Gaps are filled with empty arguments. - /// Implicit offset argument is kept at the back of the indices collection. + /// Implicit offset argument is kept at the back of the ArgPointers + /// collection. void addArg(size_t Index, size_t Size, const void *Arg, size_t LocalSize = 0) { - if (Index + 2 > Indices.size()) { + // Expand storage to accommodate this Index if needed. + if (Index + 2 > ArgPointers.size()) { // Move implicit offset argument index with the end - Indices.resize(Index + 2, Indices.back()); + ArgPointers.resize(Index + 2, ArgPointers.back()); // Ensure enough space for the new argument ParamSizes.resize(Index + 1); AlignedLocalMemSize.resize(Index + 1); OriginalLocalMemSize.resize(Index + 1); } - ParamSizes[Index] = Size; - // calculate the insertion point on the array - size_t InsertPos = std::accumulate(std::begin(ParamSizes), - std::begin(ParamSizes) + Index, 0); - // Update the stored value for the argument - std::memcpy(&Storage[InsertPos], Arg, Size); - Indices[Index] = &Storage[InsertPos]; - AlignedLocalMemSize[Index] = LocalSize; + + // Copy new argument to storage if it hasn't been added before. + if (ParamSizes[Index] == 0) { + ParamSizes[Index] = Size; + std::memcpy(&Storage[InsertPos], Arg, Size); + ArgPointers[Index] = &Storage[InsertPos]; + AlignedLocalMemSize[Index] = LocalSize; + InsertPos += Size; + } + // Otherwise, update the existing argument. + else { + std::memcpy(ArgPointers[Index], Arg, Size); + AlignedLocalMemSize[Index] = LocalSize; + assert(Size == ParamSizes[Index]); + } } /// Returns the padded size and offset of a local memory argument. @@ -128,7 +140,7 @@ struct ur_kernel_handle_t_ { std::pair calcAlignedLocalArgument(size_t Index, size_t Size) { // Store the unpadded size of the local argument - if (Index + 2 > Indices.size()) { + if (Index + 2 > ArgPointers.size()) { AlignedLocalMemSize.resize(Index + 1); OriginalLocalMemSize.resize(Index + 1); } @@ -158,10 +170,11 @@ struct ur_kernel_handle_t_ { return std::make_pair(AlignedLocalSize, AlignedLocalOffset); } - // Iterate over all existing local argument which follows StartIndex + // Iterate over each existing local argument which follows StartIndex // index, update the offset and pointer into the kernel local memory. void updateLocalArgOffset(size_t StartIndex) { - const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg + const size_t NumArgs = + ArgPointers.size() - 1; // Accounts for implicit arg for (auto SuccIndex = StartIndex; SuccIndex < NumArgs; SuccIndex++) { const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex]; if (OriginalLocalSize == 0) { @@ -177,10 +190,7 @@ struct ur_kernel_handle_t_ { AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize; // Store new offset into local data - const size_t InsertPos = - std::accumulate(std::begin(ParamSizes), - std::begin(ParamSizes) + SuccIndex, size_t{0}); - std::memcpy(&Storage[InsertPos], &SuccAlignedLocalOffset, + std::memcpy(ArgPointers[SuccIndex], &SuccAlignedLocalOffset, sizeof(size_t)); } } @@ -228,7 +238,7 @@ struct ur_kernel_handle_t_ { std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size); } - const args_index_t &getIndices() const noexcept { return Indices; } + const args_index_t &getArgPointers() const noexcept { return ArgPointers; } uint32_t getLocalSize() const { return std::accumulate(std::begin(AlignedLocalMemSize), @@ -299,7 +309,7 @@ struct ur_kernel_handle_t_ { /// real one required by the kernel, since this cannot be queried from /// the CUDA Driver API uint32_t getNumArgs() const noexcept { - return static_cast(Args.Indices.size() - 1); + return static_cast(Args.ArgPointers.size() - 1); } void setKernelArg(int Index, size_t Size, const void *Arg) { @@ -314,8 +324,8 @@ struct ur_kernel_handle_t_ { return Args.setImplicitOffset(Size, ImplicitOffset); } - const arguments::args_index_t &getArgIndices() const { - return Args.getIndices(); + const arguments::args_index_t &getArgPointers() const { + return Args.getArgPointers(); } void setWorkGroupMemory(size_t MemSize) { Args.setWorkGroupMemory(MemSize); } diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp index 09c59bb9f7..887eb75287 100644 --- a/source/adapters/hip/command_buffer.cpp +++ b/source/adapters/hip/command_buffer.cpp @@ -378,7 +378,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( pLocalWorkSize, hKernel, HIPFunc, ThreadsPerBlock, BlocksPerGrid)); // Set node param structure with the kernel related data - auto &ArgIndices = hKernel->getArgIndices(); + auto &ArgPointers = hKernel->getArgPointers(); hipKernelNodeParams NodeParams; NodeParams.func = HIPFunc; NodeParams.gridDim.x = BlocksPerGrid[0]; @@ -388,7 +388,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( NodeParams.blockDim.y = ThreadsPerBlock[1]; NodeParams.blockDim.z = ThreadsPerBlock[2]; NodeParams.sharedMemBytes = LocalSize; - NodeParams.kernelParams = const_cast(ArgIndices.data()); + NodeParams.kernelParams = const_cast(ArgPointers.data()); NodeParams.extra = nullptr; // Create and add an new kernel node to the HIP graph @@ -1098,7 +1098,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( Params.blockDim.z = ThreadsPerBlock[2]; Params.sharedMemBytes = hCommand->Kernel->getLocalSize(); Params.kernelParams = - const_cast(hCommand->Kernel->getArgIndices().data()); + const_cast(hCommand->Kernel->getArgPointers().data()); hipGraphNode_t Node = hCommand->Node; hipGraphExec_t HipGraphExec = CommandBuffer->HIPGraphExec; diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index 8c7c1c617d..849369de4b 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -308,7 +308,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } } - auto ArgIndices = hKernel->getArgIndices(); + auto ArgPointers = hKernel->getArgPointers(); // If migration of mem across buffer is needed, an event must be associated // with this command, implicitly if phEvent is nullptr @@ -322,7 +322,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_CHECK_ERROR(hipModuleLaunchKernel( HIPFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], - hKernel->getLocalSize(), HIPStream, ArgIndices.data(), nullptr)); + hKernel->getLocalSize(), HIPStream, ArgPointers.data(), nullptr)); if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); diff --git a/source/adapters/hip/kernel.hpp b/source/adapters/hip/kernel.hpp index c6d30e81ad..5ec51e7fa4 100644 --- a/source/adapters/hip/kernel.hpp +++ b/source/adapters/hip/kernel.hpp @@ -61,8 +61,10 @@ struct ur_kernel_handle_t_ { args_t Storage; /// Aligned size of each parameter, including padding. args_size_t ParamSizes; - /// Byte offset into /p Storage allocation for each parameter. - args_index_t Indices; + /// Byte offset into /p Storage allocation for each argument. + args_index_t ArgPointers; + /// Position in the Storage array where the next argument should added. + size_t InsertPos = 0; /// Aligned size in bytes for each local memory parameter after padding has /// been added. Zero if the argument at the index isn't a local memory /// argument. @@ -85,32 +87,41 @@ struct ur_kernel_handle_t_ { arguments() { // Place the implicit offset index at the end of the indicies collection - Indices.emplace_back(&ImplicitOffsetArgs); + ArgPointers.emplace_back(&ImplicitOffsetArgs); } /// Add an argument to the kernel. /// If the argument existed before, it is replaced. /// Otherwise, it is added. /// Gaps are filled with empty arguments. - /// Implicit offset argument is kept at the back of the indices collection. + /// Implicit offset argument is kept at the back of the ArgPointers + /// collection. void addArg(size_t Index, size_t Size, const void *Arg, size_t LocalSize = 0) { - if (Index + 2 > Indices.size()) { - // Move implicit offset argument Index with the end - Indices.resize(Index + 2, Indices.back()); + // Expand storage to accommodate this Index if needed. + if (Index + 2 > ArgPointers.size()) { + // Move implicit offset argument index with the end + ArgPointers.resize(Index + 2, ArgPointers.back()); // Ensure enough space for the new argument ParamSizes.resize(Index + 1); AlignedLocalMemSize.resize(Index + 1); OriginalLocalMemSize.resize(Index + 1); } - ParamSizes[Index] = Size; - // calculate the insertion point on the array - size_t InsertPos = std::accumulate(std::begin(ParamSizes), - std::begin(ParamSizes) + Index, 0); - // Update the stored value for the argument - std::memcpy(&Storage[InsertPos], Arg, Size); - Indices[Index] = &Storage[InsertPos]; - AlignedLocalMemSize[Index] = LocalSize; + + // Copy new argument to storage if it hasn't been added before. + if (ParamSizes[Index] == 0) { + ParamSizes[Index] = Size; + std::memcpy(&Storage[InsertPos], Arg, Size); + ArgPointers[Index] = &Storage[InsertPos]; + AlignedLocalMemSize[Index] = LocalSize; + InsertPos += Size; + } + // Otherwise, update the existing argument. + else { + std::memcpy(ArgPointers[Index], Arg, Size); + AlignedLocalMemSize[Index] = LocalSize; + assert(Size == ParamSizes[Index]); + } } /// Returns the padded size and offset of a local memory argument. @@ -122,7 +133,7 @@ struct ur_kernel_handle_t_ { std::pair calcAlignedLocalArgument(size_t Index, size_t Size) { // Store the unpadded size of the local argument - if (Index + 2 > Indices.size()) { + if (Index + 2 > ArgPointers.size()) { AlignedLocalMemSize.resize(Index + 1); OriginalLocalMemSize.resize(Index + 1); } @@ -151,20 +162,12 @@ struct ur_kernel_handle_t_ { return std::make_pair(AlignedLocalSize, AlignedLocalOffset); } - void addLocalArg(size_t Index, size_t Size) { - // Get the aligned argument size and offset into local data - auto [AlignedLocalSize, AlignedLocalOffset] = - calcAlignedLocalArgument(Index, Size); - - // Store argument details - addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset), - AlignedLocalSize); - - // For every existing local argument which follows at later argument - // indices, update the offset and pointer into the kernel local memory. - // Required as padding will need to be recalculated. - const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg - for (auto SuccIndex = Index + 1; SuccIndex < NumArgs; SuccIndex++) { + // Iterate over each existing local argument which follows StartIndex + // index, update the offset and pointer into the kernel local memory. + void updateLocalArgOffset(size_t StartIndex) { + const size_t NumArgs = + ArgPointers.size() - 1; // Accounts for implicit arg + for (auto SuccIndex = StartIndex; SuccIndex < NumArgs; SuccIndex++) { const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex]; if (OriginalLocalSize == 0) { // Skip if successor argument isn't a local memory arg @@ -179,14 +182,26 @@ struct ur_kernel_handle_t_ { AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize; // Store new offset into local data - const size_t InsertPos = - std::accumulate(std::begin(ParamSizes), - std::begin(ParamSizes) + SuccIndex, size_t{0}); - std::memcpy(&Storage[InsertPos], &SuccAlignedLocalOffset, + std::memcpy(ArgPointers[SuccIndex], &SuccAlignedLocalOffset, sizeof(size_t)); } } + void addLocalArg(size_t Index, size_t Size) { + // Get the aligned argument size and offset into local data + auto [AlignedLocalSize, AlignedLocalOffset] = + calcAlignedLocalArgument(Index, Size); + + // Store argument details + addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset), + AlignedLocalSize); + + // For every existing local argument which follows at later argument + // indices, update the offset and pointer into the kernel local memory. + // Required as padding will need to be recalculated. + updateLocalArgOffset(Index + 1); + } + void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) { assert(hMem && "Invalid mem handle"); // To avoid redundancy we are not storing mem obj with index i at index @@ -206,7 +221,7 @@ struct ur_kernel_handle_t_ { std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size); } - const args_index_t &getIndices() const noexcept { return Indices; } + const args_index_t &getArgPointers() const noexcept { return ArgPointers; } uint32_t getLocalSize() const { return std::accumulate(std::begin(AlignedLocalMemSize), @@ -263,7 +278,7 @@ struct ur_kernel_handle_t_ { /// offset. Note this only returns the current known number of arguments, /// not the real one required by the kernel, since this cannot be queried /// from the HIP Driver API - uint32_t getNumArgs() const noexcept { return Args.Indices.size() - 1; } + uint32_t getNumArgs() const noexcept { return Args.ArgPointers.size() - 1; } void setKernelArg(int Index, size_t Size, const void *Arg) { Args.addArg(Index, Size, Arg); @@ -277,8 +292,8 @@ struct ur_kernel_handle_t_ { return Args.setImplicitOffset(Size, ImplicitOffset); } - const arguments::args_index_t &getArgIndices() const { - return Args.getIndices(); + const arguments::args_index_t &getArgPointers() const { + return Args.getArgPointers(); } uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); } diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt index 39031a700d..c75c870be7 100644 --- a/source/adapters/level_zero/CMakeLists.txt +++ b/source/adapters/level_zero/CMakeLists.txt @@ -145,7 +145,9 @@ if(UR_BUILD_ADAPTER_L0_V2) ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tensor_map.cpp # v2-only sources + ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_buffer.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_cache.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_manager.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/context.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool_cache.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool.hpp @@ -159,7 +161,9 @@ if(UR_BUILD_ADAPTER_L0_V2) ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/usm.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/api.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_buffer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_cache.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_manager.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/context.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool_cache.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool.cpp diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index 058f92f8ca..902da42d2c 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -1535,15 +1535,17 @@ ur_result_t waitForDependencies(ur_exp_command_buffer_handle_t CommandBuffer, * @param CommandList The command-list to append the QueryKernelTimestamps * command to. * @param SignalEvent The event that must be signaled after the profiling is - * finished. This event will contain the profiling information. + * finished. * @param WaitEvent The event that must be waited on before starting the * profiling. + * @param ProfilingEvent The event that will contain the profiling data. * @return UR_RESULT_SUCCESS or an error code on failure. */ ur_result_t appendProfilingQueries(ur_exp_command_buffer_handle_t CommandBuffer, ze_command_list_handle_t CommandList, ur_event_handle_t SignalEvent, - ur_event_handle_t WaitEvent) { + ur_event_handle_t WaitEvent, + ur_event_handle_t ProfilingEvent) { // Multiple submissions of a command buffer implies that we need to save // the event timestamps before resubmiting the command buffer. We // therefore copy these timestamps in a dedicated USM memory section @@ -1556,12 +1558,17 @@ ur_result_t appendProfilingQueries(ur_exp_command_buffer_handle_t CommandBuffer, Profiling->Timestamps = new ze_kernel_timestamp_result_t[Profiling->NumEvents]; + uint32_t NumWaitEvents = WaitEvent ? 1 : 0; + ze_event_handle_t *ZeWaitEventList = + WaitEvent ? &(WaitEvent->ZeEvent) : nullptr; + ze_event_handle_t ZeSignalEvent = + SignalEvent ? SignalEvent->ZeEvent : nullptr; ZE2UR_CALL(zeCommandListAppendQueryKernelTimestamps, (CommandList, CommandBuffer->ZeEventsList.size(), CommandBuffer->ZeEventsList.data(), (void *)Profiling->Timestamps, - 0, SignalEvent->ZeEvent, 1, &(WaitEvent->ZeEvent))); + 0, ZeSignalEvent, NumWaitEvents, ZeWaitEventList)); - SignalEvent->CommandData = static_cast(Profiling); + ProfilingEvent->CommandData = static_cast(Profiling); return UR_RESULT_SUCCESS; } @@ -1615,8 +1622,8 @@ ur_result_t enqueueImmediateAppendPath( if (DoProfiling) { UR_CALL(appendProfilingQueries(CommandBuffer, CommandListHelper->first, - *Event, - CommandBuffer->ComputeFinishedEvent)); + *Event, CommandBuffer->ComputeFinishedEvent, + *Event)); } // When the current execution is finished, signal ExecutionFinishedEvent to @@ -1694,10 +1701,15 @@ ur_result_t enqueueWaitEventPath(ur_exp_command_buffer_handle_t CommandBuffer, (ZeCopyCommandQueue, 1, &CommandBuffer->ZeCopyCommandList, nullptr)); } + ZE2UR_CALL(zeCommandListAppendBarrier, + (SignalCommandList->first, nullptr, 1, + &(CommandBuffer->ExecutionFinishedEvent->ZeEvent))); + // Reset the wait-event for the UR command-buffer that is signaled when its // submission dependencies have been satisfied. ZE2UR_CALL(zeCommandListAppendEventReset, (SignalCommandList->first, CommandBuffer->WaitEvent->ZeEvent)); + // Reset the all-reset-event for the UR command-buffer that is signaled when // all events of the main command-list have been reset. ZE2UR_CALL(zeCommandListAppendEventReset, @@ -1705,14 +1717,12 @@ ur_result_t enqueueWaitEventPath(ur_exp_command_buffer_handle_t CommandBuffer, if (DoProfiling) { UR_CALL(appendProfilingQueries(CommandBuffer, SignalCommandList->first, - *Event, - CommandBuffer->ExecutionFinishedEvent)); - } else { - ZE2UR_CALL(zeCommandListAppendBarrier, - (SignalCommandList->first, (*Event)->ZeEvent, 1, - &(CommandBuffer->ExecutionFinishedEvent->ZeEvent))); + nullptr, nullptr, *Event)); } + ZE2UR_CALL(zeCommandListAppendBarrier, + (SignalCommandList->first, (*Event)->ZeEvent, 0, nullptr)); + UR_CALL(Queue->executeCommandList(SignalCommandList, false /*IsBlocking*/, false /*OKToBatchCommand*/)); diff --git a/source/adapters/level_zero/v2/api.cpp b/source/adapters/level_zero/v2/api.cpp index 9ae9bddcb9..edd9687445 100644 --- a/source/adapters/level_zero/v2/api.cpp +++ b/source/adapters/level_zero/v2/api.cpp @@ -239,47 +239,6 @@ ur_result_t urBindlessImagesReleaseExternalSemaphoreExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -ur_result_t -urCommandBufferCreateExp(ur_context_handle_t hContext, - ur_device_handle_t hDevice, - const ur_exp_command_buffer_desc_t *pCommandBufferDesc, - ur_exp_command_buffer_handle_t *phCommandBuffer) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t -urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t -urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t -urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t urCommandBufferAppendKernelLaunchExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_kernel_handle_t hKernel, - uint32_t workDim, const size_t *pGlobalWorkOffset, - const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, - uint32_t numKernelAlternatives, ur_kernel_handle_t *phKernelAlternatives, - uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, - ur_exp_command_buffer_command_handle_t *phCommand) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - ur_result_t urCommandBufferAppendUSMMemcpyExp( ur_exp_command_buffer_handle_t hCommandBuffer, void *pDst, const void *pSrc, size_t size, uint32_t numSyncPointsInWaitList, @@ -415,14 +374,6 @@ ur_result_t urCommandBufferAppendUSMAdviseExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -ur_result_t urCommandBufferEnqueueExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - ur_result_t urCommandBufferRetainCommandExp( ur_exp_command_buffer_command_handle_t hCommand) { logger::error("{} function not implemented!", __FUNCTION__); @@ -443,15 +394,6 @@ ur_result_t urCommandBufferUpdateKernelLaunchExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -ur_result_t -urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer, - ur_exp_command_buffer_info_t propName, - size_t propSize, void *pPropValue, - size_t *pPropSizeRet) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - ur_result_t urCommandBufferUpdateSignalEventExp( ur_exp_command_buffer_command_handle_t hCommand, ur_event_handle_t *phEvent) { diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp new file mode 100644 index 0000000000..c35d97d76b --- /dev/null +++ b/source/adapters/level_zero/v2/command_buffer.cpp @@ -0,0 +1,183 @@ +//===--------- command_buffer.cpp - Level Zero Adapter ---------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "command_buffer.hpp" +#include "../helpers/kernel_helpers.hpp" +#include "../ur_interface_loader.hpp" +#include "logger/ur_logger.hpp" + +namespace { + +// Checks whether zeCommandListImmediateAppendCommandListsExp can be used for a +// given context. +void checkImmediateAppendSupport(ur_context_handle_t context) { + bool DriverSupportsImmediateAppend = + context->getPlatform()->ZeCommandListImmediateAppendExt.Supported; + + if (!DriverSupportsImmediateAppend) { + logger::error("Adapter v2 is used but " + "the current driver does not support the " + "zeCommandListImmediateAppendCommandListsExp entrypoint."); + std::abort(); + } +} + +} // namespace + +ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( + ur_context_handle_t context, ur_device_handle_t device, + v2::raii::command_list_unique_handle &&commandList, + const ur_exp_command_buffer_desc_t *desc) + : commandListManager( + context, device, + std::forward(commandList)), + isUpdatable(desc ? desc->isUpdatable : false) {} + +ur_result_t ur_exp_command_buffer_handle_t_::finalizeCommandBuffer() { + // It is not allowed to append to command list from multiple threads. + std::scoped_lock guard(this->Mutex); + UR_ASSERT(!isFinalized, UR_RESULT_ERROR_INVALID_OPERATION); + // Close the command lists and have them ready for dispatch. + ZE2UR_CALL(zeCommandListClose, (this->commandListManager.getZeCommandList())); + isFinalized = true; + return UR_RESULT_SUCCESS; +} + +namespace ur::level_zero { + +ur_result_t +urCommandBufferCreateExp(ur_context_handle_t context, ur_device_handle_t device, + const ur_exp_command_buffer_desc_t *commandBufferDesc, + ur_exp_command_buffer_handle_t *commandBuffer) try { + checkImmediateAppendSupport(context); + + if (!context->getPlatform()->ZeMutableCmdListExt.Supported) { + throw UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + using queue_group_type = ur_device_handle_t_::queue_group_info_t::type; + uint32_t queueGroupOrdinal = + device->QueueGroup[queue_group_type::Compute].ZeOrdinal; + v2::raii::command_list_unique_handle zeCommandList = + context->commandListCache.getRegularCommandList(device->ZeDevice, true, + queueGroupOrdinal, true); + + *commandBuffer = new ur_exp_command_buffer_handle_t_( + context, device, std::move(zeCommandList), commandBufferDesc); + return UR_RESULT_SUCCESS; + +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t +urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) try { + hCommandBuffer->RefCount.increment(); + return UR_RESULT_SUCCESS; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t +urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) try { + if (!hCommandBuffer->RefCount.decrementAndTest()) + return UR_RESULT_SUCCESS; + + delete hCommandBuffer; + return UR_RESULT_SUCCESS; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t +urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) try { + UR_ASSERT(hCommandBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_CALL(hCommandBuffer->finalizeCommandBuffer()); + return UR_RESULT_SUCCESS; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t urCommandBufferAppendKernelLaunchExp( + ur_exp_command_buffer_handle_t commandBuffer, ur_kernel_handle_t hKernel, + uint32_t workDim, const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numKernelAlternatives, ur_kernel_handle_t *kernelAlternatives, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *syncPointWaitList, + uint32_t numEventsInWaitList, const ur_event_handle_t *eventWaitList, + ur_exp_command_buffer_sync_point_t *retSyncPoint, ur_event_handle_t *event, + ur_exp_command_buffer_command_handle_t *command) + + try { + // Need to know semantics + // - should they be checked before kernel execution or before kernel + // appending to list if latter then it is easy fix, if former then TODO + std::ignore = numEventsInWaitList; + std::ignore = eventWaitList; + std::ignore = event; + + // sync mechanic can be ignored, because all lists are in-order + std::ignore = numSyncPointsInWaitList; + std::ignore = syncPointWaitList; + std::ignore = retSyncPoint; + + // TODO + std::ignore = numKernelAlternatives; + std::ignore = kernelAlternatives; + std::ignore = command; + UR_CALL(commandBuffer->commandListManager.appendKernelLaunch( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, 0, + nullptr, nullptr)); + return UR_RESULT_SUCCESS; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t urCommandBufferEnqueueExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) try { + return hQueue->enqueueCommandBuffer( + hCommandBuffer->commandListManager.getZeCommandList(), phEvent, + numEventsInWaitList, phEventWaitList); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t +urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer, + ur_exp_command_buffer_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet) try { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_EXP_COMMAND_BUFFER_INFO_REFERENCE_COUNT: + return ReturnValue(uint32_t{hCommandBuffer->RefCount.load()}); + case UR_EXP_COMMAND_BUFFER_INFO_DESCRIPTOR: { + ur_exp_command_buffer_desc_t Descriptor{}; + Descriptor.stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC; + Descriptor.pNext = nullptr; + Descriptor.isUpdatable = hCommandBuffer->isUpdatable; + Descriptor.isInOrder = true; + Descriptor.enableProfiling = hCommandBuffer->isProfilingEnabled; + + return ReturnValue(Descriptor); + } + default: + assert(!"Command-buffer info request not implemented"); + } + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +} // namespace ur::level_zero diff --git a/source/adapters/level_zero/v2/command_buffer.hpp b/source/adapters/level_zero/v2/command_buffer.hpp new file mode 100644 index 0000000000..5e60d6537f --- /dev/null +++ b/source/adapters/level_zero/v2/command_buffer.hpp @@ -0,0 +1,51 @@ +//===--------- command_buffer.hpp - Level Zero Adapter ---------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include "command_list_manager.hpp" +#include "common.hpp" +#include "context.hpp" +#include "kernel.hpp" +#include "queue_api.hpp" +#include + +struct ur_exp_command_buffer_handle_t_ : public _ur_object { + ur_exp_command_buffer_handle_t_( + ur_context_handle_t context, ur_device_handle_t device, + v2::raii::command_list_unique_handle &&commandList, + const ur_exp_command_buffer_desc_t *desc); + + ~ur_exp_command_buffer_handle_t_() = default; + + ur_command_list_manager commandListManager; + + ur_result_t finalizeCommandBuffer(); + // Indicates if command-buffer commands can be updated after it is closed. + const bool isUpdatable = false; + // Command-buffer profiling is enabled. + const bool isProfilingEnabled = false; + +private: + // Indicates if command buffer was finalized. + bool isFinalized = false; +}; + +struct ur_exp_command_buffer_command_handle_t_ : public _ur_object { + ur_exp_command_buffer_command_handle_t_(ur_exp_command_buffer_handle_t, + uint64_t); + +private: + ~ur_exp_command_buffer_command_handle_t_(); + + // Command-buffer of this command. + ur_exp_command_buffer_handle_t commandBuffer; + // L0 command ID identifying this command + uint64_t commandId; +}; diff --git a/source/adapters/level_zero/v2/command_list_manager.cpp b/source/adapters/level_zero/v2/command_list_manager.cpp new file mode 100644 index 0000000000..987cb462a3 --- /dev/null +++ b/source/adapters/level_zero/v2/command_list_manager.cpp @@ -0,0 +1,115 @@ +//===--------- command_list_cache.hpp - Level Zero Adapter ---------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "command_list_manager.hpp" +#include "../helpers/kernel_helpers.hpp" +#include "../ur_interface_loader.hpp" +#include "context.hpp" +#include "kernel.hpp" + +ur_command_list_manager::ur_command_list_manager( + ur_context_handle_t context, ur_device_handle_t device, + v2::raii::command_list_unique_handle &&commandList, v2::event_flags_t flags, + ur_queue_handle_t_ *queue) + : context(context), device(device), + eventPool(context->eventPoolCache.borrow(device->Id.value(), flags)), + zeCommandList(std::move(commandList)), queue(queue) { + UR_CALL_THROWS(ur::level_zero::urContextRetain(context)); + UR_CALL_THROWS(ur::level_zero::urDeviceRetain(device)); +} + +ur_command_list_manager::~ur_command_list_manager() { + ur::level_zero::urContextRelease(context); + ur::level_zero::urDeviceRelease(device); +} + +std::pair +ur_command_list_manager::getWaitListView(const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents) { + + waitList.resize(numWaitEvents); + for (uint32_t i = 0; i < numWaitEvents; i++) { + waitList[i] = phWaitEvents[i]->getZeEvent(); + } + + return {waitList.data(), static_cast(numWaitEvents)}; +} + +ze_event_handle_t +ur_command_list_manager::getSignalEvent(ur_event_handle_t *hUserEvent, + ur_command_t commandType) { + if (hUserEvent && queue) { + *hUserEvent = eventPool->allocate(); + (*hUserEvent)->resetQueueAndCommand(queue, commandType); + return (*hUserEvent)->getZeEvent(); + } else { + return nullptr; + } +} + +ur_result_t ur_command_list_manager::appendKernelLaunch( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::appendKernelLaunch"); + + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER); + + UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + + ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(device); + + std::scoped_lock Lock(this->Mutex, + hKernel->Mutex); + + ze_group_count_t zeThreadGroupDimensions{1, 1, 1}; + uint32_t WG[3]{}; + UR_CALL(calculateKernelWorkDimensions(hZeKernel, device, + zeThreadGroupDimensions, WG, workDim, + pGlobalWorkSize, pLocalWorkSize)); + + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH); + + auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); + + bool memoryMigrated = false; + auto memoryMigrate = [&](void *src, void *dst, size_t size) { + ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, + (zeCommandList.get(), dst, src, size, nullptr, + waitList.second, waitList.first)); + memoryMigrated = true; + }; + + UR_CALL(hKernel->prepareForSubmission(context, device, pGlobalWorkOffset, + workDim, WG[0], WG[1], WG[2], + memoryMigrate)); + + if (memoryMigrated) { + // If memory was migrated, we don't need to pass the wait list to + // the copy command again. + waitList.first = nullptr; + waitList.second = 0; + } + + TRACK_SCOPE_LATENCY( + "ur_command_list_manager::zeCommandListAppendLaunchKernel"); + ZE2UR_CALL(zeCommandListAppendLaunchKernel, + (zeCommandList.get(), hZeKernel, &zeThreadGroupDimensions, + zeSignalEvent, waitList.second, waitList.first)); + + return UR_RESULT_SUCCESS; +} + +ze_command_list_handle_t ur_command_list_manager::getZeCommandList() { + return zeCommandList.get(); +} diff --git a/source/adapters/level_zero/v2/command_list_manager.hpp b/source/adapters/level_zero/v2/command_list_manager.hpp new file mode 100644 index 0000000000..9e0049a130 --- /dev/null +++ b/source/adapters/level_zero/v2/command_list_manager.hpp @@ -0,0 +1,54 @@ +//===--------- command_list_cache.hpp - Level Zero Adapter ---------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include "command_list_cache.hpp" +#include "common.hpp" +#include "event_pool_cache.hpp" +#include "queue_api.hpp" +#include + +struct ur_command_list_manager : public _ur_object { + + ur_command_list_manager(ur_context_handle_t context, + ur_device_handle_t device, + v2::raii::command_list_unique_handle &&commandList, + v2::event_flags_t flags = v2::EVENT_FLAGS_COUNTER, + ur_queue_handle_t_ *queue = nullptr); + ~ur_command_list_manager(); + + ur_result_t appendKernelLaunch(ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + ur_result_t appendCommandListImmediate( + ze_command_list_handle_t commandList, ur_event_handle_t *phEvent, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList); + ze_command_list_handle_t getZeCommandList(); + + std::pair + getWaitListView(const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents); + ze_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, + ur_command_t commandType); + +private: + // UR context associated with this command-buffer + ur_context_handle_t context; + // Device associated with this command buffer + ur_device_handle_t device; + v2::raii::cache_borrowed_event_pool eventPool; + v2::raii::command_list_unique_handle zeCommandList; + ur_queue_handle_t_ *queue; + std::vector waitList; +}; diff --git a/source/adapters/level_zero/v2/queue_api.cpp b/source/adapters/level_zero/v2/queue_api.cpp index f4e2f47c09..28ff527413 100644 --- a/source/adapters/level_zero/v2/queue_api.cpp +++ b/source/adapters/level_zero/v2/queue_api.cpp @@ -11,6 +11,9 @@ * */ +// Do not edit. This file is auto generated from a template: +// scripts/templates/queue_api.cpp.mako + #include "queue_api.hpp" #include "ur_util.hpp" diff --git a/source/adapters/level_zero/v2/queue_api.hpp b/source/adapters/level_zero/v2/queue_api.hpp index e9e98874e8..88d812bbba 100644 --- a/source/adapters/level_zero/v2/queue_api.hpp +++ b/source/adapters/level_zero/v2/queue_api.hpp @@ -11,9 +11,13 @@ * */ +// Do not edit. This file is auto generated from a template: +// scripts/templates/queue_api.hpp.mako + #pragma once #include +#include struct ur_queue_handle_t_ { virtual ~ur_queue_handle_t_(); @@ -158,4 +162,8 @@ struct ur_queue_handle_t_ { const ur_exp_enqueue_native_command_properties_t *, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) = 0; + + virtual ur_result_t enqueueCommandBuffer(ze_command_list_handle_t, + ur_event_handle_t *, uint32_t, + const ur_event_handle_t *) = 0; }; diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index af65df78a2..1c738edf50 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -24,13 +24,7 @@ namespace v2 { std::pair ur_queue_immediate_in_order_t::getWaitListView( const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents) { - - waitList.resize(numWaitEvents); - for (uint32_t i = 0; i < numWaitEvents; i++) { - waitList[i] = phWaitEvents[i]->getZeEvent(); - } - - return {waitList.data(), static_cast(numWaitEvents)}; + return commandListManager.getWaitListView(phWaitEvents, numWaitEvents); } static int32_t getZeOrdinal(ur_device_handle_t hDevice) { @@ -58,25 +52,6 @@ static ze_command_queue_priority_t getZePriority(ur_queue_flags_t flags) { return ZE_COMMAND_QUEUE_PRIORITY_NORMAL; } -ur_command_list_handler_t::ur_command_list_handler_t( - ur_context_handle_t hContext, ur_device_handle_t hDevice, - const ur_queue_properties_t *pProps) - : commandList(hContext->commandListCache.getImmediateCommandList( - hDevice->ZeDevice, true, getZeOrdinal(hDevice), - true /* always enable copy offload */, - ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, - getZePriority(pProps ? pProps->flags : ur_queue_flags_t{}), - getZeIndex(pProps))) {} - -ur_command_list_handler_t::ur_command_list_handler_t( - ze_command_list_handle_t hZeCommandList, bool ownZeHandle) - : commandList(hZeCommandList, - [ownZeHandle](ze_command_list_handle_t hZeCommandList) { - if (ownZeHandle) { - ZE_CALL_NOCHECK(zeCommandListDestroy, (hZeCommandList)); - } - }) {} - static event_flags_t eventFlagsFromQueueFlags(ur_queue_flags_t flags) { event_flags_t eventFlags = EVENT_FLAGS_COUNTER; if (flags & UR_QUEUE_FLAG_PROFILING_ENABLE) @@ -88,29 +63,35 @@ ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_queue_properties_t *pProps) : hContext(hContext), hDevice(hDevice), flags(pProps ? pProps->flags : 0), - eventPool(hContext->eventPoolCache.borrow( - hDevice->Id.value(), eventFlagsFromQueueFlags(flags))), - handler(hContext, hDevice, pProps) {} + commandListManager( + hContext, hDevice, + hContext->commandListCache.getImmediateCommandList( + hDevice->ZeDevice, true, getZeOrdinal(hDevice), + true /* always enable copy offload */, + ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, + getZePriority(pProps ? pProps->flags : ur_queue_flags_t{}), + getZeIndex(pProps)), + eventFlagsFromQueueFlags(flags), this) {} ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_native_handle_t hNativeHandle, ur_queue_flags_t flags, bool ownZeQueue) : hContext(hContext), hDevice(hDevice), flags(flags), - eventPool(hContext->eventPoolCache.borrow( - hDevice->Id.value(), eventFlagsFromQueueFlags(flags))), - handler(reinterpret_cast(hNativeHandle), - ownZeQueue) {} - -ur_event_handle_t + commandListManager( + hContext, hDevice, + raii::command_list_unique_handle( + reinterpret_cast(hNativeHandle), + [ownZeQueue](ze_command_list_handle_t hZeCommandList) { + if (ownZeQueue) { + ZE_CALL_NOCHECK(zeCommandListDestroy, (hZeCommandList)); + } + }), + eventFlagsFromQueueFlags(flags)) {} + +ze_event_handle_t ur_queue_immediate_in_order_t::getSignalEvent(ur_event_handle_t *hUserEvent, ur_command_t commandType) { - if (hUserEvent) { - *hUserEvent = eventPool->allocate(); - (*hUserEvent)->resetQueueAndCommand(this, commandType); - return *hUserEvent; - } else { - return nullptr; - } + return commandListManager.getSignalEvent(hUserEvent, commandType); } ur_result_t @@ -133,7 +114,7 @@ ur_queue_immediate_in_order_t::queueGetInfo(ur_queue_info_t propName, return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; case UR_QUEUE_INFO_EMPTY: { auto status = ZE_CALL_NOCHECK(zeCommandListHostSynchronize, - (handler.commandList.get(), 0)); + (commandListManager.getZeCommandList(), 0)); if (status == ZE_RESULT_SUCCESS) { return ReturnValue(true); } else if (status == ZE_RESULT_NOT_READY) { @@ -175,8 +156,8 @@ void ur_queue_immediate_in_order_t::deferEventFree(ur_event_handle_t hEvent) { ur_result_t ur_queue_immediate_in_order_t::queueGetNativeHandle( ur_queue_native_desc_t *pDesc, ur_native_handle_t *phNativeQueue) { std::ignore = pDesc; - *phNativeQueue = - reinterpret_cast(this->handler.commandList.get()); + *phNativeQueue = reinterpret_cast( + this->commandListManager.getZeCommandList()); return UR_RESULT_SUCCESS; } @@ -189,7 +170,7 @@ ur_result_t ur_queue_immediate_in_order_t::queueFinish() { TRACK_SCOPE_LATENCY( "ur_queue_immediate_in_order_t::zeCommandListHostSynchronize"); ZE2UR_CALL(zeCommandListHostSynchronize, - (handler.commandList.get(), UINT64_MAX)); + (commandListManager.getZeCommandList(), UINT64_MAX)); // Free deferred events for (auto &hEvent : deferredEvents) { @@ -223,52 +204,9 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunch( const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueKernelLaunch"); - UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER); - - UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - - ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(hDevice); - - std::scoped_lock Lock(this->Mutex, - hKernel->Mutex); - - ze_group_count_t zeThreadGroupDimensions{1, 1, 1}; - uint32_t WG[3]{}; - UR_CALL(calculateKernelWorkDimensions(hZeKernel, hDevice, - zeThreadGroupDimensions, WG, workDim, - pGlobalWorkSize, pLocalWorkSize)); - - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH); - - auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); - - bool memoryMigrated = false; - auto memoryMigrate = [&](void *src, void *dst, size_t size) { - ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); - memoryMigrated = true; - }; - - UR_CALL(hKernel->prepareForSubmission(hContext, hDevice, pGlobalWorkOffset, - workDim, WG[0], WG[1], WG[2], - memoryMigrate)); - - if (memoryMigrated) { - // If memory was migrated, we don't need to pass the wait list to - // the copy command again. - waitList.first = nullptr; - waitList.second = 0; - } - - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::zeCommandListAppendLaunchKernel"); - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; - ZE2UR_CALL(zeCommandListAppendLaunchKernel, - (handler.commandList.get(), hZeKernel, &zeThreadGroupDimensions, - zeSignalEvent, waitList.second, waitList.first)); + UR_CALL(commandListManager.appendKernelLaunch( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + numEventsInWaitList, phEventWaitList, phEvent)); recordSubmittedKernel(hKernel); @@ -287,20 +225,20 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWait( return UR_RESULT_SUCCESS; } - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_EVENTS_WAIT); + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_EVENTS_WAIT); auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); if (numWaitEvents > 0) { - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (handler.commandList.get(), numWaitEvents, pWaitEvents)); + ZE2UR_CALL( + zeCommandListAppendWaitOnEvents, + (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents)); } - if (signalEvent) { + if (zeSignalEvent) { ZE2UR_CALL(zeCommandListAppendSignalEvent, - (handler.commandList.get(), signalEvent->getZeEvent())); + (commandListManager.getZeCommandList(), zeSignalEvent)); } - return UR_RESULT_SUCCESS; } @@ -317,13 +255,13 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrierImpl( return UR_RESULT_SUCCESS; } - auto signalEvent = + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER); auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); ZE2UR_CALL(zeCommandListAppendBarrier, - (handler.commandList.get(), signalEvent->getZeEvent(), + (commandListManager.getZeCommandList(), zeSignalEvent, numWaitEvents, pWaitEvents)); return UR_RESULT_SUCCESS; @@ -358,7 +296,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCopyUnlocked( size_t dstOffset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, ur_command_t commandType) { - auto signalEvent = getSignalEvent(phEvent, commandType); + auto zeSignalEvent = getSignalEvent(phEvent, commandType); auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -367,8 +305,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCopyUnlocked( hDevice, ur_mem_handle_t_::device_access_mode_t::read_only, srcOffset, size, [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, + size, nullptr, waitList.second, waitList.first)); memoryMigrated = true; })); @@ -376,8 +314,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCopyUnlocked( hDevice, ur_mem_handle_t_::device_access_mode_t::write_only, dstOffset, size, [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, + size, nullptr, waitList.second, waitList.first)); memoryMigrated = true; })); @@ -388,14 +326,13 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCopyUnlocked( waitList.second = 0; } - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; ZE2UR_CALL(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), pDst, pSrc, size, zeSignalEvent, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), pDst, pSrc, size, + zeSignalEvent, waitList.second, waitList.first)); if (blocking) { ZE2UR_CALL(zeCommandListHostSynchronize, - (handler.commandList.get(), UINT64_MAX)); + (commandListManager.getZeCommandList(), UINT64_MAX)); } return UR_RESULT_SUCCESS; @@ -447,7 +384,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueRegionCopyUnlocked( auto zeParams = ur2zeRegionParams(srcOrigin, dstOrigin, region, srcRowPitch, dstRowPitch, srcSlicePitch, dstSlicePitch); - auto signalEvent = getSignalEvent(phEvent, commandType); + auto zeSignalEvent = getSignalEvent(phEvent, commandType); auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -456,16 +393,16 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueRegionCopyUnlocked( hDevice, ur_mem_handle_t_::device_access_mode_t::read_only, 0, src->getSize(), [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, + size, nullptr, waitList.second, waitList.first)); memoryMigrated = true; })); auto pDst = ur_cast(dst->getDevicePtr( hDevice, ur_mem_handle_t_::device_access_mode_t::write_only, 0, dst->getSize(), [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, + size, nullptr, waitList.second, waitList.first)); memoryMigrated = true; })); @@ -476,16 +413,15 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueRegionCopyUnlocked( waitList.second = 0; } - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion, - (handler.commandList.get(), pDst, &zeParams.dstRegion, + (commandListManager.getZeCommandList(), pDst, &zeParams.dstRegion, zeParams.dstPitch, zeParams.dstSlicePitch, pSrc, &zeParams.srcRegion, zeParams.srcPitch, zeParams.srcSlicePitch, zeSignalEvent, waitList.second, waitList.first)); if (blocking) { ZE2UR_CALL(zeCommandListHostSynchronize, - (handler.commandList.get(), UINT64_MAX)); + (commandListManager.getZeCommandList(), UINT64_MAX)); } return UR_RESULT_SUCCESS; @@ -649,7 +585,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferMap( std::scoped_lock lock(this->Mutex, hBuffer->getMutex()); - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_BUFFER_MAP); + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_BUFFER_MAP); auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -657,8 +593,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferMap( auto pDst = ur_cast(hBuffer->mapHostPtr( mapFlags, offset, size, [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, + size, nullptr, waitList.second, waitList.first)); memoryMigrated = true; })); *ppRetMap = pDst; @@ -666,17 +602,18 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferMap( if (!memoryMigrated && waitList.second) { // If memory was not migrated, we need to wait on the events here. ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (handler.commandList.get(), waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), waitList.second, + waitList.first)); } - if (signalEvent) { + if (zeSignalEvent) { ZE2UR_CALL(zeCommandListAppendSignalEvent, - (handler.commandList.get(), signalEvent->getZeEvent())); + (commandListManager.getZeCommandList(), zeSignalEvent)); } if (blockingMap) { ZE2UR_CALL(zeCommandListHostSynchronize, - (handler.commandList.get(), UINT64_MAX)); + (commandListManager.getZeCommandList(), UINT64_MAX)); } return UR_RESULT_SUCCESS; @@ -689,29 +626,28 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemUnmap( std::scoped_lock lock(this->Mutex); - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_UNMAP); + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_UNMAP); auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); // TODO: currently unmapHostPtr deallocates memory immediately, // since the memory might be used by the user, we need to make sure // all dependencies are completed. - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (handler.commandList.get(), waitList.second, waitList.first)); + ZE2UR_CALL( + zeCommandListAppendWaitOnEvents, + (commandListManager.getZeCommandList(), waitList.second, waitList.first)); bool memoryMigrated = false; hMem->unmapHostPtr(pMappedPtr, [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, size, + nullptr, waitList.second, waitList.first)); memoryMigrated = true; }); - - if (signalEvent) { + if (zeSignalEvent) { ZE2UR_CALL(zeCommandListAppendSignalEvent, - (handler.commandList.get(), signalEvent->getZeEvent())); + (commandListManager.getZeCommandList(), zeSignalEvent)); } - return UR_RESULT_SUCCESS; } @@ -721,7 +657,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericFillUnlocked( const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, ur_command_t commandType) { - auto signalEvent = getSignalEvent(phEvent, commandType); + auto zeSignalEvent = getSignalEvent(phEvent, commandType); auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -730,8 +666,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericFillUnlocked( hDevice, ur_mem_handle_t_::device_access_mode_t::read_only, offset, size, [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, + size, nullptr, waitList.second, waitList.first)); memoryMigrated = true; })); @@ -746,10 +682,10 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericFillUnlocked( // PatternSize must be a power of two for zeCommandListAppendMemoryFill. // When it's not, the fill is emulated with zeCommandListAppendMemoryCopy. - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; ZE2UR_CALL(zeCommandListAppendMemoryFill, - (handler.commandList.get(), pDst, pPattern, patternSize, size, - zeSignalEvent, waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), pDst, pPattern, + patternSize, size, zeSignalEvent, waitList.second, + waitList.first)); return UR_RESULT_SUCCESS; } @@ -777,19 +713,18 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMMemcpy( std::scoped_lock lock(this->Mutex); - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_MEMCPY); + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_MEMCPY); auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; ZE2UR_CALL(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), pDst, pSrc, size, zeSignalEvent, - numWaitEvents, pWaitEvents)); + (commandListManager.getZeCommandList(), pDst, pSrc, size, + zeSignalEvent, numWaitEvents, pWaitEvents)); if (blocking) { ZE2UR_CALL(zeCommandListHostSynchronize, - (handler.commandList.get(), UINT64_MAX)); + (commandListManager.getZeCommandList(), UINT64_MAX)); } return UR_RESULT_SUCCESS; @@ -805,22 +740,22 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMPrefetch( std::scoped_lock lock(this->Mutex); - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_PREFETCH); + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_PREFETCH); auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); if (pWaitEvents) { - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (handler.commandList.get(), numWaitEvents, pWaitEvents)); + ZE2UR_CALL( + zeCommandListAppendWaitOnEvents, + (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents)); } // TODO: figure out how to translate "flags" ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, - (handler.commandList.get(), pMem, size)); - - if (signalEvent) { + (commandListManager.getZeCommandList(), pMem, size)); + if (zeSignalEvent) { ZE2UR_CALL(zeCommandListAppendSignalEvent, - (handler.commandList.get(), signalEvent->getZeEvent())); + (commandListManager.getZeCommandList(), zeSignalEvent)); } return UR_RESULT_SUCCESS; @@ -838,25 +773,25 @@ ur_queue_immediate_in_order_t::enqueueUSMAdvise(const void *pMem, size_t size, auto zeAdvice = ur_cast(advice); - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE); + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE); auto [pWaitEvents, numWaitEvents] = getWaitListView(nullptr, 0); if (pWaitEvents) { - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (handler.commandList.get(), numWaitEvents, pWaitEvents)); + ZE2UR_CALL( + zeCommandListAppendWaitOnEvents, + (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents)); } // TODO: figure out how to translate "flags" ZE2UR_CALL(zeCommandListAppendMemAdvise, - (handler.commandList.get(), this->hDevice->ZeDevice, pMem, size, - zeAdvice)); + (commandListManager.getZeCommandList(), this->hDevice->ZeDevice, + pMem, size, zeAdvice)); - if (signalEvent) { + if (zeSignalEvent) { ZE2UR_CALL(zeCommandListAppendSignalEvent, - (handler.commandList.get(), signalEvent->getZeEvent())); + (commandListManager.getZeCommandList(), zeSignalEvent)); } - return UR_RESULT_SUCCESS; } @@ -1058,15 +993,15 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp( zeThreadGroupDimensions, WG, workDim, pGlobalWorkSize, pLocalWorkSize)); - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH); + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH); auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); bool memoryMigrated = false; auto memoryMigrate = [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, size, + nullptr, waitList.second, waitList.first)); memoryMigrated = true; }; @@ -1083,10 +1018,10 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp( TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::" "zeCommandListAppendLaunchCooperativeKernel"); - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel, - (handler.commandList.get(), hZeKernel, &zeThreadGroupDimensions, - zeSignalEvent, waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), hZeKernel, + &zeThreadGroupDimensions, zeSignalEvent, waitList.second, + waitList.first)); recordSubmittedKernel(hKernel); @@ -1101,33 +1036,56 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp( std::scoped_lock lock(this->Mutex); - auto signalEvent = - getSignalEvent(phEvent, UR_COMMAND_TIMESTAMP_RECORDING_EXP); - - if (!signalEvent) { + if (!phEvent && !*phEvent) { return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } - + getSignalEvent(phEvent, UR_COMMAND_TIMESTAMP_RECORDING_EXP); auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); - signalEvent->recordStartTimestamp(); + (*phEvent)->recordStartTimestamp(); auto [timestampPtr, zeSignalEvent] = - signalEvent->getEventEndTimestampAndHandle(); + (*phEvent)->getEventEndTimestampAndHandle(); ZE2UR_CALL(zeCommandListAppendWriteGlobalTimestamp, - (handler.commandList.get(), timestampPtr, zeSignalEvent, - numWaitEvents, pWaitEvents)); + (commandListManager.getZeCommandList(), timestampPtr, + zeSignalEvent, numWaitEvents, pWaitEvents)); if (blocking) { ZE2UR_CALL(zeCommandListHostSynchronize, - (handler.commandList.get(), UINT64_MAX)); + (commandListManager.getZeCommandList(), UINT64_MAX)); } return UR_RESULT_SUCCESS; } +ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp( + uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, + ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand) { + + std::scoped_lock Lock(this->Mutex); + auto zeSignalEvent = getSignalEvent(phEvent, callerCommand); + + auto [pWaitEvents, numWaitEvents] = + getWaitListView(phEventWaitList, numEventsInWaitList); + + ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp, + (commandListManager.getZeCommandList(), numCommandLists, + phCommandLists, zeSignalEvent, numWaitEvents, pWaitEvents)); + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_immediate_in_order_t::enqueueCommandBuffer( + ze_command_list_handle_t commandBufferCommandList, + ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList) { + return enqueueGenericCommandListsExp(1, &commandBufferCommandList, phEvent, + numEventsInWaitList, phEventWaitList, + UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP); +} ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunchCustomExp( ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index e0d7f747b3..6cf8b0c51c 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -19,33 +19,19 @@ #include "ur/ur.hpp" +#include "command_list_manager.hpp" + namespace v2 { using queue_group_type = ur_device_handle_t_::queue_group_info_t::type; -struct ur_command_list_handler_t { - ur_command_list_handler_t(ur_context_handle_t hContext, - ur_device_handle_t hDevice, - const ur_queue_properties_t *pProps); - - ur_command_list_handler_t(ze_command_list_handle_t hZeCommandList, - bool ownZeHandle); - - raii::command_list_unique_handle commandList; -}; - struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ { private: ur_context_handle_t hContext; ur_device_handle_t hDevice; ur_queue_flags_t flags; - raii::cache_borrowed_event_pool eventPool; - - ur_command_list_handler_t handler; - - std::vector waitList; - + ur_command_list_manager commandListManager; std::vector deferredEvents; std::vector submittedKernels; @@ -53,7 +39,7 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ { getWaitListView(const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents); - ur_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, + ze_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, ur_command_t commandType); void deferEventFree(ur_event_handle_t hEvent) override; @@ -78,6 +64,11 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ { const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, ur_command_t commandType); + ur_result_t enqueueGenericCommandListsExp( + uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, + ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand); + ur_result_t enqueueEventsWaitWithBarrierImpl(uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, @@ -277,6 +268,10 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override; ur_result_t + enqueueCommandBuffer(ze_command_list_handle_t commandBufferCommandList, + ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList) override; + ur_result_t enqueueNativeCommandExp(ur_exp_enqueue_native_command_function_t, void *, uint32_t, const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *, diff --git a/source/common/umf_pools/disjoint_pool_config_parser.cpp b/source/common/umf_pools/disjoint_pool_config_parser.cpp index 8d5bc2066e..42c894b412 100644 --- a/source/common/umf_pools/disjoint_pool_config_parser.cpp +++ b/source/common/umf_pools/disjoint_pool_config_parser.cpp @@ -174,47 +174,36 @@ DisjointPoolAllConfigs parseDisjointPoolConfig(const std::string &config, MemParser(Params, M); }; - size_t MaxSize = (std::numeric_limits::max)(); - // Update pool settings if specified in environment. + size_t MaxSize = (std::numeric_limits::max)(); size_t EnableBuffers = 1; - if (config != "") { - std::string Params = config; - size_t Pos = Params.find(';'); - if (Pos != std::string::npos) { - if (Pos > 0) { - GetValue(Params, Pos, EnableBuffers); + + bool EnableBuffersSet = false; + bool MaxSizeSet = false; + size_t Start = 0; + size_t End = config.find(';'); + while (true) { + std::string Param = config.substr(Start, End - Start); + if (!EnableBuffersSet && (Param == "" || isdigit(Param[0]))) { + if (Param != "") { + GetValue(Param, Param.size(), EnableBuffers); } - Params.erase(0, Pos + 1); - size_t Pos = Params.find(';'); - if (Pos != std::string::npos) { - if (Pos > 0) { - GetValue(Params, Pos, MaxSize); - } - Params.erase(0, Pos + 1); - do { - size_t Pos = Params.find(';'); - if (Pos != std::string::npos) { - if (Pos > 0) { - std::string MemParams = Params.substr(0, Pos); - MemTypeParser(MemParams); - } - Params.erase(0, Pos + 1); - if (Params.size() == 0) { - break; - } - } else { - MemTypeParser(Params); - break; - } - } while (true); - } else { - // set MaxPoolSize for all configs - GetValue(Params, Params.size(), MaxSize); + EnableBuffersSet = true; + } else if (!MaxSizeSet && (Param == "" || isdigit(Param[0]))) { + if (Param != "") { + GetValue(Param, Param.size(), MaxSize); } + MaxSizeSet = true; } else { - GetValue(Params, Params.size(), EnableBuffers); + MemTypeParser(Param); } + + if (End == std::string::npos) { + break; + } + + Start = End + 1; + End = config.find(';', Start); } AllConfigs.EnableBuffers = EnableBuffers; diff --git a/test/adapters/cuda/kernel_tests.cpp b/test/adapters/cuda/kernel_tests.cpp index 0f7f3351fe..7b83459c5f 100644 --- a/test/adapters/cuda/kernel_tests.cpp +++ b/test/adapters/cuda/kernel_tests.cpp @@ -153,7 +153,7 @@ TEST_P(cudaKernelTest, URKernelArgumentSimple) { int number = 10; ASSERT_SUCCESS(urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &number)); - const auto &kernelArgs = kernel->getArgIndices(); + const auto &kernelArgs = kernel->getArgPointers(); ASSERT_EQ(kernelArgs.size(), 1 + NumberOfImplicitArgsCUDA); int storedValue = *static_cast(kernelArgs[0]); @@ -175,7 +175,7 @@ TEST_P(cudaKernelTest, URKernelArgumentSetTwice) { int number = 10; ASSERT_SUCCESS(urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &number)); - const auto &kernelArgs = kernel->getArgIndices(); + const auto &kernelArgs = kernel->getArgPointers(); ASSERT_EQ(kernelArgs.size(), 1 + NumberOfImplicitArgsCUDA); int storedValue = *static_cast(kernelArgs[0]); ASSERT_EQ(storedValue, number); @@ -183,7 +183,7 @@ TEST_P(cudaKernelTest, URKernelArgumentSetTwice) { int otherNumber = 934; ASSERT_SUCCESS( urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &otherNumber)); - const auto kernelArgs2 = kernel->getArgIndices(); + const auto kernelArgs2 = kernel->getArgPointers(); ASSERT_EQ(kernelArgs2.size(), 1 + NumberOfImplicitArgsCUDA); storedValue = *static_cast(kernelArgs2[0]); ASSERT_EQ(storedValue, otherNumber); diff --git a/test/adapters/level_zero/event_cache_tests.cpp b/test/adapters/level_zero/event_cache_tests.cpp index 14466ab805..09fcff7373 100644 --- a/test/adapters/level_zero/event_cache_tests.cpp +++ b/test/adapters/level_zero/event_cache_tests.cpp @@ -30,7 +30,7 @@ static std::shared_ptr<_zel_tracer_handle_t> tracer = [] { zel_core_callbacks_t prologue_callbacks{}; prologue_callbacks.Event.pfnCreateCb = OnEnterEventCreate; prologue_callbacks.Event.pfnDestroyCb = OnEnterEventDestroy; - return enableTracing(prologue_callbacks, {}); + return enableTracing(std::move(prologue_callbacks), {}); }(); template auto combineFlags(std::tuple tuple) { diff --git a/test/adapters/level_zero/multi_device_event_cache_tests.cpp b/test/adapters/level_zero/multi_device_event_cache_tests.cpp index f0cc261bb4..bc88eb2a02 100644 --- a/test/adapters/level_zero/multi_device_event_cache_tests.cpp +++ b/test/adapters/level_zero/multi_device_event_cache_tests.cpp @@ -20,7 +20,7 @@ static std::shared_ptr<_zel_tracer_handle_t> tracer = [] { zel_core_callbacks_t prologue_callbacks{}; prologue_callbacks.CommandList.pfnAppendWaitOnEventsCb = OnAppendWaitOnEventsCb; - return enableTracing(prologue_callbacks, {}); + return enableTracing(std::move(prologue_callbacks), {}); }(); using urMultiQueueMultiDeviceEventCacheTest = uur::urAllDevicesTest; diff --git a/test/adapters/level_zero/ze_tracer_common.hpp b/test/adapters/level_zero/ze_tracer_common.hpp index bf93c71fbb..8aa93c7c13 100644 --- a/test/adapters/level_zero/ze_tracer_common.hpp +++ b/test/adapters/level_zero/ze_tracer_common.hpp @@ -11,9 +11,9 @@ #include -std::shared_ptr<_zel_tracer_handle_t> -enableTracing(zel_core_callbacks_t prologueCallbacks, - zel_core_callbacks_t epilogueCallbacks) { +inline std::shared_ptr<_zel_tracer_handle_t> +enableTracing(zel_core_callbacks_t &&prologueCallbacks, + zel_core_callbacks_t &&epilogueCallbacks) { EXPECT_EQ(zeInit(ZE_INIT_FLAG_GPU_ONLY), ZE_RESULT_SUCCESS); zel_tracer_desc_t tracer_desc = {ZEL_STRUCTURE_TYPE_TRACER_EXP_DESC, nullptr, diff --git a/test/conformance/exp_command_buffer/update/local_memory_update.cpp b/test/conformance/exp_command_buffer/update/local_memory_update.cpp index 559a61e3ad..d55094a52c 100644 --- a/test/conformance/exp_command_buffer/update/local_memory_update.cpp +++ b/test/conformance/exp_command_buffer/update/local_memory_update.cpp @@ -1094,3 +1094,220 @@ TEST_P(LocalMemoryMultiUpdateTest, UpdateWithoutBlocking) { uint32_t *new_Y = (uint32_t *)shared_ptrs[4]; Validate(new_output, new_X, new_Y, new_A, global_size, local_size); } + +struct LocalMemoryUpdateTestBaseOutOfOrder : LocalMemoryUpdateTestBase { + virtual void SetUp() override { + program_name = "saxpy_usm_local_mem"; + UUR_RETURN_ON_FATAL_FAILURE( + urUpdatableCommandBufferExpExecutionTest::SetUp()); + + if (backend == UR_PLATFORM_BACKEND_LEVEL_ZERO) { + GTEST_SKIP() + << "Local memory argument update not supported on Level Zero."; + } + + // HIP has extra args for local memory so we define an offset for arg + // indices here for updating + hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0; + ur_device_usm_access_capability_flags_t shared_usm_flags; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; + } + + const size_t allocation_size = sizeof(uint32_t) * global_size * local_size; + for (auto &shared_ptr : shared_ptrs) { + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, &shared_ptr)); + ASSERT_NE(shared_ptr, nullptr); + + std::vector pattern(allocation_size); + uur::generateMemFillPattern(pattern); + std::memcpy(shared_ptr, pattern.data(), allocation_size); + } + + std::array index_order{}; + if (backend != UR_PLATFORM_BACKEND_HIP) { + index_order = {3, 2, 4, 5, 1, 0}; + } else { + index_order = {9, 8, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; + } + size_t current_index = 0; + + // Index 3 is A + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(A), nullptr, &A)); + // Index 2 is output + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[0])); + + // Index 4 is X + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[1])); + // Index 5 is Y + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[2])); + + // Index 1 is local_mem_b arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_b_size, nullptr)); + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + } + + // Index 0 is local_mem_a arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_a_size, nullptr)); + + // Hip has extra args for local mem at index 1-3 + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + } + } +}; + +struct LocalMemoryUpdateTestOutOfOrder : LocalMemoryUpdateTestBaseOutOfOrder { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(LocalMemoryUpdateTestBaseOutOfOrder::SetUp()); + + // Append kernel command to command-buffer and close command-buffer + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, + &global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr, nullptr, + nullptr, &command_handle)); + ASSERT_NE(command_handle, nullptr); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + } + + void TearDown() override { + if (command_handle) { + EXPECT_SUCCESS(urCommandBufferReleaseCommandExp(command_handle)); + } + + UUR_RETURN_ON_FATAL_FAILURE( + LocalMemoryUpdateTestBaseOutOfOrder::TearDown()); + } + + ur_exp_command_buffer_command_handle_t command_handle = nullptr; +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(LocalMemoryUpdateTestOutOfOrder); + +// Test updating A,X,Y parameters to new values and local memory to larger +// values when the kernel arguments were added out of order. +TEST_P(LocalMemoryUpdateTestOutOfOrder, UpdateAllParameters) { + // Run command-buffer prior to update and verify output + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size, local_size); + + // Update inputs + std::array + new_input_descs; + std::array new_value_descs; + + size_t new_local_size = local_size * 4; + size_t new_local_mem_a_size = new_local_size * sizeof(uint32_t); + + // New local_mem_a at index 0 + new_value_descs[0] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 0, // argIndex + new_local_mem_a_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }; + + // New local_mem_b at index 1 + new_value_descs[1] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 1 + hip_arg_offset, // argIndex + local_mem_b_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }; + + // New A at index 3 + uint32_t new_A = 33; + new_value_descs[2] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 3 + (2 * hip_arg_offset), // argIndex + sizeof(new_A), // argSize + nullptr, // pProperties + &new_A, // hArgValue + }; + + // New X at index 4 + new_input_descs[0] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 4 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[3], // pArgValue + }; + + // New Y at index 5 + new_input_descs[1] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 5 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[4], // pArgValue + }; + + // Update kernel inputs + ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + kernel, // hNewKernel + 0, // numNewMemObjArgs + new_input_descs.size(), // numNewPointerArgs + new_value_descs.size(), // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + new_input_descs.data(), // pNewPointerArgList + new_value_descs.data(), // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + nullptr, // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize + }; + + // Update kernel and enqueue command-buffer again + ASSERT_SUCCESS( + urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + // Verify that update occurred correctly + uint32_t *new_output = (uint32_t *)shared_ptrs[0]; + uint32_t *new_X = (uint32_t *)shared_ptrs[3]; + uint32_t *new_Y = (uint32_t *)shared_ptrs[4]; + Validate(new_output, new_X, new_Y, new_A, global_size, local_size); +} \ No newline at end of file diff --git a/test/conformance/kernel/urKernelGetInfo.cpp b/test/conformance/kernel/urKernelGetInfo.cpp index 4748f8f96a..65694b5bdd 100644 --- a/test/conformance/kernel/urKernelGetInfo.cpp +++ b/test/conformance/kernel/urKernelGetInfo.cpp @@ -121,7 +121,7 @@ TEST_P(urKernelGetInfoTest, SuccessAttributes) { } TEST_P(urKernelGetInfoTest, SuccessNumRegs) { - UUR_KNOWN_FAILURE_ON(uur::HIP{}, uur::OpenCL{}); + UUR_KNOWN_FAILURE_ON(uur::HIP{}); ur_kernel_info_t property_name = UR_KERNEL_INFO_NUM_REGS; size_t property_size = 0; diff --git a/test/conformance/kernel/urKernelSetArgLocal.cpp b/test/conformance/kernel/urKernelSetArgLocal.cpp index 688724ec09..f056d025bc 100644 --- a/test/conformance/kernel/urKernelSetArgLocal.cpp +++ b/test/conformance/kernel/urKernelSetArgLocal.cpp @@ -237,3 +237,102 @@ TEST_P(urKernelSetArgLocalMultiTest, Overwrite) { Validate(output, X, Y, A, global_size, new_local_size); } + +// Tests that adding arguments out of order (e.g. index 1 before index 0) works. +struct urKernelSetArgLocalOutOfOrder : urKernelSetArgLocalMultiTest { + void SetUp() override { + program_name = "saxpy_usm_local_mem"; + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); + + ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, + sizeof(backend), &backend, nullptr)); + + // HIP has extra args for local memory so we define an offset for arg + // indices here for updating + hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0; + ur_device_usm_access_capability_flags_t shared_usm_flags; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; + } + + const size_t allocation_size = sizeof(uint32_t) * global_size * local_size; + for (auto &shared_ptr : shared_ptrs) { + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, &shared_ptr)); + ASSERT_NE(shared_ptr, nullptr); + + std::vector pattern(allocation_size); + uur::generateMemFillPattern(pattern); + std::memcpy(shared_ptr, pattern.data(), allocation_size); + } + + std::array index_order{}; + if (backend != UR_PLATFORM_BACKEND_HIP) { + index_order = {3, 2, 4, 5, 1, 0}; + } else { + index_order = {9, 8, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; + } + size_t current_index = 0; + + // Index 3 is A + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(A), nullptr, &A)); + // Index 2 is output + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[0])); + + // Index 4 is X + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[1])); + // Index 5 is Y + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[2])); + + // Index 1 is local_mem_b arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_b_size, nullptr)); + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + } + + // Index 0 is local_mem_a arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_a_size, nullptr)); + + // Hip has extra args for local mem at index 1-3 + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + } + } +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urKernelSetArgLocalOutOfOrder); +TEST_P(urKernelSetArgLocalOutOfOrder, Success) { + ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, + &global_offset, &global_size, + &local_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size, local_size); +} diff --git a/test/usm/usmPoolManager.cpp b/test/usm/usmPoolManager.cpp index ec52d00c5e..4e82196eef 100644 --- a/test/usm/usmPoolManager.cpp +++ b/test/usm/usmPoolManager.cpp @@ -4,6 +4,7 @@ // // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include "umf_pools/disjoint_pool_config_parser.hpp" #include "ur_pool_manager.hpp" #include @@ -18,6 +19,26 @@ auto createMockPoolHandle() { [](umf_memory_pool_t *) {}); } +bool compareConfig(const usm::umf_disjoint_pool_config_t &left, + usm::umf_disjoint_pool_config_t &right) { + return left.MaxPoolableSize == right.MaxPoolableSize && + left.Capacity == right.Capacity && + left.SlabMinSize == right.SlabMinSize; +} + +bool compareConfigs(const usm::DisjointPoolAllConfigs &left, + usm::DisjointPoolAllConfigs &right) { + return left.EnableBuffers == right.EnableBuffers && + compareConfig(left.Configs[usm::DisjointPoolMemType::Host], + right.Configs[usm::DisjointPoolMemType::Host]) && + compareConfig(left.Configs[usm::DisjointPoolMemType::Device], + right.Configs[usm::DisjointPoolMemType::Device]) && + compareConfig(left.Configs[usm::DisjointPoolMemType::Shared], + right.Configs[usm::DisjointPoolMemType::Shared]) && + compareConfig(left.Configs[usm::DisjointPoolMemType::SharedReadOnly], + right.Configs[usm::DisjointPoolMemType::SharedReadOnly]); +} + TEST_P(urUsmPoolDescriptorTest, poolIsPerContextTypeAndDevice) { auto &devices = uur::DevicesEnvironment::instance->devices; @@ -111,4 +132,49 @@ TEST_P(urUsmPoolManagerTest, poolManagerGetNonexistant) { } } +TEST_P(urUsmPoolManagerTest, config) { + // Check default config + usm::DisjointPoolAllConfigs def; + usm::DisjointPoolAllConfigs parsed1 = + usm::parseDisjointPoolConfig("1;host:2M,4,64K;device:4M,4,64K;" + "shared:0,0,2M;read_only_shared:4M,4,2M", + 0); + ASSERT_EQ(compareConfigs(def, parsed1), true); + + // Check partially set config + usm::DisjointPoolAllConfigs part1 = + usm::parseDisjointPoolConfig("1;device:4M;shared:0,0,2M", 0); + ASSERT_EQ(compareConfigs(def, part1), true); + + // Check partially set config #2 + usm::DisjointPoolAllConfigs part2 = + usm::parseDisjointPoolConfig(";device:4M;shared:0,0,2M", 0); + ASSERT_EQ(compareConfigs(def, part2), true); + + // Check partially set config #3 + usm::DisjointPoolAllConfigs part3 = + usm::parseDisjointPoolConfig(";shared:0,0,2M", 0); + ASSERT_EQ(compareConfigs(def, part3), true); + + // Check partially set config #4 + usm::DisjointPoolAllConfigs part4 = + usm::parseDisjointPoolConfig(";device:4M", 0); + ASSERT_EQ(compareConfigs(def, part4), true); + + // Check partially set config #5 + usm::DisjointPoolAllConfigs part5 = + usm::parseDisjointPoolConfig(";;device:4M,4,64K", 0); + ASSERT_EQ(compareConfigs(def, part5), true); + + // Check non-default config + usm::DisjointPoolAllConfigs test(def); + test.Configs[usm::DisjointPoolMemType::Shared].MaxPoolableSize = 128 * 1024; + test.Configs[usm::DisjointPoolMemType::Shared].Capacity = 4; + test.Configs[usm::DisjointPoolMemType::Shared].SlabMinSize = 64 * 1024; + + usm::DisjointPoolAllConfigs parsed3 = + usm::parseDisjointPoolConfig("1;shared:128K,4,64K", 0); + ASSERT_EQ(compareConfigs(test, parsed3), true); +} + UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urUsmPoolManagerTest);