From 519c9c33296534888e32df0095e4e85d080acdef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Wed, 18 Dec 2024 13:06:33 +0000 Subject: [PATCH 01/46] Prepare ground for command_buffer in v2 --- source/adapters/level_zero/CMakeLists.txt | 2 + source/adapters/level_zero/v2/api.cpp | 70 ------------ .../adapters/level_zero/v2/command_buffer.cpp | 67 +++++++++++ .../adapters/level_zero/v2/command_buffer.hpp | 107 ++++++++++++++++++ 4 files changed, 176 insertions(+), 70 deletions(-) create mode 100644 source/adapters/level_zero/v2/command_buffer.cpp create mode 100644 source/adapters/level_zero/v2/command_buffer.hpp diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt index 39031a700d..46129f0ccf 100644 --- a/source/adapters/level_zero/CMakeLists.txt +++ b/source/adapters/level_zero/CMakeLists.txt @@ -145,6 +145,7 @@ if(UR_BUILD_ADAPTER_L0_V2) ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tensor_map.cpp # v2-only sources + ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_buffer.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_cache.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/context.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool_cache.hpp @@ -159,6 +160,7 @@ if(UR_BUILD_ADAPTER_L0_V2) ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/usm.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/api.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_buffer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_cache.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/context.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool_cache.cpp diff --git a/source/adapters/level_zero/v2/api.cpp b/source/adapters/level_zero/v2/api.cpp index f774f9e263..3ceb53be2f 100644 --- a/source/adapters/level_zero/v2/api.cpp +++ b/source/adapters/level_zero/v2/api.cpp @@ -249,47 +249,6 @@ ur_result_t urBindlessImagesReleaseExternalSemaphoreExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -ur_result_t -urCommandBufferCreateExp(ur_context_handle_t hContext, - ur_device_handle_t hDevice, - const ur_exp_command_buffer_desc_t *pCommandBufferDesc, - ur_exp_command_buffer_handle_t *phCommandBuffer) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t -urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t -urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t -urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t urCommandBufferAppendKernelLaunchExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_kernel_handle_t hKernel, - uint32_t workDim, const size_t *pGlobalWorkOffset, - const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, - uint32_t numKernelAlternatives, ur_kernel_handle_t *phKernelAlternatives, - uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, - ur_exp_command_buffer_command_handle_t *phCommand) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - ur_result_t urCommandBufferAppendUSMMemcpyExp( ur_exp_command_buffer_handle_t hCommandBuffer, void *pDst, const void *pSrc, size_t size, uint32_t numSyncPointsInWaitList, @@ -425,26 +384,6 @@ ur_result_t urCommandBufferAppendUSMAdviseExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -ur_result_t urCommandBufferEnqueueExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t urCommandBufferRetainCommandExp( - ur_exp_command_buffer_command_handle_t hCommand) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t urCommandBufferReleaseCommandExp( - ur_exp_command_buffer_command_handle_t hCommand) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - ur_result_t urCommandBufferUpdateKernelLaunchExp( ur_exp_command_buffer_command_handle_t hCommand, const ur_exp_command_buffer_update_kernel_launch_desc_t @@ -453,15 +392,6 @@ ur_result_t urCommandBufferUpdateKernelLaunchExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -ur_result_t -urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer, - ur_exp_command_buffer_info_t propName, - size_t propSize, void *pPropValue, - size_t *pPropSizeRet) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - ur_result_t urCommandBufferUpdateSignalEventExp( ur_exp_command_buffer_command_handle_t hCommand, ur_event_handle_t *phEvent) { diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp new file mode 100644 index 0000000000..3a7ef85e75 --- /dev/null +++ b/source/adapters/level_zero/v2/command_buffer.cpp @@ -0,0 +1,67 @@ +//===--------- command_buffer.cpp - Level Zero Adapter ---------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "command_buffer.hpp" +#include "../helpers/kernel_helpers.hpp" +#include "logger/ur_logger.hpp" +#include "../ur_interface_loader.hpp" + +namespace ur::level_zero { + +ur_result_t +urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, + const ur_exp_command_buffer_desc_t *CommandBufferDesc, + ur_exp_command_buffer_handle_t *CommandBuffer) { + return UR_RESULT_SUCCESS; +} + +ur_result_t urCommandBufferRetainCommandExp( + ur_exp_command_buffer_command_handle_t Command) { + return UR_RESULT_SUCCESS; +} + +ur_result_t +urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t CommandBuffer) { + return UR_RESULT_SUCCESS; +} + +ur_result_t +urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) { + return UR_RESULT_SUCCESS; +} + +ur_result_t urCommandBufferAppendKernelLaunchExp( + ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, + uint32_t WorkDim, const size_t *GlobalWorkOffset, + const size_t *GlobalWorkSize, const size_t *LocalWorkSize, + uint32_t NumKernelAlternatives, ur_kernel_handle_t *KernelAlternatives, + uint32_t NumSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_exp_command_buffer_sync_point_t *RetSyncPoint, ur_event_handle_t *Event, + ur_exp_command_buffer_command_handle_t *Command) { + return UR_RESULT_SUCCESS; +} + +ur_result_t urCommandBufferEnqueueExp( + ur_exp_command_buffer_handle_t CommandBuffer, ur_queue_handle_t UrQueue, + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_event_handle_t *Event) { + return UR_RESULT_SUCCESS; +} +ur_result_t +urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer, + ur_exp_command_buffer_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet) { + return UR_RESULT_SUCCESS; +} + +} // namespace ur::level_zero diff --git a/source/adapters/level_zero/v2/command_buffer.hpp b/source/adapters/level_zero/v2/command_buffer.hpp new file mode 100644 index 0000000000..6b963c3b8f --- /dev/null +++ b/source/adapters/level_zero/v2/command_buffer.hpp @@ -0,0 +1,107 @@ +//===--------- command_buffer.hpp - Level Zero Adapter ---------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include +#include +#include +#include +#include + +#include "common.hpp" + +#include "context.hpp" +#include "kernel.hpp" +#include "queue_api.hpp" + +struct command_buffer_profiling_t { + ur_exp_command_buffer_sync_point_t NumEvents; + ze_kernel_timestamp_result_t *Timestamps; +}; + +struct ur_exp_command_buffer_handle_t_ : public _ur_object { + ur_exp_command_buffer_handle_t_( + ur_context_handle_t Context, ur_device_handle_t Device, + ze_command_list_handle_t CommandList, + ur_event_handle_t ProcessingFinishedEvent, + const ur_exp_command_buffer_desc_t *Desc, + const bool IsInOrderCmdList + ); + void registerSyncPoint(ur_exp_command_buffer_sync_point_t SyncPoint, + ur_event_handle_t Event); + + ur_exp_command_buffer_sync_point_t getNextSyncPoint() const { + return NextSyncPoint; + } + + // Releases the resources associated with the command-buffer before the + // command-buffer object is destroyed. + void cleanupCommandBufferResources(); + + // UR context associated with this command-buffer + ur_context_handle_t Context; + // Device associated with this command buffer + ur_device_handle_t Device; + ze_command_list_handle_t ZeCommandList; + // [ImmediateAppend Path Only] Event that is signalled after the copy engine + // command-list finishes executing. + ur_event_handle_t ProcessingFinishedEvent = nullptr; + // Map of sync_points to ur_events + std::unordered_map + SyncPoints; + // Next sync_point value (may need to consider ways to reuse values if 32-bits + // is not enough) + ur_exp_command_buffer_sync_point_t NextSyncPoint; + // List of Level Zero events associated with submitted commands. + std::vector ZeEventsList; + + // Indicates if command-buffer commands can be updated after it is closed. + bool IsUpdatable = false; + // Indicates if command buffer was finalized. + bool IsFinalized = false; + // Command-buffer profiling is enabled. + bool IsProfilingEnabled = false; + // Command-buffer can be submitted to an in-order command-list. + bool IsInOrderCmdList = false; + // This list is needed to release all kernels retained by the + // command_buffer. + std::vector KernelsList; +}; + +struct ur_exp_command_buffer_command_handle_t_ : public _ur_object { + ur_exp_command_buffer_command_handle_t_(ur_exp_command_buffer_handle_t, + uint64_t); + + virtual ~ur_exp_command_buffer_command_handle_t_(); + + // Command-buffer of this command. + ur_exp_command_buffer_handle_t CommandBuffer; + // L0 command ID identifying this command + uint64_t CommandId; +}; + +// struct kernel_command_handle : public ur_exp_command_buffer_command_handle_t_ { +// kernel_command_handle(ur_exp_command_buffer_handle_t CommandBuffer, +// ur_kernel_handle_t Kernel, uint64_t CommandId, +// uint32_t WorkDim, bool UserDefinedLocalSize, +// uint32_t NumKernelAlternatives, +// ur_kernel_handle_t *KernelAlternatives); + +// ~kernel_command_handle(); + +// // Work-dimension the command was originally created with. +// uint32_t WorkDim; +// // Set to true if the user set the local work size on command creation. +// bool UserDefinedLocalSize; +// // Currently active kernel handle +// ur_kernel_handle_t Kernel; +// // Storage for valid kernel alternatives for this command. +// std::unordered_set ValidKernelHandles; +// }; From f87741ea6cd34185239106acef2061762dcafc57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Fri, 20 Dec 2024 13:41:08 +0000 Subject: [PATCH 02/46] Enforce in order list usage, and add initialization and destruction to buffer --- source/adapters/level_zero/v2/api.cpp | 12 + .../adapters/level_zero/v2/command_buffer.cpp | 249 +++++++++++++++++- .../adapters/level_zero/v2/command_buffer.hpp | 21 +- 3 files changed, 256 insertions(+), 26 deletions(-) diff --git a/source/adapters/level_zero/v2/api.cpp b/source/adapters/level_zero/v2/api.cpp index 3ceb53be2f..c6079b91ce 100644 --- a/source/adapters/level_zero/v2/api.cpp +++ b/source/adapters/level_zero/v2/api.cpp @@ -406,6 +406,18 @@ ur_result_t urCommandBufferUpdateWaitEventsExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +ur_result_t urCommandBufferRetainCommandExp( + ur_exp_command_buffer_command_handle_t hCommand) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urCommandBufferReleaseCommandExp( + ur_exp_command_buffer_command_handle_t hCommand) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + ur_result_t urCommandBufferCommandGetInfoExp( ur_exp_command_buffer_command_handle_t hCommand, ur_exp_command_buffer_command_info_t propName, size_t propSize, diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp index 3a7ef85e75..d0a8d4d37f 100644 --- a/source/adapters/level_zero/v2/command_buffer.cpp +++ b/source/adapters/level_zero/v2/command_buffer.cpp @@ -13,27 +13,174 @@ #include "logger/ur_logger.hpp" #include "../ur_interface_loader.hpp" +namespace { + +// Checks whether zeCommandListImmediateAppendCommandListsExp can be used for a +// given Context and Device. +void checkImmediateAppendSupport(ur_context_handle_t Context, + ur_device_handle_t Device) { + // TODO The L0 driver is not reporting this extension yet. Once it does, + // switch to using the variable zeDriverImmediateCommandListAppendFound. + + // Minimum version that supports zeCommandListImmediateAppendCommandListsExp. + constexpr uint32_t MinDriverVersion = 30898; + bool DriverSupportsImmediateAppend = + Context->getPlatform()->isDriverVersionNewerOrSimilar(1, 3, + MinDriverVersion); + + // If this environment variable is: + // - Set to 1: the immediate append path will always be enabled as long the + // pre-requisites are met. + // - Set to 0: the immediate append path will always be disabled. + // - Not Defined: The default behaviour will be used which enables the + // immediate append path only for some devices when the pre-requisites are + // met. + const char *AppendEnvVarName = "UR_L0_CMD_BUFFER_USE_IMMEDIATE_APPEND_PATH"; + const char *UrRet = std::getenv(AppendEnvVarName); + + if (!Device->ImmCommandListUsed) { + logger::error("Adapter v2 is used but immediate command-lists are currently " + "disabled. Immediate command-lists are " + "required to use the adapter v2."); + std::abort(); + } + if (!DriverSupportsImmediateAppend) { + logger::error("Adapter v2 is used but " + "the current driver does not support the " + "zeCommandListImmediateAppendCommandListsExp entrypoint. A " + "driver version of at least {} is required to use the " + "immediate append path.", MinDriverVersion); + std::abort(); + } + + const bool EnableAppendPath = !UrRet || std::atoi(UrRet) == 1; + if (!Device->isPVC() && !EnableAppendPath) { + logger::error("Adapter v2 is used but " + "immediate append support is not enabled." + "Please set {}=1 to enable it.", AppendEnvVarName); + std::abort(); + } + +} + +} +ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( + ur_context_handle_t Context, ur_device_handle_t Device, + ze_command_list_handle_t CommandList, + const ur_exp_command_buffer_desc_t *Desc) + : Context(Context), Device(Device), ZeCommandList(CommandList), + IsUpdatable(Desc ? Desc->isUpdatable : false) { + ur::level_zero::urContextRetain(Context); + ur::level_zero::urDeviceRetain(Device); +} + +void ur_exp_command_buffer_handle_t_::cleanupCommandBufferResources() { + // Release the memory allocated to the Context stored in the command_buffer + ur::level_zero::urContextRelease(Context); + + // Release the device + ur::level_zero::urDeviceRelease(Device); + + // Release the memory allocated to the CommandList stored in the + // command_buffer + if (ZeCommandList) { + ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList)); + } + + for (auto &AssociatedKernel : KernelsList) { + ur::level_zero::urKernelRelease(AssociatedKernel); + } +} + namespace ur::level_zero { +/** + * Creates a L0 command list + * @param[in] Context The Context associated with the command-list + * @param[in] Device The Device associated with the command-list + * @param[in] IsUpdatable Whether the command-list should be mutable. + * @param[out] CommandList The L0 command-list created by this function. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t createMainCommandList(ur_context_handle_t Context, + ur_device_handle_t Device, + bool IsUpdatable, + ze_command_list_handle_t &CommandList) { + + + using queue_group_type = ur_device_handle_t_::queue_group_info_t::type; + // that should be call to queue getZeOrdinal, + // but queue is not available while constructing buffer + uint32_t QueueGroupOrdinal = Device->QueueGroup[queue_group_type::Compute].ZeOrdinal; + + ZeStruct ZeCommandListDesc; + ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal; + + ZeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_IN_ORDER; + + ZeStruct ZeMutableCommandListDesc; + if (IsUpdatable) { + ZeMutableCommandListDesc.flags = 0; + ZeCommandListDesc.pNext = &ZeMutableCommandListDesc; + } + + ZE2UR_CALL(zeCommandListCreate, (Context->getZeHandle(), Device->ZeDevice, + &ZeCommandListDesc, &CommandList)); + + return UR_RESULT_SUCCESS; +} + ur_result_t urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, const ur_exp_command_buffer_desc_t *CommandBufferDesc, ur_exp_command_buffer_handle_t *CommandBuffer) { + bool IsUpdatable = CommandBufferDesc && CommandBufferDesc->isUpdatable; + checkImmediateAppendSupport(Context, Device); + + if (IsUpdatable) { + UR_ASSERT(Context->getPlatform()->ZeMutableCmdListExt.Supported, + UR_RESULT_ERROR_UNSUPPORTED_FEATURE); + } + + ze_command_list_handle_t ZeCommandList = nullptr; + UR_CALL(createMainCommandList(Context, Device, IsUpdatable, ZeCommandList)); + try { + *CommandBuffer = new ur_exp_command_buffer_handle_t_( + Context, Device, ZeCommandList, CommandBufferDesc); + } catch (const std::bad_alloc &) { + return exceptionToResult(std::current_exception()); + } return UR_RESULT_SUCCESS; } - -ur_result_t urCommandBufferRetainCommandExp( - ur_exp_command_buffer_command_handle_t Command) { +ur_result_t +urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { + hCommandBuffer->RefCount.increment(); return UR_RESULT_SUCCESS; } ur_result_t -urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t CommandBuffer) { +urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { + if (!hCommandBuffer->RefCount.decrementAndTest()) + return UR_RESULT_SUCCESS; + + hCommandBuffer->cleanupCommandBufferResources(); + delete hCommandBuffer; return UR_RESULT_SUCCESS; } ur_result_t -urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) { +urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { + UR_ASSERT(hCommandBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(!hCommandBuffer->IsFinalized, UR_RESULT_ERROR_INVALID_OPERATION); + + // It is not allowed to append to command list from multiple threads. + std::scoped_lock Guard(hCommandBuffer->Mutex); + + // Close the command lists and have them ready for dispatch. + ZE2UR_CALL(zeCommandListClose, (hCommandBuffer->ZeCommandList)); + + hCommandBuffer->IsFinalized = true; + return UR_RESULT_SUCCESS; } @@ -47,6 +194,77 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, ur_exp_command_buffer_sync_point_t *RetSyncPoint, ur_event_handle_t *Event, ur_exp_command_buffer_command_handle_t *Command) { + std::ignore = NumEventsInWaitList; + std::ignore = EventWaitList; + std::ignore = Event; + + auto hProgram = Kernel->getProgramHandle(); + UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_POINTER); + // Command handles can only be obtained from updatable command-buffers + UR_ASSERT(!(Command && !CommandBuffer->IsUpdatable), + UR_RESULT_ERROR_INVALID_OPERATION); + + for (uint32_t i = 0; i < NumKernelAlternatives; ++i) { + UR_ASSERT(KernelAlternatives[i] != Kernel, UR_RESULT_ERROR_INVALID_VALUE); + } + + // Lock automatically releases when this goes out of scope. + std::scoped_lock Lock( + Kernel->Mutex, hProgram->Mutex, CommandBuffer->Mutex); + + auto Device = CommandBuffer->Device; + ze_kernel_handle_t ZeKernel = Kernel->getZeHandle(Device); + + // if (GlobalWorkOffset != NULL) { + // UR_CALL(setKernelGlobalOffset(CommandBuffer->Context, ZeKernel, WorkDim, + // GlobalWorkOffset)); + // } + + // // If there are any pending arguments set them now. + // if (!Kernel->PendingArguments.empty()) { + // UR_CALL( + // setKernelPendingArguments(Device, Kernel->PendingArguments, ZeKernel)); + // } + + // ze_group_count_t ZeThreadGroupDimensions{1, 1, 1}; + // uint32_t WG[3]; + // UR_CALL(calculateKernelWorkDimensions(ZeKernel, Device, + // ZeThreadGroupDimensions, WG, WorkDim, + // GlobalWorkSize, LocalWorkSize)); + + // ZE2UR_CALL(zeKernelSetGroupSize, (ZeKernel, WG[0], WG[1], WG[2])); + // ^^^ above correspond to api that enqueus kernel in queue_api, but we dont have access to queue + + // CommandBuffer->KernelsList.push_back(Kernel); + // for (size_t i = 0; i < NumKernelAlternatives; i++) { + // CommandBuffer->KernelsList.push_back(KernelAlternatives[i]); + // } + + // ur::level_zero::urKernelRetain(Kernel); + // // Retain alternative kernels if provided + // for (size_t i = 0; i < NumKernelAlternatives; i++) { + // ur::level_zero::urKernelRetain(KernelAlternatives[i]); + // } + + // if (Command) { + // UR_CALL(createCommandHandle(CommandBuffer, Kernel, WorkDim, LocalWorkSize, + // NumKernelAlternatives, KernelAlternatives, + // *Command)); + // } + // ^^^ temporaly assume that command is null + + // std::vector ZeEventList; + // ze_event_handle_t ZeLaunchEvent = nullptr; + // UR_CALL(createSyncPointAndGetZeEvents( + // UR_COMMAND_KERNEL_LAUNCH, CommandBuffer, NumSyncPointsInWaitList, + // SyncPointWaitList, false, RetSyncPoint, ZeEventList, ZeLaunchEvent)); + // ^^^ this is not present, because this is in order + + // ZE2UR_CALL(zeCommandListAppendLaunchKernel, + // (CommandBuffer->ZeComputeCommandList, ZeKernel, + // &ZeThreadGroupDimensions, ZeLaunchEvent, ZeEventList.size(), + // getPointerFromVector(ZeEventList))); + // ^^ also part of queue_api return UR_RESULT_SUCCESS; } @@ -61,7 +279,26 @@ urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer, ur_exp_command_buffer_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet) { - return UR_RESULT_SUCCESS; + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_EXP_COMMAND_BUFFER_INFO_REFERENCE_COUNT: + return ReturnValue(uint32_t{hCommandBuffer->RefCount.load()}); + case UR_EXP_COMMAND_BUFFER_INFO_DESCRIPTOR: { + ur_exp_command_buffer_desc_t Descriptor{}; + Descriptor.stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC; + Descriptor.pNext = nullptr; + Descriptor.isUpdatable = hCommandBuffer->IsUpdatable; + Descriptor.isInOrder = true; + Descriptor.enableProfiling = hCommandBuffer->IsProfilingEnabled; + + return ReturnValue(Descriptor); + } + default: + assert(!"Command-buffer info request not implemented"); + } + + return UR_RESULT_ERROR_INVALID_ENUMERATION; } } // namespace ur::level_zero diff --git a/source/adapters/level_zero/v2/command_buffer.hpp b/source/adapters/level_zero/v2/command_buffer.hpp index 6b963c3b8f..279f2591ec 100644 --- a/source/adapters/level_zero/v2/command_buffer.hpp +++ b/source/adapters/level_zero/v2/command_buffer.hpp @@ -30,17 +30,11 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { ur_exp_command_buffer_handle_t_( ur_context_handle_t Context, ur_device_handle_t Device, ze_command_list_handle_t CommandList, - ur_event_handle_t ProcessingFinishedEvent, - const ur_exp_command_buffer_desc_t *Desc, - const bool IsInOrderCmdList + const ur_exp_command_buffer_desc_t *Desc ); void registerSyncPoint(ur_exp_command_buffer_sync_point_t SyncPoint, ur_event_handle_t Event); - ur_exp_command_buffer_sync_point_t getNextSyncPoint() const { - return NextSyncPoint; - } - // Releases the resources associated with the command-buffer before the // command-buffer object is destroyed. void cleanupCommandBufferResources(); @@ -50,17 +44,6 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { // Device associated with this command buffer ur_device_handle_t Device; ze_command_list_handle_t ZeCommandList; - // [ImmediateAppend Path Only] Event that is signalled after the copy engine - // command-list finishes executing. - ur_event_handle_t ProcessingFinishedEvent = nullptr; - // Map of sync_points to ur_events - std::unordered_map - SyncPoints; - // Next sync_point value (may need to consider ways to reuse values if 32-bits - // is not enough) - ur_exp_command_buffer_sync_point_t NextSyncPoint; - // List of Level Zero events associated with submitted commands. - std::vector ZeEventsList; // Indicates if command-buffer commands can be updated after it is closed. bool IsUpdatable = false; @@ -68,8 +51,6 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { bool IsFinalized = false; // Command-buffer profiling is enabled. bool IsProfilingEnabled = false; - // Command-buffer can be submitted to an in-order command-list. - bool IsInOrderCmdList = false; // This list is needed to release all kernels retained by the // command_buffer. std::vector KernelsList; From 159ebc8af9a9478113c49cd3892f6368266b8f69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Mon, 30 Dec 2024 15:45:26 +0000 Subject: [PATCH 03/46] Add initial support of command buffers to adapter v2 --- .../adapters/level_zero/v2/command_buffer.cpp | 166 +++++++++--------- .../adapters/level_zero/v2/command_buffer.hpp | 8 +- source/adapters/level_zero/v2/queue_api.hpp | 14 ++ .../v2/queue_immediate_in_order.cpp | 40 +++++ .../v2/queue_immediate_in_order.hpp | 15 ++ 5 files changed, 158 insertions(+), 85 deletions(-) diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp index d0a8d4d37f..dc916631fa 100644 --- a/source/adapters/level_zero/v2/command_buffer.cpp +++ b/source/adapters/level_zero/v2/command_buffer.cpp @@ -64,6 +64,19 @@ void checkImmediateAppendSupport(ur_context_handle_t Context, } } + +std::pair +ur_exp_command_buffer_handle_t_::getWaitListView( + const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents) { + + waitList.resize(numWaitEvents); + for (uint32_t i = 0; i < numWaitEvents; i++) { + waitList[i] = phWaitEvents[i]->getZeEvent(); + } + + return {waitList.data(), static_cast(numWaitEvents)}; +} + ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( ur_context_handle_t Context, ur_device_handle_t Device, ze_command_list_handle_t CommandList, @@ -185,95 +198,82 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { } ur_result_t urCommandBufferAppendKernelLaunchExp( - ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, - uint32_t WorkDim, const size_t *GlobalWorkOffset, - const size_t *GlobalWorkSize, const size_t *LocalWorkSize, - uint32_t NumKernelAlternatives, ur_kernel_handle_t *KernelAlternatives, - uint32_t NumSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, - uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, - ur_exp_command_buffer_sync_point_t *RetSyncPoint, ur_event_handle_t *Event, - ur_exp_command_buffer_command_handle_t *Command) { - std::ignore = NumEventsInWaitList; - std::ignore = EventWaitList; - std::ignore = Event; - - auto hProgram = Kernel->getProgramHandle(); - UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_POINTER); - // Command handles can only be obtained from updatable command-buffers - UR_ASSERT(!(Command && !CommandBuffer->IsUpdatable), - UR_RESULT_ERROR_INVALID_OPERATION); - - for (uint32_t i = 0; i < NumKernelAlternatives; ++i) { - UR_ASSERT(KernelAlternatives[i] != Kernel, UR_RESULT_ERROR_INVALID_VALUE); - } + ur_exp_command_buffer_handle_t commandBuffer, ur_kernel_handle_t hKernel, + uint32_t workDim, const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numKernelAlternatives, ur_kernel_handle_t *kernelAlternatives, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *syncPointWaitList, + uint32_t numEventsInWaitList, const ur_event_handle_t *eventWaitList, + ur_exp_command_buffer_sync_point_t *retSyncPoint, ur_event_handle_t *event, + ur_exp_command_buffer_command_handle_t *command) { + //Need to know semantics + // - should they be checked before kernel execution or before kernel appending to list + // if latter then it is easy fix, if former then TODO + std::ignore = numEventsInWaitList; + std::ignore = eventWaitList; + std::ignore = event; + + //sync mechanic can be removed, because all lists are in-order + std::ignore = numSyncPointsInWaitList; + std::ignore = syncPointWaitList; + std::ignore = retSyncPoint; + + //TODO + std::ignore = numKernelAlternatives; + std::ignore = kernelAlternatives; + std::ignore = command; + + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER); + + UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + + ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(commandBuffer->Device); + + std::scoped_lock Lock(commandBuffer->Mutex, + hKernel->Mutex); + + ze_group_count_t zeThreadGroupDimensions{1, 1, 1}; + uint32_t WG[3]{}; + UR_CALL(calculateKernelWorkDimensions(hZeKernel, commandBuffer->Device, + zeThreadGroupDimensions, WG, workDim, + pGlobalWorkSize, pLocalWorkSize)); + + auto waitList = commandBuffer->getWaitListView(nullptr, 0); + + bool memoryMigrated = false; + auto memoryMigrate = [&](void *src, void *dst, size_t size) { + ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, + (commandBuffer->ZeCommandList, dst, src, size, nullptr, + waitList.second, waitList.first)); + memoryMigrated = true; + }; + + UR_CALL(hKernel->prepareForSubmission(commandBuffer->Context, commandBuffer->Device, pGlobalWorkOffset, + workDim, WG[0], WG[1], WG[2], + memoryMigrate)); + + ZE2UR_CALL(zeCommandListAppendLaunchKernel, + (commandBuffer->ZeCommandList, hZeKernel, &zeThreadGroupDimensions, + nullptr, waitList.second, waitList.first)); - // Lock automatically releases when this goes out of scope. - std::scoped_lock Lock( - Kernel->Mutex, hProgram->Mutex, CommandBuffer->Mutex); - - auto Device = CommandBuffer->Device; - ze_kernel_handle_t ZeKernel = Kernel->getZeHandle(Device); - - // if (GlobalWorkOffset != NULL) { - // UR_CALL(setKernelGlobalOffset(CommandBuffer->Context, ZeKernel, WorkDim, - // GlobalWorkOffset)); - // } - - // // If there are any pending arguments set them now. - // if (!Kernel->PendingArguments.empty()) { - // UR_CALL( - // setKernelPendingArguments(Device, Kernel->PendingArguments, ZeKernel)); - // } - - // ze_group_count_t ZeThreadGroupDimensions{1, 1, 1}; - // uint32_t WG[3]; - // UR_CALL(calculateKernelWorkDimensions(ZeKernel, Device, - // ZeThreadGroupDimensions, WG, WorkDim, - // GlobalWorkSize, LocalWorkSize)); - - // ZE2UR_CALL(zeKernelSetGroupSize, (ZeKernel, WG[0], WG[1], WG[2])); - // ^^^ above correspond to api that enqueus kernel in queue_api, but we dont have access to queue - - // CommandBuffer->KernelsList.push_back(Kernel); - // for (size_t i = 0; i < NumKernelAlternatives; i++) { - // CommandBuffer->KernelsList.push_back(KernelAlternatives[i]); - // } - - // ur::level_zero::urKernelRetain(Kernel); - // // Retain alternative kernels if provided - // for (size_t i = 0; i < NumKernelAlternatives; i++) { - // ur::level_zero::urKernelRetain(KernelAlternatives[i]); - // } - - // if (Command) { - // UR_CALL(createCommandHandle(CommandBuffer, Kernel, WorkDim, LocalWorkSize, - // NumKernelAlternatives, KernelAlternatives, - // *Command)); - // } - // ^^^ temporaly assume that command is null - - // std::vector ZeEventList; - // ze_event_handle_t ZeLaunchEvent = nullptr; - // UR_CALL(createSyncPointAndGetZeEvents( - // UR_COMMAND_KERNEL_LAUNCH, CommandBuffer, NumSyncPointsInWaitList, - // SyncPointWaitList, false, RetSyncPoint, ZeEventList, ZeLaunchEvent)); - // ^^^ this is not present, because this is in order - - // ZE2UR_CALL(zeCommandListAppendLaunchKernel, - // (CommandBuffer->ZeComputeCommandList, ZeKernel, - // &ZeThreadGroupDimensions, ZeLaunchEvent, ZeEventList.size(), - // getPointerFromVector(ZeEventList))); - // ^^ also part of queue_api return UR_RESULT_SUCCESS; } ur_result_t urCommandBufferEnqueueExp( - ur_exp_command_buffer_handle_t CommandBuffer, ur_queue_handle_t UrQueue, - uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, - ur_event_handle_t *Event) { - return UR_RESULT_SUCCESS; + ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + try { + return hQueue->enqueueCommandBuffer( + hCommandBuffer->ZeCommandList, phEvent, numEventsInWaitList, phEventWaitList); + } catch (...) { + return exceptionToResult(std::current_exception()); + } } + ur_result_t urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer, ur_exp_command_buffer_info_t propName, diff --git a/source/adapters/level_zero/v2/command_buffer.hpp b/source/adapters/level_zero/v2/command_buffer.hpp index 279f2591ec..eda0f1caf0 100644 --- a/source/adapters/level_zero/v2/command_buffer.hpp +++ b/source/adapters/level_zero/v2/command_buffer.hpp @@ -32,9 +32,12 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { ze_command_list_handle_t CommandList, const ur_exp_command_buffer_desc_t *Desc ); - void registerSyncPoint(ur_exp_command_buffer_sync_point_t SyncPoint, - ur_event_handle_t Event); + ur_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, + ur_command_t commandType); + std::pair + getWaitListView(const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents); // Releases the resources associated with the command-buffer before the // command-buffer object is destroyed. void cleanupCommandBufferResources(); @@ -45,6 +48,7 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { ur_device_handle_t Device; ze_command_list_handle_t ZeCommandList; + std::vector waitList; // Indicates if command-buffer commands can be updated after it is closed. bool IsUpdatable = false; // Indicates if command buffer was finalized. diff --git a/source/adapters/level_zero/v2/queue_api.hpp b/source/adapters/level_zero/v2/queue_api.hpp index c59f084fc4..487e1df01f 100644 --- a/source/adapters/level_zero/v2/queue_api.hpp +++ b/source/adapters/level_zero/v2/queue_api.hpp @@ -14,6 +14,16 @@ #include +#include "../common.hpp" +#include "../device.hpp" + +#include "context.hpp" +#include "event.hpp" +#include "event_pool_cache.hpp" +#include "queue_api.hpp" + +#include "ur/ur.hpp" + struct ur_queue_handle_t_ { virtual ~ur_queue_handle_t_(); @@ -151,6 +161,10 @@ struct ur_queue_handle_t_ { enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) = 0; + virtual ur_result_t + enqueueCommandBuffer(ze_command_list_handle_t, ur_event_handle_t *, + uint32_t, const ur_event_handle_t *) = 0; + virtual ur_result_t enqueueNativeCommandExp(ur_exp_enqueue_native_command_function_t, void *, uint32_t, const ur_mem_handle_t *, diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index d97b4e39f9..2e0086b68e 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -380,6 +380,33 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCopyUnlocked( return UR_RESULT_SUCCESS; } + +ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp( + uint32_t numCommandLists, + ze_command_list_handle_t *phCommandLists, + ur_event_handle_t *phEvent, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_command_t callerCommand + ) { + + std::scoped_lock Lock(this->Mutex); + auto signalEvent = + getSignalEvent(phEvent, callerCommand); + + + auto [pWaitEvents, numWaitEvents] = + getWaitListView(phEventWaitList, numEventsInWaitList); + + auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; + + ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp, + (handler.commandList.get(), numCommandLists, phCommandLists, + zeSignalEvent, numWaitEvents, pWaitEvents)); + + return UR_RESULT_SUCCESS; + } + ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferRead( ur_mem_handle_t hBuffer, bool blockingRead, size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList, @@ -1101,6 +1128,19 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp( return UR_RESULT_SUCCESS; } +ur_result_t ur_queue_immediate_in_order_t::enqueueCommandBuffer( + ze_command_list_handle_t commandBufferCommandList, + ur_event_handle_t *phEvent, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList + ) { + return enqueueGenericCommandListsExp(1, + &commandBufferCommandList, + phEvent, + numEventsInWaitList, + phEventWaitList, + UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP); + } ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunchCustomExp( ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index 03fdbe0075..73b4b9e13c 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -77,6 +77,15 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ { const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, ur_command_t commandType); + ur_result_t enqueueGenericCommandListsExp( + uint32_t numCommandLists, + ze_command_list_handle_t *phCommandLists, + ur_event_handle_t *phEvent, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_command_t callerCommand + ); + ur_result_t enqueueEventsWaitWithBarrierImpl(uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, @@ -273,6 +282,12 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ { const ur_exp_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override; + ur_result_t enqueueCommandBuffer( + ze_command_list_handle_t commandBufferCommandList, + ur_event_handle_t *phEvent, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList + ) override; ur_result_t enqueueNativeCommandExp(ur_exp_enqueue_native_command_function_t, void *, uint32_t, const ur_mem_handle_t *, From bb90ee529b6e8f94eb983d5e7d5270e7798e11ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Wed, 8 Jan 2025 08:56:10 +0000 Subject: [PATCH 04/46] Update UR calls handling --- .../adapters/level_zero/v2/command_buffer.cpp | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp index dc916631fa..9d70e1693a 100644 --- a/source/adapters/level_zero/v2/command_buffer.cpp +++ b/source/adapters/level_zero/v2/command_buffer.cpp @@ -83,16 +83,16 @@ ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( const ur_exp_command_buffer_desc_t *Desc) : Context(Context), Device(Device), ZeCommandList(CommandList), IsUpdatable(Desc ? Desc->isUpdatable : false) { - ur::level_zero::urContextRetain(Context); - ur::level_zero::urDeviceRetain(Device); + UR_CALL_THROWS(ur::level_zero::urContextRetain(Context)); + UR_CALL_THROWS(ur::level_zero::urDeviceRetain(Device)); } void ur_exp_command_buffer_handle_t_::cleanupCommandBufferResources() { // Release the memory allocated to the Context stored in the command_buffer - ur::level_zero::urContextRelease(Context); + UR_CALL_THROWS(ur::level_zero::urContextRelease(Context)); // Release the device - ur::level_zero::urDeviceRelease(Device); + UR_CALL_THROWS(ur::level_zero::urDeviceRelease(Device)); // Release the memory allocated to the CommandList stored in the // command_buffer @@ -101,7 +101,7 @@ void ur_exp_command_buffer_handle_t_::cleanupCommandBufferResources() { } for (auto &AssociatedKernel : KernelsList) { - ur::level_zero::urKernelRelease(AssociatedKernel); + UR_CALL_THROWS(ur::level_zero::urKernelRelease(AssociatedKernel)); } } @@ -149,7 +149,7 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, ur_exp_command_buffer_handle_t *CommandBuffer) { bool IsUpdatable = CommandBufferDesc && CommandBufferDesc->isUpdatable; checkImmediateAppendSupport(Context, Device); - + if (IsUpdatable) { UR_ASSERT(Context->getPlatform()->ZeMutableCmdListExt.Supported, UR_RESULT_ERROR_UNSUPPORTED_FEATURE); @@ -176,7 +176,12 @@ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { if (!hCommandBuffer->RefCount.decrementAndTest()) return UR_RESULT_SUCCESS; - hCommandBuffer->cleanupCommandBufferResources(); + try { + hCommandBuffer->cleanupCommandBufferResources(); + } catch (...) { + delete hCommandBuffer; + return exceptionToResult(std::current_exception()); + } delete hCommandBuffer; return UR_RESULT_SUCCESS; } @@ -214,7 +219,7 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( std::ignore = eventWaitList; std::ignore = event; - //sync mechanic can be removed, because all lists are in-order + //sync mechanic can be ignored, because all lists are in-order std::ignore = numSyncPointsInWaitList; std::ignore = syncPointWaitList; std::ignore = retSyncPoint; From 84ef0dfb59bb3d7c36d696b126498c8e5ca8a2a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Wed, 8 Jan 2025 11:33:59 +0000 Subject: [PATCH 05/46] Remove unnecessary comment --- .../adapters/level_zero/v2/command_buffer.hpp | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/source/adapters/level_zero/v2/command_buffer.hpp b/source/adapters/level_zero/v2/command_buffer.hpp index eda0f1caf0..f706eb9efd 100644 --- a/source/adapters/level_zero/v2/command_buffer.hpp +++ b/source/adapters/level_zero/v2/command_buffer.hpp @@ -71,22 +71,3 @@ struct ur_exp_command_buffer_command_handle_t_ : public _ur_object { // L0 command ID identifying this command uint64_t CommandId; }; - -// struct kernel_command_handle : public ur_exp_command_buffer_command_handle_t_ { -// kernel_command_handle(ur_exp_command_buffer_handle_t CommandBuffer, -// ur_kernel_handle_t Kernel, uint64_t CommandId, -// uint32_t WorkDim, bool UserDefinedLocalSize, -// uint32_t NumKernelAlternatives, -// ur_kernel_handle_t *KernelAlternatives); - -// ~kernel_command_handle(); - -// // Work-dimension the command was originally created with. -// uint32_t WorkDim; -// // Set to true if the user set the local work size on command creation. -// bool UserDefinedLocalSize; -// // Currently active kernel handle -// ur_kernel_handle_t Kernel; -// // Storage for valid kernel alternatives for this command. -// std::unordered_set ValidKernelHandles; -// }; From 1716db3a4c414be3c05a200ddcf49a266bcb0693 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Wed, 8 Jan 2025 11:49:09 +0000 Subject: [PATCH 06/46] Move not implemented command buffer commands to previous position --- source/adapters/level_zero/v2/api.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/source/adapters/level_zero/v2/api.cpp b/source/adapters/level_zero/v2/api.cpp index 099f5a1bb5..7b150b709a 100644 --- a/source/adapters/level_zero/v2/api.cpp +++ b/source/adapters/level_zero/v2/api.cpp @@ -384,6 +384,18 @@ ur_result_t urCommandBufferAppendUSMAdviseExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +ur_result_t urCommandBufferRetainCommandExp( + ur_exp_command_buffer_command_handle_t hCommand) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urCommandBufferReleaseCommandExp( + ur_exp_command_buffer_command_handle_t hCommand) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + ur_result_t urCommandBufferUpdateKernelLaunchExp( ur_exp_command_buffer_command_handle_t hCommand, const ur_exp_command_buffer_update_kernel_launch_desc_t @@ -406,18 +418,6 @@ ur_result_t urCommandBufferUpdateWaitEventsExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -ur_result_t urCommandBufferRetainCommandExp( - ur_exp_command_buffer_command_handle_t hCommand) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t urCommandBufferReleaseCommandExp( - ur_exp_command_buffer_command_handle_t hCommand) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - ur_result_t urCommandBufferCommandGetInfoExp( ur_exp_command_buffer_command_handle_t hCommand, ur_exp_command_buffer_command_info_t propName, size_t propSize, From 7da53d8a7d500cfda5ce7eacc05df83180ed120c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Fri, 10 Jan 2025 14:19:19 +0000 Subject: [PATCH 07/46] Fix most issues with code --- scripts/templates/queue_api.cpp.mako | 2 + scripts/templates/queue_api.hpp.mako | 2 + .../adapters/level_zero/v2/command_buffer.cpp | 266 ++++++++---------- .../adapters/level_zero/v2/command_buffer.hpp | 30 +- source/adapters/level_zero/v2/queue_api.hpp | 7 - 5 files changed, 139 insertions(+), 168 deletions(-) diff --git a/scripts/templates/queue_api.cpp.mako b/scripts/templates/queue_api.cpp.mako index efb8e85e8e..f9387add04 100644 --- a/scripts/templates/queue_api.cpp.mako +++ b/scripts/templates/queue_api.cpp.mako @@ -19,6 +19,8 @@ from templates import helper as th * */ +// This file was generated basing on scripts/templates/queue_api.cpp.mako + #include "queue_api.hpp" #include "ur_util.hpp" diff --git a/scripts/templates/queue_api.hpp.mako b/scripts/templates/queue_api.hpp.mako index 69a9af328b..3ed073f4f3 100644 --- a/scripts/templates/queue_api.hpp.mako +++ b/scripts/templates/queue_api.hpp.mako @@ -19,6 +19,8 @@ from templates import helper as th * */ +// This file was generated basing on scripts/templates/queue_api.hpp.mako + #pragma once #include diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp index 9d70e1693a..8af6975465 100644 --- a/source/adapters/level_zero/v2/command_buffer.cpp +++ b/source/adapters/level_zero/v2/command_buffer.cpp @@ -16,48 +16,15 @@ namespace { // Checks whether zeCommandListImmediateAppendCommandListsExp can be used for a -// given Context and Device. -void checkImmediateAppendSupport(ur_context_handle_t Context, - ur_device_handle_t Device) { - // TODO The L0 driver is not reporting this extension yet. Once it does, - // switch to using the variable zeDriverImmediateCommandListAppendFound. - - // Minimum version that supports zeCommandListImmediateAppendCommandListsExp. - constexpr uint32_t MinDriverVersion = 30898; +// given context. +void checkImmediateAppendSupport(ur_context_handle_t context) { bool DriverSupportsImmediateAppend = - Context->getPlatform()->isDriverVersionNewerOrSimilar(1, 3, - MinDriverVersion); - - // If this environment variable is: - // - Set to 1: the immediate append path will always be enabled as long the - // pre-requisites are met. - // - Set to 0: the immediate append path will always be disabled. - // - Not Defined: The default behaviour will be used which enables the - // immediate append path only for some devices when the pre-requisites are - // met. - const char *AppendEnvVarName = "UR_L0_CMD_BUFFER_USE_IMMEDIATE_APPEND_PATH"; - const char *UrRet = std::getenv(AppendEnvVarName); - - if (!Device->ImmCommandListUsed) { - logger::error("Adapter v2 is used but immediate command-lists are currently " - "disabled. Immediate command-lists are " - "required to use the adapter v2."); - std::abort(); - } + context->getPlatform()->ZeCommandListImmediateAppendExt.Supported; + if (!DriverSupportsImmediateAppend) { logger::error("Adapter v2 is used but " "the current driver does not support the " - "zeCommandListImmediateAppendCommandListsExp entrypoint. A " - "driver version of at least {} is required to use the " - "immediate append path.", MinDriverVersion); - std::abort(); - } - - const bool EnableAppendPath = !UrRet || std::atoi(UrRet) == 1; - if (!Device->isPVC() && !EnableAppendPath) { - logger::error("Adapter v2 is used but " - "immediate append support is not enabled." - "Please set {}=1 to enable it.", AppendEnvVarName); + "zeCommandListImmediateAppendCommandListsExp entrypoint."); std::abort(); } @@ -78,30 +45,24 @@ ur_exp_command_buffer_handle_t_::getWaitListView( } ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( - ur_context_handle_t Context, ur_device_handle_t Device, - ze_command_list_handle_t CommandList, - const ur_exp_command_buffer_desc_t *Desc) - : Context(Context), Device(Device), ZeCommandList(CommandList), - IsUpdatable(Desc ? Desc->isUpdatable : false) { - UR_CALL_THROWS(ur::level_zero::urContextRetain(Context)); - UR_CALL_THROWS(ur::level_zero::urDeviceRetain(Device)); + ur_context_handle_t context, ur_device_handle_t device, + ze_command_list_handle_t commandList, + const ur_exp_command_buffer_desc_t *desc) + : context(context), device(device), zeCommandList(commandList), + isUpdatable(desc ? desc->isUpdatable : false) { + UR_CALL_THROWS(ur::level_zero::urContextRetain(context)); + UR_CALL_THROWS(ur::level_zero::urDeviceRetain(device)); } void ur_exp_command_buffer_handle_t_::cleanupCommandBufferResources() { // Release the memory allocated to the Context stored in the command_buffer - UR_CALL_THROWS(ur::level_zero::urContextRelease(Context)); + UR_CALL_THROWS(ur::level_zero::urContextRelease(context)); // Release the device - UR_CALL_THROWS(ur::level_zero::urDeviceRelease(Device)); - - // Release the memory allocated to the CommandList stored in the - // command_buffer - if (ZeCommandList) { - ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList)); - } + UR_CALL_THROWS(ur::level_zero::urDeviceRelease(device)); - for (auto &AssociatedKernel : KernelsList) { - UR_CALL_THROWS(ur::level_zero::urKernelRelease(AssociatedKernel)); + for (auto &associatedKernel : kernelsList) { + UR_CALL_THROWS(ur::level_zero::urKernelRelease(associatedKernel)); } } @@ -109,57 +70,57 @@ namespace ur::level_zero { /** * Creates a L0 command list - * @param[in] Context The Context associated with the command-list - * @param[in] Device The Device associated with the command-list - * @param[in] IsUpdatable Whether the command-list should be mutable. - * @param[out] CommandList The L0 command-list created by this function. + * @param[in] context The Context associated with the command-list + * @param[in] device The Device associated with the command-list + * @param[in] isUpdatable Whether the command-list should be mutable. + * @param[out] commandList The L0 command-list created by this function. * @return UR_RESULT_SUCCESS or an error code on failure */ -ur_result_t createMainCommandList(ur_context_handle_t Context, - ur_device_handle_t Device, - bool IsUpdatable, - ze_command_list_handle_t &CommandList) { +ur_result_t createMainCommandList(ur_context_handle_t context, + ur_device_handle_t device, + bool isUpdatable, + ze_command_list_handle_t &commandList) { using queue_group_type = ur_device_handle_t_::queue_group_info_t::type; // that should be call to queue getZeOrdinal, // but queue is not available while constructing buffer - uint32_t QueueGroupOrdinal = Device->QueueGroup[queue_group_type::Compute].ZeOrdinal; + uint32_t queueGroupOrdinal = device->QueueGroup[queue_group_type::Compute].ZeOrdinal; - ZeStruct ZeCommandListDesc; - ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal; + ZeStruct zeCommandListDesc; + zeCommandListDesc.commandQueueGroupOrdinal = queueGroupOrdinal; - ZeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_IN_ORDER; + zeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_IN_ORDER; - ZeStruct ZeMutableCommandListDesc; - if (IsUpdatable) { - ZeMutableCommandListDesc.flags = 0; - ZeCommandListDesc.pNext = &ZeMutableCommandListDesc; + ZeStruct zeMutableCommandListDesc; + if (isUpdatable) { + zeMutableCommandListDesc.flags = 0; + zeCommandListDesc.pNext = &zeMutableCommandListDesc; } - ZE2UR_CALL(zeCommandListCreate, (Context->getZeHandle(), Device->ZeDevice, - &ZeCommandListDesc, &CommandList)); + ZE2UR_CALL(zeCommandListCreate, (context->getZeHandle(), device->ZeDevice, + &zeCommandListDesc, &commandList)); return UR_RESULT_SUCCESS; } ur_result_t -urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, - const ur_exp_command_buffer_desc_t *CommandBufferDesc, - ur_exp_command_buffer_handle_t *CommandBuffer) { - bool IsUpdatable = CommandBufferDesc && CommandBufferDesc->isUpdatable; - checkImmediateAppendSupport(Context, Device); - - if (IsUpdatable) { - UR_ASSERT(Context->getPlatform()->ZeMutableCmdListExt.Supported, - UR_RESULT_ERROR_UNSUPPORTED_FEATURE); - } - - ze_command_list_handle_t ZeCommandList = nullptr; - UR_CALL(createMainCommandList(Context, Device, IsUpdatable, ZeCommandList)); +urCommandBufferCreateExp(ur_context_handle_t context, ur_device_handle_t device, + const ur_exp_command_buffer_desc_t *commandBufferDesc, + ur_exp_command_buffer_handle_t *commandBuffer) { try { - *CommandBuffer = new ur_exp_command_buffer_handle_t_( - Context, Device, ZeCommandList, CommandBufferDesc); + bool isUpdatable = commandBufferDesc && commandBufferDesc->isUpdatable; + checkImmediateAppendSupport(context); + + if (isUpdatable) { + UR_ASSERT(context->getPlatform()->ZeMutableCmdListExt.Supported, + UR_RESULT_ERROR_UNSUPPORTED_FEATURE); + } + + ze_command_list_handle_t zeCommandList = nullptr; + UR_CALL(createMainCommandList(context, device, isUpdatable, zeCommandList)); + *commandBuffer = new ur_exp_command_buffer_handle_t_( + context, device, zeCommandList, commandBufferDesc); } catch (const std::bad_alloc &) { return exceptionToResult(std::current_exception()); } @@ -167,16 +128,20 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, } ur_result_t urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - hCommandBuffer->RefCount.increment(); + try { + hCommandBuffer->RefCount.increment(); + } catch (const std::bad_alloc &) { + return exceptionToResult(std::current_exception()); + } return UR_RESULT_SUCCESS; } ur_result_t urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - if (!hCommandBuffer->RefCount.decrementAndTest()) - return UR_RESULT_SUCCESS; - try { + if (!hCommandBuffer->RefCount.decrementAndTest()) + return UR_RESULT_SUCCESS; + hCommandBuffer->cleanupCommandBufferResources(); } catch (...) { delete hCommandBuffer; @@ -188,17 +153,20 @@ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { ur_result_t urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - UR_ASSERT(hCommandBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(!hCommandBuffer->IsFinalized, UR_RESULT_ERROR_INVALID_OPERATION); - - // It is not allowed to append to command list from multiple threads. - std::scoped_lock Guard(hCommandBuffer->Mutex); + try { + UR_ASSERT(hCommandBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(!hCommandBuffer->isFinalized, UR_RESULT_ERROR_INVALID_OPERATION); - // Close the command lists and have them ready for dispatch. - ZE2UR_CALL(zeCommandListClose, (hCommandBuffer->ZeCommandList)); + // It is not allowed to append to command list from multiple threads. + std::scoped_lock guard(hCommandBuffer->Mutex); - hCommandBuffer->IsFinalized = true; + // Close the command lists and have them ready for dispatch. + ZE2UR_CALL(zeCommandListClose, (hCommandBuffer->zeCommandList.get())); + hCommandBuffer->isFinalized = true; + } catch (...) { + return exceptionToResult(std::current_exception()); + } return UR_RESULT_SUCCESS; } @@ -228,41 +196,44 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( std::ignore = numKernelAlternatives; std::ignore = kernelAlternatives; std::ignore = command; + try { + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER); - - UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(commandBuffer->Device); + ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(commandBuffer->device); - std::scoped_lock Lock(commandBuffer->Mutex, - hKernel->Mutex); + std::scoped_lock lock(commandBuffer->Mutex, + hKernel->Mutex); - ze_group_count_t zeThreadGroupDimensions{1, 1, 1}; - uint32_t WG[3]{}; - UR_CALL(calculateKernelWorkDimensions(hZeKernel, commandBuffer->Device, - zeThreadGroupDimensions, WG, workDim, - pGlobalWorkSize, pLocalWorkSize)); + ze_group_count_t zeThreadGroupDimensions{1, 1, 1}; + uint32_t wg[3]{}; + UR_CALL(calculateKernelWorkDimensions(hZeKernel, commandBuffer->device, + zeThreadGroupDimensions, wg, workDim, + pGlobalWorkSize, pLocalWorkSize)); - auto waitList = commandBuffer->getWaitListView(nullptr, 0); + auto waitList = commandBuffer->getWaitListView(nullptr, 0); - bool memoryMigrated = false; - auto memoryMigrate = [&](void *src, void *dst, size_t size) { - ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (commandBuffer->ZeCommandList, dst, src, size, nullptr, - waitList.second, waitList.first)); - memoryMigrated = true; - }; + bool memoryMigrated = false; + auto memoryMigrate = [&](void *src, void *dst, size_t size) { + ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, + (commandBuffer->zeCommandList.get(), dst, src, size, nullptr, + waitList.second, waitList.first)); + memoryMigrated = true; + }; - UR_CALL(hKernel->prepareForSubmission(commandBuffer->Context, commandBuffer->Device, pGlobalWorkOffset, - workDim, WG[0], WG[1], WG[2], - memoryMigrate)); + UR_CALL(hKernel->prepareForSubmission(commandBuffer->context, commandBuffer->device, pGlobalWorkOffset, + workDim, wg[0], wg[1], wg[2], + memoryMigrate)); - ZE2UR_CALL(zeCommandListAppendLaunchKernel, - (commandBuffer->ZeCommandList, hZeKernel, &zeThreadGroupDimensions, - nullptr, waitList.second, waitList.first)); + ZE2UR_CALL(zeCommandListAppendLaunchKernel, + (commandBuffer->zeCommandList.get(), hZeKernel, &zeThreadGroupDimensions, + nullptr, waitList.second, waitList.first)); + } catch (...) { + return exceptionToResult(std::current_exception()); + } return UR_RESULT_SUCCESS; } @@ -273,7 +244,7 @@ ur_result_t urCommandBufferEnqueueExp( ur_event_handle_t *phEvent) { try { return hQueue->enqueueCommandBuffer( - hCommandBuffer->ZeCommandList, phEvent, numEventsInWaitList, phEventWaitList); + hCommandBuffer->zeCommandList.get(), phEvent, numEventsInWaitList, phEventWaitList); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -284,25 +255,28 @@ urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer, ur_exp_command_buffer_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet) { - UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); - - switch (propName) { - case UR_EXP_COMMAND_BUFFER_INFO_REFERENCE_COUNT: - return ReturnValue(uint32_t{hCommandBuffer->RefCount.load()}); - case UR_EXP_COMMAND_BUFFER_INFO_DESCRIPTOR: { - ur_exp_command_buffer_desc_t Descriptor{}; - Descriptor.stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC; - Descriptor.pNext = nullptr; - Descriptor.isUpdatable = hCommandBuffer->IsUpdatable; - Descriptor.isInOrder = true; - Descriptor.enableProfiling = hCommandBuffer->IsProfilingEnabled; - - return ReturnValue(Descriptor); - } - default: - assert(!"Command-buffer info request not implemented"); + try { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_EXP_COMMAND_BUFFER_INFO_REFERENCE_COUNT: + return ReturnValue(uint32_t{hCommandBuffer->RefCount.load()}); + case UR_EXP_COMMAND_BUFFER_INFO_DESCRIPTOR: { + ur_exp_command_buffer_desc_t Descriptor{}; + Descriptor.stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC; + Descriptor.pNext = nullptr; + Descriptor.isUpdatable = hCommandBuffer->isUpdatable; + Descriptor.isInOrder = true; + Descriptor.enableProfiling = hCommandBuffer->isProfilingEnabled; + + return ReturnValue(Descriptor); + } + default: + assert(!"Command-buffer info request not implemented"); + } + } catch (...) { + return exceptionToResult(std::current_exception()); } - return UR_RESULT_ERROR_INVALID_ENUMERATION; } diff --git a/source/adapters/level_zero/v2/command_buffer.hpp b/source/adapters/level_zero/v2/command_buffer.hpp index f706eb9efd..020e466afc 100644 --- a/source/adapters/level_zero/v2/command_buffer.hpp +++ b/source/adapters/level_zero/v2/command_buffer.hpp @@ -22,15 +22,15 @@ #include "queue_api.hpp" struct command_buffer_profiling_t { - ur_exp_command_buffer_sync_point_t NumEvents; - ze_kernel_timestamp_result_t *Timestamps; + ur_exp_command_buffer_sync_point_t numEvents; + ze_kernel_timestamp_result_t *timestamps; }; struct ur_exp_command_buffer_handle_t_ : public _ur_object { ur_exp_command_buffer_handle_t_( - ur_context_handle_t Context, ur_device_handle_t Device, - ze_command_list_handle_t CommandList, - const ur_exp_command_buffer_desc_t *Desc + ur_context_handle_t context, ur_device_handle_t device, + ze_command_list_handle_t commandList, + const ur_exp_command_buffer_desc_t *desc ); ur_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, ur_command_t commandType); @@ -43,31 +43,31 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { void cleanupCommandBufferResources(); // UR context associated with this command-buffer - ur_context_handle_t Context; + ur_context_handle_t context; // Device associated with this command buffer - ur_device_handle_t Device; - ze_command_list_handle_t ZeCommandList; + ur_device_handle_t device; + v2::raii::ze_command_list_handle_t zeCommandList; std::vector waitList; // Indicates if command-buffer commands can be updated after it is closed. - bool IsUpdatable = false; + bool isUpdatable = false; // Indicates if command buffer was finalized. - bool IsFinalized = false; + bool isFinalized = false; // Command-buffer profiling is enabled. - bool IsProfilingEnabled = false; + bool isProfilingEnabled = false; // This list is needed to release all kernels retained by the // command_buffer. - std::vector KernelsList; + std::vector kernelsList; }; struct ur_exp_command_buffer_command_handle_t_ : public _ur_object { ur_exp_command_buffer_command_handle_t_(ur_exp_command_buffer_handle_t, uint64_t); - virtual ~ur_exp_command_buffer_command_handle_t_(); + ~ur_exp_command_buffer_command_handle_t_(); // Command-buffer of this command. - ur_exp_command_buffer_handle_t CommandBuffer; + ur_exp_command_buffer_handle_t commandBuffer; // L0 command ID identifying this command - uint64_t CommandId; + uint64_t commandId; }; diff --git a/source/adapters/level_zero/v2/queue_api.hpp b/source/adapters/level_zero/v2/queue_api.hpp index 487e1df01f..878e8f31dc 100644 --- a/source/adapters/level_zero/v2/queue_api.hpp +++ b/source/adapters/level_zero/v2/queue_api.hpp @@ -15,14 +15,7 @@ #include #include "../common.hpp" -#include "../device.hpp" -#include "context.hpp" -#include "event.hpp" -#include "event_pool_cache.hpp" -#include "queue_api.hpp" - -#include "ur/ur.hpp" struct ur_queue_handle_t_ { virtual ~ur_queue_handle_t_(); From 895f5c6460436f5fa6c9a53d54dc61853030c12f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Mon, 13 Jan 2025 10:56:06 +0000 Subject: [PATCH 08/46] Fix formatting and modify queue_api template --- scripts/templates/queue_api.hpp.mako | 8 +++ .../adapters/level_zero/v2/command_buffer.cpp | 59 ++++++++++--------- .../adapters/level_zero/v2/command_buffer.hpp | 13 ++-- source/adapters/level_zero/v2/queue_api.cpp | 2 + source/adapters/level_zero/v2/queue_api.hpp | 14 ++--- .../v2/queue_immediate_in_order.cpp | 40 +++++-------- .../v2/queue_immediate_in_order.hpp | 20 +++---- 7 files changed, 74 insertions(+), 82 deletions(-) diff --git a/scripts/templates/queue_api.hpp.mako b/scripts/templates/queue_api.hpp.mako index 3ed073f4f3..e540ed41d4 100644 --- a/scripts/templates/queue_api.hpp.mako +++ b/scripts/templates/queue_api.hpp.mako @@ -24,6 +24,7 @@ from templates import helper as th #pragma once #include +#include struct ur_queue_handle_t_ { virtual ~ur_queue_handle_t_(); @@ -33,4 +34,11 @@ struct ur_queue_handle_t_ { %for obj in th.get_queue_related_functions(specs, n, tags): virtual ${x}_result_t ${th.transform_queue_related_function_name(n, tags, obj, format=["type"])} = 0; %endfor + + virtual ur_result_t + enqueueNativeCommandExp(ur_exp_enqueue_native_command_function_t, void *, + uint32_t, const ur_mem_handle_t *, + const ur_exp_enqueue_native_command_properties_t *, + uint32_t, const ur_event_handle_t *, + ur_event_handle_t *) = 0; }; diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp index 8af6975465..c5d8d38be2 100644 --- a/source/adapters/level_zero/v2/command_buffer.cpp +++ b/source/adapters/level_zero/v2/command_buffer.cpp @@ -10,8 +10,8 @@ #include "command_buffer.hpp" #include "../helpers/kernel_helpers.hpp" -#include "logger/ur_logger.hpp" #include "../ur_interface_loader.hpp" +#include "logger/ur_logger.hpp" namespace { @@ -27,10 +27,9 @@ void checkImmediateAppendSupport(ur_context_handle_t context) { "zeCommandListImmediateAppendCommandListsExp entrypoint."); std::abort(); } - } -} +} // namespace std::pair ur_exp_command_buffer_handle_t_::getWaitListView( @@ -46,8 +45,8 @@ ur_exp_command_buffer_handle_t_::getWaitListView( ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( ur_context_handle_t context, ur_device_handle_t device, - ze_command_list_handle_t commandList, - const ur_exp_command_buffer_desc_t *desc) + ze_command_list_handle_t commandList, + const ur_exp_command_buffer_desc_t *desc) : context(context), device(device), zeCommandList(commandList), isUpdatable(desc ? desc->isUpdatable : false) { UR_CALL_THROWS(ur::level_zero::urContextRetain(context)); @@ -77,15 +76,14 @@ namespace ur::level_zero { * @return UR_RESULT_SUCCESS or an error code on failure */ ur_result_t createMainCommandList(ur_context_handle_t context, - ur_device_handle_t device, - bool isUpdatable, + ur_device_handle_t device, bool isUpdatable, ze_command_list_handle_t &commandList) { - using queue_group_type = ur_device_handle_t_::queue_group_info_t::type; - // that should be call to queue getZeOrdinal, + // that should be call to queue getZeOrdinal, // but queue is not available while constructing buffer - uint32_t queueGroupOrdinal = device->QueueGroup[queue_group_type::Compute].ZeOrdinal; + uint32_t queueGroupOrdinal = + device->QueueGroup[queue_group_type::Compute].ZeOrdinal; ZeStruct zeCommandListDesc; zeCommandListDesc.commandQueueGroupOrdinal = queueGroupOrdinal; @@ -116,7 +114,7 @@ urCommandBufferCreateExp(ur_context_handle_t context, ur_device_handle_t device, UR_ASSERT(context->getPlatform()->ZeMutableCmdListExt.Supported, UR_RESULT_ERROR_UNSUPPORTED_FEATURE); } - + ze_command_list_handle_t zeCommandList = nullptr; UR_CALL(createMainCommandList(context, device, isUpdatable, zeCommandList)); *commandBuffer = new ur_exp_command_buffer_handle_t_( @@ -152,7 +150,7 @@ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { } ur_result_t -urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { +urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { try { UR_ASSERT(hCommandBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(!hCommandBuffer->isFinalized, UR_RESULT_ERROR_INVALID_OPERATION); @@ -180,33 +178,34 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( uint32_t numEventsInWaitList, const ur_event_handle_t *eventWaitList, ur_exp_command_buffer_sync_point_t *retSyncPoint, ur_event_handle_t *event, ur_exp_command_buffer_command_handle_t *command) { - //Need to know semantics - // - should they be checked before kernel execution or before kernel appending to list - // if latter then it is easy fix, if former then TODO + // Need to know semantics + // - should they be checked before kernel execution or before kernel + // appending to list if latter then it is easy fix, if former then TODO std::ignore = numEventsInWaitList; std::ignore = eventWaitList; std::ignore = event; - //sync mechanic can be ignored, because all lists are in-order + // sync mechanic can be ignored, because all lists are in-order std::ignore = numSyncPointsInWaitList; std::ignore = syncPointWaitList; std::ignore = retSyncPoint; - //TODO + // TODO std::ignore = numKernelAlternatives; std::ignore = kernelAlternatives; std::ignore = command; try { UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(hKernel->getProgramHandle(), + UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(commandBuffer->device); - std::scoped_lock lock(commandBuffer->Mutex, - hKernel->Mutex); + std::scoped_lock lock( + commandBuffer->Mutex, hKernel->Mutex); ze_group_count_t zeThreadGroupDimensions{1, 1, 1}; uint32_t wg[3]{}; @@ -219,18 +218,19 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( bool memoryMigrated = false; auto memoryMigrate = [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (commandBuffer->zeCommandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandBuffer->zeCommandList.get(), dst, src, size, + nullptr, waitList.second, waitList.first)); memoryMigrated = true; }; - UR_CALL(hKernel->prepareForSubmission(commandBuffer->context, commandBuffer->device, pGlobalWorkOffset, - workDim, wg[0], wg[1], wg[2], - memoryMigrate)); + UR_CALL(hKernel->prepareForSubmission( + commandBuffer->context, commandBuffer->device, pGlobalWorkOffset, + workDim, wg[0], wg[1], wg[2], memoryMigrate)); ZE2UR_CALL(zeCommandListAppendLaunchKernel, - (commandBuffer->zeCommandList.get(), hZeKernel, &zeThreadGroupDimensions, - nullptr, waitList.second, waitList.first)); + (commandBuffer->zeCommandList.get(), hZeKernel, + &zeThreadGroupDimensions, nullptr, waitList.second, + waitList.first)); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -243,8 +243,9 @@ ur_result_t urCommandBufferEnqueueExp( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { try { - return hQueue->enqueueCommandBuffer( - hCommandBuffer->zeCommandList.get(), phEvent, numEventsInWaitList, phEventWaitList); + return hQueue->enqueueCommandBuffer(hCommandBuffer->zeCommandList.get(), + phEvent, numEventsInWaitList, + phEventWaitList); } catch (...) { return exceptionToResult(std::current_exception()); } diff --git a/source/adapters/level_zero/v2/command_buffer.hpp b/source/adapters/level_zero/v2/command_buffer.hpp index 020e466afc..c4f5a7f78b 100644 --- a/source/adapters/level_zero/v2/command_buffer.hpp +++ b/source/adapters/level_zero/v2/command_buffer.hpp @@ -9,11 +9,11 @@ //===----------------------------------------------------------------------===// #pragma once +#include #include #include #include #include -#include #include "common.hpp" @@ -27,13 +27,12 @@ struct command_buffer_profiling_t { }; struct ur_exp_command_buffer_handle_t_ : public _ur_object { - ur_exp_command_buffer_handle_t_( - ur_context_handle_t context, ur_device_handle_t device, - ze_command_list_handle_t commandList, - const ur_exp_command_buffer_desc_t *desc - ); + ur_exp_command_buffer_handle_t_(ur_context_handle_t context, + ur_device_handle_t device, + ze_command_list_handle_t commandList, + const ur_exp_command_buffer_desc_t *desc); ur_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, - ur_command_t commandType); + ur_command_t commandType); std::pair getWaitListView(const ur_event_handle_t *phWaitEvents, diff --git a/source/adapters/level_zero/v2/queue_api.cpp b/source/adapters/level_zero/v2/queue_api.cpp index e4659b5f2c..62476e0f9c 100644 --- a/source/adapters/level_zero/v2/queue_api.cpp +++ b/source/adapters/level_zero/v2/queue_api.cpp @@ -10,6 +10,8 @@ * */ +// This file was generated basing on scripts/templates/queue_api.cpp.mako + #include "queue_api.hpp" #include "ur_util.hpp" diff --git a/source/adapters/level_zero/v2/queue_api.hpp b/source/adapters/level_zero/v2/queue_api.hpp index 878e8f31dc..65ee7a3889 100644 --- a/source/adapters/level_zero/v2/queue_api.hpp +++ b/source/adapters/level_zero/v2/queue_api.hpp @@ -10,12 +10,12 @@ * */ +// This file was generated basing on scripts/templates/queue_api.hpp.mako + #pragma once #include - -#include "../common.hpp" - +#include struct ur_queue_handle_t_ { virtual ~ur_queue_handle_t_(); @@ -154,14 +154,14 @@ struct ur_queue_handle_t_ { enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) = 0; - virtual ur_result_t - enqueueCommandBuffer(ze_command_list_handle_t, ur_event_handle_t *, - uint32_t, const ur_event_handle_t *) = 0; - virtual ur_result_t enqueueNativeCommandExp(ur_exp_enqueue_native_command_function_t, void *, uint32_t, const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) = 0; + + virtual ur_result_t enqueueCommandBuffer(ze_command_list_handle_t, + ur_event_handle_t *, uint32_t, + const ur_event_handle_t *) = 0; }; diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index 2e0086b68e..5c308dc171 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -380,20 +380,13 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCopyUnlocked( return UR_RESULT_SUCCESS; } - ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp( - uint32_t numCommandLists, - ze_command_list_handle_t *phCommandLists, - ur_event_handle_t *phEvent, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_command_t callerCommand - ) { - - std::scoped_lock Lock(this->Mutex); - auto signalEvent = - getSignalEvent(phEvent, callerCommand); + uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, + ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand) { + std::scoped_lock Lock(this->Mutex); + auto signalEvent = getSignalEvent(phEvent, callerCommand); auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -404,8 +397,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp( (handler.commandList.get(), numCommandLists, phCommandLists, zeSignalEvent, numWaitEvents, pWaitEvents)); - return UR_RESULT_SUCCESS; - } + return UR_RESULT_SUCCESS; +} ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferRead( ur_mem_handle_t hBuffer, bool blockingRead, size_t offset, size_t size, @@ -1129,18 +1122,13 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp( } ur_result_t ur_queue_immediate_in_order_t::enqueueCommandBuffer( - ze_command_list_handle_t commandBufferCommandList, - ur_event_handle_t *phEvent, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList - ) { - return enqueueGenericCommandListsExp(1, - &commandBufferCommandList, - phEvent, - numEventsInWaitList, - phEventWaitList, - UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP); - } + ze_command_list_handle_t commandBufferCommandList, + ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList) { + return enqueueGenericCommandListsExp(1, &commandBufferCommandList, phEvent, + numEventsInWaitList, phEventWaitList, + UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP); +} ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunchCustomExp( ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index 73b4b9e13c..ee22ad6db9 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -78,13 +78,9 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ { ur_command_t commandType); ur_result_t enqueueGenericCommandListsExp( - uint32_t numCommandLists, - ze_command_list_handle_t *phCommandLists, - ur_event_handle_t *phEvent, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_command_t callerCommand - ); + uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, + ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand); ur_result_t enqueueEventsWaitWithBarrierImpl(uint32_t numEventsInWaitList, @@ -282,12 +278,10 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ { const ur_exp_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override; - ur_result_t enqueueCommandBuffer( - ze_command_list_handle_t commandBufferCommandList, - ur_event_handle_t *phEvent, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList - ) override; + ur_result_t + enqueueCommandBuffer(ze_command_list_handle_t commandBufferCommandList, + ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList) override; ur_result_t enqueueNativeCommandExp(ur_exp_enqueue_native_command_function_t, void *, uint32_t, const ur_mem_handle_t *, From 384326c88bfd82979716411367e9bd86431c5780 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Mon, 13 Jan 2025 11:41:12 +0000 Subject: [PATCH 09/46] Move command buffer cleanup to destructor --- scripts/templates/queue_api.hpp.mako | 7 ++----- source/adapters/level_zero/v2/command_buffer.cpp | 12 +++++------- source/adapters/level_zero/v2/command_buffer.hpp | 4 +--- 3 files changed, 8 insertions(+), 15 deletions(-) diff --git a/scripts/templates/queue_api.hpp.mako b/scripts/templates/queue_api.hpp.mako index e540ed41d4..6907d2ae32 100644 --- a/scripts/templates/queue_api.hpp.mako +++ b/scripts/templates/queue_api.hpp.mako @@ -36,9 +36,6 @@ struct ur_queue_handle_t_ { %endfor virtual ur_result_t - enqueueNativeCommandExp(ur_exp_enqueue_native_command_function_t, void *, - uint32_t, const ur_mem_handle_t *, - const ur_exp_enqueue_native_command_properties_t *, - uint32_t, const ur_event_handle_t *, - ur_event_handle_t *) = 0; + enqueueCommandBuffer(ze_command_list_handle_t, ur_event_handle_t *, + uint32_t, const ur_event_handle_t *) = 0; }; diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp index c5d8d38be2..066e113409 100644 --- a/source/adapters/level_zero/v2/command_buffer.cpp +++ b/source/adapters/level_zero/v2/command_buffer.cpp @@ -53,15 +53,15 @@ ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( UR_CALL_THROWS(ur::level_zero::urDeviceRetain(device)); } -void ur_exp_command_buffer_handle_t_::cleanupCommandBufferResources() { +ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { // Release the memory allocated to the Context stored in the command_buffer - UR_CALL_THROWS(ur::level_zero::urContextRelease(context)); + ur::level_zero::urContextRelease(context); // Release the device - UR_CALL_THROWS(ur::level_zero::urDeviceRelease(device)); + ur::level_zero::urDeviceRelease(device); for (auto &associatedKernel : kernelsList) { - UR_CALL_THROWS(ur::level_zero::urKernelRelease(associatedKernel)); + ur::level_zero::urKernelRelease(associatedKernel); } } @@ -140,12 +140,10 @@ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { if (!hCommandBuffer->RefCount.decrementAndTest()) return UR_RESULT_SUCCESS; - hCommandBuffer->cleanupCommandBufferResources(); - } catch (...) { delete hCommandBuffer; + } catch (...) { return exceptionToResult(std::current_exception()); } - delete hCommandBuffer; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/v2/command_buffer.hpp b/source/adapters/level_zero/v2/command_buffer.hpp index c4f5a7f78b..8299b6cbde 100644 --- a/source/adapters/level_zero/v2/command_buffer.hpp +++ b/source/adapters/level_zero/v2/command_buffer.hpp @@ -31,15 +31,13 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { ur_device_handle_t device, ze_command_list_handle_t commandList, const ur_exp_command_buffer_desc_t *desc); + ~ur_exp_command_buffer_handle_t_(); ur_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, ur_command_t commandType); std::pair getWaitListView(const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents); - // Releases the resources associated with the command-buffer before the - // command-buffer object is destroyed. - void cleanupCommandBufferResources(); // UR context associated with this command-buffer ur_context_handle_t context; From a1dd4287d28a3b52a37485005d484f36282236af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Mon, 13 Jan 2025 13:26:18 +0000 Subject: [PATCH 10/46] Use cached command lists instead of created ones --- .../adapters/level_zero/v2/command_buffer.cpp | 18 +++++++++++++----- .../adapters/level_zero/v2/command_buffer.hpp | 10 +++++----- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp index 066e113409..12c3c68baa 100644 --- a/source/adapters/level_zero/v2/command_buffer.cpp +++ b/source/adapters/level_zero/v2/command_buffer.cpp @@ -45,9 +45,11 @@ ur_exp_command_buffer_handle_t_::getWaitListView( ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( ur_context_handle_t context, ur_device_handle_t device, - ze_command_list_handle_t commandList, + v2::raii::command_list_unique_handle &&commandList, const ur_exp_command_buffer_desc_t *desc) - : context(context), device(device), zeCommandList(commandList), + : context(context), device(device), + zeCommandList( + std::forward(commandList)), isUpdatable(desc ? desc->isUpdatable : false) { UR_CALL_THROWS(ur::level_zero::urContextRetain(context)); UR_CALL_THROWS(ur::level_zero::urDeviceRetain(device)); @@ -115,10 +117,16 @@ urCommandBufferCreateExp(ur_context_handle_t context, ur_device_handle_t device, UR_RESULT_ERROR_UNSUPPORTED_FEATURE); } - ze_command_list_handle_t zeCommandList = nullptr; - UR_CALL(createMainCommandList(context, device, isUpdatable, zeCommandList)); + using queue_group_type = ur_device_handle_t_::queue_group_info_t::type; + uint32_t queueGroupOrdinal = + device->QueueGroup[queue_group_type::Compute].ZeOrdinal; + v2::raii::command_list_unique_handle zeCommandList = + context->commandListCache.getRegularCommandList( + device->ZeDevice, true, queueGroupOrdinal, true); + *commandBuffer = new ur_exp_command_buffer_handle_t_( - context, device, zeCommandList, commandBufferDesc); + context, device, std::move(zeCommandList), commandBufferDesc); + } catch (const std::bad_alloc &) { return exceptionToResult(std::current_exception()); } diff --git a/source/adapters/level_zero/v2/command_buffer.hpp b/source/adapters/level_zero/v2/command_buffer.hpp index 8299b6cbde..9a56f30b70 100644 --- a/source/adapters/level_zero/v2/command_buffer.hpp +++ b/source/adapters/level_zero/v2/command_buffer.hpp @@ -27,10 +27,10 @@ struct command_buffer_profiling_t { }; struct ur_exp_command_buffer_handle_t_ : public _ur_object { - ur_exp_command_buffer_handle_t_(ur_context_handle_t context, - ur_device_handle_t device, - ze_command_list_handle_t commandList, - const ur_exp_command_buffer_desc_t *desc); + ur_exp_command_buffer_handle_t_( + ur_context_handle_t context, ur_device_handle_t device, + v2::raii::command_list_unique_handle &&commandList, + const ur_exp_command_buffer_desc_t *desc); ~ur_exp_command_buffer_handle_t_(); ur_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, ur_command_t commandType); @@ -43,7 +43,7 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { ur_context_handle_t context; // Device associated with this command buffer ur_device_handle_t device; - v2::raii::ze_command_list_handle_t zeCommandList; + v2::raii::command_list_unique_handle zeCommandList; std::vector waitList; // Indicates if command-buffer commands can be updated after it is closed. From d03e88e518313d5409f6a205add60f2c5d0deb05 Mon Sep 17 00:00:00 2001 From: Piotr Balcer Date: Thu, 16 Jan 2025 12:15:23 +0100 Subject: [PATCH 11/46] update GitHub Cache action to 4.2.0 The cache action is stopping support for anything but the very latest version. We need to update to 4.2.0 for the cache to continue functioning. https://github.com/actions/cache/discussions/1510 --- .github/workflows/benchmarks-reusable.yml | 2 +- .github/workflows/docs.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmarks-reusable.yml b/.github/workflows/benchmarks-reusable.yml index 83c05f896c..86b2040438 100644 --- a/.github/workflows/benchmarks-reusable.yml +++ b/.github/workflows/benchmarks-reusable.yml @@ -232,7 +232,7 @@ jobs: - name: Upload HTML report if: ${{ always() && inputs.upload_report }} - uses: actions/cache/save@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 + uses: actions/cache/save@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0 with: path: ur-repo/benchmark_results.html key: benchmark-results-${{ matrix.adapter.str_name }}-${{ github.run_id }} diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index b4c40334d4..fbd4ffefef 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -52,7 +52,7 @@ jobs: - name: Download benchmark HTML id: download-bench-html - uses: actions/cache/restore@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 + uses: actions/cache/restore@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0 with: path: ur-repo/benchmark_results.html key: benchmark-results- From 4e3072afccf0bbe747c3dd974f1e3e78527640ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Mon, 20 Jan 2025 09:27:47 +0000 Subject: [PATCH 12/46] Remove not needed function and change phrasing --- scripts/templates/queue_api.cpp.mako | 2 +- scripts/templates/queue_api.hpp.mako | 2 +- .../adapters/level_zero/v2/command_buffer.cpp | 40 +------------------ 3 files changed, 4 insertions(+), 40 deletions(-) diff --git a/scripts/templates/queue_api.cpp.mako b/scripts/templates/queue_api.cpp.mako index f9387add04..967326ad16 100644 --- a/scripts/templates/queue_api.cpp.mako +++ b/scripts/templates/queue_api.cpp.mako @@ -19,7 +19,7 @@ from templates import helper as th * */ -// This file was generated basing on scripts/templates/queue_api.cpp.mako +// Do not edit. This file is auto generated from a template: scripts/templates/queue_api.cpp.mako #include "queue_api.hpp" #include "ur_util.hpp" diff --git a/scripts/templates/queue_api.hpp.mako b/scripts/templates/queue_api.hpp.mako index 6907d2ae32..a75403a520 100644 --- a/scripts/templates/queue_api.hpp.mako +++ b/scripts/templates/queue_api.hpp.mako @@ -19,7 +19,7 @@ from templates import helper as th * */ -// This file was generated basing on scripts/templates/queue_api.hpp.mako +// Do not edit. This file is auto generated from a template: scripts/templates/queue_api.hpp.mako #pragma once diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp index 12c3c68baa..ed185d4fbb 100644 --- a/source/adapters/level_zero/v2/command_buffer.cpp +++ b/source/adapters/level_zero/v2/command_buffer.cpp @@ -69,41 +69,6 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { namespace ur::level_zero { -/** - * Creates a L0 command list - * @param[in] context The Context associated with the command-list - * @param[in] device The Device associated with the command-list - * @param[in] isUpdatable Whether the command-list should be mutable. - * @param[out] commandList The L0 command-list created by this function. - * @return UR_RESULT_SUCCESS or an error code on failure - */ -ur_result_t createMainCommandList(ur_context_handle_t context, - ur_device_handle_t device, bool isUpdatable, - ze_command_list_handle_t &commandList) { - - using queue_group_type = ur_device_handle_t_::queue_group_info_t::type; - // that should be call to queue getZeOrdinal, - // but queue is not available while constructing buffer - uint32_t queueGroupOrdinal = - device->QueueGroup[queue_group_type::Compute].ZeOrdinal; - - ZeStruct zeCommandListDesc; - zeCommandListDesc.commandQueueGroupOrdinal = queueGroupOrdinal; - - zeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_IN_ORDER; - - ZeStruct zeMutableCommandListDesc; - if (isUpdatable) { - zeMutableCommandListDesc.flags = 0; - zeCommandListDesc.pNext = &zeMutableCommandListDesc; - } - - ZE2UR_CALL(zeCommandListCreate, (context->getZeHandle(), device->ZeDevice, - &zeCommandListDesc, &commandList)); - - return UR_RESULT_SUCCESS; -} - ur_result_t urCommandBufferCreateExp(ur_context_handle_t context, ur_device_handle_t device, const ur_exp_command_buffer_desc_t *commandBufferDesc, @@ -112,9 +77,8 @@ urCommandBufferCreateExp(ur_context_handle_t context, ur_device_handle_t device, bool isUpdatable = commandBufferDesc && commandBufferDesc->isUpdatable; checkImmediateAppendSupport(context); - if (isUpdatable) { - UR_ASSERT(context->getPlatform()->ZeMutableCmdListExt.Supported, - UR_RESULT_ERROR_UNSUPPORTED_FEATURE); + if (!context->getPlatform()->ZeMutableCmdListExt.Supported) { + throw UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } using queue_group_type = ur_device_handle_t_::queue_group_info_t::type; From cbfba5817e55f5cbaa612376be5501e8c79557d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Mon, 20 Jan 2025 15:30:43 +0000 Subject: [PATCH 13/46] Add initial implementation of command list manager --- source/adapters/level_zero/CMakeLists.txt | 2 + .../adapters/level_zero/v2/command_buffer.cpp | 75 ++------ .../adapters/level_zero/v2/command_buffer.hpp | 10 +- .../level_zero/v2/command_list_manager.cpp | 167 ++++++++++++++++++ .../level_zero/v2/command_list_manager.hpp | 62 +++++++ source/adapters/level_zero/v2/queue_api.cpp | 3 +- source/adapters/level_zero/v2/queue_api.hpp | 3 +- .../v2/queue_immediate_in_order.cpp | 22 ++- .../v2/queue_immediate_in_order.hpp | 8 + 9 files changed, 277 insertions(+), 75 deletions(-) create mode 100644 source/adapters/level_zero/v2/command_list_manager.cpp create mode 100644 source/adapters/level_zero/v2/command_list_manager.hpp diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt index 46129f0ccf..c75c870be7 100644 --- a/source/adapters/level_zero/CMakeLists.txt +++ b/source/adapters/level_zero/CMakeLists.txt @@ -147,6 +147,7 @@ if(UR_BUILD_ADAPTER_L0_V2) # v2-only sources ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_buffer.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_cache.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_manager.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/context.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool_cache.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool.hpp @@ -162,6 +163,7 @@ if(UR_BUILD_ADAPTER_L0_V2) ${CMAKE_CURRENT_SOURCE_DIR}/v2/api.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_buffer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_cache.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_manager.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/context.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool_cache.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool.cpp diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp index ed185d4fbb..90d38cfffc 100644 --- a/source/adapters/level_zero/v2/command_buffer.cpp +++ b/source/adapters/level_zero/v2/command_buffer.cpp @@ -47,26 +47,12 @@ ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( ur_context_handle_t context, ur_device_handle_t device, v2::raii::command_list_unique_handle &&commandList, const ur_exp_command_buffer_desc_t *desc) - : context(context), device(device), - zeCommandList( + : commandListManager( + context, device, std::forward(commandList)), - isUpdatable(desc ? desc->isUpdatable : false) { - UR_CALL_THROWS(ur::level_zero::urContextRetain(context)); - UR_CALL_THROWS(ur::level_zero::urDeviceRetain(device)); -} - -ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { - // Release the memory allocated to the Context stored in the command_buffer - ur::level_zero::urContextRelease(context); - - // Release the device - ur::level_zero::urDeviceRelease(device); - - for (auto &associatedKernel : kernelsList) { - ur::level_zero::urKernelRelease(associatedKernel); - } -} + isUpdatable(desc ? desc->isUpdatable : false) {} +ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() {} namespace ur::level_zero { ur_result_t @@ -74,7 +60,6 @@ urCommandBufferCreateExp(ur_context_handle_t context, ur_device_handle_t device, const ur_exp_command_buffer_desc_t *commandBufferDesc, ur_exp_command_buffer_handle_t *commandBuffer) { try { - bool isUpdatable = commandBufferDesc && commandBufferDesc->isUpdatable; checkImmediateAppendSupport(context); if (!context->getPlatform()->ZeMutableCmdListExt.Supported) { @@ -124,12 +109,7 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { try { UR_ASSERT(hCommandBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(!hCommandBuffer->isFinalized, UR_RESULT_ERROR_INVALID_OPERATION); - - // It is not allowed to append to command list from multiple threads. - std::scoped_lock guard(hCommandBuffer->Mutex); - - // Close the command lists and have them ready for dispatch. - ZE2UR_CALL(zeCommandListClose, (hCommandBuffer->zeCommandList.get())); + hCommandBuffer->commandListManager.closeCommandList(); hCommandBuffer->isFinalized = true; } catch (...) { @@ -165,42 +145,9 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( std::ignore = kernelAlternatives; std::ignore = command; try { - UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hKernel->getProgramHandle(), - UR_RESULT_ERROR_INVALID_NULL_POINTER); - - UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - - ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(commandBuffer->device); - - std::scoped_lock lock( - commandBuffer->Mutex, hKernel->Mutex); - - ze_group_count_t zeThreadGroupDimensions{1, 1, 1}; - uint32_t wg[3]{}; - UR_CALL(calculateKernelWorkDimensions(hZeKernel, commandBuffer->device, - zeThreadGroupDimensions, wg, workDim, - pGlobalWorkSize, pLocalWorkSize)); - - auto waitList = commandBuffer->getWaitListView(nullptr, 0); - - bool memoryMigrated = false; - auto memoryMigrate = [&](void *src, void *dst, size_t size) { - ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (commandBuffer->zeCommandList.get(), dst, src, size, - nullptr, waitList.second, waitList.first)); - memoryMigrated = true; - }; - - UR_CALL(hKernel->prepareForSubmission( - commandBuffer->context, commandBuffer->device, pGlobalWorkOffset, - workDim, wg[0], wg[1], wg[2], memoryMigrate)); - - ZE2UR_CALL(zeCommandListAppendLaunchKernel, - (commandBuffer->zeCommandList.get(), hZeKernel, - &zeThreadGroupDimensions, nullptr, waitList.second, - waitList.first)); + UR_CALL(commandBuffer->commandListManager.appendKernelLaunch( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, 0, + nullptr, nullptr)); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -213,9 +160,9 @@ ur_result_t urCommandBufferEnqueueExp( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { try { - return hQueue->enqueueCommandBuffer(hCommandBuffer->zeCommandList.get(), - phEvent, numEventsInWaitList, - phEventWaitList); + return hQueue->enqueueCommandBuffer( + hCommandBuffer->commandListManager.getZeCommandList(), phEvent, + numEventsInWaitList, phEventWaitList); } catch (...) { return exceptionToResult(std::current_exception()); } diff --git a/source/adapters/level_zero/v2/command_buffer.hpp b/source/adapters/level_zero/v2/command_buffer.hpp index 9a56f30b70..2d044c8c4c 100644 --- a/source/adapters/level_zero/v2/command_buffer.hpp +++ b/source/adapters/level_zero/v2/command_buffer.hpp @@ -21,6 +21,8 @@ #include "kernel.hpp" #include "queue_api.hpp" +#include "command_list_manager.hpp" + struct command_buffer_profiling_t { ur_exp_command_buffer_sync_point_t numEvents; ze_kernel_timestamp_result_t *timestamps; @@ -40,10 +42,7 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { uint32_t numWaitEvents); // UR context associated with this command-buffer - ur_context_handle_t context; - // Device associated with this command buffer - ur_device_handle_t device; - v2::raii::command_list_unique_handle zeCommandList; + ur_command_list_manager commandListManager; std::vector waitList; // Indicates if command-buffer commands can be updated after it is closed. @@ -52,9 +51,6 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { bool isFinalized = false; // Command-buffer profiling is enabled. bool isProfilingEnabled = false; - // This list is needed to release all kernels retained by the - // command_buffer. - std::vector kernelsList; }; struct ur_exp_command_buffer_command_handle_t_ : public _ur_object { diff --git a/source/adapters/level_zero/v2/command_list_manager.cpp b/source/adapters/level_zero/v2/command_list_manager.cpp new file mode 100644 index 0000000000..4554c9a2e3 --- /dev/null +++ b/source/adapters/level_zero/v2/command_list_manager.cpp @@ -0,0 +1,167 @@ +//===--------- command_list_cache.hpp - Level Zero Adapter ---------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "command_list_manager.hpp" +#include "../helpers/kernel_helpers.hpp" + +#include "../ur_interface_loader.hpp" +#include "logger/ur_logger.hpp" + +#include "../common.hpp" +#include "../device.hpp" + +#include "context.hpp" +#include "event.hpp" +#include "event_pool_cache.hpp" +#include "queue_api.hpp" + +#include +#include + +#include +#include +#include + +#include "common.hpp" + +#include "context.hpp" +#include "kernel.hpp" +#include "queue_api.hpp" + +ur_command_list_manager::ur_command_list_manager( + ur_context_handle_t context, ur_device_handle_t device, + v2::raii::command_list_unique_handle &&commandList, v2::event_flags_t flags, + ur_queue_handle_t_ *queue) + : context(context), device(device), + eventPool(context->eventPoolCache.borrow(device->Id.value(), flags)), + zeCommandList( + std::forward(commandList)), + queue(queue) { + UR_CALL_THROWS(ur::level_zero::urContextRetain(context)); + UR_CALL_THROWS(ur::level_zero::urDeviceRetain(device)); +} + +ur_command_list_manager::~ur_command_list_manager() { + ur::level_zero::urContextRelease(context); + ur::level_zero::urDeviceRelease(device); +} + +std::pair +ur_command_list_manager::getWaitListView(const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents) { + + waitList.resize(numWaitEvents); + for (uint32_t i = 0; i < numWaitEvents; i++) { + waitList[i] = phWaitEvents[i]->getZeEvent(); + } + + return {waitList.data(), static_cast(numWaitEvents)}; +} + +ur_event_handle_t +ur_command_list_manager::getSignalEvent(ur_event_handle_t *hUserEvent, + ur_command_t commandType) { + if (hUserEvent && queue) { + *hUserEvent = eventPool->allocate(); + (*hUserEvent)->resetQueueAndCommand(queue, commandType); + return *hUserEvent; + } else { + return nullptr; + } +} + +ur_result_t ur_command_list_manager::appendKernelLaunch( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::appendKernelLaunch"); + + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER); + + UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + + ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(device); + + std::scoped_lock Lock(this->Mutex, + hKernel->Mutex); + + ze_group_count_t zeThreadGroupDimensions{1, 1, 1}; + uint32_t WG[3]{}; + UR_CALL(calculateKernelWorkDimensions(hZeKernel, device, + zeThreadGroupDimensions, WG, workDim, + pGlobalWorkSize, pLocalWorkSize)); + + auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH); + + auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); + + bool memoryMigrated = false; + auto memoryMigrate = [&](void *src, void *dst, size_t size) { + ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, + (zeCommandList.get(), dst, src, size, nullptr, + waitList.second, waitList.first)); + memoryMigrated = true; + }; + + UR_CALL(hKernel->prepareForSubmission(context, device, pGlobalWorkOffset, + workDim, WG[0], WG[1], WG[2], + memoryMigrate)); + + if (memoryMigrated) { + // If memory was migrated, we don't need to pass the wait list to + // the copy command again. + waitList.first = nullptr; + waitList.second = 0; + } + + TRACK_SCOPE_LATENCY( + "ur_command_list_manager::zeCommandListAppendLaunchKernel"); + auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; + ZE2UR_CALL(zeCommandListAppendLaunchKernel, + (zeCommandList.get(), hZeKernel, &zeThreadGroupDimensions, + zeSignalEvent, waitList.second, waitList.first)); + + return UR_RESULT_SUCCESS; +} +ur_result_t ur_command_list_manager::enqueueGenericCommandListsExp( + uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, + ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand) { + + std::scoped_lock Lock(this->Mutex); + auto signalEvent = getSignalEvent(phEvent, callerCommand); + + auto [pWaitEvents, numWaitEvents] = + getWaitListView(phEventWaitList, numEventsInWaitList); + + auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; + + ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp, + (zeCommandList.get(), numCommandLists, phCommandLists, + zeSignalEvent, numWaitEvents, pWaitEvents)); + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::closeCommandList() { + // It is not allowed to append to command list from multiple threads. + std::scoped_lock guard(this->Mutex); + + // Close the command lists and have them ready for dispatch. + ZE2UR_CALL(zeCommandListClose, (this->zeCommandList.get())); + return UR_RESULT_SUCCESS; +} + +ze_command_list_handle_t ur_command_list_manager::getZeCommandList() { + return zeCommandList.get(); +} diff --git a/source/adapters/level_zero/v2/command_list_manager.hpp b/source/adapters/level_zero/v2/command_list_manager.hpp new file mode 100644 index 0000000000..7a79b416d0 --- /dev/null +++ b/source/adapters/level_zero/v2/command_list_manager.hpp @@ -0,0 +1,62 @@ +//===--------- command_list_cache.hpp - Level Zero Adapter ---------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include "command_list_cache.hpp" +#include "common.hpp" +#include "event.hpp" +#include "event_pool_cache.hpp" +#include "queue_api.hpp" +#include +#include +#include + +struct ur_command_list_manager : public _ur_object { + + ur_command_list_manager(ur_context_handle_t context, + ur_device_handle_t device, + v2::raii::command_list_unique_handle &&commandList, + v2::event_flags_t flags = v2::EVENT_FLAGS_COUNTER, + ur_queue_handle_t_ *queue = nullptr); + ~ur_command_list_manager(); + + ur_result_t appendKernelLaunch(ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + ur_result_t appendCommandListImmediate( + ze_command_list_handle_t commandList, ur_event_handle_t *phEvent, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList); + ur_result_t closeCommandList(); + ur_result_t enqueueGenericCommandListsExp( + uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, + ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand); + ze_command_list_handle_t getZeCommandList(); + +private: + // UR context associated with this command-buffer + ur_context_handle_t context; + // Device associated with this command buffer + ur_device_handle_t device; + v2::raii::cache_borrowed_event_pool eventPool; + v2::raii::command_list_unique_handle zeCommandList; + ur_queue_handle_t_ *queue; + std::vector waitList; + + std::pair + getWaitListView(const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents); + ur_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, + ur_command_t commandType); +}; diff --git a/source/adapters/level_zero/v2/queue_api.cpp b/source/adapters/level_zero/v2/queue_api.cpp index 62476e0f9c..a61bb84c8f 100644 --- a/source/adapters/level_zero/v2/queue_api.cpp +++ b/source/adapters/level_zero/v2/queue_api.cpp @@ -10,7 +10,8 @@ * */ -// This file was generated basing on scripts/templates/queue_api.cpp.mako +// Do not edit. This file is auto generated from a template: +// scripts/templates/queue_api.cpp.mako #include "queue_api.hpp" #include "ur_util.hpp" diff --git a/source/adapters/level_zero/v2/queue_api.hpp b/source/adapters/level_zero/v2/queue_api.hpp index 65ee7a3889..6ea340363d 100644 --- a/source/adapters/level_zero/v2/queue_api.hpp +++ b/source/adapters/level_zero/v2/queue_api.hpp @@ -10,7 +10,8 @@ * */ -// This file was generated basing on scripts/templates/queue_api.hpp.mako +// Do not edit. This file is auto generated from a template: +// scripts/templates/queue_api.hpp.mako #pragma once diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index 5c308dc171..f33dbe183a 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -90,7 +90,16 @@ ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( : hContext(hContext), hDevice(hDevice), flags(pProps ? pProps->flags : 0), eventPool(hContext->eventPoolCache.borrow( hDevice->Id.value(), eventFlagsFromQueueFlags(flags))), - handler(hContext, hDevice, pProps) {} + handler(hContext, hDevice, pProps), + listManager( + hContext, hDevice, + hContext->commandListCache.getImmediateCommandList( + hDevice->ZeDevice, true, getZeOrdinal(hDevice), + true /* always enable copy offload */, + ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, + getZePriority(pProps ? pProps->flags : ur_queue_flags_t{}), + getZeIndex(pProps)), + eventFlagsFromQueueFlags(flags), this) {} ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( ur_context_handle_t hContext, ur_device_handle_t hDevice, @@ -99,7 +108,16 @@ ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( eventPool(hContext->eventPoolCache.borrow( hDevice->Id.value(), eventFlagsFromQueueFlags(flags))), handler(reinterpret_cast(hNativeHandle), - ownZeQueue) {} + ownZeQueue), + listManager(hContext, hDevice, + raii::command_list_unique_handle( + reinterpret_cast(hNativeHandle), + [ownZeQueue](ze_command_list_handle_t hZeCommandList) { + if (ownZeQueue) { + zeCommandListDestroy(hZeCommandList); + } + }), + eventFlagsFromQueueFlags(flags)) {} ur_event_handle_t ur_queue_immediate_in_order_t::getSignalEvent(ur_event_handle_t *hUserEvent, diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index ee22ad6db9..93023590c9 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -19,6 +19,8 @@ #include "ur/ur.hpp" +#include "command_list_manager.hpp" + namespace v2 { using queue_group_type = ur_device_handle_t_::queue_group_info_t::type; @@ -36,16 +38,22 @@ struct ur_command_list_handler_t { struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ { private: + // to remove after command_list_manager is complete ur_context_handle_t hContext; + // to remove after command_list_manager is complete ur_device_handle_t hDevice; ur_queue_flags_t flags; + // to remove after command_list_manager is complete raii::cache_borrowed_event_pool eventPool; + // to remove after command_list_manager is complete ur_command_list_handler_t handler; + // to remove after command_list_manager is complete std::vector waitList; + ur_command_list_manager listManager; std::vector deferredEvents; std::pair From 1de57ef5aa51c6dcb8baba5e918f5ac3c969bee0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Mon, 20 Jan 2025 15:46:05 +0000 Subject: [PATCH 14/46] Use list manager instead of custom implementation in queue --- .../v2/queue_immediate_in_order.cpp | 96 ++++--------------- .../v2/queue_immediate_in_order.hpp | 2 +- 2 files changed, 18 insertions(+), 80 deletions(-) diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index f33dbe183a..b1aed78450 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -91,7 +91,7 @@ ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( eventPool(hContext->eventPoolCache.borrow( hDevice->Id.value(), eventFlagsFromQueueFlags(flags))), handler(hContext, hDevice, pProps), - listManager( + commandListManager( hContext, hDevice, hContext->commandListCache.getImmediateCommandList( hDevice->ZeDevice, true, getZeOrdinal(hDevice), @@ -109,15 +109,16 @@ ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( hDevice->Id.value(), eventFlagsFromQueueFlags(flags))), handler(reinterpret_cast(hNativeHandle), ownZeQueue), - listManager(hContext, hDevice, - raii::command_list_unique_handle( - reinterpret_cast(hNativeHandle), - [ownZeQueue](ze_command_list_handle_t hZeCommandList) { - if (ownZeQueue) { - zeCommandListDestroy(hZeCommandList); - } - }), - eventFlagsFromQueueFlags(flags)) {} + commandListManager( + hContext, hDevice, + raii::command_list_unique_handle( + reinterpret_cast(hNativeHandle), + [ownZeQueue](ze_command_list_handle_t hZeCommandList) { + if (ownZeQueue) { + zeCommandListDestroy(hZeCommandList); + } + }), + eventFlagsFromQueueFlags(flags)) {} ur_event_handle_t ur_queue_immediate_in_order_t::getSignalEvent(ur_event_handle_t *hUserEvent, @@ -222,52 +223,9 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunch( const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueKernelLaunch"); - UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER); - - UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - - ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(hDevice); - - std::scoped_lock Lock(this->Mutex, - hKernel->Mutex); - - ze_group_count_t zeThreadGroupDimensions{1, 1, 1}; - uint32_t WG[3]{}; - UR_CALL(calculateKernelWorkDimensions(hZeKernel, hDevice, - zeThreadGroupDimensions, WG, workDim, - pGlobalWorkSize, pLocalWorkSize)); - - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH); - - auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); - - bool memoryMigrated = false; - auto memoryMigrate = [&](void *src, void *dst, size_t size) { - ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); - memoryMigrated = true; - }; - - UR_CALL(hKernel->prepareForSubmission(hContext, hDevice, pGlobalWorkOffset, - workDim, WG[0], WG[1], WG[2], - memoryMigrate)); - - if (memoryMigrated) { - // If memory was migrated, we don't need to pass the wait list to - // the copy command again. - waitList.first = nullptr; - waitList.second = 0; - } - - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::zeCommandListAppendLaunchKernel"); - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; - ZE2UR_CALL(zeCommandListAppendLaunchKernel, - (handler.commandList.get(), hZeKernel, &zeThreadGroupDimensions, - zeSignalEvent, waitList.second, waitList.first)); + UR_CALL(commandListManager.appendKernelLaunch( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + numEventsInWaitList, phEventWaitList, phEvent)); return UR_RESULT_SUCCESS; } @@ -398,26 +356,6 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCopyUnlocked( return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp( - uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, - ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand) { - - std::scoped_lock Lock(this->Mutex); - auto signalEvent = getSignalEvent(phEvent, callerCommand); - - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); - - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; - - ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp, - (handler.commandList.get(), numCommandLists, phCommandLists, - zeSignalEvent, numWaitEvents, pWaitEvents)); - - return UR_RESULT_SUCCESS; -} - ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferRead( ur_mem_handle_t hBuffer, bool blockingRead, size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList, @@ -1143,9 +1081,9 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCommandBuffer( ze_command_list_handle_t commandBufferCommandList, ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList) { - return enqueueGenericCommandListsExp(1, &commandBufferCommandList, phEvent, - numEventsInWaitList, phEventWaitList, - UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP); + return commandListManager.enqueueGenericCommandListsExp( + 1, &commandBufferCommandList, phEvent, numEventsInWaitList, + phEventWaitList, UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP); } ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunchCustomExp( ur_kernel_handle_t hKernel, uint32_t workDim, diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index 93023590c9..199ea93a8c 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -53,7 +53,7 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ { // to remove after command_list_manager is complete std::vector waitList; - ur_command_list_manager listManager; + ur_command_list_manager commandListManager; std::vector deferredEvents; std::pair From de2f273bf77a82102d2f2bb0816b191cef483558 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Tue, 21 Jan 2025 09:28:00 +0000 Subject: [PATCH 15/46] Optimalize imports --- .../adapters/level_zero/v2/command_buffer.hpp | 11 ++-------- .../level_zero/v2/command_list_manager.cpp | 21 ------------------- .../level_zero/v2/command_list_manager.hpp | 3 --- 3 files changed, 2 insertions(+), 33 deletions(-) diff --git a/source/adapters/level_zero/v2/command_buffer.hpp b/source/adapters/level_zero/v2/command_buffer.hpp index 2d044c8c4c..3c0d2492f2 100644 --- a/source/adapters/level_zero/v2/command_buffer.hpp +++ b/source/adapters/level_zero/v2/command_buffer.hpp @@ -9,19 +9,12 @@ //===----------------------------------------------------------------------===// #pragma once -#include -#include -#include -#include -#include - +#include "command_list_manager.hpp" #include "common.hpp" - #include "context.hpp" #include "kernel.hpp" #include "queue_api.hpp" - -#include "command_list_manager.hpp" +#include struct command_buffer_profiling_t { ur_exp_command_buffer_sync_point_t numEvents; diff --git a/source/adapters/level_zero/v2/command_list_manager.cpp b/source/adapters/level_zero/v2/command_list_manager.cpp index 4554c9a2e3..a401f6ecd5 100644 --- a/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/source/adapters/level_zero/v2/command_list_manager.cpp @@ -10,30 +10,9 @@ #include "command_list_manager.hpp" #include "../helpers/kernel_helpers.hpp" - #include "../ur_interface_loader.hpp" -#include "logger/ur_logger.hpp" - -#include "../common.hpp" -#include "../device.hpp" - -#include "context.hpp" -#include "event.hpp" -#include "event_pool_cache.hpp" -#include "queue_api.hpp" - -#include -#include - -#include -#include -#include - -#include "common.hpp" - #include "context.hpp" #include "kernel.hpp" -#include "queue_api.hpp" ur_command_list_manager::ur_command_list_manager( ur_context_handle_t context, ur_device_handle_t device, diff --git a/source/adapters/level_zero/v2/command_list_manager.hpp b/source/adapters/level_zero/v2/command_list_manager.hpp index 7a79b416d0..ceb5928f84 100644 --- a/source/adapters/level_zero/v2/command_list_manager.hpp +++ b/source/adapters/level_zero/v2/command_list_manager.hpp @@ -11,12 +11,9 @@ #include "command_list_cache.hpp" #include "common.hpp" -#include "event.hpp" #include "event_pool_cache.hpp" #include "queue_api.hpp" -#include #include -#include struct ur_command_list_manager : public _ur_object { From d979f6a22cdf5eb1c03f71abf2249df15fd6a539 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Tue, 21 Jan 2025 11:16:34 +0000 Subject: [PATCH 16/46] Remove not needed destructor --- source/adapters/level_zero/v2/command_buffer.cpp | 1 - source/adapters/level_zero/v2/command_buffer.hpp | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp index 90d38cfffc..0b876c4dfd 100644 --- a/source/adapters/level_zero/v2/command_buffer.cpp +++ b/source/adapters/level_zero/v2/command_buffer.cpp @@ -52,7 +52,6 @@ ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( std::forward(commandList)), isUpdatable(desc ? desc->isUpdatable : false) {} -ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() {} namespace ur::level_zero { ur_result_t diff --git a/source/adapters/level_zero/v2/command_buffer.hpp b/source/adapters/level_zero/v2/command_buffer.hpp index 3c0d2492f2..a728b85676 100644 --- a/source/adapters/level_zero/v2/command_buffer.hpp +++ b/source/adapters/level_zero/v2/command_buffer.hpp @@ -26,7 +26,7 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { ur_context_handle_t context, ur_device_handle_t device, v2::raii::command_list_unique_handle &&commandList, const ur_exp_command_buffer_desc_t *desc); - ~ur_exp_command_buffer_handle_t_(); + ~ur_exp_command_buffer_handle_t_() = default; ur_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, ur_command_t commandType); From b1b0c60c42466e1e76666b1c9475d56d952e3f59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Mon, 20 Jan 2025 15:32:58 +0000 Subject: [PATCH 17/46] Add barriers to the SignalCommandList that guarantee that resetting the WaitEvent is done at the right time. This fixes a potential race condition where, if the SignalCommandList executes before the ComputeCommandList, the WaitEvent could be reset before the ComputeCommandList can wait on it and, consequently, create a deadlock. --- source/adapters/level_zero/command_buffer.cpp | 34 ++++++++++++------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index 058f92f8ca..902da42d2c 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -1535,15 +1535,17 @@ ur_result_t waitForDependencies(ur_exp_command_buffer_handle_t CommandBuffer, * @param CommandList The command-list to append the QueryKernelTimestamps * command to. * @param SignalEvent The event that must be signaled after the profiling is - * finished. This event will contain the profiling information. + * finished. * @param WaitEvent The event that must be waited on before starting the * profiling. + * @param ProfilingEvent The event that will contain the profiling data. * @return UR_RESULT_SUCCESS or an error code on failure. */ ur_result_t appendProfilingQueries(ur_exp_command_buffer_handle_t CommandBuffer, ze_command_list_handle_t CommandList, ur_event_handle_t SignalEvent, - ur_event_handle_t WaitEvent) { + ur_event_handle_t WaitEvent, + ur_event_handle_t ProfilingEvent) { // Multiple submissions of a command buffer implies that we need to save // the event timestamps before resubmiting the command buffer. We // therefore copy these timestamps in a dedicated USM memory section @@ -1556,12 +1558,17 @@ ur_result_t appendProfilingQueries(ur_exp_command_buffer_handle_t CommandBuffer, Profiling->Timestamps = new ze_kernel_timestamp_result_t[Profiling->NumEvents]; + uint32_t NumWaitEvents = WaitEvent ? 1 : 0; + ze_event_handle_t *ZeWaitEventList = + WaitEvent ? &(WaitEvent->ZeEvent) : nullptr; + ze_event_handle_t ZeSignalEvent = + SignalEvent ? SignalEvent->ZeEvent : nullptr; ZE2UR_CALL(zeCommandListAppendQueryKernelTimestamps, (CommandList, CommandBuffer->ZeEventsList.size(), CommandBuffer->ZeEventsList.data(), (void *)Profiling->Timestamps, - 0, SignalEvent->ZeEvent, 1, &(WaitEvent->ZeEvent))); + 0, ZeSignalEvent, NumWaitEvents, ZeWaitEventList)); - SignalEvent->CommandData = static_cast(Profiling); + ProfilingEvent->CommandData = static_cast(Profiling); return UR_RESULT_SUCCESS; } @@ -1615,8 +1622,8 @@ ur_result_t enqueueImmediateAppendPath( if (DoProfiling) { UR_CALL(appendProfilingQueries(CommandBuffer, CommandListHelper->first, - *Event, - CommandBuffer->ComputeFinishedEvent)); + *Event, CommandBuffer->ComputeFinishedEvent, + *Event)); } // When the current execution is finished, signal ExecutionFinishedEvent to @@ -1694,10 +1701,15 @@ ur_result_t enqueueWaitEventPath(ur_exp_command_buffer_handle_t CommandBuffer, (ZeCopyCommandQueue, 1, &CommandBuffer->ZeCopyCommandList, nullptr)); } + ZE2UR_CALL(zeCommandListAppendBarrier, + (SignalCommandList->first, nullptr, 1, + &(CommandBuffer->ExecutionFinishedEvent->ZeEvent))); + // Reset the wait-event for the UR command-buffer that is signaled when its // submission dependencies have been satisfied. ZE2UR_CALL(zeCommandListAppendEventReset, (SignalCommandList->first, CommandBuffer->WaitEvent->ZeEvent)); + // Reset the all-reset-event for the UR command-buffer that is signaled when // all events of the main command-list have been reset. ZE2UR_CALL(zeCommandListAppendEventReset, @@ -1705,14 +1717,12 @@ ur_result_t enqueueWaitEventPath(ur_exp_command_buffer_handle_t CommandBuffer, if (DoProfiling) { UR_CALL(appendProfilingQueries(CommandBuffer, SignalCommandList->first, - *Event, - CommandBuffer->ExecutionFinishedEvent)); - } else { - ZE2UR_CALL(zeCommandListAppendBarrier, - (SignalCommandList->first, (*Event)->ZeEvent, 1, - &(CommandBuffer->ExecutionFinishedEvent->ZeEvent))); + nullptr, nullptr, *Event)); } + ZE2UR_CALL(zeCommandListAppendBarrier, + (SignalCommandList->first, (*Event)->ZeEvent, 0, nullptr)); + UR_CALL(Queue->executeCommandList(SignalCommandList, false /*IsBlocking*/, false /*OKToBatchCommand*/)); From 545e57732480487e7f36d87632b9c56307f39f49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Tue, 21 Jan 2025 11:42:20 +0000 Subject: [PATCH 18/46] Fix formatting --- source/loader/layers/sanitizer/asan/asan_report.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/source/loader/layers/sanitizer/asan/asan_report.hpp b/source/loader/layers/sanitizer/asan/asan_report.hpp index c92ef997af..c5356cb2cf 100644 --- a/source/loader/layers/sanitizer/asan/asan_report.hpp +++ b/source/loader/layers/sanitizer/asan/asan_report.hpp @@ -2,9 +2,9 @@ * * Copyright (C) 2024 Intel Corporation * - * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. - * See LICENSE.TXT - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM + * Exceptions. See LICENSE.TXT SPDX-License-Identifier: Apache-2.0 WITH + * LLVM-exception * * @file asan_report.hpp * From ea643b36243cb5e64137626858a26ee4e2caf0ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Tue, 21 Jan 2025 12:20:51 +0000 Subject: [PATCH 19/46] Revert "Fix formatting" This reverts commit 545e57732480487e7f36d87632b9c56307f39f49. --- source/loader/layers/sanitizer/asan/asan_report.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/source/loader/layers/sanitizer/asan/asan_report.hpp b/source/loader/layers/sanitizer/asan/asan_report.hpp index c5356cb2cf..c92ef997af 100644 --- a/source/loader/layers/sanitizer/asan/asan_report.hpp +++ b/source/loader/layers/sanitizer/asan/asan_report.hpp @@ -2,9 +2,9 @@ * * Copyright (C) 2024 Intel Corporation * - * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM - * Exceptions. See LICENSE.TXT SPDX-License-Identifier: Apache-2.0 WITH - * LLVM-exception + * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. + * See LICENSE.TXT + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * * @file asan_report.hpp * From 8b7b269fb9cec8586ddd6fcbe6b7c2bd1fc4017b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Tue, 21 Jan 2025 12:33:43 +0000 Subject: [PATCH 20/46] Move command list close to the command buffer --- .../adapters/level_zero/v2/command_buffer.cpp | 24 +++++++++---------- .../adapters/level_zero/v2/command_buffer.hpp | 8 +++---- .../level_zero/v2/command_list_manager.cpp | 9 ------- .../level_zero/v2/command_list_manager.hpp | 1 - 4 files changed, 14 insertions(+), 28 deletions(-) diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp index 0b876c4dfd..9c3765064e 100644 --- a/source/adapters/level_zero/v2/command_buffer.cpp +++ b/source/adapters/level_zero/v2/command_buffer.cpp @@ -31,18 +31,6 @@ void checkImmediateAppendSupport(ur_context_handle_t context) { } // namespace -std::pair -ur_exp_command_buffer_handle_t_::getWaitListView( - const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents) { - - waitList.resize(numWaitEvents); - for (uint32_t i = 0; i < numWaitEvents; i++) { - waitList[i] = phWaitEvents[i]->getZeEvent(); - } - - return {waitList.data(), static_cast(numWaitEvents)}; -} - ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( ur_context_handle_t context, ur_device_handle_t device, v2::raii::command_list_unique_handle &&commandList, @@ -52,6 +40,16 @@ ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( std::forward(commandList)), isUpdatable(desc ? desc->isUpdatable : false) {} + +ur_result_t ur_exp_command_buffer_handle_t_::closeCommandList() { + // It is not allowed to append to command list from multiple threads. + std::scoped_lock guard(this->Mutex); + + // Close the command lists and have them ready for dispatch. + ZE2UR_CALL(zeCommandListClose, (this->commandListManager.getZeCommandList())); + return UR_RESULT_SUCCESS; +} + namespace ur::level_zero { ur_result_t @@ -108,7 +106,7 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { try { UR_ASSERT(hCommandBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(!hCommandBuffer->isFinalized, UR_RESULT_ERROR_INVALID_OPERATION); - hCommandBuffer->commandListManager.closeCommandList(); + hCommandBuffer->closeCommandList(); hCommandBuffer->isFinalized = true; } catch (...) { diff --git a/source/adapters/level_zero/v2/command_buffer.hpp b/source/adapters/level_zero/v2/command_buffer.hpp index a728b85676..a685be0279 100644 --- a/source/adapters/level_zero/v2/command_buffer.hpp +++ b/source/adapters/level_zero/v2/command_buffer.hpp @@ -30,14 +30,12 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { ur_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, ur_command_t commandType); - std::pair - getWaitListView(const ur_event_handle_t *phWaitEvents, - uint32_t numWaitEvents); - - // UR context associated with this command-buffer ur_command_list_manager commandListManager; + ur_result_t closeCommandList(); + std::vector waitList; + // Indicates if command-buffer commands can be updated after it is closed. bool isUpdatable = false; // Indicates if command buffer was finalized. diff --git a/source/adapters/level_zero/v2/command_list_manager.cpp b/source/adapters/level_zero/v2/command_list_manager.cpp index a401f6ecd5..5b3cf5ccaa 100644 --- a/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/source/adapters/level_zero/v2/command_list_manager.cpp @@ -132,15 +132,6 @@ ur_result_t ur_command_list_manager::enqueueGenericCommandListsExp( return UR_RESULT_SUCCESS; } -ur_result_t ur_command_list_manager::closeCommandList() { - // It is not allowed to append to command list from multiple threads. - std::scoped_lock guard(this->Mutex); - - // Close the command lists and have them ready for dispatch. - ZE2UR_CALL(zeCommandListClose, (this->zeCommandList.get())); - return UR_RESULT_SUCCESS; -} - ze_command_list_handle_t ur_command_list_manager::getZeCommandList() { return zeCommandList.get(); } diff --git a/source/adapters/level_zero/v2/command_list_manager.hpp b/source/adapters/level_zero/v2/command_list_manager.hpp index ceb5928f84..c56b076d42 100644 --- a/source/adapters/level_zero/v2/command_list_manager.hpp +++ b/source/adapters/level_zero/v2/command_list_manager.hpp @@ -34,7 +34,6 @@ struct ur_command_list_manager : public _ur_object { ur_result_t appendCommandListImmediate( ze_command_list_handle_t commandList, ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList); - ur_result_t closeCommandList(); ur_result_t enqueueGenericCommandListsExp( uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, From 95f978c3656aece853b85755db5f152db47c971e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Tue, 21 Jan 2025 13:16:20 +0000 Subject: [PATCH 21/46] Moved try outside function block --- .../adapters/level_zero/v2/command_buffer.cpp | 157 ++++++++---------- .../adapters/level_zero/v2/command_buffer.hpp | 2 +- 2 files changed, 73 insertions(+), 86 deletions(-) diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp index 9c3765064e..46c8c6ae27 100644 --- a/source/adapters/level_zero/v2/command_buffer.cpp +++ b/source/adapters/level_zero/v2/command_buffer.cpp @@ -40,7 +40,6 @@ ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( std::forward(commandList)), isUpdatable(desc ? desc->isUpdatable : false) {} - ur_result_t ur_exp_command_buffer_handle_t_::closeCommandList() { // It is not allowed to append to command list from multiple threads. std::scoped_lock guard(this->Mutex); @@ -55,64 +54,57 @@ namespace ur::level_zero { ur_result_t urCommandBufferCreateExp(ur_context_handle_t context, ur_device_handle_t device, const ur_exp_command_buffer_desc_t *commandBufferDesc, - ur_exp_command_buffer_handle_t *commandBuffer) { - try { - checkImmediateAppendSupport(context); - - if (!context->getPlatform()->ZeMutableCmdListExt.Supported) { - throw UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - - using queue_group_type = ur_device_handle_t_::queue_group_info_t::type; - uint32_t queueGroupOrdinal = - device->QueueGroup[queue_group_type::Compute].ZeOrdinal; - v2::raii::command_list_unique_handle zeCommandList = - context->commandListCache.getRegularCommandList( - device->ZeDevice, true, queueGroupOrdinal, true); - - *commandBuffer = new ur_exp_command_buffer_handle_t_( - context, device, std::move(zeCommandList), commandBufferDesc); - - } catch (const std::bad_alloc &) { - return exceptionToResult(std::current_exception()); + ur_exp_command_buffer_handle_t *commandBuffer) try { + checkImmediateAppendSupport(context); + + if (!context->getPlatform()->ZeMutableCmdListExt.Supported) { + throw UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + + using queue_group_type = ur_device_handle_t_::queue_group_info_t::type; + uint32_t queueGroupOrdinal = + device->QueueGroup[queue_group_type::Compute].ZeOrdinal; + v2::raii::command_list_unique_handle zeCommandList = + context->commandListCache.getRegularCommandList(device->ZeDevice, true, + queueGroupOrdinal, true); + + *commandBuffer = new ur_exp_command_buffer_handle_t_( + context, device, std::move(zeCommandList), commandBufferDesc); return UR_RESULT_SUCCESS; + +} catch (const std::bad_alloc &) { + return exceptionToResult(std::current_exception()); } + ur_result_t -urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - try { - hCommandBuffer->RefCount.increment(); - } catch (const std::bad_alloc &) { - return exceptionToResult(std::current_exception()); - } +urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) try { + hCommandBuffer->RefCount.increment(); return UR_RESULT_SUCCESS; +} catch (const std::bad_alloc &) { + return exceptionToResult(std::current_exception()); } ur_result_t -urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - try { - if (!hCommandBuffer->RefCount.decrementAndTest()) - return UR_RESULT_SUCCESS; - - delete hCommandBuffer; - } catch (...) { - return exceptionToResult(std::current_exception()); - } +urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) try { + if (!hCommandBuffer->RefCount.decrementAndTest()) + return UR_RESULT_SUCCESS; + + delete hCommandBuffer; return UR_RESULT_SUCCESS; +} catch (...) { + return exceptionToResult(std::current_exception()); } ur_result_t -urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - try { - UR_ASSERT(hCommandBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(!hCommandBuffer->isFinalized, UR_RESULT_ERROR_INVALID_OPERATION); - hCommandBuffer->closeCommandList(); - - hCommandBuffer->isFinalized = true; - } catch (...) { - return exceptionToResult(std::current_exception()); - } +urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) try { + UR_ASSERT(hCommandBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(!hCommandBuffer->isFinalized, UR_RESULT_ERROR_INVALID_OPERATION); + hCommandBuffer->closeCommandList(); + + hCommandBuffer->isFinalized = true; return UR_RESULT_SUCCESS; +} catch (...) { + return exceptionToResult(std::current_exception()); } ur_result_t urCommandBufferAppendKernelLaunchExp( @@ -124,7 +116,9 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( const ur_exp_command_buffer_sync_point_t *syncPointWaitList, uint32_t numEventsInWaitList, const ur_event_handle_t *eventWaitList, ur_exp_command_buffer_sync_point_t *retSyncPoint, ur_event_handle_t *event, - ur_exp_command_buffer_command_handle_t *command) { + ur_exp_command_buffer_command_handle_t *command) + + try { // Need to know semantics // - should they be checked before kernel execution or before kernel // appending to list if latter then it is easy fix, if former then TODO @@ -141,58 +135,51 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( std::ignore = numKernelAlternatives; std::ignore = kernelAlternatives; std::ignore = command; - try { - UR_CALL(commandBuffer->commandListManager.appendKernelLaunch( - hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, 0, - nullptr, nullptr)); - } catch (...) { - return exceptionToResult(std::current_exception()); - } - + UR_CALL(commandBuffer->commandListManager.appendKernelLaunch( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, 0, + nullptr, nullptr)); return UR_RESULT_SUCCESS; +} catch (...) { + return exceptionToResult(std::current_exception()); } ur_result_t urCommandBufferEnqueueExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - try { - return hQueue->enqueueCommandBuffer( - hCommandBuffer->commandListManager.getZeCommandList(), phEvent, - numEventsInWaitList, phEventWaitList); - } catch (...) { - return exceptionToResult(std::current_exception()); - } + ur_event_handle_t *phEvent) try { + return hQueue->enqueueCommandBuffer( + hCommandBuffer->commandListManager.getZeCommandList(), phEvent, + numEventsInWaitList, phEventWaitList); +} catch (...) { + return exceptionToResult(std::current_exception()); } ur_result_t urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer, ur_exp_command_buffer_info_t propName, size_t propSize, void *pPropValue, - size_t *pPropSizeRet) { - try { - UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); - - switch (propName) { - case UR_EXP_COMMAND_BUFFER_INFO_REFERENCE_COUNT: - return ReturnValue(uint32_t{hCommandBuffer->RefCount.load()}); - case UR_EXP_COMMAND_BUFFER_INFO_DESCRIPTOR: { - ur_exp_command_buffer_desc_t Descriptor{}; - Descriptor.stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC; - Descriptor.pNext = nullptr; - Descriptor.isUpdatable = hCommandBuffer->isUpdatable; - Descriptor.isInOrder = true; - Descriptor.enableProfiling = hCommandBuffer->isProfilingEnabled; - - return ReturnValue(Descriptor); - } - default: - assert(!"Command-buffer info request not implemented"); - } - } catch (...) { - return exceptionToResult(std::current_exception()); + size_t *pPropSizeRet) try { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_EXP_COMMAND_BUFFER_INFO_REFERENCE_COUNT: + return ReturnValue(uint32_t{hCommandBuffer->RefCount.load()}); + case UR_EXP_COMMAND_BUFFER_INFO_DESCRIPTOR: { + ur_exp_command_buffer_desc_t Descriptor{}; + Descriptor.stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC; + Descriptor.pNext = nullptr; + Descriptor.isUpdatable = hCommandBuffer->isUpdatable; + Descriptor.isInOrder = true; + Descriptor.enableProfiling = hCommandBuffer->isProfilingEnabled; + + return ReturnValue(Descriptor); + } + default: + assert(!"Command-buffer info request not implemented"); } return UR_RESULT_ERROR_INVALID_ENUMERATION; +} catch (...) { + return exceptionToResult(std::current_exception()); } } // namespace ur::level_zero diff --git a/source/adapters/level_zero/v2/command_buffer.hpp b/source/adapters/level_zero/v2/command_buffer.hpp index a685be0279..50a3d729fd 100644 --- a/source/adapters/level_zero/v2/command_buffer.hpp +++ b/source/adapters/level_zero/v2/command_buffer.hpp @@ -35,7 +35,7 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { ur_result_t closeCommandList(); std::vector waitList; - + // Indicates if command-buffer commands can be updated after it is closed. bool isUpdatable = false; // Indicates if command buffer was finalized. From 30f2f91e1fb6538e0ebb2542e9cec85b097e7133 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Tue, 21 Jan 2025 13:21:27 +0000 Subject: [PATCH 22/46] Move enqueue generic command list back to queue --- .../level_zero/v2/command_list_manager.cpp | 19 ---------------- .../level_zero/v2/command_list_manager.hpp | 4 ---- .../v2/queue_immediate_in_order.cpp | 22 ++++++++++++++++++- 3 files changed, 21 insertions(+), 24 deletions(-) diff --git a/source/adapters/level_zero/v2/command_list_manager.cpp b/source/adapters/level_zero/v2/command_list_manager.cpp index 5b3cf5ccaa..fb50726053 100644 --- a/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/source/adapters/level_zero/v2/command_list_manager.cpp @@ -112,25 +112,6 @@ ur_result_t ur_command_list_manager::appendKernelLaunch( return UR_RESULT_SUCCESS; } -ur_result_t ur_command_list_manager::enqueueGenericCommandListsExp( - uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, - ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand) { - - std::scoped_lock Lock(this->Mutex); - auto signalEvent = getSignalEvent(phEvent, callerCommand); - - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); - - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; - - ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp, - (zeCommandList.get(), numCommandLists, phCommandLists, - zeSignalEvent, numWaitEvents, pWaitEvents)); - - return UR_RESULT_SUCCESS; -} ze_command_list_handle_t ur_command_list_manager::getZeCommandList() { return zeCommandList.get(); diff --git a/source/adapters/level_zero/v2/command_list_manager.hpp b/source/adapters/level_zero/v2/command_list_manager.hpp index c56b076d42..95cfa89250 100644 --- a/source/adapters/level_zero/v2/command_list_manager.hpp +++ b/source/adapters/level_zero/v2/command_list_manager.hpp @@ -34,10 +34,6 @@ struct ur_command_list_manager : public _ur_object { ur_result_t appendCommandListImmediate( ze_command_list_handle_t commandList, ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList); - ur_result_t enqueueGenericCommandListsExp( - uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, - ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand); ze_command_list_handle_t getZeCommandList(); private: diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index 9289957b9b..b4a4513fe3 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -1093,11 +1093,31 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp( return UR_RESULT_SUCCESS; } +ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp( + uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, + ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand) { + + std::scoped_lock Lock(this->Mutex); + auto signalEvent = getSignalEvent(phEvent, callerCommand); + + auto [pWaitEvents, numWaitEvents] = + getWaitListView(phEventWaitList, numEventsInWaitList); + + auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; + + ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp, + (commandListManager.getZeCommandList(), numCommandLists, phCommandLists, + zeSignalEvent, numWaitEvents, pWaitEvents)); + + return UR_RESULT_SUCCESS; +} + ur_result_t ur_queue_immediate_in_order_t::enqueueCommandBuffer( ze_command_list_handle_t commandBufferCommandList, ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList) { - return commandListManager.enqueueGenericCommandListsExp( + return enqueueGenericCommandListsExp( 1, &commandBufferCommandList, phEvent, numEventsInWaitList, phEventWaitList, UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP); } From c00d960e75081ac83e91c44ed97daa8ac2b253ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Tue, 21 Jan 2025 14:19:59 +0000 Subject: [PATCH 23/46] Share events and lists between queue and command list manager --- .../level_zero/v2/command_list_manager.hpp | 12 +- .../v2/queue_immediate_in_order.cpp | 180 ++++++++---------- .../v2/queue_immediate_in_order.hpp | 22 --- 3 files changed, 83 insertions(+), 131 deletions(-) diff --git a/source/adapters/level_zero/v2/command_list_manager.hpp b/source/adapters/level_zero/v2/command_list_manager.hpp index 95cfa89250..52b4cbbe55 100644 --- a/source/adapters/level_zero/v2/command_list_manager.hpp +++ b/source/adapters/level_zero/v2/command_list_manager.hpp @@ -36,6 +36,12 @@ struct ur_command_list_manager : public _ur_object { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList); ze_command_list_handle_t getZeCommandList(); + std::pair + getWaitListView(const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents); + ur_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, + ur_command_t commandType); + private: // UR context associated with this command-buffer ur_context_handle_t context; @@ -45,10 +51,4 @@ struct ur_command_list_manager : public _ur_object { v2::raii::command_list_unique_handle zeCommandList; ur_queue_handle_t_ *queue; std::vector waitList; - - std::pair - getWaitListView(const ur_event_handle_t *phWaitEvents, - uint32_t numWaitEvents); - ur_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, - ur_command_t commandType); }; diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index b4a4513fe3..861a47fa94 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -24,13 +24,7 @@ namespace v2 { std::pair ur_queue_immediate_in_order_t::getWaitListView( const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents) { - - waitList.resize(numWaitEvents); - for (uint32_t i = 0; i < numWaitEvents; i++) { - waitList[i] = phWaitEvents[i]->getZeEvent(); - } - - return {waitList.data(), static_cast(numWaitEvents)}; + return commandListManager.getWaitListView(phWaitEvents, numWaitEvents); } static int32_t getZeOrdinal(ur_device_handle_t hDevice) { @@ -58,25 +52,6 @@ static ze_command_queue_priority_t getZePriority(ur_queue_flags_t flags) { return ZE_COMMAND_QUEUE_PRIORITY_NORMAL; } -ur_command_list_handler_t::ur_command_list_handler_t( - ur_context_handle_t hContext, ur_device_handle_t hDevice, - const ur_queue_properties_t *pProps) - : commandList(hContext->commandListCache.getImmediateCommandList( - hDevice->ZeDevice, true, getZeOrdinal(hDevice), - true /* always enable copy offload */, - ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, - getZePriority(pProps ? pProps->flags : ur_queue_flags_t{}), - getZeIndex(pProps))) {} - -ur_command_list_handler_t::ur_command_list_handler_t( - ze_command_list_handle_t hZeCommandList, bool ownZeHandle) - : commandList(hZeCommandList, - [ownZeHandle](ze_command_list_handle_t hZeCommandList) { - if (ownZeHandle) { - ZE_CALL_NOCHECK(zeCommandListDestroy, (hZeCommandList)); - } - }) {} - static event_flags_t eventFlagsFromQueueFlags(ur_queue_flags_t flags) { event_flags_t eventFlags = EVENT_FLAGS_COUNTER; if (flags & UR_QUEUE_FLAG_PROFILING_ENABLE) @@ -88,9 +63,6 @@ ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_queue_properties_t *pProps) : hContext(hContext), hDevice(hDevice), flags(pProps ? pProps->flags : 0), - eventPool(hContext->eventPoolCache.borrow( - hDevice->Id.value(), eventFlagsFromQueueFlags(flags))), - handler(hContext, hDevice, pProps), commandListManager( hContext, hDevice, hContext->commandListCache.getImmediateCommandList( @@ -105,10 +77,6 @@ ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_native_handle_t hNativeHandle, ur_queue_flags_t flags, bool ownZeQueue) : hContext(hContext), hDevice(hDevice), flags(flags), - eventPool(hContext->eventPoolCache.borrow( - hDevice->Id.value(), eventFlagsFromQueueFlags(flags))), - handler(reinterpret_cast(hNativeHandle), - ownZeQueue), commandListManager( hContext, hDevice, raii::command_list_unique_handle( @@ -123,13 +91,7 @@ ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( ur_event_handle_t ur_queue_immediate_in_order_t::getSignalEvent(ur_event_handle_t *hUserEvent, ur_command_t commandType) { - if (hUserEvent) { - *hUserEvent = eventPool->allocate(); - (*hUserEvent)->resetQueueAndCommand(this, commandType); - return *hUserEvent; - } else { - return nullptr; - } + return commandListManager.getSignalEvent(hUserEvent, commandType); } ur_result_t @@ -187,8 +149,8 @@ void ur_queue_immediate_in_order_t::deferEventFree(ur_event_handle_t hEvent) { ur_result_t ur_queue_immediate_in_order_t::queueGetNativeHandle( ur_queue_native_desc_t *pDesc, ur_native_handle_t *phNativeQueue) { std::ignore = pDesc; - *phNativeQueue = - reinterpret_cast(this->handler.commandList.get()); + *phNativeQueue = reinterpret_cast( + this->commandListManager.getZeCommandList()); return UR_RESULT_SUCCESS; } @@ -201,7 +163,7 @@ ur_result_t ur_queue_immediate_in_order_t::queueFinish() { TRACK_SCOPE_LATENCY( "ur_queue_immediate_in_order_t::zeCommandListHostSynchronize"); ZE2UR_CALL(zeCommandListHostSynchronize, - (handler.commandList.get(), UINT64_MAX)); + (commandListManager.getZeCommandList(), UINT64_MAX)); // Free deferred events for (auto &hEvent : deferredEvents) { @@ -261,13 +223,15 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWait( getWaitListView(phEventWaitList, numEventsInWaitList); if (numWaitEvents > 0) { - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (handler.commandList.get(), numWaitEvents, pWaitEvents)); + ZE2UR_CALL( + zeCommandListAppendWaitOnEvents, + (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents)); } if (signalEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (handler.commandList.get(), signalEvent->getZeEvent())); + ZE2UR_CALL( + zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), signalEvent->getZeEvent())); } return UR_RESULT_SUCCESS; @@ -292,7 +256,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrierImpl( getWaitListView(phEventWaitList, numEventsInWaitList); ZE2UR_CALL(zeCommandListAppendBarrier, - (handler.commandList.get(), signalEvent->getZeEvent(), + (commandListManager.getZeCommandList(), signalEvent->getZeEvent(), numWaitEvents, pWaitEvents)); return UR_RESULT_SUCCESS; @@ -336,8 +300,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCopyUnlocked( hDevice, ur_mem_handle_t_::device_access_mode_t::read_only, srcOffset, size, [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, + size, nullptr, waitList.second, waitList.first)); memoryMigrated = true; })); @@ -345,8 +309,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCopyUnlocked( hDevice, ur_mem_handle_t_::device_access_mode_t::write_only, dstOffset, size, [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, + size, nullptr, waitList.second, waitList.first)); memoryMigrated = true; })); @@ -359,12 +323,12 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCopyUnlocked( auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; ZE2UR_CALL(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), pDst, pSrc, size, zeSignalEvent, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), pDst, pSrc, size, + zeSignalEvent, waitList.second, waitList.first)); if (blocking) { ZE2UR_CALL(zeCommandListHostSynchronize, - (handler.commandList.get(), UINT64_MAX)); + (commandListManager.getZeCommandList(), UINT64_MAX)); } return UR_RESULT_SUCCESS; @@ -425,16 +389,16 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueRegionCopyUnlocked( hDevice, ur_mem_handle_t_::device_access_mode_t::read_only, 0, src->getSize(), [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, + size, nullptr, waitList.second, waitList.first)); memoryMigrated = true; })); auto pDst = ur_cast(dst->getDevicePtr( hDevice, ur_mem_handle_t_::device_access_mode_t::write_only, 0, dst->getSize(), [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, + size, nullptr, waitList.second, waitList.first)); memoryMigrated = true; })); @@ -447,14 +411,14 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueRegionCopyUnlocked( auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion, - (handler.commandList.get(), pDst, &zeParams.dstRegion, + (commandListManager.getZeCommandList(), pDst, &zeParams.dstRegion, zeParams.dstPitch, zeParams.dstSlicePitch, pSrc, &zeParams.srcRegion, zeParams.srcPitch, zeParams.srcSlicePitch, zeSignalEvent, waitList.second, waitList.first)); if (blocking) { ZE2UR_CALL(zeCommandListHostSynchronize, - (handler.commandList.get(), UINT64_MAX)); + (commandListManager.getZeCommandList(), UINT64_MAX)); } return UR_RESULT_SUCCESS; @@ -626,8 +590,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferMap( auto pDst = ur_cast(hBuffer->mapHostPtr( mapFlags, offset, size, [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, + size, nullptr, waitList.second, waitList.first)); memoryMigrated = true; })); *ppRetMap = pDst; @@ -635,16 +599,18 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferMap( if (!memoryMigrated && waitList.second) { // If memory was not migrated, we need to wait on the events here. ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (handler.commandList.get(), waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), waitList.second, + waitList.first)); if (signalEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (handler.commandList.get(), signalEvent->getZeEvent())); + ZE2UR_CALL( + zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), signalEvent->getZeEvent())); } } if (blockingMap) { ZE2UR_CALL(zeCommandListHostSynchronize, - (handler.commandList.get(), UINT64_MAX)); + (commandListManager.getZeCommandList(), UINT64_MAX)); } return UR_RESULT_SUCCESS; @@ -664,20 +630,22 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemUnmap( // TODO: currently unmapHostPtr deallocates memory immediately, // since the memory might be used by the user, we need to make sure // all dependencies are completed. - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (handler.commandList.get(), waitList.second, waitList.first)); + ZE2UR_CALL( + zeCommandListAppendWaitOnEvents, + (commandListManager.getZeCommandList(), waitList.second, waitList.first)); bool memoryMigrated = false; hMem->unmapHostPtr(pMappedPtr, [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, size, + nullptr, waitList.second, waitList.first)); memoryMigrated = true; }); if (signalEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (handler.commandList.get(), signalEvent->getZeEvent())); + ZE2UR_CALL( + zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), signalEvent->getZeEvent())); } return UR_RESULT_SUCCESS; @@ -698,8 +666,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericFillUnlocked( hDevice, ur_mem_handle_t_::device_access_mode_t::read_only, offset, size, [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, + size, nullptr, waitList.second, waitList.first)); memoryMigrated = true; })); @@ -716,8 +684,9 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericFillUnlocked( // When it's not, the fill is emulated with zeCommandListAppendMemoryCopy. auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; ZE2UR_CALL(zeCommandListAppendMemoryFill, - (handler.commandList.get(), pDst, pPattern, patternSize, size, - zeSignalEvent, waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), pDst, pPattern, + patternSize, size, zeSignalEvent, waitList.second, + waitList.first)); return UR_RESULT_SUCCESS; } @@ -752,12 +721,12 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMMemcpy( auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; ZE2UR_CALL(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), pDst, pSrc, size, zeSignalEvent, - numWaitEvents, pWaitEvents)); + (commandListManager.getZeCommandList(), pDst, pSrc, size, + zeSignalEvent, numWaitEvents, pWaitEvents)); if (blocking) { ZE2UR_CALL(zeCommandListHostSynchronize, - (handler.commandList.get(), UINT64_MAX)); + (commandListManager.getZeCommandList(), UINT64_MAX)); } return UR_RESULT_SUCCESS; @@ -779,16 +748,18 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMPrefetch( getWaitListView(phEventWaitList, numEventsInWaitList); if (pWaitEvents) { - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (handler.commandList.get(), numWaitEvents, pWaitEvents)); + ZE2UR_CALL( + zeCommandListAppendWaitOnEvents, + (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents)); } // TODO: figure out how to translate "flags" ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, - (handler.commandList.get(), pMem, size)); + (commandListManager.getZeCommandList(), pMem, size)); if (signalEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (handler.commandList.get(), signalEvent->getZeEvent())); + ZE2UR_CALL( + zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), signalEvent->getZeEvent())); } return UR_RESULT_SUCCESS; @@ -811,18 +782,20 @@ ur_queue_immediate_in_order_t::enqueueUSMAdvise(const void *pMem, size_t size, auto [pWaitEvents, numWaitEvents] = getWaitListView(nullptr, 0); if (pWaitEvents) { - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (handler.commandList.get(), numWaitEvents, pWaitEvents)); + ZE2UR_CALL( + zeCommandListAppendWaitOnEvents, + (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents)); } // TODO: figure out how to translate "flags" ZE2UR_CALL(zeCommandListAppendMemAdvise, - (handler.commandList.get(), this->hDevice->ZeDevice, pMem, size, - zeAdvice)); + (commandListManager.getZeCommandList(), this->hDevice->ZeDevice, + pMem, size, zeAdvice)); if (signalEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (handler.commandList.get(), signalEvent->getZeEvent())); + ZE2UR_CALL( + zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), signalEvent->getZeEvent())); } return UR_RESULT_SUCCESS; @@ -1030,8 +1003,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp( bool memoryMigrated = false; auto memoryMigrate = [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, size, + nullptr, waitList.second, waitList.first)); memoryMigrated = true; }; @@ -1050,8 +1023,9 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp( "zeCommandListAppendLaunchCooperativeKernel"); auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel, - (handler.commandList.get(), hZeKernel, &zeThreadGroupDimensions, - zeSignalEvent, waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), hZeKernel, + &zeThreadGroupDimensions, zeSignalEvent, waitList.second, + waitList.first)); recordSubmittedKernel(hKernel); @@ -1082,12 +1056,12 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp( signalEvent->getEventEndTimestampAndHandle(); ZE2UR_CALL(zeCommandListAppendWriteGlobalTimestamp, - (handler.commandList.get(), timestampPtr, zeSignalEvent, - numWaitEvents, pWaitEvents)); + (commandListManager.getZeCommandList(), timestampPtr, + zeSignalEvent, numWaitEvents, pWaitEvents)); if (blocking) { ZE2UR_CALL(zeCommandListHostSynchronize, - (handler.commandList.get(), UINT64_MAX)); + (commandListManager.getZeCommandList(), UINT64_MAX)); } return UR_RESULT_SUCCESS; @@ -1107,8 +1081,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp( auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp, - (commandListManager.getZeCommandList(), numCommandLists, phCommandLists, - zeSignalEvent, numWaitEvents, pWaitEvents)); + (commandListManager.getZeCommandList(), numCommandLists, + phCommandLists, zeSignalEvent, numWaitEvents, pWaitEvents)); return UR_RESULT_SUCCESS; } @@ -1117,9 +1091,9 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCommandBuffer( ze_command_list_handle_t commandBufferCommandList, ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList) { - return enqueueGenericCommandListsExp( - 1, &commandBufferCommandList, phEvent, numEventsInWaitList, - phEventWaitList, UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP); + return enqueueGenericCommandListsExp(1, &commandBufferCommandList, phEvent, + numEventsInWaitList, phEventWaitList, + UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP); } ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunchCustomExp( ur_kernel_handle_t hKernel, uint32_t workDim, diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index 6cb2f75999..de3b95c748 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -25,34 +25,12 @@ namespace v2 { using queue_group_type = ur_device_handle_t_::queue_group_info_t::type; -struct ur_command_list_handler_t { - ur_command_list_handler_t(ur_context_handle_t hContext, - ur_device_handle_t hDevice, - const ur_queue_properties_t *pProps); - - ur_command_list_handler_t(ze_command_list_handle_t hZeCommandList, - bool ownZeHandle); - - raii::command_list_unique_handle commandList; -}; - struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ { private: - // to remove after command_list_manager is complete ur_context_handle_t hContext; - // to remove after command_list_manager is complete ur_device_handle_t hDevice; ur_queue_flags_t flags; - // to remove after command_list_manager is complete - raii::cache_borrowed_event_pool eventPool; - - // to remove after command_list_manager is complete - ur_command_list_handler_t handler; - - // to remove after command_list_manager is complete - std::vector waitList; - ur_command_list_manager commandListManager; std::vector deferredEvents; std::vector submittedKernels; From eb6487d5b27db993e6567ebe64de48c6d3f90c5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Tue, 14 Jan 2025 13:24:55 +0000 Subject: [PATCH 24/46] [CUDA][HIP] Fix kernel arguments being overriden when added out of order In the Cuda and Hip adapter, when kernel arguments are added out of order (e.g. argument at index 1 is added before argument at index 0), the existing arguments are currently being overwritten. This happens because some of the argument sizes might not be known when adding them out of order and the code relies on those sizes to choose where to store the argument. This commit avoids this issue by storing the arguments in the same order that they are added and accessing them using pointer offsets. --- source/adapters/cuda/kernel.hpp | 31 ++- source/adapters/hip/kernel.hpp | 65 +++--- .../update/local_memory_update.cpp | 220 ++++++++++++++++++ .../kernel/urKernelSetArgLocal.cpp | 99 ++++++++ 4 files changed, 377 insertions(+), 38 deletions(-) diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp index d1b3b61244..a6194e9a57 100644 --- a/source/adapters/cuda/kernel.hpp +++ b/source/adapters/cuda/kernel.hpp @@ -68,6 +68,8 @@ struct ur_kernel_handle_t_ { args_size_t ParamSizes; /// Byte offset into /p Storage allocation for each parameter. args_index_t Indices; + /// Position in the Storage array where the next argument should added. + size_t InsertPos = 0; /// Aligned size in bytes for each local memory parameter after padding has /// been added. Zero if the argument at the index isn't a local memory /// argument. @@ -101,6 +103,7 @@ struct ur_kernel_handle_t_ { /// Implicit offset argument is kept at the back of the indices collection. void addArg(size_t Index, size_t Size, const void *Arg, size_t LocalSize = 0) { + // Expand storage to accommodate this Index if needed. if (Index + 2 > Indices.size()) { // Move implicit offset argument index with the end Indices.resize(Index + 2, Indices.back()); @@ -109,14 +112,21 @@ struct ur_kernel_handle_t_ { AlignedLocalMemSize.resize(Index + 1); OriginalLocalMemSize.resize(Index + 1); } - ParamSizes[Index] = Size; - // calculate the insertion point on the array - size_t InsertPos = std::accumulate(std::begin(ParamSizes), - std::begin(ParamSizes) + Index, 0); - // Update the stored value for the argument - std::memcpy(&Storage[InsertPos], Arg, Size); - Indices[Index] = &Storage[InsertPos]; - AlignedLocalMemSize[Index] = LocalSize; + + // Copy new argument to storage if it hasn't been added before. + if (ParamSizes[Index] == 0) { + ParamSizes[Index] = Size; + std::memcpy(&Storage[InsertPos], Arg, Size); + Indices[Index] = &Storage[InsertPos]; + AlignedLocalMemSize[Index] = LocalSize; + InsertPos += Size; + } + // Otherwise, update the existing argument. + else { + std::memcpy(Indices[Index], Arg, Size); + AlignedLocalMemSize[Index] = LocalSize; + assert(Size == ParamSizes[Index]); + } } /// Returns the padded size and offset of a local memory argument. @@ -177,10 +187,7 @@ struct ur_kernel_handle_t_ { AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize; // Store new offset into local data - const size_t InsertPos = - std::accumulate(std::begin(ParamSizes), - std::begin(ParamSizes) + SuccIndex, size_t{0}); - std::memcpy(&Storage[InsertPos], &SuccAlignedLocalOffset, + std::memcpy(Indices[SuccIndex], &SuccAlignedLocalOffset, sizeof(size_t)); } } diff --git a/source/adapters/hip/kernel.hpp b/source/adapters/hip/kernel.hpp index c6d30e81ad..61dd89cc99 100644 --- a/source/adapters/hip/kernel.hpp +++ b/source/adapters/hip/kernel.hpp @@ -63,6 +63,8 @@ struct ur_kernel_handle_t_ { args_size_t ParamSizes; /// Byte offset into /p Storage allocation for each parameter. args_index_t Indices; + /// Position in the Storage array where the next argument should added. + size_t InsertPos = 0; /// Aligned size in bytes for each local memory parameter after padding has /// been added. Zero if the argument at the index isn't a local memory /// argument. @@ -95,22 +97,30 @@ struct ur_kernel_handle_t_ { /// Implicit offset argument is kept at the back of the indices collection. void addArg(size_t Index, size_t Size, const void *Arg, size_t LocalSize = 0) { + // Expand storage to accommodate this Index if needed. if (Index + 2 > Indices.size()) { - // Move implicit offset argument Index with the end + // Move implicit offset argument index with the end Indices.resize(Index + 2, Indices.back()); // Ensure enough space for the new argument ParamSizes.resize(Index + 1); AlignedLocalMemSize.resize(Index + 1); OriginalLocalMemSize.resize(Index + 1); } - ParamSizes[Index] = Size; - // calculate the insertion point on the array - size_t InsertPos = std::accumulate(std::begin(ParamSizes), - std::begin(ParamSizes) + Index, 0); - // Update the stored value for the argument - std::memcpy(&Storage[InsertPos], Arg, Size); - Indices[Index] = &Storage[InsertPos]; - AlignedLocalMemSize[Index] = LocalSize; + + // Copy new argument to storage if it hasn't been added before. + if (ParamSizes[Index] == 0) { + ParamSizes[Index] = Size; + std::memcpy(&Storage[InsertPos], Arg, Size); + Indices[Index] = &Storage[InsertPos]; + AlignedLocalMemSize[Index] = LocalSize; + InsertPos += Size; + } + // Otherwise, update the existing argument. + else { + std::memcpy(Indices[Index], Arg, Size); + AlignedLocalMemSize[Index] = LocalSize; + assert(Size == ParamSizes[Index]); + } } /// Returns the padded size and offset of a local memory argument. @@ -151,20 +161,11 @@ struct ur_kernel_handle_t_ { return std::make_pair(AlignedLocalSize, AlignedLocalOffset); } - void addLocalArg(size_t Index, size_t Size) { - // Get the aligned argument size and offset into local data - auto [AlignedLocalSize, AlignedLocalOffset] = - calcAlignedLocalArgument(Index, Size); - - // Store argument details - addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset), - AlignedLocalSize); - - // For every existing local argument which follows at later argument - // indices, update the offset and pointer into the kernel local memory. - // Required as padding will need to be recalculated. + // Iterate over all existing local argument which follows StartIndex + // index, update the offset and pointer into the kernel local memory. + void updateLocalArgOffset(size_t StartIndex) { const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg - for (auto SuccIndex = Index + 1; SuccIndex < NumArgs; SuccIndex++) { + for (auto SuccIndex = StartIndex; SuccIndex < NumArgs; SuccIndex++) { const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex]; if (OriginalLocalSize == 0) { // Skip if successor argument isn't a local memory arg @@ -179,14 +180,26 @@ struct ur_kernel_handle_t_ { AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize; // Store new offset into local data - const size_t InsertPos = - std::accumulate(std::begin(ParamSizes), - std::begin(ParamSizes) + SuccIndex, size_t{0}); - std::memcpy(&Storage[InsertPos], &SuccAlignedLocalOffset, + std::memcpy(Indices[SuccIndex], &SuccAlignedLocalOffset, sizeof(size_t)); } } + void addLocalArg(size_t Index, size_t Size) { + // Get the aligned argument size and offset into local data + auto [AlignedLocalSize, AlignedLocalOffset] = + calcAlignedLocalArgument(Index, Size); + + // Store argument details + addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset), + AlignedLocalSize); + + // For every existing local argument which follows at later argument + // indices, update the offset and pointer into the kernel local memory. + // Required as padding will need to be recalculated. + updateLocalArgOffset(Index + 1); + } + void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) { assert(hMem && "Invalid mem handle"); // To avoid redundancy we are not storing mem obj with index i at index diff --git a/test/conformance/exp_command_buffer/update/local_memory_update.cpp b/test/conformance/exp_command_buffer/update/local_memory_update.cpp index 559a61e3ad..6f309b6933 100644 --- a/test/conformance/exp_command_buffer/update/local_memory_update.cpp +++ b/test/conformance/exp_command_buffer/update/local_memory_update.cpp @@ -1094,3 +1094,223 @@ TEST_P(LocalMemoryMultiUpdateTest, UpdateWithoutBlocking) { uint32_t *new_Y = (uint32_t *)shared_ptrs[4]; Validate(new_output, new_X, new_Y, new_A, global_size, local_size); } + +struct LocalMemoryUpdateTestBaseOutOfOrder : LocalMemoryUpdateTestBase { + virtual void SetUp() override { + program_name = "saxpy_usm_local_mem"; + UUR_RETURN_ON_FATAL_FAILURE( + urUpdatableCommandBufferExpExecutionTest::SetUp()); + + if (backend == UR_PLATFORM_BACKEND_LEVEL_ZERO) { + GTEST_SKIP() + << "Local memory argument update not supported on Level Zero."; + } + + // HIP has extra args for local memory so we define an offset for arg + // indices here for updating + hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0; + ur_device_usm_access_capability_flags_t shared_usm_flags; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; + } + + const size_t allocation_size = + sizeof(uint32_t) * global_size * local_size; + for (auto &shared_ptr : shared_ptrs) { + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, &shared_ptr)); + ASSERT_NE(shared_ptr, nullptr); + + std::vector pattern(allocation_size); + uur::generateMemFillPattern(pattern); + std::memcpy(shared_ptr, pattern.data(), allocation_size); + } + + std::array index_order{}; + if (backend != UR_PLATFORM_BACKEND_HIP) { + index_order = {3, 2, 4, 5, 1, 0}; + } else { + index_order = {9, 8, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; + } + size_t current_index = 0; + + // Index 3 is A + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(A), nullptr, &A)); + // Index 2 is output + ASSERT_SUCCESS(urKernelSetArgPointer( + kernel, index_order[current_index++], nullptr, shared_ptrs[0])); + + // Index 4 is X + ASSERT_SUCCESS(urKernelSetArgPointer( + kernel, index_order[current_index++], nullptr, shared_ptrs[1])); + // Index 5 is Y + ASSERT_SUCCESS(urKernelSetArgPointer( + kernel, index_order[current_index++], nullptr, shared_ptrs[2])); + + // Index 1 is local_mem_b arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_b_size, nullptr)); + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + } + + // Index 0 is local_mem_a arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_a_size, nullptr)); + + // Hip has extra args for local mem at index 1-3 + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + } + } +}; + +struct LocalMemoryUpdateTestOutOfOrder : LocalMemoryUpdateTestBaseOutOfOrder { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE( + LocalMemoryUpdateTestBaseOutOfOrder::SetUp()); + + // Append kernel command to command-buffer and close command-buffer + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, + &global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr, + nullptr, nullptr, &command_handle)); + ASSERT_NE(command_handle, nullptr); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + } + + void TearDown() override { + if (command_handle) { + EXPECT_SUCCESS(urCommandBufferReleaseCommandExp(command_handle)); + } + + UUR_RETURN_ON_FATAL_FAILURE( + LocalMemoryUpdateTestBaseOutOfOrder::TearDown()); + } + + ur_exp_command_buffer_command_handle_t command_handle = nullptr; +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(LocalMemoryUpdateTestOutOfOrder); + +// Test updating A,X,Y parameters to new values and local memory to larger +// values when the kernel arguments were added out of order. +TEST_P(LocalMemoryUpdateTestOutOfOrder, UpdateAllParameters) { + // Run command-buffer prior to update and verify output + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size, local_size); + + // Update inputs + std::array + new_input_descs; + std::array + new_value_descs; + + size_t new_local_size = local_size * 4; + size_t new_local_mem_a_size = new_local_size * sizeof(uint32_t); + + // New local_mem_a at index 0 + new_value_descs[0] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 0, // argIndex + new_local_mem_a_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }; + + // New local_mem_b at index 1 + new_value_descs[1] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 1 + hip_arg_offset, // argIndex + local_mem_b_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }; + + // New A at index 3 + uint32_t new_A = 33; + new_value_descs[2] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 3 + (2 * hip_arg_offset), // argIndex + sizeof(new_A), // argSize + nullptr, // pProperties + &new_A, // hArgValue + }; + + // New X at index 4 + new_input_descs[0] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 4 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[3], // pArgValue + }; + + // New Y at index 5 + new_input_descs[1] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 5 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[4], // pArgValue + }; + + // Update kernel inputs + ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + kernel, // hNewKernel + 0, // numNewMemObjArgs + new_input_descs.size(), // numNewPointerArgs + new_value_descs.size(), // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + new_input_descs.data(), // pNewPointerArgList + new_value_descs.data(), // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + nullptr, // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize + }; + + // Update kernel and enqueue command-buffer again + ASSERT_SUCCESS( + urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + // Verify that update occurred correctly + uint32_t *new_output = (uint32_t *)shared_ptrs[0]; + uint32_t *new_X = (uint32_t *)shared_ptrs[3]; + uint32_t *new_Y = (uint32_t *)shared_ptrs[4]; + Validate(new_output, new_X, new_Y, new_A, global_size, local_size); +} \ No newline at end of file diff --git a/test/conformance/kernel/urKernelSetArgLocal.cpp b/test/conformance/kernel/urKernelSetArgLocal.cpp index 688724ec09..f5fc0019ae 100644 --- a/test/conformance/kernel/urKernelSetArgLocal.cpp +++ b/test/conformance/kernel/urKernelSetArgLocal.cpp @@ -237,3 +237,102 @@ TEST_P(urKernelSetArgLocalMultiTest, Overwrite) { Validate(output, X, Y, A, global_size, new_local_size); } + +// Tests that adding arguments out of order (e.g. index 1 before index 0) works. +struct urKernelSetArgLocalOutOfOrder : urKernelSetArgLocalMultiTest { + void SetUp() override { + program_name = "saxpy_usm_local_mem"; + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); + + ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, + sizeof(backend), &backend, nullptr)); + + // HIP has extra args for local memory so we define an offset for arg indices here for updating + hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0; + ur_device_usm_access_capability_flags_t shared_usm_flags; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; + } + + const size_t allocation_size = + sizeof(uint32_t) * global_size * local_size; + for (auto &shared_ptr : shared_ptrs) { + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, &shared_ptr)); + ASSERT_NE(shared_ptr, nullptr); + + std::vector pattern(allocation_size); + uur::generateMemFillPattern(pattern); + std::memcpy(shared_ptr, pattern.data(), allocation_size); + } + + std::array index_order{}; + if (backend != UR_PLATFORM_BACKEND_HIP) { + index_order = {3, 2, 4, 5, 1, 0}; + } else { + index_order = {9, 8, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; + } + size_t current_index = 0; + + // Index 3 is A + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(A), nullptr, &A)); + // Index 2 is output + ASSERT_SUCCESS(urKernelSetArgPointer( + kernel, index_order[current_index++], nullptr, shared_ptrs[0])); + + // Index 4 is X + ASSERT_SUCCESS(urKernelSetArgPointer( + kernel, index_order[current_index++], nullptr, shared_ptrs[1])); + // Index 5 is Y + ASSERT_SUCCESS(urKernelSetArgPointer( + kernel, index_order[current_index++], nullptr, shared_ptrs[2])); + + // Index 1 is local_mem_b arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_b_size, nullptr)); + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + } + + // Index 0 is local_mem_a arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_a_size, nullptr)); + + // Hip has extra args for local mem at index 1-3 + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + } + } +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urKernelSetArgLocalOutOfOrder); +TEST_P(urKernelSetArgLocalOutOfOrder, Success) { + ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, + &global_offset, &global_size, + &local_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size, local_size); +} From e3dcfc3c5726f39e94ae4a12f0ef2325e7970d63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Tue, 21 Jan 2025 14:20:16 +0000 Subject: [PATCH 25/46] Rename Indices member variable to ArgPointers --- source/adapters/cuda/command_buffer.cpp | 6 ++-- source/adapters/cuda/enqueue.cpp | 6 ++-- source/adapters/cuda/kernel.hpp | 37 +++++++++++++------------ source/adapters/hip/command_buffer.cpp | 6 ++-- source/adapters/hip/enqueue.cpp | 4 +-- source/adapters/hip/kernel.hpp | 34 ++++++++++++----------- test/adapters/cuda/kernel_tests.cpp | 6 ++-- 7 files changed, 52 insertions(+), 47 deletions(-) diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp index 05c20a6614..37018dde6c 100644 --- a/source/adapters/cuda/command_buffer.cpp +++ b/source/adapters/cuda/command_buffer.cpp @@ -523,7 +523,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ThreadsPerBlock, BlocksPerGrid)); // Set node param structure with the kernel related data - auto &ArgIndices = hKernel->getArgIndices(); + auto &ArgPointers = hKernel->getArgPointers(); CUDA_KERNEL_NODE_PARAMS NodeParams = {}; NodeParams.func = CuFunc; NodeParams.gridDimX = BlocksPerGrid[0]; @@ -533,7 +533,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( NodeParams.blockDimY = ThreadsPerBlock[1]; NodeParams.blockDimZ = ThreadsPerBlock[2]; NodeParams.sharedMemBytes = LocalSize; - NodeParams.kernelParams = const_cast(ArgIndices.data()); + NodeParams.kernelParams = const_cast(ArgPointers.data()); // Create and add an new kernel node to the Cuda graph UR_CHECK_ERROR(cuGraphAddKernelNode(&GraphNode, hCommandBuffer->CudaGraph, @@ -1398,7 +1398,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( Params.blockDimZ = ThreadsPerBlock[2]; Params.sharedMemBytes = KernelCommandHandle->Kernel->getLocalSize(); Params.kernelParams = - const_cast(KernelCommandHandle->Kernel->getArgIndices().data()); + const_cast(KernelCommandHandle->Kernel->getArgPointers().data()); CUgraphNode Node = KernelCommandHandle->Node; CUgraphExec CudaGraphExec = CommandBuffer->CudaGraphExec; diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 2a4a2cf54f..71c4340456 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -492,7 +492,7 @@ enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, UR_CHECK_ERROR(RetImplEvent->start()); } - auto &ArgIndices = hKernel->getArgIndices(); + auto &ArgIndices = hKernel->getArgPointers(); UR_CHECK_ERROR(cuLaunchKernel( CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize, @@ -680,7 +680,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( UR_CHECK_ERROR(RetImplEvent->start()); } - auto &ArgIndices = hKernel->getArgIndices(); + auto &ArgPointers = hKernel->getArgPointers(); CUlaunchConfig launch_config; launch_config.gridDimX = BlocksPerGrid[0]; @@ -696,7 +696,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( launch_config.numAttrs = launch_attribute.size(); UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc, - const_cast(ArgIndices.data()), + const_cast(ArgPointers.data()), nullptr)); if (phEvent) { diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp index a6194e9a57..f299714b02 100644 --- a/source/adapters/cuda/kernel.hpp +++ b/source/adapters/cuda/kernel.hpp @@ -66,8 +66,8 @@ struct ur_kernel_handle_t_ { args_t Storage; /// Aligned size of each parameter, including padding. args_size_t ParamSizes; - /// Byte offset into /p Storage allocation for each parameter. - args_index_t Indices; + /// Byte offset into /p Storage allocation for each argument. + args_index_t ArgPointers; /// Position in the Storage array where the next argument should added. size_t InsertPos = 0; /// Aligned size in bytes for each local memory parameter after padding has @@ -92,21 +92,23 @@ struct ur_kernel_handle_t_ { std::uint32_t ImplicitOffsetArgs[3] = {0, 0, 0}; arguments() { - // Place the implicit offset index at the end of the indicies collection - Indices.emplace_back(&ImplicitOffsetArgs); + // Place the implicit offset index at the end of the ArgPointers + // collection. + ArgPointers.emplace_back(&ImplicitOffsetArgs); } /// Add an argument to the kernel. /// If the argument existed before, it is replaced. /// Otherwise, it is added. /// Gaps are filled with empty arguments. - /// Implicit offset argument is kept at the back of the indices collection. + /// Implicit offset argument is kept at the back of the ArgPointers + /// collection. void addArg(size_t Index, size_t Size, const void *Arg, size_t LocalSize = 0) { // Expand storage to accommodate this Index if needed. - if (Index + 2 > Indices.size()) { + if (Index + 2 > ArgPointers.size()) { // Move implicit offset argument index with the end - Indices.resize(Index + 2, Indices.back()); + ArgPointers.resize(Index + 2, ArgPointers.back()); // Ensure enough space for the new argument ParamSizes.resize(Index + 1); AlignedLocalMemSize.resize(Index + 1); @@ -117,13 +119,13 @@ struct ur_kernel_handle_t_ { if (ParamSizes[Index] == 0) { ParamSizes[Index] = Size; std::memcpy(&Storage[InsertPos], Arg, Size); - Indices[Index] = &Storage[InsertPos]; + ArgPointers[Index] = &Storage[InsertPos]; AlignedLocalMemSize[Index] = LocalSize; InsertPos += Size; } // Otherwise, update the existing argument. else { - std::memcpy(Indices[Index], Arg, Size); + std::memcpy(ArgPointers[Index], Arg, Size); AlignedLocalMemSize[Index] = LocalSize; assert(Size == ParamSizes[Index]); } @@ -138,7 +140,7 @@ struct ur_kernel_handle_t_ { std::pair calcAlignedLocalArgument(size_t Index, size_t Size) { // Store the unpadded size of the local argument - if (Index + 2 > Indices.size()) { + if (Index + 2 > ArgPointers.size()) { AlignedLocalMemSize.resize(Index + 1); OriginalLocalMemSize.resize(Index + 1); } @@ -168,10 +170,11 @@ struct ur_kernel_handle_t_ { return std::make_pair(AlignedLocalSize, AlignedLocalOffset); } - // Iterate over all existing local argument which follows StartIndex + // Iterate over each existing local argument which follows StartIndex // index, update the offset and pointer into the kernel local memory. void updateLocalArgOffset(size_t StartIndex) { - const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg + const size_t NumArgs = + ArgPointers.size() - 1; // Accounts for implicit arg for (auto SuccIndex = StartIndex; SuccIndex < NumArgs; SuccIndex++) { const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex]; if (OriginalLocalSize == 0) { @@ -187,7 +190,7 @@ struct ur_kernel_handle_t_ { AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize; // Store new offset into local data - std::memcpy(Indices[SuccIndex], &SuccAlignedLocalOffset, + std::memcpy(ArgPointers[SuccIndex], &SuccAlignedLocalOffset, sizeof(size_t)); } } @@ -235,7 +238,7 @@ struct ur_kernel_handle_t_ { std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size); } - const args_index_t &getIndices() const noexcept { return Indices; } + const args_index_t &getArgPointers() const noexcept { return ArgPointers; } uint32_t getLocalSize() const { return std::accumulate(std::begin(AlignedLocalMemSize), @@ -306,7 +309,7 @@ struct ur_kernel_handle_t_ { /// real one required by the kernel, since this cannot be queried from /// the CUDA Driver API uint32_t getNumArgs() const noexcept { - return static_cast(Args.Indices.size() - 1); + return static_cast(Args.ArgPointers.size() - 1); } void setKernelArg(int Index, size_t Size, const void *Arg) { @@ -321,8 +324,8 @@ struct ur_kernel_handle_t_ { return Args.setImplicitOffset(Size, ImplicitOffset); } - const arguments::args_index_t &getArgIndices() const { - return Args.getIndices(); + const arguments::args_index_t &getArgPointers() const { + return Args.getArgPointers(); } void setWorkGroupMemory(size_t MemSize) { Args.setWorkGroupMemory(MemSize); } diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp index 09c59bb9f7..887eb75287 100644 --- a/source/adapters/hip/command_buffer.cpp +++ b/source/adapters/hip/command_buffer.cpp @@ -378,7 +378,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( pLocalWorkSize, hKernel, HIPFunc, ThreadsPerBlock, BlocksPerGrid)); // Set node param structure with the kernel related data - auto &ArgIndices = hKernel->getArgIndices(); + auto &ArgPointers = hKernel->getArgPointers(); hipKernelNodeParams NodeParams; NodeParams.func = HIPFunc; NodeParams.gridDim.x = BlocksPerGrid[0]; @@ -388,7 +388,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( NodeParams.blockDim.y = ThreadsPerBlock[1]; NodeParams.blockDim.z = ThreadsPerBlock[2]; NodeParams.sharedMemBytes = LocalSize; - NodeParams.kernelParams = const_cast(ArgIndices.data()); + NodeParams.kernelParams = const_cast(ArgPointers.data()); NodeParams.extra = nullptr; // Create and add an new kernel node to the HIP graph @@ -1098,7 +1098,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( Params.blockDim.z = ThreadsPerBlock[2]; Params.sharedMemBytes = hCommand->Kernel->getLocalSize(); Params.kernelParams = - const_cast(hCommand->Kernel->getArgIndices().data()); + const_cast(hCommand->Kernel->getArgPointers().data()); hipGraphNode_t Node = hCommand->Node; hipGraphExec_t HipGraphExec = CommandBuffer->HIPGraphExec; diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index 8c7c1c617d..849369de4b 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -308,7 +308,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } } - auto ArgIndices = hKernel->getArgIndices(); + auto ArgPointers = hKernel->getArgPointers(); // If migration of mem across buffer is needed, an event must be associated // with this command, implicitly if phEvent is nullptr @@ -322,7 +322,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_CHECK_ERROR(hipModuleLaunchKernel( HIPFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], - hKernel->getLocalSize(), HIPStream, ArgIndices.data(), nullptr)); + hKernel->getLocalSize(), HIPStream, ArgPointers.data(), nullptr)); if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); diff --git a/source/adapters/hip/kernel.hpp b/source/adapters/hip/kernel.hpp index 61dd89cc99..5ec51e7fa4 100644 --- a/source/adapters/hip/kernel.hpp +++ b/source/adapters/hip/kernel.hpp @@ -61,8 +61,8 @@ struct ur_kernel_handle_t_ { args_t Storage; /// Aligned size of each parameter, including padding. args_size_t ParamSizes; - /// Byte offset into /p Storage allocation for each parameter. - args_index_t Indices; + /// Byte offset into /p Storage allocation for each argument. + args_index_t ArgPointers; /// Position in the Storage array where the next argument should added. size_t InsertPos = 0; /// Aligned size in bytes for each local memory parameter after padding has @@ -87,20 +87,21 @@ struct ur_kernel_handle_t_ { arguments() { // Place the implicit offset index at the end of the indicies collection - Indices.emplace_back(&ImplicitOffsetArgs); + ArgPointers.emplace_back(&ImplicitOffsetArgs); } /// Add an argument to the kernel. /// If the argument existed before, it is replaced. /// Otherwise, it is added. /// Gaps are filled with empty arguments. - /// Implicit offset argument is kept at the back of the indices collection. + /// Implicit offset argument is kept at the back of the ArgPointers + /// collection. void addArg(size_t Index, size_t Size, const void *Arg, size_t LocalSize = 0) { // Expand storage to accommodate this Index if needed. - if (Index + 2 > Indices.size()) { + if (Index + 2 > ArgPointers.size()) { // Move implicit offset argument index with the end - Indices.resize(Index + 2, Indices.back()); + ArgPointers.resize(Index + 2, ArgPointers.back()); // Ensure enough space for the new argument ParamSizes.resize(Index + 1); AlignedLocalMemSize.resize(Index + 1); @@ -111,13 +112,13 @@ struct ur_kernel_handle_t_ { if (ParamSizes[Index] == 0) { ParamSizes[Index] = Size; std::memcpy(&Storage[InsertPos], Arg, Size); - Indices[Index] = &Storage[InsertPos]; + ArgPointers[Index] = &Storage[InsertPos]; AlignedLocalMemSize[Index] = LocalSize; InsertPos += Size; } // Otherwise, update the existing argument. else { - std::memcpy(Indices[Index], Arg, Size); + std::memcpy(ArgPointers[Index], Arg, Size); AlignedLocalMemSize[Index] = LocalSize; assert(Size == ParamSizes[Index]); } @@ -132,7 +133,7 @@ struct ur_kernel_handle_t_ { std::pair calcAlignedLocalArgument(size_t Index, size_t Size) { // Store the unpadded size of the local argument - if (Index + 2 > Indices.size()) { + if (Index + 2 > ArgPointers.size()) { AlignedLocalMemSize.resize(Index + 1); OriginalLocalMemSize.resize(Index + 1); } @@ -161,10 +162,11 @@ struct ur_kernel_handle_t_ { return std::make_pair(AlignedLocalSize, AlignedLocalOffset); } - // Iterate over all existing local argument which follows StartIndex + // Iterate over each existing local argument which follows StartIndex // index, update the offset and pointer into the kernel local memory. void updateLocalArgOffset(size_t StartIndex) { - const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg + const size_t NumArgs = + ArgPointers.size() - 1; // Accounts for implicit arg for (auto SuccIndex = StartIndex; SuccIndex < NumArgs; SuccIndex++) { const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex]; if (OriginalLocalSize == 0) { @@ -180,7 +182,7 @@ struct ur_kernel_handle_t_ { AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize; // Store new offset into local data - std::memcpy(Indices[SuccIndex], &SuccAlignedLocalOffset, + std::memcpy(ArgPointers[SuccIndex], &SuccAlignedLocalOffset, sizeof(size_t)); } } @@ -219,7 +221,7 @@ struct ur_kernel_handle_t_ { std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size); } - const args_index_t &getIndices() const noexcept { return Indices; } + const args_index_t &getArgPointers() const noexcept { return ArgPointers; } uint32_t getLocalSize() const { return std::accumulate(std::begin(AlignedLocalMemSize), @@ -276,7 +278,7 @@ struct ur_kernel_handle_t_ { /// offset. Note this only returns the current known number of arguments, /// not the real one required by the kernel, since this cannot be queried /// from the HIP Driver API - uint32_t getNumArgs() const noexcept { return Args.Indices.size() - 1; } + uint32_t getNumArgs() const noexcept { return Args.ArgPointers.size() - 1; } void setKernelArg(int Index, size_t Size, const void *Arg) { Args.addArg(Index, Size, Arg); @@ -290,8 +292,8 @@ struct ur_kernel_handle_t_ { return Args.setImplicitOffset(Size, ImplicitOffset); } - const arguments::args_index_t &getArgIndices() const { - return Args.getIndices(); + const arguments::args_index_t &getArgPointers() const { + return Args.getArgPointers(); } uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); } diff --git a/test/adapters/cuda/kernel_tests.cpp b/test/adapters/cuda/kernel_tests.cpp index 0f7f3351fe..7b83459c5f 100644 --- a/test/adapters/cuda/kernel_tests.cpp +++ b/test/adapters/cuda/kernel_tests.cpp @@ -153,7 +153,7 @@ TEST_P(cudaKernelTest, URKernelArgumentSimple) { int number = 10; ASSERT_SUCCESS(urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &number)); - const auto &kernelArgs = kernel->getArgIndices(); + const auto &kernelArgs = kernel->getArgPointers(); ASSERT_EQ(kernelArgs.size(), 1 + NumberOfImplicitArgsCUDA); int storedValue = *static_cast(kernelArgs[0]); @@ -175,7 +175,7 @@ TEST_P(cudaKernelTest, URKernelArgumentSetTwice) { int number = 10; ASSERT_SUCCESS(urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &number)); - const auto &kernelArgs = kernel->getArgIndices(); + const auto &kernelArgs = kernel->getArgPointers(); ASSERT_EQ(kernelArgs.size(), 1 + NumberOfImplicitArgsCUDA); int storedValue = *static_cast(kernelArgs[0]); ASSERT_EQ(storedValue, number); @@ -183,7 +183,7 @@ TEST_P(cudaKernelTest, URKernelArgumentSetTwice) { int otherNumber = 934; ASSERT_SUCCESS( urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &otherNumber)); - const auto kernelArgs2 = kernel->getArgIndices(); + const auto kernelArgs2 = kernel->getArgPointers(); ASSERT_EQ(kernelArgs2.size(), 1 + NumberOfImplicitArgsCUDA); storedValue = *static_cast(kernelArgs2[0]); ASSERT_EQ(storedValue, otherNumber); From 06e7807e20c8693733abc2bf7db90661abbc4aba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Tue, 21 Jan 2025 16:12:05 +0000 Subject: [PATCH 26/46] Use ze events instead of ur in getSignalEvent --- .../level_zero/v2/command_list_manager.cpp | 7 +- .../level_zero/v2/command_list_manager.hpp | 2 +- .../v2/queue_immediate_in_order.cpp | 67 +++++++------------ .../v2/queue_immediate_in_order.hpp | 2 +- 4 files changed, 30 insertions(+), 48 deletions(-) diff --git a/source/adapters/level_zero/v2/command_list_manager.cpp b/source/adapters/level_zero/v2/command_list_manager.cpp index fb50726053..b248fd2dd3 100644 --- a/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/source/adapters/level_zero/v2/command_list_manager.cpp @@ -44,13 +44,13 @@ ur_command_list_manager::getWaitListView(const ur_event_handle_t *phWaitEvents, return {waitList.data(), static_cast(numWaitEvents)}; } -ur_event_handle_t +ze_event_handle_t ur_command_list_manager::getSignalEvent(ur_event_handle_t *hUserEvent, ur_command_t commandType) { if (hUserEvent && queue) { *hUserEvent = eventPool->allocate(); (*hUserEvent)->resetQueueAndCommand(queue, commandType); - return *hUserEvent; + return (*hUserEvent)->getZeEvent(); } else { return nullptr; } @@ -80,7 +80,7 @@ ur_result_t ur_command_list_manager::appendKernelLaunch( zeThreadGroupDimensions, WG, workDim, pGlobalWorkSize, pLocalWorkSize)); - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH); + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH); auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -105,7 +105,6 @@ ur_result_t ur_command_list_manager::appendKernelLaunch( TRACK_SCOPE_LATENCY( "ur_command_list_manager::zeCommandListAppendLaunchKernel"); - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; ZE2UR_CALL(zeCommandListAppendLaunchKernel, (zeCommandList.get(), hZeKernel, &zeThreadGroupDimensions, zeSignalEvent, waitList.second, waitList.first)); diff --git a/source/adapters/level_zero/v2/command_list_manager.hpp b/source/adapters/level_zero/v2/command_list_manager.hpp index 52b4cbbe55..9e0049a130 100644 --- a/source/adapters/level_zero/v2/command_list_manager.hpp +++ b/source/adapters/level_zero/v2/command_list_manager.hpp @@ -39,7 +39,7 @@ struct ur_command_list_manager : public _ur_object { std::pair getWaitListView(const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents); - ur_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, + ze_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, ur_command_t commandType); private: diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index 861a47fa94..2d58e36d9a 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -88,7 +88,7 @@ ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( }), eventFlagsFromQueueFlags(flags)) {} -ur_event_handle_t +ze_event_handle_t ur_queue_immediate_in_order_t::getSignalEvent(ur_event_handle_t *hUserEvent, ur_command_t commandType) { return commandListManager.getSignalEvent(hUserEvent, commandType); @@ -218,7 +218,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWait( return UR_RESULT_SUCCESS; } - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_EVENTS_WAIT); + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_EVENTS_WAIT); auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -228,12 +228,9 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWait( (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents)); } - if (signalEvent) { ZE2UR_CALL( zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), signalEvent->getZeEvent())); - } - + (commandListManager.getZeCommandList(), zeSignalEvent)); return UR_RESULT_SUCCESS; } @@ -250,13 +247,13 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrierImpl( return UR_RESULT_SUCCESS; } - auto signalEvent = + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER); auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); ZE2UR_CALL(zeCommandListAppendBarrier, - (commandListManager.getZeCommandList(), signalEvent->getZeEvent(), + (commandListManager.getZeCommandList(), zeSignalEvent, numWaitEvents, pWaitEvents)); return UR_RESULT_SUCCESS; @@ -291,7 +288,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCopyUnlocked( size_t dstOffset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, ur_command_t commandType) { - auto signalEvent = getSignalEvent(phEvent, commandType); + auto zeSignalEvent = getSignalEvent(phEvent, commandType); auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -321,7 +318,6 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCopyUnlocked( waitList.second = 0; } - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; ZE2UR_CALL(zeCommandListAppendMemoryCopy, (commandListManager.getZeCommandList(), pDst, pSrc, size, zeSignalEvent, waitList.second, waitList.first)); @@ -380,7 +376,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueRegionCopyUnlocked( auto zeParams = ur2zeRegionParams(srcOrigin, dstOrigin, region, srcRowPitch, dstRowPitch, srcSlicePitch, dstSlicePitch); - auto signalEvent = getSignalEvent(phEvent, commandType); + auto zeSignalEvent = getSignalEvent(phEvent, commandType); auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -409,7 +405,6 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueRegionCopyUnlocked( waitList.second = 0; } - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion, (commandListManager.getZeCommandList(), pDst, &zeParams.dstRegion, zeParams.dstPitch, zeParams.dstSlicePitch, pSrc, @@ -582,7 +577,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferMap( std::scoped_lock lock(this->Mutex, hBuffer->getMutex()); - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_BUFFER_MAP); + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_BUFFER_MAP); auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -601,11 +596,10 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferMap( ZE2UR_CALL(zeCommandListAppendWaitOnEvents, (commandListManager.getZeCommandList(), waitList.second, waitList.first)); - if (signalEvent) { ZE2UR_CALL( zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), signalEvent->getZeEvent())); - } + (commandListManager.getZeCommandList(), zeSignalEvent)); + } if (blockingMap) { @@ -623,7 +617,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemUnmap( std::scoped_lock lock(this->Mutex); - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_UNMAP); + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_UNMAP); auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -642,11 +636,10 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemUnmap( memoryMigrated = true; }); - if (signalEvent) { ZE2UR_CALL( zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), signalEvent->getZeEvent())); - } + (commandListManager.getZeCommandList(), zeSignalEvent)); + return UR_RESULT_SUCCESS; } @@ -657,7 +650,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericFillUnlocked( const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, ur_command_t commandType) { - auto signalEvent = getSignalEvent(phEvent, commandType); + auto zeSignalEvent = getSignalEvent(phEvent, commandType); auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -682,7 +675,6 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericFillUnlocked( // PatternSize must be a power of two for zeCommandListAppendMemoryFill. // When it's not, the fill is emulated with zeCommandListAppendMemoryCopy. - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; ZE2UR_CALL(zeCommandListAppendMemoryFill, (commandListManager.getZeCommandList(), pDst, pPattern, patternSize, size, zeSignalEvent, waitList.second, @@ -714,12 +706,11 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMMemcpy( std::scoped_lock lock(this->Mutex); - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_MEMCPY); + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_MEMCPY); auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; ZE2UR_CALL(zeCommandListAppendMemoryCopy, (commandListManager.getZeCommandList(), pDst, pSrc, size, zeSignalEvent, numWaitEvents, pWaitEvents)); @@ -742,7 +733,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMPrefetch( std::scoped_lock lock(this->Mutex); - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_PREFETCH); + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_PREFETCH); auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -756,11 +747,10 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMPrefetch( ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, (commandListManager.getZeCommandList(), pMem, size)); - if (signalEvent) { ZE2UR_CALL( zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), signalEvent->getZeEvent())); - } + (commandListManager.getZeCommandList(), zeSignalEvent)); + return UR_RESULT_SUCCESS; } @@ -777,7 +767,7 @@ ur_queue_immediate_in_order_t::enqueueUSMAdvise(const void *pMem, size_t size, auto zeAdvice = ur_cast(advice); - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE); + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE); auto [pWaitEvents, numWaitEvents] = getWaitListView(nullptr, 0); @@ -792,11 +782,9 @@ ur_queue_immediate_in_order_t::enqueueUSMAdvise(const void *pMem, size_t size, (commandListManager.getZeCommandList(), this->hDevice->ZeDevice, pMem, size, zeAdvice)); - if (signalEvent) { ZE2UR_CALL( zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), signalEvent->getZeEvent())); - } + (commandListManager.getZeCommandList(), zeSignalEvent)); return UR_RESULT_SUCCESS; } @@ -996,7 +984,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp( zeThreadGroupDimensions, WG, workDim, pGlobalWorkSize, pLocalWorkSize)); - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH); + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH); auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -1021,7 +1009,6 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp( TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::" "zeCommandListAppendLaunchCooperativeKernel"); - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel, (commandListManager.getZeCommandList(), hZeKernel, &zeThreadGroupDimensions, zeSignalEvent, waitList.second, @@ -1040,20 +1027,18 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp( std::scoped_lock lock(this->Mutex); - auto signalEvent = - getSignalEvent(phEvent, UR_COMMAND_TIMESTAMP_RECORDING_EXP); - if (!signalEvent) { + if (!phEvent && !*phEvent) { return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); - signalEvent->recordStartTimestamp(); + (*phEvent)->recordStartTimestamp(); auto [timestampPtr, zeSignalEvent] = - signalEvent->getEventEndTimestampAndHandle(); + (*phEvent)->getEventEndTimestampAndHandle(); ZE2UR_CALL(zeCommandListAppendWriteGlobalTimestamp, (commandListManager.getZeCommandList(), timestampPtr, @@ -1073,13 +1058,11 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp( const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand) { std::scoped_lock Lock(this->Mutex); - auto signalEvent = getSignalEvent(phEvent, callerCommand); + auto zeSignalEvent = getSignalEvent(phEvent, callerCommand); auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; - ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp, (commandListManager.getZeCommandList(), numCommandLists, phCommandLists, zeSignalEvent, numWaitEvents, pWaitEvents)); diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index de3b95c748..6cf8b0c51c 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -39,7 +39,7 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ { getWaitListView(const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents); - ur_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, + ze_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, ur_command_t commandType); void deferEventFree(ur_event_handle_t hEvent) override; From 9f53547f20b659217537b33806fdea5c43a41e99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Tue, 21 Jan 2025 16:47:15 +0000 Subject: [PATCH 27/46] Remove not needed structs and reformat code --- .../adapters/level_zero/v2/command_buffer.hpp | 10 +------ .../v2/queue_immediate_in_order.cpp | 29 +++++++------------ 2 files changed, 11 insertions(+), 28 deletions(-) diff --git a/source/adapters/level_zero/v2/command_buffer.hpp b/source/adapters/level_zero/v2/command_buffer.hpp index 50a3d729fd..eca575bfa7 100644 --- a/source/adapters/level_zero/v2/command_buffer.hpp +++ b/source/adapters/level_zero/v2/command_buffer.hpp @@ -16,26 +16,17 @@ #include "queue_api.hpp" #include -struct command_buffer_profiling_t { - ur_exp_command_buffer_sync_point_t numEvents; - ze_kernel_timestamp_result_t *timestamps; -}; - struct ur_exp_command_buffer_handle_t_ : public _ur_object { ur_exp_command_buffer_handle_t_( ur_context_handle_t context, ur_device_handle_t device, v2::raii::command_list_unique_handle &&commandList, const ur_exp_command_buffer_desc_t *desc); ~ur_exp_command_buffer_handle_t_() = default; - ur_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, - ur_command_t commandType); ur_command_list_manager commandListManager; ur_result_t closeCommandList(); - std::vector waitList; - // Indicates if command-buffer commands can be updated after it is closed. bool isUpdatable = false; // Indicates if command buffer was finalized. @@ -50,6 +41,7 @@ struct ur_exp_command_buffer_command_handle_t_ : public _ur_object { ~ur_exp_command_buffer_command_handle_t_(); +private: // Command-buffer of this command. ur_exp_command_buffer_handle_t commandBuffer; // L0 command ID identifying this command diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index 2d58e36d9a..acb0a6f1a3 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -228,9 +228,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWait( (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents)); } - ZE2UR_CALL( - zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), zeSignalEvent)); + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); return UR_RESULT_SUCCESS; } @@ -596,10 +595,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferMap( ZE2UR_CALL(zeCommandListAppendWaitOnEvents, (commandListManager.getZeCommandList(), waitList.second, waitList.first)); - ZE2UR_CALL( - zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), zeSignalEvent)); - + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); } if (blockingMap) { @@ -636,10 +633,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemUnmap( memoryMigrated = true; }); - ZE2UR_CALL( - zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), zeSignalEvent)); - + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); return UR_RESULT_SUCCESS; } @@ -747,10 +742,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMPrefetch( ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, (commandListManager.getZeCommandList(), pMem, size)); - ZE2UR_CALL( - zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), zeSignalEvent)); - + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); return UR_RESULT_SUCCESS; } @@ -782,9 +775,8 @@ ur_queue_immediate_in_order_t::enqueueUSMAdvise(const void *pMem, size_t size, (commandListManager.getZeCommandList(), this->hDevice->ZeDevice, pMem, size, zeAdvice)); - ZE2UR_CALL( - zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), zeSignalEvent)); + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); return UR_RESULT_SUCCESS; } @@ -1027,7 +1019,6 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp( std::scoped_lock lock(this->Mutex); - if (!phEvent && !*phEvent) { return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } From a4516461f41fadf0d098721d4a90e5ac869d1824 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Wed, 22 Jan 2025 10:58:59 +0000 Subject: [PATCH 28/46] Fix PR comments --- source/adapters/level_zero/v2/command_buffer.cpp | 14 ++++++-------- source/adapters/level_zero/v2/command_buffer.hpp | 4 ++-- .../level_zero/v2/command_list_manager.cpp | 4 +--- .../level_zero/v2/queue_immediate_in_order.cpp | 4 ++-- 4 files changed, 11 insertions(+), 15 deletions(-) diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp index 46c8c6ae27..c35d97d76b 100644 --- a/source/adapters/level_zero/v2/command_buffer.cpp +++ b/source/adapters/level_zero/v2/command_buffer.cpp @@ -40,12 +40,13 @@ ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( std::forward(commandList)), isUpdatable(desc ? desc->isUpdatable : false) {} -ur_result_t ur_exp_command_buffer_handle_t_::closeCommandList() { +ur_result_t ur_exp_command_buffer_handle_t_::finalizeCommandBuffer() { // It is not allowed to append to command list from multiple threads. std::scoped_lock guard(this->Mutex); - + UR_ASSERT(!isFinalized, UR_RESULT_ERROR_INVALID_OPERATION); // Close the command lists and have them ready for dispatch. ZE2UR_CALL(zeCommandListClose, (this->commandListManager.getZeCommandList())); + isFinalized = true; return UR_RESULT_SUCCESS; } @@ -72,7 +73,7 @@ urCommandBufferCreateExp(ur_context_handle_t context, ur_device_handle_t device, context, device, std::move(zeCommandList), commandBufferDesc); return UR_RESULT_SUCCESS; -} catch (const std::bad_alloc &) { +} catch (...) { return exceptionToResult(std::current_exception()); } @@ -80,7 +81,7 @@ ur_result_t urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) try { hCommandBuffer->RefCount.increment(); return UR_RESULT_SUCCESS; -} catch (const std::bad_alloc &) { +} catch (...) { return exceptionToResult(std::current_exception()); } @@ -98,10 +99,7 @@ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) try { ur_result_t urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) try { UR_ASSERT(hCommandBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(!hCommandBuffer->isFinalized, UR_RESULT_ERROR_INVALID_OPERATION); - hCommandBuffer->closeCommandList(); - - hCommandBuffer->isFinalized = true; + UR_CALL(hCommandBuffer->finalizeCommandBuffer()); return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); diff --git a/source/adapters/level_zero/v2/command_buffer.hpp b/source/adapters/level_zero/v2/command_buffer.hpp index eca575bfa7..c263457d1a 100644 --- a/source/adapters/level_zero/v2/command_buffer.hpp +++ b/source/adapters/level_zero/v2/command_buffer.hpp @@ -25,7 +25,7 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { ur_command_list_manager commandListManager; - ur_result_t closeCommandList(); + ur_result_t finalizeCommandBuffer(); // Indicates if command-buffer commands can be updated after it is closed. bool isUpdatable = false; @@ -39,9 +39,9 @@ struct ur_exp_command_buffer_command_handle_t_ : public _ur_object { ur_exp_command_buffer_command_handle_t_(ur_exp_command_buffer_handle_t, uint64_t); +private: ~ur_exp_command_buffer_command_handle_t_(); -private: // Command-buffer of this command. ur_exp_command_buffer_handle_t commandBuffer; // L0 command ID identifying this command diff --git a/source/adapters/level_zero/v2/command_list_manager.cpp b/source/adapters/level_zero/v2/command_list_manager.cpp index b248fd2dd3..987cb462a3 100644 --- a/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/source/adapters/level_zero/v2/command_list_manager.cpp @@ -20,9 +20,7 @@ ur_command_list_manager::ur_command_list_manager( ur_queue_handle_t_ *queue) : context(context), device(device), eventPool(context->eventPoolCache.borrow(device->Id.value(), flags)), - zeCommandList( - std::forward(commandList)), - queue(queue) { + zeCommandList(std::move(commandList)), queue(queue) { UR_CALL_THROWS(ur::level_zero::urContextRetain(context)); UR_CALL_THROWS(ur::level_zero::urDeviceRetain(device)); } diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index acb0a6f1a3..eccf07b8b7 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -83,7 +83,7 @@ ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( reinterpret_cast(hNativeHandle), [ownZeQueue](ze_command_list_handle_t hZeCommandList) { if (ownZeQueue) { - zeCommandListDestroy(hZeCommandList); + ZE_CALL_NOCHECK(zeCommandListDestroy, (hZeCommandList)); } }), eventFlagsFromQueueFlags(flags)) {} @@ -1022,7 +1022,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp( if (!phEvent && !*phEvent) { return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } - + getSignalEvent(phEvent, UR_COMMAND_TIMESTAMP_RECORDING_EXP); auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); From f98229f8bc1ff4b11a960e06799f2d182bb9b89c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Wed, 22 Jan 2025 13:49:54 +0000 Subject: [PATCH 29/46] Fix formatting --- .../update/local_memory_update.cpp | 401 +++++++++--------- .../kernel/urKernelSetArgLocal.cpp | 180 ++++---- 2 files changed, 289 insertions(+), 292 deletions(-) diff --git a/test/conformance/exp_command_buffer/update/local_memory_update.cpp b/test/conformance/exp_command_buffer/update/local_memory_update.cpp index 6f309b6933..d55094a52c 100644 --- a/test/conformance/exp_command_buffer/update/local_memory_update.cpp +++ b/test/conformance/exp_command_buffer/update/local_memory_update.cpp @@ -1096,119 +1096,117 @@ TEST_P(LocalMemoryMultiUpdateTest, UpdateWithoutBlocking) { } struct LocalMemoryUpdateTestBaseOutOfOrder : LocalMemoryUpdateTestBase { - virtual void SetUp() override { - program_name = "saxpy_usm_local_mem"; - UUR_RETURN_ON_FATAL_FAILURE( - urUpdatableCommandBufferExpExecutionTest::SetUp()); - - if (backend == UR_PLATFORM_BACKEND_LEVEL_ZERO) { - GTEST_SKIP() - << "Local memory argument update not supported on Level Zero."; - } - - // HIP has extra args for local memory so we define an offset for arg - // indices here for updating - hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0; - ur_device_usm_access_capability_flags_t shared_usm_flags; - ASSERT_SUCCESS( - uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); - if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { - GTEST_SKIP() << "Shared USM is not supported."; - } - - const size_t allocation_size = - sizeof(uint32_t) * global_size * local_size; - for (auto &shared_ptr : shared_ptrs) { - ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, - allocation_size, &shared_ptr)); - ASSERT_NE(shared_ptr, nullptr); - - std::vector pattern(allocation_size); - uur::generateMemFillPattern(pattern); - std::memcpy(shared_ptr, pattern.data(), allocation_size); - } - - std::array index_order{}; - if (backend != UR_PLATFORM_BACKEND_HIP) { - index_order = {3, 2, 4, 5, 1, 0}; - } else { - index_order = {9, 8, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; - } - size_t current_index = 0; - - // Index 3 is A - ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], - sizeof(A), nullptr, &A)); - // Index 2 is output - ASSERT_SUCCESS(urKernelSetArgPointer( - kernel, index_order[current_index++], nullptr, shared_ptrs[0])); - - // Index 4 is X - ASSERT_SUCCESS(urKernelSetArgPointer( - kernel, index_order[current_index++], nullptr, shared_ptrs[1])); - // Index 5 is Y - ASSERT_SUCCESS(urKernelSetArgPointer( - kernel, index_order[current_index++], nullptr, shared_ptrs[2])); - - // Index 1 is local_mem_b arg - ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], - local_mem_b_size, nullptr)); - if (backend == UR_PLATFORM_BACKEND_HIP) { - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - } - - // Index 0 is local_mem_a arg - ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], - local_mem_a_size, nullptr)); - - // Hip has extra args for local mem at index 1-3 - if (backend == UR_PLATFORM_BACKEND_HIP) { - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - } + virtual void SetUp() override { + program_name = "saxpy_usm_local_mem"; + UUR_RETURN_ON_FATAL_FAILURE( + urUpdatableCommandBufferExpExecutionTest::SetUp()); + + if (backend == UR_PLATFORM_BACKEND_LEVEL_ZERO) { + GTEST_SKIP() + << "Local memory argument update not supported on Level Zero."; + } + + // HIP has extra args for local memory so we define an offset for arg + // indices here for updating + hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0; + ur_device_usm_access_capability_flags_t shared_usm_flags; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; + } + + const size_t allocation_size = sizeof(uint32_t) * global_size * local_size; + for (auto &shared_ptr : shared_ptrs) { + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, &shared_ptr)); + ASSERT_NE(shared_ptr, nullptr); + + std::vector pattern(allocation_size); + uur::generateMemFillPattern(pattern); + std::memcpy(shared_ptr, pattern.data(), allocation_size); + } + + std::array index_order{}; + if (backend != UR_PLATFORM_BACKEND_HIP) { + index_order = {3, 2, 4, 5, 1, 0}; + } else { + index_order = {9, 8, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; + } + size_t current_index = 0; + + // Index 3 is A + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(A), nullptr, &A)); + // Index 2 is output + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[0])); + + // Index 4 is X + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[1])); + // Index 5 is Y + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[2])); + + // Index 1 is local_mem_b arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_b_size, nullptr)); + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); } + + // Index 0 is local_mem_a arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_a_size, nullptr)); + + // Hip has extra args for local mem at index 1-3 + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + } + } }; struct LocalMemoryUpdateTestOutOfOrder : LocalMemoryUpdateTestBaseOutOfOrder { - void SetUp() override { - UUR_RETURN_ON_FATAL_FAILURE( - LocalMemoryUpdateTestBaseOutOfOrder::SetUp()); - - // Append kernel command to command-buffer and close command-buffer - ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( - updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr, - nullptr, nullptr, &command_handle)); - ASSERT_NE(command_handle, nullptr); - - ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); - } + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(LocalMemoryUpdateTestBaseOutOfOrder::SetUp()); + + // Append kernel command to command-buffer and close command-buffer + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, + &global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr, nullptr, + nullptr, &command_handle)); + ASSERT_NE(command_handle, nullptr); - void TearDown() override { - if (command_handle) { - EXPECT_SUCCESS(urCommandBufferReleaseCommandExp(command_handle)); - } + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + } - UUR_RETURN_ON_FATAL_FAILURE( - LocalMemoryUpdateTestBaseOutOfOrder::TearDown()); + void TearDown() override { + if (command_handle) { + EXPECT_SUCCESS(urCommandBufferReleaseCommandExp(command_handle)); } - ur_exp_command_buffer_command_handle_t command_handle = nullptr; + UUR_RETURN_ON_FATAL_FAILURE( + LocalMemoryUpdateTestBaseOutOfOrder::TearDown()); + } + + ur_exp_command_buffer_command_handle_t command_handle = nullptr; }; UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(LocalMemoryUpdateTestOutOfOrder); @@ -1216,101 +1214,100 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(LocalMemoryUpdateTestOutOfOrder); // Test updating A,X,Y parameters to new values and local memory to larger // values when the kernel arguments were added out of order. TEST_P(LocalMemoryUpdateTestOutOfOrder, UpdateAllParameters) { - // Run command-buffer prior to update and verify output - ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, - nullptr, nullptr)); - ASSERT_SUCCESS(urQueueFinish(queue)); - - uint32_t *output = (uint32_t *)shared_ptrs[0]; - uint32_t *X = (uint32_t *)shared_ptrs[1]; - uint32_t *Y = (uint32_t *)shared_ptrs[2]; - Validate(output, X, Y, A, global_size, local_size); - - // Update inputs - std::array - new_input_descs; - std::array - new_value_descs; - - size_t new_local_size = local_size * 4; - size_t new_local_mem_a_size = new_local_size * sizeof(uint32_t); - - // New local_mem_a at index 0 - new_value_descs[0] = { - UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype - nullptr, // pNext - 0, // argIndex - new_local_mem_a_size, // argSize - nullptr, // pProperties - nullptr, // hArgValue - }; - - // New local_mem_b at index 1 - new_value_descs[1] = { - UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype - nullptr, // pNext - 1 + hip_arg_offset, // argIndex - local_mem_b_size, // argSize - nullptr, // pProperties - nullptr, // hArgValue - }; - - // New A at index 3 - uint32_t new_A = 33; - new_value_descs[2] = { - UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype - nullptr, // pNext - 3 + (2 * hip_arg_offset), // argIndex - sizeof(new_A), // argSize - nullptr, // pProperties - &new_A, // hArgValue - }; - - // New X at index 4 - new_input_descs[0] = { - UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype - nullptr, // pNext - 4 + (2 * hip_arg_offset), // argIndex - nullptr, // pProperties - &shared_ptrs[3], // pArgValue - }; - - // New Y at index 5 - new_input_descs[1] = { - UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype - nullptr, // pNext - 5 + (2 * hip_arg_offset), // argIndex - nullptr, // pProperties - &shared_ptrs[4], // pArgValue - }; - - // Update kernel inputs - ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { - UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype - nullptr, // pNext - kernel, // hNewKernel - 0, // numNewMemObjArgs - new_input_descs.size(), // numNewPointerArgs - new_value_descs.size(), // numNewValueArgs - n_dimensions, // newWorkDim - nullptr, // pNewMemObjArgList - new_input_descs.data(), // pNewPointerArgList - new_value_descs.data(), // pNewValueArgList - nullptr, // pNewGlobalWorkOffset - nullptr, // pNewGlobalWorkSize - nullptr, // pNewLocalWorkSize - }; - - // Update kernel and enqueue command-buffer again - ASSERT_SUCCESS( - urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); - ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, - nullptr, nullptr)); - ASSERT_SUCCESS(urQueueFinish(queue)); - - // Verify that update occurred correctly - uint32_t *new_output = (uint32_t *)shared_ptrs[0]; - uint32_t *new_X = (uint32_t *)shared_ptrs[3]; - uint32_t *new_Y = (uint32_t *)shared_ptrs[4]; - Validate(new_output, new_X, new_Y, new_A, global_size, local_size); + // Run command-buffer prior to update and verify output + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size, local_size); + + // Update inputs + std::array + new_input_descs; + std::array new_value_descs; + + size_t new_local_size = local_size * 4; + size_t new_local_mem_a_size = new_local_size * sizeof(uint32_t); + + // New local_mem_a at index 0 + new_value_descs[0] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 0, // argIndex + new_local_mem_a_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }; + + // New local_mem_b at index 1 + new_value_descs[1] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 1 + hip_arg_offset, // argIndex + local_mem_b_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }; + + // New A at index 3 + uint32_t new_A = 33; + new_value_descs[2] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 3 + (2 * hip_arg_offset), // argIndex + sizeof(new_A), // argSize + nullptr, // pProperties + &new_A, // hArgValue + }; + + // New X at index 4 + new_input_descs[0] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 4 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[3], // pArgValue + }; + + // New Y at index 5 + new_input_descs[1] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 5 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[4], // pArgValue + }; + + // Update kernel inputs + ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + kernel, // hNewKernel + 0, // numNewMemObjArgs + new_input_descs.size(), // numNewPointerArgs + new_value_descs.size(), // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + new_input_descs.data(), // pNewPointerArgList + new_value_descs.data(), // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + nullptr, // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize + }; + + // Update kernel and enqueue command-buffer again + ASSERT_SUCCESS( + urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + // Verify that update occurred correctly + uint32_t *new_output = (uint32_t *)shared_ptrs[0]; + uint32_t *new_X = (uint32_t *)shared_ptrs[3]; + uint32_t *new_Y = (uint32_t *)shared_ptrs[4]; + Validate(new_output, new_X, new_Y, new_A, global_size, local_size); } \ No newline at end of file diff --git a/test/conformance/kernel/urKernelSetArgLocal.cpp b/test/conformance/kernel/urKernelSetArgLocal.cpp index f5fc0019ae..f056d025bc 100644 --- a/test/conformance/kernel/urKernelSetArgLocal.cpp +++ b/test/conformance/kernel/urKernelSetArgLocal.cpp @@ -240,99 +240,99 @@ TEST_P(urKernelSetArgLocalMultiTest, Overwrite) { // Tests that adding arguments out of order (e.g. index 1 before index 0) works. struct urKernelSetArgLocalOutOfOrder : urKernelSetArgLocalMultiTest { - void SetUp() override { - program_name = "saxpy_usm_local_mem"; - UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); - - ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, - sizeof(backend), &backend, nullptr)); - - // HIP has extra args for local memory so we define an offset for arg indices here for updating - hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0; - ur_device_usm_access_capability_flags_t shared_usm_flags; - ASSERT_SUCCESS( - uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); - if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { - GTEST_SKIP() << "Shared USM is not supported."; - } - - const size_t allocation_size = - sizeof(uint32_t) * global_size * local_size; - for (auto &shared_ptr : shared_ptrs) { - ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, - allocation_size, &shared_ptr)); - ASSERT_NE(shared_ptr, nullptr); - - std::vector pattern(allocation_size); - uur::generateMemFillPattern(pattern); - std::memcpy(shared_ptr, pattern.data(), allocation_size); - } - - std::array index_order{}; - if (backend != UR_PLATFORM_BACKEND_HIP) { - index_order = {3, 2, 4, 5, 1, 0}; - } else { - index_order = {9, 8, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; - } - size_t current_index = 0; - - // Index 3 is A - ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], - sizeof(A), nullptr, &A)); - // Index 2 is output - ASSERT_SUCCESS(urKernelSetArgPointer( - kernel, index_order[current_index++], nullptr, shared_ptrs[0])); - - // Index 4 is X - ASSERT_SUCCESS(urKernelSetArgPointer( - kernel, index_order[current_index++], nullptr, shared_ptrs[1])); - // Index 5 is Y - ASSERT_SUCCESS(urKernelSetArgPointer( - kernel, index_order[current_index++], nullptr, shared_ptrs[2])); - - // Index 1 is local_mem_b arg - ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], - local_mem_b_size, nullptr)); - if (backend == UR_PLATFORM_BACKEND_HIP) { - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - } - - // Index 0 is local_mem_a arg - ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], - local_mem_a_size, nullptr)); - - // Hip has extra args for local mem at index 1-3 - if (backend == UR_PLATFORM_BACKEND_HIP) { - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - } + void SetUp() override { + program_name = "saxpy_usm_local_mem"; + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); + + ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, + sizeof(backend), &backend, nullptr)); + + // HIP has extra args for local memory so we define an offset for arg + // indices here for updating + hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0; + ur_device_usm_access_capability_flags_t shared_usm_flags; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; } + + const size_t allocation_size = sizeof(uint32_t) * global_size * local_size; + for (auto &shared_ptr : shared_ptrs) { + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, &shared_ptr)); + ASSERT_NE(shared_ptr, nullptr); + + std::vector pattern(allocation_size); + uur::generateMemFillPattern(pattern); + std::memcpy(shared_ptr, pattern.data(), allocation_size); + } + + std::array index_order{}; + if (backend != UR_PLATFORM_BACKEND_HIP) { + index_order = {3, 2, 4, 5, 1, 0}; + } else { + index_order = {9, 8, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; + } + size_t current_index = 0; + + // Index 3 is A + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(A), nullptr, &A)); + // Index 2 is output + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[0])); + + // Index 4 is X + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[1])); + // Index 5 is Y + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[2])); + + // Index 1 is local_mem_b arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_b_size, nullptr)); + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + } + + // Index 0 is local_mem_a arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_a_size, nullptr)); + + // Hip has extra args for local mem at index 1-3 + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + } + } }; UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urKernelSetArgLocalOutOfOrder); TEST_P(urKernelSetArgLocalOutOfOrder, Success) { - ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, - &global_offset, &global_size, - &local_size, 0, nullptr, nullptr)); - ASSERT_SUCCESS(urQueueFinish(queue)); - - uint32_t *output = (uint32_t *)shared_ptrs[0]; - uint32_t *X = (uint32_t *)shared_ptrs[1]; - uint32_t *Y = (uint32_t *)shared_ptrs[2]; - Validate(output, X, Y, A, global_size, local_size); + ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, + &global_offset, &global_size, + &local_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size, local_size); } From 67f716258334a448aeea230f0b88adb0e54a608a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Wed, 22 Jan 2025 14:20:08 +0000 Subject: [PATCH 30/46] Fix ze function calling --- .../v2/queue_immediate_in_order.cpp | 34 +++++++++++-------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index eccf07b8b7..990527514a 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -228,8 +228,10 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWait( (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents)); } - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), zeSignalEvent)); + if (zeSignalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); + } return UR_RESULT_SUCCESS; } @@ -595,8 +597,10 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferMap( ZE2UR_CALL(zeCommandListAppendWaitOnEvents, (commandListManager.getZeCommandList(), waitList.second, waitList.first)); - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), zeSignalEvent)); + if (zeSignalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); + } } if (blockingMap) { @@ -632,10 +636,10 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemUnmap( nullptr, waitList.second, waitList.first)); memoryMigrated = true; }); - - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), zeSignalEvent)); - + if (zeSignalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); + } return UR_RESULT_SUCCESS; } @@ -741,9 +745,10 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMPrefetch( // TODO: figure out how to translate "flags" ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, (commandListManager.getZeCommandList(), pMem, size)); - - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), zeSignalEvent)); + if (zeSignalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); + } return UR_RESULT_SUCCESS; } @@ -775,9 +780,10 @@ ur_queue_immediate_in_order_t::enqueueUSMAdvise(const void *pMem, size_t size, (commandListManager.getZeCommandList(), this->hDevice->ZeDevice, pMem, size, zeAdvice)); - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), zeSignalEvent)); - + if (zeSignalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); + } return UR_RESULT_SUCCESS; } From 9de10cd9547db008ef4347f86dc0bf9198a8fb97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Wed, 22 Jan 2025 14:43:42 +0000 Subject: [PATCH 31/46] Rename variable ArgIndices to ArgPointers --- source/adapters/cuda/enqueue.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 71c4340456..540ebb86fa 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -492,11 +492,11 @@ enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, UR_CHECK_ERROR(RetImplEvent->start()); } - auto &ArgIndices = hKernel->getArgPointers(); + auto &ArgPointers = hKernel->getArgPointers(); UR_CHECK_ERROR(cuLaunchKernel( CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize, - CuStream, const_cast(ArgIndices.data()), nullptr)); + CuStream, const_cast(ArgPointers.data()), nullptr)); if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); From eeff9f4a6e0ed51ba459fd923724fb4c3dd545d7 Mon Sep 17 00:00:00 2001 From: Przemek Malon Date: Wed, 8 Jan 2025 19:53:17 +0000 Subject: [PATCH 32/46] Enable creation of bindless images backed by host USM Small patch to enable bindless images backed by host USM in the CUDA adapter. Host and Device USM pointers are usable across the host and device for all versions of CUDA that we support. There is no need to provide the `CU_MEMHOSTALLOC_DEVICEMAP` flag during allocation, or calling `cuMemHostGetDevicePointer` to retrieve a device usable address. Passing a `CU_MEMHOSTALLOC_WRITECOMBINED` flag to the host USM allocation will enhance performance in certain scenarios, however, an extension allowing this is not yet available. --- source/adapters/cuda/image.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/source/adapters/cuda/image.cpp b/source/adapters/cuda/image.cpp index c11a85b293..87570e3b45 100644 --- a/source/adapters/cuda/image.cpp +++ b/source/adapters/cuda/image.cpp @@ -533,8 +533,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( image_res_desc.resType = CU_RESOURCE_TYPE_MIPMAPPED_ARRAY; image_res_desc.res.mipmap.hMipmappedArray = (CUmipmappedArray)hImageMem; } - } else if (mem_type == CU_MEMORYTYPE_DEVICE) { - // We have a USM pointer + } else if (mem_type == CU_MEMORYTYPE_DEVICE || + mem_type == CU_MEMORYTYPE_HOST) { + // We have a USM pointer. + // Images may be created from device or host USM. if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { image_res_desc.resType = CU_RESOURCE_TYPE_LINEAR; image_res_desc.res.linear.devPtr = (CUdeviceptr)hImageMem; From f28f7074c5a1cb9a24db9c508d8052da9c0ee1e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Thu, 23 Jan 2025 11:55:43 +0000 Subject: [PATCH 33/46] Fix access to some fields in command buffer v2 --- source/adapters/level_zero/v2/command_buffer.hpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/source/adapters/level_zero/v2/command_buffer.hpp b/source/adapters/level_zero/v2/command_buffer.hpp index c263457d1a..5e60d6537f 100644 --- a/source/adapters/level_zero/v2/command_buffer.hpp +++ b/source/adapters/level_zero/v2/command_buffer.hpp @@ -21,18 +21,20 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { ur_context_handle_t context, ur_device_handle_t device, v2::raii::command_list_unique_handle &&commandList, const ur_exp_command_buffer_desc_t *desc); + ~ur_exp_command_buffer_handle_t_() = default; ur_command_list_manager commandListManager; ur_result_t finalizeCommandBuffer(); - // Indicates if command-buffer commands can be updated after it is closed. - bool isUpdatable = false; + const bool isUpdatable = false; + // Command-buffer profiling is enabled. + const bool isProfilingEnabled = false; + +private: // Indicates if command buffer was finalized. bool isFinalized = false; - // Command-buffer profiling is enabled. - bool isProfilingEnabled = false; }; struct ur_exp_command_buffer_command_handle_t_ : public _ur_object { From 48d1890d44b53edbf24181d022707cd7d9cf988c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Thu, 23 Jan 2025 12:30:47 +0000 Subject: [PATCH 34/46] Fix compilation --- source/adapters/level_zero/v2/queue_immediate_in_order.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index 96687570b0..3d50c52a56 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -114,7 +114,7 @@ ur_queue_immediate_in_order_t::queueGetInfo(ur_queue_info_t propName, return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; case UR_QUEUE_INFO_EMPTY: { auto status = ZE_CALL_NOCHECK(zeCommandListHostSynchronize, - (handler.commandList.get(), 0)); + (commandListManager.getZeCommandList(), 0)); if (status == ZE_RESULT_SUCCESS) { return ReturnValue(true); } else if (status == ZE_RESULT_NOT_READY) { From f71ef62fa42683060fe26652797df0d12753959b Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Thu, 23 Jan 2025 16:34:03 +0000 Subject: [PATCH 35/46] Fix passing struct object by value Don't pass large objects by value, use a r-value reference instead. Also make function defined in header `inline`. --- test/adapters/level_zero/event_cache_tests.cpp | 2 +- test/adapters/level_zero/multi_device_event_cache_tests.cpp | 2 +- test/adapters/level_zero/ze_tracer_common.hpp | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/test/adapters/level_zero/event_cache_tests.cpp b/test/adapters/level_zero/event_cache_tests.cpp index 14466ab805..09fcff7373 100644 --- a/test/adapters/level_zero/event_cache_tests.cpp +++ b/test/adapters/level_zero/event_cache_tests.cpp @@ -30,7 +30,7 @@ static std::shared_ptr<_zel_tracer_handle_t> tracer = [] { zel_core_callbacks_t prologue_callbacks{}; prologue_callbacks.Event.pfnCreateCb = OnEnterEventCreate; prologue_callbacks.Event.pfnDestroyCb = OnEnterEventDestroy; - return enableTracing(prologue_callbacks, {}); + return enableTracing(std::move(prologue_callbacks), {}); }(); template auto combineFlags(std::tuple tuple) { diff --git a/test/adapters/level_zero/multi_device_event_cache_tests.cpp b/test/adapters/level_zero/multi_device_event_cache_tests.cpp index f0cc261bb4..bc88eb2a02 100644 --- a/test/adapters/level_zero/multi_device_event_cache_tests.cpp +++ b/test/adapters/level_zero/multi_device_event_cache_tests.cpp @@ -20,7 +20,7 @@ static std::shared_ptr<_zel_tracer_handle_t> tracer = [] { zel_core_callbacks_t prologue_callbacks{}; prologue_callbacks.CommandList.pfnAppendWaitOnEventsCb = OnAppendWaitOnEventsCb; - return enableTracing(prologue_callbacks, {}); + return enableTracing(std::move(prologue_callbacks), {}); }(); using urMultiQueueMultiDeviceEventCacheTest = uur::urAllDevicesTest; diff --git a/test/adapters/level_zero/ze_tracer_common.hpp b/test/adapters/level_zero/ze_tracer_common.hpp index bf93c71fbb..8aa93c7c13 100644 --- a/test/adapters/level_zero/ze_tracer_common.hpp +++ b/test/adapters/level_zero/ze_tracer_common.hpp @@ -11,9 +11,9 @@ #include -std::shared_ptr<_zel_tracer_handle_t> -enableTracing(zel_core_callbacks_t prologueCallbacks, - zel_core_callbacks_t epilogueCallbacks) { +inline std::shared_ptr<_zel_tracer_handle_t> +enableTracing(zel_core_callbacks_t &&prologueCallbacks, + zel_core_callbacks_t &&epilogueCallbacks) { EXPECT_EQ(zeInit(ZE_INIT_FLAG_GPU_ONLY), ZE_RESULT_SUCCESS); zel_tracer_desc_t tracer_desc = {ZEL_STRUCTURE_TYPE_TRACER_EXP_DESC, nullptr, From 9a64274e39c51abe9422d986386576ae02f7581c Mon Sep 17 00:00:00 2001 From: Agata Momot Date: Thu, 23 Jan 2025 18:44:47 +0100 Subject: [PATCH 36/46] remove benchmark output from markdown --- scripts/benchmarks/output_markdown.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/scripts/benchmarks/output_markdown.py b/scripts/benchmarks/output_markdown.py index 13df68d45e..fc3b65507b 100644 --- a/scripts/benchmarks/output_markdown.py +++ b/scripts/benchmarks/output_markdown.py @@ -27,7 +27,7 @@ def generate_markdown_details(results: list[Result]): markdown_sections.append(f"""
-Benchmark details - environment, command, output... +Benchmark details - environment, command... """) for res in results: @@ -42,9 +42,6 @@ def generate_markdown_details(results: list[Result]): #### Command: {' '.join(res.command)} -#### Output: -{res.stdout} -
""") markdown_sections.append(f""" From 07001aa737ef66f394cd36f2b56f28b97e98c1c2 Mon Sep 17 00:00:00 2001 From: Rafal Rudnicki Date: Fri, 24 Jan 2025 10:25:23 +0100 Subject: [PATCH 37/46] fix parseDisjointPoolConfig and add tests --- .../umf_pools/disjoint_pool_config_parser.cpp | 59 +++++++---------- test/usm/usmPoolManager.cpp | 66 +++++++++++++++++++ 2 files changed, 90 insertions(+), 35 deletions(-) diff --git a/source/common/umf_pools/disjoint_pool_config_parser.cpp b/source/common/umf_pools/disjoint_pool_config_parser.cpp index 8d5bc2066e..42c894b412 100644 --- a/source/common/umf_pools/disjoint_pool_config_parser.cpp +++ b/source/common/umf_pools/disjoint_pool_config_parser.cpp @@ -174,47 +174,36 @@ DisjointPoolAllConfigs parseDisjointPoolConfig(const std::string &config, MemParser(Params, M); }; - size_t MaxSize = (std::numeric_limits::max)(); - // Update pool settings if specified in environment. + size_t MaxSize = (std::numeric_limits::max)(); size_t EnableBuffers = 1; - if (config != "") { - std::string Params = config; - size_t Pos = Params.find(';'); - if (Pos != std::string::npos) { - if (Pos > 0) { - GetValue(Params, Pos, EnableBuffers); + + bool EnableBuffersSet = false; + bool MaxSizeSet = false; + size_t Start = 0; + size_t End = config.find(';'); + while (true) { + std::string Param = config.substr(Start, End - Start); + if (!EnableBuffersSet && (Param == "" || isdigit(Param[0]))) { + if (Param != "") { + GetValue(Param, Param.size(), EnableBuffers); } - Params.erase(0, Pos + 1); - size_t Pos = Params.find(';'); - if (Pos != std::string::npos) { - if (Pos > 0) { - GetValue(Params, Pos, MaxSize); - } - Params.erase(0, Pos + 1); - do { - size_t Pos = Params.find(';'); - if (Pos != std::string::npos) { - if (Pos > 0) { - std::string MemParams = Params.substr(0, Pos); - MemTypeParser(MemParams); - } - Params.erase(0, Pos + 1); - if (Params.size() == 0) { - break; - } - } else { - MemTypeParser(Params); - break; - } - } while (true); - } else { - // set MaxPoolSize for all configs - GetValue(Params, Params.size(), MaxSize); + EnableBuffersSet = true; + } else if (!MaxSizeSet && (Param == "" || isdigit(Param[0]))) { + if (Param != "") { + GetValue(Param, Param.size(), MaxSize); } + MaxSizeSet = true; } else { - GetValue(Params, Params.size(), EnableBuffers); + MemTypeParser(Param); } + + if (End == std::string::npos) { + break; + } + + Start = End + 1; + End = config.find(';', Start); } AllConfigs.EnableBuffers = EnableBuffers; diff --git a/test/usm/usmPoolManager.cpp b/test/usm/usmPoolManager.cpp index ec52d00c5e..4e82196eef 100644 --- a/test/usm/usmPoolManager.cpp +++ b/test/usm/usmPoolManager.cpp @@ -4,6 +4,7 @@ // // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include "umf_pools/disjoint_pool_config_parser.hpp" #include "ur_pool_manager.hpp" #include @@ -18,6 +19,26 @@ auto createMockPoolHandle() { [](umf_memory_pool_t *) {}); } +bool compareConfig(const usm::umf_disjoint_pool_config_t &left, + usm::umf_disjoint_pool_config_t &right) { + return left.MaxPoolableSize == right.MaxPoolableSize && + left.Capacity == right.Capacity && + left.SlabMinSize == right.SlabMinSize; +} + +bool compareConfigs(const usm::DisjointPoolAllConfigs &left, + usm::DisjointPoolAllConfigs &right) { + return left.EnableBuffers == right.EnableBuffers && + compareConfig(left.Configs[usm::DisjointPoolMemType::Host], + right.Configs[usm::DisjointPoolMemType::Host]) && + compareConfig(left.Configs[usm::DisjointPoolMemType::Device], + right.Configs[usm::DisjointPoolMemType::Device]) && + compareConfig(left.Configs[usm::DisjointPoolMemType::Shared], + right.Configs[usm::DisjointPoolMemType::Shared]) && + compareConfig(left.Configs[usm::DisjointPoolMemType::SharedReadOnly], + right.Configs[usm::DisjointPoolMemType::SharedReadOnly]); +} + TEST_P(urUsmPoolDescriptorTest, poolIsPerContextTypeAndDevice) { auto &devices = uur::DevicesEnvironment::instance->devices; @@ -111,4 +132,49 @@ TEST_P(urUsmPoolManagerTest, poolManagerGetNonexistant) { } } +TEST_P(urUsmPoolManagerTest, config) { + // Check default config + usm::DisjointPoolAllConfigs def; + usm::DisjointPoolAllConfigs parsed1 = + usm::parseDisjointPoolConfig("1;host:2M,4,64K;device:4M,4,64K;" + "shared:0,0,2M;read_only_shared:4M,4,2M", + 0); + ASSERT_EQ(compareConfigs(def, parsed1), true); + + // Check partially set config + usm::DisjointPoolAllConfigs part1 = + usm::parseDisjointPoolConfig("1;device:4M;shared:0,0,2M", 0); + ASSERT_EQ(compareConfigs(def, part1), true); + + // Check partially set config #2 + usm::DisjointPoolAllConfigs part2 = + usm::parseDisjointPoolConfig(";device:4M;shared:0,0,2M", 0); + ASSERT_EQ(compareConfigs(def, part2), true); + + // Check partially set config #3 + usm::DisjointPoolAllConfigs part3 = + usm::parseDisjointPoolConfig(";shared:0,0,2M", 0); + ASSERT_EQ(compareConfigs(def, part3), true); + + // Check partially set config #4 + usm::DisjointPoolAllConfigs part4 = + usm::parseDisjointPoolConfig(";device:4M", 0); + ASSERT_EQ(compareConfigs(def, part4), true); + + // Check partially set config #5 + usm::DisjointPoolAllConfigs part5 = + usm::parseDisjointPoolConfig(";;device:4M,4,64K", 0); + ASSERT_EQ(compareConfigs(def, part5), true); + + // Check non-default config + usm::DisjointPoolAllConfigs test(def); + test.Configs[usm::DisjointPoolMemType::Shared].MaxPoolableSize = 128 * 1024; + test.Configs[usm::DisjointPoolMemType::Shared].Capacity = 4; + test.Configs[usm::DisjointPoolMemType::Shared].SlabMinSize = 64 * 1024; + + usm::DisjointPoolAllConfigs parsed3 = + usm::parseDisjointPoolConfig("1;shared:128K,4,64K", 0); + ASSERT_EQ(compareConfigs(test, parsed3), true); +} + UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urUsmPoolManagerTest); From 707bcded8da0b7fe5a0294a6f5e8b20a1b3f5dfe Mon Sep 17 00:00:00 2001 From: Martin Morrison-Grant Date: Mon, 13 Jan 2025 12:09:24 +0000 Subject: [PATCH 38/46] Move urMemImageGetInfo success test from a switch to individual test. Added implementation details to OpenCL adapter for processing image format structs. Added a couple missing enums to spec for ur_image_info_t and added CTS tests for these. --- include/ur_api.h | 10 +- include/ur_print.hpp | 48 ++++ scripts/core/memory.yml | 6 + source/adapters/opencl/memory.cpp | 99 +++++++ source/loader/layers/validation/ur_valddi.cpp | 4 +- source/loader/ur_libapi.cpp | 4 +- source/ur_api.cpp | 4 +- test/conformance/memory/urMemImageGetInfo.cpp | 254 ++++++++++++++---- test/conformance/queue/urQueueGetInfo.cpp | 28 +- .../testing/include/uur/fixtures.h | 38 --- 10 files changed, 381 insertions(+), 114 deletions(-) diff --git a/include/ur_api.h b/include/ur_api.h index ad88f3ac08..684a3bb2b3 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -3324,6 +3324,12 @@ typedef enum ur_image_info_t { UR_IMAGE_INFO_HEIGHT = 5, /// [size_t] image depth UR_IMAGE_INFO_DEPTH = 6, + /// [size_t] array size + UR_IMAGE_INFO_ARRAY_SIZE = 7, + /// [uint32_t] number of MIP levels + UR_IMAGE_INFO_NUM_MIP_LEVELS = 8, + /// [uint32_t] number of samples + UR_IMAGE_INFO_NUM_SAMPLES = 9, /// @cond UR_IMAGE_INFO_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -3837,7 +3843,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hMemory` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_IMAGE_INFO_DEPTH < propName` +/// + `::UR_IMAGE_INFO_NUM_SAMPLES < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -9499,7 +9505,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hContext` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_IMAGE_INFO_DEPTH < propName` +/// + `::UR_IMAGE_INFO_NUM_SAMPLES < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE diff --git a/include/ur_print.hpp b/include/ur_print.hpp index 2e6f7d715d..d8873ae456 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -6466,6 +6466,15 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_image_info_t value) { case UR_IMAGE_INFO_DEPTH: os << "UR_IMAGE_INFO_DEPTH"; break; + case UR_IMAGE_INFO_ARRAY_SIZE: + os << "UR_IMAGE_INFO_ARRAY_SIZE"; + break; + case UR_IMAGE_INFO_NUM_MIP_LEVELS: + os << "UR_IMAGE_INFO_NUM_MIP_LEVELS"; + break; + case UR_IMAGE_INFO_NUM_SAMPLES: + os << "UR_IMAGE_INFO_NUM_SAMPLES"; + break; default: os << "unknown enumerator"; break; @@ -6574,6 +6583,45 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, os << ")"; } break; + case UR_IMAGE_INFO_ARRAY_SIZE: { + const size_t *tptr = (const size_t *)ptr; + if (sizeof(size_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(size_t) + << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; + case UR_IMAGE_INFO_NUM_MIP_LEVELS: { + const uint32_t *tptr = (const uint32_t *)ptr; + if (sizeof(uint32_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(uint32_t) + << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; + case UR_IMAGE_INFO_NUM_SAMPLES: { + const uint32_t *tptr = (const uint32_t *)ptr; + if (sizeof(uint32_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(uint32_t) + << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; default: os << "unknown enumerator"; return UR_RESULT_ERROR_INVALID_ENUMERATION; diff --git a/scripts/core/memory.yml b/scripts/core/memory.yml index 75f68d8e9a..a20693f8a9 100644 --- a/scripts/core/memory.yml +++ b/scripts/core/memory.yml @@ -160,6 +160,12 @@ etors: desc: "[size_t] image height" - name: DEPTH desc: "[size_t] image depth" + - name: ARRAY_SIZE + desc: "[size_t] array size" + - name: NUM_MIP_LEVELS + desc: "[uint32_t] number of MIP levels" + - name: NUM_SAMPLES + desc: "[uint32_t] number of samples" --- #-------------------------------------------------------------------------- type: struct desc: "Image format including channel layout and data type" diff --git a/source/adapters/opencl/memory.cpp b/source/adapters/opencl/memory.cpp index 201df1f678..c420b920df 100644 --- a/source/adapters/opencl/memory.cpp +++ b/source/adapters/opencl/memory.cpp @@ -10,6 +10,92 @@ #include "common.hpp" +#include + +const std::unordered_map + ChannelOrderMap = { + {UR_IMAGE_CHANNEL_ORDER_A, CL_A}, + {UR_IMAGE_CHANNEL_ORDER_R, CL_R}, + {UR_IMAGE_CHANNEL_ORDER_RG, CL_RG}, + {UR_IMAGE_CHANNEL_ORDER_RA, CL_RA}, + {UR_IMAGE_CHANNEL_ORDER_RGB, CL_RGB}, + {UR_IMAGE_CHANNEL_ORDER_RGBA, CL_RGBA}, + {UR_IMAGE_CHANNEL_ORDER_BGRA, CL_BGRA}, + {UR_IMAGE_CHANNEL_ORDER_ARGB, CL_ARGB}, + {UR_IMAGE_CHANNEL_ORDER_ABGR, CL_ABGR}, + {UR_IMAGE_CHANNEL_ORDER_INTENSITY, CL_INTENSITY}, + {UR_IMAGE_CHANNEL_ORDER_LUMINANCE, CL_LUMINANCE}, + {UR_IMAGE_CHANNEL_ORDER_RX, CL_Rx}, + {UR_IMAGE_CHANNEL_ORDER_RGX, CL_RGx}, + {UR_IMAGE_CHANNEL_ORDER_RGBX, CL_RGBx}, + {UR_IMAGE_CHANNEL_ORDER_SRGBA, CL_sRGBA}, +}; + +const std::unordered_map + ChannelTypeMap = { + {UR_IMAGE_CHANNEL_TYPE_SNORM_INT8, CL_SNORM_INT8}, + {UR_IMAGE_CHANNEL_TYPE_SNORM_INT16, CL_SNORM_INT16}, + {UR_IMAGE_CHANNEL_TYPE_UNORM_INT8, CL_UNORM_INT8}, + {UR_IMAGE_CHANNEL_TYPE_UNORM_INT16, CL_UNORM_INT16}, + {UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565, CL_UNORM_SHORT_565}, + {UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555, CL_UNORM_SHORT_555}, + {UR_IMAGE_CHANNEL_TYPE_INT_101010, CL_UNORM_INT_101010}, + {UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8, CL_SIGNED_INT8}, + {UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16, CL_SIGNED_INT16}, + {UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32, CL_SIGNED_INT32}, + {UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8, CL_UNSIGNED_INT8}, + {UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16, CL_UNSIGNED_INT16}, + {UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32, CL_UNSIGNED_INT32}, + {UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT, CL_HALF_FLOAT}, + {UR_IMAGE_CHANNEL_TYPE_FLOAT, CL_FLOAT}, +}; + +cl_image_format mapURImageFormatToCL(const ur_image_format_t &PImageFormat) { + cl_image_format CLImageFormat = {UR_IMAGE_CHANNEL_ORDER_FORCE_UINT32, + UR_IMAGE_CHANNEL_TYPE_FORCE_UINT32}; + + auto channelOrderIt = ChannelOrderMap.find(PImageFormat.channelOrder); + if (channelOrderIt != ChannelOrderMap.end()) { + CLImageFormat.image_channel_order = channelOrderIt->second; + } + + auto channelTypeIt = ChannelTypeMap.find(PImageFormat.channelType); + if (channelTypeIt != ChannelTypeMap.end()) { + CLImageFormat.image_channel_data_type = channelTypeIt->second; + } + + return CLImageFormat; +} + +ur_image_format_t mapCLImageFormatToUR(const cl_image_format *PImageFormat) { + ur_image_format_t URImageFormat = {UR_IMAGE_CHANNEL_ORDER_FORCE_UINT32, + UR_IMAGE_CHANNEL_TYPE_FORCE_UINT32}; + + auto reverseChannelOrderIt = + std::find_if(ChannelOrderMap.begin(), ChannelOrderMap.end(), + [PImageFormat](const auto &pair) { + return pair.second == PImageFormat->image_channel_order; + }); + if (reverseChannelOrderIt != ChannelOrderMap.end()) { + URImageFormat.channelOrder = reverseChannelOrderIt->first; + } + + URImageFormat.channelOrder = (reverseChannelOrderIt != ChannelOrderMap.end()) + ? reverseChannelOrderIt->first + : UR_IMAGE_CHANNEL_ORDER_FORCE_UINT32; + + auto reverseChannelTypeIt = std::find_if( + ChannelTypeMap.begin(), ChannelTypeMap.end(), + [PImageFormat](const auto &pair) { + return pair.second == PImageFormat->image_channel_data_type; + }); + if (reverseChannelTypeIt != ChannelTypeMap.end()) { + URImageFormat.channelType = reverseChannelTypeIt->first; + } + + return URImageFormat; +} + cl_image_format mapURImageFormatToCL(const ur_image_format_t *PImageFormat) { cl_image_format CLImageFormat; switch (PImageFormat->channelOrder) { @@ -174,6 +260,12 @@ cl_int mapURMemImageInfoToCL(ur_image_info_t URPropName) { return CL_IMAGE_HEIGHT; case UR_IMAGE_INFO_DEPTH: return CL_IMAGE_DEPTH; + case UR_IMAGE_INFO_ARRAY_SIZE: + return CL_IMAGE_ARRAY_SIZE; + case UR_IMAGE_INFO_NUM_MIP_LEVELS: + return CL_IMAGE_NUM_MIP_LEVELS; + case UR_IMAGE_INFO_NUM_SAMPLES: + return CL_IMAGE_NUM_SAMPLES; default: return -1; } @@ -397,7 +489,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(ur_mem_handle_t hMemory, CL_RETURN_ON_FAILURE(ClResult); if (pPropSizeRet) { *pPropSizeRet = CheckPropSize; + } else { + if (propName == UR_IMAGE_INFO_FORMAT) { + ur_image_format_t format = mapCLImageFormatToUR( + reinterpret_cast(pPropValue)); + return ReturnValue(format); + } } + return UR_RESULT_SUCCESS; } diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index 19ab908ee3..d86c3ae98d 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -1470,7 +1470,7 @@ __urdlllocal ur_result_t UR_APICALL urMemImageGetInfo( if (pPropValue == NULL && pPropSizeRet == NULL) return UR_RESULT_ERROR_INVALID_NULL_POINTER; - if (UR_IMAGE_INFO_DEPTH < propName) + if (UR_IMAGE_INFO_NUM_SAMPLES < propName) return UR_RESULT_ERROR_INVALID_ENUMERATION; if (propSize == 0 && pPropValue != NULL) @@ -7069,7 +7069,7 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageGetInfoExp( if (pPropValue == NULL && pPropSizeRet == NULL) return UR_RESULT_ERROR_INVALID_NULL_POINTER; - if (UR_IMAGE_INFO_DEPTH < propName) + if (UR_IMAGE_INFO_NUM_SAMPLES < propName) return UR_RESULT_ERROR_INVALID_ENUMERATION; } diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 5b55b71d4d..744aadf1a5 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -1943,7 +1943,7 @@ ur_result_t UR_APICALL urMemGetInfo( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hMemory` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_IMAGE_INFO_DEPTH < propName` +/// + `::UR_IMAGE_INFO_NUM_SAMPLES < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -7120,7 +7120,7 @@ ur_result_t UR_APICALL urBindlessImagesImageCopyExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hContext` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_IMAGE_INFO_DEPTH < propName` +/// + `::UR_IMAGE_INFO_NUM_SAMPLES < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 15cb0442b1..82b58be9b5 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -1720,7 +1720,7 @@ ur_result_t UR_APICALL urMemGetInfo( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hMemory` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_IMAGE_INFO_DEPTH < propName` +/// + `::UR_IMAGE_INFO_NUM_SAMPLES < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -6249,7 +6249,7 @@ ur_result_t UR_APICALL urBindlessImagesImageCopyExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hContext` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_IMAGE_INFO_DEPTH < propName` +/// + `::UR_IMAGE_INFO_NUM_SAMPLES < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE diff --git a/test/conformance/memory/urMemImageGetInfo.cpp b/test/conformance/memory/urMemImageGetInfo.cpp index 233a61d683..3712d91a5e 100644 --- a/test/conformance/memory/urMemImageGetInfo.cpp +++ b/test/conformance/memory/urMemImageGetInfo.cpp @@ -5,93 +5,235 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include -#include - -struct urMemImageGetInfoTest : uur::urMemImageTestWithParam { - void SetUp() override { - UUR_KNOWN_FAILURE_ON(uur::LevelZeroV2{}); - UUR_RETURN_ON_FATAL_FAILURE( - uur::urMemImageTestWithParam::SetUp()); - } -}; - -static std::unordered_map image_info_size_map = { - {UR_IMAGE_INFO_FORMAT, sizeof(ur_image_format_t)}, - {UR_IMAGE_INFO_ELEMENT_SIZE, sizeof(size_t)}, - {UR_IMAGE_INFO_ROW_PITCH, sizeof(size_t)}, - {UR_IMAGE_INFO_SLICE_PITCH, sizeof(size_t)}, - {UR_IMAGE_INFO_WIDTH, sizeof(size_t)}, - {UR_IMAGE_INFO_HEIGHT, sizeof(size_t)}, - {UR_IMAGE_INFO_DEPTH, sizeof(size_t)}, -}; - -UUR_DEVICE_TEST_SUITE_P( - urMemImageGetInfoTest, - ::testing::Values(UR_IMAGE_INFO_FORMAT, UR_IMAGE_INFO_ELEMENT_SIZE, - UR_IMAGE_INFO_ROW_PITCH, UR_IMAGE_INFO_SLICE_PITCH, - UR_IMAGE_INFO_WIDTH, UR_IMAGE_INFO_HEIGHT, - UR_IMAGE_INFO_DEPTH), - uur::deviceTestWithParamPrinter); - -TEST_P(urMemImageGetInfoTest, Success) { - UUR_KNOWN_FAILURE_ON(uur::HIP{}); - // This fail is specific to the "Multi device testing" ci job. +#include + +using urMemImageGetInfoTest = uur::urMemImageTest; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urMemImageGetInfoTest); + +bool operator==(ur_image_format_t lhs, ur_image_format_t rhs) { + return lhs.channelOrder == rhs.channelOrder && + lhs.channelType == rhs.channelType; +} + +TEST_P(urMemImageGetInfoTest, SuccessFormat) { UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); - ur_image_info_t info = getParam(); - size_t size = 0; + size_t property_size = 0; + ur_image_info_t property_name = UR_IMAGE_INFO_FORMAT; + ASSERT_SUCCESS_OR_OPTIONAL_QUERY( - urMemImageGetInfo(image, info, 0, nullptr, &size), info); - ASSERT_NE(size, 0); + urMemImageGetInfo(image, property_name, 0, nullptr, &property_size), + property_name); + ASSERT_EQ(sizeof(ur_image_format_t), property_size); - if (const auto expected_size = image_info_size_map.find(info); - expected_size != image_info_size_map.end()) { - ASSERT_EQ(expected_size->second, size); - } else { - FAIL() << "Missing info value in image info size map"; - } + ur_image_format_t property_value = {UR_IMAGE_CHANNEL_ORDER_FORCE_UINT32, + UR_IMAGE_CHANNEL_TYPE_FORCE_UINT32}; - std::vector info_data(size); - ASSERT_SUCCESS( - urMemImageGetInfo(image, info, size, info_data.data(), nullptr)); + ASSERT_SUCCESS(urMemImageGetInfo(image, property_name, property_size, + &property_value, nullptr)); + + ASSERT_TRUE(property_value == image_format); +} + +TEST_P(urMemImageGetInfoTest, SuccessElementSize) { + UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); + + size_t property_size = 0; + ur_image_info_t property_name = UR_IMAGE_INFO_ELEMENT_SIZE; + + ASSERT_SUCCESS_OR_OPTIONAL_QUERY( + urMemImageGetInfo(image, property_name, 0, nullptr, &property_size), + property_name); + ASSERT_EQ(sizeof(size_t), property_size); + + size_t property_value = 999; + ASSERT_SUCCESS(urMemImageGetInfo(image, property_name, property_size, + &property_value, nullptr)); + + ASSERT_NE(property_value, 999); +} + +TEST_P(urMemImageGetInfoTest, SuccessRowPitch) { + UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); + + size_t property_size = 0; + ur_image_info_t property_name = UR_IMAGE_INFO_ROW_PITCH; + + ASSERT_SUCCESS_OR_OPTIONAL_QUERY( + urMemImageGetInfo(image, property_name, 0, nullptr, &property_size), + property_name); + ASSERT_EQ(sizeof(size_t), property_size); + + size_t property_value = 999; + ASSERT_SUCCESS(urMemImageGetInfo(image, property_name, property_size, + &property_value, nullptr)); + + ASSERT_TRUE(property_value == image_desc.rowPitch || + property_value == (4 * sizeof(uint8_t)) * image_desc.width); +} + +TEST_P(urMemImageGetInfoTest, SuccessSlicePitch) { + UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); + + size_t property_size = 0; + ur_image_info_t property_name = UR_IMAGE_INFO_SLICE_PITCH; + + ASSERT_SUCCESS_OR_OPTIONAL_QUERY( + urMemImageGetInfo(image, property_name, 0, nullptr, &property_size), + property_name); + ASSERT_EQ(sizeof(size_t), property_size); + + size_t property_value = 999; + ASSERT_SUCCESS(urMemImageGetInfo(image, property_name, property_size, + &property_value, nullptr)); + + ASSERT_EQ(property_value, image_desc.slicePitch); +} + +TEST_P(urMemImageGetInfoTest, SuccessWidth) { + UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); + + size_t property_size = 0; + ur_image_info_t property_name = UR_IMAGE_INFO_WIDTH; + + ASSERT_SUCCESS_OR_OPTIONAL_QUERY( + urMemImageGetInfo(image, property_name, 0, nullptr, &property_size), + property_name); + ASSERT_EQ(sizeof(size_t), property_size); + + size_t property_value = 999; + ASSERT_SUCCESS(urMemImageGetInfo(image, property_name, property_size, + &property_value, nullptr)); + + ASSERT_EQ(property_value, image_desc.width); +} + +TEST_P(urMemImageGetInfoTest, SuccessHeight) { + UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); + + size_t property_size = 0; + ur_image_info_t property_name = UR_IMAGE_INFO_HEIGHT; + + ASSERT_SUCCESS_OR_OPTIONAL_QUERY( + urMemImageGetInfo(image, property_name, 0, nullptr, &property_size), + property_name); + ASSERT_EQ(sizeof(size_t), property_size); + + size_t property_value = 999; + ASSERT_SUCCESS(urMemImageGetInfo(image, property_name, property_size, + &property_value, nullptr)); + + ASSERT_EQ(property_value, image_desc.height); +} + +TEST_P(urMemImageGetInfoTest, SuccessDepth) { + UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); + + size_t property_size = 0; + ur_image_info_t property_name = UR_IMAGE_INFO_DEPTH; + + ASSERT_SUCCESS_OR_OPTIONAL_QUERY( + urMemImageGetInfo(image, property_name, 0, nullptr, &property_size), + property_name); + ASSERT_EQ(sizeof(size_t), property_size); + + size_t property_value = 999; + ASSERT_SUCCESS(urMemImageGetInfo(image, property_name, property_size, + &property_value, nullptr)); + + ASSERT_TRUE(property_value == image_desc.depth || property_value == 0); +} + +TEST_P(urMemImageGetInfoTest, SuccessArraySize) { + UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); + + size_t property_size = 0; + ur_image_info_t property_name = UR_IMAGE_INFO_ARRAY_SIZE; + + ASSERT_SUCCESS_OR_OPTIONAL_QUERY( + urMemImageGetInfo(image, property_name, 0, nullptr, &property_size), + property_name); + ASSERT_EQ(sizeof(size_t), property_size); + + size_t property_value = 999; + ASSERT_SUCCESS(urMemImageGetInfo(image, property_name, property_size, + &property_value, nullptr)); + + ASSERT_TRUE(property_value == image_desc.depth || property_value == 0); +} + +TEST_P(urMemImageGetInfoTest, SuccessNumMipMaps) { + UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); + + size_t property_size = 0; + ur_image_info_t property_name = UR_IMAGE_INFO_NUM_MIP_LEVELS; + + ASSERT_SUCCESS_OR_OPTIONAL_QUERY( + urMemImageGetInfo(image, property_name, 0, nullptr, &property_size), + property_name); + ASSERT_EQ(sizeof(uint32_t), property_size); + + uint32_t property_value = 999; + ASSERT_SUCCESS(urMemImageGetInfo(image, property_name, property_size, + &property_value, nullptr)); + + ASSERT_EQ(property_value, image_desc.numMipLevel); +} + +TEST_P(urMemImageGetInfoTest, SuccessNumSamples) { + UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); + + size_t property_size = 0; + ur_image_info_t property_name = UR_IMAGE_INFO_NUM_SAMPLES; + + ASSERT_SUCCESS_OR_OPTIONAL_QUERY( + urMemImageGetInfo(image, property_name, 0, nullptr, &property_size), + property_name); + ASSERT_EQ(sizeof(uint32_t), property_size); + + uint32_t property_value = 999; + ASSERT_SUCCESS(urMemImageGetInfo(image, property_name, property_size, + &property_value, nullptr)); + + ASSERT_EQ(property_value, image_desc.numSamples); } TEST_P(urMemImageGetInfoTest, InvalidNullHandleImage) { - size_t info_size = 0; + size_t property_size = 0; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE, urMemImageGetInfo(nullptr, UR_IMAGE_INFO_FORMAT, - sizeof(size_t), &info_size, nullptr)); + sizeof(size_t), &property_size, nullptr)); } TEST_P(urMemImageGetInfoTest, InvalidEnumerationImageInfoType) { - size_t info_size = 0; + size_t property_size = 0; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_ENUMERATION, urMemImageGetInfo(image, UR_IMAGE_INFO_FORCE_UINT32, - sizeof(size_t), &info_size, nullptr)); + sizeof(size_t), &property_size, nullptr)); } TEST_P(urMemImageGetInfoTest, InvalidSizeZero) { - size_t info_size = 0; - ASSERT_EQ_RESULT( - urMemImageGetInfo(image, UR_IMAGE_INFO_FORMAT, 0, &info_size, nullptr), - UR_RESULT_ERROR_INVALID_SIZE); + size_t property_size = 0; + ASSERT_EQ_RESULT(urMemImageGetInfo(image, UR_IMAGE_INFO_FORMAT, 0, + &property_size, nullptr), + UR_RESULT_ERROR_INVALID_SIZE); } TEST_P(urMemImageGetInfoTest, InvalidSizeSmall) { // This fail is specific to the "Multi device testing" ci job. UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); - int info_size = 0; + int property_size = 0; ASSERT_EQ_RESULT(urMemImageGetInfo(image, UR_IMAGE_INFO_FORMAT, - sizeof(info_size) - 1, &info_size, + sizeof(property_size) - 1, &property_size, nullptr), UR_RESULT_ERROR_INVALID_SIZE); } TEST_P(urMemImageGetInfoTest, InvalidNullPointerParamValue) { - size_t info_size = 0; + size_t property_size = 0; ASSERT_EQ_RESULT(urMemImageGetInfo(image, UR_IMAGE_INFO_FORMAT, - sizeof(info_size), nullptr, nullptr), + sizeof(property_size), nullptr, nullptr), UR_RESULT_ERROR_INVALID_NULL_POINTER); } diff --git a/test/conformance/queue/urQueueGetInfo.cpp b/test/conformance/queue/urQueueGetInfo.cpp index f9ce054c3c..4de352d805 100644 --- a/test/conformance/queue/urQueueGetInfo.cpp +++ b/test/conformance/queue/urQueueGetInfo.cpp @@ -170,30 +170,34 @@ struct urQueueGetInfoDeviceQueueTestWithInfoParam : public uur::urQueueTest { UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urQueueGetInfoDeviceQueueTestWithInfoParam); -TEST_P(urQueueGetInfoDeviceQueueTestWithInfoParam, DeviceDefault) { - size_t size = 0; +TEST_P(urQueueGetInfoDeviceQueueTestWithInfoParam, SuccessDeviceDefault) { + size_t property_size = 0; ur_queue_info_t property_name = UR_QUEUE_INFO_DEVICE_DEFAULT; + ASSERT_SUCCESS_OR_OPTIONAL_QUERY( - urQueueGetInfo(queue, property_name, 0, nullptr, &size), property_name); - ASSERT_EQ(sizeof(ur_queue_handle_t), size); + urQueueGetInfo(queue, property_name, 0, nullptr, &property_size), + property_name); + ASSERT_EQ(sizeof(ur_queue_handle_t), property_size); ur_queue_handle_t returned_queue = nullptr; - ASSERT_SUCCESS( - urQueueGetInfo(queue, property_name, size, &returned_queue, nullptr)); + ASSERT_SUCCESS(urQueueGetInfo(queue, property_name, property_size, + &returned_queue, nullptr)); ASSERT_EQ(queue, returned_queue); } -TEST_P(urQueueGetInfoDeviceQueueTestWithInfoParam, Size) { - size_t size = 0; +TEST_P(urQueueGetInfoDeviceQueueTestWithInfoParam, SuccessSize) { + size_t property_size = 0; ur_queue_info_t property_name = UR_QUEUE_INFO_SIZE; + ASSERT_SUCCESS_OR_OPTIONAL_QUERY( - urQueueGetInfo(queue, property_name, 0, nullptr, &size), property_name); - ASSERT_EQ(sizeof(uint32_t), size); + urQueueGetInfo(queue, property_name, 0, nullptr, &property_size), + property_name); + ASSERT_EQ(sizeof(uint32_t), property_size); uint32_t returned_size = 0; - ASSERT_SUCCESS( - urQueueGetInfo(queue, property_name, size, &returned_size, nullptr)); + ASSERT_SUCCESS(urQueueGetInfo(queue, property_name, property_size, + &returned_size, nullptr)); ASSERT_GT(returned_size, 0); } diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h index 3d884a44b5..4f6aa816ea 100644 --- a/test/conformance/testing/include/uur/fixtures.h +++ b/test/conformance/testing/include/uur/fixtures.h @@ -358,44 +358,6 @@ template struct urSamplerTestWithParam : urContextTestWithParam { ur_sampler_desc_t sampler_desc; }; -template struct urMemImageTestWithParam : urContextTestWithParam { - void SetUp() override { - UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::SetUp()); - ur_bool_t imageSupported = false; - ASSERT_SUCCESS(urDeviceGetInfo(this->device, UR_DEVICE_INFO_IMAGE_SUPPORTED, - sizeof(ur_bool_t), &imageSupported, - nullptr)); - if (!imageSupported) { - GTEST_SKIP(); - } - UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( - urMemImageCreate(this->context, UR_MEM_FLAG_READ_WRITE, &format, &desc, - nullptr, &image)); - ASSERT_NE(nullptr, image); - } - - void TearDown() override { - if (image) { - EXPECT_SUCCESS(urMemRelease(image)); - } - UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::TearDown()); - } - ur_mem_handle_t image = nullptr; - ur_image_format_t format = {UR_IMAGE_CHANNEL_ORDER_RGBA, - UR_IMAGE_CHANNEL_TYPE_FLOAT}; - ur_image_desc_t desc = {UR_STRUCTURE_TYPE_IMAGE_DESC, // stype - nullptr, // pNext - UR_MEM_TYPE_IMAGE1D, // mem object type - 1024, // image width - 1, // image height - 1, // image depth - 1, // array size - 0, // row pitch - 0, // slice pitch - 0, // mip levels - 0}; // num samples -}; - struct urQueueTest : urContextTest { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(urContextTest::SetUp()); From d18935cdebce53258d5ed8533ec50c859b685248 Mon Sep 17 00:00:00 2001 From: Martin Grant Date: Fri, 24 Jan 2025 11:19:26 +0000 Subject: [PATCH 39/46] Revert "Move urMemImageGetInfo success test from a switch to individual test" --- include/ur_api.h | 10 +- include/ur_print.hpp | 48 ---- scripts/core/memory.yml | 6 - source/adapters/opencl/memory.cpp | 99 ------- source/loader/layers/validation/ur_valddi.cpp | 4 +- source/loader/ur_libapi.cpp | 4 +- source/ur_api.cpp | 4 +- test/conformance/memory/urMemImageGetInfo.cpp | 254 ++++-------------- test/conformance/queue/urQueueGetInfo.cpp | 28 +- .../testing/include/uur/fixtures.h | 38 +++ 10 files changed, 114 insertions(+), 381 deletions(-) diff --git a/include/ur_api.h b/include/ur_api.h index 684a3bb2b3..ad88f3ac08 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -3324,12 +3324,6 @@ typedef enum ur_image_info_t { UR_IMAGE_INFO_HEIGHT = 5, /// [size_t] image depth UR_IMAGE_INFO_DEPTH = 6, - /// [size_t] array size - UR_IMAGE_INFO_ARRAY_SIZE = 7, - /// [uint32_t] number of MIP levels - UR_IMAGE_INFO_NUM_MIP_LEVELS = 8, - /// [uint32_t] number of samples - UR_IMAGE_INFO_NUM_SAMPLES = 9, /// @cond UR_IMAGE_INFO_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -3843,7 +3837,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hMemory` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_IMAGE_INFO_NUM_SAMPLES < propName` +/// + `::UR_IMAGE_INFO_DEPTH < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -9505,7 +9499,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hContext` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_IMAGE_INFO_NUM_SAMPLES < propName` +/// + `::UR_IMAGE_INFO_DEPTH < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE diff --git a/include/ur_print.hpp b/include/ur_print.hpp index d8873ae456..2e6f7d715d 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -6466,15 +6466,6 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_image_info_t value) { case UR_IMAGE_INFO_DEPTH: os << "UR_IMAGE_INFO_DEPTH"; break; - case UR_IMAGE_INFO_ARRAY_SIZE: - os << "UR_IMAGE_INFO_ARRAY_SIZE"; - break; - case UR_IMAGE_INFO_NUM_MIP_LEVELS: - os << "UR_IMAGE_INFO_NUM_MIP_LEVELS"; - break; - case UR_IMAGE_INFO_NUM_SAMPLES: - os << "UR_IMAGE_INFO_NUM_SAMPLES"; - break; default: os << "unknown enumerator"; break; @@ -6583,45 +6574,6 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, os << ")"; } break; - case UR_IMAGE_INFO_ARRAY_SIZE: { - const size_t *tptr = (const size_t *)ptr; - if (sizeof(size_t) > size) { - os << "invalid size (is: " << size << ", expected: >=" << sizeof(size_t) - << ")"; - return UR_RESULT_ERROR_INVALID_SIZE; - } - os << (const void *)(tptr) << " ("; - - os << *tptr; - - os << ")"; - } break; - case UR_IMAGE_INFO_NUM_MIP_LEVELS: { - const uint32_t *tptr = (const uint32_t *)ptr; - if (sizeof(uint32_t) > size) { - os << "invalid size (is: " << size << ", expected: >=" << sizeof(uint32_t) - << ")"; - return UR_RESULT_ERROR_INVALID_SIZE; - } - os << (const void *)(tptr) << " ("; - - os << *tptr; - - os << ")"; - } break; - case UR_IMAGE_INFO_NUM_SAMPLES: { - const uint32_t *tptr = (const uint32_t *)ptr; - if (sizeof(uint32_t) > size) { - os << "invalid size (is: " << size << ", expected: >=" << sizeof(uint32_t) - << ")"; - return UR_RESULT_ERROR_INVALID_SIZE; - } - os << (const void *)(tptr) << " ("; - - os << *tptr; - - os << ")"; - } break; default: os << "unknown enumerator"; return UR_RESULT_ERROR_INVALID_ENUMERATION; diff --git a/scripts/core/memory.yml b/scripts/core/memory.yml index a20693f8a9..75f68d8e9a 100644 --- a/scripts/core/memory.yml +++ b/scripts/core/memory.yml @@ -160,12 +160,6 @@ etors: desc: "[size_t] image height" - name: DEPTH desc: "[size_t] image depth" - - name: ARRAY_SIZE - desc: "[size_t] array size" - - name: NUM_MIP_LEVELS - desc: "[uint32_t] number of MIP levels" - - name: NUM_SAMPLES - desc: "[uint32_t] number of samples" --- #-------------------------------------------------------------------------- type: struct desc: "Image format including channel layout and data type" diff --git a/source/adapters/opencl/memory.cpp b/source/adapters/opencl/memory.cpp index c420b920df..201df1f678 100644 --- a/source/adapters/opencl/memory.cpp +++ b/source/adapters/opencl/memory.cpp @@ -10,92 +10,6 @@ #include "common.hpp" -#include - -const std::unordered_map - ChannelOrderMap = { - {UR_IMAGE_CHANNEL_ORDER_A, CL_A}, - {UR_IMAGE_CHANNEL_ORDER_R, CL_R}, - {UR_IMAGE_CHANNEL_ORDER_RG, CL_RG}, - {UR_IMAGE_CHANNEL_ORDER_RA, CL_RA}, - {UR_IMAGE_CHANNEL_ORDER_RGB, CL_RGB}, - {UR_IMAGE_CHANNEL_ORDER_RGBA, CL_RGBA}, - {UR_IMAGE_CHANNEL_ORDER_BGRA, CL_BGRA}, - {UR_IMAGE_CHANNEL_ORDER_ARGB, CL_ARGB}, - {UR_IMAGE_CHANNEL_ORDER_ABGR, CL_ABGR}, - {UR_IMAGE_CHANNEL_ORDER_INTENSITY, CL_INTENSITY}, - {UR_IMAGE_CHANNEL_ORDER_LUMINANCE, CL_LUMINANCE}, - {UR_IMAGE_CHANNEL_ORDER_RX, CL_Rx}, - {UR_IMAGE_CHANNEL_ORDER_RGX, CL_RGx}, - {UR_IMAGE_CHANNEL_ORDER_RGBX, CL_RGBx}, - {UR_IMAGE_CHANNEL_ORDER_SRGBA, CL_sRGBA}, -}; - -const std::unordered_map - ChannelTypeMap = { - {UR_IMAGE_CHANNEL_TYPE_SNORM_INT8, CL_SNORM_INT8}, - {UR_IMAGE_CHANNEL_TYPE_SNORM_INT16, CL_SNORM_INT16}, - {UR_IMAGE_CHANNEL_TYPE_UNORM_INT8, CL_UNORM_INT8}, - {UR_IMAGE_CHANNEL_TYPE_UNORM_INT16, CL_UNORM_INT16}, - {UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565, CL_UNORM_SHORT_565}, - {UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555, CL_UNORM_SHORT_555}, - {UR_IMAGE_CHANNEL_TYPE_INT_101010, CL_UNORM_INT_101010}, - {UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8, CL_SIGNED_INT8}, - {UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16, CL_SIGNED_INT16}, - {UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32, CL_SIGNED_INT32}, - {UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8, CL_UNSIGNED_INT8}, - {UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16, CL_UNSIGNED_INT16}, - {UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32, CL_UNSIGNED_INT32}, - {UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT, CL_HALF_FLOAT}, - {UR_IMAGE_CHANNEL_TYPE_FLOAT, CL_FLOAT}, -}; - -cl_image_format mapURImageFormatToCL(const ur_image_format_t &PImageFormat) { - cl_image_format CLImageFormat = {UR_IMAGE_CHANNEL_ORDER_FORCE_UINT32, - UR_IMAGE_CHANNEL_TYPE_FORCE_UINT32}; - - auto channelOrderIt = ChannelOrderMap.find(PImageFormat.channelOrder); - if (channelOrderIt != ChannelOrderMap.end()) { - CLImageFormat.image_channel_order = channelOrderIt->second; - } - - auto channelTypeIt = ChannelTypeMap.find(PImageFormat.channelType); - if (channelTypeIt != ChannelTypeMap.end()) { - CLImageFormat.image_channel_data_type = channelTypeIt->second; - } - - return CLImageFormat; -} - -ur_image_format_t mapCLImageFormatToUR(const cl_image_format *PImageFormat) { - ur_image_format_t URImageFormat = {UR_IMAGE_CHANNEL_ORDER_FORCE_UINT32, - UR_IMAGE_CHANNEL_TYPE_FORCE_UINT32}; - - auto reverseChannelOrderIt = - std::find_if(ChannelOrderMap.begin(), ChannelOrderMap.end(), - [PImageFormat](const auto &pair) { - return pair.second == PImageFormat->image_channel_order; - }); - if (reverseChannelOrderIt != ChannelOrderMap.end()) { - URImageFormat.channelOrder = reverseChannelOrderIt->first; - } - - URImageFormat.channelOrder = (reverseChannelOrderIt != ChannelOrderMap.end()) - ? reverseChannelOrderIt->first - : UR_IMAGE_CHANNEL_ORDER_FORCE_UINT32; - - auto reverseChannelTypeIt = std::find_if( - ChannelTypeMap.begin(), ChannelTypeMap.end(), - [PImageFormat](const auto &pair) { - return pair.second == PImageFormat->image_channel_data_type; - }); - if (reverseChannelTypeIt != ChannelTypeMap.end()) { - URImageFormat.channelType = reverseChannelTypeIt->first; - } - - return URImageFormat; -} - cl_image_format mapURImageFormatToCL(const ur_image_format_t *PImageFormat) { cl_image_format CLImageFormat; switch (PImageFormat->channelOrder) { @@ -260,12 +174,6 @@ cl_int mapURMemImageInfoToCL(ur_image_info_t URPropName) { return CL_IMAGE_HEIGHT; case UR_IMAGE_INFO_DEPTH: return CL_IMAGE_DEPTH; - case UR_IMAGE_INFO_ARRAY_SIZE: - return CL_IMAGE_ARRAY_SIZE; - case UR_IMAGE_INFO_NUM_MIP_LEVELS: - return CL_IMAGE_NUM_MIP_LEVELS; - case UR_IMAGE_INFO_NUM_SAMPLES: - return CL_IMAGE_NUM_SAMPLES; default: return -1; } @@ -489,14 +397,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(ur_mem_handle_t hMemory, CL_RETURN_ON_FAILURE(ClResult); if (pPropSizeRet) { *pPropSizeRet = CheckPropSize; - } else { - if (propName == UR_IMAGE_INFO_FORMAT) { - ur_image_format_t format = mapCLImageFormatToUR( - reinterpret_cast(pPropValue)); - return ReturnValue(format); - } } - return UR_RESULT_SUCCESS; } diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index d86c3ae98d..19ab908ee3 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -1470,7 +1470,7 @@ __urdlllocal ur_result_t UR_APICALL urMemImageGetInfo( if (pPropValue == NULL && pPropSizeRet == NULL) return UR_RESULT_ERROR_INVALID_NULL_POINTER; - if (UR_IMAGE_INFO_NUM_SAMPLES < propName) + if (UR_IMAGE_INFO_DEPTH < propName) return UR_RESULT_ERROR_INVALID_ENUMERATION; if (propSize == 0 && pPropValue != NULL) @@ -7069,7 +7069,7 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageGetInfoExp( if (pPropValue == NULL && pPropSizeRet == NULL) return UR_RESULT_ERROR_INVALID_NULL_POINTER; - if (UR_IMAGE_INFO_NUM_SAMPLES < propName) + if (UR_IMAGE_INFO_DEPTH < propName) return UR_RESULT_ERROR_INVALID_ENUMERATION; } diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 744aadf1a5..5b55b71d4d 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -1943,7 +1943,7 @@ ur_result_t UR_APICALL urMemGetInfo( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hMemory` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_IMAGE_INFO_NUM_SAMPLES < propName` +/// + `::UR_IMAGE_INFO_DEPTH < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -7120,7 +7120,7 @@ ur_result_t UR_APICALL urBindlessImagesImageCopyExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hContext` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_IMAGE_INFO_NUM_SAMPLES < propName` +/// + `::UR_IMAGE_INFO_DEPTH < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 82b58be9b5..15cb0442b1 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -1720,7 +1720,7 @@ ur_result_t UR_APICALL urMemGetInfo( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hMemory` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_IMAGE_INFO_NUM_SAMPLES < propName` +/// + `::UR_IMAGE_INFO_DEPTH < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -6249,7 +6249,7 @@ ur_result_t UR_APICALL urBindlessImagesImageCopyExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hContext` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_IMAGE_INFO_NUM_SAMPLES < propName` +/// + `::UR_IMAGE_INFO_DEPTH < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE diff --git a/test/conformance/memory/urMemImageGetInfo.cpp b/test/conformance/memory/urMemImageGetInfo.cpp index 3712d91a5e..233a61d683 100644 --- a/test/conformance/memory/urMemImageGetInfo.cpp +++ b/test/conformance/memory/urMemImageGetInfo.cpp @@ -5,235 +5,93 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include -#include - -using urMemImageGetInfoTest = uur::urMemImageTest; -UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urMemImageGetInfoTest); - -bool operator==(ur_image_format_t lhs, ur_image_format_t rhs) { - return lhs.channelOrder == rhs.channelOrder && - lhs.channelType == rhs.channelType; -} - -TEST_P(urMemImageGetInfoTest, SuccessFormat) { - UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); - - size_t property_size = 0; - ur_image_info_t property_name = UR_IMAGE_INFO_FORMAT; - - ASSERT_SUCCESS_OR_OPTIONAL_QUERY( - urMemImageGetInfo(image, property_name, 0, nullptr, &property_size), - property_name); - ASSERT_EQ(sizeof(ur_image_format_t), property_size); - - ur_image_format_t property_value = {UR_IMAGE_CHANNEL_ORDER_FORCE_UINT32, - UR_IMAGE_CHANNEL_TYPE_FORCE_UINT32}; - - ASSERT_SUCCESS(urMemImageGetInfo(image, property_name, property_size, - &property_value, nullptr)); - - ASSERT_TRUE(property_value == image_format); -} - -TEST_P(urMemImageGetInfoTest, SuccessElementSize) { - UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); - - size_t property_size = 0; - ur_image_info_t property_name = UR_IMAGE_INFO_ELEMENT_SIZE; - - ASSERT_SUCCESS_OR_OPTIONAL_QUERY( - urMemImageGetInfo(image, property_name, 0, nullptr, &property_size), - property_name); - ASSERT_EQ(sizeof(size_t), property_size); - - size_t property_value = 999; - ASSERT_SUCCESS(urMemImageGetInfo(image, property_name, property_size, - &property_value, nullptr)); - - ASSERT_NE(property_value, 999); -} - -TEST_P(urMemImageGetInfoTest, SuccessRowPitch) { - UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); - - size_t property_size = 0; - ur_image_info_t property_name = UR_IMAGE_INFO_ROW_PITCH; - - ASSERT_SUCCESS_OR_OPTIONAL_QUERY( - urMemImageGetInfo(image, property_name, 0, nullptr, &property_size), - property_name); - ASSERT_EQ(sizeof(size_t), property_size); - - size_t property_value = 999; - ASSERT_SUCCESS(urMemImageGetInfo(image, property_name, property_size, - &property_value, nullptr)); - - ASSERT_TRUE(property_value == image_desc.rowPitch || - property_value == (4 * sizeof(uint8_t)) * image_desc.width); -} - -TEST_P(urMemImageGetInfoTest, SuccessSlicePitch) { - UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); - - size_t property_size = 0; - ur_image_info_t property_name = UR_IMAGE_INFO_SLICE_PITCH; - - ASSERT_SUCCESS_OR_OPTIONAL_QUERY( - urMemImageGetInfo(image, property_name, 0, nullptr, &property_size), - property_name); - ASSERT_EQ(sizeof(size_t), property_size); - - size_t property_value = 999; - ASSERT_SUCCESS(urMemImageGetInfo(image, property_name, property_size, - &property_value, nullptr)); - - ASSERT_EQ(property_value, image_desc.slicePitch); -} - -TEST_P(urMemImageGetInfoTest, SuccessWidth) { - UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); - - size_t property_size = 0; - ur_image_info_t property_name = UR_IMAGE_INFO_WIDTH; - - ASSERT_SUCCESS_OR_OPTIONAL_QUERY( - urMemImageGetInfo(image, property_name, 0, nullptr, &property_size), - property_name); - ASSERT_EQ(sizeof(size_t), property_size); - - size_t property_value = 999; - ASSERT_SUCCESS(urMemImageGetInfo(image, property_name, property_size, - &property_value, nullptr)); - - ASSERT_EQ(property_value, image_desc.width); -} - -TEST_P(urMemImageGetInfoTest, SuccessHeight) { - UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); - - size_t property_size = 0; - ur_image_info_t property_name = UR_IMAGE_INFO_HEIGHT; - - ASSERT_SUCCESS_OR_OPTIONAL_QUERY( - urMemImageGetInfo(image, property_name, 0, nullptr, &property_size), - property_name); - ASSERT_EQ(sizeof(size_t), property_size); - - size_t property_value = 999; - ASSERT_SUCCESS(urMemImageGetInfo(image, property_name, property_size, - &property_value, nullptr)); - - ASSERT_EQ(property_value, image_desc.height); -} - -TEST_P(urMemImageGetInfoTest, SuccessDepth) { - UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); - - size_t property_size = 0; - ur_image_info_t property_name = UR_IMAGE_INFO_DEPTH; - - ASSERT_SUCCESS_OR_OPTIONAL_QUERY( - urMemImageGetInfo(image, property_name, 0, nullptr, &property_size), - property_name); - ASSERT_EQ(sizeof(size_t), property_size); - - size_t property_value = 999; - ASSERT_SUCCESS(urMemImageGetInfo(image, property_name, property_size, - &property_value, nullptr)); - - ASSERT_TRUE(property_value == image_desc.depth || property_value == 0); -} - -TEST_P(urMemImageGetInfoTest, SuccessArraySize) { - UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); - - size_t property_size = 0; - ur_image_info_t property_name = UR_IMAGE_INFO_ARRAY_SIZE; - - ASSERT_SUCCESS_OR_OPTIONAL_QUERY( - urMemImageGetInfo(image, property_name, 0, nullptr, &property_size), - property_name); - ASSERT_EQ(sizeof(size_t), property_size); - - size_t property_value = 999; - ASSERT_SUCCESS(urMemImageGetInfo(image, property_name, property_size, - &property_value, nullptr)); - - ASSERT_TRUE(property_value == image_desc.depth || property_value == 0); -} - -TEST_P(urMemImageGetInfoTest, SuccessNumMipMaps) { - UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); - - size_t property_size = 0; - ur_image_info_t property_name = UR_IMAGE_INFO_NUM_MIP_LEVELS; - - ASSERT_SUCCESS_OR_OPTIONAL_QUERY( - urMemImageGetInfo(image, property_name, 0, nullptr, &property_size), - property_name); - ASSERT_EQ(sizeof(uint32_t), property_size); - - uint32_t property_value = 999; - ASSERT_SUCCESS(urMemImageGetInfo(image, property_name, property_size, - &property_value, nullptr)); - - ASSERT_EQ(property_value, image_desc.numMipLevel); -} - -TEST_P(urMemImageGetInfoTest, SuccessNumSamples) { +#include + +struct urMemImageGetInfoTest : uur::urMemImageTestWithParam { + void SetUp() override { + UUR_KNOWN_FAILURE_ON(uur::LevelZeroV2{}); + UUR_RETURN_ON_FATAL_FAILURE( + uur::urMemImageTestWithParam::SetUp()); + } +}; + +static std::unordered_map image_info_size_map = { + {UR_IMAGE_INFO_FORMAT, sizeof(ur_image_format_t)}, + {UR_IMAGE_INFO_ELEMENT_SIZE, sizeof(size_t)}, + {UR_IMAGE_INFO_ROW_PITCH, sizeof(size_t)}, + {UR_IMAGE_INFO_SLICE_PITCH, sizeof(size_t)}, + {UR_IMAGE_INFO_WIDTH, sizeof(size_t)}, + {UR_IMAGE_INFO_HEIGHT, sizeof(size_t)}, + {UR_IMAGE_INFO_DEPTH, sizeof(size_t)}, +}; + +UUR_DEVICE_TEST_SUITE_P( + urMemImageGetInfoTest, + ::testing::Values(UR_IMAGE_INFO_FORMAT, UR_IMAGE_INFO_ELEMENT_SIZE, + UR_IMAGE_INFO_ROW_PITCH, UR_IMAGE_INFO_SLICE_PITCH, + UR_IMAGE_INFO_WIDTH, UR_IMAGE_INFO_HEIGHT, + UR_IMAGE_INFO_DEPTH), + uur::deviceTestWithParamPrinter); + +TEST_P(urMemImageGetInfoTest, Success) { + UUR_KNOWN_FAILURE_ON(uur::HIP{}); + // This fail is specific to the "Multi device testing" ci job. UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); - size_t property_size = 0; - ur_image_info_t property_name = UR_IMAGE_INFO_NUM_SAMPLES; - + ur_image_info_t info = getParam(); + size_t size = 0; ASSERT_SUCCESS_OR_OPTIONAL_QUERY( - urMemImageGetInfo(image, property_name, 0, nullptr, &property_size), - property_name); - ASSERT_EQ(sizeof(uint32_t), property_size); + urMemImageGetInfo(image, info, 0, nullptr, &size), info); + ASSERT_NE(size, 0); - uint32_t property_value = 999; - ASSERT_SUCCESS(urMemImageGetInfo(image, property_name, property_size, - &property_value, nullptr)); + if (const auto expected_size = image_info_size_map.find(info); + expected_size != image_info_size_map.end()) { + ASSERT_EQ(expected_size->second, size); + } else { + FAIL() << "Missing info value in image info size map"; + } - ASSERT_EQ(property_value, image_desc.numSamples); + std::vector info_data(size); + ASSERT_SUCCESS( + urMemImageGetInfo(image, info, size, info_data.data(), nullptr)); } TEST_P(urMemImageGetInfoTest, InvalidNullHandleImage) { - size_t property_size = 0; + size_t info_size = 0; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE, urMemImageGetInfo(nullptr, UR_IMAGE_INFO_FORMAT, - sizeof(size_t), &property_size, nullptr)); + sizeof(size_t), &info_size, nullptr)); } TEST_P(urMemImageGetInfoTest, InvalidEnumerationImageInfoType) { - size_t property_size = 0; + size_t info_size = 0; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_ENUMERATION, urMemImageGetInfo(image, UR_IMAGE_INFO_FORCE_UINT32, - sizeof(size_t), &property_size, nullptr)); + sizeof(size_t), &info_size, nullptr)); } TEST_P(urMemImageGetInfoTest, InvalidSizeZero) { - size_t property_size = 0; - ASSERT_EQ_RESULT(urMemImageGetInfo(image, UR_IMAGE_INFO_FORMAT, 0, - &property_size, nullptr), - UR_RESULT_ERROR_INVALID_SIZE); + size_t info_size = 0; + ASSERT_EQ_RESULT( + urMemImageGetInfo(image, UR_IMAGE_INFO_FORMAT, 0, &info_size, nullptr), + UR_RESULT_ERROR_INVALID_SIZE); } TEST_P(urMemImageGetInfoTest, InvalidSizeSmall) { // This fail is specific to the "Multi device testing" ci job. UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); - int property_size = 0; + int info_size = 0; ASSERT_EQ_RESULT(urMemImageGetInfo(image, UR_IMAGE_INFO_FORMAT, - sizeof(property_size) - 1, &property_size, + sizeof(info_size) - 1, &info_size, nullptr), UR_RESULT_ERROR_INVALID_SIZE); } TEST_P(urMemImageGetInfoTest, InvalidNullPointerParamValue) { - size_t property_size = 0; + size_t info_size = 0; ASSERT_EQ_RESULT(urMemImageGetInfo(image, UR_IMAGE_INFO_FORMAT, - sizeof(property_size), nullptr, nullptr), + sizeof(info_size), nullptr, nullptr), UR_RESULT_ERROR_INVALID_NULL_POINTER); } diff --git a/test/conformance/queue/urQueueGetInfo.cpp b/test/conformance/queue/urQueueGetInfo.cpp index 4de352d805..f9ce054c3c 100644 --- a/test/conformance/queue/urQueueGetInfo.cpp +++ b/test/conformance/queue/urQueueGetInfo.cpp @@ -170,34 +170,30 @@ struct urQueueGetInfoDeviceQueueTestWithInfoParam : public uur::urQueueTest { UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urQueueGetInfoDeviceQueueTestWithInfoParam); -TEST_P(urQueueGetInfoDeviceQueueTestWithInfoParam, SuccessDeviceDefault) { - size_t property_size = 0; +TEST_P(urQueueGetInfoDeviceQueueTestWithInfoParam, DeviceDefault) { + size_t size = 0; ur_queue_info_t property_name = UR_QUEUE_INFO_DEVICE_DEFAULT; - ASSERT_SUCCESS_OR_OPTIONAL_QUERY( - urQueueGetInfo(queue, property_name, 0, nullptr, &property_size), - property_name); - ASSERT_EQ(sizeof(ur_queue_handle_t), property_size); + urQueueGetInfo(queue, property_name, 0, nullptr, &size), property_name); + ASSERT_EQ(sizeof(ur_queue_handle_t), size); ur_queue_handle_t returned_queue = nullptr; - ASSERT_SUCCESS(urQueueGetInfo(queue, property_name, property_size, - &returned_queue, nullptr)); + ASSERT_SUCCESS( + urQueueGetInfo(queue, property_name, size, &returned_queue, nullptr)); ASSERT_EQ(queue, returned_queue); } -TEST_P(urQueueGetInfoDeviceQueueTestWithInfoParam, SuccessSize) { - size_t property_size = 0; +TEST_P(urQueueGetInfoDeviceQueueTestWithInfoParam, Size) { + size_t size = 0; ur_queue_info_t property_name = UR_QUEUE_INFO_SIZE; - ASSERT_SUCCESS_OR_OPTIONAL_QUERY( - urQueueGetInfo(queue, property_name, 0, nullptr, &property_size), - property_name); - ASSERT_EQ(sizeof(uint32_t), property_size); + urQueueGetInfo(queue, property_name, 0, nullptr, &size), property_name); + ASSERT_EQ(sizeof(uint32_t), size); uint32_t returned_size = 0; - ASSERT_SUCCESS(urQueueGetInfo(queue, property_name, property_size, - &returned_size, nullptr)); + ASSERT_SUCCESS( + urQueueGetInfo(queue, property_name, size, &returned_size, nullptr)); ASSERT_GT(returned_size, 0); } diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h index 4f6aa816ea..3d884a44b5 100644 --- a/test/conformance/testing/include/uur/fixtures.h +++ b/test/conformance/testing/include/uur/fixtures.h @@ -358,6 +358,44 @@ template struct urSamplerTestWithParam : urContextTestWithParam { ur_sampler_desc_t sampler_desc; }; +template struct urMemImageTestWithParam : urContextTestWithParam { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::SetUp()); + ur_bool_t imageSupported = false; + ASSERT_SUCCESS(urDeviceGetInfo(this->device, UR_DEVICE_INFO_IMAGE_SUPPORTED, + sizeof(ur_bool_t), &imageSupported, + nullptr)); + if (!imageSupported) { + GTEST_SKIP(); + } + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urMemImageCreate(this->context, UR_MEM_FLAG_READ_WRITE, &format, &desc, + nullptr, &image)); + ASSERT_NE(nullptr, image); + } + + void TearDown() override { + if (image) { + EXPECT_SUCCESS(urMemRelease(image)); + } + UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::TearDown()); + } + ur_mem_handle_t image = nullptr; + ur_image_format_t format = {UR_IMAGE_CHANNEL_ORDER_RGBA, + UR_IMAGE_CHANNEL_TYPE_FLOAT}; + ur_image_desc_t desc = {UR_STRUCTURE_TYPE_IMAGE_DESC, // stype + nullptr, // pNext + UR_MEM_TYPE_IMAGE1D, // mem object type + 1024, // image width + 1, // image height + 1, // image depth + 1, // array size + 0, // row pitch + 0, // slice pitch + 0, // mip levels + 0}; // num samples +}; + struct urQueueTest : urContextTest { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(urContextTest::SetUp()); From 241636f57e3c1f451bcbfec4d5520d325218f8d1 Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Mon, 27 Jan 2025 17:35:09 +0000 Subject: [PATCH 40/46] Remove unnecessary OpenCL KNOWN_FAILURE from urKernelGetInfoTest. --- test/conformance/kernel/urKernelGetInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/conformance/kernel/urKernelGetInfo.cpp b/test/conformance/kernel/urKernelGetInfo.cpp index 4748f8f96a..65694b5bdd 100644 --- a/test/conformance/kernel/urKernelGetInfo.cpp +++ b/test/conformance/kernel/urKernelGetInfo.cpp @@ -121,7 +121,7 @@ TEST_P(urKernelGetInfoTest, SuccessAttributes) { } TEST_P(urKernelGetInfoTest, SuccessNumRegs) { - UUR_KNOWN_FAILURE_ON(uur::HIP{}, uur::OpenCL{}); + UUR_KNOWN_FAILURE_ON(uur::HIP{}); ur_kernel_info_t property_name = UR_KERNEL_INFO_NUM_REGS; size_t property_size = 0; From 43e7f2d0027fabfc17487a2687bd738bedd3875a Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Tue, 28 Jan 2025 09:56:05 +0000 Subject: [PATCH 41/46] enqueueMemBufferMap bugfix --- .../adapters/level_zero/v2/queue_immediate_in_order.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index 3d50c52a56..e087c3ae2e 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -604,10 +604,11 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferMap( ZE2UR_CALL(zeCommandListAppendWaitOnEvents, (commandListManager.getZeCommandList(), waitList.second, waitList.first)); - if (zeSignalEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), zeSignalEvent)); - } + } + + if (zeSignalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); } if (blockingMap) { From 4a916cc14e82bf1b18294e38db9a0e439df67404 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Wed, 18 Dec 2024 13:06:33 +0000 Subject: [PATCH 42/46] # This is a combination of 4 commits. # This is the 1st commit message: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit parent 982416300132e778138f9d02bbbb2cde4e9f6249 author Mikołaj Komar 1734527193 +0000 committer Mateusz P. Nowak 1738059437 +0000 Prepare ground for command_buffer in v2 Enforce in order list usage, and add initialization and destruction to buffer Add initial support of command buffers to adapter v2 Update UR calls handling Remove unnecessary comment Move not implemented command buffer commands to previous position Fix most issues with code Fix formatting and modify queue_api template Move command buffer cleanup to destructor Use cached command lists instead of created ones Remove not needed function and change phrasing Add initial implementation of command list manager Use list manager instead of custom implementation in queue Optimalize imports Remove not needed destructor Revert "Fix formatting" This reverts commit 545e57732480487e7f36d87632b9c56307f39f49. # This is the commit message #2: Move command list close to the command buffer # This is the commit message #3: Moved try outside function block # This is the commit message #4: Move enqueue generic command list back to queue --- Testing/Temporary/CTestCostData.txt | 1 + ft.sh | 21 ++ scripts/templates/queue_api.cpp.mako | 2 + scripts/templates/queue_api.hpp.mako | 7 + source/adapters/level_zero/CMakeLists.txt | 4 + source/adapters/level_zero/v2/api.cpp | 58 ------ .../adapters/level_zero/v2/command_buffer.cpp | 185 ++++++++++++++++++ .../adapters/level_zero/v2/command_buffer.hpp | 57 ++++++ .../level_zero/v2/command_list_manager.cpp | 118 +++++++++++ .../level_zero/v2/command_list_manager.hpp | 54 +++++ source/adapters/level_zero/v2/queue_api.cpp | 3 + source/adapters/level_zero/v2/queue_api.hpp | 8 + .../v2/queue_immediate_in_order.cpp | 100 +++++----- .../v2/queue_immediate_in_order.hpp | 17 ++ 14 files changed, 529 insertions(+), 106 deletions(-) create mode 100644 Testing/Temporary/CTestCostData.txt create mode 100644 ft.sh create mode 100644 source/adapters/level_zero/v2/command_buffer.cpp create mode 100644 source/adapters/level_zero/v2/command_buffer.hpp create mode 100644 source/adapters/level_zero/v2/command_list_manager.cpp create mode 100644 source/adapters/level_zero/v2/command_list_manager.hpp diff --git a/Testing/Temporary/CTestCostData.txt b/Testing/Temporary/CTestCostData.txt new file mode 100644 index 0000000000..ed97d539c0 --- /dev/null +++ b/Testing/Temporary/CTestCostData.txt @@ -0,0 +1 @@ +--- diff --git a/ft.sh b/ft.sh new file mode 100644 index 0000000000..f7f73081f5 --- /dev/null +++ b/ft.sh @@ -0,0 +1,21 @@ +# add_test([=[enqueue-adapter_level_zero_v2]=] "/home/mateuszpn/pr2532/build/bin/test-enqueue" "--gtest_filter=*Level_Zero*") +# set_tests_properties([=[enqueue-adapter_level_zero_v2]=] PROPERTIES ENVIRONMENT +# "UR_ADAPTERS_FORCE_LOAD=\"/home/mateuszpn/pr2532/build/lib/libur_adapter_level_zero_v2.so.0.12.0\"" +# "LABELS "conformance;adapter_level_zero_v2" +# WORKING_DIRECTORY "/home/mateuszpn/pr2532/build/test/conformance/enqueue" _BACKTRACE_TRIPLES "/home/mateuszpn/pr2532/test/conformance/CMakeLists.txt;22;add_test;/home/mateuszpn/pr2532/test/conformance/CMakeLists.txt;32;do_add_test;/home/mateuszpn/pr2532/test/conformance/CMakeLists.txt;67;add_test_adapter;/home/mateuszpn/pr2532/test/conformance/CMakeLists.txt;78;add_conformance_test;/home/mateuszpn/pr2532/test/conformance/enqueue/CMakeLists.txt;6;add_conformance_test_with_kernels_environment;/home/mateuszpn/pr2532/test/conformance/enqueue/CMakeLists.txt;0;") + +# Set environment variable +export UR_ADAPTERS_FORCE_LOAD="/home/mateuszpn/pr2532/build/lib/libur_adapter_level_zero_v2.so.0.12.0" + +# Set working directory +#cd /home/mateuszpn/pr2532/build/test/conformance/$1 + +# Run the test with the specified filter +#/home/mateuszpn/pr2532/build/bin/test-$1 --gtest_filter=*Level_Zero* + +# Set working directory +cd /home/mateuszpn/pr2532/build/test/conformance/enqueue + +# Run the test with the specified filter +#/home/mateuszpn/pr2532/build/bin/test-enqueue --gtest_filter=urEnqueueMemBufferMapTestWithParam.MapSignalEvent*Level_Zero* +/home/mateuszpn/pr2532/build/bin/test-enqueue --gtest_filter=*Level_Zero* \ No newline at end of file diff --git a/scripts/templates/queue_api.cpp.mako b/scripts/templates/queue_api.cpp.mako index 89f857e007..14def952ac 100644 --- a/scripts/templates/queue_api.cpp.mako +++ b/scripts/templates/queue_api.cpp.mako @@ -20,6 +20,8 @@ from templates import helper as th * */ +// Do not edit. This file is auto generated from a template: scripts/templates/queue_api.cpp.mako + #include "queue_api.hpp" #include "ur_util.hpp" diff --git a/scripts/templates/queue_api.hpp.mako b/scripts/templates/queue_api.hpp.mako index 352abbeb43..46ed74ed33 100644 --- a/scripts/templates/queue_api.hpp.mako +++ b/scripts/templates/queue_api.hpp.mako @@ -20,9 +20,12 @@ from templates import helper as th * */ +// Do not edit. This file is auto generated from a template: scripts/templates/queue_api.hpp.mako + #pragma once #include +#include struct ur_queue_handle_t_ { virtual ~ur_queue_handle_t_(); @@ -32,4 +35,8 @@ struct ur_queue_handle_t_ { %for obj in th.get_queue_related_functions(specs, n, tags): virtual ${x}_result_t ${th.transform_queue_related_function_name(n, tags, obj, format=["type"])} = 0; %endfor + + virtual ur_result_t + enqueueCommandBuffer(ze_command_list_handle_t, ur_event_handle_t *, + uint32_t, const ur_event_handle_t *) = 0; }; diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt index 39031a700d..c75c870be7 100644 --- a/source/adapters/level_zero/CMakeLists.txt +++ b/source/adapters/level_zero/CMakeLists.txt @@ -145,7 +145,9 @@ if(UR_BUILD_ADAPTER_L0_V2) ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tensor_map.cpp # v2-only sources + ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_buffer.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_cache.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_manager.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/context.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool_cache.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool.hpp @@ -159,7 +161,9 @@ if(UR_BUILD_ADAPTER_L0_V2) ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/usm.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/api.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_buffer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_cache.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_manager.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/context.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool_cache.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool.cpp diff --git a/source/adapters/level_zero/v2/api.cpp b/source/adapters/level_zero/v2/api.cpp index 9ae9bddcb9..edd9687445 100644 --- a/source/adapters/level_zero/v2/api.cpp +++ b/source/adapters/level_zero/v2/api.cpp @@ -239,47 +239,6 @@ ur_result_t urBindlessImagesReleaseExternalSemaphoreExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -ur_result_t -urCommandBufferCreateExp(ur_context_handle_t hContext, - ur_device_handle_t hDevice, - const ur_exp_command_buffer_desc_t *pCommandBufferDesc, - ur_exp_command_buffer_handle_t *phCommandBuffer) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t -urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t -urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t -urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t urCommandBufferAppendKernelLaunchExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_kernel_handle_t hKernel, - uint32_t workDim, const size_t *pGlobalWorkOffset, - const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, - uint32_t numKernelAlternatives, ur_kernel_handle_t *phKernelAlternatives, - uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, - ur_exp_command_buffer_command_handle_t *phCommand) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - ur_result_t urCommandBufferAppendUSMMemcpyExp( ur_exp_command_buffer_handle_t hCommandBuffer, void *pDst, const void *pSrc, size_t size, uint32_t numSyncPointsInWaitList, @@ -415,14 +374,6 @@ ur_result_t urCommandBufferAppendUSMAdviseExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -ur_result_t urCommandBufferEnqueueExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - ur_result_t urCommandBufferRetainCommandExp( ur_exp_command_buffer_command_handle_t hCommand) { logger::error("{} function not implemented!", __FUNCTION__); @@ -443,15 +394,6 @@ ur_result_t urCommandBufferUpdateKernelLaunchExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -ur_result_t -urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer, - ur_exp_command_buffer_info_t propName, - size_t propSize, void *pPropValue, - size_t *pPropSizeRet) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - ur_result_t urCommandBufferUpdateSignalEventExp( ur_exp_command_buffer_command_handle_t hCommand, ur_event_handle_t *phEvent) { diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp new file mode 100644 index 0000000000..46c8c6ae27 --- /dev/null +++ b/source/adapters/level_zero/v2/command_buffer.cpp @@ -0,0 +1,185 @@ +//===--------- command_buffer.cpp - Level Zero Adapter ---------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "command_buffer.hpp" +#include "../helpers/kernel_helpers.hpp" +#include "../ur_interface_loader.hpp" +#include "logger/ur_logger.hpp" + +namespace { + +// Checks whether zeCommandListImmediateAppendCommandListsExp can be used for a +// given context. +void checkImmediateAppendSupport(ur_context_handle_t context) { + bool DriverSupportsImmediateAppend = + context->getPlatform()->ZeCommandListImmediateAppendExt.Supported; + + if (!DriverSupportsImmediateAppend) { + logger::error("Adapter v2 is used but " + "the current driver does not support the " + "zeCommandListImmediateAppendCommandListsExp entrypoint."); + std::abort(); + } +} + +} // namespace + +ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( + ur_context_handle_t context, ur_device_handle_t device, + v2::raii::command_list_unique_handle &&commandList, + const ur_exp_command_buffer_desc_t *desc) + : commandListManager( + context, device, + std::forward(commandList)), + isUpdatable(desc ? desc->isUpdatable : false) {} + +ur_result_t ur_exp_command_buffer_handle_t_::closeCommandList() { + // It is not allowed to append to command list from multiple threads. + std::scoped_lock guard(this->Mutex); + + // Close the command lists and have them ready for dispatch. + ZE2UR_CALL(zeCommandListClose, (this->commandListManager.getZeCommandList())); + return UR_RESULT_SUCCESS; +} + +namespace ur::level_zero { + +ur_result_t +urCommandBufferCreateExp(ur_context_handle_t context, ur_device_handle_t device, + const ur_exp_command_buffer_desc_t *commandBufferDesc, + ur_exp_command_buffer_handle_t *commandBuffer) try { + checkImmediateAppendSupport(context); + + if (!context->getPlatform()->ZeMutableCmdListExt.Supported) { + throw UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + using queue_group_type = ur_device_handle_t_::queue_group_info_t::type; + uint32_t queueGroupOrdinal = + device->QueueGroup[queue_group_type::Compute].ZeOrdinal; + v2::raii::command_list_unique_handle zeCommandList = + context->commandListCache.getRegularCommandList(device->ZeDevice, true, + queueGroupOrdinal, true); + + *commandBuffer = new ur_exp_command_buffer_handle_t_( + context, device, std::move(zeCommandList), commandBufferDesc); + return UR_RESULT_SUCCESS; + +} catch (const std::bad_alloc &) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t +urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) try { + hCommandBuffer->RefCount.increment(); + return UR_RESULT_SUCCESS; +} catch (const std::bad_alloc &) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t +urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) try { + if (!hCommandBuffer->RefCount.decrementAndTest()) + return UR_RESULT_SUCCESS; + + delete hCommandBuffer; + return UR_RESULT_SUCCESS; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t +urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) try { + UR_ASSERT(hCommandBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(!hCommandBuffer->isFinalized, UR_RESULT_ERROR_INVALID_OPERATION); + hCommandBuffer->closeCommandList(); + + hCommandBuffer->isFinalized = true; + return UR_RESULT_SUCCESS; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t urCommandBufferAppendKernelLaunchExp( + ur_exp_command_buffer_handle_t commandBuffer, ur_kernel_handle_t hKernel, + uint32_t workDim, const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numKernelAlternatives, ur_kernel_handle_t *kernelAlternatives, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *syncPointWaitList, + uint32_t numEventsInWaitList, const ur_event_handle_t *eventWaitList, + ur_exp_command_buffer_sync_point_t *retSyncPoint, ur_event_handle_t *event, + ur_exp_command_buffer_command_handle_t *command) + + try { + // Need to know semantics + // - should they be checked before kernel execution or before kernel + // appending to list if latter then it is easy fix, if former then TODO + std::ignore = numEventsInWaitList; + std::ignore = eventWaitList; + std::ignore = event; + + // sync mechanic can be ignored, because all lists are in-order + std::ignore = numSyncPointsInWaitList; + std::ignore = syncPointWaitList; + std::ignore = retSyncPoint; + + // TODO + std::ignore = numKernelAlternatives; + std::ignore = kernelAlternatives; + std::ignore = command; + UR_CALL(commandBuffer->commandListManager.appendKernelLaunch( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, 0, + nullptr, nullptr)); + return UR_RESULT_SUCCESS; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t urCommandBufferEnqueueExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) try { + return hQueue->enqueueCommandBuffer( + hCommandBuffer->commandListManager.getZeCommandList(), phEvent, + numEventsInWaitList, phEventWaitList); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t +urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer, + ur_exp_command_buffer_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet) try { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_EXP_COMMAND_BUFFER_INFO_REFERENCE_COUNT: + return ReturnValue(uint32_t{hCommandBuffer->RefCount.load()}); + case UR_EXP_COMMAND_BUFFER_INFO_DESCRIPTOR: { + ur_exp_command_buffer_desc_t Descriptor{}; + Descriptor.stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC; + Descriptor.pNext = nullptr; + Descriptor.isUpdatable = hCommandBuffer->isUpdatable; + Descriptor.isInOrder = true; + Descriptor.enableProfiling = hCommandBuffer->isProfilingEnabled; + + return ReturnValue(Descriptor); + } + default: + assert(!"Command-buffer info request not implemented"); + } + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +} // namespace ur::level_zero diff --git a/source/adapters/level_zero/v2/command_buffer.hpp b/source/adapters/level_zero/v2/command_buffer.hpp new file mode 100644 index 0000000000..50a3d729fd --- /dev/null +++ b/source/adapters/level_zero/v2/command_buffer.hpp @@ -0,0 +1,57 @@ +//===--------- command_buffer.hpp - Level Zero Adapter ---------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include "command_list_manager.hpp" +#include "common.hpp" +#include "context.hpp" +#include "kernel.hpp" +#include "queue_api.hpp" +#include + +struct command_buffer_profiling_t { + ur_exp_command_buffer_sync_point_t numEvents; + ze_kernel_timestamp_result_t *timestamps; +}; + +struct ur_exp_command_buffer_handle_t_ : public _ur_object { + ur_exp_command_buffer_handle_t_( + ur_context_handle_t context, ur_device_handle_t device, + v2::raii::command_list_unique_handle &&commandList, + const ur_exp_command_buffer_desc_t *desc); + ~ur_exp_command_buffer_handle_t_() = default; + ur_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, + ur_command_t commandType); + + ur_command_list_manager commandListManager; + + ur_result_t closeCommandList(); + + std::vector waitList; + + // Indicates if command-buffer commands can be updated after it is closed. + bool isUpdatable = false; + // Indicates if command buffer was finalized. + bool isFinalized = false; + // Command-buffer profiling is enabled. + bool isProfilingEnabled = false; +}; + +struct ur_exp_command_buffer_command_handle_t_ : public _ur_object { + ur_exp_command_buffer_command_handle_t_(ur_exp_command_buffer_handle_t, + uint64_t); + + ~ur_exp_command_buffer_command_handle_t_(); + + // Command-buffer of this command. + ur_exp_command_buffer_handle_t commandBuffer; + // L0 command ID identifying this command + uint64_t commandId; +}; diff --git a/source/adapters/level_zero/v2/command_list_manager.cpp b/source/adapters/level_zero/v2/command_list_manager.cpp new file mode 100644 index 0000000000..fb50726053 --- /dev/null +++ b/source/adapters/level_zero/v2/command_list_manager.cpp @@ -0,0 +1,118 @@ +//===--------- command_list_cache.hpp - Level Zero Adapter ---------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "command_list_manager.hpp" +#include "../helpers/kernel_helpers.hpp" +#include "../ur_interface_loader.hpp" +#include "context.hpp" +#include "kernel.hpp" + +ur_command_list_manager::ur_command_list_manager( + ur_context_handle_t context, ur_device_handle_t device, + v2::raii::command_list_unique_handle &&commandList, v2::event_flags_t flags, + ur_queue_handle_t_ *queue) + : context(context), device(device), + eventPool(context->eventPoolCache.borrow(device->Id.value(), flags)), + zeCommandList( + std::forward(commandList)), + queue(queue) { + UR_CALL_THROWS(ur::level_zero::urContextRetain(context)); + UR_CALL_THROWS(ur::level_zero::urDeviceRetain(device)); +} + +ur_command_list_manager::~ur_command_list_manager() { + ur::level_zero::urContextRelease(context); + ur::level_zero::urDeviceRelease(device); +} + +std::pair +ur_command_list_manager::getWaitListView(const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents) { + + waitList.resize(numWaitEvents); + for (uint32_t i = 0; i < numWaitEvents; i++) { + waitList[i] = phWaitEvents[i]->getZeEvent(); + } + + return {waitList.data(), static_cast(numWaitEvents)}; +} + +ur_event_handle_t +ur_command_list_manager::getSignalEvent(ur_event_handle_t *hUserEvent, + ur_command_t commandType) { + if (hUserEvent && queue) { + *hUserEvent = eventPool->allocate(); + (*hUserEvent)->resetQueueAndCommand(queue, commandType); + return *hUserEvent; + } else { + return nullptr; + } +} + +ur_result_t ur_command_list_manager::appendKernelLaunch( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::appendKernelLaunch"); + + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER); + + UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + + ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(device); + + std::scoped_lock Lock(this->Mutex, + hKernel->Mutex); + + ze_group_count_t zeThreadGroupDimensions{1, 1, 1}; + uint32_t WG[3]{}; + UR_CALL(calculateKernelWorkDimensions(hZeKernel, device, + zeThreadGroupDimensions, WG, workDim, + pGlobalWorkSize, pLocalWorkSize)); + + auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH); + + auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); + + bool memoryMigrated = false; + auto memoryMigrate = [&](void *src, void *dst, size_t size) { + ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, + (zeCommandList.get(), dst, src, size, nullptr, + waitList.second, waitList.first)); + memoryMigrated = true; + }; + + UR_CALL(hKernel->prepareForSubmission(context, device, pGlobalWorkOffset, + workDim, WG[0], WG[1], WG[2], + memoryMigrate)); + + if (memoryMigrated) { + // If memory was migrated, we don't need to pass the wait list to + // the copy command again. + waitList.first = nullptr; + waitList.second = 0; + } + + TRACK_SCOPE_LATENCY( + "ur_command_list_manager::zeCommandListAppendLaunchKernel"); + auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; + ZE2UR_CALL(zeCommandListAppendLaunchKernel, + (zeCommandList.get(), hZeKernel, &zeThreadGroupDimensions, + zeSignalEvent, waitList.second, waitList.first)); + + return UR_RESULT_SUCCESS; +} + +ze_command_list_handle_t ur_command_list_manager::getZeCommandList() { + return zeCommandList.get(); +} diff --git a/source/adapters/level_zero/v2/command_list_manager.hpp b/source/adapters/level_zero/v2/command_list_manager.hpp new file mode 100644 index 0000000000..95cfa89250 --- /dev/null +++ b/source/adapters/level_zero/v2/command_list_manager.hpp @@ -0,0 +1,54 @@ +//===--------- command_list_cache.hpp - Level Zero Adapter ---------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include "command_list_cache.hpp" +#include "common.hpp" +#include "event_pool_cache.hpp" +#include "queue_api.hpp" +#include + +struct ur_command_list_manager : public _ur_object { + + ur_command_list_manager(ur_context_handle_t context, + ur_device_handle_t device, + v2::raii::command_list_unique_handle &&commandList, + v2::event_flags_t flags = v2::EVENT_FLAGS_COUNTER, + ur_queue_handle_t_ *queue = nullptr); + ~ur_command_list_manager(); + + ur_result_t appendKernelLaunch(ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + ur_result_t appendCommandListImmediate( + ze_command_list_handle_t commandList, ur_event_handle_t *phEvent, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList); + ze_command_list_handle_t getZeCommandList(); + +private: + // UR context associated with this command-buffer + ur_context_handle_t context; + // Device associated with this command buffer + ur_device_handle_t device; + v2::raii::cache_borrowed_event_pool eventPool; + v2::raii::command_list_unique_handle zeCommandList; + ur_queue_handle_t_ *queue; + std::vector waitList; + + std::pair + getWaitListView(const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents); + ur_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, + ur_command_t commandType); +}; diff --git a/source/adapters/level_zero/v2/queue_api.cpp b/source/adapters/level_zero/v2/queue_api.cpp index f4e2f47c09..28ff527413 100644 --- a/source/adapters/level_zero/v2/queue_api.cpp +++ b/source/adapters/level_zero/v2/queue_api.cpp @@ -11,6 +11,9 @@ * */ +// Do not edit. This file is auto generated from a template: +// scripts/templates/queue_api.cpp.mako + #include "queue_api.hpp" #include "ur_util.hpp" diff --git a/source/adapters/level_zero/v2/queue_api.hpp b/source/adapters/level_zero/v2/queue_api.hpp index e9e98874e8..88d812bbba 100644 --- a/source/adapters/level_zero/v2/queue_api.hpp +++ b/source/adapters/level_zero/v2/queue_api.hpp @@ -11,9 +11,13 @@ * */ +// Do not edit. This file is auto generated from a template: +// scripts/templates/queue_api.hpp.mako + #pragma once #include +#include struct ur_queue_handle_t_ { virtual ~ur_queue_handle_t_(); @@ -158,4 +162,8 @@ struct ur_queue_handle_t_ { const ur_exp_enqueue_native_command_properties_t *, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) = 0; + + virtual ur_result_t enqueueCommandBuffer(ze_command_list_handle_t, + ur_event_handle_t *, uint32_t, + const ur_event_handle_t *) = 0; }; diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index af65df78a2..ace9032e2b 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -90,7 +90,16 @@ ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( : hContext(hContext), hDevice(hDevice), flags(pProps ? pProps->flags : 0), eventPool(hContext->eventPoolCache.borrow( hDevice->Id.value(), eventFlagsFromQueueFlags(flags))), - handler(hContext, hDevice, pProps) {} + handler(hContext, hDevice, pProps), + commandListManager( + hContext, hDevice, + hContext->commandListCache.getImmediateCommandList( + hDevice->ZeDevice, true, getZeOrdinal(hDevice), + true /* always enable copy offload */, + ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, + getZePriority(pProps ? pProps->flags : ur_queue_flags_t{}), + getZeIndex(pProps)), + eventFlagsFromQueueFlags(flags), this) {} ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( ur_context_handle_t hContext, ur_device_handle_t hDevice, @@ -99,7 +108,17 @@ ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( eventPool(hContext->eventPoolCache.borrow( hDevice->Id.value(), eventFlagsFromQueueFlags(flags))), handler(reinterpret_cast(hNativeHandle), - ownZeQueue) {} + ownZeQueue), + commandListManager( + hContext, hDevice, + raii::command_list_unique_handle( + reinterpret_cast(hNativeHandle), + [ownZeQueue](ze_command_list_handle_t hZeCommandList) { + if (ownZeQueue) { + zeCommandListDestroy(hZeCommandList); + } + }), + eventFlagsFromQueueFlags(flags)) {} ur_event_handle_t ur_queue_immediate_in_order_t::getSignalEvent(ur_event_handle_t *hUserEvent, @@ -223,52 +242,9 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunch( const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueKernelLaunch"); - UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER); - - UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - - ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(hDevice); - - std::scoped_lock Lock(this->Mutex, - hKernel->Mutex); - - ze_group_count_t zeThreadGroupDimensions{1, 1, 1}; - uint32_t WG[3]{}; - UR_CALL(calculateKernelWorkDimensions(hZeKernel, hDevice, - zeThreadGroupDimensions, WG, workDim, - pGlobalWorkSize, pLocalWorkSize)); - - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH); - - auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); - - bool memoryMigrated = false; - auto memoryMigrate = [&](void *src, void *dst, size_t size) { - ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); - memoryMigrated = true; - }; - - UR_CALL(hKernel->prepareForSubmission(hContext, hDevice, pGlobalWorkOffset, - workDim, WG[0], WG[1], WG[2], - memoryMigrate)); - - if (memoryMigrated) { - // If memory was migrated, we don't need to pass the wait list to - // the copy command again. - waitList.first = nullptr; - waitList.second = 0; - } - - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::zeCommandListAppendLaunchKernel"); - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; - ZE2UR_CALL(zeCommandListAppendLaunchKernel, - (handler.commandList.get(), hZeKernel, &zeThreadGroupDimensions, - zeSignalEvent, waitList.second, waitList.first)); + UR_CALL(commandListManager.appendKernelLaunch( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + numEventsInWaitList, phEventWaitList, phEvent)); recordSubmittedKernel(hKernel); @@ -1128,6 +1104,34 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp( return UR_RESULT_SUCCESS; } +ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp( + uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, + ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand) { + + std::scoped_lock Lock(this->Mutex); + auto signalEvent = getSignalEvent(phEvent, callerCommand); + + auto [pWaitEvents, numWaitEvents] = + getWaitListView(phEventWaitList, numEventsInWaitList); + + auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; + + ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp, + (commandListManager.getZeCommandList(), numCommandLists, phCommandLists, + zeSignalEvent, numWaitEvents, pWaitEvents)); + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_immediate_in_order_t::enqueueCommandBuffer( + ze_command_list_handle_t commandBufferCommandList, + ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList) { + return enqueueGenericCommandListsExp( + 1, &commandBufferCommandList, phEvent, numEventsInWaitList, + phEventWaitList, UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP); +} ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunchCustomExp( ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index e0d7f747b3..6cb2f75999 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -19,6 +19,8 @@ #include "ur/ur.hpp" +#include "command_list_manager.hpp" + namespace v2 { using queue_group_type = ur_device_handle_t_::queue_group_info_t::type; @@ -36,16 +38,22 @@ struct ur_command_list_handler_t { struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ { private: + // to remove after command_list_manager is complete ur_context_handle_t hContext; + // to remove after command_list_manager is complete ur_device_handle_t hDevice; ur_queue_flags_t flags; + // to remove after command_list_manager is complete raii::cache_borrowed_event_pool eventPool; + // to remove after command_list_manager is complete ur_command_list_handler_t handler; + // to remove after command_list_manager is complete std::vector waitList; + ur_command_list_manager commandListManager; std::vector deferredEvents; std::vector submittedKernels; @@ -78,6 +86,11 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ { const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, ur_command_t commandType); + ur_result_t enqueueGenericCommandListsExp( + uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, + ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand); + ur_result_t enqueueEventsWaitWithBarrierImpl(uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, @@ -277,6 +290,10 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override; ur_result_t + enqueueCommandBuffer(ze_command_list_handle_t commandBufferCommandList, + ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList) override; + ur_result_t enqueueNativeCommandExp(ur_exp_enqueue_native_command_function_t, void *, uint32_t, const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *, From 78f6fbd67bfe647577a5d3d4167f22766d622d9f Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Tue, 28 Jan 2025 10:24:01 +0000 Subject: [PATCH 43/46] =?UTF-8?q?parent=20982416300132e778138f9d02bbbb2cde?= =?UTF-8?q?4e9f6249=20author=20Miko=C5=82aj=20Komar=20=201734527193=20+0000=20committer=20Mateusz=20P.=20Nowak?= =?UTF-8?q?=20=201738059437=20+0000?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prepare ground for command_buffer in v2 Enforce in order list usage, and add initialization and destruction to buffer Add initial support of command buffers to adapter v2 Update UR calls handling Remove unnecessary comment Move not implemented command buffer commands to previous position Fix most issues with code Fix formatting and modify queue_api template Move command buffer cleanup to destructor Use cached command lists instead of created ones Remove not needed function and change phrasing Add initial implementation of command list manager Use list manager instead of custom implementation in queue Optimalize imports Remove not needed destructor Revert "Fix formatting" This reverts commit 545e57732480487e7f36d87632b9c56307f39f49. Move command list close to the command buffer Moved try outside function block Move enqueue generic command list back to queue Share events and lists between queue and command list manager Use ze events instead of ur in getSignalEvent --- .../level_zero/v2/command_list_manager.cpp | 7 +- .../level_zero/v2/command_list_manager.hpp | 12 +- .../v2/queue_immediate_in_order.cpp | 224 +++++++----------- .../v2/queue_immediate_in_order.hpp | 24 +- 4 files changed, 100 insertions(+), 167 deletions(-) diff --git a/source/adapters/level_zero/v2/command_list_manager.cpp b/source/adapters/level_zero/v2/command_list_manager.cpp index fb50726053..b248fd2dd3 100644 --- a/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/source/adapters/level_zero/v2/command_list_manager.cpp @@ -44,13 +44,13 @@ ur_command_list_manager::getWaitListView(const ur_event_handle_t *phWaitEvents, return {waitList.data(), static_cast(numWaitEvents)}; } -ur_event_handle_t +ze_event_handle_t ur_command_list_manager::getSignalEvent(ur_event_handle_t *hUserEvent, ur_command_t commandType) { if (hUserEvent && queue) { *hUserEvent = eventPool->allocate(); (*hUserEvent)->resetQueueAndCommand(queue, commandType); - return *hUserEvent; + return (*hUserEvent)->getZeEvent(); } else { return nullptr; } @@ -80,7 +80,7 @@ ur_result_t ur_command_list_manager::appendKernelLaunch( zeThreadGroupDimensions, WG, workDim, pGlobalWorkSize, pLocalWorkSize)); - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH); + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH); auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -105,7 +105,6 @@ ur_result_t ur_command_list_manager::appendKernelLaunch( TRACK_SCOPE_LATENCY( "ur_command_list_manager::zeCommandListAppendLaunchKernel"); - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; ZE2UR_CALL(zeCommandListAppendLaunchKernel, (zeCommandList.get(), hZeKernel, &zeThreadGroupDimensions, zeSignalEvent, waitList.second, waitList.first)); diff --git a/source/adapters/level_zero/v2/command_list_manager.hpp b/source/adapters/level_zero/v2/command_list_manager.hpp index 95cfa89250..9e0049a130 100644 --- a/source/adapters/level_zero/v2/command_list_manager.hpp +++ b/source/adapters/level_zero/v2/command_list_manager.hpp @@ -36,6 +36,12 @@ struct ur_command_list_manager : public _ur_object { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList); ze_command_list_handle_t getZeCommandList(); + std::pair + getWaitListView(const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents); + ze_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, + ur_command_t commandType); + private: // UR context associated with this command-buffer ur_context_handle_t context; @@ -45,10 +51,4 @@ struct ur_command_list_manager : public _ur_object { v2::raii::command_list_unique_handle zeCommandList; ur_queue_handle_t_ *queue; std::vector waitList; - - std::pair - getWaitListView(const ur_event_handle_t *phWaitEvents, - uint32_t numWaitEvents); - ur_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, - ur_command_t commandType); }; diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index ace9032e2b..bf98344681 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -24,13 +24,7 @@ namespace v2 { std::pair ur_queue_immediate_in_order_t::getWaitListView( const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents) { - - waitList.resize(numWaitEvents); - for (uint32_t i = 0; i < numWaitEvents; i++) { - waitList[i] = phWaitEvents[i]->getZeEvent(); - } - - return {waitList.data(), static_cast(numWaitEvents)}; + return commandListManager.getWaitListView(phWaitEvents, numWaitEvents); } static int32_t getZeOrdinal(ur_device_handle_t hDevice) { @@ -58,25 +52,6 @@ static ze_command_queue_priority_t getZePriority(ur_queue_flags_t flags) { return ZE_COMMAND_QUEUE_PRIORITY_NORMAL; } -ur_command_list_handler_t::ur_command_list_handler_t( - ur_context_handle_t hContext, ur_device_handle_t hDevice, - const ur_queue_properties_t *pProps) - : commandList(hContext->commandListCache.getImmediateCommandList( - hDevice->ZeDevice, true, getZeOrdinal(hDevice), - true /* always enable copy offload */, - ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, - getZePriority(pProps ? pProps->flags : ur_queue_flags_t{}), - getZeIndex(pProps))) {} - -ur_command_list_handler_t::ur_command_list_handler_t( - ze_command_list_handle_t hZeCommandList, bool ownZeHandle) - : commandList(hZeCommandList, - [ownZeHandle](ze_command_list_handle_t hZeCommandList) { - if (ownZeHandle) { - ZE_CALL_NOCHECK(zeCommandListDestroy, (hZeCommandList)); - } - }) {} - static event_flags_t eventFlagsFromQueueFlags(ur_queue_flags_t flags) { event_flags_t eventFlags = EVENT_FLAGS_COUNTER; if (flags & UR_QUEUE_FLAG_PROFILING_ENABLE) @@ -88,9 +63,6 @@ ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_queue_properties_t *pProps) : hContext(hContext), hDevice(hDevice), flags(pProps ? pProps->flags : 0), - eventPool(hContext->eventPoolCache.borrow( - hDevice->Id.value(), eventFlagsFromQueueFlags(flags))), - handler(hContext, hDevice, pProps), commandListManager( hContext, hDevice, hContext->commandListCache.getImmediateCommandList( @@ -105,10 +77,6 @@ ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_native_handle_t hNativeHandle, ur_queue_flags_t flags, bool ownZeQueue) : hContext(hContext), hDevice(hDevice), flags(flags), - eventPool(hContext->eventPoolCache.borrow( - hDevice->Id.value(), eventFlagsFromQueueFlags(flags))), - handler(reinterpret_cast(hNativeHandle), - ownZeQueue), commandListManager( hContext, hDevice, raii::command_list_unique_handle( @@ -120,16 +88,10 @@ ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( }), eventFlagsFromQueueFlags(flags)) {} -ur_event_handle_t +ze_event_handle_t ur_queue_immediate_in_order_t::getSignalEvent(ur_event_handle_t *hUserEvent, ur_command_t commandType) { - if (hUserEvent) { - *hUserEvent = eventPool->allocate(); - (*hUserEvent)->resetQueueAndCommand(this, commandType); - return *hUserEvent; - } else { - return nullptr; - } + return commandListManager.getSignalEvent(hUserEvent, commandType); } ur_result_t @@ -194,8 +156,8 @@ void ur_queue_immediate_in_order_t::deferEventFree(ur_event_handle_t hEvent) { ur_result_t ur_queue_immediate_in_order_t::queueGetNativeHandle( ur_queue_native_desc_t *pDesc, ur_native_handle_t *phNativeQueue) { std::ignore = pDesc; - *phNativeQueue = - reinterpret_cast(this->handler.commandList.get()); + *phNativeQueue = reinterpret_cast( + this->commandListManager.getZeCommandList()); return UR_RESULT_SUCCESS; } @@ -208,7 +170,7 @@ ur_result_t ur_queue_immediate_in_order_t::queueFinish() { TRACK_SCOPE_LATENCY( "ur_queue_immediate_in_order_t::zeCommandListHostSynchronize"); ZE2UR_CALL(zeCommandListHostSynchronize, - (handler.commandList.get(), UINT64_MAX)); + (commandListManager.getZeCommandList(), UINT64_MAX)); // Free deferred events for (auto &hEvent : deferredEvents) { @@ -263,20 +225,19 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWait( return UR_RESULT_SUCCESS; } - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_EVENTS_WAIT); + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_EVENTS_WAIT); auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); if (numWaitEvents > 0) { - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (handler.commandList.get(), numWaitEvents, pWaitEvents)); - } - - if (signalEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (handler.commandList.get(), signalEvent->getZeEvent())); + ZE2UR_CALL( + zeCommandListAppendWaitOnEvents, + (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents)); } + ZE2UR_CALL( + zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); return UR_RESULT_SUCCESS; } @@ -293,13 +254,13 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrierImpl( return UR_RESULT_SUCCESS; } - auto signalEvent = + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER); auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); ZE2UR_CALL(zeCommandListAppendBarrier, - (handler.commandList.get(), signalEvent->getZeEvent(), + (commandListManager.getZeCommandList(), zeSignalEvent, numWaitEvents, pWaitEvents)); return UR_RESULT_SUCCESS; @@ -334,7 +295,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCopyUnlocked( size_t dstOffset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, ur_command_t commandType) { - auto signalEvent = getSignalEvent(phEvent, commandType); + auto zeSignalEvent = getSignalEvent(phEvent, commandType); auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -343,8 +304,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCopyUnlocked( hDevice, ur_mem_handle_t_::device_access_mode_t::read_only, srcOffset, size, [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, + size, nullptr, waitList.second, waitList.first)); memoryMigrated = true; })); @@ -352,8 +313,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCopyUnlocked( hDevice, ur_mem_handle_t_::device_access_mode_t::write_only, dstOffset, size, [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, + size, nullptr, waitList.second, waitList.first)); memoryMigrated = true; })); @@ -364,14 +325,13 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCopyUnlocked( waitList.second = 0; } - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; ZE2UR_CALL(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), pDst, pSrc, size, zeSignalEvent, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), pDst, pSrc, size, + zeSignalEvent, waitList.second, waitList.first)); if (blocking) { ZE2UR_CALL(zeCommandListHostSynchronize, - (handler.commandList.get(), UINT64_MAX)); + (commandListManager.getZeCommandList(), UINT64_MAX)); } return UR_RESULT_SUCCESS; @@ -423,7 +383,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueRegionCopyUnlocked( auto zeParams = ur2zeRegionParams(srcOrigin, dstOrigin, region, srcRowPitch, dstRowPitch, srcSlicePitch, dstSlicePitch); - auto signalEvent = getSignalEvent(phEvent, commandType); + auto zeSignalEvent = getSignalEvent(phEvent, commandType); auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -432,16 +392,16 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueRegionCopyUnlocked( hDevice, ur_mem_handle_t_::device_access_mode_t::read_only, 0, src->getSize(), [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, + size, nullptr, waitList.second, waitList.first)); memoryMigrated = true; })); auto pDst = ur_cast(dst->getDevicePtr( hDevice, ur_mem_handle_t_::device_access_mode_t::write_only, 0, dst->getSize(), [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, + size, nullptr, waitList.second, waitList.first)); memoryMigrated = true; })); @@ -452,16 +412,15 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueRegionCopyUnlocked( waitList.second = 0; } - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion, - (handler.commandList.get(), pDst, &zeParams.dstRegion, + (commandListManager.getZeCommandList(), pDst, &zeParams.dstRegion, zeParams.dstPitch, zeParams.dstSlicePitch, pSrc, &zeParams.srcRegion, zeParams.srcPitch, zeParams.srcSlicePitch, zeSignalEvent, waitList.second, waitList.first)); if (blocking) { ZE2UR_CALL(zeCommandListHostSynchronize, - (handler.commandList.get(), UINT64_MAX)); + (commandListManager.getZeCommandList(), UINT64_MAX)); } return UR_RESULT_SUCCESS; @@ -625,7 +584,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferMap( std::scoped_lock lock(this->Mutex, hBuffer->getMutex()); - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_BUFFER_MAP); + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_BUFFER_MAP); auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -633,8 +592,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferMap( auto pDst = ur_cast(hBuffer->mapHostPtr( mapFlags, offset, size, [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, + size, nullptr, waitList.second, waitList.first)); memoryMigrated = true; })); *ppRetMap = pDst; @@ -652,7 +611,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferMap( if (blockingMap) { ZE2UR_CALL(zeCommandListHostSynchronize, - (handler.commandList.get(), UINT64_MAX)); + (commandListManager.getZeCommandList(), UINT64_MAX)); } return UR_RESULT_SUCCESS; @@ -665,28 +624,29 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemUnmap( std::scoped_lock lock(this->Mutex); - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_UNMAP); + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_UNMAP); auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); // TODO: currently unmapHostPtr deallocates memory immediately, // since the memory might be used by the user, we need to make sure // all dependencies are completed. - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (handler.commandList.get(), waitList.second, waitList.first)); + ZE2UR_CALL( + zeCommandListAppendWaitOnEvents, + (commandListManager.getZeCommandList(), waitList.second, waitList.first)); bool memoryMigrated = false; hMem->unmapHostPtr(pMappedPtr, [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, size, + nullptr, waitList.second, waitList.first)); memoryMigrated = true; }); - if (signalEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (handler.commandList.get(), signalEvent->getZeEvent())); - } + ZE2UR_CALL( + zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); + return UR_RESULT_SUCCESS; } @@ -697,7 +657,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericFillUnlocked( const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, ur_command_t commandType) { - auto signalEvent = getSignalEvent(phEvent, commandType); + auto zeSignalEvent = getSignalEvent(phEvent, commandType); auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -706,8 +666,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericFillUnlocked( hDevice, ur_mem_handle_t_::device_access_mode_t::read_only, offset, size, [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, + size, nullptr, waitList.second, waitList.first)); memoryMigrated = true; })); @@ -722,10 +682,10 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericFillUnlocked( // PatternSize must be a power of two for zeCommandListAppendMemoryFill. // When it's not, the fill is emulated with zeCommandListAppendMemoryCopy. - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; ZE2UR_CALL(zeCommandListAppendMemoryFill, - (handler.commandList.get(), pDst, pPattern, patternSize, size, - zeSignalEvent, waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), pDst, pPattern, + patternSize, size, zeSignalEvent, waitList.second, + waitList.first)); return UR_RESULT_SUCCESS; } @@ -753,19 +713,18 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMMemcpy( std::scoped_lock lock(this->Mutex); - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_MEMCPY); + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_MEMCPY); auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; ZE2UR_CALL(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), pDst, pSrc, size, zeSignalEvent, - numWaitEvents, pWaitEvents)); + (commandListManager.getZeCommandList(), pDst, pSrc, size, + zeSignalEvent, numWaitEvents, pWaitEvents)); if (blocking) { ZE2UR_CALL(zeCommandListHostSynchronize, - (handler.commandList.get(), UINT64_MAX)); + (commandListManager.getZeCommandList(), UINT64_MAX)); } return UR_RESULT_SUCCESS; @@ -781,23 +740,24 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMPrefetch( std::scoped_lock lock(this->Mutex); - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_PREFETCH); + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_PREFETCH); auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); if (pWaitEvents) { - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (handler.commandList.get(), numWaitEvents, pWaitEvents)); + ZE2UR_CALL( + zeCommandListAppendWaitOnEvents, + (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents)); } // TODO: figure out how to translate "flags" ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, - (handler.commandList.get(), pMem, size)); + (commandListManager.getZeCommandList(), pMem, size)); - if (signalEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (handler.commandList.get(), signalEvent->getZeEvent())); - } + ZE2UR_CALL( + zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); + return UR_RESULT_SUCCESS; } @@ -814,24 +774,24 @@ ur_queue_immediate_in_order_t::enqueueUSMAdvise(const void *pMem, size_t size, auto zeAdvice = ur_cast(advice); - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE); + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE); auto [pWaitEvents, numWaitEvents] = getWaitListView(nullptr, 0); if (pWaitEvents) { - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (handler.commandList.get(), numWaitEvents, pWaitEvents)); + ZE2UR_CALL( + zeCommandListAppendWaitOnEvents, + (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents)); } // TODO: figure out how to translate "flags" ZE2UR_CALL(zeCommandListAppendMemAdvise, - (handler.commandList.get(), this->hDevice->ZeDevice, pMem, size, - zeAdvice)); + (commandListManager.getZeCommandList(), this->hDevice->ZeDevice, + pMem, size, zeAdvice)); - if (signalEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (handler.commandList.get(), signalEvent->getZeEvent())); - } + ZE2UR_CALL( + zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); return UR_RESULT_SUCCESS; } @@ -1034,15 +994,15 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp( zeThreadGroupDimensions, WG, workDim, pGlobalWorkSize, pLocalWorkSize)); - auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH); + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH); auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); bool memoryMigrated = false; auto memoryMigrate = [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (handler.commandList.get(), dst, src, size, nullptr, - waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), dst, src, size, + nullptr, waitList.second, waitList.first)); memoryMigrated = true; }; @@ -1059,10 +1019,10 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp( TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::" "zeCommandListAppendLaunchCooperativeKernel"); - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel, - (handler.commandList.get(), hZeKernel, &zeThreadGroupDimensions, - zeSignalEvent, waitList.second, waitList.first)); + (commandListManager.getZeCommandList(), hZeKernel, + &zeThreadGroupDimensions, zeSignalEvent, waitList.second, + waitList.first)); recordSubmittedKernel(hKernel); @@ -1077,28 +1037,26 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp( std::scoped_lock lock(this->Mutex); - auto signalEvent = - getSignalEvent(phEvent, UR_COMMAND_TIMESTAMP_RECORDING_EXP); - if (!signalEvent) { + if (!phEvent && !*phEvent) { return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); - signalEvent->recordStartTimestamp(); + (*phEvent)->recordStartTimestamp(); auto [timestampPtr, zeSignalEvent] = - signalEvent->getEventEndTimestampAndHandle(); + (*phEvent)->getEventEndTimestampAndHandle(); ZE2UR_CALL(zeCommandListAppendWriteGlobalTimestamp, - (handler.commandList.get(), timestampPtr, zeSignalEvent, - numWaitEvents, pWaitEvents)); + (commandListManager.getZeCommandList(), timestampPtr, + zeSignalEvent, numWaitEvents, pWaitEvents)); if (blocking) { ZE2UR_CALL(zeCommandListHostSynchronize, - (handler.commandList.get(), UINT64_MAX)); + (commandListManager.getZeCommandList(), UINT64_MAX)); } return UR_RESULT_SUCCESS; @@ -1110,16 +1068,14 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp( const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand) { std::scoped_lock Lock(this->Mutex); - auto signalEvent = getSignalEvent(phEvent, callerCommand); + auto zeSignalEvent = getSignalEvent(phEvent, callerCommand); auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); - auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; - ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp, - (commandListManager.getZeCommandList(), numCommandLists, phCommandLists, - zeSignalEvent, numWaitEvents, pWaitEvents)); + (commandListManager.getZeCommandList(), numCommandLists, + phCommandLists, zeSignalEvent, numWaitEvents, pWaitEvents)); return UR_RESULT_SUCCESS; } @@ -1128,9 +1084,9 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCommandBuffer( ze_command_list_handle_t commandBufferCommandList, ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList) { - return enqueueGenericCommandListsExp( - 1, &commandBufferCommandList, phEvent, numEventsInWaitList, - phEventWaitList, UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP); + return enqueueGenericCommandListsExp(1, &commandBufferCommandList, phEvent, + numEventsInWaitList, phEventWaitList, + UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP); } ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunchCustomExp( ur_kernel_handle_t hKernel, uint32_t workDim, diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index 6cb2f75999..6cf8b0c51c 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -25,34 +25,12 @@ namespace v2 { using queue_group_type = ur_device_handle_t_::queue_group_info_t::type; -struct ur_command_list_handler_t { - ur_command_list_handler_t(ur_context_handle_t hContext, - ur_device_handle_t hDevice, - const ur_queue_properties_t *pProps); - - ur_command_list_handler_t(ze_command_list_handle_t hZeCommandList, - bool ownZeHandle); - - raii::command_list_unique_handle commandList; -}; - struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ { private: - // to remove after command_list_manager is complete ur_context_handle_t hContext; - // to remove after command_list_manager is complete ur_device_handle_t hDevice; ur_queue_flags_t flags; - // to remove after command_list_manager is complete - raii::cache_borrowed_event_pool eventPool; - - // to remove after command_list_manager is complete - ur_command_list_handler_t handler; - - // to remove after command_list_manager is complete - std::vector waitList; - ur_command_list_manager commandListManager; std::vector deferredEvents; std::vector submittedKernels; @@ -61,7 +39,7 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ { getWaitListView(const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents); - ur_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, + ze_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, ur_command_t commandType); void deferEventFree(ur_event_handle_t hEvent) override; From 2aff645cb9811e5de50449832ca70e0a37a1a093 Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Tue, 28 Jan 2025 10:28:36 +0000 Subject: [PATCH 44/46] # This is a combination of 3 commits. # This is the 1st commit message: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit parent 982416300132e778138f9d02bbbb2cde4e9f6249 author Mikołaj Komar 1734527193 +0000 committer Mateusz P. Nowak 1738059437 +0000 Prepare ground for command_buffer in v2 Enforce in order list usage, and add initialization and destruction to buffer Add initial support of command buffers to adapter v2 Update UR calls handling Remove unnecessary comment Move not implemented command buffer commands to previous position Fix most issues with code Fix formatting and modify queue_api template Move command buffer cleanup to destructor Use cached command lists instead of created ones Remove not needed function and change phrasing Add initial implementation of command list manager Use list manager instead of custom implementation in queue Optimalize imports Remove not needed destructor Revert "Fix formatting" This reverts commit 545e57732480487e7f36d87632b9c56307f39f49. Move command list close to the command buffer Moved try outside function block Move enqueue generic command list back to queue Share events and lists between queue and command list manager Use ze events instead of ur in getSignalEvent # This is the commit message #2: Remove not needed structs and reformat code # This is the commit message #3: Fix PR comments --- .../adapters/level_zero/v2/command_buffer.cpp | 14 ++++---- .../adapters/level_zero/v2/command_buffer.hpp | 12 ++----- .../level_zero/v2/command_list_manager.cpp | 4 +-- .../v2/queue_immediate_in_order.cpp | 34 +++++++++---------- 4 files changed, 26 insertions(+), 38 deletions(-) diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp index 46c8c6ae27..c35d97d76b 100644 --- a/source/adapters/level_zero/v2/command_buffer.cpp +++ b/source/adapters/level_zero/v2/command_buffer.cpp @@ -40,12 +40,13 @@ ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( std::forward(commandList)), isUpdatable(desc ? desc->isUpdatable : false) {} -ur_result_t ur_exp_command_buffer_handle_t_::closeCommandList() { +ur_result_t ur_exp_command_buffer_handle_t_::finalizeCommandBuffer() { // It is not allowed to append to command list from multiple threads. std::scoped_lock guard(this->Mutex); - + UR_ASSERT(!isFinalized, UR_RESULT_ERROR_INVALID_OPERATION); // Close the command lists and have them ready for dispatch. ZE2UR_CALL(zeCommandListClose, (this->commandListManager.getZeCommandList())); + isFinalized = true; return UR_RESULT_SUCCESS; } @@ -72,7 +73,7 @@ urCommandBufferCreateExp(ur_context_handle_t context, ur_device_handle_t device, context, device, std::move(zeCommandList), commandBufferDesc); return UR_RESULT_SUCCESS; -} catch (const std::bad_alloc &) { +} catch (...) { return exceptionToResult(std::current_exception()); } @@ -80,7 +81,7 @@ ur_result_t urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) try { hCommandBuffer->RefCount.increment(); return UR_RESULT_SUCCESS; -} catch (const std::bad_alloc &) { +} catch (...) { return exceptionToResult(std::current_exception()); } @@ -98,10 +99,7 @@ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) try { ur_result_t urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) try { UR_ASSERT(hCommandBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(!hCommandBuffer->isFinalized, UR_RESULT_ERROR_INVALID_OPERATION); - hCommandBuffer->closeCommandList(); - - hCommandBuffer->isFinalized = true; + UR_CALL(hCommandBuffer->finalizeCommandBuffer()); return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); diff --git a/source/adapters/level_zero/v2/command_buffer.hpp b/source/adapters/level_zero/v2/command_buffer.hpp index 50a3d729fd..c263457d1a 100644 --- a/source/adapters/level_zero/v2/command_buffer.hpp +++ b/source/adapters/level_zero/v2/command_buffer.hpp @@ -16,25 +16,16 @@ #include "queue_api.hpp" #include -struct command_buffer_profiling_t { - ur_exp_command_buffer_sync_point_t numEvents; - ze_kernel_timestamp_result_t *timestamps; -}; - struct ur_exp_command_buffer_handle_t_ : public _ur_object { ur_exp_command_buffer_handle_t_( ur_context_handle_t context, ur_device_handle_t device, v2::raii::command_list_unique_handle &&commandList, const ur_exp_command_buffer_desc_t *desc); ~ur_exp_command_buffer_handle_t_() = default; - ur_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, - ur_command_t commandType); ur_command_list_manager commandListManager; - ur_result_t closeCommandList(); - - std::vector waitList; + ur_result_t finalizeCommandBuffer(); // Indicates if command-buffer commands can be updated after it is closed. bool isUpdatable = false; @@ -48,6 +39,7 @@ struct ur_exp_command_buffer_command_handle_t_ : public _ur_object { ur_exp_command_buffer_command_handle_t_(ur_exp_command_buffer_handle_t, uint64_t); +private: ~ur_exp_command_buffer_command_handle_t_(); // Command-buffer of this command. diff --git a/source/adapters/level_zero/v2/command_list_manager.cpp b/source/adapters/level_zero/v2/command_list_manager.cpp index b248fd2dd3..987cb462a3 100644 --- a/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/source/adapters/level_zero/v2/command_list_manager.cpp @@ -20,9 +20,7 @@ ur_command_list_manager::ur_command_list_manager( ur_queue_handle_t_ *queue) : context(context), device(device), eventPool(context->eventPoolCache.borrow(device->Id.value(), flags)), - zeCommandList( - std::forward(commandList)), - queue(queue) { + zeCommandList(std::move(commandList)), queue(queue) { UR_CALL_THROWS(ur::level_zero::urContextRetain(context)); UR_CALL_THROWS(ur::level_zero::urDeviceRetain(device)); } diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index bf98344681..c4aa527595 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -83,7 +83,7 @@ ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( reinterpret_cast(hNativeHandle), [ownZeQueue](ze_command_list_handle_t hZeCommandList) { if (ownZeQueue) { - zeCommandListDestroy(hZeCommandList); + ZE_CALL_NOCHECK(zeCommandListDestroy, (hZeCommandList)); } }), eventFlagsFromQueueFlags(flags)) {} @@ -235,9 +235,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWait( (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents)); } - ZE2UR_CALL( - zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), zeSignalEvent)); + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); return UR_RESULT_SUCCESS; } @@ -601,12 +600,19 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferMap( if (!memoryMigrated && waitList.second) { // If memory was not migrated, we need to wait on the events here. ZE2UR_CALL(zeCommandListAppendWaitOnEvents, +<<<<<<< HEAD (handler.commandList.get(), waitList.second, waitList.first)); } if (signalEvent) { ZE2UR_CALL(zeCommandListAppendSignalEvent, (handler.commandList.get(), signalEvent->getZeEvent())); +======= + (commandListManager.getZeCommandList(), waitList.second, + waitList.first)); + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); +>>>>>>> 9f53547f (Remove not needed structs and reformat code) } if (blockingMap) { @@ -643,10 +649,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemUnmap( memoryMigrated = true; }); - ZE2UR_CALL( - zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), zeSignalEvent)); - + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); return UR_RESULT_SUCCESS; } @@ -754,10 +758,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMPrefetch( ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, (commandListManager.getZeCommandList(), pMem, size)); - ZE2UR_CALL( - zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), zeSignalEvent)); - + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); return UR_RESULT_SUCCESS; } @@ -789,9 +791,8 @@ ur_queue_immediate_in_order_t::enqueueUSMAdvise(const void *pMem, size_t size, (commandListManager.getZeCommandList(), this->hDevice->ZeDevice, pMem, size, zeAdvice)); - ZE2UR_CALL( - zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), zeSignalEvent)); + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); return UR_RESULT_SUCCESS; } @@ -1037,11 +1038,10 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp( std::scoped_lock lock(this->Mutex); - if (!phEvent && !*phEvent) { return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } - + getSignalEvent(phEvent, UR_COMMAND_TIMESTAMP_RECORDING_EXP); auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); From af062037715296b949fb9a6355a42ccc6c5d4a35 Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Tue, 28 Jan 2025 10:32:15 +0000 Subject: [PATCH 45/46] # This is a combination of 6 commits. # This is the 1st commit message: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit parent 982416300132e778138f9d02bbbb2cde4e9f6249 author Mikołaj Komar 1734527193 +0000 committer Mateusz P. Nowak 1738059437 +0000 Prepare ground for command_buffer in v2 Enforce in order list usage, and add initialization and destruction to buffer Add initial support of command buffers to adapter v2 Update UR calls handling Remove unnecessary comment Move not implemented command buffer commands to previous position Fix most issues with code Fix formatting and modify queue_api template Move command buffer cleanup to destructor Use cached command lists instead of created ones Remove not needed function and change phrasing Add initial implementation of command list manager Use list manager instead of custom implementation in queue Optimalize imports Remove not needed destructor Revert "Fix formatting" This reverts commit 545e57732480487e7f36d87632b9c56307f39f49. Move command list close to the command buffer Moved try outside function block Move enqueue generic command list back to queue Share events and lists between queue and command list manager Use ze events instead of ur in getSignalEvent # This is the commit message #2: Remove not needed structs and reformat code # This is the commit message #3: Fix PR comments # This is the commit message #4: Fix ze function calling # This is the commit message #5: Fix access to some fields in command buffer v2 # This is the commit message #6: Fix compilation --- .../adapters/level_zero/v2/command_buffer.hpp | 10 +++-- .../v2/queue_immediate_in_order.cpp | 41 +++++++++---------- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/source/adapters/level_zero/v2/command_buffer.hpp b/source/adapters/level_zero/v2/command_buffer.hpp index c263457d1a..5e60d6537f 100644 --- a/source/adapters/level_zero/v2/command_buffer.hpp +++ b/source/adapters/level_zero/v2/command_buffer.hpp @@ -21,18 +21,20 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { ur_context_handle_t context, ur_device_handle_t device, v2::raii::command_list_unique_handle &&commandList, const ur_exp_command_buffer_desc_t *desc); + ~ur_exp_command_buffer_handle_t_() = default; ur_command_list_manager commandListManager; ur_result_t finalizeCommandBuffer(); - // Indicates if command-buffer commands can be updated after it is closed. - bool isUpdatable = false; + const bool isUpdatable = false; + // Command-buffer profiling is enabled. + const bool isProfilingEnabled = false; + +private: // Indicates if command buffer was finalized. bool isFinalized = false; - // Command-buffer profiling is enabled. - bool isProfilingEnabled = false; }; struct ur_exp_command_buffer_command_handle_t_ : public _ur_object { diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index c4aa527595..aceea600cc 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -114,7 +114,7 @@ ur_queue_immediate_in_order_t::queueGetInfo(ur_queue_info_t propName, return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; case UR_QUEUE_INFO_EMPTY: { auto status = ZE_CALL_NOCHECK(zeCommandListHostSynchronize, - (handler.commandList.get(), 0)); + (commandListManager.getZeCommandList(), 0)); if (status == ZE_RESULT_SUCCESS) { return ReturnValue(true); } else if (status == ZE_RESULT_NOT_READY) { @@ -235,8 +235,10 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWait( (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents)); } - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), zeSignalEvent)); + if (zeSignalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); + } return UR_RESULT_SUCCESS; } @@ -600,19 +602,12 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferMap( if (!memoryMigrated && waitList.second) { // If memory was not migrated, we need to wait on the events here. ZE2UR_CALL(zeCommandListAppendWaitOnEvents, -<<<<<<< HEAD - (handler.commandList.get(), waitList.second, waitList.first)); - } - - if (signalEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (handler.commandList.get(), signalEvent->getZeEvent())); -======= (commandListManager.getZeCommandList(), waitList.second, waitList.first)); + } + if (zeSignalEvent) { ZE2UR_CALL(zeCommandListAppendSignalEvent, (commandListManager.getZeCommandList(), zeSignalEvent)); ->>>>>>> 9f53547f (Remove not needed structs and reformat code) } if (blockingMap) { @@ -648,10 +643,10 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemUnmap( nullptr, waitList.second, waitList.first)); memoryMigrated = true; }); - - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), zeSignalEvent)); - + if (zeSignalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); + } return UR_RESULT_SUCCESS; } @@ -757,9 +752,10 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMPrefetch( // TODO: figure out how to translate "flags" ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, (commandListManager.getZeCommandList(), pMem, size)); - - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), zeSignalEvent)); + if (zeSignalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); + } return UR_RESULT_SUCCESS; } @@ -791,9 +787,10 @@ ur_queue_immediate_in_order_t::enqueueUSMAdvise(const void *pMem, size_t size, (commandListManager.getZeCommandList(), this->hDevice->ZeDevice, pMem, size, zeAdvice)); - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), zeSignalEvent)); - + if (zeSignalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (commandListManager.getZeCommandList(), zeSignalEvent)); + } return UR_RESULT_SUCCESS; } From 55055ae6073cc5a5e935716062cbea6e86f332ae Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Tue, 28 Jan 2025 10:33:21 +0000 Subject: [PATCH 46/46] rebase --- source/adapters/level_zero/v2/queue_immediate_in_order.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index aceea600cc..1c738edf50 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -605,6 +605,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferMap( (commandListManager.getZeCommandList(), waitList.second, waitList.first)); } + if (zeSignalEvent) { ZE2UR_CALL(zeCommandListAppendSignalEvent, (commandListManager.getZeCommandList(), zeSignalEvent));