From 9ece879ab18115d31247adf1e92de137810ad7cf Mon Sep 17 00:00:00 2001 From: Wenju He Date: Tue, 22 Apr 2025 22:05:34 -0700 Subject: [PATCH 1/2] [UR][OpenCL][EnqueueCommandBuffer] Use internal in-order queue if command buffer is in-order sycl queue is out-of-order by default. We can't use out-of-order queue for EnqueueCommandBuffer when the command buffer is created with with an internal in-order queue in the case graph is in-order. --- unified-runtime/source/adapters/opencl/command_buffer.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unified-runtime/source/adapters/opencl/command_buffer.cpp b/unified-runtime/source/adapters/opencl/command_buffer.cpp index d3ef027457b5b..bce1b0d86f02a 100644 --- a/unified-runtime/source/adapters/opencl/command_buffer.cpp +++ b/unified-runtime/source/adapters/opencl/command_buffer.cpp @@ -475,7 +475,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp( for (uint32_t i = 0; i < numEventsInWaitList; i++) { CLWaitEvents[i] = phEventWaitList[i]->CLEvent; } - cl_command_queue CLQueue = hQueue->CLQueue; + cl_command_queue CLQueue = hCommandBuffer->IsInOrder + ? hCommandBuffer->hInternalQueue->CLQueue + : hQueue->CLQueue; CL_RETURN_ON_FAILURE(clEnqueueCommandBufferKHR( NumberOfQueues, &CLQueue, hCommandBuffer->CLCommandBuffer, numEventsInWaitList, CLWaitEvents.data(), ifUrEvent(phEvent, Event))); From f0b410a3900cc7465224048d62c8a3b167fccebd Mon Sep 17 00:00:00 2001 From: Wenju He Date: Tue, 22 Apr 2025 22:53:41 -0700 Subject: [PATCH 2/2] revert 9ece879, always use pSyncPoint --- .../source/adapters/opencl/command_buffer.cpp | 44 +++++++------------ 1 file changed, 16 insertions(+), 28 deletions(-) diff --git a/unified-runtime/source/adapters/opencl/command_buffer.cpp b/unified-runtime/source/adapters/opencl/command_buffer.cpp index bce1b0d86f02a..9482b3c9fae3f 100644 --- a/unified-runtime/source/adapters/opencl/command_buffer.cpp +++ b/unified-runtime/source/adapters/opencl/command_buffer.cpp @@ -171,11 +171,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( cl_command_properties_khr *Properties = hCommandBuffer->IsUpdatable ? UpdateProperties : nullptr; - const bool IsInOrder = hCommandBuffer->IsInOrder; - cl_sync_point_khr *RetSyncPoint = IsInOrder ? nullptr : pSyncPoint; - const cl_sync_point_khr *SyncPointWaitList = - IsInOrder ? nullptr : pSyncPointWaitList; - uint32_t WaitListSize = IsInOrder ? 0 : numSyncPointsInWaitList; + cl_sync_point_khr *RetSyncPoint = pSyncPoint; + const cl_sync_point_khr *SyncPointWaitList = pSyncPointWaitList; + uint32_t WaitListSize = numSyncPointsInWaitList; CL_RETURN_ON_FAILURE(clCommandNDRangeKernelKHR( hCommandBuffer->CLCommandBuffer, nullptr, Properties, hKernel->CLKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, WaitListSize, @@ -246,11 +244,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( CLContext, ur::cl::getAdapter()->fnCache.clCommandCopyBufferKHRCache, cl_ext::CommandCopyBufferName, &clCommandCopyBufferKHR)); - const bool IsInOrder = hCommandBuffer->IsInOrder; - cl_sync_point_khr *RetSyncPoint = IsInOrder ? nullptr : pSyncPoint; - const cl_sync_point_khr *SyncPointWaitList = - IsInOrder ? nullptr : pSyncPointWaitList; - uint32_t WaitListSize = IsInOrder ? 0 : numSyncPointsInWaitList; + cl_sync_point_khr *RetSyncPoint = pSyncPoint; + const cl_sync_point_khr *SyncPointWaitList = pSyncPointWaitList; + uint32_t WaitListSize = numSyncPointsInWaitList; CL_RETURN_ON_FAILURE(clCommandCopyBufferKHR( hCommandBuffer->CLCommandBuffer, nullptr, nullptr, hSrcMem->CLMemory, hDstMem->CLMemory, srcOffset, dstOffset, size, WaitListSize, @@ -289,11 +285,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( ur::cl::getAdapter()->fnCache.clCommandCopyBufferRectKHRCache, cl_ext::CommandCopyBufferRectName, &clCommandCopyBufferRectKHR)); - const bool IsInOrder = hCommandBuffer->IsInOrder; - cl_sync_point_khr *RetSyncPoint = IsInOrder ? nullptr : pSyncPoint; - const cl_sync_point_khr *SyncPointWaitList = - IsInOrder ? nullptr : pSyncPointWaitList; - uint32_t WaitListSize = IsInOrder ? 0 : numSyncPointsInWaitList; + cl_sync_point_khr *RetSyncPoint = pSyncPoint; + const cl_sync_point_khr *SyncPointWaitList = pSyncPointWaitList; + uint32_t WaitListSize = numSyncPointsInWaitList; CL_RETURN_ON_FAILURE(clCommandCopyBufferRectKHR( hCommandBuffer->CLCommandBuffer, nullptr, nullptr, hSrcMem->CLMemory, hDstMem->CLMemory, OpenCLOriginRect, OpenCLDstRect, OpenCLRegion, @@ -397,11 +391,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( CLContext, ur::cl::getAdapter()->fnCache.clCommandFillBufferKHRCache, cl_ext::CommandFillBufferName, &clCommandFillBufferKHR)); - const bool IsInOrder = hCommandBuffer->IsInOrder; - cl_sync_point_khr *RetSyncPoint = IsInOrder ? nullptr : pSyncPoint; - const cl_sync_point_khr *SyncPointWaitList = - IsInOrder ? nullptr : pSyncPointWaitList; - uint32_t WaitListSize = IsInOrder ? 0 : numSyncPointsInWaitList; + cl_sync_point_khr *RetSyncPoint = pSyncPoint; + const cl_sync_point_khr *SyncPointWaitList = pSyncPointWaitList; + uint32_t WaitListSize = numSyncPointsInWaitList; CL_RETURN_ON_FAILURE(clCommandFillBufferKHR( hCommandBuffer->CLCommandBuffer, nullptr, nullptr, hBuffer->CLMemory, pPattern, patternSize, offset, size, WaitListSize, SyncPointWaitList, @@ -475,9 +467,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp( for (uint32_t i = 0; i < numEventsInWaitList; i++) { CLWaitEvents[i] = phEventWaitList[i]->CLEvent; } - cl_command_queue CLQueue = hCommandBuffer->IsInOrder - ? hCommandBuffer->hInternalQueue->CLQueue - : hQueue->CLQueue; + cl_command_queue CLQueue = hQueue->CLQueue; CL_RETURN_ON_FAILURE(clEnqueueCommandBufferKHR( NumberOfQueues, &CLQueue, hCommandBuffer->CLCommandBuffer, numEventsInWaitList, CLWaitEvents.data(), ifUrEvent(phEvent, Event))); @@ -768,11 +758,9 @@ ur_result_t UR_APICALL urCommandBufferAppendNativeCommandExp( cl_ext::CommandBarrierWithWaitListName, &clCommandBarrierWithWaitListKHR)); - const bool IsInOrder = hCommandBuffer->IsInOrder; - cl_sync_point_khr *RetSyncPoint = IsInOrder ? nullptr : pSyncPoint; - const cl_sync_point_khr *SyncPointWaitList = - IsInOrder ? nullptr : pSyncPointWaitList; - uint32_t WaitListSize = IsInOrder ? 0 : numSyncPointsInWaitList; + cl_sync_point_khr *RetSyncPoint = pSyncPoint; + const cl_sync_point_khr *SyncPointWaitList = pSyncPointWaitList; + uint32_t WaitListSize = numSyncPointsInWaitList; CL_RETURN_ON_FAILURE(clCommandBarrierWithWaitListKHR( hCommandBuffer->CLCommandBuffer, nullptr, nullptr, WaitListSize, SyncPointWaitList, nullptr, nullptr));