diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_bf16.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_bf16.h index a34b75d1a6..db6c161c42 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_bf16.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_bf16.h @@ -781,6 +781,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hadd(const __hip_bfloat16 a, const return (__bf16)a + (__bf16)b; } +/** + * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH + * \brief Adds two bfloat16 values, will not fuse into fma + */ +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hadd_rn(const __hip_bfloat16 a, + const __hip_bfloat16 b) { +#pragma clang fp contract(off) + return (__bf16)a + (__bf16)b; +} + /** * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Subtracts two bfloat16 values @@ -789,6 +799,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hsub(const __hip_bfloat16 a, const return (__bf16)a - (__bf16)b; } +/** + * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH + * \brief Subtracts two bfloat16 values, will not fuse into fma + */ +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hsub_rn(const __hip_bfloat16 a, + const __hip_bfloat16 b) { +#pragma clang fp contract(off) + return (__bf16)a - (__bf16)b; +} + /** * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Divides two bfloat16 values @@ -815,6 +835,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmul(const __hip_bfloat16 a, const return (__bf16)a * (__bf16)b; } +/** + * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH + * \brief Multiplies two bfloat16 values, will not fuse into fma + */ +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmul_rn(const __hip_bfloat16 a, + const __hip_bfloat16 b) { +#pragma clang fp contract(off) + return (__bf16)a * (__bf16)b; +} + /** * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Negate a bfloat16 value @@ -861,6 +891,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hadd2(const __hip_bfloat162 a, return __hip_bfloat162{__bf16_2(a) + __bf16_2(b)}; } +/** + * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH + * \brief Adds two bfloat162 values, will not fuse into fma + */ +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hadd2_rn(const __hip_bfloat162 a, + const __hip_bfloat162 b) { +#pragma clang fp contract(off) + return __hip_bfloat162{__bf16_2(a) + __bf16_2(b)}; +} + /** * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Performs FMA of given bfloat162 values @@ -879,6 +919,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmul2(const __hip_bfloat162 a, return __hip_bfloat162{__bf16_2(a) * __bf16_2(b)}; } +/** + * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH + * \brief Multiplies two bfloat162 values, will not fuse into fma + */ +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmul2_rn(const __hip_bfloat162 a, + const __hip_bfloat162 b) { +#pragma clang fp contract(off) + return __hip_bfloat162{__bf16_2(a) * __bf16_2(b)}; +} + /** * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Converts a bfloat162 into negative @@ -896,6 +946,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hsub2(const __hip_bfloat162 a, return __hip_bfloat162{__bf16_2(a) - __bf16_2(b)}; } +/** + * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH + * \brief Subtracts two bfloat162 values, will not fuse into fma + */ +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hsub2_rn(const __hip_bfloat162 a, + const __hip_bfloat162 b) { +#pragma clang fp contract(off) + return __hip_bfloat162{__bf16_2(a) - __bf16_2(b)}; +} + /** * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Operator to multiply two __hip_bfloat16 numbers diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp16.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp16.h index 814575ea1e..f53c22cb83 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp16.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp16.h @@ -1367,6 +1367,15 @@ THE SOFTWARE. return __half_raw{ static_cast<__half_raw>(x).data + static_cast<__half_raw>(y).data}; + } + inline + __HOST_DEVICE__ + __half __hadd_rn(__half x, __half y) + { +#pragma clang fp contract(off) + return __half_raw{ + static_cast<__half_raw>(x).data + + static_cast<__half_raw>(y).data}; } inline __HOST_DEVICE__ @@ -1390,6 +1399,15 @@ THE SOFTWARE. } inline __HOST_DEVICE__ + __half __hsub_rn(__half x, __half y) + { +#pragma clang fp contract(off) + return __half_raw{ + static_cast<__half_raw>(x).data - + static_cast<__half_raw>(y).data}; + } + inline + __HOST_DEVICE__ __half __hmul(__half x, __half y) { return __half_raw{ @@ -1398,6 +1416,15 @@ THE SOFTWARE. } inline __HOST_DEVICE__ + __half __hmul_rn(__half x, __half y) + { +#pragma clang fp contract(off) + return __half_raw{ + static_cast<__half_raw>(x).data * + static_cast<__half_raw>(y).data}; + } + inline + __HOST_DEVICE__ __half __hadd_sat(__half x, __half y) { return __clamp_01(__hadd(x, y)); @@ -1446,6 +1473,16 @@ THE SOFTWARE. static_cast<__half2_raw>(x).data + static_cast<__half2_raw>(y).data}; } + inline + __HOST_DEVICE__ + __half2 __hadd2_rn(__half2 x, __half2 y) + { +#pragma clang fp contract(off) + return __half2{ + static_cast<__half2_raw>(x).data + + static_cast<__half2_raw>(y).data}; + } + inline __HOST_DEVICE__ __half2 __habs2(__half2 x) @@ -1462,6 +1499,15 @@ THE SOFTWARE. } inline __HOST_DEVICE__ + __half2 __hsub2_rn(__half2 x, __half2 y) + { +#pragma clang fp contract(off) + return __half2{ + static_cast<__half2_raw>(x).data - + static_cast<__half2_raw>(y).data}; + } + inline + __HOST_DEVICE__ __half2 __hmul2(__half2 x, __half2 y) { return __half2{ @@ -1470,6 +1516,15 @@ THE SOFTWARE. } inline __HOST_DEVICE__ + __half2 __hmul2_rn(__half2 x, __half2 y) + { +#pragma clang fp contract(off) + return __half2{ + static_cast<__half2_raw>(x).data * + static_cast<__half2_raw>(y).data}; + } + inline + __HOST_DEVICE__ __half2 __hadd2_sat(__half2 x, __half2 y) { auto r = static_cast<__half2_raw>(__hadd2(x, y)); diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h index 01849c0b16..702c3f85a6 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h @@ -51,8 +51,8 @@ THE SOFTWARE. #include #endif // defined(__HIPCC_RTC__) -template constexpr __hip_internal::size_t __hip_vec_align_v() { - return (N == 4 && alignof(T) == 8) ? 16 : N * alignof(T); +template constexpr __hip_internal::size_t __hip_vec_align_v() { + return (_hip_N == 4 && alignof(T) == 8) ? 16 : _hip_N * alignof(T); } template struct HIP_vector_base; diff --git a/projects/clr/hipamd/include/hip/amd_detail/host_defines.h b/projects/clr/hipamd/include/hip/amd_detail/host_defines.h index 5aa38661f2..59e1673dcb 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/host_defines.h +++ b/projects/clr/hipamd/include/hip/amd_detail/host_defines.h @@ -199,16 +199,16 @@ struct integer_sequence { template using index_sequence = integer_sequence; -template -struct make_index_sequence_impl : make_index_sequence_impl {}; +template +struct make_index_sequence_impl : make_index_sequence_impl<_hip_N - 1, _hip_N - 1, Ints...> {}; template struct make_index_sequence_impl<0, Ints...> { using type = index_sequence; }; -template -using make_index_sequence = typename make_index_sequence_impl::type; +template +using make_index_sequence = typename make_index_sequence_impl<_hip_N>::type; template constexpr index_sequence make_index_sequence_value(index_sequence) { diff --git a/projects/clr/hipamd/src/hip_code_object.cpp b/projects/clr/hipamd/src/hip_code_object.cpp index 4336fdf0ea..846ca299a7 100644 --- a/projects/clr/hipamd/src/hip_code_object.cpp +++ b/projects/clr/hipamd/src/hip_code_object.cpp @@ -284,9 +284,7 @@ hipError_t StatCO::removeFatBinary(FatBinaryInfo** module) { for (auto& hostVar : hostVarsIter->second) { auto varIter = vars_.find(hostVar); if (varIter == vars_.end()) { - LogPrintfError( - "removeFatBinary: Unable to find module 0x%x hostVar 0x%x", - module, hostVar); + LogPrintfError("removeFatBinary: Unable to find module 0x%x hostVar 0x%x", module, hostVar); } else { delete varIter->second; vars_.erase(varIter); @@ -325,8 +323,8 @@ hipError_t StatCO::removeFatBinary(FatBinaryInfo** module) { for (auto& hostFunc : hostFuncsIter->second) { auto funcIter = functions_.find(hostFunc); if (funcIter == functions_.end()) { - LogPrintfError("removeFatBinary: Unable to find module 0x%x hostFunc 0x%x", - module, hostFunc); + LogPrintfError("removeFatBinary: Unable to find module 0x%x hostFunc 0x%x", module, + hostFunc); } else { delete funcIter->second; functions_.erase(funcIter); @@ -343,8 +341,8 @@ hipError_t StatCO::removeFatBinary(FatBinaryInfo** module) { delete moduleIter->second; modules_.erase(moduleIter); } else { - LogPrintfError("removeFatBinary: Unable to find module 0x%x via hostModule 0x%x", - module, hostModule); + LogPrintfError("removeFatBinary: Unable to find module 0x%x via hostModule 0x%x", module, + hostModule); } module_to_hostModule_.erase(hostModuleIter); } @@ -383,7 +381,7 @@ hipError_t StatCO::getStatFunc(hipFunction_t* hfunc, const void* hostFunction, i } // Lazy load - FatBinaryInfo **module = it->second->moduleInfo(); + FatBinaryInfo** module = it->second->moduleInfo(); if (*(module) == nullptr) { amd::ScopedLock lock(sclock_); if (*(module) == nullptr) { @@ -405,7 +403,7 @@ hipError_t StatCO::getStatFuncAttr(hipFuncAttributes* func_attr, const void* hos } // Lazy load - FatBinaryInfo **module = it->second->moduleInfo(); + FatBinaryInfo** module = it->second->moduleInfo(); if (*(module) == nullptr) { hipError_t err = digestFatBinary(module_to_hostModule_[module], *module); assert(err == hipSuccess); @@ -437,7 +435,7 @@ hipError_t StatCO::getStatGlobalVar(const void* hostVar, int deviceId, hipDevice } // Lazy load - FatBinaryInfo **module = it->second->moduleInfo(); + FatBinaryInfo** module = it->second->moduleInfo(); if (*(module) == nullptr) { hipError_t err = digestFatBinary(module_to_hostModule_[module], *module); assert(err == hipSuccess); @@ -464,7 +462,7 @@ hipError_t StatCO::initStatManagedVarDevicePtr(int deviceId) { for (auto& vecIter : managedVars_) { for (auto& var : vecIter.second) { // Lazy load - FatBinaryInfo **module = var->moduleInfo(); + FatBinaryInfo** module = var->moduleInfo(); if (*(module) == nullptr) { err = digestFatBinary(module_to_hostModule_[module], *module); assert(err == hipSuccess); diff --git a/projects/clr/hipamd/src/hip_event.hpp b/projects/clr/hipamd/src/hip_event.hpp index 456bf3d0a6..33d80dfbf2 100644 --- a/projects/clr/hipamd/src/hip_event.hpp +++ b/projects/clr/hipamd/src/hip_event.hpp @@ -200,6 +200,9 @@ class IPCEvent : public Event { if (!amd::Os::MemoryUnmapFile(ipc_evt_.ipc_shmem_, sizeof(hip::ihipIpcEventShmem_t))) { // print hipErrorInvalidHandle; } + if (owners == 0) { + amd::Os::shm_unlink(ipc_evt_.ipc_name_); + } } } IPCEvent() : Event(hipEventInterprocess) {} diff --git a/projects/clr/opencl/amdocl/cl_event.cpp b/projects/clr/opencl/amdocl/cl_event.cpp index d4f2b04e26..4dfc9c0b94 100644 --- a/projects/clr/opencl/amdocl/cl_event.cpp +++ b/projects/clr/opencl/amdocl/cl_event.cpp @@ -251,15 +251,16 @@ RUNTIME_ENTRY_RET(cl_event, clCreateUserEvent, (cl_context context, cl_int* errc return (cl_event)0; } - amd::Event* event = new amd::UserEvent(*as_amd(context)); - if (event == NULL) { + auto event = new amd::UserEvent(*as_amd(context)); + if (event == nullptr || !event->Create()) { + delete event; *not_null(errcode_ret) = CL_OUT_OF_HOST_MEMORY; return (cl_event)0; } event->retain(); *not_null(errcode_ret) = CL_SUCCESS; - return as_cl(event); + return as_cl(reinterpret_cast(event)); } RUNTIME_EXIT @@ -288,8 +289,8 @@ RUNTIME_ENTRY(cl_int, clSetUserEventStatus, (cl_event event, cl_int execution_st if (execution_status > CL_COMPLETE) { return CL_INVALID_VALUE; } - - if (!as_amd(event)->setStatus(execution_status)) { + auto user_event = reinterpret_cast(as_amd(event)); + if (!user_event->SetExecutionStatus(execution_status)) { return CL_INVALID_OPERATION; } return CL_SUCCESS; diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index e3b3f736df..4ecbd04e56 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -95,6 +95,7 @@ class StreamOperationCommand; class BatchMemoryOperationCommand; class VirtualMapCommand; class ExternalSemaphoreCmd; +class UserEvent; class Isa; class Device; struct KernelParameterDescriptor; @@ -1329,6 +1330,7 @@ class VirtualDevice : public amd::HeapObject { ShouldNotReachHere(); } virtual void submitVirtualMap(amd::VirtualMapCommand& cmd) { ShouldNotReachHere(); } + virtual void submitUserEvent(amd::UserEvent& vcmd) { ShouldNotReachHere(); } virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; } virtual void ReleaseAllHwQueues() {} @@ -2037,6 +2039,13 @@ class Device : public RuntimeObject { return (info().svmCapabilities_ & CL_DEVICE_SVM_ATOMICS) != 0 ? true : false; } + /// @brief Creates HW user event for OpenCL implementation + /// @return The pointer to a HW event structure, known to the HW backend + virtual bool CreateUserEvent(amd::UserEvent* event) const { return false; } + + /// @brief Sets HW user event to the complete status + virtual void SetUserEvent(amd::UserEvent* event) const {} + //! Returns TRUE if the device is available for computations bool isOnline() const { return online_; } diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp index 40ff3454ae..3ab0b06549 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.cpp +++ b/projects/clr/rocclr/device/rocm/rocblit.cpp @@ -54,8 +54,8 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin, const amd::Coord3D& size, bool entire, amd::CopyMetadata copyMetadata) const { // Use host copy if memory has direct access - if (setup_.disableReadBuffer_ || - (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) { + if (dev().settings().blocking_blit_ && (setup_.disableReadBuffer_ || + (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached()))) { // Stall GPU before CPU access gpu().releaseGpuMemoryFence(); return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata); @@ -138,8 +138,9 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, const amd::Coord3D& origin, const amd::Coord3D& size, bool entire, amd::CopyMetadata copyMetadata) const { // Use host copy if memory has direct access - if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() || - gpuMem(dstMemory).IsPersistentDirectMap()) { + if (dev().settings().blocking_blit_ && + (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() || + gpuMem(dstMemory).IsPersistentDirectMap())) { // Stall GPU before CPU access gpu().releaseGpuMemoryFence(); return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata); @@ -685,6 +686,7 @@ void DmaBlitManager::getBuffer(const_address hostMem, size_t size, buffState.buffer_ = gpu().Staging().Acquire(std::min(xferSize, StagingXferSize)); } +// ================================================================================================ void DmaBlitManager::releaseBuffer(BufferState &buffer) const { if (buffer.pinnedMem_) { gpu().addPinnedMem(buffer.pinnedMem_); @@ -696,7 +698,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs size_t size, bool hostToDev, amd::CopyMetadata& copyMetadata, bool enablePin) const { // Do not skip wait here for D2H. Resolving dependent signals for SDMA engine is slow - gpu().releaseGpuMemoryFence(hostToDev); + gpu().releaseGpuMemoryFence(hostToDev || !dev().settings().blocking_blit_); // If Pinning is enabled, Pin host Memory for copy size > MinSizeForPinnedTransfer // For 16KB < size <= MinSizeForPinnedTransfer Use staging buffer without pinning bool status = true; @@ -740,9 +742,9 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs copyMetadata.isAsync_); const_address src = static_cast(hostSrc) + copyOffset; status = rocrCopyBuffer(stagingBuffer, dstAgent, src , srcAgent, copysize, copyMetadata); - if (status ) { - // Wait for current signal of previous rocr copy if its not pinned mem + if (status) { if (outBuffer.pinnedMem_ == nullptr) { + // Wait for current signal of previous rocr copy if its not pinned mem gpu().Barriers().WaitCurrent(); ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu", hostDst + copyOffset, stagingBuffer, copysize); @@ -752,6 +754,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs break; } } + // Release Pinned Memory back to pool if any releaseBuffer(outBuffer); // Update Offset and Transfer Size @@ -760,12 +763,18 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs firstTx = false; } + // @note: HIP requires a blocking wait on D2H with the pageable system memory + if (amd::IS_HIP && !hostToDev) { + gpu().Barriers().WaitCurrent(); + } + if(!status) { return false; } return true; } + // ================================================================================================ KernelBlitManager::KernelBlitManager(VirtualGPU& gpu, Setup setup) : DmaBlitManager(gpu, setup), @@ -1718,8 +1727,9 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, bool result = false; // Use host copy if memory has direct access - if (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() && - !srcMemory.isCpuUncached())) { + if (dev().settings().blocking_blit_ && + (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() && + !srcMemory.isCpuUncached()))) { // Stall GPU before CPU access gpu().releaseGpuMemoryFence(); result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata); @@ -1854,8 +1864,9 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo bool result = false; // Use host copy if memory has direct access - if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() || - gpuMem(dstMemory).IsPersistentDirectMap()) { + if (dev().settings().blocking_blit_ && + (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() || + gpuMem(dstMemory).IsPersistentDirectMap())) { // Stall GPU before CPU access gpu().releaseGpuMemoryFence(); result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata); @@ -2718,10 +2729,9 @@ void KernelBlitManager::releaseArguments(address args) const { } // ================================================================================================ -bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam, +bool KernelBlitManager::runScheduler(uint64_t vqVM, hsa_queue_t* schedulerQueue, - hsa_signal_t& schedulerSignal, - uint threads) { + uint threads, uint64_t aql_wrap) { size_t globalWorkOffset[1] = {0}; size_t globalWorkSize[1] = {threads}; size_t localWorkSize[1] = {1}; @@ -2731,21 +2741,16 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam, device::Kernel* devKernel = const_cast(kernels_[Scheduler]->getDeviceKernel(dev())); Kernel& gpuKernel = static_cast(*devKernel); - SchedulerParam* sp = reinterpret_cast(schedulerParam->getHostMem()); + auto* sp = reinterpret_cast( + gpu().allocKernArg(sizeof(SchedulerParam), kCBAlignment)); memset(sp, 0, sizeof(SchedulerParam)); - Memory* schedulerMem = dev().getRocMemory(schedulerParam); - sp->kernarg_address = reinterpret_cast(schedulerMem->getDeviceMemory()); + sp->kernarg_address = reinterpret_cast(sp); sp->thread_counter = 0; sp->child_queue = reinterpret_cast(schedulerQueue); - sp->complete_signal = schedulerSignal; - - hsa_signal_store_relaxed(schedulerSignal, kInitSignalValueOne); - - + sp->complete_signal = gpu().Barriers().ActiveSignal(kInitSignalValueOne, nullptr); sp->vqueue_header = vqVM; - - sp->parentAQL = sp->kernarg_address + sizeof(SchedulerParam); + sp->parentAQL = reinterpret_cast(aql_wrap); if (dev().info().maxEngineClockFrequency_ > 0) { sp->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_; @@ -2754,8 +2759,8 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam, // Use a device side global atomics to workaround the reliance of PCIe 3 atomics sp->write_index = hsa_queue_load_write_index_relaxed(schedulerQueue); - cl_mem mem = as_cl(schedulerParam); - setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), &mem); + constexpr bool kDirectVa = true; + setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), sp, 0, nullptr, kDirectVa); address parameters = captureArguments(kernels_[Scheduler]); @@ -2764,12 +2769,17 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam, return false; } releaseArguments(parameters); - - if (!WaitForSignal(schedulerSignal)) { + // Wait for the scheduler to finish all operations + gpu().WaitCompleteSignal(sp->complete_signal); + // @note: A wait shouldn't be really necessary, but the queue write_index may not get a proper + // value without the wait for all previous commands (see the PCIE3 atomics workaround above). + // The scheduler can enqueue extra commands, but the real queue write index didn't have any + // progress. That leads to hangs and requires blocking. Then the wait causes problems in DD mode + // with device enqueue and user events, because device enqueue is blocking below + if (!WaitForSignal(sp->complete_signal)) { LogWarning("Failed schedulerSignal wait"); return false; } - return true; } diff --git a/projects/clr/rocclr/device/rocm/rocblit.hpp b/projects/clr/rocclr/device/rocm/rocblit.hpp index 8a7f03f454..9c4b16d580 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.hpp +++ b/projects/clr/rocclr/device/rocm/rocblit.hpp @@ -492,10 +492,8 @@ class KernelBlitManager : public DmaBlitManager { ) const; bool runScheduler(uint64_t vqVM, - amd::Memory* schedulerParam, hsa_queue_t* schedulerQueue, - hsa_signal_t& schedulerSignal, - uint threads); + uint threads, uint64_t aql_wrap); //! Runs a blit kernel for GWS init bool RunGwsInit(uint32_t value //!< Initial value for GWS resource diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 44f3822d33..3274cb22f3 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -3464,6 +3464,25 @@ void Device::ReleaseGlobalSignal(void* signal) const { } } +// ================================================================================================ +bool Device::CreateUserEvent(amd::UserEvent* event) const { + std::unique_ptr signal(new ProfilingSignal()); + if ((signal == nullptr) || + (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &signal->signal_))) { + return false; + } + hsa_signal_silent_store_relaxed(signal->signal_, kInitSignalValueOne); + event->SetHwEvent(signal.release()); + return true; +} + +// ================================================================================================ +void Device::SetUserEvent(amd::UserEvent* event) const { + auto signal = reinterpret_cast(event->HwEvent()); + assert(signal != nullptr && "Can't have user event without hw event!"); + hsa_signal_silent_store_relaxed(signal->signal_, 0); +} + // ================================================================================================ bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info) { // Query ptr type to see if it's a HMM allocation diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index 32dc66726c..0c7ce24805 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -450,6 +450,8 @@ class Device : public NullDevice { uint32_t hip_event_flags = 0) const; virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const; virtual void ReleaseGlobalSignal(void* signal) const; + virtual bool CreateUserEvent(amd::UserEvent* event) const; + virtual void SetUserEvent(amd::UserEvent* event) const; //! Allocate host memory in terms of numa policy set by user void* hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const; diff --git a/projects/clr/rocclr/device/rocm/rocsettings.cpp b/projects/clr/rocclr/device/rocm/rocsettings.cpp index d53231fdd9..9d89124cdb 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.cpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.cpp @@ -97,6 +97,8 @@ Settings::Settings() { limit_blit_wg_ = 16; dynamic_queues_ = amd::IS_HIP ? DEBUG_HIP_DYNAMIC_QUEUES : false; + // note: OCL user events don't allow CPU blocking calls in DD mode + blocking_blit_ = amd::IS_HIP || !AMD_DIRECT_DISPATCH; } // ================================================================================================ diff --git a/projects/clr/rocclr/device/rocm/rocsettings.hpp b/projects/clr/rocclr/device/rocm/rocsettings.hpp index dd323b86ab..b6f741db8e 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.hpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.hpp @@ -51,7 +51,8 @@ class Settings : public device::Settings { uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment uint barrier_value_packet_ : 1; //!< Barrier value packet functionality uint dynamic_queues_ : 1; //!< Dynamic queues management - uint reserved_ : 22; + uint blocking_blit_ : 1; //!< Blit ops can be blocking on CPU + uint reserved_ : 21; }; uint value_; }; diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index d5247ad82c..91b8e5146e 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -1209,6 +1209,12 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, return false; } +// ================================================================================================ +void VirtualGPU::WaitCompleteSignal(hsa_signal_t signal) { + barrier_packet_.dep_signal[0] = signal; + dispatchBarrierPacket(kBarrierPacketHeader, false); +} + // ================================================================================================ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal, hsa_signal_t signal) { @@ -1413,9 +1419,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative, deviceQueueSize_(0), maskGroups_(0), schedulerThreads_(0), - schedulerParam_(nullptr), schedulerQueue_(nullptr), - schedulerSignal_({0}), barriers_(*this), managed_buffer_(*this, ManagedBuffer::kPoolNumSignals * device.settings().stagedXferSize_), managed_kernarg_buffer_(*this, device.settings().kernargPoolSize_), @@ -1490,18 +1494,10 @@ VirtualGPU::~VirtualGPU() { delete printfdbg_; - if (0 != schedulerSignal_.handle) { - hsa_signal_destroy(schedulerSignal_); - } - if (nullptr != schedulerQueue_) { hsa_queue_destroy(schedulerQueue_); } - if (nullptr != schedulerParam_) { - schedulerParam_->release(); - } - if (nullptr != virtualQueue_) { virtualQueue_->release(); } @@ -2604,7 +2600,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) { // If we have host memory, use it if ((devMemory->owner()->getHostMem() != nullptr) && (devMemory->owner()->getSvmPtr() == nullptr)) { - if (!devMemory->isHostMemDirectAccess()) { + if (!AMD_DIRECT_DISPATCH && !devMemory->isHostMemDirectAccess()) { // Make sure GPU finished operation before synchronization with the backing store releaseGpuMemoryFence(); } @@ -3081,18 +3077,11 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) { // ================================================================================================ bool VirtualGPU::createSchedulerParam() { - if (nullptr != schedulerParam_) { + if (nullptr != schedulerQueue_) { return true; } - while(true) { - schedulerParam_ = new (dev().context()) amd::Buffer(dev().context(), - CL_MEM_ALLOC_HOST_PTR, sizeof(SchedulerParam) + sizeof(AmdAqlWrap)); - - if ((nullptr != schedulerParam_) && !schedulerParam_->create(nullptr)) { - break; - } - + while (true) { // The queue is written by multiple threads of the scheduler kernel if (HSA_STATUS_SUCCESS != hsa_queue_create(gpu_device(), 2048, HSA_QUEUE_TYPE_MULTI, callbackQueue, &roc_device_, std::numeric_limits::max(), @@ -3100,39 +3089,14 @@ bool VirtualGPU::createSchedulerParam() break; } - hsa_signal_t signal0 = {0}; - - if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &signal0)) { - break; - } - - schedulerSignal_ = signal0; - - Memory* schedulerMem = dev().getRocMemory(schedulerParam_); - - if (nullptr == schedulerMem) { - break; - } - - schedulerParam_->setVirtualDevice(this); return true; } - if (0 != schedulerSignal_.handle) { - hsa_signal_destroy(schedulerSignal_); - schedulerSignal_.handle = 0; - } - if (nullptr != schedulerQueue_) { hsa_queue_destroy(schedulerQueue_); schedulerQueue_ = nullptr; } - if (nullptr != schedulerParam_) { - schedulerParam_->release(); - schedulerParam_ = nullptr; - } - return false; } @@ -3355,7 +3319,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, global[i] = static_cast(sizes.global()[i]); local[i] = static_cast(local_size[i]); } - + uint64_t spVA = 0; // Check if runtime has to setup hidden arguments for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) { const auto& it = signature.at(i); @@ -3415,14 +3379,12 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, break; } case amd::KernelParameterDescriptor::HiddenCompletionAction: { - uint64_t spVA = 0; - if (nullptr != schedulerParam_ && devKernel->dynamicParallelism()) { - Memory* schedulerMem = dev().getRocMemory(schedulerParam_); - AmdAqlWrap* wrap = reinterpret_cast( - reinterpret_cast(schedulerParam_->getHostMem()) + sizeof(SchedulerParam)); + if (devKernel->dynamicParallelism()) { + auto params = allocKernArg(sizeof(AmdAqlWrap), 64); + AmdAqlWrap* wrap = reinterpret_cast(params); memset(wrap, 0, sizeof(AmdAqlWrap)); wrap->state = AQL_WRAP_DONE; - spVA = reinterpret_cast(schedulerMem->getDeviceMemory()) + sizeof(SchedulerParam); + spVA = reinterpret_cast(wrap); } WriteAqlArgAt(hidden_arguments, spVA, it.size_, it.offset_); break; @@ -3651,8 +3613,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, dispatchBarrierPacket(kBarrierPacketHeader, true); if (virtualQueue_ != nullptr) { static_cast(blitMgr()).runScheduler( - getVQVirtualAddress(), schedulerParam_, schedulerQueue_, - schedulerSignal_, schedulerThreads_); + getVQVirtualAddress(), schedulerQueue_, + schedulerThreads_, spVA); } } @@ -3833,10 +3795,10 @@ void VirtualGPU::flush(amd::Command* list, bool wait) { // ================================================================================================ void VirtualGPU::addPinnedMem(amd::Memory* mem) { - //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait - //! unconditionally, before it can release pinned memory - releaseGpuMemoryFence(); if (!AMD_DIRECT_DISPATCH) { + //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait + //! unconditionally, before it can release pinned memory + releaseGpuMemoryFence(); if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) { if (pinnedMems_.size() > 7) { pinnedMems_.front()->release(); @@ -3847,7 +3809,14 @@ void VirtualGPU::addPinnedMem(amd::Memory* mem) { pinnedMems_.push_back(mem); } } else { - mem->release(); + if (command_ != nullptr) { + command_->AddPinnedMemory(mem); + } else { + //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait + //! unconditionally, before it can release pinned memory + releaseGpuMemoryFence(); + mem->release(); + } } } diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index 1d9c8b756b..1c54f653c3 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -450,8 +450,9 @@ class VirtualGPU : public device::VirtualDevice { void* allocKernArg(size_t size, size_t alignment); bool isFenceDirty() const { return fence_dirty_; } void setFenceDirty(bool state) { fence_dirty_ = state; } - void HiddenHeapInit(); + void WaitCompleteSignal(hsa_signal_t signal); + void HiddenHeapInit(); void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; } uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); } uint64_t getQueueID(); @@ -592,9 +593,7 @@ class VirtualGPU : public device::VirtualDevice { //!< one thread uint schedulerThreads_; //!< The number of scheduler threads - amd::Memory* schedulerParam_; hsa_queue_t* schedulerQueue_; - hsa_signal_t schedulerSignal_; HwQueueTracker barriers_; //!< Tracks active barriers in ROCr diff --git a/projects/clr/rocclr/os/os.hpp b/projects/clr/rocclr/os/os.hpp index e8b7c0d2ad..d830361229 100644 --- a/projects/clr/rocclr/os/os.hpp +++ b/projects/clr/rocclr/os/os.hpp @@ -285,6 +285,9 @@ class Os : AllStatic { //! Deletes file static int unlink(const std::string& path); + //! Removes the shared memory object name + static int shm_unlink(const std::string& path); + // Library routines: // typedef bool (*SymbolCallback)(std::string, const void*, void*); diff --git a/projects/clr/rocclr/os/os_posix.cpp b/projects/clr/rocclr/os/os_posix.cpp index 6ee367842c..8837c575b5 100644 --- a/projects/clr/rocclr/os/os_posix.cpp +++ b/projects/clr/rocclr/os/os_posix.cpp @@ -650,6 +650,7 @@ std::string Os::getTempFileName() { } int Os::unlink(const std::string& path) { return ::unlink(path.c_str()); } +int Os::shm_unlink(const std::string& path) { return ::shm_unlink(path.c_str()); } #if defined(ATI_ARCH_X86) void Os::cpuid(int regs[4], int info) { diff --git a/projects/clr/rocclr/os/os_win32.cpp b/projects/clr/rocclr/os/os_win32.cpp index 68a87f15f7..b933dfd452 100644 --- a/projects/clr/rocclr/os/os_win32.cpp +++ b/projects/clr/rocclr/os/os_win32.cpp @@ -505,6 +505,9 @@ std::string Os::getTempFileName() { int Os::unlink(const std::string& path) { return ::_unlink(path.c_str()); } +// shm_unlink is a nop on windows +int Os::shm_unlink(const std::string& path) { return 0; } + void Os::cpuid(int regs[4], int info) { return __cpuid(regs, info); } uint64_t Os::xgetbv(uint32_t ecx) { return (uint64_t)_xgetbv(ecx); } diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp index 6a20fe9e26..d9361703e7 100644 --- a/projects/clr/rocclr/platform/command.cpp +++ b/projects/clr/rocclr/platform/command.cpp @@ -269,7 +269,9 @@ bool Event::notifyCmdQueue(bool cpu_wait) { ScopedLock l(notify_lock_); if ((status() > CL_COMPLETE) && (nullptr != queue) && // If HW event was assigned, then notification can be ignored, since a barrier was issued - (HwEvent() == nullptr) && + // @note: Force the marker always in OCL for now, since OCL events require precise + // sequence of the status update + ((HwEvent() == nullptr) || !amd::IS_HIP) && !notified_.test_and_set()) { // Make sure the queue is draining the enqueued commands. amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this, cpu_wait); @@ -364,8 +366,17 @@ void Command::enqueue() { // Notify all commands about the waiter. Barrier will be sent in order to obtain // HSA signal for a wait on the current queue - for (const auto& event : eventWaitList()) { - event->notifyCmdQueue(!kCpuWait); + for (const auto &event: eventWaitList()) { + if (!amd::IS_HIP && event->command().type() == CL_COMMAND_USER) { + if (event->status() >= CL_COMPLETE) { + reinterpret_cast(event)->AddDependent(this); + } else { + setStatus(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); + return; + } + } else { + event->notifyCmdQueue(!kCpuWait); + } } // The batch update must be lock protected to avoid a race condition diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp index 3dbc814c54..977014f69d 100644 --- a/projects/clr/rocclr/platform/command.hpp +++ b/projects/clr/rocclr/platform/command.hpp @@ -397,6 +397,9 @@ class Command : public Event { //! Release the resources associated with this event. virtual void releaseResources(); + //! Empty function for adding pinned memory + virtual void AddPinnedMemory(Memory* pinned) {} + //! Set the next GPU command void setNext(Command* next) { next_ = next; } @@ -424,16 +427,46 @@ class Command : public Event { }; class UserEvent : public Command { - const Context& context_; + const Context& context_; //!< OCL context associated with the event + std::vector dependents_; //!< Commands, which depends on this user event public: UserEvent(Context& context) : Command(CL_COMMAND_USER), context_(context) { setStatus(CL_SUBMITTED); } + //! Creates a user event in the backend layer + bool Create() { + if (AMD_DIRECT_DISPATCH) { + return context_.devices()[0]->CreateUserEvent(this); + } else { + return true; + } + } + + //! Sets the execution status of the user event + bool SetExecutionStatus(cl_int status) { + if (AMD_DIRECT_DISPATCH) { + // If it's invalid status, then mark dependent commands as invalid + if (status < CL_COMPLETE) { + for (auto it : dependents_) { + it->setStatus(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); + } + } + dependents_.clear(); + context_.devices()[0]->SetUserEvent(this); + } + return setStatus(status); + } + + //! Adds dependent commands for the user event + void AddDependent(Command* command) { + dependents_.push_back(command); + } + virtual void submit(device::VirtualDevice& device) { ShouldNotCallThis(); } - virtual const Context& context() const { return context_; } + const Context& context() const { return context_; } }; class ClGlEvent : public Command { @@ -450,7 +483,7 @@ class ClGlEvent : public Command { bool awaitCompletion() { return waitForFence(); } - virtual const Context& context() const { return context_; } + const Context& context() const { return context_; } }; inline Command& Event::command() { return *static_cast(this); } @@ -465,6 +498,7 @@ class NDRangeContainer; class OneMemoryArgCommand : public Command { protected: Memory* memory_; + std::vector pinned_memory_; //!< Pinned memory object public: OneMemoryArgCommand(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList, @@ -477,8 +511,13 @@ class OneMemoryArgCommand : public Command { memory_->release(); DEBUG_ONLY(memory_ = NULL); Command::releaseResources(); + for (auto it : pinned_memory_) { + it->release(); + } } + //! Adds pinned memory, used in this command for later release + virtual void AddPinnedMemory(Memory* pinned) override { pinned_memory_.push_back(pinned); } bool validateMemory(); bool validatePeerMemory(); }; diff --git a/projects/clr/rocclr/platform/commandqueue.cpp b/projects/clr/rocclr/platform/commandqueue.cpp index 665e5313f8..6e8d5a96ad 100644 --- a/projects/clr/rocclr/platform/commandqueue.cpp +++ b/projects/clr/rocclr/platform/commandqueue.cpp @@ -173,6 +173,9 @@ void HostQueue::finish(bool cpu_wait) { (vdev()->QueuedAsyncHandlers().load() > DEBUG_HIP_BLOCK_SYNC)) { cpu_wait = true; } + } else { + // Force CPU wait for OpenCL, since the tests may check OCL command status after finish + cpu_wait = true; } size_t batchSize = GetSubmissionBatchSize();