diff --git a/projects/clr/rocclr/device/pal/paldevice.hpp b/projects/clr/rocclr/device/pal/paldevice.hpp index 2c7d0e539d..8292218126 100644 --- a/projects/clr/rocclr/device/pal/paldevice.hpp +++ b/projects/clr/rocclr/device/pal/paldevice.hpp @@ -253,15 +253,16 @@ class Device : public NullDevice { uint32_t index_; //!< HW queue index for scratch buffer access amd::Monitor queue_lock_; //!< Queue lock for access AqlPacketMgmt aql_packet_mgmt_; //!< AQL packets management class for debugger support - QueueRecycleInfo() + QueueRecycleInfo(const Device& dev) : counter_(1), engineType_(Pal::EngineTypeCompute), index_(0), - queue_lock_(true) /* Queue lock for sharing */ {} + queue_lock_(true) /* Queue lock for sharing */, + aql_packet_mgmt_(dev) {} - //! Returns the aql packet list - uintptr_t AqlPacketList() const { - return reinterpret_cast(&aql_packet_mgmt_.aql_packets_); + //! Returns the MQD's read_dispatch_id's address. + uintptr_t DebuggerData() const { + return reinterpret_cast(&aql_packet_mgmt_.amd_queue_.read_dispatch_id); } }; diff --git a/projects/clr/rocclr/device/pal/palkernel.cpp b/projects/clr/rocclr/device/pal/palkernel.cpp index 4d51bd2729..b597328c23 100644 --- a/projects/clr/rocclr/device/pal/palkernel.cpp +++ b/projects/clr/rocclr/device/pal/palkernel.cpp @@ -172,12 +172,10 @@ const pal::Program& Kernel::prog() const { return reinterpret_cast(prog_); } -hsa_kernel_dispatch_packet_t* Kernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel, - const amd::NDRangeContainer& sizes, - const_address params, size_t ldsAddress, - uint64_t vmDefQueue, - uint64_t* vmParentWrap, - uint32_t* aql_index) const { +std::pair +HSAILKernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel, + const amd::NDRangeContainer& sizes, const_address params, + size_t ldsAddress, uint64_t vmDefQueue, uint64_t* vmParentWrap) const { // Provide private and local heap addresses static constexpr uint AddressShift = LP64_SWITCH(0, 32); const_address parameters = params; @@ -364,7 +362,7 @@ hsa_kernel_dispatch_packet_t* Kernel::loadArguments(VirtualGPU& gpu, const amd:: std::min(static_cast(argsBufferSize()), signature.paramsSize())); } - hsa_kernel_dispatch_packet_t* hsaDisp = gpu.GetAqlPacketSlot(aql_index); + auto&& [hsaDisp, aql_packet_id] = gpu.GetAqlPacketSlot(); constexpr uint16_t kDispatchPacketHeader = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | @@ -401,7 +399,7 @@ hsa_kernel_dispatch_packet_t* Kernel::loadArguments(VirtualGPU& gpu, const amd:: gpu.addVmMemory(gpu.hsaQueueMem()); } - return hsaDisp; + return {hsaDisp, aql_packet_id}; } bool Kernel::setKernelDescriptor(amd::hsa::loader::Symbol* sym, diff --git a/projects/clr/rocclr/device/pal/palkernel.hpp b/projects/clr/rocclr/device/pal/palkernel.hpp index ccbd9a2093..afd2e6b468 100644 --- a/projects/clr/rocclr/device/pal/palkernel.hpp +++ b/projects/clr/rocclr/device/pal/palkernel.hpp @@ -104,15 +104,14 @@ class Kernel : public device::Kernel { //! Returns AQL packet in CPU memory //! if the kernel arguments were successfully loaded, otherwise NULL - hsa_kernel_dispatch_packet_t* loadArguments( - VirtualGPU& gpu, //!< Running GPU context - const amd::Kernel& kernel, //!< AMD kernel object - const amd::NDRangeContainer& sizes, //!< NDrange container - const_address params, //!< Application arguments for the kernel - size_t ldsAddress, //!< LDS address that includes all arguments. - uint64_t vmDefQueue, //!< GPU VM default queue pointer - uint64_t* vmParentWrap, //!< GPU VM parent aql wrap object - uint32_t* aql_index //!< AQL packet index in the packets array for debugger + std::pair + loadArguments(VirtualGPU& gpu, //!< Running GPU context + const amd::Kernel& kernel, //!< AMD kernel object + const amd::NDRangeContainer& sizes, //!< NDrange container + const_address params, //!< Application arguments for the kernel + size_t ldsAddress, //!< LDS address that includes all arguments. + uint64_t vmDefQueue, //!< GPU VM default queue pointer + uint64_t* vmParentWrap //!< GPU VM parent aql wrap object ) const; //! Returns the kernel index in the program diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp index 33445a1ceb..587914e8fc 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/device/pal/palvirtual.cpp @@ -52,6 +52,37 @@ namespace amd::pal { +AqlPacketMgmt::AqlPacketMgmt(const Device& dev) { + memset(aql_vgpus_, 0, sizeof(aql_vgpus_)); + + static_assert(sizeof(decltype(amd_queue_)::read_dispatch_id) == sizeof(uint64_t)); + static_assert(sizeof(decltype(amd_queue_)::write_dispatch_id) == sizeof(uint64_t)); + + // Initialize the amd_queue_ + amd_queue_.hsa_queue.type = HSA_QUEUE_TYPE_MULTI; + amd_queue_.hsa_queue.features = HSA_QUEUE_FEATURE_KERNEL_DISPATCH; + amd_queue_.hsa_queue.base_address = &aql_packets_[0]; + amd_queue_.hsa_queue.size = sizeof(aql_packets_) / sizeof(aql_packets_[0]); + amd_queue_.hsa_queue.id = []() { + static std::atomic queue_counter; + return queue_counter++; + }(); + amd_queue_.read_dispatch_id_field_base_byte_offset = + offsetof(decltype(amd_queue_), read_dispatch_id) - offsetof(decltype(amd_queue_), hsa_queue); + + amd_queue_.max_cu_id = dev.properties().gfxipProperties.shaderCore.numAvailableCus - 1; + amd_queue_.max_wave_id = dev.properties().gfxipProperties.shaderCore.numSimdsPerCu * + dev.properties().gfxipProperties.shaderCore.numWavefrontsPerSimd - + 1; + + amd_queue_.private_segment_aperture_base_hi = static_cast( + dev.properties().gpuMemoryProperties.privateApertureBase >> LP64_SWITCH(0, 32)); + amd_queue_.group_segment_aperture_base_hi = static_cast( + dev.properties().gpuMemoryProperties.sharedApertureBase >> LP64_SWITCH(0, 32)); + + AMD_HSA_BITS_SET(amd_queue_.queue_properties, AMD_QUEUE_PROPERTIES_IS_PTR64, LP64_SWITCH(0, 1)); +} + uint32_t VirtualGPU::Queue::AllocedQueues(const VirtualGPU& gpu, Pal::EngineType type) { uint32_t allocedQueues = 0; for (const auto& queue : gpu.dev().QueuePool()) { @@ -151,13 +182,13 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que uint32_t index = AllocedQueues(gpu, qCreateInfo.engineType); // Create PAL queue object if (index < GPU_MAX_HW_QUEUES) { - Device::QueueRecycleInfo* info = new (qSize) Device::QueueRecycleInfo(); + Device::QueueRecycleInfo* info = new (qSize) Device::QueueRecycleInfo(gpu.dev()); if (info == nullptr) { LogError("Could not create QueueRecycleInfo!"); return nullptr; } addrQ = reinterpret_cast
(&info[1]); - qCreateInfo.aqlPacketList = info->AqlPacketList(); + qCreateInfo.aqlPacketList = info->DebuggerData(); result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_); if (result == Pal::Result::Success) { const_cast(gpu.dev()).QueuePool().insert({queue->iQueue_, info}); @@ -193,7 +224,7 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que queue->lock_ = &info->queue_lock_; addrQ = reinterpret_cast
(&queue[1]); } else { - Device::QueueRecycleInfo* info = new Device::QueueRecycleInfo(); + Device::QueueRecycleInfo* info = new Device::QueueRecycleInfo(gpu.dev()); if (info == nullptr) { LogError("Could not create QueueRecycleInfo!"); return nullptr; @@ -202,7 +233,7 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que queue->aql_mgmt_ = &info->aql_packet_mgmt_; // Exclusive compute path addrQ = reinterpret_cast
(&queue[1]); - qCreateInfo.aqlPacketList = info->AqlPacketList(); + qCreateInfo.aqlPacketList = info->DebuggerData(); result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_); } if (result != Pal::Result::Success) { @@ -1072,7 +1103,7 @@ VirtualGPU::~VirtualGPU() { if (queues_[MainEngine] != nullptr) { // Clear all timestamps, associated with this virtual GPU auto& mgmt = *queues_[MainEngine]->aql_mgmt_; - for (uint32_t i = 0; i < AqlPacketMgmt::kAqlPacketsListSize; ++i) { + for (uint32_t i = 0; i < mgmt.amd_queue_.hsa_queue.size; ++i) { if (mgmt.aql_vgpus_[i] == this) { mgmt.aql_vgpus_[i] = nullptr; mgmt.aql_events_[i].invalidate(); @@ -2688,13 +2719,15 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const GpuEvent gpuEvent(queues_[MainEngine]->cmdBufId()); uint32_t id = gpuEvent.id_; uint64_t vmParentWrap = 0; - uint32_t aql_index = 0; // Program the kernel arguments for the GPU execution - hsa_kernel_dispatch_packet_t* aqlPkt = + auto&& [aqlPkt, aql_packet_id] = hsaKernel.loadArguments(*this, kernel, sizes, parameters, ldsSize + sharedMemBytes, - vmDefQueue, &vmParentWrap, &aql_index); + vmDefQueue, &vmParentWrap); assert((nullptr != aqlPkt) && "Couldn't load kernel arguments"); + auto& amd_queue = queues_[MainEngine]->aql_mgmt_->amd_queue_; + uint32_t aql_index = aql_packet_id % amd_queue.hsa_queue.size; + // Dynamic call stack size is considered to calculate private segment size and scratch regs // in pal::Kernel::postLoad(). As it is not called during hipModuleLaunchKernel unlike // hipLaunchKernel/hipLaunchKernelGGL, Updated value is passed to dispatch packet. @@ -2729,9 +2762,46 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false; dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize(); dispatchParam.aqlPacketIndex = aql_index; + + // Update the mqd's information about scratch memory. + amd_queue.scratch_backing_memory_location = static_cast(dispatchParam.scratchAddr); + amd_queue.scratch_backing_memory_byte_size = static_cast(dispatchParam.scratchSize); + + // FIXME: Conservatively, the read_dispatch_id cannot be smaller than the current aql_packet_id - + // hsa_queue.size for the debugger to work correctly. The read_dispatch_id really should be + // updated when the CmdBuf is marked as complete. + uint64_t new_read_dispatch_id = (aql_packet_id >= amd_queue.hsa_queue.size) + ? (aql_packet_id - amd_queue.hsa_queue.size + 1) + : 0; + + // Do an atomic max of &amd_queue.read_dispatch_id and new_read_dispatch_id + uint64_t old_read_dispatch_id = amd_queue.read_dispatch_id; + while (new_read_dispatch_id > old_read_dispatch_id) { +#if defined(__GNUC__) + if (__atomic_compare_exchange_n(&amd_queue.read_dispatch_id, &old_read_dispatch_id, + new_read_dispatch_id, true, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) + break; +#elif defined(_MSC_VER) + uint64_t initial_value = InterlockedCompareExchange64( + reinterpret_cast(&amd_queue.read_dispatch_id), new_read_dispatch_id, + old_read_dispatch_id); + if (initial_value == old_read_dispatch_id) break; + old_read_dispatch_id = initial_value; +#else // !defined (_MSV_VER) && !defined(__GNUC__) +#error Not implemented +#endif // !defined (_MSV_VER) && !defined(__GNUC__) + } + // Run AQL dispatch in HW eventBegin(MainEngine); + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 954 iCmd()->CmdDispatchAql(dispatchParam); +#else // PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 954 + Pal::DispatchAqlFeedback feedback{}; + iCmd()->CmdDispatchAql(dispatchParam, &feedback); + amd_queue.compute_tmpring_size = feedback.tmpRingSize; +#endif // PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 954 if (id != gpuEvent.id_) { LogError("Something is wrong. ID mismatch!\n"); diff --git a/projects/clr/rocclr/device/pal/palvirtual.hpp b/projects/clr/rocclr/device/pal/palvirtual.hpp index 30d216f7a6..6cc134e270 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/device/pal/palvirtual.hpp @@ -36,6 +36,11 @@ #include "palQueue.h" #include "palFence.h" #include "palLinearAllocator.h" +#include "amd_hsa_queue.h" + +#ifdef _WIN32 +#include +#endif // _WIN32 /*! \addtogroup PAL PAL Resource Implementation * @{ @@ -55,12 +60,13 @@ class Kernel; struct AqlPacketMgmt : public amd::EmbeddedObject { static constexpr uint32_t kAqlPacketsListSize = 4 * Ki; - AqlPacketMgmt() : packet_index_(0) { memset(aql_vgpus_, 0, sizeof(aql_vgpus_)); } + AqlPacketMgmt(const Device& dev); - hsa_kernel_dispatch_packet_t aql_packets_[kAqlPacketsListSize]; //!< The list of AQL packets + amd_queue_t amd_queue_{}; + alignas(sizeof(hsa_kernel_dispatch_packet_t)) + hsa_kernel_dispatch_packet_t aql_packets_[kAqlPacketsListSize]; //!< The list of AQL packets GpuEvent aql_events_[kAqlPacketsListSize]; //!< The list of gpu for each AQL packet VirtualGPU* aql_vgpus_[kAqlPacketsListSize]; //!< The list of vgpus which had submissions - std::atomic packet_index_; //!< The active packet slot index }; enum class BarrierType : uint8_t { @@ -596,15 +602,26 @@ class VirtualGPU : public device::VirtualDevice { } //! Returns the current active slot for AQL packet - hsa_kernel_dispatch_packet_t* GetAqlPacketSlot(uint32_t* index) { + std::pair + GetAqlPacketSlot() const { auto& mgmt = *queues_[MainEngine]->aql_mgmt_; // Atomic increment global AQL index and wrap around max AQL list size - *index = ++mgmt.packet_index_ % AqlPacketMgmt::kAqlPacketsListSize; - if (mgmt.aql_events_[*index].isValid()) { + uint64_t packet_id = +#if defined(__GNUC__) + __atomic_fetch_add(&mgmt.amd_queue_.write_dispatch_id, 1, __ATOMIC_RELAXED); +#elif defined(_MSC_VER) + InterlockedExchangeAdd64( + reinterpret_cast(&mgmt.amd_queue_.write_dispatch_id), 1); +#else // !defined (_MSV_VER) && !defined(__GNUC__) +#error Not implemented +#endif // !defined (_MSV_VER) && !defined(__GNUC__) + + uint32_t index = packet_id % mgmt.amd_queue_.hsa_queue.size; + if (mgmt.aql_events_[index].isValid()) { // Make sure GPU doesn't process this slot - mgmt.aql_vgpus_[*index]->waitForEvent(&mgmt.aql_events_[*index]); + mgmt.aql_vgpus_[index]->waitForEvent(&mgmt.aql_events_[index]); } - return &mgmt.aql_packets_[*index]; + return {&mgmt.aql_packets_[index], packet_id}; } protected: