From 950dccb283cc038bd78795ffd547b83f2fa9dd2d Mon Sep 17 00:00:00 2001 From: foreman Date: Thu, 1 Aug 2019 16:24:33 -0400 Subject: [PATCH] P4 to Git Change 1977224 by gandryey@gera-hip-lnx on 2019/08/01 16:17:51 SWDEV-193423 - HIP/VDI - Support for lazy hsa queue creation - Add queue pool support for HSA HW queues. GPU_MAX_HW_QUEUES controls the pool size. The current default value is 4 (the number of active pipes on GPU). Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#132 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#38 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#81 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#24 edit ... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#314 edit --- rocclr/runtime/device/rocm/rocdevice.cpp | 4 - rocclr/runtime/device/rocm/rocdevice.hpp | 3 + rocclr/runtime/device/rocm/rocvirtual.cpp | 103 ++++++++++++++++------ rocclr/runtime/device/rocm/rocvirtual.hpp | 6 +- rocclr/runtime/utils/flags.hpp | 4 +- 5 files changed, 83 insertions(+), 37 deletions(-) diff --git a/rocclr/runtime/device/rocm/rocdevice.cpp b/rocclr/runtime/device/rocm/rocdevice.cpp index 35f935bfa8..8c4f2e2bf4 100644 --- a/rocclr/runtime/device/rocm/rocdevice.cpp +++ b/rocclr/runtime/device/rocm/rocdevice.cpp @@ -1411,10 +1411,6 @@ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) { return nullptr; } - if (profiling) { - hsa_amd_profiling_set_profiler_enabled(virtualDevice->gpu_queue(), 1); - } - return virtualDevice; } diff --git a/rocclr/runtime/device/rocm/rocdevice.hpp b/rocclr/runtime/device/rocm/rocdevice.hpp index 7b7b8fb58c..2ec5b65273 100644 --- a/rocclr/runtime/device/rocm/rocdevice.hpp +++ b/rocclr/runtime/device/rocm/rocdevice.hpp @@ -406,6 +406,8 @@ class Device : public NullDevice { VirtualGPU* xferQueue() const; + std::map& QueuePool() { return queue_pool_; } + private: static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table; @@ -436,6 +438,7 @@ class Device : public NullDevice { std::atomic freeMem_; //!< Total of free memory available mutable amd::Monitor vgpusAccess_; //!< Lock to serialise virtual gpu list access bool hsa_exclusive_gpu_access_; //!< TRUE if current device was moved into exclusive GPU access mode + std::map queue_pool_; //!< Pool of HSA queues for recycling public: amd::Atomic numOfVgpus_; //!< Virtual gpu unique index diff --git a/rocclr/runtime/device/rocm/rocvirtual.cpp b/rocclr/runtime/device/rocm/rocvirtual.cpp index f6af73f13f..950bbce25d 100644 --- a/rocclr/runtime/device/rocm/rocvirtual.cpp +++ b/rocclr/runtime/device/rocm/rocvirtual.cpp @@ -48,6 +48,9 @@ namespace roc { // (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE) invalidates L1, L2 and flushes // L2 +static const uint16_t kInvalidAql = + (HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE); + static const uint16_t kDispatchPacketHeaderNoSync = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | @@ -387,13 +390,18 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para return true; } +static inline void packet_store_release(uint32_t* packet, uint16_t header, uint16_t rest) { + __atomic_store_n(packet, header | (rest << 16), __ATOMIC_RELEASE); +} + template -bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, bool blocking, size_t size) { +bool VirtualGPU::dispatchGenericAqlPacket( + AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking, size_t size) { const uint32_t queueSize = gpu_queue_->size; const uint32_t queueMask = queueSize - 1; // Check for queue full and wait if needed. - uint64_t index = hsa_queue_load_write_index_relaxed(gpu_queue_); + uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, size); uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_); hsa_signal_t signal; @@ -412,7 +420,11 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, bool blocking, size timestamp_->setAgent(gpu_device_); } - if (blocking || (index - read) == queueMask) { + // Make sure the slot is free for usage + while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask); + + // Add blocking command if the original value of read index was behind of the queue size + if (blocking || (index - read) >= queueMask) { if (packet->completion_signal.handle == 0) { packet->completion_signal = barrier_signal_; } @@ -426,10 +438,14 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, bool blocking, size // NOTE: need multiple packets to dispatch the performance counter // packet blob of the legacy devices (gfx8) for (uint i = 0; i < size; i++, index++, packet++) { - ((AqlPacket*)(gpu_queue_->base_address))[index & queueMask] = *packet; + AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[index & queueMask]; + *aql_loc = *packet; + if (header != 0) { + packet_store_release(reinterpret_cast(aql_loc), header, rest); + } } - hsa_queue_store_write_index_release(gpu_queue_, index); - hsa_signal_store_relaxed(gpu_queue_->doorbell_signal, index-1); + //hsa_queue_store_write_index_release(gpu_queue_, index); + hsa_signal_store_release(gpu_queue_->doorbell_signal, index - 1); // Wait on signal ? if (blocking) { @@ -446,12 +462,14 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, bool blocking, size return true; } -bool VirtualGPU::dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, bool blocking) { - return dispatchGenericAqlPacket(packet, blocking); +bool VirtualGPU::dispatchAqlPacket( + hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest, bool blocking) { + return dispatchGenericAqlPacket(packet, header, rest, blocking); } -bool VirtualGPU::dispatchAqlPacket(hsa_barrier_and_packet_t* packet, bool blocking) { - return dispatchGenericAqlPacket(packet, blocking); +bool VirtualGPU::dispatchAqlPacket( + hsa_barrier_and_packet_t* packet, uint16_t header, uint16_t rest, bool blocking) { + return dispatchGenericAqlPacket(packet, header, rest, blocking); } bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, @@ -467,13 +485,13 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, { // Create legacy devices PM4 data hsa_ext_amd_aql_pm4_packet_t pm4Packet[SLOT_PM4_SIZE_AQLP]; extApi->hsa_ven_amd_aqlprofile_legacy_get_pm4(packet, static_cast(&pm4Packet[0])); - return dispatchGenericAqlPacket(&pm4Packet[0], blocking, SLOT_PM4_SIZE_AQLP); + return dispatchGenericAqlPacket(&pm4Packet[0], 0, 0, blocking, SLOT_PM4_SIZE_AQLP); } break; case PerfCounter::ROC_GFX9: { packet->header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE; - return dispatchGenericAqlPacket(packet, blocking); + return dispatchGenericAqlPacket(packet, 0, 0, blocking); } break; } @@ -485,13 +503,16 @@ void VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet) { assert(packet->completion_signal.handle != 0); const uint32_t queueSize = gpu_queue_->size; const uint32_t queueMask = queueSize - 1; + uint32_t header = kBarrierPacketHeader; - uint64_t index = hsa_queue_load_write_index_relaxed(gpu_queue_); - ((hsa_barrier_and_packet_t*)(gpu_queue_->base_address))[index & queueMask] = *packet; + uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1); + while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask); + hsa_barrier_and_packet_t* aql_loc = + &(reinterpret_cast(gpu_queue_->base_address))[index & queueMask]; + *aql_loc = *packet; + __atomic_store_n(reinterpret_cast(aql_loc), kBarrierPacketHeader, __ATOMIC_RELEASE); - hsa_queue_store_write_index_relaxed(gpu_queue_, index + 1); - - hsa_signal_store_relaxed(gpu_queue_->doorbell_signal, index); + hsa_signal_store_release(gpu_queue_->doorbell_signal, index); } /** @@ -569,8 +590,6 @@ VirtualGPU::~VirtualGPU() { // Release the resources of signal releaseGpuMemoryFence(); - hsa_status_t err = hsa_queue_destroy(gpu_queue_); - if (barrier_signal_.handle != 0) { hsa_signal_destroy(barrier_signal_); } @@ -613,6 +632,13 @@ VirtualGPU::~VirtualGPU() { for (uint idx = index(); idx < roc_device_.vgpus().size(); ++idx) { roc_device_.vgpus()[idx]->index_--; } + // Decrement the counter + roc_device_.QueuePool()[gpu_queue_]--; + // Release the queue if the counter is 0 + if (roc_device_.QueuePool()[gpu_queue_] == 0) { + hsa_status_t err = hsa_queue_destroy(gpu_queue_); + roc_device_.QueuePool().erase(gpu_queue_); + } } bool VirtualGPU::create(bool profilingEna) { @@ -630,13 +656,28 @@ bool VirtualGPU::create(bool profilingEna) { // Pick a reasonable queue size uint32_t queue_size = 1024; queue_size = (queue_max_packets < queue_size) ? queue_max_packets : queue_size; - while (hsa_queue_create(gpu_device_, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr, nullptr, + if (roc_device_.QueuePool().size() < GPU_MAX_HW_QUEUES) { + while (hsa_queue_create(gpu_device_, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr, nullptr, std::numeric_limits::max(), std::numeric_limits::max(), &gpu_queue_) != HSA_STATUS_SUCCESS) { - queue_size >>= 1; - if (queue_size < 64) { - return false; + queue_size >>= 1; + if (queue_size < 64) { + return false; + } } + hsa_amd_profiling_set_profiler_enabled(gpu_queue(), 1); + roc_device_.QueuePool().insert({gpu_queue_, 1}); + } else { + int usage = std::numeric_limits::max(); + // Loop through all allocated queues and find the lowest usage + for (const auto it : roc_device_.QueuePool()) { + if (it.second < usage) { + gpu_queue_ = it.first; + usage = it.second; + } + } + // Increment the usage of the current queue + roc_device_.QueuePool()[gpu_queue_]++; } if (!initPool(dev().settings().kernargPoolSize_, (profilingEna) ? queue_size : 0)) { @@ -660,7 +701,7 @@ bool VirtualGPU::create(bool profilingEna) { // Initialize barrier packet. memset(&barrier_packet_, 0, sizeof(barrier_packet_)); - barrier_packet_.header = kBarrierPacketHeader; + barrier_packet_.header = kInvalidAql; barrier_packet_.completion_signal = barrier_signal_; // Create a object of PrintfDbg @@ -2100,10 +2141,11 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const hsa_kernel_dispatch_packet_t dispatchPacket; memset(&dispatchPacket, 0, sizeof(dispatchPacket)); + dispatchPacket.header = kInvalidAql; dispatchPacket.kernel_object = gpuKernel.KernelCodeHandle(); - dispatchPacket.header = aqlHeader_; - dispatchPacket.setup |= sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + // dispatchPacket.header = aqlHeader_; + // dispatchPacket.setup |= sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; dispatchPacket.grid_size_x = sizes.dimensions() > 0 ? newGlobalSize[0] : 1; dispatchPacket.grid_size_y = sizes.dimensions() > 1 ? newGlobalSize[1] : 1; dispatchPacket.grid_size_z = sizes.dimensions() > 2 ? newGlobalSize[2] : 1; @@ -2119,7 +2161,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_; // Dispatch the packet - if (!dispatchAqlPacket(&dispatchPacket, GPU_FLUSH_ON_EXECUTION)) { + if (!dispatchAqlPacket( + &dispatchPacket, aqlHeader_, + (sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS), + GPU_FLUSH_ON_EXECUTION)) { return false; } } @@ -2219,7 +2264,7 @@ void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) { profilingBegin(vcmd); auto fence = kBarrierAcquirePacket; - dispatchAqlPacket(&fence, false); + dispatchAqlPacket(&fence, 0, 0, false); profilingEnd(vcmd); } @@ -2228,7 +2273,7 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) { amd::ScopedLock lock(execution()); profilingBegin(vcmd); auto fence = kBarrierReleasePacket; - dispatchAqlPacket(&fence, false); + dispatchAqlPacket(&fence, 0, 0, false); profilingEnd(vcmd); } diff --git a/rocclr/runtime/device/rocm/rocvirtual.hpp b/rocclr/runtime/device/rocm/rocvirtual.hpp index ea84bf1764..97153fbaeb 100644 --- a/rocclr/runtime/device/rocm/rocvirtual.hpp +++ b/rocclr/runtime/device/rocm/rocvirtual.hpp @@ -249,9 +249,9 @@ class VirtualGPU : public device::VirtualDevice { // } roc OpenCL integration private: - bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, bool blocking = true); - bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, bool blocking = true); - template bool dispatchGenericAqlPacket(AqlPacket* packet, bool blocking, size_t size = 1); + bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest, bool blocking = true); + bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header, uint16_t rest, bool blocking = true); + template bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking, size_t size = 1); void dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet); bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion, bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi); void initializeDispatchPacket(hsa_kernel_dispatch_packet_t* packet, amd::NDRangeContainer& sizes); diff --git a/rocclr/runtime/utils/flags.hpp b/rocclr/runtime/utils/flags.hpp index 85e815ca95..549ee5a54b 100644 --- a/rocclr/runtime/utils/flags.hpp +++ b/rocclr/runtime/utils/flags.hpp @@ -115,7 +115,7 @@ release(uint, OPENCL_VERSION, (IS_BRAHMA ? 120 : 200), \ "Force GPU opencl verison") \ release(bool, HSA_LOCAL_MEMORY_ENABLE, true, \ "Enable HSA device local memory usage") \ -release(uint, HSA_KERNARG_POOL_SIZE, 2 * 1024 * 1024, \ +release(uint, HSA_KERNARG_POOL_SIZE, 512 * 1024, \ "Kernarg pool size") \ release(bool, HSA_ENABLE_COARSE_GRAIN_SVM, true, \ "Enable device memory for coarse grain SVM allocations") \ @@ -177,6 +177,8 @@ release(bool, GPU_ENABLE_COOP_GROUPS, false, \ "Enables cooperative group launch") \ release(uint, GPU_MAX_COMMAND_BUFFERS, 8, \ "The maximum number of command buffers allocated per queue") \ +release(uint, GPU_MAX_HW_QUEUES, 4, \ + "The maximum number of HW queues allocated per device") \ release(bool, GPU_IMAGE_BUFFER_WAR, true, \ "Enables image buffer workaround") \ release(cstring, HIP_VISIBLE_DEVICES, "", \