From fb7da41d27abe316580862e3752dea80ed29dbde Mon Sep 17 00:00:00 2001 From: foreman Date: Wed, 16 Oct 2019 11:24:09 -0400 Subject: [PATCH] P4 to Git Change 2014404 by gandryey@gera-win10 on 2019/10/16 11:13:37 SWDEV-184710 - Support hipLaunchCooperativeKernelMultiDevice() - Add support for multi grid launch in hip - Detect the new hidden argument and pass the required information for the kernel launch - Memory for synchronization is allocated as a single object and then the offset for each GPU is found Affected files ... ... //depot/stg/opencl/drivers/opencl/api/hip/hip_module.cpp#44 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#343 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#25 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#17 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#82 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#136 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#42 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#90 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#30 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#99 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/command.hpp#97 edit [ROCm/clr commit: 6e7e97987fc9df4ebc03cca491a16c1bd12870f5] --- projects/clr/rocclr/runtime/device/device.hpp | 23 +++++++- .../clr/rocclr/runtime/device/devkernel.cpp | 3 + .../clr/rocclr/runtime/device/devkernel.hpp | 17 +++--- .../rocclr/runtime/device/pal/palkernel.cpp | 2 + .../rocclr/runtime/device/rocm/rocdevice.cpp | 37 +++++++----- .../rocclr/runtime/device/rocm/rocdevice.hpp | 14 ++++- .../rocclr/runtime/device/rocm/rocvirtual.cpp | 56 ++++++++++++++----- .../rocclr/runtime/device/rocm/rocvirtual.hpp | 2 +- .../clr/rocclr/runtime/platform/command.cpp | 19 +++++-- .../clr/rocclr/runtime/platform/command.hpp | 30 ++++++++-- 10 files changed, 151 insertions(+), 52 deletions(-) diff --git a/projects/clr/rocclr/runtime/device/device.hpp b/projects/clr/rocclr/runtime/device/device.hpp index 932d543074..f273ecf37f 100644 --- a/projects/clr/rocclr/runtime/device/device.hpp +++ b/projects/clr/rocclr/runtime/device/device.hpp @@ -1153,7 +1153,24 @@ class Device : public RuntimeObject { typedef aclCompiler Compiler; public: + // The structures below for MGPU launch match the device library format + struct MGSyncData { + uint32_t w0; + uint32_t w1; + }; + + struct MGSyncInfo { + struct MGSyncData* mgs; + uint32_t grid_id; + uint32_t num_grids; + uint64_t prev_sum; + uint64_t all_sum; + }; + static constexpr size_t kP2PStagingSize = 4 * Mi; + static constexpr size_t kMGSyncDataSize = sizeof(MGSyncData); + static constexpr size_t kMGInfoSizePerDevice = kMGSyncDataSize + sizeof(MGSyncInfo); + typedef std::list CommandQueues; struct BlitProgram : public amd::HeapObject { @@ -1409,9 +1426,9 @@ class Device : public RuntimeObject { std::unique_ptr cacheCompilation_; #endif - static amd::Context* glb_ctx_; //!< Global context with all devices - static amd::Monitor p2p_stage_ops_; //!< Lock to serialise cache for the P2P resources - static Memory* p2p_stage_; //!< Staging resources + static amd::Context* glb_ctx_; //!< Global context with all devices + static amd::Monitor p2p_stage_ops_; //!< Lock to serialise cache for the P2P resources + static Memory* p2p_stage_; //!< Staging resources private: bool IsTypeMatching(cl_device_type type, bool offlineDevices); diff --git a/projects/clr/rocclr/runtime/device/devkernel.cpp b/projects/clr/rocclr/runtime/device/devkernel.cpp index 6fcab423aa..9fb0f69144 100644 --- a/projects/clr/rocclr/runtime/device/devkernel.cpp +++ b/projects/clr/rocclr/runtime/device/devkernel.cpp @@ -752,6 +752,9 @@ static inline uint32_t GetOclArgumentTypeOCL(const KernelArgMD& lcArg, bool* isH case ValueKind::HiddenCompletionAction: *isHidden = true; return amd::KernelParameterDescriptor::HiddenCompletionAction; + case ValueKind::HiddenMultiGridSyncArg: + *isHidden = true; + return amd::KernelParameterDescriptor::HiddenMultiGridSync; case ValueKind::HiddenNone: default: *isHidden = true; diff --git a/projects/clr/rocclr/runtime/device/devkernel.hpp b/projects/clr/rocclr/runtime/device/devkernel.hpp index 561dfc087d..f944584252 100644 --- a/projects/clr/rocclr/runtime/device/devkernel.hpp +++ b/projects/clr/rocclr/runtime/device/devkernel.hpp @@ -110,7 +110,8 @@ static const std::map ArgValueKind = {"HiddenNone", ValueKind::HiddenNone}, {"HiddenPrintfBuffer", ValueKind::HiddenPrintfBuffer}, {"HiddenDefaultQueue", ValueKind::HiddenDefaultQueue}, - {"HiddenCompletionAction", ValueKind::HiddenCompletionAction} + {"HiddenCompletionAction", ValueKind::HiddenCompletionAction}, + {"HiddenMultigridSyncArg", ValueKind::HiddenMultiGridSyncArg} }; static const std::map ArgValueType = @@ -223,7 +224,8 @@ static const std::map ArgValueKindV3 = {"hidden_none", ValueKind::HiddenNone}, {"hidden_printf_buffer", ValueKind::HiddenPrintfBuffer}, {"hidden_default_queue", ValueKind::HiddenDefaultQueue}, - {"hidden_completion_action", ValueKind::HiddenCompletionAction} + {"hidden_completion_action", ValueKind::HiddenCompletionAction}, + {"hidden_multigrid_sync_arg", ValueKind::HiddenMultiGridSyncArg} }; static const std::map ArgValueTypeV3 = @@ -317,19 +319,20 @@ struct KernelParameterDescriptor { ValueObject = 10, ImageObject = 11, SamplerObject = 12, - QueueObject = 13 + QueueObject = 13, + HiddenMultiGridSync = 14 }; clk_value_type_t type_; //!< The parameter's type size_t offset_; //!< Its offset in the parameter's stack size_t size_; //!< Its size in bytes union InfoData { struct { - uint32_t oclObject_ : 4; //!< OCL object type + uint32_t oclObject_ : 4; //!< OCL object type uint32_t readOnly_ : 1; //!< OCL object is read only, applied to memory only - uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA - uint32_t defined_ : 1; //!< The argument was defined by the app + uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA + uint32_t defined_ : 1; //!< The argument was defined by the app uint32_t reserved_ : 1; //!< reserved - uint32_t arrayIndex_ : 24; //!< Index in the objects array or LDS alignment + uint32_t arrayIndex_ : 24;//!< Index in the objects array or LDS alignment }; uint32_t allValues_; InfoData() : allValues_(0) {} diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp index 985a907095..86b73f3c70 100644 --- a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp @@ -336,6 +336,8 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const WriteAqlArgAt(const_cast
(parameters), vmParentWrap, it.size_, it.offset_); } break; + case amd::KernelParameterDescriptor::HiddenMultiGridSync: + break; } } diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp index ced55b2c01..af4d6eec95 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp @@ -51,7 +51,7 @@ hsa_agent_t roc::Device::cpu_agent_ = {0}; std::vector roc::Device::gpu_agents_; const bool roc::Device::offlineDevice_ = false; const bool roc::NullDevice::offlineDevice_ = true; - +address Device::mg_sync_ = nullptr; static HsaDeviceId getHsaDeviceId(hsa_agent_t device, uint32_t& pci_id) { if (HSA_STATUS_SUCCESS != @@ -175,6 +175,10 @@ Device::~Device() { p2p_stage_->release(); p2p_stage_ = nullptr; } + if (nullptr != mg_sync_) { + amd::SvmBuffer::free(GlbCtx(), mg_sync_); + mg_sync_ = nullptr; + } if (glb_ctx_ != nullptr) { glb_ctx_->release(); glb_ctx_ = nullptr; @@ -715,28 +719,25 @@ bool Device::create(bool sramEccEnabled) { // Use just 1 entry by default for the map cache mapCache_->push_back(nullptr); - if ((p2p_agents_.size() == 0) && - (glb_ctx_ == nullptr) && (gpu_agents_.size() > 1) && + if ((glb_ctx_ == nullptr) && (gpu_agents_.size() >= 1) && // Allow creation for the last device in the list. (gpu_agents_[gpu_agents_.size() - 1].handle == _bkendDevice.handle)) { - std::vector devices; uint32_t numDevices = amd::Device::numDevices(CL_DEVICE_TYPE_GPU, false); // Add all PAL devices for (uint32_t i = 0; i < numDevices; ++i) { - devices.push_back(amd::Device::devices()[i]); + devices.push_back(amd::Device::devices()[i]); } // Add current devices.push_back(this); + // Create a dummy context + glb_ctx_ = new amd::Context(devices, info); + if (glb_ctx_ == nullptr) { + return false; + } - if (devices.size() > 1) { - // Create a dummy context - glb_ctx_ = new amd::Context(devices, info); - if (glb_ctx_ == nullptr) { - return false; - } - amd::Buffer* buf = - new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize); + if ((p2p_agents_.size() == 0) && (devices.size() > 1)) { + amd::Buffer* buf = new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize); if ((buf != nullptr) && buf->create()) { p2p_stage_ = buf; } @@ -745,6 +746,15 @@ bool Device::create(bool sramEccEnabled) { return false; } } + // Check if sync buffer wasn't allocated yet + if (amd::IS_HIP && mg_sync_ == nullptr) { + mg_sync_ = reinterpret_cast
(amd::SvmBuffer::malloc( + GlbCtx(), (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS), + kMGInfoSizePerDevice * GlbCtx().devices().size(), kMGInfoSizePerDevice)); + if (mg_sync_ == nullptr) { + return false; + } + } } if (settings().stagedXferSize_ != 0) { @@ -1817,6 +1827,7 @@ VirtualGPU* Device::xferQueue() const { xferQueue_->enableSyncBlit(); return xferQueue_; } + bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { bool result = true; return result; diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp index 378952501e..09ffb8e825 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp @@ -410,6 +410,16 @@ class Device : public NullDevice { hsa_amd_memory_pool_t SystemCoarseSegment() const { return system_coarse_segment_; } + //! Acquire HSA queue. This method can create a new HSA queue or + //! share previously created + hsa_queue_t* acquireQueue(uint32_t queue_size_hint); + + //! Release HSA queue + void releaseQueue(hsa_queue_t*); + + //! Return multi GPU grid launch sync buffer + address MGSync() const { return mg_sync_; } + private: static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table; @@ -440,6 +450,7 @@ class Device : public NullDevice { std::atomic freeMem_; //!< Total of free memory available mutable amd::Monitor vgpusAccess_; //!< Lock to serialise virtual gpu list access bool hsa_exclusive_gpu_access_; //!< TRUE if current device was moved into exclusive GPU access mode + static address mg_sync_; //!< MGPU grid launch sync memory (SVM location) struct QueueInfo { int refCount; @@ -448,9 +459,6 @@ class Device : public NullDevice { public: amd::Atomic numOfVgpus_; //!< Virtual gpu unique index - - hsa_queue_t *acquireQueue(uint32_t queue_size_hint); - void releaseQueue(hsa_queue_t*); }; // class roc::Device } // namespace roc diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp index 408a973fc3..fe18049bbe 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp @@ -1961,13 +1961,14 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) } bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel, - const_address parameters, void* eventHandle, uint32_t sharedMemBytes, bool cooperativeGroups) { + const_address parameters, void* eventHandle, uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd) { device::Kernel* devKernel = const_cast(kernel.getDeviceKernel(dev())); Kernel& gpuKernel = static_cast(*devKernel); size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize(); // Check memory dependency and SVM objects - if (!processMemObjects(kernel, parameters, ldsUsage, cooperativeGroups)) { + bool coopGroups = (vcmd != nullptr) ? vcmd->cooperativeGroups() : false; + if (!processMemObjects(kernel, parameters, ldsUsage, coopGroups)) { LogError("Wrong memory objects!"); return false; } @@ -2099,6 +2100,27 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const WriteAqlArgAt(const_cast
(parameters), &spVA, it.size_, it.offset_); break; } + case amd::KernelParameterDescriptor::HiddenMultiGridSync: { + uint64_t gridSync = coopGroups ? 1 : 0; + bool multiGrid = (vcmd != nullptr) ? vcmd->cooperativeMultiDeviceGroups() : false; + if (multiGrid) { + // Find CPU pointer to the right sync info structure. It should be after MGSyncData + Device::MGSyncInfo* syncInfo = reinterpret_cast( + dev().MGSync() + Device::kMGInfoSizePerDevice * dev().index() + Device::kMGSyncDataSize); + // Update sync data address. Use the offset adjustment to the right location + syncInfo->mgs = reinterpret_cast(dev().MGSync() + + Device::kMGInfoSizePerDevice * vcmd->firstDevice()); + // Fill all sync info fields + syncInfo->grid_id = vcmd->gridId(); + syncInfo->num_grids = vcmd->numGrids(); + syncInfo->prev_sum = vcmd->prevGridSum(); + syncInfo->all_sum = vcmd->allGridSum(); + // Update GPU address for grid sync info. Use the offset adjustment for the right location + gridSync = reinterpret_cast(syncInfo); + } + WriteAqlArgAt(const_cast
(parameters), &gridSync, it.size_, it.offset_); + break; + } } } @@ -2177,32 +2199,36 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const * the list of kernel parameters. */ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { - if (vcmd.cooperativeGroups()) { - uint32_t workgroups = 0; - for (uint i = 0; i < vcmd.sizes().dimensions(); i++) { - if ((vcmd.sizes().local()[i] != 0) && (vcmd.sizes().global()[i] != 1)) { - workgroups += (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]); - } - } - + if (vcmd.cooperativeGroups() || vcmd.cooperativeMultiDeviceGroups()) { // Get device queue for exclusive GPU access VirtualGPU* queue = dev().xferQueue(); + // Lock the queue, using the blit manager lock + amd::ScopedLock lock(queue->blitMgr().lockXfer()); + // Wait for the execution on the current queue, since the coop groups will use the device queue releaseGpuMemoryFence(); - // Lock the queue, using the blit manager lock - amd::ScopedLock lock(queue->blitMgr().lockXfer()); queue->profilingBegin(vcmd); - static_cast(queue->blitMgr()).RunGwsInit(workgroups); + if (vcmd.cooperativeGroups()) { + // Initialize GWS if it's cooperative groups launch + uint32_t workgroups = 0; + for (uint i = 0; i < vcmd.sizes().dimensions(); i++) { + if ((vcmd.sizes().local()[i] != 0) && (vcmd.sizes().global()[i] != 1)) { + workgroups += (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]); + } + } + + static_cast(queue->blitMgr()).RunGwsInit(workgroups - 1); + } // Sync AQL packets queue->setAqlHeader(dispatchPacketHeader_); // Submit kernel to HW if (!queue->submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), - static_cast(as_cl(&vcmd.event())), vcmd.sharedMemBytes(), vcmd.cooperativeGroups())) { + static_cast(as_cl(&vcmd.event())), vcmd.sharedMemBytes(), &vcmd)) { LogError("AQL dispatch failed!"); vcmd.setStatus(CL_INVALID_OPERATION); } @@ -2218,7 +2244,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { // Submit kernel to HW if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), - static_cast(as_cl(&vcmd.event())), vcmd.sharedMemBytes(), vcmd.cooperativeGroups())) { + static_cast(as_cl(&vcmd.event())), vcmd.sharedMemBytes())) { LogError("AQL dispatch failed!"); vcmd.setStatus(CL_INVALID_OPERATION); } diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp index f45060322d..2e31dc4b4f 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp @@ -171,7 +171,7 @@ class VirtualGPU : public device::VirtualDevice { const_address parameters, //!< Parameters for the kernel void* event_handle, //!< Handle to OCL event for debugging uint32_t sharedMemBytes = 0, //!< Shared memory size - bool cooperativeGroups = false //!< TRUE if cooperative groups mode is required + amd::NDRangeKernelCommand* vcmd = nullptr //!< Original launch command ); void submitNativeFn(amd::NativeFnCommand& cmd); void submitMarker(amd::Marker& cmd); diff --git a/projects/clr/rocclr/runtime/platform/command.cpp b/projects/clr/rocclr/runtime/platform/command.cpp index e67c9067dc..fb87d99588 100644 --- a/projects/clr/rocclr/runtime/platform/command.cpp +++ b/projects/clr/rocclr/runtime/platform/command.cpp @@ -232,12 +232,19 @@ const Context& Command::context() const { return queue_->context(); } NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList, Kernel& kernel, const NDRangeContainer& sizes, - uint32_t sharedMemBytes, uint32_t extraParam) - : Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList, AMD_SERIALIZE_KERNEL) - , kernel_(kernel) - , sizes_(sizes) - , sharedMemBytes_(sharedMemBytes) - , extraParam_(extraParam) { + uint32_t sharedMemBytes, uint32_t extraParam, + uint32_t gridId, uint32_t numGrids, + uint64_t prevGridSum, uint64_t allGridSum, uint32_t firstDevice) : + Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList, AMD_SERIALIZE_KERNEL), + kernel_(kernel), + sizes_(sizes), + sharedMemBytes_(sharedMemBytes), + extraParam_(extraParam), + gridId_(gridId), + numGrids_(numGrids), + prevGridSum_(prevGridSum), + allGridSum_(allGridSum), + firstDevice_(firstDevice) { auto& device = queue.device(); auto devKernel = const_cast(kernel.getDeviceKernel(device)); profilingInfo_.setCallback(devKernel->getProfilingCallback( diff --git a/projects/clr/rocclr/runtime/platform/command.hpp b/projects/clr/rocclr/runtime/platform/command.hpp index e55a33fe2b..6610381869 100644 --- a/projects/clr/rocclr/runtime/platform/command.hpp +++ b/projects/clr/rocclr/runtime/platform/command.hpp @@ -764,9 +764,15 @@ class NDRangeKernelCommand : public Command { private: Kernel& kernel_; NDRangeContainer sizes_; - address parameters_; - uint32_t sharedMemBytes_; - uint32_t extraParam_; + address parameters_; //!< Pointer to the kernel argumets + // The below fields are specific to the HIP functionality + uint32_t sharedMemBytes_; //!< Size of reserved shared memory + uint32_t extraParam_; //!< Extra flags for the kernel launch + uint32_t gridId_; //!< Grid ID in the multi GPU kernel launch + uint32_t numGrids_; //!< Total number of grids in multi GPU launch + uint64_t prevGridSum_; //!< A sum of previous grids to the current launch + uint64_t allGridSum_; //!< A sum of all grids in multi GPU launch + uint32_t firstDevice_; //!< Device index of the first device in the grid public: enum { @@ -777,7 +783,8 @@ class NDRangeKernelCommand : public Command { //! Construct an ExecuteKernel command NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList, Kernel& kernel, const NDRangeContainer& sizes, uint32_t sharedMemBytes = 0, - uint32_t extraParam = 0); + uint32_t extraParam = 0, uint32_t gridId = 0, uint32_t numGrids = 0, + uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0); virtual void submit(device::VirtualDevice& device) { device.submitKernel(*this); } @@ -804,6 +811,21 @@ class NDRangeKernelCommand : public Command { return (extraParam_ & CooperativeMultiDeviceGroups) ? true : false; } + //! Return the current grid ID for multidevice launch + uint32_t gridId() const { return gridId_; } + + //! Return the number of launched grids + uint32_t numGrids() const { return numGrids_; } + + //! Return the total workload size for up to the current + uint64_t prevGridSum() const { return prevGridSum_; } + + //! Return the total workload size for all GPUs + uint64_t allGridSum() const { return allGridSum_; } + + //! Return the index of the first device in multi GPU launch + uint64_t firstDevice() const { return firstDevice_; } + //! Set the local work size. void setLocalWorkSize(const NDRange& local) { sizes_.local() = local; }