From fb7da41d27abe316580862e3752dea80ed29dbde Mon Sep 17 00:00:00 2001
From: foreman
Date: Wed, 16 Oct 2019 11:24:09 -0400
Subject: [PATCH] P4 to Git Change 2014404 by gandryey@gera-win10 on 2019/10/16
11:13:37
SWDEV-184710 - Support hipLaunchCooperativeKernelMultiDevice()
- Add support for multi grid launch in hip
- Detect the new hidden argument and pass the required information for the kernel launch
- Memory for synchronization is allocated as a single object and then the offset for each GPU is found
Affected files ...
... //depot/stg/opencl/drivers/opencl/api/hip/hip_module.cpp#44 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#343 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#25 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#17 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#82 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#136 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#42 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#90 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#30 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#99 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.hpp#97 edit
[ROCm/clr commit: 6e7e97987fc9df4ebc03cca491a16c1bd12870f5]
---
projects/clr/rocclr/runtime/device/device.hpp | 23 +++++++-
.../clr/rocclr/runtime/device/devkernel.cpp | 3 +
.../clr/rocclr/runtime/device/devkernel.hpp | 17 +++---
.../rocclr/runtime/device/pal/palkernel.cpp | 2 +
.../rocclr/runtime/device/rocm/rocdevice.cpp | 37 +++++++-----
.../rocclr/runtime/device/rocm/rocdevice.hpp | 14 ++++-
.../rocclr/runtime/device/rocm/rocvirtual.cpp | 56 ++++++++++++++-----
.../rocclr/runtime/device/rocm/rocvirtual.hpp | 2 +-
.../clr/rocclr/runtime/platform/command.cpp | 19 +++++--
.../clr/rocclr/runtime/platform/command.hpp | 30 ++++++++--
10 files changed, 151 insertions(+), 52 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/device.hpp b/projects/clr/rocclr/runtime/device/device.hpp
index 932d543074..f273ecf37f 100644
--- a/projects/clr/rocclr/runtime/device/device.hpp
+++ b/projects/clr/rocclr/runtime/device/device.hpp
@@ -1153,7 +1153,24 @@ class Device : public RuntimeObject {
typedef aclCompiler Compiler;
public:
+ // The structures below for MGPU launch match the device library format
+ struct MGSyncData {
+ uint32_t w0;
+ uint32_t w1;
+ };
+
+ struct MGSyncInfo {
+ struct MGSyncData* mgs;
+ uint32_t grid_id;
+ uint32_t num_grids;
+ uint64_t prev_sum;
+ uint64_t all_sum;
+ };
+
static constexpr size_t kP2PStagingSize = 4 * Mi;
+ static constexpr size_t kMGSyncDataSize = sizeof(MGSyncData);
+ static constexpr size_t kMGInfoSizePerDevice = kMGSyncDataSize + sizeof(MGSyncInfo);
+
typedef std::list CommandQueues;
struct BlitProgram : public amd::HeapObject {
@@ -1409,9 +1426,9 @@ class Device : public RuntimeObject {
std::unique_ptr cacheCompilation_;
#endif
- static amd::Context* glb_ctx_; //!< Global context with all devices
- static amd::Monitor p2p_stage_ops_; //!< Lock to serialise cache for the P2P resources
- static Memory* p2p_stage_; //!< Staging resources
+ static amd::Context* glb_ctx_; //!< Global context with all devices
+ static amd::Monitor p2p_stage_ops_; //!< Lock to serialise cache for the P2P resources
+ static Memory* p2p_stage_; //!< Staging resources
private:
bool IsTypeMatching(cl_device_type type, bool offlineDevices);
diff --git a/projects/clr/rocclr/runtime/device/devkernel.cpp b/projects/clr/rocclr/runtime/device/devkernel.cpp
index 6fcab423aa..9fb0f69144 100644
--- a/projects/clr/rocclr/runtime/device/devkernel.cpp
+++ b/projects/clr/rocclr/runtime/device/devkernel.cpp
@@ -752,6 +752,9 @@ static inline uint32_t GetOclArgumentTypeOCL(const KernelArgMD& lcArg, bool* isH
case ValueKind::HiddenCompletionAction:
*isHidden = true;
return amd::KernelParameterDescriptor::HiddenCompletionAction;
+ case ValueKind::HiddenMultiGridSyncArg:
+ *isHidden = true;
+ return amd::KernelParameterDescriptor::HiddenMultiGridSync;
case ValueKind::HiddenNone:
default:
*isHidden = true;
diff --git a/projects/clr/rocclr/runtime/device/devkernel.hpp b/projects/clr/rocclr/runtime/device/devkernel.hpp
index 561dfc087d..f944584252 100644
--- a/projects/clr/rocclr/runtime/device/devkernel.hpp
+++ b/projects/clr/rocclr/runtime/device/devkernel.hpp
@@ -110,7 +110,8 @@ static const std::map ArgValueKind =
{"HiddenNone", ValueKind::HiddenNone},
{"HiddenPrintfBuffer", ValueKind::HiddenPrintfBuffer},
{"HiddenDefaultQueue", ValueKind::HiddenDefaultQueue},
- {"HiddenCompletionAction", ValueKind::HiddenCompletionAction}
+ {"HiddenCompletionAction", ValueKind::HiddenCompletionAction},
+ {"HiddenMultigridSyncArg", ValueKind::HiddenMultiGridSyncArg}
};
static const std::map ArgValueType =
@@ -223,7 +224,8 @@ static const std::map ArgValueKindV3 =
{"hidden_none", ValueKind::HiddenNone},
{"hidden_printf_buffer", ValueKind::HiddenPrintfBuffer},
{"hidden_default_queue", ValueKind::HiddenDefaultQueue},
- {"hidden_completion_action", ValueKind::HiddenCompletionAction}
+ {"hidden_completion_action", ValueKind::HiddenCompletionAction},
+ {"hidden_multigrid_sync_arg", ValueKind::HiddenMultiGridSyncArg}
};
static const std::map ArgValueTypeV3 =
@@ -317,19 +319,20 @@ struct KernelParameterDescriptor {
ValueObject = 10,
ImageObject = 11,
SamplerObject = 12,
- QueueObject = 13
+ QueueObject = 13,
+ HiddenMultiGridSync = 14
};
clk_value_type_t type_; //!< The parameter's type
size_t offset_; //!< Its offset in the parameter's stack
size_t size_; //!< Its size in bytes
union InfoData {
struct {
- uint32_t oclObject_ : 4; //!< OCL object type
+ uint32_t oclObject_ : 4; //!< OCL object type
uint32_t readOnly_ : 1; //!< OCL object is read only, applied to memory only
- uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA
- uint32_t defined_ : 1; //!< The argument was defined by the app
+ uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA
+ uint32_t defined_ : 1; //!< The argument was defined by the app
uint32_t reserved_ : 1; //!< reserved
- uint32_t arrayIndex_ : 24; //!< Index in the objects array or LDS alignment
+ uint32_t arrayIndex_ : 24;//!< Index in the objects array or LDS alignment
};
uint32_t allValues_;
InfoData() : allValues_(0) {}
diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
index 985a907095..86b73f3c70 100644
--- a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
@@ -336,6 +336,8 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const
WriteAqlArgAt(const_cast(parameters), vmParentWrap, it.size_, it.offset_);
}
break;
+ case amd::KernelParameterDescriptor::HiddenMultiGridSync:
+ break;
}
}
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
index ced55b2c01..af4d6eec95 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
@@ -51,7 +51,7 @@ hsa_agent_t roc::Device::cpu_agent_ = {0};
std::vector roc::Device::gpu_agents_;
const bool roc::Device::offlineDevice_ = false;
const bool roc::NullDevice::offlineDevice_ = true;
-
+address Device::mg_sync_ = nullptr;
static HsaDeviceId getHsaDeviceId(hsa_agent_t device, uint32_t& pci_id) {
if (HSA_STATUS_SUCCESS !=
@@ -175,6 +175,10 @@ Device::~Device() {
p2p_stage_->release();
p2p_stage_ = nullptr;
}
+ if (nullptr != mg_sync_) {
+ amd::SvmBuffer::free(GlbCtx(), mg_sync_);
+ mg_sync_ = nullptr;
+ }
if (glb_ctx_ != nullptr) {
glb_ctx_->release();
glb_ctx_ = nullptr;
@@ -715,28 +719,25 @@ bool Device::create(bool sramEccEnabled) {
// Use just 1 entry by default for the map cache
mapCache_->push_back(nullptr);
- if ((p2p_agents_.size() == 0) &&
- (glb_ctx_ == nullptr) && (gpu_agents_.size() > 1) &&
+ if ((glb_ctx_ == nullptr) && (gpu_agents_.size() >= 1) &&
// Allow creation for the last device in the list.
(gpu_agents_[gpu_agents_.size() - 1].handle == _bkendDevice.handle)) {
-
std::vector devices;
uint32_t numDevices = amd::Device::numDevices(CL_DEVICE_TYPE_GPU, false);
// Add all PAL devices
for (uint32_t i = 0; i < numDevices; ++i) {
- devices.push_back(amd::Device::devices()[i]);
+ devices.push_back(amd::Device::devices()[i]);
}
// Add current
devices.push_back(this);
+ // Create a dummy context
+ glb_ctx_ = new amd::Context(devices, info);
+ if (glb_ctx_ == nullptr) {
+ return false;
+ }
- if (devices.size() > 1) {
- // Create a dummy context
- glb_ctx_ = new amd::Context(devices, info);
- if (glb_ctx_ == nullptr) {
- return false;
- }
- amd::Buffer* buf =
- new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize);
+ if ((p2p_agents_.size() == 0) && (devices.size() > 1)) {
+ amd::Buffer* buf = new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize);
if ((buf != nullptr) && buf->create()) {
p2p_stage_ = buf;
}
@@ -745,6 +746,15 @@ bool Device::create(bool sramEccEnabled) {
return false;
}
}
+ // Check if sync buffer wasn't allocated yet
+ if (amd::IS_HIP && mg_sync_ == nullptr) {
+ mg_sync_ = reinterpret_cast(amd::SvmBuffer::malloc(
+ GlbCtx(), (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS),
+ kMGInfoSizePerDevice * GlbCtx().devices().size(), kMGInfoSizePerDevice));
+ if (mg_sync_ == nullptr) {
+ return false;
+ }
+ }
}
if (settings().stagedXferSize_ != 0) {
@@ -1817,6 +1827,7 @@ VirtualGPU* Device::xferQueue() const {
xferQueue_->enableSyncBlit();
return xferQueue_;
}
+
bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
bool result = true;
return result;
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
index 378952501e..09ffb8e825 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
@@ -410,6 +410,16 @@ class Device : public NullDevice {
hsa_amd_memory_pool_t SystemCoarseSegment() const { return system_coarse_segment_; }
+ //! Acquire HSA queue. This method can create a new HSA queue or
+ //! share previously created
+ hsa_queue_t* acquireQueue(uint32_t queue_size_hint);
+
+ //! Release HSA queue
+ void releaseQueue(hsa_queue_t*);
+
+ //! Return multi GPU grid launch sync buffer
+ address MGSync() const { return mg_sync_; }
+
private:
static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
@@ -440,6 +450,7 @@ class Device : public NullDevice {
std::atomic freeMem_; //!< Total of free memory available
mutable amd::Monitor vgpusAccess_; //!< Lock to serialise virtual gpu list access
bool hsa_exclusive_gpu_access_; //!< TRUE if current device was moved into exclusive GPU access mode
+ static address mg_sync_; //!< MGPU grid launch sync memory (SVM location)
struct QueueInfo {
int refCount;
@@ -448,9 +459,6 @@ class Device : public NullDevice {
public:
amd::Atomic numOfVgpus_; //!< Virtual gpu unique index
-
- hsa_queue_t *acquireQueue(uint32_t queue_size_hint);
- void releaseQueue(hsa_queue_t*);
}; // class roc::Device
} // namespace roc
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
index 408a973fc3..fe18049bbe 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
@@ -1961,13 +1961,14 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
}
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
- const_address parameters, void* eventHandle, uint32_t sharedMemBytes, bool cooperativeGroups) {
+ const_address parameters, void* eventHandle, uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd) {
device::Kernel* devKernel = const_cast(kernel.getDeviceKernel(dev()));
Kernel& gpuKernel = static_cast(*devKernel);
size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
// Check memory dependency and SVM objects
- if (!processMemObjects(kernel, parameters, ldsUsage, cooperativeGroups)) {
+ bool coopGroups = (vcmd != nullptr) ? vcmd->cooperativeGroups() : false;
+ if (!processMemObjects(kernel, parameters, ldsUsage, coopGroups)) {
LogError("Wrong memory objects!");
return false;
}
@@ -2099,6 +2100,27 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
WriteAqlArgAt(const_cast(parameters), &spVA, it.size_, it.offset_);
break;
}
+ case amd::KernelParameterDescriptor::HiddenMultiGridSync: {
+ uint64_t gridSync = coopGroups ? 1 : 0;
+ bool multiGrid = (vcmd != nullptr) ? vcmd->cooperativeMultiDeviceGroups() : false;
+ if (multiGrid) {
+ // Find CPU pointer to the right sync info structure. It should be after MGSyncData
+ Device::MGSyncInfo* syncInfo = reinterpret_cast(
+ dev().MGSync() + Device::kMGInfoSizePerDevice * dev().index() + Device::kMGSyncDataSize);
+ // Update sync data address. Use the offset adjustment to the right location
+ syncInfo->mgs = reinterpret_cast(dev().MGSync() +
+ Device::kMGInfoSizePerDevice * vcmd->firstDevice());
+ // Fill all sync info fields
+ syncInfo->grid_id = vcmd->gridId();
+ syncInfo->num_grids = vcmd->numGrids();
+ syncInfo->prev_sum = vcmd->prevGridSum();
+ syncInfo->all_sum = vcmd->allGridSum();
+ // Update GPU address for grid sync info. Use the offset adjustment for the right location
+ gridSync = reinterpret_cast(syncInfo);
+ }
+ WriteAqlArgAt(const_cast(parameters), &gridSync, it.size_, it.offset_);
+ break;
+ }
}
}
@@ -2177,32 +2199,36 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
* the list of kernel parameters.
*/
void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
- if (vcmd.cooperativeGroups()) {
- uint32_t workgroups = 0;
- for (uint i = 0; i < vcmd.sizes().dimensions(); i++) {
- if ((vcmd.sizes().local()[i] != 0) && (vcmd.sizes().global()[i] != 1)) {
- workgroups += (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]);
- }
- }
-
+ if (vcmd.cooperativeGroups() || vcmd.cooperativeMultiDeviceGroups()) {
// Get device queue for exclusive GPU access
VirtualGPU* queue = dev().xferQueue();
+ // Lock the queue, using the blit manager lock
+ amd::ScopedLock lock(queue->blitMgr().lockXfer());
+
// Wait for the execution on the current queue, since the coop groups will use the device queue
releaseGpuMemoryFence();
- // Lock the queue, using the blit manager lock
- amd::ScopedLock lock(queue->blitMgr().lockXfer());
queue->profilingBegin(vcmd);
- static_cast(queue->blitMgr()).RunGwsInit(workgroups);
+ if (vcmd.cooperativeGroups()) {
+ // Initialize GWS if it's cooperative groups launch
+ uint32_t workgroups = 0;
+ for (uint i = 0; i < vcmd.sizes().dimensions(); i++) {
+ if ((vcmd.sizes().local()[i] != 0) && (vcmd.sizes().global()[i] != 1)) {
+ workgroups += (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]);
+ }
+ }
+
+ static_cast(queue->blitMgr()).RunGwsInit(workgroups - 1);
+ }
// Sync AQL packets
queue->setAqlHeader(dispatchPacketHeader_);
// Submit kernel to HW
if (!queue->submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(),
- static_cast(as_cl(&vcmd.event())), vcmd.sharedMemBytes(), vcmd.cooperativeGroups())) {
+ static_cast(as_cl(&vcmd.event())), vcmd.sharedMemBytes(), &vcmd)) {
LogError("AQL dispatch failed!");
vcmd.setStatus(CL_INVALID_OPERATION);
}
@@ -2218,7 +2244,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
// Submit kernel to HW
if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(),
- static_cast(as_cl(&vcmd.event())), vcmd.sharedMemBytes(), vcmd.cooperativeGroups())) {
+ static_cast(as_cl(&vcmd.event())), vcmd.sharedMemBytes())) {
LogError("AQL dispatch failed!");
vcmd.setStatus(CL_INVALID_OPERATION);
}
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
index f45060322d..2e31dc4b4f 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
@@ -171,7 +171,7 @@ class VirtualGPU : public device::VirtualDevice {
const_address parameters, //!< Parameters for the kernel
void* event_handle, //!< Handle to OCL event for debugging
uint32_t sharedMemBytes = 0, //!< Shared memory size
- bool cooperativeGroups = false //!< TRUE if cooperative groups mode is required
+ amd::NDRangeKernelCommand* vcmd = nullptr //!< Original launch command
);
void submitNativeFn(amd::NativeFnCommand& cmd);
void submitMarker(amd::Marker& cmd);
diff --git a/projects/clr/rocclr/runtime/platform/command.cpp b/projects/clr/rocclr/runtime/platform/command.cpp
index e67c9067dc..fb87d99588 100644
--- a/projects/clr/rocclr/runtime/platform/command.cpp
+++ b/projects/clr/rocclr/runtime/platform/command.cpp
@@ -232,12 +232,19 @@ const Context& Command::context() const { return queue_->context(); }
NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList,
Kernel& kernel, const NDRangeContainer& sizes,
- uint32_t sharedMemBytes, uint32_t extraParam)
- : Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList, AMD_SERIALIZE_KERNEL)
- , kernel_(kernel)
- , sizes_(sizes)
- , sharedMemBytes_(sharedMemBytes)
- , extraParam_(extraParam) {
+ uint32_t sharedMemBytes, uint32_t extraParam,
+ uint32_t gridId, uint32_t numGrids,
+ uint64_t prevGridSum, uint64_t allGridSum, uint32_t firstDevice) :
+ Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList, AMD_SERIALIZE_KERNEL),
+ kernel_(kernel),
+ sizes_(sizes),
+ sharedMemBytes_(sharedMemBytes),
+ extraParam_(extraParam),
+ gridId_(gridId),
+ numGrids_(numGrids),
+ prevGridSum_(prevGridSum),
+ allGridSum_(allGridSum),
+ firstDevice_(firstDevice) {
auto& device = queue.device();
auto devKernel = const_cast(kernel.getDeviceKernel(device));
profilingInfo_.setCallback(devKernel->getProfilingCallback(
diff --git a/projects/clr/rocclr/runtime/platform/command.hpp b/projects/clr/rocclr/runtime/platform/command.hpp
index e55a33fe2b..6610381869 100644
--- a/projects/clr/rocclr/runtime/platform/command.hpp
+++ b/projects/clr/rocclr/runtime/platform/command.hpp
@@ -764,9 +764,15 @@ class NDRangeKernelCommand : public Command {
private:
Kernel& kernel_;
NDRangeContainer sizes_;
- address parameters_;
- uint32_t sharedMemBytes_;
- uint32_t extraParam_;
+ address parameters_; //!< Pointer to the kernel argumets
+ // The below fields are specific to the HIP functionality
+ uint32_t sharedMemBytes_; //!< Size of reserved shared memory
+ uint32_t extraParam_; //!< Extra flags for the kernel launch
+ uint32_t gridId_; //!< Grid ID in the multi GPU kernel launch
+ uint32_t numGrids_; //!< Total number of grids in multi GPU launch
+ uint64_t prevGridSum_; //!< A sum of previous grids to the current launch
+ uint64_t allGridSum_; //!< A sum of all grids in multi GPU launch
+ uint32_t firstDevice_; //!< Device index of the first device in the grid
public:
enum {
@@ -777,7 +783,8 @@ class NDRangeKernelCommand : public Command {
//! Construct an ExecuteKernel command
NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList, Kernel& kernel,
const NDRangeContainer& sizes, uint32_t sharedMemBytes = 0,
- uint32_t extraParam = 0);
+ uint32_t extraParam = 0, uint32_t gridId = 0, uint32_t numGrids = 0,
+ uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0);
virtual void submit(device::VirtualDevice& device) { device.submitKernel(*this); }
@@ -804,6 +811,21 @@ class NDRangeKernelCommand : public Command {
return (extraParam_ & CooperativeMultiDeviceGroups) ? true : false;
}
+ //! Return the current grid ID for multidevice launch
+ uint32_t gridId() const { return gridId_; }
+
+ //! Return the number of launched grids
+ uint32_t numGrids() const { return numGrids_; }
+
+ //! Return the total workload size for up to the current
+ uint64_t prevGridSum() const { return prevGridSum_; }
+
+ //! Return the total workload size for all GPUs
+ uint64_t allGridSum() const { return allGridSum_; }
+
+ //! Return the index of the first device in multi GPU launch
+ uint64_t firstDevice() const { return firstDevice_; }
+
//! Set the local work size.
void setLocalWorkSize(const NDRange& local) { sizes_.local() = local; }