P4 to Git Change 2014404 by gandryey@gera-win10 on 2019/10/16 11:13:37
SWDEV-184710 - Support hipLaunchCooperativeKernelMultiDevice()
- Add support for multi grid launch in hip
- Detect the new hidden argument and pass the required information for the kernel launch
- Memory for synchronization is allocated as a single object and then the offset for each GPU is found
Affected files ...
... //depot/stg/opencl/drivers/opencl/api/hip/hip_module.cpp#44 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#343 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#25 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#17 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#82 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#136 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#42 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#90 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#30 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#99 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.hpp#97 edit
[ROCm/clr commit: 6e7e97987f]
Этот коммит содержится в:
@@ -1153,7 +1153,24 @@ class Device : public RuntimeObject {
|
||||
typedef aclCompiler Compiler;
|
||||
|
||||
public:
|
||||
// The structures below for MGPU launch match the device library format
|
||||
struct MGSyncData {
|
||||
uint32_t w0;
|
||||
uint32_t w1;
|
||||
};
|
||||
|
||||
struct MGSyncInfo {
|
||||
struct MGSyncData* mgs;
|
||||
uint32_t grid_id;
|
||||
uint32_t num_grids;
|
||||
uint64_t prev_sum;
|
||||
uint64_t all_sum;
|
||||
};
|
||||
|
||||
static constexpr size_t kP2PStagingSize = 4 * Mi;
|
||||
static constexpr size_t kMGSyncDataSize = sizeof(MGSyncData);
|
||||
static constexpr size_t kMGInfoSizePerDevice = kMGSyncDataSize + sizeof(MGSyncInfo);
|
||||
|
||||
typedef std::list<CommandQueue*> CommandQueues;
|
||||
|
||||
struct BlitProgram : public amd::HeapObject {
|
||||
@@ -1409,9 +1426,9 @@ class Device : public RuntimeObject {
|
||||
std::unique_ptr<amd::CacheCompilation> cacheCompilation_;
|
||||
#endif
|
||||
|
||||
static amd::Context* glb_ctx_; //!< Global context with all devices
|
||||
static amd::Monitor p2p_stage_ops_; //!< Lock to serialise cache for the P2P resources
|
||||
static Memory* p2p_stage_; //!< Staging resources
|
||||
static amd::Context* glb_ctx_; //!< Global context with all devices
|
||||
static amd::Monitor p2p_stage_ops_; //!< Lock to serialise cache for the P2P resources
|
||||
static Memory* p2p_stage_; //!< Staging resources
|
||||
|
||||
private:
|
||||
bool IsTypeMatching(cl_device_type type, bool offlineDevices);
|
||||
|
||||
@@ -752,6 +752,9 @@ static inline uint32_t GetOclArgumentTypeOCL(const KernelArgMD& lcArg, bool* isH
|
||||
case ValueKind::HiddenCompletionAction:
|
||||
*isHidden = true;
|
||||
return amd::KernelParameterDescriptor::HiddenCompletionAction;
|
||||
case ValueKind::HiddenMultiGridSyncArg:
|
||||
*isHidden = true;
|
||||
return amd::KernelParameterDescriptor::HiddenMultiGridSync;
|
||||
case ValueKind::HiddenNone:
|
||||
default:
|
||||
*isHidden = true;
|
||||
|
||||
@@ -110,7 +110,8 @@ static const std::map<std::string,ValueKind> ArgValueKind =
|
||||
{"HiddenNone", ValueKind::HiddenNone},
|
||||
{"HiddenPrintfBuffer", ValueKind::HiddenPrintfBuffer},
|
||||
{"HiddenDefaultQueue", ValueKind::HiddenDefaultQueue},
|
||||
{"HiddenCompletionAction", ValueKind::HiddenCompletionAction}
|
||||
{"HiddenCompletionAction", ValueKind::HiddenCompletionAction},
|
||||
{"HiddenMultigridSyncArg", ValueKind::HiddenMultiGridSyncArg}
|
||||
};
|
||||
|
||||
static const std::map<std::string,ValueType> ArgValueType =
|
||||
@@ -223,7 +224,8 @@ static const std::map<std::string,ValueKind> ArgValueKindV3 =
|
||||
{"hidden_none", ValueKind::HiddenNone},
|
||||
{"hidden_printf_buffer", ValueKind::HiddenPrintfBuffer},
|
||||
{"hidden_default_queue", ValueKind::HiddenDefaultQueue},
|
||||
{"hidden_completion_action", ValueKind::HiddenCompletionAction}
|
||||
{"hidden_completion_action", ValueKind::HiddenCompletionAction},
|
||||
{"hidden_multigrid_sync_arg", ValueKind::HiddenMultiGridSyncArg}
|
||||
};
|
||||
|
||||
static const std::map<std::string,ValueType> ArgValueTypeV3 =
|
||||
@@ -317,19 +319,20 @@ struct KernelParameterDescriptor {
|
||||
ValueObject = 10,
|
||||
ImageObject = 11,
|
||||
SamplerObject = 12,
|
||||
QueueObject = 13
|
||||
QueueObject = 13,
|
||||
HiddenMultiGridSync = 14
|
||||
};
|
||||
clk_value_type_t type_; //!< The parameter's type
|
||||
size_t offset_; //!< Its offset in the parameter's stack
|
||||
size_t size_; //!< Its size in bytes
|
||||
union InfoData {
|
||||
struct {
|
||||
uint32_t oclObject_ : 4; //!< OCL object type
|
||||
uint32_t oclObject_ : 4; //!< OCL object type
|
||||
uint32_t readOnly_ : 1; //!< OCL object is read only, applied to memory only
|
||||
uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA
|
||||
uint32_t defined_ : 1; //!< The argument was defined by the app
|
||||
uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA
|
||||
uint32_t defined_ : 1; //!< The argument was defined by the app
|
||||
uint32_t reserved_ : 1; //!< reserved
|
||||
uint32_t arrayIndex_ : 24; //!< Index in the objects array or LDS alignment
|
||||
uint32_t arrayIndex_ : 24;//!< Index in the objects array or LDS alignment
|
||||
};
|
||||
uint32_t allValues_;
|
||||
InfoData() : allValues_(0) {}
|
||||
|
||||
@@ -336,6 +336,8 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const
|
||||
WriteAqlArgAt(const_cast<address>(parameters), vmParentWrap, it.size_, it.offset_);
|
||||
}
|
||||
break;
|
||||
case amd::KernelParameterDescriptor::HiddenMultiGridSync:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@ hsa_agent_t roc::Device::cpu_agent_ = {0};
|
||||
std::vector<hsa_agent_t> roc::Device::gpu_agents_;
|
||||
const bool roc::Device::offlineDevice_ = false;
|
||||
const bool roc::NullDevice::offlineDevice_ = true;
|
||||
|
||||
address Device::mg_sync_ = nullptr;
|
||||
|
||||
static HsaDeviceId getHsaDeviceId(hsa_agent_t device, uint32_t& pci_id) {
|
||||
if (HSA_STATUS_SUCCESS !=
|
||||
@@ -175,6 +175,10 @@ Device::~Device() {
|
||||
p2p_stage_->release();
|
||||
p2p_stage_ = nullptr;
|
||||
}
|
||||
if (nullptr != mg_sync_) {
|
||||
amd::SvmBuffer::free(GlbCtx(), mg_sync_);
|
||||
mg_sync_ = nullptr;
|
||||
}
|
||||
if (glb_ctx_ != nullptr) {
|
||||
glb_ctx_->release();
|
||||
glb_ctx_ = nullptr;
|
||||
@@ -715,28 +719,25 @@ bool Device::create(bool sramEccEnabled) {
|
||||
// Use just 1 entry by default for the map cache
|
||||
mapCache_->push_back(nullptr);
|
||||
|
||||
if ((p2p_agents_.size() == 0) &&
|
||||
(glb_ctx_ == nullptr) && (gpu_agents_.size() > 1) &&
|
||||
if ((glb_ctx_ == nullptr) && (gpu_agents_.size() >= 1) &&
|
||||
// Allow creation for the last device in the list.
|
||||
(gpu_agents_[gpu_agents_.size() - 1].handle == _bkendDevice.handle)) {
|
||||
|
||||
std::vector<amd::Device*> devices;
|
||||
uint32_t numDevices = amd::Device::numDevices(CL_DEVICE_TYPE_GPU, false);
|
||||
// Add all PAL devices
|
||||
for (uint32_t i = 0; i < numDevices; ++i) {
|
||||
devices.push_back(amd::Device::devices()[i]);
|
||||
devices.push_back(amd::Device::devices()[i]);
|
||||
}
|
||||
// Add current
|
||||
devices.push_back(this);
|
||||
// Create a dummy context
|
||||
glb_ctx_ = new amd::Context(devices, info);
|
||||
if (glb_ctx_ == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (devices.size() > 1) {
|
||||
// Create a dummy context
|
||||
glb_ctx_ = new amd::Context(devices, info);
|
||||
if (glb_ctx_ == nullptr) {
|
||||
return false;
|
||||
}
|
||||
amd::Buffer* buf =
|
||||
new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize);
|
||||
if ((p2p_agents_.size() == 0) && (devices.size() > 1)) {
|
||||
amd::Buffer* buf = new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize);
|
||||
if ((buf != nullptr) && buf->create()) {
|
||||
p2p_stage_ = buf;
|
||||
}
|
||||
@@ -745,6 +746,15 @@ bool Device::create(bool sramEccEnabled) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// Check if sync buffer wasn't allocated yet
|
||||
if (amd::IS_HIP && mg_sync_ == nullptr) {
|
||||
mg_sync_ = reinterpret_cast<address>(amd::SvmBuffer::malloc(
|
||||
GlbCtx(), (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS),
|
||||
kMGInfoSizePerDevice * GlbCtx().devices().size(), kMGInfoSizePerDevice));
|
||||
if (mg_sync_ == nullptr) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (settings().stagedXferSize_ != 0) {
|
||||
@@ -1817,6 +1827,7 @@ VirtualGPU* Device::xferQueue() const {
|
||||
xferQueue_->enableSyncBlit();
|
||||
return xferQueue_;
|
||||
}
|
||||
|
||||
bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
|
||||
bool result = true;
|
||||
return result;
|
||||
|
||||
@@ -410,6 +410,16 @@ class Device : public NullDevice {
|
||||
|
||||
hsa_amd_memory_pool_t SystemCoarseSegment() const { return system_coarse_segment_; }
|
||||
|
||||
//! Acquire HSA queue. This method can create a new HSA queue or
|
||||
//! share previously created
|
||||
hsa_queue_t* acquireQueue(uint32_t queue_size_hint);
|
||||
|
||||
//! Release HSA queue
|
||||
void releaseQueue(hsa_queue_t*);
|
||||
|
||||
//! Return multi GPU grid launch sync buffer
|
||||
address MGSync() const { return mg_sync_; }
|
||||
|
||||
private:
|
||||
static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
|
||||
|
||||
@@ -440,6 +450,7 @@ class Device : public NullDevice {
|
||||
std::atomic<size_t> freeMem_; //!< Total of free memory available
|
||||
mutable amd::Monitor vgpusAccess_; //!< Lock to serialise virtual gpu list access
|
||||
bool hsa_exclusive_gpu_access_; //!< TRUE if current device was moved into exclusive GPU access mode
|
||||
static address mg_sync_; //!< MGPU grid launch sync memory (SVM location)
|
||||
|
||||
struct QueueInfo {
|
||||
int refCount;
|
||||
@@ -448,9 +459,6 @@ class Device : public NullDevice {
|
||||
|
||||
public:
|
||||
amd::Atomic<uint> numOfVgpus_; //!< Virtual gpu unique index
|
||||
|
||||
hsa_queue_t *acquireQueue(uint32_t queue_size_hint);
|
||||
void releaseQueue(hsa_queue_t*);
|
||||
}; // class roc::Device
|
||||
} // namespace roc
|
||||
|
||||
|
||||
@@ -1961,13 +1961,14 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
|
||||
}
|
||||
|
||||
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
|
||||
const_address parameters, void* eventHandle, uint32_t sharedMemBytes, bool cooperativeGroups) {
|
||||
const_address parameters, void* eventHandle, uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd) {
|
||||
device::Kernel* devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(dev()));
|
||||
Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
|
||||
size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
|
||||
|
||||
// Check memory dependency and SVM objects
|
||||
if (!processMemObjects(kernel, parameters, ldsUsage, cooperativeGroups)) {
|
||||
bool coopGroups = (vcmd != nullptr) ? vcmd->cooperativeGroups() : false;
|
||||
if (!processMemObjects(kernel, parameters, ldsUsage, coopGroups)) {
|
||||
LogError("Wrong memory objects!");
|
||||
return false;
|
||||
}
|
||||
@@ -2099,6 +2100,27 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
WriteAqlArgAt(const_cast<address>(parameters), &spVA, it.size_, it.offset_);
|
||||
break;
|
||||
}
|
||||
case amd::KernelParameterDescriptor::HiddenMultiGridSync: {
|
||||
uint64_t gridSync = coopGroups ? 1 : 0;
|
||||
bool multiGrid = (vcmd != nullptr) ? vcmd->cooperativeMultiDeviceGroups() : false;
|
||||
if (multiGrid) {
|
||||
// Find CPU pointer to the right sync info structure. It should be after MGSyncData
|
||||
Device::MGSyncInfo* syncInfo = reinterpret_cast<Device::MGSyncInfo*>(
|
||||
dev().MGSync() + Device::kMGInfoSizePerDevice * dev().index() + Device::kMGSyncDataSize);
|
||||
// Update sync data address. Use the offset adjustment to the right location
|
||||
syncInfo->mgs = reinterpret_cast<Device::MGSyncData*>(dev().MGSync() +
|
||||
Device::kMGInfoSizePerDevice * vcmd->firstDevice());
|
||||
// Fill all sync info fields
|
||||
syncInfo->grid_id = vcmd->gridId();
|
||||
syncInfo->num_grids = vcmd->numGrids();
|
||||
syncInfo->prev_sum = vcmd->prevGridSum();
|
||||
syncInfo->all_sum = vcmd->allGridSum();
|
||||
// Update GPU address for grid sync info. Use the offset adjustment for the right location
|
||||
gridSync = reinterpret_cast<uint64_t>(syncInfo);
|
||||
}
|
||||
WriteAqlArgAt(const_cast<address>(parameters), &gridSync, it.size_, it.offset_);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2177,32 +2199,36 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
* the list of kernel parameters.
|
||||
*/
|
||||
void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
|
||||
if (vcmd.cooperativeGroups()) {
|
||||
uint32_t workgroups = 0;
|
||||
for (uint i = 0; i < vcmd.sizes().dimensions(); i++) {
|
||||
if ((vcmd.sizes().local()[i] != 0) && (vcmd.sizes().global()[i] != 1)) {
|
||||
workgroups += (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (vcmd.cooperativeGroups() || vcmd.cooperativeMultiDeviceGroups()) {
|
||||
// Get device queue for exclusive GPU access
|
||||
VirtualGPU* queue = dev().xferQueue();
|
||||
|
||||
// Lock the queue, using the blit manager lock
|
||||
amd::ScopedLock lock(queue->blitMgr().lockXfer());
|
||||
|
||||
// Wait for the execution on the current queue, since the coop groups will use the device queue
|
||||
releaseGpuMemoryFence();
|
||||
|
||||
// Lock the queue, using the blit manager lock
|
||||
amd::ScopedLock lock(queue->blitMgr().lockXfer());
|
||||
queue->profilingBegin(vcmd);
|
||||
|
||||
static_cast<KernelBlitManager&>(queue->blitMgr()).RunGwsInit(workgroups);
|
||||
if (vcmd.cooperativeGroups()) {
|
||||
// Initialize GWS if it's cooperative groups launch
|
||||
uint32_t workgroups = 0;
|
||||
for (uint i = 0; i < vcmd.sizes().dimensions(); i++) {
|
||||
if ((vcmd.sizes().local()[i] != 0) && (vcmd.sizes().global()[i] != 1)) {
|
||||
workgroups += (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static_cast<KernelBlitManager&>(queue->blitMgr()).RunGwsInit(workgroups - 1);
|
||||
}
|
||||
|
||||
// Sync AQL packets
|
||||
queue->setAqlHeader(dispatchPacketHeader_);
|
||||
|
||||
// Submit kernel to HW
|
||||
if (!queue->submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(),
|
||||
static_cast<void*>(as_cl(&vcmd.event())), vcmd.sharedMemBytes(), vcmd.cooperativeGroups())) {
|
||||
static_cast<void*>(as_cl(&vcmd.event())), vcmd.sharedMemBytes(), &vcmd)) {
|
||||
LogError("AQL dispatch failed!");
|
||||
vcmd.setStatus(CL_INVALID_OPERATION);
|
||||
}
|
||||
@@ -2218,7 +2244,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
|
||||
|
||||
// Submit kernel to HW
|
||||
if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(),
|
||||
static_cast<void*>(as_cl(&vcmd.event())), vcmd.sharedMemBytes(), vcmd.cooperativeGroups())) {
|
||||
static_cast<void*>(as_cl(&vcmd.event())), vcmd.sharedMemBytes())) {
|
||||
LogError("AQL dispatch failed!");
|
||||
vcmd.setStatus(CL_INVALID_OPERATION);
|
||||
}
|
||||
|
||||
@@ -171,7 +171,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
const_address parameters, //!< Parameters for the kernel
|
||||
void* event_handle, //!< Handle to OCL event for debugging
|
||||
uint32_t sharedMemBytes = 0, //!< Shared memory size
|
||||
bool cooperativeGroups = false //!< TRUE if cooperative groups mode is required
|
||||
amd::NDRangeKernelCommand* vcmd = nullptr //!< Original launch command
|
||||
);
|
||||
void submitNativeFn(amd::NativeFnCommand& cmd);
|
||||
void submitMarker(amd::Marker& cmd);
|
||||
|
||||
@@ -232,12 +232,19 @@ const Context& Command::context() const { return queue_->context(); }
|
||||
|
||||
NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList,
|
||||
Kernel& kernel, const NDRangeContainer& sizes,
|
||||
uint32_t sharedMemBytes, uint32_t extraParam)
|
||||
: Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList, AMD_SERIALIZE_KERNEL)
|
||||
, kernel_(kernel)
|
||||
, sizes_(sizes)
|
||||
, sharedMemBytes_(sharedMemBytes)
|
||||
, extraParam_(extraParam) {
|
||||
uint32_t sharedMemBytes, uint32_t extraParam,
|
||||
uint32_t gridId, uint32_t numGrids,
|
||||
uint64_t prevGridSum, uint64_t allGridSum, uint32_t firstDevice) :
|
||||
Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList, AMD_SERIALIZE_KERNEL),
|
||||
kernel_(kernel),
|
||||
sizes_(sizes),
|
||||
sharedMemBytes_(sharedMemBytes),
|
||||
extraParam_(extraParam),
|
||||
gridId_(gridId),
|
||||
numGrids_(numGrids),
|
||||
prevGridSum_(prevGridSum),
|
||||
allGridSum_(allGridSum),
|
||||
firstDevice_(firstDevice) {
|
||||
auto& device = queue.device();
|
||||
auto devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(device));
|
||||
profilingInfo_.setCallback(devKernel->getProfilingCallback(
|
||||
|
||||
@@ -764,9 +764,15 @@ class NDRangeKernelCommand : public Command {
|
||||
private:
|
||||
Kernel& kernel_;
|
||||
NDRangeContainer sizes_;
|
||||
address parameters_;
|
||||
uint32_t sharedMemBytes_;
|
||||
uint32_t extraParam_;
|
||||
address parameters_; //!< Pointer to the kernel argumets
|
||||
// The below fields are specific to the HIP functionality
|
||||
uint32_t sharedMemBytes_; //!< Size of reserved shared memory
|
||||
uint32_t extraParam_; //!< Extra flags for the kernel launch
|
||||
uint32_t gridId_; //!< Grid ID in the multi GPU kernel launch
|
||||
uint32_t numGrids_; //!< Total number of grids in multi GPU launch
|
||||
uint64_t prevGridSum_; //!< A sum of previous grids to the current launch
|
||||
uint64_t allGridSum_; //!< A sum of all grids in multi GPU launch
|
||||
uint32_t firstDevice_; //!< Device index of the first device in the grid
|
||||
|
||||
public:
|
||||
enum {
|
||||
@@ -777,7 +783,8 @@ class NDRangeKernelCommand : public Command {
|
||||
//! Construct an ExecuteKernel command
|
||||
NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList, Kernel& kernel,
|
||||
const NDRangeContainer& sizes, uint32_t sharedMemBytes = 0,
|
||||
uint32_t extraParam = 0);
|
||||
uint32_t extraParam = 0, uint32_t gridId = 0, uint32_t numGrids = 0,
|
||||
uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0);
|
||||
|
||||
virtual void submit(device::VirtualDevice& device) { device.submitKernel(*this); }
|
||||
|
||||
@@ -804,6 +811,21 @@ class NDRangeKernelCommand : public Command {
|
||||
return (extraParam_ & CooperativeMultiDeviceGroups) ? true : false;
|
||||
}
|
||||
|
||||
//! Return the current grid ID for multidevice launch
|
||||
uint32_t gridId() const { return gridId_; }
|
||||
|
||||
//! Return the number of launched grids
|
||||
uint32_t numGrids() const { return numGrids_; }
|
||||
|
||||
//! Return the total workload size for up to the current
|
||||
uint64_t prevGridSum() const { return prevGridSum_; }
|
||||
|
||||
//! Return the total workload size for all GPUs
|
||||
uint64_t allGridSum() const { return allGridSum_; }
|
||||
|
||||
//! Return the index of the first device in multi GPU launch
|
||||
uint64_t firstDevice() const { return firstDevice_; }
|
||||
|
||||
//! Set the local work size.
|
||||
void setLocalWorkSize(const NDRange& local) { sizes_.local() = local; }
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user