P4 to Git Change 2014404 by gandryey@gera-win10 on 2019/10/16 11:13:37

SWDEV-184710 - Support hipLaunchCooperativeKernelMultiDevice()
	- Add support for multi grid launch in hip
	- Detect the new hidden argument and pass the required information for the kernel launch
	- Memory for synchronization is allocated as a single object and then the offset for each GPU is found

Affected files ...

... //depot/stg/opencl/drivers/opencl/api/hip/hip_module.cpp#44 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#343 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#25 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#17 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#82 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#136 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#42 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#90 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#30 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#99 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.hpp#97 edit


[ROCm/clr commit: 6e7e97987f]
Этот коммит содержится в:
foreman
2019-10-16 11:24:09 -04:00
родитель f49169e681
Коммит fb7da41d27
10 изменённых файлов: 151 добавлений и 52 удалений
+20 -3
Просмотреть файл
@@ -1153,7 +1153,24 @@ class Device : public RuntimeObject {
typedef aclCompiler Compiler;
public:
// The structures below for MGPU launch match the device library format
struct MGSyncData {
uint32_t w0;
uint32_t w1;
};
struct MGSyncInfo {
struct MGSyncData* mgs;
uint32_t grid_id;
uint32_t num_grids;
uint64_t prev_sum;
uint64_t all_sum;
};
static constexpr size_t kP2PStagingSize = 4 * Mi;
static constexpr size_t kMGSyncDataSize = sizeof(MGSyncData);
static constexpr size_t kMGInfoSizePerDevice = kMGSyncDataSize + sizeof(MGSyncInfo);
typedef std::list<CommandQueue*> CommandQueues;
struct BlitProgram : public amd::HeapObject {
@@ -1409,9 +1426,9 @@ class Device : public RuntimeObject {
std::unique_ptr<amd::CacheCompilation> cacheCompilation_;
#endif
static amd::Context* glb_ctx_; //!< Global context with all devices
static amd::Monitor p2p_stage_ops_; //!< Lock to serialise cache for the P2P resources
static Memory* p2p_stage_; //!< Staging resources
static amd::Context* glb_ctx_; //!< Global context with all devices
static amd::Monitor p2p_stage_ops_; //!< Lock to serialise cache for the P2P resources
static Memory* p2p_stage_; //!< Staging resources
private:
bool IsTypeMatching(cl_device_type type, bool offlineDevices);
+3
Просмотреть файл
@@ -752,6 +752,9 @@ static inline uint32_t GetOclArgumentTypeOCL(const KernelArgMD& lcArg, bool* isH
case ValueKind::HiddenCompletionAction:
*isHidden = true;
return amd::KernelParameterDescriptor::HiddenCompletionAction;
case ValueKind::HiddenMultiGridSyncArg:
*isHidden = true;
return amd::KernelParameterDescriptor::HiddenMultiGridSync;
case ValueKind::HiddenNone:
default:
*isHidden = true;
+10 -7
Просмотреть файл
@@ -110,7 +110,8 @@ static const std::map<std::string,ValueKind> ArgValueKind =
{"HiddenNone", ValueKind::HiddenNone},
{"HiddenPrintfBuffer", ValueKind::HiddenPrintfBuffer},
{"HiddenDefaultQueue", ValueKind::HiddenDefaultQueue},
{"HiddenCompletionAction", ValueKind::HiddenCompletionAction}
{"HiddenCompletionAction", ValueKind::HiddenCompletionAction},
{"HiddenMultigridSyncArg", ValueKind::HiddenMultiGridSyncArg}
};
static const std::map<std::string,ValueType> ArgValueType =
@@ -223,7 +224,8 @@ static const std::map<std::string,ValueKind> ArgValueKindV3 =
{"hidden_none", ValueKind::HiddenNone},
{"hidden_printf_buffer", ValueKind::HiddenPrintfBuffer},
{"hidden_default_queue", ValueKind::HiddenDefaultQueue},
{"hidden_completion_action", ValueKind::HiddenCompletionAction}
{"hidden_completion_action", ValueKind::HiddenCompletionAction},
{"hidden_multigrid_sync_arg", ValueKind::HiddenMultiGridSyncArg}
};
static const std::map<std::string,ValueType> ArgValueTypeV3 =
@@ -317,19 +319,20 @@ struct KernelParameterDescriptor {
ValueObject = 10,
ImageObject = 11,
SamplerObject = 12,
QueueObject = 13
QueueObject = 13,
HiddenMultiGridSync = 14
};
clk_value_type_t type_; //!< The parameter's type
size_t offset_; //!< Its offset in the parameter's stack
size_t size_; //!< Its size in bytes
union InfoData {
struct {
uint32_t oclObject_ : 4; //!< OCL object type
uint32_t oclObject_ : 4; //!< OCL object type
uint32_t readOnly_ : 1; //!< OCL object is read only, applied to memory only
uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA
uint32_t defined_ : 1; //!< The argument was defined by the app
uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA
uint32_t defined_ : 1; //!< The argument was defined by the app
uint32_t reserved_ : 1; //!< reserved
uint32_t arrayIndex_ : 24; //!< Index in the objects array or LDS alignment
uint32_t arrayIndex_ : 24;//!< Index in the objects array or LDS alignment
};
uint32_t allValues_;
InfoData() : allValues_(0) {}
+2
Просмотреть файл
@@ -336,6 +336,8 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const
WriteAqlArgAt(const_cast<address>(parameters), vmParentWrap, it.size_, it.offset_);
}
break;
case amd::KernelParameterDescriptor::HiddenMultiGridSync:
break;
}
}
+24 -13
Просмотреть файл
@@ -51,7 +51,7 @@ hsa_agent_t roc::Device::cpu_agent_ = {0};
std::vector<hsa_agent_t> roc::Device::gpu_agents_;
const bool roc::Device::offlineDevice_ = false;
const bool roc::NullDevice::offlineDevice_ = true;
address Device::mg_sync_ = nullptr;
static HsaDeviceId getHsaDeviceId(hsa_agent_t device, uint32_t& pci_id) {
if (HSA_STATUS_SUCCESS !=
@@ -175,6 +175,10 @@ Device::~Device() {
p2p_stage_->release();
p2p_stage_ = nullptr;
}
if (nullptr != mg_sync_) {
amd::SvmBuffer::free(GlbCtx(), mg_sync_);
mg_sync_ = nullptr;
}
if (glb_ctx_ != nullptr) {
glb_ctx_->release();
glb_ctx_ = nullptr;
@@ -715,28 +719,25 @@ bool Device::create(bool sramEccEnabled) {
// Use just 1 entry by default for the map cache
mapCache_->push_back(nullptr);
if ((p2p_agents_.size() == 0) &&
(glb_ctx_ == nullptr) && (gpu_agents_.size() > 1) &&
if ((glb_ctx_ == nullptr) && (gpu_agents_.size() >= 1) &&
// Allow creation for the last device in the list.
(gpu_agents_[gpu_agents_.size() - 1].handle == _bkendDevice.handle)) {
std::vector<amd::Device*> devices;
uint32_t numDevices = amd::Device::numDevices(CL_DEVICE_TYPE_GPU, false);
// Add all PAL devices
for (uint32_t i = 0; i < numDevices; ++i) {
devices.push_back(amd::Device::devices()[i]);
devices.push_back(amd::Device::devices()[i]);
}
// Add current
devices.push_back(this);
// Create a dummy context
glb_ctx_ = new amd::Context(devices, info);
if (glb_ctx_ == nullptr) {
return false;
}
if (devices.size() > 1) {
// Create a dummy context
glb_ctx_ = new amd::Context(devices, info);
if (glb_ctx_ == nullptr) {
return false;
}
amd::Buffer* buf =
new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize);
if ((p2p_agents_.size() == 0) && (devices.size() > 1)) {
amd::Buffer* buf = new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize);
if ((buf != nullptr) && buf->create()) {
p2p_stage_ = buf;
}
@@ -745,6 +746,15 @@ bool Device::create(bool sramEccEnabled) {
return false;
}
}
// Check if sync buffer wasn't allocated yet
if (amd::IS_HIP && mg_sync_ == nullptr) {
mg_sync_ = reinterpret_cast<address>(amd::SvmBuffer::malloc(
GlbCtx(), (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS),
kMGInfoSizePerDevice * GlbCtx().devices().size(), kMGInfoSizePerDevice));
if (mg_sync_ == nullptr) {
return false;
}
}
}
if (settings().stagedXferSize_ != 0) {
@@ -1817,6 +1827,7 @@ VirtualGPU* Device::xferQueue() const {
xferQueue_->enableSyncBlit();
return xferQueue_;
}
bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
bool result = true;
return result;
+11 -3
Просмотреть файл
@@ -410,6 +410,16 @@ class Device : public NullDevice {
hsa_amd_memory_pool_t SystemCoarseSegment() const { return system_coarse_segment_; }
//! Acquire HSA queue. This method can create a new HSA queue or
//! share previously created
hsa_queue_t* acquireQueue(uint32_t queue_size_hint);
//! Release HSA queue
void releaseQueue(hsa_queue_t*);
//! Return multi GPU grid launch sync buffer
address MGSync() const { return mg_sync_; }
private:
static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
@@ -440,6 +450,7 @@ class Device : public NullDevice {
std::atomic<size_t> freeMem_; //!< Total of free memory available
mutable amd::Monitor vgpusAccess_; //!< Lock to serialise virtual gpu list access
bool hsa_exclusive_gpu_access_; //!< TRUE if current device was moved into exclusive GPU access mode
static address mg_sync_; //!< MGPU grid launch sync memory (SVM location)
struct QueueInfo {
int refCount;
@@ -448,9 +459,6 @@ class Device : public NullDevice {
public:
amd::Atomic<uint> numOfVgpus_; //!< Virtual gpu unique index
hsa_queue_t *acquireQueue(uint32_t queue_size_hint);
void releaseQueue(hsa_queue_t*);
}; // class roc::Device
} // namespace roc
+41 -15
Просмотреть файл
@@ -1961,13 +1961,14 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
}
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
const_address parameters, void* eventHandle, uint32_t sharedMemBytes, bool cooperativeGroups) {
const_address parameters, void* eventHandle, uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd) {
device::Kernel* devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(dev()));
Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
// Check memory dependency and SVM objects
if (!processMemObjects(kernel, parameters, ldsUsage, cooperativeGroups)) {
bool coopGroups = (vcmd != nullptr) ? vcmd->cooperativeGroups() : false;
if (!processMemObjects(kernel, parameters, ldsUsage, coopGroups)) {
LogError("Wrong memory objects!");
return false;
}
@@ -2099,6 +2100,27 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
WriteAqlArgAt(const_cast<address>(parameters), &spVA, it.size_, it.offset_);
break;
}
case amd::KernelParameterDescriptor::HiddenMultiGridSync: {
uint64_t gridSync = coopGroups ? 1 : 0;
bool multiGrid = (vcmd != nullptr) ? vcmd->cooperativeMultiDeviceGroups() : false;
if (multiGrid) {
// Find CPU pointer to the right sync info structure. It should be after MGSyncData
Device::MGSyncInfo* syncInfo = reinterpret_cast<Device::MGSyncInfo*>(
dev().MGSync() + Device::kMGInfoSizePerDevice * dev().index() + Device::kMGSyncDataSize);
// Update sync data address. Use the offset adjustment to the right location
syncInfo->mgs = reinterpret_cast<Device::MGSyncData*>(dev().MGSync() +
Device::kMGInfoSizePerDevice * vcmd->firstDevice());
// Fill all sync info fields
syncInfo->grid_id = vcmd->gridId();
syncInfo->num_grids = vcmd->numGrids();
syncInfo->prev_sum = vcmd->prevGridSum();
syncInfo->all_sum = vcmd->allGridSum();
// Update GPU address for grid sync info. Use the offset adjustment for the right location
gridSync = reinterpret_cast<uint64_t>(syncInfo);
}
WriteAqlArgAt(const_cast<address>(parameters), &gridSync, it.size_, it.offset_);
break;
}
}
}
@@ -2177,32 +2199,36 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
* the list of kernel parameters.
*/
void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
if (vcmd.cooperativeGroups()) {
uint32_t workgroups = 0;
for (uint i = 0; i < vcmd.sizes().dimensions(); i++) {
if ((vcmd.sizes().local()[i] != 0) && (vcmd.sizes().global()[i] != 1)) {
workgroups += (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]);
}
}
if (vcmd.cooperativeGroups() || vcmd.cooperativeMultiDeviceGroups()) {
// Get device queue for exclusive GPU access
VirtualGPU* queue = dev().xferQueue();
// Lock the queue, using the blit manager lock
amd::ScopedLock lock(queue->blitMgr().lockXfer());
// Wait for the execution on the current queue, since the coop groups will use the device queue
releaseGpuMemoryFence();
// Lock the queue, using the blit manager lock
amd::ScopedLock lock(queue->blitMgr().lockXfer());
queue->profilingBegin(vcmd);
static_cast<KernelBlitManager&>(queue->blitMgr()).RunGwsInit(workgroups);
if (vcmd.cooperativeGroups()) {
// Initialize GWS if it's cooperative groups launch
uint32_t workgroups = 0;
for (uint i = 0; i < vcmd.sizes().dimensions(); i++) {
if ((vcmd.sizes().local()[i] != 0) && (vcmd.sizes().global()[i] != 1)) {
workgroups += (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]);
}
}
static_cast<KernelBlitManager&>(queue->blitMgr()).RunGwsInit(workgroups - 1);
}
// Sync AQL packets
queue->setAqlHeader(dispatchPacketHeader_);
// Submit kernel to HW
if (!queue->submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(),
static_cast<void*>(as_cl(&vcmd.event())), vcmd.sharedMemBytes(), vcmd.cooperativeGroups())) {
static_cast<void*>(as_cl(&vcmd.event())), vcmd.sharedMemBytes(), &vcmd)) {
LogError("AQL dispatch failed!");
vcmd.setStatus(CL_INVALID_OPERATION);
}
@@ -2218,7 +2244,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
// Submit kernel to HW
if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(),
static_cast<void*>(as_cl(&vcmd.event())), vcmd.sharedMemBytes(), vcmd.cooperativeGroups())) {
static_cast<void*>(as_cl(&vcmd.event())), vcmd.sharedMemBytes())) {
LogError("AQL dispatch failed!");
vcmd.setStatus(CL_INVALID_OPERATION);
}
+1 -1
Просмотреть файл
@@ -171,7 +171,7 @@ class VirtualGPU : public device::VirtualDevice {
const_address parameters, //!< Parameters for the kernel
void* event_handle, //!< Handle to OCL event for debugging
uint32_t sharedMemBytes = 0, //!< Shared memory size
bool cooperativeGroups = false //!< TRUE if cooperative groups mode is required
amd::NDRangeKernelCommand* vcmd = nullptr //!< Original launch command
);
void submitNativeFn(amd::NativeFnCommand& cmd);
void submitMarker(amd::Marker& cmd);
+13 -6
Просмотреть файл
@@ -232,12 +232,19 @@ const Context& Command::context() const { return queue_->context(); }
NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList,
Kernel& kernel, const NDRangeContainer& sizes,
uint32_t sharedMemBytes, uint32_t extraParam)
: Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList, AMD_SERIALIZE_KERNEL)
, kernel_(kernel)
, sizes_(sizes)
, sharedMemBytes_(sharedMemBytes)
, extraParam_(extraParam) {
uint32_t sharedMemBytes, uint32_t extraParam,
uint32_t gridId, uint32_t numGrids,
uint64_t prevGridSum, uint64_t allGridSum, uint32_t firstDevice) :
Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList, AMD_SERIALIZE_KERNEL),
kernel_(kernel),
sizes_(sizes),
sharedMemBytes_(sharedMemBytes),
extraParam_(extraParam),
gridId_(gridId),
numGrids_(numGrids),
prevGridSum_(prevGridSum),
allGridSum_(allGridSum),
firstDevice_(firstDevice) {
auto& device = queue.device();
auto devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(device));
profilingInfo_.setCallback(devKernel->getProfilingCallback(
+26 -4
Просмотреть файл
@@ -764,9 +764,15 @@ class NDRangeKernelCommand : public Command {
private:
Kernel& kernel_;
NDRangeContainer sizes_;
address parameters_;
uint32_t sharedMemBytes_;
uint32_t extraParam_;
address parameters_; //!< Pointer to the kernel argumets
// The below fields are specific to the HIP functionality
uint32_t sharedMemBytes_; //!< Size of reserved shared memory
uint32_t extraParam_; //!< Extra flags for the kernel launch
uint32_t gridId_; //!< Grid ID in the multi GPU kernel launch
uint32_t numGrids_; //!< Total number of grids in multi GPU launch
uint64_t prevGridSum_; //!< A sum of previous grids to the current launch
uint64_t allGridSum_; //!< A sum of all grids in multi GPU launch
uint32_t firstDevice_; //!< Device index of the first device in the grid
public:
enum {
@@ -777,7 +783,8 @@ class NDRangeKernelCommand : public Command {
//! Construct an ExecuteKernel command
NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList, Kernel& kernel,
const NDRangeContainer& sizes, uint32_t sharedMemBytes = 0,
uint32_t extraParam = 0);
uint32_t extraParam = 0, uint32_t gridId = 0, uint32_t numGrids = 0,
uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0);
virtual void submit(device::VirtualDevice& device) { device.submitKernel(*this); }
@@ -804,6 +811,21 @@ class NDRangeKernelCommand : public Command {
return (extraParam_ & CooperativeMultiDeviceGroups) ? true : false;
}
//! Return the current grid ID for multidevice launch
uint32_t gridId() const { return gridId_; }
//! Return the number of launched grids
uint32_t numGrids() const { return numGrids_; }
//! Return the total workload size for up to the current
uint64_t prevGridSum() const { return prevGridSum_; }
//! Return the total workload size for all GPUs
uint64_t allGridSum() const { return allGridSum_; }
//! Return the index of the first device in multi GPU launch
uint64_t firstDevice() const { return firstDevice_; }
//! Set the local work size.
void setLocalWorkSize(const NDRange& local) { sizes_.local() = local; }