From 17943639e4e2f686c7776d631fa5d672ceb224fb Mon Sep 17 00:00:00 2001
From: foreman
Date: Wed, 12 Jun 2019 10:00:38 -0400
Subject: [PATCH] P4 to Git Change 1809277 by gandryey@gera-win10 on 2019/06/11
17:34:13
SWDEV-180872 - Runtime support changes for Cooperative Group Features
- Initial implementation of the core functionality. Disabled by default. Use GPU_ENABLE_COOP_GROUPS=1 to enable the feature.
- Runtime uses device queue for cooperative executions with a synchronization on the launched queue.
- The current implementation is pure runtime change and it can work if only one app uses this feature. No ROCr/KFD support was added or tested
- Only inline assembler was tested
Affected files ...
... //depot/stg/opencl/drivers/opencl/api/hip/hip_device.cpp#20 edit
... //depot/stg/opencl/drivers/opencl/api/hip/hip_device_runtime.cpp#15 edit
... //depot/stg/opencl/drivers/opencl/api/hip/hip_hcc.def.in#15 edit
... //depot/stg/opencl/drivers/opencl/api/hip/hip_hcc.map.in#17 edit
... //depot/stg/opencl/drivers/opencl/api/hip/hip_module.cpp#28 edit
... //depot/stg/opencl/drivers/opencl/api/hip/hip_platform.cpp#32 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#338 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#606 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#171 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#31 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.hpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#142 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#39 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palschedcl.cpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#135 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#61 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.cpp#32 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.hpp#12 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#127 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#37 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocschedcl.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#75 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#23 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#94 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.hpp#92 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#311 edit
---
api/hip/hip_device.cpp | 2 +
api/hip/hip_device_runtime.cpp | 8 +++-
api/hip/hip_hcc.def.in | 4 ++
api/hip/hip_hcc.map.in | 4 ++
api/hip/hip_module.cpp | 69 ++++++++++++++++++++++++++----
api/hip/hip_platform.cpp | 77 ++++++++++++++++++++++++++++++++++
6 files changed, 155 insertions(+), 9 deletions(-)
diff --git a/api/hip/hip_device.cpp b/api/hip/hip_device.cpp
index a3019cc7d7..622ad150a1 100644
--- a/api/hip/hip_device.cpp
+++ b/api/hip/hip_device.cpp
@@ -192,6 +192,8 @@ hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device )
//deviceProps.isMultiGpuBoard = info.;
deviceProps.canMapHostMemory = 1;
deviceProps.gcnArch = info.gfxipVersion_;
+ deviceProps.cooperativeLaunch = info.cooperativeGroups_;
+ deviceProps.cooperativeMultiDeviceLaunch = info.cooperativeMultiDeviceGroups_;
*props = deviceProps;
HIP_RETURN(hipSuccess);
diff --git a/api/hip/hip_device_runtime.cpp b/api/hip/hip_device_runtime.cpp
index 6bb4261964..fea9fb4dd9 100644
--- a/api/hip/hip_device_runtime.cpp
+++ b/api/hip/hip_device_runtime.cpp
@@ -235,6 +235,12 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device)
case hipDeviceAttributeIsMultiGpuBoard:
*pi = prop.isMultiGpuBoard;
break;
+ case hipDeviceAttributeCooperativeLaunch:
+ *pi = prop.cooperativeLaunch;
+ break;
+ case hipDeviceAttributeCooperativeMultiDeviceLaunch:
+ *pi = prop.cooperativeMultiDeviceLaunch;
+ break;
default:
HIP_RETURN(hipErrorInvalidValue);
}
@@ -401,7 +407,7 @@ hipError_t hipGetDevice ( int* deviceId ) {
if (deviceId != nullptr) {
int dev = ihipGetDevice();
- assert(dev != -1);
+ assert(dev != -1);
*deviceId = dev;
HIP_RETURN(hipSuccess);
} else {
diff --git a/api/hip/hip_hcc.def.in b/api/hip/hip_hcc.def.in
index 44dd6edea7..662bfd04c3 100644
--- a/api/hip/hip_hcc.def.in
+++ b/api/hip/hip_hcc.def.in
@@ -110,10 +110,14 @@ hipModuleGetFunction
hipModuleGetGlobal
hipModuleLaunchKernel
hipModuleLaunchKernelExt
+hipLaunchCooperativeKernel
+hipLaunchCooperativeMultiDeviceKernel
hipHccModuleLaunchKernel
hipModuleLoad
hipModuleLoadData
hipModuleUnload
+hipOccupancyMaxActiveBlocksPerMultiprocessor
+hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
hipFuncGetAttributes
hipPeekAtLastError
hipPointerGetAttributes
diff --git a/api/hip/hip_hcc.map.in b/api/hip/hip_hcc.map.in
index d0b862eaa3..f6ab5a533b 100644
--- a/api/hip/hip_hcc.map.in
+++ b/api/hip/hip_hcc.map.in
@@ -111,9 +111,13 @@ global:
hipModuleGetGlobal;
hipModuleLaunchKernel;
hipModuleLaunchKernelExt;
+ hipLaunchCooperativeKernel;
+ hipLaunchCooperativeMultiDeviceKernel;
hipModuleLoad;
hipModuleLoadData;
hipModuleUnload;
+ hipOccupancyMaxActiveBlocksPerMultiprocessor;
+ hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags;
hipFuncGetAttributes;
hipPeekAtLastError;
hipPointerGetAttributes;
diff --git a/api/hip/hip_module.cpp b/api/hip/hip_module.cpp
index 2163ef3da4..edc3ba4384 100644
--- a/api/hip/hip_module.cpp
+++ b/api/hip/hip_module.cpp
@@ -170,7 +170,8 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
uint32_t sharedMemBytes, hipStream_t hStream,
void **kernelParams, void **extra,
- hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0)
+ hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0,
+ uint32_t params = 0)
{
HIP_INIT_API(f, gridDimX, gridDimY, gridDimZ,
blockDimX, blockDimY, blockDimZ,
@@ -193,6 +194,14 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
hip::getNullStream()->finish();
queue = as_amd(reinterpret_cast(hStream))->asHostQueue();
}
+ if ((params & amd::NDRangeKernelCommand::CooperativeGroups) &&
+ !device->info().cooperativeGroups_) {
+ return hipErrorLaunchFailure;
+ }
+ if ((params & amd::NDRangeKernelCommand::CooperativeMultiDeviceGroups) &&
+ !device->info().cooperativeMultiDeviceGroups_) {
+ return hipErrorLaunchFailure;
+ }
if (!queue) {
return hipErrorOutOfMemory;
}
@@ -203,25 +212,29 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
amd::NDRangeContainer ndrange(3, globalWorkOffset, globalWorkSize, localWorkSize);
amd::Command::EventWaitList waitList;
+ address kernargs = nullptr;
+
// 'extra' is a struct that contains the following info: {
// HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs,
// HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernargs_size,
// HIP_LAUNCH_PARAM_END }
- if (extra[0] != HIP_LAUNCH_PARAM_BUFFER_POINTER ||
- extra[2] != HIP_LAUNCH_PARAM_BUFFER_SIZE || extra[4] != HIP_LAUNCH_PARAM_END) {
- return hipErrorNotInitialized;
+ if (extra != nullptr) {
+ if (extra[0] != HIP_LAUNCH_PARAM_BUFFER_POINTER ||
+ extra[2] != HIP_LAUNCH_PARAM_BUFFER_SIZE || extra[4] != HIP_LAUNCH_PARAM_END) {
+ return hipErrorNotInitialized;
+ }
+ kernargs = reinterpret_cast(extra[1]);
}
- address kernargs = reinterpret_cast(extra[1]);
const amd::KernelSignature& signature = kernel->signature();
for (size_t i = 0; i < signature.numParameters(); ++i) {
const amd::KernelParameterDescriptor& desc = signature.at(i);
if (kernelParams == nullptr) {
- assert(extra);
+ assert(kernargs != nullptr);
kernel->parameters().set(i, desc.size_, kernargs + desc.offset_,
desc.type_ == T_POINTER/*svmBound*/);
} else {
- assert(!extra);
+ assert(extra == nullptr);
kernel->parameters().set(i, desc.size_, kernelParams[i], desc.type_ == T_POINTER/*svmBound*/);
}
}
@@ -232,7 +245,8 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
eStart->addMarker(queue, startCommand);
}
- amd::NDRangeKernelCommand* command = new amd::NDRangeKernelCommand(*queue, waitList, *kernel, ndrange, sharedMemBytes);
+ amd::NDRangeKernelCommand* command = new amd::NDRangeKernelCommand(
+ *queue, waitList, *kernel, ndrange, sharedMemBytes, params);
if (!command) {
return hipErrorOutOfMemory;
}
@@ -303,4 +317,43 @@ hipError_t hipModuleLaunchKernelExt(hipFunction_t f, uint32_t gridDimX,
sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent));
}
+hipError_t hipLaunchCooperativeKernel(const void* f,
+ dim3 gridDim, dim3 blockDim,
+ void **kernelParams, uint32_t sharedMemBytes, hipStream_t hStream)
+{
+ int deviceId = ihipGetDevice();
+ hipFunction_t func = PlatformState::instance().getFunc(f, deviceId);
+ if (func == nullptr) {
+ HIP_RETURN(hipErrorUnknown);
+ }
+ HIP_RETURN(ihipModuleLaunchKernel(func, gridDim.x * blockDim.x, gridDim.y * blockDim.y, gridDim.z * blockDim.z,
+ blockDim.x, blockDim.y, blockDim.z,
+ sharedMemBytes, hStream, kernelParams, nullptr, nullptr, nullptr, 0,
+ amd::NDRangeKernelCommand::CooperativeGroups));
+}
+
+hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
+ int numDevices, unsigned int flags)
+{
+ int deviceId = ihipGetDevice();
+
+ hipError_t result = hipErrorUnknown;
+ for (int i = 0; i < numDevices; ++i) {
+ const hipLaunchParams& launch = launchParamsList[i];
+ amd::HostQueue* queue = as_amd(reinterpret_cast(launch.stream))->asHostQueue();
+ hipFunction_t func = PlatformState::instance().getFunc(launch.func, deviceId);
+ if (func == nullptr) {
+ HIP_RETURN(result);
+ }
+ result = ihipModuleLaunchKernel(func,
+ launch.gridDim.x * launch.blockDim.x,
+ launch.gridDim.y * launch.blockDim.y,
+ launch.gridDim.z * launch.blockDim.z,
+ launch.blockDim.x, launch.blockDim.y, launch.blockDim.z,
+ launch.sharedMem, launch.stream,
+ launch.args, nullptr, nullptr, nullptr, flags,
+ (amd::NDRangeKernelCommand::CooperativeGroups | amd::NDRangeKernelCommand::CooperativeMultiDeviceGroups));
+ }
+ return result;
+}
diff --git a/api/hip/hip_platform.cpp b/api/hip/hip_platform.cpp
index f09bc1dea9..07a68faccf 100644
--- a/api/hip/hip_platform.cpp
+++ b/api/hip/hip_platform.cpp
@@ -448,6 +448,69 @@ hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memor
}
+namespace hip_impl {
+
+hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
+ const void* f,
+ int blockSize,
+ size_t dynamicSMemSize)
+{
+ HIP_INIT_API(f, blockSize, dynamicSMemSize);
+ int deviceId = ihipGetDevice();
+ hipFunction_t func = PlatformState::instance().getFunc(f, deviceId);
+ if (func == nullptr) {
+ HIP_RETURN(hipErrorUnknown);
+ }
+
+ hip::Function* function = hip::Function::asFunction(func);
+ amd::Kernel* kernel = function->function_;
+ if (!kernel) {
+ HIP_RETURN(hipErrorOutOfMemory);
+ }
+ if (blockSize == 0) {
+ HIP_RETURN(hipErrorInvalidValue);
+ }
+ amd::Device* device = hip::getCurrentContext()->devices()[0];
+ const device::Kernel::WorkGroupInfo* wrkGrpInfo = kernel->getDeviceKernel(*device)->workGroupInfo();
+
+ // Find threads accupancy per CU => simd_per_cu * GPR usage
+ constexpr size_t MaxWavesPerSimd = 8; // Limited by SPI 32 per CU, hence 8 per SIMD
+ size_t alu_accupancy = device->info().simdPerCU_ *
+ std::min(MaxWavesPerSimd, (wrkGrpInfo->availableVGPRs_ / amd::alignUp(wrkGrpInfo->usedVGPRs_, 4)));
+
+ alu_accupancy *= wrkGrpInfo->wavefrontSize_;
+ // Calculate blocks occupancy per CU
+ *numBlocks = alu_accupancy / amd::alignUp(blockSize, wrkGrpInfo->wavefrontSize_);
+
+ size_t total_used_lds = wrkGrpInfo->usedLDSSize_ + dynamicSMemSize;
+ if (total_used_lds != 0) {
+ // Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
+ int lds_occupancy = static_cast(device->info().localMemSize_ / total_used_lds);
+ *numBlocks = std::min(*numBlocks, lds_occupancy);
+ }
+
+ HIP_RETURN(hipSuccess);
+}
+}
+
+hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
+ const void* f,
+ int blockSize,
+ size_t dynamicSMemSize)
+{
+ HIP_RETURN(hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynamicSMemSize));
+}
+
+hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,
+ const void* f,
+ int blockSize,
+ size_t dynamicSMemSize,
+ unsigned int flags)
+{
+ HIP_RETURN(hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynamicSMemSize));
+}
+
+
#if defined(ATI_OS_LINUX)
namespace hip_impl {
@@ -668,6 +731,20 @@ void hipLaunchKernelGGLImpl(
sharedMemBytes, stream, nullptr, kernarg);
}
+void hipLaunchCooperativeKernelGGLImpl(
+ uintptr_t function_address,
+ const dim3& numBlocks,
+ const dim3& dimBlocks,
+ uint32_t sharedMemBytes,
+ hipStream_t stream,
+ void** kernarg)
+{
+ HIP_INIT();
+
+ hipLaunchCooperativeKernel(reinterpret_cast(function_address),
+ numBlocks, dimBlocks, kernarg, sharedMemBytes, stream);
+}
+
}
// conversion routines between float and half precision