P4 to Git Change 1809277 by gandryey@gera-win10 on 2019/06/11 17:34:13

SWDEV-180872 - Runtime support changes for Cooperative Group Features
	- Initial implementation of the core functionality. Disabled by default. Use GPU_ENABLE_COOP_GROUPS=1 to enable the feature.
	- Runtime uses device queue for cooperative executions with a synchronization on the launched queue.
	- The current implementation is pure runtime change and it can work if only one app uses this feature. No ROCr/KFD support was added or tested
	- Only inline assembler was tested

Affected files ...

... //depot/stg/opencl/drivers/opencl/api/hip/hip_device.cpp#20 edit
... //depot/stg/opencl/drivers/opencl/api/hip/hip_device_runtime.cpp#15 edit
... //depot/stg/opencl/drivers/opencl/api/hip/hip_hcc.def.in#15 edit
... //depot/stg/opencl/drivers/opencl/api/hip/hip_hcc.map.in#17 edit
... //depot/stg/opencl/drivers/opencl/api/hip/hip_module.cpp#28 edit
... //depot/stg/opencl/drivers/opencl/api/hip/hip_platform.cpp#32 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#338 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#606 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#171 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#31 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.hpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#142 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#39 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palschedcl.cpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#135 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#61 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.cpp#32 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.hpp#12 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#127 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#37 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocschedcl.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#75 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#23 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#94 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.hpp#92 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#311 edit
This commit is contained in:
foreman
2019-06-12 10:00:38 -04:00
والد 21b1c1a183
کامیت 17943639e4
6فایلهای تغییر یافته به همراه155 افزوده شده و 9 حذف شده
+2
مشاهده پرونده
@@ -192,6 +192,8 @@ hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device )
//deviceProps.isMultiGpuBoard = info.;
deviceProps.canMapHostMemory = 1;
deviceProps.gcnArch = info.gfxipVersion_;
deviceProps.cooperativeLaunch = info.cooperativeGroups_;
deviceProps.cooperativeMultiDeviceLaunch = info.cooperativeMultiDeviceGroups_;
*props = deviceProps;
HIP_RETURN(hipSuccess);
@@ -235,6 +235,12 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device)
case hipDeviceAttributeIsMultiGpuBoard:
*pi = prop.isMultiGpuBoard;
break;
case hipDeviceAttributeCooperativeLaunch:
*pi = prop.cooperativeLaunch;
break;
case hipDeviceAttributeCooperativeMultiDeviceLaunch:
*pi = prop.cooperativeMultiDeviceLaunch;
break;
default:
HIP_RETURN(hipErrorInvalidValue);
}
@@ -401,7 +407,7 @@ hipError_t hipGetDevice ( int* deviceId ) {
if (deviceId != nullptr) {
int dev = ihipGetDevice();
assert(dev != -1);
assert(dev != -1);
*deviceId = dev;
HIP_RETURN(hipSuccess);
} else {
+4
مشاهده پرونده
@@ -110,10 +110,14 @@ hipModuleGetFunction
hipModuleGetGlobal
hipModuleLaunchKernel
hipModuleLaunchKernelExt
hipLaunchCooperativeKernel
hipLaunchCooperativeMultiDeviceKernel
hipHccModuleLaunchKernel
hipModuleLoad
hipModuleLoadData
hipModuleUnload
hipOccupancyMaxActiveBlocksPerMultiprocessor
hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
hipFuncGetAttributes
hipPeekAtLastError
hipPointerGetAttributes
+4
مشاهده پرونده
@@ -111,9 +111,13 @@ global:
hipModuleGetGlobal;
hipModuleLaunchKernel;
hipModuleLaunchKernelExt;
hipLaunchCooperativeKernel;
hipLaunchCooperativeMultiDeviceKernel;
hipModuleLoad;
hipModuleLoadData;
hipModuleUnload;
hipOccupancyMaxActiveBlocksPerMultiprocessor;
hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags;
hipFuncGetAttributes;
hipPeekAtLastError;
hipPointerGetAttributes;
+61 -8
مشاهده پرونده
@@ -170,7 +170,8 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
uint32_t sharedMemBytes, hipStream_t hStream,
void **kernelParams, void **extra,
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0)
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0,
uint32_t params = 0)
{
HIP_INIT_API(f, gridDimX, gridDimY, gridDimZ,
blockDimX, blockDimY, blockDimZ,
@@ -193,6 +194,14 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
hip::getNullStream()->finish();
queue = as_amd(reinterpret_cast<cl_command_queue>(hStream))->asHostQueue();
}
if ((params & amd::NDRangeKernelCommand::CooperativeGroups) &&
!device->info().cooperativeGroups_) {
return hipErrorLaunchFailure;
}
if ((params & amd::NDRangeKernelCommand::CooperativeMultiDeviceGroups) &&
!device->info().cooperativeMultiDeviceGroups_) {
return hipErrorLaunchFailure;
}
if (!queue) {
return hipErrorOutOfMemory;
}
@@ -203,25 +212,29 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
amd::NDRangeContainer ndrange(3, globalWorkOffset, globalWorkSize, localWorkSize);
amd::Command::EventWaitList waitList;
address kernargs = nullptr;
// 'extra' is a struct that contains the following info: {
// HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs,
// HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernargs_size,
// HIP_LAUNCH_PARAM_END }
if (extra[0] != HIP_LAUNCH_PARAM_BUFFER_POINTER ||
extra[2] != HIP_LAUNCH_PARAM_BUFFER_SIZE || extra[4] != HIP_LAUNCH_PARAM_END) {
return hipErrorNotInitialized;
if (extra != nullptr) {
if (extra[0] != HIP_LAUNCH_PARAM_BUFFER_POINTER ||
extra[2] != HIP_LAUNCH_PARAM_BUFFER_SIZE || extra[4] != HIP_LAUNCH_PARAM_END) {
return hipErrorNotInitialized;
}
kernargs = reinterpret_cast<address>(extra[1]);
}
address kernargs = reinterpret_cast<address>(extra[1]);
const amd::KernelSignature& signature = kernel->signature();
for (size_t i = 0; i < signature.numParameters(); ++i) {
const amd::KernelParameterDescriptor& desc = signature.at(i);
if (kernelParams == nullptr) {
assert(extra);
assert(kernargs != nullptr);
kernel->parameters().set(i, desc.size_, kernargs + desc.offset_,
desc.type_ == T_POINTER/*svmBound*/);
} else {
assert(!extra);
assert(extra == nullptr);
kernel->parameters().set(i, desc.size_, kernelParams[i], desc.type_ == T_POINTER/*svmBound*/);
}
}
@@ -232,7 +245,8 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
eStart->addMarker(queue, startCommand);
}
amd::NDRangeKernelCommand* command = new amd::NDRangeKernelCommand(*queue, waitList, *kernel, ndrange, sharedMemBytes);
amd::NDRangeKernelCommand* command = new amd::NDRangeKernelCommand(
*queue, waitList, *kernel, ndrange, sharedMemBytes, params);
if (!command) {
return hipErrorOutOfMemory;
}
@@ -303,4 +317,43 @@ hipError_t hipModuleLaunchKernelExt(hipFunction_t f, uint32_t gridDimX,
sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent));
}
hipError_t hipLaunchCooperativeKernel(const void* f,
dim3 gridDim, dim3 blockDim,
void **kernelParams, uint32_t sharedMemBytes, hipStream_t hStream)
{
int deviceId = ihipGetDevice();
hipFunction_t func = PlatformState::instance().getFunc(f, deviceId);
if (func == nullptr) {
HIP_RETURN(hipErrorUnknown);
}
HIP_RETURN(ihipModuleLaunchKernel(func, gridDim.x * blockDim.x, gridDim.y * blockDim.y, gridDim.z * blockDim.z,
blockDim.x, blockDim.y, blockDim.z,
sharedMemBytes, hStream, kernelParams, nullptr, nullptr, nullptr, 0,
amd::NDRangeKernelCommand::CooperativeGroups));
}
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
int numDevices, unsigned int flags)
{
int deviceId = ihipGetDevice();
hipError_t result = hipErrorUnknown;
for (int i = 0; i < numDevices; ++i) {
const hipLaunchParams& launch = launchParamsList[i];
amd::HostQueue* queue = as_amd(reinterpret_cast<cl_command_queue>(launch.stream))->asHostQueue();
hipFunction_t func = PlatformState::instance().getFunc(launch.func, deviceId);
if (func == nullptr) {
HIP_RETURN(result);
}
result = ihipModuleLaunchKernel(func,
launch.gridDim.x * launch.blockDim.x,
launch.gridDim.y * launch.blockDim.y,
launch.gridDim.z * launch.blockDim.z,
launch.blockDim.x, launch.blockDim.y, launch.blockDim.z,
launch.sharedMem, launch.stream,
launch.args, nullptr, nullptr, nullptr, flags,
(amd::NDRangeKernelCommand::CooperativeGroups | amd::NDRangeKernelCommand::CooperativeMultiDeviceGroups));
}
return result;
}
+77
مشاهده پرونده
@@ -448,6 +448,69 @@ hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memor
}
namespace hip_impl {
hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
const void* f,
int blockSize,
size_t dynamicSMemSize)
{
HIP_INIT_API(f, blockSize, dynamicSMemSize);
int deviceId = ihipGetDevice();
hipFunction_t func = PlatformState::instance().getFunc(f, deviceId);
if (func == nullptr) {
HIP_RETURN(hipErrorUnknown);
}
hip::Function* function = hip::Function::asFunction(func);
amd::Kernel* kernel = function->function_;
if (!kernel) {
HIP_RETURN(hipErrorOutOfMemory);
}
if (blockSize == 0) {
HIP_RETURN(hipErrorInvalidValue);
}
amd::Device* device = hip::getCurrentContext()->devices()[0];
const device::Kernel::WorkGroupInfo* wrkGrpInfo = kernel->getDeviceKernel(*device)->workGroupInfo();
// Find threads accupancy per CU => simd_per_cu * GPR usage
constexpr size_t MaxWavesPerSimd = 8; // Limited by SPI 32 per CU, hence 8 per SIMD
size_t alu_accupancy = device->info().simdPerCU_ *
std::min(MaxWavesPerSimd, (wrkGrpInfo->availableVGPRs_ / amd::alignUp(wrkGrpInfo->usedVGPRs_, 4)));
alu_accupancy *= wrkGrpInfo->wavefrontSize_;
// Calculate blocks occupancy per CU
*numBlocks = alu_accupancy / amd::alignUp(blockSize, wrkGrpInfo->wavefrontSize_);
size_t total_used_lds = wrkGrpInfo->usedLDSSize_ + dynamicSMemSize;
if (total_used_lds != 0) {
// Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
int lds_occupancy = static_cast<int>(device->info().localMemSize_ / total_used_lds);
*numBlocks = std::min(*numBlocks, lds_occupancy);
}
HIP_RETURN(hipSuccess);
}
}
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
const void* f,
int blockSize,
size_t dynamicSMemSize)
{
HIP_RETURN(hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynamicSMemSize));
}
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,
const void* f,
int blockSize,
size_t dynamicSMemSize,
unsigned int flags)
{
HIP_RETURN(hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynamicSMemSize));
}
#if defined(ATI_OS_LINUX)
namespace hip_impl {
@@ -668,6 +731,20 @@ void hipLaunchKernelGGLImpl(
sharedMemBytes, stream, nullptr, kernarg);
}
void hipLaunchCooperativeKernelGGLImpl(
uintptr_t function_address,
const dim3& numBlocks,
const dim3& dimBlocks,
uint32_t sharedMemBytes,
hipStream_t stream,
void** kernarg)
{
HIP_INIT();
hipLaunchCooperativeKernel(reinterpret_cast<void*>(function_address),
numBlocks, dimBlocks, kernarg, sharedMemBytes, stream);
}
}
// conversion routines between float and half precision