P4 to Git Change 1809277 by gandryey@gera-win10 on 2019/06/11 17:34:13
SWDEV-180872 - Runtime support changes for Cooperative Group Features - Initial implementation of the core functionality. Disabled by default. Use GPU_ENABLE_COOP_GROUPS=1 to enable the feature. - Runtime uses device queue for cooperative executions with a synchronization on the launched queue. - The current implementation is pure runtime change and it can work if only one app uses this feature. No ROCr/KFD support was added or tested - Only inline assembler was tested Affected files ... ... //depot/stg/opencl/drivers/opencl/api/hip/hip_device.cpp#20 edit ... //depot/stg/opencl/drivers/opencl/api/hip/hip_device_runtime.cpp#15 edit ... //depot/stg/opencl/drivers/opencl/api/hip/hip_hcc.def.in#15 edit ... //depot/stg/opencl/drivers/opencl/api/hip/hip_hcc.map.in#17 edit ... //depot/stg/opencl/drivers/opencl/api/hip/hip_module.cpp#28 edit ... //depot/stg/opencl/drivers/opencl/api/hip/hip_platform.cpp#32 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#338 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#606 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#171 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#31 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.hpp#9 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#142 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#39 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palschedcl.cpp#6 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#135 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#61 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.cpp#32 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.hpp#12 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#127 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#37 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocschedcl.cpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#75 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#23 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#94 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/command.hpp#92 edit ... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#311 edit
This commit is contained in:
@@ -192,6 +192,8 @@ hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device )
|
||||
//deviceProps.isMultiGpuBoard = info.;
|
||||
deviceProps.canMapHostMemory = 1;
|
||||
deviceProps.gcnArch = info.gfxipVersion_;
|
||||
deviceProps.cooperativeLaunch = info.cooperativeGroups_;
|
||||
deviceProps.cooperativeMultiDeviceLaunch = info.cooperativeMultiDeviceGroups_;
|
||||
|
||||
*props = deviceProps;
|
||||
HIP_RETURN(hipSuccess);
|
||||
|
||||
@@ -235,6 +235,12 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device)
|
||||
case hipDeviceAttributeIsMultiGpuBoard:
|
||||
*pi = prop.isMultiGpuBoard;
|
||||
break;
|
||||
case hipDeviceAttributeCooperativeLaunch:
|
||||
*pi = prop.cooperativeLaunch;
|
||||
break;
|
||||
case hipDeviceAttributeCooperativeMultiDeviceLaunch:
|
||||
*pi = prop.cooperativeMultiDeviceLaunch;
|
||||
break;
|
||||
default:
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
@@ -401,7 +407,7 @@ hipError_t hipGetDevice ( int* deviceId ) {
|
||||
|
||||
if (deviceId != nullptr) {
|
||||
int dev = ihipGetDevice();
|
||||
assert(dev != -1);
|
||||
assert(dev != -1);
|
||||
*deviceId = dev;
|
||||
HIP_RETURN(hipSuccess);
|
||||
} else {
|
||||
|
||||
@@ -110,10 +110,14 @@ hipModuleGetFunction
|
||||
hipModuleGetGlobal
|
||||
hipModuleLaunchKernel
|
||||
hipModuleLaunchKernelExt
|
||||
hipLaunchCooperativeKernel
|
||||
hipLaunchCooperativeMultiDeviceKernel
|
||||
hipHccModuleLaunchKernel
|
||||
hipModuleLoad
|
||||
hipModuleLoadData
|
||||
hipModuleUnload
|
||||
hipOccupancyMaxActiveBlocksPerMultiprocessor
|
||||
hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
|
||||
hipFuncGetAttributes
|
||||
hipPeekAtLastError
|
||||
hipPointerGetAttributes
|
||||
|
||||
@@ -111,9 +111,13 @@ global:
|
||||
hipModuleGetGlobal;
|
||||
hipModuleLaunchKernel;
|
||||
hipModuleLaunchKernelExt;
|
||||
hipLaunchCooperativeKernel;
|
||||
hipLaunchCooperativeMultiDeviceKernel;
|
||||
hipModuleLoad;
|
||||
hipModuleLoadData;
|
||||
hipModuleUnload;
|
||||
hipOccupancyMaxActiveBlocksPerMultiprocessor;
|
||||
hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags;
|
||||
hipFuncGetAttributes;
|
||||
hipPeekAtLastError;
|
||||
hipPointerGetAttributes;
|
||||
|
||||
@@ -170,7 +170,8 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
|
||||
uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
|
||||
uint32_t sharedMemBytes, hipStream_t hStream,
|
||||
void **kernelParams, void **extra,
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0)
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0,
|
||||
uint32_t params = 0)
|
||||
{
|
||||
HIP_INIT_API(f, gridDimX, gridDimY, gridDimZ,
|
||||
blockDimX, blockDimY, blockDimZ,
|
||||
@@ -193,6 +194,14 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
|
||||
hip::getNullStream()->finish();
|
||||
queue = as_amd(reinterpret_cast<cl_command_queue>(hStream))->asHostQueue();
|
||||
}
|
||||
if ((params & amd::NDRangeKernelCommand::CooperativeGroups) &&
|
||||
!device->info().cooperativeGroups_) {
|
||||
return hipErrorLaunchFailure;
|
||||
}
|
||||
if ((params & amd::NDRangeKernelCommand::CooperativeMultiDeviceGroups) &&
|
||||
!device->info().cooperativeMultiDeviceGroups_) {
|
||||
return hipErrorLaunchFailure;
|
||||
}
|
||||
if (!queue) {
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
@@ -203,25 +212,29 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
|
||||
amd::NDRangeContainer ndrange(3, globalWorkOffset, globalWorkSize, localWorkSize);
|
||||
amd::Command::EventWaitList waitList;
|
||||
|
||||
address kernargs = nullptr;
|
||||
|
||||
// 'extra' is a struct that contains the following info: {
|
||||
// HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs,
|
||||
// HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernargs_size,
|
||||
// HIP_LAUNCH_PARAM_END }
|
||||
if (extra[0] != HIP_LAUNCH_PARAM_BUFFER_POINTER ||
|
||||
extra[2] != HIP_LAUNCH_PARAM_BUFFER_SIZE || extra[4] != HIP_LAUNCH_PARAM_END) {
|
||||
return hipErrorNotInitialized;
|
||||
if (extra != nullptr) {
|
||||
if (extra[0] != HIP_LAUNCH_PARAM_BUFFER_POINTER ||
|
||||
extra[2] != HIP_LAUNCH_PARAM_BUFFER_SIZE || extra[4] != HIP_LAUNCH_PARAM_END) {
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
kernargs = reinterpret_cast<address>(extra[1]);
|
||||
}
|
||||
address kernargs = reinterpret_cast<address>(extra[1]);
|
||||
|
||||
const amd::KernelSignature& signature = kernel->signature();
|
||||
for (size_t i = 0; i < signature.numParameters(); ++i) {
|
||||
const amd::KernelParameterDescriptor& desc = signature.at(i);
|
||||
if (kernelParams == nullptr) {
|
||||
assert(extra);
|
||||
assert(kernargs != nullptr);
|
||||
kernel->parameters().set(i, desc.size_, kernargs + desc.offset_,
|
||||
desc.type_ == T_POINTER/*svmBound*/);
|
||||
} else {
|
||||
assert(!extra);
|
||||
assert(extra == nullptr);
|
||||
kernel->parameters().set(i, desc.size_, kernelParams[i], desc.type_ == T_POINTER/*svmBound*/);
|
||||
}
|
||||
}
|
||||
@@ -232,7 +245,8 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
|
||||
eStart->addMarker(queue, startCommand);
|
||||
}
|
||||
|
||||
amd::NDRangeKernelCommand* command = new amd::NDRangeKernelCommand(*queue, waitList, *kernel, ndrange, sharedMemBytes);
|
||||
amd::NDRangeKernelCommand* command = new amd::NDRangeKernelCommand(
|
||||
*queue, waitList, *kernel, ndrange, sharedMemBytes, params);
|
||||
if (!command) {
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
@@ -303,4 +317,43 @@ hipError_t hipModuleLaunchKernelExt(hipFunction_t f, uint32_t gridDimX,
|
||||
sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent));
|
||||
}
|
||||
|
||||
hipError_t hipLaunchCooperativeKernel(const void* f,
|
||||
dim3 gridDim, dim3 blockDim,
|
||||
void **kernelParams, uint32_t sharedMemBytes, hipStream_t hStream)
|
||||
{
|
||||
int deviceId = ihipGetDevice();
|
||||
hipFunction_t func = PlatformState::instance().getFunc(f, deviceId);
|
||||
if (func == nullptr) {
|
||||
HIP_RETURN(hipErrorUnknown);
|
||||
}
|
||||
|
||||
HIP_RETURN(ihipModuleLaunchKernel(func, gridDim.x * blockDim.x, gridDim.y * blockDim.y, gridDim.z * blockDim.z,
|
||||
blockDim.x, blockDim.y, blockDim.z,
|
||||
sharedMemBytes, hStream, kernelParams, nullptr, nullptr, nullptr, 0,
|
||||
amd::NDRangeKernelCommand::CooperativeGroups));
|
||||
}
|
||||
|
||||
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
int numDevices, unsigned int flags)
|
||||
{
|
||||
int deviceId = ihipGetDevice();
|
||||
|
||||
hipError_t result = hipErrorUnknown;
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
const hipLaunchParams& launch = launchParamsList[i];
|
||||
amd::HostQueue* queue = as_amd(reinterpret_cast<cl_command_queue>(launch.stream))->asHostQueue();
|
||||
hipFunction_t func = PlatformState::instance().getFunc(launch.func, deviceId);
|
||||
if (func == nullptr) {
|
||||
HIP_RETURN(result);
|
||||
}
|
||||
result = ihipModuleLaunchKernel(func,
|
||||
launch.gridDim.x * launch.blockDim.x,
|
||||
launch.gridDim.y * launch.blockDim.y,
|
||||
launch.gridDim.z * launch.blockDim.z,
|
||||
launch.blockDim.x, launch.blockDim.y, launch.blockDim.z,
|
||||
launch.sharedMem, launch.stream,
|
||||
launch.args, nullptr, nullptr, nullptr, flags,
|
||||
(amd::NDRangeKernelCommand::CooperativeGroups | amd::NDRangeKernelCommand::CooperativeMultiDeviceGroups));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -448,6 +448,69 @@ hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memor
|
||||
}
|
||||
|
||||
|
||||
namespace hip_impl {
|
||||
|
||||
hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
|
||||
const void* f,
|
||||
int blockSize,
|
||||
size_t dynamicSMemSize)
|
||||
{
|
||||
HIP_INIT_API(f, blockSize, dynamicSMemSize);
|
||||
int deviceId = ihipGetDevice();
|
||||
hipFunction_t func = PlatformState::instance().getFunc(f, deviceId);
|
||||
if (func == nullptr) {
|
||||
HIP_RETURN(hipErrorUnknown);
|
||||
}
|
||||
|
||||
hip::Function* function = hip::Function::asFunction(func);
|
||||
amd::Kernel* kernel = function->function_;
|
||||
if (!kernel) {
|
||||
HIP_RETURN(hipErrorOutOfMemory);
|
||||
}
|
||||
if (blockSize == 0) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
amd::Device* device = hip::getCurrentContext()->devices()[0];
|
||||
const device::Kernel::WorkGroupInfo* wrkGrpInfo = kernel->getDeviceKernel(*device)->workGroupInfo();
|
||||
|
||||
// Find threads accupancy per CU => simd_per_cu * GPR usage
|
||||
constexpr size_t MaxWavesPerSimd = 8; // Limited by SPI 32 per CU, hence 8 per SIMD
|
||||
size_t alu_accupancy = device->info().simdPerCU_ *
|
||||
std::min(MaxWavesPerSimd, (wrkGrpInfo->availableVGPRs_ / amd::alignUp(wrkGrpInfo->usedVGPRs_, 4)));
|
||||
|
||||
alu_accupancy *= wrkGrpInfo->wavefrontSize_;
|
||||
// Calculate blocks occupancy per CU
|
||||
*numBlocks = alu_accupancy / amd::alignUp(blockSize, wrkGrpInfo->wavefrontSize_);
|
||||
|
||||
size_t total_used_lds = wrkGrpInfo->usedLDSSize_ + dynamicSMemSize;
|
||||
if (total_used_lds != 0) {
|
||||
// Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
|
||||
int lds_occupancy = static_cast<int>(device->info().localMemSize_ / total_used_lds);
|
||||
*numBlocks = std::min(*numBlocks, lds_occupancy);
|
||||
}
|
||||
|
||||
HIP_RETURN(hipSuccess);
|
||||
}
|
||||
}
|
||||
|
||||
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
|
||||
const void* f,
|
||||
int blockSize,
|
||||
size_t dynamicSMemSize)
|
||||
{
|
||||
HIP_RETURN(hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynamicSMemSize));
|
||||
}
|
||||
|
||||
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,
|
||||
const void* f,
|
||||
int blockSize,
|
||||
size_t dynamicSMemSize,
|
||||
unsigned int flags)
|
||||
{
|
||||
HIP_RETURN(hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynamicSMemSize));
|
||||
}
|
||||
|
||||
|
||||
#if defined(ATI_OS_LINUX)
|
||||
|
||||
namespace hip_impl {
|
||||
@@ -668,6 +731,20 @@ void hipLaunchKernelGGLImpl(
|
||||
sharedMemBytes, stream, nullptr, kernarg);
|
||||
}
|
||||
|
||||
void hipLaunchCooperativeKernelGGLImpl(
|
||||
uintptr_t function_address,
|
||||
const dim3& numBlocks,
|
||||
const dim3& dimBlocks,
|
||||
uint32_t sharedMemBytes,
|
||||
hipStream_t stream,
|
||||
void** kernarg)
|
||||
{
|
||||
HIP_INIT();
|
||||
|
||||
hipLaunchCooperativeKernel(reinterpret_cast<void*>(function_address),
|
||||
numBlocks, dimBlocks, kernarg, sharedMemBytes, stream);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// conversion routines between float and half precision
|
||||
|
||||
مرجع در شماره جدید
Block a user