From 17943639e4e2f686c7776d631fa5d672ceb224fb Mon Sep 17 00:00:00 2001 From: foreman Date: Wed, 12 Jun 2019 10:00:38 -0400 Subject: [PATCH] P4 to Git Change 1809277 by gandryey@gera-win10 on 2019/06/11 17:34:13 SWDEV-180872 - Runtime support changes for Cooperative Group Features - Initial implementation of the core functionality. Disabled by default. Use GPU_ENABLE_COOP_GROUPS=1 to enable the feature. - Runtime uses device queue for cooperative executions with a synchronization on the launched queue. - The current implementation is pure runtime change and it can work if only one app uses this feature. No ROCr/KFD support was added or tested - Only inline assembler was tested Affected files ... ... //depot/stg/opencl/drivers/opencl/api/hip/hip_device.cpp#20 edit ... //depot/stg/opencl/drivers/opencl/api/hip/hip_device_runtime.cpp#15 edit ... //depot/stg/opencl/drivers/opencl/api/hip/hip_hcc.def.in#15 edit ... //depot/stg/opencl/drivers/opencl/api/hip/hip_hcc.map.in#17 edit ... //depot/stg/opencl/drivers/opencl/api/hip/hip_module.cpp#28 edit ... //depot/stg/opencl/drivers/opencl/api/hip/hip_platform.cpp#32 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#338 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#606 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#171 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#31 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.hpp#9 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#142 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#39 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palschedcl.cpp#6 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#135 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#61 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.cpp#32 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.hpp#12 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#127 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#37 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocschedcl.cpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#75 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#23 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#94 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/command.hpp#92 edit ... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#311 edit --- api/hip/hip_device.cpp | 2 + api/hip/hip_device_runtime.cpp | 8 +++- api/hip/hip_hcc.def.in | 4 ++ api/hip/hip_hcc.map.in | 4 ++ api/hip/hip_module.cpp | 69 ++++++++++++++++++++++++++---- api/hip/hip_platform.cpp | 77 ++++++++++++++++++++++++++++++++++ 6 files changed, 155 insertions(+), 9 deletions(-) diff --git a/api/hip/hip_device.cpp b/api/hip/hip_device.cpp index a3019cc7d7..622ad150a1 100644 --- a/api/hip/hip_device.cpp +++ b/api/hip/hip_device.cpp @@ -192,6 +192,8 @@ hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device ) //deviceProps.isMultiGpuBoard = info.; deviceProps.canMapHostMemory = 1; deviceProps.gcnArch = info.gfxipVersion_; + deviceProps.cooperativeLaunch = info.cooperativeGroups_; + deviceProps.cooperativeMultiDeviceLaunch = info.cooperativeMultiDeviceGroups_; *props = deviceProps; HIP_RETURN(hipSuccess); diff --git a/api/hip/hip_device_runtime.cpp b/api/hip/hip_device_runtime.cpp index 6bb4261964..fea9fb4dd9 100644 --- a/api/hip/hip_device_runtime.cpp +++ b/api/hip/hip_device_runtime.cpp @@ -235,6 +235,12 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) case hipDeviceAttributeIsMultiGpuBoard: *pi = prop.isMultiGpuBoard; break; + case hipDeviceAttributeCooperativeLaunch: + *pi = prop.cooperativeLaunch; + break; + case hipDeviceAttributeCooperativeMultiDeviceLaunch: + *pi = prop.cooperativeMultiDeviceLaunch; + break; default: HIP_RETURN(hipErrorInvalidValue); } @@ -401,7 +407,7 @@ hipError_t hipGetDevice ( int* deviceId ) { if (deviceId != nullptr) { int dev = ihipGetDevice(); - assert(dev != -1); + assert(dev != -1); *deviceId = dev; HIP_RETURN(hipSuccess); } else { diff --git a/api/hip/hip_hcc.def.in b/api/hip/hip_hcc.def.in index 44dd6edea7..662bfd04c3 100644 --- a/api/hip/hip_hcc.def.in +++ b/api/hip/hip_hcc.def.in @@ -110,10 +110,14 @@ hipModuleGetFunction hipModuleGetGlobal hipModuleLaunchKernel hipModuleLaunchKernelExt +hipLaunchCooperativeKernel +hipLaunchCooperativeMultiDeviceKernel hipHccModuleLaunchKernel hipModuleLoad hipModuleLoadData hipModuleUnload +hipOccupancyMaxActiveBlocksPerMultiprocessor +hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags hipFuncGetAttributes hipPeekAtLastError hipPointerGetAttributes diff --git a/api/hip/hip_hcc.map.in b/api/hip/hip_hcc.map.in index d0b862eaa3..f6ab5a533b 100644 --- a/api/hip/hip_hcc.map.in +++ b/api/hip/hip_hcc.map.in @@ -111,9 +111,13 @@ global: hipModuleGetGlobal; hipModuleLaunchKernel; hipModuleLaunchKernelExt; + hipLaunchCooperativeKernel; + hipLaunchCooperativeMultiDeviceKernel; hipModuleLoad; hipModuleLoadData; hipModuleUnload; + hipOccupancyMaxActiveBlocksPerMultiprocessor; + hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags; hipFuncGetAttributes; hipPeekAtLastError; hipPointerGetAttributes; diff --git a/api/hip/hip_module.cpp b/api/hip/hip_module.cpp index 2163ef3da4..edc3ba4384 100644 --- a/api/hip/hip_module.cpp +++ b/api/hip/hip_module.cpp @@ -170,7 +170,8 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra, - hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0) + hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0, + uint32_t params = 0) { HIP_INIT_API(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, @@ -193,6 +194,14 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, hip::getNullStream()->finish(); queue = as_amd(reinterpret_cast(hStream))->asHostQueue(); } + if ((params & amd::NDRangeKernelCommand::CooperativeGroups) && + !device->info().cooperativeGroups_) { + return hipErrorLaunchFailure; + } + if ((params & amd::NDRangeKernelCommand::CooperativeMultiDeviceGroups) && + !device->info().cooperativeMultiDeviceGroups_) { + return hipErrorLaunchFailure; + } if (!queue) { return hipErrorOutOfMemory; } @@ -203,25 +212,29 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, amd::NDRangeContainer ndrange(3, globalWorkOffset, globalWorkSize, localWorkSize); amd::Command::EventWaitList waitList; + address kernargs = nullptr; + // 'extra' is a struct that contains the following info: { // HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs, // HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernargs_size, // HIP_LAUNCH_PARAM_END } - if (extra[0] != HIP_LAUNCH_PARAM_BUFFER_POINTER || - extra[2] != HIP_LAUNCH_PARAM_BUFFER_SIZE || extra[4] != HIP_LAUNCH_PARAM_END) { - return hipErrorNotInitialized; + if (extra != nullptr) { + if (extra[0] != HIP_LAUNCH_PARAM_BUFFER_POINTER || + extra[2] != HIP_LAUNCH_PARAM_BUFFER_SIZE || extra[4] != HIP_LAUNCH_PARAM_END) { + return hipErrorNotInitialized; + } + kernargs = reinterpret_cast
(extra[1]); } - address kernargs = reinterpret_cast
(extra[1]); const amd::KernelSignature& signature = kernel->signature(); for (size_t i = 0; i < signature.numParameters(); ++i) { const amd::KernelParameterDescriptor& desc = signature.at(i); if (kernelParams == nullptr) { - assert(extra); + assert(kernargs != nullptr); kernel->parameters().set(i, desc.size_, kernargs + desc.offset_, desc.type_ == T_POINTER/*svmBound*/); } else { - assert(!extra); + assert(extra == nullptr); kernel->parameters().set(i, desc.size_, kernelParams[i], desc.type_ == T_POINTER/*svmBound*/); } } @@ -232,7 +245,8 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, eStart->addMarker(queue, startCommand); } - amd::NDRangeKernelCommand* command = new amd::NDRangeKernelCommand(*queue, waitList, *kernel, ndrange, sharedMemBytes); + amd::NDRangeKernelCommand* command = new amd::NDRangeKernelCommand( + *queue, waitList, *kernel, ndrange, sharedMemBytes, params); if (!command) { return hipErrorOutOfMemory; } @@ -303,4 +317,43 @@ hipError_t hipModuleLaunchKernelExt(hipFunction_t f, uint32_t gridDimX, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent)); } +hipError_t hipLaunchCooperativeKernel(const void* f, + dim3 gridDim, dim3 blockDim, + void **kernelParams, uint32_t sharedMemBytes, hipStream_t hStream) +{ + int deviceId = ihipGetDevice(); + hipFunction_t func = PlatformState::instance().getFunc(f, deviceId); + if (func == nullptr) { + HIP_RETURN(hipErrorUnknown); + } + HIP_RETURN(ihipModuleLaunchKernel(func, gridDim.x * blockDim.x, gridDim.y * blockDim.y, gridDim.z * blockDim.z, + blockDim.x, blockDim.y, blockDim.z, + sharedMemBytes, hStream, kernelParams, nullptr, nullptr, nullptr, 0, + amd::NDRangeKernelCommand::CooperativeGroups)); +} + +hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, + int numDevices, unsigned int flags) +{ + int deviceId = ihipGetDevice(); + + hipError_t result = hipErrorUnknown; + for (int i = 0; i < numDevices; ++i) { + const hipLaunchParams& launch = launchParamsList[i]; + amd::HostQueue* queue = as_amd(reinterpret_cast(launch.stream))->asHostQueue(); + hipFunction_t func = PlatformState::instance().getFunc(launch.func, deviceId); + if (func == nullptr) { + HIP_RETURN(result); + } + result = ihipModuleLaunchKernel(func, + launch.gridDim.x * launch.blockDim.x, + launch.gridDim.y * launch.blockDim.y, + launch.gridDim.z * launch.blockDim.z, + launch.blockDim.x, launch.blockDim.y, launch.blockDim.z, + launch.sharedMem, launch.stream, + launch.args, nullptr, nullptr, nullptr, flags, + (amd::NDRangeKernelCommand::CooperativeGroups | amd::NDRangeKernelCommand::CooperativeMultiDeviceGroups)); + } + return result; +} diff --git a/api/hip/hip_platform.cpp b/api/hip/hip_platform.cpp index f09bc1dea9..07a68faccf 100644 --- a/api/hip/hip_platform.cpp +++ b/api/hip/hip_platform.cpp @@ -448,6 +448,69 @@ hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memor } +namespace hip_impl { + +hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, + const void* f, + int blockSize, + size_t dynamicSMemSize) +{ + HIP_INIT_API(f, blockSize, dynamicSMemSize); + int deviceId = ihipGetDevice(); + hipFunction_t func = PlatformState::instance().getFunc(f, deviceId); + if (func == nullptr) { + HIP_RETURN(hipErrorUnknown); + } + + hip::Function* function = hip::Function::asFunction(func); + amd::Kernel* kernel = function->function_; + if (!kernel) { + HIP_RETURN(hipErrorOutOfMemory); + } + if (blockSize == 0) { + HIP_RETURN(hipErrorInvalidValue); + } + amd::Device* device = hip::getCurrentContext()->devices()[0]; + const device::Kernel::WorkGroupInfo* wrkGrpInfo = kernel->getDeviceKernel(*device)->workGroupInfo(); + + // Find threads accupancy per CU => simd_per_cu * GPR usage + constexpr size_t MaxWavesPerSimd = 8; // Limited by SPI 32 per CU, hence 8 per SIMD + size_t alu_accupancy = device->info().simdPerCU_ * + std::min(MaxWavesPerSimd, (wrkGrpInfo->availableVGPRs_ / amd::alignUp(wrkGrpInfo->usedVGPRs_, 4))); + + alu_accupancy *= wrkGrpInfo->wavefrontSize_; + // Calculate blocks occupancy per CU + *numBlocks = alu_accupancy / amd::alignUp(blockSize, wrkGrpInfo->wavefrontSize_); + + size_t total_used_lds = wrkGrpInfo->usedLDSSize_ + dynamicSMemSize; + if (total_used_lds != 0) { + // Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds) + int lds_occupancy = static_cast(device->info().localMemSize_ / total_used_lds); + *numBlocks = std::min(*numBlocks, lds_occupancy); + } + + HIP_RETURN(hipSuccess); +} +} + +hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, + const void* f, + int blockSize, + size_t dynamicSMemSize) +{ + HIP_RETURN(hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynamicSMemSize)); +} + +hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks, + const void* f, + int blockSize, + size_t dynamicSMemSize, + unsigned int flags) +{ + HIP_RETURN(hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynamicSMemSize)); +} + + #if defined(ATI_OS_LINUX) namespace hip_impl { @@ -668,6 +731,20 @@ void hipLaunchKernelGGLImpl( sharedMemBytes, stream, nullptr, kernarg); } +void hipLaunchCooperativeKernelGGLImpl( + uintptr_t function_address, + const dim3& numBlocks, + const dim3& dimBlocks, + uint32_t sharedMemBytes, + hipStream_t stream, + void** kernarg) +{ + HIP_INIT(); + + hipLaunchCooperativeKernel(reinterpret_cast(function_address), + numBlocks, dimBlocks, kernarg, sharedMemBytes, stream); +} + } // conversion routines between float and half precision