P4 to Git Change 1809277 by gandryey@gera-win10 on 2019/06/11 17:34:13

SWDEV-180872 - Runtime support changes for Cooperative Group Features - Initial implementation of the core functionality. Disabled by default. Use GPU_ENABLE_COOP_GROUPS=1 to enable the feature. - Runtime uses device queue for cooperative executions with a synchronization on the launched queue. - The current implementation is pure runtime change and it can work if only one app uses this feature. No ROCr/KFD support was added or tested - Only inline assembler was tested Affected files ... ... //depot/stg/opencl/drivers/opencl/api/hip/hip_device.cpp#20 edit ... //depot/stg/opencl/drivers/opencl/api/hip/hip_device_runtime.cpp#15 edit ... //depot/stg/opencl/drivers/opencl/api/hip/hip_hcc.def.in#15 edit ... //depot/stg/opencl/drivers/opencl/api/hip/hip_hcc.map.in#17 edit ... //depot/stg/opencl/drivers/opencl/api/hip/hip_module.cpp#28 edit ... //depot/stg/opencl/drivers/opencl/api/hip/hip_platform.cpp#32 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#338 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#606 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#171 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#31 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.hpp#9 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#142 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#39 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palschedcl.cpp#6 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#135 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#61 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.cpp#32 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.hpp#12 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#127 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#37 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocschedcl.cpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#75 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#23 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#94 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/command.hpp#92 edit ... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#311 edit
2019-06-12 10:00:38 -04:00
@@ -192,6 +192,8 @@ hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device )
  //deviceProps.isMultiGpuBoard = info.;
  deviceProps.canMapHostMemory = 1;
  deviceProps.gcnArch = info.gfxipVersion_;
+  deviceProps.cooperativeLaunch = info.cooperativeGroups_;
+  deviceProps.cooperativeMultiDeviceLaunch = info.cooperativeMultiDeviceGroups_;

  *props = deviceProps;
  HIP_RETURN(hipSuccess);
@@ -235,6 +235,12 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device)
  case hipDeviceAttributeIsMultiGpuBoard:
    *pi = prop.isMultiGpuBoard;
    break;
+  case hipDeviceAttributeCooperativeLaunch:
+    *pi = prop.cooperativeLaunch;
+    break;
+  case hipDeviceAttributeCooperativeMultiDeviceLaunch:
+    *pi = prop.cooperativeMultiDeviceLaunch;
+    break;
  default:
    HIP_RETURN(hipErrorInvalidValue);
  }
@@ -401,7 +407,7 @@ hipError_t hipGetDevice ( int* deviceId ) {

  if (deviceId != nullptr) {
    int dev = ihipGetDevice();
-    assert(dev != -1); 
+    assert(dev != -1);
    *deviceId = dev;
    HIP_RETURN(hipSuccess);
  } else {
@@ -110,10 +110,14 @@ hipModuleGetFunction
 hipModuleGetGlobal
 hipModuleLaunchKernel
 hipModuleLaunchKernelExt
+hipLaunchCooperativeKernel
+hipLaunchCooperativeMultiDeviceKernel
 hipHccModuleLaunchKernel
 hipModuleLoad
 hipModuleLoadData
 hipModuleUnload
+hipOccupancyMaxActiveBlocksPerMultiprocessor
+hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
 hipFuncGetAttributes
 hipPeekAtLastError
 hipPointerGetAttributes
@@ -111,9 +111,13 @@ global:
    hipModuleGetGlobal;
    hipModuleLaunchKernel;
    hipModuleLaunchKernelExt;
+    hipLaunchCooperativeKernel;
+    hipLaunchCooperativeMultiDeviceKernel;
    hipModuleLoad;
    hipModuleLoadData;
    hipModuleUnload;
+    hipOccupancyMaxActiveBlocksPerMultiprocessor;
+    hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags;
    hipFuncGetAttributes;
    hipPeekAtLastError;
    hipPointerGetAttributes;
@@ -170,7 +170,8 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
                                 uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
                                 uint32_t sharedMemBytes, hipStream_t hStream,
                                 void **kernelParams, void **extra,
-                                 hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0)
+                                 hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0,
+                                 uint32_t params = 0)
 {
  HIP_INIT_API(f, gridDimX, gridDimY, gridDimZ,
               blockDimX, blockDimY, blockDimZ,
@@ -193,6 +194,14 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
    hip::getNullStream()->finish();
    queue = as_amd(reinterpret_cast<cl_command_queue>(hStream))->asHostQueue();
  }
+  if ((params & amd::NDRangeKernelCommand::CooperativeGroups) &&
+      !device->info().cooperativeGroups_) {
+    return hipErrorLaunchFailure;
+  }
+  if ((params & amd::NDRangeKernelCommand::CooperativeMultiDeviceGroups) &&
+      !device->info().cooperativeMultiDeviceGroups_) {
+    return hipErrorLaunchFailure;
+  }
  if (!queue) {
    return hipErrorOutOfMemory;
  }
@@ -203,25 +212,29 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
  amd::NDRangeContainer ndrange(3, globalWorkOffset, globalWorkSize, localWorkSize);
  amd::Command::EventWaitList waitList;

+  address kernargs = nullptr;
+
  // 'extra' is a struct that contains the following info: {
  //   HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs,
  //   HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernargs_size,
  //   HIP_LAUNCH_PARAM_END }
-  if (extra[0] != HIP_LAUNCH_PARAM_BUFFER_POINTER ||
-      extra[2] != HIP_LAUNCH_PARAM_BUFFER_SIZE || extra[4] != HIP_LAUNCH_PARAM_END) {
-    return hipErrorNotInitialized;
+  if (extra != nullptr) {
+    if (extra[0] != HIP_LAUNCH_PARAM_BUFFER_POINTER ||
+        extra[2] != HIP_LAUNCH_PARAM_BUFFER_SIZE || extra[4] != HIP_LAUNCH_PARAM_END) {
+      return hipErrorNotInitialized;
+    }
+    kernargs = reinterpret_cast<address>(extra[1]);
  }
-  address kernargs = reinterpret_cast<address>(extra[1]);

  const amd::KernelSignature& signature = kernel->signature();
  for (size_t i = 0; i < signature.numParameters(); ++i) {
    const amd::KernelParameterDescriptor& desc = signature.at(i);
    if (kernelParams == nullptr) {
-      assert(extra);
+      assert(kernargs != nullptr);
      kernel->parameters().set(i, desc.size_, kernargs + desc.offset_,
                               desc.type_ == T_POINTER/*svmBound*/);
    } else {
-      assert(!extra);
+      assert(extra == nullptr);
      kernel->parameters().set(i, desc.size_, kernelParams[i], desc.type_ == T_POINTER/*svmBound*/);
    }
  }
@@ -232,7 +245,8 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
    eStart->addMarker(queue, startCommand);
  }

-  amd::NDRangeKernelCommand* command = new amd::NDRangeKernelCommand(*queue, waitList, *kernel, ndrange, sharedMemBytes);
+  amd::NDRangeKernelCommand* command = new amd::NDRangeKernelCommand(
+    *queue, waitList, *kernel, ndrange, sharedMemBytes, params);
  if (!command) {
    return hipErrorOutOfMemory;
  }
@@ -303,4 +317,43 @@ hipError_t hipModuleLaunchKernelExt(hipFunction_t f, uint32_t gridDimX,
                                sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent));
 }

+hipError_t hipLaunchCooperativeKernel(const void* f,
+                                      dim3 gridDim, dim3 blockDim,
+                                      void **kernelParams, uint32_t sharedMemBytes, hipStream_t hStream)
+{
+  int deviceId = ihipGetDevice();
+  hipFunction_t func = PlatformState::instance().getFunc(f, deviceId);
+  if (func == nullptr) {
+    HIP_RETURN(hipErrorUnknown);
+  }

+  HIP_RETURN(ihipModuleLaunchKernel(func, gridDim.x * blockDim.x, gridDim.y * blockDim.y, gridDim.z * blockDim.z,
+                                blockDim.x, blockDim.y, blockDim.z,
+                                sharedMemBytes, hStream, kernelParams, nullptr, nullptr, nullptr, 0,
+                                amd::NDRangeKernelCommand::CooperativeGroups));
+}
+
+hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
+                                                 int  numDevices, unsigned int  flags)
+{
+  int deviceId = ihipGetDevice();
+
+  hipError_t result = hipErrorUnknown;
+  for (int i = 0; i < numDevices; ++i) {
+    const hipLaunchParams& launch = launchParamsList[i];
+    amd::HostQueue* queue = as_amd(reinterpret_cast<cl_command_queue>(launch.stream))->asHostQueue();
+    hipFunction_t func = PlatformState::instance().getFunc(launch.func, deviceId);
+    if (func == nullptr) {
+      HIP_RETURN(result);
+    }
+    result = ihipModuleLaunchKernel(func,
+      launch.gridDim.x * launch.blockDim.x,
+      launch.gridDim.y * launch.blockDim.y,
+      launch.gridDim.z * launch.blockDim.z,
+      launch.blockDim.x, launch.blockDim.y, launch.blockDim.z,
+      launch.sharedMem, launch.stream,
+      launch.args, nullptr, nullptr, nullptr, flags,
+      (amd::NDRangeKernelCommand::CooperativeGroups | amd::NDRangeKernelCommand::CooperativeMultiDeviceGroups));
+  }
+  return result;
+}
@@ -448,6 +448,69 @@ hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memor
 }


+namespace hip_impl {
+
+hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
+                                                         const void* f,
+                                                         int  blockSize,
+                                                         size_t dynamicSMemSize)
+{
+  HIP_INIT_API(f, blockSize, dynamicSMemSize);
+  int deviceId = ihipGetDevice();
+  hipFunction_t func = PlatformState::instance().getFunc(f, deviceId);
+  if (func == nullptr) {
+    HIP_RETURN(hipErrorUnknown);
+  }
+
+  hip::Function* function = hip::Function::asFunction(func);
+  amd::Kernel* kernel = function->function_;
+  if (!kernel) {
+    HIP_RETURN(hipErrorOutOfMemory);
+  }
+  if (blockSize == 0) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  amd::Device* device = hip::getCurrentContext()->devices()[0];
+  const device::Kernel::WorkGroupInfo* wrkGrpInfo = kernel->getDeviceKernel(*device)->workGroupInfo();
+
+  // Find threads accupancy per CU => simd_per_cu * GPR usage
+  constexpr size_t MaxWavesPerSimd = 8;  // Limited by SPI 32 per CU, hence 8 per SIMD
+  size_t alu_accupancy = device->info().simdPerCU_ *
+    std::min(MaxWavesPerSimd, (wrkGrpInfo->availableVGPRs_ / amd::alignUp(wrkGrpInfo->usedVGPRs_, 4)));
+
+  alu_accupancy *= wrkGrpInfo->wavefrontSize_;
+  // Calculate blocks occupancy per CU
+  *numBlocks = alu_accupancy / amd::alignUp(blockSize, wrkGrpInfo->wavefrontSize_);
+
+  size_t total_used_lds = wrkGrpInfo->usedLDSSize_ + dynamicSMemSize;
+  if (total_used_lds != 0) {
+    // Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
+    int lds_occupancy = static_cast<int>(device->info().localMemSize_ / total_used_lds);
+    *numBlocks = std::min(*numBlocks, lds_occupancy);
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+}
+
+hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
+                                                        const void* f,
+                                                        int  blockSize,
+                                                        size_t dynamicSMemSize)
+{
+  HIP_RETURN(hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynamicSMemSize));
+}
+
+hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,
+                                                                 const void* f,
+                                                                 int  blockSize,
+                                                                 size_t dynamicSMemSize,
+                                                                 unsigned int flags)
+{
+  HIP_RETURN(hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynamicSMemSize));
+}
+
+
 #if defined(ATI_OS_LINUX)

 namespace hip_impl {
@@ -668,6 +731,20 @@ void hipLaunchKernelGGLImpl(
    sharedMemBytes, stream, nullptr, kernarg);
 }

+void hipLaunchCooperativeKernelGGLImpl(
+  uintptr_t function_address,
+  const dim3& numBlocks,
+  const dim3& dimBlocks,
+  uint32_t sharedMemBytes,
+  hipStream_t stream,
+  void** kernarg)
+{
+  HIP_INIT();
+
+  hipLaunchCooperativeKernel(reinterpret_cast<void*>(function_address),
+    numBlocks, dimBlocks, kernarg, sharedMemBytes, stream);
+}
+
 }

 // conversion routines between float and half precision