P4 to Git Change 2014404 by gandryey@gera-win10 on 2019/10/16 11:13:37

SWDEV-184710 - Support hipLaunchCooperativeKernelMultiDevice() - Add support for multi grid launch in hip - Detect the new hidden argument and pass the required information for the kernel launch - Memory for synchronization is allocated as a single object and then the offset for each GPU is found Affected files ... ... //depot/stg/opencl/drivers/opencl/api/hip/hip_module.cpp#44 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#343 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#25 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#17 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#82 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#136 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#42 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#90 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#30 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#99 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/command.hpp#97 edit [ROCm/hip commit: 70a52b9cd7]
2019-10-16 11:24:09 -04:00
@@ -256,27 +256,27 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
                                 uint32_t sharedMemBytes, hipStream_t hStream,
                                 void **kernelParams, void **extra,
                                 hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0,
-                                 uint32_t params = 0)
-{
+                                 uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0,
+                                 uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0) {
  HIP_INIT_API(NONE, f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
    sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags, params);

  hip::Function* function = hip::Function::asFunction(f);
  amd::Kernel* kernel = function->function_;
-  amd::Device* device = hip::getCurrentContext()->devices()[0];

  amd::ScopedLock lock(function->lock_);

  hip::Event* eStart = reinterpret_cast<hip::Event*>(startEvent);
  hip::Event* eStop = reinterpret_cast<hip::Event*>(stopEvent);
  amd::HostQueue* queue = hip::getQueue(hStream);
+  const amd::Device& device = queue->vdev()->device();

  if ((params & amd::NDRangeKernelCommand::CooperativeGroups) &&
-      !device->info().cooperativeGroups_) {
+      !device.info().cooperativeGroups_) {
    return hipErrorLaunchFailure;
  }
  if ((params & amd::NDRangeKernelCommand::CooperativeMultiDeviceGroups) &&
-      !device->info().cooperativeMultiDeviceGroups_) {
+      !device.info().cooperativeMultiDeviceGroups_) {
    return hipErrorLaunchFailure;
  }
  if (!queue) {
@@ -323,7 +323,8 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
  }

  amd::NDRangeKernelCommand* command = new amd::NDRangeKernelCommand(
-    *queue, waitList, *kernel, ndrange, sharedMemBytes, params);
+    *queue, waitList, *kernel, ndrange, sharedMemBytes,
+    params, gridId, numGrids, prevGridSum, allGridSum, firstDevice);
  if (!command) {
    return hipErrorOutOfMemory;
  }
@@ -436,33 +437,54 @@ hipError_t hipLaunchCooperativeKernel(const void* f,
 hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
                                                  int numDevices, unsigned int flags, uint32_t extFlags)
 {
-  int currentDevice = ihipGetDevice();
  int numActiveGPUs = 0;
  ihipDeviceGetCount(&numActiveGPUs);

  if ((numDevices > numActiveGPUs) || (launchParamsList == nullptr)) {
    return hipErrorInvalidValue;
  }
- 
+
  hipError_t result = hipErrorUnknown;
+  uint64_t allGridSize = 0;
+  for (int i = 0; i < numDevices; ++i) {
+    const hipLaunchParams& launch = launchParamsList[i];
+    allGridSize += launch.gridDim.x * launch.gridDim.y * launch.gridDim.z;
+  }
+  uint64_t prevGridSize = 0;
+  uint32_t firstDevice = 0;
  for (int i = 0; i < numDevices; ++i) {
-    hipSetDevice(i);
    const hipLaunchParams& launch = launchParamsList[i];
    amd::HostQueue* queue = reinterpret_cast<hip::Stream*>(launch.stream)->asHostQueue();
-    hipFunction_t func = PlatformState::instance().getFunc(launch.func, i);
+    hipFunction_t func = nullptr;
+    // The order of devices in the launch may not match the order in the global array
+    for (size_t dev = 0; dev < g_devices.size(); ++dev) {
+      // Find the matching device and request the kernel function
+      if (&queue->vdev()->device() == g_devices[dev]->devices()[0]) {
+        func = PlatformState::instance().getFunc(launch.func, dev);
+        // Save VDI index of the first device in the launch
+        if (i == 0) {
+          firstDevice = queue->vdev()->device().index();
+        }
+        break;
+      }
+    }
    if (func == nullptr) {
      HIP_RETURN(result);
    }
+
    result = ihipModuleLaunchKernel(func,
      launch.gridDim.x * launch.blockDim.x,
      launch.gridDim.y * launch.blockDim.y,
      launch.gridDim.z * launch.blockDim.z,
      launch.blockDim.x, launch.blockDim.y, launch.blockDim.z,
-      launch.sharedMem, launch.stream,
-      launch.args, nullptr, nullptr, nullptr, flags, extFlags);
+      launch.sharedMem, launch.stream, launch.args, nullptr, nullptr, nullptr,
+      flags, extFlags, i, numDevices, prevGridSize, allGridSize, firstDevice);
+    if (result != hipSuccess) {
+      break;
+    }
+    prevGridSize += launch.gridDim.x * launch.gridDim.y * launch.gridDim.z;
  }

-  hipSetDevice(currentDevice);
  return result;
 }