From ac8109b1ac32eb96530fd3cf5ef24eb2f78271bb Mon Sep 17 00:00:00 2001
From: foreman
Date: Wed, 16 Oct 2019 11:24:09 -0400
Subject: [PATCH] P4 to Git Change 2014404 by gandryey@gera-win10 on 2019/10/16
11:13:37
SWDEV-184710 - Support hipLaunchCooperativeKernelMultiDevice()
- Add support for multi grid launch in hip
- Detect the new hidden argument and pass the required information for the kernel launch
- Memory for synchronization is allocated as a single object and then the offset for each GPU is found
Affected files ...
... //depot/stg/opencl/drivers/opencl/api/hip/hip_module.cpp#44 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#343 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#25 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#17 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#82 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#136 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#42 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#90 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#30 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#99 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.hpp#97 edit
[ROCm/hip commit: 70a52b9cd713c0bf26771d25c702b28f35b4ef9f]
---
projects/hip/api/hip/hip_module.cpp | 48 +++++++++++++++++++++--------
1 file changed, 35 insertions(+), 13 deletions(-)
diff --git a/projects/hip/api/hip/hip_module.cpp b/projects/hip/api/hip/hip_module.cpp
index ad637d3967..20a5fbdef1 100644
--- a/projects/hip/api/hip/hip_module.cpp
+++ b/projects/hip/api/hip/hip_module.cpp
@@ -256,27 +256,27 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
uint32_t sharedMemBytes, hipStream_t hStream,
void **kernelParams, void **extra,
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0,
- uint32_t params = 0)
-{
+ uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0,
+ uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0) {
HIP_INIT_API(NONE, f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags, params);
hip::Function* function = hip::Function::asFunction(f);
amd::Kernel* kernel = function->function_;
- amd::Device* device = hip::getCurrentContext()->devices()[0];
amd::ScopedLock lock(function->lock_);
hip::Event* eStart = reinterpret_cast(startEvent);
hip::Event* eStop = reinterpret_cast(stopEvent);
amd::HostQueue* queue = hip::getQueue(hStream);
+ const amd::Device& device = queue->vdev()->device();
if ((params & amd::NDRangeKernelCommand::CooperativeGroups) &&
- !device->info().cooperativeGroups_) {
+ !device.info().cooperativeGroups_) {
return hipErrorLaunchFailure;
}
if ((params & amd::NDRangeKernelCommand::CooperativeMultiDeviceGroups) &&
- !device->info().cooperativeMultiDeviceGroups_) {
+ !device.info().cooperativeMultiDeviceGroups_) {
return hipErrorLaunchFailure;
}
if (!queue) {
@@ -323,7 +323,8 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
}
amd::NDRangeKernelCommand* command = new amd::NDRangeKernelCommand(
- *queue, waitList, *kernel, ndrange, sharedMemBytes, params);
+ *queue, waitList, *kernel, ndrange, sharedMemBytes,
+ params, gridId, numGrids, prevGridSum, allGridSum, firstDevice);
if (!command) {
return hipErrorOutOfMemory;
}
@@ -436,33 +437,54 @@ hipError_t hipLaunchCooperativeKernel(const void* f,
hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
int numDevices, unsigned int flags, uint32_t extFlags)
{
- int currentDevice = ihipGetDevice();
int numActiveGPUs = 0;
ihipDeviceGetCount(&numActiveGPUs);
if ((numDevices > numActiveGPUs) || (launchParamsList == nullptr)) {
return hipErrorInvalidValue;
}
-
+
hipError_t result = hipErrorUnknown;
+ uint64_t allGridSize = 0;
+ for (int i = 0; i < numDevices; ++i) {
+ const hipLaunchParams& launch = launchParamsList[i];
+ allGridSize += launch.gridDim.x * launch.gridDim.y * launch.gridDim.z;
+ }
+ uint64_t prevGridSize = 0;
+ uint32_t firstDevice = 0;
for (int i = 0; i < numDevices; ++i) {
- hipSetDevice(i);
const hipLaunchParams& launch = launchParamsList[i];
amd::HostQueue* queue = reinterpret_cast(launch.stream)->asHostQueue();
- hipFunction_t func = PlatformState::instance().getFunc(launch.func, i);
+ hipFunction_t func = nullptr;
+ // The order of devices in the launch may not match the order in the global array
+ for (size_t dev = 0; dev < g_devices.size(); ++dev) {
+ // Find the matching device and request the kernel function
+ if (&queue->vdev()->device() == g_devices[dev]->devices()[0]) {
+ func = PlatformState::instance().getFunc(launch.func, dev);
+ // Save VDI index of the first device in the launch
+ if (i == 0) {
+ firstDevice = queue->vdev()->device().index();
+ }
+ break;
+ }
+ }
if (func == nullptr) {
HIP_RETURN(result);
}
+
result = ihipModuleLaunchKernel(func,
launch.gridDim.x * launch.blockDim.x,
launch.gridDim.y * launch.blockDim.y,
launch.gridDim.z * launch.blockDim.z,
launch.blockDim.x, launch.blockDim.y, launch.blockDim.z,
- launch.sharedMem, launch.stream,
- launch.args, nullptr, nullptr, nullptr, flags, extFlags);
+ launch.sharedMem, launch.stream, launch.args, nullptr, nullptr, nullptr,
+ flags, extFlags, i, numDevices, prevGridSize, allGridSize, firstDevice);
+ if (result != hipSuccess) {
+ break;
+ }
+ prevGridSize += launch.gridDim.x * launch.gridDim.y * launch.gridDim.z;
}
- hipSetDevice(currentDevice);
return result;
}