From ac8109b1ac32eb96530fd3cf5ef24eb2f78271bb Mon Sep 17 00:00:00 2001 From: foreman Date: Wed, 16 Oct 2019 11:24:09 -0400 Subject: [PATCH] P4 to Git Change 2014404 by gandryey@gera-win10 on 2019/10/16 11:13:37 SWDEV-184710 - Support hipLaunchCooperativeKernelMultiDevice() - Add support for multi grid launch in hip - Detect the new hidden argument and pass the required information for the kernel launch - Memory for synchronization is allocated as a single object and then the offset for each GPU is found Affected files ... ... //depot/stg/opencl/drivers/opencl/api/hip/hip_module.cpp#44 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#343 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#25 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#17 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#82 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#136 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#42 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#90 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#30 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#99 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/command.hpp#97 edit [ROCm/hip commit: 70a52b9cd713c0bf26771d25c702b28f35b4ef9f] --- projects/hip/api/hip/hip_module.cpp | 48 +++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/projects/hip/api/hip/hip_module.cpp b/projects/hip/api/hip/hip_module.cpp index ad637d3967..20a5fbdef1 100644 --- a/projects/hip/api/hip/hip_module.cpp +++ b/projects/hip/api/hip/hip_module.cpp @@ -256,27 +256,27 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra, hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0, - uint32_t params = 0) -{ + uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0, + uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0) { HIP_INIT_API(NONE, f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags, params); hip::Function* function = hip::Function::asFunction(f); amd::Kernel* kernel = function->function_; - amd::Device* device = hip::getCurrentContext()->devices()[0]; amd::ScopedLock lock(function->lock_); hip::Event* eStart = reinterpret_cast(startEvent); hip::Event* eStop = reinterpret_cast(stopEvent); amd::HostQueue* queue = hip::getQueue(hStream); + const amd::Device& device = queue->vdev()->device(); if ((params & amd::NDRangeKernelCommand::CooperativeGroups) && - !device->info().cooperativeGroups_) { + !device.info().cooperativeGroups_) { return hipErrorLaunchFailure; } if ((params & amd::NDRangeKernelCommand::CooperativeMultiDeviceGroups) && - !device->info().cooperativeMultiDeviceGroups_) { + !device.info().cooperativeMultiDeviceGroups_) { return hipErrorLaunchFailure; } if (!queue) { @@ -323,7 +323,8 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, } amd::NDRangeKernelCommand* command = new amd::NDRangeKernelCommand( - *queue, waitList, *kernel, ndrange, sharedMemBytes, params); + *queue, waitList, *kernel, ndrange, sharedMemBytes, + params, gridId, numGrids, prevGridSum, allGridSum, firstDevice); if (!command) { return hipErrorOutOfMemory; } @@ -436,33 +437,54 @@ hipError_t hipLaunchCooperativeKernel(const void* f, hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices, unsigned int flags, uint32_t extFlags) { - int currentDevice = ihipGetDevice(); int numActiveGPUs = 0; ihipDeviceGetCount(&numActiveGPUs); if ((numDevices > numActiveGPUs) || (launchParamsList == nullptr)) { return hipErrorInvalidValue; } - + hipError_t result = hipErrorUnknown; + uint64_t allGridSize = 0; + for (int i = 0; i < numDevices; ++i) { + const hipLaunchParams& launch = launchParamsList[i]; + allGridSize += launch.gridDim.x * launch.gridDim.y * launch.gridDim.z; + } + uint64_t prevGridSize = 0; + uint32_t firstDevice = 0; for (int i = 0; i < numDevices; ++i) { - hipSetDevice(i); const hipLaunchParams& launch = launchParamsList[i]; amd::HostQueue* queue = reinterpret_cast(launch.stream)->asHostQueue(); - hipFunction_t func = PlatformState::instance().getFunc(launch.func, i); + hipFunction_t func = nullptr; + // The order of devices in the launch may not match the order in the global array + for (size_t dev = 0; dev < g_devices.size(); ++dev) { + // Find the matching device and request the kernel function + if (&queue->vdev()->device() == g_devices[dev]->devices()[0]) { + func = PlatformState::instance().getFunc(launch.func, dev); + // Save VDI index of the first device in the launch + if (i == 0) { + firstDevice = queue->vdev()->device().index(); + } + break; + } + } if (func == nullptr) { HIP_RETURN(result); } + result = ihipModuleLaunchKernel(func, launch.gridDim.x * launch.blockDim.x, launch.gridDim.y * launch.blockDim.y, launch.gridDim.z * launch.blockDim.z, launch.blockDim.x, launch.blockDim.y, launch.blockDim.z, - launch.sharedMem, launch.stream, - launch.args, nullptr, nullptr, nullptr, flags, extFlags); + launch.sharedMem, launch.stream, launch.args, nullptr, nullptr, nullptr, + flags, extFlags, i, numDevices, prevGridSize, allGridSize, firstDevice); + if (result != hipSuccess) { + break; + } + prevGridSize += launch.gridDim.x * launch.gridDim.y * launch.gridDim.z; } - hipSetDevice(currentDevice); return result; }