diff --git a/projects/hip/api/hip/hip_module.cpp b/projects/hip/api/hip/hip_module.cpp index ad637d3967..20a5fbdef1 100644 --- a/projects/hip/api/hip/hip_module.cpp +++ b/projects/hip/api/hip/hip_module.cpp @@ -256,27 +256,27 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra, hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0, - uint32_t params = 0) -{ + uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0, + uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0) { HIP_INIT_API(NONE, f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags, params); hip::Function* function = hip::Function::asFunction(f); amd::Kernel* kernel = function->function_; - amd::Device* device = hip::getCurrentContext()->devices()[0]; amd::ScopedLock lock(function->lock_); hip::Event* eStart = reinterpret_cast(startEvent); hip::Event* eStop = reinterpret_cast(stopEvent); amd::HostQueue* queue = hip::getQueue(hStream); + const amd::Device& device = queue->vdev()->device(); if ((params & amd::NDRangeKernelCommand::CooperativeGroups) && - !device->info().cooperativeGroups_) { + !device.info().cooperativeGroups_) { return hipErrorLaunchFailure; } if ((params & amd::NDRangeKernelCommand::CooperativeMultiDeviceGroups) && - !device->info().cooperativeMultiDeviceGroups_) { + !device.info().cooperativeMultiDeviceGroups_) { return hipErrorLaunchFailure; } if (!queue) { @@ -323,7 +323,8 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, } amd::NDRangeKernelCommand* command = new amd::NDRangeKernelCommand( - *queue, waitList, *kernel, ndrange, sharedMemBytes, params); + *queue, waitList, *kernel, ndrange, sharedMemBytes, + params, gridId, numGrids, prevGridSum, allGridSum, firstDevice); if (!command) { return hipErrorOutOfMemory; } @@ -436,33 +437,54 @@ hipError_t hipLaunchCooperativeKernel(const void* f, hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices, unsigned int flags, uint32_t extFlags) { - int currentDevice = ihipGetDevice(); int numActiveGPUs = 0; ihipDeviceGetCount(&numActiveGPUs); if ((numDevices > numActiveGPUs) || (launchParamsList == nullptr)) { return hipErrorInvalidValue; } - + hipError_t result = hipErrorUnknown; + uint64_t allGridSize = 0; + for (int i = 0; i < numDevices; ++i) { + const hipLaunchParams& launch = launchParamsList[i]; + allGridSize += launch.gridDim.x * launch.gridDim.y * launch.gridDim.z; + } + uint64_t prevGridSize = 0; + uint32_t firstDevice = 0; for (int i = 0; i < numDevices; ++i) { - hipSetDevice(i); const hipLaunchParams& launch = launchParamsList[i]; amd::HostQueue* queue = reinterpret_cast(launch.stream)->asHostQueue(); - hipFunction_t func = PlatformState::instance().getFunc(launch.func, i); + hipFunction_t func = nullptr; + // The order of devices in the launch may not match the order in the global array + for (size_t dev = 0; dev < g_devices.size(); ++dev) { + // Find the matching device and request the kernel function + if (&queue->vdev()->device() == g_devices[dev]->devices()[0]) { + func = PlatformState::instance().getFunc(launch.func, dev); + // Save VDI index of the first device in the launch + if (i == 0) { + firstDevice = queue->vdev()->device().index(); + } + break; + } + } if (func == nullptr) { HIP_RETURN(result); } + result = ihipModuleLaunchKernel(func, launch.gridDim.x * launch.blockDim.x, launch.gridDim.y * launch.blockDim.y, launch.gridDim.z * launch.blockDim.z, launch.blockDim.x, launch.blockDim.y, launch.blockDim.z, - launch.sharedMem, launch.stream, - launch.args, nullptr, nullptr, nullptr, flags, extFlags); + launch.sharedMem, launch.stream, launch.args, nullptr, nullptr, nullptr, + flags, extFlags, i, numDevices, prevGridSize, allGridSize, firstDevice); + if (result != hipSuccess) { + break; + } + prevGridSize += launch.gridDim.x * launch.gridDim.y * launch.gridDim.z; } - hipSetDevice(currentDevice); return result; }