diff --git a/include/hip/hcc_detail/functional_grid_launch.hpp b/include/hip/hcc_detail/functional_grid_launch.hpp index e7d8cc623f..76a04fa355 100644 --- a/include/hip/hcc_detail/functional_grid_launch.hpp +++ b/include/hip/hcc_detail/functional_grid_launch.hpp @@ -37,6 +37,15 @@ THE SOFTWARE. hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices, unsigned int flags, hip_impl::program_state& ps); +hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDimX, void** kernelParams, + unsigned int sharedMemBytes, hipStream_t stream, hip_impl::program_state& ps); + +hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices, + unsigned int flags, hip_impl::program_state& ps); + + + + #pragma GCC visibility push(hidden) namespace hip_impl { @@ -190,4 +199,25 @@ hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, } +template +inline +__attribute__((visibility("hidden"))) +hipError_t hipLaunchCooperativeKernel(F f, dim3 gridDim, dim3 blockDimX, void** kernelParams, + unsigned int sharedMemBytes, hipStream_t stream) { + + hip_impl::hip_init(); + auto& ps = hip_impl::get_program_state(); + return ihipLaunchCooperativeKernel(reinterpret_cast(f), gridDim, blockDimX, kernelParams, sharedMemBytes, stream, ps); +} + +inline +__attribute__((visibility("hidden"))) +hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices, + unsigned int flags) { + + hip_impl::hip_init(); + auto& ps = hip_impl::get_program_state(); + return ihipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags, ps); +} + #pragma GCC visibility pop diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index c619cfcc44..a23c1c9bd7 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -2889,6 +2889,7 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigne void** kernelParams, void** extra); +#if __HIP_VDI__ && !defined(__HCC__) /** * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed * to kernelparams or extra, where thread blocks can cooperate and synchronize as they execute @@ -2921,6 +2922,8 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDim hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices, unsigned int flags); +#endif + /** * @brief determine the grid and block sizes to achieves maximum occupancy for a kernel * @@ -3382,7 +3385,7 @@ hipError_t hipBindTextureToMipmappedArray(const texture& tex, return hipSuccess; } - +#if __HIP_VDI__ && !defined(__HCC__) template inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim, void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) { @@ -3396,7 +3399,7 @@ inline hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchP return hipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags); } -#if defined(__clang__) && defined(__HIP__) + template inline hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, unsigned int numDevices, unsigned int flags = 0) { diff --git a/src/hip_module.cpp b/src/hip_module.cpp index d9fc79ce63..02bc2df349 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -382,46 +382,45 @@ __global__ void init_gws(uint nwm1) { } } -hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, +__attribute__((visibility("default"))) +hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDimX, void** kernelParams, unsigned int sharedMemBytes, - hipStream_t stream) { + hipStream_t stream, hip_impl::program_state& ps) { - HIP_INIT_API(hipLaunchCooperativeKernel, f, gridDim, blockDimX, kernelParams, sharedMemBytes, stream); hipError_t result; if ((f == nullptr) || (stream == nullptr) || (kernelParams == nullptr)) { - return ihipLogStatus(hipErrorNotInitialized); + return hipErrorNotInitialized; } if (!stream->getDevice()->_props.cooperativeLaunch) { - return ihipLogStatus(hipErrorInvalidConfiguration); + return hipErrorInvalidConfiguration; } // Prepare the kernel descriptor for initializing the GWS - hipFunction_t gwsKD = hip_impl::get_program_state().kernel_descriptor( + hipFunction_t gwsKD = ps.kernel_descriptor( reinterpret_cast(&init_gws), hip_impl::target_agent(stream)); if (gwsKD == nullptr) { - return ihipLogStatus(hipErrorInvalidValue); + return hipErrorInvalidValue; } - hip_impl::kernargs_size_align gwsKargs = - hip_impl::get_program_state().get_kernargs_size_align( + hip_impl::kernargs_size_align gwsKargs = ps.get_kernargs_size_align( reinterpret_cast(&init_gws)); gwsKD->_kernarg_layout = *reinterpret_cast>*>(gwsKargs.getHandle()); // Prepare the kernel descriptor for the main kernel - hipFunction_t kd = hip_impl::get_program_state().kernel_descriptor( + hipFunction_t kd = ps.kernel_descriptor( reinterpret_cast(f), hip_impl::target_agent(stream)); if (kd == nullptr) { - return ihipLogStatus(hipErrorInvalidValue); + return hipErrorInvalidValue; } hip_impl::kernargs_size_align kargs = - hip_impl::get_program_state().get_kernargs_size_align( + ps.get_kernargs_size_align( reinterpret_cast(f)); kd->_kernarg_layout = *reinterpret_cast_av.acquire_locked_hsa_queue(); #endif + GET_TLS(); // launch the init_gws kernel to initialize the GWS result = ihipModuleLaunchKernel(tls, gwsKD, 1, 1, 1, 1, 1, 1, 0, stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true); @@ -448,7 +448,7 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, stream->criticalData()._av.release_locked_hsa_queue(); #endif - return ihipLogStatus(hipErrorLaunchFailure); + return hipErrorLaunchFailure; } size_t impCoopArg = 1; @@ -469,29 +469,30 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, stream->criticalData()._av.release_locked_hsa_queue(); #endif - return ihipLogStatus(result); + return result; } -hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, - int numDevices, unsigned int flags) { +__attribute__((visibility("default"))) +hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, + int numDevices, unsigned int flags, hip_impl::program_state& ps) { - HIP_INIT_API(hipLaunchCooperativeKernelMultiDevice, launchParamsList, numDevices, flags); hipError_t result; if (numDevices > g_deviceCnt || launchParamsList == nullptr || numDevices > MAX_COOPERATIVE_GPUs) { - return ihipLogStatus(hipErrorInvalidValue); + return hipErrorInvalidValue; } for (int i = 0; i < numDevices; ++i) { if (!launchParamsList[i].stream->getDevice()->_props.cooperativeMultiDeviceLaunch) { - return ihipLogStatus(hipErrorInvalidConfiguration); + return hipErrorInvalidConfiguration; } } hipFunction_t* gwsKds = reinterpret_cast(malloc(sizeof(hipFunction_t) * numDevices)); hipFunction_t* kds = reinterpret_cast(malloc(sizeof(hipFunction_t) * numDevices)); + if (kds == nullptr || gwsKds == nullptr) { - return ihipLogStatus(hipErrorNotInitialized); + return hipErrorNotInitialized; } // prepare all kernel descriptors for initializing the GWS and the main kernels per device @@ -500,30 +501,30 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi if (lp.stream == nullptr) { free(gwsKds); free(kds); - return ihipLogStatus(hipErrorNotInitialized); + return hipErrorNotInitialized; } - gwsKds[i] = hip_impl::get_program_state().kernel_descriptor(reinterpret_cast(&init_gws), + gwsKds[i] = ps.kernel_descriptor(reinterpret_cast(&init_gws), hip_impl::target_agent(lp.stream)); if (gwsKds[i] == nullptr) { free(gwsKds); free(kds); - return ihipLogStatus(hipErrorInvalidValue); + return hipErrorInvalidValue; } - hip_impl::kernargs_size_align gwsKargs = hip_impl::get_program_state().get_kernargs_size_align( + hip_impl::kernargs_size_align gwsKargs = ps.get_kernargs_size_align( reinterpret_cast(&init_gws)); gwsKds[i]->_kernarg_layout = *reinterpret_cast>*>( gwsKargs.getHandle()); - kds[i] = hip_impl::get_program_state().kernel_descriptor(reinterpret_cast(lp.func), + kds[i] = ps.kernel_descriptor(reinterpret_cast(lp.func), hip_impl::target_agent(lp.stream)); if (kds[i] == nullptr) { free(gwsKds); free(kds); - return ihipLogStatus(hipErrorInvalidValue); + return hipErrorInvalidValue; } - hip_impl::kernargs_size_align kargs = hip_impl::get_program_state().get_kernargs_size_align( + hip_impl::kernargs_size_align kargs = ps.get_kernargs_size_align( reinterpret_cast(lp.func)); kds[i]->_kernarg_layout = *reinterpret_cast>*>( kargs.getHandle()); @@ -532,9 +533,10 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi mg_sync *mg_sync_ptr = 0; mg_info *mg_info_ptr[MAX_COOPERATIVE_GPUs] = {0}; + GET_TLS(); result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault); if (result != hipSuccess) { - return ihipLogStatus(hipErrorInvalidValue); + return hipErrorInvalidValue; } mg_sync_ptr->w0 = 0; mg_sync_ptr->w1 = 0; @@ -547,7 +549,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi for (int j = 0; j < i; ++j) { hip_internal::ihipHostFree(tls, mg_info_ptr[j]); } - return ihipLogStatus(hipErrorInvalidValue); + return hipErrorInvalidValue; } // calculate the sum of sizes of all grids const hipLaunchParams& lp = launchParamsList[i]; @@ -586,7 +588,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi hip_internal::ihipHostFree(tls, mg_info_ptr[j]); } - return ihipLogStatus(hipErrorLaunchFailure); + return hipErrorLaunchFailure; } } @@ -629,7 +631,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi hip_internal::ihipHostFree(tls, mg_info_ptr[j]); } - return ihipLogStatus(hipErrorLaunchFailure); + return hipErrorLaunchFailure; } } @@ -650,7 +652,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi hip_internal::ihipHostFree(tls, mg_info_ptr[j]); } - return ihipLogStatus(result); + return result; } namespace hip_impl {