[hip] refactoring cooperative kernel launch APIs (#1737)
This PR is a follow-up on PR# #1698 and it makes two more APIs (hipLaunchCooperativeKernel/hipLaunchCooperativeKernelMultiDevice) inline so that they can work correctly with lazy binding.
Tento commit je obsažen v:
odevzdal
Maneesh Gupta
rodič
651c7a8e27
revize
857052be1e
@@ -37,6 +37,15 @@ THE SOFTWARE.
|
||||
hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
|
||||
unsigned int flags, hip_impl::program_state& ps);
|
||||
|
||||
hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDimX, void** kernelParams,
|
||||
unsigned int sharedMemBytes, hipStream_t stream, hip_impl::program_state& ps);
|
||||
|
||||
hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
|
||||
unsigned int flags, hip_impl::program_state& ps);
|
||||
|
||||
|
||||
|
||||
|
||||
#pragma GCC visibility push(hidden)
|
||||
|
||||
namespace hip_impl {
|
||||
@@ -190,4 +199,25 @@ hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
hipError_t hipLaunchCooperativeKernel(F f, dim3 gridDim, dim3 blockDimX, void** kernelParams,
|
||||
unsigned int sharedMemBytes, hipStream_t stream) {
|
||||
|
||||
hip_impl::hip_init();
|
||||
auto& ps = hip_impl::get_program_state();
|
||||
return ihipLaunchCooperativeKernel(reinterpret_cast<void*>(f), gridDim, blockDimX, kernelParams, sharedMemBytes, stream, ps);
|
||||
}
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
|
||||
unsigned int flags) {
|
||||
|
||||
hip_impl::hip_init();
|
||||
auto& ps = hip_impl::get_program_state();
|
||||
return ihipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags, ps);
|
||||
}
|
||||
|
||||
#pragma GCC visibility pop
|
||||
|
||||
@@ -2889,6 +2889,7 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigne
|
||||
void** kernelParams, void** extra);
|
||||
|
||||
|
||||
#if __HIP_VDI__ && !defined(__HCC__)
|
||||
/**
|
||||
* @brief launches kernel f with launch parameters and shared memory on stream with arguments passed
|
||||
* to kernelparams or extra, where thread blocks can cooperate and synchronize as they execute
|
||||
@@ -2921,6 +2922,8 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDim
|
||||
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
int numDevices, unsigned int flags);
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief determine the grid and block sizes to achieves maximum occupancy for a kernel
|
||||
*
|
||||
@@ -3382,7 +3385,7 @@ hipError_t hipBindTextureToMipmappedArray(const texture<T, dim, readMode>& tex,
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
|
||||
#if __HIP_VDI__ && !defined(__HCC__)
|
||||
template <class T>
|
||||
inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
|
||||
void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) {
|
||||
@@ -3396,7 +3399,7 @@ inline hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchP
|
||||
return hipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags);
|
||||
}
|
||||
|
||||
#if defined(__clang__) && defined(__HIP__)
|
||||
|
||||
template <class T>
|
||||
inline hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
unsigned int numDevices, unsigned int flags = 0) {
|
||||
|
||||
@@ -382,46 +382,45 @@ __global__ void init_gws(uint nwm1) {
|
||||
}
|
||||
}
|
||||
|
||||
hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
|
||||
__attribute__((visibility("default")))
|
||||
hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
|
||||
dim3 blockDimX, void** kernelParams, unsigned int sharedMemBytes,
|
||||
hipStream_t stream) {
|
||||
hipStream_t stream, hip_impl::program_state& ps) {
|
||||
|
||||
HIP_INIT_API(hipLaunchCooperativeKernel, f, gridDim, blockDimX, kernelParams, sharedMemBytes, stream);
|
||||
hipError_t result;
|
||||
|
||||
|
||||
if ((f == nullptr) || (stream == nullptr) || (kernelParams == nullptr)) {
|
||||
return ihipLogStatus(hipErrorNotInitialized);
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
|
||||
if (!stream->getDevice()->_props.cooperativeLaunch) {
|
||||
return ihipLogStatus(hipErrorInvalidConfiguration);
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
|
||||
// Prepare the kernel descriptor for initializing the GWS
|
||||
hipFunction_t gwsKD = hip_impl::get_program_state().kernel_descriptor(
|
||||
hipFunction_t gwsKD = ps.kernel_descriptor(
|
||||
reinterpret_cast<std::uintptr_t>(&init_gws),
|
||||
hip_impl::target_agent(stream));
|
||||
|
||||
if (gwsKD == nullptr) {
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
hip_impl::kernargs_size_align gwsKargs =
|
||||
hip_impl::get_program_state().get_kernargs_size_align(
|
||||
hip_impl::kernargs_size_align gwsKargs = ps.get_kernargs_size_align(
|
||||
reinterpret_cast<std::uintptr_t>(&init_gws));
|
||||
|
||||
gwsKD->_kernarg_layout = *reinterpret_cast<const std::vector<
|
||||
std::pair<std::size_t, std::size_t>>*>(gwsKargs.getHandle());
|
||||
|
||||
// Prepare the kernel descriptor for the main kernel
|
||||
hipFunction_t kd = hip_impl::get_program_state().kernel_descriptor(
|
||||
hipFunction_t kd = ps.kernel_descriptor(
|
||||
reinterpret_cast<std::uintptr_t>(f),
|
||||
hip_impl::target_agent(stream));
|
||||
if (kd == nullptr) {
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
hip_impl::kernargs_size_align kargs =
|
||||
hip_impl::get_program_state().get_kernargs_size_align(
|
||||
ps.get_kernargs_size_align(
|
||||
reinterpret_cast<std::uintptr_t>(f));
|
||||
|
||||
kd->_kernarg_layout = *reinterpret_cast<const std::vector<
|
||||
@@ -438,6 +437,7 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
|
||||
streamCrit->_av.acquire_locked_hsa_queue();
|
||||
#endif
|
||||
|
||||
GET_TLS();
|
||||
// launch the init_gws kernel to initialize the GWS
|
||||
result = ihipModuleLaunchKernel(tls, gwsKD, 1, 1, 1, 1, 1, 1,
|
||||
0, stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true);
|
||||
@@ -448,7 +448,7 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
|
||||
stream->criticalData()._av.release_locked_hsa_queue();
|
||||
#endif
|
||||
|
||||
return ihipLogStatus(hipErrorLaunchFailure);
|
||||
return hipErrorLaunchFailure;
|
||||
}
|
||||
|
||||
size_t impCoopArg = 1;
|
||||
@@ -469,29 +469,30 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
|
||||
stream->criticalData()._av.release_locked_hsa_queue();
|
||||
#endif
|
||||
|
||||
return ihipLogStatus(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
int numDevices, unsigned int flags) {
|
||||
__attribute__((visibility("default")))
|
||||
hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
int numDevices, unsigned int flags, hip_impl::program_state& ps) {
|
||||
|
||||
HIP_INIT_API(hipLaunchCooperativeKernelMultiDevice, launchParamsList, numDevices, flags);
|
||||
hipError_t result;
|
||||
|
||||
if (numDevices > g_deviceCnt || launchParamsList == nullptr || numDevices > MAX_COOPERATIVE_GPUs) {
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
if (!launchParamsList[i].stream->getDevice()->_props.cooperativeMultiDeviceLaunch) {
|
||||
return ihipLogStatus(hipErrorInvalidConfiguration);
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
}
|
||||
|
||||
hipFunction_t* gwsKds = reinterpret_cast<hipFunction_t*>(malloc(sizeof(hipFunction_t) * numDevices));
|
||||
hipFunction_t* kds = reinterpret_cast<hipFunction_t*>(malloc(sizeof(hipFunction_t) * numDevices));
|
||||
|
||||
if (kds == nullptr || gwsKds == nullptr) {
|
||||
return ihipLogStatus(hipErrorNotInitialized);
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
|
||||
// prepare all kernel descriptors for initializing the GWS and the main kernels per device
|
||||
@@ -500,30 +501,30 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
|
||||
if (lp.stream == nullptr) {
|
||||
free(gwsKds);
|
||||
free(kds);
|
||||
return ihipLogStatus(hipErrorNotInitialized);
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
|
||||
gwsKds[i] = hip_impl::get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(&init_gws),
|
||||
gwsKds[i] = ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(&init_gws),
|
||||
hip_impl::target_agent(lp.stream));
|
||||
if (gwsKds[i] == nullptr) {
|
||||
free(gwsKds);
|
||||
free(kds);
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
hip_impl::kernargs_size_align gwsKargs = hip_impl::get_program_state().get_kernargs_size_align(
|
||||
hip_impl::kernargs_size_align gwsKargs = ps.get_kernargs_size_align(
|
||||
reinterpret_cast<std::uintptr_t>(&init_gws));
|
||||
gwsKds[i]->_kernarg_layout = *reinterpret_cast<const std::vector<std::pair<std::size_t, std::size_t>>*>(
|
||||
gwsKargs.getHandle());
|
||||
|
||||
|
||||
kds[i] = hip_impl::get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
|
||||
kds[i] = ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
|
||||
hip_impl::target_agent(lp.stream));
|
||||
if (kds[i] == nullptr) {
|
||||
free(gwsKds);
|
||||
free(kds);
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
hip_impl::kernargs_size_align kargs = hip_impl::get_program_state().get_kernargs_size_align(
|
||||
hip_impl::kernargs_size_align kargs = ps.get_kernargs_size_align(
|
||||
reinterpret_cast<std::uintptr_t>(lp.func));
|
||||
kds[i]->_kernarg_layout = *reinterpret_cast<const std::vector<std::pair<std::size_t, std::size_t>>*>(
|
||||
kargs.getHandle());
|
||||
@@ -532,9 +533,10 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
|
||||
mg_sync *mg_sync_ptr = 0;
|
||||
mg_info *mg_info_ptr[MAX_COOPERATIVE_GPUs] = {0};
|
||||
|
||||
GET_TLS();
|
||||
result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault);
|
||||
if (result != hipSuccess) {
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
mg_sync_ptr->w0 = 0;
|
||||
mg_sync_ptr->w1 = 0;
|
||||
@@ -547,7 +549,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
|
||||
for (int j = 0; j < i; ++j) {
|
||||
hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
|
||||
}
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
// calculate the sum of sizes of all grids
|
||||
const hipLaunchParams& lp = launchParamsList[i];
|
||||
@@ -586,7 +588,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
|
||||
hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
|
||||
}
|
||||
|
||||
return ihipLogStatus(hipErrorLaunchFailure);
|
||||
return hipErrorLaunchFailure;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -629,7 +631,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
|
||||
hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
|
||||
}
|
||||
|
||||
return ihipLogStatus(hipErrorLaunchFailure);
|
||||
return hipErrorLaunchFailure;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -650,7 +652,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
|
||||
hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
|
||||
}
|
||||
|
||||
return ihipLogStatus(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
namespace hip_impl {
|
||||
|
||||
Odkázat v novém úkolu
Zablokovat Uživatele