[hip] refactoring hipExtLaunchMultiKernelMultiDevice API (#1698)

[Background] it was found that if lazy linking used for a library that calls hipExtLaunchMultiKernelMultiDevice API then this API can get the wrong program_state object for looking up device kernels leading to a "No device code available" error in this API.

To fix this issue, the API was refactored to be inline and get and pass the correct program_state to an internal hip API to request a multi-device kernel launch.
This commit is contained in:
Aryan Salmanpour
2019-12-04 01:20:51 -05:00
gecommit door Maneesh Gupta
bovenliggende 38e971b645
commit 8eaea4d114
3 gewijzigde bestanden met toevoegingen van 29 en 10 verwijderingen
@@ -34,6 +34,9 @@ THE SOFTWARE.
#include <type_traits>
#include <utility>
hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
unsigned int flags, hip_impl::program_state& ps);
#pragma GCC visibility push(hidden)
namespace hip_impl {
@@ -176,4 +179,15 @@ void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
numBlocks, dimBlocks, sharedMemBytes,
stream, &config[0]);
}
inline
__attribute__((visibility("hidden")))
hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
int numDevices, unsigned int flags) {
hip_impl::hip_init();
auto& ps = hip_impl::get_program_state();
return ihipExtLaunchMultiKernelMultiDevice(launchParamsList, numDevices, flags, ps);
}
#pragma GCC visibility pop
@@ -2960,6 +2960,7 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags);
#if defined(__clang__) && defined(__HIP__)
/**
* @brief Launches kernels on multiple devices and guarantees all specified kernels are dispatched
* on respective streams before enqueuing any other work on the specified streams from any other threads
@@ -2974,6 +2975,7 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
int numDevices, unsigned int flags);
#endif
// doxygen end Version Management
/**
@@ -3395,12 +3397,14 @@ inline hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchP
return hipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags);
}
#if defined(__clang__) && defined(__HIP__)
template <class T>
inline hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
unsigned int numDevices, unsigned int flags = 0) {
return hipExtLaunchMultiKernelMultiDevice(launchParamsList, numDevices, flags);
}
#endif
/*
* @brief Unbinds the textuer bound to @p tex
+11 -10
Bestand weergeven
@@ -311,18 +311,18 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, 0));
}
hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
int numDevices, unsigned int flags) {
HIP_INIT_API(hipExtLaunchMultiKernelMultiDevice, launchParamsList, numDevices, flags);
__attribute__((visibility("default")))
hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
int numDevices, unsigned int flags, hip_impl::program_state& ps) {
hipError_t result;
if ((numDevices > g_deviceCnt) || (launchParamsList == nullptr)) {
return ihipLogStatus(hipErrorInvalidValue);
return hipErrorInvalidValue;
}
hipFunction_t* kds = reinterpret_cast<hipFunction_t*>(malloc(sizeof(hipFunction_t) * numDevices));
if (kds == nullptr) {
return ihipLogStatus(hipErrorNotInitialized);
return hipErrorNotInitialized;
}
// prepare all kernel descriptors for each device as all streams will be locked in the next loop
@@ -330,15 +330,15 @@ hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
const hipLaunchParams& lp = launchParamsList[i];
if (lp.stream == nullptr) {
free(kds);
return ihipLogStatus(hipErrorNotInitialized);
return hipErrorNotInitialized;
}
kds[i] = hip_impl::get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
kds[i] = ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
hip_impl::target_agent(lp.stream));
if (kds[i] == nullptr) {
free(kds);
return ihipLogStatus(hipErrorInvalidValue);
return hipErrorInvalidValue;
}
hip_impl::kernargs_size_align kargs = hip_impl::get_program_state().get_kernargs_size_align(
hip_impl::kernargs_size_align kargs = ps.get_kernargs_size_align(
reinterpret_cast<std::uintptr_t>(lp.func));
kds[i]->_kernarg_layout = *reinterpret_cast<const std::vector<std::pair<std::size_t, std::size_t>>*>(
kargs.getHandle());
@@ -352,6 +352,7 @@ hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
#endif
}
GET_TLS();
// launch kernels for each device
for (int i = 0; i < numDevices; ++i) {
const hipLaunchParams& lp = launchParamsList[i];
@@ -377,7 +378,7 @@ hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
free(kds);
return ihipLogStatus(result);
return result;
}
namespace {