[hip] refactoring hipExtLaunchMultiKernelMultiDevice API (#1698)
[Background] it was found that if lazy linking used for a library that calls hipExtLaunchMultiKernelMultiDevice API then this API can get the wrong program_state object for looking up device kernels leading to a "No device code available" error in this API. To fix this issue, the API was refactored to be inline and get and pass the correct program_state to an internal hip API to request a multi-device kernel launch.
This commit is contained in:
gecommit door
Maneesh Gupta
bovenliggende
38e971b645
commit
8eaea4d114
@@ -34,6 +34,9 @@ THE SOFTWARE.
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
|
||||
unsigned int flags, hip_impl::program_state& ps);
|
||||
|
||||
#pragma GCC visibility push(hidden)
|
||||
|
||||
namespace hip_impl {
|
||||
@@ -176,4 +179,15 @@ void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
|
||||
numBlocks, dimBlocks, sharedMemBytes,
|
||||
stream, &config[0]);
|
||||
}
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
int numDevices, unsigned int flags) {
|
||||
hip_impl::hip_init();
|
||||
auto& ps = hip_impl::get_program_state();
|
||||
return ihipExtLaunchMultiKernelMultiDevice(launchParamsList, numDevices, flags, ps);
|
||||
|
||||
}
|
||||
|
||||
#pragma GCC visibility pop
|
||||
|
||||
@@ -2960,6 +2960,7 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags);
|
||||
|
||||
#if defined(__clang__) && defined(__HIP__)
|
||||
/**
|
||||
* @brief Launches kernels on multiple devices and guarantees all specified kernels are dispatched
|
||||
* on respective streams before enqueuing any other work on the specified streams from any other threads
|
||||
@@ -2974,6 +2975,7 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
int numDevices, unsigned int flags);
|
||||
|
||||
#endif
|
||||
|
||||
// doxygen end Version Management
|
||||
/**
|
||||
@@ -3395,12 +3397,14 @@ inline hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchP
|
||||
return hipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags);
|
||||
}
|
||||
|
||||
#if defined(__clang__) && defined(__HIP__)
|
||||
template <class T>
|
||||
inline hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
unsigned int numDevices, unsigned int flags = 0) {
|
||||
return hipExtLaunchMultiKernelMultiDevice(launchParamsList, numDevices, flags);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* @brief Unbinds the textuer bound to @p tex
|
||||
|
||||
@@ -311,18 +311,18 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, 0));
|
||||
}
|
||||
|
||||
hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
int numDevices, unsigned int flags) {
|
||||
HIP_INIT_API(hipExtLaunchMultiKernelMultiDevice, launchParamsList, numDevices, flags);
|
||||
__attribute__((visibility("default")))
|
||||
hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
int numDevices, unsigned int flags, hip_impl::program_state& ps) {
|
||||
hipError_t result;
|
||||
|
||||
if ((numDevices > g_deviceCnt) || (launchParamsList == nullptr)) {
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
hipFunction_t* kds = reinterpret_cast<hipFunction_t*>(malloc(sizeof(hipFunction_t) * numDevices));
|
||||
if (kds == nullptr) {
|
||||
return ihipLogStatus(hipErrorNotInitialized);
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
|
||||
// prepare all kernel descriptors for each device as all streams will be locked in the next loop
|
||||
@@ -330,15 +330,15 @@ hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
const hipLaunchParams& lp = launchParamsList[i];
|
||||
if (lp.stream == nullptr) {
|
||||
free(kds);
|
||||
return ihipLogStatus(hipErrorNotInitialized);
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
kds[i] = hip_impl::get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
|
||||
kds[i] = ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
|
||||
hip_impl::target_agent(lp.stream));
|
||||
if (kds[i] == nullptr) {
|
||||
free(kds);
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
hip_impl::kernargs_size_align kargs = hip_impl::get_program_state().get_kernargs_size_align(
|
||||
hip_impl::kernargs_size_align kargs = ps.get_kernargs_size_align(
|
||||
reinterpret_cast<std::uintptr_t>(lp.func));
|
||||
kds[i]->_kernarg_layout = *reinterpret_cast<const std::vector<std::pair<std::size_t, std::size_t>>*>(
|
||||
kargs.getHandle());
|
||||
@@ -352,6 +352,7 @@ hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
#endif
|
||||
}
|
||||
|
||||
GET_TLS();
|
||||
// launch kernels for each device
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
const hipLaunchParams& lp = launchParamsList[i];
|
||||
@@ -377,7 +378,7 @@ hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
|
||||
free(kds);
|
||||
|
||||
return ihipLogStatus(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
Verwijs in nieuw issue
Block a user