[hip] implement the hipExtLaunchMultiKernelMultiDevice API (#1165)
* [hip] implement the hipExtLaunchMultiKernelMultiDevice API * add a guard to check the HCC version for acquire_locked_hsa_queue() API which was introdued in HCC for ROCm 2.5 * modified code based on the requested changes * changes to lock all streams before launching kernels for each device and unlock them after the dispatches * check each stream to be valid before starting to lock all the streams
This commit is contained in:
committad av
Maneesh Gupta
förälder
d492f1fd6b
incheckning
96dc74897d
@@ -270,14 +270,14 @@ static inline __device__ void printf(const char* format, All... all) {}
|
||||
|
||||
#if defined __HCC_CPP__
|
||||
extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block,
|
||||
grid_launch_parm* lp, const char* kernelNameStr);
|
||||
grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0);
|
||||
extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block,
|
||||
grid_launch_parm* lp, const char* kernelNameStr);
|
||||
grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0);
|
||||
extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block,
|
||||
grid_launch_parm* lp, const char* kernelNameStr);
|
||||
grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0);
|
||||
extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block,
|
||||
grid_launch_parm* lp, const char* kernelNameStr);
|
||||
extern void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launch_parm& lp);
|
||||
grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0);
|
||||
extern void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launch_parm& lp, bool unlockPostponed = 0);
|
||||
|
||||
#if GENERIC_GRID_LAUNCH == 0
|
||||
//#warning "Original hipLaunchKernel defined"
|
||||
|
||||
@@ -77,6 +77,7 @@ class kernargs_size_align {
|
||||
public:
|
||||
std::size_t size(std::size_t n) const;
|
||||
std::size_t alignment(std::size_t n) const;
|
||||
const void* getHandle() const {return handle;};
|
||||
private:
|
||||
const void* handle;
|
||||
friend kernargs_size_align program_state::get_kernargs_size_align(std::uintptr_t);
|
||||
|
||||
+31
-18
@@ -404,7 +404,7 @@ LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand() {
|
||||
//---
|
||||
// Must be called after kernel finishes, this releases the lock on the stream so other commands can
|
||||
// submit.
|
||||
void ihipStream_t::lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av) {
|
||||
void ihipStream_t::lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av, bool unlockPostponed) {
|
||||
bool blockThisKernel = false;
|
||||
|
||||
if (!g_hipLaunchBlockingKernels.empty()) {
|
||||
@@ -426,7 +426,10 @@ void ihipStream_t::lockclose_postKernelCommand(const char* kernelName, hc::accel
|
||||
kernelName);
|
||||
}
|
||||
|
||||
_criticalData.unlock(); // paired with lock from lockopen_preKernelCommand.
|
||||
// if unlockPostponed is true then this stream will be unlocked later (e.g., see hipExtLaunchMultiKernelMultiDevice for a sample call)
|
||||
if (!unlockPostponed) {
|
||||
_criticalData.unlock(); // paired with lock from lockopen_preKernelCommand.
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -1493,7 +1496,7 @@ void ihipStreamCallbackHandler(ihipStreamCallback_t* cb) {
|
||||
//
|
||||
// If stream==NULL synchronize appropriately with other streams and return the default av for the
|
||||
// device. If stream is valid, return the AV to use.
|
||||
hipStream_t ihipSyncAndResolveStream(hipStream_t stream) {
|
||||
hipStream_t ihipSyncAndResolveStream(hipStream_t stream, bool lockAcquired) {
|
||||
if (stream == hipStreamNull) {
|
||||
// Submitting to NULL stream, call locked_syncDefaultStream to wait for all other streams:
|
||||
ihipCtx_t* ctx = ihipGetTlsDefaultCtx();
|
||||
@@ -1535,9 +1538,14 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream) {
|
||||
if (needGatherMarker) {
|
||||
// ensure any commands sent to this stream wait on the NULL stream before
|
||||
// continuing
|
||||
LockedAccessor_StreamCrit_t thisStreamCrit(stream->criticalData());
|
||||
// TODO - could be "noret" version of create_blocking_marker
|
||||
thisStreamCrit->_av.create_blocking_marker(dcf, hc::accelerator_scope);
|
||||
if (!lockAcquired) {
|
||||
LockedAccessor_StreamCrit_t thisStreamCrit(stream->criticalData());
|
||||
// TODO - could be "noret" version of create_blocking_marker
|
||||
thisStreamCrit->_av.create_blocking_marker(dcf, hc::accelerator_scope);
|
||||
} else {
|
||||
// this stream is already locked (e.g., call from hipExtLaunchMultiKernelMultiDevice)
|
||||
stream->criticalData()._av.create_blocking_marker(dcf, hc::accelerator_scope);
|
||||
}
|
||||
tprintf(
|
||||
DB_SYNC,
|
||||
" %s adding marker to wait for freshly recorded default-stream marker \n",
|
||||
@@ -1578,8 +1586,8 @@ void ihipPrintKernelLaunch(const char* kernelName, const grid_launch_parm* lp,
|
||||
// Called just before a kernel is launched from hipLaunchKernel.
|
||||
// Allows runtime to track some information about the stream.
|
||||
hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm* lp,
|
||||
const char* kernelNameStr) {
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
const char* kernelNameStr, bool lockAcquired) {
|
||||
stream = ihipSyncAndResolveStream(stream, lockAcquired);
|
||||
lp->grid_dim.x = grid.x;
|
||||
lp->grid_dim.y = grid.y;
|
||||
lp->grid_dim.z = grid.z;
|
||||
@@ -1589,8 +1597,13 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_
|
||||
lp->barrier_bit = barrier_bit_queue_default;
|
||||
lp->launch_fence = -1;
|
||||
|
||||
auto crit = stream->lockopen_preKernelCommand();
|
||||
lp->av = &(crit->_av);
|
||||
if (!lockAcquired) {
|
||||
auto crit = stream->lockopen_preKernelCommand();
|
||||
lp->av = &(crit->_av);
|
||||
} else {
|
||||
// this stream is already locked (e.g., call from hipExtLaunchMultiKernelMultiDevice)
|
||||
lp->av = &(stream->criticalData()._av);
|
||||
}
|
||||
lp->cf = nullptr;
|
||||
ihipPrintKernelLaunch(kernelNameStr, lp, stream);
|
||||
|
||||
@@ -1599,30 +1612,30 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_
|
||||
|
||||
|
||||
hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm* lp,
|
||||
const char* kernelNameStr) {
|
||||
return ihipPreLaunchKernel(stream, dim3(grid), block, lp, kernelNameStr);
|
||||
const char* kernelNameStr, bool lockAcquired) {
|
||||
return ihipPreLaunchKernel(stream, dim3(grid), block, lp, kernelNameStr, lockAcquired);
|
||||
}
|
||||
|
||||
|
||||
hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm* lp,
|
||||
const char* kernelNameStr) {
|
||||
return ihipPreLaunchKernel(stream, grid, dim3(block), lp, kernelNameStr);
|
||||
const char* kernelNameStr, bool lockAcquired) {
|
||||
return ihipPreLaunchKernel(stream, grid, dim3(block), lp, kernelNameStr, lockAcquired);
|
||||
}
|
||||
|
||||
|
||||
hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm* lp,
|
||||
const char* kernelNameStr) {
|
||||
return ihipPreLaunchKernel(stream, dim3(grid), dim3(block), lp, kernelNameStr);
|
||||
const char* kernelNameStr, bool lockAcquired) {
|
||||
return ihipPreLaunchKernel(stream, dim3(grid), dim3(block), lp, kernelNameStr, lockAcquired);
|
||||
}
|
||||
|
||||
|
||||
//---
|
||||
// Called after kernel finishes execution.
|
||||
// This releases the lock on the stream.
|
||||
void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launch_parm& lp) {
|
||||
void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launch_parm& lp, bool unlockPostponed) {
|
||||
tprintf(DB_SYNC, "ihipPostLaunchKernel, unlocking stream\n");
|
||||
|
||||
stream->lockclose_postKernelCommand(kernelName, lp.av);
|
||||
stream->lockclose_postKernelCommand(kernelName, lp.av, unlockPostponed);
|
||||
if (HIP_PROFILE_API) {
|
||||
MARKER_END();
|
||||
}
|
||||
|
||||
@@ -550,7 +550,7 @@ class ihipStream_t {
|
||||
// Member functions that begin with locked_ are thread-safe accessors - these acquire / release
|
||||
// the critical mutex.
|
||||
LockedAccessor_StreamCrit_t lockopen_preKernelCommand();
|
||||
void lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av);
|
||||
void lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av, bool unlockNotNeeded = 0);
|
||||
|
||||
|
||||
void locked_wait();
|
||||
@@ -952,7 +952,7 @@ hipError_t hipModuleGetFunctionEx(hipFunction_t* hfunc, hipModule_t hmod,
|
||||
const char* name, hsa_agent_t *agent);
|
||||
|
||||
|
||||
hipStream_t ihipSyncAndResolveStream(hipStream_t);
|
||||
hipStream_t ihipSyncAndResolveStream(hipStream_t, bool lockAcquired = 0);
|
||||
hipError_t ihipStreamSynchronize(hipStream_t stream);
|
||||
void ihipStreamCallbackHandler(ihipStreamCallback_t* cb);
|
||||
|
||||
|
||||
+74
-4
@@ -150,7 +150,7 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
uint32_t localWorkSizeX, uint32_t localWorkSizeY,
|
||||
uint32_t localWorkSizeZ, size_t sharedMemBytes,
|
||||
hipStream_t hStream, void** kernelParams, void** extra,
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags) {
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0) {
|
||||
using namespace hip_impl;
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
@@ -206,8 +206,7 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
sharedMemBytes; // TODO - this should be part of preLaunchKernel.
|
||||
hStream = ihipPreLaunchKernel(
|
||||
hStream, dim3(globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ),
|
||||
dim3(localWorkSizeX, localWorkSizeY, localWorkSizeZ), &lp, f->_name.c_str());
|
||||
|
||||
dim3(localWorkSizeX, localWorkSizeY, localWorkSizeZ), &lp, f->_name.c_str(), isStreamLocked);
|
||||
|
||||
hsa_kernel_dispatch_packet_t aql;
|
||||
|
||||
@@ -272,7 +271,9 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
stopEvent->attachToCompletionFuture(&cf, hStream, hipEventTypeStopCommand);
|
||||
}
|
||||
|
||||
ihipPostLaunchKernel(f->_name.c_str(), hStream, lp);
|
||||
ihipPostLaunchKernel(f->_name.c_str(), hStream, lp, isStreamLocked);
|
||||
|
||||
|
||||
}
|
||||
|
||||
return ret;
|
||||
@@ -315,6 +316,75 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, 0));
|
||||
}
|
||||
|
||||
hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
int numDevices, unsigned int flags) {
|
||||
|
||||
hipError_t result;
|
||||
|
||||
if ((numDevices > g_deviceCnt) || (launchParamsList == nullptr)) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
hipFunction_t* kds = reinterpret_cast<hipFunction_t*>(malloc(sizeof(hipFunction_t) * numDevices));
|
||||
if (kds == nullptr) {
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
|
||||
// prepare all kernel descriptors for each device as all streams will be locked in the next loop
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
const hipLaunchParams& lp = launchParamsList[i];
|
||||
if (lp.stream == nullptr) {
|
||||
free(kds);
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
kds[i] = hip_impl::get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
|
||||
hip_impl::target_agent(lp.stream));
|
||||
if (kds[i] == nullptr) {
|
||||
free(kds);
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
hip_impl::kernargs_size_align kargs = hip_impl::get_program_state().get_kernargs_size_align(
|
||||
reinterpret_cast<std::uintptr_t>(lp.func));
|
||||
kds[i]->_kernarg_layout = *reinterpret_cast<const std::vector<std::pair<std::size_t, std::size_t>>*>(
|
||||
kargs.getHandle());
|
||||
}
|
||||
|
||||
// lock all streams before launching kernels to each device
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false);
|
||||
#if (__hcc_workweek__ >= 19213)
|
||||
streamCrit->_av.acquire_locked_hsa_queue();
|
||||
#endif
|
||||
}
|
||||
|
||||
// launch kernels for each device
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
const hipLaunchParams& lp = launchParamsList[i];
|
||||
|
||||
result = ihipModuleLaunchKernel(kds[i],
|
||||
lp.gridDim.x * lp.blockDim.x,
|
||||
lp.gridDim.y * lp.blockDim.y,
|
||||
lp.gridDim.z * lp.blockDim.z,
|
||||
lp.blockDim.x, lp.blockDim.y,
|
||||
lp.blockDim.z, lp.sharedMem,
|
||||
lp.stream, lp.args, nullptr, nullptr, nullptr, 0,
|
||||
true /* stream is already locked above and will be unlocked
|
||||
in the below code after launching kernels on all devices*/);
|
||||
}
|
||||
|
||||
// unlock all streams
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
launchParamsList[i].stream->criticalData().unlock();
|
||||
#if (__hcc_workweek__ >= 19213)
|
||||
launchParamsList[i].stream->criticalData()._av.release_locked_hsa_queue();
|
||||
#endif
|
||||
}
|
||||
|
||||
free(kds);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
namespace hip_impl {
|
||||
hsa_executable_t executable_for(hipModule_t hmod) {
|
||||
return hmod->executable;
|
||||
|
||||
Referens i nytt ärende
Block a user