[hip] implement the hipExtLaunchMultiKernelMultiDevice API (#1165)

* [hip] implement the hipExtLaunchMultiKernelMultiDevice API

* add a guard to check the HCC version for acquire_locked_hsa_queue() API which was introdued in HCC for ROCm 2.5

* modified code based on the requested changes

* changes to lock all streams before launching kernels for each device and unlock them after the dispatches

* check each stream to be valid before starting to lock all the streams
This commit is contained in:
Aryan Salmanpour
2019-06-19 20:29:05 -04:00
committad av Maneesh Gupta
förälder d492f1fd6b
incheckning 96dc74897d
5 ändrade filer med 113 tillägg och 29 borttagningar
+5 -5
Visa fil
@@ -270,14 +270,14 @@ static inline __device__ void printf(const char* format, All... all) {}
#if defined __HCC_CPP__
extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block,
grid_launch_parm* lp, const char* kernelNameStr);
grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0);
extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block,
grid_launch_parm* lp, const char* kernelNameStr);
grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0);
extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block,
grid_launch_parm* lp, const char* kernelNameStr);
grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0);
extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block,
grid_launch_parm* lp, const char* kernelNameStr);
extern void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launch_parm& lp);
grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0);
extern void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launch_parm& lp, bool unlockPostponed = 0);
#if GENERIC_GRID_LAUNCH == 0
//#warning "Original hipLaunchKernel defined"
+1
Visa fil
@@ -77,6 +77,7 @@ class kernargs_size_align {
public:
std::size_t size(std::size_t n) const;
std::size_t alignment(std::size_t n) const;
const void* getHandle() const {return handle;};
private:
const void* handle;
friend kernargs_size_align program_state::get_kernargs_size_align(std::uintptr_t);
+31 -18
Visa fil
@@ -404,7 +404,7 @@ LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand() {
//---
// Must be called after kernel finishes, this releases the lock on the stream so other commands can
// submit.
void ihipStream_t::lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av) {
void ihipStream_t::lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av, bool unlockPostponed) {
bool blockThisKernel = false;
if (!g_hipLaunchBlockingKernels.empty()) {
@@ -426,7 +426,10 @@ void ihipStream_t::lockclose_postKernelCommand(const char* kernelName, hc::accel
kernelName);
}
_criticalData.unlock(); // paired with lock from lockopen_preKernelCommand.
// if unlockPostponed is true then this stream will be unlocked later (e.g., see hipExtLaunchMultiKernelMultiDevice for a sample call)
if (!unlockPostponed) {
_criticalData.unlock(); // paired with lock from lockopen_preKernelCommand.
}
};
@@ -1493,7 +1496,7 @@ void ihipStreamCallbackHandler(ihipStreamCallback_t* cb) {
//
// If stream==NULL synchronize appropriately with other streams and return the default av for the
// device. If stream is valid, return the AV to use.
hipStream_t ihipSyncAndResolveStream(hipStream_t stream) {
hipStream_t ihipSyncAndResolveStream(hipStream_t stream, bool lockAcquired) {
if (stream == hipStreamNull) {
// Submitting to NULL stream, call locked_syncDefaultStream to wait for all other streams:
ihipCtx_t* ctx = ihipGetTlsDefaultCtx();
@@ -1535,9 +1538,14 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream) {
if (needGatherMarker) {
// ensure any commands sent to this stream wait on the NULL stream before
// continuing
LockedAccessor_StreamCrit_t thisStreamCrit(stream->criticalData());
// TODO - could be "noret" version of create_blocking_marker
thisStreamCrit->_av.create_blocking_marker(dcf, hc::accelerator_scope);
if (!lockAcquired) {
LockedAccessor_StreamCrit_t thisStreamCrit(stream->criticalData());
// TODO - could be "noret" version of create_blocking_marker
thisStreamCrit->_av.create_blocking_marker(dcf, hc::accelerator_scope);
} else {
// this stream is already locked (e.g., call from hipExtLaunchMultiKernelMultiDevice)
stream->criticalData()._av.create_blocking_marker(dcf, hc::accelerator_scope);
}
tprintf(
DB_SYNC,
" %s adding marker to wait for freshly recorded default-stream marker \n",
@@ -1578,8 +1586,8 @@ void ihipPrintKernelLaunch(const char* kernelName, const grid_launch_parm* lp,
// Called just before a kernel is launched from hipLaunchKernel.
// Allows runtime to track some information about the stream.
hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm* lp,
const char* kernelNameStr) {
stream = ihipSyncAndResolveStream(stream);
const char* kernelNameStr, bool lockAcquired) {
stream = ihipSyncAndResolveStream(stream, lockAcquired);
lp->grid_dim.x = grid.x;
lp->grid_dim.y = grid.y;
lp->grid_dim.z = grid.z;
@@ -1589,8 +1597,13 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_
lp->barrier_bit = barrier_bit_queue_default;
lp->launch_fence = -1;
auto crit = stream->lockopen_preKernelCommand();
lp->av = &(crit->_av);
if (!lockAcquired) {
auto crit = stream->lockopen_preKernelCommand();
lp->av = &(crit->_av);
} else {
// this stream is already locked (e.g., call from hipExtLaunchMultiKernelMultiDevice)
lp->av = &(stream->criticalData()._av);
}
lp->cf = nullptr;
ihipPrintKernelLaunch(kernelNameStr, lp, stream);
@@ -1599,30 +1612,30 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_
hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm* lp,
const char* kernelNameStr) {
return ihipPreLaunchKernel(stream, dim3(grid), block, lp, kernelNameStr);
const char* kernelNameStr, bool lockAcquired) {
return ihipPreLaunchKernel(stream, dim3(grid), block, lp, kernelNameStr, lockAcquired);
}
hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm* lp,
const char* kernelNameStr) {
return ihipPreLaunchKernel(stream, grid, dim3(block), lp, kernelNameStr);
const char* kernelNameStr, bool lockAcquired) {
return ihipPreLaunchKernel(stream, grid, dim3(block), lp, kernelNameStr, lockAcquired);
}
hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm* lp,
const char* kernelNameStr) {
return ihipPreLaunchKernel(stream, dim3(grid), dim3(block), lp, kernelNameStr);
const char* kernelNameStr, bool lockAcquired) {
return ihipPreLaunchKernel(stream, dim3(grid), dim3(block), lp, kernelNameStr, lockAcquired);
}
//---
// Called after kernel finishes execution.
// This releases the lock on the stream.
void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launch_parm& lp) {
void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launch_parm& lp, bool unlockPostponed) {
tprintf(DB_SYNC, "ihipPostLaunchKernel, unlocking stream\n");
stream->lockclose_postKernelCommand(kernelName, lp.av);
stream->lockclose_postKernelCommand(kernelName, lp.av, unlockPostponed);
if (HIP_PROFILE_API) {
MARKER_END();
}
+2 -2
Visa fil
@@ -550,7 +550,7 @@ class ihipStream_t {
// Member functions that begin with locked_ are thread-safe accessors - these acquire / release
// the critical mutex.
LockedAccessor_StreamCrit_t lockopen_preKernelCommand();
void lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av);
void lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av, bool unlockNotNeeded = 0);
void locked_wait();
@@ -952,7 +952,7 @@ hipError_t hipModuleGetFunctionEx(hipFunction_t* hfunc, hipModule_t hmod,
const char* name, hsa_agent_t *agent);
hipStream_t ihipSyncAndResolveStream(hipStream_t);
hipStream_t ihipSyncAndResolveStream(hipStream_t, bool lockAcquired = 0);
hipError_t ihipStreamSynchronize(hipStream_t stream);
void ihipStreamCallbackHandler(ihipStreamCallback_t* cb);
+74 -4
Visa fil
@@ -150,7 +150,7 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
uint32_t localWorkSizeX, uint32_t localWorkSizeY,
uint32_t localWorkSizeZ, size_t sharedMemBytes,
hipStream_t hStream, void** kernelParams, void** extra,
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags) {
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0) {
using namespace hip_impl;
auto ctx = ihipGetTlsDefaultCtx();
@@ -206,8 +206,7 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
sharedMemBytes; // TODO - this should be part of preLaunchKernel.
hStream = ihipPreLaunchKernel(
hStream, dim3(globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ),
dim3(localWorkSizeX, localWorkSizeY, localWorkSizeZ), &lp, f->_name.c_str());
dim3(localWorkSizeX, localWorkSizeY, localWorkSizeZ), &lp, f->_name.c_str(), isStreamLocked);
hsa_kernel_dispatch_packet_t aql;
@@ -272,7 +271,9 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
stopEvent->attachToCompletionFuture(&cf, hStream, hipEventTypeStopCommand);
}
ihipPostLaunchKernel(f->_name.c_str(), hStream, lp);
ihipPostLaunchKernel(f->_name.c_str(), hStream, lp, isStreamLocked);
}
return ret;
@@ -315,6 +316,75 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, 0));
}
hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
int numDevices, unsigned int flags) {
hipError_t result;
if ((numDevices > g_deviceCnt) || (launchParamsList == nullptr)) {
return hipErrorInvalidValue;
}
hipFunction_t* kds = reinterpret_cast<hipFunction_t*>(malloc(sizeof(hipFunction_t) * numDevices));
if (kds == nullptr) {
return hipErrorNotInitialized;
}
// prepare all kernel descriptors for each device as all streams will be locked in the next loop
for (int i = 0; i < numDevices; ++i) {
const hipLaunchParams& lp = launchParamsList[i];
if (lp.stream == nullptr) {
free(kds);
return hipErrorNotInitialized;
}
kds[i] = hip_impl::get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
hip_impl::target_agent(lp.stream));
if (kds[i] == nullptr) {
free(kds);
return hipErrorInvalidValue;
}
hip_impl::kernargs_size_align kargs = hip_impl::get_program_state().get_kernargs_size_align(
reinterpret_cast<std::uintptr_t>(lp.func));
kds[i]->_kernarg_layout = *reinterpret_cast<const std::vector<std::pair<std::size_t, std::size_t>>*>(
kargs.getHandle());
}
// lock all streams before launching kernels to each device
for (int i = 0; i < numDevices; ++i) {
LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false);
#if (__hcc_workweek__ >= 19213)
streamCrit->_av.acquire_locked_hsa_queue();
#endif
}
// launch kernels for each device
for (int i = 0; i < numDevices; ++i) {
const hipLaunchParams& lp = launchParamsList[i];
result = ihipModuleLaunchKernel(kds[i],
lp.gridDim.x * lp.blockDim.x,
lp.gridDim.y * lp.blockDim.y,
lp.gridDim.z * lp.blockDim.z,
lp.blockDim.x, lp.blockDim.y,
lp.blockDim.z, lp.sharedMem,
lp.stream, lp.args, nullptr, nullptr, nullptr, 0,
true /* stream is already locked above and will be unlocked
in the below code after launching kernels on all devices*/);
}
// unlock all streams
for (int i = 0; i < numDevices; ++i) {
launchParamsList[i].stream->criticalData().unlock();
#if (__hcc_workweek__ >= 19213)
launchParamsList[i].stream->criticalData()._av.release_locked_hsa_queue();
#endif
}
free(kds);
return result;
}
namespace hip_impl {
hsa_executable_t executable_for(hipModule_t hmod) {
return hmod->executable;