[hip] add support for implicit kernel argument for multi-grid sync (#1456)

* [hip] add support for implicit kernel argument for multi-grid sync

* modified code for calculating the prev_sum

* change the impCoopArg type to size_t

* add memory clean up

* launch init_gws and main kernels into two separate loops
Этот коммит содержится в:
Aryan Salmanpour
2019-10-24 08:13:30 -04:00
коммит произвёл Maneesh Gupta
родитель fe5f7d4245
Коммит 359dc79101
3 изменённых файлов: 211 добавлений и 99 удалений
+89 -6
Просмотреть файл
@@ -137,7 +137,8 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
uint32_t localWorkSizeX, uint32_t localWorkSizeY,
uint32_t localWorkSizeZ, size_t sharedMemBytes,
hipStream_t hStream, void** kernelParams, void** extra,
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0) {
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0,
void** impCoopParams = 0) {
using namespace hip_impl;
auto ctx = ihipGetTlsDefaultCtx();
@@ -181,10 +182,17 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
return hipErrorInvalidValue;
}
// Insert 48-bytes at the end for implicit kernel arguments and fill with value zero.
// Insert 56-bytes at the end for implicit kernel arguments and fill with value zero.
size_t padSize = (~kernargs.size() + 1) & (HIP_IMPLICIT_KERNARG_ALIGNMENT - 1);
kernargs.insert(kernargs.end(), padSize + HIP_IMPLICIT_KERNARG_SIZE, 0);
if (impCoopParams) {
const auto p{static_cast<const char*>(*impCoopParams)};
// The sixth index is for multi-grid synchronization
kernargs.insert((kernargs.cend() - padSize - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT,
p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT);
}
/*
Kernel argument preparation.
*/
@@ -449,6 +457,10 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
return ihipLogStatus(hipErrorLaunchFailure);
}
size_t impCoopArg = 1;
void* impCoopParams[1];
impCoopParams[0] = &impCoopArg;
// launch the main kernel
result = ihipModuleLaunchKernel(tls, kd,
gridDim.x * blockDimX.x,
@@ -456,7 +468,7 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
gridDim.z * blockDimX.z,
blockDimX.x, blockDimX.y, blockDimX.z,
sharedMemBytes, stream, kernelParams, nullptr, nullptr,
nullptr, 0, true);
nullptr, 0, true, impCoopParams);
stream->criticalData().unlock();
#if (__hcc_workweek__ >= 19213)
@@ -472,7 +484,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
HIP_INIT_API(hipLaunchCooperativeKernelMultiDevice, launchParamsList, numDevices, flags);
hipError_t result;
if (numDevices > g_deviceCnt || launchParamsList == nullptr) {
if (numDevices > g_deviceCnt || launchParamsList == nullptr || numDevices > MAX_COOPERATIVE_GPUs) {
return ihipLogStatus(hipErrorInvalidValue);
}
@@ -523,6 +535,32 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
kargs.getHandle());
}
mg_sync *mg_sync_ptr = 0;
mg_info *mg_info_ptr[MAX_COOPERATIVE_GPUs] = {0};
result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault);
if (result != hipSuccess) {
return ihipLogStatus(hipErrorInvalidValue);
}
mg_sync_ptr->w0 = 0;
mg_sync_ptr->w1 = 0;
uint all_sum = 0;
for (int i = 0; i < numDevices; ++i) {
result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_ptr[i], sizeof(mg_info), hipHostMallocDefault);
if (result != hipSuccess) {
hip_internal::ihipHostFree(tls, mg_sync_ptr);
for (int j = 0; j < i; ++j) {
hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
}
return ihipLogStatus(hipErrorInvalidValue);
}
// calculate the sum of sizes of all grids
const hipLaunchParams& lp = launchParamsList[i];
all_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z *
lp.gridDim.x * lp.gridDim.y * lp.gridDim.z;
}
// lock all streams before launching the blit kernels for initializing the GWS and main kernels to each device
for (int i = 0; i < numDevices; ++i) {
LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false);
@@ -531,7 +569,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
#endif
}
// launch the init_gws kernel to initialize the GWS followed by launching the main kernels for each device
// launch the init_gws kernel to initialize the GWS for each device
for (int i = 0; i < numDevices; ++i) {
const hipLaunchParams& lp = launchParamsList[i];
@@ -549,8 +587,32 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
#endif
}
hip_internal::ihipHostFree(tls, mg_sync_ptr);
for (int j = 0; j < numDevices; ++j) {
hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
}
return ihipLogStatus(hipErrorLaunchFailure);
}
}
void* impCoopParams[1];
ulong prev_sum = 0;
// launch the main kernels for each device
for (int i = 0; i < numDevices; ++i) {
const hipLaunchParams& lp = launchParamsList[i];
//initialize and setup the implicit kernel argument for multi-grid sync
mg_info_ptr[i]->mgs = mg_sync_ptr;
mg_info_ptr[i]->grid_id = i;
mg_info_ptr[i]->num_grids = numDevices;
mg_info_ptr[i]->all_sum = all_sum;
mg_info_ptr[i]->prev_sum = prev_sum;
prev_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z *
lp.gridDim.x * lp.gridDim.y * lp.gridDim.z;
impCoopParams[0] = &mg_info_ptr[i];
result = ihipModuleLaunchKernel(tls, kds[i],
lp.gridDim.x * lp.blockDim.x,
@@ -559,7 +621,23 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
lp.blockDim.x, lp.blockDim.y,
lp.blockDim.z, lp.sharedMem,
lp.stream, lp.args, nullptr, nullptr, nullptr, 0,
true);
true, impCoopParams);
if (result != hipSuccess) {
for (int j = 0; j < numDevices; ++j) {
launchParamsList[j].stream->criticalData().unlock();
#if (__hcc_workweek__ >= 19213)
launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
#endif
}
hip_internal::ihipHostFree(tls, mg_sync_ptr);
for (int j = 0; j < numDevices; ++j) {
hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
}
return ihipLogStatus(hipErrorLaunchFailure);
}
}
// unlock all streams
@@ -573,6 +651,11 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
free(gwsKds);
free(kds);
hip_internal::ihipHostFree(tls, mg_sync_ptr);
for (int j = 0; j < numDevices; ++j) {
hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
}
return ihipLogStatus(result);
}