[hip] add support for implicit kernel argument for multi-grid sync (#1456)
* [hip] add support for implicit kernel argument for multi-grid sync * modified code for calculating the prev_sum * change the impCoopArg type to size_t * add memory clean up * launch init_gws and main kernels into two separate loops
Этот коммит содержится в:
коммит произвёл
Maneesh Gupta
родитель
fe5f7d4245
Коммит
359dc79101
@@ -137,7 +137,8 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
|
||||
uint32_t localWorkSizeX, uint32_t localWorkSizeY,
|
||||
uint32_t localWorkSizeZ, size_t sharedMemBytes,
|
||||
hipStream_t hStream, void** kernelParams, void** extra,
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0) {
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0,
|
||||
void** impCoopParams = 0) {
|
||||
using namespace hip_impl;
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
@@ -181,10 +182,17 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
// Insert 48-bytes at the end for implicit kernel arguments and fill with value zero.
|
||||
// Insert 56-bytes at the end for implicit kernel arguments and fill with value zero.
|
||||
size_t padSize = (~kernargs.size() + 1) & (HIP_IMPLICIT_KERNARG_ALIGNMENT - 1);
|
||||
kernargs.insert(kernargs.end(), padSize + HIP_IMPLICIT_KERNARG_SIZE, 0);
|
||||
|
||||
if (impCoopParams) {
|
||||
const auto p{static_cast<const char*>(*impCoopParams)};
|
||||
// The sixth index is for multi-grid synchronization
|
||||
kernargs.insert((kernargs.cend() - padSize - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT,
|
||||
p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT);
|
||||
}
|
||||
|
||||
/*
|
||||
Kernel argument preparation.
|
||||
*/
|
||||
@@ -449,6 +457,10 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
|
||||
return ihipLogStatus(hipErrorLaunchFailure);
|
||||
}
|
||||
|
||||
size_t impCoopArg = 1;
|
||||
void* impCoopParams[1];
|
||||
impCoopParams[0] = &impCoopArg;
|
||||
|
||||
// launch the main kernel
|
||||
result = ihipModuleLaunchKernel(tls, kd,
|
||||
gridDim.x * blockDimX.x,
|
||||
@@ -456,7 +468,7 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
|
||||
gridDim.z * blockDimX.z,
|
||||
blockDimX.x, blockDimX.y, blockDimX.z,
|
||||
sharedMemBytes, stream, kernelParams, nullptr, nullptr,
|
||||
nullptr, 0, true);
|
||||
nullptr, 0, true, impCoopParams);
|
||||
|
||||
stream->criticalData().unlock();
|
||||
#if (__hcc_workweek__ >= 19213)
|
||||
@@ -472,7 +484,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
|
||||
HIP_INIT_API(hipLaunchCooperativeKernelMultiDevice, launchParamsList, numDevices, flags);
|
||||
hipError_t result;
|
||||
|
||||
if (numDevices > g_deviceCnt || launchParamsList == nullptr) {
|
||||
if (numDevices > g_deviceCnt || launchParamsList == nullptr || numDevices > MAX_COOPERATIVE_GPUs) {
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
@@ -523,6 +535,32 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
|
||||
kargs.getHandle());
|
||||
}
|
||||
|
||||
mg_sync *mg_sync_ptr = 0;
|
||||
mg_info *mg_info_ptr[MAX_COOPERATIVE_GPUs] = {0};
|
||||
|
||||
result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault);
|
||||
if (result != hipSuccess) {
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
}
|
||||
mg_sync_ptr->w0 = 0;
|
||||
mg_sync_ptr->w1 = 0;
|
||||
|
||||
uint all_sum = 0;
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_ptr[i], sizeof(mg_info), hipHostMallocDefault);
|
||||
if (result != hipSuccess) {
|
||||
hip_internal::ihipHostFree(tls, mg_sync_ptr);
|
||||
for (int j = 0; j < i; ++j) {
|
||||
hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
|
||||
}
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
}
|
||||
// calculate the sum of sizes of all grids
|
||||
const hipLaunchParams& lp = launchParamsList[i];
|
||||
all_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z *
|
||||
lp.gridDim.x * lp.gridDim.y * lp.gridDim.z;
|
||||
}
|
||||
|
||||
// lock all streams before launching the blit kernels for initializing the GWS and main kernels to each device
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false);
|
||||
@@ -531,7 +569,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
|
||||
#endif
|
||||
}
|
||||
|
||||
// launch the init_gws kernel to initialize the GWS followed by launching the main kernels for each device
|
||||
// launch the init_gws kernel to initialize the GWS for each device
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
const hipLaunchParams& lp = launchParamsList[i];
|
||||
|
||||
@@ -549,8 +587,32 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
|
||||
launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
|
||||
#endif
|
||||
}
|
||||
hip_internal::ihipHostFree(tls, mg_sync_ptr);
|
||||
for (int j = 0; j < numDevices; ++j) {
|
||||
hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
|
||||
}
|
||||
|
||||
return ihipLogStatus(hipErrorLaunchFailure);
|
||||
}
|
||||
}
|
||||
|
||||
void* impCoopParams[1];
|
||||
ulong prev_sum = 0;
|
||||
// launch the main kernels for each device
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
const hipLaunchParams& lp = launchParamsList[i];
|
||||
|
||||
//initialize and setup the implicit kernel argument for multi-grid sync
|
||||
mg_info_ptr[i]->mgs = mg_sync_ptr;
|
||||
mg_info_ptr[i]->grid_id = i;
|
||||
mg_info_ptr[i]->num_grids = numDevices;
|
||||
mg_info_ptr[i]->all_sum = all_sum;
|
||||
mg_info_ptr[i]->prev_sum = prev_sum;
|
||||
prev_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z *
|
||||
lp.gridDim.x * lp.gridDim.y * lp.gridDim.z;
|
||||
|
||||
|
||||
impCoopParams[0] = &mg_info_ptr[i];
|
||||
|
||||
result = ihipModuleLaunchKernel(tls, kds[i],
|
||||
lp.gridDim.x * lp.blockDim.x,
|
||||
@@ -559,7 +621,23 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
|
||||
lp.blockDim.x, lp.blockDim.y,
|
||||
lp.blockDim.z, lp.sharedMem,
|
||||
lp.stream, lp.args, nullptr, nullptr, nullptr, 0,
|
||||
true);
|
||||
true, impCoopParams);
|
||||
|
||||
if (result != hipSuccess) {
|
||||
for (int j = 0; j < numDevices; ++j) {
|
||||
launchParamsList[j].stream->criticalData().unlock();
|
||||
#if (__hcc_workweek__ >= 19213)
|
||||
launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
|
||||
#endif
|
||||
}
|
||||
hip_internal::ihipHostFree(tls, mg_sync_ptr);
|
||||
for (int j = 0; j < numDevices; ++j) {
|
||||
hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
|
||||
}
|
||||
|
||||
return ihipLogStatus(hipErrorLaunchFailure);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// unlock all streams
|
||||
@@ -573,6 +651,11 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
|
||||
free(gwsKds);
|
||||
free(kds);
|
||||
|
||||
hip_internal::ihipHostFree(tls, mg_sync_ptr);
|
||||
for (int j = 0; j < numDevices; ++j) {
|
||||
hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
|
||||
}
|
||||
|
||||
return ihipLogStatus(result);
|
||||
}
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user