diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h index 3ee14577b0..658cfbf576 100644 --- a/src/hip_hcc_internal.h +++ b/src/hip_hcc_internal.h @@ -1022,6 +1022,27 @@ inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) { namespace hip_internal { hipError_t memcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream); + +hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags); + +hipError_t ihipHostFree(TlsData *tls, void* ptr); + +}; + +#define MAX_COOPERATIVE_GPUs 255 + +// do not change these two structs without changing the device library +struct mg_sync { + uint w0; + uint w1; +}; + +struct mg_info { + struct mg_sync *mgs; + uint grid_id; + uint num_grids; + ulong prev_sum; + ulong all_sum; }; //--- diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 48b83287f3..c8369685ec 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -141,6 +141,103 @@ void* allocAndSharePtr(const char* msg, size_t sizeBytes, ihipCtx_t* ctx, bool s return ptr; } +hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags) { + hipError_t hip_status = hipSuccess; + + if (HIP_SYNC_HOST_ALLOC) { + hipDeviceSynchronize(); + } + + auto ctx = ihipGetTlsDefaultCtx(); + if ((ctx == nullptr) || (ptr == nullptr)) { + hip_status = hipErrorInvalidValue; + } + else if (sizeBytes == 0) { + hip_status = hipSuccess; + // TODO - should size of 0 return err or be siliently ignored? + } else { + unsigned trueFlags = flags; + if (flags == hipHostMallocDefault) { + // HCC/ROCM provide a modern system with unified memory and should set both of these + // flags by default: + trueFlags = hipHostMallocMapped | hipHostMallocPortable; + } + + + const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | + hipHostMallocWriteCombined | hipHostMallocCoherent | + hipHostMallocNonCoherent; + + + const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent; + + if ((flags & ~supportedFlags) || ((flags & coherencyFlags) == coherencyFlags)) { + *ptr = nullptr; + // can't specify unsupported flags, can't specify both Coherent + NonCoherent + hip_status = hipErrorInvalidValue; + } else { + auto device = ctx->getWriteableDevice(); +#if (__hcc_workweek__ >= 19115) + //Avoid mapping host pinned memory to all devices by HCC + unsigned amFlags = amHostUnmapped; +#else + unsigned amFlags = 0; +#endif + if (flags & hipHostMallocCoherent) { + amFlags |= amHostCoherent; + } else if (flags & hipHostMallocNonCoherent) { + amFlags |= amHostNonCoherent; + } else { + // depends on env variables: + amFlags |= HIP_HOST_COHERENT ? amHostCoherent : amHostNonCoherent; + } + + + *ptr = hip_internal::allocAndSharePtr( + (amFlags & amHostCoherent) ? "finegrained_host" : "pinned_host", sizeBytes, ctx, + true /*shareWithAll*/, amFlags, flags, 0); + + if (sizeBytes && (*ptr == NULL)) { + hip_status = hipErrorMemoryAllocation; + } + } + } + + if (HIP_SYNC_HOST_ALLOC) { + hipDeviceSynchronize(); + } + return hip_status; +} + +hipError_t ihipHostFree(TlsData *tls, void* ptr) { + + // Synchronize to ensure all work has finished. + ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits + // for all activity to finish. + + hipError_t hipStatus = hipErrorInvalidValue; + if (ptr) { + hc::accelerator acc; +#if (__hcc_workweek__ >= 17332) + hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0); +#else + hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); +#endif + am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr); + if (status == AM_SUCCESS) { + if (amPointerInfo._hostPointer == ptr) { + hc::am_free(ptr); + hipStatus = hipSuccess; + } + } + } else { + // free NULL pointer succeeds and is common technique to initialize runtime + hipStatus = hipSuccess; + } + + return hipStatus; +} + } // end namespace hip_internal @@ -301,79 +398,12 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag return ihipLogStatus(hip_status); } -hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags) { - hipError_t hip_status = hipSuccess; - - if (HIP_SYNC_HOST_ALLOC) { - hipDeviceSynchronize(); - } - - auto ctx = ihipGetTlsDefaultCtx(); - if ((ctx == nullptr) || (ptr == nullptr)) { - hip_status = hipErrorInvalidValue; - } - else if (sizeBytes == 0) { - hip_status = hipSuccess; - // TODO - should size of 0 return err or be siliently ignored? - } else { - unsigned trueFlags = flags; - if (flags == hipHostMallocDefault) { - // HCC/ROCM provide a modern system with unified memory and should set both of these - // flags by default: - trueFlags = hipHostMallocMapped | hipHostMallocPortable; - } - - - const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | - hipHostMallocWriteCombined | hipHostMallocCoherent | - hipHostMallocNonCoherent; - - - const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent; - - if ((flags & ~supportedFlags) || ((flags & coherencyFlags) == coherencyFlags)) { - *ptr = nullptr; - // can't specify unsupported flags, can't specify both Coherent + NonCoherent - hip_status = hipErrorInvalidValue; - } else { - auto device = ctx->getWriteableDevice(); -#if (__hcc_workweek__ >= 19115) - //Avoid mapping host pinned memory to all devices by HCC - unsigned amFlags = amHostUnmapped; -#else - unsigned amFlags = 0; -#endif - if (flags & hipHostMallocCoherent) { - amFlags |= amHostCoherent; - } else if (flags & hipHostMallocNonCoherent) { - amFlags |= amHostNonCoherent; - } else { - // depends on env variables: - amFlags |= HIP_HOST_COHERENT ? amHostCoherent : amHostNonCoherent; - } - - - *ptr = hip_internal::allocAndSharePtr( - (amFlags & amHostCoherent) ? "finegrained_host" : "pinned_host", sizeBytes, ctx, - true /*shareWithAll*/, amFlags, flags, 0); - - if (sizeBytes && (*ptr == NULL)) { - hip_status = hipErrorMemoryAllocation; - } - } - } - - if (HIP_SYNC_HOST_ALLOC) { - hipDeviceSynchronize(); - } - return hip_status; -} hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) { HIP_INIT_SPECIAL_API(hipHostMalloc, (TRACE_MEM), ptr, sizeBytes, flags); HIP_SET_DEVICE(); hipError_t hip_status = hipSuccess; - hip_status = ihipHostMalloc(tls, ptr, sizeBytes, flags); + hip_status = hip_internal::ihipHostMalloc(tls, ptr, sizeBytes, flags); return ihipLogStatus(hip_status); } @@ -384,7 +414,7 @@ hipError_t hipMallocManaged(void** devPtr, size_t size, unsigned int flags) { if(flags != hipMemAttachGlobal) hip_status = hipErrorInvalidValue; else - hip_status = ihipHostMalloc(tls, devPtr, size, hipHostMallocDefault); + hip_status = hip_internal::ihipHostMalloc(tls, devPtr, size, hipHostMallocDefault); return ihipLogStatus(hip_status); } @@ -2146,30 +2176,8 @@ hipError_t hipFree(void* ptr) { hipError_t hipHostFree(void* ptr) { HIP_INIT_SPECIAL_API(hipHostFree, (TRACE_MEM), ptr); - // Synchronize to ensure all work has finished. - ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits - // for all activity to finish. - - - hipError_t hipStatus = hipErrorInvalidValue; - if (ptr) { - hc::accelerator acc; -#if (__hcc_workweek__ >= 17332) - hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0); -#else - hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); -#endif - am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr); - if (status == AM_SUCCESS) { - if (amPointerInfo._hostPointer == ptr) { - hc::am_free(ptr); - hipStatus = hipSuccess; - } - } - } else { - // free NULL pointer succeeds and is common technique to initialize runtime - hipStatus = hipSuccess; - } + hipError_t hipStatus = hipSuccess; + hipStatus = hip_internal::ihipHostFree(tls, ptr); return ihipLogStatus(hipStatus); }; diff --git a/src/hip_module.cpp b/src/hip_module.cpp index b3afdd4ffe..ac239105b8 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -137,7 +137,8 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ, size_t sharedMemBytes, hipStream_t hStream, void** kernelParams, void** extra, - hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0) { + hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0, + void** impCoopParams = 0) { using namespace hip_impl; auto ctx = ihipGetTlsDefaultCtx(); @@ -181,10 +182,17 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global return hipErrorInvalidValue; } - // Insert 48-bytes at the end for implicit kernel arguments and fill with value zero. + // Insert 56-bytes at the end for implicit kernel arguments and fill with value zero. size_t padSize = (~kernargs.size() + 1) & (HIP_IMPLICIT_KERNARG_ALIGNMENT - 1); kernargs.insert(kernargs.end(), padSize + HIP_IMPLICIT_KERNARG_SIZE, 0); + if (impCoopParams) { + const auto p{static_cast(*impCoopParams)}; + // The sixth index is for multi-grid synchronization + kernargs.insert((kernargs.cend() - padSize - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT, + p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT); + } + /* Kernel argument preparation. */ @@ -449,6 +457,10 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, return ihipLogStatus(hipErrorLaunchFailure); } + size_t impCoopArg = 1; + void* impCoopParams[1]; + impCoopParams[0] = &impCoopArg; + // launch the main kernel result = ihipModuleLaunchKernel(tls, kd, gridDim.x * blockDimX.x, @@ -456,7 +468,7 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, gridDim.z * blockDimX.z, blockDimX.x, blockDimX.y, blockDimX.z, sharedMemBytes, stream, kernelParams, nullptr, nullptr, - nullptr, 0, true); + nullptr, 0, true, impCoopParams); stream->criticalData().unlock(); #if (__hcc_workweek__ >= 19213) @@ -472,7 +484,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi HIP_INIT_API(hipLaunchCooperativeKernelMultiDevice, launchParamsList, numDevices, flags); hipError_t result; - if (numDevices > g_deviceCnt || launchParamsList == nullptr) { + if (numDevices > g_deviceCnt || launchParamsList == nullptr || numDevices > MAX_COOPERATIVE_GPUs) { return ihipLogStatus(hipErrorInvalidValue); } @@ -523,6 +535,32 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi kargs.getHandle()); } + mg_sync *mg_sync_ptr = 0; + mg_info *mg_info_ptr[MAX_COOPERATIVE_GPUs] = {0}; + + result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault); + if (result != hipSuccess) { + return ihipLogStatus(hipErrorInvalidValue); + } + mg_sync_ptr->w0 = 0; + mg_sync_ptr->w1 = 0; + + uint all_sum = 0; + for (int i = 0; i < numDevices; ++i) { + result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_ptr[i], sizeof(mg_info), hipHostMallocDefault); + if (result != hipSuccess) { + hip_internal::ihipHostFree(tls, mg_sync_ptr); + for (int j = 0; j < i; ++j) { + hip_internal::ihipHostFree(tls, mg_info_ptr[j]); + } + return ihipLogStatus(hipErrorInvalidValue); + } + // calculate the sum of sizes of all grids + const hipLaunchParams& lp = launchParamsList[i]; + all_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z * + lp.gridDim.x * lp.gridDim.y * lp.gridDim.z; + } + // lock all streams before launching the blit kernels for initializing the GWS and main kernels to each device for (int i = 0; i < numDevices; ++i) { LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false); @@ -531,7 +569,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi #endif } - // launch the init_gws kernel to initialize the GWS followed by launching the main kernels for each device + // launch the init_gws kernel to initialize the GWS for each device for (int i = 0; i < numDevices; ++i) { const hipLaunchParams& lp = launchParamsList[i]; @@ -549,8 +587,32 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue(); #endif } + hip_internal::ihipHostFree(tls, mg_sync_ptr); + for (int j = 0; j < numDevices; ++j) { + hip_internal::ihipHostFree(tls, mg_info_ptr[j]); + } + return ihipLogStatus(hipErrorLaunchFailure); } + } + + void* impCoopParams[1]; + ulong prev_sum = 0; + // launch the main kernels for each device + for (int i = 0; i < numDevices; ++i) { + const hipLaunchParams& lp = launchParamsList[i]; + + //initialize and setup the implicit kernel argument for multi-grid sync + mg_info_ptr[i]->mgs = mg_sync_ptr; + mg_info_ptr[i]->grid_id = i; + mg_info_ptr[i]->num_grids = numDevices; + mg_info_ptr[i]->all_sum = all_sum; + mg_info_ptr[i]->prev_sum = prev_sum; + prev_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z * + lp.gridDim.x * lp.gridDim.y * lp.gridDim.z; + + + impCoopParams[0] = &mg_info_ptr[i]; result = ihipModuleLaunchKernel(tls, kds[i], lp.gridDim.x * lp.blockDim.x, @@ -559,7 +621,23 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi lp.blockDim.x, lp.blockDim.y, lp.blockDim.z, lp.sharedMem, lp.stream, lp.args, nullptr, nullptr, nullptr, 0, - true); + true, impCoopParams); + + if (result != hipSuccess) { + for (int j = 0; j < numDevices; ++j) { + launchParamsList[j].stream->criticalData().unlock(); +#if (__hcc_workweek__ >= 19213) + launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue(); +#endif + } + hip_internal::ihipHostFree(tls, mg_sync_ptr); + for (int j = 0; j < numDevices; ++j) { + hip_internal::ihipHostFree(tls, mg_info_ptr[j]); + } + + return ihipLogStatus(hipErrorLaunchFailure); + } + } // unlock all streams @@ -573,6 +651,11 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi free(gwsKds); free(kds); + hip_internal::ihipHostFree(tls, mg_sync_ptr); + for (int j = 0; j < numDevices; ++j) { + hip_internal::ihipHostFree(tls, mg_info_ptr[j]); + } + return ihipLogStatus(result); }