[hip] add support for implicit kernel argument for multi-grid sync (#1456)

* [hip] add support for implicit kernel argument for multi-grid sync

* modified code for calculating the prev_sum

* change the impCoopArg type to size_t

* add memory clean up

* launch init_gws and main kernels into two separate loops
Šī revīzija ir iekļauta:
Aryan Salmanpour
2019-10-24 08:13:30 -04:00
revīziju iesūtīja Maneesh Gupta
vecāks fe5f7d4245
revīzija 359dc79101
3 mainīti faili ar 211 papildinājumiem un 99 dzēšanām
+101 -93
Parādīt failu
@@ -141,6 +141,103 @@ void* allocAndSharePtr(const char* msg, size_t sizeBytes, ihipCtx_t* ctx, bool s
return ptr;
}
hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags) {
hipError_t hip_status = hipSuccess;
if (HIP_SYNC_HOST_ALLOC) {
hipDeviceSynchronize();
}
auto ctx = ihipGetTlsDefaultCtx();
if ((ctx == nullptr) || (ptr == nullptr)) {
hip_status = hipErrorInvalidValue;
}
else if (sizeBytes == 0) {
hip_status = hipSuccess;
// TODO - should size of 0 return err or be siliently ignored?
} else {
unsigned trueFlags = flags;
if (flags == hipHostMallocDefault) {
// HCC/ROCM provide a modern system with unified memory and should set both of these
// flags by default:
trueFlags = hipHostMallocMapped | hipHostMallocPortable;
}
const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped |
hipHostMallocWriteCombined | hipHostMallocCoherent |
hipHostMallocNonCoherent;
const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent;
if ((flags & ~supportedFlags) || ((flags & coherencyFlags) == coherencyFlags)) {
*ptr = nullptr;
// can't specify unsupported flags, can't specify both Coherent + NonCoherent
hip_status = hipErrorInvalidValue;
} else {
auto device = ctx->getWriteableDevice();
#if (__hcc_workweek__ >= 19115)
//Avoid mapping host pinned memory to all devices by HCC
unsigned amFlags = amHostUnmapped;
#else
unsigned amFlags = 0;
#endif
if (flags & hipHostMallocCoherent) {
amFlags |= amHostCoherent;
} else if (flags & hipHostMallocNonCoherent) {
amFlags |= amHostNonCoherent;
} else {
// depends on env variables:
amFlags |= HIP_HOST_COHERENT ? amHostCoherent : amHostNonCoherent;
}
*ptr = hip_internal::allocAndSharePtr(
(amFlags & amHostCoherent) ? "finegrained_host" : "pinned_host", sizeBytes, ctx,
true /*shareWithAll*/, amFlags, flags, 0);
if (sizeBytes && (*ptr == NULL)) {
hip_status = hipErrorMemoryAllocation;
}
}
}
if (HIP_SYNC_HOST_ALLOC) {
hipDeviceSynchronize();
}
return hip_status;
}
hipError_t ihipHostFree(TlsData *tls, void* ptr) {
// Synchronize to ensure all work has finished.
ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits
// for all activity to finish.
hipError_t hipStatus = hipErrorInvalidValue;
if (ptr) {
hc::accelerator acc;
#if (__hcc_workweek__ >= 17332)
hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0);
#else
hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0);
#endif
am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr);
if (status == AM_SUCCESS) {
if (amPointerInfo._hostPointer == ptr) {
hc::am_free(ptr);
hipStatus = hipSuccess;
}
}
} else {
// free NULL pointer succeeds and is common technique to initialize runtime
hipStatus = hipSuccess;
}
return hipStatus;
}
} // end namespace hip_internal
@@ -301,79 +398,12 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag
return ihipLogStatus(hip_status);
}
hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags) {
hipError_t hip_status = hipSuccess;
if (HIP_SYNC_HOST_ALLOC) {
hipDeviceSynchronize();
}
auto ctx = ihipGetTlsDefaultCtx();
if ((ctx == nullptr) || (ptr == nullptr)) {
hip_status = hipErrorInvalidValue;
}
else if (sizeBytes == 0) {
hip_status = hipSuccess;
// TODO - should size of 0 return err or be siliently ignored?
} else {
unsigned trueFlags = flags;
if (flags == hipHostMallocDefault) {
// HCC/ROCM provide a modern system with unified memory and should set both of these
// flags by default:
trueFlags = hipHostMallocMapped | hipHostMallocPortable;
}
const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped |
hipHostMallocWriteCombined | hipHostMallocCoherent |
hipHostMallocNonCoherent;
const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent;
if ((flags & ~supportedFlags) || ((flags & coherencyFlags) == coherencyFlags)) {
*ptr = nullptr;
// can't specify unsupported flags, can't specify both Coherent + NonCoherent
hip_status = hipErrorInvalidValue;
} else {
auto device = ctx->getWriteableDevice();
#if (__hcc_workweek__ >= 19115)
//Avoid mapping host pinned memory to all devices by HCC
unsigned amFlags = amHostUnmapped;
#else
unsigned amFlags = 0;
#endif
if (flags & hipHostMallocCoherent) {
amFlags |= amHostCoherent;
} else if (flags & hipHostMallocNonCoherent) {
amFlags |= amHostNonCoherent;
} else {
// depends on env variables:
amFlags |= HIP_HOST_COHERENT ? amHostCoherent : amHostNonCoherent;
}
*ptr = hip_internal::allocAndSharePtr(
(amFlags & amHostCoherent) ? "finegrained_host" : "pinned_host", sizeBytes, ctx,
true /*shareWithAll*/, amFlags, flags, 0);
if (sizeBytes && (*ptr == NULL)) {
hip_status = hipErrorMemoryAllocation;
}
}
}
if (HIP_SYNC_HOST_ALLOC) {
hipDeviceSynchronize();
}
return hip_status;
}
hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) {
HIP_INIT_SPECIAL_API(hipHostMalloc, (TRACE_MEM), ptr, sizeBytes, flags);
HIP_SET_DEVICE();
hipError_t hip_status = hipSuccess;
hip_status = ihipHostMalloc(tls, ptr, sizeBytes, flags);
hip_status = hip_internal::ihipHostMalloc(tls, ptr, sizeBytes, flags);
return ihipLogStatus(hip_status);
}
@@ -384,7 +414,7 @@ hipError_t hipMallocManaged(void** devPtr, size_t size, unsigned int flags) {
if(flags != hipMemAttachGlobal)
hip_status = hipErrorInvalidValue;
else
hip_status = ihipHostMalloc(tls, devPtr, size, hipHostMallocDefault);
hip_status = hip_internal::ihipHostMalloc(tls, devPtr, size, hipHostMallocDefault);
return ihipLogStatus(hip_status);
}
@@ -2146,30 +2176,8 @@ hipError_t hipFree(void* ptr) {
hipError_t hipHostFree(void* ptr) {
HIP_INIT_SPECIAL_API(hipHostFree, (TRACE_MEM), ptr);
// Synchronize to ensure all work has finished.
ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits
// for all activity to finish.
hipError_t hipStatus = hipErrorInvalidValue;
if (ptr) {
hc::accelerator acc;
#if (__hcc_workweek__ >= 17332)
hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0);
#else
hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0);
#endif
am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr);
if (status == AM_SUCCESS) {
if (amPointerInfo._hostPointer == ptr) {
hc::am_free(ptr);
hipStatus = hipSuccess;
}
}
} else {
// free NULL pointer succeeds and is common technique to initialize runtime
hipStatus = hipSuccess;
}
hipError_t hipStatus = hipSuccess;
hipStatus = hip_internal::ihipHostFree(tls, ptr);
return ihipLogStatus(hipStatus);
};