From cf8589b8c8a40ddcc55fa3a51e23390a49824130 Mon Sep 17 00:00:00 2001 From: Aryan Salmanpour Date: Mon, 13 Apr 2020 04:32:52 -0400 Subject: [PATCH] [HIP] add support for NoPreSync/NoPostSync flags for Cooperative MultiDevice launch API (#1990) --- src/hip_hcc.cpp | 8 +++++- src/hip_hcc_internal.h | 32 +++++++++++++----------- src/hip_memory.cpp | 6 ++--- src/hip_module.cpp | 57 +++++++++++++++++++++++++++++------------- 4 files changed, 66 insertions(+), 37 deletions(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 807dcc7391..2fd40903d7 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -263,7 +263,13 @@ ihipStream_t::ihipStream_t(ihipCtx_t* ctx, hc::accelerator_view av, unsigned int //--- -ihipStream_t::~ihipStream_t() {} +ihipStream_t::~ihipStream_t() { + GET_TLS(); + for (auto mem : coopMemsTracker) { + hip_internal::ihipHostFree(tls, mem->mgs); + hip_internal::ihipHostFree(tls, mem); + } +} hc::hcWaitMode ihipStream_t::waitMode() const { diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h index 93551c8316..2c3fb25b3a 100644 --- a/src/hip_hcc_internal.h +++ b/src/hip_hcc_internal.h @@ -551,6 +551,20 @@ public: typedef ihipStreamCriticalBase_t ihipStreamCritical_t; typedef LockedAccessor LockedAccessor_StreamCrit_t; +// do not change these two structs without changing the device library +struct mg_sync { + uint w0; + uint w1; +}; + +struct mg_info { + struct mg_sync *mgs; + uint grid_id; + uint num_grids; + ulong prev_sum; + ulong all_sum; +}; + //--- // Internal stream structure. class ihipStream_t { @@ -619,6 +633,8 @@ class ihipStream_t { // Before calling this function, stream must be resolved from "0" to the actual stream: bool isDefaultStream() const { return _id == 0; }; + std::vector coopMemsTracker; + public: //--- // Public member vars - these are set at initialization and never change: @@ -1018,7 +1034,7 @@ namespace hip_internal { hipError_t memcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream); -hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags); +hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags, bool noSync = 0); hipError_t ihipHostFree(TlsData *tls, void* ptr); @@ -1026,20 +1042,6 @@ hipError_t ihipHostFree(TlsData *tls, void* ptr); #define MAX_COOPERATIVE_GPUs 255 -// do not change these two structs without changing the device library -struct mg_sync { - uint w0; - uint w1; -}; - -struct mg_info { - struct mg_sync *mgs; - uint grid_id; - uint num_grids; - ulong prev_sum; - ulong all_sum; -}; - //--- // TODO - review the context creation strategy here. Really should be: // - first "non-device" runtime call creates the context for this thread. Allowed to call diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 832dcc5531..de6bc63b20 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -496,14 +496,14 @@ void* allocAndSharePtr(const char* msg, size_t sizeBytes, ihipCtx_t* ctx, bool s return ptr; } -hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags) { +hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags, bool noSync) { hipError_t hip_status = hipSuccess; if (sizeBytes == 0) { return hipSuccess; } - if (HIP_SYNC_HOST_ALLOC) { + if (HIP_SYNC_HOST_ALLOC && !noSync) { hipDeviceSynchronize(); } @@ -558,7 +558,7 @@ hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned i } } - if (HIP_SYNC_HOST_ALLOC) { + if (HIP_SYNC_HOST_ALLOC && !noSync) { hipDeviceSynchronize(); } return hip_status; diff --git a/src/hip_module.cpp b/src/hip_module.cpp index d0ec0df9de..e98afa3294 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -511,8 +511,7 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim, size_t globalWorkSizeX = (size_t)gridDim.x * (size_t)blockDim.x; size_t globalWorkSizeY = (size_t)gridDim.y * (size_t)blockDim.y; size_t globalWorkSizeZ = (size_t)gridDim.z * (size_t)blockDim.z; - if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX) - { + if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX) { return hipErrorInvalidConfiguration; } @@ -738,7 +737,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL vector mg_info_ptr; - result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault); + result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault, true); if (result != hipSuccess) { return hipErrorInvalidValue; } @@ -748,7 +747,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL uint all_sum = 0; for (int i = 0; i < numDevices; ++i) { mg_info *mg_info_temp = nullptr; - result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_temp, sizeof(mg_info), hipHostMallocDefault); + result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_temp, sizeof(mg_info), hipHostMallocDefault, true); if (result != hipSuccess) { hip_internal::ihipHostFree(tls, mg_sync_ptr); for (int j = 0; j < i; ++j) { @@ -770,11 +769,22 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL hc::completion_future streamCF; if (!streamCrit->_av.get_is_empty()) { streamCF = streamCrit->_av.create_marker(hc::accelerator_scope); - coopAVs[i].create_blocking_marker(streamCF, hc::accelerator_scope); + if (flags & hipCooperativeLaunchMultiDeviceNoPreSync) { + coopAVs[i].create_blocking_marker(streamCF, hc::accelerator_scope); + streamCrit->_av.acquire_locked_hsa_queue(); + coopAVs[i].acquire_locked_hsa_queue(); + } else { + for (int j = 0; j < numDevices; ++j) { + coopAVs[j].create_blocking_marker(streamCF, hc::accelerator_scope); + } + } + } + } + if ((flags & hipCooperativeLaunchMultiDeviceNoPreSync) == 0) { + for (int i = 0; i < numDevices; ++i) { + launchParamsList[i].stream->criticalData()._av.acquire_locked_hsa_queue(); + coopAVs[i].acquire_locked_hsa_queue(); } - - streamCrit->_av.acquire_locked_hsa_queue(); - coopAVs[i].acquire_locked_hsa_queue(); } // launch the init_gws kernel to initialize the GWS for each device @@ -820,14 +830,14 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL prev_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z * lp.gridDim.x * lp.gridDim.y * lp.gridDim.z; + lp.stream->coopMemsTracker.push_back(mg_info_ptr[i]); impCoopParams[0] = &mg_info_ptr[i]; globalWorkSizeX = (size_t)lp.gridDim.x * (size_t)lp.blockDim.x; globalWorkSizeY = (size_t)lp.gridDim.y * (size_t)lp.blockDim.y; globalWorkSizeZ = (size_t)lp.gridDim.z * (size_t)lp.blockDim.z; - if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX) - { + if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX) { return hipErrorInvalidConfiguration; } @@ -849,6 +859,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL hip_internal::ihipHostFree(tls, mg_sync_ptr); for (int j = 0; j < numDevices; ++j) { hip_internal::ihipHostFree(tls, mg_info_ptr[j]); + launchParamsList[j].stream->coopMemsTracker.pop_back(); } return hipErrorLaunchFailure; @@ -856,24 +867,34 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL } - // unlock all streams + // unlock streams and create blocking markers on them based on the workload + // on cooperative queues on each device for (int i = 0; i < numDevices; ++i) { coopAVs[i].release_locked_hsa_queue(); launchParamsList[i].stream->criticalData()._av.release_locked_hsa_queue(); + } + for (int i = 0; i < numDevices; ++i) { hc::completion_future cooperativeCF; if (!coopAVs[i].get_is_empty()) { cooperativeCF = coopAVs[i].create_marker(hc::accelerator_scope); - launchParamsList[i].stream->criticalData()._av.create_blocking_marker( - cooperativeCF, hc::accelerator_scope); + if (flags & hipCooperativeLaunchMultiDeviceNoPostSync) { + launchParamsList[i].stream->criticalData()._av.create_blocking_marker( + cooperativeCF, hc::accelerator_scope); + launchParamsList[i].stream->criticalData().unlock(); + } else { + for (int j = 0; j < numDevices; ++j) { + launchParamsList[j].stream->criticalData()._av.create_blocking_marker( + cooperativeCF, hc::accelerator_scope); + } + } } - - launchParamsList[i].stream->criticalData().unlock(); } - hip_internal::ihipHostFree(tls, mg_sync_ptr); - for (int j = 0; j < numDevices; ++j) { - hip_internal::ihipHostFree(tls, mg_info_ptr[j]); + if ((flags & hipCooperativeLaunchMultiDeviceNoPostSync) == 0) { + for (int i = 0; i < numDevices; ++i) { + launchParamsList[i].stream->criticalData().unlock(); + } } return result;