Remove hipEventDisableSystemFence (#1122)

There is no indication that disabling system fence has any latency improvement.
Removing it per recommendation from HIP.
This commit is contained in:
Wenkai Du
2024-03-25 08:01:57 -07:00
zatwierdzone przez GitHub
rodzic c2fc1d6809
commit 5976f757dd
3 zmienionych plików z 2 dodań i 28 usunięć
-6
Wyświetl plik
@@ -142,9 +142,6 @@ else()
message(WARNING "Failed to extract ROCm version.")
endif()
### Check for hipEventDisableSystemFence support
check_symbol_exists("hipEventDisableSystemFence" "hip/hip_runtime_api.h" HIP_EVENT_DISABLE_FENCE)
### Check for hipDeviceMallocUncached support
check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
@@ -576,9 +573,6 @@ endif()
if(TRACE)
target_compile_definitions(rccl PRIVATE ENABLE_TRACE)
endif()
if(${HIP_EVENT_DISABLE_FENCE})
target_compile_definitions(rccl PRIVATE HIP_EVENT_DISABLE_FENCE)
endif()
if(${HIP_CONTIGUOUS_MEMORY})
target_compile_definitions(rccl PRIVATE HIP_CONTIGUOUS_MEMORY)
endif()
-5
Wyświetl plik
@@ -521,12 +521,7 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
// Try to create a CUDA object right away. If there is something wrong with
// the device we're on (failure cause #1) , better know it early.
hipEvent_t doneEvent;
#ifdef HIP_EVENT_DISABLE_FENCE
CUDACHECK(hipEventCreateWithFlags(&doneEvent, hipEventDisableTiming|hipEventDisableSystemFence));
#else
CUDACHECK(hipEventCreateWithFlags(&doneEvent, hipEventDisableTiming));
#endif
comm->doneEvent = doneEvent;
comm->lastStream = nullptr;
+2 -17
Wyświetl plik
@@ -16,10 +16,6 @@
#include "msccl/msccl_setup.h"
#include "msccl/msccl_status.h"
#ifndef HIP_EVENT_DISABLE_FENCE
RCCL_PARAM(MscclEnableDoneEvent, "MSCCL_ENABLE_DONE_EVENT", 1);
#endif
RCCL_PARAM(MscclWorkFifoDepth, "MSCCL_WORK_FIFO_DEPTH", 256<<10);
static inline size_t computeSizeNeeded(size_t nBytes, int nScratchChunks, int nChunksPerLoop) {
@@ -417,14 +413,7 @@ ncclResult_t mscclSetupKernel(const void* sendBuff, void* recvBuff, size_t count
mscclStatus& status = mscclGetStatus();
mscclThreadLocalStatus& threadLocalStatus = mscclGetThreadLocalStatus();
bool enableDoneEvent =
#ifndef HIP_EVENT_DISABLE_FENCE
(rcclParamMscclEnableDoneEvent() == 1);
#else
true;
#endif
if (enableDoneEvent && (status.lastStream != stream && status.lastStream != nullptr)) {
if (status.lastStream != stream && status.lastStream != nullptr) {
CUDACHECK(hipStreamWaitEvent(stream, comm->doneEvent, 0));
}
@@ -526,11 +515,7 @@ ncclResult_t mscclSetupKernel(const void* sendBuff, void* recvBuff, size_t count
void *args[3] = {&comm->devComm, &devAlgo, &workPtr};
void *func = mscclKernelEntries[fnIndex];
if (enableDoneEvent) {
CUDACHECK(hipExtLaunchKernel(func, grid, block, args, 0, stream, NULL, comm->doneEvent, 0));
} else {
CUDACHECK(hipExtLaunchKernel(func, grid, block, args, 0, stream, NULL, NULL, 0));
}
CUDACHECK(hipExtLaunchKernel(func, grid, block, args, 0, stream, NULL, comm->doneEvent, 0));
status.workIndex++;
status.lastStream = stream;
return ncclSuccess;