Remove hipEventDisableSystemFence (#1122)
There is no indication that disabling system fence has any latency improvement. Removing it per recommendation from HIP.
This commit is contained in:
@@ -142,9 +142,6 @@ else()
|
||||
message(WARNING "Failed to extract ROCm version.")
|
||||
endif()
|
||||
|
||||
### Check for hipEventDisableSystemFence support
|
||||
check_symbol_exists("hipEventDisableSystemFence" "hip/hip_runtime_api.h" HIP_EVENT_DISABLE_FENCE)
|
||||
|
||||
### Check for hipDeviceMallocUncached support
|
||||
check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
|
||||
|
||||
@@ -576,9 +573,6 @@ endif()
|
||||
if(TRACE)
|
||||
target_compile_definitions(rccl PRIVATE ENABLE_TRACE)
|
||||
endif()
|
||||
if(${HIP_EVENT_DISABLE_FENCE})
|
||||
target_compile_definitions(rccl PRIVATE HIP_EVENT_DISABLE_FENCE)
|
||||
endif()
|
||||
if(${HIP_CONTIGUOUS_MEMORY})
|
||||
target_compile_definitions(rccl PRIVATE HIP_CONTIGUOUS_MEMORY)
|
||||
endif()
|
||||
|
||||
@@ -521,12 +521,7 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
|
||||
// Try to create a CUDA object right away. If there is something wrong with
|
||||
// the device we're on (failure cause #1) , better know it early.
|
||||
hipEvent_t doneEvent;
|
||||
#ifdef HIP_EVENT_DISABLE_FENCE
|
||||
CUDACHECK(hipEventCreateWithFlags(&doneEvent, hipEventDisableTiming|hipEventDisableSystemFence));
|
||||
#else
|
||||
CUDACHECK(hipEventCreateWithFlags(&doneEvent, hipEventDisableTiming));
|
||||
#endif
|
||||
|
||||
|
||||
comm->doneEvent = doneEvent;
|
||||
comm->lastStream = nullptr;
|
||||
|
||||
@@ -16,10 +16,6 @@
|
||||
#include "msccl/msccl_setup.h"
|
||||
#include "msccl/msccl_status.h"
|
||||
|
||||
#ifndef HIP_EVENT_DISABLE_FENCE
|
||||
RCCL_PARAM(MscclEnableDoneEvent, "MSCCL_ENABLE_DONE_EVENT", 1);
|
||||
#endif
|
||||
|
||||
RCCL_PARAM(MscclWorkFifoDepth, "MSCCL_WORK_FIFO_DEPTH", 256<<10);
|
||||
|
||||
static inline size_t computeSizeNeeded(size_t nBytes, int nScratchChunks, int nChunksPerLoop) {
|
||||
@@ -417,14 +413,7 @@ ncclResult_t mscclSetupKernel(const void* sendBuff, void* recvBuff, size_t count
|
||||
mscclStatus& status = mscclGetStatus();
|
||||
mscclThreadLocalStatus& threadLocalStatus = mscclGetThreadLocalStatus();
|
||||
|
||||
bool enableDoneEvent =
|
||||
#ifndef HIP_EVENT_DISABLE_FENCE
|
||||
(rcclParamMscclEnableDoneEvent() == 1);
|
||||
#else
|
||||
true;
|
||||
#endif
|
||||
|
||||
if (enableDoneEvent && (status.lastStream != stream && status.lastStream != nullptr)) {
|
||||
if (status.lastStream != stream && status.lastStream != nullptr) {
|
||||
CUDACHECK(hipStreamWaitEvent(stream, comm->doneEvent, 0));
|
||||
}
|
||||
|
||||
@@ -526,11 +515,7 @@ ncclResult_t mscclSetupKernel(const void* sendBuff, void* recvBuff, size_t count
|
||||
|
||||
void *args[3] = {&comm->devComm, &devAlgo, &workPtr};
|
||||
void *func = mscclKernelEntries[fnIndex];
|
||||
if (enableDoneEvent) {
|
||||
CUDACHECK(hipExtLaunchKernel(func, grid, block, args, 0, stream, NULL, comm->doneEvent, 0));
|
||||
} else {
|
||||
CUDACHECK(hipExtLaunchKernel(func, grid, block, args, 0, stream, NULL, NULL, 0));
|
||||
}
|
||||
CUDACHECK(hipExtLaunchKernel(func, grid, block, args, 0, stream, NULL, comm->doneEvent, 0));
|
||||
status.workIndex++;
|
||||
status.lastStream = stream;
|
||||
return ncclSuccess;
|
||||
|
||||
Reference in New Issue
Block a user