diff --git a/CMakeLists.txt b/CMakeLists.txt index dcce63ccde..4142cd09cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -542,7 +542,10 @@ if(${HIP_EVENT_DISABLE_FENCE}) target_compile_definitions(rccl PRIVATE HIP_EVENT_DISABLE_FENCE) endif() if(${HIP_UNCACHED_MEMORY}) - target_compile_definitions(rccl PRIVATE HIP_UNCACHED_MEMORY) + if(${hipcc_version_string} VERSION_GREATER_EQUAL "5.7.31920") + target_compile_definitions(rccl PRIVATE HIP_UNCACHED_MEMORY) + message(STATUS "HIP_UNCACHED_MEMORY enabled") + endif() endif() if (BUILD_BFD) if (HAVE_BFD) diff --git a/src/include/alloc.h b/src/include/alloc.h index 9579044bf3..27a82fea82 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -167,9 +167,7 @@ ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (isFineGrain) { #if defined(HIP_UNCACHED_MEMORY) - hipDeviceProp_t prop; - CUDACHECK(hipGetDeviceProperties(&prop, 0)); - CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), prop.gcnArch/10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained), result, finish); + CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocUncached), result, finish); #else CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish); #endif @@ -195,9 +193,7 @@ ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); if (isFineGrain) { #if defined(HIP_UNCACHED_MEMORY) - hipDeviceProp_t prop; - CUDACHECK(hipGetDeviceProperties(&prop, 0)); - CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), prop.gcnArch/10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained), result, finish); + CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocUncached), result, finish); #else CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish); #endif @@ -229,9 +225,7 @@ ncclResult_t ncclCudaCallocAsyncDebug(const char *filefunc, int line, T** ptr, s CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (isFineGrain) { #if defined(HIP_UNCACHED_MEMORY) - hipDeviceProp_t prop; - CUDACHECK(hipGetDeviceProperties(&prop, 0)); - CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), prop.gcnArch/10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained), result, finish); + CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocUncached), result, finish); #else CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish); #endif diff --git a/src/init.cc b/src/init.cc index ac3a38f43c..c9bef33db2 100644 --- a/src/init.cc +++ b/src/init.cc @@ -124,9 +124,11 @@ static ncclResult_t ncclInit() { if (strstr(strValue, "iommu=pt") == NULL) WARN("Missing \"iommu=pt\" from kernel command line which can lead to system instablity or hang!"); } +#ifndef HIP_UNCACHED_MEMORY char *env = getenv("HSA_FORCE_FINE_GRAIN_PCIE"); if (env == NULL || strcmp(env, "1") != 0) WARN("Missing \"HSA_FORCE_FINE_GRAIN_PCIE=1\" from environment which can lead to low RCCL performance, system instablity or hang!"); +#endif } #ifndef NVTX_NO_IMPL initNvtxRegisteredEnums(); @@ -715,9 +717,7 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u // detect if fine grained memory is available on this GPU int *ptr; #if defined(HIP_UNCACHED_MEMORY) - hipDeviceProp_t prop; - CUDACHECK(hipGetDeviceProperties(&prop, 0)); - if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), prop.gcnArch/10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained) == hipSuccess) { + if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), hipDeviceMallocUncached) == hipSuccess) { #else if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), hipDeviceMallocFinegrained) == hipSuccess) { #endif