From 32dc7ef47c69e876358f94b9f1ca4192dfcd1fc3 Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Mon, 17 Feb 2025 13:28:19 -0800 Subject: [PATCH] Enable GDRCopy only on gfx94x (#1550) * Enable GDRCopy only on gfx94x * Use cudaFree instead of hipFree * Add warning if failed to get device property * Remove extra return --- src/include/gdrwrap.h | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/include/gdrwrap.h b/src/include/gdrwrap.h index cb4887615e..d305690856 100644 --- a/src/include/gdrwrap.h +++ b/src/include/gdrwrap.h @@ -13,6 +13,7 @@ #include // for standard [u]intX_t types #include #include +#include "archinfo.h" // These can be used if the GDR library isn't thread safe #include @@ -160,8 +161,21 @@ typedef struct gdr_mem_desc { #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) static gdr_t ncclGdrInit() { - INFO(NCCL_INIT, "Enabled GDRCopy equivalent memory allocation"); - return (gdr_t)0x12345678L; + cudaDeviceProp devProp; + char gcnArchNameSubstr[128]; + cudaError_t err = cudaGetDeviceProperties(&devProp, 0); + if (err != cudaSuccess) { + WARN("Failed to GetDeviceProperties for device"); + return NULL; + } + GcnArchNameFormat(devProp.gcnArchName, gcnArchNameSubstr); + if (IsArchMatch(gcnArchNameSubstr, "gfx94")) { + INFO(NCCL_INIT, "Enabled GDRCopy equivalent memory allocation on %s", gcnArchNameSubstr); + return (gdr_t)0x12345678L; + } else { + INFO(NCCL_INIT, "Disabled GDRCopy equivalent memory allocation on %s due to GPU architecture", gcnArchNameSubstr); + return NULL; + } } template @@ -209,7 +223,7 @@ static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nele static ncclResult_t ncclGdrCudaFree(void* gdrHandle) { gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle; - CUDACHECK(hipFree(md->gdrDevMem)); + CUDACHECK(cudaFree(md->gdrDevMem)); free(md); return ncclSuccess;