diff --git a/CMakeLists.txt b/CMakeLists.txt index f71929e5c6..67a20696b6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,12 +34,12 @@ set(CMAKE_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "") #Set the AMDGPU_TARGETS with backward compatiblity if(COMMAND rocm_check_target_ids) rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS - TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx1030" + TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx1030;gfx1100;gfx1101;gfx1102" ) else() # Use target ID syntax if supported for AMDGPU_TARGETS if(TARGET_ID_SUPPORT) - set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx1030") + set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx1030;gfx1100;gfx1101;gfx1102") else() set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900;gfx906;gfx908") endif() diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h index bb96806460..4cf1988e98 100644 --- a/src/collectives/device/common.h +++ b/src/collectives/device/common.h @@ -330,13 +330,13 @@ class ncclFunction { }; #ifdef ENABLE_COLLTRACE -#define traceColl(launch_type) { \ +#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) + #define traceColl(launch_type) { \ uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \ - collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \ + collTrace->timeStamp = wall_clock64(); \ collTrace->bid = blockIdx.x; \ collTrace->funcIndex = ncclShmem.work.header.funcIndex; \ - asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (collTrace->data_0)); \ if (ncclShmem.work.header.type == ncclWorkTypeP2p) { \ struct ncclWorkElemP2p *p2pElems = ncclShmem.work.p2pElems; \ collTrace->p2p[0].connIndex = 0; \ @@ -362,25 +362,84 @@ class ncclFunction { } \ } -#define traceKernelLaunch(firstLaunch) { \ + #define traceKernelLaunch(firstLaunch) { \ traceColl(firstLaunch?ncclCollTraceKernelLaunchType:ncclCollTraceCollLaunchType); \ } -#define traceKernelEnd() { \ + #define traceKernelEnd() { \ + uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ + struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \ + collTrace->timeStamp = wall_clock64(); \ + collTrace->bid = blockIdx.x; \ + collTrace->type = ncclCollTraceKernelEndType; \ + } + #define traceAbort() { \ + uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ + struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \ + collTrace->timeStamp = wall_clock64(); \ + collTrace->bid = blockIdx.x; \ + collTrace->type = ncclCollTraceAbortType; \ + } + #define traceData(data2, data4, data8_0, data8_1) { \ + uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ + struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \ + collTrace->bid = blockIdx.x; \ + collTrace->timeStamp = wall_clock64(); \ + collTrace->funcIndex = data2; \ + collTrace->data_0 = data4; \ + collTrace->opCount = data8_0; \ + collTrace->data_1 = data8_1; \ + collTrace->type = ncclCollTraceDataType; \ + } +#else + #define traceColl(launch_type) { \ + uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ + struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \ + collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \ + collTrace->bid = blockIdx.x; \ + collTrace->funcIndex = ncclShmem.work.header.funcIndex; \ + asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (collTrace->data_0)); \ + if (ncclShmem.work.header.type == ncclWorkTypeP2p) { \ + struct ncclWorkElemP2p *p2pElems = ncclShmem.work.p2pElems; \ + collTrace->p2p[0].connIndex = 0; \ + collTrace->p2pOpCount[0] = p2pElems[0].opCount; \ + collTrace->p2p[0].ngroups = p2pElems[0].ngroups; \ + collTrace->p2p[0].nWarps = p2pElems[0].nWarps; \ + collTrace->p2p[0].warpStart = p2pElems[0].warpStart; \ + collTrace->p2p[0].peer = p2pElems[0].p2pType == ncclWorkP2pTypeRecv ? (uint16_t)(p2pElems[0].peer) : -1; \ + collTrace->p2p[1].connIndex = 0; \ + collTrace->p2pOpCount[1] = p2pElems[1].opCount; \ + collTrace->p2p[1].ngroups = p2pElems[1].ngroups; \ + collTrace->p2p[1].nWarps = p2pElems[1].nWarps; \ + collTrace->p2p[1].warpStart = p2pElems[1].warpStart; \ + collTrace->p2p[1].peer = p2pElems[1].p2pType == ncclWorkP2pTypeSend ? (uint16_t)(p2pElems[1].peer) : -1; \ + collTrace->type = (launch_type) | ncclCollTraceP2pElemType; \ + } else if (ncclShmem.work.header.type == ncclWorkTypeColl) { \ + struct ncclWorkElem *elems = ncclShmem.work.elems; \ + collTrace->opCount = elems[0].opCount; \ + collTrace->coll.nWarps = elems[0].nWarps; \ + collTrace->coll.bid = elems[0].bid; \ + collTrace->coll.nChannels = elems[0].nChannels; \ + collTrace->type = (launch_type) | ncclCollTraceCollElemType; \ + } \ + } + #define traceKernelLaunch(firstLaunch) { \ + traceColl(firstLaunch?ncclCollTraceKernelLaunchType:ncclCollTraceCollLaunchType); \ + } + #define traceKernelEnd() { \ uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \ collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \ collTrace->bid = blockIdx.x; \ collTrace->type = ncclCollTraceKernelEndType; \ } -#define traceAbort() { \ + #define traceAbort() { \ uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \ collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \ collTrace->bid = blockIdx.x; \ collTrace->type = ncclCollTraceAbortType; \ } -// traceData(int16_t data2, uint32_t data4, uint64_t data8_0, uint64_t data8_1) -#define traceData(data2, data4, data8_0, data8_1) { \ + #define traceData(data2, data4, data8_0, data8_1) { \ uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \ collTrace->bid = blockIdx.x; \ @@ -391,6 +450,7 @@ class ncclFunction { collTrace->data_1 = data8_1; \ collTrace->type = ncclCollTraceDataType; \ } +#endif #else #define traceData(data2, data4, data8_0, data8_1) #endif diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h index b7ef95dd0f..2242a081a3 100644 --- a/src/collectives/device/prims_ll128.h +++ b/src/collectives/device/prims_ll128.h @@ -297,7 +297,7 @@ private: } } -#if !defined(__gfx1030__) +#if !defined(__gfx1030__) && !defined(__gfx1100__) && !defined(__gfx1101__) && !defined(__gfx1102__) if (tid == 0) __asm__ __volatile__("buffer_wbinvl1_vol"); #endif /************************ Send **************************/