Added new gpu targets (#631)
This commit is contained in:
gecommit door
GitHub
bovenliggende
a523b37ac7
commit
ef71550738
@@ -34,12 +34,12 @@ set(CMAKE_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "")
|
||||
#Set the AMDGPU_TARGETS with backward compatiblity
|
||||
if(COMMAND rocm_check_target_ids)
|
||||
rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS
|
||||
TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx1030"
|
||||
TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx1030;gfx1100;gfx1101;gfx1102"
|
||||
)
|
||||
else()
|
||||
# Use target ID syntax if supported for AMDGPU_TARGETS
|
||||
if(TARGET_ID_SUPPORT)
|
||||
set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx1030")
|
||||
set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx1030;gfx1100;gfx1101;gfx1102")
|
||||
else()
|
||||
set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900;gfx906;gfx908")
|
||||
endif()
|
||||
|
||||
@@ -330,13 +330,13 @@ class ncclFunction {
|
||||
};
|
||||
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
#define traceColl(launch_type) { \
|
||||
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__)
|
||||
#define traceColl(launch_type) { \
|
||||
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
|
||||
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
collTrace->timeStamp = wall_clock64(); \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->funcIndex = ncclShmem.work.header.funcIndex; \
|
||||
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (collTrace->data_0)); \
|
||||
if (ncclShmem.work.header.type == ncclWorkTypeP2p) { \
|
||||
struct ncclWorkElemP2p *p2pElems = ncclShmem.work.p2pElems; \
|
||||
collTrace->p2p[0].connIndex = 0; \
|
||||
@@ -362,25 +362,84 @@ class ncclFunction {
|
||||
} \
|
||||
}
|
||||
|
||||
#define traceKernelLaunch(firstLaunch) { \
|
||||
#define traceKernelLaunch(firstLaunch) { \
|
||||
traceColl(firstLaunch?ncclCollTraceKernelLaunchType:ncclCollTraceCollLaunchType); \
|
||||
}
|
||||
#define traceKernelEnd() { \
|
||||
#define traceKernelEnd() { \
|
||||
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
|
||||
collTrace->timeStamp = wall_clock64(); \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->type = ncclCollTraceKernelEndType; \
|
||||
}
|
||||
#define traceAbort() { \
|
||||
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
|
||||
collTrace->timeStamp = wall_clock64(); \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->type = ncclCollTraceAbortType; \
|
||||
}
|
||||
#define traceData(data2, data4, data8_0, data8_1) { \
|
||||
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->timeStamp = wall_clock64(); \
|
||||
collTrace->funcIndex = data2; \
|
||||
collTrace->data_0 = data4; \
|
||||
collTrace->opCount = data8_0; \
|
||||
collTrace->data_1 = data8_1; \
|
||||
collTrace->type = ncclCollTraceDataType; \
|
||||
}
|
||||
#else
|
||||
#define traceColl(launch_type) { \
|
||||
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
|
||||
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->funcIndex = ncclShmem.work.header.funcIndex; \
|
||||
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (collTrace->data_0)); \
|
||||
if (ncclShmem.work.header.type == ncclWorkTypeP2p) { \
|
||||
struct ncclWorkElemP2p *p2pElems = ncclShmem.work.p2pElems; \
|
||||
collTrace->p2p[0].connIndex = 0; \
|
||||
collTrace->p2pOpCount[0] = p2pElems[0].opCount; \
|
||||
collTrace->p2p[0].ngroups = p2pElems[0].ngroups; \
|
||||
collTrace->p2p[0].nWarps = p2pElems[0].nWarps; \
|
||||
collTrace->p2p[0].warpStart = p2pElems[0].warpStart; \
|
||||
collTrace->p2p[0].peer = p2pElems[0].p2pType == ncclWorkP2pTypeRecv ? (uint16_t)(p2pElems[0].peer) : -1; \
|
||||
collTrace->p2p[1].connIndex = 0; \
|
||||
collTrace->p2pOpCount[1] = p2pElems[1].opCount; \
|
||||
collTrace->p2p[1].ngroups = p2pElems[1].ngroups; \
|
||||
collTrace->p2p[1].nWarps = p2pElems[1].nWarps; \
|
||||
collTrace->p2p[1].warpStart = p2pElems[1].warpStart; \
|
||||
collTrace->p2p[1].peer = p2pElems[1].p2pType == ncclWorkP2pTypeSend ? (uint16_t)(p2pElems[1].peer) : -1; \
|
||||
collTrace->type = (launch_type) | ncclCollTraceP2pElemType; \
|
||||
} else if (ncclShmem.work.header.type == ncclWorkTypeColl) { \
|
||||
struct ncclWorkElem *elems = ncclShmem.work.elems; \
|
||||
collTrace->opCount = elems[0].opCount; \
|
||||
collTrace->coll.nWarps = elems[0].nWarps; \
|
||||
collTrace->coll.bid = elems[0].bid; \
|
||||
collTrace->coll.nChannels = elems[0].nChannels; \
|
||||
collTrace->type = (launch_type) | ncclCollTraceCollElemType; \
|
||||
} \
|
||||
}
|
||||
#define traceKernelLaunch(firstLaunch) { \
|
||||
traceColl(firstLaunch?ncclCollTraceKernelLaunchType:ncclCollTraceCollLaunchType); \
|
||||
}
|
||||
#define traceKernelEnd() { \
|
||||
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
|
||||
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->type = ncclCollTraceKernelEndType; \
|
||||
}
|
||||
#define traceAbort() { \
|
||||
#define traceAbort() { \
|
||||
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
|
||||
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->type = ncclCollTraceAbortType; \
|
||||
}
|
||||
// traceData(int16_t data2, uint32_t data4, uint64_t data8_0, uint64_t data8_1)
|
||||
#define traceData(data2, data4, data8_0, data8_1) { \
|
||||
#define traceData(data2, data4, data8_0, data8_1) { \
|
||||
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
@@ -391,6 +450,7 @@ class ncclFunction {
|
||||
collTrace->data_1 = data8_1; \
|
||||
collTrace->type = ncclCollTraceDataType; \
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
#define traceData(data2, data4, data8_0, data8_1)
|
||||
#endif
|
||||
|
||||
@@ -297,7 +297,7 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(__gfx1030__)
|
||||
#if !defined(__gfx1030__) && !defined(__gfx1100__) && !defined(__gfx1101__) && !defined(__gfx1102__)
|
||||
if (tid == 0) __asm__ __volatile__("buffer_wbinvl1_vol");
|
||||
#endif
|
||||
/************************ Send **************************/
|
||||
|
||||
Verwijs in nieuw issue
Block a user