rearranged how the min and max functions are part of msccl (#1025)
* rearranged how the min and max functions are part of msccl * added more coverage on in place graph tests
This commit is contained in:
zatwierdzone przez
GitHub
rodzic
7e1cbb440d
commit
f4858e14b2
@@ -338,21 +338,21 @@ __device__ __forceinline__ void mscclRunInterpreter(
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_SEND_ENTRY, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP());
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
prims.template send<1>(srcOffset, thisNelem); // LL.send is the only situation where there is no barrier at the end.
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_SEND_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_SEND_EXIT, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP());
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
else if (t->type == MSCCL_RECV) {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RECV_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RECV_ENTRY, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP());
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
prims.template recv<1>(dstOffset, thisNelem);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RECV_EXIT)
|
||||
if (tid == 0) {
|
||||
@@ -515,8 +515,8 @@ __global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, Simple, fullOps)(struct
|
||||
#define MSCCL_IMPL_KERNEL_ENTRY_FUNC() \
|
||||
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP(Sum, false) \
|
||||
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP(Prod, false) \
|
||||
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP(Min, false) \
|
||||
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP(Max, false) \
|
||||
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP(Min, false) \
|
||||
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP(PreMulSum, false) \
|
||||
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_NOFLOAT(SumPostDiv, false)
|
||||
|
||||
|
||||
@@ -39,8 +39,8 @@ __global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, proto, fullOps)(struct n
|
||||
#define MSCCL_DECL_KERNEL_ENTRY_FUNC() \
|
||||
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(Sum, false) \
|
||||
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(Prod, false) \
|
||||
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(Min, false) \
|
||||
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(Max, false) \
|
||||
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(Min, false) \
|
||||
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(PreMulSum, false) \
|
||||
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_NOFLOAT(SumPostDiv, false)
|
||||
|
||||
|
||||
@@ -185,7 +185,7 @@ static void HIPRT_CB mscclSetupProxyCallback(void *args) {
|
||||
INFO(NCCL_NET,"mscclSetupProxyCallback: proxy args size: %ld\n", params->size());
|
||||
for (auto &p : *params) {
|
||||
mscclSetupProxyImpl(p.hostAlgo, p.comm);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ncclResult_t mscclSetupProxy(struct mscclAlgo* hostAlgo, ncclComm_t comm, hipStream_t stream) {
|
||||
@@ -317,8 +317,8 @@ static ncclResult_t hostToDevRedOp(
|
||||
#define MSCCL_KERNEL_ENTRY() \
|
||||
MSCCL_KERNEL_ENTRY_DEVREDOP(Sum, false), \
|
||||
MSCCL_KERNEL_ENTRY_DEVREDOP(Prod, false), \
|
||||
MSCCL_KERNEL_ENTRY_DEVREDOP(Min, false), \
|
||||
MSCCL_KERNEL_ENTRY_DEVREDOP(Max, false), \
|
||||
MSCCL_KERNEL_ENTRY_DEVREDOP(Min, false), \
|
||||
MSCCL_KERNEL_ENTRY_DEVREDOP(PreMulSum, false), \
|
||||
MSCCL_KERNEL_ENTRY_DEVREDOP_NOFLOAT(SumPostDiv, false)
|
||||
|
||||
|
||||
@@ -73,7 +73,7 @@ namespace RcclUnitTesting
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclInt32};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclMax};
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {393216};
|
||||
std::vector<int> const numElements = {393216, 12888, 384};
|
||||
std::vector<bool> const inPlaceList = {true};
|
||||
std::vector<bool> const managedMemList = {false};
|
||||
std::vector<bool> const useHipGraphList = {true};
|
||||
|
||||
Reference in New Issue
Block a user