rearranged how the min and max functions are part of msccl (#1025)

* rearranged how the min and max functions are part of msccl

* added more coverage on in place graph tests
This commit is contained in:
akolliasAMD
2023-12-21 08:58:33 -07:00
zatwierdzone przez GitHub
rodzic 7e1cbb440d
commit f4858e14b2
4 zmienionych plików z 8 dodań i 8 usunięć
@@ -338,21 +338,21 @@ __device__ __forceinline__ void mscclRunInterpreter(
if (tid == 0) {
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_SEND_ENTRY, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP());
}
#endif
#endif
prims.template send<1>(srcOffset, thisNelem); // LL.send is the only situation where there is no barrier at the end.
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_SEND_EXIT)
if (tid == 0) {
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_SEND_EXIT, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP());
}
#endif
#endif
}
else if (t->type == MSCCL_RECV) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RECV_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RECV_ENTRY, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP());
}
#endif
#endif
prims.template recv<1>(dstOffset, thisNelem);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RECV_EXIT)
if (tid == 0) {
@@ -515,8 +515,8 @@ __global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, Simple, fullOps)(struct
#define MSCCL_IMPL_KERNEL_ENTRY_FUNC() \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP(Sum, false) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP(Prod, false) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP(Min, false) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP(Max, false) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP(Min, false) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP(PreMulSum, false) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_NOFLOAT(SumPostDiv, false)
+1 -1
Wyświetl plik
@@ -39,8 +39,8 @@ __global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, proto, fullOps)(struct n
#define MSCCL_DECL_KERNEL_ENTRY_FUNC() \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(Sum, false) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(Prod, false) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(Min, false) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(Max, false) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(Min, false) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(PreMulSum, false) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_NOFLOAT(SumPostDiv, false)
+2 -2
Wyświetl plik
@@ -185,7 +185,7 @@ static void HIPRT_CB mscclSetupProxyCallback(void *args) {
INFO(NCCL_NET,"mscclSetupProxyCallback: proxy args size: %ld\n", params->size());
for (auto &p : *params) {
mscclSetupProxyImpl(p.hostAlgo, p.comm);
}
}
}
ncclResult_t mscclSetupProxy(struct mscclAlgo* hostAlgo, ncclComm_t comm, hipStream_t stream) {
@@ -317,8 +317,8 @@ static ncclResult_t hostToDevRedOp(
#define MSCCL_KERNEL_ENTRY() \
MSCCL_KERNEL_ENTRY_DEVREDOP(Sum, false), \
MSCCL_KERNEL_ENTRY_DEVREDOP(Prod, false), \
MSCCL_KERNEL_ENTRY_DEVREDOP(Min, false), \
MSCCL_KERNEL_ENTRY_DEVREDOP(Max, false), \
MSCCL_KERNEL_ENTRY_DEVREDOP(Min, false), \
MSCCL_KERNEL_ENTRY_DEVREDOP(PreMulSum, false), \
MSCCL_KERNEL_ENTRY_DEVREDOP_NOFLOAT(SumPostDiv, false)
+1 -1
Wyświetl plik
@@ -73,7 +73,7 @@ namespace RcclUnitTesting
std::vector<ncclDataType_t> const dataTypes = {ncclInt32};
std::vector<ncclRedOp_t> const redOps = {ncclMax};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {393216};
std::vector<int> const numElements = {393216, 12888, 384};
std::vector<bool> const inPlaceList = {true};
std::vector<bool> const managedMemList = {false};
std::vector<bool> const useHipGraphList = {true};