diff --git a/src/all_gather.cu b/src/all_gather.cu index 0831207433..6db67e6d00 100644 --- a/src/all_gather.cu +++ b/src/all_gather.cu @@ -7,10 +7,8 @@ #include "cuda_runtime.h" #include "common.h" -#define ALIGN 4 - -void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { - size_t base = (count/(ALIGN*nranks))*ALIGN; +void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) { + size_t base = (count/nranks) & -(16/eltSize); *sendcount = base; *recvcount = base*nranks; *sendInplaceOffset = base; @@ -60,7 +58,7 @@ struct testColl allGatherTest = { void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { size_t paramcount, sendInplaceOffset, recvInplaceOffset; - AllGatherGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); + AllGatherGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks); } testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { diff --git a/src/all_reduce.cu b/src/all_reduce.cu index a38eabe057..4aa1feead7 100644 --- a/src/all_reduce.cu +++ b/src/all_reduce.cu @@ -7,7 +7,7 @@ #include "cuda_runtime.h" #include "common.h" -void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { +void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) { *sendcount = count; *recvcount = count; *sendInplaceOffset = 0; @@ -55,7 +55,7 @@ struct testColl allReduceTest = { void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { size_t paramcount, sendInplaceOffset, recvInplaceOffset; - AllReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); + AllReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks); } testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { diff --git a/src/alltoall.cu b/src/alltoall.cu index 41c7c4ae33..dd085e54a9 100644 --- a/src/alltoall.cu +++ b/src/alltoall.cu @@ -7,12 +7,12 @@ #include "cuda_runtime.h" #include "common.h" -void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { - *sendcount = (count/nranks)*nranks; - *recvcount = (count/nranks)*nranks; +void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) { + *paramcount = (count/nranks) & -(16/eltSize); + *sendcount = nranks*(*paramcount); + *recvcount = *sendcount; *sendInplaceOffset = 0; *recvInplaceOffset = 0; - *paramcount = count/nranks; } testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { @@ -74,7 +74,7 @@ struct testColl alltoAllTest = { void AlltoAllGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { size_t paramcount, sendInplaceOffset, recvInplaceOffset; - AlltoAllGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); + AlltoAllGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks); } testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { diff --git a/src/broadcast.cu b/src/broadcast.cu index 903066a2b8..67e9af2f36 100644 --- a/src/broadcast.cu +++ b/src/broadcast.cu @@ -7,7 +7,7 @@ #include "cuda_runtime.h" #include "common.h" -void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { +void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) { *sendcount = count; *recvcount = count; *sendInplaceOffset = 0; @@ -64,7 +64,7 @@ struct testColl broadcastTest = { void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { size_t paramcount, sendInplaceOffset, recvInplaceOffset; - BroadcastGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); + BroadcastGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks); } testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { diff --git a/src/common.cu b/src/common.cu index e1f8a85f16..6d103d797d 100644 --- a/src/common.cu +++ b/src/common.cu @@ -571,7 +571,7 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) { size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset; count = size / wordSize(type); - args->collTest->getCollByteCount(&sendCount, &recvCount, ¶mCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks); + args->collTest->getCollByteCount(&sendCount, &recvCount, ¶mCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, wordSize(type), (size_t)nranks); args->nbytes = paramCount * wordSize(type); args->sendBytes = sendCount * wordSize(type); diff --git a/src/common.h b/src/common.h index e6762e1c97..478d7fb1c0 100644 --- a/src/common.h +++ b/src/common.h @@ -87,7 +87,7 @@ struct testColl { void (*getCollByteCount)( size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, - size_t count, int nranks); + size_t count, size_t eltSize, int nranks); testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place); void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks); diff --git a/src/gather.cu b/src/gather.cu index 03ef4d9e3f..a4a7a30bcd 100644 --- a/src/gather.cu +++ b/src/gather.cu @@ -7,12 +7,12 @@ #include "cuda_runtime.h" #include "common.h" -void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { - *sendcount = count/nranks; - *recvcount = (count/nranks)*nranks; - *sendInplaceOffset = count/nranks; +void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) { + *sendcount = (count/nranks) & -(16/eltSize); + *recvcount = (*sendcount)*nranks; + *sendInplaceOffset = *sendcount; *recvInplaceOffset = 0; - *paramcount = count/nranks; + *paramcount = *sendcount; } testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { @@ -73,7 +73,7 @@ struct testColl gatherTest = { void GatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { size_t paramcount, sendInplaceOffset, recvInplaceOffset; - GatherGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); + GatherGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks); } testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { diff --git a/src/hypercube.cu b/src/hypercube.cu index 5c1456f8c7..b3459c91f4 100644 --- a/src/hypercube.cu +++ b/src/hypercube.cu @@ -9,8 +9,8 @@ #define ALIGN 4 -void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { - size_t base = (count/(ALIGN*nranks))*ALIGN; +void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) { + size_t base = (count/nranks) & -(16/eltSize); *sendcount = base; *recvcount = base*nranks; *sendInplaceOffset = base; @@ -78,7 +78,7 @@ struct testColl hyperCubeTest = { void HyperCubeGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { size_t paramcount, sendInplaceOffset, recvInplaceOffset; - HyperCubeGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); + HyperCubeGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks); } testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { diff --git a/src/reduce.cu b/src/reduce.cu index f2fa80dd95..731abfa141 100644 --- a/src/reduce.cu +++ b/src/reduce.cu @@ -7,7 +7,7 @@ #include "cuda_runtime.h" #include "common.h" -void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { +void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) { *sendcount = count; *recvcount = count; *sendInplaceOffset = 0; @@ -54,7 +54,7 @@ struct testColl reduceTest = { void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { size_t paramcount, sendInplaceOffset, recvInplaceOffset; - ReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); + ReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks); } testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu index ed372e3b9a..35cfdd4929 100644 --- a/src/reduce_scatter.cu +++ b/src/reduce_scatter.cu @@ -7,10 +7,8 @@ #include "cuda_runtime.h" #include "common.h" -#define ALIGN 4 - -void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { - size_t base = (count/(ALIGN*nranks))*ALIGN; +void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) { + size_t base = (count/nranks) & -(16/eltSize); *sendcount = base*nranks; *recvcount = base; *sendInplaceOffset = 0; @@ -59,7 +57,7 @@ struct testColl reduceScatterTest = { void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { size_t paramcount, sendInplaceOffset, recvInplaceOffset; - ReduceScatterGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); + ReduceScatterGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks); } testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { diff --git a/src/scatter.cu b/src/scatter.cu index 49d20e1601..d1eec71282 100644 --- a/src/scatter.cu +++ b/src/scatter.cu @@ -7,12 +7,12 @@ #include "cuda_runtime.h" #include "common.h" -void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { - *sendcount = (count/nranks)*nranks; - *recvcount = count/nranks; +void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) { + *recvcount = (count/nranks) & -(16/eltSize); + *sendcount = (*recvcount)*nranks; *sendInplaceOffset = 0; - *recvInplaceOffset = count/nranks; - *paramcount = count/nranks; + *recvInplaceOffset = *recvcount; + *paramcount = *recvcount; } testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { @@ -69,7 +69,7 @@ struct testColl scatterTest = { void ScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { size_t paramcount, sendInplaceOffset, recvInplaceOffset; - ScatterGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); + ScatterGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks); } testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { diff --git a/src/sendrecv.cu b/src/sendrecv.cu index c9eb5bb427..67a4898b27 100644 --- a/src/sendrecv.cu +++ b/src/sendrecv.cu @@ -7,7 +7,7 @@ #include "cuda_runtime.h" #include "common.h" -void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { +void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) { *sendcount = count; *recvcount = count; *sendInplaceOffset = 0; @@ -68,7 +68,7 @@ struct testColl sendRecvTest = { void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { size_t paramcount, sendInplaceOffset, recvInplaceOffset; - SendRecvGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); + SendRecvGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks); } testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {