Merge pull request #134 from changpeng/master

Tuning the inline and unroll to reduce the scratch usage

[ROCm/rccl commit: 662281e599]
此提交包含在:
Wenkai Du
2019-10-09 10:58:38 -07:00
提交者 GitHub
當前提交 c4ed3d2e08
+3 -4
查看文件
@@ -102,8 +102,7 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
#endif
}
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
template<int UNUSED, class FUNC, typename T>
__device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x;
@@ -122,7 +121,7 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
do {
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
ncclPrimitives<1, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
@@ -139,7 +138,7 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
do {
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
ncclPrimitives<1, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;