diff --git a/projects/rccl/tools/p2p-latency-test/ll_latency_test.cpp b/projects/rccl/tools/p2p-latency-test/ll_latency_test.cpp index a26980c0c2..8a20e55615 100644 --- a/projects/rccl/tools/p2p-latency-test/ll_latency_test.cpp +++ b/projects/rccl/tools/p2p-latency-test/ll_latency_test.cpp @@ -78,14 +78,14 @@ __device__ uint64_t readLL(union LLFifoLine* src, uint32_t flag, uint32_t* abort __global__ void PingKernel(LLFifoLine* local_flag, LLFifoLine* remote_flag, uint64_t* time_delta, uint32_t* abortFlag) { int tid = threadIdx.x; #pragma unroll - for (uint32_t i = 1; i < NUM_LOOPS_WARMUP; i++) { + for (uint32_t i = 1; i <= NUM_LOOPS_WARMUP; i++) { storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i); while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i); } uint64_t start_time, end_time; if (tid == 0) start_time = wall_clock64(); #pragma unroll - for (uint32_t i = NUM_LOOPS_WARMUP; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) { + for (uint32_t i = NUM_LOOPS_WARMUP + 1; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) { storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i); while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i); } @@ -97,14 +97,14 @@ __global__ void PingKernel(LLFifoLine* local_flag, LLFifoLine* remote_flag, uint __global__ void PongKernel(LLFifoLine* local_flag, LLFifoLine* remote_flag, uint64_t* time_delta, uint32_t* abortFlag) { int tid = threadIdx.x; #pragma unroll - for (uint32_t i = 1; i < NUM_LOOPS_WARMUP; i++) { + for (uint32_t i = 1; i <= NUM_LOOPS_WARMUP; i++) { while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i); storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i); } uint64_t start_time, end_time; if (tid == 0) start_time = wall_clock64(); #pragma unroll - for (uint32_t i = NUM_LOOPS_WARMUP; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) { + for (uint32_t i = NUM_LOOPS_WARMUP + 1; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) { while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i); storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i); } diff --git a/projects/rccl/tools/p2p-latency-test/ll_latency_test.cu b/projects/rccl/tools/p2p-latency-test/ll_latency_test.cu index 80875e7d89..ebbc3ba346 100644 --- a/projects/rccl/tools/p2p-latency-test/ll_latency_test.cu +++ b/projects/rccl/tools/p2p-latency-test/ll_latency_test.cu @@ -69,14 +69,14 @@ __device__ uint64_t readLL(union LLFifoLine* src, uint32_t flag, uint32_t* abort __global__ void PingKernel(LLFifoLine* local_flag, LLFifoLine* remote_flag, uint64_t* time_delta, uint32_t* abortFlag) { int tid = threadIdx.x; #pragma unroll - for (uint32_t i = 1; i < NUM_LOOPS_WARMUP; i++) { + for (uint32_t i = 1; i <= NUM_LOOPS_WARMUP; i++) { storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i); while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i); } uint64_t start_time, end_time; if (tid == 0) start_time = clock64(); #pragma unroll - for (uint32_t i = NUM_LOOPS_WARMUP; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) { + for (uint32_t i = NUM_LOOPS_WARMUP + 1; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) { storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i); while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i); } @@ -88,14 +88,14 @@ __global__ void PingKernel(LLFifoLine* local_flag, LLFifoLine* remote_flag, uint __global__ void PongKernel(LLFifoLine* local_flag, LLFifoLine* remote_flag, uint64_t* time_delta, uint32_t* abortFlag) { int tid = threadIdx.x; #pragma unroll - for (uint32_t i = 1; i < NUM_LOOPS_WARMUP; i++) { + for (uint32_t i = 1; i <= NUM_LOOPS_WARMUP; i++) { while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i); storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i); } uint64_t start_time, end_time; if (tid == 0) start_time = clock64(); #pragma unroll - for (uint32_t i = NUM_LOOPS_WARMUP; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) { + for (uint32_t i = NUM_LOOPS_WARMUP + 1; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) { while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i); storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i); } diff --git a/projects/rccl/tools/p2p-latency-test/p2p_latency_test.cpp b/projects/rccl/tools/p2p-latency-test/p2p_latency_test.cpp index d92f995ed1..0ed3b5e982 100644 --- a/projects/rccl/tools/p2p-latency-test/p2p_latency_test.cpp +++ b/projects/rccl/tools/p2p-latency-test/p2p_latency_test.cpp @@ -23,13 +23,13 @@ __global__ void PingKernel(uint64_t* local_flag, uint64_t* remote_flag, uint64_t* time_delta) { #pragma unroll - for (uint32_t i = 1; i < NUM_LOOPS_WARMUP; i++) { + for (uint32_t i = 1; i <= NUM_LOOPS_WARMUP; i++) { __atomic_store_n(remote_flag, i, __ATOMIC_RELAXED); while (__atomic_load_n(local_flag, __ATOMIC_RELAXED) != i); } uint64_t start_time = wall_clock64(); #pragma unroll - for (uint32_t i = NUM_LOOPS_WARMUP; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) { + for (uint32_t i = NUM_LOOPS_WARMUP + 1; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) { __atomic_store_n(remote_flag, i, __ATOMIC_RELAXED); while (__atomic_load_n(local_flag, __ATOMIC_RELAXED) != i); } @@ -39,13 +39,13 @@ __global__ void PingKernel(uint64_t* local_flag, uint64_t* remote_flag, uint64_t __global__ void PongKernel(uint64_t* local_flag, uint64_t* remote_flag, uint64_t* time_delta) { #pragma unroll - for (uint32_t i = 1; i < NUM_LOOPS_WARMUP; i++) { + for (uint32_t i = 1; i <= NUM_LOOPS_WARMUP; i++) { while (__atomic_load_n(local_flag, __ATOMIC_RELAXED) != i); __atomic_store_n(remote_flag, i, __ATOMIC_RELAXED); } uint64_t start_time = wall_clock64(); #pragma unroll - for (uint32_t i = NUM_LOOPS_WARMUP; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) { + for (uint32_t i = NUM_LOOPS_WARMUP + 1; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) { while (__atomic_load_n(local_flag, __ATOMIC_RELAXED) != i); __atomic_store_n(remote_flag, i, __ATOMIC_RELAXED); }