Fix number of loops in p2p-latency-test (#1286)

This commit is contained in:
Ziyue Yang
2024-08-06 04:35:56 +08:00
committed by GitHub
vanhempi cb2e0615d7
commit 145a13235a
3 muutettua tiedostoa jossa 12 lisäystä ja 12 poistoa
@@ -78,14 +78,14 @@ __device__ uint64_t readLL(union LLFifoLine* src, uint32_t flag, uint32_t* abort
__global__ void PingKernel(LLFifoLine* local_flag, LLFifoLine* remote_flag, uint64_t* time_delta, uint32_t* abortFlag) {
int tid = threadIdx.x;
#pragma unroll
for (uint32_t i = 1; i < NUM_LOOPS_WARMUP; i++) {
for (uint32_t i = 1; i <= NUM_LOOPS_WARMUP; i++) {
storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i);
while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i);
}
uint64_t start_time, end_time;
if (tid == 0) start_time = wall_clock64();
#pragma unroll
for (uint32_t i = NUM_LOOPS_WARMUP; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) {
for (uint32_t i = NUM_LOOPS_WARMUP + 1; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) {
storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i);
while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i);
}
@@ -97,14 +97,14 @@ __global__ void PingKernel(LLFifoLine* local_flag, LLFifoLine* remote_flag, uint
__global__ void PongKernel(LLFifoLine* local_flag, LLFifoLine* remote_flag, uint64_t* time_delta, uint32_t* abortFlag) {
int tid = threadIdx.x;
#pragma unroll
for (uint32_t i = 1; i < NUM_LOOPS_WARMUP; i++) {
for (uint32_t i = 1; i <= NUM_LOOPS_WARMUP; i++) {
while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i);
storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i);
}
uint64_t start_time, end_time;
if (tid == 0) start_time = wall_clock64();
#pragma unroll
for (uint32_t i = NUM_LOOPS_WARMUP; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) {
for (uint32_t i = NUM_LOOPS_WARMUP + 1; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) {
while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i);
storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i);
}
@@ -69,14 +69,14 @@ __device__ uint64_t readLL(union LLFifoLine* src, uint32_t flag, uint32_t* abort
__global__ void PingKernel(LLFifoLine* local_flag, LLFifoLine* remote_flag, uint64_t* time_delta, uint32_t* abortFlag) {
int tid = threadIdx.x;
#pragma unroll
for (uint32_t i = 1; i < NUM_LOOPS_WARMUP; i++) {
for (uint32_t i = 1; i <= NUM_LOOPS_WARMUP; i++) {
storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i);
while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i);
}
uint64_t start_time, end_time;
if (tid == 0) start_time = clock64();
#pragma unroll
for (uint32_t i = NUM_LOOPS_WARMUP; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) {
for (uint32_t i = NUM_LOOPS_WARMUP + 1; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) {
storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i);
while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i);
}
@@ -88,14 +88,14 @@ __global__ void PingKernel(LLFifoLine* local_flag, LLFifoLine* remote_flag, uint
__global__ void PongKernel(LLFifoLine* local_flag, LLFifoLine* remote_flag, uint64_t* time_delta, uint32_t* abortFlag) {
int tid = threadIdx.x;
#pragma unroll
for (uint32_t i = 1; i < NUM_LOOPS_WARMUP; i++) {
for (uint32_t i = 1; i <= NUM_LOOPS_WARMUP; i++) {
while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i);
storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i);
}
uint64_t start_time, end_time;
if (tid == 0) start_time = clock64();
#pragma unroll
for (uint32_t i = NUM_LOOPS_WARMUP; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) {
for (uint32_t i = NUM_LOOPS_WARMUP + 1; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) {
while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i);
storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i);
}
@@ -23,13 +23,13 @@
__global__ void PingKernel(uint64_t* local_flag, uint64_t* remote_flag, uint64_t* time_delta) {
#pragma unroll
for (uint32_t i = 1; i < NUM_LOOPS_WARMUP; i++) {
for (uint32_t i = 1; i <= NUM_LOOPS_WARMUP; i++) {
__atomic_store_n(remote_flag, i, __ATOMIC_RELAXED);
while (__atomic_load_n(local_flag, __ATOMIC_RELAXED) != i);
}
uint64_t start_time = wall_clock64();
#pragma unroll
for (uint32_t i = NUM_LOOPS_WARMUP; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) {
for (uint32_t i = NUM_LOOPS_WARMUP + 1; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) {
__atomic_store_n(remote_flag, i, __ATOMIC_RELAXED);
while (__atomic_load_n(local_flag, __ATOMIC_RELAXED) != i);
}
@@ -39,13 +39,13 @@ __global__ void PingKernel(uint64_t* local_flag, uint64_t* remote_flag, uint64_t
__global__ void PongKernel(uint64_t* local_flag, uint64_t* remote_flag, uint64_t* time_delta) {
#pragma unroll
for (uint32_t i = 1; i < NUM_LOOPS_WARMUP; i++) {
for (uint32_t i = 1; i <= NUM_LOOPS_WARMUP; i++) {
while (__atomic_load_n(local_flag, __ATOMIC_RELAXED) != i);
__atomic_store_n(remote_flag, i, __ATOMIC_RELAXED);
}
uint64_t start_time = wall_clock64();
#pragma unroll
for (uint32_t i = NUM_LOOPS_WARMUP; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) {
for (uint32_t i = NUM_LOOPS_WARMUP + 1; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) {
while (__atomic_load_n(local_flag, __ATOMIC_RELAXED) != i);
__atomic_store_n(remote_flag, i, __ATOMIC_RELAXED);
}