Update ll_latency_test and add CUDA version (#873)

Этот коммит содержится в:
Wenkai Du
2023-08-30 16:29:42 -07:00
коммит произвёл GitHub
родитель a433fcc726
Коммит c6dd6f6237
2 изменённых файлов: 197 добавлений и 18 удалений
+16 -18
Просмотреть файл
@@ -14,15 +14,11 @@
#include <iostream> //cerr
#include <cstring>
#define HIP_IPC_MEM_MIN_SIZE 2097152UL
#define NUM_LOOPS_WARMUP 2000
#define NUM_LOOPS_RUN 10000
#define PING_MODE 0
#define PONG_MODE 1
#define LL_MAX_THREAD 256
#define LL_MAX_THREADS 256
#define LL_MAX_LINES 1000
union LLFifoLine {
/* Flags have to be *after* data, because otherwise, an incomplete receive
@@ -39,6 +35,8 @@ union LLFifoLine {
int4 i4;
};
#define HIP_IPC_MEM_MIN_SIZE (LL_MAX_THREADS*LL_MAX_LINES*sizeof(LLFifoLine))
__device__ void storeLL(union LLFifoLine* dst, uint64_t val, uint32_t flag) {
union LLFifoLine i4;
i4.data1 = val & 0xffffffff;
@@ -81,15 +79,15 @@ __global__ void PingKernel(LLFifoLine* local_flag, LLFifoLine* remote_flag, uint
int tid = threadIdx.x;
#pragma unroll
for (uint32_t i = 1; i < NUM_LOOPS_WARMUP; i++) {
storeLL(remote_flag+tid, i, i);
while (readLL(local_flag+tid, i, abortFlag) != i);
storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i);
while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i);
}
uint64_t start_time, end_time;
if (tid == 0) start_time = wall_clock64();
#pragma unroll
for (uint32_t i = NUM_LOOPS_WARMUP; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) {
storeLL(remote_flag+tid, i, i);
while (readLL(local_flag+tid, i, abortFlag) != i);
storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i);
while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i);
}
__syncthreads();
if (tid == 0) end_time = wall_clock64();
@@ -100,15 +98,15 @@ __global__ void PongKernel(LLFifoLine* local_flag, LLFifoLine* remote_flag, uint
int tid = threadIdx.x;
#pragma unroll
for (uint32_t i = 1; i < NUM_LOOPS_WARMUP; i++) {
while (readLL(local_flag+tid, i, abortFlag) != i);
storeLL(remote_flag+tid, i, i);
while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i);
storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i);
}
uint64_t start_time, end_time;
if (tid == 0) start_time = wall_clock64();
#pragma unroll
for (uint32_t i = NUM_LOOPS_WARMUP; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) {
while (readLL(local_flag+tid, i, abortFlag) != i);
storeLL(remote_flag+tid, i, i);
while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i);
storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i);
}
__syncthreads();
if (tid == 0) end_time = wall_clock64();
@@ -150,7 +148,7 @@ int main(int argc, char** argv) {
HIPCHECK(hipDeviceEnablePeerAccess(device_id[1], 0));
HIPCHECK(hipGetDeviceProperties(&prop[0], device_id[0]));
HIPCHECK(hipExtMallocWithFlags((void**)&flag[0], HIP_IPC_MEM_MIN_SIZE, prop[0].gcnArch / 10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained));
HIPCHECK(hipMalloc((void**)&time_delta[0], HIP_IPC_MEM_MIN_SIZE));
HIPCHECK(hipHostMalloc ((void**)&time_delta[0], sizeof(uint64_t), hipHostMallocDefault));
HIPCHECK(hipMalloc((void**)&abortFlag[0], sizeof(uint32_t)));
HIPCHECK(hipMemsetAsync(flag[0], 0, HIP_IPC_MEM_MIN_SIZE, stream[0]));
HIPCHECK(hipMemsetAsync(abortFlag[0], 0, sizeof(uint32_t), stream[0]));
@@ -161,17 +159,17 @@ int main(int argc, char** argv) {
HIPCHECK(hipDeviceEnablePeerAccess(device_id[0], 0));
HIPCHECK(hipGetDeviceProperties(&prop[1], device_id[1]));
HIPCHECK(hipExtMallocWithFlags((void**)&flag[1], HIP_IPC_MEM_MIN_SIZE, prop[1].gcnArch / 10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained));
HIPCHECK(hipMalloc((void**)&time_delta[1], HIP_IPC_MEM_MIN_SIZE));
HIPCHECK(hipHostMalloc((void**)&time_delta[1], sizeof(uint64_t), hipHostMallocDefault));
HIPCHECK(hipMalloc((void**)&abortFlag[1], sizeof(uint32_t)));
HIPCHECK(hipMemsetAsync(flag[1], 0, HIP_IPC_MEM_MIN_SIZE, stream[1]));
HIPCHECK(hipMemsetAsync(abortFlag[1], 0, sizeof(uint32_t), stream[0]));
HIPCHECK(hipStreamSynchronize(stream[1]));
HIPCHECK(hipSetDevice(device_id[0]));
PingKernel<<<1, LL_MAX_THREAD, 0, stream[0]>>>(flag[0], flag[1], time_delta[0], abortFlag[0]);
PingKernel<<<1, LL_MAX_THREADS, 0, stream[0]>>>(flag[0], flag[1], time_delta[0], abortFlag[0]);
HIPCHECK(hipSetDevice(device_id[1]));
PongKernel<<<1, LL_MAX_THREAD, 0, stream[1]>>>(flag[1], flag[0], time_delta[1], abortFlag[1]);
PongKernel<<<1, LL_MAX_THREADS, 0, stream[1]>>>(flag[1], flag[0], time_delta[1], abortFlag[1]);
double vega_gpu_rtc_freq;
+181
Просмотреть файл
@@ -0,0 +1,181 @@
/*************************************************************************
* Copyright (c) Microsoft Corporation.
* Modifications Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
* Licensed under the MIT License.
************************************************************************/
#include <cstdio>
#include <cstdlib>
#include <chrono>
#include <fstream>
#include <string>
#include <unistd.h>
#include <cuda_runtime.h>
#include <iostream> //cerr
#include <cstring>
#define NUM_LOOPS_WARMUP 2000
#define NUM_LOOPS_RUN 10000
#define LL_MAX_THREADS 256
#define LL_MAX_LINES 1000
union LLFifoLine {
/* Flags have to be *after* data, because otherwise, an incomplete receive
from the network may receive the flag but not the data.
Note this is assuming that either we receive contiguous chunks of data
(sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
struct {
uint32_t data1;
uint32_t flag1;
uint32_t data2;
uint32_t flag2;
};
uint64_t v[2];
int4 i4;
};
#define HIP_IPC_MEM_MIN_SIZE (LL_MAX_THREADS*LL_MAX_LINES*sizeof(LLFifoLine))
__device__ void storeLL(union LLFifoLine* dst, uint64_t val, uint32_t flag) {
asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
}
#define LL_SPINS_BEFORE_CHECK_ABORT 1000000
inline __device__ int checkAbort(int &spins, uint32_t* abortFlag) {
uint32_t abort = 0;
spins++;
if (spins == LL_SPINS_BEFORE_CHECK_ABORT) {
abort = *((volatile uint32_t*)abortFlag);
spins = 0;
}
return abort;
}
__device__ uint64_t readLL(union LLFifoLine* src, uint32_t flag, uint32_t* abortFlag) {
int spins = 0;
union LLFifoLine i4;
do {
asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(i4.data1), "=r"(i4.flag1), "=r"(i4.data2), "=r"(i4.flag2) : "l"(&src->i4));
if (checkAbort(spins, abortFlag)) break;
} while ((i4.flag1 != flag) || (i4.flag2 != flag));
uint64_t val64 = (uint64_t)(i4.data1) + (((uint64_t)i4.data2) << 32);
return val64;
}
__global__ void PingKernel(LLFifoLine* local_flag, LLFifoLine* remote_flag, uint64_t* time_delta, uint32_t* abortFlag) {
int tid = threadIdx.x;
#pragma unroll
for (uint32_t i = 1; i < NUM_LOOPS_WARMUP; i++) {
storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i);
while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i);
}
uint64_t start_time, end_time;
if (tid == 0) start_time = clock64();
#pragma unroll
for (uint32_t i = NUM_LOOPS_WARMUP; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) {
storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i);
while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i);
}
__syncthreads();
if (tid == 0) end_time = clock64();
if (tid == 0) *time_delta = end_time - start_time;
}
__global__ void PongKernel(LLFifoLine* local_flag, LLFifoLine* remote_flag, uint64_t* time_delta, uint32_t* abortFlag) {
int tid = threadIdx.x;
#pragma unroll
for (uint32_t i = 1; i < NUM_LOOPS_WARMUP; i++) {
while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i);
storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i);
}
uint64_t start_time, end_time;
if (tid == 0) start_time = clock64();
#pragma unroll
for (uint32_t i = NUM_LOOPS_WARMUP; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) {
while (readLL(local_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, abortFlag) != i);
storeLL(remote_flag+tid+(i%LL_MAX_LINES)*LL_MAX_THREADS, i, i);
}
__syncthreads();
if (tid == 0) end_time = clock64();
if (tid == 0) *time_delta = end_time - start_time;
}
#define HIPCHECK(cmd) \
do { \
cudaError_t error = (cmd); \
if (error != cudaSuccess) \
{ \
std::cerr << "Encountered HIP error (" << error << ") at line " \
<< __LINE__ << " in file " << __FILE__ << "\n"; \
exit(-1); \
} \
} while (0)
int main(int argc, char** argv) {
cudaStream_t stream[2];
int device_id[2];
cudaDeviceProp prop[2];
if (argc != 3) {
fprintf(stderr, "Usage: ./ll_latency_test ping_dev_id pong_dev_id\n");
return -1;
}
device_id[0] = atoi(argv[1]);
device_id[1] = atoi(argv[2]);
fprintf(stdout, "Using devices %d %d\n", device_id[0], device_id[1]);
LLFifoLine *flag[2];
uint64_t *time_delta[2];
uint32_t *abortFlag[2];
HIPCHECK(cudaSetDevice(device_id[0]));
HIPCHECK(cudaStreamCreateWithFlags(&stream[0], cudaStreamNonBlocking));
HIPCHECK(cudaDeviceEnablePeerAccess(device_id[1], 0));
HIPCHECK(cudaGetDeviceProperties(&prop[0], device_id[0]));
HIPCHECK(cudaMalloc((void**)&flag[0], HIP_IPC_MEM_MIN_SIZE));
HIPCHECK(cudaHostAlloc ((void**)&time_delta[0], sizeof(uint64_t), cudaHostAllocDefault));
HIPCHECK(cudaMalloc((void**)&abortFlag[0], sizeof(uint32_t)));
HIPCHECK(cudaMemsetAsync(flag[0], 0, HIP_IPC_MEM_MIN_SIZE, stream[0]));
HIPCHECK(cudaMemsetAsync(abortFlag[0], 0, sizeof(uint32_t), stream[0]));
HIPCHECK(cudaStreamSynchronize(stream[0]));
HIPCHECK(cudaSetDevice(device_id[1]));
HIPCHECK(cudaStreamCreateWithFlags(&stream[1], cudaStreamNonBlocking));
HIPCHECK(cudaDeviceEnablePeerAccess(device_id[0], 0));
HIPCHECK(cudaGetDeviceProperties(&prop[1], device_id[1]));
HIPCHECK(cudaMalloc((void**)&flag[1], HIP_IPC_MEM_MIN_SIZE));
HIPCHECK(cudaHostAlloc((void**)&time_delta[1], sizeof(uint64_t), cudaHostAllocDefault));
HIPCHECK(cudaMalloc((void**)&abortFlag[1], sizeof(uint32_t)));
HIPCHECK(cudaMemsetAsync(flag[1], 0, HIP_IPC_MEM_MIN_SIZE, stream[1]));
HIPCHECK(cudaMemsetAsync(abortFlag[1], 0, sizeof(uint32_t), stream[0]));
HIPCHECK(cudaStreamSynchronize(stream[1]));
HIPCHECK(cudaSetDevice(device_id[0]));
PingKernel<<<1, LL_MAX_THREADS, 0, stream[0]>>>(flag[0], flag[1], time_delta[0], abortFlag[0]);
HIPCHECK(cudaSetDevice(device_id[1]));
PongKernel<<<1, LL_MAX_THREADS, 0, stream[1]>>>(flag[1], flag[0], time_delta[1], abortFlag[1]);
double gpu_rtc_freq;
HIPCHECK(cudaStreamSynchronize(stream[0]));
gpu_rtc_freq = prop[0].clockRate*1.0E3;
fprintf(stdout, "One-way latency in us: %g\n", double(*time_delta[0]) * 1e6 / NUM_LOOPS_RUN / gpu_rtc_freq / 2);
HIPCHECK(cudaStreamSynchronize(stream[1]));
gpu_rtc_freq = prop[1].clockRate*1.0E3;
fprintf(stdout, "One-way latency in us: %g\n", double(*time_delta[1]) * 1e6 / NUM_LOOPS_RUN / gpu_rtc_freq / 2);
HIPCHECK(cudaFree(flag[0]));
HIPCHECK(cudaFreeHost(time_delta[0]));
HIPCHECK(cudaFree(abortFlag[0]));
HIPCHECK(cudaFree(flag[1]));
HIPCHECK(cudaFreeHost(time_delta[1]));
HIPCHECK(cudaFree(abortFlag[1]));
return 0;
}