[TOOLS] Update p2p-latency-test for gfx950 (#1730)
This commit is contained in:
@@ -1,18 +1,23 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Modifications Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
# Licensed under the MIT License.
|
||||
HIP_PATH ?= $(wildcard /opt/rocm)
|
||||
ifeq (,$(HIP_PATH))
|
||||
HIP_PATH = ../../..
|
||||
endif
|
||||
HIPCC = $(HIP_PATH)/bin/hipcc
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
|
||||
ROCM_PATH ?= $(wildcard /opt/rocm)
|
||||
HIPCC = $(ROCM_PATH)/bin/hipcc
|
||||
|
||||
all: p2p_latency_test ll_latency_test
|
||||
|
||||
CXXFLAGS = -g -O3
|
||||
|
||||
p2p_latency_test: p2p_latency_test.cpp
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
$(HIPCC) $(CXXFLAGS) $^ -o $@
|
||||
|
||||
ll_latency_test: ll_latency_test.cpp
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
$(HIPCC) $(CXXFLAGS) $^ -o $@
|
||||
|
||||
clean:
|
||||
rm -f *.o $(EXE)
|
||||
rm -f *.o p2p_latency_test ll_latency_test
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
# P2P Latency Tests
|
||||
Microbenchmarks to test Peer-2-Peer and Low Latency protocols on AMD GPUs.
|
||||
|
||||
## How-to-use?
|
||||
|
||||
To build and run these tests, follow these steps:
|
||||
1. Navigate to the `p2p-latency-test` directory.
|
||||
2. Run `bash build_and_run.sh`
|
||||
|
||||
```bash
|
||||
cd rccl/tools/p2p-latency-test
|
||||
bash build_and_run.sh
|
||||
```
|
||||
|
||||
@@ -1,25 +1,31 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Modifications Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
make
|
||||
make clean
|
||||
make -j
|
||||
|
||||
# Example run: test one-way latency between GPU 0 and GPU 1 in both directions.
|
||||
export HSA_FORCE_FINE_GRAIN_PCIE=1
|
||||
|
||||
echo Running p2p_latency_test using GPU pair 0 1
|
||||
./p2p_latency_test 0 1
|
||||
echo ""
|
||||
|
||||
sleep 1
|
||||
|
||||
echo Running p2p_latency_test using GPU pair 1 0
|
||||
./p2p_latency_test 1 0
|
||||
echo ""
|
||||
|
||||
sleep 1
|
||||
|
||||
echo Running ll_latency_test using GPU pair 0 1
|
||||
./ll_latency_test 0 1
|
||||
echo ""
|
||||
|
||||
sleep 1
|
||||
|
||||
echo Running ll_latency_test using GPU pair 1 0
|
||||
./ll_latency_test 1 0
|
||||
echo ""
|
||||
|
||||
@@ -147,7 +147,7 @@ int main(int argc, char** argv) {
|
||||
HIPCHECK(hipStreamCreateWithFlags(&stream[0], hipStreamNonBlocking));
|
||||
HIPCHECK(hipDeviceEnablePeerAccess(device_id[1], 0));
|
||||
HIPCHECK(hipGetDeviceProperties(&prop[0], device_id[0]));
|
||||
HIPCHECK(hipExtMallocWithFlags((void**)&flag[0], HIP_IPC_MEM_MIN_SIZE, strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained));
|
||||
HIPCHECK(hipExtMallocWithFlags((void**)&flag[0], HIP_IPC_MEM_MIN_SIZE, (strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[0].gcnArchName, "gfx950", 6) == 0) ? hipDeviceMallocUncached : hipDeviceMallocFinegrained));
|
||||
HIPCHECK(hipHostMalloc ((void**)&time_delta[0], sizeof(uint64_t), hipHostMallocDefault));
|
||||
HIPCHECK(hipMalloc((void**)&abortFlag[0], sizeof(uint32_t)));
|
||||
HIPCHECK(hipMemsetAsync(flag[0], 0, HIP_IPC_MEM_MIN_SIZE, stream[0]));
|
||||
@@ -158,7 +158,7 @@ int main(int argc, char** argv) {
|
||||
HIPCHECK(hipStreamCreateWithFlags(&stream[1], hipStreamNonBlocking));
|
||||
HIPCHECK(hipDeviceEnablePeerAccess(device_id[0], 0));
|
||||
HIPCHECK(hipGetDeviceProperties(&prop[1], device_id[1]));
|
||||
HIPCHECK(hipExtMallocWithFlags((void**)&flag[1], HIP_IPC_MEM_MIN_SIZE, strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained));
|
||||
HIPCHECK(hipExtMallocWithFlags((void**)&flag[1], HIP_IPC_MEM_MIN_SIZE, (strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[1].gcnArchName, "gfx950", 6) == 0) ? hipDeviceMallocUncached : hipDeviceMallocFinegrained));
|
||||
HIPCHECK(hipHostMalloc((void**)&time_delta[1], sizeof(uint64_t), hipHostMallocDefault));
|
||||
HIPCHECK(hipMalloc((void**)&abortFlag[1], sizeof(uint32_t)));
|
||||
HIPCHECK(hipMemsetAsync(flag[1], 0, HIP_IPC_MEM_MIN_SIZE, stream[1]));
|
||||
@@ -174,11 +174,11 @@ int main(int argc, char** argv) {
|
||||
double vega_gpu_rtc_freq;
|
||||
|
||||
HIPCHECK(hipStreamSynchronize(stream[0]));
|
||||
vega_gpu_rtc_freq = strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 ? 1.0E8 : 2.5E7;
|
||||
vega_gpu_rtc_freq = (strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[0].gcnArchName, "gfx950", 6) == 0) ? 1.0E8 : 2.5E7;
|
||||
fprintf(stdout, "One-way latency in us: %g\n", double(*time_delta[0]) * 1e6 / NUM_LOOPS_RUN / vega_gpu_rtc_freq / 2);
|
||||
|
||||
HIPCHECK(hipStreamSynchronize(stream[1]));
|
||||
vega_gpu_rtc_freq = strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 ? 1.0E8 : 2.5E7;
|
||||
vega_gpu_rtc_freq = (strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[1].gcnArchName, "gfx950", 6) == 0) ? 1.0E8 : 2.5E7;
|
||||
fprintf(stdout, "One-way latency in us: %g\n", double(*time_delta[1]) * 1e6 / NUM_LOOPS_RUN / vega_gpu_rtc_freq / 2);
|
||||
|
||||
HIPCHECK(hipFree(flag[0]));
|
||||
@@ -188,4 +188,4 @@ int main(int argc, char** argv) {
|
||||
HIPCHECK(hipFree(time_delta[1]));
|
||||
HIPCHECK(hipFree(abortFlag[1]));
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) Microsoft Corporation.
|
||||
* Modifications Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Licensed under the MIT License.
|
||||
************************************************************************/
|
||||
|
||||
@@ -86,7 +87,7 @@ int main(int argc, char** argv) {
|
||||
HIPCHECK(hipStreamCreateWithFlags(&stream[0], hipStreamNonBlocking));
|
||||
HIPCHECK(hipDeviceEnablePeerAccess(device_id[1], 0));
|
||||
HIPCHECK(hipGetDeviceProperties(&prop[0], device_id[0]));
|
||||
HIPCHECK(hipExtMallocWithFlags((void**)&flag[0], HIP_IPC_MEM_MIN_SIZE, strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained));
|
||||
HIPCHECK(hipExtMallocWithFlags((void**)&flag[0], HIP_IPC_MEM_MIN_SIZE, (strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[0].gcnArchName, "gfx950", 6) == 0) ? hipDeviceMallocUncached : hipDeviceMallocFinegrained));
|
||||
HIPCHECK(hipMalloc((void**)&time_delta[0], HIP_IPC_MEM_MIN_SIZE));
|
||||
HIPCHECK(hipMemsetAsync(flag[0], 0, HIP_IPC_MEM_MIN_SIZE, stream[0]));
|
||||
HIPCHECK(hipStreamSynchronize(stream[0]));
|
||||
@@ -95,7 +96,7 @@ int main(int argc, char** argv) {
|
||||
HIPCHECK(hipStreamCreateWithFlags(&stream[1], hipStreamNonBlocking));
|
||||
HIPCHECK(hipDeviceEnablePeerAccess(device_id[0], 0));
|
||||
HIPCHECK(hipGetDeviceProperties(&prop[1], device_id[1]));
|
||||
HIPCHECK(hipExtMallocWithFlags((void**)&flag[1], HIP_IPC_MEM_MIN_SIZE, strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained));
|
||||
HIPCHECK(hipExtMallocWithFlags((void**)&flag[1], HIP_IPC_MEM_MIN_SIZE, (strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[1].gcnArchName, "gfx950", 6) == 0) ? hipDeviceMallocUncached : hipDeviceMallocFinegrained));
|
||||
HIPCHECK(hipMalloc((void**)&time_delta[1], HIP_IPC_MEM_MIN_SIZE));
|
||||
HIPCHECK(hipMemsetAsync(flag[1], 0, HIP_IPC_MEM_MIN_SIZE, stream[1]));
|
||||
HIPCHECK(hipStreamSynchronize(stream[1]));
|
||||
@@ -109,11 +110,11 @@ int main(int argc, char** argv) {
|
||||
double vega_gpu_rtc_freq;
|
||||
|
||||
HIPCHECK(hipStreamSynchronize(stream[0]));
|
||||
vega_gpu_rtc_freq = strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 ? 1.0E8 : 2.5E7;
|
||||
vega_gpu_rtc_freq = (strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[0].gcnArchName, "gfx950", 6) == 0) ? 1.0E8 : 2.5E7;
|
||||
fprintf(stdout, "One-way latency in us: %g\n", double(*time_delta[0]) * 1e6 / NUM_LOOPS_RUN / vega_gpu_rtc_freq / 2);
|
||||
|
||||
HIPCHECK(hipStreamSynchronize(stream[1]));
|
||||
vega_gpu_rtc_freq = strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 ? 1.0E8 : 2.5E7;
|
||||
vega_gpu_rtc_freq = (strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[1].gcnArchName, "gfx950", 6) == 0) ? 1.0E8 : 2.5E7;
|
||||
fprintf(stdout, "One-way latency in us: %g\n", double(*time_delta[1]) * 1e6 / NUM_LOOPS_RUN / vega_gpu_rtc_freq / 2);
|
||||
|
||||
HIPCHECK(hipFree(flag[0]));
|
||||
|
||||
Fai riferimento in un nuovo problema
Block a user