[TOOLS] Update p2p-latency-test for gfx950 (#1730)

This commit is contained in:
Nilesh M Negi
2025-07-10 12:13:29 -05:00
committato da GitHub
parent 2c099fe29a
commit f839e4edef
5 ha cambiato i file con 42 aggiunte e 16 eliminazioni
+11 -6
Vedi File
@@ -1,18 +1,23 @@
# Copyright (c) Microsoft Corporation.
# Modifications Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# Licensed under the MIT License.
HIP_PATH ?= $(wildcard /opt/rocm)
ifeq (,$(HIP_PATH))
HIP_PATH = ../../..
endif
HIPCC = $(HIP_PATH)/bin/hipcc
#
# See LICENSE.txt for license information
ROCM_PATH ?= $(wildcard /opt/rocm)
HIPCC = $(ROCM_PATH)/bin/hipcc
all: p2p_latency_test ll_latency_test
CXXFLAGS = -g -O3
p2p_latency_test: p2p_latency_test.cpp
@printf "Compiling %-35s > %s\n" $< $@
$(HIPCC) $(CXXFLAGS) $^ -o $@
ll_latency_test: ll_latency_test.cpp
@printf "Compiling %-35s > %s\n" $< $@
$(HIPCC) $(CXXFLAGS) $^ -o $@
clean:
rm -f *.o $(EXE)
rm -f *.o p2p_latency_test ll_latency_test
+14
Vedi File
@@ -0,0 +1,14 @@
# P2P Latency Tests
Microbenchmarks to test Peer-2-Peer and Low Latency protocols on AMD GPUs.
## How-to-use?
To build and run these tests, follow these steps:
1. Navigate to the `p2p-latency-test` directory.
2. Run `bash build_and_run.sh`
```bash
cd rccl/tools/p2p-latency-test
bash build_and_run.sh
```
+7 -1
Vedi File
@@ -1,25 +1,31 @@
# Copyright (c) Microsoft Corporation.
# Modifications Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# Licensed under the MIT License.
make
make clean
make -j
# Example run: test one-way latency between GPU 0 and GPU 1 in both directions.
export HSA_FORCE_FINE_GRAIN_PCIE=1
echo Running p2p_latency_test using GPU pair 0 1
./p2p_latency_test 0 1
echo ""
sleep 1
echo Running p2p_latency_test using GPU pair 1 0
./p2p_latency_test 1 0
echo ""
sleep 1
echo Running ll_latency_test using GPU pair 0 1
./ll_latency_test 0 1
echo ""
sleep 1
echo Running ll_latency_test using GPU pair 1 0
./ll_latency_test 1 0
echo ""
@@ -147,7 +147,7 @@ int main(int argc, char** argv) {
HIPCHECK(hipStreamCreateWithFlags(&stream[0], hipStreamNonBlocking));
HIPCHECK(hipDeviceEnablePeerAccess(device_id[1], 0));
HIPCHECK(hipGetDeviceProperties(&prop[0], device_id[0]));
HIPCHECK(hipExtMallocWithFlags((void**)&flag[0], HIP_IPC_MEM_MIN_SIZE, strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained));
HIPCHECK(hipExtMallocWithFlags((void**)&flag[0], HIP_IPC_MEM_MIN_SIZE, (strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[0].gcnArchName, "gfx950", 6) == 0) ? hipDeviceMallocUncached : hipDeviceMallocFinegrained));
HIPCHECK(hipHostMalloc ((void**)&time_delta[0], sizeof(uint64_t), hipHostMallocDefault));
HIPCHECK(hipMalloc((void**)&abortFlag[0], sizeof(uint32_t)));
HIPCHECK(hipMemsetAsync(flag[0], 0, HIP_IPC_MEM_MIN_SIZE, stream[0]));
@@ -158,7 +158,7 @@ int main(int argc, char** argv) {
HIPCHECK(hipStreamCreateWithFlags(&stream[1], hipStreamNonBlocking));
HIPCHECK(hipDeviceEnablePeerAccess(device_id[0], 0));
HIPCHECK(hipGetDeviceProperties(&prop[1], device_id[1]));
HIPCHECK(hipExtMallocWithFlags((void**)&flag[1], HIP_IPC_MEM_MIN_SIZE, strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained));
HIPCHECK(hipExtMallocWithFlags((void**)&flag[1], HIP_IPC_MEM_MIN_SIZE, (strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[1].gcnArchName, "gfx950", 6) == 0) ? hipDeviceMallocUncached : hipDeviceMallocFinegrained));
HIPCHECK(hipHostMalloc((void**)&time_delta[1], sizeof(uint64_t), hipHostMallocDefault));
HIPCHECK(hipMalloc((void**)&abortFlag[1], sizeof(uint32_t)));
HIPCHECK(hipMemsetAsync(flag[1], 0, HIP_IPC_MEM_MIN_SIZE, stream[1]));
@@ -174,11 +174,11 @@ int main(int argc, char** argv) {
double vega_gpu_rtc_freq;
HIPCHECK(hipStreamSynchronize(stream[0]));
vega_gpu_rtc_freq = strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 ? 1.0E8 : 2.5E7;
vega_gpu_rtc_freq = (strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[0].gcnArchName, "gfx950", 6) == 0) ? 1.0E8 : 2.5E7;
fprintf(stdout, "One-way latency in us: %g\n", double(*time_delta[0]) * 1e6 / NUM_LOOPS_RUN / vega_gpu_rtc_freq / 2);
HIPCHECK(hipStreamSynchronize(stream[1]));
vega_gpu_rtc_freq = strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 ? 1.0E8 : 2.5E7;
vega_gpu_rtc_freq = (strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[1].gcnArchName, "gfx950", 6) == 0) ? 1.0E8 : 2.5E7;
fprintf(stdout, "One-way latency in us: %g\n", double(*time_delta[1]) * 1e6 / NUM_LOOPS_RUN / vega_gpu_rtc_freq / 2);
HIPCHECK(hipFree(flag[0]));
@@ -188,4 +188,4 @@ int main(int argc, char** argv) {
HIPCHECK(hipFree(time_delta[1]));
HIPCHECK(hipFree(abortFlag[1]));
return 0;
}
}
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) Microsoft Corporation.
* Modifications Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
* Licensed under the MIT License.
************************************************************************/
@@ -86,7 +87,7 @@ int main(int argc, char** argv) {
HIPCHECK(hipStreamCreateWithFlags(&stream[0], hipStreamNonBlocking));
HIPCHECK(hipDeviceEnablePeerAccess(device_id[1], 0));
HIPCHECK(hipGetDeviceProperties(&prop[0], device_id[0]));
HIPCHECK(hipExtMallocWithFlags((void**)&flag[0], HIP_IPC_MEM_MIN_SIZE, strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained));
HIPCHECK(hipExtMallocWithFlags((void**)&flag[0], HIP_IPC_MEM_MIN_SIZE, (strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[0].gcnArchName, "gfx950", 6) == 0) ? hipDeviceMallocUncached : hipDeviceMallocFinegrained));
HIPCHECK(hipMalloc((void**)&time_delta[0], HIP_IPC_MEM_MIN_SIZE));
HIPCHECK(hipMemsetAsync(flag[0], 0, HIP_IPC_MEM_MIN_SIZE, stream[0]));
HIPCHECK(hipStreamSynchronize(stream[0]));
@@ -95,7 +96,7 @@ int main(int argc, char** argv) {
HIPCHECK(hipStreamCreateWithFlags(&stream[1], hipStreamNonBlocking));
HIPCHECK(hipDeviceEnablePeerAccess(device_id[0], 0));
HIPCHECK(hipGetDeviceProperties(&prop[1], device_id[1]));
HIPCHECK(hipExtMallocWithFlags((void**)&flag[1], HIP_IPC_MEM_MIN_SIZE, strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained));
HIPCHECK(hipExtMallocWithFlags((void**)&flag[1], HIP_IPC_MEM_MIN_SIZE, (strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[1].gcnArchName, "gfx950", 6) == 0) ? hipDeviceMallocUncached : hipDeviceMallocFinegrained));
HIPCHECK(hipMalloc((void**)&time_delta[1], HIP_IPC_MEM_MIN_SIZE));
HIPCHECK(hipMemsetAsync(flag[1], 0, HIP_IPC_MEM_MIN_SIZE, stream[1]));
HIPCHECK(hipStreamSynchronize(stream[1]));
@@ -109,11 +110,11 @@ int main(int argc, char** argv) {
double vega_gpu_rtc_freq;
HIPCHECK(hipStreamSynchronize(stream[0]));
vega_gpu_rtc_freq = strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 ? 1.0E8 : 2.5E7;
vega_gpu_rtc_freq = (strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[0].gcnArchName, "gfx950", 6) == 0) ? 1.0E8 : 2.5E7;
fprintf(stdout, "One-way latency in us: %g\n", double(*time_delta[0]) * 1e6 / NUM_LOOPS_RUN / vega_gpu_rtc_freq / 2);
HIPCHECK(hipStreamSynchronize(stream[1]));
vega_gpu_rtc_freq = strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 ? 1.0E8 : 2.5E7;
vega_gpu_rtc_freq = (strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[1].gcnArchName, "gfx950", 6) == 0) ? 1.0E8 : 2.5E7;
fprintf(stdout, "One-way latency in us: %g\n", double(*time_delta[1]) * 1e6 / NUM_LOOPS_RUN / vega_gpu_rtc_freq / 2);
HIPCHECK(hipFree(flag[0]));