diff --git a/tools/p2p-latency-test/Makefile b/tools/p2p-latency-test/Makefile index ca02daf4de..5697f24d54 100644 --- a/tools/p2p-latency-test/Makefile +++ b/tools/p2p-latency-test/Makefile @@ -1,18 +1,23 @@ # Copyright (c) Microsoft Corporation. +# Modifications Copyright (c) Advanced Micro Devices, Inc., or its affiliates. # Licensed under the MIT License. -HIP_PATH ?= $(wildcard /opt/rocm) -ifeq (,$(HIP_PATH)) -HIP_PATH = ../../.. -endif -HIPCC = $(HIP_PATH)/bin/hipcc +# +# See LICENSE.txt for license information + +ROCM_PATH ?= $(wildcard /opt/rocm) +HIPCC = $(ROCM_PATH)/bin/hipcc all: p2p_latency_test ll_latency_test CXXFLAGS = -g -O3 + p2p_latency_test: p2p_latency_test.cpp + @printf "Compiling %-35s > %s\n" $< $@ $(HIPCC) $(CXXFLAGS) $^ -o $@ + ll_latency_test: ll_latency_test.cpp + @printf "Compiling %-35s > %s\n" $< $@ $(HIPCC) $(CXXFLAGS) $^ -o $@ clean: - rm -f *.o $(EXE) + rm -f *.o p2p_latency_test ll_latency_test diff --git a/tools/p2p-latency-test/README.md b/tools/p2p-latency-test/README.md new file mode 100644 index 0000000000..46f98798ae --- /dev/null +++ b/tools/p2p-latency-test/README.md @@ -0,0 +1,14 @@ +# P2P Latency Tests +Microbenchmarks to test Peer-2-Peer and Low Latency protocols on AMD GPUs. + +## How-to-use? + +To build and run these tests, follow these steps: +1. Navigate to the `p2p-latency-test` directory. +2. Run `bash build_and_run.sh` + +```bash +cd rccl/tools/p2p-latency-test +bash build_and_run.sh +``` + diff --git a/tools/p2p-latency-test/build_and_run.sh b/tools/p2p-latency-test/build_and_run.sh index c3e87d202b..73810c34dd 100644 --- a/tools/p2p-latency-test/build_and_run.sh +++ b/tools/p2p-latency-test/build_and_run.sh @@ -1,25 +1,31 @@ # Copyright (c) Microsoft Corporation. +# Modifications Copyright (c) Advanced Micro Devices, Inc., or its affiliates. # Licensed under the MIT License. -make +make clean +make -j # Example run: test one-way latency between GPU 0 and GPU 1 in both directions. export HSA_FORCE_FINE_GRAIN_PCIE=1 echo Running p2p_latency_test using GPU pair 0 1 ./p2p_latency_test 0 1 +echo "" sleep 1 echo Running p2p_latency_test using GPU pair 1 0 ./p2p_latency_test 1 0 +echo "" sleep 1 echo Running ll_latency_test using GPU pair 0 1 ./ll_latency_test 0 1 +echo "" sleep 1 echo Running ll_latency_test using GPU pair 1 0 ./ll_latency_test 1 0 +echo "" diff --git a/tools/p2p-latency-test/ll_latency_test.cpp b/tools/p2p-latency-test/ll_latency_test.cpp index f049c9db73..6fee02af5a 100644 --- a/tools/p2p-latency-test/ll_latency_test.cpp +++ b/tools/p2p-latency-test/ll_latency_test.cpp @@ -147,7 +147,7 @@ int main(int argc, char** argv) { HIPCHECK(hipStreamCreateWithFlags(&stream[0], hipStreamNonBlocking)); HIPCHECK(hipDeviceEnablePeerAccess(device_id[1], 0)); HIPCHECK(hipGetDeviceProperties(&prop[0], device_id[0])); - HIPCHECK(hipExtMallocWithFlags((void**)&flag[0], HIP_IPC_MEM_MIN_SIZE, strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained)); + HIPCHECK(hipExtMallocWithFlags((void**)&flag[0], HIP_IPC_MEM_MIN_SIZE, (strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[0].gcnArchName, "gfx950", 6) == 0) ? hipDeviceMallocUncached : hipDeviceMallocFinegrained)); HIPCHECK(hipHostMalloc ((void**)&time_delta[0], sizeof(uint64_t), hipHostMallocDefault)); HIPCHECK(hipMalloc((void**)&abortFlag[0], sizeof(uint32_t))); HIPCHECK(hipMemsetAsync(flag[0], 0, HIP_IPC_MEM_MIN_SIZE, stream[0])); @@ -158,7 +158,7 @@ int main(int argc, char** argv) { HIPCHECK(hipStreamCreateWithFlags(&stream[1], hipStreamNonBlocking)); HIPCHECK(hipDeviceEnablePeerAccess(device_id[0], 0)); HIPCHECK(hipGetDeviceProperties(&prop[1], device_id[1])); - HIPCHECK(hipExtMallocWithFlags((void**)&flag[1], HIP_IPC_MEM_MIN_SIZE, strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained)); + HIPCHECK(hipExtMallocWithFlags((void**)&flag[1], HIP_IPC_MEM_MIN_SIZE, (strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[1].gcnArchName, "gfx950", 6) == 0) ? hipDeviceMallocUncached : hipDeviceMallocFinegrained)); HIPCHECK(hipHostMalloc((void**)&time_delta[1], sizeof(uint64_t), hipHostMallocDefault)); HIPCHECK(hipMalloc((void**)&abortFlag[1], sizeof(uint32_t))); HIPCHECK(hipMemsetAsync(flag[1], 0, HIP_IPC_MEM_MIN_SIZE, stream[1])); @@ -174,11 +174,11 @@ int main(int argc, char** argv) { double vega_gpu_rtc_freq; HIPCHECK(hipStreamSynchronize(stream[0])); - vega_gpu_rtc_freq = strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 ? 1.0E8 : 2.5E7; + vega_gpu_rtc_freq = (strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[0].gcnArchName, "gfx950", 6) == 0) ? 1.0E8 : 2.5E7; fprintf(stdout, "One-way latency in us: %g\n", double(*time_delta[0]) * 1e6 / NUM_LOOPS_RUN / vega_gpu_rtc_freq / 2); HIPCHECK(hipStreamSynchronize(stream[1])); - vega_gpu_rtc_freq = strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 ? 1.0E8 : 2.5E7; + vega_gpu_rtc_freq = (strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[1].gcnArchName, "gfx950", 6) == 0) ? 1.0E8 : 2.5E7; fprintf(stdout, "One-way latency in us: %g\n", double(*time_delta[1]) * 1e6 / NUM_LOOPS_RUN / vega_gpu_rtc_freq / 2); HIPCHECK(hipFree(flag[0])); @@ -188,4 +188,4 @@ int main(int argc, char** argv) { HIPCHECK(hipFree(time_delta[1])); HIPCHECK(hipFree(abortFlag[1])); return 0; -} \ No newline at end of file +} diff --git a/tools/p2p-latency-test/p2p_latency_test.cpp b/tools/p2p-latency-test/p2p_latency_test.cpp index da98c8b2bc..a8e74cec5b 100644 --- a/tools/p2p-latency-test/p2p_latency_test.cpp +++ b/tools/p2p-latency-test/p2p_latency_test.cpp @@ -1,5 +1,6 @@ /************************************************************************* * Copyright (c) Microsoft Corporation. + * Modifications Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. * Licensed under the MIT License. ************************************************************************/ @@ -86,7 +87,7 @@ int main(int argc, char** argv) { HIPCHECK(hipStreamCreateWithFlags(&stream[0], hipStreamNonBlocking)); HIPCHECK(hipDeviceEnablePeerAccess(device_id[1], 0)); HIPCHECK(hipGetDeviceProperties(&prop[0], device_id[0])); - HIPCHECK(hipExtMallocWithFlags((void**)&flag[0], HIP_IPC_MEM_MIN_SIZE, strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained)); + HIPCHECK(hipExtMallocWithFlags((void**)&flag[0], HIP_IPC_MEM_MIN_SIZE, (strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[0].gcnArchName, "gfx950", 6) == 0) ? hipDeviceMallocUncached : hipDeviceMallocFinegrained)); HIPCHECK(hipMalloc((void**)&time_delta[0], HIP_IPC_MEM_MIN_SIZE)); HIPCHECK(hipMemsetAsync(flag[0], 0, HIP_IPC_MEM_MIN_SIZE, stream[0])); HIPCHECK(hipStreamSynchronize(stream[0])); @@ -95,7 +96,7 @@ int main(int argc, char** argv) { HIPCHECK(hipStreamCreateWithFlags(&stream[1], hipStreamNonBlocking)); HIPCHECK(hipDeviceEnablePeerAccess(device_id[0], 0)); HIPCHECK(hipGetDeviceProperties(&prop[1], device_id[1])); - HIPCHECK(hipExtMallocWithFlags((void**)&flag[1], HIP_IPC_MEM_MIN_SIZE, strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained)); + HIPCHECK(hipExtMallocWithFlags((void**)&flag[1], HIP_IPC_MEM_MIN_SIZE, (strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[1].gcnArchName, "gfx950", 6) == 0) ? hipDeviceMallocUncached : hipDeviceMallocFinegrained)); HIPCHECK(hipMalloc((void**)&time_delta[1], HIP_IPC_MEM_MIN_SIZE)); HIPCHECK(hipMemsetAsync(flag[1], 0, HIP_IPC_MEM_MIN_SIZE, stream[1])); HIPCHECK(hipStreamSynchronize(stream[1])); @@ -109,11 +110,11 @@ int main(int argc, char** argv) { double vega_gpu_rtc_freq; HIPCHECK(hipStreamSynchronize(stream[0])); - vega_gpu_rtc_freq = strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 ? 1.0E8 : 2.5E7; + vega_gpu_rtc_freq = (strncmp(prop[0].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[0].gcnArchName, "gfx950", 6) == 0) ? 1.0E8 : 2.5E7; fprintf(stdout, "One-way latency in us: %g\n", double(*time_delta[0]) * 1e6 / NUM_LOOPS_RUN / vega_gpu_rtc_freq / 2); HIPCHECK(hipStreamSynchronize(stream[1])); - vega_gpu_rtc_freq = strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 ? 1.0E8 : 2.5E7; + vega_gpu_rtc_freq = (strncmp(prop[1].gcnArchName, "gfx942", 6) == 0 || strncmp(prop[1].gcnArchName, "gfx950", 6) == 0) ? 1.0E8 : 2.5E7; fprintf(stdout, "One-way latency in us: %g\n", double(*time_delta[1]) * 1e6 / NUM_LOOPS_RUN / vega_gpu_rtc_freq / 2); HIPCHECK(hipFree(flag[0]));