/****************************************************************************** * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. *****************************************************************************/ #include "tester.hpp" #include #include #include #include #include #include #include "alltoall_tester.hpp" #include "amo_bitwise_tester.hpp" #include "amo_extended_tester.hpp" #include "amo_standard_tester.hpp" #include "barrier_all_tester.hpp" #include "broadcast_tester.hpp" #include "empty_tester.hpp" #include "extended_primitives.hpp" #include "fcollect_tester.hpp" #include "ping_all_tester.hpp" #include "ping_pong_tester.hpp" #include "primitive_mr_tester.hpp" #include "primitive_tester.hpp" #include "random_access_tester.hpp" #include "reduction_tester.hpp" #include "shmem_ptr_tester.hpp" #include "swarm_tester.hpp" #include "sync_tester.hpp" #include "team_broadcast_tester.hpp" #include "team_ctx_infra_tester.hpp" #include "team_ctx_primitive_tester.hpp" #include "team_reduction_tester.hpp" Tester::Tester(TesterArguments args) : args(args) { _type = (TestType)args.algorithm; _shmem_context = args.shmem_context; CHECK_HIP(hipStreamCreate(&stream)); CHECK_HIP(hipEventCreate(&start_event)); CHECK_HIP(hipEventCreate(&stop_event)); CHECK_HIP(hipMalloc((void**)&timer, sizeof(uint64_t) * args.num_wgs)); } Tester::~Tester() { CHECK_HIP(hipFree(timer)); CHECK_HIP(hipEventDestroy(stop_event)); CHECK_HIP(hipEventDestroy(start_event)); CHECK_HIP(hipStreamDestroy(stream)); } std::vector Tester::create(TesterArguments args) { int rank = args.myid; std::vector testers; if (rank == 0) std::cout << "*** Creating Test: "; TestType type = (TestType)args.algorithm; switch (type) { case InitTestType: if (rank == 0) std::cout << "Init ***" << std::endl; testers.push_back(new EmptyTester(args)); return testers; case GetTestType: if (rank == 0) std::cout << "Blocking Gets***" << std::endl; testers.push_back(new PrimitiveTester(args)); return testers; case GetNBITestType: if (rank == 0) std::cout << "Non-Blocking Gets***" << std::endl; testers.push_back(new PrimitiveTester(args)); return testers; case PutTestType: if (rank == 0) std::cout << "Blocking Puts***" << std::endl; testers.push_back(new PrimitiveTester(args)); return testers; case PutNBITestType: if (rank == 0) std::cout << "Non-Blocking Puts***" << std::endl; testers.push_back(new PrimitiveTester(args)); return testers; case TeamCtxInfraTestType: if (rank == 0) std::cout << "Team Ctx Infra test***" << std::endl; testers.push_back(new TeamCtxInfraTester(args)); return testers; case TeamCtxGetTestType: if (rank == 0) std::cout << "Blocking Team Ctx Gets***" << std::endl; testers.push_back(new TeamCtxPrimitiveTester(args)); return testers; case TeamCtxGetNBITestType: if (rank == 0) std::cout << "Non-Blocking Team Ctx Gets***" << std::endl; testers.push_back(new TeamCtxPrimitiveTester(args)); return testers; case TeamCtxPutTestType: if (rank == 0) std::cout << "Blocking Team Ctx Puts***" << std::endl; testers.push_back(new TeamCtxPrimitiveTester(args)); return testers; case TeamCtxPutNBITestType: if (rank == 0) std::cout << "Non-Blocking Team Ctx Puts***" << std::endl; testers.push_back(new TeamCtxPrimitiveTester(args)); return testers; case PTestType: if (rank == 0) std::cout << "P Test***" << std::endl; testers.push_back(new PrimitiveTester(args)); return testers; case GTestType: if (rank == 0) std::cout << "G Test***" << std::endl; testers.push_back(new PrimitiveTester(args)); return testers; case GetSwarmTestType: if (rank == 0) std::cout << "Get Swarm***" << std::endl; testers.push_back(new GetSwarmTester(args)); return testers; case TeamReductionTestType: if (rank == 0) std::cout << "All-to-All Team-based Reduction***" << std::endl; testers.push_back(new TeamReductionTester( args, [](float& f1, float& f2) { f1 = 1; f2 = 1; }, [](float v, float n_pes) { return (v == n_pes) ? std::make_pair(true, "") : std::make_pair(false, "Got " + std::to_string(v) + ", Expect " + std::to_string(n_pes)); })); return testers; case ReductionTestType: if (rank == 0) std::cout << "All-to-All Reduction***" << std::endl; testers.push_back(new ReductionTester( args, [](float& f1, float& f2) { f1 = 1; f2 = 1; }, [](float v, float n_pes) { return (v == n_pes) ? std::make_pair(true, "") : std::make_pair(false, "Got " + std::to_string(v) + ", Expect " + std::to_string(n_pes)); })); #if 0 testers.push_back( new ReductionTester( args, [](double& f1, double& f2) { f1=1; f2=1; }, [](double v, double n_pes) { return (v == n_pes) ? std::make_pair(true, "") : std::make_pair(false, "Got "+ std::to_string(v) + ", Expect " + std::to_string(n_pes)); } ) ); testers.push_back( new ReductionTester(args, [](long double& f1,long double& f2){f1=1; f2=1;}, [](long double v){ return (v==2.0) ? std::make_pair(true, "") : std::make_pair(false, "Got "+ std::to_string(v) +", Expect 2.0. [r3]."); })); testers.push_back( new ReductionTester(args, [](short& f1, short& f2){f1=1; f2=2;}, [](short v){ return (v==3) ? std::make_pair(true, "") : std::make_pair(false, "Got "+ std::to_string(v) +", Expect 3. [r4]."); })); testers.push_back( new ReductionTester(args, [](int& f1, int& f2){f1=1; f2=2;}, [](int v){ return (v==3) ? std::make_pair(true, "") : std::make_pair(false, "Got "+ std::to_string(v) +", Expect 3. [r5]."); })); testers.push_back( new ReductionTester(args, [](long long& f1, long long& f2){f1=1; f2=2;}, [](long long v){ return (v==3) ? std::make_pair(true, "") : std::make_pair(false, "Got "+ std::to_string(v) +", Expect 3. [r6]."); })); testers.push_back( new ReductionTester(args, [](long long& f1, long long& f2){f1=1; f2=2;}, [](long long v){ return (v==3) ? std::make_pair(true, "") : std::make_pair(false, "Got "+ std::to_string(v) +", Expect 3. [r7]."); })); // seems like deadlock or soemthing, this test hang forever testers.push_back( new ReductionTester(args, [](short& f1, short& f2){f1=1; f2=2;}, [](short v){ return (v==1) ? std::make_pair(true, "") : std::make_pair(false, "Got "+ std::to_string(v) +", Expect 1. [r8]."); })); testers.push_back( new ReductionTester(args, [](int& f1, int& f2){f1=1; f2=2;}, [](int v){ return (v==1) ? std::make_pair(true, "") : std::make_pair(false, "Got "+ std::to_string(v) +", Expect 1. [r9]."); })); testers.push_back( new ReductionTester(args, [](long long& f1, long long& f2){f1=1; f2=2;}, [](long long v){ return (v==1) ? std::make_pair(true, "") : std::make_pair(false, "Got "+ std::to_string(v) +", Expect 1. [r10]."); })); testers.push_back( new ReductionTester(args, [](long long& f1, long long& f2){f1=1; f2=2;}, [](long long v){ return (v==1) ? std::make_pair(true, "") : std::make_pair(false, "Got "+ std::to_string(v) +", Expect 1. [r11]."); })); testers.push_back( new ReductionTester(args, [](int& f1, int& f2){f1=1; f2=2;}, [](int v){ return (v==2) ? std::make_pair(true, "") : std::make_pair(false, "Got "+ std::to_string(v) +", Expect 2. [r12]."); })); testers.push_back( new ReductionTester(args, [](long long& f1, long long& f2){f1=1; f2=2;}, [](long long v){ return (v==2) ? std::make_pair(true, "") : std::make_pair(false, "Got "+ std::to_string(v) +", Expect 2. [r13]."); })); testers.push_back( new ReductionTester(args, [](long long& f1, long long& f2){f1=1; f2=2;}, [](long long v){ return (v==2) ? std::make_pair(true, "") : std::make_pair(false, "Got "+ std::to_string(v) +", Expect 2. [r14]."); })); #endif return testers; case BroadcastTestType: if (rank == 0) { std::cout << "Broadcast Test***" << std::endl; } testers.push_back(new BroadcastTester( args, [](long& f1, long& f2) { f1 = 1; f2 = 2; }, [rank](long v) { long expected_val; /** * The verification routine here requires that the * PE_root value is 0 which denotes that the * sending processing element is rank 0. * * The difference in expected values arises from * the specification for broadcast where the * PE_root processing element does not copy the * contents from its own source to dest during * the broadcast. */ if (rank == 0) { expected_val = 2; } else { expected_val = 1; } return (v == expected_val) ? std::make_pair(true, "") : std::make_pair( false, "Rank " + std::to_string(rank) + ", Got " + std::to_string(v) + ", Expect " + std::to_string(expected_val)); })); return testers; case TeamBroadcastTestType: if (rank == 0) { std::cout << "Team Broadcast Test***" << std::endl; } testers.push_back(new TeamBroadcastTester( args, [](long& f1, long& f2) { f1 = 1; f2 = 2; }, [rank](long v) { long expected_val; /** * The verification routine here requires that the * PE_root value is 0 which denotes that the * sending processing element is rank 0. * * The difference in expected values arises from * the specification for broadcast where the * PE_root processing element does not copy the * contents from its own source to dest during * the broadcast. */ if (rank == 0) { expected_val = 2; } else { expected_val = 1; } return (v == expected_val) ? std::make_pair(true, "") : std::make_pair( false, "Rank " + std::to_string(rank) + ", Got " + std::to_string(v) + ", Expect " + std::to_string(expected_val)); })); return testers; case AllToAllTestType: if (rank == 0) { std::cout << "Alltoall Test***" << std::endl; } testers.push_back(new AlltoallTester( args, [rank](int64_t& f1, int64_t& f2, int64_t dest_pe) { const long SRC_SHIFT = 16; // Make value for each src, dst pair unique // by shifting src by SRC_SHIFT bits f1 = (rank << SRC_SHIFT) + dest_pe; f2 = -1; }, [rank](int64_t v, int64_t src_pe) { const long SRC_SHIFT = 16; // See if we obtained unique value long expected_val = (src_pe << SRC_SHIFT) + rank; return (v == expected_val) ? std::make_pair(true, "") : std::make_pair( false, "Rank " + std::to_string(rank) + ", Got " + std::to_string(v) + ", Expect " + std::to_string(expected_val)); })); return testers; case FCollectTestType: if (rank == 0) { std::cout << "Fcollect Test***" << std::endl; } testers.push_back(new FcollectTester( args, [rank](int64_t& f1, int64_t& f2) { f1 = rank; f2 = -1; }, [rank](int64_t v, int64_t src_pe) { int64_t expected_val = src_pe; return (v == expected_val) ? std::make_pair(true, "") : std::make_pair( false, "Rank " + std::to_string(rank) + ", Got " + std::to_string(v) + ", Expect " + std::to_string(expected_val)); })); return testers; case AMO_FAddTestType: if (rank == 0) std::cout << "AMO Fetch_Add***" << std::endl; testers.push_back(new AMOStandardTester(args)); testers.push_back(new AMOStandardTester(args)); testers.push_back(new AMOStandardTester(args)); return testers; case AMO_FIncTestType: if (rank == 0) std::cout << "AMO Fetch_Inc***" << std::endl; testers.push_back(new AMOStandardTester(args)); testers.push_back(new AMOStandardTester(args)); testers.push_back(new AMOStandardTester(args)); return testers; case AMO_FetchTestType: if (rank == 0) std::cout << "AMO Fetch***" << std::endl; testers.push_back(new AMOExtendedTester(args)); testers.push_back(new AMOExtendedTester(args)); testers.push_back(new AMOExtendedTester(args)); return testers; case AMO_FCswapTestType: if (rank == 0) std::cout << "AMO Fetch_CSWAP***" << std::endl; testers.push_back(new AMOStandardTester(args)); testers.push_back(new AMOStandardTester(args)); testers.push_back(new AMOStandardTester(args)); return testers; case AMO_AddTestType: if (rank == 0) std::cout << "AMO Add***" << std::endl; testers.push_back(new AMOStandardTester(args)); testers.push_back(new AMOStandardTester(args)); testers.push_back(new AMOStandardTester(args)); return testers; case AMO_SetTestType: if (rank == 0) std::cout << "AMO Set***" << std::endl; testers.push_back(new AMOExtendedTester(args)); testers.push_back(new AMOExtendedTester(args)); testers.push_back(new AMOExtendedTester(args)); return testers; case AMO_SwapTestType: if (rank == 0) std::cout << "AMO Swap***" << std::endl; testers.push_back(new AMOExtendedTester(args)); testers.push_back(new AMOExtendedTester(args)); testers.push_back(new AMOExtendedTester(args)); return testers; case AMO_FetchAndTestType: if (rank == 0) std::cout << "AMO Fetch And***" << std::endl; testers.push_back(new AMOBitwiseTester(args)); testers.push_back(new AMOBitwiseTester(args)); testers.push_back(new AMOBitwiseTester(args)); return testers; case AMO_AndTestType: if (rank == 0) std::cout << "AMO And***" << std::endl; testers.push_back(new AMOBitwiseTester(args)); testers.push_back(new AMOBitwiseTester(args)); testers.push_back(new AMOBitwiseTester(args)); return testers; case AMO_FetchOrTestType: if (rank == 0) std::cout << "AMO Fetch Or***" << std::endl; testers.push_back(new AMOBitwiseTester(args)); testers.push_back(new AMOBitwiseTester(args)); testers.push_back(new AMOBitwiseTester(args)); return testers; case AMO_OrTestType: if (rank == 0) std::cout << "AMO Or***" << std::endl; testers.push_back(new AMOBitwiseTester(args)); testers.push_back(new AMOBitwiseTester(args)); testers.push_back(new AMOBitwiseTester(args)); return testers; case AMO_FetchXorTestType: if (rank == 0) std::cout << "AMO Fetch Xor***" << std::endl; testers.push_back(new AMOBitwiseTester(args)); testers.push_back(new AMOBitwiseTester(args)); testers.push_back(new AMOBitwiseTester(args)); return testers; case AMO_XorTestType: if (rank == 0) std::cout << "AMO Xor***" << std::endl; testers.push_back(new AMOBitwiseTester(args)); testers.push_back(new AMOBitwiseTester(args)); testers.push_back(new AMOBitwiseTester(args)); return testers; case AMO_IncTestType: if (rank == 0) std::cout << "AMO Inc***" << std::endl; testers.push_back(new AMOStandardTester(args)); testers.push_back(new AMOStandardTester(args)); testers.push_back(new AMOStandardTester(args)); return testers; case PingPongTestType: if (rank == 0) std::cout << "PingPong***" << std::endl; testers.push_back(new PingPongTester(args)); return testers; case PingAllTestType: if (rank == 0) std::cout << "PingAll***" << std::endl; testers.push_back(new PingAllTester(args)); return testers; case BarrierAllTestType: if (rank == 0) std::cout << "Barrier_All***" << std::endl; testers.push_back(new BarrierAllTester(args)); return testers; case SyncAllTestType: if (rank == 0) std::cout << "SyncAll***" << std::endl; testers.push_back(new SyncTester(args)); return testers; case SyncTestType: if (rank == 0) std::cout << "Sync***" << std::endl; testers.push_back(new SyncTester(args)); return testers; case RandomAccessTestType: if (rank == 0) std::cout << "Random_Access***" << std::endl; testers.push_back(new RandomAccessTester(args)); return testers; case ShmemPtrTestType: if (rank == 0) std::cout << "Shmem_Ptr***" << std::endl; testers.push_back(new ShmemPtrTester(args)); return testers; case WGGetTestType: if (rank == 0) std::cout << "Blocking WG level Gets***" << std::endl; testers.push_back(new ExtendedPrimitiveTester(args)); return testers; case WGGetNBITestType: if (rank == 0) std::cout << "Non-Blocking WG level Gets***" << std::endl; testers.push_back(new ExtendedPrimitiveTester(args)); return testers; case WGPutTestType: if (rank == 0) std::cout << "Blocking WG level Puts***" << std::endl; testers.push_back(new ExtendedPrimitiveTester(args)); return testers; case WGPutNBITestType: if (rank == 0) std::cout << "Non-Blocking WG level Puts***" << std::endl; testers.push_back(new ExtendedPrimitiveTester(args)); return testers; case PutNBIMRTestType: if (rank == 0) std::cout << "Non-Blocking Put message rate***" << std::endl; testers.push_back(new PrimitiveMRTester(args)); return testers; default: if (rank == 0) std::cout << "Unknown***" << std::endl; testers.push_back(new PrimitiveTester(args)); return testers; } return testers; } void Tester::execute() { if (_type == InitTestType) return; int num_loops = args.loop; /** * Some tests loop through data sizes in powers of 2 and report the * results for those ranges. */ for (uint64_t size = args.min_msg_size; size <= args.max_msg_size; size <<= 1) { resetBuffers(size); /** * Restricts the number of iterations of really large messages. */ if (size > args.large_message_size) num_loops = args.loop_large; barrier(); preLaunchKernel(); /** * This conditional launches the HIP kernel. * * Some tests may only launch a single kernel. These kernels will * be kicked off by the initiator (denoted by the args.myid check). * * Other tests will initiate of both sides and launch from both * rocshmem pes. */ if (peLaunchesKernel()) { /** * TODO: * Verify that this timer type is actually uint64_t on the * device side. */ memset(timer, 0, sizeof(uint64_t) * args.num_wgs); const dim3 blockSize(args.wg_size, 1, 1); const dim3 gridSize(args.num_wgs, 1, 1); CHECK_HIP(hipEventRecord(start_event, stream)); launchKernel(gridSize, blockSize, num_loops, size); CHECK_HIP(hipEventRecord(stop_event, stream)); hipError_t err = hipStreamSynchronize(stream); if (err != hipSuccess) { printf("error = %d \n", err); } // roc_shmem_dump_stats(); roc_shmem_reset_stats(); } barrier(); postLaunchKernel(); // data validation verifyResults(size); barrier(); if (_type != TeamCtxInfraTestType) { print(size); } } } bool Tester::peLaunchesKernel() { bool is_launcher; /** * The PE assigned 0 is always active in these tests. */ is_launcher = args.myid == 0; /** * Some test types are active on both sides. */ is_launcher = is_launcher || (_type == ReductionTestType) || (_type == TeamReductionTestType) || (_type == BroadcastTestType) || (_type == TeamBroadcastTestType) || (_type == AllToAllTestType) || (_type == FCollectTestType) || (_type == PingPongTestType) || (_type == BarrierAllTestType) || (_type == SyncTestType) || (_type == SyncAllTestType) || (_type == RandomAccessTestType) || (_type == PingAllTestType); return is_launcher; } void Tester::print(uint64_t size) { if (args.myid != 0) { return; } uint64_t timer_avg = timerAvgInMicroseconds(); double latency_avg = static_cast(timer_avg) / num_timed_msgs; double avg_msg_rate = num_timed_msgs / (timer_avg / 1e6); float total_kern_time_ms; CHECK_HIP(hipEventElapsedTime(&total_kern_time_ms, start_event, stop_event)); float total_kern_time_s = total_kern_time_ms / 1000; double bandwidth_avg_gbs = num_msgs * size * bw_factor / total_kern_time_s / pow(2, 30); int field_width = 20; int float_precision = 2; printf("\n##### Message Size %lu #####\n", size); printf("%*s%*s%*s\n", field_width + 1, "Latency AVG (us)", field_width + 1, "Bandwidth (GB/s)", field_width + 1, "Avg Message rate (Messages/s)"); printf("%*.*f %*.*f %*.*f\n", field_width, float_precision, latency_avg, field_width, float_precision, bandwidth_avg_gbs, field_width, float_precision, avg_msg_rate); fflush(stdout); } void flush_hdp() { int hip_dev_id{}; unsigned int* hdp_flush_ptr_{nullptr}; CHECK_HIP(hipGetDevice(&hip_dev_id)); CHECK_HIP(hipDeviceGetAttribute(reinterpret_cast(&hdp_flush_ptr_), hipDeviceAttributeHdpMemFlushCntl, hip_dev_id)); __atomic_store_n(hdp_flush_ptr_, 0x1, __ATOMIC_SEQ_CST); } void Tester::barrier() { MPI_Barrier(MPI_COMM_WORLD); flush_hdp(); } uint64_t Tester::gpuCyclesToMicroseconds(uint64_t cycles) { /** * The dGPU asm core timer runs at 27MHz. This is different from the * core clock returned by HIP. For an APU, this is different and might * need adjusting. */ uint64_t gpu_frequency_MHz = 27; /** * hipDeviceGetAttribute(&gpu_frequency_khz, * hipDeviceAttributeClockRate, * 0); */ return cycles / gpu_frequency_MHz; } uint64_t Tester::timerAvgInMicroseconds() { uint64_t sum = 0; for (int i = 0; i < args.num_wgs; i++) { sum += gpuCyclesToMicroseconds(timer[i]); } return sum / args.num_wgs; }