From faed69f9fcadf371653566bc106fa5e96e351a80 Mon Sep 17 00:00:00 2001 From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com> Date: Thu, 1 Dec 2022 10:28:42 -0700 Subject: [PATCH] Graph unit tests (#656) * Adding hipGraph unit tests --- test/AllGather_InPlace.cpp | 18 +++--- test/AllGather_ManagedMem.cpp | 18 +++--- test/AllGather_OutOfPlace.cpp | 18 +++--- test/AllReduce_Clique.cpp | 18 +++--- test/AllReduce_InPlace.cpp | 18 +++--- test/AllReduce_ManagedMem.cpp | 18 +++--- test/AllReduce_OutOfPlace.cpp | 18 +++--- test/AllToAll_ManagedMem.cpp | 18 +++--- test/AllToAll_OutOfPlace.cpp | 18 +++--- test/AllToAllv_OutOfPlace.cpp | 2 +- test/Broadcast_InPlace.cpp | 18 +++--- test/Broadcast_ManagedMem.cpp | 18 +++--- test/Broadcast_OutOfPlace.cpp | 18 +++--- test/Gather_InPlace.cpp | 18 +++--- test/Gather_ManagedMem.cpp | 18 +++--- test/Gather_OutOfPlace.cpp | 18 +++--- test/ReduceScatter_InPlace.cpp | 18 +++--- test/ReduceScatter_ManagedMem.cpp | 18 +++--- test/ReduceScatter_OutOfPlace.cpp | 18 +++--- test/Reduce_InPlace.cpp | 18 +++--- test/Reduce_ManagedMem.cpp | 18 +++--- test/Reduce_OutOfPlace.cpp | 18 +++--- test/Scatter_InPlace.cpp | 18 +++--- test/Scatter_ManagedMem.cpp | 18 +++--- test/Scatter_OutOfPlace.cpp | 18 +++--- test/common/CollectiveArgs.hpp | 7 ++- test/common/ErrCode.hpp | 4 +- test/common/TestBed.cpp | 71 +++++++++++----------- test/common/TestBed.hpp | 15 +++-- test/common/TestBedChild.cpp | 99 ++++++++++++++++++++++++++----- tools/GraphBench/GraphBench.cpp | 20 +++++++ 31 files changed, 399 insertions(+), 251 deletions(-) diff --git a/test/AllGather_InPlace.cpp b/test/AllGather_InPlace.cpp index 773554883f..c504ed4040 100644 --- a/test/AllGather_InPlace.cpp +++ b/test/AllGather_InPlace.cpp @@ -12,15 +12,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollAllGather}; - std::vector const dataTypes = {ncclInt8, ncclInt32, ncclInt64}; - std::vector const redOps = {ncclSum}; - std::vector const roots = {0}; - std::vector const numElements = {1048576, 53327, 1024}; - std::vector const inPlaceList = {true}; - std::vector const managedMemList = {false}; + std::vector const funcTypes = {ncclCollAllGather}; + std::vector const dataTypes = {ncclInt8, ncclInt32, ncclInt64}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {true}; + std::vector const managedMemList = {false}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } } diff --git a/test/AllGather_ManagedMem.cpp b/test/AllGather_ManagedMem.cpp index 8ff753d619..eb265211b3 100644 --- a/test/AllGather_ManagedMem.cpp +++ b/test/AllGather_ManagedMem.cpp @@ -12,15 +12,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollAllGather}; - std::vector const dataTypes = {ncclUint8, ncclUint32, ncclUint64}; - std::vector const redOps = {ncclSum}; - std::vector const roots = {0}; - std::vector const numElements = {1048576, 53327, 1024}; - std::vector const inPlaceList = {false}; - std::vector const managedMemList = {true}; + std::vector const funcTypes = {ncclCollAllGather}; + std::vector const dataTypes = {ncclUint8, ncclUint32, ncclUint64}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {true}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } } diff --git a/test/AllGather_OutOfPlace.cpp b/test/AllGather_OutOfPlace.cpp index 2dcc178683..f446c62d61 100644 --- a/test/AllGather_OutOfPlace.cpp +++ b/test/AllGather_OutOfPlace.cpp @@ -12,15 +12,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollAllGather}; - std::vector const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16}; - std::vector const redOps = {ncclSum}; - std::vector const roots = {0}; - std::vector const numElements = {1048576, 53327, 1024}; - std::vector const inPlaceList = {false}; - std::vector const managedMemList = {false}; + std::vector const funcTypes = {ncclCollAllGather}; + std::vector const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {false}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } } diff --git a/test/AllReduce_Clique.cpp b/test/AllReduce_Clique.cpp index a4b818d9df..e1267dd524 100644 --- a/test/AllReduce_Clique.cpp +++ b/test/AllReduce_Clique.cpp @@ -15,15 +15,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollAllReduce}; - std::vector const dataTypes = testBed.GetAllSupportedDataTypes(); - std::vector const redOps = testBed.GetAllSupportedRedOps(); - std::vector const roots = {0}; - std::vector const numElements = {1048576, 53327, 1024}; - std::vector const inPlaceList = {false, true}; - std::vector const managedMemList = {false}; + std::vector const funcTypes = {ncclCollAllReduce}; + std::vector const dataTypes = testBed.GetAllSupportedDataTypes(); + std::vector const redOps = testBed.GetAllSupportedRedOps(); + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false, true}; + std::vector const managedMemList = {false}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); unsetenv("RCCL_ENABLE_CLIQUE"); diff --git a/test/AllReduce_InPlace.cpp b/test/AllReduce_InPlace.cpp index 4a3e61f4bf..e413acf870 100644 --- a/test/AllReduce_InPlace.cpp +++ b/test/AllReduce_InPlace.cpp @@ -12,15 +12,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollAllReduce}; - std::vector const dataTypes = {ncclInt8, ncclInt32, ncclFloat32}; - std::vector const redOps = {ncclSum, ncclProd}; - std::vector const roots = {0}; - std::vector const numElements = {1048576, 53327, 1024}; - std::vector const inPlaceList = {true}; - std::vector const managedMemList = {false}; + std::vector const funcTypes = {ncclCollAllReduce}; + std::vector const dataTypes = {ncclInt8, ncclInt32, ncclFloat32}; + std::vector const redOps = {ncclSum, ncclProd}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {true}; + std::vector const managedMemList = {false}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } } diff --git a/test/AllReduce_ManagedMem.cpp b/test/AllReduce_ManagedMem.cpp index f5019df88a..1e2b1946a3 100644 --- a/test/AllReduce_ManagedMem.cpp +++ b/test/AllReduce_ManagedMem.cpp @@ -12,15 +12,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollAllReduce}; - std::vector const dataTypes = {ncclFloat32, ncclUint8, ncclUint64}; - std::vector const redOps = {ncclSum, ncclMax}; - std::vector const roots = {0}; - std::vector const numElements = {1048576, 53327, 1024}; - std::vector const inPlaceList = {false}; - std::vector const managedMemList = {true}; + std::vector const funcTypes = {ncclCollAllReduce}; + std::vector const dataTypes = {ncclFloat32, ncclUint8, ncclUint64}; + std::vector const redOps = {ncclSum, ncclMax}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {true}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } } diff --git a/test/AllReduce_OutOfPlace.cpp b/test/AllReduce_OutOfPlace.cpp index 6fc8aef9f4..8bfaf6d39f 100644 --- a/test/AllReduce_OutOfPlace.cpp +++ b/test/AllReduce_OutOfPlace.cpp @@ -12,15 +12,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollAllReduce}; - std::vector const dataTypes = {ncclFloat16, ncclFloat32, ncclFloat64, ncclBfloat16}; - std::vector const redOps = {ncclSum, ncclMin}; - std::vector const roots = {0}; - std::vector const numElements = {1048576, 53327, 1024}; - std::vector const inPlaceList = {false}; - std::vector const managedMemList = {false}; + std::vector const funcTypes = {ncclCollAllReduce}; + std::vector const dataTypes = {ncclFloat16, ncclFloat32, ncclFloat64, ncclBfloat16}; + std::vector const redOps = {ncclSum, ncclMin}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {false}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } } diff --git a/test/AllToAll_ManagedMem.cpp b/test/AllToAll_ManagedMem.cpp index 041be9cf8a..eecfcc28f7 100644 --- a/test/AllToAll_ManagedMem.cpp +++ b/test/AllToAll_ManagedMem.cpp @@ -12,15 +12,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollAllToAll}; - std::vector const dataTypes = {ncclUint8, ncclUint32, ncclUint64}; - std::vector const redOps = {ncclSum}; - std::vector const roots = {0}; - std::vector const numElements = {1048576, 53327, 1024}; - std::vector const inPlaceList = {false}; - std::vector const managedMemList = {true}; + std::vector const funcTypes = {ncclCollAllToAll}; + std::vector const dataTypes = {ncclUint8, ncclUint32, ncclUint64}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {true}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } } diff --git a/test/AllToAll_OutOfPlace.cpp b/test/AllToAll_OutOfPlace.cpp index 3b3153afae..4fdd55d6de 100644 --- a/test/AllToAll_OutOfPlace.cpp +++ b/test/AllToAll_OutOfPlace.cpp @@ -12,15 +12,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollAllToAll}; - std::vector const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16}; - std::vector const redOps = {ncclSum}; - std::vector const roots = {0}; - std::vector const numElements = {1048576, 53327, 1024}; - std::vector const inPlaceList = {false}; - std::vector const managedMemList = {false}; + std::vector const funcTypes = {ncclCollAllToAll}; + std::vector const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {false}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } } diff --git a/test/AllToAllv_OutOfPlace.cpp b/test/AllToAllv_OutOfPlace.cpp index e15cafdb3a..ba4939aacb 100644 --- a/test/AllToAllv_OutOfPlace.cpp +++ b/test/AllToAllv_OutOfPlace.cpp @@ -71,7 +71,7 @@ namespace RcclUnitTesting std::string name = testBed.GetTestCaseName(totalRanks, isMultiProcess, ncclCollAllToAllv, dataTypes[dataIdx], ncclSum, -1, - inPlace, useManagedMem); + inPlace, useManagedMem, false); INFO("%s\n", name.c_str()); } diff --git a/test/Broadcast_InPlace.cpp b/test/Broadcast_InPlace.cpp index e10dd961e4..694c3c84d4 100644 --- a/test/Broadcast_InPlace.cpp +++ b/test/Broadcast_InPlace.cpp @@ -12,15 +12,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollBroadcast}; - std::vector const dataTypes = {ncclInt8, ncclInt32, ncclInt64}; - std::vector const redOps = {ncclSum}; - std::vector const roots = {0}; - std::vector const numElements = {1048576, 53327, 1024}; - std::vector const inPlaceList = {true}; - std::vector const managedMemList = {false}; + std::vector const funcTypes = {ncclCollBroadcast}; + std::vector const dataTypes = {ncclInt8, ncclInt32, ncclInt64}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {true}; + std::vector const managedMemList = {false}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } } diff --git a/test/Broadcast_ManagedMem.cpp b/test/Broadcast_ManagedMem.cpp index 1f6695c102..66f5122e49 100644 --- a/test/Broadcast_ManagedMem.cpp +++ b/test/Broadcast_ManagedMem.cpp @@ -12,15 +12,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollBroadcast}; - std::vector const dataTypes = {ncclUint8, ncclUint32, ncclUint64}; - std::vector const redOps = {ncclSum}; - std::vector const roots = {0}; - std::vector const numElements = {1048576, 53327, 1024}; - std::vector const inPlaceList = {false}; - std::vector const managedMemList = {true}; + std::vector const funcTypes = {ncclCollBroadcast}; + std::vector const dataTypes = {ncclUint8, ncclUint32, ncclUint64}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {true}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } } diff --git a/test/Broadcast_OutOfPlace.cpp b/test/Broadcast_OutOfPlace.cpp index ffb01fff55..e52f429370 100644 --- a/test/Broadcast_OutOfPlace.cpp +++ b/test/Broadcast_OutOfPlace.cpp @@ -12,15 +12,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollBroadcast}; - std::vector const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16}; - std::vector const redOps = {ncclSum}; - std::vector const roots = {1}; - std::vector const numElements = {1048576, 53327, 1024}; - std::vector const inPlaceList = {false}; - std::vector const managedMemList = {false}; + std::vector const funcTypes = {ncclCollBroadcast}; + std::vector const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {1}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {false}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } } diff --git a/test/Gather_InPlace.cpp b/test/Gather_InPlace.cpp index 6fe6f0e5a8..a2accce3e1 100644 --- a/test/Gather_InPlace.cpp +++ b/test/Gather_InPlace.cpp @@ -12,15 +12,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollGather}; - std::vector const dataTypes = {ncclInt8, ncclInt32, ncclInt64, ncclFloat16}; - std::vector const redOps = {ncclSum}; - std::vector const roots = {0}; - std::vector const numElements = {1048576, 53327, 1024}; - std::vector const inPlaceList = {true}; - std::vector const managedMemList = {false}; + std::vector const funcTypes = {ncclCollGather}; + std::vector const dataTypes = {ncclInt8, ncclInt32, ncclInt64, ncclFloat16}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {true}; + std::vector const managedMemList = {false}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } } diff --git a/test/Gather_ManagedMem.cpp b/test/Gather_ManagedMem.cpp index efb5134107..837c863544 100644 --- a/test/Gather_ManagedMem.cpp +++ b/test/Gather_ManagedMem.cpp @@ -12,15 +12,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollGather}; - std::vector const dataTypes = {ncclUint8, ncclUint32, ncclUint64}; - std::vector const redOps = {ncclSum}; - std::vector const roots = {1}; - std::vector const numElements = {1048576, 53327, 1024}; - std::vector const inPlaceList = {false}; - std::vector const managedMemList = {true}; + std::vector const funcTypes = {ncclCollGather}; + std::vector const dataTypes = {ncclUint8, ncclUint32, ncclUint64}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {1}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {true}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } } diff --git a/test/Gather_OutOfPlace.cpp b/test/Gather_OutOfPlace.cpp index 49a21d4b81..6a018ec160 100644 --- a/test/Gather_OutOfPlace.cpp +++ b/test/Gather_OutOfPlace.cpp @@ -12,15 +12,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollGather}; - std::vector const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16}; - std::vector const redOps = {ncclSum}; - std::vector const roots = {1}; - std::vector const numElements = {1048576, 53327, 1024}; - std::vector const inPlaceList = {false}; - std::vector const managedMemList = {false}; + std::vector const funcTypes = {ncclCollGather}; + std::vector const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {1}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {false}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } } diff --git a/test/ReduceScatter_InPlace.cpp b/test/ReduceScatter_InPlace.cpp index 53f200a2ec..2db1d46662 100644 --- a/test/ReduceScatter_InPlace.cpp +++ b/test/ReduceScatter_InPlace.cpp @@ -12,15 +12,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollReduceScatter}; - std::vector const dataTypes = {ncclInt8, ncclInt32, ncclInt64, ncclFloat16}; - std::vector const redOps = {ncclSum, ncclProd}; - std::vector const roots = {0}; - std::vector const numElements = {1048576, 53327, 5461, 1024}; - std::vector const inPlaceList = {true}; - std::vector const managedMemList = {false}; + std::vector const funcTypes = {ncclCollReduceScatter}; + std::vector const dataTypes = {ncclInt8, ncclInt32, ncclInt64, ncclFloat16}; + std::vector const redOps = {ncclSum, ncclProd}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 5461, 1024}; + std::vector const inPlaceList = {true}; + std::vector const managedMemList = {false}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } } diff --git a/test/ReduceScatter_ManagedMem.cpp b/test/ReduceScatter_ManagedMem.cpp index 1fc94d8b96..0b9ce80c8b 100644 --- a/test/ReduceScatter_ManagedMem.cpp +++ b/test/ReduceScatter_ManagedMem.cpp @@ -12,15 +12,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollReduceScatter}; - std::vector const dataTypes = {ncclUint8, ncclUint32, ncclUint64}; - std::vector const redOps = {ncclSum}; - std::vector const roots = {0}; - std::vector const numElements = {1048576, 53327, 1024}; - std::vector const inPlaceList = {false}; - std::vector const managedMemList = {true}; + std::vector const funcTypes = {ncclCollReduceScatter}; + std::vector const dataTypes = {ncclUint8, ncclUint32, ncclUint64}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {true}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } } diff --git a/test/ReduceScatter_OutOfPlace.cpp b/test/ReduceScatter_OutOfPlace.cpp index c4f7ee6b99..b674dc68d8 100644 --- a/test/ReduceScatter_OutOfPlace.cpp +++ b/test/ReduceScatter_OutOfPlace.cpp @@ -12,15 +12,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollReduceScatter}; - std::vector const dataTypes = {ncclFloat16, ncclFloat32, ncclFloat64, ncclBfloat16}; - std::vector const redOps = {ncclMin, ncclMax, ncclAvg}; - std::vector const roots = {0}; - std::vector const numElements = {1048576, 53327, 5461, 1024}; - std::vector const inPlaceList = {false}; - std::vector const managedMemList = {false}; + std::vector const funcTypes = {ncclCollReduceScatter}; + std::vector const dataTypes = {ncclFloat16, ncclFloat32, ncclFloat64, ncclBfloat16}; + std::vector const redOps = {ncclMin, ncclMax, ncclAvg}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 5461, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {false}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } } diff --git a/test/Reduce_InPlace.cpp b/test/Reduce_InPlace.cpp index 8d26f43874..b2f417c5bb 100644 --- a/test/Reduce_InPlace.cpp +++ b/test/Reduce_InPlace.cpp @@ -12,15 +12,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollReduce}; - std::vector const dataTypes = {ncclInt8, ncclInt32, ncclInt64}; - std::vector const redOps = {ncclSum, ncclProd}; - std::vector const roots = {0}; - std::vector const numElements = {1048576, 53327, 1024}; - std::vector const inPlaceList = {true}; - std::vector const managedMemList = {false}; + std::vector const funcTypes = {ncclCollReduce}; + std::vector const dataTypes = {ncclInt8, ncclInt32, ncclInt64}; + std::vector const redOps = {ncclSum, ncclProd}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {true}; + std::vector const managedMemList = {false}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } } diff --git a/test/Reduce_ManagedMem.cpp b/test/Reduce_ManagedMem.cpp index 95b3547b47..003b2ca18a 100644 --- a/test/Reduce_ManagedMem.cpp +++ b/test/Reduce_ManagedMem.cpp @@ -12,15 +12,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollReduce}; - std::vector const dataTypes = {ncclUint8, ncclUint32, ncclUint64}; - std::vector const redOps = {ncclSum}; - std::vector const roots = {0}; - std::vector const numElements = {1048576, 53327, 1024}; - std::vector const inPlaceList = {false}; - std::vector const managedMemList = {true}; + std::vector const funcTypes = {ncclCollReduce}; + std::vector const dataTypes = {ncclUint8, ncclUint32, ncclUint64}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {true}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } } diff --git a/test/Reduce_OutOfPlace.cpp b/test/Reduce_OutOfPlace.cpp index 3c8eb474ea..0900911f0b 100644 --- a/test/Reduce_OutOfPlace.cpp +++ b/test/Reduce_OutOfPlace.cpp @@ -12,15 +12,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollReduce}; - std::vector const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16}; - std::vector const redOps = {ncclMin, ncclMax, ncclAvg}; - std::vector const roots = {1}; - std::vector const numElements = {1048576, 53327, 1024}; - std::vector const inPlaceList = {false}; - std::vector const managedMemList = {false}; + std::vector const funcTypes = {ncclCollReduce}; + std::vector const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16}; + std::vector const redOps = {ncclMin, ncclMax, ncclAvg}; + std::vector const roots = {1}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {false}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } } diff --git a/test/Scatter_InPlace.cpp b/test/Scatter_InPlace.cpp index 99f3f983a2..2b1eda72b0 100644 --- a/test/Scatter_InPlace.cpp +++ b/test/Scatter_InPlace.cpp @@ -12,15 +12,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollScatter}; - std::vector const dataTypes = {ncclInt8, ncclInt32, ncclInt64}; - std::vector const redOps = {ncclSum}; - std::vector const roots = {0}; - std::vector const numElements = {1048576, 53327, 1024}; - std::vector const inPlaceList = {true}; - std::vector const managedMemList = {false}; + std::vector const funcTypes = {ncclCollScatter}; + std::vector const dataTypes = {ncclInt8, ncclInt32, ncclInt64}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {true}; + std::vector const managedMemList = {false}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } } diff --git a/test/Scatter_ManagedMem.cpp b/test/Scatter_ManagedMem.cpp index 4a959fc308..1383a0c96d 100644 --- a/test/Scatter_ManagedMem.cpp +++ b/test/Scatter_ManagedMem.cpp @@ -12,15 +12,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollScatter}; - std::vector const dataTypes = {ncclUint8, ncclUint32, ncclUint64}; - std::vector const redOps = {ncclSum}; - std::vector const roots = {0}; - std::vector const numElements = {1048576, 53327, 1024}; - std::vector const inPlaceList = {false}; - std::vector const managedMemList = {true}; + std::vector const funcTypes = {ncclCollScatter}; + std::vector const dataTypes = {ncclUint8, ncclUint32, ncclUint64}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {true}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } } diff --git a/test/Scatter_OutOfPlace.cpp b/test/Scatter_OutOfPlace.cpp index 02bb878930..f7facf3959 100644 --- a/test/Scatter_OutOfPlace.cpp +++ b/test/Scatter_OutOfPlace.cpp @@ -12,15 +12,17 @@ namespace RcclUnitTesting TestBed testBed; // Configuration - std::vector const funcTypes = {ncclCollScatter}; - std::vector const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16}; - std::vector const redOps = {ncclSum}; - std::vector const roots = {1}; - std::vector const numElements = {1048576, 53327, 1024}; - std::vector const inPlaceList = {false}; - std::vector const managedMemList = {false}; + std::vector const funcTypes = {ncclCollScatter}; + std::vector const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {1}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {false}; + std::vector const useHipGraphList = {false, true}; - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } } diff --git a/test/common/CollectiveArgs.hpp b/test/common/CollectiveArgs.hpp index 5f961588ce..1d26155c29 100644 --- a/test/common/CollectiveArgs.hpp +++ b/test/common/CollectiveArgs.hpp @@ -76,9 +76,10 @@ namespace RcclUnitTesting struct OptionalColArgs { ncclRedOp_t redOp = ncclSum; - int root = 0; // Used as "peer" for Send/Recv - ScalarTransport scalarTransport; // Used for custom reduction operators - int scalarMode = -1; // -1 if scalar not used + int root = 0; // Used as "peer" for Send/Recv + ScalarTransport scalarTransport; // Used for custom reduction operators + int scalarMode = -1; // -1 if scalar not used + // allToAllv args size_t sendcounts[MAX_RANKS*MAX_RANKS]; size_t sdispls[MAX_RANKS*MAX_RANKS]; diff --git a/test/common/ErrCode.hpp b/test/common/ErrCode.hpp index 426b2e976a..89f5e4f17e 100644 --- a/test/common/ErrCode.hpp +++ b/test/common/ErrCode.hpp @@ -4,6 +4,7 @@ * See LICENSE.txt for license information ************************************************************************/ #pragma once +#include namespace RcclUnitTesting { @@ -31,7 +32,8 @@ namespace RcclUnitTesting hipError_t error = (func); \ if (error != hipSuccess) \ { \ - fprintf(stderr, "\033[0;33" "[ ERROR ] HIP error: %s\n" "\033[m", hipGetErrorString(error)); \ + fprintf(stderr, "\033[0;31m" "[ ERROR ] HIP error: %s File:%s Line:%d\n" "\033[m", \ + hipGetErrorString(error), strrchr("/" __FILE__, '/') + 1, __LINE__); \ return TEST_FAIL; \ } \ } diff --git a/test/common/TestBed.cpp b/test/common/TestBed.cpp index e07035356a..f65a1ee59b 100644 --- a/test/common/TestBed.cpp +++ b/test/common/TestBed.cpp @@ -15,7 +15,7 @@ { \ if (ev.verbose) INFO("Calling PIPE_READ to Child %d\n", childId); \ ssize_t retval = read(childList[childId]->parentReadFd, &val, sizeof(val)); \ - if (ev.verbose) INFO("Got PIPE_READ %ld\n", retval); \ + if (ev.verbose) INFO("Got PIPE_READ %ld from Child %d\n", retval, childId); \ if (retval == -1) \ { \ ERROR("Unable to read from child %d: Error %s\n", childId, strerror(errno)); \ @@ -104,7 +104,7 @@ namespace RcclUnitTesting } } - //Determine number of unique GPUs being used. + // Determine number of unique GPUs being used. std::set unique_devices; for (auto a: this->rankToDeviceMap) unique_devices.insert(a); @@ -240,7 +240,7 @@ namespace RcclUnitTesting } } - void TestBed::ExecuteCollectives(std::vector const ¤tRanks) + void TestBed::ExecuteCollectives(std::vector const ¤tRanks, bool const useHipGraph) { int const cmd = TestBedChild::CHILD_EXECUTE_COLL; ++TestBed::NumTestsRun(); @@ -257,6 +257,7 @@ namespace RcclUnitTesting if ((currentRanks.size() == 0) || (ranksPerChild[childId].size() > 0)) { PIPE_WRITE(childId, cmd); + PIPE_WRITE(childId, useHipGraph); int tempCurrentRanks = currentRanks.size(); PIPE_WRITE(childId, tempCurrentRanks); for (int rank = 0; rank < currentRanks.size(); ++rank){ @@ -372,16 +373,16 @@ namespace RcclUnitTesting } std::vector> TestBed::GetDeviceIdsList(int const numProcesses, - int const numGpus, - int const ranksPerGpu) + int const numGpus, + int const ranksPerGpu) { std::vector> result(numProcesses); int ntasks = numProcesses == 1 ? numGpus : 1; int k=0; for (int i = 0; i < numProcesses; i++) for (int j = 0; j < ntasks * ranksPerGpu; j++) { - result[i].push_back(k%numGpus); - k++; + result[i].push_back(k%numGpus); + k++; } return result; } @@ -394,7 +395,8 @@ namespace RcclUnitTesting int const root, bool const inPlace, bool const managedMem, - int const ranksPerProc) + bool const useHipGraph, + int const ranksPerProc) { std::stringstream ss; ss << (isMultiProcess ? "MP" : "SP") << " "; @@ -405,7 +407,9 @@ namespace RcclUnitTesting ss << " "; ss << "ranks "; ss << ncclFuncNames[funcType] << " "; - ss << "(" << (inPlace ? "IP" : "OP") << "," << (managedMem ? "MM" : "GM") << ") "; + ss << "(" << (inPlace ? "IP" : "OP") << "," + << (managedMem ? "MM" : "GM") << "," + << (useHipGraph ? "GL" : "NL") <<") "; ss << ncclDataTypeNames[dataType] << " "; if (CollectiveArgs::UsesReduce(funcType)) ss << ncclRedOpNames[redOp] << " "; if (CollectiveArgs::UsesRoot(funcType)) ss << "Root " << root << " "; @@ -418,7 +422,8 @@ namespace RcclUnitTesting std::vector const& roots, std::vector const& numElements, std::vector const& inPlaceList, - std::vector const& managedMemList) + std::vector const& managedMemList, + std::vector const& useHipGraphList) { // Sort numElements in descending order to cut down on # of allocations std::vector sortedN = numElements; @@ -475,16 +480,6 @@ namespace RcclUnitTesting for (int ipIdx = 0; ipIdx < inPlaceList.size() && isCorrect; ++ipIdx) for (int mmIdx = 0; mmIdx < managedMemList.size() && isCorrect; ++mmIdx) { - if (ev.showNames) - { - std::string name = this->GetTestCaseName(numGpus, isMultiProcess, - funcTypes[ftIdx], dataTypes[dtIdx], - redOps[rdIdx], roots[rtIdx], - inPlaceList[ipIdx], managedMemList[mmIdx], - ranksPerGpu); - INFO("%s\n", name.c_str()); - } - for (int neIdx = 0; neIdx < numElements.size() && isCorrect; ++neIdx) { int numInputElements, numOutputElements; @@ -504,24 +499,34 @@ namespace RcclUnitTesting // Only allocate once for largest size if (neIdx == 0) this->AllocateMem(inPlaceList[ipIdx], managedMemList[mmIdx]); - // There are some cases when data does not need to be re-prepared - // e.g. AllReduce subarray expected results are still valid - bool canSkip = (neIdx != 0 && !inPlaceList[ipIdx] && - (funcTypes[ftIdx] == ncclCollBroadcast || - funcTypes[ftIdx] == ncclCollReduce || - funcTypes[ftIdx] == ncclCollAllReduce)); - if (!canSkip) this->PrepareData(); - - this->ExecuteCollectives(); - this->ValidateResults(isCorrect); - if (!isCorrect) + for (int hgIdx = 0; hgIdx < useHipGraphList.size() && isCorrect; ++hgIdx) { + // There are some cases when data does not need to be re-prepared + // e.g. AllReduce subarray expected results are still valid + bool canSkip = (neIdx != 0 && !inPlaceList[ipIdx] && + (funcTypes[ftIdx] == ncclCollBroadcast || + funcTypes[ftIdx] == ncclCollReduce || + funcTypes[ftIdx] == ncclCollAllReduce)); + if (!canSkip) this->PrepareData(); + std::string name = this->GetTestCaseName(numGpus, isMultiProcess, funcTypes[ftIdx], dataTypes[dtIdx], redOps[rdIdx], roots[rtIdx], inPlaceList[ipIdx], managedMemList[mmIdx], - ranksPerGpu); - ERROR("Incorrect output for %s\n", name.c_str()); + useHipGraphList[hgIdx], ranksPerGpu); + + if (ev.showNames) + { + INFO("%s [%d elements]\n", name.c_str(), numInputElements); + } + + std::vector currentRanksEmpty = {}; + this->ExecuteCollectives(currentRanksEmpty, useHipGraphList[hgIdx]); + this->ValidateResults(isCorrect); + if (!isCorrect) + { + ERROR("Incorrect output for %s\n", name.c_str()); + } } } this->DeallocateMem(); diff --git a/test/common/TestBed.hpp b/test/common/TestBed.hpp index e28dcf9208..e72daebae3 100644 --- a/test/common/TestBed.hpp +++ b/test/common/TestBed.hpp @@ -32,9 +32,12 @@ namespace RcclUnitTesting TestBed(); // Prepare TestBed for use with GPUs across multiple child processes - void InitComms(std::vector> const& deviceIdsPerChild, int const numCollectivesInGroup = 1); + void InitComms(std::vector> const& deviceIdsPerChild, + int const numCollectivesInGroup = 1); + // Prepare TestBed for use with GPUs on a single child process - void InitComms(int const numGpus, int const numCollectivesInGroup = 1); + void InitComms(int const numGpus, + int const numCollectivesInGroup = 1); // Set collectives arguments for specified collective / rank // Setting scalarsPerRank to non-null will create custom reduction operator @@ -69,7 +72,7 @@ namespace RcclUnitTesting // Execute all collectives on all test children // Blocks until collective is completed - void ExecuteCollectives(std::vector const ¤tRanks = {}); + void ExecuteCollectives(std::vector const ¤tRanks = {}, bool const useHipGraph = false); // Perform results validation - compare output to expected void ValidateResults(bool& isCorrect, int collId = -1, int const rank = -1); @@ -96,7 +99,7 @@ namespace RcclUnitTesting // Helper function that splits up GPUs to the given number of processes static std::vector> GetDeviceIdsList(int const numProcesses, int const numGpus, - int const ranksPerGpu); + int const ranksPerGpu); static std::vector> GetDeviceIdsList(int const numProcesses, int const numGpus); @@ -109,6 +112,7 @@ namespace RcclUnitTesting int const root, bool const inPlace, bool const managedMem, + bool const useHipGraph, int const ranksPerProc=1); // Run a simple sweep @@ -118,7 +122,8 @@ namespace RcclUnitTesting std::vector const& roots, std::vector const& numElements, std::vector const& inPlaceList, - std::vector const& managedMemList); + std::vector const& managedMemList, + std::vector const& useHipGraphList); // Used to track total number of calls to ExecuteCollectives() static int& NumTestsRun(); diff --git a/test/common/TestBedChild.cpp b/test/common/TestBedChild.cpp index 233d99ff14..d6437effc5 100644 --- a/test/common/TestBedChild.cpp +++ b/test/common/TestBedChild.cpp @@ -170,16 +170,16 @@ namespace RcclUnitTesting if (useMultiRankPerGpu) { - if (ncclCommInitRankMulti(&this->comms[localRank], this->totalRanks, id, globalRank, globalRank) != ncclSuccess) + if (ncclCommInitRankMulti(&this->comms[localRank], this->totalRanks, id, globalRank, globalRank) != ncclSuccess) { - ERROR("Rank %d on child %d unable to call ncclCommInitRankMulti\n", globalRank, this->childId); - status = TEST_FAIL; - break; - } + ERROR("Rank %d on child %d unable to call ncclCommInitRankMulti\n", globalRank, this->childId); + status = TEST_FAIL; + break; + } } else { - if (ncclCommInitRank(&this->comms[localRank], this->totalRanks, id, globalRank) != ncclSuccess) + if (ncclCommInitRank(&this->comms[localRank], this->totalRanks, id, globalRank) != ncclSuccess) { ERROR("Rank %d on child %d unable to call ncclCommInitRank\n", globalRank, this->childId); status = TEST_FAIL; @@ -337,6 +337,9 @@ namespace RcclUnitTesting ErrCode TestBedChild::ExecuteCollectives() { + bool useHipGraph = false; + PIPE_READ(useHipGraph); + int numRanksToExecute, tempRank; std::vector ranksToExecute = {}; PIPE_READ(numRanksToExecute); @@ -345,7 +348,31 @@ namespace RcclUnitTesting PIPE_READ(tempRank); ranksToExecute.push_back(tempRank - this->rankOffset); } - if (this->verbose) INFO("Child %d begins ExecuteCollectives()\n", this->childId); + if (this->verbose) INFO("Child %d begins ExecuteCollectives() %s\n", this->childId, useHipGraph ? "(using hipGraphs)" : ""); + + // Determine which local ranks to execute on + std::vector localRanksToExecute; + for (int localRank = 0; localRank < this->deviceIds.size(); ++localRank) + { + // If ranksToExeute is empty, execute all local ranks belonging to this child + if (!ranksToExecute.empty() && + (std::count(ranksToExecute.begin(), ranksToExecute.end(), localRank) == 0)) continue; + localRanksToExecute.push_back(localRank); + } + + numRanksToExecute = (int)localRanksToExecute.size(); + hipGraph_t graphs[numRanksToExecute]; + hipGraphExec_t graphExec[numRanksToExecute]; + + // Start HIP graph stream capture if requested + if (useHipGraph) + { + for (int localRank : localRanksToExecute) + { + if (this->verbose) INFO("Capturing stream for rank %d\n", localRank); + CHECK_HIP(hipStreamBeginCapture(this->streams[localRank], hipStreamCaptureModeGlobal)); + } + } // Start group call CHILD_NCCL_CALL(ncclGroupStart(), "ncclGroupStart"); @@ -354,16 +381,13 @@ namespace RcclUnitTesting for (int collId = 0; collId < this->numCollectivesInGroup; ++collId) { // Loop over all local ranks - for (int localRank = 0; localRank < this->deviceIds.size(); ++localRank) + for (int localRank : localRanksToExecute) { - // If ranks to execute is empty, execute all ranks belonging to child - if (!ranksToExecute.empty() && (std::count(ranksToExecute.begin(), ranksToExecute.end(), localRank) == 0)) continue; - CHECK_HIP(hipSetDevice(this->deviceIds[localRank])); CollectiveArgs const& collArg = this->collArgs[localRank][collId]; - if (this->printValues) + if (this->printValues && !useHipGraph) { int const numInputElementsToPrint = (this->printValues < 0 ? collArg.numInputElements : this->printValues); PtrUnion inputCpu; @@ -502,17 +526,60 @@ namespace RcclUnitTesting // End group call CHILD_NCCL_CALL(ncclGroupEnd(), "ncclGroupEnd"); - // Synchronize - if (this->verbose) INFO("Child %d submits group call. Waiting for completion\n", this->childId); - for (int localRank = 0; localRank < this->streams.size(); ++localRank) + // Instantiate and launch HIP graph if requested + if (useHipGraph) { + for (int localRank : localRanksToExecute) + { + if (this->verbose) INFO("Ending stream capture for rank %d\n", localRank); + + CHECK_HIP(hipStreamEndCapture(this->streams[localRank], &graphs[localRank])); + if (this->verbose) + { + size_t numNodes; + hipGraphNode_t* nodes; + CHECK_HIP(hipGraphGetNodes(graphs[localRank], nodes, &numNodes)); + INFO("Graph for rank %d has %lu nodes\n", localRank, numNodes); + } + + if (this->verbose) INFO("Instantiating executable graph for rank %d\n", localRank); + CHECK_HIP(hipGraphInstantiate(&graphExec[localRank], graphs[localRank], NULL, NULL, 0)); + } + + for (int localRank : localRanksToExecute) + { + if (this->verbose) INFO("Launch graph for rank %d\n", localRank); + CHECK_HIP(hipGraphLaunch(graphExec[localRank], this->streams[localRank])); + } + } + else + { + if (this->verbose) + INFO("Child %d submits group call. Waiting for completion\n", this->childId); + } + + // Synchronize + for (int localRank : localRanksToExecute) + { + if (this->verbose) INFO("Starting synchronization for rank %d\n", localRank); CHECK_HIP(hipStreamSynchronize(this->streams[localRank])); } + // Destroy graphs + if (useHipGraph) + { + for (int localRank : localRanksToExecute) + { + if (this->verbose) INFO("Destroying graphs for rank %d\n", localRank); + CHECK_HIP(hipGraphDestroy(graphs[localRank])); + CHECK_HIP(hipGraphExecDestroy(graphExec[localRank])); + } + } + if (this->printValues) { for (int collId = 0; collId < this->numCollectivesInGroup; ++collId) - for (int localRank = 0; localRank < this->deviceIds.size(); ++localRank) + for (int localRank : localRanksToExecute) { CollectiveArgs const& collArg = this->collArgs[localRank][collId]; diff --git a/tools/GraphBench/GraphBench.cpp b/tools/GraphBench/GraphBench.cpp index 2f29285538..92bda69c9f 100644 --- a/tools/GraphBench/GraphBench.cpp +++ b/tools/GraphBench/GraphBench.cpp @@ -121,7 +121,10 @@ int main(int argc, char **argv) if (usingGraphs) { for (int r = 0; r < nranks; ++r) + { + HIP_CALL(hipSetDevice(r)); HIP_CALL(hipStreamBeginCapture(stream[r], hipStreamCaptureModeThreadLocal)); + } NCCL_CALL(ncclGroupStart()); for (int r = 0; r < nranks; ++r) @@ -132,11 +135,17 @@ int main(int argc, char **argv) NCCL_CALL(ncclGroupEnd()); for (int r = 0; r < nranks; ++r) + { + //HIP_CALL(hipSetDevice(r)); HIP_CALL(hipStreamEndCapture(stream[r], &graphs[r])); + } // Instantiating graphs for (int r = 0; r < nranks; ++r) + { + HIP_CALL(hipSetDevice(r)); HIP_CALL(hipGraphInstantiate(&graphExec[r], graphs[r], NULL, NULL, 0)); + } } auto setupDelta = std::chrono::high_resolution_clock::now() - setupStart; double setupTime = std::chrono::duration_cast>(setupDelta).count(); @@ -150,7 +159,10 @@ int main(int argc, char **argv) if (usingGraphs) { for (int r = 0; r < nranks; r++) + { + HIP_CALL(hipSetDevice(r)); HIP_CALL(hipGraphLaunch(graphExec[r], stream[r])); + } } else { @@ -162,6 +174,7 @@ int main(int argc, char **argv) } NCCL_CALL(ncclGroupEnd()); } + for (int r = 0; r < nranks; r++) HIP_CALL(hipStreamSynchronize(stream[r])); @@ -194,6 +207,13 @@ int main(int argc, char **argv) } average[usingGraphs] /= numIterations; printf("%12.3f", average[usingGraphs]); + + for (int r = 0; r < nranks; r++) + { + HIP_CALL(hipSetDevice(r)); + HIP_CALL(hipGraphDestroy(graphs[r])); + HIP_CALL(hipGraphExecDestroy(graphExec[r])); + } } printf("%12.3f\n", average[0] / average[1]); fflush(stdout);