rocm-systems/projects/rccl/test/GroupCallTests.cpp

/*************************************************************************
 * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
#include "TestBed.hpp"

namespace RcclUnitTesting
{
  // Test identical collectives within the same group call
  TEST(GroupCall, Identical)
  {
    TestBed testBed;

    // Configuration
    std::vector<ncclFunc_t>     const funcTypes       = {ncclCollAllReduce, ncclCollAllReduce, ncclCollAllReduce};
    std::vector<ncclRedOp_t>    const testRedOps      = {ncclSum, ncclSum, ncclSum};
    std::vector<ncclDataType_t> const testDataTypes   = {ncclFloat, ncclFloat, ncclFloat};
    std::vector<int>            const numElements     = {1048576, 384 * 1024, 384};

    int                         const numCollPerGroup = numElements.size();
    bool                        const inPlace         = false;
    bool                        const useManagedMem   = false;

    std::vector<ncclDataType_t> dataTypes;
    testBed.GetSupportedDataTypes(dataTypes, testDataTypes);
    if (dataTypes.empty()) {
      GTEST_SKIP() << "Skipping... test datatypes excluded by UT_DATATYPES.";
    }

    std::vector<ncclRedOp_t> redOps;
    testBed.GetSupportedRedOps(redOps, testRedOps);
    if (redOps.empty()) {
      GTEST_SKIP() << "Skipping... test reduction operations excluded by UT_REDOPS.";
    }

    bool isCorrect = true;
    for (int totalRanks : testBed.ev.GetNumGpusList())
    for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
    {
      // Test either single process all GPUs, or 1 process per GPU
      int const numProcesses = isMultiProcess ? totalRanks : 1;
      const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
      testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder), numCollPerGroup);

      if (testBed.ev.showNames)
        INFO("%s %d-ranks GroupCall Identical\n", isMultiProcess ? "MP" : "SP", totalRanks);

      // Set up the different collectives within the group
      for (int collIdx = 0; collIdx < numCollPerGroup; ++collIdx)
      {
        OptionalColArgs options;
        options.redOp = redOps[collIdx];
        testBed.SetCollectiveArgs(funcTypes[collIdx],
                                  dataTypes[collIdx],
                                  numElements[collIdx],
                                  numElements[collIdx],
                                  options,
                                  collIdx);
      }

      testBed.AllocateMem(inPlace, useManagedMem);
      testBed.PrepareData();
      testBed.ExecuteCollectives();
      testBed.ValidateResults(isCorrect);
      testBed.DeallocateMem();
      testBed.DestroyComms();
    }
    testBed.Finalize();
  }

  // Test different collectives within the same group call
  TEST(GroupCall, Different)
  {
    TestBed testBed;

    // Configuration
    std::vector<ncclFunc_t>     const funcTypes       = {ncclCollBroadcast,
                                                         ncclCollAllGather,
                                                         ncclCollReduceScatter,
                                                         ncclCollAllReduce,
                                                         ncclCollGather,
                                                         ncclCollScatter,
                                                         ncclCollAlltoAll};
    int                         const numCollPerGroup = funcTypes.size();
    int                         const numElements     = 1048576;
    bool                        const inPlace         = false;
    bool                        const useManagedMem   = false;

    OptionalColArgs options;
    options.redOp = ncclSum;
    options.root  = 0;

    bool isCorrect = true;
    for (int totalRanks : testBed.ev.GetNumGpusList())
    for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
    {
      // Test either single process all GPUs, or 1 process per GPU
      int const numProcesses = isMultiProcess ? totalRanks : 1;
      const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
      testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder), numCollPerGroup);

      if (testBed.ev.showNames)
        INFO("%s %d-ranks GroupCall Different\n", isMultiProcess ? "MP" : "SP", totalRanks);

      // Set up the different collectives within the group
      for (int collIdx = 0; collIdx < numCollPerGroup; ++collIdx)
      {
        int numInputElements;
        int numOutputElements;
        CollectiveArgs::GetNumElementsForFuncType(funcTypes[collIdx],
                                                  numElements,
                                                  totalRanks,
                                                  &numInputElements,
                                                  &numOutputElements);

        testBed.SetCollectiveArgs(funcTypes[collIdx],
                                  ncclFloat,
                                  numInputElements,
                                  numOutputElements,
                                  options,
                                  collIdx);
      }

      testBed.AllocateMem(inPlace, useManagedMem);
      testBed.PrepareData();
      testBed.ExecuteCollectives();
      testBed.ValidateResults(isCorrect);
      testBed.DeallocateMem();
      testBed.DestroyComms();
    }
    testBed.Finalize();
  }

  // Test identical collectives with different data type
  TEST(GroupCall, MixedDataType)
  {
    TestBed testBed;

    // Configuration
    std::vector<ncclFunc_t>     const funcTypes       = {ncclCollAllReduce, ncclCollAllReduce, ncclCollAllReduce};
    std::vector<ncclRedOp_t>    const testRedOps      = {ncclSum, ncclSum, ncclSum};
    std::vector<ncclDataType_t> const testDataTypes   = {ncclFloat16, ncclFloat32, ncclFloat64};
    std::vector<int>            const numElements     = {1048576, 384 * 1024, 384};

    int                         const numCollPerGroup = numElements.size();
    bool                        const inPlace         = false;
    bool                        const useManagedMem   = false;

    std::vector<ncclDataType_t> dataTypes;
    testBed.GetSupportedDataTypes(dataTypes, testDataTypes);
    if (dataTypes.empty()) {
      GTEST_SKIP() << "Skipping... test datatypes excluded by UT_DATATYPES.";
    }

    std::vector<ncclRedOp_t> redOps;
    testBed.GetSupportedRedOps(redOps, testRedOps);
    if (redOps.empty()) {
      GTEST_SKIP() << "Skipping... test reduction operations excluded by UT_REDOPS.";
    }

    bool isCorrect = true;
    for (int totalRanks : testBed.ev.GetNumGpusList())
    for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
    {
      // Test either single process all GPUs, or 1 process per GPU
      int const numProcesses = isMultiProcess ? totalRanks : 1;
      const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
      testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder), numCollPerGroup);

      if (testBed.ev.showNames)
        INFO("%s %d-ranks GroupCall MixedDataType\n", isMultiProcess ? "MP" : "SP", totalRanks);

      // Set up the different collectives within the group
      for (int collIdx = 0; collIdx < numCollPerGroup; ++collIdx)
      {
        OptionalColArgs options;
        options.redOp = redOps[collIdx];
        testBed.SetCollectiveArgs(funcTypes[collIdx],
                                  dataTypes[collIdx],
                                  numElements[collIdx],
                                  numElements[collIdx],
                                  options,
                                  collIdx);
      }

      testBed.AllocateMem(inPlace, useManagedMem);
      testBed.PrepareData();
      testBed.ExecuteCollectives();
      testBed.ValidateResults(isCorrect);
      testBed.DeallocateMem();
      testBed.DestroyComms();
    }
    testBed.Finalize();
  }

  TEST(GroupCall, Multistream)
  {
    TestBed testBed;

    // Configuration
    int  const  numElements        = 1048576;
    bool const  inPlace            = false;
    bool const  useManagedMem      = false;

    OptionalColArgs options;

    // This test runs multiple AllReduce collectives on different streams within the same group call
    bool isCorrect = true;
    for (int totalRanks : testBed.ev.GetNumGpusList())
    for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
    {
      // Test either single process all GPUs, or 1 process per GPU
      int const numProcesses = isMultiProcess ? totalRanks : 1;

      for (int numCollPerGroup = 2; numCollPerGroup <= 6; numCollPerGroup += 2)
      {
        for (int numStreamsPerGroup = numCollPerGroup; numStreamsPerGroup >= 2; numStreamsPerGroup -= 3)
        {
          if (testBed.ev.showNames)
            INFO("%s %d-ranks Multistream %d-Group Calls across %d streams\n",
                 isMultiProcess ? "MP" : "SP", totalRanks, numCollPerGroup, numStreamsPerGroup);

          const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
          testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder),
                            numCollPerGroup, numStreamsPerGroup);

          // Set up each collective in group in different stream (modulo numStreamsPerGroup)
          options.redOp = ncclSum;
          for (int collIdx = 0; collIdx < numCollPerGroup; ++collIdx)
          {
            testBed.SetCollectiveArgs(ncclCollAllReduce, ncclFloat, numElements, numElements,
                                      options, collIdx, 0, -1, collIdx % numStreamsPerGroup);
          }

          testBed.AllocateMem(inPlace, useManagedMem);
          testBed.PrepareData();
          testBed.ExecuteCollectives();
          testBed.ValidateResults(isCorrect);
          testBed.DeallocateMem();
          testBed.DestroyComms();
        }
      }
    }
    testBed.Finalize();
  }

  TEST(GroupCall, MultiGroupCall)
  {
    TestBed testBed;

    // Configuration
    std::vector<std::vector<ncclFunc_t>> const groupCalls         = {{ncclCollAllReduce, ncclCollAllGather},
                                                                     {ncclCollAlltoAll, ncclCollGather},
                                                                     {ncclCollBroadcast, ncclCollReduceScatter}};
    std::vector<std::vector<int>>        const numElements        = {{1250, 1048576}, {384, 384 * 1024}, {1048576, 127}};
    std::vector<ncclDataType_t>          const testDataTypes      = {ncclFloat16, ncclFloat32, ncclBfloat16};
    std::vector<ncclRedOp_t>             const testRedOps         = {ncclSum, ncclProd, ncclMax};
    std::vector<int>                     const numCollsPerGroup   = {2, 2, 2};
    std::vector<int>                     const numStreamsPerGroup = {1, 1, 1};
    std::vector<bool>                    const useHipGraphList    = {true, false, true};
    bool                                 const inPlace            = false;
    bool                                 const useManagedMem      = false;
    bool                                 const useBlocking        = true;
    int                                  const numGroupCalls      = groupCalls.size();
    int                                  const numIterations      = 10;

    std::vector<ncclDataType_t> dataTypes;
    testBed.GetSupportedDataTypes(dataTypes, testDataTypes);
    if (dataTypes.empty()) {
      GTEST_SKIP() << "Skipping... test datatypes excluded by UT_DATATYPES.";
    }

    std::vector<ncclRedOp_t> redOps;
    testBed.GetSupportedRedOps(redOps, testRedOps);
    if (redOps.empty()) {
      GTEST_SKIP() << "Skipping... test reduction operations excluded by UT_REDOPS.";
    }

    bool isCorrect = true;
    for (int totalRanks : testBed.ev.GetNumGpusList())
    for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
    {
      int const numProcesses     = isMultiProcess ? totalRanks : 1;

      // Initialize comms by specifying the # of group calls
      const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
      testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder), numCollsPerGroup, numStreamsPerGroup, numGroupCalls, useBlocking);

      if (testBed.ev.showNames)
        INFO("%s %d-ranks GroupCall MultiGroupCall\n", isMultiProcess ? "MP" : "SP", totalRanks);

      for (int groupCallIdx = 0; groupCallIdx < groupCalls.size(); ++groupCallIdx)
      {
        std::vector<ncclFunc_t> funcTypes = groupCalls[groupCallIdx];
        OptionalColArgs options;
        options.redOp = redOps[groupCallIdx];
        options.root  = 0;

        for (int collIdx = 0; collIdx < numCollsPerGroup[groupCallIdx]; ++collIdx)
        {
          int numInputElements;
          int numOutputElements;
          CollectiveArgs::GetNumElementsForFuncType(funcTypes[collIdx],
                                                    numElements[groupCallIdx][collIdx],
                                                    totalRanks,
                                                    &numInputElements,
                                                    &numOutputElements);

          testBed.SetCollectiveArgs(funcTypes[collIdx],
                                    dataTypes[groupCallIdx],
                                    numInputElements,
                                    numOutputElements,
                                    options,
                                    collIdx,
                                    groupCallIdx);
        }

        testBed.AllocateMem(inPlace, useManagedMem, groupCallIdx);
        testBed.PrepareData(groupCallIdx);

        // Stream capture in advance for HIP graph enabled collective groups
        if (useHipGraphList[groupCallIdx])
        {
          testBed.ExecuteCollectives({}, groupCallIdx, useHipGraphList[groupCallIdx]);
        }
      }

      // Execute collectives based on groupIdx
      for (int i = 0; i < numIterations; ++i)
      {
        // Select a random group call
        int groupCallIdx = i % groupCalls.size();

        // Use graphs if enabled otherwise execute the collective
        if (useHipGraphList[groupCallIdx]) testBed.LaunchGraphs(groupCallIdx);
        else testBed.ExecuteCollectives({}, groupCallIdx);
        testBed.ValidateResults(isCorrect, groupCallIdx);
      }

      testBed.DeallocateMem();
      testBed.DestroyGraphs();
      testBed.DestroyComms();
    }
    testBed.Finalize();
  }
}