diff --git a/CMakeLists.txt b/CMakeLists.txt index a94b634..fee3bb2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,7 @@ option(MSCCLPP_USE_CUDA "Use NVIDIA/CUDA." OFF) option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF) option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF) option(MSCCLPP_NPKIT_FLAGS "Enable NPKIT" OFF) +option(MSCCLPP_ENABLE_EXECUTOR "Enable JSON Executor" OFF) if(MSCCLPP_BYPASS_GPU_CHECK) if(MSCCLPP_USE_CUDA) @@ -100,15 +101,21 @@ find_package(NUMA REQUIRED) find_package(Threads REQUIRED) include(FetchContent) -FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz) -FetchContent_MakeAvailable(json) +if(MSCCLPP_ENABLE_EXECUTOR) + FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz) + FetchContent_MakeAvailable(json) +endif() add_library(mscclpp_obj OBJECT) target_include_directories(mscclpp_obj SYSTEM PRIVATE ${GPU_INCLUDE_DIRS} ${NUMA_INCLUDE_DIRS}) -target_link_libraries(mscclpp_obj PRIVATE ${GPU_LIBRARIES} ${NUMA_LIBRARIES} nlohmann_json::nlohmann_json Threads::Threads dl) +if(MSCCLPP_ENABLE_EXECUTOR) + target_link_libraries(mscclpp_obj PRIVATE ${GPU_LIBRARIES} ${NUMA_LIBRARIES} nlohmann_json::nlohmann_json Threads::Threads dl) +else() + target_link_libraries(mscclpp_obj PRIVATE ${GPU_LIBRARIES} ${NUMA_LIBRARIES} Threads::Threads dl) +endif() if(IBVERBS_FOUND) target_include_directories(mscclpp_obj SYSTEM PRIVATE ${IBVERBS_INCLUDE_DIRS}) target_link_libraries(mscclpp_obj PRIVATE ${IBVERBS_LIBRARIES}) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index f91d15e..d11f4e5 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -6,7 +6,9 @@ #include #include #include +#ifdef MSCCLPP_ENABLE_EXECUTOR #include +#endif #include #include #include @@ -54,10 +56,12 @@ struct planKey { bool isInPlace; }; +#ifdef MSCCLPP_ENABLE_EXECUTOR struct executionPlanInstance { planKey key; std::shared_ptr plan; }; +#endif namespace std { template <> @@ -77,8 +81,10 @@ struct ncclComm { std::shared_ptr comm; std::vector> connections; std::vector> smSemaphores; +#ifdef MSCCLPP_ENABLE_EXECUTOR std::shared_ptr executor; std::unordered_map> executionPlans; +#endif std::unordered_map channelInInfos; std::unordered_map channelOutInfos; @@ -164,12 +170,14 @@ static std::vector setupSmChannels(ncclComm_t comm, return channels; } +#ifdef MSCCLPP_ENABLE_EXECUTOR static std::pair loadExecutionPlan(const std::string& filename) { std::shared_ptr plan = std::make_shared(filename); std::string collective = plan->collective(); planKey key{plan->minMessageSize(), plan->maxMessageSize(), plan->isInPlace()}; return std::make_pair(collective, executionPlanInstance{key, plan}); } +#endif static std::shared_ptr> setupSmChannelDeviceHandles( const std::vector& smChannels) { @@ -409,12 +417,15 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI ncclComm* commPtr = new ncclComm(); commPtr->comm = mscclppComm; +#ifdef MSCCLPP_ENABLE_EXECUTOR commPtr->executor = std::make_shared(mscclppComm); +#endif // FallBack for single node if (mscclppComm->bootstrap()->getNranks() == mscclppComm->bootstrap()->getNranksPerNode()) ncclCommInitRankFallbackSingleNode(commPtr, mscclppComm, rank); +#ifdef MSCCLPP_ENABLE_EXECUTOR const std::string& collectiveDir = mscclpp::env()->executionPlanDir; if (collectiveDir != "") { if (!std::filesystem::is_directory(collectiveDir)) { @@ -428,6 +439,7 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI } } } +#endif *comm = commPtr; #if defined(ENABLE_NPKIT) @@ -623,6 +635,7 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t return ncclInvalidArgument; } +#ifdef MSCCLPP_ENABLE_EXECUTOR int rank = comm->comm->bootstrap()->getRank(); std::vector& plans = comm->executionPlans["broadcast"]; @@ -663,6 +676,9 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t } return ncclSuccess; +#endif + + return ncclBroadcastFallback(sendbuff, recvbuff, count, datatype, root, comm, stream); } NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, @@ -675,6 +691,7 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t return ncclInvalidArgument; } +#ifdef MSCCLPP_ENABLE_EXECUTOR // Declarating variables size_t bytes = count * ncclTypeSize(datatype); int rank = comm->comm->bootstrap()->getRank(); @@ -716,6 +733,9 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t } return ncclSuccess; +#endif + + return ncclAllReduceFallback(sendbuff, recvbuff, count, datatype, reductionOperation, comm, stream); } NCCL_API ncclResult_t ncclReduceScatter(const void*, void*, size_t, ncclDataType_t, ncclRedOp_t, ncclComm_t, @@ -735,6 +755,7 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t return ncclInvalidArgument; } +#ifdef MSCCLPP_ENABLE_EXECUTOR int rank = comm->comm->bootstrap()->getRank(); int nRank = comm->comm->bootstrap()->getNranks(); @@ -775,6 +796,9 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t } return ncclSuccess; +#endif + + return ncclAllGatherFallback(sendbuff, recvbuff, sendcount, datatype, comm, stream); } NCCL_API ncclResult_t ncclSend(const void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t) { diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp index 6708628..3460102 100644 --- a/include/mscclpp/env.hpp +++ b/include/mscclpp/env.hpp @@ -27,7 +27,9 @@ class Env { const std::string socketFamily; const std::string socketIfname; const std::string commId; +#ifdef MSCCLPP_ENABLE_EXECUTOR const std::string executionPlanDir; +#endif const std::string npkitDumpDir; const bool cudaIpcUseDefaultStream; diff --git a/include/mscclpp/errors.hpp b/include/mscclpp/errors.hpp index 8d3fde4..a797460 100644 --- a/include/mscclpp/errors.hpp +++ b/include/mscclpp/errors.hpp @@ -16,7 +16,9 @@ enum class ErrorCode { InvalidUsage, // The function was used incorrectly. Timeout, // The operation timed out. Aborted, // The operation was aborted. +#ifdef MSCCLPP_ENABLE_EXECUTOR ExecutorError, // An error occurred in the MSCCL++ executor. +#endif }; /// Convert an error code to a string. diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp index 6848688..84d4d90 100644 --- a/include/mscclpp/executor.hpp +++ b/include/mscclpp/executor.hpp @@ -1,6 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. +#ifdef MSCCLPP_ENABLE_EXECUTOR + #ifndef MSCCLPP_EXECUTOR_HPP_ #define MSCCLPP_EXECUTOR_HPP_ @@ -59,3 +61,5 @@ class Executor { } // namespace mscclpp #endif // MSCCLPP_EXECUTOR_HPP_ + +#endif diff --git a/src/env.cpp b/src/env.cpp index 625de0a..5d6bae0 100644 --- a/src/env.cpp +++ b/src/env.cpp @@ -59,7 +59,9 @@ Env::Env() socketFamily(readEnv("MSCCLPP_SOCKET_FAMILY", "")), socketIfname(readEnv("MSCCLPP_SOCKET_IFNAME", "")), commId(readEnv("MSCCLPP_COMM_ID", "")), +#ifdef MSCCLPP_ENABLE_EXECUTOR executionPlanDir(readEnv("MSCCLPP_EXECUTION_PLAN_DIR", "")), +#endif npkitDumpDir(readEnv("MSCCLPP_NPKIT_DUMP_DIR", "")), cudaIpcUseDefaultStream(readEnv("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", false)) {} @@ -77,7 +79,9 @@ std::shared_ptr env() { logEnv("MSCCLPP_SOCKET_FAMILY", globalEnv->socketFamily); logEnv("MSCCLPP_SOCKET_IFNAME", globalEnv->socketIfname); logEnv("MSCCLPP_COMM_ID", globalEnv->commId); +#ifdef MSCCLPP_ENABLE_EXECUTOR logEnv("MSCCLPP_EXECUTION_PLAN_DIR", globalEnv->executionPlanDir); +#endif logEnv("MSCCLPP_NPKIT_DUMP_DIR", globalEnv->npkitDumpDir); logEnv("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", globalEnv->cudaIpcUseDefaultStream); } diff --git a/src/errors.cc b/src/errors.cc index fbc7a57..afdb181 100644 --- a/src/errors.cc +++ b/src/errors.cc @@ -21,8 +21,10 @@ std::string errorToString(enum ErrorCode error) { return "Timeout"; case ErrorCode::Aborted: return "Aborted"; +#ifdef MSCCLPP_ENABLE_EXECUTOR case ErrorCode::ExecutorError: return "ExecutorError"; +#endif default: return "UnknownError"; } diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu index a60317c..0ae77b8 100644 --- a/src/executor/execution_kernel.cu +++ b/src/executor/execution_kernel.cu @@ -1,6 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. +#ifdef MSCCLPP_ENABLE_EXECUTOR + #include "execution_kernel.hpp" #if defined(MSCCLPP_DEVICE_CUDA) @@ -74,3 +76,5 @@ template void ExecutionKernel::launchKernel(int rank, int nthreadbloc cudaStream_t stream, uint32_t flag); } // namespace mscclpp #endif + +#endif diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index 56c881b..4c9bc16 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -1,6 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. +#ifdef MSCCLPP_ENABLE_EXECUTOR + #include "execution_plan.hpp" #include @@ -610,3 +612,5 @@ size_t ExecutionPlan::maxMessageSize() const { return this->impl_->maxMessageSiz bool ExecutionPlan::isInPlace() const { return this->impl_->isInPlace; } } // namespace mscclpp + +#endif diff --git a/src/executor/executor.cc b/src/executor/executor.cc index 944ddb2..191be60 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -1,6 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. +#ifdef MSCCLPP_ENABLE_EXECUTOR + #include #include #include @@ -451,3 +453,5 @@ void Executor::execute(int rank, void* sendbuff, void* recvbuff, size_t sendBuff Executor::~Executor() = default; } // namespace mscclpp + +#endif diff --git a/src/include/execution_common.hpp b/src/include/execution_common.hpp index f6ed215..0bfb613 100644 --- a/src/include/execution_common.hpp +++ b/src/include/execution_common.hpp @@ -1,6 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. +#ifdef MSCCLPP_ENABLE_EXECUTOR + #ifndef MSCCLPP_EXECUTION_COMMON_HPP_ #define MSCCLPP_EXECUTION_COMMON_HPP_ @@ -107,3 +109,5 @@ struct __attribute__((aligned(16))) DeviceExecutionPlan { } // namespace mscclpp #endif // MSCCLPP_EXECUTION_COMMON_HPP_ + +#endif diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index 98bed37..b724dba 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -1,6 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. +#ifdef MSCCLPP_ENABLE_EXECUTOR + #ifndef MSCCLPP_EXECUTION_KERNEL_HPP_ #define MSCCLPP_EXECUTION_KERNEL_HPP_ @@ -687,3 +689,5 @@ class ExecutionKernel { } // namespace mscclpp #endif // MSCCLPP_EXECUTION_KERNEL_HPP_ + +#endif diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp index 080a768..ee36ad9 100644 --- a/src/include/execution_plan.hpp +++ b/src/include/execution_plan.hpp @@ -1,6 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. +#ifdef MSCCLPP_ENABLE_EXECUTOR + #ifndef MSCCLPP_EXECUTOR_PLAN_HPP_ #define MSCCLPP_EXECUTOR_PLAN_HPP_ @@ -129,3 +131,5 @@ struct ExecutionPlan::Impl { } // namespace mscclpp #endif // MSCCLPP_EXECUTOR_PLAN_HPP_ + +#endif