diff --git a/projects/rccl/CMakeLists.txt b/projects/rccl/CMakeLists.txt index ea901485c1..0eb07dd3e6 100644 --- a/projects/rccl/CMakeLists.txt +++ b/projects/rccl/CMakeLists.txt @@ -28,7 +28,8 @@ option(COLLTRACE "Collective Trace Option" option(ENABLE_CODE_COVERAGE "Enable code coverage" OFF) option(ENABLE_MSCCL_KERNEL "Enable MSCCL while compiling" ON) option(ENABLE_MSCCLPP "Enable MSCCL++" ON) -option(ENABLE_MSCCLPP_CLIP "Enable MSCCL++" OFF) +option(ENABLE_MSCCLPP_CLIP "Enable MSCCL++ CLIP" OFF) +option(ENABLE_MSCCLPP_EXECUTOR "Enable MSCCL++ Executor" OFF) option(ENABLE_IFC "Enable indirect function call" OFF) option(INSTALL_DEPENDENCIES "Force install dependencies" OFF) option(ROCTX "Enable ROCTX" ON) diff --git a/projects/rccl/cmake/MSCCLPP.cmake b/projects/rccl/cmake/MSCCLPP.cmake index 4efe57e8e3..51d07261a0 100644 --- a/projects/rccl/cmake/MSCCLPP.cmake +++ b/projects/rccl/cmake/MSCCLPP.cmake @@ -53,47 +53,47 @@ if(ENABLE_MSCCLPP) ) endif() - execute_process( - COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch - WORKING_DIRECTORY ${MSCCLPP_SOURCE} - ) + execute_process( + COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) - execute_process( - COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch - WORKING_DIRECTORY ${MSCCLPP_SOURCE} - ) + execute_process( + COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) - execute_process( - COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp_ibv_access_relaxed_ordering.patch - WORKING_DIRECTORY ${MSCCLPP_SOURCE} - ) + execute_process( + COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp_ibv_access_relaxed_ordering.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) - execute_process( - COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch - WORKING_DIRECTORY ${MSCCLPP_SOURCE} - ) + execute_process( + COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) - execute_process( - COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/non-multiple-128-fix.patch - WORKING_DIRECTORY ${MSCCLPP_SOURCE} - ) + execute_process( + COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/non-multiple-128-fix.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) - execute_process( - COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/bf16-tuning.patch - WORKING_DIRECTORY ${MSCCLPP_SOURCE} - ) + execute_process( + COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/bf16-tuning.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) - execute_process( - COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/reg-fix.patch - WORKING_DIRECTORY ${MSCCLPP_SOURCE} - ) + execute_process( + COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/reg-fix.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) - execute_process( - COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/no-cache.patch - WORKING_DIRECTORY ${MSCCLPP_SOURCE} - ) + execute_process( + COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/no-cache.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) - execute_process( + execute_process( COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/device-flag.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) @@ -103,6 +103,11 @@ if(ENABLE_MSCCLPP) WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) + execute_process( + COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/disable-executor.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) + set(CMAKE_INHERITED_ARGS "") set(CMAKE_ARGS_LIST "CMAKE_PREFIX_PATH;CMAKE_INSTALL_RPATH_USE_LINK_PATH;HIP_COMPILER") foreach(arg IN LISTS CMAKE_ARGS_LIST) @@ -135,7 +140,7 @@ if(ENABLE_MSCCLPP) #GIT_TAG 4ee15b7ad085daaf74349d4c49c9b8480d28f0dc INSTALL_DIR ${MSCCLPP_ROOT} LIST_SEPARATOR % - CMAKE_ARGS "-DGPU_TARGETS=${MSCCLPP_GPU_TARGETS}" -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DMSCCLPP_BUILD_APPS_NCCL=ON -DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF -DMSCCLPP_BUILD_TESTS=OFF -DMSCCLPP_CLIP_ENABLED=${ENABLE_MSCCLPP_CLIP} -DCMAKE_INSTALL_PREFIX= -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INHERITED_ARGS}" -DFETCHCONTENT_SOURCE_DIR_JSON=${JSON_SOURCE} LOG_DOWNLOAD FALSE + CMAKE_ARGS "-DGPU_TARGETS=${MSCCLPP_GPU_TARGETS}" -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DMSCCLPP_BUILD_APPS_NCCL=ON -DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF -DMSCCLPP_BUILD_TESTS=OFF -DMSCCLPP_CLIP_ENABLED=${ENABLE_MSCCLPP_CLIP} -DMSCCLPP_ENABLE_EXECUTOR=${ENABLE_MSCCLPP_EXECUTOR} -DCMAKE_INSTALL_PREFIX= -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INHERITED_ARGS}" -DFETCHCONTENT_SOURCE_DIR_JSON=${JSON_SOURCE} LOG_DOWNLOAD FALSE LOG_CONFIGURE FALSE LOG_BUILD FALSE LOG_INSTALL FALSE @@ -145,56 +150,60 @@ if(ENABLE_MSCCLPP) find_package(mscclpp_nccl REQUIRED) + execute_process( + COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/disable-executor.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) - execute_process( + execute_process( COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/remove-clip.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) - execute_process( + execute_process( COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/device-flag.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} - ) + ) - execute_process( - COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/no-cache.patch - WORKING_DIRECTORY ${MSCCLPP_SOURCE} - ) - - execute_process( - COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/reg-fix.patch - WORKING_DIRECTORY ${MSCCLPP_SOURCE} - ) + execute_process( + COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/no-cache.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) - execute_process( - COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/bf16-tuning.patch - WORKING_DIRECTORY ${MSCCLPP_SOURCE} - ) + execute_process( + COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/reg-fix.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) - execute_process( - COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/non-multiple-128-fix.patch - WORKING_DIRECTORY ${MSCCLPP_SOURCE} - ) + execute_process( + COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/bf16-tuning.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) - execute_process( - COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch - WORKING_DIRECTORY ${MSCCLPP_SOURCE} - ) + execute_process( + COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/non-multiple-128-fix.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) - execute_process( - COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp_ibv_access_relaxed_ordering.patch - WORKING_DIRECTORY ${MSCCLPP_SOURCE} - ) + execute_process( + COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) - execute_process( - COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch - WORKING_DIRECTORY ${MSCCLPP_SOURCE} - ) + execute_process( + COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp_ibv_access_relaxed_ordering.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) - execute_process( - COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch - WORKING_DIRECTORY ${MSCCLPP_SOURCE} - ) + execute_process( + COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) + + execute_process( + COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) #endif() diff --git a/projects/rccl/ext-src/disable-executor.patch b/projects/rccl/ext-src/disable-executor.patch new file mode 100644 index 0000000000..8607ad1498 --- /dev/null +++ b/projects/rccl/ext-src/disable-executor.patch @@ -0,0 +1,368 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index a94b634..fee3bb2 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -22,6 +22,7 @@ option(MSCCLPP_USE_CUDA "Use NVIDIA/CUDA." OFF) + option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF) + option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF) + option(MSCCLPP_NPKIT_FLAGS "Enable NPKIT" OFF) ++option(MSCCLPP_ENABLE_EXECUTOR "Enable JSON Executor" OFF) + + if(MSCCLPP_BYPASS_GPU_CHECK) + if(MSCCLPP_USE_CUDA) +@@ -100,15 +101,21 @@ find_package(NUMA REQUIRED) + find_package(Threads REQUIRED) + + include(FetchContent) +-FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz) +-FetchContent_MakeAvailable(json) ++if(MSCCLPP_ENABLE_EXECUTOR) ++ FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz) ++ FetchContent_MakeAvailable(json) ++endif() + + add_library(mscclpp_obj OBJECT) + target_include_directories(mscclpp_obj + SYSTEM PRIVATE + ${GPU_INCLUDE_DIRS} + ${NUMA_INCLUDE_DIRS}) +-target_link_libraries(mscclpp_obj PRIVATE ${GPU_LIBRARIES} ${NUMA_LIBRARIES} nlohmann_json::nlohmann_json Threads::Threads dl) ++if(MSCCLPP_ENABLE_EXECUTOR) ++ target_link_libraries(mscclpp_obj PRIVATE ${GPU_LIBRARIES} ${NUMA_LIBRARIES} nlohmann_json::nlohmann_json Threads::Threads dl) ++else() ++ target_link_libraries(mscclpp_obj PRIVATE ${GPU_LIBRARIES} ${NUMA_LIBRARIES} Threads::Threads dl) ++endif() + if(IBVERBS_FOUND) + target_include_directories(mscclpp_obj SYSTEM PRIVATE ${IBVERBS_INCLUDE_DIRS}) + target_link_libraries(mscclpp_obj PRIVATE ${IBVERBS_LIBRARIES}) +diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu +index f91d15e..d11f4e5 100644 +--- a/apps/nccl/src/nccl.cu ++++ b/apps/nccl/src/nccl.cu +@@ -6,7 +6,9 @@ + #include + #include + #include ++#ifdef MSCCLPP_ENABLE_EXECUTOR + #include ++#endif + #include + #include + #include +@@ -54,10 +56,12 @@ struct planKey { + bool isInPlace; + }; + ++#ifdef MSCCLPP_ENABLE_EXECUTOR + struct executionPlanInstance { + planKey key; + std::shared_ptr plan; + }; ++#endif + + namespace std { + template <> +@@ -77,8 +81,10 @@ struct ncclComm { + std::shared_ptr comm; + std::vector> connections; + std::vector> smSemaphores; ++#ifdef MSCCLPP_ENABLE_EXECUTOR + std::shared_ptr executor; + std::unordered_map> executionPlans; ++#endif + + std::unordered_map channelInInfos; + std::unordered_map channelOutInfos; +@@ -164,12 +170,14 @@ static std::vector setupSmChannels(ncclComm_t comm, + return channels; + } + ++#ifdef MSCCLPP_ENABLE_EXECUTOR + static std::pair loadExecutionPlan(const std::string& filename) { + std::shared_ptr plan = std::make_shared(filename); + std::string collective = plan->collective(); + planKey key{plan->minMessageSize(), plan->maxMessageSize(), plan->isInPlace()}; + return std::make_pair(collective, executionPlanInstance{key, plan}); + } ++#endif + + static std::shared_ptr> setupSmChannelDeviceHandles( + const std::vector& smChannels) { +@@ -409,12 +417,15 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI + ncclComm* commPtr = new ncclComm(); + + commPtr->comm = mscclppComm; ++#ifdef MSCCLPP_ENABLE_EXECUTOR + commPtr->executor = std::make_shared(mscclppComm); ++#endif + + // FallBack for single node + if (mscclppComm->bootstrap()->getNranks() == mscclppComm->bootstrap()->getNranksPerNode()) + ncclCommInitRankFallbackSingleNode(commPtr, mscclppComm, rank); + ++#ifdef MSCCLPP_ENABLE_EXECUTOR + const std::string& collectiveDir = mscclpp::env()->executionPlanDir; + if (collectiveDir != "") { + if (!std::filesystem::is_directory(collectiveDir)) { +@@ -428,6 +439,7 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI + } + } + } ++#endif + + *comm = commPtr; + #if defined(ENABLE_NPKIT) +@@ -623,6 +635,7 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t + return ncclInvalidArgument; + } + ++#ifdef MSCCLPP_ENABLE_EXECUTOR + int rank = comm->comm->bootstrap()->getRank(); + + std::vector& plans = comm->executionPlans["broadcast"]; +@@ -663,6 +676,9 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t + } + + return ncclSuccess; ++#endif ++ ++ return ncclBroadcastFallback(sendbuff, recvbuff, count, datatype, root, comm, stream); + } + + NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, +@@ -675,6 +691,7 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t + return ncclInvalidArgument; + } + ++#ifdef MSCCLPP_ENABLE_EXECUTOR + // Declarating variables + size_t bytes = count * ncclTypeSize(datatype); + int rank = comm->comm->bootstrap()->getRank(); +@@ -716,6 +733,9 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t + } + + return ncclSuccess; ++#endif ++ ++ return ncclAllReduceFallback(sendbuff, recvbuff, count, datatype, reductionOperation, comm, stream); + } + + NCCL_API ncclResult_t ncclReduceScatter(const void*, void*, size_t, ncclDataType_t, ncclRedOp_t, ncclComm_t, +@@ -735,6 +755,7 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t + return ncclInvalidArgument; + } + ++#ifdef MSCCLPP_ENABLE_EXECUTOR + int rank = comm->comm->bootstrap()->getRank(); + int nRank = comm->comm->bootstrap()->getNranks(); + +@@ -775,6 +796,9 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t + } + + return ncclSuccess; ++#endif ++ ++ return ncclAllGatherFallback(sendbuff, recvbuff, sendcount, datatype, comm, stream); + } + + NCCL_API ncclResult_t ncclSend(const void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t) { +diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp +index 6708628..3460102 100644 +--- a/include/mscclpp/env.hpp ++++ b/include/mscclpp/env.hpp +@@ -27,7 +27,9 @@ class Env { + const std::string socketFamily; + const std::string socketIfname; + const std::string commId; ++#ifdef MSCCLPP_ENABLE_EXECUTOR + const std::string executionPlanDir; ++#endif + const std::string npkitDumpDir; + const bool cudaIpcUseDefaultStream; + +diff --git a/include/mscclpp/errors.hpp b/include/mscclpp/errors.hpp +index 8d3fde4..a797460 100644 +--- a/include/mscclpp/errors.hpp ++++ b/include/mscclpp/errors.hpp +@@ -16,7 +16,9 @@ enum class ErrorCode { + InvalidUsage, // The function was used incorrectly. + Timeout, // The operation timed out. + Aborted, // The operation was aborted. ++#ifdef MSCCLPP_ENABLE_EXECUTOR + ExecutorError, // An error occurred in the MSCCL++ executor. ++#endif + }; + + /// Convert an error code to a string. +diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp +index 6848688..84d4d90 100644 +--- a/include/mscclpp/executor.hpp ++++ b/include/mscclpp/executor.hpp +@@ -1,6 +1,8 @@ + // Copyright (c) Microsoft Corporation. + // Licensed under the MIT license. + ++#ifdef MSCCLPP_ENABLE_EXECUTOR ++ + #ifndef MSCCLPP_EXECUTOR_HPP_ + #define MSCCLPP_EXECUTOR_HPP_ + +@@ -59,3 +61,5 @@ class Executor { + } // namespace mscclpp + + #endif // MSCCLPP_EXECUTOR_HPP_ ++ ++#endif +diff --git a/src/env.cpp b/src/env.cpp +index 625de0a..5d6bae0 100644 +--- a/src/env.cpp ++++ b/src/env.cpp +@@ -59,7 +59,9 @@ Env::Env() + socketFamily(readEnv("MSCCLPP_SOCKET_FAMILY", "")), + socketIfname(readEnv("MSCCLPP_SOCKET_IFNAME", "")), + commId(readEnv("MSCCLPP_COMM_ID", "")), ++#ifdef MSCCLPP_ENABLE_EXECUTOR + executionPlanDir(readEnv("MSCCLPP_EXECUTION_PLAN_DIR", "")), ++#endif + npkitDumpDir(readEnv("MSCCLPP_NPKIT_DUMP_DIR", "")), + cudaIpcUseDefaultStream(readEnv("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", false)) {} + +@@ -77,7 +79,9 @@ std::shared_ptr env() { + logEnv("MSCCLPP_SOCKET_FAMILY", globalEnv->socketFamily); + logEnv("MSCCLPP_SOCKET_IFNAME", globalEnv->socketIfname); + logEnv("MSCCLPP_COMM_ID", globalEnv->commId); ++#ifdef MSCCLPP_ENABLE_EXECUTOR + logEnv("MSCCLPP_EXECUTION_PLAN_DIR", globalEnv->executionPlanDir); ++#endif + logEnv("MSCCLPP_NPKIT_DUMP_DIR", globalEnv->npkitDumpDir); + logEnv("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", globalEnv->cudaIpcUseDefaultStream); + } +diff --git a/src/errors.cc b/src/errors.cc +index fbc7a57..afdb181 100644 +--- a/src/errors.cc ++++ b/src/errors.cc +@@ -21,8 +21,10 @@ std::string errorToString(enum ErrorCode error) { + return "Timeout"; + case ErrorCode::Aborted: + return "Aborted"; ++#ifdef MSCCLPP_ENABLE_EXECUTOR + case ErrorCode::ExecutorError: + return "ExecutorError"; ++#endif + default: + return "UnknownError"; + } +diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu +index a60317c..0ae77b8 100644 +--- a/src/executor/execution_kernel.cu ++++ b/src/executor/execution_kernel.cu +@@ -1,6 +1,8 @@ + // Copyright (c) Microsoft Corporation. + // Licensed under the MIT license. + ++#ifdef MSCCLPP_ENABLE_EXECUTOR ++ + #include "execution_kernel.hpp" + + #if defined(MSCCLPP_DEVICE_CUDA) +@@ -74,3 +76,5 @@ template void ExecutionKernel::launchKernel(int rank, int nthreadbloc + cudaStream_t stream, uint32_t flag); + } // namespace mscclpp + #endif ++ ++#endif +diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc +index 56c881b..4c9bc16 100644 +--- a/src/executor/execution_plan.cc ++++ b/src/executor/execution_plan.cc +@@ -1,6 +1,8 @@ + // Copyright (c) Microsoft Corporation. + // Licensed under the MIT license. + ++#ifdef MSCCLPP_ENABLE_EXECUTOR ++ + #include "execution_plan.hpp" + + #include +@@ -610,3 +612,5 @@ size_t ExecutionPlan::maxMessageSize() const { return this->impl_->maxMessageSiz + bool ExecutionPlan::isInPlace() const { return this->impl_->isInPlace; } + + } // namespace mscclpp ++ ++#endif +diff --git a/src/executor/executor.cc b/src/executor/executor.cc +index 944ddb2..191be60 100644 +--- a/src/executor/executor.cc ++++ b/src/executor/executor.cc +@@ -1,6 +1,8 @@ + // Copyright (c) Microsoft Corporation. + // Licensed under the MIT license. + ++#ifdef MSCCLPP_ENABLE_EXECUTOR ++ + #include + #include + #include +@@ -451,3 +453,5 @@ void Executor::execute(int rank, void* sendbuff, void* recvbuff, size_t sendBuff + Executor::~Executor() = default; + + } // namespace mscclpp ++ ++#endif +diff --git a/src/include/execution_common.hpp b/src/include/execution_common.hpp +index f6ed215..0bfb613 100644 +--- a/src/include/execution_common.hpp ++++ b/src/include/execution_common.hpp +@@ -1,6 +1,8 @@ + // Copyright (c) Microsoft Corporation. + // Licensed under the MIT license. + ++#ifdef MSCCLPP_ENABLE_EXECUTOR ++ + #ifndef MSCCLPP_EXECUTION_COMMON_HPP_ + #define MSCCLPP_EXECUTION_COMMON_HPP_ + +@@ -107,3 +109,5 @@ struct __attribute__((aligned(16))) DeviceExecutionPlan { + } // namespace mscclpp + + #endif // MSCCLPP_EXECUTION_COMMON_HPP_ ++ ++#endif +diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp +index 98bed37..b724dba 100644 +--- a/src/include/execution_kernel.hpp ++++ b/src/include/execution_kernel.hpp +@@ -1,6 +1,8 @@ + // Copyright (c) Microsoft Corporation. + // Licensed under the MIT license. + ++#ifdef MSCCLPP_ENABLE_EXECUTOR ++ + #ifndef MSCCLPP_EXECUTION_KERNEL_HPP_ + #define MSCCLPP_EXECUTION_KERNEL_HPP_ + +@@ -687,3 +689,5 @@ class ExecutionKernel { + } // namespace mscclpp + + #endif // MSCCLPP_EXECUTION_KERNEL_HPP_ ++ ++#endif +diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp +index 080a768..ee36ad9 100644 +--- a/src/include/execution_plan.hpp ++++ b/src/include/execution_plan.hpp +@@ -1,6 +1,8 @@ + // Copyright (c) Microsoft Corporation. + // Licensed under the MIT license. + ++#ifdef MSCCLPP_ENABLE_EXECUTOR ++ + #ifndef MSCCLPP_EXECUTOR_PLAN_HPP_ + #define MSCCLPP_EXECUTOR_PLAN_HPP_ + +@@ -129,3 +131,5 @@ struct ExecutionPlan::Impl { + } // namespace mscclpp + + #endif // MSCCLPP_EXECUTOR_PLAN_HPP_ ++ ++#endif