From 7d6e7bcd7d3a2b92da97fee52701c9bfef4271cc Mon Sep 17 00:00:00 2001 From: Ziyue Yang Date: Wed, 24 May 2023 22:41:05 +0800 Subject: [PATCH] revert npkit (#748) --- src/collectives/device/all_gather.h | 15 ++++ src/collectives/device/all_reduce.h | 45 ++++++++++++ src/collectives/device/broadcast.h | 15 ++++ src/collectives/device/sendrecv.h | 30 ++++++++ src/include/devcomm.h | 1 + src/include/npkit/npkit.h | 11 ++- src/include/npkit/npkit_event.h | 57 ++++++++------- src/init.cc | 1 + src/misc/npkit.cc | 106 ++++++---------------------- src/transport/net.cc | 8 +-- 10 files changed, 167 insertions(+), 122 deletions(-) diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h index 4bc9bc2868..dbfca9b082 100644 --- a/src/collectives/device/all_gather.h +++ b/src/collectives/device/all_gather.h @@ -33,6 +33,21 @@ namespace { int npKitCtxIdx = bid; #endif +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) + if (tid == 0) { + uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp; + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp, + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + T *inputBuf = (T*)args->sendbuff; T *outputBuf = (T*)args->recvbuff; Primitives, 0, Proto, 0> prims diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h index 02634ec772..57444ab024 100644 --- a/src/collectives/device/all_reduce.h +++ b/src/collectives/device/all_reduce.h @@ -35,6 +35,21 @@ namespace { int npKitCtxIdx = bid; #endif +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) + if (tid == 0) { + uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp; + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp, + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_ENTRY, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), @@ -231,6 +246,21 @@ namespace { int npKitCtxIdx = bid; #endif +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) + if (tid == 0) { + uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp; + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp, + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), @@ -390,6 +420,21 @@ namespace { } #endif +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) + if (isNpKitThread) { + uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp; + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp, + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU) + if (isNpKitThread) { + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY) if (isNpKitThread) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h index 0576eaabf5..db3aed51e8 100644 --- a/src/collectives/device/broadcast.h +++ b/src/collectives/device/broadcast.h @@ -32,6 +32,21 @@ namespace { int npKitCtxIdx = bid; #endif +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) + if (tid == 0) { + uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp; + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp, + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + T *inputBuf = (T*)args->sendbuff; T *outputBuf = (T*)args->recvbuff; Primitives, 0, Proto, 0> diff --git a/src/collectives/device/sendrecv.h b/src/collectives/device/sendrecv.h index ca50d1de83..abb71b7a6d 100644 --- a/src/collectives/device/sendrecv.h +++ b/src/collectives/device/sendrecv.h @@ -24,6 +24,21 @@ struct RunWork { int npKitCtxIdx = blockIdx.x * NCCL_MAX_WORK_ELEMENTS_P2P; #endif +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) + if (isNpKitThread) { + uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp; + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp, + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU) + if (isNpKitThread) { + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + if (args->peer == ncclShmem.comm.rank) { struct ncclWorkElemP2p* recvArgs = args-1; void* recvBuff = reinterpret_cast(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32); @@ -105,6 +120,21 @@ struct RunWork { int npKitCtxIdx = blockIdx.x * NCCL_MAX_WORK_ELEMENTS_P2P + 1; #endif +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) + if (isNpKitThread) { + uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp; + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp, + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU) + if (isNpKitThread) { + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + if (args->peer != ncclShmem.comm.rank) { void* buff = reinterpret_cast(uintptr_t(args->buffHi32)<<32 | args->buffLo32); ssize_t count = reinterpret_cast(size_t(args->countHi32)<<32 | args->countLo32); diff --git a/src/include/devcomm.h b/src/include/devcomm.h index 4fc208e284..c569a39ede 100644 --- a/src/include/devcomm.h +++ b/src/include/devcomm.h @@ -393,6 +393,7 @@ struct ncclDevComm { #if defined(ENABLE_NPKIT) NpKitEventCollectContext* npKitEventCollectContexts; + uint64_t* cpuTimestamp; #endif #ifdef ENABLE_COLLTRACE diff --git a/src/include/npkit/npkit.h b/src/include/npkit/npkit.h index ec3f518f9c..924dc71f85 100644 --- a/src/include/npkit/npkit.h +++ b/src/include/npkit/npkit.h @@ -49,7 +49,7 @@ class NpKit { static void CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id); - static uint64_t GetCpuTimestamp(); + static uint64_t *GetCpuTimestamp(); private: static void CpuTimestampUpdateThread(); @@ -65,13 +65,12 @@ class NpKit { static NpKitEventCollectContext* gpu_collect_contexts_; static NpKitEventCollectContext* cpu_collect_contexts_; - - static uint64_t base_cpu_timestamp_global_; - static uint64_t base_cpu_timestamp_local_; - static uint64_t base_gpu_timestamp_cpu_; - static uint64_t base_gpu_timestamp_gpu_; + static uint64_t* cpu_timestamp_; static uint64_t rank_; + + static std::thread* cpu_timestamp_update_thread_; + static volatile bool cpu_timestamp_update_thread_should_stop_; }; #endif diff --git a/src/include/npkit/npkit_event.h b/src/include/npkit/npkit_event.h index 22922e0c5a..fd1f940a88 100644 --- a/src/include/npkit/npkit_event.h +++ b/src/include/npkit/npkit_event.h @@ -65,36 +65,39 @@ #define NPKIT_EVENT_NET_RECV_ENTRY 0x31 #define NPKIT_EVENT_NET_RECV_EXIT 0x32 -#define NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY 0x33 -#define NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT 0x34 -#define NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY 0x35 -#define NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT 0x36 -#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY 0x37 -#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT 0x38 -#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY 0x39 -#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT 0x3A -#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY 0x3B -#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT 0x3C +#define NPKIT_EVENT_TIME_SYNC_GPU 0x33 +#define NPKIT_EVENT_TIME_SYNC_CPU 0x34 -#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY 0x3D -#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT 0x3E -#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY 0x3F -#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT 0x40 +#define NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY 0x35 +#define NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT 0x36 +#define NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY 0x37 +#define NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT 0x38 +#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY 0x39 +#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT 0x3A +#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY 0x3B +#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT 0x3C +#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY 0x3D +#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT 0x3E -#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY 0x41 -#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT 0x42 -#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY 0x43 -#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT 0x44 -#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY 0x45 -#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT 0x46 +#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY 0x3F +#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT 0x40 +#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY 0x41 +#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT 0x42 -#define NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY 0x47 -#define NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT 0x48 -#define NPKIT_EVENT_SEND_RECV_SEND_ENTRY 0x49 -#define NPKIT_EVENT_SEND_RECV_SEND_EXIT 0x4A -#define NPKIT_EVENT_SEND_RECV_RECV_ENTRY 0x4B -#define NPKIT_EVENT_SEND_RECV_RECV_EXIT 0x4C +#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY 0x43 +#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT 0x44 +#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY 0x45 +#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT 0x46 +#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY 0x47 +#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT 0x48 -#define NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME 0x4D +#define NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY 0x49 +#define NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT 0x4A +#define NPKIT_EVENT_SEND_RECV_SEND_ENTRY 0x4B +#define NPKIT_EVENT_SEND_RECV_SEND_EXIT 0x4C +#define NPKIT_EVENT_SEND_RECV_RECV_ENTRY 0x4D +#define NPKIT_EVENT_SEND_RECV_RECV_EXIT 0x4E + +#define NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME 0x4F #endif diff --git a/src/init.cc b/src/init.cc index 0a1facbf9c..9b035f3faa 100644 --- a/src/init.cc +++ b/src/init.cc @@ -622,6 +622,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { // Init NPKit NCCLCHECK(NpKit::Init(comm->rank)); tmpCommAndChans.comm.npKitEventCollectContexts = NpKit::GetGpuEventCollectContexts(); + tmpCommAndChans.comm.cpuTimestamp = NpKit::GetCpuTimestamp(); #endif #ifdef ENABLE_PROFILING diff --git a/src/misc/npkit.cc b/src/misc/npkit.cc index 26a29b89d1..b6c86e81a6 100644 --- a/src/misc/npkit.cc +++ b/src/misc/npkit.cc @@ -18,29 +18,19 @@ NpKitEvent** NpKit::cpu_event_buffers_ = nullptr; NpKitEventCollectContext* NpKit::gpu_collect_contexts_ = nullptr; NpKitEventCollectContext* NpKit::cpu_collect_contexts_ = nullptr; -uint64_t NpKit::base_cpu_timestamp_global_ = 0; -uint64_t NpKit::base_cpu_timestamp_local_ = 0; -uint64_t NpKit::base_gpu_timestamp_cpu_ = 0; -uint64_t NpKit::base_gpu_timestamp_gpu_ = 0; +uint64_t* NpKit::cpu_timestamp_ = nullptr; -__global__ void TimeCalibrationKernel(uint64_t *gpu_timestamp, uint64_t *cpu_timestamp_in, uint64_t *cpu_timestamp_out) { - uint64_t gpu_timestamp_1 = NPKIT_GET_GPU_TIMESTAMP(); - uint64_t cpu_timestamp = *cpu_timestamp_in; - uint64_t gpu_timestamp_2 = NPKIT_GET_GPU_TIMESTAMP(); - *gpu_timestamp = gpu_timestamp_1 + (gpu_timestamp_2 - gpu_timestamp_1) / 2; - *cpu_timestamp_out = cpu_timestamp; -} - -static volatile bool cpu_timestamp_update_thread_should_stop = false; -static volatile uint64_t* volatile_cpu_timestamp = nullptr; +std::thread* NpKit::cpu_timestamp_update_thread_ = nullptr; +volatile bool NpKit::cpu_timestamp_update_thread_should_stop_ = false; void NpKit::CpuTimestampUpdateThread() { uint64_t init_system_clock = std::chrono::system_clock::now().time_since_epoch().count(); uint64_t init_steady_clock = std::chrono::steady_clock::now().time_since_epoch().count(); uint64_t curr_steady_clock = 0; - while (!cpu_timestamp_update_thread_should_stop) { + volatile uint64_t* volatile_cpu_timestamp_ = cpu_timestamp_; + while (!cpu_timestamp_update_thread_should_stop_) { curr_steady_clock = std::chrono::steady_clock::now().time_since_epoch().count(); - *volatile_cpu_timestamp = init_system_clock + (curr_steady_clock - init_steady_clock); + *volatile_cpu_timestamp_ = init_system_clock + (curr_steady_clock - init_steady_clock); } } @@ -67,40 +57,12 @@ ncclResult_t NpKit::Init(int rank) { cpu_collect_contexts_[i] = ctx; } - // Calibrate CPU timestamp - uint64_t cpu_timestamp_1 = std::chrono::steady_clock::now().time_since_epoch().count(); - base_cpu_timestamp_global_ = std::chrono::system_clock::now().time_since_epoch().count(); - uint64_t cpu_timestamp_2 = std::chrono::steady_clock::now().time_since_epoch().count(); - base_cpu_timestamp_local_ = cpu_timestamp_1 + (cpu_timestamp_2 - cpu_timestamp_1) / 2; - - // Calibrate GPU timestamp - uint64_t *cpu_timestamp_in = nullptr; - uint64_t *cpu_timestamp_out = nullptr; - uint64_t *gpu_timestamp = nullptr; - NCCLCHECK(ncclCudaHostCalloc(&cpu_timestamp_in, 1)); - NCCLCHECK(ncclCudaCalloc(&cpu_timestamp_out, 1)); - NCCLCHECK(ncclCudaCalloc(&gpu_timestamp, 1)); - - cpu_timestamp_update_thread_should_stop = false; - volatile_cpu_timestamp = cpu_timestamp_in; - *volatile_cpu_timestamp = 0; - - hipStream_t timingStream; - CUDACHECK(hipStreamCreateWithFlags(&timingStream, hipStreamNonBlocking)); - - std::thread cpu_timestamp_update_thread(CpuTimestampUpdateThread); - while (!*volatile_cpu_timestamp); - TimeCalibrationKernel<<<1, 1, 0, timingStream>>>(gpu_timestamp, cpu_timestamp_in, cpu_timestamp_out); - CUDACHECK(hipStreamSynchronize(timingStream)); - cpu_timestamp_update_thread_should_stop = true; - cpu_timestamp_update_thread.join(); - - NCCLCHECK(ncclCudaMemcpy(&base_gpu_timestamp_cpu_, cpu_timestamp_out, 1)); - NCCLCHECK(ncclCudaMemcpy(&base_gpu_timestamp_gpu_, gpu_timestamp, 1)); - - NCCLCHECK(ncclCudaHostFree(cpu_timestamp_in)); - CUDACHECK(hipFree(cpu_timestamp_out)); - CUDACHECK(hipFree(gpu_timestamp)); + // Init timestamp + NCCLCHECK(ncclCudaHostCalloc(&cpu_timestamp_, 1)); + volatile uint64_t* volatile_cpu_timestamp = cpu_timestamp_; + *volatile_cpu_timestamp = std::chrono::system_clock::now().time_since_epoch().count(); + cpu_timestamp_update_thread_should_stop_ = false; + cpu_timestamp_update_thread_ = new std::thread(CpuTimestampUpdateThread); return ncclSuccess; } @@ -164,45 +126,16 @@ ncclResult_t NpKit::Dump(const std::string& dump_dir) { gpu_clock_rate_file.write(clock_rate_str.c_str(), clock_rate_str.length()); gpu_clock_rate_file.close(); - // Dump clock calibration info - dump_file_path = dump_dir; - dump_file_path += "/clock_calibration_cpu_global_rank_"; - dump_file_path += std::to_string(rank_); - std::string base_cpu_timestamp_global_str = std::to_string(base_cpu_timestamp_global_); - auto base_cpu_timestamp_global_file = std::fstream(dump_file_path, std::ios::out); - base_cpu_timestamp_global_file.write(base_cpu_timestamp_global_str.c_str(), base_cpu_timestamp_global_str.length()); - base_cpu_timestamp_global_file.close(); - - dump_file_path = dump_dir; - dump_file_path += "/clock_calibration_cpu_local_rank_"; - dump_file_path += std::to_string(rank_); - std::string base_cpu_timestamp_local_str = std::to_string(base_cpu_timestamp_local_); - auto base_cpu_timestamp_local_file = std::fstream(dump_file_path, std::ios::out); - base_cpu_timestamp_local_file.write(base_cpu_timestamp_local_str.c_str(), base_cpu_timestamp_local_str.length()); - base_cpu_timestamp_local_file.close(); - - dump_file_path = dump_dir; - dump_file_path += "/clock_calibration_gpu_cpu_rank_"; - dump_file_path += std::to_string(rank_); - std::string base_gpu_timestamp_cpu_str = std::to_string(base_gpu_timestamp_cpu_); - auto base_gpu_timestamp_cpu_file = std::fstream(dump_file_path, std::ios::out); - base_gpu_timestamp_cpu_file.write(base_gpu_timestamp_cpu_str.c_str(), base_gpu_timestamp_cpu_str.length()); - base_gpu_timestamp_cpu_file.close(); - - dump_file_path = dump_dir; - dump_file_path += "/clock_calibration_gpu_gpu_rank_"; - dump_file_path += std::to_string(rank_); - std::string base_gpu_timestamp_gpu_str = std::to_string(base_gpu_timestamp_gpu_); - auto base_gpu_timestamp_gpu_file = std::fstream(dump_file_path, std::ios::out); - base_gpu_timestamp_gpu_file.write(base_gpu_timestamp_gpu_str.c_str(), base_gpu_timestamp_gpu_str.length()); - base_gpu_timestamp_gpu_file.close(); - return ncclSuccess; } ncclResult_t NpKit::Shutdown() { uint64_t i = 0; + // Stop CPU timestamp updating thread + cpu_timestamp_update_thread_should_stop_ = true; + cpu_timestamp_update_thread_->join(); + // Free CPU event data structures for (i = 0; i < kNumCpuEventBuffers; i++) { free(cpu_event_buffers_[i]); @@ -217,6 +150,9 @@ ncclResult_t NpKit::Shutdown() { free(gpu_event_buffers_); CUDACHECK(hipFree(gpu_collect_contexts_)); + // Free timestamp + NCCLCHECK(ncclCudaHostFree(cpu_timestamp_)); + return ncclSuccess; } @@ -236,6 +172,6 @@ void NpKit::CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t } } -uint64_t NpKit::GetCpuTimestamp() { - return std::chrono::steady_clock::now().time_since_epoch().count(); +uint64_t* NpKit::GetCpuTimestamp() { + return cpu_timestamp_; } diff --git a/src/transport/net.cc b/src/transport/net.cc index ec7a2ffaba..a8fafcc10f 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -1022,7 +1022,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg size, #endif uint64_t(sub->requests+buffSlot)/sizeof(void*), - NpKit::GetCpuTimestamp(), sub->channelId); + *(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId); #if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT) g_npkit_net_poll_cnt = 0; #endif @@ -1056,7 +1056,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg sub->npKitSizesFifo[buffSlot], #endif uint64_t(sub->requests+buffSlot)/sizeof(void*), - NpKit::GetCpuTimestamp(), sub->channelId); + *(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId); #if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT) g_npkit_net_poll_cnt = 0; #endif @@ -1183,7 +1183,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg sizes[i], #endif uint64_t(sub->requests+(step%NCCL_STEPS))/sizeof(void*), - NpKit::GetCpuTimestamp(), sub->channelId); + *(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId); #if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT) g_npkit_net_poll_cnt = 0; #endif @@ -1224,7 +1224,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg sizes[i], #endif uint64_t(sub->requests+(step%NCCL_STEPS))/sizeof(void*), - NpKit::GetCpuTimestamp(), sub->channelId); + *(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId); #if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT) g_npkit_net_poll_cnt = 0; #endif