revert npkit (#748)
This commit is contained in:
@@ -33,6 +33,21 @@ namespace {
|
||||
int npKitCtxIdx = bid;
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
if (tid == 0) {
|
||||
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
T *inputBuf = (T*)args->sendbuff;
|
||||
T *outputBuf = (T*)args->recvbuff;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
|
||||
|
||||
@@ -35,6 +35,21 @@ namespace {
|
||||
int npKitCtxIdx = bid;
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
if (tid == 0) {
|
||||
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_ENTRY, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
@@ -231,6 +246,21 @@ namespace {
|
||||
int npKitCtxIdx = bid;
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
if (tid == 0) {
|
||||
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
@@ -390,6 +420,21 @@ namespace {
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
if (isNpKitThread) {
|
||||
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
|
||||
@@ -32,6 +32,21 @@ namespace {
|
||||
int npKitCtxIdx = bid;
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
if (tid == 0) {
|
||||
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
T *inputBuf = (T*)args->sendbuff;
|
||||
T *outputBuf = (T*)args->recvbuff;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
|
||||
|
||||
@@ -24,6 +24,21 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
int npKitCtxIdx = blockIdx.x * NCCL_MAX_WORK_ELEMENTS_P2P;
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
if (isNpKitThread) {
|
||||
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (args->peer == ncclShmem.comm.rank) {
|
||||
struct ncclWorkElemP2p* recvArgs = args-1;
|
||||
void* recvBuff = reinterpret_cast<void*>(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32);
|
||||
@@ -105,6 +120,21 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
int npKitCtxIdx = blockIdx.x * NCCL_MAX_WORK_ELEMENTS_P2P + 1;
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
if (isNpKitThread) {
|
||||
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (args->peer != ncclShmem.comm.rank) {
|
||||
void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
|
||||
ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
|
||||
|
||||
@@ -393,6 +393,7 @@ struct ncclDevComm {
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
NpKitEventCollectContext* npKitEventCollectContexts;
|
||||
uint64_t* cpuTimestamp;
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
|
||||
@@ -49,7 +49,7 @@ class NpKit {
|
||||
|
||||
static void CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id);
|
||||
|
||||
static uint64_t GetCpuTimestamp();
|
||||
static uint64_t *GetCpuTimestamp();
|
||||
|
||||
private:
|
||||
static void CpuTimestampUpdateThread();
|
||||
@@ -65,13 +65,12 @@ class NpKit {
|
||||
|
||||
static NpKitEventCollectContext* gpu_collect_contexts_;
|
||||
static NpKitEventCollectContext* cpu_collect_contexts_;
|
||||
|
||||
static uint64_t base_cpu_timestamp_global_;
|
||||
static uint64_t base_cpu_timestamp_local_;
|
||||
static uint64_t base_gpu_timestamp_cpu_;
|
||||
static uint64_t base_gpu_timestamp_gpu_;
|
||||
static uint64_t* cpu_timestamp_;
|
||||
|
||||
static uint64_t rank_;
|
||||
|
||||
static std::thread* cpu_timestamp_update_thread_;
|
||||
static volatile bool cpu_timestamp_update_thread_should_stop_;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -65,36 +65,39 @@
|
||||
#define NPKIT_EVENT_NET_RECV_ENTRY 0x31
|
||||
#define NPKIT_EVENT_NET_RECV_EXIT 0x32
|
||||
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY 0x33
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT 0x34
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY 0x35
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT 0x36
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY 0x37
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT 0x38
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY 0x39
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT 0x3A
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY 0x3B
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT 0x3C
|
||||
#define NPKIT_EVENT_TIME_SYNC_GPU 0x33
|
||||
#define NPKIT_EVENT_TIME_SYNC_CPU 0x34
|
||||
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY 0x3D
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT 0x3E
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY 0x3F
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT 0x40
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY 0x35
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT 0x36
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY 0x37
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT 0x38
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY 0x39
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT 0x3A
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY 0x3B
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT 0x3C
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY 0x3D
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT 0x3E
|
||||
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY 0x41
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT 0x42
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY 0x43
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT 0x44
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY 0x45
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT 0x46
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY 0x3F
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT 0x40
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY 0x41
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT 0x42
|
||||
|
||||
#define NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY 0x47
|
||||
#define NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT 0x48
|
||||
#define NPKIT_EVENT_SEND_RECV_SEND_ENTRY 0x49
|
||||
#define NPKIT_EVENT_SEND_RECV_SEND_EXIT 0x4A
|
||||
#define NPKIT_EVENT_SEND_RECV_RECV_ENTRY 0x4B
|
||||
#define NPKIT_EVENT_SEND_RECV_RECV_EXIT 0x4C
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY 0x43
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT 0x44
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY 0x45
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT 0x46
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY 0x47
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT 0x48
|
||||
|
||||
#define NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME 0x4D
|
||||
#define NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY 0x49
|
||||
#define NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT 0x4A
|
||||
#define NPKIT_EVENT_SEND_RECV_SEND_ENTRY 0x4B
|
||||
#define NPKIT_EVENT_SEND_RECV_SEND_EXIT 0x4C
|
||||
#define NPKIT_EVENT_SEND_RECV_RECV_ENTRY 0x4D
|
||||
#define NPKIT_EVENT_SEND_RECV_RECV_EXIT 0x4E
|
||||
|
||||
#define NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME 0x4F
|
||||
|
||||
#endif
|
||||
|
||||
@@ -622,6 +622,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
// Init NPKit
|
||||
NCCLCHECK(NpKit::Init(comm->rank));
|
||||
tmpCommAndChans.comm.npKitEventCollectContexts = NpKit::GetGpuEventCollectContexts();
|
||||
tmpCommAndChans.comm.cpuTimestamp = NpKit::GetCpuTimestamp();
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
|
||||
+21
-85
@@ -18,29 +18,19 @@ NpKitEvent** NpKit::cpu_event_buffers_ = nullptr;
|
||||
NpKitEventCollectContext* NpKit::gpu_collect_contexts_ = nullptr;
|
||||
NpKitEventCollectContext* NpKit::cpu_collect_contexts_ = nullptr;
|
||||
|
||||
uint64_t NpKit::base_cpu_timestamp_global_ = 0;
|
||||
uint64_t NpKit::base_cpu_timestamp_local_ = 0;
|
||||
uint64_t NpKit::base_gpu_timestamp_cpu_ = 0;
|
||||
uint64_t NpKit::base_gpu_timestamp_gpu_ = 0;
|
||||
uint64_t* NpKit::cpu_timestamp_ = nullptr;
|
||||
|
||||
__global__ void TimeCalibrationKernel(uint64_t *gpu_timestamp, uint64_t *cpu_timestamp_in, uint64_t *cpu_timestamp_out) {
|
||||
uint64_t gpu_timestamp_1 = NPKIT_GET_GPU_TIMESTAMP();
|
||||
uint64_t cpu_timestamp = *cpu_timestamp_in;
|
||||
uint64_t gpu_timestamp_2 = NPKIT_GET_GPU_TIMESTAMP();
|
||||
*gpu_timestamp = gpu_timestamp_1 + (gpu_timestamp_2 - gpu_timestamp_1) / 2;
|
||||
*cpu_timestamp_out = cpu_timestamp;
|
||||
}
|
||||
|
||||
static volatile bool cpu_timestamp_update_thread_should_stop = false;
|
||||
static volatile uint64_t* volatile_cpu_timestamp = nullptr;
|
||||
std::thread* NpKit::cpu_timestamp_update_thread_ = nullptr;
|
||||
volatile bool NpKit::cpu_timestamp_update_thread_should_stop_ = false;
|
||||
|
||||
void NpKit::CpuTimestampUpdateThread() {
|
||||
uint64_t init_system_clock = std::chrono::system_clock::now().time_since_epoch().count();
|
||||
uint64_t init_steady_clock = std::chrono::steady_clock::now().time_since_epoch().count();
|
||||
uint64_t curr_steady_clock = 0;
|
||||
while (!cpu_timestamp_update_thread_should_stop) {
|
||||
volatile uint64_t* volatile_cpu_timestamp_ = cpu_timestamp_;
|
||||
while (!cpu_timestamp_update_thread_should_stop_) {
|
||||
curr_steady_clock = std::chrono::steady_clock::now().time_since_epoch().count();
|
||||
*volatile_cpu_timestamp = init_system_clock + (curr_steady_clock - init_steady_clock);
|
||||
*volatile_cpu_timestamp_ = init_system_clock + (curr_steady_clock - init_steady_clock);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -67,40 +57,12 @@ ncclResult_t NpKit::Init(int rank) {
|
||||
cpu_collect_contexts_[i] = ctx;
|
||||
}
|
||||
|
||||
// Calibrate CPU timestamp
|
||||
uint64_t cpu_timestamp_1 = std::chrono::steady_clock::now().time_since_epoch().count();
|
||||
base_cpu_timestamp_global_ = std::chrono::system_clock::now().time_since_epoch().count();
|
||||
uint64_t cpu_timestamp_2 = std::chrono::steady_clock::now().time_since_epoch().count();
|
||||
base_cpu_timestamp_local_ = cpu_timestamp_1 + (cpu_timestamp_2 - cpu_timestamp_1) / 2;
|
||||
|
||||
// Calibrate GPU timestamp
|
||||
uint64_t *cpu_timestamp_in = nullptr;
|
||||
uint64_t *cpu_timestamp_out = nullptr;
|
||||
uint64_t *gpu_timestamp = nullptr;
|
||||
NCCLCHECK(ncclCudaHostCalloc(&cpu_timestamp_in, 1));
|
||||
NCCLCHECK(ncclCudaCalloc(&cpu_timestamp_out, 1));
|
||||
NCCLCHECK(ncclCudaCalloc(&gpu_timestamp, 1));
|
||||
|
||||
cpu_timestamp_update_thread_should_stop = false;
|
||||
volatile_cpu_timestamp = cpu_timestamp_in;
|
||||
*volatile_cpu_timestamp = 0;
|
||||
|
||||
hipStream_t timingStream;
|
||||
CUDACHECK(hipStreamCreateWithFlags(&timingStream, hipStreamNonBlocking));
|
||||
|
||||
std::thread cpu_timestamp_update_thread(CpuTimestampUpdateThread);
|
||||
while (!*volatile_cpu_timestamp);
|
||||
TimeCalibrationKernel<<<1, 1, 0, timingStream>>>(gpu_timestamp, cpu_timestamp_in, cpu_timestamp_out);
|
||||
CUDACHECK(hipStreamSynchronize(timingStream));
|
||||
cpu_timestamp_update_thread_should_stop = true;
|
||||
cpu_timestamp_update_thread.join();
|
||||
|
||||
NCCLCHECK(ncclCudaMemcpy(&base_gpu_timestamp_cpu_, cpu_timestamp_out, 1));
|
||||
NCCLCHECK(ncclCudaMemcpy(&base_gpu_timestamp_gpu_, gpu_timestamp, 1));
|
||||
|
||||
NCCLCHECK(ncclCudaHostFree(cpu_timestamp_in));
|
||||
CUDACHECK(hipFree(cpu_timestamp_out));
|
||||
CUDACHECK(hipFree(gpu_timestamp));
|
||||
// Init timestamp
|
||||
NCCLCHECK(ncclCudaHostCalloc(&cpu_timestamp_, 1));
|
||||
volatile uint64_t* volatile_cpu_timestamp = cpu_timestamp_;
|
||||
*volatile_cpu_timestamp = std::chrono::system_clock::now().time_since_epoch().count();
|
||||
cpu_timestamp_update_thread_should_stop_ = false;
|
||||
cpu_timestamp_update_thread_ = new std::thread(CpuTimestampUpdateThread);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -164,45 +126,16 @@ ncclResult_t NpKit::Dump(const std::string& dump_dir) {
|
||||
gpu_clock_rate_file.write(clock_rate_str.c_str(), clock_rate_str.length());
|
||||
gpu_clock_rate_file.close();
|
||||
|
||||
// Dump clock calibration info
|
||||
dump_file_path = dump_dir;
|
||||
dump_file_path += "/clock_calibration_cpu_global_rank_";
|
||||
dump_file_path += std::to_string(rank_);
|
||||
std::string base_cpu_timestamp_global_str = std::to_string(base_cpu_timestamp_global_);
|
||||
auto base_cpu_timestamp_global_file = std::fstream(dump_file_path, std::ios::out);
|
||||
base_cpu_timestamp_global_file.write(base_cpu_timestamp_global_str.c_str(), base_cpu_timestamp_global_str.length());
|
||||
base_cpu_timestamp_global_file.close();
|
||||
|
||||
dump_file_path = dump_dir;
|
||||
dump_file_path += "/clock_calibration_cpu_local_rank_";
|
||||
dump_file_path += std::to_string(rank_);
|
||||
std::string base_cpu_timestamp_local_str = std::to_string(base_cpu_timestamp_local_);
|
||||
auto base_cpu_timestamp_local_file = std::fstream(dump_file_path, std::ios::out);
|
||||
base_cpu_timestamp_local_file.write(base_cpu_timestamp_local_str.c_str(), base_cpu_timestamp_local_str.length());
|
||||
base_cpu_timestamp_local_file.close();
|
||||
|
||||
dump_file_path = dump_dir;
|
||||
dump_file_path += "/clock_calibration_gpu_cpu_rank_";
|
||||
dump_file_path += std::to_string(rank_);
|
||||
std::string base_gpu_timestamp_cpu_str = std::to_string(base_gpu_timestamp_cpu_);
|
||||
auto base_gpu_timestamp_cpu_file = std::fstream(dump_file_path, std::ios::out);
|
||||
base_gpu_timestamp_cpu_file.write(base_gpu_timestamp_cpu_str.c_str(), base_gpu_timestamp_cpu_str.length());
|
||||
base_gpu_timestamp_cpu_file.close();
|
||||
|
||||
dump_file_path = dump_dir;
|
||||
dump_file_path += "/clock_calibration_gpu_gpu_rank_";
|
||||
dump_file_path += std::to_string(rank_);
|
||||
std::string base_gpu_timestamp_gpu_str = std::to_string(base_gpu_timestamp_gpu_);
|
||||
auto base_gpu_timestamp_gpu_file = std::fstream(dump_file_path, std::ios::out);
|
||||
base_gpu_timestamp_gpu_file.write(base_gpu_timestamp_gpu_str.c_str(), base_gpu_timestamp_gpu_str.length());
|
||||
base_gpu_timestamp_gpu_file.close();
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t NpKit::Shutdown() {
|
||||
uint64_t i = 0;
|
||||
|
||||
// Stop CPU timestamp updating thread
|
||||
cpu_timestamp_update_thread_should_stop_ = true;
|
||||
cpu_timestamp_update_thread_->join();
|
||||
|
||||
// Free CPU event data structures
|
||||
for (i = 0; i < kNumCpuEventBuffers; i++) {
|
||||
free(cpu_event_buffers_[i]);
|
||||
@@ -217,6 +150,9 @@ ncclResult_t NpKit::Shutdown() {
|
||||
free(gpu_event_buffers_);
|
||||
CUDACHECK(hipFree(gpu_collect_contexts_));
|
||||
|
||||
// Free timestamp
|
||||
NCCLCHECK(ncclCudaHostFree(cpu_timestamp_));
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -236,6 +172,6 @@ void NpKit::CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t NpKit::GetCpuTimestamp() {
|
||||
return std::chrono::steady_clock::now().time_since_epoch().count();
|
||||
uint64_t* NpKit::GetCpuTimestamp() {
|
||||
return cpu_timestamp_;
|
||||
}
|
||||
|
||||
@@ -1022,7 +1022,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
size,
|
||||
#endif
|
||||
uint64_t(sub->requests+buffSlot)/sizeof(void*),
|
||||
NpKit::GetCpuTimestamp(), sub->channelId);
|
||||
*(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
|
||||
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt = 0;
|
||||
#endif
|
||||
@@ -1056,7 +1056,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
sub->npKitSizesFifo[buffSlot],
|
||||
#endif
|
||||
uint64_t(sub->requests+buffSlot)/sizeof(void*),
|
||||
NpKit::GetCpuTimestamp(), sub->channelId);
|
||||
*(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
|
||||
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt = 0;
|
||||
#endif
|
||||
@@ -1183,7 +1183,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
sizes[i],
|
||||
#endif
|
||||
uint64_t(sub->requests+(step%NCCL_STEPS))/sizeof(void*),
|
||||
NpKit::GetCpuTimestamp(), sub->channelId);
|
||||
*(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
|
||||
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt = 0;
|
||||
#endif
|
||||
@@ -1224,7 +1224,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
sizes[i],
|
||||
#endif
|
||||
uint64_t(sub->requests+(step%NCCL_STEPS))/sizeof(void*),
|
||||
NpKit::GetCpuTimestamp(), sub->channelId);
|
||||
*(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
|
||||
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt = 0;
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user