revert npkit (#748)

[ROCm/rccl commit: 7d6e7bcd7d]
Bu işleme şunda yer alıyor:
Ziyue Yang
2023-05-24 22:41:05 +08:00
işlemeyi yapan: GitHub
ebeveyn 4430e4448f
işleme a7557cf7b0
10 değiştirilmiş dosya ile 167 ekleme ve 122 silme
+15
Dosyayı Görüntüle
@@ -33,6 +33,21 @@ namespace {
int npKitCtxIdx = bid;
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (tid == 0) {
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
T *inputBuf = (T*)args->sendbuff;
T *outputBuf = (T*)args->recvbuff;
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
+45
Dosyayı Görüntüle
@@ -35,6 +35,21 @@ namespace {
int npKitCtxIdx = bid;
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (tid == 0) {
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_ENTRY, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
@@ -231,6 +246,21 @@ namespace {
int npKitCtxIdx = bid;
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (tid == 0) {
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
@@ -390,6 +420,21 @@ namespace {
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (isNpKitThread) {
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+15
Dosyayı Görüntüle
@@ -32,6 +32,21 @@ namespace {
int npKitCtxIdx = bid;
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (tid == 0) {
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
T *inputBuf = (T*)args->sendbuff;
T *outputBuf = (T*)args->recvbuff;
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
+30
Dosyayı Görüntüle
@@ -24,6 +24,21 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
int npKitCtxIdx = blockIdx.x * NCCL_MAX_WORK_ELEMENTS_P2P;
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (isNpKitThread) {
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
if (args->peer == ncclShmem.comm.rank) {
struct ncclWorkElemP2p* recvArgs = args-1;
void* recvBuff = reinterpret_cast<void*>(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32);
@@ -105,6 +120,21 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
int npKitCtxIdx = blockIdx.x * NCCL_MAX_WORK_ELEMENTS_P2P + 1;
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (isNpKitThread) {
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
if (args->peer != ncclShmem.comm.rank) {
void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
+1
Dosyayı Görüntüle
@@ -393,6 +393,7 @@ struct ncclDevComm {
#if defined(ENABLE_NPKIT)
NpKitEventCollectContext* npKitEventCollectContexts;
uint64_t* cpuTimestamp;
#endif
#ifdef ENABLE_COLLTRACE
+5 -6
Dosyayı Görüntüle
@@ -49,7 +49,7 @@ class NpKit {
static void CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id);
static uint64_t GetCpuTimestamp();
static uint64_t *GetCpuTimestamp();
private:
static void CpuTimestampUpdateThread();
@@ -65,13 +65,12 @@ class NpKit {
static NpKitEventCollectContext* gpu_collect_contexts_;
static NpKitEventCollectContext* cpu_collect_contexts_;
static uint64_t base_cpu_timestamp_global_;
static uint64_t base_cpu_timestamp_local_;
static uint64_t base_gpu_timestamp_cpu_;
static uint64_t base_gpu_timestamp_gpu_;
static uint64_t* cpu_timestamp_;
static uint64_t rank_;
static std::thread* cpu_timestamp_update_thread_;
static volatile bool cpu_timestamp_update_thread_should_stop_;
};
#endif
+30 -27
Dosyayı Görüntüle
@@ -65,36 +65,39 @@
#define NPKIT_EVENT_NET_RECV_ENTRY 0x31
#define NPKIT_EVENT_NET_RECV_EXIT 0x32
#define NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY 0x33
#define NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT 0x34
#define NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY 0x35
#define NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT 0x36
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY 0x37
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT 0x38
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY 0x39
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT 0x3A
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY 0x3B
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT 0x3C
#define NPKIT_EVENT_TIME_SYNC_GPU 0x33
#define NPKIT_EVENT_TIME_SYNC_CPU 0x34
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY 0x3D
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT 0x3E
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY 0x3F
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT 0x40
#define NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY 0x35
#define NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT 0x36
#define NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY 0x37
#define NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT 0x38
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY 0x39
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT 0x3A
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY 0x3B
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT 0x3C
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY 0x3D
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT 0x3E
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY 0x41
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT 0x42
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY 0x43
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT 0x44
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY 0x45
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT 0x46
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY 0x3F
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT 0x40
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY 0x41
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT 0x42
#define NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY 0x47
#define NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT 0x48
#define NPKIT_EVENT_SEND_RECV_SEND_ENTRY 0x49
#define NPKIT_EVENT_SEND_RECV_SEND_EXIT 0x4A
#define NPKIT_EVENT_SEND_RECV_RECV_ENTRY 0x4B
#define NPKIT_EVENT_SEND_RECV_RECV_EXIT 0x4C
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY 0x43
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT 0x44
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY 0x45
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT 0x46
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY 0x47
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT 0x48
#define NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME 0x4D
#define NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY 0x49
#define NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT 0x4A
#define NPKIT_EVENT_SEND_RECV_SEND_ENTRY 0x4B
#define NPKIT_EVENT_SEND_RECV_SEND_EXIT 0x4C
#define NPKIT_EVENT_SEND_RECV_RECV_ENTRY 0x4D
#define NPKIT_EVENT_SEND_RECV_RECV_EXIT 0x4E
#define NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME 0x4F
#endif
+1
Dosyayı Görüntüle
@@ -622,6 +622,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
// Init NPKit
NCCLCHECK(NpKit::Init(comm->rank));
tmpCommAndChans.comm.npKitEventCollectContexts = NpKit::GetGpuEventCollectContexts();
tmpCommAndChans.comm.cpuTimestamp = NpKit::GetCpuTimestamp();
#endif
#ifdef ENABLE_PROFILING
+21 -85
Dosyayı Görüntüle
@@ -18,29 +18,19 @@ NpKitEvent** NpKit::cpu_event_buffers_ = nullptr;
NpKitEventCollectContext* NpKit::gpu_collect_contexts_ = nullptr;
NpKitEventCollectContext* NpKit::cpu_collect_contexts_ = nullptr;
uint64_t NpKit::base_cpu_timestamp_global_ = 0;
uint64_t NpKit::base_cpu_timestamp_local_ = 0;
uint64_t NpKit::base_gpu_timestamp_cpu_ = 0;
uint64_t NpKit::base_gpu_timestamp_gpu_ = 0;
uint64_t* NpKit::cpu_timestamp_ = nullptr;
__global__ void TimeCalibrationKernel(uint64_t *gpu_timestamp, uint64_t *cpu_timestamp_in, uint64_t *cpu_timestamp_out) {
uint64_t gpu_timestamp_1 = NPKIT_GET_GPU_TIMESTAMP();
uint64_t cpu_timestamp = *cpu_timestamp_in;
uint64_t gpu_timestamp_2 = NPKIT_GET_GPU_TIMESTAMP();
*gpu_timestamp = gpu_timestamp_1 + (gpu_timestamp_2 - gpu_timestamp_1) / 2;
*cpu_timestamp_out = cpu_timestamp;
}
static volatile bool cpu_timestamp_update_thread_should_stop = false;
static volatile uint64_t* volatile_cpu_timestamp = nullptr;
std::thread* NpKit::cpu_timestamp_update_thread_ = nullptr;
volatile bool NpKit::cpu_timestamp_update_thread_should_stop_ = false;
void NpKit::CpuTimestampUpdateThread() {
uint64_t init_system_clock = std::chrono::system_clock::now().time_since_epoch().count();
uint64_t init_steady_clock = std::chrono::steady_clock::now().time_since_epoch().count();
uint64_t curr_steady_clock = 0;
while (!cpu_timestamp_update_thread_should_stop) {
volatile uint64_t* volatile_cpu_timestamp_ = cpu_timestamp_;
while (!cpu_timestamp_update_thread_should_stop_) {
curr_steady_clock = std::chrono::steady_clock::now().time_since_epoch().count();
*volatile_cpu_timestamp = init_system_clock + (curr_steady_clock - init_steady_clock);
*volatile_cpu_timestamp_ = init_system_clock + (curr_steady_clock - init_steady_clock);
}
}
@@ -67,40 +57,12 @@ ncclResult_t NpKit::Init(int rank) {
cpu_collect_contexts_[i] = ctx;
}
// Calibrate CPU timestamp
uint64_t cpu_timestamp_1 = std::chrono::steady_clock::now().time_since_epoch().count();
base_cpu_timestamp_global_ = std::chrono::system_clock::now().time_since_epoch().count();
uint64_t cpu_timestamp_2 = std::chrono::steady_clock::now().time_since_epoch().count();
base_cpu_timestamp_local_ = cpu_timestamp_1 + (cpu_timestamp_2 - cpu_timestamp_1) / 2;
// Calibrate GPU timestamp
uint64_t *cpu_timestamp_in = nullptr;
uint64_t *cpu_timestamp_out = nullptr;
uint64_t *gpu_timestamp = nullptr;
NCCLCHECK(ncclCudaHostCalloc(&cpu_timestamp_in, 1));
NCCLCHECK(ncclCudaCalloc(&cpu_timestamp_out, 1));
NCCLCHECK(ncclCudaCalloc(&gpu_timestamp, 1));
cpu_timestamp_update_thread_should_stop = false;
volatile_cpu_timestamp = cpu_timestamp_in;
*volatile_cpu_timestamp = 0;
hipStream_t timingStream;
CUDACHECK(hipStreamCreateWithFlags(&timingStream, hipStreamNonBlocking));
std::thread cpu_timestamp_update_thread(CpuTimestampUpdateThread);
while (!*volatile_cpu_timestamp);
TimeCalibrationKernel<<<1, 1, 0, timingStream>>>(gpu_timestamp, cpu_timestamp_in, cpu_timestamp_out);
CUDACHECK(hipStreamSynchronize(timingStream));
cpu_timestamp_update_thread_should_stop = true;
cpu_timestamp_update_thread.join();
NCCLCHECK(ncclCudaMemcpy(&base_gpu_timestamp_cpu_, cpu_timestamp_out, 1));
NCCLCHECK(ncclCudaMemcpy(&base_gpu_timestamp_gpu_, gpu_timestamp, 1));
NCCLCHECK(ncclCudaHostFree(cpu_timestamp_in));
CUDACHECK(hipFree(cpu_timestamp_out));
CUDACHECK(hipFree(gpu_timestamp));
// Init timestamp
NCCLCHECK(ncclCudaHostCalloc(&cpu_timestamp_, 1));
volatile uint64_t* volatile_cpu_timestamp = cpu_timestamp_;
*volatile_cpu_timestamp = std::chrono::system_clock::now().time_since_epoch().count();
cpu_timestamp_update_thread_should_stop_ = false;
cpu_timestamp_update_thread_ = new std::thread(CpuTimestampUpdateThread);
return ncclSuccess;
}
@@ -164,45 +126,16 @@ ncclResult_t NpKit::Dump(const std::string& dump_dir) {
gpu_clock_rate_file.write(clock_rate_str.c_str(), clock_rate_str.length());
gpu_clock_rate_file.close();
// Dump clock calibration info
dump_file_path = dump_dir;
dump_file_path += "/clock_calibration_cpu_global_rank_";
dump_file_path += std::to_string(rank_);
std::string base_cpu_timestamp_global_str = std::to_string(base_cpu_timestamp_global_);
auto base_cpu_timestamp_global_file = std::fstream(dump_file_path, std::ios::out);
base_cpu_timestamp_global_file.write(base_cpu_timestamp_global_str.c_str(), base_cpu_timestamp_global_str.length());
base_cpu_timestamp_global_file.close();
dump_file_path = dump_dir;
dump_file_path += "/clock_calibration_cpu_local_rank_";
dump_file_path += std::to_string(rank_);
std::string base_cpu_timestamp_local_str = std::to_string(base_cpu_timestamp_local_);
auto base_cpu_timestamp_local_file = std::fstream(dump_file_path, std::ios::out);
base_cpu_timestamp_local_file.write(base_cpu_timestamp_local_str.c_str(), base_cpu_timestamp_local_str.length());
base_cpu_timestamp_local_file.close();
dump_file_path = dump_dir;
dump_file_path += "/clock_calibration_gpu_cpu_rank_";
dump_file_path += std::to_string(rank_);
std::string base_gpu_timestamp_cpu_str = std::to_string(base_gpu_timestamp_cpu_);
auto base_gpu_timestamp_cpu_file = std::fstream(dump_file_path, std::ios::out);
base_gpu_timestamp_cpu_file.write(base_gpu_timestamp_cpu_str.c_str(), base_gpu_timestamp_cpu_str.length());
base_gpu_timestamp_cpu_file.close();
dump_file_path = dump_dir;
dump_file_path += "/clock_calibration_gpu_gpu_rank_";
dump_file_path += std::to_string(rank_);
std::string base_gpu_timestamp_gpu_str = std::to_string(base_gpu_timestamp_gpu_);
auto base_gpu_timestamp_gpu_file = std::fstream(dump_file_path, std::ios::out);
base_gpu_timestamp_gpu_file.write(base_gpu_timestamp_gpu_str.c_str(), base_gpu_timestamp_gpu_str.length());
base_gpu_timestamp_gpu_file.close();
return ncclSuccess;
}
ncclResult_t NpKit::Shutdown() {
uint64_t i = 0;
// Stop CPU timestamp updating thread
cpu_timestamp_update_thread_should_stop_ = true;
cpu_timestamp_update_thread_->join();
// Free CPU event data structures
for (i = 0; i < kNumCpuEventBuffers; i++) {
free(cpu_event_buffers_[i]);
@@ -217,6 +150,9 @@ ncclResult_t NpKit::Shutdown() {
free(gpu_event_buffers_);
CUDACHECK(hipFree(gpu_collect_contexts_));
// Free timestamp
NCCLCHECK(ncclCudaHostFree(cpu_timestamp_));
return ncclSuccess;
}
@@ -236,6 +172,6 @@ void NpKit::CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t
}
}
uint64_t NpKit::GetCpuTimestamp() {
return std::chrono::steady_clock::now().time_since_epoch().count();
uint64_t* NpKit::GetCpuTimestamp() {
return cpu_timestamp_;
}
+4 -4
Dosyayı Görüntüle
@@ -1022,7 +1022,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
size,
#endif
uint64_t(sub->requests+buffSlot)/sizeof(void*),
NpKit::GetCpuTimestamp(), sub->channelId);
*(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
g_npkit_net_poll_cnt = 0;
#endif
@@ -1056,7 +1056,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
sub->npKitSizesFifo[buffSlot],
#endif
uint64_t(sub->requests+buffSlot)/sizeof(void*),
NpKit::GetCpuTimestamp(), sub->channelId);
*(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
g_npkit_net_poll_cnt = 0;
#endif
@@ -1183,7 +1183,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
sizes[i],
#endif
uint64_t(sub->requests+(step%NCCL_STEPS))/sizeof(void*),
NpKit::GetCpuTimestamp(), sub->channelId);
*(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
g_npkit_net_poll_cnt = 0;
#endif
@@ -1224,7 +1224,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
sizes[i],
#endif
uint64_t(sub->requests+(step%NCCL_STEPS))/sizeof(void*),
NpKit::GetCpuTimestamp(), sub->channelId);
*(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
g_npkit_net_poll_cnt = 0;
#endif