diff --git a/install.sh b/install.sh index eaad65bf14..532680125e 100755 --- a/install.sh +++ b/install.sh @@ -273,6 +273,8 @@ if ($npkit_enabled); then -DENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT \ -DENABLE_NPKIT_EVENT_NET_SEND_ENTRY \ -DENABLE_NPKIT_EVENT_NET_SEND_EXIT \ + -DENABLE_NPKIT_EVENT_NET_TEST_ENTRY \ + -DENABLE_NPKIT_EVENT_NET_TEST_EXIT \ -DENABLE_NPKIT_EVENT_NET_RECV_ENTRY \ -DENABLE_NPKIT_EVENT_NET_RECV_EXIT \ -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY \ diff --git a/src/include/npkit/npkit_event.h b/src/include/npkit/npkit_event.h index c3892684c4..80ad637c3b 100644 --- a/src/include/npkit/npkit_event.h +++ b/src/include/npkit/npkit_event.h @@ -109,5 +109,8 @@ #define NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_ENTRY 0x56 #define NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT 0x57 +#define NPKIT_EVENT_NET_TEST_ENTRY 0x58 +#define NPKIT_EVENT_NET_TEST_EXIT 0x59 + #endif diff --git a/src/include/proxy.h b/src/include/proxy.h index 17db4bcef0..1a3272df73 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -74,6 +74,7 @@ struct ncclProxySubArgs { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT) int npKitSizesFifo[NCCL_STEPS]; + uint64_t timestamp[NCCL_STEPS]; #endif }; diff --git a/src/transport/net.cc b/src/transport/net.cc index 748be8ca42..273858cd02 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -1067,6 +1067,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct #if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT) g_npkit_net_poll_cnt = 0; #endif + sub->timestamp[buffSlot] = 0; #endif TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]); @@ -1085,9 +1086,12 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct if (sub->done < sub->transmitted) { int done; int buffSlot = (sub->base+sub->done)%NCCL_STEPS; +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT) + if (sub->timestamp[buffSlot] == 0) + sub->timestamp[buffSlot] = *(volatile uint64_t*)NpKit::GetCpuTimestamp(); +#endif NCCLCHECK(proxyState->ncclNet->test(sub->requests[buffSlot], &done, NULL)); if (done) { - #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT) NpKit::CollectCpuEvent( NPKIT_EVENT_NET_SEND_EXIT, @@ -1095,6 +1099,34 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct g_npkit_net_poll_cnt, #else sub->npKitSizesFifo[buffSlot], +#endif + uint64_t(sub->requests+buffSlot)/sizeof(void*), + sub->timestamp[buffSlot], sub->channelId); +#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT) + g_npkit_net_poll_cnt = 0; +#endif +#endif +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_TEST_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_TEST_EXIT) + NpKit::CollectCpuEvent( + NPKIT_EVENT_NET_TEST_ENTRY, +#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT) + g_npkit_net_poll_cnt, +#else + sub->npKitSizesFifo[buffSlot], +#endif + uint64_t(sub->requests+buffSlot)/sizeof(void*), + sub->timestamp[buffSlot], sub->channelId); +#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT) + g_npkit_net_poll_cnt = 0; +#endif +#endif +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_TEST_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_TEST_EXIT) + NpKit::CollectCpuEvent( + NPKIT_EVENT_NET_TEST_EXIT, +#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT) + g_npkit_net_poll_cnt, +#else + sub->npKitSizesFifo[buffSlot], #endif uint64_t(sub->requests+buffSlot)/sizeof(void*), *(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);