Implement HDP flush when transfer data over PCIe P2P (#75)

* Implement HDP flush when transfer data over PCIe P2P
* Add some descriptions to HDP flushing
* Fix for review comments


[ROCm/rccl commit: b7a6307371]
This commit is contained in:
Wenkai Du
2019-06-03 16:29:55 -07:00
committad av GitHub
förälder 9bd033992f
incheckning a8fbf5555c
8 ändrade filer med 74 tillägg och 12 borttagningar
@@ -31,7 +31,7 @@ __device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
WaitFlag waitDoneFromNext(ring->send.conn.head, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLGATHER_SUBSTEPS);
PostFlag postDoneToPrev(ring->recv.conn.head, ALLGATHER_SUBSTEPS, NULL, 0);
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS, ring->hdp_reg);
typedef Primitives<UNROLL, ALLGATHER_SUBSTEPS, T> Prims;
@@ -31,7 +31,7 @@ __device__ void ncclAllReduceKernel(struct CollectiveArgs* args) {
WaitFlag waitDoneFromNext(ring->send.conn.head, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLREDUCE_SUBSTEPS);
PostFlag postDoneToPrev(ring->recv.conn.head, ALLREDUCE_SUBSTEPS, NULL, 0);
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS, ring->hdp_reg);
typedef Primitives<UNROLL, ALLREDUCE_SUBSTEPS, T, FUNC> Prims;
@@ -30,7 +30,7 @@ __device__ void ncclBroadcastKernel(struct CollectiveArgs* args) {
WaitFlag waitDoneFromNext(ring->send.conn.head, (BROADCAST_BUFCHUNKS-1)*BROADCAST_SUBSTEPS);
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS);
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS, ring->hdp_reg);
typedef Primitives<UNROLL, BROADCAST_SUBSTEPS, T> Prims;
@@ -44,11 +44,14 @@ class PostFlag {
const int shift;
volatile int * const fifo;
const int fifo_size;
uint32_t * hdp_reg;
public:
__device__
PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size) : flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size) { }
PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size, uint32_t* hdp_reg = NULL)
: flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size), hdp_reg(hdp_reg) { }
// remote writes can be reordered if we don't do s_waitcnt 0 + store to HDP between the data and flag
__device__
void post(uint64_t val) { STORE(flag, (val - shift)); }
void post(uint64_t val) { if (hdp_reg != NULL) STORE(hdp_reg, 0x1); STORE(flag, (val - shift)); }
__device__
void postSize(uint64_t step, int size) { if (fifo != NULL) STORE(fifo + step%fifo_size, size); };
};
+1 -1
Visa fil
@@ -27,7 +27,7 @@ __device__ void ncclReduceKernel(struct CollectiveArgs* args) {
WaitFlag waitDoneFromNext(ring->send.conn.head, (REDUCE_BUFCHUNKS-1)*REDUCE_SUBSTEPS);
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS);
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS, ring->hdp_reg);
typedef Primitives<UNROLL, REDUCE_SUBSTEPS, T, FUNC> Prims;
@@ -28,7 +28,7 @@ __device__ void ncclReduceScatterKernel(struct CollectiveArgs* args) {
WaitFlag waitDoneFromNext(ring->send.conn.head, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, REDUCESCATTER_SUBSTEPS);
PostFlag postDoneToPrev(ring->recv.conn.head, REDUCESCATTER_SUBSTEPS, NULL, 0);
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS, ring->hdp_reg);
typedef Primitives<UNROLL, REDUCESCATTER_SUBSTEPS, T, FUNC> Prims;
+5
Visa fil
@@ -153,6 +153,11 @@ struct ncclRing {
int* userRanks;
int* devUserRanks;
// Next GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
// allows software to explicitly initiate a flush read to HDP memory. See more
// descriptions in primitives.h.
uint32_t* hdp_reg;
// Operation list for aggregation
struct ncclColl* collectives;
struct ncclColl* devCollectives;
+59 -5
Visa fil
@@ -118,11 +118,6 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
link_type_name[link_type], hops);
link_status_print_once_mask |= (1 << (myInfo->cudaDev*8 + peerInfo->cudaDev));
}
if (link_type != HSA_AMD_LINK_INFO_TYPE_XGMI) {
// disable PCIe P2P until HDP flush is implemented.
p2p = 0;
return ncclSuccess;
}
int nvlinkp2p = 0;
if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI && hops == 1)
nvlinkp2p = CONNECT_NVLINK;
@@ -487,10 +482,69 @@ end:
} while (0)
/* Send: Create and return connect structures for this peer to connect to me */
static ncclResult_t getGpuHdpReg(int cudaDev, uint32_t** hdp) {
auto convert_bdf = [](const char *busId) {
char bdf[9];
strncpy(bdf, busId, 4);
strncpy(bdf+4, busId+5, 2);
strncpy(bdf+6, busId+8, 2);
bdf[8] = '\0';
uint16_t id = (uint16_t)strtol(bdf, NULL, 16);
return id;
};
union find_agent_args {
hsa_agent_t agent;
uint16_t id;
} args;
const auto& find_agent = [](hsa_agent_t agent, void* arg) {
uint16_t id = ((union find_agent_args *)arg)->id;
hsa_device_type_t type;
hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, (void*)&type);
if(type == HSA_DEVICE_TYPE_GPU) {
uint16_t bdf_id = 1;
hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, &bdf_id);
if(bdf_id == id) {
((union find_agent_args *)arg)->agent=agent;
return HSA_STATUS_INFO_BREAK;
}
}
return HSA_STATUS_SUCCESS;
};
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
*hdp = NULL;
CUDACHECK(hipDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
args.id = convert_bdf(busId);
hsa_status_t err = hsa_iterate_agents(find_agent, (void*)&args);
if (err != HSA_STATUS_INFO_BREAK) {
WARN("failed to get locate HSA agent for GPU %d", cudaDev);
return ncclSystemError;
}
hsa_amd_hdp_flush_t hdpinfo;
err = hsa_agent_get_info(args.agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_HDP_FLUSH, &hdpinfo);
if ((err != HSA_STATUS_SUCCESS) && (err != HSA_STATUS_INFO_BREAK)) {
WARN("failed to get HSA_AMD_AGENT_INFO_HDP_FLUSH for GPU %d", cudaDev);
return ncclSystemError;
}
*hdp = hdpinfo.HDP_MEM_FLUSH_CNTL;
return ncclSuccess;
}
ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo;
struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo;
struct p2pConnectInfo info;
uint32_t linktype, hops;
if (hipExtGetLinkTypeAndHopCount(myInfo->cudaDev, peerInfo->cudaDev, &linktype, &hops) != hipSuccess) {
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", ring->id, myInfo->rank, peerInfo->rank);
return ncclInternalError;
}
if (linktype != HSA_AMD_LINK_INFO_TYPE_XGMI) {
NCCLCHECK(getGpuHdpReg(peerInfo->cudaDev, &ring->hdp_reg));
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", ring->id, myInfo->rank, peerInfo->rank, ring->hdp_reg);
}
if (myInfo->pidHash == peerInfo->pidHash) {
info.direct = 1;
info.directPtr = ring->devMemSend;