Implement HDP flush when transfer data over PCIe P2P (#75)
* Implement HDP flush when transfer data over PCIe P2P
* Add some descriptions to HDP flushing
* Fix for review comments
[ROCm/rccl commit: b7a6307371]
This commit is contained in:
@@ -31,7 +31,7 @@ __device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
|
||||
WaitFlag waitDoneFromNext(ring->send.conn.head, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
|
||||
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLGATHER_SUBSTEPS);
|
||||
PostFlag postDoneToPrev(ring->recv.conn.head, ALLGATHER_SUBSTEPS, NULL, 0);
|
||||
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
|
||||
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS, ring->hdp_reg);
|
||||
|
||||
typedef Primitives<UNROLL, ALLGATHER_SUBSTEPS, T> Prims;
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ __device__ void ncclAllReduceKernel(struct CollectiveArgs* args) {
|
||||
WaitFlag waitDoneFromNext(ring->send.conn.head, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
|
||||
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLREDUCE_SUBSTEPS);
|
||||
PostFlag postDoneToPrev(ring->recv.conn.head, ALLREDUCE_SUBSTEPS, NULL, 0);
|
||||
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
|
||||
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS, ring->hdp_reg);
|
||||
|
||||
typedef Primitives<UNROLL, ALLREDUCE_SUBSTEPS, T, FUNC> Prims;
|
||||
|
||||
|
||||
@@ -30,7 +30,7 @@ __device__ void ncclBroadcastKernel(struct CollectiveArgs* args) {
|
||||
WaitFlag waitDoneFromNext(ring->send.conn.head, (BROADCAST_BUFCHUNKS-1)*BROADCAST_SUBSTEPS);
|
||||
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
|
||||
PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
|
||||
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS);
|
||||
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS, ring->hdp_reg);
|
||||
|
||||
typedef Primitives<UNROLL, BROADCAST_SUBSTEPS, T> Prims;
|
||||
|
||||
|
||||
@@ -44,11 +44,14 @@ class PostFlag {
|
||||
const int shift;
|
||||
volatile int * const fifo;
|
||||
const int fifo_size;
|
||||
uint32_t * hdp_reg;
|
||||
public:
|
||||
__device__
|
||||
PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size) : flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size) { }
|
||||
PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size, uint32_t* hdp_reg = NULL)
|
||||
: flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size), hdp_reg(hdp_reg) { }
|
||||
// remote writes can be reordered if we don't do s_waitcnt 0 + store to HDP between the data and flag
|
||||
__device__
|
||||
void post(uint64_t val) { STORE(flag, (val - shift)); }
|
||||
void post(uint64_t val) { if (hdp_reg != NULL) STORE(hdp_reg, 0x1); STORE(flag, (val - shift)); }
|
||||
__device__
|
||||
void postSize(uint64_t step, int size) { if (fifo != NULL) STORE(fifo + step%fifo_size, size); };
|
||||
};
|
||||
|
||||
@@ -27,7 +27,7 @@ __device__ void ncclReduceKernel(struct CollectiveArgs* args) {
|
||||
WaitFlag waitDoneFromNext(ring->send.conn.head, (REDUCE_BUFCHUNKS-1)*REDUCE_SUBSTEPS);
|
||||
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
|
||||
PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
|
||||
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS);
|
||||
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS, ring->hdp_reg);
|
||||
|
||||
typedef Primitives<UNROLL, REDUCE_SUBSTEPS, T, FUNC> Prims;
|
||||
|
||||
|
||||
@@ -28,7 +28,7 @@ __device__ void ncclReduceScatterKernel(struct CollectiveArgs* args) {
|
||||
WaitFlag waitDoneFromNext(ring->send.conn.head, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
|
||||
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, REDUCESCATTER_SUBSTEPS);
|
||||
PostFlag postDoneToPrev(ring->recv.conn.head, REDUCESCATTER_SUBSTEPS, NULL, 0);
|
||||
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
|
||||
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS, ring->hdp_reg);
|
||||
|
||||
typedef Primitives<UNROLL, REDUCESCATTER_SUBSTEPS, T, FUNC> Prims;
|
||||
|
||||
|
||||
@@ -153,6 +153,11 @@ struct ncclRing {
|
||||
int* userRanks;
|
||||
int* devUserRanks;
|
||||
|
||||
// Next GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
|
||||
// allows software to explicitly initiate a flush read to HDP memory. See more
|
||||
// descriptions in primitives.h.
|
||||
uint32_t* hdp_reg;
|
||||
|
||||
// Operation list for aggregation
|
||||
struct ncclColl* collectives;
|
||||
struct ncclColl* devCollectives;
|
||||
|
||||
@@ -118,11 +118,6 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
|
||||
link_type_name[link_type], hops);
|
||||
link_status_print_once_mask |= (1 << (myInfo->cudaDev*8 + peerInfo->cudaDev));
|
||||
}
|
||||
if (link_type != HSA_AMD_LINK_INFO_TYPE_XGMI) {
|
||||
// disable PCIe P2P until HDP flush is implemented.
|
||||
p2p = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
int nvlinkp2p = 0;
|
||||
if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI && hops == 1)
|
||||
nvlinkp2p = CONNECT_NVLINK;
|
||||
@@ -487,10 +482,69 @@ end:
|
||||
} while (0)
|
||||
|
||||
/* Send: Create and return connect structures for this peer to connect to me */
|
||||
static ncclResult_t getGpuHdpReg(int cudaDev, uint32_t** hdp) {
|
||||
auto convert_bdf = [](const char *busId) {
|
||||
char bdf[9];
|
||||
strncpy(bdf, busId, 4);
|
||||
strncpy(bdf+4, busId+5, 2);
|
||||
strncpy(bdf+6, busId+8, 2);
|
||||
bdf[8] = '\0';
|
||||
uint16_t id = (uint16_t)strtol(bdf, NULL, 16);
|
||||
return id;
|
||||
};
|
||||
|
||||
union find_agent_args {
|
||||
hsa_agent_t agent;
|
||||
uint16_t id;
|
||||
} args;
|
||||
|
||||
const auto& find_agent = [](hsa_agent_t agent, void* arg) {
|
||||
uint16_t id = ((union find_agent_args *)arg)->id;
|
||||
hsa_device_type_t type;
|
||||
hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, (void*)&type);
|
||||
if(type == HSA_DEVICE_TYPE_GPU) {
|
||||
uint16_t bdf_id = 1;
|
||||
hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, &bdf_id);
|
||||
if(bdf_id == id) {
|
||||
((union find_agent_args *)arg)->agent=agent;
|
||||
return HSA_STATUS_INFO_BREAK;
|
||||
}
|
||||
}
|
||||
return HSA_STATUS_SUCCESS;
|
||||
};
|
||||
|
||||
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
|
||||
*hdp = NULL;
|
||||
CUDACHECK(hipDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
|
||||
args.id = convert_bdf(busId);
|
||||
hsa_status_t err = hsa_iterate_agents(find_agent, (void*)&args);
|
||||
if (err != HSA_STATUS_INFO_BREAK) {
|
||||
WARN("failed to get locate HSA agent for GPU %d", cudaDev);
|
||||
return ncclSystemError;
|
||||
}
|
||||
hsa_amd_hdp_flush_t hdpinfo;
|
||||
err = hsa_agent_get_info(args.agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_HDP_FLUSH, &hdpinfo);
|
||||
if ((err != HSA_STATUS_SUCCESS) && (err != HSA_STATUS_INFO_BREAK)) {
|
||||
WARN("failed to get HSA_AMD_AGENT_INFO_HDP_FLUSH for GPU %d", cudaDev);
|
||||
return ncclSystemError;
|
||||
}
|
||||
*hdp = hdpinfo.HDP_MEM_FLUSH_CNTL;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
|
||||
struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo;
|
||||
struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo;
|
||||
struct p2pConnectInfo info;
|
||||
uint32_t linktype, hops;
|
||||
if (hipExtGetLinkTypeAndHopCount(myInfo->cudaDev, peerInfo->cudaDev, &linktype, &hops) != hipSuccess) {
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", ring->id, myInfo->rank, peerInfo->rank);
|
||||
return ncclInternalError;
|
||||
}
|
||||
if (linktype != HSA_AMD_LINK_INFO_TYPE_XGMI) {
|
||||
NCCLCHECK(getGpuHdpReg(peerInfo->cudaDev, &ring->hdp_reg));
|
||||
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", ring->id, myInfo->rank, peerInfo->rank, ring->hdp_reg);
|
||||
}
|
||||
if (myInfo->pidHash == peerInfo->pidHash) {
|
||||
info.direct = 1;
|
||||
info.directPtr = ring->devMemSend;
|
||||
|
||||
Referens i nytt ärende
Block a user