Use posix_memalign for network buffer allocation on host memory (#221)

* Use posix_memalign for network buffer allocation on host memory

* ib-test: add ability to specify run iterations

* ib-test: define iterations as multiple of default cycles

* Add checking to posix_memalign return value
Bu işleme şunda yer alıyor:
Wenkai Du
2020-06-22 13:06:25 -07:00
işlemeyi yapan: GitHub
ebeveyn dc739c4e70
işleme 0eb19a563a
6 değiştirilmiş dosya ile 82 ekleme ve 12 silme
+1 -1
Dosyayı Görüntüle
@@ -169,7 +169,7 @@ endforeach()
if("${HIP_COMPILER}" MATCHES "clang")
foreach(target ${AMDGPU_TARGETS})
target_compile_options(rccl PRIVATE --amdgpu-target=${target} PRIVATE -fgpu-rdc)
target_compile_options(rccl PRIVATE --amdgpu-target=${target} PRIVATE -fgpu-rdc -DUSE_MEMALIGN)
endforeach()
target_link_libraries(rccl PRIVATE -fgpu-rdc)
target_include_directories(rccl PRIVATE /opt/rocm/hsa/include)
+6
Dosyayı Görüntüle
@@ -20,6 +20,7 @@
#endif
#include "xml.h"
#include "cpuset.h"
#include <numa.h>
#define BUSID_SIZE (sizeof("0000:00:00.0"))
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
@@ -733,6 +734,11 @@ ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank) {
NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev, affinityStr);
SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity");
int ret = numa_run_on_node(cpu->id);
if (ret != 0)
INFO(NCCL_INIT, "Failed to run on numa node %ld", cpu->id);
else
INFO(NCCL_INIT, "Thread is set to run on numa node %ld", cpu->id);
}
return ncclSuccess;
}
+3
Dosyayı Görüntüle
@@ -92,6 +92,9 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
struct ncclConnInfo {
// Regular comm mechanism
char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
#ifdef USE_MEMALIGN
char *hostBuffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
#endif
uint64_t *tail; // Local for recv, remote for send
uint64_t *head; // Local for send, remote for recv
uint64_t *opCountLoc; // opCount of local rank
+36
Dosyayı Görüntüle
@@ -26,6 +26,9 @@ struct netSendResources {
int netDev;
int useGdr;
char* buffers[LOC_COUNT];
#ifdef USE_MEMALIGN
char* hostBuffer;
#endif
int buffSizes[LOC_COUNT];
void* mhandles[LOC_COUNT];
void** mhandlesProto[NCCL_NUM_PROTOCOLS];
@@ -93,10 +96,24 @@ ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
}
char line[16];
if (resources->buffSizes[LOC_HOSTMEM]) {
#ifdef USE_MEMALIGN
int page_size = getpagesize();
if (posix_memalign((void **)&resources->hostBuffer, page_size, resources->buffSizes[LOC_HOSTMEM])) {
WARN("Failed to posix_memalign %d bytes", resources->buffSizes[LOC_HOSTMEM]);
return ncclSystemError;
}
CUDACHECK(hipHostRegister(resources->hostBuffer, resources->buffSizes[LOC_HOSTMEM], hipHostRegisterMapped));
CUDACHECK(hipHostGetDevicePointer((void **)resources->buffers+LOC_HOSTMEM, resources->hostBuffer, 0));
#else
NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
#endif
int status[1] = {-1};
line[0]= 0;
#ifdef USE_MEMALIGN
if (!move_pages(0, 1, (void **)&resources->hostBuffer, NULL, status, 0))
#else
if (!move_pages(0, 1, (void **)resources->buffers+LOC_HOSTMEM, NULL, status, 0))
#endif
sprintf(line, "/MEM%d", status[0]);
}
@@ -105,6 +122,12 @@ ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
send->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
#ifdef USE_MEMALIGN
if (protoLoc[p] == LOC_HOSTMEM)
send->conn.hostBuffs[p] = resources->hostBuffer + offsets[protoLoc[p]];
else
send->conn.hostBuffs[p] = send->conn.buffs[p];
#endif
offsets[protoLoc[p]] += buffSizes[p];
}
@@ -182,7 +205,11 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, int nranks, int ran
NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM]));
}
if (resources->buffSizes[LOC_HOSTMEM]) {
#ifdef USE_MEMALIGN
NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->hostBuffer, resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM]));
#else
NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_HOSTMEM], resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM]));
#endif
}
return ncclSuccess;
}
@@ -213,7 +240,12 @@ ncclResult_t netSendFree(void* transportResources) {
if (resources->buffers[l])
NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[l]));
}
#ifdef USE_MEMALIGN
CUDACHECK(hipHostUnregister(resources->hostBuffer));
free(resources->hostBuffer);
#else
NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
#endif
CUDACHECK(hipFree(resources->buffers[LOC_DEVMEM]));
NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
free(resources);
@@ -251,7 +283,11 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
if (args->state == ncclProxyOpProgress) {
int p = args->protocol;
int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
#ifdef USE_MEMALIGN
char* localBuff = args->connector->conn.hostBuffs[p];
#else
char* localBuff = args->connector->conn.buffs[p];
#endif
void* mhandle = *(resources->mhandlesProto[p]);
args->idle = 1;
if (args->head < args->end) {
+1 -1
Dosyayı Görüntüle
@@ -291,7 +291,7 @@ struct ncclIbListenComm {
int fd;
};
struct ncclIbSendFifo {
struct alignas(64) ncclIbSendFifo {
uint64_t addr;
int size;
uint32_t seq;
+35 -10
Dosyayı Görüntüle
@@ -46,7 +46,7 @@ bool cmdOptionExists(char** begin, char** end, const std::string& option) {
#define DEFAULT_BUFFSIZE (1LL << 22) /* 4MiB */
#define SLICE_STEPS 4
#define ITERATIONS 2000
#define DEFAULT_CYCLES 4000
#define VEGA_GPU_RTC_FREQUENCY 2.5E7
#define ENABLE_VALIDATION
#define USE_MEMALIGN
@@ -91,6 +91,8 @@ bool runSend = false, runRecv = false;
uint64_t send_byte;
uint64_t recv_byte;
uint64_t iterations = 1;
__device__
inline __attribute((always_inline))
long long int __rtc64() {
@@ -316,6 +318,12 @@ int main(int argc,char* argv[])
CUDACHECK(hipSetDevice(atol(gpu)));
}
char *iters = getCmdOption(argv, argv + argc, "-i");
if (iters) {
iterations = atol(iters);
printf("Running %ld iterations\n", iterations);
}
char *gdr_read = getCmdOption(argv, argv + argc, "-r");
if (gdr_read) {
use_gdr_read = atol(gdr_read);
@@ -479,29 +487,30 @@ int main(int argc,char* argv[])
*sendHead = 0; *sendTail = 0; *sourceCycle = 0; *sourceBytes = 0;
send_sizes = 0; send_bw_cumulative = 0; send_bw_count =0; send_byte = 0;
hipLaunchKernelGGL(DataSourceKernel, dim3(1, 1, 1), dim3(256, 1, 1), 0, 0,
NCCL_STEPS*ITERATIONS, (Pack128 *)(use_gdr_read ? sendDevBuffer : d_sendHostBuffer), sendHead, sendTail, sourceCycle, sourceBytes);
NCCL_STEPS*iterations*DEFAULT_CYCLES, (Pack128 *)(use_gdr_read ? sendDevBuffer : d_sendHostBuffer), sendHead, sendTail, sourceCycle, sourceBytes);
runSend = true;
} else {
*recvHead = 0; *recvTail = 0; *recvErrorCount = 0; *sinkCycle = 0, *sinkBytes = 0;
recv_sizes = 0; recv_bw_cumulative = 0; recv_bw_count =0; recv_byte = 0;
hipLaunchKernelGGL(DataSinkKernel, dim3(1, 1, 1), dim3(256, 1, 1), 0, 0,
NCCL_STEPS*ITERATIONS, (Pack128 *)(use_gdr_write ? recvDevBuffer : d_recvHostBuffer), recvHead, recvTail, recvErrorCount, sinkCycle, sinkBytes);
NCCL_STEPS*iterations*DEFAULT_CYCLES, (Pack128 *)(use_gdr_write ? recvDevBuffer : d_recvHostBuffer), recvHead, recvTail, recvErrorCount, sinkCycle, sinkBytes);
runRecv = true;
}
struct timeval tv_start, tv_end;
struct timeval tv_start, tv_end, tv_prev;
gettimeofday(&tv_start, NULL);
gettimeofday(&tv_prev, NULL);
memset(&sendArgs, 0, sizeof(struct ncclProxyArgs));
sendArgs.head = 0;
sendArgs.tail = 0;
sendArgs.end = NCCL_STEPS*ITERATIONS;
sendArgs.end = NCCL_STEPS*iterations*DEFAULT_CYCLES;
sendArgs.sliceSteps = sliceSteps;
memset(&recvArgs, 0, sizeof(struct ncclProxyArgs));
recvArgs.head = 0;
recvArgs.tail = 0;
recvArgs.end = NCCL_STEPS*ITERATIONS;
recvArgs.end = NCCL_STEPS*iterations*DEFAULT_CYCLES;
recvArgs.sliceSteps = sliceSteps;
do {
@@ -509,6 +518,22 @@ int main(int argc,char* argv[])
NCCLCHECK(netRecvProxy(&recvArgs));
if (runSend)
NCCLCHECK(netSendProxy(&sendArgs));
gettimeofday(&tv_end, NULL);
uint64_t timelap = ((uint64_t)(tv_end.tv_sec - tv_prev.tv_sec)*1000*1000 + tv_end.tv_usec - tv_prev.tv_usec);
if (timelap > 100000UL) {
uint64_t total_time = ((uint64_t)(tv_end.tv_sec - tv_start.tv_sec)*1000*1000 + tv_end.tv_usec - tv_start.tv_usec);
if (send_byte) printf("# Send %3ld%% %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes)\r",
sendArgs.head*100/(NCCL_STEPS*iterations*DEFAULT_CYCLES), (total_time) ? (double)send_byte/total_time/1000.0 : 0,
send_byte, total_time, send_bw_count ? (float)send_bw_cumulative/send_bw_count : 0, send_bw_count,
*sourceCycle ? (double)(*sourceBytes)*sizeof(Pack128)/((double)(*sourceCycle)/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, *sourceBytes*sizeof(Pack128));
if (recv_byte) printf("# Recv %3ld%% %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes) Errors %ld\r",
recvArgs.head*100/(NCCL_STEPS*iterations*DEFAULT_CYCLES), (total_time) ? (double)recv_byte/total_time/1000.0 : 0,
recv_byte, total_time, recv_bw_count ? (float)recv_bw_cumulative/recv_bw_count : 0, recv_bw_count,
*sinkCycle ? (double)(*sinkBytes)*sizeof(Pack128)/((double)(*sinkCycle)/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, *sinkBytes*sizeof(Pack128),
*recvErrorCount);
gettimeofday(&tv_prev, NULL);
}
} while (runSend || runRecv);
CUDACHECK(hipDeviceSynchronize());
@@ -516,12 +541,12 @@ int main(int argc,char* argv[])
gettimeofday(&tv_end, NULL);
uint64_t total_time = ((uint64_t)(tv_end.tv_sec - tv_start.tv_sec)*1000*1000 + tv_end.tv_usec - tv_start.tv_usec);
if (send_byte) printf("# Send %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes)\n",
(total_time) ? (double)send_byte/total_time/1000.0 : 0,
if (send_byte) printf("# Send %3ld%% %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes)\n",
sendArgs.head*100/(NCCL_STEPS*iterations*DEFAULT_CYCLES), (total_time) ? (double)send_byte/total_time/1000.0 : 0,
send_byte, total_time, send_bw_count ? (float)send_bw_cumulative/send_bw_count : 0, send_bw_count,
*sourceCycle ? (double)(*sourceBytes)*sizeof(Pack128)/((double)(*sourceCycle)/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, *sourceBytes*sizeof(Pack128));
if (recv_byte) printf("# Recv %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes) Data Error Counts %ld\n",
(total_time) ? (double)recv_byte/total_time/1000.0 : 0,
if (recv_byte) printf("# Recv %3ld%% %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes) Errors %ld\n",
recvArgs.head*100/(NCCL_STEPS*iterations*DEFAULT_CYCLES), (total_time) ? (double)recv_byte/total_time/1000.0 : 0,
recv_byte, total_time, recv_bw_count ? (float)recv_bw_cumulative/recv_bw_count : 0, recv_bw_count,
*sinkCycle ? (double)(*sinkBytes)*sizeof(Pack128)/((double)(*sinkCycle)/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, *sinkBytes*sizeof(Pack128),
*recvErrorCount);