Use posix_memalign for network buffer allocation on host memory (#221)
* Use posix_memalign for network buffer allocation on host memory * ib-test: add ability to specify run iterations * ib-test: define iterations as multiple of default cycles * Add checking to posix_memalign return value
Bu işleme şunda yer alıyor:
@@ -169,7 +169,7 @@ endforeach()
|
||||
|
||||
if("${HIP_COMPILER}" MATCHES "clang")
|
||||
foreach(target ${AMDGPU_TARGETS})
|
||||
target_compile_options(rccl PRIVATE --amdgpu-target=${target} PRIVATE -fgpu-rdc)
|
||||
target_compile_options(rccl PRIVATE --amdgpu-target=${target} PRIVATE -fgpu-rdc -DUSE_MEMALIGN)
|
||||
endforeach()
|
||||
target_link_libraries(rccl PRIVATE -fgpu-rdc)
|
||||
target_include_directories(rccl PRIVATE /opt/rocm/hsa/include)
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
#endif
|
||||
#include "xml.h"
|
||||
#include "cpuset.h"
|
||||
#include <numa.h>
|
||||
|
||||
#define BUSID_SIZE (sizeof("0000:00:00.0"))
|
||||
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
|
||||
@@ -733,6 +734,11 @@ ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank) {
|
||||
NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
|
||||
INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev, affinityStr);
|
||||
SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity");
|
||||
int ret = numa_run_on_node(cpu->id);
|
||||
if (ret != 0)
|
||||
INFO(NCCL_INIT, "Failed to run on numa node %ld", cpu->id);
|
||||
else
|
||||
INFO(NCCL_INIT, "Thread is set to run on numa node %ld", cpu->id);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -92,6 +92,9 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
|
||||
struct ncclConnInfo {
|
||||
// Regular comm mechanism
|
||||
char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
|
||||
#ifdef USE_MEMALIGN
|
||||
char *hostBuffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
|
||||
#endif
|
||||
uint64_t *tail; // Local for recv, remote for send
|
||||
uint64_t *head; // Local for send, remote for recv
|
||||
uint64_t *opCountLoc; // opCount of local rank
|
||||
|
||||
@@ -26,6 +26,9 @@ struct netSendResources {
|
||||
int netDev;
|
||||
int useGdr;
|
||||
char* buffers[LOC_COUNT];
|
||||
#ifdef USE_MEMALIGN
|
||||
char* hostBuffer;
|
||||
#endif
|
||||
int buffSizes[LOC_COUNT];
|
||||
void* mhandles[LOC_COUNT];
|
||||
void** mhandlesProto[NCCL_NUM_PROTOCOLS];
|
||||
@@ -93,10 +96,24 @@ ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
|
||||
}
|
||||
char line[16];
|
||||
if (resources->buffSizes[LOC_HOSTMEM]) {
|
||||
#ifdef USE_MEMALIGN
|
||||
int page_size = getpagesize();
|
||||
if (posix_memalign((void **)&resources->hostBuffer, page_size, resources->buffSizes[LOC_HOSTMEM])) {
|
||||
WARN("Failed to posix_memalign %d bytes", resources->buffSizes[LOC_HOSTMEM]);
|
||||
return ncclSystemError;
|
||||
}
|
||||
CUDACHECK(hipHostRegister(resources->hostBuffer, resources->buffSizes[LOC_HOSTMEM], hipHostRegisterMapped));
|
||||
CUDACHECK(hipHostGetDevicePointer((void **)resources->buffers+LOC_HOSTMEM, resources->hostBuffer, 0));
|
||||
#else
|
||||
NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
|
||||
#endif
|
||||
int status[1] = {-1};
|
||||
line[0]= 0;
|
||||
#ifdef USE_MEMALIGN
|
||||
if (!move_pages(0, 1, (void **)&resources->hostBuffer, NULL, status, 0))
|
||||
#else
|
||||
if (!move_pages(0, 1, (void **)resources->buffers+LOC_HOSTMEM, NULL, status, 0))
|
||||
#endif
|
||||
sprintf(line, "/MEM%d", status[0]);
|
||||
}
|
||||
|
||||
@@ -105,6 +122,12 @@ ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
|
||||
send->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
|
||||
#ifdef USE_MEMALIGN
|
||||
if (protoLoc[p] == LOC_HOSTMEM)
|
||||
send->conn.hostBuffs[p] = resources->hostBuffer + offsets[protoLoc[p]];
|
||||
else
|
||||
send->conn.hostBuffs[p] = send->conn.buffs[p];
|
||||
#endif
|
||||
offsets[protoLoc[p]] += buffSizes[p];
|
||||
}
|
||||
|
||||
@@ -182,7 +205,11 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, int nranks, int ran
|
||||
NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM]));
|
||||
}
|
||||
if (resources->buffSizes[LOC_HOSTMEM]) {
|
||||
#ifdef USE_MEMALIGN
|
||||
NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->hostBuffer, resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM]));
|
||||
#else
|
||||
NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_HOSTMEM], resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM]));
|
||||
#endif
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -213,7 +240,12 @@ ncclResult_t netSendFree(void* transportResources) {
|
||||
if (resources->buffers[l])
|
||||
NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[l]));
|
||||
}
|
||||
#ifdef USE_MEMALIGN
|
||||
CUDACHECK(hipHostUnregister(resources->hostBuffer));
|
||||
free(resources->hostBuffer);
|
||||
#else
|
||||
NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
|
||||
#endif
|
||||
CUDACHECK(hipFree(resources->buffers[LOC_DEVMEM]));
|
||||
NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
|
||||
free(resources);
|
||||
@@ -251,7 +283,11 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
||||
if (args->state == ncclProxyOpProgress) {
|
||||
int p = args->protocol;
|
||||
int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
|
||||
#ifdef USE_MEMALIGN
|
||||
char* localBuff = args->connector->conn.hostBuffs[p];
|
||||
#else
|
||||
char* localBuff = args->connector->conn.buffs[p];
|
||||
#endif
|
||||
void* mhandle = *(resources->mhandlesProto[p]);
|
||||
args->idle = 1;
|
||||
if (args->head < args->end) {
|
||||
|
||||
@@ -291,7 +291,7 @@ struct ncclIbListenComm {
|
||||
int fd;
|
||||
};
|
||||
|
||||
struct ncclIbSendFifo {
|
||||
struct alignas(64) ncclIbSendFifo {
|
||||
uint64_t addr;
|
||||
int size;
|
||||
uint32_t seq;
|
||||
|
||||
@@ -46,7 +46,7 @@ bool cmdOptionExists(char** begin, char** end, const std::string& option) {
|
||||
|
||||
#define DEFAULT_BUFFSIZE (1LL << 22) /* 4MiB */
|
||||
#define SLICE_STEPS 4
|
||||
#define ITERATIONS 2000
|
||||
#define DEFAULT_CYCLES 4000
|
||||
#define VEGA_GPU_RTC_FREQUENCY 2.5E7
|
||||
#define ENABLE_VALIDATION
|
||||
#define USE_MEMALIGN
|
||||
@@ -91,6 +91,8 @@ bool runSend = false, runRecv = false;
|
||||
uint64_t send_byte;
|
||||
uint64_t recv_byte;
|
||||
|
||||
uint64_t iterations = 1;
|
||||
|
||||
__device__
|
||||
inline __attribute((always_inline))
|
||||
long long int __rtc64() {
|
||||
@@ -316,6 +318,12 @@ int main(int argc,char* argv[])
|
||||
CUDACHECK(hipSetDevice(atol(gpu)));
|
||||
}
|
||||
|
||||
char *iters = getCmdOption(argv, argv + argc, "-i");
|
||||
if (iters) {
|
||||
iterations = atol(iters);
|
||||
printf("Running %ld iterations\n", iterations);
|
||||
}
|
||||
|
||||
char *gdr_read = getCmdOption(argv, argv + argc, "-r");
|
||||
if (gdr_read) {
|
||||
use_gdr_read = atol(gdr_read);
|
||||
@@ -479,29 +487,30 @@ int main(int argc,char* argv[])
|
||||
*sendHead = 0; *sendTail = 0; *sourceCycle = 0; *sourceBytes = 0;
|
||||
send_sizes = 0; send_bw_cumulative = 0; send_bw_count =0; send_byte = 0;
|
||||
hipLaunchKernelGGL(DataSourceKernel, dim3(1, 1, 1), dim3(256, 1, 1), 0, 0,
|
||||
NCCL_STEPS*ITERATIONS, (Pack128 *)(use_gdr_read ? sendDevBuffer : d_sendHostBuffer), sendHead, sendTail, sourceCycle, sourceBytes);
|
||||
NCCL_STEPS*iterations*DEFAULT_CYCLES, (Pack128 *)(use_gdr_read ? sendDevBuffer : d_sendHostBuffer), sendHead, sendTail, sourceCycle, sourceBytes);
|
||||
runSend = true;
|
||||
} else {
|
||||
*recvHead = 0; *recvTail = 0; *recvErrorCount = 0; *sinkCycle = 0, *sinkBytes = 0;
|
||||
recv_sizes = 0; recv_bw_cumulative = 0; recv_bw_count =0; recv_byte = 0;
|
||||
hipLaunchKernelGGL(DataSinkKernel, dim3(1, 1, 1), dim3(256, 1, 1), 0, 0,
|
||||
NCCL_STEPS*ITERATIONS, (Pack128 *)(use_gdr_write ? recvDevBuffer : d_recvHostBuffer), recvHead, recvTail, recvErrorCount, sinkCycle, sinkBytes);
|
||||
NCCL_STEPS*iterations*DEFAULT_CYCLES, (Pack128 *)(use_gdr_write ? recvDevBuffer : d_recvHostBuffer), recvHead, recvTail, recvErrorCount, sinkCycle, sinkBytes);
|
||||
runRecv = true;
|
||||
}
|
||||
|
||||
struct timeval tv_start, tv_end;
|
||||
struct timeval tv_start, tv_end, tv_prev;
|
||||
gettimeofday(&tv_start, NULL);
|
||||
gettimeofday(&tv_prev, NULL);
|
||||
|
||||
memset(&sendArgs, 0, sizeof(struct ncclProxyArgs));
|
||||
sendArgs.head = 0;
|
||||
sendArgs.tail = 0;
|
||||
sendArgs.end = NCCL_STEPS*ITERATIONS;
|
||||
sendArgs.end = NCCL_STEPS*iterations*DEFAULT_CYCLES;
|
||||
sendArgs.sliceSteps = sliceSteps;
|
||||
|
||||
memset(&recvArgs, 0, sizeof(struct ncclProxyArgs));
|
||||
recvArgs.head = 0;
|
||||
recvArgs.tail = 0;
|
||||
recvArgs.end = NCCL_STEPS*ITERATIONS;
|
||||
recvArgs.end = NCCL_STEPS*iterations*DEFAULT_CYCLES;
|
||||
recvArgs.sliceSteps = sliceSteps;
|
||||
|
||||
do {
|
||||
@@ -509,6 +518,22 @@ int main(int argc,char* argv[])
|
||||
NCCLCHECK(netRecvProxy(&recvArgs));
|
||||
if (runSend)
|
||||
NCCLCHECK(netSendProxy(&sendArgs));
|
||||
|
||||
gettimeofday(&tv_end, NULL);
|
||||
uint64_t timelap = ((uint64_t)(tv_end.tv_sec - tv_prev.tv_sec)*1000*1000 + tv_end.tv_usec - tv_prev.tv_usec);
|
||||
if (timelap > 100000UL) {
|
||||
uint64_t total_time = ((uint64_t)(tv_end.tv_sec - tv_start.tv_sec)*1000*1000 + tv_end.tv_usec - tv_start.tv_usec);
|
||||
if (send_byte) printf("# Send %3ld%% %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes)\r",
|
||||
sendArgs.head*100/(NCCL_STEPS*iterations*DEFAULT_CYCLES), (total_time) ? (double)send_byte/total_time/1000.0 : 0,
|
||||
send_byte, total_time, send_bw_count ? (float)send_bw_cumulative/send_bw_count : 0, send_bw_count,
|
||||
*sourceCycle ? (double)(*sourceBytes)*sizeof(Pack128)/((double)(*sourceCycle)/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, *sourceBytes*sizeof(Pack128));
|
||||
if (recv_byte) printf("# Recv %3ld%% %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes) Errors %ld\r",
|
||||
recvArgs.head*100/(NCCL_STEPS*iterations*DEFAULT_CYCLES), (total_time) ? (double)recv_byte/total_time/1000.0 : 0,
|
||||
recv_byte, total_time, recv_bw_count ? (float)recv_bw_cumulative/recv_bw_count : 0, recv_bw_count,
|
||||
*sinkCycle ? (double)(*sinkBytes)*sizeof(Pack128)/((double)(*sinkCycle)/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, *sinkBytes*sizeof(Pack128),
|
||||
*recvErrorCount);
|
||||
gettimeofday(&tv_prev, NULL);
|
||||
}
|
||||
} while (runSend || runRecv);
|
||||
|
||||
CUDACHECK(hipDeviceSynchronize());
|
||||
@@ -516,12 +541,12 @@ int main(int argc,char* argv[])
|
||||
gettimeofday(&tv_end, NULL);
|
||||
uint64_t total_time = ((uint64_t)(tv_end.tv_sec - tv_start.tv_sec)*1000*1000 + tv_end.tv_usec - tv_start.tv_usec);
|
||||
|
||||
if (send_byte) printf("# Send %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes)\n",
|
||||
(total_time) ? (double)send_byte/total_time/1000.0 : 0,
|
||||
if (send_byte) printf("# Send %3ld%% %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes)\n",
|
||||
sendArgs.head*100/(NCCL_STEPS*iterations*DEFAULT_CYCLES), (total_time) ? (double)send_byte/total_time/1000.0 : 0,
|
||||
send_byte, total_time, send_bw_count ? (float)send_bw_cumulative/send_bw_count : 0, send_bw_count,
|
||||
*sourceCycle ? (double)(*sourceBytes)*sizeof(Pack128)/((double)(*sourceCycle)/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, *sourceBytes*sizeof(Pack128));
|
||||
if (recv_byte) printf("# Recv %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes) Data Error Counts %ld\n",
|
||||
(total_time) ? (double)recv_byte/total_time/1000.0 : 0,
|
||||
if (recv_byte) printf("# Recv %3ld%% %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes) Errors %ld\n",
|
||||
recvArgs.head*100/(NCCL_STEPS*iterations*DEFAULT_CYCLES), (total_time) ? (double)recv_byte/total_time/1000.0 : 0,
|
||||
recv_byte, total_time, recv_bw_count ? (float)recv_bw_cumulative/recv_bw_count : 0, recv_bw_count,
|
||||
*sinkCycle ? (double)(*sinkBytes)*sizeof(Pack128)/((double)(*sinkCycle)/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, *sinkBytes*sizeof(Pack128),
|
||||
*recvErrorCount);
|
||||
|
||||
Yeni konuda referans
Bir kullanıcı engelle