diff --git a/projects/rccl/tools/TransferBench/TransferBench.cpp b/projects/rccl/tools/TransferBench/TransferBench.cpp index dca4f1376c..2ec45376e5 100644 --- a/projects/rccl/tools/TransferBench/TransferBench.cpp +++ b/projects/rccl/tools/TransferBench/TransferBench.cpp @@ -46,7 +46,12 @@ int main(int argc, char **argv) printf(" would define 2 links each using 3 threadblocks from GPU0 -> GPU1, and GPU1->GPU0\n"); printf("- N: (Optional) Number of bytes to transfer per link.\n"); printf(" If not specified, defaults to 2^28=256MB. Must be a multiple of 128 bytes\n"); - printf("Set env var USE_MEMCPY_ASYNC to use hipMemcpyAsync instead of copy kernel\n"); + printf("\n"); + printf("Environment variables:\n"); + printf("======================\n"); + printf(" USE_HIP_CALL - Use hip calls (hipMemcpyAsync/hipMemset) instead of kernel\n"); + printf(" USE_MEMSET - Write constant value (instead of doing a copy)\n"); + printf(" USE_COARSE_MEM - Use coarse-grained dst GPU memory (instead of fine-grained)\n"); exit(0); } @@ -58,17 +63,31 @@ int main(int argc, char **argv) printf("[ERROR] numBytesPerLink (%lu) must be a multiple of 128\n", numBytesPerLink); exit(1); } + printf("Operating on %zu bytes per link (%zu floats)\n", numBytesPerLink, N); + + bool useHipCall = getenv("USE_HIP_CALL"); + bool useMemset = getenv("USE_MEMSET"); + bool useCoarseMem = getenv("USE_COARSE_MEM"); + printf("Running %s%s tests (control using USE_HIP_CALL/USE_MEMSET)\n", + useHipCall ? "hipMem" : "mem", + useMemset ? "set" : "cpy"); + printf("Destination memory: %s-grained (control using USE_COARSE_MEM)\n", + useCoarseMem ? "coarse" : "fine"); + if (useHipCall && !useMemset) + { + if (getenv("HSA_ENABLE_SDMA") && !strcmp(getenv("HSA_ENABLE_SDMA"), "0")) + printf("Using blit kernels for hipMemcpy. (HSA_ENABLE_SDMA=0)\n"); + else + printf("Using DMA copy engines (disable by setting HSA_ENABLE_SDMA=0)\n"); + } // Currently an environment variable is required in order to enable fine-grained VRAM allocations - if (!getenv("HSA_FORCE_FINE_GRAIN_PCIE")) + if (!useCoarseMem && !getenv("HSA_FORCE_FINE_GRAIN_PCIE")) { printf("[ERROR] Currently you must set HSA_FORCE_FINE_GRAIN_PCIE=1 prior to execution\n"); exit(1); } - bool useMemcpy = getenv("USE_MEMCPY_ASYNC"); - printf("Using %s\n", useMemcpy ? "hipMemcpyAsync (USE_MEMCPY_ASYNC found) [# of blocks to use will be ignored]" : "copy kernel (USE_MEMCPY_ASYNC not found)"); - // Collect the number of available GPUs on this machine int numDevices; HIP_CALL(hipGetDeviceCount(&numDevices)); @@ -160,11 +179,14 @@ int main(int argc, char **argv) HIP_CALL(hipEventCreate(&stopEvents[i])); HIP_CALL(hipMalloc((void **)&linkSrcMem[i], numBytesPerLink)); HIP_CALL(hipMalloc((void**)&gpuBlockParams[i], sizeof(BlockParam) * numLinks)); - CheckOrFill(N, linkSrcMem[i], false); + CheckOrFill(N, linkSrcMem[i], false, useMemset, useHipCall); - // Allocate fine-grained GPU memory on destination GPU + // Allocate GPU memory on destination GPU HIP_CALL(hipSetDevice(links[i].dstGpu)); - HIP_CALL(hipExtMallocWithFlags((void**)&linkDstMem[i], numBytesPerLink, hipDeviceMallocFinegrained)); + if (useCoarseMem) + HIP_CALL(hipMalloc((void**)&linkDstMem[i], numBytesPerLink)); + else + HIP_CALL(hipExtMallocWithFlags((void**)&linkDstMem[i], numBytesPerLink, hipDeviceMallocFinegrained)); // Each block needs to know src/dst pointers and how many elements to transfer // Figure out the sub-array each block does for this link @@ -203,20 +225,39 @@ int main(int argc, char **argv) { HIP_CALL(hipSetDevice(links[i].srcGpu)); HIP_CALL(hipEventRecord(startEvents[i], streams[i])); - if (useMemcpy) + if (useHipCall) { + if (useMemset) + { + HIP_CALL(hipMemsetAsync(linkDstMem[i], 42, numBytesPerLink, streams[i])); + } + else + { HIP_CALL(hipMemcpyAsync(linkDstMem[i], linkSrcMem[i], numBytesPerLink, hipMemcpyDeviceToDevice, streams[i])); + } } else { + if (useMemset) + { + hipLaunchKernelGGL(MemsetKernel, + dim3(links[i].numBlocksToUse, 1, 1), + dim3(BLOCKSIZE, 1, 1), + 0, + streams[i], + gpuBlockParams[i]); + } + else + { hipLaunchKernelGGL(CopyKernel, dim3(links[i].numBlocksToUse, 1, 1), dim3(BLOCKSIZE, 1, 1), 0, streams[i], gpuBlockParams[i]); + } } HIP_CALL(hipEventRecord(stopEvents[i], streams[i])); } @@ -255,7 +296,7 @@ int main(int argc, char **argv) // Validate that each link has transferred correctly for (int i = 0; i < numLinks; i++) - CheckOrFill(N, linkDstMem[i], true); + CheckOrFill(N, linkDstMem[i], true, useMemset, useHipCall); // Report timings printf("%-*s", MAX_NAME_LEN, name); diff --git a/projects/rccl/tools/TransferBench/TransferBench.hpp b/projects/rccl/tools/TransferBench/TransferBench.hpp index 25f61e9253..5fde8f58fe 100644 --- a/projects/rccl/tools/TransferBench/TransferBench.hpp +++ b/projects/rccl/tools/TransferBench/TransferBench.hpp @@ -35,6 +35,7 @@ THE SOFTWARE. #define MAX_NAME_LEN 64 #define BLOCKSIZE 256 #define COPY_UNROLL 4 +#define MEMSET_UNROLL 4 // Each link is defined between a source GPU and destination GPU struct Link @@ -64,6 +65,22 @@ CopyKernel(BlockParam* blockParams) Copy(dst, src, N); } +// GPU set kernel +__global__ void __launch_bounds__(BLOCKSIZE) +MemsetKernel(BlockParam* blockParams) +{ + // Collect the arguments for this block + int N = blockParams[blockIdx.x].N; + float* __restrict__ dst = (float*)blockParams[blockIdx.x].dst; + + // Use non-zero value + #pragma unroll MEMSET_UNROLL + for (int tid = threadIdx.x; tid < N; tid += BLOCKSIZE) + { + dst[tid] = 1234.0; + } +} + // Helper function to parse a link of link definitions void ParseLinks(char const* line, std::vector& links) { @@ -83,12 +100,27 @@ void ParseLinks(char const* line, std::vector& links) } // Helper function to either fill a device pointer with pseudo-random data, or to check to see if it matches -void CheckOrFill(int N, float* devPtr, bool doCheck) +void CheckOrFill(int N, float* devPtr, bool doCheck, bool isMemset, bool isHipCall) { float* refBuffer = (float*)malloc(N * sizeof(float)); - for (int i = 0; i < N; i++) - refBuffer[i] = i % 383 + 31; + if (isMemset) + { + if (isHipCall) + { + memset(refBuffer, 42, N * sizeof(float)); + } + else + { + for (int i = 0; i < N; i++) + refBuffer[i] = 1234.0f; + } + } + else + { + for (int i = 0; i < N; i++) + refBuffer[i] = (i % 383 + 31); + } if (doCheck) {