Adding ability to switch between fine/coarse grain destination GPU memory
Adding ability to switch between memset/memcpy
[ROCm/rccl commit: 648c1ee7cc]
Este cometimento está contido em:
cometido por
gilbertlee-amd
ascendente
b4ab922f94
cometimento
a99accb2cb
@@ -46,7 +46,12 @@ int main(int argc, char **argv)
|
||||
printf(" would define 2 links each using 3 threadblocks from GPU0 -> GPU1, and GPU1->GPU0\n");
|
||||
printf("- N: (Optional) Number of bytes to transfer per link.\n");
|
||||
printf(" If not specified, defaults to 2^28=256MB. Must be a multiple of 128 bytes\n");
|
||||
printf("Set env var USE_MEMCPY_ASYNC to use hipMemcpyAsync instead of copy kernel\n");
|
||||
printf("\n");
|
||||
printf("Environment variables:\n");
|
||||
printf("======================\n");
|
||||
printf(" USE_HIP_CALL - Use hip calls (hipMemcpyAsync/hipMemset) instead of kernel\n");
|
||||
printf(" USE_MEMSET - Write constant value (instead of doing a copy)\n");
|
||||
printf(" USE_COARSE_MEM - Use coarse-grained dst GPU memory (instead of fine-grained)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
@@ -58,17 +63,31 @@ int main(int argc, char **argv)
|
||||
printf("[ERROR] numBytesPerLink (%lu) must be a multiple of 128\n", numBytesPerLink);
|
||||
exit(1);
|
||||
}
|
||||
printf("Operating on %zu bytes per link (%zu floats)\n", numBytesPerLink, N);
|
||||
|
||||
bool useHipCall = getenv("USE_HIP_CALL");
|
||||
bool useMemset = getenv("USE_MEMSET");
|
||||
bool useCoarseMem = getenv("USE_COARSE_MEM");
|
||||
printf("Running %s%s tests (control using USE_HIP_CALL/USE_MEMSET)\n",
|
||||
useHipCall ? "hipMem" : "mem",
|
||||
useMemset ? "set" : "cpy");
|
||||
printf("Destination memory: %s-grained (control using USE_COARSE_MEM)\n",
|
||||
useCoarseMem ? "coarse" : "fine");
|
||||
if (useHipCall && !useMemset)
|
||||
{
|
||||
if (getenv("HSA_ENABLE_SDMA") && !strcmp(getenv("HSA_ENABLE_SDMA"), "0"))
|
||||
printf("Using blit kernels for hipMemcpy. (HSA_ENABLE_SDMA=0)\n");
|
||||
else
|
||||
printf("Using DMA copy engines (disable by setting HSA_ENABLE_SDMA=0)\n");
|
||||
}
|
||||
|
||||
// Currently an environment variable is required in order to enable fine-grained VRAM allocations
|
||||
if (!getenv("HSA_FORCE_FINE_GRAIN_PCIE"))
|
||||
if (!useCoarseMem && !getenv("HSA_FORCE_FINE_GRAIN_PCIE"))
|
||||
{
|
||||
printf("[ERROR] Currently you must set HSA_FORCE_FINE_GRAIN_PCIE=1 prior to execution\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
bool useMemcpy = getenv("USE_MEMCPY_ASYNC");
|
||||
printf("Using %s\n", useMemcpy ? "hipMemcpyAsync (USE_MEMCPY_ASYNC found) [# of blocks to use will be ignored]" : "copy kernel (USE_MEMCPY_ASYNC not found)");
|
||||
|
||||
// Collect the number of available GPUs on this machine
|
||||
int numDevices;
|
||||
HIP_CALL(hipGetDeviceCount(&numDevices));
|
||||
@@ -160,11 +179,14 @@ int main(int argc, char **argv)
|
||||
HIP_CALL(hipEventCreate(&stopEvents[i]));
|
||||
HIP_CALL(hipMalloc((void **)&linkSrcMem[i], numBytesPerLink));
|
||||
HIP_CALL(hipMalloc((void**)&gpuBlockParams[i], sizeof(BlockParam) * numLinks));
|
||||
CheckOrFill(N, linkSrcMem[i], false);
|
||||
CheckOrFill(N, linkSrcMem[i], false, useMemset, useHipCall);
|
||||
|
||||
// Allocate fine-grained GPU memory on destination GPU
|
||||
// Allocate GPU memory on destination GPU
|
||||
HIP_CALL(hipSetDevice(links[i].dstGpu));
|
||||
HIP_CALL(hipExtMallocWithFlags((void**)&linkDstMem[i], numBytesPerLink, hipDeviceMallocFinegrained));
|
||||
if (useCoarseMem)
|
||||
HIP_CALL(hipMalloc((void**)&linkDstMem[i], numBytesPerLink));
|
||||
else
|
||||
HIP_CALL(hipExtMallocWithFlags((void**)&linkDstMem[i], numBytesPerLink, hipDeviceMallocFinegrained));
|
||||
|
||||
// Each block needs to know src/dst pointers and how many elements to transfer
|
||||
// Figure out the sub-array each block does for this link
|
||||
@@ -203,20 +225,39 @@ int main(int argc, char **argv)
|
||||
{
|
||||
HIP_CALL(hipSetDevice(links[i].srcGpu));
|
||||
HIP_CALL(hipEventRecord(startEvents[i], streams[i]));
|
||||
if (useMemcpy)
|
||||
if (useHipCall)
|
||||
{
|
||||
if (useMemset)
|
||||
{
|
||||
HIP_CALL(hipMemsetAsync(linkDstMem[i], 42, numBytesPerLink, streams[i]));
|
||||
}
|
||||
else
|
||||
{
|
||||
HIP_CALL(hipMemcpyAsync(linkDstMem[i], linkSrcMem[i],
|
||||
numBytesPerLink, hipMemcpyDeviceToDevice,
|
||||
streams[i]));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (useMemset)
|
||||
{
|
||||
hipLaunchKernelGGL(MemsetKernel,
|
||||
dim3(links[i].numBlocksToUse, 1, 1),
|
||||
dim3(BLOCKSIZE, 1, 1),
|
||||
0,
|
||||
streams[i],
|
||||
gpuBlockParams[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
hipLaunchKernelGGL(CopyKernel,
|
||||
dim3(links[i].numBlocksToUse, 1, 1),
|
||||
dim3(BLOCKSIZE, 1, 1),
|
||||
0,
|
||||
streams[i],
|
||||
gpuBlockParams[i]);
|
||||
}
|
||||
}
|
||||
HIP_CALL(hipEventRecord(stopEvents[i], streams[i]));
|
||||
}
|
||||
@@ -255,7 +296,7 @@ int main(int argc, char **argv)
|
||||
|
||||
// Validate that each link has transferred correctly
|
||||
for (int i = 0; i < numLinks; i++)
|
||||
CheckOrFill(N, linkDstMem[i], true);
|
||||
CheckOrFill(N, linkDstMem[i], true, useMemset, useHipCall);
|
||||
|
||||
// Report timings
|
||||
printf("%-*s", MAX_NAME_LEN, name);
|
||||
|
||||
@@ -35,6 +35,7 @@ THE SOFTWARE.
|
||||
#define MAX_NAME_LEN 64
|
||||
#define BLOCKSIZE 256
|
||||
#define COPY_UNROLL 4
|
||||
#define MEMSET_UNROLL 4
|
||||
|
||||
// Each link is defined between a source GPU and destination GPU
|
||||
struct Link
|
||||
@@ -64,6 +65,22 @@ CopyKernel(BlockParam* blockParams)
|
||||
Copy<COPY_UNROLL, BLOCKSIZE>(dst, src, N);
|
||||
}
|
||||
|
||||
// GPU set kernel
|
||||
__global__ void __launch_bounds__(BLOCKSIZE)
|
||||
MemsetKernel(BlockParam* blockParams)
|
||||
{
|
||||
// Collect the arguments for this block
|
||||
int N = blockParams[blockIdx.x].N;
|
||||
float* __restrict__ dst = (float*)blockParams[blockIdx.x].dst;
|
||||
|
||||
// Use non-zero value
|
||||
#pragma unroll MEMSET_UNROLL
|
||||
for (int tid = threadIdx.x; tid < N; tid += BLOCKSIZE)
|
||||
{
|
||||
dst[tid] = 1234.0;
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to parse a link of link definitions
|
||||
void ParseLinks(char const* line, std::vector<Link>& links)
|
||||
{
|
||||
@@ -83,12 +100,27 @@ void ParseLinks(char const* line, std::vector<Link>& links)
|
||||
}
|
||||
|
||||
// Helper function to either fill a device pointer with pseudo-random data, or to check to see if it matches
|
||||
void CheckOrFill(int N, float* devPtr, bool doCheck)
|
||||
void CheckOrFill(int N, float* devPtr, bool doCheck, bool isMemset, bool isHipCall)
|
||||
{
|
||||
float* refBuffer = (float*)malloc(N * sizeof(float));
|
||||
|
||||
for (int i = 0; i < N; i++)
|
||||
refBuffer[i] = i % 383 + 31;
|
||||
if (isMemset)
|
||||
{
|
||||
if (isHipCall)
|
||||
{
|
||||
memset(refBuffer, 42, N * sizeof(float));
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < N; i++)
|
||||
refBuffer[i] = 1234.0f;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < N; i++)
|
||||
refBuffer[i] = (i % 383 + 31);
|
||||
}
|
||||
|
||||
if (doCheck)
|
||||
{
|
||||
|
||||
Criar uma nova questão referindo esta
Bloquear um utilizador