2
0

Adding ability to switch between fine/coarse grain destination GPU memory

Adding ability to switch between memset/memcpy


[ROCm/rccl commit: 648c1ee7cc]
Este cometimento está contido em:
Gilbert Lee
2019-10-28 13:16:21 -07:00
cometido por gilbertlee-amd
ascendente b4ab922f94
cometimento a99accb2cb
2 ficheiros modificados com 86 adições e 13 eliminações
+51 -10
Ver ficheiro
@@ -46,7 +46,12 @@ int main(int argc, char **argv)
printf(" would define 2 links each using 3 threadblocks from GPU0 -> GPU1, and GPU1->GPU0\n");
printf("- N: (Optional) Number of bytes to transfer per link.\n");
printf(" If not specified, defaults to 2^28=256MB. Must be a multiple of 128 bytes\n");
printf("Set env var USE_MEMCPY_ASYNC to use hipMemcpyAsync instead of copy kernel\n");
printf("\n");
printf("Environment variables:\n");
printf("======================\n");
printf(" USE_HIP_CALL - Use hip calls (hipMemcpyAsync/hipMemset) instead of kernel\n");
printf(" USE_MEMSET - Write constant value (instead of doing a copy)\n");
printf(" USE_COARSE_MEM - Use coarse-grained dst GPU memory (instead of fine-grained)\n");
exit(0);
}
@@ -58,17 +63,31 @@ int main(int argc, char **argv)
printf("[ERROR] numBytesPerLink (%lu) must be a multiple of 128\n", numBytesPerLink);
exit(1);
}
printf("Operating on %zu bytes per link (%zu floats)\n", numBytesPerLink, N);
bool useHipCall = getenv("USE_HIP_CALL");
bool useMemset = getenv("USE_MEMSET");
bool useCoarseMem = getenv("USE_COARSE_MEM");
printf("Running %s%s tests (control using USE_HIP_CALL/USE_MEMSET)\n",
useHipCall ? "hipMem" : "mem",
useMemset ? "set" : "cpy");
printf("Destination memory: %s-grained (control using USE_COARSE_MEM)\n",
useCoarseMem ? "coarse" : "fine");
if (useHipCall && !useMemset)
{
if (getenv("HSA_ENABLE_SDMA") && !strcmp(getenv("HSA_ENABLE_SDMA"), "0"))
printf("Using blit kernels for hipMemcpy. (HSA_ENABLE_SDMA=0)\n");
else
printf("Using DMA copy engines (disable by setting HSA_ENABLE_SDMA=0)\n");
}
// Currently an environment variable is required in order to enable fine-grained VRAM allocations
if (!getenv("HSA_FORCE_FINE_GRAIN_PCIE"))
if (!useCoarseMem && !getenv("HSA_FORCE_FINE_GRAIN_PCIE"))
{
printf("[ERROR] Currently you must set HSA_FORCE_FINE_GRAIN_PCIE=1 prior to execution\n");
exit(1);
}
bool useMemcpy = getenv("USE_MEMCPY_ASYNC");
printf("Using %s\n", useMemcpy ? "hipMemcpyAsync (USE_MEMCPY_ASYNC found) [# of blocks to use will be ignored]" : "copy kernel (USE_MEMCPY_ASYNC not found)");
// Collect the number of available GPUs on this machine
int numDevices;
HIP_CALL(hipGetDeviceCount(&numDevices));
@@ -160,11 +179,14 @@ int main(int argc, char **argv)
HIP_CALL(hipEventCreate(&stopEvents[i]));
HIP_CALL(hipMalloc((void **)&linkSrcMem[i], numBytesPerLink));
HIP_CALL(hipMalloc((void**)&gpuBlockParams[i], sizeof(BlockParam) * numLinks));
CheckOrFill(N, linkSrcMem[i], false);
CheckOrFill(N, linkSrcMem[i], false, useMemset, useHipCall);
// Allocate fine-grained GPU memory on destination GPU
// Allocate GPU memory on destination GPU
HIP_CALL(hipSetDevice(links[i].dstGpu));
HIP_CALL(hipExtMallocWithFlags((void**)&linkDstMem[i], numBytesPerLink, hipDeviceMallocFinegrained));
if (useCoarseMem)
HIP_CALL(hipMalloc((void**)&linkDstMem[i], numBytesPerLink));
else
HIP_CALL(hipExtMallocWithFlags((void**)&linkDstMem[i], numBytesPerLink, hipDeviceMallocFinegrained));
// Each block needs to know src/dst pointers and how many elements to transfer
// Figure out the sub-array each block does for this link
@@ -203,20 +225,39 @@ int main(int argc, char **argv)
{
HIP_CALL(hipSetDevice(links[i].srcGpu));
HIP_CALL(hipEventRecord(startEvents[i], streams[i]));
if (useMemcpy)
if (useHipCall)
{
if (useMemset)
{
HIP_CALL(hipMemsetAsync(linkDstMem[i], 42, numBytesPerLink, streams[i]));
}
else
{
HIP_CALL(hipMemcpyAsync(linkDstMem[i], linkSrcMem[i],
numBytesPerLink, hipMemcpyDeviceToDevice,
streams[i]));
}
}
else
{
if (useMemset)
{
hipLaunchKernelGGL(MemsetKernel,
dim3(links[i].numBlocksToUse, 1, 1),
dim3(BLOCKSIZE, 1, 1),
0,
streams[i],
gpuBlockParams[i]);
}
else
{
hipLaunchKernelGGL(CopyKernel,
dim3(links[i].numBlocksToUse, 1, 1),
dim3(BLOCKSIZE, 1, 1),
0,
streams[i],
gpuBlockParams[i]);
}
}
HIP_CALL(hipEventRecord(stopEvents[i], streams[i]));
}
@@ -255,7 +296,7 @@ int main(int argc, char **argv)
// Validate that each link has transferred correctly
for (int i = 0; i < numLinks; i++)
CheckOrFill(N, linkDstMem[i], true);
CheckOrFill(N, linkDstMem[i], true, useMemset, useHipCall);
// Report timings
printf("%-*s", MAX_NAME_LEN, name);
+35 -3
Ver ficheiro
@@ -35,6 +35,7 @@ THE SOFTWARE.
#define MAX_NAME_LEN 64
#define BLOCKSIZE 256
#define COPY_UNROLL 4
#define MEMSET_UNROLL 4
// Each link is defined between a source GPU and destination GPU
struct Link
@@ -64,6 +65,22 @@ CopyKernel(BlockParam* blockParams)
Copy<COPY_UNROLL, BLOCKSIZE>(dst, src, N);
}
// GPU set kernel
__global__ void __launch_bounds__(BLOCKSIZE)
MemsetKernel(BlockParam* blockParams)
{
// Collect the arguments for this block
int N = blockParams[blockIdx.x].N;
float* __restrict__ dst = (float*)blockParams[blockIdx.x].dst;
// Use non-zero value
#pragma unroll MEMSET_UNROLL
for (int tid = threadIdx.x; tid < N; tid += BLOCKSIZE)
{
dst[tid] = 1234.0;
}
}
// Helper function to parse a link of link definitions
void ParseLinks(char const* line, std::vector<Link>& links)
{
@@ -83,12 +100,27 @@ void ParseLinks(char const* line, std::vector<Link>& links)
}
// Helper function to either fill a device pointer with pseudo-random data, or to check to see if it matches
void CheckOrFill(int N, float* devPtr, bool doCheck)
void CheckOrFill(int N, float* devPtr, bool doCheck, bool isMemset, bool isHipCall)
{
float* refBuffer = (float*)malloc(N * sizeof(float));
for (int i = 0; i < N; i++)
refBuffer[i] = i % 383 + 31;
if (isMemset)
{
if (isHipCall)
{
memset(refBuffer, 42, N * sizeof(float));
}
else
{
for (int i = 0; i < N; i++)
refBuffer[i] = 1234.0f;
}
}
else
{
for (int i = 0; i < N; i++)
refBuffer[i] = (i % 383 + 31);
}
if (doCheck)
{