diff --git a/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp b/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp index 0c2c387c2f..cad5065e78 100644 --- a/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp +++ b/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp @@ -33,11 +33,13 @@ THE SOFTWARE. #ifdef __HIP_PLATFORM_HCC__ #include -#define USE_HCC_MEMTRACKER 0 #endif +#define USE_HCC_MEMTRACKER 0 /* Debug flag to show the memtracker periodically */ -int elementSizes[] = {16, 1024,524288}; + + +int elementSizes[] = {1, 16, 1024, 524288, 16*1000*1000}; int nSizes = sizeof(elementSizes) / sizeof(int); int enablePeers(int dev0, int dev1) @@ -57,8 +59,9 @@ int enablePeers(int dev0, int dev1) return 0; }; +// Set value of array to specified 32-bit integer: __global__ void -memsetIntKernel(/*hipLaunchParm lp,*/ int * ptr, const int val, size_t numElements) +memsetIntKernel(int * ptr, const int val, size_t numElements) { int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); int stride = hipBlockDim_x * hipGridDim_x ; @@ -68,7 +71,7 @@ memsetIntKernel(/*hipLaunchParm lp,*/ int * ptr, const int val, size_t numElemen }; __global__ void -memcpyIntKernel(/*hipLaunchParm lp, */const int * src, int* dst, size_t numElements) +memcpyIntKernel(const int * src, int* dst, size_t numElements) { int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); int stride = hipBlockDim_x * hipGridDim_x ; @@ -77,6 +80,9 @@ memcpyIntKernel(/*hipLaunchParm lp, */const int * src, int* dst, size_t numEleme } }; + +// CHeck arrays in reverse order, to more easily detect cases where +// the copy is "partially" done. void checkReverse(const int *ptr, int numElements, int expected) { for (int i=numElements-1; i>=0; i--) { if (ptr[i] != expected) { @@ -88,7 +94,8 @@ void checkReverse(const int *ptr, int numElements, int expected) { printf ("test: OK\n"); } -void runTest(bool stepAIsCopy, bool hostSync, hipStream_t gpu0Stream, hipStream_t gpu1Stream, int numElements, + +void runTestImpl(bool stepAIsCopy, bool hostSync, hipStream_t gpu0Stream, hipStream_t gpu1Stream, int numElements, int * dataGpu0_0, int * dataGpu0_1, int *dataGpu1, int *dataHost, int expected) { hipEvent_t e; @@ -96,7 +103,7 @@ void runTest(bool stepAIsCopy, bool hostSync, hipStream_t gpu0Stream, hipStream_ HIPCHECK(hipEventCreateWithFlags(&e,0)); } const size_t sizeElements = numElements * sizeof(int); - printf ("test: runTest with %zu bytes %s with hostSync %s\n", sizeElements, stepAIsCopy ? "copy" : "kernel", hostSync ? "enabled" : "disabled"); + printf ("test: runTestImpl with %zu bytes %s with hostSync %s\n", sizeElements, stepAIsCopy ? "copy" : "kernel", hostSync ? "enabled" : "disabled"); hipStream_t stepAStream = gpu0Stream; @@ -127,9 +134,12 @@ void runTest(bool stepAIsCopy, bool hostSync, hipStream_t gpu0Stream, hipStream_ HIPCHECK(hipStreamSynchronize(gpu0Stream)); checkReverse(dataHost, numElements, expected); + if(!hostSync) { + HIPCHECK(hipEventDestroy(e)); + } } -void testMultiGpu(int dev0, int dev1, int numElements, bool hostSync, bool useMemcpy) +void testMultiGpu(int dev0, int dev1, int numElements, bool hostSync) { const size_t sizeElements = numElements * sizeof(int); @@ -163,12 +173,15 @@ void testMultiGpu(int dev0, int dev1, int numElements, bool hostSync, bool useMe #endif printf (" test: init complete\n"); - runTest(useMemcpy , hostSync, gpu0Stream, gpu1Stream, numElements, dataGpu0_0,dataGpu0_1, dataGpu1, dataHost, expected); + runTestImpl(true, hostSync, gpu0Stream, gpu1Stream, numElements, dataGpu0_0,dataGpu0_1, dataGpu1, dataHost, expected); HIPCHECK(hipFree(dataGpu0_0)); HIPCHECK(hipFree(dataGpu0_1)); HIPCHECK(hipFree(dataGpu1)); HIPCHECK(hipHostFree(dataHost)); + + HIPCHECK(hipStreamDestroy(gpu0Stream)); + HIPCHECK(hipStreamDestroy(gpu1Stream)); }; int main(int argc, char *argv[]) @@ -192,11 +205,9 @@ int main(int argc, char *argv[]) return -1; }; - for(int index = 1;index < nSizes;index++) { - testMultiGpu(dev0, dev1, elementSizes[index] , false /* GPU Synchronization*/, true); - testMultiGpu(dev0, dev1, elementSizes[index] , true /*Host Synchronization*/, true); - testMultiGpu(dev0, dev1, elementSizes[index] , true /*Host Synchronization*/, false); - testMultiGpu(dev0, dev1, elementSizes[index] , false /*Host Synchronization*/, false); + for(int index = 0;index < nSizes;index++) { + testMultiGpu(dev0, dev1, elementSizes[index] , false /*GPU Synchronization*/); + testMultiGpu(dev0, dev1, elementSizes[index] , true /*Host Synchronization*/); } diff --git a/hipamd/tests/src/runtimeApi/synchronization/copy_coherency.cpp b/hipamd/tests/src/runtimeApi/synchronization/copy_coherency.cpp new file mode 100644 index 0000000000..2e514e1e3a --- /dev/null +++ b/hipamd/tests/src/runtimeApi/synchronization/copy_coherency.cpp @@ -0,0 +1,391 @@ +/* +Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +//ROCM_TARGET=gfx900 hipcc --genco memcpyInt.device.cpp -o memcpyInt.hsaco +//hipcc copy_coherency.cpp -I ~/X/HIP/tests/src/ ~/X/HIP/tests/src/test_common.cpp + + +// TODO - add code object support here. +/* HIT_START + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 + * RUN: %t + * HIT_END + */ + + +// Test cache management (fences) and synchronization between kernel and copy commands. +// Exhaustively tests 3 command types (copy, kernel, module kernel), +// many sync types (see SyncType), followed by another command, across a sweep +// of data sizes designed to stress various levels of the memory hierarchy. + +#include "hip/hip_runtime.h" +#include "test_common.h" + +// TODO - turn this back on when test infra can copy the module files to use as test inputs. +#define SKIP_MODULE_KERNEL 1 + + +class MemcpyFunction +{ +public: + MemcpyFunction(const char *fileName, const char *functionName) { load(fileName, functionName); }; + void load(const char *fileName, const char *functionName); + void launch(int * dst, const int * src, size_t numElements, hipStream_t s); + +private: + hipFunction_t _function; + hipModule_t _module; +}; + + +void MemcpyFunction::load(const char *fileName, const char *functionName) +{ +#if SKIP_MODULE_KERNEL!=1 + HIPCHECK(hipModuleLoad(&_module, fileName)); + HIPCHECK(hipModuleGetFunction(&_function, _module, functionName)); +#endif +}; + + +void MemcpyFunction::launch(int * dst, const int * src, size_t numElements, hipStream_t s) +{ + struct { + uint32_t _hidden[6]; + int* _dst; + const int* _src; + size_t _numElements; + } args; + + args._dst = dst; + args._src = src; + args._numElements = numElements; + + size_t size = sizeof(args); + void *config[] = { + HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, + HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, + HIP_LAUNCH_PARAM_END + }; + + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); + HIPCHECK(hipModuleLaunchKernel(_function, blocks, 1, 1, threadsPerBlock, 1, 1, 0/*dynamicShared*/, s, NULL, (void**)&config)); +}; + +bool g_warnOnFail = true; +//int g_elementSizes[] = {1, 16, 1024, 524288, 16*1000*1000}; // TODO +int g_elementSizes[] = {128*1000, 256*1000, 16*1000*1000}; + +MemcpyFunction g_moduleMemcpy("memcpyInt.hsaco", "memcpyIntKernel"); + + + +// Set value of array to specified 32-bit integer: +__global__ void +memsetIntKernel(int * ptr, const int val, size_t numElements) +{ + int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + int stride = hipBlockDim_x * hipGridDim_x ; + for (size_t i= gid; i< numElements; i+=stride){ + ptr[i] = val; + } +}; + +__global__ void +memcpyIntKernel(int *dst, const int * src, size_t numElements) +{ + int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + int stride = hipBlockDim_x * hipGridDim_x ; + for (size_t i= gid; i< numElements; i+=stride){ + dst[i] = src[i]; + } +}; + + +// CHeck arrays in reverse order, to more easily detect cases where +// the copy is "partially" done. +void checkReverse(const int *ptr, int numElements, int expected) { + int mismatchCnt = 0; + for (int i=numElements-1; i>=0; i--) { + if (ptr[i] != expected) { + fprintf (stderr, "%s**error: i=%d, ptr[i] == (%x) , does not equal expected (%x)\n%s", KRED, i, ptr[i], expected, KNRM); + if (!g_warnOnFail) { + assert (ptr[i] == expected); + } + if (++mismatchCnt >= 10) { + break; + } + } + } + + fprintf (stderr, "test: OK\n"); +} + +#define ENUM_CASE_STR(x) case x: return #x + +enum CmdType { + COPY, + KERNEL, + MODULE_KERNEL, + MAX_CmdType +}; + + +const char * CmdTypeStr(CmdType c) +{ + switch(c) { + ENUM_CASE_STR(COPY); + ENUM_CASE_STR(KERNEL); + ENUM_CASE_STR(MODULE_KERNEL); + default: return "UNKNOWN"; + }; +} + + +enum SyncType { + NONE, + EVENT_QUERY, + EVENT_SYNC, + STREAM_WAIT_EVENT, + STREAM_QUERY, + STREAM_SYNC, + DEVICE_SYNC, + MAX_SyncType +}; + + +const char * SyncTypeStr(SyncType s) +{ + switch(s) { + ENUM_CASE_STR(NONE); + ENUM_CASE_STR(EVENT_QUERY); + ENUM_CASE_STR(EVENT_SYNC); + ENUM_CASE_STR(STREAM_WAIT_EVENT); + ENUM_CASE_STR(STREAM_QUERY); + ENUM_CASE_STR(STREAM_SYNC); + ENUM_CASE_STR(DEVICE_SYNC); + default: return "UNKNOWN"; + }; +}; + + +void runCmd(CmdType cmd, int *dst, const int *src, hipStream_t s, size_t numElements) +{ + switch (cmd) { + case COPY: + HIPCHECK(hipMemcpyAsync(dst, src, numElements*sizeof(int), hipMemcpyDeviceToDevice, s)); + break; + case KERNEL: + { + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); + hipLaunchKernelGGL(memcpyIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, s, + dst, src, numElements); + } + break; + case MODULE_KERNEL: + g_moduleMemcpy.launch(dst, src, numElements, s); + break; + default: + failed("unknown cmd=%d type", cmd); + }; +} + +void resetInputs( int * Ad, int * Bd, int *Cd, int *Ch, size_t numElements, int expected) +{ + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); + hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, hipStream_t(0), + Ad, expected, numElements); + hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, hipStream_t(0), + Bd, 0xDEADBEEF, numElements); // poison with bad value to ensure is overwritten correctly + hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, hipStream_t(0), + Bd, 0xF000BA55, numElements); // poison with bad value to ensure is overwritten correctly + memset(Ch, 13, numElements*sizeof(int)); // poison with bad value to ensure is overwritten correctly + HIPCHECK(hipDeviceSynchronize()); +} + +// Intended to test proper synchronization and cache flushing between CMDA and CMDB. +// CMD are of type CmdType. All command copy memory, using either hipMemcpyAsync or kernel implementations. +// CmdA copies from Ad to Bd, +// Some form of synchronization is applied. +// Then cmdB copies from Bd to Cd. +// +// Cd is then copied to host Ch using a memory copy. +// +// Correct result at the end is that Ch contains the contents originally in Ad (integer 0x42) +void runTestImpl(CmdType cmdAType, SyncType syncType, CmdType cmdBType, + hipStream_t stream1, hipStream_t stream2, int numElements, + int * Ad, int * Bd, int *Cd, int *Ch, + int expected) +{ + hipEvent_t e; + HIPCHECK(hipEventCreateWithFlags(&e,0)); + + resetInputs(Ad, Bd, Cd, Ch, numElements, expected); + + const size_t sizeElements = numElements * sizeof(int); + fprintf (stderr, "test: runTest with %zu bytes (%6.2f MB) cmdA=%s; sync=%s; cmdB=%s\n", + sizeElements, (double) (sizeElements/1024.0), CmdTypeStr(cmdAType), SyncTypeStr(syncType), CmdTypeStr(cmdBType)); + + if (SKIP_MODULE_KERNEL && ((cmdAType == MODULE_KERNEL) || (cmdBType == MODULE_KERNEL))) { + fprintf (stderr, "warn: skipping since test infra does not yet support modules\n"); + return; + } + + + // Step A: + runCmd(cmdAType, Bd, Ad, stream1, numElements); + + + // Sync in-between? + switch (syncType) { + case NONE: + break; + case EVENT_QUERY: + { + hipError_t st = hipErrorNotReady; + HIPCHECK(hipEventRecord(e, stream1)); + do { + st = hipEventQuery(e); + } while (st == hipErrorNotReady); + HIPCHECK(st); + } + break; + case EVENT_SYNC: + HIPCHECK(hipEventRecord(e, stream1)); + HIPCHECK(hipEventSynchronize(e)); + break; + case STREAM_WAIT_EVENT: + HIPCHECK(hipEventRecord(e, stream1)); + HIPCHECK(hipStreamWaitEvent(stream2, e, 0)); + break; + case STREAM_QUERY: + { + hipError_t st = hipErrorNotReady; + do { + st = hipStreamQuery(stream1); + } while (st == hipErrorNotReady); + HIPCHECK(st); + } + break; + case STREAM_SYNC: + HIPCHECK(hipStreamSynchronize(stream1)); + break; + case DEVICE_SYNC: + HIPCHECK(hipDeviceSynchronize()); + break; + default: + fprintf(stderr, "warning: unknown sync type=%s", SyncTypeStr(syncType)); + return; // FIXME, this doesn't clean up + //failed("unknown sync type=%s", SyncTypeStr(syncType)); + }; + + + runCmd(cmdBType, Cd, Bd, stream2, numElements); + + + // Copy back to host, use async copy to avoid any extra synchronization that might mask issues. + HIPCHECK(hipMemcpyAsync(Ch, Cd, sizeElements, hipMemcpyDeviceToHost, stream2)); + HIPCHECK(hipStreamSynchronize(stream2)); + + checkReverse(Ch, numElements, expected); + + HIPCHECK(hipEventDestroy(e)); +}; + + +void testWrapper(size_t numElements) +{ + const size_t sizeElements = numElements * sizeof(int); + const int expected = 0x42; + int * Ad, * Bd, *Cd, *Ch; + + HIPCHECK(hipMalloc(&Ad, sizeElements)); + HIPCHECK(hipMalloc(&Bd, sizeElements)); + HIPCHECK(hipMalloc(&Cd, sizeElements)); + HIPCHECK(hipHostMalloc(&Ch, sizeElements)); // Ch is the end array + + + + hipStream_t stream1, stream2; + + HIPCHECK(hipStreamCreate(&stream1)); + HIPCHECK(hipStreamCreate(&stream2)); + + + HIPCHECK(hipDeviceSynchronize()); + fprintf (stderr, "test: init complete, start running tests\n"); + + + runTestImpl(COPY, EVENT_SYNC, KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected); + + for (int cmdA=0; cmdA + + + +extern "C" __global__ void +memcpyIntKernel(hipLaunchParm lp, int *dst, const int * src, size_t numElements) +{ + int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + int stride = hipBlockDim_x * hipGridDim_x ; + for (size_t i= gid; i< numElements; i+=stride){ + dst[i] = src[i]; + } +}; diff --git a/hipamd/tests/src/test_common.h b/hipamd/tests/src/test_common.h index e3cc664d56..35f0aedb25 100644 --- a/hipamd/tests/src/test_common.h +++ b/hipamd/tests/src/test_common.h @@ -23,7 +23,6 @@ THE SOFTWARE. #include #include "hip/hip_runtime.h" -#include "hip/hip_texture_types.h" #include "hip/hip_runtime_api.h" #define HC __attribute__((hc))