diff --git a/projects/hip-tests/catch/perftests/memory/hipPerfMemset.cc b/projects/hip-tests/catch/perftests/memory/hipPerfMemset.cc index 7ae952e02c..e1e22aa149 100644 --- a/projects/hip-tests/catch/perftests/memory/hipPerfMemset.cc +++ b/projects/hip-tests/catch/perftests/memory/hipPerfMemset.cc @@ -27,16 +27,13 @@ #include -static size_t typeSizeList[] = { - 1, 2, 4, 8, 16, 32, 64, 128, -}; - static unsigned int sizeList[] = { 256, 512, 1024, 2048, 4096, 8192, }; static unsigned int eleNumList[] = { - 0x0020000, 0x0080000, 0x0200000, 0x0800000, 0x2000000, + 0x100, 0x400, 0x1000, 0x4000, 0x10000, 0x20000, 0x40000, 0x80000, 0x100000, + 0x200000, 0x400000, 0x800000, 0x1000000 }; typedef struct _dataType { @@ -46,21 +43,21 @@ typedef struct _dataType { int memsetD32val = 0xDEADBEEF; }dataType; -#define NUM_ITER 100 +#define NUM_ITER 1000 enum MemsetType { hipMemsetTypeDefault, hipMemsetTypeD8, hipMemsetTypeD16, - hipMemsetTypeD32 + hipMemsetTypeD32, + hipMemsetTypeMax + }; class hipPerfMemset { private: - unsigned int bufSize_; - unsigned int num_typeSize_; + uint64_t bufSize_; unsigned int num_elements_; - size_t testTypeSize_; unsigned int testNumEle_; unsigned int _numSubTests = 0; unsigned int _numSubTests2D = 0; @@ -69,13 +66,12 @@ class hipPerfMemset { public: hipPerfMemset() { - num_typeSize_ = sizeof(typeSizeList) / sizeof(size_t); - num_elements_ = sizeof(eleNumList) / sizeof(unsigned int); - _numSubTests = num_elements_ * num_typeSize_; + num_elements_ = sizeof(eleNumList) / sizeof(unsigned int); + _numSubTests = num_elements_ * hipMemsetTypeMax; - num_sizes_ = sizeof(sizeList) / sizeof(unsigned int); - _numSubTests2D = num_sizes_; - _numSubTests3D = _numSubTests2D; + num_sizes_ = sizeof(sizeList) / sizeof(unsigned int); + _numSubTests2D = num_sizes_; + _numSubTests3D = _numSubTests2D; } ~hipPerfMemset() {} @@ -124,20 +120,25 @@ void hipPerfMemset::run1D(unsigned int test, T memsetval, enum MemsetType type, bool async) { T *A_h, *A_d; - testTypeSize_ = typeSizeList[(test / num_elements_) % num_typeSize_]; testNumEle_ = eleNumList[test % num_elements_]; - bufSize_ = testNumEle_ * 4; + bufSize_ = testNumEle_ * sizeof(uint32_t); HIP_CHECK(hipMalloc(&A_d, bufSize_)); A_h = reinterpret_cast (malloc(bufSize_)); hipStream_t stream; - HIP_CHECK(hipStreamCreate(&stream)); + HIP_CHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); // Warm-up - HIP_CHECK(hipMemset(reinterpret_cast(A_d), memsetval, bufSize_)); + if (async) { + HIP_CHECK(hipMemsetAsync((void *)A_d, memsetval, bufSize_, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + } else { + HIP_CHECK(hipMemset((void *)A_d, memsetval, bufSize_)); + HIP_CHECK(hipDeviceSynchronize()); + } auto start = std::chrono::steady_clock::now(); @@ -149,29 +150,28 @@ void hipPerfMemset::run1D(unsigned int test, T memsetval, } else if (type == hipMemsetTypeD8 && !async) { HIP_CHECK(hipMemsetD8((hipDeviceptr_t)A_d, memsetval, bufSize_)); } else if (type == hipMemsetTypeD8 && async) { - HIP_CHECK(hipMemsetD8Async((hipDeviceptr_t)A_d, memsetval, bufSize_)); + HIP_CHECK(hipMemsetD8Async((hipDeviceptr_t)A_d, memsetval, bufSize_, stream)); } else if (type == hipMemsetTypeD16 && !async) { - HIP_CHECK(hipMemsetD16((hipDeviceptr_t)A_d, memsetval, - bufSize_/sizeof(T))); + HIP_CHECK(hipMemsetD16((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T))); } else if (type == hipMemsetTypeD16 && async) { - HIP_CHECK(hipMemsetD16Async((hipDeviceptr_t)A_d, memsetval, - bufSize_/sizeof(T))); + HIP_CHECK(hipMemsetD16Async((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T), stream)); } else if (type == hipMemsetTypeD32 && !async) { - HIP_CHECK(hipMemsetD32((hipDeviceptr_t)A_d, memsetval, - bufSize_/sizeof(T))); + HIP_CHECK(hipMemsetD32((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T))); } else if (type == hipMemsetTypeD32 && async) { - HIP_CHECK(hipMemsetD32Async((hipDeviceptr_t)A_d, memsetval, - bufSize_/sizeof(T))); + HIP_CHECK(hipMemsetD32Async((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T), stream)); } } - - HIP_CHECK(hipDeviceSynchronize()); + if (async) { + HIPCHECK(hipStreamSynchronize(stream)); + } else { + HIPCHECK(hipDeviceSynchronize()); + } auto end = std::chrono::steady_clock::now(); HIP_CHECK(hipMemcpy(A_h, A_d, bufSize_, hipMemcpyDeviceToHost) ); - for (int i = 0; i < bufSize_/testTypeSize_; i++) { + for (int i = 0; i < bufSize_ / sizeof(T); i++) { if (A_h[i] != memsetval) { INFO("mismatch at index " << i << " computed: " << static_cast (A_h[i]) << ", memsetval: " << @@ -188,9 +188,10 @@ void hipPerfMemset::run1D(unsigned int test, T memsetval, auto sec = diff.count(); auto perf = static_cast((bufSize_ * NUM_ITER * (1e-09)) / sec); - INFO("hipPerf1DMemset[" << test << "] " << (int)bufSize_/1024 << " Kb " - << std::setw(4) << " typeSize " << (int) testTypeSize_ << ":" - << std::setw(5) << perf << " GB/s \n"); + std::cout << "[" << std::setw(2) + << test << "] " << std::setw(5) << bufSize_/1024 + << " Kb " << std::setw(4) << " typeSize " << sizeof(T) << " : " + << std::setw(7) << perf << " GB/s \n"; } template @@ -215,11 +216,16 @@ void hipPerfMemset::run2D(unsigned int test, T memsetval, } hipStream_t stream; - HIP_CHECK(hipStreamCreate(&stream)); + HIP_CHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); // Warm-up - HIP_CHECK(hipMemset2D(A_d, pitch_A, memsetval, numW, numH)); - + if (async) { + HIP_CHECK(hipMemset2DAsync(A_d, pitch_A, memsetval, numW, numH, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + } else { + HIP_CHECK(hipMemset2D(A_d, pitch_A, memsetval, numW, numH)); + HIP_CHECK(hipDeviceSynchronize()); + } auto start = std::chrono::steady_clock::now(); for (uint i = 0; i < NUM_ITER; i++) { @@ -230,7 +236,11 @@ void hipPerfMemset::run2D(unsigned int test, T memsetval, } } - HIP_CHECK(hipStreamSynchronize(stream)); + if (async) { + HIP_CHECK(hipStreamSynchronize(stream)); + } else { + HIP_CHECK(hipDeviceSynchronize()); + } auto end = std::chrono::steady_clock::now(); @@ -251,9 +261,9 @@ void hipPerfMemset::run2D(unsigned int test, T memsetval, auto sec = diff.count(); auto perf = static_cast((sizeElements* NUM_ITER * (1e-09)) / sec); - INFO("hipPerf2DMemset[" << test << "] " <<" " << "(GB/s) for " << - (int)bufSize_ << " x " << bufSize_ << " bytes : " << std::setw(5) << - perf << "\n"); + std::cout << "hipPerf2DMemset" << (async ? "Async" : " ") << "[" << test << "] " + << " " << "(GB/s) for " << std::setw(5) << bufSize_ + << " x " << std::setw(5) << bufSize_ << " bytes : " << std::setw(7) << perf << "\n"; HIP_CHECK(hipStreamDestroy(stream)); HIP_CHECK(hipFree(A_d)); @@ -273,7 +283,7 @@ void hipPerfMemset::run3D(unsigned int test, T memsetval, size_t elements = numW* numH* depth; hipStream_t stream; - HIP_CHECK(hipStreamCreate(&stream)); + HIP_CHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); T *A_h; @@ -289,7 +299,13 @@ void hipPerfMemset::run3D(unsigned int test, T memsetval, } // Warm up - HIP_CHECK(hipMemset3D(devPitchedPtr, memsetval, extent)); + if (async) { + HIP_CHECK(hipMemset3DAsync(devPitchedPtr, memsetval, extent, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + } else { + HIP_CHECK(hipMemset3D(devPitchedPtr, memsetval, extent)); + HIP_CHECK(hipDeviceSynchronize()); + } auto start = std::chrono::steady_clock::now(); @@ -301,7 +317,11 @@ void hipPerfMemset::run3D(unsigned int test, T memsetval, } } - HIP_CHECK(hipStreamSynchronize(stream)); + if (async) { + HIP_CHECK(hipStreamSynchronize(stream)); + } else { + HIP_CHECK(hipDeviceSynchronize()); + } auto end = std::chrono::steady_clock::now(); @@ -330,9 +350,9 @@ void hipPerfMemset::run3D(unsigned int test, T memsetval, auto sec = diff.count(); auto perf = static_cast((sizeElements * NUM_ITER * (1e-09)) / sec); - INFO("hipPerf3DMemset[" << test << "] " <<" " << "(GB/s) for " << - (int)bufSize_ << " x " << bufSize_ << " x " < -#include - -static size_t typeSizeList[] = { - 1, 2, 4, 8, 16, 32, 64, 128, -}; - -static unsigned int sizeList[] = { - 256, 512, 1024, 2048, 4096, 8192, -}; - -static unsigned int eleNumList[] = { - 0x0020000, 0x0080000, 0x0200000, 0x0800000, 0x2000000, -}; - -typedef struct _dataType { -char memsetval = 0x42; -char memsetD8val = 0xDE; -int16_t memsetD16val = 0xDEAD; -int memsetD32val = 0xDEADBEEF; -}dataType; - -#define NUM_ITER 100 - -enum MemsetType { - hipMemsetTypeDefault, - hipMemsetTypeD8, - hipMemsetTypeD16, - hipMemsetTypeD32 -}; - -using namespace std; - -class hipPerfMemset { - private: - unsigned int bufSize_; - unsigned int num_typeSize_; - unsigned int num_elements_; - size_t testTypeSize_; - unsigned int testNumEle_; - unsigned int _numSubTests = 0; - unsigned int _numSubTests2D = 0; - unsigned int _numSubTests3D = 0; - unsigned int num_sizes_ =0; - - public: - hipPerfMemset() { - num_typeSize_ = sizeof(typeSizeList) / sizeof(size_t); - num_elements_ = sizeof(eleNumList) / sizeof(unsigned int); - _numSubTests = num_elements_ * num_typeSize_; - - num_sizes_ = sizeof(sizeList) / sizeof(unsigned int); - _numSubTests2D = num_sizes_; - _numSubTests3D = _numSubTests2D; - }; - - ~hipPerfMemset() {}; - - void open(int deviceID); - - template - void run1D(unsigned int test, T memsetval, enum MemsetType type, bool async); - - template - void run2D(unsigned int test, T memsetval, enum MemsetType type, bool async); - - template - void run3D(unsigned int test, T memsetval, enum MemsetType type, bool async); - - uint getNumTests() { - return _numSubTests; - } - - uint getNumTests2D() { - return _numSubTests2D; - } - uint getNumTests3D() { - return _numSubTests3D; - } -}; - - -void hipPerfMemset::open(int deviceId) { - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - if (nGpu < 1) { - failed("No GPU!"); - } - - HIPCHECK(hipSetDevice(deviceId)); - hipDeviceProp_t props = {0}; - HIPCHECK(hipGetDeviceProperties(&props, deviceId)); - std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name - << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId - << std::endl; -} - -template -void hipPerfMemset::run1D(unsigned int test, T memsetval, enum MemsetType type, bool async) { - - T * A_h; - T * A_d; - - testTypeSize_ = typeSizeList[(test / num_elements_) % num_typeSize_]; - testNumEle_ = eleNumList[test % num_elements_]; - - bufSize_ = testNumEle_ * 4; - - HIPCHECK(hipMalloc(&A_d, bufSize_)); - - A_h = reinterpret_cast (malloc(bufSize_)); - - hipStream_t stream; - HIPCHECK(hipStreamCreate(&stream)); - - // Warm-up - HIPCHECK(hipMemset((void *)A_d, memsetval, bufSize_)); - - auto start = chrono::steady_clock::now(); - - for (uint i = 0; i < NUM_ITER; i++) { - if (type == hipMemsetTypeDefault && !async) { - HIPCHECK(hipMemset((void *)A_d, memsetval, bufSize_)); - } - else if (type == hipMemsetTypeDefault && async) { - HIPCHECK(hipMemsetAsync(A_d, memsetval, bufSize_, stream)); - } - else if (type == hipMemsetTypeD8 && !async){ - HIPCHECK(hipMemsetD8((hipDeviceptr_t)A_d, memsetval, bufSize_)); - } - else if (type == hipMemsetTypeD8 && async) { - HIPCHECK(hipMemsetD8Async((hipDeviceptr_t)A_d, memsetval, bufSize_)); - } - else if (type == hipMemsetTypeD16 && !async) { - HIPCHECK(hipMemsetD16((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T))); - } - else if (type == hipMemsetTypeD16 && async) { - HIPCHECK(hipMemsetD16Async((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T))); - } - else if (type == hipMemsetTypeD32 && !async) { - HIPCHECK(hipMemsetD32((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T))); - } - else if (type == hipMemsetTypeD32 && async) { - HIPCHECK(hipMemsetD32Async((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T))); - } - } - - HIPCHECK(hipDeviceSynchronize()); - - auto end = chrono::steady_clock::now(); - - HIPCHECK(hipMemcpy(A_h, A_d, bufSize_, hipMemcpyDeviceToHost) ); - - for (int i = 0; i < bufSize_/testTypeSize_; i++) { - if (A_h[i] != memsetval) { - cout << "mismatch at index " << i << " computed: " << static_cast (A_h[i]) - << ", memsetval: " << static_cast (memsetval) << endl; - break; - } - } - - HIPCHECK(hipFree(A_d)); - free(A_h); - - chrono::duration diff = end - start; - - auto sec = diff.count(); - - auto perf = static_cast((bufSize_ * NUM_ITER * (double)(1e-09)) / sec); - - cout << " hipPerf1DMemset[" << test << "] " << (int)bufSize_/1024 << " Kb " << setw(4) - << " typeSize " << (int) testTypeSize_ << ":" << setw(5) << perf << " GB/s " < -void hipPerfMemset::run2D(unsigned int test, T memsetval, enum MemsetType type, bool async) { - - bufSize_ = sizeList[test % num_sizes_]; - - size_t numH = bufSize_; - size_t numW = bufSize_; - size_t pitch_A; - size_t width = numW * sizeof(char); - size_t sizeElements = width * numH; - size_t elements = numW* numH; - - T * A_h; - T * A_d; - - HIPCHECK(hipMallocPitch(reinterpret_cast(&A_d), &pitch_A, width , - numH)); - A_h = reinterpret_cast(malloc(sizeElements)); - - for (size_t i=0; i < elements; i++) { - A_h[i] = 1; - } - - hipStream_t stream; - HIPCHECK(hipStreamCreate(&stream)); - - // Warm-up - HIPCHECK(hipMemset2D(A_d, pitch_A, memsetval, numW, numH)); - - - auto start = chrono::steady_clock::now(); - - for (uint i = 0; i < NUM_ITER; i++) { - if (type == hipMemsetTypeDefault && !async) { - HIPCHECK(hipMemset2D(A_d, pitch_A, memsetval, numW, numH)); - } - else if (type == hipMemsetTypeDefault && async) { - HIPCHECK(hipMemset2DAsync(A_d, pitch_A, memsetval, numW, numH, stream)); - } - } - - HIPCHECK(hipStreamSynchronize(stream)); - - auto end = chrono::steady_clock::now(); - - HIPCHECK(hipMemcpy2D(A_h, width, A_d, pitch_A, numW, numH, - hipMemcpyDeviceToHost)); - - for (int i=0; i < elements; i++) { - if (A_h[i] != memsetval) { - cout << "mismatch at index " << i << " computed: " << static_cast (A_h[i]) - << ", memsetval: " << static_cast (memsetval) << endl; - break; - } - } - - chrono::duration diff = end - start; - - auto sec = diff.count(); - - auto perf = static_cast((sizeElements* NUM_ITER * (double)(1e-09)) / sec); - - cout << " hipPerf2DMemset[" << test << "] " <<" " << "(GB/s) for " << (int)bufSize_ - << " x " << bufSize_ << " bytes : " << setw(5)<< perf << endl; - - HIPCHECK(hipStreamDestroy(stream)); - HIPCHECK(hipFree(A_d)); - free(A_h); -} - -template -void hipPerfMemset::run3D(unsigned int test, T memsetval, enum MemsetType type, bool async) { - - bufSize_ = sizeList[test % num_sizes_]; - - size_t numH = bufSize_; - size_t numW = bufSize_; - size_t depth = 10; - size_t width = numW * sizeof(char); - size_t sizeElements = width * numH * depth; - size_t elements = numW* numH* depth; - - hipStream_t stream; - HIPCHECK(hipStreamCreate(&stream)); - - T *A_h; - - hipExtent extent = make_hipExtent(width, numH, depth); - hipPitchedPtr devPitchedPtr; - - HIPCHECK(hipMalloc3D(&devPitchedPtr, extent)); - A_h = (char*)malloc(sizeElements); - HIPASSERT(A_h != NULL); - - for (size_t i=0; i (A_h[i]) - << ", memsetval: " << static_cast (memsetval) << endl; - break; - } - } - - chrono::duration diff = end - start; - - auto sec = diff.count(); - - auto perf = static_cast((sizeElements * NUM_ITER * (double)(1e-09)) / sec); - - cout << " hipPerf3DMemset[" << test << "] " <<" " << "(GB/s) for " << (int)bufSize_ - << " x " << bufSize_ << " x " < +#include + +static unsigned int sizeList[] = { + 256, 512, 1024, 2048, 4096, 8192, +}; + +static unsigned int eleNumList[] = { + 0x100, 0x400, 0x1000, 0x4000, 0x10000, 0x20000, 0x40000, 0x80000, 0x100000, + 0x200000, 0x400000, 0x800000, 0x1000000 +}; + +typedef struct _dataType { +char memsetval = 0x42; +char memsetD8val = 0xDE; +int16_t memsetD16val = 0xDEAD; +int memsetD32val = 0xDEADBEEF; +}dataType; + +#define NUM_ITER 1000 + +enum MemsetType { + hipMemsetTypeDefault, + hipMemsetTypeD8, + hipMemsetTypeD16, + hipMemsetTypeD32, + hipMemsetTypeMax + +}; + +using namespace std; + +class hipPerfMemset { + private: + uint64_t bufSize_; + unsigned int num_elements_; + unsigned int testNumEle_; + unsigned int _numSubTests = 0; + unsigned int _numSubTests2D = 0; + unsigned int _numSubTests3D = 0; + unsigned int num_sizes_ =0; + + public: + hipPerfMemset() { + num_elements_ = sizeof(eleNumList) / sizeof(unsigned int); + _numSubTests = num_elements_ * hipMemsetTypeMax; + + num_sizes_ = sizeof(sizeList) / sizeof(unsigned int); + _numSubTests2D = num_sizes_; + _numSubTests3D = _numSubTests2D; + }; + + ~hipPerfMemset() {}; + + void open(int deviceID); + + template + void run1D(unsigned int test, T memsetval, enum MemsetType type, bool async); + + template + void run2D(unsigned int test, T memsetval, enum MemsetType type, bool async); + + template + void run3D(unsigned int test, T memsetval, enum MemsetType type, bool async); + + uint getNumTests() { + return _numSubTests; + } + + uint getNumTests2D() { + return _numSubTests2D; + } + uint getNumTests3D() { + return _numSubTests3D; + } +}; + + +void hipPerfMemset::open(int deviceId) { + int nGpu = 0; + HIPCHECK(hipGetDeviceCount(&nGpu)); + if (nGpu < 1) { + failed("No GPU!"); + } + + HIPCHECK(hipSetDevice(deviceId)); + hipDeviceProp_t props = {0}; + HIPCHECK(hipGetDeviceProperties(&props, deviceId)); + std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name + << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId + << std::endl; +} + +template +void hipPerfMemset::run1D(unsigned int test, T memsetval, enum MemsetType type, bool async) { + + T * A_h; + T * A_d; + + testNumEle_ = eleNumList[test % num_elements_]; + + bufSize_ = testNumEle_ * sizeof(uint32_t); + + HIPCHECK(hipMalloc(&A_d, bufSize_)); + + A_h = reinterpret_cast (malloc(bufSize_)); + + hipStream_t stream; + HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); + + // Warm-up + if (async) { + HIPCHECK(hipMemsetAsync((void *)A_d, memsetval, bufSize_, stream)); + HIPCHECK(hipStreamSynchronize(stream)); + } else { + HIPCHECK(hipMemset((void *)A_d, memsetval, bufSize_)); + HIPCHECK(hipDeviceSynchronize()); + } + + auto start = chrono::high_resolution_clock::now(); + for (uint i = 0; i < NUM_ITER; i++) { + if (type == hipMemsetTypeDefault && !async) { + HIPCHECK(hipMemset((void *)A_d, memsetval, bufSize_)); + } + else if (type == hipMemsetTypeDefault && async) { + HIPCHECK(hipMemsetAsync(A_d, memsetval, bufSize_, stream)); + } + else if (type == hipMemsetTypeD8 && !async){ + HIPCHECK(hipMemsetD8((hipDeviceptr_t)A_d, memsetval, bufSize_)); + } + else if (type == hipMemsetTypeD8 && async) { + HIPCHECK(hipMemsetD8Async((hipDeviceptr_t)A_d, memsetval, bufSize_, stream)); + } + else if (type == hipMemsetTypeD16 && !async) { + HIPCHECK(hipMemsetD16((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T))); + } + else if (type == hipMemsetTypeD16 && async) { + HIPCHECK(hipMemsetD16Async((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T), stream)); + } + else if (type == hipMemsetTypeD32 && !async) { + HIPCHECK(hipMemsetD32((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T))); + } + else if (type == hipMemsetTypeD32 && async) { + HIPCHECK(hipMemsetD32Async((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T), stream)); + } + } + if (async) { + HIPCHECK(hipStreamSynchronize(stream)); + } else { + HIPCHECK(hipDeviceSynchronize()); + } + + auto end = chrono::high_resolution_clock::now(); + + HIPCHECK(hipMemcpy(A_h, A_d, bufSize_, hipMemcpyDeviceToHost) ); + + for (int i = 0; i < bufSize_ / sizeof(T); i++) { + if (A_h[i] != memsetval) { + cout << "mismatch at index " << i << " computed: " << static_cast (A_h[i]) + << ", memsetval: " << static_cast (memsetval) << endl; + break; + } + } + + HIPCHECK(hipFree(A_d)); + free(A_h); + + auto diff = std::chrono::duration(end - start); + auto sec = diff.count(); + + auto perf = static_cast((bufSize_ * NUM_ITER * (double)(1e-09)) / sec); + + cout << "[" << setw(2) << test << "] " << setw(5) << bufSize_/1024 << " Kb " << setw(4) + << " typeSize " << (int)sizeof(T) << " : " << setw(7) << perf << " GB/s " << endl; +} + +template +void hipPerfMemset::run2D(unsigned int test, T memsetval, enum MemsetType type, bool async) { + + bufSize_ = sizeList[test % num_sizes_]; + + size_t numH = bufSize_; + size_t numW = bufSize_; + size_t pitch_A; + size_t width = numW * sizeof(char); + size_t sizeElements = width * numH; + size_t elements = numW* numH; + + T * A_h; + T * A_d; + + HIPCHECK(hipMallocPitch(reinterpret_cast(&A_d), &pitch_A, width , + numH)); + A_h = reinterpret_cast(malloc(sizeElements)); + + for (size_t i=0; i < elements; i++) { + A_h[i] = 1; + } + + hipStream_t stream; + HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); + + // Warm-up + if (async) { + HIPCHECK(hipMemset2DAsync(A_d, pitch_A, memsetval, numW, numH, stream)); + HIPCHECK(hipStreamSynchronize(stream)); + } else { + HIPCHECK(hipMemset2D(A_d, pitch_A, memsetval, numW, numH)); + HIPCHECK(hipDeviceSynchronize()); + } + + auto start = chrono::steady_clock::now(); + + for (uint i = 0; i < NUM_ITER; i++) { + if (type == hipMemsetTypeDefault && !async) { + HIPCHECK(hipMemset2D(A_d, pitch_A, memsetval, numW, numH)); + } + else if (type == hipMemsetTypeDefault && async) { + HIPCHECK(hipMemset2DAsync(A_d, pitch_A, memsetval, numW, numH, stream)); + } + } + + if (async) { + HIPCHECK(hipStreamSynchronize(stream)); + } else { + HIPCHECK(hipDeviceSynchronize()); + } + + auto end = chrono::steady_clock::now(); + + HIPCHECK(hipMemcpy2D(A_h, width, A_d, pitch_A, numW, numH, + hipMemcpyDeviceToHost)); + + for (int i=0; i < elements; i++) { + if (A_h[i] != memsetval) { + cout << "mismatch at index " << i << " computed: " << static_cast (A_h[i]) + << ", memsetval: " << static_cast (memsetval) << endl; + break; + } + } + + chrono::duration diff = end - start; + + auto sec = diff.count(); + + auto perf = static_cast((sizeElements* NUM_ITER * (double)(1e-09)) / sec); + + cout << " hipPerf2DMemset" << (async ? "Async" : " ") << "[" << test << "] " + << " " << "(GB/s) for " << setw(5) << bufSize_ + << " x " << setw(5) << bufSize_ << " bytes : " << setw(7) << perf << endl; + + HIPCHECK(hipStreamDestroy(stream)); + HIPCHECK(hipFree(A_d)); + free(A_h); +} + +template +void hipPerfMemset::run3D(unsigned int test, T memsetval, enum MemsetType type, bool async) { + + bufSize_ = sizeList[test % num_sizes_]; + + size_t numH = bufSize_; + size_t numW = bufSize_; + size_t depth = 10; + size_t width = numW * sizeof(char); + size_t sizeElements = width * numH * depth; + size_t elements = numW* numH* depth; + + hipStream_t stream; + HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); + + T *A_h; + + hipExtent extent = make_hipExtent(width, numH, depth); + hipPitchedPtr devPitchedPtr; + + HIPCHECK(hipMalloc3D(&devPitchedPtr, extent)); + A_h = (char*)malloc(sizeElements); + HIPASSERT(A_h != NULL); + + for (size_t i=0; i (A_h[i]) + << ", memsetval: " << static_cast (memsetval) << endl; + break; + } + } + + chrono::duration diff = end - start; + + auto sec = diff.count(); + + auto perf = static_cast((sizeElements * NUM_ITER * (double)(1e-09)) / sec); + + cout << " hipPerf3DMemset" << (async ? "Async" : " ") << "[" << test << "] " << " " + << "(GB/s) for " << setw(5) << bufSize_ << " x " << setw(5) + << bufSize_ << " x " << depth << " bytes : " << setw(7) << perf << endl; + HIPCHECK(hipFree(devPitchedPtr.ptr)); + free(A_h); +} + +int main() { + hipPerfMemset hipPerfMemset; + + dataType pattern; + int deviceId = 0; + hipPerfMemset.open(deviceId); + MemsetType type; + + int numTests = hipPerfMemset.getNumTests(); + int numTests2D = hipPerfMemset.getNumTests2D(); + int numTests3D = hipPerfMemset.getNumTests3D(); + + + cout << "--------------------- 1D buffer -------------------" << endl; + bool async= false; + for (uint i = 0; i < 2 ; i++) { + cout << endl; + + for (auto testCase = 0; testCase < numTests; testCase++) { + if (testCase < sizeof(eleNumList) / sizeof(uint32_t)) { + cout << "API: hipMemsetD8" << (async ? "Async " : " "); + hipPerfMemset.run1D(testCase, pattern.memsetval, hipMemsetTypeD8, async); + } + + else if (testCase < 2 * sizeof(eleNumList) / sizeof(uint32_t)) { + cout << "API: hipMemsetD16" << (async ? "Async" : " "); + hipPerfMemset.run1D(testCase,pattern.memsetD16val, hipMemsetTypeD16, async); + } + + else if (testCase < 3 * sizeof(eleNumList) / sizeof(uint32_t)) { + cout << "API: hipMemsetD32" << (async ? "Async" : " "); + hipPerfMemset.run1D(testCase,pattern.memsetD32val, hipMemsetTypeD32, async); + } + + else { + cout << "API: hipMemset" << (async ? "Async " : " "); + hipPerfMemset.run1D(testCase,pattern.memsetval, hipMemsetTypeDefault, async); + } + } + async = true; + } + + cout << endl; + cout << "------------------ 2D buffer arrays ---------------" << endl; + + async = false; + for (uint i = 0; i < 2; i++) { + cout << endl; + for (uint test = 0; test < numTests2D; test++) { + hipPerfMemset.run2D(test, pattern.memsetval, hipMemsetTypeDefault, async); + } + async = true; + } + + cout << endl; + cout << "------------------ 3D buffer arrays ---------------" << endl; + + async = false; + for (uint i = 0; i < 2; i++) { + cout << endl; + for (uint test =0; test < numTests3D; test++) { + hipPerfMemset.run3D(test, pattern.memsetval, hipMemsetTypeDefault, async); + } + async = true; + } + + passed(); +}