From 3f5ddfa0ab62b2e8efec34cf5f9d68fb3a0231d4 Mon Sep 17 00:00:00 2001 From: Giovanni LB Date: Sun, 15 Oct 2023 01:57:53 -0300 Subject: [PATCH] Initial commit for counter correctness tests. Change-Id: I74caa4ab89bd765e59686cfbaaf1ce44ab10fe19 - adds counter correctness tests for GRBM, SQWAVES and SQ INSTRUCTIONs - pandas dependency - made it compatible with test infra. Change-Id: I74caa4ab89bd765e59686cfbaaf1ce44ab10fe19 --- tests-v2/featuretests/profiler/CMakeLists.txt | 44 +++ .../profiler/apps/goldentraces/pmc.txt | 1 + .../featuretests/profiler/apps/histogram.cpp | 281 ++++++++++++++++++ .../featuretests/profiler/apps/histogram.hpp | 181 +++++++++++ .../featuretests/profiler/apps/transpose.cpp | 127 ++++++++ .../featuretests/profiler/apps/vectoradd.cpp | 216 ++++++++++++++ .../profiler/counter_correctness_tests.cmake | 62 ++++ .../featuretests/profiler/test_histogram.py | 73 +++++ .../featuretests/profiler/test_vectoradd.py | 114 +++++++ 9 files changed, 1099 insertions(+) create mode 100644 tests-v2/featuretests/profiler/apps/goldentraces/pmc.txt create mode 100644 tests-v2/featuretests/profiler/apps/histogram.cpp create mode 100644 tests-v2/featuretests/profiler/apps/histogram.hpp create mode 100644 tests-v2/featuretests/profiler/apps/transpose.cpp create mode 100644 tests-v2/featuretests/profiler/apps/vectoradd.cpp create mode 100644 tests-v2/featuretests/profiler/counter_correctness_tests.cmake create mode 100644 tests-v2/featuretests/profiler/test_histogram.py create mode 100644 tests-v2/featuretests/profiler/test_vectoradd.py diff --git a/tests-v2/featuretests/profiler/CMakeLists.txt b/tests-v2/featuretests/profiler/CMakeLists.txt index 4296ccf3e9..f9f3855a04 100644 --- a/tests-v2/featuretests/profiler/CMakeLists.txt +++ b/tests-v2/featuretests/profiler/CMakeLists.txt @@ -172,6 +172,49 @@ endif() # COMPONENT tests) # endif() +# pmc correctness vectoradd +set_source_files_properties(apps/vectoradd.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) +rocprofiler_featuretests_profiler_add_executable(pmc_vectoradd apps/vectoradd.cpp) +set_target_properties( + pmc_vectoradd PROPERTIES RUNTIME_OUTPUT_DIRECTORY + "${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps") +target_link_options(pmc_vectoradd PRIVATE "-Wl,--build-id=md5") +install( + TARGETS pmc_vectoradd + RUNTIME + DESTINATION + ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/profiler/apps + COMPONENT tests) + +# pmc correctness hstogram +set_source_files_properties(apps/histogram.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) +rocprofiler_featuretests_profiler_add_executable(pmc_histogram apps/histogram.cpp) +set_target_properties( + pmc_histogram PROPERTIES RUNTIME_OUTPUT_DIRECTORY + "${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps") +target_link_options(pmc_histogram PRIVATE "-Wl,--build-id=md5") +install( + TARGETS pmc_histogram + RUNTIME + DESTINATION + ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/profiler/apps + COMPONENT tests) + +# pmc correctness transpose +set_source_files_properties(apps/transpose.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) +rocprofiler_featuretests_profiler_add_executable(pmc_transpose apps/transpose.cpp) +set_target_properties( + pmc_transpose PROPERTIES RUNTIME_OUTPUT_DIRECTORY + "${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps") +target_link_options(pmc_transpose PRIVATE "-Wl,--build-id=md5") +install( + TARGETS pmc_transpose + RUNTIME + DESTINATION + ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/profiler/apps + COMPONENT tests) + + # hsa-mem_async_copy -- Not Enabled for Now set_source_files_properties(apps/async_mem_copy.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) @@ -381,4 +424,5 @@ find_package( else() # cmake based tests include(${CMAKE_CURRENT_LIST_DIR}/counter_validation_tests.cmake) + include(${CMAKE_CURRENT_LIST_DIR}/counter_correctness_tests.cmake) endif() diff --git a/tests-v2/featuretests/profiler/apps/goldentraces/pmc.txt b/tests-v2/featuretests/profiler/apps/goldentraces/pmc.txt new file mode 100644 index 0000000000..22e01acf81 --- /dev/null +++ b/tests-v2/featuretests/profiler/apps/goldentraces/pmc.txt @@ -0,0 +1 @@ +pmc: GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES SQ_INSTS_LDS SQ_INSTS_VALU SQ_INSTS_SALU SQ_INSTS_SMEM L2CacheHit TA_BUSY_max MemUnitBusy diff --git a/tests-v2/featuretests/profiler/apps/histogram.cpp b/tests-v2/featuretests/profiler/apps/histogram.cpp new file mode 100644 index 0000000000..e00b8b390a --- /dev/null +++ b/tests-v2/featuretests/profiler/apps/histogram.cpp @@ -0,0 +1,281 @@ +/********************************************************************** +Copyright �2015 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +� Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. +� Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ + +#include +#include "hip/hip_runtime.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "histogram.hpp" + +#define LINEAR_MEM_ACCESS + +#define BIN_SIZE 256 +#define SDK_SUCCESS 0 +#define SDK_FAILURE 1 +#define CHECK_ALLOCATION(x, msg) if(!(x)) { std::cout << __FILE__ << ' ' << __LINE__ << ' ' << msg << std::endl; } + + +/** + * @brief Calculates block-histogram bin whose bin size is 256 + * @param data input data pointer + * @param sharedArray shared array for thread-histogram bins + * @param binResult block-histogram array + */ + +__global__ +void histogram256( + unsigned int* data, + unsigned int* binResult) +{ + HIP_DYNAMIC_SHARED(unsigned char, sharedArray); + size_t localId = hipThreadIdx_x; + size_t globalId = hipThreadIdx_x + hipBlockIdx_x*hipBlockDim_x; + size_t groupId = hipBlockIdx_x; + size_t groupSize = hipBlockDim_x; + int offSet1 = localId & 31; + int offSet2 = 4 * offSet1; //which element to access in one bank. + int offSet3 = localId >> 5; //bank number + /* initialize shared array to zero */ + uchar4 * input = (uchar4*)sharedArray; + for(int i = 0; i < 64; ++i) + input[groupSize * i + localId] = make_uchar4(0,0,0,0); + + __syncthreads(); + + + /* calculate thread-histograms */ + //128 accumulations per thread + for(int i = 0; i < 128; i++) + { +#ifdef LINEAR_MEM_ACCESS + uint value = data[groupId * (groupSize * (BIN_SIZE/2)) + i * groupSize + localId]; +#else + uint value = data[globalId + i*4096]; + +#endif // LINEAR_MEM_ACCESS + sharedArray[value * 128 + offSet2 + offSet3]++; + } + __syncthreads(); + + /* merge all thread-histograms into block-histogram */ + + uint4 binCount; + uint result; + uchar4 binVal; //Introduced uint4 for summation to avoid overflows + uint4 binValAsUint; + for(int i = 0; i < BIN_SIZE / groupSize; ++i) + { + int passNumber = BIN_SIZE / 2 * 32 * i + localId * 32 ; + binCount = make_uint4(0,0,0,0); + result= 0; + for(int j = 0; j < 32; ++j) + { + int bankNum = (j + offSet1) & 31; // this is bank number + binVal = input[passNumber +bankNum]; + + binValAsUint.x = (unsigned int)binVal.x; + binValAsUint.y = (unsigned int)binVal.y; + binValAsUint.z = (unsigned int)binVal.z; + binValAsUint.w = (unsigned int)binVal.w; + + binCount.x += binValAsUint.x; + binCount.y += binValAsUint.y; + binCount.z += binValAsUint.z; + binCount.w += binValAsUint.w; + + } + result = binCount.x + binCount.y + binCount.z + binCount.w; + binResult[groupId * BIN_SIZE + groupSize * i + localId ] = result; + } +} + +int +Histogram::calculateHostBin() +{ + for(int i = 0; i < height; ++i) + { + for(int j = 0; j < width; ++j) + { + hostBin[data[i * width + j]]++; + } + } + + return SDK_SUCCESS; +} + + +int +Histogram::setupHistogram() +{ + int i = 0; + + data = (unsigned int *)malloc(sizeof(unsigned int) * width * height); + + for(i = 0; i < width * height; i++) + { + data[i] = rand() % (unsigned int)(binSize); + } + + hostBin = (unsigned int*)malloc(binSize * sizeof(unsigned int)); + CHECK_ALLOCATION(hostBin, "Failed to allocate host memory. (hostBin)"); + + memset(hostBin, 0, binSize * sizeof(unsigned int)); + + deviceBin = (unsigned int*)malloc(binSize * sizeof(unsigned int)); + CHECK_ALLOCATION(deviceBin, "Failed to allocate host memory. (deviceBin)"); + midDeviceBin = (unsigned int*)malloc(sizeof(unsigned int) * binSize * subHistgCnt); + + memset(deviceBin, 0, binSize * sizeof(unsigned int)); + return SDK_SUCCESS; +} + +int +Histogram::setupHIP(void) +{ + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + cout << " System minor " << devProp.minor << endl; + cout << " System major " << devProp.major << endl; + cout << " agent prop name " << devProp.name << endl; + + return SDK_SUCCESS; +} + + +int +Histogram::runKernels(void) +{ + groupSize = 128; + globalThreads = (width * height) / (GROUP_ITERATIONS); + + localThreads = groupSize; + + + hipHostMalloc((void**)&dataBuf,sizeof(unsigned int) * width * height, hipHostMallocDefault); + unsigned int *din; + hipHostGetDevicePointer((void**)&din, dataBuf,0); + hipMemcpy(din, data,sizeof(unsigned int) * width * height, hipMemcpyHostToDevice); + + subHistgCnt = (width * height) / (groupSize * groupIterations); + + hipHostMalloc((void**)&midDeviceBinBuf,sizeof(unsigned int) * binSize * subHistgCnt, hipHostMallocDefault); + + hipLaunchKernelGGL(histogram256, + dim3(globalThreads/localThreads), + dim3(localThreads), + groupSize * binSize * sizeof(unsigned char), 0, + dataBuf ,midDeviceBinBuf); + + hipDeviceSynchronize(); + + hipMemcpy(midDeviceBin, midDeviceBinBuf,sizeof(unsigned int) * binSize * subHistgCnt, hipMemcpyDeviceToHost); + //printArray("midDeviceBin", midDeviceBin, sizeof(unsigned int) * binSize * subHistgCnt, 1); + // Clear deviceBin array + memset(deviceBin, 0, binSize * sizeof(unsigned int)); + + // Calculate final histogram bin + for(int i = 0; i < subHistgCnt; ++i) + { + for(int j = 0; j < binSize; ++j) + { + deviceBin[j] += midDeviceBin[i * binSize + j]; + } + } + + return SDK_SUCCESS; +} + +int +Histogram::setup() +{ + if(iterations < 1) + { + std::cout<<"Error, iterations cannot be 0 or negative. Exiting..\n"; + exit(0); + } + int status = 0; + + /* width must be multiples of binSize and + * height must be multiples of groupSize + */ + width = (width / binSize ? width / binSize: 1) * binSize; + height = (height / groupSize ? height / groupSize: 1) * groupSize; + + status = setupHIP(); + if(status != SDK_SUCCESS) + return status; + + status = setupHistogram(); + if(status != SDK_SUCCESS) + return status; + + return SDK_SUCCESS; +} + + +int Histogram::run() +{ + for(int i = 0; i < 2 && iterations != 1; i++) + if(runKernels() != SDK_SUCCESS) + return SDK_FAILURE; + + for(int i = 0; i < iterations; i++) + if(runKernels() != SDK_SUCCESS) + return SDK_FAILURE; + + return SDK_SUCCESS; +} + +int Histogram::cleanup() +{ + hipFree(dataBuf); + hipFree(midDeviceBinBuf); + + free(hostBin); + free(deviceBin); + + return SDK_SUCCESS; +} + +int +main(int argc, char * argv[]) +{ + int status = 0; + // Create MonteCalroAsian object + Histogram hipHistogram; + + // Setup + status = hipHistogram.setup(); + if(status != SDK_SUCCESS) + return status; + + // Run + if(hipHistogram.run() != SDK_SUCCESS) + return SDK_FAILURE; + + // Cleanup resources created + if(hipHistogram.cleanup() != SDK_SUCCESS) + return SDK_FAILURE; + + return SDK_SUCCESS; +} diff --git a/tests-v2/featuretests/profiler/apps/histogram.hpp b/tests-v2/featuretests/profiler/apps/histogram.hpp new file mode 100644 index 0000000000..302d1516ee --- /dev/null +++ b/tests-v2/featuretests/profiler/apps/histogram.hpp @@ -0,0 +1,181 @@ +/********************************************************************** +Copyright �2015 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +� Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. +� Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ + + +#ifndef HISTOGRAM_H_ +#define HISTOGRAM_H_ + + +#include +#include +#include +#include +#include + +using namespace std; + +#define SAMPLE_VERSION "HIP-Examples-Applications-v1.0" +#define WIDTH 1024 +#define HEIGHT 1024 +#define BIN_SIZE 256 +#define GROUP_SIZE 128 +#define GROUP_ITERATIONS (BIN_SIZE / 2)//This is done to avoid overflow in the kernel +#define SUB_HISTOGRAM_COUNT ((WIDTH * HEIGHT) /(GROUP_SIZE * GROUP_ITERATIONS)) + + +#ifndef __global__ +#define __global__ +#endif + +#ifndef HIP_DYNAMIC_SHARED +#define HIP_DYNAMIC_SHARED(t,x) t* x +#endif + +#ifndef hipThreadIdx_x +#define hipThreadIdx_x 0 +#endif + +#ifndef hipThreadIdx_y +#define hipThreadIdx_y 0 +#endif + +#ifndef hipThreadIdx_z +#define hipThreadIdx_z 0 +#endif + +#ifndef hipBlockIdx_x +#define hipBlockIdx_x 0 +#endif + +#ifndef hipBlockDim_x +#define hipBlockDim_x 0 +#endif + +#ifndef __syncthreads +#define __syncthreads() +#endif + +/** +* Histogram +* Class implements 256 Histogram bin implementation + +*/ + +class Histogram +{ + + int binSize; /**< Size of Histogram bin */ + int groupSize; /**< Number of threads in group */ + int subHistgCnt; /**< Sub histogram count */ + unsigned int *data; /**< input data initialized with normalized(0 - binSize) random values */ + int width; /**< width of the input */ + int height; /**< height of the input */ + unsigned int *hostBin; /**< Host result for histogram bin */ + unsigned int *midDeviceBin; /**< Intermittent sub-histogram bins */ + unsigned int *deviceBin; /**< Device result for histogram bin */ + + unsigned long totalLocalMemory; /**< Max local memory allowed */ + unsigned long usedLocalMemory; /**< Used local memory by kernel */ + + unsigned int* dataBuf; /**< CL memory buffer for data */ + unsigned int* midDeviceBinBuf; /**< CL memory buffer for intermittent device bin */ + + int iterations; /**< Number of iterations for kernel execution */ + unsigned int globalThreads; + unsigned int localThreads ; + int groupIterations; + + public: + + /** + * Constructor + * Initialize member variables + * @param name name of sample (string) + */ + Histogram() + : + binSize(BIN_SIZE), + groupSize(GROUP_SIZE), + subHistgCnt(SUB_HISTOGRAM_COUNT), + groupIterations(GROUP_ITERATIONS), + data(NULL), + hostBin(NULL), + midDeviceBin(NULL), + deviceBin(NULL), + iterations(1) + { + /* Set default values for width and height */ + width = WIDTH; + height = HEIGHT; + } + + + ~Histogram() + { + } + + /** + * Allocate and initialize required host memory with appropriate values + * @return SDK_SUCCESS on success and SDK_FAILURE on failure + */ + int setupHistogram(); + + /** + * HIP related initialisations. + * Set up Context, Device list, Command Queue, Memory buffers + * Build HIP kernel program executable + * @return SDK_SUCCESS on success and SDK_FAILURE on failure + */ + int setupHIP(); + + /** + * Set values for kernels' arguments, enqueue calls to the kernels + * on to the command queue, wait till end of kernel execution. + * Get kernel start and end time if timing is enabled + * @return SDK_SUCCESS on success and SDK_FAILURE on failure + */ + int runKernels(); + + + /** + * Override from SDKSample, adjust width and height + * of execution domain, perform all sample setup + * @return SDK_SUCCESS on success and SDK_FAILURE on failure + */ + int setup(); + + /** + * Override from SDKSample + * Run HIP Black-Scholes + * @return SDK_SUCCESS on success and SDK_FAILURE on failure + */ + int run(); + + /** + * Override from SDKSample + * Cleanup memory allocations + * @return SDK_SUCCESS on success and SDK_FAILURE on failure + */ + int cleanup(); + + private: + + /** + * Calculate histogram bin on host + */ + int calculateHostBin(); +}; +#endif diff --git a/tests-v2/featuretests/profiler/apps/transpose.cpp b/tests-v2/featuretests/profiler/apps/transpose.cpp new file mode 100644 index 0000000000..5824d79a38 --- /dev/null +++ b/tests-v2/featuretests/profiler/apps/transpose.cpp @@ -0,0 +1,127 @@ +/********************************************************************** +Copyright ©2023 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +• Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. +• Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ + +#include +#include +#include +#include +#include +#include + +#define TILE_DIM 16 +#define BLOCK_ROWS 16 + +__global__ void transposeNaive(float *odata, float *idata, int width, int height) { + int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; + int yIndex = blockIdx.y * TILE_DIM + threadIdx.y; + + int index_in = xIndex + width * yIndex; + int index_out = yIndex + height * xIndex; + + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + odata[index_out + i] = idata[index_in + i * width]; + } +} + +__global__ void transposeCoalesced(float *odata, float *idata, int width, int height) { + __shared__ float tile[TILE_DIM][TILE_DIM]; + + int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; + int yIndex = blockIdx.y * TILE_DIM + threadIdx.y; + int index_in = xIndex + (yIndex)*width; + + xIndex = blockIdx.y * TILE_DIM + threadIdx.x; + yIndex = blockIdx.x * TILE_DIM + threadIdx.y; + int index_out = xIndex + (yIndex)*height; + + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + tile[threadIdx.y + i][threadIdx.x] = idata[index_in + i * width]; + } + + __syncthreads(); + + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + odata[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i]; + } +} + +__global__ void transposeNoBankConflicts(float *odata, float *idata, int width, int height) { + __shared__ float tile[TILE_DIM][TILE_DIM + 1]; + + int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; + int yIndex = blockIdx.y * TILE_DIM + threadIdx.y; + int index_in = xIndex + (yIndex)*width; + + xIndex = blockIdx.y * TILE_DIM + threadIdx.x; + yIndex = blockIdx.x * TILE_DIM + threadIdx.y; + int index_out = xIndex + (yIndex)*height; + + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + tile[threadIdx.y + i][threadIdx.x] = idata[index_in + i * width]; + } + + __syncthreads(); + + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + odata[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i]; + } +} + +/** + * fillRandom + * fill array with random values + */ +__global__ void fillRandom(float* arrayPtr, int size) { + int id = threadIdx.x + blockDim.x * blockIdx.x; + uint32_t seed = 12345678; + while (id < size) { + seed = (seed<<3) ^ id; + arrayPtr[id] = seed; + id += blockDim.x * gridDim.x; + } +} + +int main(int argc, char * argv[]) +{ + float* input = nullptr; + float* output = nullptr; + size_t width = 1024; + size_t height = 1024; + + // Set input data to matrix A and matrix B + hipMalloc((void**)&input, width * height * sizeof(float)); + hipMalloc((void**)&output, width * height * sizeof(float)); + + fillRandom<<<256, 256>>>(input, width*height); + hipDeviceSynchronize(); + + dim3 block(width/TILE_DIM,height/TILE_DIM); + dim3 threads(TILE_DIM,TILE_DIM); + + transposeNoBankConflicts<<>>( + output, + input, + width, + height + ); + + hipDeviceSynchronize(); + + hipFree(input); + hipFree(output); + + return 0; +} \ No newline at end of file diff --git a/tests-v2/featuretests/profiler/apps/vectoradd.cpp b/tests-v2/featuretests/profiler/apps/vectoradd.cpp new file mode 100644 index 0000000000..aa40ac4d81 --- /dev/null +++ b/tests-v2/featuretests/profiler/apps/vectoradd.cpp @@ -0,0 +1,216 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include +#include +#include +#include +#include +#include "hip/hip_runtime.h" +#include +#include + +#ifdef NDEBUG +#define HIP_ASSERT(x) x +#else +#define HIP_ASSERT(x) (assert((x)==hipSuccess)) +#endif + + +#define WIDTH (1024) +#define HEIGHT (1024) + +#define NUM (WIDTH*HEIGHT) + +#define THREADS_PER_BLOCK_X 64 +#define THREADS_PER_BLOCK_Y 1 +#define THREADS_PER_BLOCK_Z 1 + +// Computes vectorAdd with matrix-multiply +template +__global__ void addition_kernel( + T* __restrict__ a, + const float* __restrict__ b, + const float* __restrict__ c, + int width, + int height +) { + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; + + if (x >= WIDTH || y >= HEIGHT) return; + int index = y * width + x; + + a[index] = b[index]+c[index]; +} + + +__global__ void subtract_kernel( + float* __restrict__ a, + const float* __restrict__ b, + const float* __restrict__ c, + int width, + int height +) { + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; + + if (x >= WIDTH || y >= HEIGHT) return; + int index = y * width + x; + + a[index] = abs(b[index]-c[index]); +} + +__global__ void multiply_kernel( + float* __restrict__ a, + const float* __restrict__ b, + const float* __restrict__ c, + int width, + int height +) { + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; + + if (x >= WIDTH || y >= HEIGHT) return; + int index = y * width + x; + + a[index] = (b[index]-1)*(c[index]-1)+1; +} + +__global__ void divide_kernel( + float* __restrict__ a, + const float* __restrict__ b, + const float* __restrict__ c, + int width, + int height +) { + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; + + if (x >= WIDTH || y >= HEIGHT) return; + int index = y * width + x; + + a[index] = (b[index]-c[index]) / abs(c[index]+b[index]) + 1; +} + +using namespace std; + +void run(int NUM_QUEUE) { + std::vector hostA(NUM_QUEUE); + std::vector hostB(NUM_QUEUE); + std::vector hostC(NUM_QUEUE); + + std::vector deviceA(NUM_QUEUE); + std::vector deviceB(NUM_QUEUE); + std::vector deviceC(NUM_QUEUE); + + std::vector streams(NUM_QUEUE); + + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + cout << " System minor " << devProp.minor << endl; + cout << " System major " << devProp.major << endl; + cout << " agent prop name " << devProp.name << endl; + cout << "hip Device prop succeeded " << endl ; + + int i; + int errors; + + for (int q=0; q 0) # GPU must always be active + assert np.all(count >= active) # Count always increments more than active + assert np.all( + count * 0.5 < active + ) # We can reasonably expect an active GPU during the kernel execution + + +def test_insts(csvfile): + waves = np.array( + csvfile["SQ_WAVES"] + ) # TODO: 256 for wave32, need to check for wave64 + lds = np.array(csvfile["SQ_INSTS_LDS"]) + valu = np.array(csvfile["SQ_INSTS_VALU"]) + salu = np.array(csvfile["SQ_INSTS_SALU"]) + smem = np.array(csvfile["SQ_INSTS_SMEM"]) + + assert np.all(waves == 256) or np.all(waves == 128) + + # Each have executes at least one of these + assert np.all(lds > waves) + assert np.all(valu > waves) + assert np.all(salu > waves) + assert np.all(smem >= waves) + + +def test_sqcycles(csvfile): + tabusy = np.array(csvfile["TA_BUSY_max"]) + grbm = np.array(csvfile["GRBM_GUI_ACTIVE"]) + waves = np.array(csvfile["SQ_WAVES"]) + wait_any = np.array(csvfile["SQ_WAIT_ANY"]) + wave_cycles = np.array(csvfile["SQ_WAVE_CYCLES"]) + vmem_cycles = np.array(csvfile["SQ_INST_CYCLES_VMEM"]) + + lds = np.array(csvfile["SQ_INSTS_LDS"]) + valu = np.array(csvfile["SQ_INSTS_VALU"]) + salu = np.array(csvfile["SQ_INSTS_SALU"]) + smem = np.array(csvfile["SQ_INSTS_SMEM"]) + + ALU = lds + valu + salu + smem + + assert np.all( + wave_cycles >= ALU + wait_any + ) # Each ALU inst takes at least one cycle + assert np.all(wave_cycles / grbm <= waves) # Mean occupancy cannot exceed waves + assert np.all(wait_any >= tabusy) # Waves are waiting for ta + assert np.all( + vmem_cycles >= waves + ) # Each wave takes at least one cycle to issue vmem + + +if __name__ == "__main__": + csv = pandas.read_csv(f"{output_folder}/{expected_filename}") + test_grbm(csv) + test_insts(csv) + # test_sqcycles(csv) + + # if its reached this point, then all tests apssed + print("Test Passed: All counter correctness tests passed.") diff --git a/tests-v2/featuretests/profiler/test_vectoradd.py b/tests-v2/featuretests/profiler/test_vectoradd.py new file mode 100644 index 0000000000..78f95f1258 --- /dev/null +++ b/tests-v2/featuretests/profiler/test_vectoradd.py @@ -0,0 +1,114 @@ +import numpy as np +import pandas +import os +import glob + +current_dir = os.getcwd() +rocprof = "rocprofv2" + +expected_filename = "pmc_1/results_vadd.csv" +output_folder = "/tmp/tests-v2/pmc" + + +def test_grbm(csvfile): + count = np.array(csvfile["GRBM_COUNT"]) + active = np.array(csvfile["GRBM_GUI_ACTIVE"]) + assert np.all(active > 0) # GPU must always be active + assert np.all(count >= active) # Count always increments more than active + assert np.all( + count * 0.8 < active + ) # We can reasonably expect an active GPU during the kernel execution + + +def test_sqwaves(csvfile): + waves = np.array( + csvfile["SQ_WAVES"] + ) # 1M threads == 32k waves for Wave32 and 16k waves for Wave64 + assert np.all(waves == 32768) or np.all(waves == 16384) + + +def test_insts(csvfile): + waves = np.array(csvfile["SQ_WAVES"]) + valu = np.array(csvfile["SQ_INSTS_VALU"]) + salu = np.array(csvfile["SQ_INSTS_SALU"]) + smem = np.array(csvfile["SQ_INSTS_SMEM"]) + lds = np.array(csvfile["SQ_INSTS_LDS"]) + + assert np.all(lds == 0) # Not used on vectoradd + + # VALU, SALU and SMEM must be divisible by SQ_Waves + assert np.all(valu % waves == 0) + assert np.all(salu % waves == 0) + assert np.all(smem % waves == 0) + + # Each have executes at least one of these + assert np.all(valu > waves) + assert np.all(salu > waves) + assert np.all(smem >= waves) + + # TODO: Check assembly for exact number! + + +def test_gl2c(csvfile): + waves = np.array(csvfile["SQ_WAVES"]) + read = np.array(csvfile["GL2C_MC_RDREQ_sum"]) + write = np.array(csvfile["GL2C_MC_WRREQ_sum"]) + hit = np.array(csvfile["GL2C_HIT_sum"]) + miss = np.array(csvfile["GL2C_MISS_sum"]) + hitrate = np.array(csvfile["L2CacheHit"]) + + assert np.all(write >= waves) # We do at least one write per wave + # TODO: Find out why the first kernel gets such a high write request count. + assert np.all(write < 2.5 * waves) # We do only one write (+ a little) per wave. + assert np.all(read >= 2 * waves) # We do at least 2 reads per wave (A=B+C) + assert np.all(read < 3 * waves) # We do only 2 reads (+ a little) per wave + + assert np.all(miss >= hit) # on Vadd we can't have more hits than misses + assert np.all(miss >= 2 * waves) # Each read misses at least once + assert np.all(miss < 4 * waves) # Can't miss too much + assert np.all(hit >= 0.5 * waves) # We have at least one hit per wave + + assert np.all(hitrate <= 50) # We always get more misses than hits + + +def test_ta(csvfile): + busy = np.array(csvfile["MemUnitBusy"]) + some_busy = np.array(csvfile["TA_BUSY_max"]) / np.array(csvfile["GRBM_GUI_ACTIVE"]) + + assert np.all(busy <= 100) # MemUnitBusy <= 100% + assert np.all(some_busy >= 1) # Some shader engine is using TA + + +def test_sqcycles(csvfile): + tabusy = np.array(csvfile["TA_BUSY_max"]) + grbm = np.array(csvfile["GRBM_GUI_ACTIVE"]) + waves = np.array(csvfile["SQ_WAVES"]) + ALU = np.array(csvfile["SQ_INSTS_VALU"]) + np.array(csvfile["SQ_INSTS_SALU"]) + wait_any = np.array(csvfile["SQ_WAIT_ANY"]) + wave_cycles = np.array(csvfile["SQ_WAVE_CYCLES"]) + vmem_cycles = np.array(csvfile["SQ_INST_CYCLES_VMEM"]) + + assert np.all( + wave_cycles >= ALU + wait_any + ) # Each ALU inst takes at least one cycle + assert np.all(wave_cycles / grbm <= waves) # Mean occupancy cannot exceed waves + assert np.all(wait_any >= tabusy) # Waves are waiting for ta + assert np.all( + vmem_cycles >= waves + ) # Each wave takes at least one cycle to issue vmem + assert np.all( + wait_any / wave_cycles >= 0.5 + ) # vectorAdd is very memory-bound. TODO: use number less arbitrary than 0.5 + + +if __name__ == "__main__": + csv = pandas.read_csv(f"{output_folder}/{expected_filename}") + test_grbm(csv) + test_sqwaves(csv) + test_insts(csv) + # test_gl2c(csv) + # test_ta(csv) + # test_sqcycles(csv) + + # if its reached this point, then all tests apssed + print("Test Passed: All counter correctness tests passed.")