Initial commit for counter correctness tests.

Change-Id: I74caa4ab89bd765e59686cfbaaf1ce44ab10fe19 - adds counter correctness tests for GRBM, SQWAVES and SQ INSTRUCTIONs - pandas dependency - made it compatible with test infra. Change-Id: I74caa4ab89bd765e59686cfbaaf1ce44ab10fe19
2023-10-15 01:57:53 -03:00
parent c1884d70fc
commit 3f5ddfa0ab
9 changed files with 1099 additions and 0 deletions
@@ -172,6 +172,49 @@ endif()
 #             COMPONENT tests)
 # endif()

+# pmc correctness vectoradd
+set_source_files_properties(apps/vectoradd.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+rocprofiler_featuretests_profiler_add_executable(pmc_vectoradd apps/vectoradd.cpp)
+set_target_properties(
+    pmc_vectoradd PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+                             "${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps")
+target_link_options(pmc_vectoradd PRIVATE "-Wl,--build-id=md5")
+install(
+    TARGETS pmc_vectoradd
+    RUNTIME
+        DESTINATION
+            ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/profiler/apps
+        COMPONENT tests)
+
+# pmc correctness hstogram
+set_source_files_properties(apps/histogram.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+rocprofiler_featuretests_profiler_add_executable(pmc_histogram apps/histogram.cpp)
+set_target_properties(
+    pmc_histogram PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+                             "${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps")
+target_link_options(pmc_histogram PRIVATE "-Wl,--build-id=md5")
+install(
+    TARGETS pmc_histogram
+    RUNTIME
+        DESTINATION
+            ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/profiler/apps
+        COMPONENT tests)
+
+# pmc correctness transpose
+set_source_files_properties(apps/transpose.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+rocprofiler_featuretests_profiler_add_executable(pmc_transpose apps/transpose.cpp)
+set_target_properties(
+    pmc_transpose PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+                             "${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps")
+target_link_options(pmc_transpose PRIVATE "-Wl,--build-id=md5")
+install(
+    TARGETS pmc_transpose
+    RUNTIME
+        DESTINATION
+            ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/profiler/apps
+        COMPONENT tests)
+
+
 # hsa-mem_async_copy -- Not Enabled for Now
 set_source_files_properties(apps/async_mem_copy.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT
                                                               1)
@@ -381,4 +424,5 @@ find_package(
    else()
        # cmake based tests
        include(${CMAKE_CURRENT_LIST_DIR}/counter_validation_tests.cmake)
+        include(${CMAKE_CURRENT_LIST_DIR}/counter_correctness_tests.cmake)
    endif()
@@ -0,0 +1 @@
+pmc: GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES SQ_INSTS_LDS SQ_INSTS_VALU SQ_INSTS_SALU SQ_INSTS_SMEM L2CacheHit TA_BUSY_max MemUnitBusy
@@ -0,0 +1,281 @@
+/**********************************************************************
+Copyright �2015 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+�   Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+�   Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************/
+
+#include <math.h>
+#include "hip/hip_runtime.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <algorithm>
+#include <stdlib.h>
+#include <iostream>
+#include <unistd.h>
+#include <vector>
+
+#include "histogram.hpp"
+
+#define LINEAR_MEM_ACCESS
+
+#define BIN_SIZE 256
+#define SDK_SUCCESS 0
+#define SDK_FAILURE 1
+#define CHECK_ALLOCATION(x, msg) if(!(x)) { std::cout << __FILE__ << ' ' << __LINE__ << ' ' << msg << std::endl; }
+
+
+/**
+ * @brief   Calculates block-histogram bin whose bin size is 256
+ * @param   data  input data pointer
+ * @param   sharedArray shared array for thread-histogram bins
+ * @param   binResult block-histogram array
+ */
+
+__global__
+void histogram256(
+                  unsigned int* data,
+                  unsigned int* binResult)
+{
+    HIP_DYNAMIC_SHARED(unsigned char, sharedArray);
+    size_t localId = hipThreadIdx_x;
+    size_t globalId = hipThreadIdx_x + hipBlockIdx_x*hipBlockDim_x;
+    size_t groupId = hipBlockIdx_x;
+    size_t groupSize = hipBlockDim_x;
+    int offSet1 = localId & 31;    
+    int offSet2 = 4 * offSet1;      //which element to access in one bank.
+    int offSet3 = localId >> 5;     //bank number
+    /* initialize shared array to zero */
+    uchar4 * input = (uchar4*)sharedArray;
+    for(int i = 0; i < 64; ++i)
+        input[groupSize * i + localId] = make_uchar4(0,0,0,0);
+
+    __syncthreads();
+
+
+    /* calculate thread-histograms */
+	//128 accumulations per thread
+	for(int i = 0; i < 128; i++)
+    {
+#ifdef LINEAR_MEM_ACCESS
+       uint value =  data[groupId * (groupSize * (BIN_SIZE/2)) + i * groupSize + localId]; 
+#else
+       uint  value = data[globalId + i*4096];
+
+#endif // LINEAR_MEM_ACCESS
+	   sharedArray[value * 128 + offSet2 + offSet3]++;
+    }
+    __syncthreads();
+    
+    /* merge all thread-histograms into block-histogram */
+
+	uint4 binCount;
+	uint result;
+	uchar4 binVal;	            //Introduced uint4 for summation to avoid overflows
+	uint4 binValAsUint;
+	for(int i = 0; i < BIN_SIZE / groupSize; ++i)
+    {
+        int passNumber = BIN_SIZE / 2 * 32 * i +  localId * 32 ;
+		binCount = make_uint4(0,0,0,0);
+		result= 0;
+        for(int j = 0; j < 32; ++j)
+		{
+			int bankNum = (j + offSet1) & 31;   // this is bank number
+            binVal = input[passNumber  +bankNum];
+
+            binValAsUint.x = (unsigned int)binVal.x;
+            binValAsUint.y = (unsigned int)binVal.y;
+            binValAsUint.z = (unsigned int)binVal.z;
+            binValAsUint.w = (unsigned int)binVal.w;
+
+            binCount.x += binValAsUint.x;
+            binCount.y += binValAsUint.y;
+            binCount.z += binValAsUint.z;
+            binCount.w += binValAsUint.w;
+
+		}
+        result = binCount.x + binCount.y + binCount.z + binCount.w;
+        binResult[groupId * BIN_SIZE + groupSize * i + localId ] = result;
+	}
+}
+
+int
+Histogram::calculateHostBin()
+{
+    for(int i = 0; i < height; ++i)
+    {
+        for(int j = 0; j < width; ++j)
+        {
+            hostBin[data[i * width + j]]++;
+        }
+    }
+
+    return SDK_SUCCESS;
+}
+
+
+int
+Histogram::setupHistogram()
+{
+    int i = 0;
+
+    data = (unsigned int *)malloc(sizeof(unsigned int) * width * height);
+
+    for(i = 0; i < width * height; i++)
+    {
+        data[i] = rand() % (unsigned int)(binSize);
+    }
+
+    hostBin = (unsigned int*)malloc(binSize * sizeof(unsigned int));
+    CHECK_ALLOCATION(hostBin, "Failed to allocate host memory. (hostBin)");
+
+    memset(hostBin, 0, binSize * sizeof(unsigned int));
+
+    deviceBin = (unsigned int*)malloc(binSize * sizeof(unsigned int));
+    CHECK_ALLOCATION(deviceBin, "Failed to allocate host memory. (deviceBin)");
+    midDeviceBin = (unsigned int*)malloc(sizeof(unsigned int) * binSize * subHistgCnt);
+
+    memset(deviceBin, 0, binSize * sizeof(unsigned int));
+    return SDK_SUCCESS;
+}
+
+int
+Histogram::setupHIP(void)
+{
+    hipDeviceProp_t devProp;
+    hipGetDeviceProperties(&devProp, 0);
+    cout << " System minor " << devProp.minor << endl;
+    cout << " System major " << devProp.major << endl;
+    cout << " agent prop name " << devProp.name << endl;
+
+    return SDK_SUCCESS;
+}
+
+
+int
+Histogram::runKernels(void)
+{
+    groupSize = 128;
+    globalThreads = (width * height) / (GROUP_ITERATIONS);
+
+    localThreads = groupSize;
+
+
+    hipHostMalloc((void**)&dataBuf,sizeof(unsigned int) * width * height, hipHostMallocDefault);
+    unsigned int *din;
+    hipHostGetDevicePointer((void**)&din, dataBuf,0);
+    hipMemcpy(din, data,sizeof(unsigned int) * width * height, hipMemcpyHostToDevice);
+
+    subHistgCnt = (width * height) / (groupSize * groupIterations);
+
+    hipHostMalloc((void**)&midDeviceBinBuf,sizeof(unsigned int) * binSize * subHistgCnt, hipHostMallocDefault);
+
+    hipLaunchKernelGGL(histogram256,
+                    dim3(globalThreads/localThreads),
+                    dim3(localThreads),
+                    groupSize * binSize * sizeof(unsigned char), 0,
+                    dataBuf ,midDeviceBinBuf);
+
+    hipDeviceSynchronize();
+
+    hipMemcpy(midDeviceBin, midDeviceBinBuf,sizeof(unsigned int) * binSize * subHistgCnt, hipMemcpyDeviceToHost);
+        //printArray<unsigned int>("midDeviceBin", midDeviceBin, sizeof(unsigned int) * binSize * subHistgCnt, 1);
+    // Clear deviceBin array
+    memset(deviceBin, 0, binSize * sizeof(unsigned int));
+
+    // Calculate final histogram bin
+    for(int i = 0; i < subHistgCnt; ++i)
+    {
+        for(int j = 0; j < binSize; ++j)
+        {
+            deviceBin[j] += midDeviceBin[i * binSize + j];
+        }
+    }
+
+    return SDK_SUCCESS;
+}
+
+int
+Histogram::setup()
+{
+    if(iterations < 1)
+    {
+        std::cout<<"Error, iterations cannot be 0 or negative. Exiting..\n";
+        exit(0);
+    }
+    int status = 0;
+
+    /* width must be multiples of binSize and
+     * height must be multiples of groupSize
+     */
+    width = (width / binSize ? width / binSize: 1) * binSize;
+    height = (height / groupSize ? height / groupSize: 1) * groupSize;
+
+    status = setupHIP();
+    if(status != SDK_SUCCESS)
+        return status;
+
+    status = setupHistogram();
+    if(status != SDK_SUCCESS)
+        return status;
+
+    return SDK_SUCCESS;
+}
+
+
+int Histogram::run()
+{
+    for(int i = 0; i < 2 && iterations != 1; i++)
+        if(runKernels() != SDK_SUCCESS)
+            return SDK_FAILURE;
+
+    for(int i = 0; i < iterations; i++)
+        if(runKernels() != SDK_SUCCESS)
+            return SDK_FAILURE;
+
+    return SDK_SUCCESS;
+}
+
+int Histogram::cleanup()
+{
+    hipFree(dataBuf);
+    hipFree(midDeviceBinBuf);
+
+    free(hostBin);
+    free(deviceBin);
+
+    return SDK_SUCCESS;
+}
+
+int
+main(int argc, char * argv[])
+{
+    int status = 0;
+    // Create MonteCalroAsian object
+    Histogram hipHistogram;
+
+    // Setup
+    status = hipHistogram.setup();
+    if(status != SDK_SUCCESS)
+        return status;
+
+    // Run
+    if(hipHistogram.run() != SDK_SUCCESS)
+        return SDK_FAILURE;
+
+    // Cleanup resources created
+    if(hipHistogram.cleanup() != SDK_SUCCESS)
+        return SDK_FAILURE;
+
+    return SDK_SUCCESS;
+}
@@ -0,0 +1,181 @@
+/**********************************************************************
+Copyright �2015 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+�   Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+�   Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************/
+
+
+#ifndef HISTOGRAM_H_
+#define HISTOGRAM_H_
+
+
+#include <hip/hip_runtime.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+
+using namespace std;
+
+#define SAMPLE_VERSION "HIP-Examples-Applications-v1.0"
+#define WIDTH 1024
+#define HEIGHT 1024
+#define BIN_SIZE 256
+#define GROUP_SIZE 128
+#define GROUP_ITERATIONS (BIN_SIZE / 2)//This is done to avoid overflow in the kernel
+#define SUB_HISTOGRAM_COUNT ((WIDTH * HEIGHT) /(GROUP_SIZE * GROUP_ITERATIONS))
+
+
+#ifndef __global__
+#define __global__
+#endif
+
+#ifndef HIP_DYNAMIC_SHARED
+#define HIP_DYNAMIC_SHARED(t,x) t* x
+#endif
+
+#ifndef hipThreadIdx_x
+#define hipThreadIdx_x 0
+#endif
+
+#ifndef hipThreadIdx_y
+#define hipThreadIdx_y 0
+#endif
+
+#ifndef hipThreadIdx_z
+#define hipThreadIdx_z 0
+#endif
+
+#ifndef hipBlockIdx_x
+#define hipBlockIdx_x 0
+#endif
+
+#ifndef hipBlockDim_x
+#define hipBlockDim_x 0
+#endif
+
+#ifndef __syncthreads
+#define __syncthreads()
+#endif
+
+/**
+* Histogram
+* Class implements 256 Histogram bin implementation
+
+*/
+
+class Histogram
+{
+
+        int binSize;             /**< Size of Histogram bin */
+        int groupSize;           /**< Number of threads in group */
+        int subHistgCnt;         /**< Sub histogram count */
+        unsigned int *data;              /**< input data initialized with normalized(0 - binSize) random values */
+        int width;               /**< width of the input */
+        int height;              /**< height of the input */
+        unsigned int *hostBin;           /**< Host result for histogram bin */
+        unsigned int *midDeviceBin;      /**< Intermittent sub-histogram bins */
+        unsigned int *deviceBin;         /**< Device result for histogram bin */
+
+        unsigned long totalLocalMemory;      /**< Max local memory allowed */
+        unsigned long usedLocalMemory;       /**< Used local memory by kernel */
+
+        unsigned int* dataBuf;                 /**< CL memory buffer for data */
+        unsigned int* midDeviceBinBuf;         /**< CL memory buffer for intermittent device bin */
+
+        int iterations;                     /**< Number of iterations for kernel execution */
+        unsigned int globalThreads;
+        unsigned int localThreads ;
+        int groupIterations;
+
+    public:
+
+        /**
+        * Constructor
+        * Initialize member variables
+        * @param name name of sample (string)
+        */
+        Histogram()
+            :
+            binSize(BIN_SIZE),
+            groupSize(GROUP_SIZE),
+            subHistgCnt(SUB_HISTOGRAM_COUNT),
+            groupIterations(GROUP_ITERATIONS),
+            data(NULL),
+            hostBin(NULL),
+            midDeviceBin(NULL),
+            deviceBin(NULL),
+            iterations(1)
+        {
+            /* Set default values for width and height */
+            width = WIDTH;
+            height = HEIGHT;
+        }
+
+
+        ~Histogram()
+        {
+        }
+
+        /**
+        * Allocate and initialize required host memory with appropriate values
+        * @return SDK_SUCCESS on success and SDK_FAILURE on failure
+        */
+        int setupHistogram();
+
+        /**
+        * HIP related initialisations.
+        * Set up Context, Device list, Command Queue, Memory buffers
+        * Build HIP kernel program executable
+        * @return SDK_SUCCESS on success and SDK_FAILURE on failure
+        */
+        int setupHIP();
+
+        /**
+        * Set values for kernels' arguments, enqueue calls to the kernels
+        * on to the command queue, wait till end of kernel execution.
+        * Get kernel start and end time if timing is enabled
+        * @return SDK_SUCCESS on success and SDK_FAILURE on failure
+        */
+        int runKernels();
+
+
+        /**
+        * Override from SDKSample, adjust width and height
+        * of execution domain, perform all sample setup
+        * @return SDK_SUCCESS on success and SDK_FAILURE on failure
+        */
+        int setup();
+
+        /**
+        * Override from SDKSample
+        * Run HIP Black-Scholes
+        * @return SDK_SUCCESS on success and SDK_FAILURE on failure
+        */
+        int run();
+
+        /**
+        * Override from SDKSample
+        * Cleanup memory allocations
+        * @return SDK_SUCCESS on success and SDK_FAILURE on failure
+        */
+        int cleanup();
+
+    private:
+
+        /**
+        *  Calculate histogram bin on host
+        */
+        int calculateHostBin();
+};
+#endif
@@ -0,0 +1,127 @@
+/**********************************************************************
+Copyright ©2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+   Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+   Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************/
+
+#include <hip/hip_runtime.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <iostream>
+
+#define TILE_DIM 16
+#define BLOCK_ROWS 16
+
+__global__ void transposeNaive(float *odata, float *idata, int width, int height) {
+  int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
+  int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
+
+  int index_in = xIndex + width * yIndex;
+  int index_out = yIndex + height * xIndex;
+
+  for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
+    odata[index_out + i] = idata[index_in + i * width];
+  }
+}
+
+__global__ void transposeCoalesced(float *odata, float *idata, int width, int height) {
+  __shared__ float tile[TILE_DIM][TILE_DIM];
+
+  int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
+  int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
+  int index_in = xIndex + (yIndex)*width;
+
+  xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
+  yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
+  int index_out = xIndex + (yIndex)*height;
+
+  for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
+    tile[threadIdx.y + i][threadIdx.x] = idata[index_in + i * width];
+  }
+
+  __syncthreads();
+
+  for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
+    odata[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i];
+  }
+}
+
+__global__ void transposeNoBankConflicts(float *odata, float *idata, int width, int height) {
+  __shared__ float tile[TILE_DIM][TILE_DIM + 1];
+
+  int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
+  int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
+  int index_in = xIndex + (yIndex)*width;
+
+  xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
+  yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
+  int index_out = xIndex + (yIndex)*height;
+
+  for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
+    tile[threadIdx.y + i][threadIdx.x] = idata[index_in + i * width];
+  }
+
+  __syncthreads();
+
+  for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
+    odata[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i];
+  }
+}
+
+/**
+ * fillRandom
+ * fill array with random values
+ */
+__global__ void fillRandom(float* arrayPtr, int size) {
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
+    uint32_t seed = 12345678;
+    while (id < size) {
+        seed = (seed<<3) ^ id;
+        arrayPtr[id] = seed;
+        id += blockDim.x * gridDim.x;
+    }
+}
+
+int main(int argc, char * argv[])
+{
+    float* input = nullptr;
+    float* output = nullptr;
+    size_t width = 1024;
+    size_t height = 1024;
+
+    // Set input data to matrix A and matrix B
+    hipMalloc((void**)&input, width * height * sizeof(float));
+    hipMalloc((void**)&output, width * height * sizeof(float));
+
+    fillRandom<<<256, 256>>>(input, width*height);
+    hipDeviceSynchronize();
+
+    dim3 block(width/TILE_DIM,height/TILE_DIM);
+    dim3 threads(TILE_DIM,TILE_DIM);
+
+    transposeNoBankConflicts<<<block, threads>>>(
+        output,
+        input,
+        width,
+        height
+    );
+
+    hipDeviceSynchronize();
+
+    hipFree(input);
+    hipFree(output);
+
+    return 0;
+}
@@ -0,0 +1,216 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include <assert.h>
+#include <stdio.h>
+#include <algorithm>
+#include <stdlib.h>
+#include <iostream>
+#include "hip/hip_runtime.h"
+#include <unistd.h>
+#include <vector>
+
+#ifdef NDEBUG
+#define HIP_ASSERT(x) x
+#else
+#define HIP_ASSERT(x) (assert((x)==hipSuccess))
+#endif
+
+
+#define WIDTH     (1024)
+#define HEIGHT    (1024)
+
+#define NUM       (WIDTH*HEIGHT)
+
+#define THREADS_PER_BLOCK_X  64
+#define THREADS_PER_BLOCK_Y  1
+#define THREADS_PER_BLOCK_Z  1
+
+// Computes vectorAdd with matrix-multiply
+template<typename T>
+__global__ void addition_kernel(
+    T* __restrict__ a,
+    const float* __restrict__ b,
+    const float* __restrict__ c,
+    int width,
+    int height
+) {
+    int x = blockDim.x * blockIdx.x + threadIdx.x;
+    int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if (x >= WIDTH || y >= HEIGHT) return;
+    int index = y * width + x;
+
+    a[index] = b[index]+c[index];
+}
+
+
+__global__ void subtract_kernel(
+    float* __restrict__ a,
+    const float* __restrict__ b,
+    const float* __restrict__ c,
+    int width,
+    int height
+) {
+    int x = blockDim.x * blockIdx.x + threadIdx.x;
+    int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if (x >= WIDTH || y >= HEIGHT) return;
+    int index = y * width + x;
+
+    a[index] = abs(b[index]-c[index]);
+}
+
+__global__ void multiply_kernel(
+    float* __restrict__ a,
+    const float* __restrict__ b,
+    const float* __restrict__ c,
+    int width,
+    int height
+) {
+    int x = blockDim.x * blockIdx.x + threadIdx.x;
+    int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if (x >= WIDTH || y >= HEIGHT) return;
+    int index = y * width + x;
+
+    a[index] = (b[index]-1)*(c[index]-1)+1;
+}
+
+__global__ void divide_kernel(
+    float* __restrict__ a,
+    const float* __restrict__ b,
+    const float* __restrict__ c,
+    int width,
+    int height
+) {
+    int x = blockDim.x * blockIdx.x + threadIdx.x;
+    int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if (x >= WIDTH || y >= HEIGHT) return;
+    int index = y * width + x;
+
+    a[index] = (b[index]-c[index]) / abs(c[index]+b[index]) + 1;
+}
+
+using namespace std;
+
+void run(int NUM_QUEUE) {
+    std::vector<float*> hostA(NUM_QUEUE);
+    std::vector<float*> hostB(NUM_QUEUE);
+    std::vector<float*> hostC(NUM_QUEUE);
+
+    std::vector<float*> deviceA(NUM_QUEUE);
+    std::vector<float*> deviceB(NUM_QUEUE);
+    std::vector<float*> deviceC(NUM_QUEUE);
+
+    std::vector<hipStream_t> streams(NUM_QUEUE);
+
+    hipDeviceProp_t devProp;
+    hipGetDeviceProperties(&devProp, 0);
+    cout << " System minor " << devProp.minor << endl;
+    cout << " System major " << devProp.major << endl;
+    cout << " agent prop name " << devProp.name << endl;
+    cout << "hip Device prop succeeded " << endl ;
+
+    int i;
+    int errors;
+
+    for (int q=0; q<NUM_QUEUE; q++) {
+        hipStreamCreateWithFlags(&streams[q], hipStreamNonBlocking);
+
+        HIP_ASSERT(hipHostMalloc(&hostA[q], NUM * sizeof(float), 0));
+        HIP_ASSERT(hipHostMalloc(&hostB[q], NUM * sizeof(float), 0));
+        HIP_ASSERT(hipHostMalloc(&hostC[q], NUM * sizeof(float), 0));
+        
+        // initialize the input data
+        for (i = 0; i < NUM; i++) {
+            hostB[q][i] = (float)i;
+            hostC[q][i] = (float)i*100.0f;
+        }
+
+        HIP_ASSERT(hipMalloc((void**)(&deviceA[q]), NUM * sizeof(float)));
+        HIP_ASSERT(hipMalloc((void**)(&deviceB[q]), NUM * sizeof(float)));
+        HIP_ASSERT(hipMalloc((void**)(&deviceC[q]), NUM * sizeof(float)));
+        
+        HIP_ASSERT(hipMemcpyAsync(deviceB[q], hostB[q], NUM*sizeof(float), hipMemcpyHostToDevice, streams[q]));
+        HIP_ASSERT(hipMemcpyAsync(deviceC[q], hostC[q], NUM*sizeof(float), hipMemcpyHostToDevice, streams[q]));
+    }
+hipDeviceSynchronize();
+
+    for (int RUN_I=0; RUN_I<2; RUN_I++)
+    {
+        int q = (4*RUN_I+0)%NUM_QUEUE;
+        hipLaunchKernelGGL(addition_kernel,
+                        dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
+                        dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                        0, streams[q],
+                        deviceA[q], deviceB[q], deviceC[q] ,WIDTH ,HEIGHT);
+
+hipDeviceSynchronize();
+        q = (4*RUN_I+1)%NUM_QUEUE;
+        hipLaunchKernelGGL(subtract_kernel,
+                        dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
+                        dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                        0, streams[q],
+                        deviceA[q], deviceB[q], deviceC[q] ,WIDTH ,HEIGHT);
+
+hipDeviceSynchronize();
+        q = (4*RUN_I+2)%NUM_QUEUE;
+        hipLaunchKernelGGL(multiply_kernel,
+                        dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
+                        dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                        0, streams[q],
+                        deviceA[q], deviceB[q], deviceC[q] ,WIDTH ,HEIGHT);
+
+hipDeviceSynchronize();
+        q = (4*RUN_I+3)%NUM_QUEUE;
+        hipLaunchKernelGGL(divide_kernel,
+                        dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
+                        dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                        0, streams[q],
+                        deviceB[q], deviceA[q], deviceC[q] ,WIDTH ,HEIGHT);
+hipDeviceSynchronize();
+    }
+
+    for (int q=0; q<NUM_QUEUE; q++)
+        HIP_ASSERT(hipMemcpyAsync(hostA[q], deviceA[q], NUM*sizeof(float), hipMemcpyDeviceToHost, streams[q]));
+
+
+    for (int q=0; q<NUM_QUEUE; q++) {
+        HIP_ASSERT(hipMemcpy(hostA[q], deviceA[q], NUM*sizeof(float), hipMemcpyDeviceToHost));
+        hipDeviceSynchronize();
+
+        HIP_ASSERT(hipFree(deviceA[q]));
+        HIP_ASSERT(hipFree(deviceB[q]));
+        HIP_ASSERT(hipFree(deviceC[q]));
+
+        hipHostFree(hostA[q]);
+        hipHostFree(hostB[q]);
+        hipHostFree(hostC[q]);
+        hipStreamDestroy(streams[q]);
+    }
+}
+
+int main() {
+    run(1);
+    return 0;
+}
@@ -0,0 +1,62 @@
+# counter correctness test
+add_test(
+    NAME pmc_correctnes_vadd_test
+    COMMAND
+        ${PROJECT_BINARY_DIR}/rocprofv2 -i
+        ${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps/goldentraces/pmc.txt -d
+        /tmp/tests-v2/pmc -o vadd tests-v2/featuretests/profiler/apps/pmc_vectoradd
+    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}")
+
+set_tests_properties(
+    pmc_correctnes_vadd_test PROPERTIES LABELS "v2;rocprofv2" ENVIRONMENT
+                                        "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}")
+
+add_test(
+    NAME pmc_correctnes_vadd_test_validation
+    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test_vectoradd.py
+            "/tmp/tests-v2/pmc/pmc_1/results_vadd.csv"
+    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}")
+
+set_tests_properties(
+    pmc_correctnes_vadd_test_validation
+    PROPERTIES DEPENDS
+               pmc_correctnes_vadd_test
+               LABELS
+               "v2;validation"
+               PASS_REGULAR_EXPRESSION
+               "Test Passed"
+               FAIL_REGULAR_EXPRESSION
+               "Test Failed"
+               SKIP_REGULAR_EXPRESSION
+               "Skipped")
+
+add_test(
+    NAME pmc_correctnes_histogram_test
+    COMMAND
+        ${PROJECT_BINARY_DIR}/rocprofv2 -i
+        ${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps/goldentraces/pmc.txt -d
+        /tmp/tests-v2/pmc -o histo tests-v2/featuretests/profiler/apps/pmc_histogram
+    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}")
+
+set_tests_properties(
+    pmc_correctnes_histogram_test PROPERTIES LABELS "v2;rocprofv2" ENVIRONMENT
+                                             "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}")
+
+add_test(
+    NAME pmc_correctnes_histogram_test_validation
+    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test_histogram.py
+            "/tmp/tests-v2/pmc/pmc_1/results_histo.csv"
+    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}")
+
+set_tests_properties(
+    pmc_correctnes_histogram_test_validation
+    PROPERTIES DEPENDS
+               pmc_correctnes_histogram_test
+               LABELS
+               "v2;validation"
+               PASS_REGULAR_EXPRESSION
+               "Test Passed"
+               FAIL_REGULAR_EXPRESSION
+               "Test Failed"
+               SKIP_REGULAR_EXPRESSION
+               "Skipped")
@@ -0,0 +1,73 @@
+import numpy as np
+import pandas
+import os
+import glob
+
+current_dir = os.getcwd()
+rocprof = "rocprofv2"
+
+expected_filename = "pmc_1/results_histo.csv"
+output_folder = "/tmp/tests-v2/pmc"
+
+
+def test_grbm(csvfile):
+    count = np.array(csvfile["GRBM_COUNT"])
+    active = np.array(csvfile["GRBM_GUI_ACTIVE"])
+    assert np.all(active > 0)  # GPU must always be active
+    assert np.all(count >= active)  # Count always increments more than active
+    assert np.all(
+        count * 0.5 < active
+    )  # We can reasonably expect an active GPU during the kernel execution
+
+
+def test_insts(csvfile):
+    waves = np.array(
+        csvfile["SQ_WAVES"]
+    )  # TODO: 256 for wave32, need to check for wave64
+    lds = np.array(csvfile["SQ_INSTS_LDS"])
+    valu = np.array(csvfile["SQ_INSTS_VALU"])
+    salu = np.array(csvfile["SQ_INSTS_SALU"])
+    smem = np.array(csvfile["SQ_INSTS_SMEM"])
+
+    assert np.all(waves == 256) or np.all(waves == 128)
+
+    # Each have executes at least one of these
+    assert np.all(lds > waves)
+    assert np.all(valu > waves)
+    assert np.all(salu > waves)
+    assert np.all(smem >= waves)
+
+
+def test_sqcycles(csvfile):
+    tabusy = np.array(csvfile["TA_BUSY_max"])
+    grbm = np.array(csvfile["GRBM_GUI_ACTIVE"])
+    waves = np.array(csvfile["SQ_WAVES"])
+    wait_any = np.array(csvfile["SQ_WAIT_ANY"])
+    wave_cycles = np.array(csvfile["SQ_WAVE_CYCLES"])
+    vmem_cycles = np.array(csvfile["SQ_INST_CYCLES_VMEM"])
+
+    lds = np.array(csvfile["SQ_INSTS_LDS"])
+    valu = np.array(csvfile["SQ_INSTS_VALU"])
+    salu = np.array(csvfile["SQ_INSTS_SALU"])
+    smem = np.array(csvfile["SQ_INSTS_SMEM"])
+
+    ALU = lds + valu + salu + smem
+
+    assert np.all(
+        wave_cycles >= ALU + wait_any
+    )  # Each ALU inst takes at least one cycle
+    assert np.all(wave_cycles / grbm <= waves)  # Mean occupancy cannot exceed waves
+    assert np.all(wait_any >= tabusy)  # Waves are waiting for ta
+    assert np.all(
+        vmem_cycles >= waves
+    )  # Each wave takes at least one cycle to issue vmem
+
+
+if __name__ == "__main__":
+    csv = pandas.read_csv(f"{output_folder}/{expected_filename}")
+    test_grbm(csv)
+    test_insts(csv)
+    # test_sqcycles(csv)
+
+    # if its reached this point, then all tests apssed
+    print("Test Passed: All counter correctness tests passed.")
@@ -0,0 +1,114 @@
+import numpy as np
+import pandas
+import os
+import glob
+
+current_dir = os.getcwd()
+rocprof = "rocprofv2"
+
+expected_filename = "pmc_1/results_vadd.csv"
+output_folder = "/tmp/tests-v2/pmc"
+
+
+def test_grbm(csvfile):
+    count = np.array(csvfile["GRBM_COUNT"])
+    active = np.array(csvfile["GRBM_GUI_ACTIVE"])
+    assert np.all(active > 0)  # GPU must always be active
+    assert np.all(count >= active)  # Count always increments more than active
+    assert np.all(
+        count * 0.8 < active
+    )  # We can reasonably expect an active GPU during the kernel execution
+
+
+def test_sqwaves(csvfile):
+    waves = np.array(
+        csvfile["SQ_WAVES"]
+    )  # 1M threads == 32k waves for Wave32 and 16k waves for Wave64
+    assert np.all(waves == 32768) or np.all(waves == 16384)
+
+
+def test_insts(csvfile):
+    waves = np.array(csvfile["SQ_WAVES"])
+    valu = np.array(csvfile["SQ_INSTS_VALU"])
+    salu = np.array(csvfile["SQ_INSTS_SALU"])
+    smem = np.array(csvfile["SQ_INSTS_SMEM"])
+    lds = np.array(csvfile["SQ_INSTS_LDS"])
+
+    assert np.all(lds == 0)  # Not used on vectoradd
+
+    # VALU, SALU and SMEM must be divisible by SQ_Waves
+    assert np.all(valu % waves == 0)
+    assert np.all(salu % waves == 0)
+    assert np.all(smem % waves == 0)
+
+    # Each have executes at least one of these
+    assert np.all(valu > waves)
+    assert np.all(salu > waves)
+    assert np.all(smem >= waves)
+
+    # TODO: Check assembly for exact number!
+
+
+def test_gl2c(csvfile):
+    waves = np.array(csvfile["SQ_WAVES"])
+    read = np.array(csvfile["GL2C_MC_RDREQ_sum"])
+    write = np.array(csvfile["GL2C_MC_WRREQ_sum"])
+    hit = np.array(csvfile["GL2C_HIT_sum"])
+    miss = np.array(csvfile["GL2C_MISS_sum"])
+    hitrate = np.array(csvfile["L2CacheHit"])
+
+    assert np.all(write >= waves)  # We do at least one write per wave
+    # TODO: Find out why the first kernel gets such a high write request count.
+    assert np.all(write < 2.5 * waves)  # We do only one write (+ a little) per wave.
+    assert np.all(read >= 2 * waves)  # We do at least 2 reads per wave (A=B+C)
+    assert np.all(read < 3 * waves)  # We do only 2 reads (+ a little) per wave
+
+    assert np.all(miss >= hit)  # on Vadd we can't have more hits than misses
+    assert np.all(miss >= 2 * waves)  # Each read misses at least once
+    assert np.all(miss < 4 * waves)  # Can't miss too much
+    assert np.all(hit >= 0.5 * waves)  # We have at least one hit per wave
+
+    assert np.all(hitrate <= 50)  # We always get more misses than hits
+
+
+def test_ta(csvfile):
+    busy = np.array(csvfile["MemUnitBusy"])
+    some_busy = np.array(csvfile["TA_BUSY_max"]) / np.array(csvfile["GRBM_GUI_ACTIVE"])
+
+    assert np.all(busy <= 100)  # MemUnitBusy <= 100%
+    assert np.all(some_busy >= 1)  # Some shader engine is using TA
+
+
+def test_sqcycles(csvfile):
+    tabusy = np.array(csvfile["TA_BUSY_max"])
+    grbm = np.array(csvfile["GRBM_GUI_ACTIVE"])
+    waves = np.array(csvfile["SQ_WAVES"])
+    ALU = np.array(csvfile["SQ_INSTS_VALU"]) + np.array(csvfile["SQ_INSTS_SALU"])
+    wait_any = np.array(csvfile["SQ_WAIT_ANY"])
+    wave_cycles = np.array(csvfile["SQ_WAVE_CYCLES"])
+    vmem_cycles = np.array(csvfile["SQ_INST_CYCLES_VMEM"])
+
+    assert np.all(
+        wave_cycles >= ALU + wait_any
+    )  # Each ALU inst takes at least one cycle
+    assert np.all(wave_cycles / grbm <= waves)  # Mean occupancy cannot exceed waves
+    assert np.all(wait_any >= tabusy)  # Waves are waiting for ta
+    assert np.all(
+        vmem_cycles >= waves
+    )  # Each wave takes at least one cycle to issue vmem
+    assert np.all(
+        wait_any / wave_cycles >= 0.5
+    )  # vectorAdd is very memory-bound. TODO: use number less arbitrary than 0.5
+
+
+if __name__ == "__main__":
+    csv = pandas.read_csv(f"{output_folder}/{expected_filename}")
+    test_grbm(csv)
+    test_sqwaves(csv)
+    test_insts(csv)
+    # test_gl2c(csv)
+    # test_ta(csv)
+    # test_sqcycles(csv)
+
+    # if its reached this point, then all tests apssed
+    print("Test Passed: All counter correctness tests passed.")
				`@@ -0,0 +1 @@`
				`pmc: GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES SQ_INSTS_LDS SQ_INSTS_VALU SQ_INSTS_SALU SQ_INSTS_SMEM L2CacheHit TA_BUSY_max MemUnitBusy`