Initial commit for counter correctness tests.

Change-Id: I74caa4ab89bd765e59686cfbaaf1ce44ab10fe19

- adds counter correctness tests for GRBM, SQWAVES and SQ INSTRUCTIONs
- pandas dependency
- made it compatible with test infra.

Change-Id: I74caa4ab89bd765e59686cfbaaf1ce44ab10fe19
This commit is contained in:
Giovanni LB
2023-10-15 01:57:53 -03:00
committed by Gopesh Bhardwaj
parent c1884d70fc
commit 3f5ddfa0ab
9 changed files with 1099 additions and 0 deletions
@@ -172,6 +172,49 @@ endif()
# COMPONENT tests)
# endif()
# pmc correctness vectoradd
set_source_files_properties(apps/vectoradd.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
rocprofiler_featuretests_profiler_add_executable(pmc_vectoradd apps/vectoradd.cpp)
set_target_properties(
pmc_vectoradd PROPERTIES RUNTIME_OUTPUT_DIRECTORY
"${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps")
target_link_options(pmc_vectoradd PRIVATE "-Wl,--build-id=md5")
install(
TARGETS pmc_vectoradd
RUNTIME
DESTINATION
${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/profiler/apps
COMPONENT tests)
# pmc correctness hstogram
set_source_files_properties(apps/histogram.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
rocprofiler_featuretests_profiler_add_executable(pmc_histogram apps/histogram.cpp)
set_target_properties(
pmc_histogram PROPERTIES RUNTIME_OUTPUT_DIRECTORY
"${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps")
target_link_options(pmc_histogram PRIVATE "-Wl,--build-id=md5")
install(
TARGETS pmc_histogram
RUNTIME
DESTINATION
${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/profiler/apps
COMPONENT tests)
# pmc correctness transpose
set_source_files_properties(apps/transpose.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
rocprofiler_featuretests_profiler_add_executable(pmc_transpose apps/transpose.cpp)
set_target_properties(
pmc_transpose PROPERTIES RUNTIME_OUTPUT_DIRECTORY
"${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps")
target_link_options(pmc_transpose PRIVATE "-Wl,--build-id=md5")
install(
TARGETS pmc_transpose
RUNTIME
DESTINATION
${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/profiler/apps
COMPONENT tests)
# hsa-mem_async_copy -- Not Enabled for Now
set_source_files_properties(apps/async_mem_copy.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT
1)
@@ -381,4 +424,5 @@ find_package(
else()
# cmake based tests
include(${CMAKE_CURRENT_LIST_DIR}/counter_validation_tests.cmake)
include(${CMAKE_CURRENT_LIST_DIR}/counter_correctness_tests.cmake)
endif()
@@ -0,0 +1 @@
pmc: GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES SQ_INSTS_LDS SQ_INSTS_VALU SQ_INSTS_SALU SQ_INSTS_SMEM L2CacheHit TA_BUSY_max MemUnitBusy
@@ -0,0 +1,281 @@
/**********************************************************************
Copyright 2015 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
********************************************************************/
#include <math.h>
#include "hip/hip_runtime.h"
#include <assert.h>
#include <stdio.h>
#include <algorithm>
#include <stdlib.h>
#include <iostream>
#include <unistd.h>
#include <vector>
#include "histogram.hpp"
#define LINEAR_MEM_ACCESS
#define BIN_SIZE 256
#define SDK_SUCCESS 0
#define SDK_FAILURE 1
#define CHECK_ALLOCATION(x, msg) if(!(x)) { std::cout << __FILE__ << ' ' << __LINE__ << ' ' << msg << std::endl; }
/**
* @brief Calculates block-histogram bin whose bin size is 256
* @param data input data pointer
* @param sharedArray shared array for thread-histogram bins
* @param binResult block-histogram array
*/
__global__
void histogram256(
unsigned int* data,
unsigned int* binResult)
{
HIP_DYNAMIC_SHARED(unsigned char, sharedArray);
size_t localId = hipThreadIdx_x;
size_t globalId = hipThreadIdx_x + hipBlockIdx_x*hipBlockDim_x;
size_t groupId = hipBlockIdx_x;
size_t groupSize = hipBlockDim_x;
int offSet1 = localId & 31;
int offSet2 = 4 * offSet1; //which element to access in one bank.
int offSet3 = localId >> 5; //bank number
/* initialize shared array to zero */
uchar4 * input = (uchar4*)sharedArray;
for(int i = 0; i < 64; ++i)
input[groupSize * i + localId] = make_uchar4(0,0,0,0);
__syncthreads();
/* calculate thread-histograms */
//128 accumulations per thread
for(int i = 0; i < 128; i++)
{
#ifdef LINEAR_MEM_ACCESS
uint value = data[groupId * (groupSize * (BIN_SIZE/2)) + i * groupSize + localId];
#else
uint value = data[globalId + i*4096];
#endif // LINEAR_MEM_ACCESS
sharedArray[value * 128 + offSet2 + offSet3]++;
}
__syncthreads();
/* merge all thread-histograms into block-histogram */
uint4 binCount;
uint result;
uchar4 binVal; //Introduced uint4 for summation to avoid overflows
uint4 binValAsUint;
for(int i = 0; i < BIN_SIZE / groupSize; ++i)
{
int passNumber = BIN_SIZE / 2 * 32 * i + localId * 32 ;
binCount = make_uint4(0,0,0,0);
result= 0;
for(int j = 0; j < 32; ++j)
{
int bankNum = (j + offSet1) & 31; // this is bank number
binVal = input[passNumber +bankNum];
binValAsUint.x = (unsigned int)binVal.x;
binValAsUint.y = (unsigned int)binVal.y;
binValAsUint.z = (unsigned int)binVal.z;
binValAsUint.w = (unsigned int)binVal.w;
binCount.x += binValAsUint.x;
binCount.y += binValAsUint.y;
binCount.z += binValAsUint.z;
binCount.w += binValAsUint.w;
}
result = binCount.x + binCount.y + binCount.z + binCount.w;
binResult[groupId * BIN_SIZE + groupSize * i + localId ] = result;
}
}
int
Histogram::calculateHostBin()
{
for(int i = 0; i < height; ++i)
{
for(int j = 0; j < width; ++j)
{
hostBin[data[i * width + j]]++;
}
}
return SDK_SUCCESS;
}
int
Histogram::setupHistogram()
{
int i = 0;
data = (unsigned int *)malloc(sizeof(unsigned int) * width * height);
for(i = 0; i < width * height; i++)
{
data[i] = rand() % (unsigned int)(binSize);
}
hostBin = (unsigned int*)malloc(binSize * sizeof(unsigned int));
CHECK_ALLOCATION(hostBin, "Failed to allocate host memory. (hostBin)");
memset(hostBin, 0, binSize * sizeof(unsigned int));
deviceBin = (unsigned int*)malloc(binSize * sizeof(unsigned int));
CHECK_ALLOCATION(deviceBin, "Failed to allocate host memory. (deviceBin)");
midDeviceBin = (unsigned int*)malloc(sizeof(unsigned int) * binSize * subHistgCnt);
memset(deviceBin, 0, binSize * sizeof(unsigned int));
return SDK_SUCCESS;
}
int
Histogram::setupHIP(void)
{
hipDeviceProp_t devProp;
hipGetDeviceProperties(&devProp, 0);
cout << " System minor " << devProp.minor << endl;
cout << " System major " << devProp.major << endl;
cout << " agent prop name " << devProp.name << endl;
return SDK_SUCCESS;
}
int
Histogram::runKernels(void)
{
groupSize = 128;
globalThreads = (width * height) / (GROUP_ITERATIONS);
localThreads = groupSize;
hipHostMalloc((void**)&dataBuf,sizeof(unsigned int) * width * height, hipHostMallocDefault);
unsigned int *din;
hipHostGetDevicePointer((void**)&din, dataBuf,0);
hipMemcpy(din, data,sizeof(unsigned int) * width * height, hipMemcpyHostToDevice);
subHistgCnt = (width * height) / (groupSize * groupIterations);
hipHostMalloc((void**)&midDeviceBinBuf,sizeof(unsigned int) * binSize * subHistgCnt, hipHostMallocDefault);
hipLaunchKernelGGL(histogram256,
dim3(globalThreads/localThreads),
dim3(localThreads),
groupSize * binSize * sizeof(unsigned char), 0,
dataBuf ,midDeviceBinBuf);
hipDeviceSynchronize();
hipMemcpy(midDeviceBin, midDeviceBinBuf,sizeof(unsigned int) * binSize * subHistgCnt, hipMemcpyDeviceToHost);
//printArray<unsigned int>("midDeviceBin", midDeviceBin, sizeof(unsigned int) * binSize * subHistgCnt, 1);
// Clear deviceBin array
memset(deviceBin, 0, binSize * sizeof(unsigned int));
// Calculate final histogram bin
for(int i = 0; i < subHistgCnt; ++i)
{
for(int j = 0; j < binSize; ++j)
{
deviceBin[j] += midDeviceBin[i * binSize + j];
}
}
return SDK_SUCCESS;
}
int
Histogram::setup()
{
if(iterations < 1)
{
std::cout<<"Error, iterations cannot be 0 or negative. Exiting..\n";
exit(0);
}
int status = 0;
/* width must be multiples of binSize and
* height must be multiples of groupSize
*/
width = (width / binSize ? width / binSize: 1) * binSize;
height = (height / groupSize ? height / groupSize: 1) * groupSize;
status = setupHIP();
if(status != SDK_SUCCESS)
return status;
status = setupHistogram();
if(status != SDK_SUCCESS)
return status;
return SDK_SUCCESS;
}
int Histogram::run()
{
for(int i = 0; i < 2 && iterations != 1; i++)
if(runKernels() != SDK_SUCCESS)
return SDK_FAILURE;
for(int i = 0; i < iterations; i++)
if(runKernels() != SDK_SUCCESS)
return SDK_FAILURE;
return SDK_SUCCESS;
}
int Histogram::cleanup()
{
hipFree(dataBuf);
hipFree(midDeviceBinBuf);
free(hostBin);
free(deviceBin);
return SDK_SUCCESS;
}
int
main(int argc, char * argv[])
{
int status = 0;
// Create MonteCalroAsian object
Histogram hipHistogram;
// Setup
status = hipHistogram.setup();
if(status != SDK_SUCCESS)
return status;
// Run
if(hipHistogram.run() != SDK_SUCCESS)
return SDK_FAILURE;
// Cleanup resources created
if(hipHistogram.cleanup() != SDK_SUCCESS)
return SDK_FAILURE;
return SDK_SUCCESS;
}
@@ -0,0 +1,181 @@
/**********************************************************************
Copyright 2015 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
********************************************************************/
#ifndef HISTOGRAM_H_
#define HISTOGRAM_H_
#include <hip/hip_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
using namespace std;
#define SAMPLE_VERSION "HIP-Examples-Applications-v1.0"
#define WIDTH 1024
#define HEIGHT 1024
#define BIN_SIZE 256
#define GROUP_SIZE 128
#define GROUP_ITERATIONS (BIN_SIZE / 2)//This is done to avoid overflow in the kernel
#define SUB_HISTOGRAM_COUNT ((WIDTH * HEIGHT) /(GROUP_SIZE * GROUP_ITERATIONS))
#ifndef __global__
#define __global__
#endif
#ifndef HIP_DYNAMIC_SHARED
#define HIP_DYNAMIC_SHARED(t,x) t* x
#endif
#ifndef hipThreadIdx_x
#define hipThreadIdx_x 0
#endif
#ifndef hipThreadIdx_y
#define hipThreadIdx_y 0
#endif
#ifndef hipThreadIdx_z
#define hipThreadIdx_z 0
#endif
#ifndef hipBlockIdx_x
#define hipBlockIdx_x 0
#endif
#ifndef hipBlockDim_x
#define hipBlockDim_x 0
#endif
#ifndef __syncthreads
#define __syncthreads()
#endif
/**
* Histogram
* Class implements 256 Histogram bin implementation
*/
class Histogram
{
int binSize; /**< Size of Histogram bin */
int groupSize; /**< Number of threads in group */
int subHistgCnt; /**< Sub histogram count */
unsigned int *data; /**< input data initialized with normalized(0 - binSize) random values */
int width; /**< width of the input */
int height; /**< height of the input */
unsigned int *hostBin; /**< Host result for histogram bin */
unsigned int *midDeviceBin; /**< Intermittent sub-histogram bins */
unsigned int *deviceBin; /**< Device result for histogram bin */
unsigned long totalLocalMemory; /**< Max local memory allowed */
unsigned long usedLocalMemory; /**< Used local memory by kernel */
unsigned int* dataBuf; /**< CL memory buffer for data */
unsigned int* midDeviceBinBuf; /**< CL memory buffer for intermittent device bin */
int iterations; /**< Number of iterations for kernel execution */
unsigned int globalThreads;
unsigned int localThreads ;
int groupIterations;
public:
/**
* Constructor
* Initialize member variables
* @param name name of sample (string)
*/
Histogram()
:
binSize(BIN_SIZE),
groupSize(GROUP_SIZE),
subHistgCnt(SUB_HISTOGRAM_COUNT),
groupIterations(GROUP_ITERATIONS),
data(NULL),
hostBin(NULL),
midDeviceBin(NULL),
deviceBin(NULL),
iterations(1)
{
/* Set default values for width and height */
width = WIDTH;
height = HEIGHT;
}
~Histogram()
{
}
/**
* Allocate and initialize required host memory with appropriate values
* @return SDK_SUCCESS on success and SDK_FAILURE on failure
*/
int setupHistogram();
/**
* HIP related initialisations.
* Set up Context, Device list, Command Queue, Memory buffers
* Build HIP kernel program executable
* @return SDK_SUCCESS on success and SDK_FAILURE on failure
*/
int setupHIP();
/**
* Set values for kernels' arguments, enqueue calls to the kernels
* on to the command queue, wait till end of kernel execution.
* Get kernel start and end time if timing is enabled
* @return SDK_SUCCESS on success and SDK_FAILURE on failure
*/
int runKernels();
/**
* Override from SDKSample, adjust width and height
* of execution domain, perform all sample setup
* @return SDK_SUCCESS on success and SDK_FAILURE on failure
*/
int setup();
/**
* Override from SDKSample
* Run HIP Black-Scholes
* @return SDK_SUCCESS on success and SDK_FAILURE on failure
*/
int run();
/**
* Override from SDKSample
* Cleanup memory allocations
* @return SDK_SUCCESS on success and SDK_FAILURE on failure
*/
int cleanup();
private:
/**
* Calculate histogram bin on host
*/
int calculateHostBin();
};
#endif
@@ -0,0 +1,127 @@
/**********************************************************************
Copyright ©2023 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
• Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
• Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
********************************************************************/
#include <hip/hip_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <iostream>
#define TILE_DIM 16
#define BLOCK_ROWS 16
__global__ void transposeNaive(float *odata, float *idata, int width, int height) {
int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
int index_in = xIndex + width * yIndex;
int index_out = yIndex + height * xIndex;
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
odata[index_out + i] = idata[index_in + i * width];
}
}
__global__ void transposeCoalesced(float *odata, float *idata, int width, int height) {
__shared__ float tile[TILE_DIM][TILE_DIM];
int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
int index_in = xIndex + (yIndex)*width;
xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
int index_out = xIndex + (yIndex)*height;
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
tile[threadIdx.y + i][threadIdx.x] = idata[index_in + i * width];
}
__syncthreads();
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
odata[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i];
}
}
__global__ void transposeNoBankConflicts(float *odata, float *idata, int width, int height) {
__shared__ float tile[TILE_DIM][TILE_DIM + 1];
int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
int index_in = xIndex + (yIndex)*width;
xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
int index_out = xIndex + (yIndex)*height;
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
tile[threadIdx.y + i][threadIdx.x] = idata[index_in + i * width];
}
__syncthreads();
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
odata[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i];
}
}
/**
* fillRandom
* fill array with random values
*/
__global__ void fillRandom(float* arrayPtr, int size) {
int id = threadIdx.x + blockDim.x * blockIdx.x;
uint32_t seed = 12345678;
while (id < size) {
seed = (seed<<3) ^ id;
arrayPtr[id] = seed;
id += blockDim.x * gridDim.x;
}
}
int main(int argc, char * argv[])
{
float* input = nullptr;
float* output = nullptr;
size_t width = 1024;
size_t height = 1024;
// Set input data to matrix A and matrix B
hipMalloc((void**)&input, width * height * sizeof(float));
hipMalloc((void**)&output, width * height * sizeof(float));
fillRandom<<<256, 256>>>(input, width*height);
hipDeviceSynchronize();
dim3 block(width/TILE_DIM,height/TILE_DIM);
dim3 threads(TILE_DIM,TILE_DIM);
transposeNoBankConflicts<<<block, threads>>>(
output,
input,
width,
height
);
hipDeviceSynchronize();
hipFree(input);
hipFree(output);
return 0;
}
@@ -0,0 +1,216 @@
/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <assert.h>
#include <stdio.h>
#include <algorithm>
#include <stdlib.h>
#include <iostream>
#include "hip/hip_runtime.h"
#include <unistd.h>
#include <vector>
#ifdef NDEBUG
#define HIP_ASSERT(x) x
#else
#define HIP_ASSERT(x) (assert((x)==hipSuccess))
#endif
#define WIDTH (1024)
#define HEIGHT (1024)
#define NUM (WIDTH*HEIGHT)
#define THREADS_PER_BLOCK_X 64
#define THREADS_PER_BLOCK_Y 1
#define THREADS_PER_BLOCK_Z 1
// Computes vectorAdd with matrix-multiply
template<typename T>
__global__ void addition_kernel(
T* __restrict__ a,
const float* __restrict__ b,
const float* __restrict__ c,
int width,
int height
) {
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x >= WIDTH || y >= HEIGHT) return;
int index = y * width + x;
a[index] = b[index]+c[index];
}
__global__ void subtract_kernel(
float* __restrict__ a,
const float* __restrict__ b,
const float* __restrict__ c,
int width,
int height
) {
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x >= WIDTH || y >= HEIGHT) return;
int index = y * width + x;
a[index] = abs(b[index]-c[index]);
}
__global__ void multiply_kernel(
float* __restrict__ a,
const float* __restrict__ b,
const float* __restrict__ c,
int width,
int height
) {
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x >= WIDTH || y >= HEIGHT) return;
int index = y * width + x;
a[index] = (b[index]-1)*(c[index]-1)+1;
}
__global__ void divide_kernel(
float* __restrict__ a,
const float* __restrict__ b,
const float* __restrict__ c,
int width,
int height
) {
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x >= WIDTH || y >= HEIGHT) return;
int index = y * width + x;
a[index] = (b[index]-c[index]) / abs(c[index]+b[index]) + 1;
}
using namespace std;
void run(int NUM_QUEUE) {
std::vector<float*> hostA(NUM_QUEUE);
std::vector<float*> hostB(NUM_QUEUE);
std::vector<float*> hostC(NUM_QUEUE);
std::vector<float*> deviceA(NUM_QUEUE);
std::vector<float*> deviceB(NUM_QUEUE);
std::vector<float*> deviceC(NUM_QUEUE);
std::vector<hipStream_t> streams(NUM_QUEUE);
hipDeviceProp_t devProp;
hipGetDeviceProperties(&devProp, 0);
cout << " System minor " << devProp.minor << endl;
cout << " System major " << devProp.major << endl;
cout << " agent prop name " << devProp.name << endl;
cout << "hip Device prop succeeded " << endl ;
int i;
int errors;
for (int q=0; q<NUM_QUEUE; q++) {
hipStreamCreateWithFlags(&streams[q], hipStreamNonBlocking);
HIP_ASSERT(hipHostMalloc(&hostA[q], NUM * sizeof(float), 0));
HIP_ASSERT(hipHostMalloc(&hostB[q], NUM * sizeof(float), 0));
HIP_ASSERT(hipHostMalloc(&hostC[q], NUM * sizeof(float), 0));
// initialize the input data
for (i = 0; i < NUM; i++) {
hostB[q][i] = (float)i;
hostC[q][i] = (float)i*100.0f;
}
HIP_ASSERT(hipMalloc((void**)(&deviceA[q]), NUM * sizeof(float)));
HIP_ASSERT(hipMalloc((void**)(&deviceB[q]), NUM * sizeof(float)));
HIP_ASSERT(hipMalloc((void**)(&deviceC[q]), NUM * sizeof(float)));
HIP_ASSERT(hipMemcpyAsync(deviceB[q], hostB[q], NUM*sizeof(float), hipMemcpyHostToDevice, streams[q]));
HIP_ASSERT(hipMemcpyAsync(deviceC[q], hostC[q], NUM*sizeof(float), hipMemcpyHostToDevice, streams[q]));
}
hipDeviceSynchronize();
for (int RUN_I=0; RUN_I<2; RUN_I++)
{
int q = (4*RUN_I+0)%NUM_QUEUE;
hipLaunchKernelGGL(addition_kernel,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, streams[q],
deviceA[q], deviceB[q], deviceC[q] ,WIDTH ,HEIGHT);
hipDeviceSynchronize();
q = (4*RUN_I+1)%NUM_QUEUE;
hipLaunchKernelGGL(subtract_kernel,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, streams[q],
deviceA[q], deviceB[q], deviceC[q] ,WIDTH ,HEIGHT);
hipDeviceSynchronize();
q = (4*RUN_I+2)%NUM_QUEUE;
hipLaunchKernelGGL(multiply_kernel,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, streams[q],
deviceA[q], deviceB[q], deviceC[q] ,WIDTH ,HEIGHT);
hipDeviceSynchronize();
q = (4*RUN_I+3)%NUM_QUEUE;
hipLaunchKernelGGL(divide_kernel,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, streams[q],
deviceB[q], deviceA[q], deviceC[q] ,WIDTH ,HEIGHT);
hipDeviceSynchronize();
}
for (int q=0; q<NUM_QUEUE; q++)
HIP_ASSERT(hipMemcpyAsync(hostA[q], deviceA[q], NUM*sizeof(float), hipMemcpyDeviceToHost, streams[q]));
for (int q=0; q<NUM_QUEUE; q++) {
HIP_ASSERT(hipMemcpy(hostA[q], deviceA[q], NUM*sizeof(float), hipMemcpyDeviceToHost));
hipDeviceSynchronize();
HIP_ASSERT(hipFree(deviceA[q]));
HIP_ASSERT(hipFree(deviceB[q]));
HIP_ASSERT(hipFree(deviceC[q]));
hipHostFree(hostA[q]);
hipHostFree(hostB[q]);
hipHostFree(hostC[q]);
hipStreamDestroy(streams[q]);
}
}
int main() {
run(1);
return 0;
}
@@ -0,0 +1,62 @@
# counter correctness test
add_test(
NAME pmc_correctnes_vadd_test
COMMAND
${PROJECT_BINARY_DIR}/rocprofv2 -i
${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps/goldentraces/pmc.txt -d
/tmp/tests-v2/pmc -o vadd tests-v2/featuretests/profiler/apps/pmc_vectoradd
WORKING_DIRECTORY "${PROJECT_BINARY_DIR}")
set_tests_properties(
pmc_correctnes_vadd_test PROPERTIES LABELS "v2;rocprofv2" ENVIRONMENT
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV}")
add_test(
NAME pmc_correctnes_vadd_test_validation
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test_vectoradd.py
"/tmp/tests-v2/pmc/pmc_1/results_vadd.csv"
WORKING_DIRECTORY "${PROJECT_BINARY_DIR}")
set_tests_properties(
pmc_correctnes_vadd_test_validation
PROPERTIES DEPENDS
pmc_correctnes_vadd_test
LABELS
"v2;validation"
PASS_REGULAR_EXPRESSION
"Test Passed"
FAIL_REGULAR_EXPRESSION
"Test Failed"
SKIP_REGULAR_EXPRESSION
"Skipped")
add_test(
NAME pmc_correctnes_histogram_test
COMMAND
${PROJECT_BINARY_DIR}/rocprofv2 -i
${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps/goldentraces/pmc.txt -d
/tmp/tests-v2/pmc -o histo tests-v2/featuretests/profiler/apps/pmc_histogram
WORKING_DIRECTORY "${PROJECT_BINARY_DIR}")
set_tests_properties(
pmc_correctnes_histogram_test PROPERTIES LABELS "v2;rocprofv2" ENVIRONMENT
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV}")
add_test(
NAME pmc_correctnes_histogram_test_validation
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test_histogram.py
"/tmp/tests-v2/pmc/pmc_1/results_histo.csv"
WORKING_DIRECTORY "${PROJECT_BINARY_DIR}")
set_tests_properties(
pmc_correctnes_histogram_test_validation
PROPERTIES DEPENDS
pmc_correctnes_histogram_test
LABELS
"v2;validation"
PASS_REGULAR_EXPRESSION
"Test Passed"
FAIL_REGULAR_EXPRESSION
"Test Failed"
SKIP_REGULAR_EXPRESSION
"Skipped")
@@ -0,0 +1,73 @@
import numpy as np
import pandas
import os
import glob
current_dir = os.getcwd()
rocprof = "rocprofv2"
expected_filename = "pmc_1/results_histo.csv"
output_folder = "/tmp/tests-v2/pmc"
def test_grbm(csvfile):
count = np.array(csvfile["GRBM_COUNT"])
active = np.array(csvfile["GRBM_GUI_ACTIVE"])
assert np.all(active > 0) # GPU must always be active
assert np.all(count >= active) # Count always increments more than active
assert np.all(
count * 0.5 < active
) # We can reasonably expect an active GPU during the kernel execution
def test_insts(csvfile):
waves = np.array(
csvfile["SQ_WAVES"]
) # TODO: 256 for wave32, need to check for wave64
lds = np.array(csvfile["SQ_INSTS_LDS"])
valu = np.array(csvfile["SQ_INSTS_VALU"])
salu = np.array(csvfile["SQ_INSTS_SALU"])
smem = np.array(csvfile["SQ_INSTS_SMEM"])
assert np.all(waves == 256) or np.all(waves == 128)
# Each have executes at least one of these
assert np.all(lds > waves)
assert np.all(valu > waves)
assert np.all(salu > waves)
assert np.all(smem >= waves)
def test_sqcycles(csvfile):
tabusy = np.array(csvfile["TA_BUSY_max"])
grbm = np.array(csvfile["GRBM_GUI_ACTIVE"])
waves = np.array(csvfile["SQ_WAVES"])
wait_any = np.array(csvfile["SQ_WAIT_ANY"])
wave_cycles = np.array(csvfile["SQ_WAVE_CYCLES"])
vmem_cycles = np.array(csvfile["SQ_INST_CYCLES_VMEM"])
lds = np.array(csvfile["SQ_INSTS_LDS"])
valu = np.array(csvfile["SQ_INSTS_VALU"])
salu = np.array(csvfile["SQ_INSTS_SALU"])
smem = np.array(csvfile["SQ_INSTS_SMEM"])
ALU = lds + valu + salu + smem
assert np.all(
wave_cycles >= ALU + wait_any
) # Each ALU inst takes at least one cycle
assert np.all(wave_cycles / grbm <= waves) # Mean occupancy cannot exceed waves
assert np.all(wait_any >= tabusy) # Waves are waiting for ta
assert np.all(
vmem_cycles >= waves
) # Each wave takes at least one cycle to issue vmem
if __name__ == "__main__":
csv = pandas.read_csv(f"{output_folder}/{expected_filename}")
test_grbm(csv)
test_insts(csv)
# test_sqcycles(csv)
# if its reached this point, then all tests apssed
print("Test Passed: All counter correctness tests passed.")
@@ -0,0 +1,114 @@
import numpy as np
import pandas
import os
import glob
current_dir = os.getcwd()
rocprof = "rocprofv2"
expected_filename = "pmc_1/results_vadd.csv"
output_folder = "/tmp/tests-v2/pmc"
def test_grbm(csvfile):
count = np.array(csvfile["GRBM_COUNT"])
active = np.array(csvfile["GRBM_GUI_ACTIVE"])
assert np.all(active > 0) # GPU must always be active
assert np.all(count >= active) # Count always increments more than active
assert np.all(
count * 0.8 < active
) # We can reasonably expect an active GPU during the kernel execution
def test_sqwaves(csvfile):
waves = np.array(
csvfile["SQ_WAVES"]
) # 1M threads == 32k waves for Wave32 and 16k waves for Wave64
assert np.all(waves == 32768) or np.all(waves == 16384)
def test_insts(csvfile):
waves = np.array(csvfile["SQ_WAVES"])
valu = np.array(csvfile["SQ_INSTS_VALU"])
salu = np.array(csvfile["SQ_INSTS_SALU"])
smem = np.array(csvfile["SQ_INSTS_SMEM"])
lds = np.array(csvfile["SQ_INSTS_LDS"])
assert np.all(lds == 0) # Not used on vectoradd
# VALU, SALU and SMEM must be divisible by SQ_Waves
assert np.all(valu % waves == 0)
assert np.all(salu % waves == 0)
assert np.all(smem % waves == 0)
# Each have executes at least one of these
assert np.all(valu > waves)
assert np.all(salu > waves)
assert np.all(smem >= waves)
# TODO: Check assembly for exact number!
def test_gl2c(csvfile):
waves = np.array(csvfile["SQ_WAVES"])
read = np.array(csvfile["GL2C_MC_RDREQ_sum"])
write = np.array(csvfile["GL2C_MC_WRREQ_sum"])
hit = np.array(csvfile["GL2C_HIT_sum"])
miss = np.array(csvfile["GL2C_MISS_sum"])
hitrate = np.array(csvfile["L2CacheHit"])
assert np.all(write >= waves) # We do at least one write per wave
# TODO: Find out why the first kernel gets such a high write request count.
assert np.all(write < 2.5 * waves) # We do only one write (+ a little) per wave.
assert np.all(read >= 2 * waves) # We do at least 2 reads per wave (A=B+C)
assert np.all(read < 3 * waves) # We do only 2 reads (+ a little) per wave
assert np.all(miss >= hit) # on Vadd we can't have more hits than misses
assert np.all(miss >= 2 * waves) # Each read misses at least once
assert np.all(miss < 4 * waves) # Can't miss too much
assert np.all(hit >= 0.5 * waves) # We have at least one hit per wave
assert np.all(hitrate <= 50) # We always get more misses than hits
def test_ta(csvfile):
busy = np.array(csvfile["MemUnitBusy"])
some_busy = np.array(csvfile["TA_BUSY_max"]) / np.array(csvfile["GRBM_GUI_ACTIVE"])
assert np.all(busy <= 100) # MemUnitBusy <= 100%
assert np.all(some_busy >= 1) # Some shader engine is using TA
def test_sqcycles(csvfile):
tabusy = np.array(csvfile["TA_BUSY_max"])
grbm = np.array(csvfile["GRBM_GUI_ACTIVE"])
waves = np.array(csvfile["SQ_WAVES"])
ALU = np.array(csvfile["SQ_INSTS_VALU"]) + np.array(csvfile["SQ_INSTS_SALU"])
wait_any = np.array(csvfile["SQ_WAIT_ANY"])
wave_cycles = np.array(csvfile["SQ_WAVE_CYCLES"])
vmem_cycles = np.array(csvfile["SQ_INST_CYCLES_VMEM"])
assert np.all(
wave_cycles >= ALU + wait_any
) # Each ALU inst takes at least one cycle
assert np.all(wave_cycles / grbm <= waves) # Mean occupancy cannot exceed waves
assert np.all(wait_any >= tabusy) # Waves are waiting for ta
assert np.all(
vmem_cycles >= waves
) # Each wave takes at least one cycle to issue vmem
assert np.all(
wait_any / wave_cycles >= 0.5
) # vectorAdd is very memory-bound. TODO: use number less arbitrary than 0.5
if __name__ == "__main__":
csv = pandas.read_csv(f"{output_folder}/{expected_filename}")
test_grbm(csv)
test_sqwaves(csv)
test_insts(csv)
# test_gl2c(csv)
# test_ta(csv)
# test_sqcycles(csv)
# if its reached this point, then all tests apssed
print("Test Passed: All counter correctness tests passed.")