Initial commit for counter correctness tests.
Change-Id: I74caa4ab89bd765e59686cfbaaf1ce44ab10fe19 - adds counter correctness tests for GRBM, SQWAVES and SQ INSTRUCTIONs - pandas dependency - made it compatible with test infra. Change-Id: I74caa4ab89bd765e59686cfbaaf1ce44ab10fe19
This commit is contained in:
committed by
Gopesh Bhardwaj
parent
c1884d70fc
commit
3f5ddfa0ab
@@ -172,6 +172,49 @@ endif()
|
||||
# COMPONENT tests)
|
||||
# endif()
|
||||
|
||||
# pmc correctness vectoradd
|
||||
set_source_files_properties(apps/vectoradd.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
|
||||
rocprofiler_featuretests_profiler_add_executable(pmc_vectoradd apps/vectoradd.cpp)
|
||||
set_target_properties(
|
||||
pmc_vectoradd PROPERTIES RUNTIME_OUTPUT_DIRECTORY
|
||||
"${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps")
|
||||
target_link_options(pmc_vectoradd PRIVATE "-Wl,--build-id=md5")
|
||||
install(
|
||||
TARGETS pmc_vectoradd
|
||||
RUNTIME
|
||||
DESTINATION
|
||||
${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/profiler/apps
|
||||
COMPONENT tests)
|
||||
|
||||
# pmc correctness hstogram
|
||||
set_source_files_properties(apps/histogram.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
|
||||
rocprofiler_featuretests_profiler_add_executable(pmc_histogram apps/histogram.cpp)
|
||||
set_target_properties(
|
||||
pmc_histogram PROPERTIES RUNTIME_OUTPUT_DIRECTORY
|
||||
"${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps")
|
||||
target_link_options(pmc_histogram PRIVATE "-Wl,--build-id=md5")
|
||||
install(
|
||||
TARGETS pmc_histogram
|
||||
RUNTIME
|
||||
DESTINATION
|
||||
${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/profiler/apps
|
||||
COMPONENT tests)
|
||||
|
||||
# pmc correctness transpose
|
||||
set_source_files_properties(apps/transpose.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
|
||||
rocprofiler_featuretests_profiler_add_executable(pmc_transpose apps/transpose.cpp)
|
||||
set_target_properties(
|
||||
pmc_transpose PROPERTIES RUNTIME_OUTPUT_DIRECTORY
|
||||
"${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps")
|
||||
target_link_options(pmc_transpose PRIVATE "-Wl,--build-id=md5")
|
||||
install(
|
||||
TARGETS pmc_transpose
|
||||
RUNTIME
|
||||
DESTINATION
|
||||
${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/profiler/apps
|
||||
COMPONENT tests)
|
||||
|
||||
|
||||
# hsa-mem_async_copy -- Not Enabled for Now
|
||||
set_source_files_properties(apps/async_mem_copy.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT
|
||||
1)
|
||||
@@ -381,4 +424,5 @@ find_package(
|
||||
else()
|
||||
# cmake based tests
|
||||
include(${CMAKE_CURRENT_LIST_DIR}/counter_validation_tests.cmake)
|
||||
include(${CMAKE_CURRENT_LIST_DIR}/counter_correctness_tests.cmake)
|
||||
endif()
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
pmc: GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES SQ_INSTS_LDS SQ_INSTS_VALU SQ_INSTS_SALU SQ_INSTS_SMEM L2CacheHit TA_BUSY_max MemUnitBusy
|
||||
@@ -0,0 +1,281 @@
|
||||
/**********************************************************************
|
||||
Copyright �2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
|
||||
|
||||
� Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
|
||||
� Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
|
||||
other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
||||
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
********************************************************************/
|
||||
|
||||
#include <math.h>
|
||||
#include "hip/hip_runtime.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include <stdlib.h>
|
||||
#include <iostream>
|
||||
#include <unistd.h>
|
||||
#include <vector>
|
||||
|
||||
#include "histogram.hpp"
|
||||
|
||||
#define LINEAR_MEM_ACCESS
|
||||
|
||||
#define BIN_SIZE 256
|
||||
#define SDK_SUCCESS 0
|
||||
#define SDK_FAILURE 1
|
||||
#define CHECK_ALLOCATION(x, msg) if(!(x)) { std::cout << __FILE__ << ' ' << __LINE__ << ' ' << msg << std::endl; }
|
||||
|
||||
|
||||
/**
|
||||
* @brief Calculates block-histogram bin whose bin size is 256
|
||||
* @param data input data pointer
|
||||
* @param sharedArray shared array for thread-histogram bins
|
||||
* @param binResult block-histogram array
|
||||
*/
|
||||
|
||||
__global__
|
||||
void histogram256(
|
||||
unsigned int* data,
|
||||
unsigned int* binResult)
|
||||
{
|
||||
HIP_DYNAMIC_SHARED(unsigned char, sharedArray);
|
||||
size_t localId = hipThreadIdx_x;
|
||||
size_t globalId = hipThreadIdx_x + hipBlockIdx_x*hipBlockDim_x;
|
||||
size_t groupId = hipBlockIdx_x;
|
||||
size_t groupSize = hipBlockDim_x;
|
||||
int offSet1 = localId & 31;
|
||||
int offSet2 = 4 * offSet1; //which element to access in one bank.
|
||||
int offSet3 = localId >> 5; //bank number
|
||||
/* initialize shared array to zero */
|
||||
uchar4 * input = (uchar4*)sharedArray;
|
||||
for(int i = 0; i < 64; ++i)
|
||||
input[groupSize * i + localId] = make_uchar4(0,0,0,0);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
|
||||
/* calculate thread-histograms */
|
||||
//128 accumulations per thread
|
||||
for(int i = 0; i < 128; i++)
|
||||
{
|
||||
#ifdef LINEAR_MEM_ACCESS
|
||||
uint value = data[groupId * (groupSize * (BIN_SIZE/2)) + i * groupSize + localId];
|
||||
#else
|
||||
uint value = data[globalId + i*4096];
|
||||
|
||||
#endif // LINEAR_MEM_ACCESS
|
||||
sharedArray[value * 128 + offSet2 + offSet3]++;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
/* merge all thread-histograms into block-histogram */
|
||||
|
||||
uint4 binCount;
|
||||
uint result;
|
||||
uchar4 binVal; //Introduced uint4 for summation to avoid overflows
|
||||
uint4 binValAsUint;
|
||||
for(int i = 0; i < BIN_SIZE / groupSize; ++i)
|
||||
{
|
||||
int passNumber = BIN_SIZE / 2 * 32 * i + localId * 32 ;
|
||||
binCount = make_uint4(0,0,0,0);
|
||||
result= 0;
|
||||
for(int j = 0; j < 32; ++j)
|
||||
{
|
||||
int bankNum = (j + offSet1) & 31; // this is bank number
|
||||
binVal = input[passNumber +bankNum];
|
||||
|
||||
binValAsUint.x = (unsigned int)binVal.x;
|
||||
binValAsUint.y = (unsigned int)binVal.y;
|
||||
binValAsUint.z = (unsigned int)binVal.z;
|
||||
binValAsUint.w = (unsigned int)binVal.w;
|
||||
|
||||
binCount.x += binValAsUint.x;
|
||||
binCount.y += binValAsUint.y;
|
||||
binCount.z += binValAsUint.z;
|
||||
binCount.w += binValAsUint.w;
|
||||
|
||||
}
|
||||
result = binCount.x + binCount.y + binCount.z + binCount.w;
|
||||
binResult[groupId * BIN_SIZE + groupSize * i + localId ] = result;
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
Histogram::calculateHostBin()
|
||||
{
|
||||
for(int i = 0; i < height; ++i)
|
||||
{
|
||||
for(int j = 0; j < width; ++j)
|
||||
{
|
||||
hostBin[data[i * width + j]]++;
|
||||
}
|
||||
}
|
||||
|
||||
return SDK_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
Histogram::setupHistogram()
|
||||
{
|
||||
int i = 0;
|
||||
|
||||
data = (unsigned int *)malloc(sizeof(unsigned int) * width * height);
|
||||
|
||||
for(i = 0; i < width * height; i++)
|
||||
{
|
||||
data[i] = rand() % (unsigned int)(binSize);
|
||||
}
|
||||
|
||||
hostBin = (unsigned int*)malloc(binSize * sizeof(unsigned int));
|
||||
CHECK_ALLOCATION(hostBin, "Failed to allocate host memory. (hostBin)");
|
||||
|
||||
memset(hostBin, 0, binSize * sizeof(unsigned int));
|
||||
|
||||
deviceBin = (unsigned int*)malloc(binSize * sizeof(unsigned int));
|
||||
CHECK_ALLOCATION(deviceBin, "Failed to allocate host memory. (deviceBin)");
|
||||
midDeviceBin = (unsigned int*)malloc(sizeof(unsigned int) * binSize * subHistgCnt);
|
||||
|
||||
memset(deviceBin, 0, binSize * sizeof(unsigned int));
|
||||
return SDK_SUCCESS;
|
||||
}
|
||||
|
||||
int
|
||||
Histogram::setupHIP(void)
|
||||
{
|
||||
hipDeviceProp_t devProp;
|
||||
hipGetDeviceProperties(&devProp, 0);
|
||||
cout << " System minor " << devProp.minor << endl;
|
||||
cout << " System major " << devProp.major << endl;
|
||||
cout << " agent prop name " << devProp.name << endl;
|
||||
|
||||
return SDK_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
Histogram::runKernels(void)
|
||||
{
|
||||
groupSize = 128;
|
||||
globalThreads = (width * height) / (GROUP_ITERATIONS);
|
||||
|
||||
localThreads = groupSize;
|
||||
|
||||
|
||||
hipHostMalloc((void**)&dataBuf,sizeof(unsigned int) * width * height, hipHostMallocDefault);
|
||||
unsigned int *din;
|
||||
hipHostGetDevicePointer((void**)&din, dataBuf,0);
|
||||
hipMemcpy(din, data,sizeof(unsigned int) * width * height, hipMemcpyHostToDevice);
|
||||
|
||||
subHistgCnt = (width * height) / (groupSize * groupIterations);
|
||||
|
||||
hipHostMalloc((void**)&midDeviceBinBuf,sizeof(unsigned int) * binSize * subHistgCnt, hipHostMallocDefault);
|
||||
|
||||
hipLaunchKernelGGL(histogram256,
|
||||
dim3(globalThreads/localThreads),
|
||||
dim3(localThreads),
|
||||
groupSize * binSize * sizeof(unsigned char), 0,
|
||||
dataBuf ,midDeviceBinBuf);
|
||||
|
||||
hipDeviceSynchronize();
|
||||
|
||||
hipMemcpy(midDeviceBin, midDeviceBinBuf,sizeof(unsigned int) * binSize * subHistgCnt, hipMemcpyDeviceToHost);
|
||||
//printArray<unsigned int>("midDeviceBin", midDeviceBin, sizeof(unsigned int) * binSize * subHistgCnt, 1);
|
||||
// Clear deviceBin array
|
||||
memset(deviceBin, 0, binSize * sizeof(unsigned int));
|
||||
|
||||
// Calculate final histogram bin
|
||||
for(int i = 0; i < subHistgCnt; ++i)
|
||||
{
|
||||
for(int j = 0; j < binSize; ++j)
|
||||
{
|
||||
deviceBin[j] += midDeviceBin[i * binSize + j];
|
||||
}
|
||||
}
|
||||
|
||||
return SDK_SUCCESS;
|
||||
}
|
||||
|
||||
int
|
||||
Histogram::setup()
|
||||
{
|
||||
if(iterations < 1)
|
||||
{
|
||||
std::cout<<"Error, iterations cannot be 0 or negative. Exiting..\n";
|
||||
exit(0);
|
||||
}
|
||||
int status = 0;
|
||||
|
||||
/* width must be multiples of binSize and
|
||||
* height must be multiples of groupSize
|
||||
*/
|
||||
width = (width / binSize ? width / binSize: 1) * binSize;
|
||||
height = (height / groupSize ? height / groupSize: 1) * groupSize;
|
||||
|
||||
status = setupHIP();
|
||||
if(status != SDK_SUCCESS)
|
||||
return status;
|
||||
|
||||
status = setupHistogram();
|
||||
if(status != SDK_SUCCESS)
|
||||
return status;
|
||||
|
||||
return SDK_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int Histogram::run()
|
||||
{
|
||||
for(int i = 0; i < 2 && iterations != 1; i++)
|
||||
if(runKernels() != SDK_SUCCESS)
|
||||
return SDK_FAILURE;
|
||||
|
||||
for(int i = 0; i < iterations; i++)
|
||||
if(runKernels() != SDK_SUCCESS)
|
||||
return SDK_FAILURE;
|
||||
|
||||
return SDK_SUCCESS;
|
||||
}
|
||||
|
||||
int Histogram::cleanup()
|
||||
{
|
||||
hipFree(dataBuf);
|
||||
hipFree(midDeviceBinBuf);
|
||||
|
||||
free(hostBin);
|
||||
free(deviceBin);
|
||||
|
||||
return SDK_SUCCESS;
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char * argv[])
|
||||
{
|
||||
int status = 0;
|
||||
// Create MonteCalroAsian object
|
||||
Histogram hipHistogram;
|
||||
|
||||
// Setup
|
||||
status = hipHistogram.setup();
|
||||
if(status != SDK_SUCCESS)
|
||||
return status;
|
||||
|
||||
// Run
|
||||
if(hipHistogram.run() != SDK_SUCCESS)
|
||||
return SDK_FAILURE;
|
||||
|
||||
// Cleanup resources created
|
||||
if(hipHistogram.cleanup() != SDK_SUCCESS)
|
||||
return SDK_FAILURE;
|
||||
|
||||
return SDK_SUCCESS;
|
||||
}
|
||||
@@ -0,0 +1,181 @@
|
||||
/**********************************************************************
|
||||
Copyright �2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
|
||||
|
||||
� Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
|
||||
� Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
|
||||
other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
||||
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
********************************************************************/
|
||||
|
||||
|
||||
#ifndef HISTOGRAM_H_
|
||||
#define HISTOGRAM_H_
|
||||
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
|
||||
using namespace std;
|
||||
|
||||
#define SAMPLE_VERSION "HIP-Examples-Applications-v1.0"
|
||||
#define WIDTH 1024
|
||||
#define HEIGHT 1024
|
||||
#define BIN_SIZE 256
|
||||
#define GROUP_SIZE 128
|
||||
#define GROUP_ITERATIONS (BIN_SIZE / 2)//This is done to avoid overflow in the kernel
|
||||
#define SUB_HISTOGRAM_COUNT ((WIDTH * HEIGHT) /(GROUP_SIZE * GROUP_ITERATIONS))
|
||||
|
||||
|
||||
#ifndef __global__
|
||||
#define __global__
|
||||
#endif
|
||||
|
||||
#ifndef HIP_DYNAMIC_SHARED
|
||||
#define HIP_DYNAMIC_SHARED(t,x) t* x
|
||||
#endif
|
||||
|
||||
#ifndef hipThreadIdx_x
|
||||
#define hipThreadIdx_x 0
|
||||
#endif
|
||||
|
||||
#ifndef hipThreadIdx_y
|
||||
#define hipThreadIdx_y 0
|
||||
#endif
|
||||
|
||||
#ifndef hipThreadIdx_z
|
||||
#define hipThreadIdx_z 0
|
||||
#endif
|
||||
|
||||
#ifndef hipBlockIdx_x
|
||||
#define hipBlockIdx_x 0
|
||||
#endif
|
||||
|
||||
#ifndef hipBlockDim_x
|
||||
#define hipBlockDim_x 0
|
||||
#endif
|
||||
|
||||
#ifndef __syncthreads
|
||||
#define __syncthreads()
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Histogram
|
||||
* Class implements 256 Histogram bin implementation
|
||||
|
||||
*/
|
||||
|
||||
class Histogram
|
||||
{
|
||||
|
||||
int binSize; /**< Size of Histogram bin */
|
||||
int groupSize; /**< Number of threads in group */
|
||||
int subHistgCnt; /**< Sub histogram count */
|
||||
unsigned int *data; /**< input data initialized with normalized(0 - binSize) random values */
|
||||
int width; /**< width of the input */
|
||||
int height; /**< height of the input */
|
||||
unsigned int *hostBin; /**< Host result for histogram bin */
|
||||
unsigned int *midDeviceBin; /**< Intermittent sub-histogram bins */
|
||||
unsigned int *deviceBin; /**< Device result for histogram bin */
|
||||
|
||||
unsigned long totalLocalMemory; /**< Max local memory allowed */
|
||||
unsigned long usedLocalMemory; /**< Used local memory by kernel */
|
||||
|
||||
unsigned int* dataBuf; /**< CL memory buffer for data */
|
||||
unsigned int* midDeviceBinBuf; /**< CL memory buffer for intermittent device bin */
|
||||
|
||||
int iterations; /**< Number of iterations for kernel execution */
|
||||
unsigned int globalThreads;
|
||||
unsigned int localThreads ;
|
||||
int groupIterations;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
* Initialize member variables
|
||||
* @param name name of sample (string)
|
||||
*/
|
||||
Histogram()
|
||||
:
|
||||
binSize(BIN_SIZE),
|
||||
groupSize(GROUP_SIZE),
|
||||
subHistgCnt(SUB_HISTOGRAM_COUNT),
|
||||
groupIterations(GROUP_ITERATIONS),
|
||||
data(NULL),
|
||||
hostBin(NULL),
|
||||
midDeviceBin(NULL),
|
||||
deviceBin(NULL),
|
||||
iterations(1)
|
||||
{
|
||||
/* Set default values for width and height */
|
||||
width = WIDTH;
|
||||
height = HEIGHT;
|
||||
}
|
||||
|
||||
|
||||
~Histogram()
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Allocate and initialize required host memory with appropriate values
|
||||
* @return SDK_SUCCESS on success and SDK_FAILURE on failure
|
||||
*/
|
||||
int setupHistogram();
|
||||
|
||||
/**
|
||||
* HIP related initialisations.
|
||||
* Set up Context, Device list, Command Queue, Memory buffers
|
||||
* Build HIP kernel program executable
|
||||
* @return SDK_SUCCESS on success and SDK_FAILURE on failure
|
||||
*/
|
||||
int setupHIP();
|
||||
|
||||
/**
|
||||
* Set values for kernels' arguments, enqueue calls to the kernels
|
||||
* on to the command queue, wait till end of kernel execution.
|
||||
* Get kernel start and end time if timing is enabled
|
||||
* @return SDK_SUCCESS on success and SDK_FAILURE on failure
|
||||
*/
|
||||
int runKernels();
|
||||
|
||||
|
||||
/**
|
||||
* Override from SDKSample, adjust width and height
|
||||
* of execution domain, perform all sample setup
|
||||
* @return SDK_SUCCESS on success and SDK_FAILURE on failure
|
||||
*/
|
||||
int setup();
|
||||
|
||||
/**
|
||||
* Override from SDKSample
|
||||
* Run HIP Black-Scholes
|
||||
* @return SDK_SUCCESS on success and SDK_FAILURE on failure
|
||||
*/
|
||||
int run();
|
||||
|
||||
/**
|
||||
* Override from SDKSample
|
||||
* Cleanup memory allocations
|
||||
* @return SDK_SUCCESS on success and SDK_FAILURE on failure
|
||||
*/
|
||||
int cleanup();
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* Calculate histogram bin on host
|
||||
*/
|
||||
int calculateHostBin();
|
||||
};
|
||||
#endif
|
||||
@@ -0,0 +1,127 @@
|
||||
/**********************************************************************
|
||||
Copyright ©2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
|
||||
|
||||
Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
|
||||
Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
|
||||
other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
||||
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
********************************************************************/
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <iostream>
|
||||
|
||||
#define TILE_DIM 16
|
||||
#define BLOCK_ROWS 16
|
||||
|
||||
__global__ void transposeNaive(float *odata, float *idata, int width, int height) {
|
||||
int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
|
||||
int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
|
||||
|
||||
int index_in = xIndex + width * yIndex;
|
||||
int index_out = yIndex + height * xIndex;
|
||||
|
||||
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
|
||||
odata[index_out + i] = idata[index_in + i * width];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void transposeCoalesced(float *odata, float *idata, int width, int height) {
|
||||
__shared__ float tile[TILE_DIM][TILE_DIM];
|
||||
|
||||
int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
|
||||
int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
|
||||
int index_in = xIndex + (yIndex)*width;
|
||||
|
||||
xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
|
||||
yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
|
||||
int index_out = xIndex + (yIndex)*height;
|
||||
|
||||
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
|
||||
tile[threadIdx.y + i][threadIdx.x] = idata[index_in + i * width];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
|
||||
odata[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void transposeNoBankConflicts(float *odata, float *idata, int width, int height) {
|
||||
__shared__ float tile[TILE_DIM][TILE_DIM + 1];
|
||||
|
||||
int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
|
||||
int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
|
||||
int index_in = xIndex + (yIndex)*width;
|
||||
|
||||
xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
|
||||
yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
|
||||
int index_out = xIndex + (yIndex)*height;
|
||||
|
||||
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
|
||||
tile[threadIdx.y + i][threadIdx.x] = idata[index_in + i * width];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
|
||||
odata[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* fillRandom
|
||||
* fill array with random values
|
||||
*/
|
||||
__global__ void fillRandom(float* arrayPtr, int size) {
|
||||
int id = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
uint32_t seed = 12345678;
|
||||
while (id < size) {
|
||||
seed = (seed<<3) ^ id;
|
||||
arrayPtr[id] = seed;
|
||||
id += blockDim.x * gridDim.x;
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char * argv[])
|
||||
{
|
||||
float* input = nullptr;
|
||||
float* output = nullptr;
|
||||
size_t width = 1024;
|
||||
size_t height = 1024;
|
||||
|
||||
// Set input data to matrix A and matrix B
|
||||
hipMalloc((void**)&input, width * height * sizeof(float));
|
||||
hipMalloc((void**)&output, width * height * sizeof(float));
|
||||
|
||||
fillRandom<<<256, 256>>>(input, width*height);
|
||||
hipDeviceSynchronize();
|
||||
|
||||
dim3 block(width/TILE_DIM,height/TILE_DIM);
|
||||
dim3 threads(TILE_DIM,TILE_DIM);
|
||||
|
||||
transposeNoBankConflicts<<<block, threads>>>(
|
||||
output,
|
||||
input,
|
||||
width,
|
||||
height
|
||||
);
|
||||
|
||||
hipDeviceSynchronize();
|
||||
|
||||
hipFree(input);
|
||||
hipFree(output);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,216 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include <stdlib.h>
|
||||
#include <iostream>
|
||||
#include "hip/hip_runtime.h"
|
||||
#include <unistd.h>
|
||||
#include <vector>
|
||||
|
||||
#ifdef NDEBUG
|
||||
#define HIP_ASSERT(x) x
|
||||
#else
|
||||
#define HIP_ASSERT(x) (assert((x)==hipSuccess))
|
||||
#endif
|
||||
|
||||
|
||||
#define WIDTH (1024)
|
||||
#define HEIGHT (1024)
|
||||
|
||||
#define NUM (WIDTH*HEIGHT)
|
||||
|
||||
#define THREADS_PER_BLOCK_X 64
|
||||
#define THREADS_PER_BLOCK_Y 1
|
||||
#define THREADS_PER_BLOCK_Z 1
|
||||
|
||||
// Computes vectorAdd with matrix-multiply
|
||||
template<typename T>
|
||||
__global__ void addition_kernel(
|
||||
T* __restrict__ a,
|
||||
const float* __restrict__ b,
|
||||
const float* __restrict__ c,
|
||||
int width,
|
||||
int height
|
||||
) {
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x >= WIDTH || y >= HEIGHT) return;
|
||||
int index = y * width + x;
|
||||
|
||||
a[index] = b[index]+c[index];
|
||||
}
|
||||
|
||||
|
||||
__global__ void subtract_kernel(
|
||||
float* __restrict__ a,
|
||||
const float* __restrict__ b,
|
||||
const float* __restrict__ c,
|
||||
int width,
|
||||
int height
|
||||
) {
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x >= WIDTH || y >= HEIGHT) return;
|
||||
int index = y * width + x;
|
||||
|
||||
a[index] = abs(b[index]-c[index]);
|
||||
}
|
||||
|
||||
__global__ void multiply_kernel(
|
||||
float* __restrict__ a,
|
||||
const float* __restrict__ b,
|
||||
const float* __restrict__ c,
|
||||
int width,
|
||||
int height
|
||||
) {
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x >= WIDTH || y >= HEIGHT) return;
|
||||
int index = y * width + x;
|
||||
|
||||
a[index] = (b[index]-1)*(c[index]-1)+1;
|
||||
}
|
||||
|
||||
__global__ void divide_kernel(
|
||||
float* __restrict__ a,
|
||||
const float* __restrict__ b,
|
||||
const float* __restrict__ c,
|
||||
int width,
|
||||
int height
|
||||
) {
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x >= WIDTH || y >= HEIGHT) return;
|
||||
int index = y * width + x;
|
||||
|
||||
a[index] = (b[index]-c[index]) / abs(c[index]+b[index]) + 1;
|
||||
}
|
||||
|
||||
using namespace std;
|
||||
|
||||
void run(int NUM_QUEUE) {
|
||||
std::vector<float*> hostA(NUM_QUEUE);
|
||||
std::vector<float*> hostB(NUM_QUEUE);
|
||||
std::vector<float*> hostC(NUM_QUEUE);
|
||||
|
||||
std::vector<float*> deviceA(NUM_QUEUE);
|
||||
std::vector<float*> deviceB(NUM_QUEUE);
|
||||
std::vector<float*> deviceC(NUM_QUEUE);
|
||||
|
||||
std::vector<hipStream_t> streams(NUM_QUEUE);
|
||||
|
||||
hipDeviceProp_t devProp;
|
||||
hipGetDeviceProperties(&devProp, 0);
|
||||
cout << " System minor " << devProp.minor << endl;
|
||||
cout << " System major " << devProp.major << endl;
|
||||
cout << " agent prop name " << devProp.name << endl;
|
||||
cout << "hip Device prop succeeded " << endl ;
|
||||
|
||||
int i;
|
||||
int errors;
|
||||
|
||||
for (int q=0; q<NUM_QUEUE; q++) {
|
||||
hipStreamCreateWithFlags(&streams[q], hipStreamNonBlocking);
|
||||
|
||||
HIP_ASSERT(hipHostMalloc(&hostA[q], NUM * sizeof(float), 0));
|
||||
HIP_ASSERT(hipHostMalloc(&hostB[q], NUM * sizeof(float), 0));
|
||||
HIP_ASSERT(hipHostMalloc(&hostC[q], NUM * sizeof(float), 0));
|
||||
|
||||
// initialize the input data
|
||||
for (i = 0; i < NUM; i++) {
|
||||
hostB[q][i] = (float)i;
|
||||
hostC[q][i] = (float)i*100.0f;
|
||||
}
|
||||
|
||||
HIP_ASSERT(hipMalloc((void**)(&deviceA[q]), NUM * sizeof(float)));
|
||||
HIP_ASSERT(hipMalloc((void**)(&deviceB[q]), NUM * sizeof(float)));
|
||||
HIP_ASSERT(hipMalloc((void**)(&deviceC[q]), NUM * sizeof(float)));
|
||||
|
||||
HIP_ASSERT(hipMemcpyAsync(deviceB[q], hostB[q], NUM*sizeof(float), hipMemcpyHostToDevice, streams[q]));
|
||||
HIP_ASSERT(hipMemcpyAsync(deviceC[q], hostC[q], NUM*sizeof(float), hipMemcpyHostToDevice, streams[q]));
|
||||
}
|
||||
hipDeviceSynchronize();
|
||||
|
||||
for (int RUN_I=0; RUN_I<2; RUN_I++)
|
||||
{
|
||||
int q = (4*RUN_I+0)%NUM_QUEUE;
|
||||
hipLaunchKernelGGL(addition_kernel,
|
||||
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
0, streams[q],
|
||||
deviceA[q], deviceB[q], deviceC[q] ,WIDTH ,HEIGHT);
|
||||
|
||||
hipDeviceSynchronize();
|
||||
q = (4*RUN_I+1)%NUM_QUEUE;
|
||||
hipLaunchKernelGGL(subtract_kernel,
|
||||
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
0, streams[q],
|
||||
deviceA[q], deviceB[q], deviceC[q] ,WIDTH ,HEIGHT);
|
||||
|
||||
hipDeviceSynchronize();
|
||||
q = (4*RUN_I+2)%NUM_QUEUE;
|
||||
hipLaunchKernelGGL(multiply_kernel,
|
||||
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
0, streams[q],
|
||||
deviceA[q], deviceB[q], deviceC[q] ,WIDTH ,HEIGHT);
|
||||
|
||||
hipDeviceSynchronize();
|
||||
q = (4*RUN_I+3)%NUM_QUEUE;
|
||||
hipLaunchKernelGGL(divide_kernel,
|
||||
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
0, streams[q],
|
||||
deviceB[q], deviceA[q], deviceC[q] ,WIDTH ,HEIGHT);
|
||||
hipDeviceSynchronize();
|
||||
}
|
||||
|
||||
for (int q=0; q<NUM_QUEUE; q++)
|
||||
HIP_ASSERT(hipMemcpyAsync(hostA[q], deviceA[q], NUM*sizeof(float), hipMemcpyDeviceToHost, streams[q]));
|
||||
|
||||
|
||||
for (int q=0; q<NUM_QUEUE; q++) {
|
||||
HIP_ASSERT(hipMemcpy(hostA[q], deviceA[q], NUM*sizeof(float), hipMemcpyDeviceToHost));
|
||||
hipDeviceSynchronize();
|
||||
|
||||
HIP_ASSERT(hipFree(deviceA[q]));
|
||||
HIP_ASSERT(hipFree(deviceB[q]));
|
||||
HIP_ASSERT(hipFree(deviceC[q]));
|
||||
|
||||
hipHostFree(hostA[q]);
|
||||
hipHostFree(hostB[q]);
|
||||
hipHostFree(hostC[q]);
|
||||
hipStreamDestroy(streams[q]);
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
run(1);
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
# counter correctness test
|
||||
add_test(
|
||||
NAME pmc_correctnes_vadd_test
|
||||
COMMAND
|
||||
${PROJECT_BINARY_DIR}/rocprofv2 -i
|
||||
${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps/goldentraces/pmc.txt -d
|
||||
/tmp/tests-v2/pmc -o vadd tests-v2/featuretests/profiler/apps/pmc_vectoradd
|
||||
WORKING_DIRECTORY "${PROJECT_BINARY_DIR}")
|
||||
|
||||
set_tests_properties(
|
||||
pmc_correctnes_vadd_test PROPERTIES LABELS "v2;rocprofv2" ENVIRONMENT
|
||||
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV}")
|
||||
|
||||
add_test(
|
||||
NAME pmc_correctnes_vadd_test_validation
|
||||
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test_vectoradd.py
|
||||
"/tmp/tests-v2/pmc/pmc_1/results_vadd.csv"
|
||||
WORKING_DIRECTORY "${PROJECT_BINARY_DIR}")
|
||||
|
||||
set_tests_properties(
|
||||
pmc_correctnes_vadd_test_validation
|
||||
PROPERTIES DEPENDS
|
||||
pmc_correctnes_vadd_test
|
||||
LABELS
|
||||
"v2;validation"
|
||||
PASS_REGULAR_EXPRESSION
|
||||
"Test Passed"
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"Test Failed"
|
||||
SKIP_REGULAR_EXPRESSION
|
||||
"Skipped")
|
||||
|
||||
add_test(
|
||||
NAME pmc_correctnes_histogram_test
|
||||
COMMAND
|
||||
${PROJECT_BINARY_DIR}/rocprofv2 -i
|
||||
${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps/goldentraces/pmc.txt -d
|
||||
/tmp/tests-v2/pmc -o histo tests-v2/featuretests/profiler/apps/pmc_histogram
|
||||
WORKING_DIRECTORY "${PROJECT_BINARY_DIR}")
|
||||
|
||||
set_tests_properties(
|
||||
pmc_correctnes_histogram_test PROPERTIES LABELS "v2;rocprofv2" ENVIRONMENT
|
||||
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV}")
|
||||
|
||||
add_test(
|
||||
NAME pmc_correctnes_histogram_test_validation
|
||||
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test_histogram.py
|
||||
"/tmp/tests-v2/pmc/pmc_1/results_histo.csv"
|
||||
WORKING_DIRECTORY "${PROJECT_BINARY_DIR}")
|
||||
|
||||
set_tests_properties(
|
||||
pmc_correctnes_histogram_test_validation
|
||||
PROPERTIES DEPENDS
|
||||
pmc_correctnes_histogram_test
|
||||
LABELS
|
||||
"v2;validation"
|
||||
PASS_REGULAR_EXPRESSION
|
||||
"Test Passed"
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"Test Failed"
|
||||
SKIP_REGULAR_EXPRESSION
|
||||
"Skipped")
|
||||
@@ -0,0 +1,73 @@
|
||||
import numpy as np
|
||||
import pandas
|
||||
import os
|
||||
import glob
|
||||
|
||||
current_dir = os.getcwd()
|
||||
rocprof = "rocprofv2"
|
||||
|
||||
expected_filename = "pmc_1/results_histo.csv"
|
||||
output_folder = "/tmp/tests-v2/pmc"
|
||||
|
||||
|
||||
def test_grbm(csvfile):
|
||||
count = np.array(csvfile["GRBM_COUNT"])
|
||||
active = np.array(csvfile["GRBM_GUI_ACTIVE"])
|
||||
assert np.all(active > 0) # GPU must always be active
|
||||
assert np.all(count >= active) # Count always increments more than active
|
||||
assert np.all(
|
||||
count * 0.5 < active
|
||||
) # We can reasonably expect an active GPU during the kernel execution
|
||||
|
||||
|
||||
def test_insts(csvfile):
|
||||
waves = np.array(
|
||||
csvfile["SQ_WAVES"]
|
||||
) # TODO: 256 for wave32, need to check for wave64
|
||||
lds = np.array(csvfile["SQ_INSTS_LDS"])
|
||||
valu = np.array(csvfile["SQ_INSTS_VALU"])
|
||||
salu = np.array(csvfile["SQ_INSTS_SALU"])
|
||||
smem = np.array(csvfile["SQ_INSTS_SMEM"])
|
||||
|
||||
assert np.all(waves == 256) or np.all(waves == 128)
|
||||
|
||||
# Each have executes at least one of these
|
||||
assert np.all(lds > waves)
|
||||
assert np.all(valu > waves)
|
||||
assert np.all(salu > waves)
|
||||
assert np.all(smem >= waves)
|
||||
|
||||
|
||||
def test_sqcycles(csvfile):
|
||||
tabusy = np.array(csvfile["TA_BUSY_max"])
|
||||
grbm = np.array(csvfile["GRBM_GUI_ACTIVE"])
|
||||
waves = np.array(csvfile["SQ_WAVES"])
|
||||
wait_any = np.array(csvfile["SQ_WAIT_ANY"])
|
||||
wave_cycles = np.array(csvfile["SQ_WAVE_CYCLES"])
|
||||
vmem_cycles = np.array(csvfile["SQ_INST_CYCLES_VMEM"])
|
||||
|
||||
lds = np.array(csvfile["SQ_INSTS_LDS"])
|
||||
valu = np.array(csvfile["SQ_INSTS_VALU"])
|
||||
salu = np.array(csvfile["SQ_INSTS_SALU"])
|
||||
smem = np.array(csvfile["SQ_INSTS_SMEM"])
|
||||
|
||||
ALU = lds + valu + salu + smem
|
||||
|
||||
assert np.all(
|
||||
wave_cycles >= ALU + wait_any
|
||||
) # Each ALU inst takes at least one cycle
|
||||
assert np.all(wave_cycles / grbm <= waves) # Mean occupancy cannot exceed waves
|
||||
assert np.all(wait_any >= tabusy) # Waves are waiting for ta
|
||||
assert np.all(
|
||||
vmem_cycles >= waves
|
||||
) # Each wave takes at least one cycle to issue vmem
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
csv = pandas.read_csv(f"{output_folder}/{expected_filename}")
|
||||
test_grbm(csv)
|
||||
test_insts(csv)
|
||||
# test_sqcycles(csv)
|
||||
|
||||
# if its reached this point, then all tests apssed
|
||||
print("Test Passed: All counter correctness tests passed.")
|
||||
@@ -0,0 +1,114 @@
|
||||
import numpy as np
|
||||
import pandas
|
||||
import os
|
||||
import glob
|
||||
|
||||
current_dir = os.getcwd()
|
||||
rocprof = "rocprofv2"
|
||||
|
||||
expected_filename = "pmc_1/results_vadd.csv"
|
||||
output_folder = "/tmp/tests-v2/pmc"
|
||||
|
||||
|
||||
def test_grbm(csvfile):
|
||||
count = np.array(csvfile["GRBM_COUNT"])
|
||||
active = np.array(csvfile["GRBM_GUI_ACTIVE"])
|
||||
assert np.all(active > 0) # GPU must always be active
|
||||
assert np.all(count >= active) # Count always increments more than active
|
||||
assert np.all(
|
||||
count * 0.8 < active
|
||||
) # We can reasonably expect an active GPU during the kernel execution
|
||||
|
||||
|
||||
def test_sqwaves(csvfile):
|
||||
waves = np.array(
|
||||
csvfile["SQ_WAVES"]
|
||||
) # 1M threads == 32k waves for Wave32 and 16k waves for Wave64
|
||||
assert np.all(waves == 32768) or np.all(waves == 16384)
|
||||
|
||||
|
||||
def test_insts(csvfile):
|
||||
waves = np.array(csvfile["SQ_WAVES"])
|
||||
valu = np.array(csvfile["SQ_INSTS_VALU"])
|
||||
salu = np.array(csvfile["SQ_INSTS_SALU"])
|
||||
smem = np.array(csvfile["SQ_INSTS_SMEM"])
|
||||
lds = np.array(csvfile["SQ_INSTS_LDS"])
|
||||
|
||||
assert np.all(lds == 0) # Not used on vectoradd
|
||||
|
||||
# VALU, SALU and SMEM must be divisible by SQ_Waves
|
||||
assert np.all(valu % waves == 0)
|
||||
assert np.all(salu % waves == 0)
|
||||
assert np.all(smem % waves == 0)
|
||||
|
||||
# Each have executes at least one of these
|
||||
assert np.all(valu > waves)
|
||||
assert np.all(salu > waves)
|
||||
assert np.all(smem >= waves)
|
||||
|
||||
# TODO: Check assembly for exact number!
|
||||
|
||||
|
||||
def test_gl2c(csvfile):
|
||||
waves = np.array(csvfile["SQ_WAVES"])
|
||||
read = np.array(csvfile["GL2C_MC_RDREQ_sum"])
|
||||
write = np.array(csvfile["GL2C_MC_WRREQ_sum"])
|
||||
hit = np.array(csvfile["GL2C_HIT_sum"])
|
||||
miss = np.array(csvfile["GL2C_MISS_sum"])
|
||||
hitrate = np.array(csvfile["L2CacheHit"])
|
||||
|
||||
assert np.all(write >= waves) # We do at least one write per wave
|
||||
# TODO: Find out why the first kernel gets such a high write request count.
|
||||
assert np.all(write < 2.5 * waves) # We do only one write (+ a little) per wave.
|
||||
assert np.all(read >= 2 * waves) # We do at least 2 reads per wave (A=B+C)
|
||||
assert np.all(read < 3 * waves) # We do only 2 reads (+ a little) per wave
|
||||
|
||||
assert np.all(miss >= hit) # on Vadd we can't have more hits than misses
|
||||
assert np.all(miss >= 2 * waves) # Each read misses at least once
|
||||
assert np.all(miss < 4 * waves) # Can't miss too much
|
||||
assert np.all(hit >= 0.5 * waves) # We have at least one hit per wave
|
||||
|
||||
assert np.all(hitrate <= 50) # We always get more misses than hits
|
||||
|
||||
|
||||
def test_ta(csvfile):
|
||||
busy = np.array(csvfile["MemUnitBusy"])
|
||||
some_busy = np.array(csvfile["TA_BUSY_max"]) / np.array(csvfile["GRBM_GUI_ACTIVE"])
|
||||
|
||||
assert np.all(busy <= 100) # MemUnitBusy <= 100%
|
||||
assert np.all(some_busy >= 1) # Some shader engine is using TA
|
||||
|
||||
|
||||
def test_sqcycles(csvfile):
|
||||
tabusy = np.array(csvfile["TA_BUSY_max"])
|
||||
grbm = np.array(csvfile["GRBM_GUI_ACTIVE"])
|
||||
waves = np.array(csvfile["SQ_WAVES"])
|
||||
ALU = np.array(csvfile["SQ_INSTS_VALU"]) + np.array(csvfile["SQ_INSTS_SALU"])
|
||||
wait_any = np.array(csvfile["SQ_WAIT_ANY"])
|
||||
wave_cycles = np.array(csvfile["SQ_WAVE_CYCLES"])
|
||||
vmem_cycles = np.array(csvfile["SQ_INST_CYCLES_VMEM"])
|
||||
|
||||
assert np.all(
|
||||
wave_cycles >= ALU + wait_any
|
||||
) # Each ALU inst takes at least one cycle
|
||||
assert np.all(wave_cycles / grbm <= waves) # Mean occupancy cannot exceed waves
|
||||
assert np.all(wait_any >= tabusy) # Waves are waiting for ta
|
||||
assert np.all(
|
||||
vmem_cycles >= waves
|
||||
) # Each wave takes at least one cycle to issue vmem
|
||||
assert np.all(
|
||||
wait_any / wave_cycles >= 0.5
|
||||
) # vectorAdd is very memory-bound. TODO: use number less arbitrary than 0.5
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
csv = pandas.read_csv(f"{output_folder}/{expected_filename}")
|
||||
test_grbm(csv)
|
||||
test_sqwaves(csv)
|
||||
test_insts(csv)
|
||||
# test_gl2c(csv)
|
||||
# test_ta(csv)
|
||||
# test_sqcycles(csv)
|
||||
|
||||
# if its reached this point, then all tests apssed
|
||||
print("Test Passed: All counter correctness tests passed.")
|
||||
Reference in New Issue
Block a user