diff --git a/tests-v2/featuretests/profiler/CMakeLists.txt b/tests-v2/featuretests/profiler/CMakeLists.txt index f9f3855a04..4284b52890 100644 --- a/tests-v2/featuretests/profiler/CMakeLists.txt +++ b/tests-v2/featuretests/profiler/CMakeLists.txt @@ -214,6 +214,19 @@ install( ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/profiler/apps COMPONENT tests) +# ATT correctness vectoradd +set_source_files_properties(apps/att_vectoradd.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) +rocprofiler_featuretests_profiler_add_executable(att_vectoradd apps/att_vectoradd.cpp) +set_target_properties( + att_vectoradd PROPERTIES RUNTIME_OUTPUT_DIRECTORY + "${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps") +target_link_options(att_vectoradd PRIVATE "-Wl,--build-id=md5") +install( + TARGETS att_vectoradd + RUNTIME + DESTINATION + ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/profiler/apps + COMPONENT tests) # hsa-mem_async_copy -- Not Enabled for Now set_source_files_properties(apps/async_mem_copy.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT @@ -425,4 +438,5 @@ find_package( # cmake based tests include(${CMAKE_CURRENT_LIST_DIR}/counter_validation_tests.cmake) include(${CMAKE_CURRENT_LIST_DIR}/counter_correctness_tests.cmake) + include(${CMAKE_CURRENT_LIST_DIR}/att_correctness_tests.cmake) endif() diff --git a/tests-v2/featuretests/profiler/apps/att_vectoradd.cpp b/tests-v2/featuretests/profiler/apps/att_vectoradd.cpp new file mode 100755 index 0000000000..33b67be4ce --- /dev/null +++ b/tests-v2/featuretests/profiler/apps/att_vectoradd.cpp @@ -0,0 +1,96 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include +#include +#include + +#include +#include + +#include "hip/hip_runtime.h" + +#define WIDTH 256 +#define HEIGHT 128 + +#define NUM (WIDTH * HEIGHT) + +#define THREADS_PER_BLOCK_X 8 +#define THREADS_PER_BLOCK_Y 8 +#define THREADS_PER_BLOCK_Z 1 + +__global__ void vectoradd_att(float* __restrict__ a, const float* __restrict__ b, + const float* __restrict__ c, int width, int height) + { + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; + + int i = y * width + x; + if (i < width * height) + a[i] = b[i] + c[i]; +} + +int main() { + float* hostA; + float* hostB; + float* hostC; + + float* deviceA; + float* deviceB; + float* deviceC; + + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + + hostA = reinterpret_cast(malloc(NUM * sizeof(float))); + hostB = reinterpret_cast(malloc(NUM * sizeof(float))); + hostC = reinterpret_cast(malloc(NUM * sizeof(float))); + + // initialize the input data + for (size_t i = 0; i < NUM; i++) { + hostB[i] = static_cast(i); + hostC[i] = static_cast(i) * 100.0f; + } + + hipMalloc(reinterpret_cast(&deviceA), NUM * sizeof(float)); + hipMalloc(reinterpret_cast(&deviceB), NUM * sizeof(float)); + hipMalloc(reinterpret_cast(&deviceC), NUM * sizeof(float)); + + hipMemcpy(deviceB, hostB, NUM * sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(deviceC, hostC, NUM * sizeof(float), hipMemcpyHostToDevice); + + hipLaunchKernelGGL(vectoradd_att, + dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, 0, deviceA, deviceB, deviceC, WIDTH, HEIGHT); + + hipMemcpy(hostA, deviceA, NUM * sizeof(float), hipMemcpyDeviceToHost); + + hipFree(deviceA); + hipFree(deviceB); + hipFree(deviceC); + + free(hostA); + free(hostB); + free(hostC); + + return 0; +} diff --git a/tests-v2/featuretests/profiler/apps/goldentraces/att_vadd.txt b/tests-v2/featuretests/profiler/apps/goldentraces/att_vadd.txt new file mode 100644 index 0000000000..ba0230ad9c --- /dev/null +++ b/tests-v2/featuretests/profiler/apps/goldentraces/att_vadd.txt @@ -0,0 +1,5 @@ +att: TARGET_CU=1 +SIMD_SELECT=0x3 +SE_MASK=0x55555555 +BUFFER_SIZE=192 +ISA_CAPTURE_MODE=0 diff --git a/tests-v2/featuretests/profiler/att_correctness_tests.cmake b/tests-v2/featuretests/profiler/att_correctness_tests.cmake new file mode 100644 index 0000000000..bf7487b276 --- /dev/null +++ b/tests-v2/featuretests/profiler/att_correctness_tests.cmake @@ -0,0 +1,31 @@ +# counter correctness test +add_test( + NAME att_correctness_vectoradd_run + COMMAND + ${PROJECT_BINARY_DIR}/rocprofv2 -i + ${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps/goldentraces/att_vadd.txt + -d /tmp/tests-v2/att/ -o /tmp/tests-v2/att/vadd + --plugin att auto --mode csv tests-v2/featuretests/profiler/apps/att_vectoradd.cpp + WORKING_DIRECTORY "${PROJECT_BINARY_DIR}") + +set_tests_properties( + att_correctness_vectoradd_run PROPERTIES LABELS "v2;rocprofv2" ENVIRONMENT + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}") + +add_test( + NAME att_correctness_vectoradd_parse + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test_att_vectoradd.py + WORKING_DIRECTORY "${PROJECT_BINARY_DIR}") + +set_tests_properties( + att_correctness_vectoradd_parse + PROPERTIES DEPENDS + att_correctness_vectoradd_run + LABELS + "v2;validation" + PASS_REGULAR_EXPRESSION + "Test Passed" + FAIL_REGULAR_EXPRESSION + "Test Failed" + SKIP_REGULAR_EXPRESSION + "Skipped") diff --git a/tests-v2/featuretests/profiler/test_att_vectoradd.py b/tests-v2/featuretests/profiler/test_att_vectoradd.py new file mode 100644 index 0000000000..8b92a69618 --- /dev/null +++ b/tests-v2/featuretests/profiler/test_att_vectoradd.py @@ -0,0 +1,72 @@ +import numpy as np +import pandas + +MAX_CU = 16 +MAX_WAVE_SIZE = 64 +MAXIMUM_ATT_HITS = 256*128//MAX_WAVE_SIZE//MAX_CU + +kernel_name = "vectoradd_att" +csv_filename = "att_output_" + kernel_name + "_v0.csv" +output_folder = "/tmp/tests-v2/att" + + +def test_hitcount(csv): + hits = {m: True for m in csv['Hitcount'] if m != 0} + print('hits', hits) + assert(len(hits) > 0) + assert(np.max([k for k in hits.keys()]) <= MAXIMUM_ATT_HITS) + + +def test_addr(csv): + addrs = np.array([int(addr, 16) for addr in csv['Addr'] if len(addr) != 0 and addr[0] != '-']) + print('addrs', addrs) + assert(addrs.max() - addrs.min() > 32) # 32 bytes is a safe minimum value + assert(addrs.max() - addrs.min() < 2**24) # Kernels are not anywhere near that large + + +def test_memory_list(csv): + inst_list = ' '.join(csv['Instruction']) + assert('vectoradd_' in inst_list) + assert('s_load_' in inst_list) + assert('_store_' in inst_list) + assert('s_waitcnt' in inst_list) + assert('v_add' in inst_list) + assert('global_load' in inst_list or 'buffer_load' in inst_list or 'flat_load' in inst_list) + + +def test_mean_cycles(csv): + cycles = np.array([c/float(h) for c, h in zip(csv['Cycles'], csv['Hitcount']) if c != 0]) + print('cycles', cycles) + assert(cycles.min() < 5) # Waves should have some instructions with very few cycles + assert(cycles.max() > 100) # s_waitcnt should have a large cost + assert(cycles.mean() > 1) # Minimum cost per inst is 1 + assert(np.median(cycles) <= 16) # Majority of instructions are not that expensive + + maxv = int(4*cycles.max()+5)//4 + histogram = np.histogram(cycles, range=[0,maxv], bins=max(maxv//8, 1))[0] + assert(histogram[0] == np.max(histogram)) # 1~8 cycles should be most common cost + + +def test_memory_cycles(csv): + is_memory_op = lambda s: ('waitcnt' in s) or ('_load_' in s) or ('_store_' in s) + + max_cycles = np.max(csv['Cycles']) + most_exp_inst = [f for f in csv[csv['Cycles'] == max_cycles]['Instruction']][0] + print('most_exp_inst', most_exp_inst) + assert(is_memory_op(most_exp_inst)) # Memory ops should be the most expensive insts + + memory_ops = [c for s, c in zip(csv['Instruction'],csv['Cycles']) if is_memory_op(s)] + print('memory_ops', memory_ops) # Memory ops should be more than half the total cycles + assert(np.sum(memory_ops) > np.sum(csv['Cycles'])*0.5) + + +if __name__ == "__main__": + csv = pandas.read_csv(f"{output_folder}/{csv_filename}") + + test_hitcount(csv) + test_addr(csv) + test_memory_list(csv) + test_mean_cycles(csv) + test_memory_cycles(csv) + + print("All ATT correctness tests passed.")