From c2caa5ae2ccb07358074ec0d58fb717d7e70d4fd Mon Sep 17 00:00:00 2001 From: Ramesh Errabolu Date: Tue, 12 Sep 2017 19:28:15 -0500 Subject: [PATCH] Benchmark copy of data from one pool to another pool either in one or both directions. Users can enumerate the pools reported by system to specify which pools serve as source / destination Change-Id: I8e6d0adb3743b3328dd3ce9152762ca840ea613b --- rocrtst/samples/rocm_async/CMakeLists.txt | 88 ++++ rocrtst/samples/rocm_async/Readme.txt | 66 +++ rocrtst/samples/rocm_async/base_test.cpp | 11 + rocrtst/samples/rocm_async/base_test.hpp | 52 ++ rocrtst/samples/rocm_async/common.cpp | 149 ++++++ rocrtst/samples/rocm_async/common.hpp | 48 ++ rocrtst/samples/rocm_async/hsatimer.cpp | 188 +++++++ rocrtst/samples/rocm_async/hsatimer.hpp | 72 +++ rocrtst/samples/rocm_async/main.cpp | 25 + rocrtst/samples/rocm_async/os.cpp | 49 ++ rocrtst/samples/rocm_async/os.hpp | 14 + rocrtst/samples/rocm_async/rocm_async.cpp | 478 ++++++++++++++++++ rocrtst/samples/rocm_async/rocm_async.hpp | 338 +++++++++++++ .../samples/rocm_async/rocm_async_parse.cpp | 206 ++++++++ .../samples/rocm_async/rocm_async_print.cpp | 177 +++++++ .../samples/rocm_async/rocm_async_report.cpp | 167 ++++++ .../rocm_async/rocm_async_topology.cpp | 114 +++++ .../samples/rocm_async/rocm_async_trans.cpp | 173 +++++++ .../rocm_async/rocm_async_validate.cpp | 155 ++++++ 19 files changed, 2570 insertions(+) create mode 100644 rocrtst/samples/rocm_async/CMakeLists.txt create mode 100644 rocrtst/samples/rocm_async/Readme.txt create mode 100644 rocrtst/samples/rocm_async/base_test.cpp create mode 100644 rocrtst/samples/rocm_async/base_test.hpp create mode 100644 rocrtst/samples/rocm_async/common.cpp create mode 100644 rocrtst/samples/rocm_async/common.hpp create mode 100644 rocrtst/samples/rocm_async/hsatimer.cpp create mode 100644 rocrtst/samples/rocm_async/hsatimer.hpp create mode 100644 rocrtst/samples/rocm_async/main.cpp create mode 100644 rocrtst/samples/rocm_async/os.cpp create mode 100755 rocrtst/samples/rocm_async/os.hpp create mode 100755 rocrtst/samples/rocm_async/rocm_async.cpp create mode 100644 rocrtst/samples/rocm_async/rocm_async.hpp create mode 100755 rocrtst/samples/rocm_async/rocm_async_parse.cpp create mode 100755 rocrtst/samples/rocm_async/rocm_async_print.cpp create mode 100755 rocrtst/samples/rocm_async/rocm_async_report.cpp create mode 100755 rocrtst/samples/rocm_async/rocm_async_topology.cpp create mode 100755 rocrtst/samples/rocm_async/rocm_async_trans.cpp create mode 100644 rocrtst/samples/rocm_async/rocm_async_validate.cpp diff --git a/rocrtst/samples/rocm_async/CMakeLists.txt b/rocrtst/samples/rocm_async/CMakeLists.txt new file mode 100644 index 0000000000..a1f836ff57 --- /dev/null +++ b/rocrtst/samples/rocm_async/CMakeLists.txt @@ -0,0 +1,88 @@ +cmake_minimum_required(VERSION 2.8.0) + +# +# Setup build environment +# +# 1) Setup env var ROCR_INC_DIR and ROCR_LIB_DIR to point to +# ROC Runtime header and libraries seperately +# +# export ROCR_INC_DIR="Path to ROC Runtime header" +# +# export ROCR_LIB_DIR="Path to ROC Runtime libraries" +# +# 2) Make an new folder called build under root folder +# +# mkdir build +# +# 3) Enter into folder of build, and run CMAKE to generate makefile +# and make it +# +# cd build; cmake ..; make +# + +if(WIN32) + MESSAGE("Windows platfomr is not supported") + RRETURN() +endif() + +if(NOT EXISTS $ENV{ROCR_INC_DIR}/hsa/hsa.h) + MESSAGE("ERROR: ROC Runtime headers can't be found under specified path") + RETURN() +endif() + +# +# Flag to enable / disable verbose output. +# +SET( CMAKE_VERBOSE_MAKEFILE on ) + +# +# Set core runtime module name +# +set ( CORE_RUNTIME_NAME "hsa-runtime" ) +set ( CORE_RUNTIME_TARGET "${CORE_RUNTIME_NAME}64" ) +set ( CORE_RUNTIME_LIBRARY "lib${CORE_RUNTIME_TARGET}" ) + +if(NOT EXISTS $ENV{ROCR_LIB_DIR}/${CORE_RUNTIME_LIBRARY}.so) + MESSAGE("ERROR: ROC Runtime libraries can't be found under sprcified path") + RETURN() +endif() + +set(PROJECT_NAME "rocm_async") +set(TEST_NAME "${PROJECT_NAME}") +project (${PROJECT_NAME}) + +string(TOLOWER "${CMAKE_BUILD_TYPE}" tmp) +if("${tmp}" STREQUAL "debug") + set(ISDEBUG "1") + add_definitions(-DDEBUG) +endif() + +if(ISDEBUG) + set(CMAKE_CXX_FLAGS "-std=c++11 -O0") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb") +else() + set(CMAKE_CXX_FLAGS "-std=c++11 -O2") +endif() + +# +# Set the remaining compiler flags +# +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror") + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-math-errno") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-threadsafe-statics") + +INCLUDE_DIRECTORIES($ENV{ROCR_INC_DIR}) + +LINK_DIRECTORIES($ENV{ROCR_LIB_DIR}) + +# Add sources that belong to the project +aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} Src) + +add_executable(rocm_async ${Src}) +target_link_libraries(rocm_async hsa-runtime64) diff --git a/rocrtst/samples/rocm_async/Readme.txt b/rocrtst/samples/rocm_async/Readme.txt new file mode 100644 index 0000000000..50d362b63c --- /dev/null +++ b/rocrtst/samples/rocm_async/Readme.txt @@ -0,0 +1,66 @@ + +Introduction: +############# + +RocmAsync is designed to capture the performance characteristics of buffer +copying and kernel read/write operations. The help screen of the benchmark +shows various options one can use in initiating cop/read/writer operations. +In addition one can also query the topology of the system in terms of memory +pools and their agents + +Build Environment: +################## + +To be able to build RocmAsync, users must ensure that the build platform has +following conditions satisfied: +Build Procedure: +################ + +The following simply lists the steps to build RocmAsync + +--- Define following environment variable to specify location of header + and library files + + // Containins header files exported by ROC Runtime + ROCR_INC_DIR="Path of ROC Runtime Header Files" + + // Containins library files exported by ROC Runtime + ROCR_LIB_DIR="Path of ROC Runtime Library Files" + +--- Create a build directory. The location of build directory can be anywhere + in the file system as long as it has read / write / execute permissions for + the user invoking the commands. User can choose any valid filename for the + build directory as the examples below illustrate + + e.g. mkdir rocm_async/perfBuild + e.g. mkdir rocm_async-build + e.g. mkdir /rocmAsyncBuild + +--- Set working directory to be the new build directory + + e.g. cd rocm_async/perfBuild + e.g. cd rocm-async-build + e.g. cd /rocmAsyncBuild + +--- Invoke Cmake to interpret build rules and generate native build files + The argument for cmake should be the root folder of RocmAsync test suite + + // Builds Release version (default) + e.g. cmake .../rocm_async + + // Builds Debug version + e.g. cmake -DCMAKE_BUILD_TYPE:STRING=Debug .../rocm_async + +--- Invoke the native build rules generated by cmake to build the various + object, library and executable files + + e.g. make + +--- Invoke the install command to copy build artifacts to pre-defined folders + of RocmAsync suite. Upon completion artifacts will be copied to the bin and + lib directories of build directory + + e.g. make install + + @note: All executables will be found in /bin folder + diff --git a/rocrtst/samples/rocm_async/base_test.cpp b/rocrtst/samples/rocm_async/base_test.cpp new file mode 100644 index 0000000000..3213a920a9 --- /dev/null +++ b/rocrtst/samples/rocm_async/base_test.cpp @@ -0,0 +1,11 @@ +#include "base_test.hpp" + +// Default Constructor +BaseTest::BaseTest(size_t num) { + + // Set the numIteration_ to be 10 by default + num_iteration_ = num; +} + +BaseTest::~BaseTest() {} + diff --git a/rocrtst/samples/rocm_async/base_test.hpp b/rocrtst/samples/rocm_async/base_test.hpp new file mode 100644 index 0000000000..a268270c3e --- /dev/null +++ b/rocrtst/samples/rocm_async/base_test.hpp @@ -0,0 +1,52 @@ + +#ifndef ROCM_ASYNC_BW_BASE_TEST_H_ +#define ROCM_ASYNC_BW_BASE_TEST_H_ + +#include "hsa/hsa.h" +#include +#include +#include + +using namespace std; + +// @Brief: An interface for tests to do some basic things, + +class BaseTest { + + public: + + BaseTest(size_t num = 10); + + virtual ~BaseTest(); + + // @Brief: Allows setup proceedures to be completed + // before running the benchmark test case + virtual void SetUp() = 0; + + // @Brief: Launches the proceedures of test scenario + virtual void Run() = 0; + + // @Brief: Allows clean up proceedures to be invoked + virtual void Close() = 0; + + // @Brief: Display the results + virtual void Display() const = 0; + + // @Brief: Set number of iterations to run + void set_num_iteration(size_t num) { + num_iteration_ = num; + return; + } + + // @Brief: Pre-declare some variables for deriviation, the + // derived class may declare more if needed + protected: + + // @Brief: Real iteration number + uint64_t num_iteration_; + + // @Brief: Status code + hsa_status_t err_; +}; + +#endif // ROCM_ASYNC_BW_BASE_TEST_H_ diff --git a/rocrtst/samples/rocm_async/common.cpp b/rocrtst/samples/rocm_async/common.cpp new file mode 100644 index 0000000000..7e92a2ef54 --- /dev/null +++ b/rocrtst/samples/rocm_async/common.cpp @@ -0,0 +1,149 @@ +#include "common.hpp" + +void error_check(hsa_status_t hsa_error_code, int line_num, const char* str) { + if (hsa_error_code != HSA_STATUS_SUCCESS && + hsa_error_code != HSA_STATUS_INFO_BREAK) { + printf("HSA Error Found! In file: %s; At line: %d\n", str, line_num); + const char* string = nullptr; + hsa_status_string(hsa_error_code, &string); + printf("Error: %s\n", string); + exit(EXIT_FAILURE); + } +} + +// So far, always find the first device +hsa_status_t FindGpuDevice(hsa_agent_t agent, void* data) { + if (data == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_device_type_t hsa_device_type; + hsa_status_t hsa_error_code = + hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &hsa_device_type); + if (hsa_error_code != HSA_STATUS_SUCCESS) { + return hsa_error_code; + } + + if (hsa_device_type == HSA_DEVICE_TYPE_GPU) { + *((hsa_agent_t*)data) = agent; + return HSA_STATUS_INFO_BREAK; + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t FindCpuDevice(hsa_agent_t agent, void* data) { + if (data == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_device_type_t hsa_device_type; + hsa_status_t hsa_error_code = + hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &hsa_device_type); + if (hsa_error_code != HSA_STATUS_SUCCESS) { + return hsa_error_code; + } + + if (hsa_device_type == HSA_DEVICE_TYPE_CPU) { + *((hsa_agent_t*)data) = agent; + return HSA_STATUS_INFO_BREAK; + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t region, void* data) { + if (NULL == data) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_status_t err; + hsa_amd_segment_t segment; + uint32_t flag; + + err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); + ErrorCheck(err); + + err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + ErrorCheck(err); + + if ((HSA_AMD_SEGMENT_GLOBAL == segment) && + (flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED)) { + *((hsa_amd_memory_pool_t*)data) = region; + } + + return HSA_STATUS_SUCCESS; +} + +double CalcMedian(vector scores) { + double median; + size_t size = scores.size(); + + if (size % 2 == 0) + median = (scores[size / 2 - 1] + scores[size / 2]) / 2; + else + median = scores[size / 2]; + + return median; +} + +double CalcMean(vector scores) { + double mean = 0; + size_t size = scores.size(); + + for (size_t i = 0; i < size; ++i) mean += scores[i]; + + return mean / size; +} + +double CalcStdDeviation(vector scores, int score_mean) { + double ret = 0.0; + for (size_t i = 0; i < scores.size(); ++i) { + ret += (scores[i] - score_mean) * (scores[i] - score_mean); + } + + ret /= scores.size(); + + return sqrt(ret); +} + +int CalcConcurrentQueues(vector scores) { + int num_of_concurrent_queues = 0; + vector execpted_exec_time_array; + + for (size_t i = 0; i < scores.size(); ++i) { + execpted_exec_time_array.push_back(scores[0] / (1 << i)); + } + + for (size_t i = 0; i < scores.size(); ++i) { + cout << "expected exe time = " << execpted_exec_time_array[i] << endl; + } + + for (size_t i = 1; i < scores.size(); ++i) { + if ((execpted_exec_time_array[i] - scores[i]) < + 0.1 * execpted_exec_time_array[i]) + ++num_of_concurrent_queues; + } + + return num_of_concurrent_queues; +} + +/** hsa_status_t FindHostRegion(hsa_region_t region, void *data) { + if (data == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + bool is_host_region = false; + hsa_status_t hsa_error_code = hsa_region_get_info( + region, (hsa_region_info_t)HSA_EXT_REGION_INFO_HOST_ACCESS, &is_host_region + ); + if (hsa_error_code != HSA_STATUS_SUCCESS) { + return hsa_error_code; + } + + if (is_host_region) { + *((hsa_region_t*)data) = region; + } + + return HSA_STATUS_SUCCESS; +} */ diff --git a/rocrtst/samples/rocm_async/common.hpp b/rocrtst/samples/rocm_async/common.hpp new file mode 100644 index 0000000000..c0a42180dd --- /dev/null +++ b/rocrtst/samples/rocm_async/common.hpp @@ -0,0 +1,48 @@ +#ifndef ROCM_ASYNC_BW_COMMON_HPP +#define ROCM_ASYNC_BW_COMMON_HPP + +#include +#include +#include +#include +#include "hsa/hsa.h" +#include "hsa/hsa_ext_amd.h" + +using namespace std; + +#if defined(_MSC_VER) +#define ALIGNED_(x) __declspec(align(x)) +#else +#if defined(__GNUC__) +#define ALIGNED_(x) __attribute__((aligned(x))) +#endif // __GNUC__ +#endif // _MSC_VER + +#define MULTILINE(...) #__VA_ARGS__ + +#define HSA_ARGUMENT_ALIGN_BYTES 16 + +#define ErrorCheck(x) error_check(x, __LINE__, __FILE__) + +// @Brief: Check HSA API return value +void error_check(hsa_status_t hsa_error_code, int line_num, const char* str); + +// @Brief: Find the first avaliable GPU device +hsa_status_t FindGpuDevice(hsa_agent_t agent, void* data); + +// @Brief: Find the first avaliable CPU device +hsa_status_t FindCpuDevice(hsa_agent_t agent, void* data); + +// @Brief: Find the agent's global region / pool +hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t region, void* data); + +// @Brief: Calculate the mean number of the vector +double CalcMean(vector scores); + +// @Brief: Calculate the Median valud of the vector +double CalcMedian(vector scores); + +// @Brief: Calculate the standard deviation of the vector +double CalcStdDeviation(vector scores, int score_mean); + +#endif // ROCM_ASYNC_BW_COMMON_HPP diff --git a/rocrtst/samples/rocm_async/hsatimer.cpp b/rocrtst/samples/rocm_async/hsatimer.cpp new file mode 100644 index 0000000000..bdb9b182eb --- /dev/null +++ b/rocrtst/samples/rocm_async/hsatimer.cpp @@ -0,0 +1,188 @@ +#include "hsatimer.hpp" + +#define NANOSECONDS_PER_SECOND 1000000000 + +PerfTimer::PerfTimer() { + freq_in_100mhz = MeasureTSCFreqHz(); +} + +PerfTimer::~PerfTimer() { + while (!_timers.empty()) { + Timer *temp = _timers.back(); + _timers.pop_back(); + delete temp; + } +} + +// Create a new timer instance and return its index +int PerfTimer::CreateTimer() { + + Timer *newTimer = new Timer; + newTimer->_start = 0.0; + newTimer->_clocks = 0.0; + + #ifdef _WIN32 + QueryPerformanceFrequency((LARGE_INTEGER *)&newTimer->_freq); + #endif + + #ifdef __linux__ + newTimer->_freq = NANOSECONDS_PER_SECOND; + #endif + + // Save the timer object in timer list + _timers.push_back(newTimer); + return (int)(_timers.size() - 1); +} + +int PerfTimer::StartTimer(int index) { + + if (index >= (int)_timers.size()) { + Error("Cannot reset timer. Invalid handle."); + return HSA_FAILURE; + } + + #ifdef _WIN32 + // General Windows timing method + #ifndef _AMD + long long tmpStart; + QueryPerformanceCounter((LARGE_INTEGER *)&(tmpStart)); + _ timers[index]->_start = (double)tmpStart; + // AMD Windows timing method + #else + #endif + #endif + + #ifdef __linux__ + // General Linux timing method + #ifndef _AMD + struct timespec s; + clock_gettime(CLOCK_MONOTONIC, &s); + _timers[index]->_start = + (long long)s.tv_sec * NANOSECONDS_PER_SECOND + (long long)s.tv_nsec; + // AMD Linux timing method + #else + unsigned int unused; + _timers[index]->_start = __rdtscp(&unused); + #endif + #endif + + return HSA_SUCCESS; +} + +int PerfTimer::StopTimer(int index) { + + long long n = 0; + if (index >= (int)_timers.size()) { + Error("Cannot reset timer. Invalid handle."); + return HSA_FAILURE; + } + + #ifdef _WIN32 + #ifndef _AMD + long long n1; + QueryPerformanceCounter((LARGE_INTEGER *)&(n1)); + n = n1; + // AMD Window Timing + #else + #endif + #endif + + #ifdef __linux__ + // General Linux timing method + #ifndef _AMD + struct timespec s; + clock_gettime(CLOCK_MONOTONIC, &s); + n = (long long)s.tv_sec * NANOSECONDS_PER_SECOND + (long long)s.tv_nsec; + // AMD Linux timing + #else + unsigned int unused; + n = __rdtscp(&unused); + #endif + #endif + + n -= _timers[index]->_start; + _timers[index]->_start = 0; + + #ifndef _AMD + _timers[index]->_clocks += n; + #endif + + #ifdef __linux__ + //_timers[index]->_clocks += 10 * n /freq_in_100mhz; // unit is ns + _timers[index]->_clocks += 1.0E-6 * 10 * n / freq_in_100mhz; // convert to ms + // cout << "_AMD is enabled!!!" << endl; + #endif + + return HSA_SUCCESS; +} + +void PerfTimer::Error(string str) { cout << str << endl; } + +double PerfTimer::ReadTimer(int index) { + + if (index >= (int)_timers.size()) { + Error("Cannot read timer. Invalid handle."); + return HSA_FAILURE; + } + + double reading = double(_timers[index]->_clocks); + + reading = double(reading / _timers[index]->_freq); + + return reading; +} + +void PerfTimer::ResetTimer(int index) { + + // Check if index value is over the timer's size + if (index >= (int)_timers.size()) { + Error("Invalid index value\n"); + exit(1); + } + + _timers[index]->_clocks = 0.0; + _timers[index]->_start = 0.0; +} + +uint64_t PerfTimer::CoarseTimestampUs() { + + #ifdef _WIN32 + uint64_t freqHz, ticks; + QueryPerformanceFrequency((LARGE_INTEGER *)&freqHz); + QueryPerformanceCounter((LARGE_INTEGER *)&ticks); + + // Scale numerator and divisor until (ticks * 1000000) fits in uint64_t. + while (ticks > (1ULL << 44)) { + ticks /= 16; + freqHz /= 16; + } + + return (ticks * 1000000) / freqHz; + #endif + + #ifdef __linux__ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return uint64_t(ts.tv_sec) * 1000000 + ts.tv_nsec / 1000; + #endif +} + +uint64_t PerfTimer::MeasureTSCFreqHz() { + + // Make a coarse interval measurement of TSC ticks for 1 gigacycles. + unsigned int unused; + uint64_t tscTicksEnd; + + uint64_t coarseBeginUs = CoarseTimestampUs(); + uint64_t tscTicksBegin = __rdtscp(&unused); + do { + tscTicksEnd = __rdtscp(&unused); + } while (tscTicksEnd - tscTicksBegin < 1000000000); + + uint64_t coarseEndUs = CoarseTimestampUs(); + + // Compute the TSC frequency and round to nearest 100MHz. + uint64_t coarseIntervalNs = (coarseEndUs - coarseBeginUs) * 1000; + uint64_t tscIntervalTicks = tscTicksEnd - tscTicksBegin; + return (tscIntervalTicks * 10 + (coarseIntervalNs / 2)) / coarseIntervalNs; +} diff --git a/rocrtst/samples/rocm_async/hsatimer.hpp b/rocrtst/samples/rocm_async/hsatimer.hpp new file mode 100644 index 0000000000..974efaa2bb --- /dev/null +++ b/rocrtst/samples/rocm_async/hsatimer.hpp @@ -0,0 +1,72 @@ + +#ifndef ROCM_ASYNC_BW_MYTIME_H_ +#define ROCM_ASYNC_BW_MYTIME_H_ + +// Will use AMD timer and general Linux timer based on users' +// need --> compilation flag. Support for windows platform is +// not currently available + +#include +#include +#include +#include +#include + +#include +#include +#include + +using namespace std; + +#include + +#define HSA_FAILURE 1 +#define HSA_SUCCESS 0 + +class PerfTimer { + + private: + + struct Timer { + string name; /* < name name of time object*/ + long long _freq; /* < _freq frequency*/ + long long _clocks; /* < _clocks number of ticks at end*/ + long long _start; /* < _start start point ticks*/ + }; + + std::vector _timers; /*< _timers vector to Timer objects */ + double freq_in_100mhz; + + public: + + PerfTimer(); + ~PerfTimer(); + + private: + + // AMD timing method + uint64_t CoarseTimestampUs(); + uint64_t MeasureTSCFreqHz(); + + // General Linux timing method + + public: + + int CreateTimer(); + int StartTimer(int index); + int StopTimer(int index); + void ResetTimer(int index); + + public: + + // retrieve time + double ReadTimer(int index); + + // write into a file + double WriteTimer(int index); + + public: + void Error(string str); +}; + +#endif // ROCM_ASYNC_BW_MYTIME_H_ diff --git a/rocrtst/samples/rocm_async/main.cpp b/rocrtst/samples/rocm_async/main.cpp new file mode 100644 index 0000000000..1c2a4c7889 --- /dev/null +++ b/rocrtst/samples/rocm_async/main.cpp @@ -0,0 +1,25 @@ +#include +#include +#include "hsatimer.hpp" +#include "rocm_async.hpp" + +using namespace std; + +int main(int argc, char** argv) { + + // Create the Bandwidth test object + RocmAsync bw_test(argc, argv); + + // Initialize the Bandwidth test object + bw_test.SetUp(); + + // Run the Bandwidth tests requested by user + bw_test.Run(); + + // Display the time taken by various tests + bw_test.Display(); + + // Release the Bandwidth test object resources + bw_test.Close(); + return 0; +} diff --git a/rocrtst/samples/rocm_async/os.cpp b/rocrtst/samples/rocm_async/os.cpp new file mode 100644 index 0000000000..e83f5087fe --- /dev/null +++ b/rocrtst/samples/rocm_async/os.cpp @@ -0,0 +1,49 @@ + +// Compiling for Windows Platform +#ifdef _WIN32 + +#include "os.hpp" +#include +#include +#include + +void SetEnv(const char* env_var_name, const char* env_var_value) { + bool err = SetEnvironmentVariable(env_var_name, env_var_value); + if (false == err) { + printf("Set environment variable failed!\n"); + exit(1); + } + return; +} + +char* GetEnv(const char* env_var_name) { + char* buff; + DWORD char_count = GetEnvironmentVariable(env_var_name, NULL, 0); + if (char_count == 0) return NULL; + buff = (char*)malloc(sizeof(char) * char_count); + GetEnvironmentVariable(env_var_name, buff, char_count); + buff[char_count - 1] = '\0'; + return buff; +} + +#endif // End of Windows Code + +// Compiling for Linux Platform +#ifdef __linux__ + +#include "os.hpp" +#include + +void SetEnv(const char* env_var_name, const char* env_var_value) { + int err = setenv(env_var_name, env_var_value, 1); + if (0 != err) { + printf("Set environment variable failed!\n"); + exit(1); + } + return; +} + +char* GetEnv(const char* env_var_name) { return getenv(env_var_name); } + +#endif // End of Linux Code + diff --git a/rocrtst/samples/rocm_async/os.hpp b/rocrtst/samples/rocm_async/os.hpp new file mode 100755 index 0000000000..3b8607f507 --- /dev/null +++ b/rocrtst/samples/rocm_async/os.hpp @@ -0,0 +1,14 @@ + +#ifndef ROCM_ASYNC_BW_UTILS_OS_H_ +#define ROCM_ASYNC_BW_UTILS_OS_H_ + +#include + +// Set envriroment variable +void SetEnv(const char* env_var_name, const char* env_var_value); + +// Get the value of enviroment +char* GetEnv(const char* env_var_name); + + +#endif // ROCM_ASYNC_BW_UTILS_OS_H_ diff --git a/rocrtst/samples/rocm_async/rocm_async.cpp b/rocrtst/samples/rocm_async/rocm_async.cpp new file mode 100755 index 0000000000..27024db926 --- /dev/null +++ b/rocrtst/samples/rocm_async/rocm_async.cpp @@ -0,0 +1,478 @@ + +#include "common.hpp" +#include "rocm_async.hpp" + +#include +#include +#include +#include +#include +#include + +// The values are in megabytes at allocation time +const uint32_t RocmAsync::SIZE_LIST[] = { 64, 128, 256, 512 }; +//const uint32_t RocmAsync::SIZE_LIST[] = { 2, 4, 8, 16, 32, 64, 128, 256, 512 }; + +uint32_t RocmAsync::GetIterationNum() { + return num_iteration_ * 1.2 + 1; +} + +void RocmAsync::AcquireAccess(hsa_agent_t agent, void* ptr) { + err_ = hsa_amd_agents_allow_access(1, &agent, NULL, ptr); + ErrorCheck(err_); +} + +void RocmAsync::AllocateHostBuffers(bool bidir, uint32_t size, + void*& src_fwd, void*& dst_fwd, + void* buf_src_fwd, void* buf_dst_fwd, + hsa_agent_t src_agent_fwd, hsa_agent_t dst_agent_fwd, + void*& src_rev, void*& dst_rev, + void* buf_src_rev, void* buf_dst_rev, + hsa_agent_t src_agent_rev, hsa_agent_t dst_agent_rev, + hsa_signal_t& signal_fwd, hsa_signal_t& signal_rev) { + + // Allocate host buffers and setup accessibility for copy operation + err_ = hsa_amd_memory_pool_allocate(sys_pool_, size, 0, (void**)&src_fwd); + ErrorCheck(err_); + AcquireAccess(src_agent_fwd, src_fwd); + AcquireAccess(cpu_agent_, buf_src_fwd); + + err_ = hsa_amd_memory_pool_allocate(sys_pool_, size, 0, (void**)&dst_fwd); + ErrorCheck(err_); + AcquireAccess(dst_agent_fwd, dst_fwd); + AcquireAccess(cpu_agent_, buf_dst_fwd); + + // Initialize host buffers to a determinate value + memset(src_fwd, 0x23, size); + memset(dst_fwd, 0x00, size); + + // Create a signal to wait on copy operation + // @TODO: replace it with a signal pool call + err_ = hsa_signal_create(1, 0, NULL, &signal_fwd); + ErrorCheck(err_); + + if (bidir == false) { + return; + } + + err_ = hsa_amd_memory_pool_allocate(sys_pool_, size, 0, (void**)&src_rev); + ErrorCheck(err_); + AcquireAccess(src_agent_rev, src_rev); + AcquireAccess(cpu_agent_, buf_src_rev); + + err_ = hsa_amd_memory_pool_allocate(sys_pool_, size, 0, (void**)&dst_rev); + ErrorCheck(err_); + AcquireAccess(dst_agent_rev, dst_rev); + AcquireAccess(cpu_agent_, buf_dst_rev); + + // Initialize host buffers to a determinate value + memset(src_rev, 0x23, size); + memset(dst_rev, 0x00, size); + + err_ = hsa_signal_create(1, 0, NULL, &signal_rev); + ErrorCheck(err_); +} + +void RocmAsync::AllocateCopyBuffers(bool bidir, uint32_t size, + void*& src_fwd, hsa_amd_memory_pool_t src_pool_fwd, + void*& dst_fwd, hsa_amd_memory_pool_t dst_pool_fwd, + hsa_agent_t src_agent_fwd, hsa_agent_t dst_agent_fwd, + void*& src_rev, hsa_amd_memory_pool_t src_pool_rev, + void*& dst_rev, hsa_amd_memory_pool_t dst_pool_rev, + hsa_agent_t src_agent_rev, hsa_agent_t dst_agent_rev, + hsa_signal_t& signal_fwd, hsa_signal_t& signal_rev) { + + // Allocate buffers in src and dst pools for forward copy + err_ = hsa_amd_memory_pool_allocate(src_pool_fwd, size, 0, &src_fwd); + ErrorCheck(err_); + err_ = hsa_amd_memory_pool_allocate(dst_pool_fwd, size, 0, &dst_fwd); + ErrorCheck(err_); + + // Allocate buffers in src and dst pools for reverse copy + if (bidir) { + err_ = hsa_amd_memory_pool_allocate(src_pool_rev, size, 0, &src_rev); + ErrorCheck(err_); + err_ = hsa_amd_memory_pool_allocate(dst_pool_rev, size, 0, &dst_rev); + ErrorCheck(err_); + } + + // Acquire access to src and dst buffers for forward copy + AcquireAccess(src_agent_fwd, dst_fwd); + AcquireAccess(dst_agent_fwd, src_fwd); + + // Acquire access to src and dst buffers for reverse copy + if (bidir) { + AcquireAccess(src_agent_rev, dst_rev); + AcquireAccess(dst_agent_rev, src_rev); + } + + // Create a signal to wait on copy operation + // @TODO: replace it with a signal pool call + err_ = hsa_signal_create(1, 0, NULL, &signal_fwd); + ErrorCheck(err_); + if (bidir) { + err_ = hsa_signal_create(1, 0, NULL, &signal_rev); + ErrorCheck(err_); + } +} + +void RocmAsync::ReleaseBuffers(bool bidir, + void* src_fwd, void* src_rev, + void* dst_fwd, void* dst_rev, + hsa_signal_t signal_fwd, + hsa_signal_t signal_rev) { + + // Free the src and dst buffers used in forward copy + // including the signal used to wait + err_ = hsa_amd_memory_pool_free(src_fwd); + ErrorCheck(err_); + err_ = hsa_amd_memory_pool_free(dst_fwd); + ErrorCheck(err_); + err_ = hsa_signal_destroy(signal_fwd); + ErrorCheck(err_); + + // Free the src and dst buffers used in reverse copy + // including the signal used to wait + if (bidir) { + err_ = hsa_amd_memory_pool_free(src_rev); + ErrorCheck(err_); + err_ = hsa_amd_memory_pool_free(dst_rev); + ErrorCheck(err_); + err_ = hsa_signal_destroy(signal_rev); + ErrorCheck(err_); + } +} + +double RocmAsync::GetGpuCopyTime(bool bidir, + hsa_signal_t signal_fwd, + hsa_signal_t signal_rev) { + + // Obtain time taken for forward copy + hsa_amd_profiling_async_copy_time_t async_time_fwd = {0}; + err_= hsa_amd_profiling_get_async_copy_time(signal_fwd, &async_time_fwd); + ErrorCheck(err_); + if (bidir == false) { + return(async_time_fwd.end - async_time_fwd.start); + } + + hsa_amd_profiling_async_copy_time_t async_time_rev = {0}; + err_= hsa_amd_profiling_get_async_copy_time(signal_rev, &async_time_rev); + ErrorCheck(err_); + double start = min(async_time_fwd.start, async_time_rev.start); + double end = max(async_time_fwd.end, async_time_rev.end); + return(end - start); +} + +void RocmAsync::copy_buffer(void* dst, hsa_agent_t dst_agent, + void* src, hsa_agent_t src_agent, + size_t size, hsa_signal_t signal) { + + // Copy from src into dst buffer + err_ = hsa_amd_memory_async_copy(dst, dst_agent, + src, src_agent, + size, 0, NULL, signal); + ErrorCheck(err_); + + // Wait for the forward copy operation to complete + while (hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, + uint64_t(-1), HSA_WAIT_STATE_ACTIVE)); +} + +void RocmAsync::RunCopyBenchmark(async_trans_t& trans) { + + // Bind if this transaction is bidirectional + bool bidir = trans.copy.bidir_; + + // Initialize size of buffer to equal the largest element of allocation + uint32_t size_len = size_list_.size(); + uint32_t max_size = size_list_.back() * 1024 * 1024; + + // Bind to resources such as pool and agents that are involved + // in both forward and reverse copy operations + void* buf_src_fwd; + void* buf_dst_fwd; + void* buf_src_rev; + void* buf_dst_rev; + void* host_src_fwd; + void* host_dst_fwd; + void* host_src_rev; + void* host_dst_rev; + hsa_signal_t signal_fwd; + hsa_signal_t signal_rev; + hsa_signal_t host_signal_fwd; + hsa_signal_t host_signal_rev; + hsa_amd_memory_pool_t src_pool_fwd = trans.copy.src_pool_; + hsa_amd_memory_pool_t dst_pool_fwd = trans.copy.dst_pool_; + hsa_amd_memory_pool_t src_pool_rev = dst_pool_fwd; + hsa_amd_memory_pool_t dst_pool_rev = src_pool_fwd; + hsa_agent_t src_agent_fwd = pool_list_[trans.copy.src_idx_].owner_agent_; + hsa_agent_t dst_agent_fwd = pool_list_[trans.copy.dst_idx_].owner_agent_; + hsa_agent_t src_agent_rev = dst_agent_fwd; + hsa_agent_t dst_agent_rev = src_agent_fwd; + + // Allocate buffers and signal objects + AllocateCopyBuffers(bidir, max_size, + buf_src_fwd, src_pool_fwd, + buf_dst_fwd, dst_pool_fwd, + src_agent_fwd, dst_agent_fwd, + buf_src_rev, src_pool_rev, + buf_dst_rev, dst_pool_rev, + src_agent_rev, dst_agent_rev, + signal_fwd, signal_rev); + + if (verify_) { + AllocateHostBuffers(bidir, max_size, + host_src_fwd, host_dst_fwd, + buf_src_fwd, buf_dst_fwd, + src_agent_fwd, dst_agent_fwd, + host_src_rev, host_dst_rev, + buf_src_rev, buf_dst_rev, + src_agent_rev, dst_agent_rev, + host_signal_fwd, host_signal_rev); + + // Initialize source buffer with values from verification buffer + copy_buffer(buf_src_fwd, src_agent_fwd, + host_src_fwd, cpu_agent_, + max_size, host_signal_fwd); + ErrorCheck(err_); + if (bidir) { + copy_buffer(buf_src_rev, src_agent_rev, + host_src_rev, cpu_agent_, + max_size, host_signal_rev); + ErrorCheck(err_); + } + } + + // Bind the number of iterations + uint32_t iterations = GetIterationNum(); + + // Iterate through the differnt buffer sizes to + // compute the bandwidth as determined by copy + for (uint32_t idx = 0; idx < size_len; idx++) { + + // This should not be happening + uint32_t curr_size = size_list_[idx] * 1024 * 1024; + if (curr_size > max_size) { + break; + } + + std::vector cpu_time; + std::vector gpu_time; + for (uint32_t it = 0; it < iterations; it++) { + #if DEBUG + printf("."); + fflush(stdout); + #endif + + hsa_signal_store_relaxed(signal_fwd, 1); + if (bidir) { + hsa_signal_store_relaxed(signal_rev, 1); + } + + if (verify_) { + AcquireAccess(src_agent_fwd, buf_dst_fwd); + AcquireAccess(dst_agent_fwd, buf_src_fwd); + if (bidir) { + AcquireAccess(src_agent_rev, buf_dst_rev); + AcquireAccess(dst_agent_rev, buf_src_rev); + } + } + + // Create a timer object and reset signals + PerfTimer timer; + uint32_t index = timer.CreateTimer(); + + // Start the timer and launch forward copy operation + timer.StartTimer(index); + err_ = hsa_amd_memory_async_copy(buf_dst_fwd, dst_agent_fwd, + buf_src_fwd, src_agent_fwd, + curr_size, 0, NULL, signal_fwd); + ErrorCheck(err_); + + // Launch reverse copy operation if it is bidirectional + if (bidir) { + err_ = hsa_amd_memory_async_copy(buf_dst_rev, dst_agent_rev, + buf_src_rev, src_agent_rev, + curr_size, 0, NULL, signal_rev); + ErrorCheck(err_); + } + + // Wait for the forward copy operation to complete + while (hsa_signal_wait_acquire(signal_fwd, HSA_SIGNAL_CONDITION_LT, 1, + uint64_t(-1), HSA_WAIT_STATE_ACTIVE)); + + // Wait for the reverse copy operation to complete + if (bidir) { + while (hsa_signal_wait_acquire(signal_rev, HSA_SIGNAL_CONDITION_LT, 1, + uint64_t(-1), HSA_WAIT_STATE_ACTIVE)); + } + + // Stop the timer object + timer.StopTimer(index); + + // Push the time taken for copy into a vector of copy times + cpu_time.push_back(timer.ReadTimer(index)); + + // Collect time from the signal(s) + if (trans.copy.uses_gpu_) { + double temp = GetGpuCopyTime(bidir, signal_fwd, signal_rev); + gpu_time.push_back(temp); + } + + if (verify_) { + + // Re-Establish access to destination buffer and host buffer + AcquireAccess(cpu_agent_, buf_dst_fwd); + AcquireAccess(dst_agent_fwd, host_dst_fwd); + + // Init dst buffer with values from outbuffer of copy operation + hsa_signal_store_relaxed(host_signal_fwd, 1); + copy_buffer(host_dst_fwd, cpu_agent_, + buf_dst_fwd, dst_agent_fwd, + curr_size, host_signal_fwd); + ErrorCheck(err_); + + // Compare output equals input + err_ = (hsa_status_t)memcmp(host_src_fwd, host_dst_fwd, curr_size); + ErrorCheck(err_); + + if (bidir) { + + // Re-Establish access to destination buffer and host buffer + AcquireAccess(cpu_agent_, buf_dst_rev); + AcquireAccess(dst_agent_rev, host_dst_rev); + + hsa_signal_store_relaxed(host_signal_rev, 1); + copy_buffer(host_dst_rev, cpu_agent_, + buf_dst_rev, dst_agent_rev, + curr_size, host_signal_rev); + ErrorCheck(err_); + + // Compare output equals input + err_ = (hsa_status_t)memcmp(host_src_rev, host_dst_rev, curr_size); + ErrorCheck(err_); + } + } + } + #if DEBUG + std::cout << std::endl; + #endif + + // Get Cpu min copy time + trans.cpu_min_time_.push_back(GetMinTime(cpu_time)); + // Get Cpu mean copy time and store to the array + trans.cpu_avg_time_.push_back(GetMeanTime(cpu_time)); + + if (trans.copy.uses_gpu_) { + // Get Gpu min copy time + trans.gpu_min_time_.push_back(GetMinTime(gpu_time)); + // Get Gpu mean copy time and store to the array + trans.gpu_avg_time_.push_back(GetMeanTime(gpu_time)); + } + + // Clear the stack of cpu times + cpu_time.clear(); + gpu_time.clear(); + } + + // Free up buffers and signal objects used in copy operation + ReleaseBuffers(bidir, buf_src_fwd, buf_src_rev, + buf_dst_fwd, buf_dst_rev, signal_fwd, signal_rev); + + if (verify_) { + ReleaseBuffers(bidir, host_src_fwd, host_src_rev, + host_dst_fwd, host_dst_rev, host_signal_fwd, host_signal_rev); + } +} + +void RocmAsync::RunIOBenchmark(async_trans_t& trans) { + + std::cout << "Unsupported Request - Read / Write" << std::endl; + exit(1); +} + +void RocmAsync::Run() { + + // Enable profiling of Async Copy Activity + err_ = hsa_amd_profiling_async_copy_enable(true); + ErrorCheck(err_); + + // Iterate through the list of transactions and execute them + uint32_t trans_size = trans_list_.size(); + for (uint32_t idx = 0; idx < trans_size; idx++) { + async_trans_t& trans = trans_list_[idx]; + if ((trans.req_type_ == REQ_COPY_BIDIR) || + (trans.req_type_ == REQ_COPY_UNIDIR)) { + RunCopyBenchmark(trans); + } + if ((trans.req_type_ == REQ_READ) || + (trans.req_type_ == REQ_WRITE)) { + RunIOBenchmark(trans); + } + } + + // Disable profiling of Async Copy Activity + err_ = hsa_amd_profiling_async_copy_enable(false); + ErrorCheck(err_); + +} + +void RocmAsync::Close() { + hsa_status_t status = hsa_shut_down(); + ErrorCheck(status); + return; +} + +// Sets up the bandwidth test object to enable running +// the various test scenarios requested by user. The +// things this proceedure takes care of are: +// +// Parse user arguments +// Discover RocR Device Topology +// Determine validity of requested test scenarios +// Build the list of transactions to execute +// Miscellaneous +// +void RocmAsync::SetUp() { + + // Parse user arguments + ParseArguments(); + + // Validate input parameters + bool status = ValidateArguments(); + if (status == false) { + PrintHelpScreen(); + exit(1); + } + + // Build list of transactions (copy, read, write) to execute + status = BuildTransList(); + if (status == false) { + PrintHelpScreen(); + exit(1); + } + + // Print Debug Info - List of Agents, Pool, Transactions + char* print_debug = getenv("PRINT_DEBUG"); + if (print_debug) { + //PrintAgentsList(); + //PrintPoolsList(); + PrintTransList(); + //PrintTopology(); + } +} + +RocmAsync::RocmAsync(int argc, char** argv) : BaseTest() { + usr_argc_ = argc; + usr_argv_ = argv; + verify_ = false; + pool_index_ = 0; + agent_index_ = 0; + req_read_ = REQ_INVALID; + req_write_ = REQ_INVALID; + req_copy_bidir_ = REQ_INVALID; + req_copy_unidir_ = REQ_INVALID; +} + +RocmAsync::~RocmAsync() { } + diff --git a/rocrtst/samples/rocm_async/rocm_async.hpp b/rocrtst/samples/rocm_async/rocm_async.hpp new file mode 100644 index 0000000000..f230f90c30 --- /dev/null +++ b/rocrtst/samples/rocm_async/rocm_async.hpp @@ -0,0 +1,338 @@ +#ifndef __ROCM_ASYNC_BW_H__ +#define __ROCM_ASYNC_BW_H__ + +#include "hsa/hsa.h" +#include "base_test.hpp" +#include "hsatimer.hpp" +#include "common.hpp" +#include + +using namespace std; + +// Structure to encapsulate a RocR agent and its index in a list +typedef struct agent_info { + + agent_info(hsa_agent_t agent, + uint32_t index, hsa_device_type_t device_type) { + agent_ = agent; + index_ = index; + device_type_ = device_type; + } + + agent_info() {} + + uint32_t index_; + hsa_agent_t agent_; + hsa_device_type_t device_type_; + +} agent_info_t; + +typedef struct pool_info { + + pool_info(hsa_agent_t agent, uint32_t agent_index, + hsa_amd_memory_pool_t pool, hsa_amd_segment_t segment, + size_t size, uint32_t index, bool is_fine_grained, + bool is_kernarg, bool access_to_all, + hsa_amd_memory_pool_access_t owner_access) { + + pool_ = pool; + index_ = index; + segment_ = segment; + owner_agent_ = agent; + agent_index_ = agent_index; + allocable_size_ = size; + is_kernarg_ = is_kernarg; + owner_access_ = owner_access; + access_to_all_ = access_to_all; + is_fine_grained_ = is_fine_grained; + } + + pool_info() {} + + uint32_t index_; + bool is_kernarg_; + bool access_to_all_; + bool is_fine_grained_; + size_t allocable_size_; + uint32_t agent_index_; + hsa_agent_t owner_agent_; + hsa_amd_segment_t segment_; + hsa_amd_memory_pool_t pool_; + hsa_amd_memory_pool_access_t owner_access_; + +} pool_info_t; + +// Used to print out topology info +typedef struct agent_pool_info { + + agent_pool_info() {} + + agent_info agent; + + vector pool_list; + +} agent_pool_info_t; + +typedef struct async_trans { + + uint32_t req_type_; + union { + struct { + bool bidir_; + bool uses_gpu_; + uint32_t src_idx_; + uint32_t dst_idx_; + hsa_amd_memory_pool_t src_pool_; + hsa_amd_memory_pool_t dst_pool_; + } copy; + struct { + void* code_; + uint32_t agent_idx_; + hsa_agent_t agent_; + uint32_t pool_idx_; + hsa_amd_memory_pool_t pool_; + } kernel; + }; + + // Cpu BenchMark average copy time + vector cpu_avg_time_; + + // Cpu Min time + vector cpu_min_time_; + + // Gpu BenchMark average copy time + vector gpu_avg_time_; + + // Gpu Min time + vector gpu_min_time_; + + async_trans(uint32_t req_type) { req_type_ = req_type; } +} async_trans_t; + +typedef enum Request_Type { + + REQ_READ = 1, + REQ_WRITE = 2, + REQ_COPY_BIDIR = 3, + REQ_COPY_UNIDIR = 4, + REQ_INVALID = 5, + +} Request_Type; + +class RocmAsync : public BaseTest { + + public: + + // @brief: Constructor for test case of RocmAsync + RocmAsync(int argc, char** argv); + + // @brief: Destructor for test case of RocmAsync + virtual ~RocmAsync(); + + // @brief: Setup the environment for measurement + virtual void SetUp(); + + // @brief: Core measurement execution + virtual void Run(); + + // @brief: Clean up and retrive the resource + virtual void Close(); + + // @brief: Display the results + virtual void Display() const; + + private: + + // @brief: Print Help Menu Screen + void PrintHelpScreen(); + + // @brief: Discover the topology of pools on Rocm Platform + void DiscoverTopology(); + + // @brief: Print topology info + void PrintTopology(); + + // @brief: Print info on agents in system + void PrintAgentsList(); + + // @brief: Print info on memory pools in system + void PrintPoolsList(); + + // @brief: Parse the arguments provided by user to + // build list of transactions + void ParseArguments(); + + // @brief: Print the list of transactions + void PrintTransList(); + + // @brief: Run read/write requests of users + void RunIOBenchmark(async_trans_t& trans); + + // @brief: Run copy requests of users + void RunCopyBenchmark(async_trans_t& trans); + + // @brief: Get iteration number + uint32_t GetIterationNum(); + + // @brief: Get the mean copy time + double GetMeanTime(std::vector& vec); + + // @brief: Get the min copy time + double GetMinTime(std::vector& vec); + + // @brief: Dispaly Benchmark result + void DisplayIOTime(async_trans_t& trans) const; + void DisplayCopyTime(async_trans_t& trans) const; + + private: + + // @brief: Validate the arguments passed in by user + bool ValidateArguments(); + bool ValidateReadReq(); + bool ValidateWriteReq(); + bool ValidateReadOrWriteReq(vector& in_list); + + bool ValidateBidirCopyReq(); + bool ValidateUnidirCopyReq(); + bool ValidateCopyReq(vector& in_list); + void PrintIOAccessError(uint32_t agent_idx, uint32_t pool_idx); + void PrintCopyAccessError(uint32_t src_pool_idx, uint32_t dst_pool_idx); + + bool PoolIsPresent(vector& in_list); + bool PoolIsDuplicated(vector& in_list); + + // @brief: Builds a list of transaction per user request + bool BuildTransList(); + bool BuildReadTrans(); + bool BuildWriteTrans(); + bool BuildBidirCopyTrans(); + bool BuildUnidirCopyTrans(); + bool BuildReadOrWriteTrans(uint32_t req_type, + vector& in_list); + bool BuildCopyTrans(uint32_t req_type, + vector& src_list, + vector& dst_list); + + void AllocateCopyBuffers(bool bidir, uint32_t size, + void*& src_fwd, hsa_amd_memory_pool_t src_pool_fwd, + void*& dst_fwd, hsa_amd_memory_pool_t dst_pool_fwd, + hsa_agent_t src_agent_fwd, hsa_agent_t dst_agent_fwd, + void*& src_rev, hsa_amd_memory_pool_t src_pool_rev, + void*& dst_rev, hsa_amd_memory_pool_t dst_pool_rev, + hsa_agent_t src_agent_rev, hsa_agent_t dst_agent_rev, + hsa_signal_t& signal_fwd, hsa_signal_t& signal_rev); + void ReleaseBuffers(bool bidir, + void* src_fwd, void* src_rev, + void* dst_fwd, void* dst_rev, + hsa_signal_t signal_fwd, hsa_signal_t signal_rev); + double GetGpuCopyTime(bool bidir, hsa_signal_t signal_fwd, hsa_signal_t signal_rev); + void AllocateHostBuffers(bool bidir, uint32_t size, + void*& src_fwd, void*& dst_fwd, + void* buf_src_fwd, void* buf_dst_fwd, + hsa_agent_t src_agent_fwd, hsa_agent_t dst_agent_fwd, + void*& src_rev, void*& dst_rev, + void* buf_src_rev, void* buf_dst_rev, + hsa_agent_t src_agent_rev, hsa_agent_t dst_agent_rev, + hsa_signal_t& signal_fwd, hsa_signal_t& signal_rev); + void copy_buffer(void* dst, hsa_agent_t dst_agent, + void* src, hsa_agent_t src_agent, + size_t size, hsa_signal_t signal); + + // @brief: Check if agent and access memory pool, if so, set + // access to the agent, if not, exit + void AcquireAccess(hsa_agent_t agent, void* ptr); + + // Functions to find agents and memory pools and udpate + // relevant data structures used to maintain system topology + friend hsa_status_t AgentInfo(hsa_agent_t agent, void* data); + friend hsa_status_t MemPoolInfo(hsa_amd_memory_pool_t pool, void* data); + + protected: + + // More variables declared for testing + // vector tran_; + + // Used to help count agent_info + uint32_t agent_index_; + + // List used to store agent info, indexed by agent_index_ + vector agent_list_; + + // Used to help count pool_info_t + uint32_t pool_index_; + + // List used to store pool_info_t, indexed by pool_index_ + vector pool_list_; + + // List used to store agent_pool_info_t + vector agent_pool_list_; + + // List of agents involved in a bidrectional copy operation + // Size of the list cannot exceed the number of agents + // reported by the system + vector bidir_list_; + + // List of source agents in a unidrectional copy operation + // Size of the list cannot exceed the number of agents + // reported by the system + vector src_list_; + + // List of destination agents in a unidrectional copy operation + // Size of the list cannot exceed the number of agents + // reported by the system + vector dst_list_; + + // List of agents involved in read operation. Has + // two agents, the first agent hosts the memory pool + // while the second agent executes the read operation + vector read_list_; + + // List of agents involved in write operation. Has + // two agents, the first agent hosts the memory pool + // while the second agent executes the write operation + vector write_list_; + + // List of sizes to use in copy and read/write transactions + // Size is specified in terms of Megabytes + vector size_list_; + + // Type of service requested by user + uint32_t req_read_; + uint32_t req_write_; + uint32_t req_copy_bidir_; + uint32_t req_copy_unidir_; + + // List used to store transactions per user request + vector trans_list_; + + // Variable to store argument number + + // Variable to store argument number + + // Variable to store argument number + uint32_t usr_argc_; + + // Pointer to store address of argument text + char** usr_argv_; + + // BenchMark copy time + vector op_time_; + + // Min time + vector min_time_; + + // Determines if user has requested verification + bool verify_; + + // CPU agent used for verification + hsa_agent_t cpu_agent_; + + // System region + hsa_amd_memory_pool_t sys_pool_; + + static const uint32_t SIZE_LIST[4]; + //static const uint32_t SIZE_LIST[9]; + +}; + +#endif diff --git a/rocrtst/samples/rocm_async/rocm_async_parse.cpp b/rocrtst/samples/rocm_async/rocm_async_parse.cpp new file mode 100755 index 0000000000..2461896d84 --- /dev/null +++ b/rocrtst/samples/rocm_async/rocm_async_parse.cpp @@ -0,0 +1,206 @@ +#include "common.hpp" +#include "rocm_async.hpp" + +#include +#include +#include + +// Parse option value string. The string has one more decimal +// values separated by comma - "3,6,9,12,15". +static bool ParseOptionValue(char* value, vector&value_list) { + + // Capture the option value string + std::stringstream stream; + stream << value; + + uint32_t token = 0x11231926; + do { + + // Read the option value + stream >> token; + + // Update output list with values + value_list.push_back(token); + + // Ignore the delimiter + if((stream.eof()) || + (stream.peek() == ',')) { + stream.ignore(); + } else { + return false; + } + + } while (!stream.eof()); + + return true; +} + +void RocmAsync::ParseArguments() { + + bool print_help = false; + bool copy_all_bi = false; + bool copy_all_uni = false; + bool print_topology = false; + + // This will suppress prints from getopt implementation + // In case of error, it will return the character '?' as + // return value. + opterr = 0; + + int opt; + bool status; + while ((opt = getopt(usr_argc_, usr_argv_, "hvtaAb:s:d:r:w:m:")) != -1) { + switch (opt) { + + // Print help screen + case 'h': + print_help = true; + break; + + // Print system topology + case 't': + print_topology = true; + break; + + // Set verification flag to true + case 'v': + verify_ = true; + break; + + // Collect list of agents involved in bidirectional copy operation + case 'b': + status = ParseOptionValue(optarg, bidir_list_); + if (status) { + req_copy_bidir_ = REQ_COPY_BIDIR; + break; + } + print_help = true; + break; + + // Collect list of source pools involved in unidirectional copy operation + case 's': + status = ParseOptionValue(optarg, src_list_); + if (status) { + req_copy_unidir_ = REQ_COPY_UNIDIR; + break; + } + print_help = true; + break; + + // Collect list of destination pools involved in unidirectional copy operation + case 'd': + status = ParseOptionValue(optarg, dst_list_); + if (status) { + req_copy_unidir_ = REQ_COPY_UNIDIR; + break; + } + print_help = true; + break; + + // Collect request to read a buffer + case 'r': + req_read_ = REQ_READ; + status = ParseOptionValue(optarg, read_list_); + if (status == false) { + print_help = true; + } + break; + + // Collect request to write a buffer + case 'w': + req_write_ = REQ_WRITE; + status = ParseOptionValue(optarg, write_list_); + if (status == false) { + print_help = true; + } + break; + + // Size of buffers to use in copy and read/write operations + case 'm': + status = ParseOptionValue(optarg, size_list_); + if (status == false) { + print_help = true; + } + break; + + // Enable Unidirectional copy among all valid pools + case 'a': + copy_all_uni = true; + req_copy_unidir_ = REQ_COPY_UNIDIR; + break; + + // Enable Bidirectional copy among all valid pools + case 'A': + copy_all_bi = true; + req_copy_bidir_ = REQ_COPY_BIDIR; + break; + + // getopt implementation returns the value of the unknown + // option or an option with missing operand in the variable + // optopt + case '?': + std::cout << "Value of optopt is: " << '?' << std::endl; + if ((optopt == 'b' || optopt == 's' || optopt == 'd' || optopt == 'e')) { + std::cout << "Error: Option -b -s -d and -e require argument" << std::endl; + } + print_help = true; + break; + default: + print_help = true; + break; + } + } + + // Print help screen if user option has "-h" + if (print_help) { + PrintHelpScreen(); + exit(0); + } + + // Initialize Roc Runtime + err_ = hsa_init(); + ErrorCheck(err_); + + // Discover the topology of RocR agent in system + DiscoverTopology(); + + // Print system topology if user option has "-t" + if (print_topology) { + PrintTopology(); + exit(0); + } + + // Invalidate request if user has requested full + // copying for both unidirectional and bidirectional + if ((copy_all_bi) && (copy_all_uni)) { + PrintHelpScreen(); + exit(0); + } + + // Initialize pool list if full copying in unidirectional mode is enabled + if (copy_all_uni) { + uint32_t size = pool_list_.size(); + for (uint32_t idx = 0; idx < size; idx++) { + src_list_.push_back(idx); + dst_list_.push_back(idx); + } + } + + // Initialize pool list if full copying in bidirectional mode is enabled + if (copy_all_bi) { + uint32_t size = pool_list_.size(); + for (uint32_t idx = 0; idx < size; idx++) { + bidir_list_.push_back(idx); + } + } + + // Initialize the list of buffer sizes to use in copy/read/write operations + if (size_list_.size() == 0) { + uint32_t size_len = sizeof(SIZE_LIST)/sizeof(uint32_t); + for (uint32_t idx = 0; idx < size_len; idx++) { + size_list_.push_back(SIZE_LIST[idx]); + } + } + std::sort(size_list_.begin(), size_list_.end()); +} + diff --git a/rocrtst/samples/rocm_async/rocm_async_print.cpp b/rocrtst/samples/rocm_async/rocm_async_print.cpp new file mode 100755 index 0000000000..4a25c82418 --- /dev/null +++ b/rocrtst/samples/rocm_async/rocm_async_print.cpp @@ -0,0 +1,177 @@ +#include "common.hpp" +#include "rocm_async.hpp" + +// @Brief: Print Help Menu Screen +void RocmAsync::PrintHelpScreen() { + + std::cout << std::endl; + std::cout << "Runs with following options:" << std::endl; + std::cout << std::endl; + std::cout << "\t -h Prints the help screen" << std::endl; + std::cout << "\t -g Prints Gpu times for transfers" << std::endl; + std::cout << "\t -t Prints system topology and its memory pools" << std::endl; + std::cout << "\t -m List of buffer sizes to use, specified in Megabytes" << std::endl; + std::cout << "\t -r List of pool,agent pairs engaged in Read operation" << std::endl; + std::cout << "\t -w List of pool,agent pairs engaged in Write operation" << std::endl; + std::cout << "\t -b List pools to use in bidirectional copy operations" << std::endl; + std::cout << "\t -s List of source pools to use in copy unidirectional operations" << std::endl; + std::cout << "\t -d List of destination pools to use in unidirectional copy operations" << std::endl; + std::cout << "\t -a Perform Unidirectional Copy involving all pool combinations" << std::endl; + std::cout << "\t -A Perform Bidirectional Copy involving all pool combinations" << std::endl; + std::cout << std::endl; + + std::cout << "\t @note 1: Removes copyReq(srcI, dstI) - where Src & Dst Pools are same" << std::endl; + std::cout << std::endl; + std::cout << "\t @note 2: Removes copyReq(srcI, dstJ) - where Src & Dst Pools are Cpu bound " << std::endl; + std::cout << std::endl; + std::cout << "\t @note 3: Treats copyReq(dstI, srcJ) as NOT EQUAL to copyReq(dstJ, srcI) " << std::endl; + std::cout << "\t Underlying copy engine could be different " << std::endl; + std::cout << std::endl; +} + +// @brief: Print the topology of Memory Pools and Agents present in system +void RocmAsync::PrintTopology() { + + size_t count = agent_pool_list_.size(); + std::cout << std::endl; + for (uint32_t idx = 0; idx < count; idx++) { + agent_pool_info_t node = agent_pool_list_.at(idx); + + // Print agent info + std::cout << "Agent: " << node.agent.index_ << std::endl; + if (HSA_DEVICE_TYPE_CPU == node.agent.device_type_) + std::cout << " Agent Device Type: CPU" << std::endl; + else if (HSA_DEVICE_TYPE_GPU == node.agent.device_type_) + std::cout << " Agent Device Type: GPU" << std::endl; + + // Print pool info + size_t pool_count = node.pool_list.size(); + for (uint32_t jdx = 0; jdx < pool_count; jdx++) { + std::cout << " Memory Pool: " + << node.pool_list.at(jdx).index_ << std::endl; + std::cout << " max allocable size in KB: " + << node.pool_list.at(jdx).allocable_size_ / 1024 << std::endl; + std::cout << " segment id: " + << node.pool_list.at(jdx).segment_ << std::endl; + std::cout << " is kernarg: " + << node.pool_list.at(jdx).is_kernarg_ << std::endl; + std::cout << " is fine-grained: " + << node.pool_list.at(jdx).is_fine_grained_ << std::endl; + std::cout << " accessible to owner: " + << node.pool_list.at(jdx).owner_access_ << std::endl; + std::cout << " accessible to all by default: " + << node.pool_list.at(jdx).access_to_all_ << std::endl; + } + std::cout << std::endl; + } + std::cout << std::endl; +} + +// @brief: Print info on agents in system +void RocmAsync::PrintAgentsList() { + + size_t count = agent_pool_list_.size(); + for (uint32_t idx = 0; idx < count; idx++) { + std::cout << std::endl; + agent_pool_info_t node = agent_pool_list_.at(idx); + std::cout << "Agent: " << node.agent.index_ << std::endl; + if (HSA_DEVICE_TYPE_CPU == node.agent.device_type_) + std::cout << " Agent Device Type: CPU" << std::endl; + else if (HSA_DEVICE_TYPE_GPU == node.agent.device_type_) + std::cout << " Agent Device Type: GPU" << std::endl; + } + std::cout << std::endl; +} + +// @brief: Print info on memory pools in system +void RocmAsync::PrintPoolsList() { + + size_t pool_count = pool_list_.size(); + for (uint32_t jdx = 0; jdx < pool_count; jdx++) { + std::cout << std::endl; + std::cout << "Memory Pool Idx: " + << pool_list_.at(jdx).index_ << std::endl; + std::cout << " max allocable size in KB: " + << pool_list_.at(jdx).allocable_size_ / 1024 << std::endl; + std::cout << " segment id: " + << pool_list_.at(jdx).segment_ << std::endl; + std::cout << " is kernarg: " + << pool_list_.at(jdx).is_kernarg_ << std::endl; + std::cout << " is fine-grained: " + << pool_list_.at(jdx).is_fine_grained_ << std::endl; + std::cout << " accessible to owner: " + << pool_list_.at(jdx).owner_access_ << std::endl; + std::cout << " accessible to all by default: " + << pool_list_.at(jdx).access_to_all_ << std::endl; + } + std::cout << std::endl; + +} + +// @brief: Print the list of transactions that will be executed +void RocmAsync::PrintTransList() { + + size_t count = trans_list_.size(); + for (uint32_t idx = 0; idx < count; idx++) { + async_trans_t trans = trans_list_.at(idx); + std::cout << std::endl; + std::cout << " Transaction Id: " << idx << std::endl; + std::cout << " Transaction Type: " << trans.req_type_ << std::endl; + if ((trans.req_type_ == REQ_READ) || (trans.req_type_ == REQ_WRITE)) { + std::cout << "Rocm Kernel used by Transaction: " << trans.kernel.code_ << std::endl; + std::cout << "Rocm Memory Pool Used by Kernel: " << trans.kernel.pool_idx_ << std::endl; + std::cout << " Rocm Agent used for Execution: " << trans.kernel.agent_idx_ << std::endl; + } + if ((trans.req_type_ == REQ_COPY_BIDIR) || (trans.req_type_ == REQ_COPY_UNIDIR)) { + std::cout << " Src Memory Pool used in Copy: " << trans.copy.src_idx_ << std::endl; + std::cout << " Dst Memory Pool used in Copy: " << trans.copy.dst_idx_ << std::endl; + } + + } + std::cout << std::endl; +} + +// @brief: Prints error message when a request to copy between +// source pool and destination pool is not possible +void RocmAsync::PrintCopyAccessError(uint32_t src_idx, uint32_t dst_idx) { + + // Retrieve Roc runtime handles for Src memory pool and agents + uint32_t src_dev_idx = pool_list_[src_idx].agent_index_; + hsa_device_type_t src_dev_type = agent_list_[src_dev_idx].device_type_; + + // Retrieve Roc runtime handles for Dst memory pool and agents + uint32_t dst_dev_idx = pool_list_[dst_idx].agent_index_; + hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_; + + std::cout << std::endl; + std::cout << "Index of Src Pool: " << src_idx << std::endl; + std::cout << "Index of Dst Pool: " << dst_idx << std::endl; + std::cout << "Index of Src Pool's Agent: " << src_dev_idx << std::endl; + std::cout << "Index of Dst Pool's Agent: " << dst_dev_idx << std::endl; + std::cout << "Device Type of Src Pool's Agent: " << src_dev_type << std::endl; + std::cout << "Device Type of Dst Pool's Agent: " << dst_dev_type << std::endl; + std::cout << "Rocm Agent hosting Src Pool cannot ACCESS Dst Pool" << std::endl; + std::cout << std::endl; +} + +// @brief: Prints error message when a request to read / write from +// a pool by an agent is not possible +void RocmAsync::PrintIOAccessError(uint32_t exec_idx, uint32_t pool_idx) { + + // Retrieve device type of executing agent + hsa_device_type_t exec_dev_type = agent_list_[exec_idx].device_type_; + + // Retrieve device type of memory pool's agent + uint32_t pool_dev_idx = pool_list_[pool_idx].agent_index_; + hsa_device_type_t pool_dev_type = agent_list_[pool_dev_idx].device_type_; + + std::cout << std::endl; + std::cout << "Index of Executing Agent: " << exec_idx << std::endl; + std::cout << "Device Type of Executing Agent: " << exec_dev_type << std::endl; + + std::cout << "Index of Buffer's Memory Pool: " << pool_idx << std::endl; + std::cout << "Index of Buffer Memory Pool's Agent: " << pool_dev_idx << std::endl; + std::cout << "Device Type of Buffer Memory Pool's Agent: " << pool_dev_type << std::endl; + std::cout << "Rocm Agent executing Read / Write request cannot ACCESS Buffer's Memory Pool" << std::endl; + std::cout << std::endl; +} diff --git a/rocrtst/samples/rocm_async/rocm_async_report.cpp b/rocrtst/samples/rocm_async/rocm_async_report.cpp new file mode 100755 index 0000000000..b589fa574c --- /dev/null +++ b/rocrtst/samples/rocm_async/rocm_async_report.cpp @@ -0,0 +1,167 @@ +#include "common.hpp" +#include "rocm_async.hpp" + +#include +#include +#include + +static void printRecord(uint32_t size, double avg_time, + double bandwidth, double min_time, + double peak_bandwidth) { + + std::stringstream size_str; + size_str << size << " MB"; + + uint32_t format = 15; + std::cout.precision(3); + std::cout.width(format); + std::cout << size_str.str(); + std::cout.width(format); + std::cout << (avg_time * 1e6); + std::cout.width(format); + std::cout << bandwidth; + std::cout.width(format); + std::cout << (min_time * 1e6); + std::cout.width(format); + std::cout << peak_bandwidth; + std::cout << std::endl; +} + +static void printCopyBanner(uint32_t src_pool_id, uint32_t src_agent_type, + uint32_t dst_pool_id, uint32_t dst_agent_type) { + + std::stringstream src_type; + std::stringstream dst_type; + (src_agent_type == 0) ? src_type << "Cpu" : src_type << "Gpu"; + (dst_agent_type == 0) ? dst_type << "Cpu" : dst_type << "Gpu"; + + std::cout << std::endl; + std::cout << "================"; + std::cout << " Benchmark Result"; + std::cout << " ================"; + std::cout << std::endl; + std::cout << "================"; + std::cout << " Src Pool Id: " << src_pool_id; + std::cout << " Src Agent Type: " << src_type.str(); + std::cout << " ================"; + std::cout << std::endl; + std::cout << "================"; + std::cout << " Dst Pool Id: " << dst_pool_id; + std::cout << " Dst Agent Type: " << dst_type.str(); + std::cout << " ================"; + std::cout << std::endl; + std::cout << std::endl; + + uint32_t format = 15; + std::cout.setf(ios::left); + std::cout.width(format); + std::cout << "Data Size"; + std::cout.width(format); + std::cout << "Avg Time(us)"; + std::cout.width(format); + std::cout << "Avg BW(GB/s)"; + std::cout.width(format); + std::cout << "Min Time(us)"; + std::cout.width(format); + std::cout << "Peak BW(GB/s)"; + std::cout << std::endl; +} + +double RocmAsync::GetMinTime(std::vector& vec) { + + std::sort(vec.begin(), vec.end()); + return vec.at(0); +} + +double RocmAsync::GetMeanTime(std::vector& vec) { + + std::sort(vec.begin(), vec.end()); + vec.erase(vec.begin()); + vec.erase(vec.begin(), vec.begin() + num_iteration_ * 0.1); + vec.erase(vec.begin() + num_iteration_, vec.end()); + + double mean = 0.0; + int num = vec.size(); + for (int it = 0; it < num; it++) { + mean += vec[it]; + } + mean /= num; + return mean; +} + +void RocmAsync::Display() const { + + // Iterate through list of transactions and display its timing data + uint32_t trans_size = trans_list_.size(); + if (trans_size == 0) { + std::cout << std::endl; + std::cout << " One or more of the requests wered filtered out " << std::endl; + std::cout << " i.e. No Valid Requests were Made or Remain" << std::endl; + std::cout << std::endl; + return; + } + for (uint32_t idx = 0; idx < trans_size; idx++) { + async_trans_t trans = trans_list_[idx]; + if ((trans.req_type_ == REQ_COPY_BIDIR) || + (trans.req_type_ == REQ_COPY_UNIDIR)) { + DisplayCopyTime(trans); + } + if ((trans.req_type_ == REQ_READ) || + (trans.req_type_ == REQ_WRITE)) { + DisplayIOTime(trans); + } + } + std::cout << std::endl; +} + +void RocmAsync::DisplayIOTime(async_trans_t& trans) const { + +} + +void RocmAsync::DisplayCopyTime(async_trans_t& trans) const { + + // Get the frequency of Gpu Timestamping + uint64_t sys_freq = 0; + hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sys_freq); + + // Print Benchmark Header + uint32_t src_idx = trans.copy.src_idx_; + uint32_t dst_idx = trans.copy.dst_idx_; + uint32_t src_dev_idx = pool_list_[src_idx].agent_index_; + hsa_device_type_t src_dev_type = agent_list_[src_dev_idx].device_type_; + uint32_t dst_dev_idx = pool_list_[dst_idx].agent_index_; + hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_; + printCopyBanner(src_idx, src_dev_type, dst_idx, dst_dev_type); + + double avg_time = 0; + double min_time = 0; + double bandwidth = 0; + uint32_t data_size = 0; + double peak_bandwidth = 0; + uint32_t size_len = size_list_.size(); + for (uint32_t idx = 0; idx < size_len; idx++) { + + // Adjust size of data involved in copy + data_size = size_list_[idx]; + if (trans.copy.bidir_ == true) { + data_size += size_list_[idx]; + } + data_size = data_size * 1024 * 1024; + + // Copy operation does not involve a Gpu device + if (trans.copy.uses_gpu_ != true) { + avg_time = trans.cpu_avg_time_[idx]; + min_time = trans.cpu_min_time_[idx]; + bandwidth = (double)data_size / avg_time / 1000 / 1000 / 1000; + peak_bandwidth = (double)data_size / min_time / 1000 / 1000 / 1000; + } else { + avg_time = trans.gpu_avg_time_[idx] / sys_freq; + min_time = trans.gpu_min_time_[idx] / sys_freq; + bandwidth = (double)data_size / avg_time / 1000 / 1000 / 1000; + peak_bandwidth = (double)data_size / min_time / 1000 / 1000 / 1000; + } + + printRecord(size_list_[idx], avg_time, bandwidth, min_time, peak_bandwidth); + } +} + diff --git a/rocrtst/samples/rocm_async/rocm_async_topology.cpp b/rocrtst/samples/rocm_async/rocm_async_topology.cpp new file mode 100755 index 0000000000..1e7af1abe3 --- /dev/null +++ b/rocrtst/samples/rocm_async/rocm_async_topology.cpp @@ -0,0 +1,114 @@ +#include "common.hpp" +#include "rocm_async.hpp" + +// @brief: Helper method to iterate throught the memory pools of +// an agent and discover its properties +hsa_status_t MemPoolInfo(hsa_amd_memory_pool_t pool, void* data) { + + hsa_status_t status; + RocmAsync* asyncDrvr = reinterpret_cast(data); + + // Query pools' segment, report only pools from global segment + hsa_amd_segment_t segment; + status = hsa_amd_memory_pool_get_info(pool, + HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); + ErrorCheck(status); + if (HSA_AMD_SEGMENT_GLOBAL != segment) { + return HSA_STATUS_SUCCESS; + } + + // Determine if allocation is allowed in this pool + // Report only pools that allow an alloction by user + bool alloc = false; + status = hsa_amd_memory_pool_get_info(pool, + HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &alloc); + ErrorCheck(status); + if (alloc != true) { + return HSA_STATUS_SUCCESS; + } + + // Query the max allocatable size + size_t max_size = 0; + status = hsa_amd_memory_pool_get_info(pool, + HSA_AMD_MEMORY_POOL_INFO_SIZE, &max_size); + ErrorCheck(status); + + // Determine if the pools is accessible to all agents + bool access_to_all = false; + status = hsa_amd_memory_pool_get_info(pool, + HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &access_to_all); + ErrorCheck(status); + + // Determine type of access to owner agent + hsa_amd_memory_pool_access_t owner_access; + hsa_agent_t agent = asyncDrvr->agent_list_.back().agent_; + status = hsa_amd_agent_memory_pool_get_info(agent, pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &owner_access); + ErrorCheck(status); + + // Determine if the pool is fine-grained or coarse-grained + uint32_t flag = 0; + status = hsa_amd_memory_pool_get_info(pool, + HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + ErrorCheck(status); + bool is_kernarg = (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & flag); + bool is_fine_grained = (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & flag); + + // Update the pool handle for system memory if kernarg is true + if (is_kernarg) { + asyncDrvr->sys_pool_ = pool; + } + + // Create an instance of agent_pool_info and add it to the list + pool_info_t pool_info(agent, asyncDrvr->agent_index_, pool, + segment, max_size, asyncDrvr->pool_index_, + is_fine_grained, is_kernarg, + access_to_all, owner_access); + asyncDrvr->pool_list_.push_back(pool_info); + + // Create an agent_pool_infot and add it to its list + asyncDrvr->agent_pool_list_[asyncDrvr->agent_index_].pool_list.push_back(pool_info); + asyncDrvr->pool_index_++; + + return HSA_STATUS_SUCCESS; +} + +// @brief: Helper method to iterate throught the agents of +// a system and discover its properties +hsa_status_t AgentInfo(hsa_agent_t agent, void* data) { + + RocmAsync* asyncDrvr = reinterpret_cast(data); + + // Get the name of the agent + char agent_name[64]; + hsa_status_t status; + status = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_name); + ErrorCheck(status); + + // Get device type + hsa_device_type_t device_type; + status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); + ErrorCheck(status); + + // Capture the handle of Cpu agent + if (device_type == HSA_DEVICE_TYPE_CPU) { + asyncDrvr->cpu_agent_ = agent; + } + + asyncDrvr->agent_list_.push_back(agent_info(agent, asyncDrvr->agent_index_, device_type)); + + // Contruct an new agent_pool_info structure and add it to the list + agent_pool_info node; + node.agent = asyncDrvr->agent_list_.back(); + asyncDrvr->agent_pool_list_.push_back(node); + + status = hsa_amd_agent_iterate_memory_pools(agent, MemPoolInfo, asyncDrvr); + asyncDrvr->agent_index_++; + + return HSA_STATUS_SUCCESS; +} + +void RocmAsync::DiscoverTopology() { + err_ = hsa_iterate_agents(AgentInfo, this); +} + diff --git a/rocrtst/samples/rocm_async/rocm_async_trans.cpp b/rocrtst/samples/rocm_async/rocm_async_trans.cpp new file mode 100755 index 0000000000..33d0c54aad --- /dev/null +++ b/rocrtst/samples/rocm_async/rocm_async_trans.cpp @@ -0,0 +1,173 @@ +#include "common.hpp" +#include "rocm_async.hpp" + +bool RocmAsync::BuildReadOrWriteTrans(uint32_t req_type, + vector& in_list) { + + // Validate the list of pool-agent tuples + hsa_status_t status; + hsa_amd_memory_pool_access_t access; + uint32_t list_size = in_list.size(); + for (uint32_t idx = 0; idx < list_size; idx+=2) { + + uint32_t pool_idx = in_list[idx]; + uint32_t exec_idx = in_list[idx + 1]; + + // Retrieve Roc runtime handles for memory pool and agent + hsa_agent_t exec_agent = agent_list_[exec_idx].agent_; + hsa_amd_memory_pool_t pool = pool_list_[pool_idx].pool_; + + // Determine agent can access the memory pool + status = hsa_amd_agent_memory_pool_get_info(exec_agent, pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access); + ErrorCheck(status); + + // Determine if accessibility to agent is not denied + if (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { + PrintIOAccessError(exec_idx, pool_idx); + return false; + } + + // Agent has access, build an instance of transaction + // and add it to the list of transactions + async_trans_t trans(req_type); + trans.kernel.code_ = nullptr; + trans.kernel.pool_ = pool; + trans.kernel.pool_idx_ = pool_idx; + trans.kernel.agent_ = exec_agent; + trans.kernel.agent_idx_ = exec_idx; + trans_list_.push_back(trans); + } + return true; +} + +bool RocmAsync::BuildReadTrans() { + return BuildReadOrWriteTrans(REQ_READ, read_list_); +} + +bool RocmAsync::BuildWriteTrans() { + return BuildReadOrWriteTrans(REQ_WRITE, write_list_); +} + +bool RocmAsync::BuildCopyTrans(uint32_t req_type, + vector& src_list, + vector& dst_list) { + + uint32_t src_size = src_list.size(); + uint32_t dst_size = dst_list.size(); + + hsa_status_t status; + hsa_amd_memory_pool_access_t access; + for (uint32_t idx = 0; idx < src_size; idx++) { + + // Retrieve Roc runtime handles for Src memory pool and agents + uint32_t src_idx = src_list[idx]; + hsa_agent_t src_agent = pool_list_[src_idx].owner_agent_; + hsa_amd_memory_pool_t src_pool = pool_list_[src_idx].pool_; + uint32_t src_dev_idx = pool_list_[src_idx].agent_index_; + hsa_device_type_t src_dev_type = agent_list_[src_dev_idx].device_type_; + + for (uint32_t jdx = 0; jdx < dst_size; jdx++) { + + // Retrieve Roc runtime handles for Dst memory pool and agents + uint32_t dst_idx = dst_list[jdx]; + hsa_agent_t dst_agent = pool_list_[dst_idx].owner_agent_; + hsa_amd_memory_pool_t dst_pool = pool_list_[dst_idx].pool_; + uint32_t dst_dev_idx = pool_list_[dst_idx].agent_index_; + hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_; + + // Filter out transaction when Src & Dst pools belong to Cpu + if ((src_dev_type == HSA_DEVICE_TYPE_CPU) && + (dst_dev_type == HSA_DEVICE_TYPE_CPU)) { + continue; + } + + // Filter out transaction with same Src & Dst pools + if (src_idx == dst_idx) { + continue; + } + + // Determine if accessibility to src pool for dst agent is not denied + status = hsa_amd_agent_memory_pool_get_info(dst_agent, src_pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access); + ErrorCheck(status); + if (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { + PrintCopyAccessError(src_idx, dst_idx); + return false; + } + + // Determine if accessibility to dst pool for src agent is not denied + status = hsa_amd_agent_memory_pool_get_info(src_agent, dst_pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access); + ErrorCheck(status); + if (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { + return false; + } + + // Agents have access, build an instance of transaction + // and add it to the list of transactions + async_trans_t trans(req_type); + trans.copy.src_idx_ = src_idx; + trans.copy.dst_idx_ = dst_idx; + trans.copy.src_pool_ = src_pool; + trans.copy.dst_pool_ = dst_pool; + trans.copy.bidir_ = (req_type == REQ_COPY_BIDIR); + trans.copy.uses_gpu_ = ((src_dev_type == HSA_DEVICE_TYPE_GPU) || + (dst_dev_type == HSA_DEVICE_TYPE_GPU)); + trans_list_.push_back(trans); + } + } + return true; +} + +bool RocmAsync::BuildBidirCopyTrans() { + return BuildCopyTrans(REQ_COPY_BIDIR, bidir_list_, bidir_list_); +} + +bool RocmAsync::BuildUnidirCopyTrans() { + return BuildCopyTrans(REQ_COPY_UNIDIR, src_list_, dst_list_); +} + +// @brief: Builds a list of transaction per user request +bool RocmAsync::BuildTransList() { + + // Build list of Read transactions per user request + bool status = false; + if (req_read_ == REQ_READ) { + status = BuildReadTrans(); + if (status == false) { + return status; + } + } + + // Build list of Write transactions per user request + status = false; + if (req_write_ == REQ_WRITE) { + status = BuildWriteTrans(); + if (status == false) { + return status; + } + } + + // Build list of Bidirectional Copy transactions per user request + status = false; + if (req_copy_bidir_ == REQ_COPY_BIDIR) { + status = BuildBidirCopyTrans(); + if (status == false) { + return status; + } + } + + // Build list of Unidirectional Copy transactions per user request + status = false; + if (req_copy_unidir_ == REQ_COPY_UNIDIR) { + status = BuildUnidirCopyTrans(); + if (status == false) { + return status; + } + } + + // All of the transaction are built up + return true; +} + diff --git a/rocrtst/samples/rocm_async/rocm_async_validate.cpp b/rocrtst/samples/rocm_async/rocm_async_validate.cpp new file mode 100644 index 0000000000..81dd9e4cc2 --- /dev/null +++ b/rocrtst/samples/rocm_async/rocm_async_validate.cpp @@ -0,0 +1,155 @@ + +#include "common.hpp" +#include "rocm_async.hpp" + +#include +#include +#include +#include +#include + +bool RocmAsync::PoolIsPresent(vector& in_list) { + + bool is_present; + uint32_t idx1 = 0; + uint32_t idx2 = 0; + uint32_t count = in_list.size(); + uint32_t pool_count = pool_list_.size(); + for (idx1 = 0; idx1 < count; idx1++) { + is_present = false; + for (idx2 = 0; idx2 < pool_count; idx2++) { + if (in_list[idx1] == pool_list_[idx2].index_) { + is_present = true; + break; + } + } + if (is_present == false) { + return false; + } + } + + return true; +} + +bool RocmAsync::PoolIsDuplicated(vector& in_list) { + + uint32_t idx1 = 0; + uint32_t idx2 = 0; + uint32_t count = in_list.size(); + for (idx1 = 0; idx1 < count; idx1++) { + for (idx2 = 0; idx2 < count; idx2++) { + if ((in_list[idx1] == in_list[idx2]) && (idx1 != idx2)){ + return false; + } + } + } + return true; +} + +bool RocmAsync::ValidateReadOrWriteReq(vector& in_list) { + + // Determine read / write request is even + // Request is specified as a list of memory + // pool, agent tuples - first element identifies + // memory pool while the second element denotes + // an agent + uint32_t list_size = in_list.size(); + if ((list_size % 2) != 0) { + return false; + } + + // Validate the list of pool-agent tuples + for (uint32_t idx = 0; idx < list_size; idx+=2) { + uint32_t pool_idx = in_list[idx]; + uint32_t exec_idx = in_list[idx + 1]; + // Determine the pool and agent exist in system + if ((pool_idx >= pool_index_) || + (exec_idx >= agent_index_)) { + return false; + } + } + return true; +} + +bool RocmAsync::ValidateReadReq() { + return ValidateReadOrWriteReq(read_list_); +} + +bool RocmAsync::ValidateWriteReq() { + return ValidateReadOrWriteReq(write_list_); +} + +bool RocmAsync::ValidateCopyReq(vector& in_list) { + + // Determine pool list length is valid + uint32_t count = in_list.size(); + uint32_t pool_count = pool_list_.size(); + if (count > pool_count) { + return false; + } + + // Determine no pool is duplicated + bool status = PoolIsDuplicated(in_list); + if (status == false) { + return false; + } + + // Determine every pool is present in system + return PoolIsPresent(in_list); +} + +bool RocmAsync::ValidateBidirCopyReq() { + return ValidateCopyReq(bidir_list_); +} + +bool RocmAsync::ValidateUnidirCopyReq() { + return ((ValidateCopyReq(src_list_)) && (ValidateCopyReq(dst_list_))); +} + +bool RocmAsync::ValidateArguments() { + + // Determine if user has requested a READ + // operation and gave valid inputs + bool status = false; + if (req_read_ == REQ_READ) { + status = ValidateReadReq(); + if (status == false) { + return status; + } + } + + // Determine if user has requested a WRITE + // operation and gave valid inputs + status = false; + if (req_write_ == REQ_WRITE) { + status = ValidateWriteReq(); + if (status == false) { + return status; + } + } + + // Determine if user has requested a Copy + // operation that is bidirectional and gave + // valid inputs + status = false; + if (req_copy_bidir_ == REQ_COPY_BIDIR) { + status = ValidateBidirCopyReq(); + if (status == false) { + return status; + } + } + + // Determine if user has requested a Copy + // operation that is unidirectional and gave + // valid inputs + status = false; + if (req_copy_unidir_ == REQ_COPY_UNIDIR) { + status = ValidateUnidirCopyReq(); + if (status == false) { + return status; + } + } + + // All of the request are well formed + return true; +}