diff --git a/rocrtst/samples/rocm_async/CMakeLists.txt b/rocrtst/samples/rocm_async/CMakeLists.txt new file mode 100644 index 0000000000..a1f836ff57 --- /dev/null +++ b/rocrtst/samples/rocm_async/CMakeLists.txt @@ -0,0 +1,88 @@ +cmake_minimum_required(VERSION 2.8.0) + +# +# Setup build environment +# +# 1) Setup env var ROCR_INC_DIR and ROCR_LIB_DIR to point to +# ROC Runtime header and libraries seperately +# +# export ROCR_INC_DIR="Path to ROC Runtime header" +# +# export ROCR_LIB_DIR="Path to ROC Runtime libraries" +# +# 2) Make an new folder called build under root folder +# +# mkdir build +# +# 3) Enter into folder of build, and run CMAKE to generate makefile +# and make it +# +# cd build; cmake ..; make +# + +if(WIN32) + MESSAGE("Windows platfomr is not supported") + RRETURN() +endif() + +if(NOT EXISTS $ENV{ROCR_INC_DIR}/hsa/hsa.h) + MESSAGE("ERROR: ROC Runtime headers can't be found under specified path") + RETURN() +endif() + +# +# Flag to enable / disable verbose output. +# +SET( CMAKE_VERBOSE_MAKEFILE on ) + +# +# Set core runtime module name +# +set ( CORE_RUNTIME_NAME "hsa-runtime" ) +set ( CORE_RUNTIME_TARGET "${CORE_RUNTIME_NAME}64" ) +set ( CORE_RUNTIME_LIBRARY "lib${CORE_RUNTIME_TARGET}" ) + +if(NOT EXISTS $ENV{ROCR_LIB_DIR}/${CORE_RUNTIME_LIBRARY}.so) + MESSAGE("ERROR: ROC Runtime libraries can't be found under sprcified path") + RETURN() +endif() + +set(PROJECT_NAME "rocm_async") +set(TEST_NAME "${PROJECT_NAME}") +project (${PROJECT_NAME}) + +string(TOLOWER "${CMAKE_BUILD_TYPE}" tmp) +if("${tmp}" STREQUAL "debug") + set(ISDEBUG "1") + add_definitions(-DDEBUG) +endif() + +if(ISDEBUG) + set(CMAKE_CXX_FLAGS "-std=c++11 -O0") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb") +else() + set(CMAKE_CXX_FLAGS "-std=c++11 -O2") +endif() + +# +# Set the remaining compiler flags +# +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror") + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-math-errno") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-threadsafe-statics") + +INCLUDE_DIRECTORIES($ENV{ROCR_INC_DIR}) + +LINK_DIRECTORIES($ENV{ROCR_LIB_DIR}) + +# Add sources that belong to the project +aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} Src) + +add_executable(rocm_async ${Src}) +target_link_libraries(rocm_async hsa-runtime64) diff --git a/rocrtst/samples/rocm_async/Readme.txt b/rocrtst/samples/rocm_async/Readme.txt new file mode 100644 index 0000000000..50d362b63c --- /dev/null +++ b/rocrtst/samples/rocm_async/Readme.txt @@ -0,0 +1,66 @@ + +Introduction: +############# + +RocmAsync is designed to capture the performance characteristics of buffer +copying and kernel read/write operations. The help screen of the benchmark +shows various options one can use in initiating cop/read/writer operations. +In addition one can also query the topology of the system in terms of memory +pools and their agents + +Build Environment: +################## + +To be able to build RocmAsync, users must ensure that the build platform has +following conditions satisfied: +Build Procedure: +################ + +The following simply lists the steps to build RocmAsync + +--- Define following environment variable to specify location of header + and library files + + // Containins header files exported by ROC Runtime + ROCR_INC_DIR="Path of ROC Runtime Header Files" + + // Containins library files exported by ROC Runtime + ROCR_LIB_DIR="Path of ROC Runtime Library Files" + +--- Create a build directory. The location of build directory can be anywhere + in the file system as long as it has read / write / execute permissions for + the user invoking the commands. User can choose any valid filename for the + build directory as the examples below illustrate + + e.g. mkdir rocm_async/perfBuild + e.g. mkdir rocm_async-build + e.g. mkdir /rocmAsyncBuild + +--- Set working directory to be the new build directory + + e.g. cd rocm_async/perfBuild + e.g. cd rocm-async-build + e.g. cd /rocmAsyncBuild + +--- Invoke Cmake to interpret build rules and generate native build files + The argument for cmake should be the root folder of RocmAsync test suite + + // Builds Release version (default) + e.g. cmake .../rocm_async + + // Builds Debug version + e.g. cmake -DCMAKE_BUILD_TYPE:STRING=Debug .../rocm_async + +--- Invoke the native build rules generated by cmake to build the various + object, library and executable files + + e.g. make + +--- Invoke the install command to copy build artifacts to pre-defined folders + of RocmAsync suite. Upon completion artifacts will be copied to the bin and + lib directories of build directory + + e.g. make install + + @note: All executables will be found in /bin folder + diff --git a/rocrtst/samples/rocm_async/base_test.cpp b/rocrtst/samples/rocm_async/base_test.cpp new file mode 100644 index 0000000000..3213a920a9 --- /dev/null +++ b/rocrtst/samples/rocm_async/base_test.cpp @@ -0,0 +1,11 @@ +#include "base_test.hpp" + +// Default Constructor +BaseTest::BaseTest(size_t num) { + + // Set the numIteration_ to be 10 by default + num_iteration_ = num; +} + +BaseTest::~BaseTest() {} + diff --git a/rocrtst/samples/rocm_async/base_test.hpp b/rocrtst/samples/rocm_async/base_test.hpp new file mode 100644 index 0000000000..a268270c3e --- /dev/null +++ b/rocrtst/samples/rocm_async/base_test.hpp @@ -0,0 +1,52 @@ + +#ifndef ROCM_ASYNC_BW_BASE_TEST_H_ +#define ROCM_ASYNC_BW_BASE_TEST_H_ + +#include "hsa/hsa.h" +#include +#include +#include + +using namespace std; + +// @Brief: An interface for tests to do some basic things, + +class BaseTest { + + public: + + BaseTest(size_t num = 10); + + virtual ~BaseTest(); + + // @Brief: Allows setup proceedures to be completed + // before running the benchmark test case + virtual void SetUp() = 0; + + // @Brief: Launches the proceedures of test scenario + virtual void Run() = 0; + + // @Brief: Allows clean up proceedures to be invoked + virtual void Close() = 0; + + // @Brief: Display the results + virtual void Display() const = 0; + + // @Brief: Set number of iterations to run + void set_num_iteration(size_t num) { + num_iteration_ = num; + return; + } + + // @Brief: Pre-declare some variables for deriviation, the + // derived class may declare more if needed + protected: + + // @Brief: Real iteration number + uint64_t num_iteration_; + + // @Brief: Status code + hsa_status_t err_; +}; + +#endif // ROCM_ASYNC_BW_BASE_TEST_H_ diff --git a/rocrtst/samples/rocm_async/common.cpp b/rocrtst/samples/rocm_async/common.cpp new file mode 100644 index 0000000000..7e92a2ef54 --- /dev/null +++ b/rocrtst/samples/rocm_async/common.cpp @@ -0,0 +1,149 @@ +#include "common.hpp" + +void error_check(hsa_status_t hsa_error_code, int line_num, const char* str) { + if (hsa_error_code != HSA_STATUS_SUCCESS && + hsa_error_code != HSA_STATUS_INFO_BREAK) { + printf("HSA Error Found! In file: %s; At line: %d\n", str, line_num); + const char* string = nullptr; + hsa_status_string(hsa_error_code, &string); + printf("Error: %s\n", string); + exit(EXIT_FAILURE); + } +} + +// So far, always find the first device +hsa_status_t FindGpuDevice(hsa_agent_t agent, void* data) { + if (data == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_device_type_t hsa_device_type; + hsa_status_t hsa_error_code = + hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &hsa_device_type); + if (hsa_error_code != HSA_STATUS_SUCCESS) { + return hsa_error_code; + } + + if (hsa_device_type == HSA_DEVICE_TYPE_GPU) { + *((hsa_agent_t*)data) = agent; + return HSA_STATUS_INFO_BREAK; + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t FindCpuDevice(hsa_agent_t agent, void* data) { + if (data == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_device_type_t hsa_device_type; + hsa_status_t hsa_error_code = + hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &hsa_device_type); + if (hsa_error_code != HSA_STATUS_SUCCESS) { + return hsa_error_code; + } + + if (hsa_device_type == HSA_DEVICE_TYPE_CPU) { + *((hsa_agent_t*)data) = agent; + return HSA_STATUS_INFO_BREAK; + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t region, void* data) { + if (NULL == data) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_status_t err; + hsa_amd_segment_t segment; + uint32_t flag; + + err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); + ErrorCheck(err); + + err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + ErrorCheck(err); + + if ((HSA_AMD_SEGMENT_GLOBAL == segment) && + (flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED)) { + *((hsa_amd_memory_pool_t*)data) = region; + } + + return HSA_STATUS_SUCCESS; +} + +double CalcMedian(vector scores) { + double median; + size_t size = scores.size(); + + if (size % 2 == 0) + median = (scores[size / 2 - 1] + scores[size / 2]) / 2; + else + median = scores[size / 2]; + + return median; +} + +double CalcMean(vector scores) { + double mean = 0; + size_t size = scores.size(); + + for (size_t i = 0; i < size; ++i) mean += scores[i]; + + return mean / size; +} + +double CalcStdDeviation(vector scores, int score_mean) { + double ret = 0.0; + for (size_t i = 0; i < scores.size(); ++i) { + ret += (scores[i] - score_mean) * (scores[i] - score_mean); + } + + ret /= scores.size(); + + return sqrt(ret); +} + +int CalcConcurrentQueues(vector scores) { + int num_of_concurrent_queues = 0; + vector execpted_exec_time_array; + + for (size_t i = 0; i < scores.size(); ++i) { + execpted_exec_time_array.push_back(scores[0] / (1 << i)); + } + + for (size_t i = 0; i < scores.size(); ++i) { + cout << "expected exe time = " << execpted_exec_time_array[i] << endl; + } + + for (size_t i = 1; i < scores.size(); ++i) { + if ((execpted_exec_time_array[i] - scores[i]) < + 0.1 * execpted_exec_time_array[i]) + ++num_of_concurrent_queues; + } + + return num_of_concurrent_queues; +} + +/** hsa_status_t FindHostRegion(hsa_region_t region, void *data) { + if (data == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + bool is_host_region = false; + hsa_status_t hsa_error_code = hsa_region_get_info( + region, (hsa_region_info_t)HSA_EXT_REGION_INFO_HOST_ACCESS, &is_host_region + ); + if (hsa_error_code != HSA_STATUS_SUCCESS) { + return hsa_error_code; + } + + if (is_host_region) { + *((hsa_region_t*)data) = region; + } + + return HSA_STATUS_SUCCESS; +} */ diff --git a/rocrtst/samples/rocm_async/common.hpp b/rocrtst/samples/rocm_async/common.hpp new file mode 100644 index 0000000000..c0a42180dd --- /dev/null +++ b/rocrtst/samples/rocm_async/common.hpp @@ -0,0 +1,48 @@ +#ifndef ROCM_ASYNC_BW_COMMON_HPP +#define ROCM_ASYNC_BW_COMMON_HPP + +#include +#include +#include +#include +#include "hsa/hsa.h" +#include "hsa/hsa_ext_amd.h" + +using namespace std; + +#if defined(_MSC_VER) +#define ALIGNED_(x) __declspec(align(x)) +#else +#if defined(__GNUC__) +#define ALIGNED_(x) __attribute__((aligned(x))) +#endif // __GNUC__ +#endif // _MSC_VER + +#define MULTILINE(...) #__VA_ARGS__ + +#define HSA_ARGUMENT_ALIGN_BYTES 16 + +#define ErrorCheck(x) error_check(x, __LINE__, __FILE__) + +// @Brief: Check HSA API return value +void error_check(hsa_status_t hsa_error_code, int line_num, const char* str); + +// @Brief: Find the first avaliable GPU device +hsa_status_t FindGpuDevice(hsa_agent_t agent, void* data); + +// @Brief: Find the first avaliable CPU device +hsa_status_t FindCpuDevice(hsa_agent_t agent, void* data); + +// @Brief: Find the agent's global region / pool +hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t region, void* data); + +// @Brief: Calculate the mean number of the vector +double CalcMean(vector scores); + +// @Brief: Calculate the Median valud of the vector +double CalcMedian(vector scores); + +// @Brief: Calculate the standard deviation of the vector +double CalcStdDeviation(vector scores, int score_mean); + +#endif // ROCM_ASYNC_BW_COMMON_HPP diff --git a/rocrtst/samples/rocm_async/hsatimer.cpp b/rocrtst/samples/rocm_async/hsatimer.cpp new file mode 100644 index 0000000000..bdb9b182eb --- /dev/null +++ b/rocrtst/samples/rocm_async/hsatimer.cpp @@ -0,0 +1,188 @@ +#include "hsatimer.hpp" + +#define NANOSECONDS_PER_SECOND 1000000000 + +PerfTimer::PerfTimer() { + freq_in_100mhz = MeasureTSCFreqHz(); +} + +PerfTimer::~PerfTimer() { + while (!_timers.empty()) { + Timer *temp = _timers.back(); + _timers.pop_back(); + delete temp; + } +} + +// Create a new timer instance and return its index +int PerfTimer::CreateTimer() { + + Timer *newTimer = new Timer; + newTimer->_start = 0.0; + newTimer->_clocks = 0.0; + + #ifdef _WIN32 + QueryPerformanceFrequency((LARGE_INTEGER *)&newTimer->_freq); + #endif + + #ifdef __linux__ + newTimer->_freq = NANOSECONDS_PER_SECOND; + #endif + + // Save the timer object in timer list + _timers.push_back(newTimer); + return (int)(_timers.size() - 1); +} + +int PerfTimer::StartTimer(int index) { + + if (index >= (int)_timers.size()) { + Error("Cannot reset timer. Invalid handle."); + return HSA_FAILURE; + } + + #ifdef _WIN32 + // General Windows timing method + #ifndef _AMD + long long tmpStart; + QueryPerformanceCounter((LARGE_INTEGER *)&(tmpStart)); + _ timers[index]->_start = (double)tmpStart; + // AMD Windows timing method + #else + #endif + #endif + + #ifdef __linux__ + // General Linux timing method + #ifndef _AMD + struct timespec s; + clock_gettime(CLOCK_MONOTONIC, &s); + _timers[index]->_start = + (long long)s.tv_sec * NANOSECONDS_PER_SECOND + (long long)s.tv_nsec; + // AMD Linux timing method + #else + unsigned int unused; + _timers[index]->_start = __rdtscp(&unused); + #endif + #endif + + return HSA_SUCCESS; +} + +int PerfTimer::StopTimer(int index) { + + long long n = 0; + if (index >= (int)_timers.size()) { + Error("Cannot reset timer. Invalid handle."); + return HSA_FAILURE; + } + + #ifdef _WIN32 + #ifndef _AMD + long long n1; + QueryPerformanceCounter((LARGE_INTEGER *)&(n1)); + n = n1; + // AMD Window Timing + #else + #endif + #endif + + #ifdef __linux__ + // General Linux timing method + #ifndef _AMD + struct timespec s; + clock_gettime(CLOCK_MONOTONIC, &s); + n = (long long)s.tv_sec * NANOSECONDS_PER_SECOND + (long long)s.tv_nsec; + // AMD Linux timing + #else + unsigned int unused; + n = __rdtscp(&unused); + #endif + #endif + + n -= _timers[index]->_start; + _timers[index]->_start = 0; + + #ifndef _AMD + _timers[index]->_clocks += n; + #endif + + #ifdef __linux__ + //_timers[index]->_clocks += 10 * n /freq_in_100mhz; // unit is ns + _timers[index]->_clocks += 1.0E-6 * 10 * n / freq_in_100mhz; // convert to ms + // cout << "_AMD is enabled!!!" << endl; + #endif + + return HSA_SUCCESS; +} + +void PerfTimer::Error(string str) { cout << str << endl; } + +double PerfTimer::ReadTimer(int index) { + + if (index >= (int)_timers.size()) { + Error("Cannot read timer. Invalid handle."); + return HSA_FAILURE; + } + + double reading = double(_timers[index]->_clocks); + + reading = double(reading / _timers[index]->_freq); + + return reading; +} + +void PerfTimer::ResetTimer(int index) { + + // Check if index value is over the timer's size + if (index >= (int)_timers.size()) { + Error("Invalid index value\n"); + exit(1); + } + + _timers[index]->_clocks = 0.0; + _timers[index]->_start = 0.0; +} + +uint64_t PerfTimer::CoarseTimestampUs() { + + #ifdef _WIN32 + uint64_t freqHz, ticks; + QueryPerformanceFrequency((LARGE_INTEGER *)&freqHz); + QueryPerformanceCounter((LARGE_INTEGER *)&ticks); + + // Scale numerator and divisor until (ticks * 1000000) fits in uint64_t. + while (ticks > (1ULL << 44)) { + ticks /= 16; + freqHz /= 16; + } + + return (ticks * 1000000) / freqHz; + #endif + + #ifdef __linux__ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return uint64_t(ts.tv_sec) * 1000000 + ts.tv_nsec / 1000; + #endif +} + +uint64_t PerfTimer::MeasureTSCFreqHz() { + + // Make a coarse interval measurement of TSC ticks for 1 gigacycles. + unsigned int unused; + uint64_t tscTicksEnd; + + uint64_t coarseBeginUs = CoarseTimestampUs(); + uint64_t tscTicksBegin = __rdtscp(&unused); + do { + tscTicksEnd = __rdtscp(&unused); + } while (tscTicksEnd - tscTicksBegin < 1000000000); + + uint64_t coarseEndUs = CoarseTimestampUs(); + + // Compute the TSC frequency and round to nearest 100MHz. + uint64_t coarseIntervalNs = (coarseEndUs - coarseBeginUs) * 1000; + uint64_t tscIntervalTicks = tscTicksEnd - tscTicksBegin; + return (tscIntervalTicks * 10 + (coarseIntervalNs / 2)) / coarseIntervalNs; +} diff --git a/rocrtst/samples/rocm_async/hsatimer.hpp b/rocrtst/samples/rocm_async/hsatimer.hpp new file mode 100644 index 0000000000..974efaa2bb --- /dev/null +++ b/rocrtst/samples/rocm_async/hsatimer.hpp @@ -0,0 +1,72 @@ + +#ifndef ROCM_ASYNC_BW_MYTIME_H_ +#define ROCM_ASYNC_BW_MYTIME_H_ + +// Will use AMD timer and general Linux timer based on users' +// need --> compilation flag. Support for windows platform is +// not currently available + +#include +#include +#include +#include +#include + +#include +#include +#include + +using namespace std; + +#include + +#define HSA_FAILURE 1 +#define HSA_SUCCESS 0 + +class PerfTimer { + + private: + + struct Timer { + string name; /* < name name of time object*/ + long long _freq; /* < _freq frequency*/ + long long _clocks; /* < _clocks number of ticks at end*/ + long long _start; /* < _start start point ticks*/ + }; + + std::vector _timers; /*< _timers vector to Timer objects */ + double freq_in_100mhz; + + public: + + PerfTimer(); + ~PerfTimer(); + + private: + + // AMD timing method + uint64_t CoarseTimestampUs(); + uint64_t MeasureTSCFreqHz(); + + // General Linux timing method + + public: + + int CreateTimer(); + int StartTimer(int index); + int StopTimer(int index); + void ResetTimer(int index); + + public: + + // retrieve time + double ReadTimer(int index); + + // write into a file + double WriteTimer(int index); + + public: + void Error(string str); +}; + +#endif // ROCM_ASYNC_BW_MYTIME_H_ diff --git a/rocrtst/samples/rocm_async/main.cpp b/rocrtst/samples/rocm_async/main.cpp new file mode 100644 index 0000000000..1c2a4c7889 --- /dev/null +++ b/rocrtst/samples/rocm_async/main.cpp @@ -0,0 +1,25 @@ +#include +#include +#include "hsatimer.hpp" +#include "rocm_async.hpp" + +using namespace std; + +int main(int argc, char** argv) { + + // Create the Bandwidth test object + RocmAsync bw_test(argc, argv); + + // Initialize the Bandwidth test object + bw_test.SetUp(); + + // Run the Bandwidth tests requested by user + bw_test.Run(); + + // Display the time taken by various tests + bw_test.Display(); + + // Release the Bandwidth test object resources + bw_test.Close(); + return 0; +} diff --git a/rocrtst/samples/rocm_async/os.cpp b/rocrtst/samples/rocm_async/os.cpp new file mode 100644 index 0000000000..e83f5087fe --- /dev/null +++ b/rocrtst/samples/rocm_async/os.cpp @@ -0,0 +1,49 @@ + +// Compiling for Windows Platform +#ifdef _WIN32 + +#include "os.hpp" +#include +#include +#include + +void SetEnv(const char* env_var_name, const char* env_var_value) { + bool err = SetEnvironmentVariable(env_var_name, env_var_value); + if (false == err) { + printf("Set environment variable failed!\n"); + exit(1); + } + return; +} + +char* GetEnv(const char* env_var_name) { + char* buff; + DWORD char_count = GetEnvironmentVariable(env_var_name, NULL, 0); + if (char_count == 0) return NULL; + buff = (char*)malloc(sizeof(char) * char_count); + GetEnvironmentVariable(env_var_name, buff, char_count); + buff[char_count - 1] = '\0'; + return buff; +} + +#endif // End of Windows Code + +// Compiling for Linux Platform +#ifdef __linux__ + +#include "os.hpp" +#include + +void SetEnv(const char* env_var_name, const char* env_var_value) { + int err = setenv(env_var_name, env_var_value, 1); + if (0 != err) { + printf("Set environment variable failed!\n"); + exit(1); + } + return; +} + +char* GetEnv(const char* env_var_name) { return getenv(env_var_name); } + +#endif // End of Linux Code + diff --git a/rocrtst/samples/rocm_async/os.hpp b/rocrtst/samples/rocm_async/os.hpp new file mode 100755 index 0000000000..3b8607f507 --- /dev/null +++ b/rocrtst/samples/rocm_async/os.hpp @@ -0,0 +1,14 @@ + +#ifndef ROCM_ASYNC_BW_UTILS_OS_H_ +#define ROCM_ASYNC_BW_UTILS_OS_H_ + +#include + +// Set envriroment variable +void SetEnv(const char* env_var_name, const char* env_var_value); + +// Get the value of enviroment +char* GetEnv(const char* env_var_name); + + +#endif // ROCM_ASYNC_BW_UTILS_OS_H_ diff --git a/rocrtst/samples/rocm_async/rocm_async.cpp b/rocrtst/samples/rocm_async/rocm_async.cpp new file mode 100755 index 0000000000..27024db926 --- /dev/null +++ b/rocrtst/samples/rocm_async/rocm_async.cpp @@ -0,0 +1,478 @@ + +#include "common.hpp" +#include "rocm_async.hpp" + +#include +#include +#include +#include +#include +#include + +// The values are in megabytes at allocation time +const uint32_t RocmAsync::SIZE_LIST[] = { 64, 128, 256, 512 }; +//const uint32_t RocmAsync::SIZE_LIST[] = { 2, 4, 8, 16, 32, 64, 128, 256, 512 }; + +uint32_t RocmAsync::GetIterationNum() { + return num_iteration_ * 1.2 + 1; +} + +void RocmAsync::AcquireAccess(hsa_agent_t agent, void* ptr) { + err_ = hsa_amd_agents_allow_access(1, &agent, NULL, ptr); + ErrorCheck(err_); +} + +void RocmAsync::AllocateHostBuffers(bool bidir, uint32_t size, + void*& src_fwd, void*& dst_fwd, + void* buf_src_fwd, void* buf_dst_fwd, + hsa_agent_t src_agent_fwd, hsa_agent_t dst_agent_fwd, + void*& src_rev, void*& dst_rev, + void* buf_src_rev, void* buf_dst_rev, + hsa_agent_t src_agent_rev, hsa_agent_t dst_agent_rev, + hsa_signal_t& signal_fwd, hsa_signal_t& signal_rev) { + + // Allocate host buffers and setup accessibility for copy operation + err_ = hsa_amd_memory_pool_allocate(sys_pool_, size, 0, (void**)&src_fwd); + ErrorCheck(err_); + AcquireAccess(src_agent_fwd, src_fwd); + AcquireAccess(cpu_agent_, buf_src_fwd); + + err_ = hsa_amd_memory_pool_allocate(sys_pool_, size, 0, (void**)&dst_fwd); + ErrorCheck(err_); + AcquireAccess(dst_agent_fwd, dst_fwd); + AcquireAccess(cpu_agent_, buf_dst_fwd); + + // Initialize host buffers to a determinate value + memset(src_fwd, 0x23, size); + memset(dst_fwd, 0x00, size); + + // Create a signal to wait on copy operation + // @TODO: replace it with a signal pool call + err_ = hsa_signal_create(1, 0, NULL, &signal_fwd); + ErrorCheck(err_); + + if (bidir == false) { + return; + } + + err_ = hsa_amd_memory_pool_allocate(sys_pool_, size, 0, (void**)&src_rev); + ErrorCheck(err_); + AcquireAccess(src_agent_rev, src_rev); + AcquireAccess(cpu_agent_, buf_src_rev); + + err_ = hsa_amd_memory_pool_allocate(sys_pool_, size, 0, (void**)&dst_rev); + ErrorCheck(err_); + AcquireAccess(dst_agent_rev, dst_rev); + AcquireAccess(cpu_agent_, buf_dst_rev); + + // Initialize host buffers to a determinate value + memset(src_rev, 0x23, size); + memset(dst_rev, 0x00, size); + + err_ = hsa_signal_create(1, 0, NULL, &signal_rev); + ErrorCheck(err_); +} + +void RocmAsync::AllocateCopyBuffers(bool bidir, uint32_t size, + void*& src_fwd, hsa_amd_memory_pool_t src_pool_fwd, + void*& dst_fwd, hsa_amd_memory_pool_t dst_pool_fwd, + hsa_agent_t src_agent_fwd, hsa_agent_t dst_agent_fwd, + void*& src_rev, hsa_amd_memory_pool_t src_pool_rev, + void*& dst_rev, hsa_amd_memory_pool_t dst_pool_rev, + hsa_agent_t src_agent_rev, hsa_agent_t dst_agent_rev, + hsa_signal_t& signal_fwd, hsa_signal_t& signal_rev) { + + // Allocate buffers in src and dst pools for forward copy + err_ = hsa_amd_memory_pool_allocate(src_pool_fwd, size, 0, &src_fwd); + ErrorCheck(err_); + err_ = hsa_amd_memory_pool_allocate(dst_pool_fwd, size, 0, &dst_fwd); + ErrorCheck(err_); + + // Allocate buffers in src and dst pools for reverse copy + if (bidir) { + err_ = hsa_amd_memory_pool_allocate(src_pool_rev, size, 0, &src_rev); + ErrorCheck(err_); + err_ = hsa_amd_memory_pool_allocate(dst_pool_rev, size, 0, &dst_rev); + ErrorCheck(err_); + } + + // Acquire access to src and dst buffers for forward copy + AcquireAccess(src_agent_fwd, dst_fwd); + AcquireAccess(dst_agent_fwd, src_fwd); + + // Acquire access to src and dst buffers for reverse copy + if (bidir) { + AcquireAccess(src_agent_rev, dst_rev); + AcquireAccess(dst_agent_rev, src_rev); + } + + // Create a signal to wait on copy operation + // @TODO: replace it with a signal pool call + err_ = hsa_signal_create(1, 0, NULL, &signal_fwd); + ErrorCheck(err_); + if (bidir) { + err_ = hsa_signal_create(1, 0, NULL, &signal_rev); + ErrorCheck(err_); + } +} + +void RocmAsync::ReleaseBuffers(bool bidir, + void* src_fwd, void* src_rev, + void* dst_fwd, void* dst_rev, + hsa_signal_t signal_fwd, + hsa_signal_t signal_rev) { + + // Free the src and dst buffers used in forward copy + // including the signal used to wait + err_ = hsa_amd_memory_pool_free(src_fwd); + ErrorCheck(err_); + err_ = hsa_amd_memory_pool_free(dst_fwd); + ErrorCheck(err_); + err_ = hsa_signal_destroy(signal_fwd); + ErrorCheck(err_); + + // Free the src and dst buffers used in reverse copy + // including the signal used to wait + if (bidir) { + err_ = hsa_amd_memory_pool_free(src_rev); + ErrorCheck(err_); + err_ = hsa_amd_memory_pool_free(dst_rev); + ErrorCheck(err_); + err_ = hsa_signal_destroy(signal_rev); + ErrorCheck(err_); + } +} + +double RocmAsync::GetGpuCopyTime(bool bidir, + hsa_signal_t signal_fwd, + hsa_signal_t signal_rev) { + + // Obtain time taken for forward copy + hsa_amd_profiling_async_copy_time_t async_time_fwd = {0}; + err_= hsa_amd_profiling_get_async_copy_time(signal_fwd, &async_time_fwd); + ErrorCheck(err_); + if (bidir == false) { + return(async_time_fwd.end - async_time_fwd.start); + } + + hsa_amd_profiling_async_copy_time_t async_time_rev = {0}; + err_= hsa_amd_profiling_get_async_copy_time(signal_rev, &async_time_rev); + ErrorCheck(err_); + double start = min(async_time_fwd.start, async_time_rev.start); + double end = max(async_time_fwd.end, async_time_rev.end); + return(end - start); +} + +void RocmAsync::copy_buffer(void* dst, hsa_agent_t dst_agent, + void* src, hsa_agent_t src_agent, + size_t size, hsa_signal_t signal) { + + // Copy from src into dst buffer + err_ = hsa_amd_memory_async_copy(dst, dst_agent, + src, src_agent, + size, 0, NULL, signal); + ErrorCheck(err_); + + // Wait for the forward copy operation to complete + while (hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, + uint64_t(-1), HSA_WAIT_STATE_ACTIVE)); +} + +void RocmAsync::RunCopyBenchmark(async_trans_t& trans) { + + // Bind if this transaction is bidirectional + bool bidir = trans.copy.bidir_; + + // Initialize size of buffer to equal the largest element of allocation + uint32_t size_len = size_list_.size(); + uint32_t max_size = size_list_.back() * 1024 * 1024; + + // Bind to resources such as pool and agents that are involved + // in both forward and reverse copy operations + void* buf_src_fwd; + void* buf_dst_fwd; + void* buf_src_rev; + void* buf_dst_rev; + void* host_src_fwd; + void* host_dst_fwd; + void* host_src_rev; + void* host_dst_rev; + hsa_signal_t signal_fwd; + hsa_signal_t signal_rev; + hsa_signal_t host_signal_fwd; + hsa_signal_t host_signal_rev; + hsa_amd_memory_pool_t src_pool_fwd = trans.copy.src_pool_; + hsa_amd_memory_pool_t dst_pool_fwd = trans.copy.dst_pool_; + hsa_amd_memory_pool_t src_pool_rev = dst_pool_fwd; + hsa_amd_memory_pool_t dst_pool_rev = src_pool_fwd; + hsa_agent_t src_agent_fwd = pool_list_[trans.copy.src_idx_].owner_agent_; + hsa_agent_t dst_agent_fwd = pool_list_[trans.copy.dst_idx_].owner_agent_; + hsa_agent_t src_agent_rev = dst_agent_fwd; + hsa_agent_t dst_agent_rev = src_agent_fwd; + + // Allocate buffers and signal objects + AllocateCopyBuffers(bidir, max_size, + buf_src_fwd, src_pool_fwd, + buf_dst_fwd, dst_pool_fwd, + src_agent_fwd, dst_agent_fwd, + buf_src_rev, src_pool_rev, + buf_dst_rev, dst_pool_rev, + src_agent_rev, dst_agent_rev, + signal_fwd, signal_rev); + + if (verify_) { + AllocateHostBuffers(bidir, max_size, + host_src_fwd, host_dst_fwd, + buf_src_fwd, buf_dst_fwd, + src_agent_fwd, dst_agent_fwd, + host_src_rev, host_dst_rev, + buf_src_rev, buf_dst_rev, + src_agent_rev, dst_agent_rev, + host_signal_fwd, host_signal_rev); + + // Initialize source buffer with values from verification buffer + copy_buffer(buf_src_fwd, src_agent_fwd, + host_src_fwd, cpu_agent_, + max_size, host_signal_fwd); + ErrorCheck(err_); + if (bidir) { + copy_buffer(buf_src_rev, src_agent_rev, + host_src_rev, cpu_agent_, + max_size, host_signal_rev); + ErrorCheck(err_); + } + } + + // Bind the number of iterations + uint32_t iterations = GetIterationNum(); + + // Iterate through the differnt buffer sizes to + // compute the bandwidth as determined by copy + for (uint32_t idx = 0; idx < size_len; idx++) { + + // This should not be happening + uint32_t curr_size = size_list_[idx] * 1024 * 1024; + if (curr_size > max_size) { + break; + } + + std::vector cpu_time; + std::vector gpu_time; + for (uint32_t it = 0; it < iterations; it++) { + #if DEBUG + printf("."); + fflush(stdout); + #endif + + hsa_signal_store_relaxed(signal_fwd, 1); + if (bidir) { + hsa_signal_store_relaxed(signal_rev, 1); + } + + if (verify_) { + AcquireAccess(src_agent_fwd, buf_dst_fwd); + AcquireAccess(dst_agent_fwd, buf_src_fwd); + if (bidir) { + AcquireAccess(src_agent_rev, buf_dst_rev); + AcquireAccess(dst_agent_rev, buf_src_rev); + } + } + + // Create a timer object and reset signals + PerfTimer timer; + uint32_t index = timer.CreateTimer(); + + // Start the timer and launch forward copy operation + timer.StartTimer(index); + err_ = hsa_amd_memory_async_copy(buf_dst_fwd, dst_agent_fwd, + buf_src_fwd, src_agent_fwd, + curr_size, 0, NULL, signal_fwd); + ErrorCheck(err_); + + // Launch reverse copy operation if it is bidirectional + if (bidir) { + err_ = hsa_amd_memory_async_copy(buf_dst_rev, dst_agent_rev, + buf_src_rev, src_agent_rev, + curr_size, 0, NULL, signal_rev); + ErrorCheck(err_); + } + + // Wait for the forward copy operation to complete + while (hsa_signal_wait_acquire(signal_fwd, HSA_SIGNAL_CONDITION_LT, 1, + uint64_t(-1), HSA_WAIT_STATE_ACTIVE)); + + // Wait for the reverse copy operation to complete + if (bidir) { + while (hsa_signal_wait_acquire(signal_rev, HSA_SIGNAL_CONDITION_LT, 1, + uint64_t(-1), HSA_WAIT_STATE_ACTIVE)); + } + + // Stop the timer object + timer.StopTimer(index); + + // Push the time taken for copy into a vector of copy times + cpu_time.push_back(timer.ReadTimer(index)); + + // Collect time from the signal(s) + if (trans.copy.uses_gpu_) { + double temp = GetGpuCopyTime(bidir, signal_fwd, signal_rev); + gpu_time.push_back(temp); + } + + if (verify_) { + + // Re-Establish access to destination buffer and host buffer + AcquireAccess(cpu_agent_, buf_dst_fwd); + AcquireAccess(dst_agent_fwd, host_dst_fwd); + + // Init dst buffer with values from outbuffer of copy operation + hsa_signal_store_relaxed(host_signal_fwd, 1); + copy_buffer(host_dst_fwd, cpu_agent_, + buf_dst_fwd, dst_agent_fwd, + curr_size, host_signal_fwd); + ErrorCheck(err_); + + // Compare output equals input + err_ = (hsa_status_t)memcmp(host_src_fwd, host_dst_fwd, curr_size); + ErrorCheck(err_); + + if (bidir) { + + // Re-Establish access to destination buffer and host buffer + AcquireAccess(cpu_agent_, buf_dst_rev); + AcquireAccess(dst_agent_rev, host_dst_rev); + + hsa_signal_store_relaxed(host_signal_rev, 1); + copy_buffer(host_dst_rev, cpu_agent_, + buf_dst_rev, dst_agent_rev, + curr_size, host_signal_rev); + ErrorCheck(err_); + + // Compare output equals input + err_ = (hsa_status_t)memcmp(host_src_rev, host_dst_rev, curr_size); + ErrorCheck(err_); + } + } + } + #if DEBUG + std::cout << std::endl; + #endif + + // Get Cpu min copy time + trans.cpu_min_time_.push_back(GetMinTime(cpu_time)); + // Get Cpu mean copy time and store to the array + trans.cpu_avg_time_.push_back(GetMeanTime(cpu_time)); + + if (trans.copy.uses_gpu_) { + // Get Gpu min copy time + trans.gpu_min_time_.push_back(GetMinTime(gpu_time)); + // Get Gpu mean copy time and store to the array + trans.gpu_avg_time_.push_back(GetMeanTime(gpu_time)); + } + + // Clear the stack of cpu times + cpu_time.clear(); + gpu_time.clear(); + } + + // Free up buffers and signal objects used in copy operation + ReleaseBuffers(bidir, buf_src_fwd, buf_src_rev, + buf_dst_fwd, buf_dst_rev, signal_fwd, signal_rev); + + if (verify_) { + ReleaseBuffers(bidir, host_src_fwd, host_src_rev, + host_dst_fwd, host_dst_rev, host_signal_fwd, host_signal_rev); + } +} + +void RocmAsync::RunIOBenchmark(async_trans_t& trans) { + + std::cout << "Unsupported Request - Read / Write" << std::endl; + exit(1); +} + +void RocmAsync::Run() { + + // Enable profiling of Async Copy Activity + err_ = hsa_amd_profiling_async_copy_enable(true); + ErrorCheck(err_); + + // Iterate through the list of transactions and execute them + uint32_t trans_size = trans_list_.size(); + for (uint32_t idx = 0; idx < trans_size; idx++) { + async_trans_t& trans = trans_list_[idx]; + if ((trans.req_type_ == REQ_COPY_BIDIR) || + (trans.req_type_ == REQ_COPY_UNIDIR)) { + RunCopyBenchmark(trans); + } + if ((trans.req_type_ == REQ_READ) || + (trans.req_type_ == REQ_WRITE)) { + RunIOBenchmark(trans); + } + } + + // Disable profiling of Async Copy Activity + err_ = hsa_amd_profiling_async_copy_enable(false); + ErrorCheck(err_); + +} + +void RocmAsync::Close() { + hsa_status_t status = hsa_shut_down(); + ErrorCheck(status); + return; +} + +// Sets up the bandwidth test object to enable running +// the various test scenarios requested by user. The +// things this proceedure takes care of are: +// +// Parse user arguments +// Discover RocR Device Topology +// Determine validity of requested test scenarios +// Build the list of transactions to execute +// Miscellaneous +// +void RocmAsync::SetUp() { + + // Parse user arguments + ParseArguments(); + + // Validate input parameters + bool status = ValidateArguments(); + if (status == false) { + PrintHelpScreen(); + exit(1); + } + + // Build list of transactions (copy, read, write) to execute + status = BuildTransList(); + if (status == false) { + PrintHelpScreen(); + exit(1); + } + + // Print Debug Info - List of Agents, Pool, Transactions + char* print_debug = getenv("PRINT_DEBUG"); + if (print_debug) { + //PrintAgentsList(); + //PrintPoolsList(); + PrintTransList(); + //PrintTopology(); + } +} + +RocmAsync::RocmAsync(int argc, char** argv) : BaseTest() { + usr_argc_ = argc; + usr_argv_ = argv; + verify_ = false; + pool_index_ = 0; + agent_index_ = 0; + req_read_ = REQ_INVALID; + req_write_ = REQ_INVALID; + req_copy_bidir_ = REQ_INVALID; + req_copy_unidir_ = REQ_INVALID; +} + +RocmAsync::~RocmAsync() { } + diff --git a/rocrtst/samples/rocm_async/rocm_async.hpp b/rocrtst/samples/rocm_async/rocm_async.hpp new file mode 100644 index 0000000000..f230f90c30 --- /dev/null +++ b/rocrtst/samples/rocm_async/rocm_async.hpp @@ -0,0 +1,338 @@ +#ifndef __ROCM_ASYNC_BW_H__ +#define __ROCM_ASYNC_BW_H__ + +#include "hsa/hsa.h" +#include "base_test.hpp" +#include "hsatimer.hpp" +#include "common.hpp" +#include + +using namespace std; + +// Structure to encapsulate a RocR agent and its index in a list +typedef struct agent_info { + + agent_info(hsa_agent_t agent, + uint32_t index, hsa_device_type_t device_type) { + agent_ = agent; + index_ = index; + device_type_ = device_type; + } + + agent_info() {} + + uint32_t index_; + hsa_agent_t agent_; + hsa_device_type_t device_type_; + +} agent_info_t; + +typedef struct pool_info { + + pool_info(hsa_agent_t agent, uint32_t agent_index, + hsa_amd_memory_pool_t pool, hsa_amd_segment_t segment, + size_t size, uint32_t index, bool is_fine_grained, + bool is_kernarg, bool access_to_all, + hsa_amd_memory_pool_access_t owner_access) { + + pool_ = pool; + index_ = index; + segment_ = segment; + owner_agent_ = agent; + agent_index_ = agent_index; + allocable_size_ = size; + is_kernarg_ = is_kernarg; + owner_access_ = owner_access; + access_to_all_ = access_to_all; + is_fine_grained_ = is_fine_grained; + } + + pool_info() {} + + uint32_t index_; + bool is_kernarg_; + bool access_to_all_; + bool is_fine_grained_; + size_t allocable_size_; + uint32_t agent_index_; + hsa_agent_t owner_agent_; + hsa_amd_segment_t segment_; + hsa_amd_memory_pool_t pool_; + hsa_amd_memory_pool_access_t owner_access_; + +} pool_info_t; + +// Used to print out topology info +typedef struct agent_pool_info { + + agent_pool_info() {} + + agent_info agent; + + vector pool_list; + +} agent_pool_info_t; + +typedef struct async_trans { + + uint32_t req_type_; + union { + struct { + bool bidir_; + bool uses_gpu_; + uint32_t src_idx_; + uint32_t dst_idx_; + hsa_amd_memory_pool_t src_pool_; + hsa_amd_memory_pool_t dst_pool_; + } copy; + struct { + void* code_; + uint32_t agent_idx_; + hsa_agent_t agent_; + uint32_t pool_idx_; + hsa_amd_memory_pool_t pool_; + } kernel; + }; + + // Cpu BenchMark average copy time + vector cpu_avg_time_; + + // Cpu Min time + vector cpu_min_time_; + + // Gpu BenchMark average copy time + vector gpu_avg_time_; + + // Gpu Min time + vector gpu_min_time_; + + async_trans(uint32_t req_type) { req_type_ = req_type; } +} async_trans_t; + +typedef enum Request_Type { + + REQ_READ = 1, + REQ_WRITE = 2, + REQ_COPY_BIDIR = 3, + REQ_COPY_UNIDIR = 4, + REQ_INVALID = 5, + +} Request_Type; + +class RocmAsync : public BaseTest { + + public: + + // @brief: Constructor for test case of RocmAsync + RocmAsync(int argc, char** argv); + + // @brief: Destructor for test case of RocmAsync + virtual ~RocmAsync(); + + // @brief: Setup the environment for measurement + virtual void SetUp(); + + // @brief: Core measurement execution + virtual void Run(); + + // @brief: Clean up and retrive the resource + virtual void Close(); + + // @brief: Display the results + virtual void Display() const; + + private: + + // @brief: Print Help Menu Screen + void PrintHelpScreen(); + + // @brief: Discover the topology of pools on Rocm Platform + void DiscoverTopology(); + + // @brief: Print topology info + void PrintTopology(); + + // @brief: Print info on agents in system + void PrintAgentsList(); + + // @brief: Print info on memory pools in system + void PrintPoolsList(); + + // @brief: Parse the arguments provided by user to + // build list of transactions + void ParseArguments(); + + // @brief: Print the list of transactions + void PrintTransList(); + + // @brief: Run read/write requests of users + void RunIOBenchmark(async_trans_t& trans); + + // @brief: Run copy requests of users + void RunCopyBenchmark(async_trans_t& trans); + + // @brief: Get iteration number + uint32_t GetIterationNum(); + + // @brief: Get the mean copy time + double GetMeanTime(std::vector& vec); + + // @brief: Get the min copy time + double GetMinTime(std::vector& vec); + + // @brief: Dispaly Benchmark result + void DisplayIOTime(async_trans_t& trans) const; + void DisplayCopyTime(async_trans_t& trans) const; + + private: + + // @brief: Validate the arguments passed in by user + bool ValidateArguments(); + bool ValidateReadReq(); + bool ValidateWriteReq(); + bool ValidateReadOrWriteReq(vector& in_list); + + bool ValidateBidirCopyReq(); + bool ValidateUnidirCopyReq(); + bool ValidateCopyReq(vector& in_list); + void PrintIOAccessError(uint32_t agent_idx, uint32_t pool_idx); + void PrintCopyAccessError(uint32_t src_pool_idx, uint32_t dst_pool_idx); + + bool PoolIsPresent(vector& in_list); + bool PoolIsDuplicated(vector& in_list); + + // @brief: Builds a list of transaction per user request + bool BuildTransList(); + bool BuildReadTrans(); + bool BuildWriteTrans(); + bool BuildBidirCopyTrans(); + bool BuildUnidirCopyTrans(); + bool BuildReadOrWriteTrans(uint32_t req_type, + vector& in_list); + bool BuildCopyTrans(uint32_t req_type, + vector& src_list, + vector& dst_list); + + void AllocateCopyBuffers(bool bidir, uint32_t size, + void*& src_fwd, hsa_amd_memory_pool_t src_pool_fwd, + void*& dst_fwd, hsa_amd_memory_pool_t dst_pool_fwd, + hsa_agent_t src_agent_fwd, hsa_agent_t dst_agent_fwd, + void*& src_rev, hsa_amd_memory_pool_t src_pool_rev, + void*& dst_rev, hsa_amd_memory_pool_t dst_pool_rev, + hsa_agent_t src_agent_rev, hsa_agent_t dst_agent_rev, + hsa_signal_t& signal_fwd, hsa_signal_t& signal_rev); + void ReleaseBuffers(bool bidir, + void* src_fwd, void* src_rev, + void* dst_fwd, void* dst_rev, + hsa_signal_t signal_fwd, hsa_signal_t signal_rev); + double GetGpuCopyTime(bool bidir, hsa_signal_t signal_fwd, hsa_signal_t signal_rev); + void AllocateHostBuffers(bool bidir, uint32_t size, + void*& src_fwd, void*& dst_fwd, + void* buf_src_fwd, void* buf_dst_fwd, + hsa_agent_t src_agent_fwd, hsa_agent_t dst_agent_fwd, + void*& src_rev, void*& dst_rev, + void* buf_src_rev, void* buf_dst_rev, + hsa_agent_t src_agent_rev, hsa_agent_t dst_agent_rev, + hsa_signal_t& signal_fwd, hsa_signal_t& signal_rev); + void copy_buffer(void* dst, hsa_agent_t dst_agent, + void* src, hsa_agent_t src_agent, + size_t size, hsa_signal_t signal); + + // @brief: Check if agent and access memory pool, if so, set + // access to the agent, if not, exit + void AcquireAccess(hsa_agent_t agent, void* ptr); + + // Functions to find agents and memory pools and udpate + // relevant data structures used to maintain system topology + friend hsa_status_t AgentInfo(hsa_agent_t agent, void* data); + friend hsa_status_t MemPoolInfo(hsa_amd_memory_pool_t pool, void* data); + + protected: + + // More variables declared for testing + // vector tran_; + + // Used to help count agent_info + uint32_t agent_index_; + + // List used to store agent info, indexed by agent_index_ + vector agent_list_; + + // Used to help count pool_info_t + uint32_t pool_index_; + + // List used to store pool_info_t, indexed by pool_index_ + vector pool_list_; + + // List used to store agent_pool_info_t + vector agent_pool_list_; + + // List of agents involved in a bidrectional copy operation + // Size of the list cannot exceed the number of agents + // reported by the system + vector bidir_list_; + + // List of source agents in a unidrectional copy operation + // Size of the list cannot exceed the number of agents + // reported by the system + vector src_list_; + + // List of destination agents in a unidrectional copy operation + // Size of the list cannot exceed the number of agents + // reported by the system + vector dst_list_; + + // List of agents involved in read operation. Has + // two agents, the first agent hosts the memory pool + // while the second agent executes the read operation + vector read_list_; + + // List of agents involved in write operation. Has + // two agents, the first agent hosts the memory pool + // while the second agent executes the write operation + vector write_list_; + + // List of sizes to use in copy and read/write transactions + // Size is specified in terms of Megabytes + vector size_list_; + + // Type of service requested by user + uint32_t req_read_; + uint32_t req_write_; + uint32_t req_copy_bidir_; + uint32_t req_copy_unidir_; + + // List used to store transactions per user request + vector trans_list_; + + // Variable to store argument number + + // Variable to store argument number + + // Variable to store argument number + uint32_t usr_argc_; + + // Pointer to store address of argument text + char** usr_argv_; + + // BenchMark copy time + vector op_time_; + + // Min time + vector min_time_; + + // Determines if user has requested verification + bool verify_; + + // CPU agent used for verification + hsa_agent_t cpu_agent_; + + // System region + hsa_amd_memory_pool_t sys_pool_; + + static const uint32_t SIZE_LIST[4]; + //static const uint32_t SIZE_LIST[9]; + +}; + +#endif diff --git a/rocrtst/samples/rocm_async/rocm_async_parse.cpp b/rocrtst/samples/rocm_async/rocm_async_parse.cpp new file mode 100755 index 0000000000..2461896d84 --- /dev/null +++ b/rocrtst/samples/rocm_async/rocm_async_parse.cpp @@ -0,0 +1,206 @@ +#include "common.hpp" +#include "rocm_async.hpp" + +#include +#include +#include + +// Parse option value string. The string has one more decimal +// values separated by comma - "3,6,9,12,15". +static bool ParseOptionValue(char* value, vector&value_list) { + + // Capture the option value string + std::stringstream stream; + stream << value; + + uint32_t token = 0x11231926; + do { + + // Read the option value + stream >> token; + + // Update output list with values + value_list.push_back(token); + + // Ignore the delimiter + if((stream.eof()) || + (stream.peek() == ',')) { + stream.ignore(); + } else { + return false; + } + + } while (!stream.eof()); + + return true; +} + +void RocmAsync::ParseArguments() { + + bool print_help = false; + bool copy_all_bi = false; + bool copy_all_uni = false; + bool print_topology = false; + + // This will suppress prints from getopt implementation + // In case of error, it will return the character '?' as + // return value. + opterr = 0; + + int opt; + bool status; + while ((opt = getopt(usr_argc_, usr_argv_, "hvtaAb:s:d:r:w:m:")) != -1) { + switch (opt) { + + // Print help screen + case 'h': + print_help = true; + break; + + // Print system topology + case 't': + print_topology = true; + break; + + // Set verification flag to true + case 'v': + verify_ = true; + break; + + // Collect list of agents involved in bidirectional copy operation + case 'b': + status = ParseOptionValue(optarg, bidir_list_); + if (status) { + req_copy_bidir_ = REQ_COPY_BIDIR; + break; + } + print_help = true; + break; + + // Collect list of source pools involved in unidirectional copy operation + case 's': + status = ParseOptionValue(optarg, src_list_); + if (status) { + req_copy_unidir_ = REQ_COPY_UNIDIR; + break; + } + print_help = true; + break; + + // Collect list of destination pools involved in unidirectional copy operation + case 'd': + status = ParseOptionValue(optarg, dst_list_); + if (status) { + req_copy_unidir_ = REQ_COPY_UNIDIR; + break; + } + print_help = true; + break; + + // Collect request to read a buffer + case 'r': + req_read_ = REQ_READ; + status = ParseOptionValue(optarg, read_list_); + if (status == false) { + print_help = true; + } + break; + + // Collect request to write a buffer + case 'w': + req_write_ = REQ_WRITE; + status = ParseOptionValue(optarg, write_list_); + if (status == false) { + print_help = true; + } + break; + + // Size of buffers to use in copy and read/write operations + case 'm': + status = ParseOptionValue(optarg, size_list_); + if (status == false) { + print_help = true; + } + break; + + // Enable Unidirectional copy among all valid pools + case 'a': + copy_all_uni = true; + req_copy_unidir_ = REQ_COPY_UNIDIR; + break; + + // Enable Bidirectional copy among all valid pools + case 'A': + copy_all_bi = true; + req_copy_bidir_ = REQ_COPY_BIDIR; + break; + + // getopt implementation returns the value of the unknown + // option or an option with missing operand in the variable + // optopt + case '?': + std::cout << "Value of optopt is: " << '?' << std::endl; + if ((optopt == 'b' || optopt == 's' || optopt == 'd' || optopt == 'e')) { + std::cout << "Error: Option -b -s -d and -e require argument" << std::endl; + } + print_help = true; + break; + default: + print_help = true; + break; + } + } + + // Print help screen if user option has "-h" + if (print_help) { + PrintHelpScreen(); + exit(0); + } + + // Initialize Roc Runtime + err_ = hsa_init(); + ErrorCheck(err_); + + // Discover the topology of RocR agent in system + DiscoverTopology(); + + // Print system topology if user option has "-t" + if (print_topology) { + PrintTopology(); + exit(0); + } + + // Invalidate request if user has requested full + // copying for both unidirectional and bidirectional + if ((copy_all_bi) && (copy_all_uni)) { + PrintHelpScreen(); + exit(0); + } + + // Initialize pool list if full copying in unidirectional mode is enabled + if (copy_all_uni) { + uint32_t size = pool_list_.size(); + for (uint32_t idx = 0; idx < size; idx++) { + src_list_.push_back(idx); + dst_list_.push_back(idx); + } + } + + // Initialize pool list if full copying in bidirectional mode is enabled + if (copy_all_bi) { + uint32_t size = pool_list_.size(); + for (uint32_t idx = 0; idx < size; idx++) { + bidir_list_.push_back(idx); + } + } + + // Initialize the list of buffer sizes to use in copy/read/write operations + if (size_list_.size() == 0) { + uint32_t size_len = sizeof(SIZE_LIST)/sizeof(uint32_t); + for (uint32_t idx = 0; idx < size_len; idx++) { + size_list_.push_back(SIZE_LIST[idx]); + } + } + std::sort(size_list_.begin(), size_list_.end()); +} + diff --git a/rocrtst/samples/rocm_async/rocm_async_print.cpp b/rocrtst/samples/rocm_async/rocm_async_print.cpp new file mode 100755 index 0000000000..4a25c82418 --- /dev/null +++ b/rocrtst/samples/rocm_async/rocm_async_print.cpp @@ -0,0 +1,177 @@ +#include "common.hpp" +#include "rocm_async.hpp" + +// @Brief: Print Help Menu Screen +void RocmAsync::PrintHelpScreen() { + + std::cout << std::endl; + std::cout << "Runs with following options:" << std::endl; + std::cout << std::endl; + std::cout << "\t -h Prints the help screen" << std::endl; + std::cout << "\t -g Prints Gpu times for transfers" << std::endl; + std::cout << "\t -t Prints system topology and its memory pools" << std::endl; + std::cout << "\t -m List of buffer sizes to use, specified in Megabytes" << std::endl; + std::cout << "\t -r List of pool,agent pairs engaged in Read operation" << std::endl; + std::cout << "\t -w List of pool,agent pairs engaged in Write operation" << std::endl; + std::cout << "\t -b List pools to use in bidirectional copy operations" << std::endl; + std::cout << "\t -s List of source pools to use in copy unidirectional operations" << std::endl; + std::cout << "\t -d List of destination pools to use in unidirectional copy operations" << std::endl; + std::cout << "\t -a Perform Unidirectional Copy involving all pool combinations" << std::endl; + std::cout << "\t -A Perform Bidirectional Copy involving all pool combinations" << std::endl; + std::cout << std::endl; + + std::cout << "\t @note 1: Removes copyReq(srcI, dstI) - where Src & Dst Pools are same" << std::endl; + std::cout << std::endl; + std::cout << "\t @note 2: Removes copyReq(srcI, dstJ) - where Src & Dst Pools are Cpu bound " << std::endl; + std::cout << std::endl; + std::cout << "\t @note 3: Treats copyReq(dstI, srcJ) as NOT EQUAL to copyReq(dstJ, srcI) " << std::endl; + std::cout << "\t Underlying copy engine could be different " << std::endl; + std::cout << std::endl; +} + +// @brief: Print the topology of Memory Pools and Agents present in system +void RocmAsync::PrintTopology() { + + size_t count = agent_pool_list_.size(); + std::cout << std::endl; + for (uint32_t idx = 0; idx < count; idx++) { + agent_pool_info_t node = agent_pool_list_.at(idx); + + // Print agent info + std::cout << "Agent: " << node.agent.index_ << std::endl; + if (HSA_DEVICE_TYPE_CPU == node.agent.device_type_) + std::cout << " Agent Device Type: CPU" << std::endl; + else if (HSA_DEVICE_TYPE_GPU == node.agent.device_type_) + std::cout << " Agent Device Type: GPU" << std::endl; + + // Print pool info + size_t pool_count = node.pool_list.size(); + for (uint32_t jdx = 0; jdx < pool_count; jdx++) { + std::cout << " Memory Pool: " + << node.pool_list.at(jdx).index_ << std::endl; + std::cout << " max allocable size in KB: " + << node.pool_list.at(jdx).allocable_size_ / 1024 << std::endl; + std::cout << " segment id: " + << node.pool_list.at(jdx).segment_ << std::endl; + std::cout << " is kernarg: " + << node.pool_list.at(jdx).is_kernarg_ << std::endl; + std::cout << " is fine-grained: " + << node.pool_list.at(jdx).is_fine_grained_ << std::endl; + std::cout << " accessible to owner: " + << node.pool_list.at(jdx).owner_access_ << std::endl; + std::cout << " accessible to all by default: " + << node.pool_list.at(jdx).access_to_all_ << std::endl; + } + std::cout << std::endl; + } + std::cout << std::endl; +} + +// @brief: Print info on agents in system +void RocmAsync::PrintAgentsList() { + + size_t count = agent_pool_list_.size(); + for (uint32_t idx = 0; idx < count; idx++) { + std::cout << std::endl; + agent_pool_info_t node = agent_pool_list_.at(idx); + std::cout << "Agent: " << node.agent.index_ << std::endl; + if (HSA_DEVICE_TYPE_CPU == node.agent.device_type_) + std::cout << " Agent Device Type: CPU" << std::endl; + else if (HSA_DEVICE_TYPE_GPU == node.agent.device_type_) + std::cout << " Agent Device Type: GPU" << std::endl; + } + std::cout << std::endl; +} + +// @brief: Print info on memory pools in system +void RocmAsync::PrintPoolsList() { + + size_t pool_count = pool_list_.size(); + for (uint32_t jdx = 0; jdx < pool_count; jdx++) { + std::cout << std::endl; + std::cout << "Memory Pool Idx: " + << pool_list_.at(jdx).index_ << std::endl; + std::cout << " max allocable size in KB: " + << pool_list_.at(jdx).allocable_size_ / 1024 << std::endl; + std::cout << " segment id: " + << pool_list_.at(jdx).segment_ << std::endl; + std::cout << " is kernarg: " + << pool_list_.at(jdx).is_kernarg_ << std::endl; + std::cout << " is fine-grained: " + << pool_list_.at(jdx).is_fine_grained_ << std::endl; + std::cout << " accessible to owner: " + << pool_list_.at(jdx).owner_access_ << std::endl; + std::cout << " accessible to all by default: " + << pool_list_.at(jdx).access_to_all_ << std::endl; + } + std::cout << std::endl; + +} + +// @brief: Print the list of transactions that will be executed +void RocmAsync::PrintTransList() { + + size_t count = trans_list_.size(); + for (uint32_t idx = 0; idx < count; idx++) { + async_trans_t trans = trans_list_.at(idx); + std::cout << std::endl; + std::cout << " Transaction Id: " << idx << std::endl; + std::cout << " Transaction Type: " << trans.req_type_ << std::endl; + if ((trans.req_type_ == REQ_READ) || (trans.req_type_ == REQ_WRITE)) { + std::cout << "Rocm Kernel used by Transaction: " << trans.kernel.code_ << std::endl; + std::cout << "Rocm Memory Pool Used by Kernel: " << trans.kernel.pool_idx_ << std::endl; + std::cout << " Rocm Agent used for Execution: " << trans.kernel.agent_idx_ << std::endl; + } + if ((trans.req_type_ == REQ_COPY_BIDIR) || (trans.req_type_ == REQ_COPY_UNIDIR)) { + std::cout << " Src Memory Pool used in Copy: " << trans.copy.src_idx_ << std::endl; + std::cout << " Dst Memory Pool used in Copy: " << trans.copy.dst_idx_ << std::endl; + } + + } + std::cout << std::endl; +} + +// @brief: Prints error message when a request to copy between +// source pool and destination pool is not possible +void RocmAsync::PrintCopyAccessError(uint32_t src_idx, uint32_t dst_idx) { + + // Retrieve Roc runtime handles for Src memory pool and agents + uint32_t src_dev_idx = pool_list_[src_idx].agent_index_; + hsa_device_type_t src_dev_type = agent_list_[src_dev_idx].device_type_; + + // Retrieve Roc runtime handles for Dst memory pool and agents + uint32_t dst_dev_idx = pool_list_[dst_idx].agent_index_; + hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_; + + std::cout << std::endl; + std::cout << "Index of Src Pool: " << src_idx << std::endl; + std::cout << "Index of Dst Pool: " << dst_idx << std::endl; + std::cout << "Index of Src Pool's Agent: " << src_dev_idx << std::endl; + std::cout << "Index of Dst Pool's Agent: " << dst_dev_idx << std::endl; + std::cout << "Device Type of Src Pool's Agent: " << src_dev_type << std::endl; + std::cout << "Device Type of Dst Pool's Agent: " << dst_dev_type << std::endl; + std::cout << "Rocm Agent hosting Src Pool cannot ACCESS Dst Pool" << std::endl; + std::cout << std::endl; +} + +// @brief: Prints error message when a request to read / write from +// a pool by an agent is not possible +void RocmAsync::PrintIOAccessError(uint32_t exec_idx, uint32_t pool_idx) { + + // Retrieve device type of executing agent + hsa_device_type_t exec_dev_type = agent_list_[exec_idx].device_type_; + + // Retrieve device type of memory pool's agent + uint32_t pool_dev_idx = pool_list_[pool_idx].agent_index_; + hsa_device_type_t pool_dev_type = agent_list_[pool_dev_idx].device_type_; + + std::cout << std::endl; + std::cout << "Index of Executing Agent: " << exec_idx << std::endl; + std::cout << "Device Type of Executing Agent: " << exec_dev_type << std::endl; + + std::cout << "Index of Buffer's Memory Pool: " << pool_idx << std::endl; + std::cout << "Index of Buffer Memory Pool's Agent: " << pool_dev_idx << std::endl; + std::cout << "Device Type of Buffer Memory Pool's Agent: " << pool_dev_type << std::endl; + std::cout << "Rocm Agent executing Read / Write request cannot ACCESS Buffer's Memory Pool" << std::endl; + std::cout << std::endl; +} diff --git a/rocrtst/samples/rocm_async/rocm_async_report.cpp b/rocrtst/samples/rocm_async/rocm_async_report.cpp new file mode 100755 index 0000000000..b589fa574c --- /dev/null +++ b/rocrtst/samples/rocm_async/rocm_async_report.cpp @@ -0,0 +1,167 @@ +#include "common.hpp" +#include "rocm_async.hpp" + +#include +#include +#include + +static void printRecord(uint32_t size, double avg_time, + double bandwidth, double min_time, + double peak_bandwidth) { + + std::stringstream size_str; + size_str << size << " MB"; + + uint32_t format = 15; + std::cout.precision(3); + std::cout.width(format); + std::cout << size_str.str(); + std::cout.width(format); + std::cout << (avg_time * 1e6); + std::cout.width(format); + std::cout << bandwidth; + std::cout.width(format); + std::cout << (min_time * 1e6); + std::cout.width(format); + std::cout << peak_bandwidth; + std::cout << std::endl; +} + +static void printCopyBanner(uint32_t src_pool_id, uint32_t src_agent_type, + uint32_t dst_pool_id, uint32_t dst_agent_type) { + + std::stringstream src_type; + std::stringstream dst_type; + (src_agent_type == 0) ? src_type << "Cpu" : src_type << "Gpu"; + (dst_agent_type == 0) ? dst_type << "Cpu" : dst_type << "Gpu"; + + std::cout << std::endl; + std::cout << "================"; + std::cout << " Benchmark Result"; + std::cout << " ================"; + std::cout << std::endl; + std::cout << "================"; + std::cout << " Src Pool Id: " << src_pool_id; + std::cout << " Src Agent Type: " << src_type.str(); + std::cout << " ================"; + std::cout << std::endl; + std::cout << "================"; + std::cout << " Dst Pool Id: " << dst_pool_id; + std::cout << " Dst Agent Type: " << dst_type.str(); + std::cout << " ================"; + std::cout << std::endl; + std::cout << std::endl; + + uint32_t format = 15; + std::cout.setf(ios::left); + std::cout.width(format); + std::cout << "Data Size"; + std::cout.width(format); + std::cout << "Avg Time(us)"; + std::cout.width(format); + std::cout << "Avg BW(GB/s)"; + std::cout.width(format); + std::cout << "Min Time(us)"; + std::cout.width(format); + std::cout << "Peak BW(GB/s)"; + std::cout << std::endl; +} + +double RocmAsync::GetMinTime(std::vector& vec) { + + std::sort(vec.begin(), vec.end()); + return vec.at(0); +} + +double RocmAsync::GetMeanTime(std::vector& vec) { + + std::sort(vec.begin(), vec.end()); + vec.erase(vec.begin()); + vec.erase(vec.begin(), vec.begin() + num_iteration_ * 0.1); + vec.erase(vec.begin() + num_iteration_, vec.end()); + + double mean = 0.0; + int num = vec.size(); + for (int it = 0; it < num; it++) { + mean += vec[it]; + } + mean /= num; + return mean; +} + +void RocmAsync::Display() const { + + // Iterate through list of transactions and display its timing data + uint32_t trans_size = trans_list_.size(); + if (trans_size == 0) { + std::cout << std::endl; + std::cout << " One or more of the requests wered filtered out " << std::endl; + std::cout << " i.e. No Valid Requests were Made or Remain" << std::endl; + std::cout << std::endl; + return; + } + for (uint32_t idx = 0; idx < trans_size; idx++) { + async_trans_t trans = trans_list_[idx]; + if ((trans.req_type_ == REQ_COPY_BIDIR) || + (trans.req_type_ == REQ_COPY_UNIDIR)) { + DisplayCopyTime(trans); + } + if ((trans.req_type_ == REQ_READ) || + (trans.req_type_ == REQ_WRITE)) { + DisplayIOTime(trans); + } + } + std::cout << std::endl; +} + +void RocmAsync::DisplayIOTime(async_trans_t& trans) const { + +} + +void RocmAsync::DisplayCopyTime(async_trans_t& trans) const { + + // Get the frequency of Gpu Timestamping + uint64_t sys_freq = 0; + hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sys_freq); + + // Print Benchmark Header + uint32_t src_idx = trans.copy.src_idx_; + uint32_t dst_idx = trans.copy.dst_idx_; + uint32_t src_dev_idx = pool_list_[src_idx].agent_index_; + hsa_device_type_t src_dev_type = agent_list_[src_dev_idx].device_type_; + uint32_t dst_dev_idx = pool_list_[dst_idx].agent_index_; + hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_; + printCopyBanner(src_idx, src_dev_type, dst_idx, dst_dev_type); + + double avg_time = 0; + double min_time = 0; + double bandwidth = 0; + uint32_t data_size = 0; + double peak_bandwidth = 0; + uint32_t size_len = size_list_.size(); + for (uint32_t idx = 0; idx < size_len; idx++) { + + // Adjust size of data involved in copy + data_size = size_list_[idx]; + if (trans.copy.bidir_ == true) { + data_size += size_list_[idx]; + } + data_size = data_size * 1024 * 1024; + + // Copy operation does not involve a Gpu device + if (trans.copy.uses_gpu_ != true) { + avg_time = trans.cpu_avg_time_[idx]; + min_time = trans.cpu_min_time_[idx]; + bandwidth = (double)data_size / avg_time / 1000 / 1000 / 1000; + peak_bandwidth = (double)data_size / min_time / 1000 / 1000 / 1000; + } else { + avg_time = trans.gpu_avg_time_[idx] / sys_freq; + min_time = trans.gpu_min_time_[idx] / sys_freq; + bandwidth = (double)data_size / avg_time / 1000 / 1000 / 1000; + peak_bandwidth = (double)data_size / min_time / 1000 / 1000 / 1000; + } + + printRecord(size_list_[idx], avg_time, bandwidth, min_time, peak_bandwidth); + } +} + diff --git a/rocrtst/samples/rocm_async/rocm_async_topology.cpp b/rocrtst/samples/rocm_async/rocm_async_topology.cpp new file mode 100755 index 0000000000..1e7af1abe3 --- /dev/null +++ b/rocrtst/samples/rocm_async/rocm_async_topology.cpp @@ -0,0 +1,114 @@ +#include "common.hpp" +#include "rocm_async.hpp" + +// @brief: Helper method to iterate throught the memory pools of +// an agent and discover its properties +hsa_status_t MemPoolInfo(hsa_amd_memory_pool_t pool, void* data) { + + hsa_status_t status; + RocmAsync* asyncDrvr = reinterpret_cast(data); + + // Query pools' segment, report only pools from global segment + hsa_amd_segment_t segment; + status = hsa_amd_memory_pool_get_info(pool, + HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); + ErrorCheck(status); + if (HSA_AMD_SEGMENT_GLOBAL != segment) { + return HSA_STATUS_SUCCESS; + } + + // Determine if allocation is allowed in this pool + // Report only pools that allow an alloction by user + bool alloc = false; + status = hsa_amd_memory_pool_get_info(pool, + HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &alloc); + ErrorCheck(status); + if (alloc != true) { + return HSA_STATUS_SUCCESS; + } + + // Query the max allocatable size + size_t max_size = 0; + status = hsa_amd_memory_pool_get_info(pool, + HSA_AMD_MEMORY_POOL_INFO_SIZE, &max_size); + ErrorCheck(status); + + // Determine if the pools is accessible to all agents + bool access_to_all = false; + status = hsa_amd_memory_pool_get_info(pool, + HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &access_to_all); + ErrorCheck(status); + + // Determine type of access to owner agent + hsa_amd_memory_pool_access_t owner_access; + hsa_agent_t agent = asyncDrvr->agent_list_.back().agent_; + status = hsa_amd_agent_memory_pool_get_info(agent, pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &owner_access); + ErrorCheck(status); + + // Determine if the pool is fine-grained or coarse-grained + uint32_t flag = 0; + status = hsa_amd_memory_pool_get_info(pool, + HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + ErrorCheck(status); + bool is_kernarg = (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & flag); + bool is_fine_grained = (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & flag); + + // Update the pool handle for system memory if kernarg is true + if (is_kernarg) { + asyncDrvr->sys_pool_ = pool; + } + + // Create an instance of agent_pool_info and add it to the list + pool_info_t pool_info(agent, asyncDrvr->agent_index_, pool, + segment, max_size, asyncDrvr->pool_index_, + is_fine_grained, is_kernarg, + access_to_all, owner_access); + asyncDrvr->pool_list_.push_back(pool_info); + + // Create an agent_pool_infot and add it to its list + asyncDrvr->agent_pool_list_[asyncDrvr->agent_index_].pool_list.push_back(pool_info); + asyncDrvr->pool_index_++; + + return HSA_STATUS_SUCCESS; +} + +// @brief: Helper method to iterate throught the agents of +// a system and discover its properties +hsa_status_t AgentInfo(hsa_agent_t agent, void* data) { + + RocmAsync* asyncDrvr = reinterpret_cast(data); + + // Get the name of the agent + char agent_name[64]; + hsa_status_t status; + status = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_name); + ErrorCheck(status); + + // Get device type + hsa_device_type_t device_type; + status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); + ErrorCheck(status); + + // Capture the handle of Cpu agent + if (device_type == HSA_DEVICE_TYPE_CPU) { + asyncDrvr->cpu_agent_ = agent; + } + + asyncDrvr->agent_list_.push_back(agent_info(agent, asyncDrvr->agent_index_, device_type)); + + // Contruct an new agent_pool_info structure and add it to the list + agent_pool_info node; + node.agent = asyncDrvr->agent_list_.back(); + asyncDrvr->agent_pool_list_.push_back(node); + + status = hsa_amd_agent_iterate_memory_pools(agent, MemPoolInfo, asyncDrvr); + asyncDrvr->agent_index_++; + + return HSA_STATUS_SUCCESS; +} + +void RocmAsync::DiscoverTopology() { + err_ = hsa_iterate_agents(AgentInfo, this); +} + diff --git a/rocrtst/samples/rocm_async/rocm_async_trans.cpp b/rocrtst/samples/rocm_async/rocm_async_trans.cpp new file mode 100755 index 0000000000..33d0c54aad --- /dev/null +++ b/rocrtst/samples/rocm_async/rocm_async_trans.cpp @@ -0,0 +1,173 @@ +#include "common.hpp" +#include "rocm_async.hpp" + +bool RocmAsync::BuildReadOrWriteTrans(uint32_t req_type, + vector& in_list) { + + // Validate the list of pool-agent tuples + hsa_status_t status; + hsa_amd_memory_pool_access_t access; + uint32_t list_size = in_list.size(); + for (uint32_t idx = 0; idx < list_size; idx+=2) { + + uint32_t pool_idx = in_list[idx]; + uint32_t exec_idx = in_list[idx + 1]; + + // Retrieve Roc runtime handles for memory pool and agent + hsa_agent_t exec_agent = agent_list_[exec_idx].agent_; + hsa_amd_memory_pool_t pool = pool_list_[pool_idx].pool_; + + // Determine agent can access the memory pool + status = hsa_amd_agent_memory_pool_get_info(exec_agent, pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access); + ErrorCheck(status); + + // Determine if accessibility to agent is not denied + if (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { + PrintIOAccessError(exec_idx, pool_idx); + return false; + } + + // Agent has access, build an instance of transaction + // and add it to the list of transactions + async_trans_t trans(req_type); + trans.kernel.code_ = nullptr; + trans.kernel.pool_ = pool; + trans.kernel.pool_idx_ = pool_idx; + trans.kernel.agent_ = exec_agent; + trans.kernel.agent_idx_ = exec_idx; + trans_list_.push_back(trans); + } + return true; +} + +bool RocmAsync::BuildReadTrans() { + return BuildReadOrWriteTrans(REQ_READ, read_list_); +} + +bool RocmAsync::BuildWriteTrans() { + return BuildReadOrWriteTrans(REQ_WRITE, write_list_); +} + +bool RocmAsync::BuildCopyTrans(uint32_t req_type, + vector& src_list, + vector& dst_list) { + + uint32_t src_size = src_list.size(); + uint32_t dst_size = dst_list.size(); + + hsa_status_t status; + hsa_amd_memory_pool_access_t access; + for (uint32_t idx = 0; idx < src_size; idx++) { + + // Retrieve Roc runtime handles for Src memory pool and agents + uint32_t src_idx = src_list[idx]; + hsa_agent_t src_agent = pool_list_[src_idx].owner_agent_; + hsa_amd_memory_pool_t src_pool = pool_list_[src_idx].pool_; + uint32_t src_dev_idx = pool_list_[src_idx].agent_index_; + hsa_device_type_t src_dev_type = agent_list_[src_dev_idx].device_type_; + + for (uint32_t jdx = 0; jdx < dst_size; jdx++) { + + // Retrieve Roc runtime handles for Dst memory pool and agents + uint32_t dst_idx = dst_list[jdx]; + hsa_agent_t dst_agent = pool_list_[dst_idx].owner_agent_; + hsa_amd_memory_pool_t dst_pool = pool_list_[dst_idx].pool_; + uint32_t dst_dev_idx = pool_list_[dst_idx].agent_index_; + hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_; + + // Filter out transaction when Src & Dst pools belong to Cpu + if ((src_dev_type == HSA_DEVICE_TYPE_CPU) && + (dst_dev_type == HSA_DEVICE_TYPE_CPU)) { + continue; + } + + // Filter out transaction with same Src & Dst pools + if (src_idx == dst_idx) { + continue; + } + + // Determine if accessibility to src pool for dst agent is not denied + status = hsa_amd_agent_memory_pool_get_info(dst_agent, src_pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access); + ErrorCheck(status); + if (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { + PrintCopyAccessError(src_idx, dst_idx); + return false; + } + + // Determine if accessibility to dst pool for src agent is not denied + status = hsa_amd_agent_memory_pool_get_info(src_agent, dst_pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access); + ErrorCheck(status); + if (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { + return false; + } + + // Agents have access, build an instance of transaction + // and add it to the list of transactions + async_trans_t trans(req_type); + trans.copy.src_idx_ = src_idx; + trans.copy.dst_idx_ = dst_idx; + trans.copy.src_pool_ = src_pool; + trans.copy.dst_pool_ = dst_pool; + trans.copy.bidir_ = (req_type == REQ_COPY_BIDIR); + trans.copy.uses_gpu_ = ((src_dev_type == HSA_DEVICE_TYPE_GPU) || + (dst_dev_type == HSA_DEVICE_TYPE_GPU)); + trans_list_.push_back(trans); + } + } + return true; +} + +bool RocmAsync::BuildBidirCopyTrans() { + return BuildCopyTrans(REQ_COPY_BIDIR, bidir_list_, bidir_list_); +} + +bool RocmAsync::BuildUnidirCopyTrans() { + return BuildCopyTrans(REQ_COPY_UNIDIR, src_list_, dst_list_); +} + +// @brief: Builds a list of transaction per user request +bool RocmAsync::BuildTransList() { + + // Build list of Read transactions per user request + bool status = false; + if (req_read_ == REQ_READ) { + status = BuildReadTrans(); + if (status == false) { + return status; + } + } + + // Build list of Write transactions per user request + status = false; + if (req_write_ == REQ_WRITE) { + status = BuildWriteTrans(); + if (status == false) { + return status; + } + } + + // Build list of Bidirectional Copy transactions per user request + status = false; + if (req_copy_bidir_ == REQ_COPY_BIDIR) { + status = BuildBidirCopyTrans(); + if (status == false) { + return status; + } + } + + // Build list of Unidirectional Copy transactions per user request + status = false; + if (req_copy_unidir_ == REQ_COPY_UNIDIR) { + status = BuildUnidirCopyTrans(); + if (status == false) { + return status; + } + } + + // All of the transaction are built up + return true; +} + diff --git a/rocrtst/samples/rocm_async/rocm_async_validate.cpp b/rocrtst/samples/rocm_async/rocm_async_validate.cpp new file mode 100644 index 0000000000..81dd9e4cc2 --- /dev/null +++ b/rocrtst/samples/rocm_async/rocm_async_validate.cpp @@ -0,0 +1,155 @@ + +#include "common.hpp" +#include "rocm_async.hpp" + +#include +#include +#include +#include +#include + +bool RocmAsync::PoolIsPresent(vector& in_list) { + + bool is_present; + uint32_t idx1 = 0; + uint32_t idx2 = 0; + uint32_t count = in_list.size(); + uint32_t pool_count = pool_list_.size(); + for (idx1 = 0; idx1 < count; idx1++) { + is_present = false; + for (idx2 = 0; idx2 < pool_count; idx2++) { + if (in_list[idx1] == pool_list_[idx2].index_) { + is_present = true; + break; + } + } + if (is_present == false) { + return false; + } + } + + return true; +} + +bool RocmAsync::PoolIsDuplicated(vector& in_list) { + + uint32_t idx1 = 0; + uint32_t idx2 = 0; + uint32_t count = in_list.size(); + for (idx1 = 0; idx1 < count; idx1++) { + for (idx2 = 0; idx2 < count; idx2++) { + if ((in_list[idx1] == in_list[idx2]) && (idx1 != idx2)){ + return false; + } + } + } + return true; +} + +bool RocmAsync::ValidateReadOrWriteReq(vector& in_list) { + + // Determine read / write request is even + // Request is specified as a list of memory + // pool, agent tuples - first element identifies + // memory pool while the second element denotes + // an agent + uint32_t list_size = in_list.size(); + if ((list_size % 2) != 0) { + return false; + } + + // Validate the list of pool-agent tuples + for (uint32_t idx = 0; idx < list_size; idx+=2) { + uint32_t pool_idx = in_list[idx]; + uint32_t exec_idx = in_list[idx + 1]; + // Determine the pool and agent exist in system + if ((pool_idx >= pool_index_) || + (exec_idx >= agent_index_)) { + return false; + } + } + return true; +} + +bool RocmAsync::ValidateReadReq() { + return ValidateReadOrWriteReq(read_list_); +} + +bool RocmAsync::ValidateWriteReq() { + return ValidateReadOrWriteReq(write_list_); +} + +bool RocmAsync::ValidateCopyReq(vector& in_list) { + + // Determine pool list length is valid + uint32_t count = in_list.size(); + uint32_t pool_count = pool_list_.size(); + if (count > pool_count) { + return false; + } + + // Determine no pool is duplicated + bool status = PoolIsDuplicated(in_list); + if (status == false) { + return false; + } + + // Determine every pool is present in system + return PoolIsPresent(in_list); +} + +bool RocmAsync::ValidateBidirCopyReq() { + return ValidateCopyReq(bidir_list_); +} + +bool RocmAsync::ValidateUnidirCopyReq() { + return ((ValidateCopyReq(src_list_)) && (ValidateCopyReq(dst_list_))); +} + +bool RocmAsync::ValidateArguments() { + + // Determine if user has requested a READ + // operation and gave valid inputs + bool status = false; + if (req_read_ == REQ_READ) { + status = ValidateReadReq(); + if (status == false) { + return status; + } + } + + // Determine if user has requested a WRITE + // operation and gave valid inputs + status = false; + if (req_write_ == REQ_WRITE) { + status = ValidateWriteReq(); + if (status == false) { + return status; + } + } + + // Determine if user has requested a Copy + // operation that is bidirectional and gave + // valid inputs + status = false; + if (req_copy_bidir_ == REQ_COPY_BIDIR) { + status = ValidateBidirCopyReq(); + if (status == false) { + return status; + } + } + + // Determine if user has requested a Copy + // operation that is unidirectional and gave + // valid inputs + status = false; + if (req_copy_unidir_ == REQ_COPY_UNIDIR) { + status = ValidateUnidirCopyReq(); + if (status == false) { + return status; + } + } + + // All of the request are well formed + return true; +}