Benchmark copy of data from one pool to another pool either in

one or both directions. Users can enumerate the pools reported
  by system to specify which pools serve as source / destination

Change-Id: I8e6d0adb3743b3328dd3ce9152762ca840ea613b
Этот коммит содержится в:
Ramesh Errabolu
2017-09-12 19:28:15 -05:00
родитель 34602f7e95
Коммит c2caa5ae2c
19 изменённых файлов: 2570 добавлений и 0 удалений
+88
Просмотреть файл
@@ -0,0 +1,88 @@
cmake_minimum_required(VERSION 2.8.0)
#
# Setup build environment
#
# 1) Setup env var ROCR_INC_DIR and ROCR_LIB_DIR to point to
# ROC Runtime header and libraries seperately
#
# export ROCR_INC_DIR="Path to ROC Runtime header"
#
# export ROCR_LIB_DIR="Path to ROC Runtime libraries"
#
# 2) Make an new folder called build under root folder
#
# mkdir build
#
# 3) Enter into folder of build, and run CMAKE to generate makefile
# and make it
#
# cd build; cmake ..; make
#
if(WIN32)
MESSAGE("Windows platfomr is not supported")
RRETURN()
endif()
if(NOT EXISTS $ENV{ROCR_INC_DIR}/hsa/hsa.h)
MESSAGE("ERROR: ROC Runtime headers can't be found under specified path")
RETURN()
endif()
#
# Flag to enable / disable verbose output.
#
SET( CMAKE_VERBOSE_MAKEFILE on )
#
# Set core runtime module name
#
set ( CORE_RUNTIME_NAME "hsa-runtime" )
set ( CORE_RUNTIME_TARGET "${CORE_RUNTIME_NAME}64" )
set ( CORE_RUNTIME_LIBRARY "lib${CORE_RUNTIME_TARGET}" )
if(NOT EXISTS $ENV{ROCR_LIB_DIR}/${CORE_RUNTIME_LIBRARY}.so)
MESSAGE("ERROR: ROC Runtime libraries can't be found under sprcified path")
RETURN()
endif()
set(PROJECT_NAME "rocm_async")
set(TEST_NAME "${PROJECT_NAME}")
project (${PROJECT_NAME})
string(TOLOWER "${CMAKE_BUILD_TYPE}" tmp)
if("${tmp}" STREQUAL "debug")
set(ISDEBUG "1")
add_definitions(-DDEBUG)
endif()
if(ISDEBUG)
set(CMAKE_CXX_FLAGS "-std=c++11 -O0")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb")
else()
set(CMAKE_CXX_FLAGS "-std=c++11 -O2")
endif()
#
# Set the remaining compiler flags
#
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-math-errno")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-threadsafe-statics")
INCLUDE_DIRECTORIES($ENV{ROCR_INC_DIR})
LINK_DIRECTORIES($ENV{ROCR_LIB_DIR})
# Add sources that belong to the project
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} Src)
add_executable(rocm_async ${Src})
target_link_libraries(rocm_async hsa-runtime64)
+66
Просмотреть файл
@@ -0,0 +1,66 @@
Introduction:
#############
RocmAsync is designed to capture the performance characteristics of buffer
copying and kernel read/write operations. The help screen of the benchmark
shows various options one can use in initiating cop/read/writer operations.
In addition one can also query the topology of the system in terms of memory
pools and their agents
Build Environment:
##################
To be able to build RocmAsync, users must ensure that the build platform has
following conditions satisfied:
Build Procedure:
################
The following simply lists the steps to build RocmAsync
--- Define following environment variable to specify location of header
and library files
// Containins header files exported by ROC Runtime
ROCR_INC_DIR="Path of ROC Runtime Header Files"
// Containins library files exported by ROC Runtime
ROCR_LIB_DIR="Path of ROC Runtime Library Files"
--- Create a build directory. The location of build directory can be anywhere
in the file system as long as it has read / write / execute permissions for
the user invoking the commands. User can choose any valid filename for the
build directory as the examples below illustrate
e.g. mkdir rocm_async/perfBuild
e.g. mkdir <parent_Of_rocm_async>rocm_async-build
e.g. mkdir <user_home>/rocmAsyncBuild
--- Set working directory to be the new build directory
e.g. cd rocm_async/perfBuild
e.g. cd <parent_Of_rocm_async>rocm-async-build
e.g. cd <user_home>/rocmAsyncBuild
--- Invoke Cmake to interpret build rules and generate native build files
The argument for cmake should be the root folder of RocmAsync test suite
// Builds Release version (default)
e.g. cmake .../rocm_async
// Builds Debug version
e.g. cmake -DCMAKE_BUILD_TYPE:STRING=Debug .../rocm_async
--- Invoke the native build rules generated by cmake to build the various
object, library and executable files
e.g. make
--- Invoke the install command to copy build artifacts to pre-defined folders
of RocmAsync suite. Upon completion artifacts will be copied to the bin and
lib directories of build directory
e.g. make install
@note: All executables will be found in <build_directory>/bin folder
+11
Просмотреть файл
@@ -0,0 +1,11 @@
#include "base_test.hpp"
// Default Constructor
BaseTest::BaseTest(size_t num) {
// Set the numIteration_ to be 10 by default
num_iteration_ = num;
}
BaseTest::~BaseTest() {}
+52
Просмотреть файл
@@ -0,0 +1,52 @@
#ifndef ROCM_ASYNC_BW_BASE_TEST_H_
#define ROCM_ASYNC_BW_BASE_TEST_H_
#include "hsa/hsa.h"
#include <iostream>
#include <string>
#include <vector>
using namespace std;
// @Brief: An interface for tests to do some basic things,
class BaseTest {
public:
BaseTest(size_t num = 10);
virtual ~BaseTest();
// @Brief: Allows setup proceedures to be completed
// before running the benchmark test case
virtual void SetUp() = 0;
// @Brief: Launches the proceedures of test scenario
virtual void Run() = 0;
// @Brief: Allows clean up proceedures to be invoked
virtual void Close() = 0;
// @Brief: Display the results
virtual void Display() const = 0;
// @Brief: Set number of iterations to run
void set_num_iteration(size_t num) {
num_iteration_ = num;
return;
}
// @Brief: Pre-declare some variables for deriviation, the
// derived class may declare more if needed
protected:
// @Brief: Real iteration number
uint64_t num_iteration_;
// @Brief: Status code
hsa_status_t err_;
};
#endif // ROCM_ASYNC_BW_BASE_TEST_H_
+149
Просмотреть файл
@@ -0,0 +1,149 @@
#include "common.hpp"
void error_check(hsa_status_t hsa_error_code, int line_num, const char* str) {
if (hsa_error_code != HSA_STATUS_SUCCESS &&
hsa_error_code != HSA_STATUS_INFO_BREAK) {
printf("HSA Error Found! In file: %s; At line: %d\n", str, line_num);
const char* string = nullptr;
hsa_status_string(hsa_error_code, &string);
printf("Error: %s\n", string);
exit(EXIT_FAILURE);
}
}
// So far, always find the first device
hsa_status_t FindGpuDevice(hsa_agent_t agent, void* data) {
if (data == NULL) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
hsa_device_type_t hsa_device_type;
hsa_status_t hsa_error_code =
hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &hsa_device_type);
if (hsa_error_code != HSA_STATUS_SUCCESS) {
return hsa_error_code;
}
if (hsa_device_type == HSA_DEVICE_TYPE_GPU) {
*((hsa_agent_t*)data) = agent;
return HSA_STATUS_INFO_BREAK;
}
return HSA_STATUS_SUCCESS;
}
hsa_status_t FindCpuDevice(hsa_agent_t agent, void* data) {
if (data == NULL) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
hsa_device_type_t hsa_device_type;
hsa_status_t hsa_error_code =
hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &hsa_device_type);
if (hsa_error_code != HSA_STATUS_SUCCESS) {
return hsa_error_code;
}
if (hsa_device_type == HSA_DEVICE_TYPE_CPU) {
*((hsa_agent_t*)data) = agent;
return HSA_STATUS_INFO_BREAK;
}
return HSA_STATUS_SUCCESS;
}
hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t region, void* data) {
if (NULL == data) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
hsa_status_t err;
hsa_amd_segment_t segment;
uint32_t flag;
err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
ErrorCheck(err);
err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag);
ErrorCheck(err);
if ((HSA_AMD_SEGMENT_GLOBAL == segment) &&
(flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED)) {
*((hsa_amd_memory_pool_t*)data) = region;
}
return HSA_STATUS_SUCCESS;
}
double CalcMedian(vector<double> scores) {
double median;
size_t size = scores.size();
if (size % 2 == 0)
median = (scores[size / 2 - 1] + scores[size / 2]) / 2;
else
median = scores[size / 2];
return median;
}
double CalcMean(vector<double> scores) {
double mean = 0;
size_t size = scores.size();
for (size_t i = 0; i < size; ++i) mean += scores[i];
return mean / size;
}
double CalcStdDeviation(vector<double> scores, int score_mean) {
double ret = 0.0;
for (size_t i = 0; i < scores.size(); ++i) {
ret += (scores[i] - score_mean) * (scores[i] - score_mean);
}
ret /= scores.size();
return sqrt(ret);
}
int CalcConcurrentQueues(vector<double> scores) {
int num_of_concurrent_queues = 0;
vector<double> execpted_exec_time_array;
for (size_t i = 0; i < scores.size(); ++i) {
execpted_exec_time_array.push_back(scores[0] / (1 << i));
}
for (size_t i = 0; i < scores.size(); ++i) {
cout << "expected exe time = " << execpted_exec_time_array[i] << endl;
}
for (size_t i = 1; i < scores.size(); ++i) {
if ((execpted_exec_time_array[i] - scores[i]) <
0.1 * execpted_exec_time_array[i])
++num_of_concurrent_queues;
}
return num_of_concurrent_queues;
}
/** hsa_status_t FindHostRegion(hsa_region_t region, void *data) {
if (data == NULL) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
bool is_host_region = false;
hsa_status_t hsa_error_code = hsa_region_get_info(
region, (hsa_region_info_t)HSA_EXT_REGION_INFO_HOST_ACCESS, &is_host_region
);
if (hsa_error_code != HSA_STATUS_SUCCESS) {
return hsa_error_code;
}
if (is_host_region) {
*((hsa_region_t*)data) = region;
}
return HSA_STATUS_SUCCESS;
} */
+48
Просмотреть файл
@@ -0,0 +1,48 @@
#ifndef ROCM_ASYNC_BW_COMMON_HPP
#define ROCM_ASYNC_BW_COMMON_HPP
#include <cstdlib>
#include <iostream>
#include <vector>
#include <cmath>
#include "hsa/hsa.h"
#include "hsa/hsa_ext_amd.h"
using namespace std;
#if defined(_MSC_VER)
#define ALIGNED_(x) __declspec(align(x))
#else
#if defined(__GNUC__)
#define ALIGNED_(x) __attribute__((aligned(x)))
#endif // __GNUC__
#endif // _MSC_VER
#define MULTILINE(...) #__VA_ARGS__
#define HSA_ARGUMENT_ALIGN_BYTES 16
#define ErrorCheck(x) error_check(x, __LINE__, __FILE__)
// @Brief: Check HSA API return value
void error_check(hsa_status_t hsa_error_code, int line_num, const char* str);
// @Brief: Find the first avaliable GPU device
hsa_status_t FindGpuDevice(hsa_agent_t agent, void* data);
// @Brief: Find the first avaliable CPU device
hsa_status_t FindCpuDevice(hsa_agent_t agent, void* data);
// @Brief: Find the agent's global region / pool
hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t region, void* data);
// @Brief: Calculate the mean number of the vector
double CalcMean(vector<double> scores);
// @Brief: Calculate the Median valud of the vector
double CalcMedian(vector<double> scores);
// @Brief: Calculate the standard deviation of the vector
double CalcStdDeviation(vector<double> scores, int score_mean);
#endif // ROCM_ASYNC_BW_COMMON_HPP
+188
Просмотреть файл
@@ -0,0 +1,188 @@
#include "hsatimer.hpp"
#define NANOSECONDS_PER_SECOND 1000000000
PerfTimer::PerfTimer() {
freq_in_100mhz = MeasureTSCFreqHz();
}
PerfTimer::~PerfTimer() {
while (!_timers.empty()) {
Timer *temp = _timers.back();
_timers.pop_back();
delete temp;
}
}
// Create a new timer instance and return its index
int PerfTimer::CreateTimer() {
Timer *newTimer = new Timer;
newTimer->_start = 0.0;
newTimer->_clocks = 0.0;
#ifdef _WIN32
QueryPerformanceFrequency((LARGE_INTEGER *)&newTimer->_freq);
#endif
#ifdef __linux__
newTimer->_freq = NANOSECONDS_PER_SECOND;
#endif
// Save the timer object in timer list
_timers.push_back(newTimer);
return (int)(_timers.size() - 1);
}
int PerfTimer::StartTimer(int index) {
if (index >= (int)_timers.size()) {
Error("Cannot reset timer. Invalid handle.");
return HSA_FAILURE;
}
#ifdef _WIN32
// General Windows timing method
#ifndef _AMD
long long tmpStart;
QueryPerformanceCounter((LARGE_INTEGER *)&(tmpStart));
_ timers[index]->_start = (double)tmpStart;
// AMD Windows timing method
#else
#endif
#endif
#ifdef __linux__
// General Linux timing method
#ifndef _AMD
struct timespec s;
clock_gettime(CLOCK_MONOTONIC, &s);
_timers[index]->_start =
(long long)s.tv_sec * NANOSECONDS_PER_SECOND + (long long)s.tv_nsec;
// AMD Linux timing method
#else
unsigned int unused;
_timers[index]->_start = __rdtscp(&unused);
#endif
#endif
return HSA_SUCCESS;
}
int PerfTimer::StopTimer(int index) {
long long n = 0;
if (index >= (int)_timers.size()) {
Error("Cannot reset timer. Invalid handle.");
return HSA_FAILURE;
}
#ifdef _WIN32
#ifndef _AMD
long long n1;
QueryPerformanceCounter((LARGE_INTEGER *)&(n1));
n = n1;
// AMD Window Timing
#else
#endif
#endif
#ifdef __linux__
// General Linux timing method
#ifndef _AMD
struct timespec s;
clock_gettime(CLOCK_MONOTONIC, &s);
n = (long long)s.tv_sec * NANOSECONDS_PER_SECOND + (long long)s.tv_nsec;
// AMD Linux timing
#else
unsigned int unused;
n = __rdtscp(&unused);
#endif
#endif
n -= _timers[index]->_start;
_timers[index]->_start = 0;
#ifndef _AMD
_timers[index]->_clocks += n;
#endif
#ifdef __linux__
//_timers[index]->_clocks += 10 * n /freq_in_100mhz; // unit is ns
_timers[index]->_clocks += 1.0E-6 * 10 * n / freq_in_100mhz; // convert to ms
// cout << "_AMD is enabled!!!" << endl;
#endif
return HSA_SUCCESS;
}
void PerfTimer::Error(string str) { cout << str << endl; }
double PerfTimer::ReadTimer(int index) {
if (index >= (int)_timers.size()) {
Error("Cannot read timer. Invalid handle.");
return HSA_FAILURE;
}
double reading = double(_timers[index]->_clocks);
reading = double(reading / _timers[index]->_freq);
return reading;
}
void PerfTimer::ResetTimer(int index) {
// Check if index value is over the timer's size
if (index >= (int)_timers.size()) {
Error("Invalid index value\n");
exit(1);
}
_timers[index]->_clocks = 0.0;
_timers[index]->_start = 0.0;
}
uint64_t PerfTimer::CoarseTimestampUs() {
#ifdef _WIN32
uint64_t freqHz, ticks;
QueryPerformanceFrequency((LARGE_INTEGER *)&freqHz);
QueryPerformanceCounter((LARGE_INTEGER *)&ticks);
// Scale numerator and divisor until (ticks * 1000000) fits in uint64_t.
while (ticks > (1ULL << 44)) {
ticks /= 16;
freqHz /= 16;
}
return (ticks * 1000000) / freqHz;
#endif
#ifdef __linux__
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
return uint64_t(ts.tv_sec) * 1000000 + ts.tv_nsec / 1000;
#endif
}
uint64_t PerfTimer::MeasureTSCFreqHz() {
// Make a coarse interval measurement of TSC ticks for 1 gigacycles.
unsigned int unused;
uint64_t tscTicksEnd;
uint64_t coarseBeginUs = CoarseTimestampUs();
uint64_t tscTicksBegin = __rdtscp(&unused);
do {
tscTicksEnd = __rdtscp(&unused);
} while (tscTicksEnd - tscTicksBegin < 1000000000);
uint64_t coarseEndUs = CoarseTimestampUs();
// Compute the TSC frequency and round to nearest 100MHz.
uint64_t coarseIntervalNs = (coarseEndUs - coarseBeginUs) * 1000;
uint64_t tscIntervalTicks = tscTicksEnd - tscTicksBegin;
return (tscIntervalTicks * 10 + (coarseIntervalNs / 2)) / coarseIntervalNs;
}
+72
Просмотреть файл
@@ -0,0 +1,72 @@
#ifndef ROCM_ASYNC_BW_MYTIME_H_
#define ROCM_ASYNC_BW_MYTIME_H_
// Will use AMD timer and general Linux timer based on users'
// need --> compilation flag. Support for windows platform is
// not currently available
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <x86intrin.h>
#include <string.h>
#include <iostream>
#include <vector>
#include <string>
using namespace std;
#include <sys/time.h>
#define HSA_FAILURE 1
#define HSA_SUCCESS 0
class PerfTimer {
private:
struct Timer {
string name; /* < name name of time object*/
long long _freq; /* < _freq frequency*/
long long _clocks; /* < _clocks number of ticks at end*/
long long _start; /* < _start start point ticks*/
};
std::vector<Timer*> _timers; /*< _timers vector to Timer objects */
double freq_in_100mhz;
public:
PerfTimer();
~PerfTimer();
private:
// AMD timing method
uint64_t CoarseTimestampUs();
uint64_t MeasureTSCFreqHz();
// General Linux timing method
public:
int CreateTimer();
int StartTimer(int index);
int StopTimer(int index);
void ResetTimer(int index);
public:
// retrieve time
double ReadTimer(int index);
// write into a file
double WriteTimer(int index);
public:
void Error(string str);
};
#endif // ROCM_ASYNC_BW_MYTIME_H_
+25
Просмотреть файл
@@ -0,0 +1,25 @@
#include <unistd.h>
#include <iostream>
#include "hsatimer.hpp"
#include "rocm_async.hpp"
using namespace std;
int main(int argc, char** argv) {
// Create the Bandwidth test object
RocmAsync bw_test(argc, argv);
// Initialize the Bandwidth test object
bw_test.SetUp();
// Run the Bandwidth tests requested by user
bw_test.Run();
// Display the time taken by various tests
bw_test.Display();
// Release the Bandwidth test object resources
bw_test.Close();
return 0;
}
+49
Просмотреть файл
@@ -0,0 +1,49 @@
// Compiling for Windows Platform
#ifdef _WIN32
#include "os.hpp"
#include <stdio.h>
#include <stdlib.h>
#include <Windows.h>
void SetEnv(const char* env_var_name, const char* env_var_value) {
bool err = SetEnvironmentVariable(env_var_name, env_var_value);
if (false == err) {
printf("Set environment variable failed!\n");
exit(1);
}
return;
}
char* GetEnv(const char* env_var_name) {
char* buff;
DWORD char_count = GetEnvironmentVariable(env_var_name, NULL, 0);
if (char_count == 0) return NULL;
buff = (char*)malloc(sizeof(char) * char_count);
GetEnvironmentVariable(env_var_name, buff, char_count);
buff[char_count - 1] = '\0';
return buff;
}
#endif // End of Windows Code
// Compiling for Linux Platform
#ifdef __linux__
#include "os.hpp"
#include <stdlib.h>
void SetEnv(const char* env_var_name, const char* env_var_value) {
int err = setenv(env_var_name, env_var_value, 1);
if (0 != err) {
printf("Set environment variable failed!\n");
exit(1);
}
return;
}
char* GetEnv(const char* env_var_name) { return getenv(env_var_name); }
#endif // End of Linux Code
Исполняемый файл
+14
Просмотреть файл
@@ -0,0 +1,14 @@
#ifndef ROCM_ASYNC_BW_UTILS_OS_H_
#define ROCM_ASYNC_BW_UTILS_OS_H_
#include <stdio.h>
// Set envriroment variable
void SetEnv(const char* env_var_name, const char* env_var_value);
// Get the value of enviroment
char* GetEnv(const char* env_var_name);
#endif // ROCM_ASYNC_BW_UTILS_OS_H_
Исполняемый файл
+478
Просмотреть файл
@@ -0,0 +1,478 @@
#include "common.hpp"
#include "rocm_async.hpp"
#include <stdlib.h>
#include <assert.h>
#include <algorithm>
#include <unistd.h>
#include <cctype>
#include <sstream>
// The values are in megabytes at allocation time
const uint32_t RocmAsync::SIZE_LIST[] = { 64, 128, 256, 512 };
//const uint32_t RocmAsync::SIZE_LIST[] = { 2, 4, 8, 16, 32, 64, 128, 256, 512 };
uint32_t RocmAsync::GetIterationNum() {
return num_iteration_ * 1.2 + 1;
}
void RocmAsync::AcquireAccess(hsa_agent_t agent, void* ptr) {
err_ = hsa_amd_agents_allow_access(1, &agent, NULL, ptr);
ErrorCheck(err_);
}
void RocmAsync::AllocateHostBuffers(bool bidir, uint32_t size,
void*& src_fwd, void*& dst_fwd,
void* buf_src_fwd, void* buf_dst_fwd,
hsa_agent_t src_agent_fwd, hsa_agent_t dst_agent_fwd,
void*& src_rev, void*& dst_rev,
void* buf_src_rev, void* buf_dst_rev,
hsa_agent_t src_agent_rev, hsa_agent_t dst_agent_rev,
hsa_signal_t& signal_fwd, hsa_signal_t& signal_rev) {
// Allocate host buffers and setup accessibility for copy operation
err_ = hsa_amd_memory_pool_allocate(sys_pool_, size, 0, (void**)&src_fwd);
ErrorCheck(err_);
AcquireAccess(src_agent_fwd, src_fwd);
AcquireAccess(cpu_agent_, buf_src_fwd);
err_ = hsa_amd_memory_pool_allocate(sys_pool_, size, 0, (void**)&dst_fwd);
ErrorCheck(err_);
AcquireAccess(dst_agent_fwd, dst_fwd);
AcquireAccess(cpu_agent_, buf_dst_fwd);
// Initialize host buffers to a determinate value
memset(src_fwd, 0x23, size);
memset(dst_fwd, 0x00, size);
// Create a signal to wait on copy operation
// @TODO: replace it with a signal pool call
err_ = hsa_signal_create(1, 0, NULL, &signal_fwd);
ErrorCheck(err_);
if (bidir == false) {
return;
}
err_ = hsa_amd_memory_pool_allocate(sys_pool_, size, 0, (void**)&src_rev);
ErrorCheck(err_);
AcquireAccess(src_agent_rev, src_rev);
AcquireAccess(cpu_agent_, buf_src_rev);
err_ = hsa_amd_memory_pool_allocate(sys_pool_, size, 0, (void**)&dst_rev);
ErrorCheck(err_);
AcquireAccess(dst_agent_rev, dst_rev);
AcquireAccess(cpu_agent_, buf_dst_rev);
// Initialize host buffers to a determinate value
memset(src_rev, 0x23, size);
memset(dst_rev, 0x00, size);
err_ = hsa_signal_create(1, 0, NULL, &signal_rev);
ErrorCheck(err_);
}
void RocmAsync::AllocateCopyBuffers(bool bidir, uint32_t size,
void*& src_fwd, hsa_amd_memory_pool_t src_pool_fwd,
void*& dst_fwd, hsa_amd_memory_pool_t dst_pool_fwd,
hsa_agent_t src_agent_fwd, hsa_agent_t dst_agent_fwd,
void*& src_rev, hsa_amd_memory_pool_t src_pool_rev,
void*& dst_rev, hsa_amd_memory_pool_t dst_pool_rev,
hsa_agent_t src_agent_rev, hsa_agent_t dst_agent_rev,
hsa_signal_t& signal_fwd, hsa_signal_t& signal_rev) {
// Allocate buffers in src and dst pools for forward copy
err_ = hsa_amd_memory_pool_allocate(src_pool_fwd, size, 0, &src_fwd);
ErrorCheck(err_);
err_ = hsa_amd_memory_pool_allocate(dst_pool_fwd, size, 0, &dst_fwd);
ErrorCheck(err_);
// Allocate buffers in src and dst pools for reverse copy
if (bidir) {
err_ = hsa_amd_memory_pool_allocate(src_pool_rev, size, 0, &src_rev);
ErrorCheck(err_);
err_ = hsa_amd_memory_pool_allocate(dst_pool_rev, size, 0, &dst_rev);
ErrorCheck(err_);
}
// Acquire access to src and dst buffers for forward copy
AcquireAccess(src_agent_fwd, dst_fwd);
AcquireAccess(dst_agent_fwd, src_fwd);
// Acquire access to src and dst buffers for reverse copy
if (bidir) {
AcquireAccess(src_agent_rev, dst_rev);
AcquireAccess(dst_agent_rev, src_rev);
}
// Create a signal to wait on copy operation
// @TODO: replace it with a signal pool call
err_ = hsa_signal_create(1, 0, NULL, &signal_fwd);
ErrorCheck(err_);
if (bidir) {
err_ = hsa_signal_create(1, 0, NULL, &signal_rev);
ErrorCheck(err_);
}
}
void RocmAsync::ReleaseBuffers(bool bidir,
void* src_fwd, void* src_rev,
void* dst_fwd, void* dst_rev,
hsa_signal_t signal_fwd,
hsa_signal_t signal_rev) {
// Free the src and dst buffers used in forward copy
// including the signal used to wait
err_ = hsa_amd_memory_pool_free(src_fwd);
ErrorCheck(err_);
err_ = hsa_amd_memory_pool_free(dst_fwd);
ErrorCheck(err_);
err_ = hsa_signal_destroy(signal_fwd);
ErrorCheck(err_);
// Free the src and dst buffers used in reverse copy
// including the signal used to wait
if (bidir) {
err_ = hsa_amd_memory_pool_free(src_rev);
ErrorCheck(err_);
err_ = hsa_amd_memory_pool_free(dst_rev);
ErrorCheck(err_);
err_ = hsa_signal_destroy(signal_rev);
ErrorCheck(err_);
}
}
double RocmAsync::GetGpuCopyTime(bool bidir,
hsa_signal_t signal_fwd,
hsa_signal_t signal_rev) {
// Obtain time taken for forward copy
hsa_amd_profiling_async_copy_time_t async_time_fwd = {0};
err_= hsa_amd_profiling_get_async_copy_time(signal_fwd, &async_time_fwd);
ErrorCheck(err_);
if (bidir == false) {
return(async_time_fwd.end - async_time_fwd.start);
}
hsa_amd_profiling_async_copy_time_t async_time_rev = {0};
err_= hsa_amd_profiling_get_async_copy_time(signal_rev, &async_time_rev);
ErrorCheck(err_);
double start = min(async_time_fwd.start, async_time_rev.start);
double end = max(async_time_fwd.end, async_time_rev.end);
return(end - start);
}
void RocmAsync::copy_buffer(void* dst, hsa_agent_t dst_agent,
void* src, hsa_agent_t src_agent,
size_t size, hsa_signal_t signal) {
// Copy from src into dst buffer
err_ = hsa_amd_memory_async_copy(dst, dst_agent,
src, src_agent,
size, 0, NULL, signal);
ErrorCheck(err_);
// Wait for the forward copy operation to complete
while (hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1,
uint64_t(-1), HSA_WAIT_STATE_ACTIVE));
}
void RocmAsync::RunCopyBenchmark(async_trans_t& trans) {
// Bind if this transaction is bidirectional
bool bidir = trans.copy.bidir_;
// Initialize size of buffer to equal the largest element of allocation
uint32_t size_len = size_list_.size();
uint32_t max_size = size_list_.back() * 1024 * 1024;
// Bind to resources such as pool and agents that are involved
// in both forward and reverse copy operations
void* buf_src_fwd;
void* buf_dst_fwd;
void* buf_src_rev;
void* buf_dst_rev;
void* host_src_fwd;
void* host_dst_fwd;
void* host_src_rev;
void* host_dst_rev;
hsa_signal_t signal_fwd;
hsa_signal_t signal_rev;
hsa_signal_t host_signal_fwd;
hsa_signal_t host_signal_rev;
hsa_amd_memory_pool_t src_pool_fwd = trans.copy.src_pool_;
hsa_amd_memory_pool_t dst_pool_fwd = trans.copy.dst_pool_;
hsa_amd_memory_pool_t src_pool_rev = dst_pool_fwd;
hsa_amd_memory_pool_t dst_pool_rev = src_pool_fwd;
hsa_agent_t src_agent_fwd = pool_list_[trans.copy.src_idx_].owner_agent_;
hsa_agent_t dst_agent_fwd = pool_list_[trans.copy.dst_idx_].owner_agent_;
hsa_agent_t src_agent_rev = dst_agent_fwd;
hsa_agent_t dst_agent_rev = src_agent_fwd;
// Allocate buffers and signal objects
AllocateCopyBuffers(bidir, max_size,
buf_src_fwd, src_pool_fwd,
buf_dst_fwd, dst_pool_fwd,
src_agent_fwd, dst_agent_fwd,
buf_src_rev, src_pool_rev,
buf_dst_rev, dst_pool_rev,
src_agent_rev, dst_agent_rev,
signal_fwd, signal_rev);
if (verify_) {
AllocateHostBuffers(bidir, max_size,
host_src_fwd, host_dst_fwd,
buf_src_fwd, buf_dst_fwd,
src_agent_fwd, dst_agent_fwd,
host_src_rev, host_dst_rev,
buf_src_rev, buf_dst_rev,
src_agent_rev, dst_agent_rev,
host_signal_fwd, host_signal_rev);
// Initialize source buffer with values from verification buffer
copy_buffer(buf_src_fwd, src_agent_fwd,
host_src_fwd, cpu_agent_,
max_size, host_signal_fwd);
ErrorCheck(err_);
if (bidir) {
copy_buffer(buf_src_rev, src_agent_rev,
host_src_rev, cpu_agent_,
max_size, host_signal_rev);
ErrorCheck(err_);
}
}
// Bind the number of iterations
uint32_t iterations = GetIterationNum();
// Iterate through the differnt buffer sizes to
// compute the bandwidth as determined by copy
for (uint32_t idx = 0; idx < size_len; idx++) {
// This should not be happening
uint32_t curr_size = size_list_[idx] * 1024 * 1024;
if (curr_size > max_size) {
break;
}
std::vector<double> cpu_time;
std::vector<double> gpu_time;
for (uint32_t it = 0; it < iterations; it++) {
#if DEBUG
printf(".");
fflush(stdout);
#endif
hsa_signal_store_relaxed(signal_fwd, 1);
if (bidir) {
hsa_signal_store_relaxed(signal_rev, 1);
}
if (verify_) {
AcquireAccess(src_agent_fwd, buf_dst_fwd);
AcquireAccess(dst_agent_fwd, buf_src_fwd);
if (bidir) {
AcquireAccess(src_agent_rev, buf_dst_rev);
AcquireAccess(dst_agent_rev, buf_src_rev);
}
}
// Create a timer object and reset signals
PerfTimer timer;
uint32_t index = timer.CreateTimer();
// Start the timer and launch forward copy operation
timer.StartTimer(index);
err_ = hsa_amd_memory_async_copy(buf_dst_fwd, dst_agent_fwd,
buf_src_fwd, src_agent_fwd,
curr_size, 0, NULL, signal_fwd);
ErrorCheck(err_);
// Launch reverse copy operation if it is bidirectional
if (bidir) {
err_ = hsa_amd_memory_async_copy(buf_dst_rev, dst_agent_rev,
buf_src_rev, src_agent_rev,
curr_size, 0, NULL, signal_rev);
ErrorCheck(err_);
}
// Wait for the forward copy operation to complete
while (hsa_signal_wait_acquire(signal_fwd, HSA_SIGNAL_CONDITION_LT, 1,
uint64_t(-1), HSA_WAIT_STATE_ACTIVE));
// Wait for the reverse copy operation to complete
if (bidir) {
while (hsa_signal_wait_acquire(signal_rev, HSA_SIGNAL_CONDITION_LT, 1,
uint64_t(-1), HSA_WAIT_STATE_ACTIVE));
}
// Stop the timer object
timer.StopTimer(index);
// Push the time taken for copy into a vector of copy times
cpu_time.push_back(timer.ReadTimer(index));
// Collect time from the signal(s)
if (trans.copy.uses_gpu_) {
double temp = GetGpuCopyTime(bidir, signal_fwd, signal_rev);
gpu_time.push_back(temp);
}
if (verify_) {
// Re-Establish access to destination buffer and host buffer
AcquireAccess(cpu_agent_, buf_dst_fwd);
AcquireAccess(dst_agent_fwd, host_dst_fwd);
// Init dst buffer with values from outbuffer of copy operation
hsa_signal_store_relaxed(host_signal_fwd, 1);
copy_buffer(host_dst_fwd, cpu_agent_,
buf_dst_fwd, dst_agent_fwd,
curr_size, host_signal_fwd);
ErrorCheck(err_);
// Compare output equals input
err_ = (hsa_status_t)memcmp(host_src_fwd, host_dst_fwd, curr_size);
ErrorCheck(err_);
if (bidir) {
// Re-Establish access to destination buffer and host buffer
AcquireAccess(cpu_agent_, buf_dst_rev);
AcquireAccess(dst_agent_rev, host_dst_rev);
hsa_signal_store_relaxed(host_signal_rev, 1);
copy_buffer(host_dst_rev, cpu_agent_,
buf_dst_rev, dst_agent_rev,
curr_size, host_signal_rev);
ErrorCheck(err_);
// Compare output equals input
err_ = (hsa_status_t)memcmp(host_src_rev, host_dst_rev, curr_size);
ErrorCheck(err_);
}
}
}
#if DEBUG
std::cout << std::endl;
#endif
// Get Cpu min copy time
trans.cpu_min_time_.push_back(GetMinTime(cpu_time));
// Get Cpu mean copy time and store to the array
trans.cpu_avg_time_.push_back(GetMeanTime(cpu_time));
if (trans.copy.uses_gpu_) {
// Get Gpu min copy time
trans.gpu_min_time_.push_back(GetMinTime(gpu_time));
// Get Gpu mean copy time and store to the array
trans.gpu_avg_time_.push_back(GetMeanTime(gpu_time));
}
// Clear the stack of cpu times
cpu_time.clear();
gpu_time.clear();
}
// Free up buffers and signal objects used in copy operation
ReleaseBuffers(bidir, buf_src_fwd, buf_src_rev,
buf_dst_fwd, buf_dst_rev, signal_fwd, signal_rev);
if (verify_) {
ReleaseBuffers(bidir, host_src_fwd, host_src_rev,
host_dst_fwd, host_dst_rev, host_signal_fwd, host_signal_rev);
}
}
void RocmAsync::RunIOBenchmark(async_trans_t& trans) {
std::cout << "Unsupported Request - Read / Write" << std::endl;
exit(1);
}
void RocmAsync::Run() {
// Enable profiling of Async Copy Activity
err_ = hsa_amd_profiling_async_copy_enable(true);
ErrorCheck(err_);
// Iterate through the list of transactions and execute them
uint32_t trans_size = trans_list_.size();
for (uint32_t idx = 0; idx < trans_size; idx++) {
async_trans_t& trans = trans_list_[idx];
if ((trans.req_type_ == REQ_COPY_BIDIR) ||
(trans.req_type_ == REQ_COPY_UNIDIR)) {
RunCopyBenchmark(trans);
}
if ((trans.req_type_ == REQ_READ) ||
(trans.req_type_ == REQ_WRITE)) {
RunIOBenchmark(trans);
}
}
// Disable profiling of Async Copy Activity
err_ = hsa_amd_profiling_async_copy_enable(false);
ErrorCheck(err_);
}
void RocmAsync::Close() {
hsa_status_t status = hsa_shut_down();
ErrorCheck(status);
return;
}
// Sets up the bandwidth test object to enable running
// the various test scenarios requested by user. The
// things this proceedure takes care of are:
//
// Parse user arguments
// Discover RocR Device Topology
// Determine validity of requested test scenarios
// Build the list of transactions to execute
// Miscellaneous
//
void RocmAsync::SetUp() {
// Parse user arguments
ParseArguments();
// Validate input parameters
bool status = ValidateArguments();
if (status == false) {
PrintHelpScreen();
exit(1);
}
// Build list of transactions (copy, read, write) to execute
status = BuildTransList();
if (status == false) {
PrintHelpScreen();
exit(1);
}
// Print Debug Info - List of Agents, Pool, Transactions
char* print_debug = getenv("PRINT_DEBUG");
if (print_debug) {
//PrintAgentsList();
//PrintPoolsList();
PrintTransList();
//PrintTopology();
}
}
RocmAsync::RocmAsync(int argc, char** argv) : BaseTest() {
usr_argc_ = argc;
usr_argv_ = argv;
verify_ = false;
pool_index_ = 0;
agent_index_ = 0;
req_read_ = REQ_INVALID;
req_write_ = REQ_INVALID;
req_copy_bidir_ = REQ_INVALID;
req_copy_unidir_ = REQ_INVALID;
}
RocmAsync::~RocmAsync() { }
+338
Просмотреть файл
@@ -0,0 +1,338 @@
#ifndef __ROCM_ASYNC_BW_H__
#define __ROCM_ASYNC_BW_H__
#include "hsa/hsa.h"
#include "base_test.hpp"
#include "hsatimer.hpp"
#include "common.hpp"
#include <vector>
using namespace std;
// Structure to encapsulate a RocR agent and its index in a list
typedef struct agent_info {
agent_info(hsa_agent_t agent,
uint32_t index, hsa_device_type_t device_type) {
agent_ = agent;
index_ = index;
device_type_ = device_type;
}
agent_info() {}
uint32_t index_;
hsa_agent_t agent_;
hsa_device_type_t device_type_;
} agent_info_t;
typedef struct pool_info {
pool_info(hsa_agent_t agent, uint32_t agent_index,
hsa_amd_memory_pool_t pool, hsa_amd_segment_t segment,
size_t size, uint32_t index, bool is_fine_grained,
bool is_kernarg, bool access_to_all,
hsa_amd_memory_pool_access_t owner_access) {
pool_ = pool;
index_ = index;
segment_ = segment;
owner_agent_ = agent;
agent_index_ = agent_index;
allocable_size_ = size;
is_kernarg_ = is_kernarg;
owner_access_ = owner_access;
access_to_all_ = access_to_all;
is_fine_grained_ = is_fine_grained;
}
pool_info() {}
uint32_t index_;
bool is_kernarg_;
bool access_to_all_;
bool is_fine_grained_;
size_t allocable_size_;
uint32_t agent_index_;
hsa_agent_t owner_agent_;
hsa_amd_segment_t segment_;
hsa_amd_memory_pool_t pool_;
hsa_amd_memory_pool_access_t owner_access_;
} pool_info_t;
// Used to print out topology info
typedef struct agent_pool_info {
agent_pool_info() {}
agent_info agent;
vector<pool_info_t> pool_list;
} agent_pool_info_t;
typedef struct async_trans {
uint32_t req_type_;
union {
struct {
bool bidir_;
bool uses_gpu_;
uint32_t src_idx_;
uint32_t dst_idx_;
hsa_amd_memory_pool_t src_pool_;
hsa_amd_memory_pool_t dst_pool_;
} copy;
struct {
void* code_;
uint32_t agent_idx_;
hsa_agent_t agent_;
uint32_t pool_idx_;
hsa_amd_memory_pool_t pool_;
} kernel;
};
// Cpu BenchMark average copy time
vector<double> cpu_avg_time_;
// Cpu Min time
vector<double> cpu_min_time_;
// Gpu BenchMark average copy time
vector<double> gpu_avg_time_;
// Gpu Min time
vector<double> gpu_min_time_;
async_trans(uint32_t req_type) { req_type_ = req_type; }
} async_trans_t;
typedef enum Request_Type {
REQ_READ = 1,
REQ_WRITE = 2,
REQ_COPY_BIDIR = 3,
REQ_COPY_UNIDIR = 4,
REQ_INVALID = 5,
} Request_Type;
class RocmAsync : public BaseTest {
public:
// @brief: Constructor for test case of RocmAsync
RocmAsync(int argc, char** argv);
// @brief: Destructor for test case of RocmAsync
virtual ~RocmAsync();
// @brief: Setup the environment for measurement
virtual void SetUp();
// @brief: Core measurement execution
virtual void Run();
// @brief: Clean up and retrive the resource
virtual void Close();
// @brief: Display the results
virtual void Display() const;
private:
// @brief: Print Help Menu Screen
void PrintHelpScreen();
// @brief: Discover the topology of pools on Rocm Platform
void DiscoverTopology();
// @brief: Print topology info
void PrintTopology();
// @brief: Print info on agents in system
void PrintAgentsList();
// @brief: Print info on memory pools in system
void PrintPoolsList();
// @brief: Parse the arguments provided by user to
// build list of transactions
void ParseArguments();
// @brief: Print the list of transactions
void PrintTransList();
// @brief: Run read/write requests of users
void RunIOBenchmark(async_trans_t& trans);
// @brief: Run copy requests of users
void RunCopyBenchmark(async_trans_t& trans);
// @brief: Get iteration number
uint32_t GetIterationNum();
// @brief: Get the mean copy time
double GetMeanTime(std::vector<double>& vec);
// @brief: Get the min copy time
double GetMinTime(std::vector<double>& vec);
// @brief: Dispaly Benchmark result
void DisplayIOTime(async_trans_t& trans) const;
void DisplayCopyTime(async_trans_t& trans) const;
private:
// @brief: Validate the arguments passed in by user
bool ValidateArguments();
bool ValidateReadReq();
bool ValidateWriteReq();
bool ValidateReadOrWriteReq(vector<uint32_t>& in_list);
bool ValidateBidirCopyReq();
bool ValidateUnidirCopyReq();
bool ValidateCopyReq(vector<uint32_t>& in_list);
void PrintIOAccessError(uint32_t agent_idx, uint32_t pool_idx);
void PrintCopyAccessError(uint32_t src_pool_idx, uint32_t dst_pool_idx);
bool PoolIsPresent(vector<uint32_t>& in_list);
bool PoolIsDuplicated(vector<uint32_t>& in_list);
// @brief: Builds a list of transaction per user request
bool BuildTransList();
bool BuildReadTrans();
bool BuildWriteTrans();
bool BuildBidirCopyTrans();
bool BuildUnidirCopyTrans();
bool BuildReadOrWriteTrans(uint32_t req_type,
vector<uint32_t>& in_list);
bool BuildCopyTrans(uint32_t req_type,
vector<uint32_t>& src_list,
vector<uint32_t>& dst_list);
void AllocateCopyBuffers(bool bidir, uint32_t size,
void*& src_fwd, hsa_amd_memory_pool_t src_pool_fwd,
void*& dst_fwd, hsa_amd_memory_pool_t dst_pool_fwd,
hsa_agent_t src_agent_fwd, hsa_agent_t dst_agent_fwd,
void*& src_rev, hsa_amd_memory_pool_t src_pool_rev,
void*& dst_rev, hsa_amd_memory_pool_t dst_pool_rev,
hsa_agent_t src_agent_rev, hsa_agent_t dst_agent_rev,
hsa_signal_t& signal_fwd, hsa_signal_t& signal_rev);
void ReleaseBuffers(bool bidir,
void* src_fwd, void* src_rev,
void* dst_fwd, void* dst_rev,
hsa_signal_t signal_fwd, hsa_signal_t signal_rev);
double GetGpuCopyTime(bool bidir, hsa_signal_t signal_fwd, hsa_signal_t signal_rev);
void AllocateHostBuffers(bool bidir, uint32_t size,
void*& src_fwd, void*& dst_fwd,
void* buf_src_fwd, void* buf_dst_fwd,
hsa_agent_t src_agent_fwd, hsa_agent_t dst_agent_fwd,
void*& src_rev, void*& dst_rev,
void* buf_src_rev, void* buf_dst_rev,
hsa_agent_t src_agent_rev, hsa_agent_t dst_agent_rev,
hsa_signal_t& signal_fwd, hsa_signal_t& signal_rev);
void copy_buffer(void* dst, hsa_agent_t dst_agent,
void* src, hsa_agent_t src_agent,
size_t size, hsa_signal_t signal);
// @brief: Check if agent and access memory pool, if so, set
// access to the agent, if not, exit
void AcquireAccess(hsa_agent_t agent, void* ptr);
// Functions to find agents and memory pools and udpate
// relevant data structures used to maintain system topology
friend hsa_status_t AgentInfo(hsa_agent_t agent, void* data);
friend hsa_status_t MemPoolInfo(hsa_amd_memory_pool_t pool, void* data);
protected:
// More variables declared for testing
// vector<transaction> tran_;
// Used to help count agent_info
uint32_t agent_index_;
// List used to store agent info, indexed by agent_index_
vector<agent_info_t> agent_list_;
// Used to help count pool_info_t
uint32_t pool_index_;
// List used to store pool_info_t, indexed by pool_index_
vector<pool_info_t> pool_list_;
// List used to store agent_pool_info_t
vector<agent_pool_info_t> agent_pool_list_;
// List of agents involved in a bidrectional copy operation
// Size of the list cannot exceed the number of agents
// reported by the system
vector<uint32_t> bidir_list_;
// List of source agents in a unidrectional copy operation
// Size of the list cannot exceed the number of agents
// reported by the system
vector<uint32_t> src_list_;
// List of destination agents in a unidrectional copy operation
// Size of the list cannot exceed the number of agents
// reported by the system
vector<uint32_t> dst_list_;
// List of agents involved in read operation. Has
// two agents, the first agent hosts the memory pool
// while the second agent executes the read operation
vector<uint32_t> read_list_;
// List of agents involved in write operation. Has
// two agents, the first agent hosts the memory pool
// while the second agent executes the write operation
vector<uint32_t> write_list_;
// List of sizes to use in copy and read/write transactions
// Size is specified in terms of Megabytes
vector<uint32_t> size_list_;
// Type of service requested by user
uint32_t req_read_;
uint32_t req_write_;
uint32_t req_copy_bidir_;
uint32_t req_copy_unidir_;
// List used to store transactions per user request
vector<async_trans_t> trans_list_;
// Variable to store argument number
// Variable to store argument number
// Variable to store argument number
uint32_t usr_argc_;
// Pointer to store address of argument text
char** usr_argv_;
// BenchMark copy time
vector<double> op_time_;
// Min time
vector<double> min_time_;
// Determines if user has requested verification
bool verify_;
// CPU agent used for verification
hsa_agent_t cpu_agent_;
// System region
hsa_amd_memory_pool_t sys_pool_;
static const uint32_t SIZE_LIST[4];
//static const uint32_t SIZE_LIST[9];
};
#endif
+206
Просмотреть файл
@@ -0,0 +1,206 @@
#include "common.hpp"
#include "rocm_async.hpp"
#include <algorithm>
#include <sstream>
#include <unistd.h>
// Parse option value string. The string has one more decimal
// values separated by comma - "3,6,9,12,15".
static bool ParseOptionValue(char* value, vector<uint32_t>&value_list) {
// Capture the option value string
std::stringstream stream;
stream << value;
uint32_t token = 0x11231926;
do {
// Read the option value
stream >> token;
// Update output list with values
value_list.push_back(token);
// Ignore the delimiter
if((stream.eof()) ||
(stream.peek() == ',')) {
stream.ignore();
} else {
return false;
}
} while (!stream.eof());
return true;
}
void RocmAsync::ParseArguments() {
bool print_help = false;
bool copy_all_bi = false;
bool copy_all_uni = false;
bool print_topology = false;
// This will suppress prints from getopt implementation
// In case of error, it will return the character '?' as
// return value.
opterr = 0;
int opt;
bool status;
while ((opt = getopt(usr_argc_, usr_argv_, "hvtaAb:s:d:r:w:m:")) != -1) {
switch (opt) {
// Print help screen
case 'h':
print_help = true;
break;
// Print system topology
case 't':
print_topology = true;
break;
// Set verification flag to true
case 'v':
verify_ = true;
break;
// Collect list of agents involved in bidirectional copy operation
case 'b':
status = ParseOptionValue(optarg, bidir_list_);
if (status) {
req_copy_bidir_ = REQ_COPY_BIDIR;
break;
}
print_help = true;
break;
// Collect list of source pools involved in unidirectional copy operation
case 's':
status = ParseOptionValue(optarg, src_list_);
if (status) {
req_copy_unidir_ = REQ_COPY_UNIDIR;
break;
}
print_help = true;
break;
// Collect list of destination pools involved in unidirectional copy operation
case 'd':
status = ParseOptionValue(optarg, dst_list_);
if (status) {
req_copy_unidir_ = REQ_COPY_UNIDIR;
break;
}
print_help = true;
break;
// Collect request to read a buffer
case 'r':
req_read_ = REQ_READ;
status = ParseOptionValue(optarg, read_list_);
if (status == false) {
print_help = true;
}
break;
// Collect request to write a buffer
case 'w':
req_write_ = REQ_WRITE;
status = ParseOptionValue(optarg, write_list_);
if (status == false) {
print_help = true;
}
break;
// Size of buffers to use in copy and read/write operations
case 'm':
status = ParseOptionValue(optarg, size_list_);
if (status == false) {
print_help = true;
}
break;
// Enable Unidirectional copy among all valid pools
case 'a':
copy_all_uni = true;
req_copy_unidir_ = REQ_COPY_UNIDIR;
break;
// Enable Bidirectional copy among all valid pools
case 'A':
copy_all_bi = true;
req_copy_bidir_ = REQ_COPY_BIDIR;
break;
// getopt implementation returns the value of the unknown
// option or an option with missing operand in the variable
// optopt
case '?':
std::cout << "Value of optopt is: " << '?' << std::endl;
if ((optopt == 'b' || optopt == 's' || optopt == 'd' || optopt == 'e')) {
std::cout << "Error: Option -b -s -d and -e require argument" << std::endl;
}
print_help = true;
break;
default:
print_help = true;
break;
}
}
// Print help screen if user option has "-h"
if (print_help) {
PrintHelpScreen();
exit(0);
}
// Initialize Roc Runtime
err_ = hsa_init();
ErrorCheck(err_);
// Discover the topology of RocR agent in system
DiscoverTopology();
// Print system topology if user option has "-t"
if (print_topology) {
PrintTopology();
exit(0);
}
// Invalidate request if user has requested full
// copying for both unidirectional and bidirectional
if ((copy_all_bi) && (copy_all_uni)) {
PrintHelpScreen();
exit(0);
}
// Initialize pool list if full copying in unidirectional mode is enabled
if (copy_all_uni) {
uint32_t size = pool_list_.size();
for (uint32_t idx = 0; idx < size; idx++) {
src_list_.push_back(idx);
dst_list_.push_back(idx);
}
}
// Initialize pool list if full copying in bidirectional mode is enabled
if (copy_all_bi) {
uint32_t size = pool_list_.size();
for (uint32_t idx = 0; idx < size; idx++) {
bidir_list_.push_back(idx);
}
}
// Initialize the list of buffer sizes to use in copy/read/write operations
if (size_list_.size() == 0) {
uint32_t size_len = sizeof(SIZE_LIST)/sizeof(uint32_t);
for (uint32_t idx = 0; idx < size_len; idx++) {
size_list_.push_back(SIZE_LIST[idx]);
}
}
std::sort(size_list_.begin(), size_list_.end());
}
+177
Просмотреть файл
@@ -0,0 +1,177 @@
#include "common.hpp"
#include "rocm_async.hpp"
// @Brief: Print Help Menu Screen
void RocmAsync::PrintHelpScreen() {
std::cout << std::endl;
std::cout << "Runs with following options:" << std::endl;
std::cout << std::endl;
std::cout << "\t -h Prints the help screen" << std::endl;
std::cout << "\t -g Prints Gpu times for transfers" << std::endl;
std::cout << "\t -t Prints system topology and its memory pools" << std::endl;
std::cout << "\t -m List of buffer sizes to use, specified in Megabytes" << std::endl;
std::cout << "\t -r List of pool,agent pairs engaged in Read operation" << std::endl;
std::cout << "\t -w List of pool,agent pairs engaged in Write operation" << std::endl;
std::cout << "\t -b List pools to use in bidirectional copy operations" << std::endl;
std::cout << "\t -s List of source pools to use in copy unidirectional operations" << std::endl;
std::cout << "\t -d List of destination pools to use in unidirectional copy operations" << std::endl;
std::cout << "\t -a Perform Unidirectional Copy involving all pool combinations" << std::endl;
std::cout << "\t -A Perform Bidirectional Copy involving all pool combinations" << std::endl;
std::cout << std::endl;
std::cout << "\t @note 1: Removes copyReq(srcI, dstI) - where Src & Dst Pools are same" << std::endl;
std::cout << std::endl;
std::cout << "\t @note 2: Removes copyReq(srcI, dstJ) - where Src & Dst Pools are Cpu bound " << std::endl;
std::cout << std::endl;
std::cout << "\t @note 3: Treats copyReq(dstI, srcJ) as NOT EQUAL to copyReq(dstJ, srcI) " << std::endl;
std::cout << "\t Underlying copy engine could be different " << std::endl;
std::cout << std::endl;
}
// @brief: Print the topology of Memory Pools and Agents present in system
void RocmAsync::PrintTopology() {
size_t count = agent_pool_list_.size();
std::cout << std::endl;
for (uint32_t idx = 0; idx < count; idx++) {
agent_pool_info_t node = agent_pool_list_.at(idx);
// Print agent info
std::cout << "Agent: " << node.agent.index_ << std::endl;
if (HSA_DEVICE_TYPE_CPU == node.agent.device_type_)
std::cout << " Agent Device Type: CPU" << std::endl;
else if (HSA_DEVICE_TYPE_GPU == node.agent.device_type_)
std::cout << " Agent Device Type: GPU" << std::endl;
// Print pool info
size_t pool_count = node.pool_list.size();
for (uint32_t jdx = 0; jdx < pool_count; jdx++) {
std::cout << " Memory Pool: "
<< node.pool_list.at(jdx).index_ << std::endl;
std::cout << " max allocable size in KB: "
<< node.pool_list.at(jdx).allocable_size_ / 1024 << std::endl;
std::cout << " segment id: "
<< node.pool_list.at(jdx).segment_ << std::endl;
std::cout << " is kernarg: "
<< node.pool_list.at(jdx).is_kernarg_ << std::endl;
std::cout << " is fine-grained: "
<< node.pool_list.at(jdx).is_fine_grained_ << std::endl;
std::cout << " accessible to owner: "
<< node.pool_list.at(jdx).owner_access_ << std::endl;
std::cout << " accessible to all by default: "
<< node.pool_list.at(jdx).access_to_all_ << std::endl;
}
std::cout << std::endl;
}
std::cout << std::endl;
}
// @brief: Print info on agents in system
void RocmAsync::PrintAgentsList() {
size_t count = agent_pool_list_.size();
for (uint32_t idx = 0; idx < count; idx++) {
std::cout << std::endl;
agent_pool_info_t node = agent_pool_list_.at(idx);
std::cout << "Agent: " << node.agent.index_ << std::endl;
if (HSA_DEVICE_TYPE_CPU == node.agent.device_type_)
std::cout << " Agent Device Type: CPU" << std::endl;
else if (HSA_DEVICE_TYPE_GPU == node.agent.device_type_)
std::cout << " Agent Device Type: GPU" << std::endl;
}
std::cout << std::endl;
}
// @brief: Print info on memory pools in system
void RocmAsync::PrintPoolsList() {
size_t pool_count = pool_list_.size();
for (uint32_t jdx = 0; jdx < pool_count; jdx++) {
std::cout << std::endl;
std::cout << "Memory Pool Idx: "
<< pool_list_.at(jdx).index_ << std::endl;
std::cout << " max allocable size in KB: "
<< pool_list_.at(jdx).allocable_size_ / 1024 << std::endl;
std::cout << " segment id: "
<< pool_list_.at(jdx).segment_ << std::endl;
std::cout << " is kernarg: "
<< pool_list_.at(jdx).is_kernarg_ << std::endl;
std::cout << " is fine-grained: "
<< pool_list_.at(jdx).is_fine_grained_ << std::endl;
std::cout << " accessible to owner: "
<< pool_list_.at(jdx).owner_access_ << std::endl;
std::cout << " accessible to all by default: "
<< pool_list_.at(jdx).access_to_all_ << std::endl;
}
std::cout << std::endl;
}
// @brief: Print the list of transactions that will be executed
void RocmAsync::PrintTransList() {
size_t count = trans_list_.size();
for (uint32_t idx = 0; idx < count; idx++) {
async_trans_t trans = trans_list_.at(idx);
std::cout << std::endl;
std::cout << " Transaction Id: " << idx << std::endl;
std::cout << " Transaction Type: " << trans.req_type_ << std::endl;
if ((trans.req_type_ == REQ_READ) || (trans.req_type_ == REQ_WRITE)) {
std::cout << "Rocm Kernel used by Transaction: " << trans.kernel.code_ << std::endl;
std::cout << "Rocm Memory Pool Used by Kernel: " << trans.kernel.pool_idx_ << std::endl;
std::cout << " Rocm Agent used for Execution: " << trans.kernel.agent_idx_ << std::endl;
}
if ((trans.req_type_ == REQ_COPY_BIDIR) || (trans.req_type_ == REQ_COPY_UNIDIR)) {
std::cout << " Src Memory Pool used in Copy: " << trans.copy.src_idx_ << std::endl;
std::cout << " Dst Memory Pool used in Copy: " << trans.copy.dst_idx_ << std::endl;
}
}
std::cout << std::endl;
}
// @brief: Prints error message when a request to copy between
// source pool and destination pool is not possible
void RocmAsync::PrintCopyAccessError(uint32_t src_idx, uint32_t dst_idx) {
// Retrieve Roc runtime handles for Src memory pool and agents
uint32_t src_dev_idx = pool_list_[src_idx].agent_index_;
hsa_device_type_t src_dev_type = agent_list_[src_dev_idx].device_type_;
// Retrieve Roc runtime handles for Dst memory pool and agents
uint32_t dst_dev_idx = pool_list_[dst_idx].agent_index_;
hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_;
std::cout << std::endl;
std::cout << "Index of Src Pool: " << src_idx << std::endl;
std::cout << "Index of Dst Pool: " << dst_idx << std::endl;
std::cout << "Index of Src Pool's Agent: " << src_dev_idx << std::endl;
std::cout << "Index of Dst Pool's Agent: " << dst_dev_idx << std::endl;
std::cout << "Device Type of Src Pool's Agent: " << src_dev_type << std::endl;
std::cout << "Device Type of Dst Pool's Agent: " << dst_dev_type << std::endl;
std::cout << "Rocm Agent hosting Src Pool cannot ACCESS Dst Pool" << std::endl;
std::cout << std::endl;
}
// @brief: Prints error message when a request to read / write from
// a pool by an agent is not possible
void RocmAsync::PrintIOAccessError(uint32_t exec_idx, uint32_t pool_idx) {
// Retrieve device type of executing agent
hsa_device_type_t exec_dev_type = agent_list_[exec_idx].device_type_;
// Retrieve device type of memory pool's agent
uint32_t pool_dev_idx = pool_list_[pool_idx].agent_index_;
hsa_device_type_t pool_dev_type = agent_list_[pool_dev_idx].device_type_;
std::cout << std::endl;
std::cout << "Index of Executing Agent: " << exec_idx << std::endl;
std::cout << "Device Type of Executing Agent: " << exec_dev_type << std::endl;
std::cout << "Index of Buffer's Memory Pool: " << pool_idx << std::endl;
std::cout << "Index of Buffer Memory Pool's Agent: " << pool_dev_idx << std::endl;
std::cout << "Device Type of Buffer Memory Pool's Agent: " << pool_dev_type << std::endl;
std::cout << "Rocm Agent executing Read / Write request cannot ACCESS Buffer's Memory Pool" << std::endl;
std::cout << std::endl;
}
+167
Просмотреть файл
@@ -0,0 +1,167 @@
#include "common.hpp"
#include "rocm_async.hpp"
#include <iomanip>
#include <sstream>
#include <algorithm>
static void printRecord(uint32_t size, double avg_time,
double bandwidth, double min_time,
double peak_bandwidth) {
std::stringstream size_str;
size_str << size << " MB";
uint32_t format = 15;
std::cout.precision(3);
std::cout.width(format);
std::cout << size_str.str();
std::cout.width(format);
std::cout << (avg_time * 1e6);
std::cout.width(format);
std::cout << bandwidth;
std::cout.width(format);
std::cout << (min_time * 1e6);
std::cout.width(format);
std::cout << peak_bandwidth;
std::cout << std::endl;
}
static void printCopyBanner(uint32_t src_pool_id, uint32_t src_agent_type,
uint32_t dst_pool_id, uint32_t dst_agent_type) {
std::stringstream src_type;
std::stringstream dst_type;
(src_agent_type == 0) ? src_type << "Cpu" : src_type << "Gpu";
(dst_agent_type == 0) ? dst_type << "Cpu" : dst_type << "Gpu";
std::cout << std::endl;
std::cout << "================";
std::cout << " Benchmark Result";
std::cout << " ================";
std::cout << std::endl;
std::cout << "================";
std::cout << " Src Pool Id: " << src_pool_id;
std::cout << " Src Agent Type: " << src_type.str();
std::cout << " ================";
std::cout << std::endl;
std::cout << "================";
std::cout << " Dst Pool Id: " << dst_pool_id;
std::cout << " Dst Agent Type: " << dst_type.str();
std::cout << " ================";
std::cout << std::endl;
std::cout << std::endl;
uint32_t format = 15;
std::cout.setf(ios::left);
std::cout.width(format);
std::cout << "Data Size";
std::cout.width(format);
std::cout << "Avg Time(us)";
std::cout.width(format);
std::cout << "Avg BW(GB/s)";
std::cout.width(format);
std::cout << "Min Time(us)";
std::cout.width(format);
std::cout << "Peak BW(GB/s)";
std::cout << std::endl;
}
double RocmAsync::GetMinTime(std::vector<double>& vec) {
std::sort(vec.begin(), vec.end());
return vec.at(0);
}
double RocmAsync::GetMeanTime(std::vector<double>& vec) {
std::sort(vec.begin(), vec.end());
vec.erase(vec.begin());
vec.erase(vec.begin(), vec.begin() + num_iteration_ * 0.1);
vec.erase(vec.begin() + num_iteration_, vec.end());
double mean = 0.0;
int num = vec.size();
for (int it = 0; it < num; it++) {
mean += vec[it];
}
mean /= num;
return mean;
}
void RocmAsync::Display() const {
// Iterate through list of transactions and display its timing data
uint32_t trans_size = trans_list_.size();
if (trans_size == 0) {
std::cout << std::endl;
std::cout << " One or more of the requests wered filtered out " << std::endl;
std::cout << " i.e. No Valid Requests were Made or Remain" << std::endl;
std::cout << std::endl;
return;
}
for (uint32_t idx = 0; idx < trans_size; idx++) {
async_trans_t trans = trans_list_[idx];
if ((trans.req_type_ == REQ_COPY_BIDIR) ||
(trans.req_type_ == REQ_COPY_UNIDIR)) {
DisplayCopyTime(trans);
}
if ((trans.req_type_ == REQ_READ) ||
(trans.req_type_ == REQ_WRITE)) {
DisplayIOTime(trans);
}
}
std::cout << std::endl;
}
void RocmAsync::DisplayIOTime(async_trans_t& trans) const {
}
void RocmAsync::DisplayCopyTime(async_trans_t& trans) const {
// Get the frequency of Gpu Timestamping
uint64_t sys_freq = 0;
hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sys_freq);
// Print Benchmark Header
uint32_t src_idx = trans.copy.src_idx_;
uint32_t dst_idx = trans.copy.dst_idx_;
uint32_t src_dev_idx = pool_list_[src_idx].agent_index_;
hsa_device_type_t src_dev_type = agent_list_[src_dev_idx].device_type_;
uint32_t dst_dev_idx = pool_list_[dst_idx].agent_index_;
hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_;
printCopyBanner(src_idx, src_dev_type, dst_idx, dst_dev_type);
double avg_time = 0;
double min_time = 0;
double bandwidth = 0;
uint32_t data_size = 0;
double peak_bandwidth = 0;
uint32_t size_len = size_list_.size();
for (uint32_t idx = 0; idx < size_len; idx++) {
// Adjust size of data involved in copy
data_size = size_list_[idx];
if (trans.copy.bidir_ == true) {
data_size += size_list_[idx];
}
data_size = data_size * 1024 * 1024;
// Copy operation does not involve a Gpu device
if (trans.copy.uses_gpu_ != true) {
avg_time = trans.cpu_avg_time_[idx];
min_time = trans.cpu_min_time_[idx];
bandwidth = (double)data_size / avg_time / 1000 / 1000 / 1000;
peak_bandwidth = (double)data_size / min_time / 1000 / 1000 / 1000;
} else {
avg_time = trans.gpu_avg_time_[idx] / sys_freq;
min_time = trans.gpu_min_time_[idx] / sys_freq;
bandwidth = (double)data_size / avg_time / 1000 / 1000 / 1000;
peak_bandwidth = (double)data_size / min_time / 1000 / 1000 / 1000;
}
printRecord(size_list_[idx], avg_time, bandwidth, min_time, peak_bandwidth);
}
}
+114
Просмотреть файл
@@ -0,0 +1,114 @@
#include "common.hpp"
#include "rocm_async.hpp"
// @brief: Helper method to iterate throught the memory pools of
// an agent and discover its properties
hsa_status_t MemPoolInfo(hsa_amd_memory_pool_t pool, void* data) {
hsa_status_t status;
RocmAsync* asyncDrvr = reinterpret_cast<RocmAsync*>(data);
// Query pools' segment, report only pools from global segment
hsa_amd_segment_t segment;
status = hsa_amd_memory_pool_get_info(pool,
HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
ErrorCheck(status);
if (HSA_AMD_SEGMENT_GLOBAL != segment) {
return HSA_STATUS_SUCCESS;
}
// Determine if allocation is allowed in this pool
// Report only pools that allow an alloction by user
bool alloc = false;
status = hsa_amd_memory_pool_get_info(pool,
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &alloc);
ErrorCheck(status);
if (alloc != true) {
return HSA_STATUS_SUCCESS;
}
// Query the max allocatable size
size_t max_size = 0;
status = hsa_amd_memory_pool_get_info(pool,
HSA_AMD_MEMORY_POOL_INFO_SIZE, &max_size);
ErrorCheck(status);
// Determine if the pools is accessible to all agents
bool access_to_all = false;
status = hsa_amd_memory_pool_get_info(pool,
HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &access_to_all);
ErrorCheck(status);
// Determine type of access to owner agent
hsa_amd_memory_pool_access_t owner_access;
hsa_agent_t agent = asyncDrvr->agent_list_.back().agent_;
status = hsa_amd_agent_memory_pool_get_info(agent, pool,
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &owner_access);
ErrorCheck(status);
// Determine if the pool is fine-grained or coarse-grained
uint32_t flag = 0;
status = hsa_amd_memory_pool_get_info(pool,
HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag);
ErrorCheck(status);
bool is_kernarg = (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & flag);
bool is_fine_grained = (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & flag);
// Update the pool handle for system memory if kernarg is true
if (is_kernarg) {
asyncDrvr->sys_pool_ = pool;
}
// Create an instance of agent_pool_info and add it to the list
pool_info_t pool_info(agent, asyncDrvr->agent_index_, pool,
segment, max_size, asyncDrvr->pool_index_,
is_fine_grained, is_kernarg,
access_to_all, owner_access);
asyncDrvr->pool_list_.push_back(pool_info);
// Create an agent_pool_infot and add it to its list
asyncDrvr->agent_pool_list_[asyncDrvr->agent_index_].pool_list.push_back(pool_info);
asyncDrvr->pool_index_++;
return HSA_STATUS_SUCCESS;
}
// @brief: Helper method to iterate throught the agents of
// a system and discover its properties
hsa_status_t AgentInfo(hsa_agent_t agent, void* data) {
RocmAsync* asyncDrvr = reinterpret_cast<RocmAsync*>(data);
// Get the name of the agent
char agent_name[64];
hsa_status_t status;
status = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_name);
ErrorCheck(status);
// Get device type
hsa_device_type_t device_type;
status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
ErrorCheck(status);
// Capture the handle of Cpu agent
if (device_type == HSA_DEVICE_TYPE_CPU) {
asyncDrvr->cpu_agent_ = agent;
}
asyncDrvr->agent_list_.push_back(agent_info(agent, asyncDrvr->agent_index_, device_type));
// Contruct an new agent_pool_info structure and add it to the list
agent_pool_info node;
node.agent = asyncDrvr->agent_list_.back();
asyncDrvr->agent_pool_list_.push_back(node);
status = hsa_amd_agent_iterate_memory_pools(agent, MemPoolInfo, asyncDrvr);
asyncDrvr->agent_index_++;
return HSA_STATUS_SUCCESS;
}
void RocmAsync::DiscoverTopology() {
err_ = hsa_iterate_agents(AgentInfo, this);
}
+173
Просмотреть файл
@@ -0,0 +1,173 @@
#include "common.hpp"
#include "rocm_async.hpp"
bool RocmAsync::BuildReadOrWriteTrans(uint32_t req_type,
vector<uint32_t>& in_list) {
// Validate the list of pool-agent tuples
hsa_status_t status;
hsa_amd_memory_pool_access_t access;
uint32_t list_size = in_list.size();
for (uint32_t idx = 0; idx < list_size; idx+=2) {
uint32_t pool_idx = in_list[idx];
uint32_t exec_idx = in_list[idx + 1];
// Retrieve Roc runtime handles for memory pool and agent
hsa_agent_t exec_agent = agent_list_[exec_idx].agent_;
hsa_amd_memory_pool_t pool = pool_list_[pool_idx].pool_;
// Determine agent can access the memory pool
status = hsa_amd_agent_memory_pool_get_info(exec_agent, pool,
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
ErrorCheck(status);
// Determine if accessibility to agent is not denied
if (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
PrintIOAccessError(exec_idx, pool_idx);
return false;
}
// Agent has access, build an instance of transaction
// and add it to the list of transactions
async_trans_t trans(req_type);
trans.kernel.code_ = nullptr;
trans.kernel.pool_ = pool;
trans.kernel.pool_idx_ = pool_idx;
trans.kernel.agent_ = exec_agent;
trans.kernel.agent_idx_ = exec_idx;
trans_list_.push_back(trans);
}
return true;
}
bool RocmAsync::BuildReadTrans() {
return BuildReadOrWriteTrans(REQ_READ, read_list_);
}
bool RocmAsync::BuildWriteTrans() {
return BuildReadOrWriteTrans(REQ_WRITE, write_list_);
}
bool RocmAsync::BuildCopyTrans(uint32_t req_type,
vector<uint32_t>& src_list,
vector<uint32_t>& dst_list) {
uint32_t src_size = src_list.size();
uint32_t dst_size = dst_list.size();
hsa_status_t status;
hsa_amd_memory_pool_access_t access;
for (uint32_t idx = 0; idx < src_size; idx++) {
// Retrieve Roc runtime handles for Src memory pool and agents
uint32_t src_idx = src_list[idx];
hsa_agent_t src_agent = pool_list_[src_idx].owner_agent_;
hsa_amd_memory_pool_t src_pool = pool_list_[src_idx].pool_;
uint32_t src_dev_idx = pool_list_[src_idx].agent_index_;
hsa_device_type_t src_dev_type = agent_list_[src_dev_idx].device_type_;
for (uint32_t jdx = 0; jdx < dst_size; jdx++) {
// Retrieve Roc runtime handles for Dst memory pool and agents
uint32_t dst_idx = dst_list[jdx];
hsa_agent_t dst_agent = pool_list_[dst_idx].owner_agent_;
hsa_amd_memory_pool_t dst_pool = pool_list_[dst_idx].pool_;
uint32_t dst_dev_idx = pool_list_[dst_idx].agent_index_;
hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_;
// Filter out transaction when Src & Dst pools belong to Cpu
if ((src_dev_type == HSA_DEVICE_TYPE_CPU) &&
(dst_dev_type == HSA_DEVICE_TYPE_CPU)) {
continue;
}
// Filter out transaction with same Src & Dst pools
if (src_idx == dst_idx) {
continue;
}
// Determine if accessibility to src pool for dst agent is not denied
status = hsa_amd_agent_memory_pool_get_info(dst_agent, src_pool,
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
ErrorCheck(status);
if (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
PrintCopyAccessError(src_idx, dst_idx);
return false;
}
// Determine if accessibility to dst pool for src agent is not denied
status = hsa_amd_agent_memory_pool_get_info(src_agent, dst_pool,
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
ErrorCheck(status);
if (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
return false;
}
// Agents have access, build an instance of transaction
// and add it to the list of transactions
async_trans_t trans(req_type);
trans.copy.src_idx_ = src_idx;
trans.copy.dst_idx_ = dst_idx;
trans.copy.src_pool_ = src_pool;
trans.copy.dst_pool_ = dst_pool;
trans.copy.bidir_ = (req_type == REQ_COPY_BIDIR);
trans.copy.uses_gpu_ = ((src_dev_type == HSA_DEVICE_TYPE_GPU) ||
(dst_dev_type == HSA_DEVICE_TYPE_GPU));
trans_list_.push_back(trans);
}
}
return true;
}
bool RocmAsync::BuildBidirCopyTrans() {
return BuildCopyTrans(REQ_COPY_BIDIR, bidir_list_, bidir_list_);
}
bool RocmAsync::BuildUnidirCopyTrans() {
return BuildCopyTrans(REQ_COPY_UNIDIR, src_list_, dst_list_);
}
// @brief: Builds a list of transaction per user request
bool RocmAsync::BuildTransList() {
// Build list of Read transactions per user request
bool status = false;
if (req_read_ == REQ_READ) {
status = BuildReadTrans();
if (status == false) {
return status;
}
}
// Build list of Write transactions per user request
status = false;
if (req_write_ == REQ_WRITE) {
status = BuildWriteTrans();
if (status == false) {
return status;
}
}
// Build list of Bidirectional Copy transactions per user request
status = false;
if (req_copy_bidir_ == REQ_COPY_BIDIR) {
status = BuildBidirCopyTrans();
if (status == false) {
return status;
}
}
// Build list of Unidirectional Copy transactions per user request
status = false;
if (req_copy_unidir_ == REQ_COPY_UNIDIR) {
status = BuildUnidirCopyTrans();
if (status == false) {
return status;
}
}
// All of the transaction are built up
return true;
}
+155
Просмотреть файл
@@ -0,0 +1,155 @@
#include "common.hpp"
#include "rocm_async.hpp"
#include <assert.h>
#include <algorithm>
#include <unistd.h>
#include <cctype>
#include <sstream>
bool RocmAsync::PoolIsPresent(vector<uint32_t>& in_list) {
bool is_present;
uint32_t idx1 = 0;
uint32_t idx2 = 0;
uint32_t count = in_list.size();
uint32_t pool_count = pool_list_.size();
for (idx1 = 0; idx1 < count; idx1++) {
is_present = false;
for (idx2 = 0; idx2 < pool_count; idx2++) {
if (in_list[idx1] == pool_list_[idx2].index_) {
is_present = true;
break;
}
}
if (is_present == false) {
return false;
}
}
return true;
}
bool RocmAsync::PoolIsDuplicated(vector<uint32_t>& in_list) {
uint32_t idx1 = 0;
uint32_t idx2 = 0;
uint32_t count = in_list.size();
for (idx1 = 0; idx1 < count; idx1++) {
for (idx2 = 0; idx2 < count; idx2++) {
if ((in_list[idx1] == in_list[idx2]) && (idx1 != idx2)){
return false;
}
}
}
return true;
}
bool RocmAsync::ValidateReadOrWriteReq(vector<uint32_t>& in_list) {
// Determine read / write request is even
// Request is specified as a list of memory
// pool, agent tuples - first element identifies
// memory pool while the second element denotes
// an agent
uint32_t list_size = in_list.size();
if ((list_size % 2) != 0) {
return false;
}
// Validate the list of pool-agent tuples
for (uint32_t idx = 0; idx < list_size; idx+=2) {
uint32_t pool_idx = in_list[idx];
uint32_t exec_idx = in_list[idx + 1];
// Determine the pool and agent exist in system
if ((pool_idx >= pool_index_) ||
(exec_idx >= agent_index_)) {
return false;
}
}
return true;
}
bool RocmAsync::ValidateReadReq() {
return ValidateReadOrWriteReq(read_list_);
}
bool RocmAsync::ValidateWriteReq() {
return ValidateReadOrWriteReq(write_list_);
}
bool RocmAsync::ValidateCopyReq(vector<uint32_t>& in_list) {
// Determine pool list length is valid
uint32_t count = in_list.size();
uint32_t pool_count = pool_list_.size();
if (count > pool_count) {
return false;
}
// Determine no pool is duplicated
bool status = PoolIsDuplicated(in_list);
if (status == false) {
return false;
}
// Determine every pool is present in system
return PoolIsPresent(in_list);
}
bool RocmAsync::ValidateBidirCopyReq() {
return ValidateCopyReq(bidir_list_);
}
bool RocmAsync::ValidateUnidirCopyReq() {
return ((ValidateCopyReq(src_list_)) && (ValidateCopyReq(dst_list_)));
}
bool RocmAsync::ValidateArguments() {
// Determine if user has requested a READ
// operation and gave valid inputs
bool status = false;
if (req_read_ == REQ_READ) {
status = ValidateReadReq();
if (status == false) {
return status;
}
}
// Determine if user has requested a WRITE
// operation and gave valid inputs
status = false;
if (req_write_ == REQ_WRITE) {
status = ValidateWriteReq();
if (status == false) {
return status;
}
}
// Determine if user has requested a Copy
// operation that is bidirectional and gave
// valid inputs
status = false;
if (req_copy_bidir_ == REQ_COPY_BIDIR) {
status = ValidateBidirCopyReq();
if (status == false) {
return status;
}
}
// Determine if user has requested a Copy
// operation that is unidirectional and gave
// valid inputs
status = false;
if (req_copy_unidir_ == REQ_COPY_UNIDIR) {
status = ValidateUnidirCopyReq();
if (status == false) {
return status;
}
}
// All of the request are well formed
return true;
}