Added dispatch time, async copy and test template rocrtst tests

Change-Id: I57a844ee65c36bd61616ee6d60d358303f51db56


[ROCm/ROCR-Runtime commit: a12c5628ea]
Этот коммит содержится в:
Chris Freehill
2017-06-28 10:54:57 -05:00
родитель 4648eb66fd
Коммит bcd0bd4e38
73 изменённых файлов: 1592 добавлений и 9290 удалений
+2 -3
Просмотреть файл
@@ -50,11 +50,9 @@
namespace rocrtst {
BaseRocR::BaseRocR(void) {
num_iteration_ = 100;
signal_.handle = 0;
num_iteration_ = 1;
cpu_device_.handle = -1;
gpu_device1_.handle = -1;
region_.handle = 0;
device_pool_.handle = 0;
kern_arg_pool_.handle = 0;
main_queue_ = nullptr;
@@ -66,6 +64,7 @@ BaseRocR::BaseRocR(void) {
orig_hsa_enable_interrupt_ = GetEnv("HSA_ENABLE_INTERRUPT");
set_kernel_file_name("");
set_verbosity(0);
set_title("unset_title");
}
BaseRocR::~BaseRocR() {
Обычный файл → Исполняемый файл
-17
Просмотреть файл
@@ -105,13 +105,6 @@ class BaseRocR {
return kernel_object_;
}
void set_signal(hsa_signal_t sig) {
signal_.handle = sig.handle;
}
const hsa_signal_t& signal(void) const {
return signal_;
}
void set_profile(hsa_profile_t in_prof) {
profile_ = in_prof;
}
@@ -151,10 +144,6 @@ class BaseRocR {
return aql_;
}
hsa_region_t& region(void) {
return region_;
}
void set_num_iteration(int num) {
num_iteration_ = num;
}
@@ -237,16 +226,12 @@ class BaseRocR {
private:
uint64_t num_iteration_; ///< Number of times to execute test
hsa_signal_t signal_; ///< Completion signal used for kernel execution
hsa_queue_t* main_queue_; ///< AQL queue used for packets
hsa_agent_t gpu_device1_; ///< Handle to first GPU found
hsa_agent_t cpu_device_; ///< Handle to CPU
hsa_region_t region_; ///< TODO(cfreehil): delete this
hsa_amd_memory_pool_t device_pool_; ///< Memory pool on gpu pool list
hsa_amd_memory_pool_t cpu_pool_; ///< Memory pool on cpu pool list
@@ -255,8 +240,6 @@ class BaseRocR {
uint64_t kernel_object_; ///< Handle to kernel code
std::string brig_file_; // TODO(cfreehil): delete this
std::string kernel_file_name_; ///< Code object file name
std::string kernel_name_; ///< Kernel name
+58 -64
Просмотреть файл
@@ -70,6 +70,8 @@ namespace rocrtst {
} \
}
// Clean up some of the common handles and memory used by BaseRocR code, then
// shut down hsa. Restore HSA_ENABLE_INTERRUPT to original value, if necessary
hsa_status_t CommonCleanUp(BaseRocR* test) {
hsa_status_t err;
@@ -87,13 +89,9 @@ hsa_status_t CommonCleanUp(BaseRocR* test) {
test->set_main_queue(nullptr);
}
if (0 != test->signal().handle) {
hsa_signal_t sig;
sig.handle = 0;
err = hsa_signal_destroy(test->signal());
if (test->aql().completion_signal.handle != 0) {
err = hsa_signal_destroy(test->aql().completion_signal);
RET_IF_HSA_UTILS_ERR(err);
test->set_signal(sig);
}
err = hsa_shut_down();
@@ -122,7 +120,7 @@ static const char* PROFILE_STR[] = {"HSA_PROFILE_BASE", "HSA_PROFILE_FULL", };
/// \returns bool
/// - true Machine meets test requirements
/// - false Machine does not meet test requirements
static bool CheckProfileAndInform(BaseRocR* test) {
bool CheckProfileAndInform(BaseRocR* test) {
if (test->verbosity() > 0) {
std::cout << "Target HW Profile is "
<< PROFILE_STR[test->profile()] << std::endl;
@@ -162,6 +160,10 @@ static hsa_status_t ProcessIterateError(hsa_status_t err) {
return err;
}
// Find pools for cpu, gpu and for kernel arguments. These pools have
// common basic requirements, but are not suitable for all cases. In
// that case, set cpu_pool(), device_pool() and/or kern_arg_pool()
// yourself instead of using this function.
hsa_status_t SetPoolsTypical(BaseRocR* test) {
hsa_status_t err;
@@ -180,11 +182,9 @@ hsa_status_t SetPoolsTypical(BaseRocR* test) {
return HSA_STATUS_SUCCESS;
}
// Enable interrupts if necessary, and call hsa_init()
hsa_status_t InitAndSetupHSA(BaseRocR* test) {
hsa_agent_t gpu_device1;
hsa_agent_t cpu_device;
hsa_status_t err;
hsa_signal_t sig;
if (test->enable_interrupt()) {
SetEnv("HSA_ENABLE_INTERRUPT", "1");
@@ -193,6 +193,15 @@ hsa_status_t InitAndSetupHSA(BaseRocR* test) {
err = hsa_init();
RET_IF_HSA_UTILS_ERR(err);
return HSA_STATUS_SUCCESS;
}
// Attempt to find and set test->cpu_device and test->gpu_device1
hsa_status_t SetDefaultAgents(BaseRocR* test) {
hsa_agent_t gpu_device1;
hsa_agent_t cpu_device;
hsa_status_t err;
gpu_device1.handle = 0;
err = hsa_iterate_agents(FindGPUDevice, &gpu_device1);
RET_IF_HSA_UTILS_ERR(rocrtst::ProcessIterateError(err));
@@ -217,7 +226,7 @@ hsa_status_t InitAndSetupHSA(BaseRocR* test) {
char name[64] = {0};
err = hsa_agent_get_info(gpu_device1, HSA_AGENT_INFO_NAME, name);
RET_IF_HSA_UTILS_ERR(err);
std::cout << "The device name is " << name << std::endl;
std::cout << "The gpu device name is " << name << std::endl;
}
hsa_profile_t profile;
@@ -228,14 +237,11 @@ hsa_status_t InitAndSetupHSA(BaseRocR* test) {
if (!CheckProfileAndInform(test)) {
return HSA_STATUS_ERROR;
}
err = hsa_signal_create(1, 0, NULL, &sig);
RET_IF_HSA_UTILS_ERR(err);
test->set_signal(sig);
return HSA_STATUS_SUCCESS;
}
// See if the profile of the target matches any required profile by the
// test program.
bool CheckProfile(BaseRocR const* test) {
if (test->requires_profile() == -1) {
return true;
@@ -243,6 +249,19 @@ bool CheckProfile(BaseRocR const* test) {
return (test->requires_profile() == test->profile());
}
}
// Load the specified kernel code from the specified file, inspect and fill
// in BaseRocR member variables related to the kernel and executable.
// Required Input BaseRocR member variables:
// - gpu_device1()
// - kernel_file_name()
// - kernel_name()
//
// Written BaseRocR member variables:
// -kernel_object()
// -private_segment_size()
// -group_segment_size()
// -kernarg_size()
// -kernarg_align()
hsa_status_t LoadKernelFromObjFile(BaseRocR* test) {
hsa_status_t err;
hsa_code_object_reader_t code_obj_rdr = {0};
@@ -334,13 +353,16 @@ hsa_status_t CreateQueue(hsa_agent_t device, hsa_queue_t** queue,
return HSA_STATUS_SUCCESS;
}
void InitializeAQLPacket(const BaseRocR* test,
// Initialize the provided aql packet with standard default values, and
// values from provided BaseRocR object.
hsa_status_t InitializeAQLPacket(const BaseRocR* test,
hsa_kernel_dispatch_packet_t* aql) {
hsa_status_t err;
assert(aql != nullptr);
if (aql == nullptr) {
return;
return HSA_STATUS_ERROR;
}
aql->header = 0; // Set this right before doorbell ring
@@ -361,19 +383,25 @@ void InitializeAQLPacket(const BaseRocR* test,
// Pin kernel code and the kernel argument buffer to the aql packet->
aql->kernel_object = test->kernel_object();
aql->kernarg_address = NULL;
aql->completion_signal.handle = test->signal().handle;
// aql->kernarg_address may be filled in by AllocAndSetKernArgs() if it is
// called before this function, so we don't want overwrite it, therefore
// we ignore it in this function.
return;
err = hsa_signal_create(1, 0, NULL, &aql->completion_signal);
return err;
}
void WriteAQLToQueue(BaseRocR* test) {
// Copy BaseRocR aql object values to the BaseRocR object queue in the
// specified queue position (ind)
hsa_kernel_dispatch_packet_t * WriteAQLToQueue(BaseRocR* test, uint64_t *ind) {
assert(test);
assert(test->main_queue());
void *queue_base = test->main_queue()->base_address;
const uint32_t queue_mask = test->main_queue()->size - 1;
uint64_t que_idx = hsa_queue_add_write_index_relaxed(test->main_queue(), 1);
*ind = que_idx;
hsa_kernel_dispatch_packet_t* staging_aql_packet = &test->aql();
hsa_kernel_dispatch_packet_t* queue_aql_packet;
@@ -395,8 +423,12 @@ void WriteAQLToQueue(BaseRocR* test) {
queue_aql_packet->kernel_object = staging_aql_packet->kernel_object;
queue_aql_packet->kernarg_address = staging_aql_packet->kernarg_address;
queue_aql_packet->completion_signal = staging_aql_packet->completion_signal;
return queue_aql_packet;
}
// Allocate a buffer in the kern_arg_pool for the kernel arguments and write
// the arguments to buffer
hsa_status_t AllocAndSetKernArgs(BaseRocR* test, void* args, size_t arg_size) {
void* kern_arg_buf = nullptr;
hsa_status_t err;
@@ -421,56 +453,18 @@ hsa_status_t AllocAndSetKernArgs(BaseRocR* test, void* args, size_t arg_size) {
assert(((uintptr_t)adj_kern_arg_buf + arg_size) <
((uintptr_t)kern_arg_buf + buf_size));
err = hsa_memory_copy_workaround_cpu(adj_kern_arg_buf, args, arg_size);
RET_IF_HSA_UTILS_ERR(err);
hsa_agent_t ag_list[2] = {*test->gpu_device1(), *test->cpu_device()};
err = hsa_amd_agents_allow_access(2, ag_list, NULL, kern_arg_buf);
RET_IF_HSA_UTILS_ERR(err);
err = hsa_memory_copy(adj_kern_arg_buf, args, arg_size);
RET_IF_HSA_UTILS_ERR(err);
test->aql().kernarg_address = adj_kern_arg_buf;
return HSA_STATUS_SUCCESS;
}
hsa_status_t AllocAndAllowAccess(BaseRocR* test, size_t len,
hsa_amd_memory_pool_t pool, void**buffer) {
hsa_status_t err;
err = hsa_amd_memory_pool_allocate(pool, len, 0, buffer);
RET_IF_HSA_UTILS_ERR(err);
hsa_agent_t ag_list[2] = {*test->gpu_device1(), *test->cpu_device()};
err = hsa_amd_agents_allow_access(2, ag_list, NULL, *buffer);
RET_IF_HSA_UTILS_ERR(err);
return err;
}
hsa_status_t hsa_memory_fill_workaround_gen(void* ptr, uint32_t value,
size_t count, hsa_agent_t dst_ag, hsa_agent_t src_ag, BaseRocR* test) {
hsa_status_t err;
void *tmp_mem;
err = hsa_amd_memory_pool_allocate(test->cpu_pool(), count, 0, &tmp_mem);
RET_IF_HSA_UTILS_ERR(err);
hsa_agent_t ag_list[2] = {*test->gpu_device1(), *test->cpu_device()};
err = hsa_amd_agents_allow_access(2, ag_list, NULL, tmp_mem);
RET_IF_HSA_UTILS_ERR(err);
(void)memset(tmp_mem, value, count);
err = hsa_memory_copy_workaround_gen(ptr, tmp_mem, count, dst_ag, src_ag);
RET_IF_HSA_UTILS_ERR(err);
hsa_amd_memory_pool_free(tmp_mem);
return HSA_STATUS_SUCCESS;
}
#undef RET_IF_HSA_UTILS_ERR
} // namespace rocrtst
+19 -19
Просмотреть файл
@@ -60,14 +60,16 @@ namespace rocrtst {
/// \param[in] test Test for which the kernel will be loaded.
/// \returns HSA_STATUS_SUCCESS if no errors
hsa_status_t LoadKernelFromObjFile(BaseRocR* test);
/// Do initialization tasks for HSA test program. This includes calling
/// hsa_init(), finding and setting the cpu and gpu agent member variables,
/// creating the signal needed for queueing AQL packets and checking
/// HW requirements.
/// Do initialization tasks for HSA test program.
/// \param[in] test Test to initialize
/// \returns HSA_STATUS_SUCCESS if no errors
hsa_status_t InitAndSetupHSA(BaseRocR* test);
/// Find and set the cpu and gpu agent member variables. Also checks that
/// gpu agent meets test requirements (e.g., FULL profile vs. BASE profile).
hsa_status_t SetDefaultAgents(BaseRocR* test);
/// For the provided device agent, create an AQL queue
/// \param[in] device Device for which a queue is to be created
/// \param[out] queue Address to which created queue pointer will be written
@@ -84,16 +86,16 @@ hsa_status_t CreateQueue(hsa_agent_t device, hsa_queue_t** queue,
/// be drawn.
/// \param[inout] aql Caller provided pointer to aql packet that will be
/// populated
/// \returns void
void InitializeAQLPacket(const BaseRocR* test,
/// \returns Appropriate hsa_status_t
hsa_status_t InitializeAQLPacket(const BaseRocR* test,
hsa_kernel_dispatch_packet_t* aql);
/// This function writes all of the aql packet fields to the queue besides
/// "setup" and "header". This assumes all the aql fields have be set
/// appropriately.
/// \param[in] test Test containing the queue and aql packet to be written.
/// \returns void
void WriteAQLToQueue(BaseRocR* test);
/// \returns Pointer to dispatch packet in queue that was written to
hsa_kernel_dispatch_packet_t* WriteAQLToQueue(BaseRocR* test, uint64_t *ind);
/// This function writes the first 32 bits of an aql packet to the provided
/// aql packet. This function is meant to be called immediately before
@@ -139,6 +141,15 @@ bool CheckProfile(BaseRocR const* test);
hsa_status_t AllocAndSetKernArgs(BaseRocR* test, void* args,
size_t arg_size);
/// Verify that the machine running the test has the required profile.
/// This function will verify that the execution machine meets any specific
/// test requirement for a profile (HSA_PROFILE_BASE or HSA_PROFILE_FULL).
/// \param[in] test Test that provides profile requirements.
/// \returns bool
/// - true Machine meets test requirements
/// - false Machine does not meet test requirements
bool CheckProfileAndInform(BaseRocR* test);
/// This function will set the cpu and gpu memory pools to the type used in
/// many applications.
/// \param[in] test Test that provides profile requirements.
@@ -146,17 +157,6 @@ hsa_status_t AllocAndSetKernArgs(BaseRocR* test, void* args,
/// error code otherwise.
hsa_status_t SetPoolsTypical(BaseRocR* test);
/// Allocate memory from a specified pool and grant both standard BaseRocR
/// agents access
/// \param[in] test Test having the agents to which access is granted
/// \param[in] len Size of the memory buffer to allocate
/// \pool[in] Pool from which to allocate memory
/// \buffer[out] Address of pointer which will point to newly allocated memory
/// upon return
/// \returns HSA_STATUS_OK if no errors
hsa_status_t AllocAndAllowAccess(BaseRocR* test, size_t len,
hsa_amd_memory_pool_t pool, void**buffer);
/// Work-around for hsa_amd_memory_fill, which is currently broken.
/// \param[in] ptr Pointer to start of memory location to be filled
/// \param[in] value Value to write to each byte of input buffer
-39
Просмотреть файл
@@ -341,45 +341,6 @@ hsa_status_t DumpPointerInfo(void* ptr) {
return HSA_STATUS_SUCCESS;
}
hsa_status_t hsa_memory_fill_workaround_cpu(void* ptr, uint32_t value,
size_t count) {
(void)memset(ptr, value, count);
return HSA_STATUS_SUCCESS;
}
hsa_status_t hsa_memory_copy_workaround_cpu(void* dst, const void *src,
size_t size) {
(void)memcpy(dst, src, size);
return HSA_STATUS_SUCCESS;
}
hsa_status_t hsa_memory_copy_workaround_gen(void* dst, const void *src,
size_t size, hsa_agent_t dst_ag, hsa_agent_t src_ag) {
hsa_signal_t s;
hsa_status_t err;
err = hsa_signal_create(1, 0, NULL, &s);
RET_IF_HSA_COMMON_ERR(err);
err = hsa_amd_memory_async_copy(dst, dst_ag, src, src_ag, size, 0, NULL, s);
RET_IF_HSA_COMMON_ERR(err);
if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1,
UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0) {
err = HSA_STATUS_ERROR;
std::cout << "Async copy signal error" << std::endl;
RET_IF_HSA_COMMON_ERR(err);
}
err = hsa_signal_destroy(s);
RET_IF_HSA_COMMON_ERR(err);
return err;
}
/*! \brief Writes to the buffer and increments the write pointer to the
* buffer. Also, ensures that the argument is written to an
-30
Просмотреть файл
@@ -140,35 +140,5 @@ hsa_status_t DumpMemoryPoolInfo(const hsa_amd_memory_pool_t pool,
/// \returns HSA_STATUS_SUCCESS if there are no errors
hsa_status_t DumpPointerInfo(void* ptr);
/// This is a work-around for filling cpu-memory to be used until
/// hsa_amd_memory_fill is fixed. Should only be used for cpu memory.
/// \param[in] ptr Start address of memory to be filled.
/// \param[in] value Value to fill buffer with
/// \param[in] count Size of buffer to fill
/// \returns HSA_STATUS_SUCCESS if there are no errors
hsa_status_t hsa_memory_fill_workaround_cpu(void* ptr, uint32_t value,
size_t count);
/// This is a work-around for copying cpu-memory to be used until
/// hsa_amd_memory_copy is fixed. Should only be used for cpu memory.
/// \param[in] dst Destination address of memory to be copied
/// \param[in] src Source address of memory to be copied
/// \param[in] size Size of buffer to fill
/// \returns HSA_STATUS_SUCCESS if there are no errors
hsa_status_t hsa_memory_copy_workaround_cpu(void* dst, const void *src,
size_t size);
/// This is a work-around for copying memory to be used until
/// hsa_amd_memory_copy is fixed. Should be used when gpu local memory is
/// involved.
/// \param[in] dst Destination address of memory to be copied
/// \param[in] src Source address of memory to be copied
/// \param[in] size Size of buffer to fill
/// \param[in] dst_ag Destination agent handle
/// \param[in] src_ag Source agent handle
/// \returns HSA_STATUS_SUCCESS if there are no errors
hsa_status_t hsa_memory_copy_workaround_gen(void* dst, const void *src,
size_t size, hsa_agent_t dst_ag, hsa_agent_t src_ag);
} // namespace rocrtst
#endif // ROCRTST_COMMON_COMMON_H_
+6 -10
Просмотреть файл
@@ -52,10 +52,10 @@
#include <iostream>
#include <string>
#include <vector>
#include <numeric>
namespace rocrtst {
template<typename T>
void PrintArray(const std::string header, const T* data, const int width,
const int height) {
@@ -191,7 +191,7 @@ AlignUp(void* value, size_t alignment) {
alignment));
}
double CalcMedian(std::vector<double> scores) {
double CalcMedian(const std::vector<double> &scores) {
double median;
size_t size = scores.size();
@@ -204,15 +204,11 @@ double CalcMedian(std::vector<double> scores) {
return median;
}
double CalcMean(std::vector<double> scores) {
double mean = 0;
size_t size = scores.size();
double CalcMean(const std::vector<double> &scores) {
double mean;
for (size_t i = 0; i < size; ++i) {
mean += scores[i];
}
return mean / size;
mean = std::accumulate(scores.begin(), scores.end(), 0.0);
return mean/scores.size();
}
double CalcMean(const std::vector<double>& v1, const std::vector<double>& v2) {
+2 -2
Просмотреть файл
@@ -60,7 +60,7 @@ bool Compare(const double* refData, const double* data,
const int length, const double epsilon = 1e-6);
/// Calculate the mean number of the vector
double CalcMean(std::vector<double> scores);
double CalcMean(const std::vector<double> &scores);
/// Calculate the mean time of difference of the two vectors
double CalcMean(const std::vector<double>& v1, const std::vector<double>& v2);
@@ -68,7 +68,7 @@ double CalcMean(const std::vector<double>& v1, const std::vector<double>& v2);
/// Return the median value of a vector of doubles
/// \param[in] scores Vector of doubles
/// \returns double Median value of provided vector
double CalcMedian(std::vector<double> scores);
double CalcMedian(const std::vector<double> &scores);
/// Calculate the standard deviation of the vector
double CalcStdDeviation(std::vector<double> scores, int score_mean);
+4
Просмотреть файл
@@ -70,6 +70,7 @@ PreDispatchCallback(const hsa_dispatch_callback_t* dispParam, void* usrArg) {
dispParam->aql_translation_handle, true);
assert((status == HSA_STATUS_SUCCESS) &&
"Error in beginning Perf Cntr Session");
(void)status; // Avoid warning
}
static void
@@ -82,6 +83,7 @@ PostDispatchCallback(const hsa_dispatch_callback_t* dispParam, void* usrArg) {
dispParam->aql_translation_handle);
assert((status == HSA_STATUS_SUCCESS) &&
"Error in endning Perf Cntr Session");
(void)status; // Avoid warning
}
/// Constructor of the class
@@ -192,6 +194,8 @@ void RocrPerfCntrApp::RegisterCallbacks(hsa_queue_t* queue) {
status = hsa_ext_tools_set_callback_arguments(queue, &perfMgr_, &perfMgr_);
assert((status == HSA_STATUS_SUCCESS) &&
"Error in registering Pre & Post Dispatch Callback Params");
(void)status; // Avoid warning
return;
}
+1 -2
Просмотреть файл
@@ -176,8 +176,7 @@ uint64_t PerfTimer::MeasureTSCFreqHz() {
do {
tscTicksEnd = __rdtscp(&unused);
}
while (tscTicksEnd - tscTicksBegin < 1000000000);
} while (tscTicksEnd - tscTicksBegin < 1000000000);
uint64_t coarseEndUs = CoarseTimestampUs();
+1
Просмотреть файл
@@ -91,6 +91,7 @@ class PerfTimer {
void ResetTimer(int index);
/// Read the time value of the timer associated with the provided index.
/// Units are seconds
/// \param[in] index Index of the timer to read
/// \returns double Value of the timer
double ReadTimer(int index);
+5
Просмотреть файл
@@ -254,6 +254,11 @@ set(BITCODE_LIBS "${BITCODE_LIBS} ${BITCODE_PREF}/ocml.amdgcn.bc")
set(CL_FILE_LIST "${PROJECT_SOURCE_DIR}/binary_search/binary_search_kernels.cl")
process_sample("binary_search")
# P2P Memory Access
set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
set(CL_FILE_LIST "${PROJECT_SOURCE_DIR}/p2p_mem_access/p2p_mem_access_kernels.cl")
process_sample("p2p_mem_access")
# RocR Info
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/rocrinfo ROCR_INFO_SOURCES)
add_executable(rocrinfo ${ROCR_INFO_SOURCES})
+88 -20
Просмотреть файл
@@ -25,10 +25,6 @@ cmake_minimum_required(VERSION 2.8.0)
# 4) Set env. variable TARGET_DEVICE to indicate gpu type (e.g., gfx803,
# gfx900, ...)
#
# 5) Set env. variables AMDHSAFIN_DIR and and AMDHSAFIN_TARGET to the
# directory containing the amd finalizer executable and version
# (e.g, 8:0:3) respectively.
#
# Building rocrtst Suite
#
# 1) Create build folder e.g. "rocrtst/build" - any name will do
@@ -91,6 +87,32 @@ else()
endif()
endif()
if (DEFINED ENV{OPENCL_DIR})
set(CLANG $ENV{OPENCL_DIR}/bin/x86_64/clang)
set(OPENCL_DIR $ENV{OPENCL_DIR})
if (NOT EXISTS ${CLANG})
message("ERROR: path to clang (${CLANG}) is not valid. Is env. variable OPENCL_DIR correct?")
return()
endif()
if (DEFINED ENV{OPENCL_VER})
set(OPENCL_VER $ENV{OPENCL_VER})
else()
message("OPENCL_VER environment variable is not set. Using default")
set(OPENCL_VER "2.0")
endif()
else()
message("WARNING: OPENCL_DIR environment variable is not set. Kernels will not be built.")
endif()
if (DEFINED ENV{TARGET_DEVICE})
set(TARGET_DEVICE $ENV{TARGET_DEVICE})
else()
message("ERROR: TARGET_DEVICE environment variable is not defined.")
message("Please define a valid clang target (e.g., gfx803, gfx900,...).")
return()
endif()
#
# Set Name for rocrtst Suite Project
#
@@ -105,17 +127,22 @@ project (${ROCRTST_SUITE_NAME})
# Build Type: Debug Vs Release, 32 Vs 64
# Compiler Version, etc
#
MESSAGE("")
MESSAGE("-------------IS64BIT: " ${IS64BIT})
MESSAGE("-----------BuildType: " ${BUILD_TYPE})
MESSAGE("------------Compiler: " ${CMAKE_CXX_COMPILER})
MESSAGE("-------------Version: " ${CMAKE_CXX_COMPILER_VERSION})
MESSAGE("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR})
MESSAGE("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR})
MESSAGE("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib)
MESSAGE("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin)
MESSAGE("")
message("")
message("Build Configuration:")
message("-------------IS64BIT: " ${IS64BIT})
message("-----------BuildType: " ${BUILD_TYPE})
message("------------Compiler: " ${CMAKE_CXX_COMPILER})
message("-------------Version: " ${CMAKE_CXX_COMPILER_VERSION})
message("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR})
message("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR})
message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib)
message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin)
message("-------Target Device: " ${TARGET_DEVICE})
message("----------Clang path: " ${CLANG})
message("-------OpenCL version " ${OPENCL_VER})
message("")
set(KERNELS_DIR ${PROJECT_SOURCE_DIR}/kernels)
#
# Set the build type based on user input
#
@@ -148,7 +175,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-threadsafe-statics")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pedantic")
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pedantic")
#
@@ -164,7 +191,7 @@ endif()
# Add compiler flags to include symbol information for debug builds
#
if(ISDEBUG)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb -O0")
endif()
MESSAGE("ISDEBUG STEP:Done")
@@ -201,10 +228,11 @@ MESSAGE(${ROCRTST_LIBS})
set(ROCRTST "rocrtst${ONLY64STR}")
#
# Sorce files for building rocrtst
# Source files for building rocrtst
#
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} performanceSources)
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/test_common testCommonSources)
aux_source_directory(${ROCRTST_ROOT}/suites/test_common testCommonSources)
# Header file include path
@@ -212,11 +240,51 @@ include_directories(${ROCR_INC_DIR})
include_directories(${ROCRTST_ROOT})
include_directories(${ROCRTST_ROOT}/gtest/include)
# Build rules
# Use this function to build any samples that have kernels to be built
function(build_kernel S_NAME)
set(SNAME_KERNEL "${S_NAME}_kernels.hsaco")
set(TARG_NAME "${S_NAME}_hsaco")
set(HSACO_TARG_LIST ${HSACO_TARG_LIST} ${TARG_NAME} PARENT_SCOPE)
separate_arguments(CLANG_ARG_LIST UNIX_COMMAND "-target amdgcn-amdh-amdhsa -mcpu=${TARGET_DEVICE} -include ${OPENCL_DIR}/include/opencl-c.h ${BITCODE_LIBS} -cl-std=CL${OPENCL_VER} ${CL_FILE_LIST} -o ${PROJECT_BINARY_DIR}/${SNAME_KERNEL}")
add_custom_target(${TARG_NAME} ${CLANG} ${CLANG_ARG_LIST}
COMMENT "BUILDING KERNEL..."
VERBATIM)
endfunction(build_kernel)
add_executable(${ROCRTST} ${performanceSources} ${common_srcs})
######################
# Kernel Build Section
######################
set(KERN_SUFFIX "kernels.hsaco")
set(BITCODE_PREF "-Xclang -mlink-bitcode-file -Xclang")
set(BITCODE_PREF "${BITCODE_PREF} ${OPENCL_DIR}/lib/x86_64/bitcode")
set(COMMON_BITCODE_LIBS "${BITCODE_PREF}/opencl.amdgcn.bc")
set(COMMON_BITCODE_LIBS "${COMMON_BITCODE_LIBS} ${BITCODE_PREF}/ockl.amdgcn.bc")
# To build kernels, repeat the pattern used below for the P2P kernel; this
# pattern sets the bitcode libraries required by the kernel which will be
# used in the build_kernel() call, which builds the kernel.
# Test Case Template example
set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
set(CL_FILE_LIST "${KERNELS_DIR}/test_case_template_kernels.cl")
build_kernel("test_case_template")
# P2P Memory Access
#set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
#set(CL_FILE_LIST "${KERNELS_DIR}/p2p_mem_access_kernels.cl")
#build_kernel("p2p_mem_access")
# Dispatch Time
set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
set(CL_FILE_LIST "${KERNELS_DIR}/dispatch_time_kernels.cl")
build_kernel("dispatch_time")
# Build rules
add_executable(${ROCRTST} ${performanceSources} ${common_srcs} ${testCommonSources})
target_link_libraries(${ROCRTST} ${ROCRTST_LIBS} c stdc++ dl pthread rt)
add_custom_target(rocrtst_kernels DEPENDS ${HSACO_TARG_LIST})
INSTALL(TARGETS ${ROCRTST}
ARCHIVE DESTINATION ${PROJECT_BINARY_DIR}/lib
LIBRARY DESTINATION ${PROJECT_BINARY_DIR}/lib
-258
Просмотреть файл
@@ -1,258 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include "cp_process_time.h"
#include "common/base_rocr_utils.h"
#include "common/common.h"
#include "common/helper_funcs.h"
#include "common/hsatimer.h"
#include "common/os.h"
#include "gtest/gtest.h"
#include "hsa/hsa.h"
#include "hsa/hsa_ext_amd.h"
#include "hsa/hsa_ext_finalize.h"
#include <algorithm>
static const uint64_t kKernelIterations = 10000;
static const uint64_t kTestBadValue = 1234567891234567891;
//Set up some expectations for reasonable processing times
//For gfx803, Overhead time had a max of 18.208uS and a min of 7.82uS
static const double kGfx803MinOverhead = 7.78;
static const double kGfx803MaxOverhead = 21.064;
static const double kOverheadToleranceFactor = 0.25;
CpProcessTime::CpProcessTime() :
BaseRocR() {
// kernel_name_ = "&__simple_kernel";
mean_ = 0.0;
}
CpProcessTime::~CpProcessTime() {
}
void CpProcessTime::SetUp() {
hsa_status_t err;
set_kernel_file_name("simple_kernel.o");
set_kernel_name("&__simple_kernel");
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
return;
}
hsa_agent_t* gpu_dev = gpu_device1();
// Create a queue
hsa_queue_t* q = nullptr;
rocrtst::CreateQueue(*gpu_dev, &q);
ASSERT_NE(q, nullptr);
set_main_queue(q);
// Set profiling
err = hsa_amd_profiling_set_profiler_enabled(q, 1);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Load and finalize the kernel
err = rocrtst::LoadKernelFromObjFile(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
rocrtst::InitializeAQLPacket(this, &aql());
aql().workgroup_size_x = 1;
aql().grid_size_x = 1;
}
size_t CpProcessTime::RealIterationNum() {
return num_iteration() * 1.2 + 1;
}
void CpProcessTime::Run() {
hsa_status_t err;
std::vector<double> timer;
if (!rocrtst::CheckProfile(this)) {
return;
}
hsa_agent_t* gpu_dev = gpu_device1();
hsa_agent_t* cpu_dev = cpu_device();
ASSERT_NE(gpu_dev, nullptr);
ASSERT_NE(cpu_dev, nullptr);
uint32_t it = RealIterationNum();
typedef struct args_t {
uint64_t* iteration;
uint64_t* result;
} args;
err = rocrtst::SetPoolsTypical(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
uint64_t* iter = NULL;
uint64_t* result = NULL;
err = rocrtst::AllocAndAllowAccess(this, sizeof(uint64_t), cpu_pool(),
(void**)&iter);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = rocrtst::AllocAndAllowAccess(this, sizeof(uint64_t), cpu_pool(),
(void**)&result);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
*iter = kKernelIterations;
*result = kTestBadValue;
args k_args;
k_args.iteration = (uint64_t*)iter;
k_args.result = (uint64_t*)result;
err = rocrtst::AllocAndSetKernArgs(this, &k_args, sizeof(args));
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
rocrtst::WriteAQLToQueue(this);
void * q_base_addr = main_queue()->base_address;
const uint32_t queue_mask = main_queue()->size - 1;
uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
// aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
// HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
// aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
// HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
for (uint32_t i = 0; i < it; i++) {
// uint64_t que_idx = hsa_queue_load_write_index_relaxed(main_queue());
uint64_t que_idx = hsa_queue_add_write_index_relaxed(main_queue(), 1);
//Get timing stamp an ring the doorbell to dispatch the kernel.
rocrtst::PerfTimer p_timer;
int id = p_timer.CreateTimer();
p_timer.StartTimer(id);
rocrtst::AtomicSetPacketHeader(aql_header, aql().setup,
&((hsa_kernel_dispatch_packet_t*)(q_base_addr))[que_idx & queue_mask]);
hsa_queue_store_write_index_relaxed(main_queue(), (que_idx + 1));
hsa_signal_store_relaxed(main_queue()->doorbell_signal, que_idx);
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
;
// hsa_signal_value_t value = hsa_signal_wait_scacquire(signal(),
// HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
// value should be 0, or we timed-out
//ASSERT_EQ(value, 0);
p_timer.StopTimer(id);
hsa_amd_profiling_dispatch_time_t dispatch_time;
err = hsa_amd_profiling_get_dispatch_time(*gpu_dev, signal(),
&dispatch_time);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
uint64_t ticks = dispatch_time.end - dispatch_time.start;
uint64_t freq;
err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freq);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
hsa_signal_store_screlease(signal(), 1);
double execution_time = (double) ticks / freq * 1e6; //convert to us
double temp = p_timer.ReadTimer(id) * 1e6;
double cp_time = temp - execution_time;
#ifdef DEBUG
std::cout << "Total:" << temp << "uS ";
std::cout << "Execution:" << execution_time << "uS ";
std::cout << "Overhead:" << cp_time << "uS ";
std::cout << "Overhead %:" << cp_time / execution_time * 100 << std::endl;
#endif
EXPECT_EQ(kKernelIterations, *result);
timer.push_back(cp_time);
//Assume overhead will not deviate too much from previously recorded
// values. If this does happen and there is not a performance bug,
// modify these constants
//This may need to be made specific to the gpu being used
EXPECT_GT(cp_time, kGfx803MinOverhead * (1 - kOverheadToleranceFactor));
EXPECT_LT(cp_time, kGfx803MaxOverhead * (1 + kOverheadToleranceFactor));
*result = 0;
}
//Abandon the first result and after sort, delete the last 2% value
timer.erase(timer.begin());
std::sort(timer.begin(), timer.end());
timer.erase(timer.begin() + num_iteration(), timer.end());
mean_ = rocrtst::CalcMean(timer);
return;
}
void CpProcessTime::DisplayResults() const {
if (!rocrtst::CheckProfile(this)) {
return;
}
if (mean_ == 0.0) {
return;
}
std::cout << "===================================================="
<< std::endl;
std::cout << "The average Command Processor processing time is: " << mean_
<< "us" << std::endl;
std::cout << "===================================================="
<< std::endl;
return;
}
void CpProcessTime::Close() {
hsa_status_t err;
err = rocrtst::CommonCleanUp(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
}
-91
Просмотреть файл
@@ -1,91 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef __ROCRTST_SRC_CP_PROCESS_TIME_H__
#define __ROCRTST_SRC_CP_PROCESS_TIME_H__
#include "perf_common/perf_base.h"
#include "common/base_rocr.h"
#include "common/common.h"
#include "hsa/hsa.h"
#include "hsa/hsa_ext_amd.h"
#include <vector>
//@Brief: This class is defined to measure the mean latency of launching
//an empty kernel
class CpProcessTime: public rocrtst::BaseRocR, public PerfBase {
public:
//@Brief: Constructor
CpProcessTime();
//@Brief: Destructor
virtual ~CpProcessTime();
//@Brief: Set up the environment for the test
virtual void SetUp();
//@Brief: Run the test case
virtual void Run();
//@Brief: Display results we got
virtual void DisplayResults() const;
//@Brief: Clean up and close the runtime
virtual void Close();
private:
//@Brief: Get actual iteration number
virtual size_t RealIterationNum();
//@Brief: Store the size of queue
uint32_t queue_size_;
//@Brief: The mean time of CP Processing
double mean_;
};
#endif
-220
Просмотреть файл
@@ -1,220 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include "cu_masking.h"
#include "common/base_rocr_utils.h"
#include "gtest/gtest.h"
CuMasking::CuMasking() :
BaseRocR() {
memset(&aql(), 0, sizeof(hsa_kernel_dispatch_packet_t));
mean_ = 0.0;
group_region_.handle = 0;
cu_ = NULL;
}
CuMasking::~CuMasking() {
}
void CuMasking::SetUp() {
hsa_status_t err;
hsa_agent_t* gpu_dev = gpu_device1();
hsa_agent_t* cpu_dev = cpu_device();
set_kernel_file_name("cu_masking.o");
set_kernel_name("&main");
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
return;
}
// Create a queue
hsa_queue_t* q = nullptr;
rocrtst::CreateQueue(*gpu_dev, &q);
set_main_queue(q);
rocrtst::LoadKernelFromObjFile(this);
// Fill up the kernel packet except header
// aql().completion_signal=signal();
// TODO: Will delete manual_input later
uint32_t cu_count = 0;
err = hsa_agent_get_info(*gpu_dev,
(hsa_agent_info_t) HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
std::cout << "CU# is: " << cu_count << std::endl;
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
rocrtst::InitializeAQLPacket(this, &aql());
aql().workgroup_size_x = 1024;
//manual_input * group_input; // workgroup_max_size;
aql().grid_size_x = (long long) 1024 * 640 * 640;
// TODO:Manully set the max cu number to 8, the api return 10
std::cout << "Grid size is: " << aql().grid_size_x << std::endl;
err = hsa_amd_agent_iterate_memory_pools(*cpu_dev,
rocrtst::FindGlobalPool, &cpu_pool());
ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
}
size_t CuMasking::RealIterationNum() {
return num_iteration() * 1.2 + 1;
}
void CuMasking::Run() {
hsa_status_t err;
if (!rocrtst::CheckProfile(this)) {
return;
}
std::vector<double> timer;
typedef struct args_t {
uint32_t* iteration;
uint32_t* result;
} local_args;
uint32_t* iter = NULL;
uint32_t* result = NULL;
err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(uint32_t), 0,
(void**) &iter);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(uint32_t), 0,
(void**) &result);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
*iter = 0xff;
*result = 0;
err = hsa_amd_agents_allow_access(1, gpu_device1(), NULL, iter);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_agents_allow_access(1, gpu_device1(), NULL, result);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
local_args* kernarg = NULL;
err = hsa_amd_memory_pool_allocate(cpu_pool(), kernarg_size(), 0,
(void**) &kernarg);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_agents_allow_access(1, gpu_device1(), NULL, kernarg);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
kernarg->iteration = iter;
kernarg->result = result;
aql().kernarg_address = kernarg;
// Obtain the current queue write inex.
uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
// Write the aql packet at the calculate queue index address.
const uint32_t queue_mask = main_queue()->size - 1;
// Set CU mask
uint32_t cu_mask = 0;
#if 0
std::cout << "Enter cu mask value:" << std::endl;
ASSERT_NE(scanf("%d", &cu_mask), EOF);
#else
cu_mask = 0xAAAAAAAA;
#endif
std::cout << "Value of bit array is: 0x" << std::hex << cu_mask << std::endl;
err = hsa_amd_queue_cu_set_mask(main_queue(), 32, &cu_mask);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
void *q_base_addr = main_queue()->base_address;
// Write the aql packet at the calculate queue index address.
aql().completion_signal = signal();
((hsa_kernel_dispatch_packet_t*)(q_base_addr))[index & queue_mask] = aql();
// Get timing stamp an ring the doorbell to dispatch the kernel.
rocrtst::PerfTimer p_timer;
int id = p_timer.CreateTimer();
p_timer.StartTimer(id);
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask].header |=
HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
// Wait on the dispatch signal until the kernel is finished.
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
;
p_timer.StopTimer(id);
hsa_signal_store_screlease(signal(), 1);
double t1 = p_timer.ReadTimer(id) * 1e6;
std::cout << "Execution time after setting cu masking: " << t1 << std::endl;
return;
}
void CuMasking::DisplayResults() const {
if (!rocrtst::CheckProfile(this)) {
return;
}
std::cout << "===================================================="
<< std::endl;
std::cout << "====================================================="
<< std::endl;
return;
}
void CuMasking::Close() {
hsa_status_t err;
err = rocrtst::CommonCleanUp(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
}
-103
Просмотреть файл
@@ -1,103 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef __ROCRTST_SRC_CU_MASKING_TIME_H__
#define __ROCRTST_SRC_CU_MASKING_TIME_H__
#include "perf_common/perf_base.h"
#include "common/base_rocr.h"
#include "common/common.h"
#include "common/hsatimer.h"
#include "hsa/hsa.h"
#include "hsa/hsa_ext_amd.h"
#include "hsa/hsa_ext_finalize.h"
#include <algorithm>
#include <vector>
//@Brief: This class is defined to measure the mean latency of launching
//an empty kernel
class CuMasking: public rocrtst::BaseRocR, public PerfBase {
public:
//@Brief: Constructor
CuMasking();
//@Brief: Destructor
virtual ~CuMasking();
//@Brief: Set up the environment for the test
virtual void SetUp();
//@Brief: Run the test case
virtual void Run();
//@Brief: Display results we got
virtual void DisplayResults() const;
//@Brief: Clean up and close the runtime
virtual void Close();
private:
//@Brief: Get actual iteration number
virtual size_t RealIterationNum();
//@Brief: Store the size of queue
uint32_t queue_size_;
//@Brief: The mean time of CP Processing
double mean_;
//@Brief: The group memory region
hsa_region_t group_region_;
//@Brief: Pointer to cu_id array
uint32_t* cu_;
uint32_t manual_input;
uint32_t group_input;
};
#endif
-293
Просмотреть файл
@@ -1,293 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include "device_load_bandwidth.h"
#include "common/base_rocr_utils.h"
#include "common/common.h"
#include "common/helper_funcs.h"
#include "common/hsatimer.h"
#include "common/os.h"
#include "gtest/gtest.h"
#include <algorithm>
// TODO: The validation code has problems to debug
#if 0
static void initGlobalReadBuffer(uint32_t* in_data, uint32_t num_thrds,
uint32_t num_ops, uint32_t num_loops) {
// Populate input buffer with thread Id left shifted by 2.
uint32_t value = 0;
uint32_t val_idx;
for (uint32_t idx1 = 0; idx1 < num_loops; idx1++) {
val_idx = 0;
for (uint32_t idx2 = 0; idx2 < num_ops; idx2++) {
// Write the value to be read by each thread
for (uint32_t idx3 = 0; idx3 < num_thrds; idx3++) {
value = idx3 << 2;
in_data[val_idx++] = value;
}
}
}
return;
}
static bool verifyGlobalLoadKernel(uint32_t* data, uint32_t num_thrds,
uint32_t scale, const char* kernel_name) {
// Verify kernel operation i.e. validate the data in the output buffer.
uint32_t valid_value = 0;
for (uint32_t idx = 0; idx < num_thrds; idx++) {
valid_value = (idx << 2) * scale;
if (data[idx] != valid_value) {
std::cout << "Value expected = " << valid_value << std::endl;
std::cout << "Value of data = " << data[idx] << std::endl;
std::cout << kernel_name << ": VALIDATION FAILED ! Bad index: " << idx
<< std::endl;
std::cout << kernel_name << ": VALUE @ Bad index: " << data[idx]
<< std::endl;
std::cout << std::endl;
return false;
}
}
#ifdef DEBUG
std::cout << kernel_name << ": Passed validation" << std::endl;
std::cout << std::endl;
#endif
return true;
}
#endif
// Constructor
DeviceLoadBandwidth::DeviceLoadBandwidth() :
BaseRocR() {
set_group_size(0);
set_enable_interrupt(false);
num_group_ = 0;
num_cus_ = 0;
kernel_loop_count_ = 0;
mean_ = 0.0;
data_size_ = 0;
set_requires_profile (HSA_PROFILE_BASE);
}
// Destructor
DeviceLoadBandwidth::~DeviceLoadBandwidth() {
}
// Set up the test environment
void DeviceLoadBandwidth::SetUp() {
SetWorkItemNum();
set_kernel_file_name("sysMemRead.o");
set_kernel_name("&__SysMemLoad");
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
return;
}
hsa_agent_t* gpu_dev = gpu_device1();
//Create a queue with max number size
hsa_queue_t* q = nullptr;
rocrtst::CreateQueue(*gpu_dev, &q);
ASSERT_NE(q, nullptr);
set_main_queue(q);
rocrtst::LoadKernelFromObjFile(this);
uint32_t total_work_items = num_cus_ * num_group_ * group_size();
//Fill up part of aql
rocrtst::InitializeAQLPacket(this, &aql());
aql().workgroup_size_x = group_size();
aql().grid_size_x = total_work_items;
return;
}
// Run the test
void DeviceLoadBandwidth::Run() {
hsa_status_t err;
if (!rocrtst::CheckProfile(this)) {
return;
}
uint32_t total_workitems = num_cus_ * num_group_ * group_size();
uint32_t ops_thrd = 32;
uint64_t addr_step = (uint64_t) total_workitems * sizeof(uint64_t);
uint64_t total_ops = (uint64_t) total_workitems * ops_thrd;
uint64_t in_data_size = (uint64_t) total_ops * sizeof(uint64_t);
data_size_ = in_data_size;
err = rocrtst::SetPoolsTypical(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = rocrtst::AllocAndAllowAccess(this, in_data_size, device_pool(),
(void**)&in_data_);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
//uint32_t out_data_size = total_workitems * sizeof(uint64_t);
uint32_t out_data_size = in_data_size;
err = rocrtst::AllocAndAllowAccess(this, out_data_size, device_pool(),
(void**)&out_data_);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
#if 0
initGlobalReadBuffer(in_data_, total_workitems, ops_thrd, kernel_loop_count_);
#endif
struct local_args_t {
void* arg0;
void* arg1;
uint64_t arg2;
void* arg3;
} local_args;
local_args.arg0 = in_data_;
local_args.arg1 = in_data_ + total_ops;
local_args.arg2 = addr_step;
local_args.arg3 = out_data_;
// Copy the kernel args structure into a registered memory block
err = rocrtst::AllocAndSetKernArgs(this, &local_args, sizeof(local_args));
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
std::vector<double> time;
rocrtst::WriteAQLToQueue(this);
// Write the aql packet at the calculated queue index address.
const uint32_t queue_mask = main_queue()->size - 1;
void * q_base = main_queue()->base_address;
for (uint32_t i = 0; i < num_iteration(); i++) {
uint64_t que_idx = hsa_queue_load_write_index_relaxed(main_queue());
rocrtst::PerfTimer p_timer;
int id = p_timer.CreateTimer();
p_timer.StartTimer(id);
uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
rocrtst::AtomicSetPacketHeader(aql_header, aql().setup,
&((hsa_kernel_dispatch_packet_t*)(q_base))[que_idx & queue_mask]);
hsa_signal_store_screlease(main_queue()->doorbell_signal, que_idx);
// Wait on the dispatch signal until the kernel is finished.
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
;
p_timer.StopTimer(id);
#ifdef DEBUG
std::cout << "." << std::flush;
#endif
#if 0
// Verify the results
uint32_t scale = kernel_loop_count_ * ops_thrd;
verifyGlobalLoadKernel(out_data_, total_workitems, scale,
kernel_name().c_str());
#endif
time.push_back(p_timer.ReadTimer(id));
hsa_signal_store_screlease(signal(), 1);
}
#ifdef DEBUG
std::cout << std::endl;
#endif
time.erase(time.begin());
std::sort(time.begin(), time.end());
time.erase(time.begin() + num_iteration(), time.end());
mean_ = rocrtst::CalcMean(time);
return;
}
void DeviceLoadBandwidth::Close() {
hsa_status_t err;
err = hsa_amd_memory_pool_free(in_data_);
EXPECT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_memory_pool_free(out_data_);
EXPECT_EQ(err, HSA_STATUS_SUCCESS);
err = rocrtst::CommonCleanUp(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
return;
}
void DeviceLoadBandwidth::DisplayResults() const {
if (!rocrtst::CheckProfile(this)) {
return;
}
std::cout << "=======================================" << std::endl;
std::cout << "Device Load Bandwidth: ";
std::cout << data_size_ / mean_ / 1024 / 1024 / 1024 << "(GB/S)" << std::endl;
std::cout << "=======================================" << std::endl;
return;
}
-219
Просмотреть файл
@@ -1,219 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include "device_store_bandwidth.h"
#include "common/base_rocr_utils.h"
#include "common/common.h"
#include "common/helper_funcs.h"
#include "common/hsatimer.h"
#include "gtest/gtest.h"
// Constructor
DeviceStoreBandwidth::DeviceStoreBandwidth() :
BaseRocR() {
set_group_size(0);
num_group_ = 0;
num_cus_ = 0;
kernel_loop_count_ = 0;
mean_ = 0.0;
data_size_ = 0;
set_requires_profile (HSA_PROFILE_BASE);
in_data_ = nullptr;
out_data_ = nullptr;
}
// Destructor
DeviceStoreBandwidth::~DeviceStoreBandwidth() {
}
// Set up the test environment
void DeviceStoreBandwidth::SetUp() {
SetWorkItemNum();
set_kernel_file_name("sysMemWrite.o");
set_kernel_name("&__SysMemStore");
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
return;
}
hsa_agent_t* gpu_dev = gpu_device1();
//Create a queue with max number size
hsa_queue_t* q = nullptr;
rocrtst::CreateQueue(*gpu_dev, &q);
ASSERT_NE(q, nullptr);
set_main_queue(q);
rocrtst::LoadKernelFromObjFile(this);
uint32_t total_work_items = num_cus_ * num_group_ * group_size();
//Fill up part of aql
rocrtst::InitializeAQLPacket(this, &aql());
aql().workgroup_size_x = group_size();
aql().grid_size_x = total_work_items;
return;
}
// Run the test
void DeviceStoreBandwidth::Run() {
hsa_status_t err;
if (!rocrtst::CheckProfile(this)) {
return;
}
uint32_t total_workitems = num_cus_ * num_group_ * group_size();
uint32_t ops_thrd = 16;
uint64_t addr_step = (uint64_t) total_workitems * sizeof(uint32_t);
uint64_t total_ops = (uint64_t) total_workitems * kernel_loop_count_
* ops_thrd;
uint64_t in_data_size = (uint64_t) total_ops * sizeof(uint32_t);
data_size_ = in_data_size;
err = rocrtst::SetPoolsTypical(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = rocrtst::AllocAndAllowAccess(this, in_data_size, device_pool(),
(void**)&in_data_);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
uint32_t out_data_size = total_workitems * sizeof(uint32_t);
err = rocrtst::AllocAndAllowAccess(this, out_data_size, device_pool(),
(void**)&out_data_);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
struct local_args_t {
void* arg0;
void* arg1;
uint64_t arg2;
void* arg3;
} local_args;
local_args.arg0 = in_data_;
local_args.arg1 = in_data_ + total_ops;
local_args.arg2 = addr_step;
local_args.arg3 = out_data_;
// Copy the kernel args structure into a registered memory block
err = rocrtst::AllocAndSetKernArgs(this, &local_args, sizeof(local_args));
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
std::vector<double> time;
rocrtst::WriteAQLToQueue(this);
for (uint32_t i = 0; i < num_iteration(); i++) {
uint64_t que_idx = hsa_queue_load_write_index_relaxed(main_queue());
// Write the aql packet at the calculated queue index address.
const uint32_t queue_mask = main_queue()->size - 1;
rocrtst::PerfTimer p_timer;
int id = p_timer.CreateTimer();
p_timer.StartTimer(id);
void * q_base = main_queue()->base_address;
uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
rocrtst::AtomicSetPacketHeader(aql_header, aql().setup,
&((hsa_kernel_dispatch_packet_t*)(q_base))[que_idx & queue_mask]);
hsa_signal_store_screlease(main_queue()->doorbell_signal, que_idx);
// Wait on the dispatch signal until the kernel is finished.
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
;
p_timer.StopTimer(id);
#ifdef DEBUG
std::cout << "." << std::flush;
#endif
time.push_back(p_timer.ReadTimer(id));
hsa_signal_store_screlease(signal(), 1);
}
#ifdef DEBUG
std::cout << std::endl;
#endif
time.erase(time.begin());
mean_ = rocrtst::CalcMean(time);
return;
}
void DeviceStoreBandwidth::Close() {
hsa_status_t err;
err = hsa_amd_memory_pool_free(in_data_);
EXPECT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_memory_pool_free(out_data_);
EXPECT_EQ(err, HSA_STATUS_SUCCESS);
err = rocrtst::CommonCleanUp(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
return;
}
void DeviceStoreBandwidth::DisplayResults() const {
if (!rocrtst::CheckProfile(this)) {
return;
}
std::cout << "=======================================" << std::endl;
std::cout << "Device Store Bandwidth: ";
std::cout << data_size_ / mean_ / 1024 / 1024 / 1024 << "(GB/S)" << std::endl;
std::cout << "=======================================" << std::endl;
return;
}
-119
Просмотреть файл
@@ -1,119 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef __ROCRTST_SRC_INC_DEVICE_STORE_BANDWIDTH_H__
#define __ROCRTST_SRC_INC_DEVICE_STORE_BANDWIDTH_H__
#include "perf_common/perf_base.h"
#include "common/base_rocr.h"
#include "hsa/hsa.h"
#include <stdio.h>
class DeviceStoreBandwidth: public rocrtst::BaseRocR, public PerfBase {
public:
//@Brief: Constructor
DeviceStoreBandwidth();
//@Brief: Destructor
~DeviceStoreBandwidth();
//@Brief: Set up the testing environment
virtual void SetUp();
//@Brief: Run the test case
virtual void Run();
//@Brief: Close and clean up the test enrionment
virtual void Close();
//@Brief: Display load bandwidth
virtual void DisplayResults() const;
//@Brief: Set work-item configuration
void SetWorkItemNum() {
#ifdef INTERACTIVE
uint32_t tmp;
printf("Please input the number of CUs you want to try:\n");
scanf("%d", &num_cus_);
printf("Please input the number of groups you want to try:\n");
scanf("%d", &num_group_);
printf("Please input the size of each group:\n");
scanf("%d", &tmp);
set_group_size(tmp);
printf("Please input the number of kernel loop you want to try:\n");
scanf("%d", &kernel_loop_count_);
#else
num_cus_ = 32;
num_group_ = 128;
set_group_size(64);
kernel_loop_count_ = 16;
#endif
return;
}
private:
//@Brief: number of group
uint32_t num_group_;
//@Brief: number of CUs
uint32_t num_cus_;
//@Brief: number of kernel loop
uint32_t kernel_loop_count_;
//@Brief: Mean execution time
double mean_;
//@Brief: data size for test
uint64_t data_size_;
uint32_t* in_data_;
uint32_t* out_data_;
};
#endif
+127 -120
Просмотреть файл
@@ -43,7 +43,10 @@
*
*/
#include "dispatch_time.h"
#include <algorithm>
#include <string>
#include "suites/performance/dispatch_time.h"
#include "common/base_rocr_utils.h"
#include "common/common.h"
#include "common/os.h"
@@ -52,40 +55,68 @@
#include "gtest/gtest.h"
#include "hsa/hsa.h"
#include "hsa/hsa_ext_finalize.h"
#include <algorithm>
DispatchTime::DispatchTime() :
BaseRocR() {
use_default_ = false;
launch_single_ = false;
DispatchTime::
DispatchTime(bool defaultInterrupt, bool launchSingleKernel) : TestBase(),
use_default_interupt_(defaultInterrupt),
launch_single_(launchSingleKernel) {
queue_size_ = 0;
num_batch_ = 100000;
memset(&aql(), 0, sizeof(hsa_kernel_dispatch_packet_t));
single_default_mean_ = 0.0;
single_interrupt_mean_ = 0.0;
multi_default_mean_ = 0.0;
multi_interrupt_mean_ = 0.0;
dispatch_time_mean_ = 0.0;
set_num_iteration(100);
set_kernel_file_name("dispatch_time_kernels.hsaco");
set_kernel_name("empty_kernel");
std::string name;
std::string desc;
name = "Average Dispatch Time";
desc = "This test measures the time to handle AQL packets that "
"do no work. Time is measured from when the packet is made available to"
" the Command Processor to when the target agent notifies the host that "
"the packet has been executed. ";
if (defaultInterrupt) {
name += ", Default Interrupts";
desc += "Interrupts are controlled by HSA_ENABLE_INTERRUPT environment "
"variable. ";
} else {
name += ", Interrupts Enabled";
desc += "Interrupts are enabled. ";
}
if (launchSingleKernel) {
name += ", Single Kernel";
desc += " One kernel at a time is and executed.";
} else {
name += ", Multiple Kernels";
desc += " Enough kernels to fill the queue are dispatched at one time";
}
set_title(name);
set_description(desc);
}
DispatchTime::~DispatchTime() {
}
void DispatchTime::SetUp() {
// If it indicates to use default signal, set env var properly
if (use_default_) {
hsa_status_t err;
// This need to happen before TestBase::SetUp()
if (use_default_interupt_) {
set_enable_interrupt(false);
}
else {
} else {
set_enable_interrupt(true);
}
set_kernel_file_name("empty_kernel.o");
set_kernel_name("&__Empty_kernel");
TestBase::SetUp();
// If it indicates to use default signal, set env var properly
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
return;
}
err = SetDefaultAgents(this);
ASSERT_EQ(HSA_STATUS_SUCCESS, err);
hsa_agent_t* gpu_dev = gpu_device1();
@@ -105,24 +136,26 @@ void DispatchTime::SetUp() {
num_batch_ = num_batch_ > size ? size : num_batch_;
}
rocrtst::LoadKernelFromObjFile(this);
err = rocrtst::LoadKernelFromObjFile(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Fill up the kernel packet except header
rocrtst::InitializeAQLPacket(this, &aql());
err = rocrtst::InitializeAQLPacket(this, &aql());
ASSERT_EQ(HSA_STATUS_SUCCESS, err);
aql().workgroup_size_x = 1;
aql().grid_size_x = 1;
}
void DispatchTime::Run() {
if (!rocrtst::CheckProfile(this)) {
return;
}
TestBase::Run();
if (launch_single_) {
RunSingle();
}
else {
} else {
RunMulti();
}
}
@@ -137,59 +170,59 @@ void DispatchTime::RunSingle() {
int it = RealIterationNum();
const uint32_t queue_mask = main_queue()->size - 1;
//queue should be empty
// queue should be empty
ASSERT_EQ(hsa_queue_load_read_index_scacquire(main_queue()),
hsa_queue_load_write_index_scacquire(main_queue()));
void *q_base_addr = main_queue()->base_address;
for (int i = 0; i < it; i++) {
//Obtain the current queue write index.
// Obtain the current queue write index.
uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
ASSERT_LT(index, main_queue()->size + index);
//Write the aql packet at the calculated queue index address.
// Write the aql packet at the calculated queue index address.
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql();
//Get timing stamp and ring the doorbell to dispatch the kernel.
reinterpret_cast<hsa_kernel_dispatch_packet_t *>(
q_base_addr)[index & queue_mask] = aql();
// Get timing stamp and ring the doorbell to dispatch the kernel.
rocrtst::PerfTimer p_timer;
int id = p_timer.CreateTimer();
p_timer.StartTimer(id);
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask].header |=
HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
reinterpret_cast<hsa_kernel_dispatch_packet_t *>(
q_base_addr)[index & queue_mask].header |=
HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
//Wait on the dispatch signal until the kernel is finished.
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
;
// Wait on the dispatch signal until the kernel is finished.
while (hsa_signal_wait_scacquire(aql().completion_signal,
HSA_SIGNAL_CONDITION_LT, 1, (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) {
}
p_timer.StopTimer(id);
timer.push_back(p_timer.ReadTimer(id));
hsa_signal_store_screlease(signal(), 1);
hsa_signal_store_screlease(aql().completion_signal, 1);
#ifdef DEBUG
std::cout << ".";
fflush(stdout);
#endif
if (verbosity() >= VERBOSE_PROGRESS) {
std::cout << ".";
fflush(stdout);
}
}
std::cout << std::endl;
if (verbosity() >= VERBOSE_PROGRESS) {
std::cout << std::endl;
}
//Abandon the first result and after sort, delete the last 2% value
// Abandon the first result and after sort, delete the last 2% value
timer.erase(timer.begin());
std::sort(timer.begin(), timer.end());
timer.erase(timer.begin() + num_iteration(), timer.end());
if (use_default_) {
single_default_mean_ = rocrtst::CalcMean(timer);
}
else {
single_interrupt_mean_ = rocrtst::CalcMean(timer);
}
dispatch_time_mean_ = rocrtst::CalcMean(timer);
return;
}
@@ -199,72 +232,69 @@ void DispatchTime::RunMulti() {
int it = RealIterationNum();
const uint32_t queue_mask = main_queue()->size - 1;
//queue should be empty
// queue should be empty
ASSERT_EQ(hsa_queue_load_read_index_scacquire(main_queue()),
hsa_queue_load_write_index_scacquire(main_queue()));
for (int i = 0; i < it; i++) {
uint64_t* index = (uint64_t*) malloc(sizeof(uint64_t) * num_batch_);
rocrtst::PerfTimer p_timer;
hsa_signal_store_screlease(signal(), num_batch_);
for (int i = 0; i < it; i++) {
uint64_t* index =
reinterpret_cast<uint64_t*>(malloc(sizeof(uint64_t) * num_batch_));
hsa_signal_store_screlease(aql().completion_signal, num_batch_);
for (uint32_t j = 0; j < num_batch_; j++) {
//index[j] = hsa_queue_add_write_index_scacq_screl(main_queue(), 1);
// index[j] = hsa_queue_add_write_index_scacq_screl(main_queue(), 1);
index[j] = hsa_queue_add_write_index_relaxed(main_queue(), 1);
//Write the aql packet at the calculated queue index address.
((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index[j]
& queue_mask] = aql();
// Write the aql packet at the calculated queue index address.
(reinterpret_cast<hsa_kernel_dispatch_packet_t*>((
main_queue()->base_address)))[index[j] & queue_mask] = aql();
if (j == num_batch_ - 1) {
((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index[j]
& queue_mask].header |= 1 << HSA_PACKET_HEADER_BARRIER;
//TODO: verify if the below is needed. I don't think it is. It should
// already be initialized to signal().
((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index[j]
& queue_mask].completion_signal = signal();
(reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
main_queue()->base_address))[index[j] & queue_mask].header |=
1 << HSA_PACKET_HEADER_BARRIER;
}
}
// Set packet header reversly; set all headers except the very first
// one, for now.
for (uint32_t j = num_batch_ - 1; j > 0; j--) {
((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index[j]
& queue_mask].header |= HSA_PACKET_TYPE_KERNEL_DISPATCH
<< HSA_PACKET_HEADER_TYPE;
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
(main_queue()->base_address))[index[j] & queue_mask].header |=
HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
}
//Get timing stamp and ring the doorbell to dispatch the kernel.
rocrtst::PerfTimer p_timer;
// Get timing stamp and ring the doorbell to dispatch the kernel.
int id = p_timer.CreateTimer();
p_timer.StartTimer(id);
//Set the very first header...
((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index[0]
& queue_mask].header |= HSA_PACKET_TYPE_KERNEL_DISPATCH
<< HSA_PACKET_HEADER_TYPE;
// Set the very first header...
(reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
main_queue()->base_address))[index[0] & queue_mask].header |=
HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
for (uint32_t j = 0; j < num_batch_; j++) {
hsa_signal_store_screlease(main_queue()->doorbell_signal, index[j]);
}
//Wait on the dispatch signal until the kernel is finished.
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_EQ, 0,
UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0)
;
// Wait on the dispatch signal until the kernel is finished.
while (hsa_signal_wait_scacquire(aql().completion_signal,
HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0) {
}
p_timer.StopTimer(id);
timer.push_back(p_timer.ReadTimer(id));
hsa_signal_store_screlease(signal(), 1);
hsa_signal_store_screlease(aql().completion_signal, 1);
free(index);
#ifdef DEBUG
std::cout << ".";
fflush(stdout);
#endif
if (verbosity() >= VERBOSE_PROGRESS) {
std::cout << ".";
fflush(stdout);
}
}
std::cout << std::endl;
@@ -275,57 +305,34 @@ void DispatchTime::RunMulti() {
timer.erase(timer.begin() + num_iteration(), timer.end());
if (use_default_) {
multi_default_mean_ = rocrtst::CalcMean(timer);
}
else {
multi_interrupt_mean_ = rocrtst::CalcMean(timer);
}
dispatch_time_mean_ = rocrtst::CalcMean(timer);
return;
}
void DispatchTime::DisplayResults() const {
void DispatchTime::DisplayTestInfo(void) {
TestBase::DisplayTestInfo();
}
void DispatchTime::DisplayResults(void) const {
if (!rocrtst::CheckProfile(this)) {
return;
}
std::cout << "===================================================="
<< std::endl;
TestBase::DisplayResults();
if (use_default_) {
if (launch_single_) {
std::cout << "Single_Default: " << single_default_mean_ * 1e6
<< std::endl;
}
else {
std::cout << "Multi_Default: "
<< multi_default_mean_ * 1e6 / num_batch_ << std::endl;
}
}
else {
if (launch_single_) {
std::cout << "Single_Interrupt: " << single_interrupt_mean_ * 1e6
<< std::endl;
}
else {
std::cout << "Multi_Interrupt: "
<< multi_interrupt_mean_ * 1e6 / num_batch_ << std::endl;
}
std::cout << "Average Time to Completion: ";
if (launch_single_) {
std::cout << dispatch_time_mean_ * 1e6;
} else {
std::cout << dispatch_time_mean_ * 1e6 / num_batch_;
}
std::cout << "====================================================="
<< std::endl;
std::cout << " uS" << std::endl;
return;
}
void DispatchTime::Close() {
hsa_status_t err;
err = rocrtst::CommonCleanUp(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
TestBase::Close();
return;
}
+36 -51
Просмотреть файл
@@ -43,83 +43,68 @@
*
*/
#ifndef __ROCRTST_SRC_DISPATCH_TIME_H__
#define __ROCRTST_SRC_DISPATCH_TIME_H__
#include "perf_common/perf_base.h"
#ifndef ROCRTST_SUITES_PERFORMANCE_DISPATCH_TIME_H_
#define ROCRTST_SUITES_PERFORMANCE_DISPATCH_TIME_H_
#include <vector>
#include "suites/test_common/test_base.h"
#include "common/base_rocr.h"
#include "common/common.h"
#include "hsa/hsa.h"
#include <vector>
//@Brief: This class is defined to measure the mean latency of launching
//an empty kernel
// @Brief: This class is defined to measure the mean latency of launching
// an empty kernel
class DispatchTime: public rocrtst::BaseRocR, public PerfBase {
class DispatchTime : public TestBase {
public:
//@Brief: Constructor
DispatchTime();
// @Brief: Constructor
DispatchTime(bool defaultInterrupt, bool launchSingleKernel);
//@Brief: Destructor
virtual ~DispatchTime();
// @Brief: Destructor
virtual ~DispatchTime(void);
//@Brief: Set up the environment for the test
virtual void SetUp();
// @Brief: Set up the environment for the test
virtual void SetUp(void);
//@Brief: Run the test case
virtual void Run();
// @Brief: Run the test case
virtual void Run(void);
//@Brief: Display results we got
virtual void DisplayResults() const;
// @Brief: Display results we got
virtual void DisplayResults(void) const;
//@Brief: Clean up and close the runtime
virtual void Close();
// @Brief: Display information about what this test does
virtual void DisplayTestInfo(void);
//@Brief: Choose if use default signal or not
void UseDefaultSignal(bool use_default = true) {
use_default_ = use_default;
}
//@Brief; Choose to launch a single kernels or not
void LaunchSingleKernel(bool launch_single = true) {
launch_single_ = launch_single;
}
// @Brief: Clean up and close the runtime
virtual void Close(void);
private:
//@Brief: Get actual iteration number
virtual size_t RealIterationNum();
// @Brief: Get actual iteration number
virtual size_t RealIterationNum(void);
//@Brief: Launch single packet each time
virtual void RunSingle();
// @Brief: Launch single packet each time
virtual void RunSingle(void);
//@Brief: Launch multiple packets each time
virtual void RunMulti();
// @Brief: Launch multiple packets each time
virtual void RunMulti(void);
//@Brief: Indicate if use default signal or not
bool use_default_;
// @Brief: Indicate if use default signal or not
bool use_default_interupt_;
//@Brief: Indicate if launch single kernel or not
// @Brief: Indicate if launch single kernel or not
bool launch_single_;
//@Brief: Store the size of queue
// @Brief: Store the size of queue
uint32_t queue_size_;
//@Brief: Number of packets in a batch
// @Brief: Number of packets in a batch
uint32_t num_batch_;
//@Brief: Time of single default signal dispatch time
double single_default_mean_;
//@Brief: Time of single interrupt signal dispatch time
double single_interrupt_mean_;
//@Brief: Time of multi default signal dispatch time
double multi_default_mean_;
//@Brief: Time of multi interrupt signal dispatch time
double multi_interrupt_mean_;
// @Brief: Ave. dispatch time
double dispatch_time_mean_;
char* orig_iterrupt_env_;
};
#endif
#endif // ROCRTST_SUITES_PERFORMANCE_DISPATCH_TIME_H_
-351
Просмотреть файл
@@ -1,351 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include "flush_latency.h"
#include "common/base_rocr_utils.h"
#include "common/common.h"
#include "common/helper_funcs.h"
#include "common/hsatimer.h"
#include "common/os.h"
#include "gtest/gtest.h"
#include <algorithm>
static const int kWorkItem = 1024 * 1204;
// Constructor
FlushLatency::FlushLatency() :
BaseRocR() {
set_group_size(0);
num_group_ = 0;
num_cus_ = 0;
kernel_loop_count_ = 0;
mean_ = 0.0;
data_size_ = 0;
set_requires_profile (HSA_PROFILE_BASE);
}
// Destructor
FlushLatency::~FlushLatency() {
}
// Set up the test environment
void FlushLatency::SetUp() {
hsa_status_t err;
SetWorkItemNum();
set_kernel_file_name("flush_latency.o");
set_kernel_name("&main");
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
return;
}
hsa_agent_t* gpu_dev = gpu_device1();
//Create a queue with max number size
hsa_queue_t* q;
rocrtst::CreateQueue(*gpu_dev, &q);
set_main_queue(q);
//Enable profiling
err = hsa_amd_profiling_set_profiler_enabled(main_queue(), 1);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
rocrtst::LoadKernelFromObjFile(this);
uint32_t total_work_items = kWorkItem * 0.3;
//Fill up part of aql
rocrtst::InitializeAQLPacket(this, &aql());
aql().workgroup_size_x = group_size();
aql().grid_size_x = total_work_items;
return;
}
// Run the test
void FlushLatency::Run() {
hsa_status_t err;
hsa_amd_memory_pool_t cpu_pool;
if (!rocrtst::CheckProfile(this)) {
return;
}
hsa_agent_t* gpu_dev = gpu_device1();
hsa_agent_t* cpu_dev = cpu_device();
err = hsa_amd_agent_iterate_memory_pools(*gpu_dev, rocrtst::FindStandardPool,
&device_pool());
ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
ASSERT_NE(device_pool().handle, 0);
cpu_pool.handle = 0;
err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool,
&cpu_pool);
ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
ASSERT_NE(cpu_pool.handle, 0);
#if DEBUG
std::cout << "Device Pool Properties:" << std::endl;
err = rocrtst::DumpMemoryPoolInfo(device_pool());
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
std::cout << "Global Pool Properties:" << std::endl;
err = rocrtst::DumpMemoryPoolInfo(cpu_pool);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
#endif
uint32_t out_data_size = 1024 * 1024 * sizeof(uint32_t);
std::vector<double> time_none;
std::vector<double> time_release;
std::vector < uint64_t > time_none_stamp;
std::vector < uint64_t > time_release_stamp;
//Query system timestamp frequency
uint64_t freq;
err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freq);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
void* out = NULL;
uint32_t* out_data;
const uint32_t queue_mask = main_queue()->size - 1;
typedef struct local_args_t {
void* arg0;
} args;
// Warm up
uint16_t header = 0;
header |= HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
aql().header = header;
err = hsa_amd_memory_pool_allocate(device_pool(), out_data_size, 0,
(void**) &out_data);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
args* kern_ptr = NULL;
err = hsa_amd_memory_pool_allocate(cpu_pool, sizeof(args), 0,
(void**) &kern_ptr);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
kern_ptr->arg0 = out_data;
aql().kernarg_address = kern_ptr;
// Obtain the current queue write index
int64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
void *q_base_addr = main_queue()->base_address;
// Write the aql packet at the calculated queue index address.
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql();
hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
// Wait on the dispatch signal until the kernel is finished.
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
;
hsa_signal_store_screlease(signal(), 1);
for (int i = 0; i < 1000; i++) {
err = hsa_amd_memory_pool_allocate(device_pool(), out_data_size, 0,
(void**) &out_data);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
args* kern_ptr = NULL;
err = hsa_amd_memory_pool_allocate(cpu_pool, sizeof(args), 0,
(void**) &kern_ptr);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
kern_ptr->arg0 = out_data;
aql().kernarg_address = kern_ptr;
// Obtain the current queue write index
int64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
// Write the aql packet at the calculated queue index address.
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql();
hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
// Wait on the dispatch signal until the kernel is finished.
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
;
hsa_amd_profiling_dispatch_time_t dispatch_time;
err = hsa_amd_profiling_get_dispatch_time(*gpu_dev, signal(),
&dispatch_time);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
uint64_t sys_start = 0;
uint64_t sys_end = 0;
err = hsa_amd_profiling_convert_tick_to_system_domain(*gpu_dev,
dispatch_time.start, &sys_start);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_profiling_convert_tick_to_system_domain(*gpu_dev,
dispatch_time.end, &sys_end);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
uint64_t stamp = dispatch_time.end - dispatch_time.start;
double execution_time = (double) stamp / freq * 1e6; // convert to us.
time_none.push_back(execution_time);
time_none_stamp.push_back(stamp);
hsa_signal_store_screlease(signal(), 1);
if (out != NULL) {
err = hsa_memory_free(out);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
}
out = out_data;
out_data = NULL;
}
header = 0;
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
aql().header = header;
for (int i = 0; i < 1000; i++) {
err = hsa_amd_memory_pool_allocate(device_pool(), out_data_size, 0,
(void**) &out_data);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
args* kern_ptr = NULL;
err = hsa_amd_memory_pool_allocate(cpu_pool, sizeof(args), 0,
(void**) &kern_ptr);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
kern_ptr->arg0 = out_data;
aql().kernarg_address = kern_ptr;
// Obtain the current queue write index
uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
// Write the aql packet at the calculated queue index address.
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql();
hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
// Wait on the dispatch signal until the kernel is finished.
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
;
hsa_signal_store_screlease(signal(), 1);
hsa_amd_profiling_dispatch_time_t dispatch_time;
err = hsa_amd_profiling_get_dispatch_time(*gpu_dev, signal(),
&dispatch_time);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
uint64_t sys_start = 0;
uint64_t sys_end = 0;
err = hsa_amd_profiling_convert_tick_to_system_domain(*gpu_dev,
dispatch_time.start, &sys_start);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_profiling_convert_tick_to_system_domain(*gpu_dev,
dispatch_time.end, &sys_end);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
uint64_t stamp = dispatch_time.end - dispatch_time.start;
double execution_time = (double) stamp / freq * 1e6; // convert to us.
time_release.push_back(execution_time);
time_release_stamp.push_back(stamp);
if (out != NULL) {
err = hsa_memory_free(out);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
}
out = out_data;
out_data = NULL;
}
std::sort(time_none.begin(), time_none.end());
std::sort(time_release.begin(), time_release.end());
time_none.erase(time_none.begin(), time_none.begin() + 50);
time_none.erase(time_none.end() - 50, time_none.end());
time_release.erase(time_release.begin(), time_release.begin() + 50);
time_release.erase(time_release.end() - 50, time_release.end());
mean_ = rocrtst::CalcMean(time_none, time_release);
return;
}
void FlushLatency::Close() {
hsa_status_t err;
err = rocrtst::CommonCleanUp(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
}
void FlushLatency::DisplayResults() const {
if (!rocrtst::CheckProfile(this)) {
return;
}
std::cout << std::endl << "======================================="
<< std::endl;
std::cout << "Average cache flush overhead: " << mean_ << "uS"
<< std::endl;
std::cout << "=======================================" << std::endl;
return;
}
-122
Просмотреть файл
@@ -1,122 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef __ROCRTST_SRC_INC_FLUSH_LATENCY_H__
#define __ROCRTST_SRC_INC_FLUSH_LATENCY_H__
#include "perf_common/perf_base.h"
#include "common/base_rocr.h"
#include "hsa/hsa.h"
#include <stdio.h>
class FlushLatency: public rocrtst::BaseRocR, public PerfBase {
public:
//@Brief: Constructor
FlushLatency();
//@Brief: Destructor
~FlushLatency();
//@Brief: Set up the testing environment
virtual void SetUp();
//@Brief: Run the test case
virtual void Run();
//@Brief: Close and clean up the test enrionment
virtual void Close();
//@Brief: Display load bandwidth
virtual void DisplayResults() const;
//@Brief: Set work-item configuration
void SetWorkItemNum() {
#ifdef INTERACTIVE
uint32_t tmp;
printf("Please input the number of CUs you want to try:\n");
int i;
i = scanf("%d", &num_cus_);
printf("Please input the number of groups you want to try:\n");
i = scanf("%d", &num_group_);
printf("Please input the size of each group:\n");
i = scanf("%d", &tmp);
set_group_size(tmp);
printf("Please input the number of kernel loop you want to try:\n");
i = scanf("%d", &kernel_loop_count_);
#else
num_cus_ = 32;
num_group_ = 128;
group_size_ = 256;
kernel_loop_count_ = 16;
#endif
return;
}
private:
//@Brief: number of work item in one group
uint32_t group_size_;
//@Brief: number of group
uint32_t num_group_;
//@Brief: number of CUs
uint32_t num_cus_;
//@Brief: number of kernel loop
uint32_t kernel_loop_count_;
//@Brief: Mean execution time
double mean_;
//@Brief: data size for test
uint64_t data_size_;
};
#endif
-502
Просмотреть файл
@@ -1,502 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include "common/base_rocr_utils.h"
#include "common/common.h"
#include "gtest/gtest.h"
#include "hsa_info.h"
static hsa_status_t get_agent_info(hsa_agent_t, void*);
static hsa_status_t get_pool_info(hsa_amd_memory_pool_t, void*);
static int agent_number = 0;
static bool output_amd = false;
//@Brief: Map to store the peak FLOPS for different agent
std::map<std::string, double> flops_table = { {"Kaveri CPU", 118.4}, {
"S pectre", 737.0
}, {"Carrizo CPU", 67.2}, {"Carrizo GPU", 819.2}
};
//@Brief: Vector to store the agent_names
std::vector<std::string> agent_names = {"Kaveri CPU", "Spectre",
"Carri zo CPU", "Carrizo GPU"
};
HsaInfo::HsaInfo() :
BaseRocR() {
}
HsaInfo::~HsaInfo() {
}
void HsaInfo::SetUp() {
// Get Env Var to determine if output AMD specific info
char* EnvVar = rocrtst::GetEnv("HSA_VENDOR_AMD");
if (NULL != EnvVar) {
output_amd = ('1' == *EnvVar);
}
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
return;
}
}
void HsaInfo::Run() {
hsa_status_t err;
// Get the system info first
// Get version info
uint16_t major, minor;
if (!rocrtst::CheckProfile(this)) {
return;
}
err = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &major);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &minor);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Get timestamp frequency
uint64_t timestamp_frequency = 0;
err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY,
&timestamp_frequency);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Get maximum duration of a signal wait operation
uint64_t max_wait = 0;
err = hsa_system_get_info(HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT, &max_wait);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Get Endianness of the system
hsa_endianness_t endianness;
err = hsa_system_get_info(HSA_SYSTEM_INFO_ENDIANNESS, &endianness);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Get machine model info
hsa_machine_model_t machine_model;
err = hsa_system_get_info(HSA_SYSTEM_INFO_MACHINE_MODEL, &machine_model);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Print out the results
std::cout << "HSA System Info:" << std::endl;
std::cout << "Runtime Version: " << major <<
"." << minor << std::endl;
std::cout << "System Timestamp Frequency: " <<
timestamp_frequency / 1e6 << "MHz" << std::endl;
std::cout << "Signal Max Wait Duration: " << max_wait
<< "(number of timestamp)" << std::endl;
std::cout << "Machine Model: ";
if (HSA_MACHINE_MODEL_SMALL == machine_model) {
std::cout << "SMALL" << std::endl;
}
else if (HSA_MACHINE_MODEL_LARGE == machine_model) {
std::cout << "LARGE" << std::endl;
}
std::cout << "System Endianness: ";
if (HSA_ENDIANNESS_LITTLE == endianness) {
std::cout << "LITTLE" << std::endl;
}
else if (HSA_ENDIANNESS_BIG == endianness) {
std::cout << "BIG" << std::endl;
}
std::cout << std::endl;
// Iterate every agent and get their info
err = hsa_iterate_agents(get_agent_info, NULL);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
return;
}
#define RET_IF_HSA_INFO_ERR(err) { \
if ((err) != HSA_STATUS_SUCCESS) { \
std::cout << "hsa api call failure at line " << __LINE__ << ", file: " << \
__FILE__ << std::endl; \
return (err); \
} \
}
static hsa_status_t get_agent_info(hsa_agent_t agent, void* data) {
int pool_number = 0;
hsa_status_t err;
{
// Increase the number of agent
agent_number++;
// Get agent name and vendor
char name[64];
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, name);
RET_IF_HSA_INFO_ERR(err)
char vendor_name[64];
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_VENDOR_NAME, &vendor_name);
RET_IF_HSA_INFO_ERR(err)
// Get agent feature
hsa_agent_feature_t agent_feature;
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_FEATURE, &agent_feature);
RET_IF_HSA_INFO_ERR(err)
// Get profile supported by the agent
hsa_profile_t agent_profile;
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_profile);
RET_IF_HSA_INFO_ERR(err)
// Get floating-point rounding mode
hsa_default_float_rounding_mode_t float_rounding_mode;
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE,
&float_rounding_mode);
RET_IF_HSA_INFO_ERR(err)
// Get max number of queue
uint32_t max_queue = 0;
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUES_MAX, &max_queue);
RET_IF_HSA_INFO_ERR(err)
// Get queue min size
uint32_t queue_min_size = 0;
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE,
&queue_min_size);
RET_IF_HSA_INFO_ERR(err)
// Get queue max size
uint32_t queue_max_size = 0;
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
&queue_max_size);
RET_IF_HSA_INFO_ERR(err)
// Get queue type
hsa_queue_type_t queue_type;
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_TYPE, &queue_type);
RET_IF_HSA_INFO_ERR(err)
// Get agent node
uint32_t node;
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NODE, &node);
RET_IF_HSA_INFO_ERR(err)
// Get device type
hsa_device_type_t device_type;
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
RET_IF_HSA_INFO_ERR(err)
// Get cache size
uint32_t cache_size[4];
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_CACHE_SIZE, cache_size);
RET_IF_HSA_INFO_ERR(err)
// Get chip id
uint32_t chip_id = 0;
err = hsa_agent_get_info(agent,
(hsa_agent_info_t) HSA_AMD_AGENT_INFO_CHIP_ID,
&chip_id);
RET_IF_HSA_INFO_ERR(err)
// Get cacheline size
uint32_t cacheline_size = 0;
err = hsa_agent_get_info(agent,
(hsa_agent_info_t) HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
&cacheline_size);
RET_IF_HSA_INFO_ERR(err)
// Get Max clock frequency
uint32_t max_clock_freq = 0;
err = hsa_agent_get_info(agent,
(hsa_agent_info_t) HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY,
&max_clock_freq);
RET_IF_HSA_INFO_ERR(err)
// Get Agent BDFID
uint16_t bdf_id = 1;
err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_BDFID,
&bdf_id);
RET_IF_HSA_INFO_ERR(err)
// Get number of Compute Unit
uint32_t compute_unit = 0;
err = hsa_agent_get_info(agent,
(hsa_agent_info_t) HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
&compute_unit);
RET_IF_HSA_INFO_ERR(err)
// Print out the common results
std::cout << std::endl;
std::cout << "Agent #" << agent_number << ":" << std::endl;
std::cout << "Agent Name: " << name <<
std::endl;
std::cout << "Agent Vendor Name: " <<
vendor_name << std::endl;
if (agent_feature & HSA_AGENT_FEATURE_KERNEL_DISPATCH
&& agent_feature & HSA_AGENT_FEATURE_AGENT_DISPATCH)
std::cout << "Agent Feature: KERNEL_DISPATCH & AGENT_DISPATCH"
<< std::endl;
else if (agent_feature & HSA_AGENT_FEATURE_KERNEL_DISPATCH) {
std::cout << "Agent Feature: KERNEL_DISPATCH" << std::endl;
}
else if (agent_feature & HSA_AGENT_FEATURE_AGENT_DISPATCH) {
std::cout << "Agent Feature: AGENT_DISPATCH" << std::endl;
}
else {
std::cout << "Agent Feature: Not Supported" << std::endl;
}
if (HSA_PROFILE_BASE == agent_profile) {
std::cout << "Agent Profile: BASE_PROFILE" << std::endl;
}
else if (HSA_PROFILE_FULL == agent_profile) {
std::cout << "Agent Profile: FULL_PROFILE" << std::endl;
}
else {
std::cout << "Agent Profile: Not Supported" << std::endl;
}
if (HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO == float_rounding_mode) {
std::cout << "Agent Floating Rounding Mode: ZERO" << std::endl;
}
else if (HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR == float_rounding_mode) {
std::cout << "Agent Floating Rounding Mode: NEAR" << std::endl;
}
else {
std::cout << "Agent Floating Rounding Mode: Not Supported" << std::endl;
}
std::cout << "Agent Max Queue Number: " << max_queue << std::endl;
std::cout << "Agent Queue Min Size: " << queue_min_size << std::endl;
std::cout << "Agent Queue Max Size: " << queue_max_size << std::endl;
if (HSA_QUEUE_TYPE_MULTI == queue_type) {
std::cout << "Agent Queue Type: MULTI" << std::endl;
}
else if (HSA_QUEUE_TYPE_SINGLE == queue_type) {
std::cout << "Agent Queue Type: SINGLE" << std::endl;
}
else {
std::cout << "Agent Queue Type: Not Supported" << std::endl;
}
std::cout << "Agent Node: " << node << std::endl;
if (HSA_DEVICE_TYPE_CPU == device_type) {
std::cout << "Agent Device Type: CPU" << std::endl;
}
else if (HSA_DEVICE_TYPE_GPU == device_type) {
std::cout << "Agent Device Type: GPU" << std::endl;
// Get ISA info
hsa_isa_t agent_isa;
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_ISA, &agent_isa);
RET_IF_HSA_INFO_ERR(err)
}
else {
std::cout << "Agent Device Type: DSP" << std::endl;
}
std::cout << "Agent Cache Info:" << std::endl;
for (int i = 0; i < 4; i++) {
if (cache_size[i]) {
std::cout << " $L" << i + 1 << ": " << cache_size[i] / 1024
<< "KB" << std::endl;
}
}
std::cout << "Agent Chip ID: " << chip_id << std::endl;
std::cout << "Agent Cacheline Size: " << cacheline_size << std::endl;
std::cout << "Agent Max Clock Frequency: " << max_clock_freq << "MHz"
<< std::endl;
std::cout << "Agent BDFID: " << bdf_id << std::endl;
std::cout << "Agent Compute Unit: " << compute_unit << std::endl;
// Output Peak FLOPS and Peak Bandwidth if Env var is set
// TODO: Fan, need to add BW
if (output_amd) {
std::string agent_name = name;
for (size_t i = 0; i < agent_names.size(); i++) {
if (agent_name.compare(agent_names[i]) == 0)
std::cout << "Agent Peak GFLOPS: " << flops_table[agent_name]
<< std::endl;
}
}
// Check if the agent is kernel agent
if (agent_feature & HSA_AGENT_FEATURE_KERNEL_DISPATCH) {
// Get flaf of fast_f16 operation
bool fast_f16;
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_FAST_F16_OPERATION,
&fast_f16);
RET_IF_HSA_INFO_ERR(err)
// Get wavefront size
uint32_t wavefront_size = 0;
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
&wavefront_size);
RET_IF_HSA_INFO_ERR(err)
// Get max total number of work-items in a workgroup
uint32_t workgroup_max_size = 0;
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
&workgroup_max_size);
RET_IF_HSA_INFO_ERR(err)
// Get max number of work-items of each dimension of a work-group
uint16_t workgroup_max_dim[3];
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
&workgroup_max_dim);
RET_IF_HSA_INFO_ERR(err)
// Get max number of a grid per dimension
hsa_dim3_t grid_max_dim;
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_DIM,
&grid_max_dim);
RET_IF_HSA_INFO_ERR(err)
// Get max total number of work-items in a grid
uint32_t grid_max_size = 0;
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
&grid_max_size);
RET_IF_HSA_INFO_ERR(err)
// Get max number of fbarriers per work group
uint32_t fbarrier_max_size = 0;
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE,
&fbarrier_max_size);
RET_IF_HSA_INFO_ERR(err)
// Print info for kernel agent
if (true == fast_f16) {
std::cout << "Agent Fast F16 Operation: TRUE" <<
std::endl;
}
std::cout << "Agent Wavefront Size: " <<
wavefront_size << std::endl;
std::cout << "Agent Workgroup Max Size: " <<
workgroup_max_size << std::endl;
std::cout <<
"Agent Workgroup Max Size Per Dimension: " <<
std::endl;
for (int i = 0; i < 3; i++) {
std::cout << " Dim[" << i <<
"]: " << workgroup_max_dim[i] <<
std::endl;
}
std::cout << "Agent Grid Max Size: " <<
grid_max_size << std::endl;
// Stop using the above kmt functions as per SWDEV-97044
//
uint32_t waves_per_cu = 0;
err = hsa_agent_get_info(agent,
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU,
&waves_per_cu);
RET_IF_HSA_INFO_ERR(err)
std::cout << "Agent Waves Per CU: " <<
waves_per_cu << std::endl;
std::cout << "Agent Max Work-item Per CU: "
<< wavefront_size* waves_per_cu << std::endl;
std::cout << "Agent Grid Max Size per Dimension:" << std::endl;
for (int i = 0; i < 3; i++) {
std::cout << " Dim[" << i <<
"] "
<< reinterpret_cast<uint32_t*>(&grid_max_dim)[i] << std::endl;
}
std::cout << "Agent Max number Of fbarriers Per Workgroup: "
<< fbarrier_max_size << std::endl;
}
}
// Get pool info
std::cout << "Agent Pool Info:" << std::endl;
err = hsa_amd_agent_iterate_memory_pools(agent, get_pool_info, &pool_number);
RET_IF_HSA_INFO_ERR(err)
return HSA_STATUS_SUCCESS;
}
// Implement region iteration function
hsa_status_t get_pool_info(hsa_amd_memory_pool_t pool, void* data) {
hsa_status_t err;
int* p_int = reinterpret_cast<int*>(data);
(*p_int)++;
std::cout << " Pool #" << *p_int << ":" << std::endl;
err = rocrtst::DumpMemoryPoolInfo(pool, 4);
RET_IF_HSA_INFO_ERR(err)
return err;
}
#undef RET_IF_HSA_INFO_ERR
void HsaInfo::DisplayResults() const {
if (!rocrtst::CheckProfile(this)) {
return;
}
return;
}
void HsaInfo::Close() {
hsa_status_t err;
err = rocrtst::CommonCleanUp(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
return;
}
-328
Просмотреть файл
@@ -1,328 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include "image_bandwidth.h"
#include "common/base_rocr_utils.h"
#include "common/common.h"
#include "common/hsatimer.h"
#include "gtest/gtest.h"
#include "hsa/hsa.h"
#include "hsa/hsa_ext_image.h"
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
ImageBandwidth::ImageBandwidth(size_t num) :
BaseRocR(), import_bandwidth_ {0.0}, export_bandwidth_ {0.0},
copy_bandwidth_ {0.0} {
format_.channel_order = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA;
format_.channel_type = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8;
geometry_ = HSA_EXT_IMAGE_GEOMETRY_2D;
set_requires_profile (HSA_PROFILE_FULL);
}
ImageBandwidth::~ImageBandwidth() {
}
const size_t ImageBandwidth::Size[10] = {32, 64, 128, 256, 512, 1024, 2048,
4096, 8192, 16384
};
const char* const ImageBandwidth::Str[10] = {"4K", "16K", "64K", "256K", "1M",
"4M", "16M", "64M", "256M", "1G"
};
void ImageBandwidth::SetUp() {
hsa_status_t err;
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
return;
}
hsa_agent_t* gpu_dev = gpu_device1();
// Find the global region
err = hsa_amd_agent_iterate_memory_pools(*gpu_dev, rocrtst::FindGlobalPool,
&cpu_pool());
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
}
void ImageBandwidth::Run() {
hsa_status_t err;
if (!rocrtst::CheckProfile(this)) {
return;
}
hsa_agent_t* gpu_dev = gpu_device1();
for (int i = 0; i < 10; i++) {
// Create timer for import, export and copy tests
rocrtst::PerfTimer import_timer;
rocrtst::PerfTimer export_timer;
rocrtst::PerfTimer copy_timer;
std::vector<double> import_image;
std::vector<double> export_image;
std::vector<double> copy_image;
// Allocate image buffer in host memory
uint32_t* image_buffer = NULL;
err = hsa_amd_memory_pool_allocate(cpu_pool(),
Size[i] * Size[i] * sizeof(uint32_t),
0, (void**) &image_buffer);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// rocrtst::CommonCleanUp the image buffer
for (uint32_t j = 0; j < Size[i] * Size[i]; j++) {
image_buffer[j] = 0x10101010;
}
// Prepare for 2D image creation
hsa_ext_image_t image_handle;
hsa_ext_image_descriptor_t image_descriptor;
image_descriptor.geometry = geometry_;
image_descriptor.width = Size[i];
image_descriptor.height = Size[i];
image_descriptor.depth = 1;
image_descriptor.array_size = 0;
image_descriptor.format = format_;
// Check if device_ supports at least read and write operation on
// image format
uint32_t capability_mask;
err = hsa_ext_image_get_capability(*gpu_dev, HSA_EXT_IMAGE_GEOMETRY_2D,
&format_, &capability_mask);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
if (!(capability_mask & HSA_EXT_IMAGE_CAPABILITY_READ_WRITE)) {
std::cout <<
"Device does not support read and write operation on this kind of image!"
<< std::endl;
ASSERT_NE(capability_mask & HSA_EXT_IMAGE_CAPABILITY_READ_WRITE, 0);
}
// Get image info
hsa_ext_image_data_info_t image_info;
err = hsa_ext_image_data_get_info(*gpu_dev, &image_descriptor,
HSA_ACCESS_PERMISSION_RW, &image_info);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Allocate memory for image
uintptr_t ptr_temp = 0;
err = hsa_amd_memory_pool_allocate(cpu_pool(),
image_info.size + image_info.alignment, 0, (void**) &ptr_temp);
// Align the image address
uintptr_t mul = ptr_temp / image_info.alignment;
void* ptr_image = (void*) ((mul + 1) * image_info.alignment);
// rocrtst::CommonCleanUp the image to 0
hsa_amd_memory_fill(ptr_image, 0, image_info.size);
// Create image handle
err = hsa_ext_image_create(*gpu_dev, &image_descriptor, ptr_image,
HSA_ACCESS_PERMISSION_RW, &image_handle);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Set import image region
hsa_dim3_t range = {(uint32_t) Size[i], (uint32_t) Size[i], 1};
hsa_ext_image_region_t image_region;
hsa_dim3_t image_offset = {0, 0, 0};
image_region.offset = image_offset;
image_region.range = range;
size_t iterations = RealIterationNum();
for (uint32_t it = 0; it < iterations; it++) {
// Create a timer
int index = import_timer.CreateTimer();
// Stamp at the beginning
import_timer.StartTimer(index);
// Import image from host
err = hsa_ext_image_import(*gpu_dev, image_buffer, 0, 0, image_handle,
&image_region);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Stamp in the end
import_timer.StopTimer(index);
import_image.push_back(import_timer.ReadTimer(index));
}
// Reset image_buffer
hsa_amd_memory_fill(image_buffer, 0, Size[i] * Size[i] * sizeof(uint32_t));
for (uint32_t it = 0; it < iterations; it++) {
// Export image
// Stamp at the beginning
int index = export_timer.CreateTimer();
export_timer.StartTimer(index);
err = hsa_ext_image_export(*gpu_dev, image_handle, image_buffer, 0, 0,
&image_region);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
export_timer.StopTimer(index);
export_image.push_back(export_timer.ReadTimer(index));
// Check if the value is correct
for (uint32_t j = 0; j < Size[i] * Size[i]; j++) {
ASSERT_EQ(image_buffer[j], 0x10101010);
}
}
// Create another image for copy
// Allocate memory for image
uintptr_t ptr_temp2 = 0;
err = hsa_amd_memory_pool_allocate(cpu_pool(),
image_info.size + image_info.alignment, 0, (void**) &ptr_temp2);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Align the image address
mul = ptr_temp2 / image_info.alignment;
void* ptr_image2 = (void*) ((mul + 1) * image_info.alignment);
// rocrtst::CommonCleanUp the image to 0
hsa_amd_memory_fill(ptr_image2, 0, image_info.size);
// Create image handle
hsa_ext_image_t image_handle_copy;
err = hsa_ext_image_create(*gpu_dev, &image_descriptor, ptr_image2,
HSA_ACCESS_PERMISSION_RW, &image_handle_copy);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
for (uint32_t it = 0; it < iterations; it++) {
// Stamp at the beginning
int index = copy_timer.CreateTimer();
copy_timer.StartTimer(index);
err = hsa_ext_image_copy(*gpu_dev, image_handle, &image_offset,
image_handle_copy, &image_offset, &range);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Stamp in the end
copy_timer.StopTimer(index);
copy_image.push_back(copy_timer.ReadTimer(index));
// Check if image data is correct
hsa_amd_memory_fill(image_buffer, 0,
Size[i] * Size[i] * sizeof(uint32_t));
// Export image
err = hsa_ext_image_export(*gpu_dev, image_handle_copy, image_buffer,
0, 0, &image_region);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Check if the value is correct
for (uint32_t j = 0; j < Size[i] * Size[i]; j++) {
ASSERT_EQ(image_buffer[j], 0x10101010);
}
}
// Calculate Bandwidth
import_bandwidth_[i] = CalculateBandwidth(import_image, Size[i]);
export_bandwidth_[i] = CalculateBandwidth(export_image, Size[i]);
copy_bandwidth_[i] = CalculateBandwidth(copy_image, Size[i]);
}
}
double ImageBandwidth::CalculateBandwidth(std::vector<double>& vec,
size_t size) {
double mean = 0.0;
// Delete the first timer result, which is warm up test
vec.erase(vec.begin());
// Sort the results
std::sort(vec.begin(), vec.end());
// Delete the last 20% of the results
vec.erase(vec.begin() + num_iteration(), vec.end());
int num = vec.size();
for (int index = 0; index < num; index++) {
mean += vec[index];
}
mean /= num;
return (double) size * size * 4 / mean / 1024 / 1024 / 1024;
}
void ImageBandwidth::DisplayResults() const {
if (!rocrtst::CheckProfile(this)) {
return;
}
fprintf(stdout, "==================================================="
"=========================\n");
fprintf(stdout,
" Size Import Export Copy\n");
for (int i = 0; i < 10; i++) {
fprintf(stdout,
" %s %f(GB/s) %f(GB/s) %f(GB/s)\n",
Str[i], import_bandwidth_[i], export_bandwidth_[i],
copy_bandwidth_[i]);
fprintf(stdout, "================================================="
"===========================\n");
}
}
void ImageBandwidth::Close() {
hsa_status_t err;
err = rocrtst::CommonCleanUp(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
}
size_t ImageBandwidth::RealIterationNum() {
return num_iteration() * 1.2 + 1;
}
-99
Просмотреть файл
@@ -1,99 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef __ROCRTST_SRC_IMAGE_BANDWIDTH_H__
#define __ROCRTST_SRC_IMAGE_BANDWIDTH_H__
#include "perf_common/perf_base.h"
#include "common/base_rocr.h"
#include "hsa/hsa.h"
#include "hsa/hsa_ext_image.h"
#include <vector>
class ImageBandwidth: public rocrtst::BaseRocR, public PerfBase {
public:
//@Brief: Constructor for test case of ImageBandwidth
ImageBandwidth(size_t num = 100);
//@Brief: Destructor
virtual ~ImageBandwidth();
//@Brief: Setup the environment for measurement
virtual void SetUp();
//@Brief: Core measurement execution
virtual void Run();
//@Brief: Clean up and retrive the resource
virtual void Close();
//@Brief: Display results
virtual void DisplayResults() const;
private:
//@Brief: Define image size and corresponding string
static const size_t Size[10];
static const char* const Str[10];
//@Brief: Get actual iteration number
size_t RealIterationNum();
//@Brief: Calculate Bandwidth
double CalculateBandwidth(std::vector<double>& vec, size_t size);
protected:
//@Brief: bandwidth data
double import_bandwidth_[10];
double export_bandwidth_[10];
double copy_bandwidth_[10];
//@Brief: Image format
hsa_ext_image_format_t format_;
//@Brief: Image geometry
hsa_ext_image_geometry_t geometry_;
};
#endif
-270
Просмотреть файл
@@ -1,270 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include "image_load_bandwidth.h"
#include "common/base_rocr_utils.h"
#include "common/common.h"
#include "common/hsatimer.h"
#include "common/helper_funcs.h"
#include "gtest/gtest.h"
#include "hsa/hsa_ext_image.h"
#include <stdio.h>
#include <vector>
// Constructor of the class
ImageLoadBandwidth::ImageLoadBandwidth() :
BaseRocR() {
load_bandwidth_ = 0.0;
image_size_ = 0;
set_requires_profile (HSA_PROFILE_FULL);
}
// Destructor of the class
ImageLoadBandwidth::~ImageLoadBandwidth() {
}
// Set up the environment
void ImageLoadBandwidth::SetUp() {
hsa_agent_t* gpu_dev = gpu_device1();
set_kernel_file_name("load_2d_image.o");
set_kernel_name("&__OpenCL_load_2d_image_kernel");
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
return;
}
//Create a queue with max number size
hsa_queue_t* q = main_queue();
rocrtst::CreateQueue(*gpu_dev, &q);
rocrtst::LoadKernelFromObjFile(this);
//Fill up part of aql
rocrtst::InitializeAQLPacket(this, &aql());
aql().setup = 0;
aql().setup |= 2 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
return;
}
// Run the test
void ImageLoadBandwidth::Run() {
hsa_agent_t* gpu_dev = gpu_device1();
hsa_agent_t* cpu_dev = cpu_device();
hsa_status_t err;
if (!rocrtst::CheckProfile(this)) {
return;
}
hsa_ext_image_descriptor_t image_descriptor;
image_descriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2D;
image_descriptor.width = 256;
image_descriptor.height = 256;
image_descriptor.depth = 1;
image_descriptor.array_size = 0;
image_descriptor.format.channel_type =
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8;
image_descriptor.format.channel_order = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA;
hsa_ext_image_format_t image_format;
image_format.channel_type = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8;
image_format.channel_order = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA;
// Check if device_ supports at least read only operation on image format
uint32_t capability_mask;
err = hsa_ext_image_get_capability(*gpu_dev, HSA_EXT_IMAGE_GEOMETRY_2D,
&image_format, &capability_mask);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
if (!(capability_mask & HSA_EXT_IMAGE_CAPABILITY_READ_ONLY)) {
ASSERT_FALSE(
"Device does not support read and write operation on this kind of image!");
}
// Get image info
hsa_ext_image_data_info_t image_info;
err = hsa_ext_image_data_get_info(*gpu_dev, &image_descriptor,
HSA_ACCESS_PERMISSION_RO, &image_info);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
image_size_ = image_info.size;
std::vector<double> time;
for (uint32_t i = 0; i < num_iteration(); i++) {
#ifdef DEBUG
std::cout << ".";
fflush(stdout);
#endif
// Allocate memory space for image
// Find the global region
err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool,
&cpu_pool());
ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
uintptr_t ptr_temp = 0;
err = hsa_amd_memory_pool_allocate(cpu_pool(),
image_info.size + image_info.alignment,
0, (void**) &ptr_temp);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_agents_allow_access(1, gpu_dev, NULL, (void*) ptr_temp);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Align the image address
uintptr_t mul = ptr_temp / image_info.alignment;
void* ptr_image = (void*) ((mul + 1) * image_info.alignment);
// rocrtst::CommonCleanUp the image memory to 1
err = hsa_amd_memory_fill(ptr_image, 1, image_info.size);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Create image handle
hsa_ext_image_t image_handle;
err = hsa_ext_image_create(*gpu_dev, &image_descriptor, ptr_image,
HSA_ACCESS_PERMISSION_RO, &image_handle);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Allocate and initialize the kernel argument
typedef struct args_t {
uint64_t arg0;
int* arg1;
int istart;
int iend;
int istep;
} args;
int local_out = 5;
int istart = 0;
int iend = 64;
int istep = 1;
args* kern_ptr = NULL;
err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(args), 0,
(void**) &kern_ptr);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
kern_ptr->arg0 = image_handle.handle;
kern_ptr->arg1 = &local_out;
kern_ptr->istart = istart;
kern_ptr->iend = iend;
kern_ptr->istep = istep;
aql().kernarg_address = kern_ptr;
// Obtain the current queue write index
uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
void *q_base_addr = main_queue()->base_address;
// Write the aql packet at the calculated queue index address.
const uint32_t queue_mask = main_queue()->size - 1;
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql();
rocrtst::PerfTimer p_timer;
int id = p_timer.CreateTimer();
p_timer.StartTimer(id);
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask].header |=
HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
hsa_signal_store_release(main_queue()->doorbell_signal, index);
// Wait on the dispatch signal until the kernel is finished.
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
;
p_timer.StopTimer(id);
time.push_back(p_timer.ReadTimer(id));
hsa_signal_store_release(signal(), 1);
err = hsa_ext_image_destroy(*gpu_dev, image_handle);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_memory_deregister(ptr_image, image_info.size);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
hsa_amd_memory_pool_free((void*) ptr_temp);
}
// Calculte the mean load time
time.erase(time.begin());
#ifdef DEBUG
for (uint32_t i = 0; i < time.size(); i++) {
std::cout << time[i] << std::endl;
}
#endif
double mean_time = rocrtst::CalcMean(time);
load_bandwidth_ = image_size_ / mean_time / 1024 / 1024 / 1024;
}
void ImageLoadBandwidth::Close() {
hsa_status_t err;
err = rocrtst::CommonCleanUp(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
}
void ImageLoadBandwidth::DisplayResults() const {
if (!rocrtst::CheckProfile(this)) {
return;
}
std::cout << "======================================"
"======================================" << std::endl;
std::cout << " Image Size(bytes): LoadBandwidth(GB/S): "
<< std::endl;
std::cout << " " << image_size_ << " "
<< load_bandwidth_ << std::endl;
}
-271
Просмотреть файл
@@ -1,271 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include "image_store_bandwidth.h"
#include "common/base_rocr_utils.h"
#include "common/common.h"
#include "common/helper_funcs.h"
#include "common/hsatimer.h"
#include "gtest/gtest.h"
#include "hsa/hsa_ext_image.h"
#include <stdio.h>
#include <vector>
// Constructor of the class
ImageStoreBandwidth::ImageStoreBandwidth() :
BaseRocR() {
store_bandwidth_ = 0.0;
store_bandwidth_ = 0.0;
image_size_ = 0;
set_requires_profile (HSA_PROFILE_FULL);
}
// Destructor of the class
ImageStoreBandwidth::~ImageStoreBandwidth() {
}
// Set up the environment
void ImageStoreBandwidth::SetUp() {
set_kernel_file_name("store_2d_image.o");
set_kernel_name("&__OpenCL_store_2d_image_kernel");
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
return;
}
hsa_agent_t* gpu_dev = gpu_device1();
//Create a queue with max number size
hsa_queue_t* q = nullptr;
rocrtst::CreateQueue(*gpu_dev, &q);
set_main_queue(q);
rocrtst::LoadKernelFromObjFile(this);
//Fill up part of aql
rocrtst::InitializeAQLPacket(this, &aql());
aql().setup = 0;
aql().setup |= 2 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
return;
}
// Run the test
void ImageStoreBandwidth::Run() {
hsa_status_t err;
if (!rocrtst::CheckProfile(this)) {
return;
}
hsa_agent_t* gpu_dev = gpu_device1();
hsa_agent_t* cpu_dev = cpu_device();
hsa_ext_image_descriptor_t image_descriptor;
image_descriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2D;
image_descriptor.width = 256;
image_descriptor.height = 256;
image_descriptor.depth = 1;
image_descriptor.array_size = 0;
image_descriptor.format.channel_type =
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8;
image_descriptor.format.channel_order = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA;
hsa_ext_image_format_t image_format;
image_format.channel_type = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8;
image_format.channel_order = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA;
// Check if device_ supports at least read only operation on image format
uint32_t capability_mask;
err = hsa_ext_image_get_capability(*gpu_dev, HSA_EXT_IMAGE_GEOMETRY_2D,
&image_format, &capability_mask);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
if (!(capability_mask & HSA_EXT_IMAGE_CAPABILITY_READ_ONLY)) {
std::cout <<
"Device does not support read and write operation on this kind of image!"
<< std::endl;
ASSERT_NE(capability_mask & HSA_EXT_IMAGE_CAPABILITY_READ_ONLY, 0);
}
// Get image info
hsa_ext_image_data_info_t image_info;
err = hsa_ext_image_data_get_info(*gpu_dev, &image_descriptor,
HSA_ACCESS_PERMISSION_RW, &image_info);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
image_size_ = image_info.size;
std::vector<double> time;
for (uint32_t i = 0; i < num_iteration(); i++) {
#ifdef DEBUG
std::cout << ".";
fflush(stdout);
#endif
// Allocate memory space for image
err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool,
&cpu_pool());
ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
uintptr_t ptr_temp = 0;
err = hsa_amd_memory_pool_allocate(cpu_pool(),
image_info.size + image_info.alignment,
0, (void**) &ptr_temp);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Align the image address
uintptr_t mul = ptr_temp / image_info.alignment;
void* ptr_image = (void*) ((mul + 1) * image_info.alignment);
// rocrtst::CommonCleanUp the image memory to 0
err = hsa_amd_memory_fill(ptr_image, 0, image_info.size);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Create image handle
hsa_ext_image_t image_handle;
err = hsa_ext_image_create(*gpu_dev, &image_descriptor, ptr_image,
HSA_ACCESS_PERMISSION_RO, &image_handle);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Allocate and initialize the kernel argument
typedef struct args_t {
uint64_t arg0;
int istart;
int iend;
int istep;
} args;
//int local_out = 5;
int istart = 0;
int iend = 64;
int istep = 1;
args* kern_ptr = NULL;
err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(args), 0,
(void**) &kern_ptr);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
kern_ptr->arg0 = image_handle.handle;
kern_ptr->istart = istart;
kern_ptr->iend = iend;
kern_ptr->istep = istep;
aql().kernarg_address = kern_ptr;
// Obtain the current queue write index
uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
void *q_base_addr = main_queue()->base_address;
// Write the aql packet at the calculated queue index address.
const uint32_t queue_mask = main_queue()->size - 1;
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql();
rocrtst::PerfTimer p_timer;
int id = p_timer.CreateTimer();
p_timer.StartTimer(id);
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask].header |=
HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
hsa_signal_store_release(main_queue()->doorbell_signal, index);
// Wait on the dispatch signal until the kernel is finished.
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
;
p_timer.StopTimer(id);
time.push_back(p_timer.ReadTimer(id));
hsa_signal_store_release(signal(), 1);
err = hsa_ext_image_destroy(*gpu_dev, image_handle);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_memory_deregister(ptr_image, image_info.size);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
hsa_amd_memory_pool_free(reinterpret_cast<void*>(ptr_temp));
}
// Calculte the mean load time
time.erase(time.begin());
#ifdef DEBUG
for (size_t i = 0; i < time.size(); i++) {
std::cout << time[i] << std::endl;
}
#endif
double mean_time = rocrtst::CalcMean(time);
std::cout << "mean time: " << mean_time << std::endl;
store_bandwidth_ = image_size_ / mean_time / 1024 / 1024 / 1024;
}
void ImageStoreBandwidth::Close() {
hsa_status_t err;
err = rocrtst::CommonCleanUp(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
}
void ImageStoreBandwidth::DisplayResults() const {
if (!rocrtst::CheckProfile(this)) {
return;
}
std::cout << "============================================="
"===============================" << std::endl;
std::cout << " Image Size(bytes): StoreBandwidth(GB/S): "
<< std::cout;
std::cout << " " << image_size_ << " "
<< store_bandwidth_ << std::endl;
}
Двоичный файл не отображается.
@@ -43,43 +43,8 @@
*
*/
#ifndef __ROCRTST_SRC_HSA_INFO_H__
#define __ROCRTST_SRC_HSA_INFO_H__
#include "perf_common/perf_base.h"
#include "common/base_rocr.h"
#include "common/common.h"
#include "common/os.h"
#include "hsa/hsa.h"
#include "hsa/hsa_ext_amd.h"
#include <iostream>
#include <map>
#include <string>
#include <vector>
//@Brief: This is trying to replicate clinfo
class HsaInfo: public rocrtst::BaseRocR, public PerfBase {
public:
//@Brief: Constructor
HsaInfo();
//@Brief: Destructor
virtual ~HsaInfo();
//@Brief: Set up the environment for the test
virtual void SetUp();
//@Brief: Run the test case
virtual void Run();
//@Brief: Display results we got
virtual void DisplayResults() const;
//@Brief: Clean up and close the runtime
virtual void Close();
};
#endif
__kernel void
empty_kernel(void) {
return;
}
-12
Просмотреть файл
@@ -1,12 +0,0 @@
module &m:1:0:$full:$large:$default;
extension "amd:gcn";
extension "IMAGE";
decl prog function &abort()();
prog kernel &__Empty_kernel()
{
ret;
};
-88
Просмотреть файл
@@ -1,88 +0,0 @@
module &m:1:0:$full:$large:$default;
/* Copyright 2014 HSA Foundation Inc. All Rights Reserved.
*
* HSAF is granting you permission to use this software and documentation (if
* any) (collectively, the "Materials") pursuant to the terms and conditions
* of the Software License Agreement included with the Materials. If you do
* not have a copy of the Software License Agreement, contact the HSA Foundation for a copy.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
*/
extension "amd:gcn";
extension "IMAGE";
decl prog function &abort()();
/**
* @brief Hsail kernel to benchmark READ accesses to system memory.
* The kernel is given a input buffer from which each each thread will
* read. The thread will read from multiple locations of the input buffer.
* The locations to read from is determined by the work-item Id, the function
* being work-item Id modulo total number of work-items in the global work grid.
* So given a global work grid of 16 work-items the reads by a thread with absolute
* id 4 would be 4, 20, 36, 52, etc.
*
* @NOTE: A constraint imposed by the kernel is that the buffer size be large
* enough to support 16 reads by each thread. So a dispatch of 8 work-items
* should allocate enough buffer for 8 * 16 * sizeof(uint32_t).
*
* @param bufStart beginning byte address of user buffer in system memory
* from which kernel threads could read
*
* @param bufEnd byte address that follows the end of user buffer. Accessing
* memory at bufEnd is illegal
*
* @param addrStep size by which to increment byte address following each read
* operation. The value represents total number of work-items * sizeof(uint32_t)
*
* @param outAddr argument that is passed by the user to be updated with values
* read by the kernel threads. This is ensure compiler and finalizer do not eliminate
* code because the values being read are not used in any meaningfule way.
*
*/
prog kernel &main(kernarg_u64 %outAddr) {
pragma "AMD RTI", "ARGSTART:__SysMemLoad";
pragma "AMD RTI", "version:3:1:104";
pragma "AMD RTI", "device:generic";
pragma "AMD RTI", "uniqueid:1024";
pragma "AMD RTI", "function:1:0";
pragma "AMD RTI", "memory:64bitABI";
pragma "AMD RTI", "uavid:8";
pragma "AMD RTI", "privateid:8";
pragma "AMD RTI", "ARGEND:__SysMemLoad";
ld_kernarg_u64 $d0, [%outAddr];
// Compute the absolute id of current thread
// and shift it by two to get index into user
// buffer to access for Read operation
workitemflatabsid_u32 $s0;
shl_u32 $s0, $s0, 2;
cvt_u64_u32 $d4, $s0;
// Add index to base address of user buffer to obtain
// effective address for access
add_u64 $d0, $d0, $d4;
mov_u32 $s2, 1;
st_global_u32 $s2, [$d0];
};
-88
Просмотреть файл
@@ -1,88 +0,0 @@
module &m:1:0:$base:$large:$default;
/* Copyright 2014 HSA Foundation Inc. All Rights Reserved.
*
* HSAF is granting you permission to use this software and documentation (if
* any) (collectively, the "Materials") pursuant to the terms and conditions
* of the Software License Agreement included with the Materials. If you do
* not have a copy of the Software License Agreement, contact the HSA Foundation for a copy.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
*/
extension "amd:gcn";
extension "IMAGE";
decl prog function &abort()();
/**
* @brief Hsail kernel to benchmark READ accesses to system memory.
* The kernel is given a input buffer from which each each thread will
* read. The thread will read from multiple locations of the input buffer.
* The locations to read from is determined by the work-item Id, the function
* being work-item Id modulo total number of work-items in the global work grid.
* So given a global work grid of 16 work-items the reads by a thread with absolute
* id 4 would be 4, 20, 36, 52, etc.
*
* @NOTE: A constraint imposed by the kernel is that the buffer size be large
* enough to support 16 reads by each thread. So a dispatch of 8 work-items
* should allocate enough buffer for 8 * 16 * sizeof(uint32_t).
*
* @param bufStart beginning byte address of user buffer in system memory
* from which kernel threads could read
*
* @param bufEnd byte address that follows the end of user buffer. Accessing
* memory at bufEnd is illegal
*
* @param addrStep size by which to increment byte address following each read
* operation. The value represents total number of work-items * sizeof(uint32_t)
*
* @param outAddr argument that is passed by the user to be updated with values
* read by the kernel threads. This is ensure compiler and finalizer do not eliminate
* code because the values being read are not used in any meaningfule way.
*
*/
prog kernel &main(kernarg_u64 %outAddr) {
pragma "AMD RTI", "ARGSTART:__SysMemLoad";
pragma "AMD RTI", "version:3:1:104";
pragma "AMD RTI", "device:generic";
pragma "AMD RTI", "uniqueid:1024";
pragma "AMD RTI", "function:1:0";
pragma "AMD RTI", "memory:64bitABI";
pragma "AMD RTI", "uavid:8";
pragma "AMD RTI", "privateid:8";
pragma "AMD RTI", "ARGEND:__SysMemLoad";
ld_kernarg_u64 $d0, [%outAddr];
// Compute the absolute id of current thread
// and shift it by two to get index into user
// buffer to access for Read operation
workitemflatabsid_u32 $s0;
shl_u32 $s0, $s0, 2;
cvt_u64_u32 $d4, $s0;
// Add index to base address of user buffer to obtain
// effective address for access
add_u64 $d0, $d0, $d4;
mov_u32 $s2, 1;
st_global_u32 $s2, [$d0];
};
-109
Просмотреть файл
@@ -1,109 +0,0 @@
module &m:1:0:$full:$large:$default;
extension "amd:gcn";
extension "IMAGE";
decl prog function &abort()();
prog kernel &__OpenCL_load_2d_image_kernel(
kernarg_rwimg %input,
kernarg_u64 %result,
kernarg_u32 %istart,
kernarg_u32 %iend,
kernarg_u32 %istep)
{
pragma "AMD RTI", "ARGSTART:__OpenCL_load_2d_image_kernel";
pragma "AMD RTI", "version:3:1:104";
pragma "AMD RTI", "device:generic";
pragma "AMD RTI", "uniqueid:1024";
pragma "AMD RTI", "function:1:0";
pragma "AMD RTI", "memory:64bitABI";
pragma "AMD RTI", "uavid:8";
pragma "AMD RTI", "privateid:8";
pragma "AMD RTI", "ARGEND:__OpenCL_load_2d_image_kernel";
@__OpenCL_load_2d_image_kernel_entry:
// BB#0: // %entry
workitemabsid_u32 $s0, 1;
workitemabsid_u32 $s1, 0;
ld_kernarg_rwimg $d5, [%input];
ld_kernarg_u32 $s2, [%istart];
ld_kernarg_u32 $s3, [%iend];
ld_kernarg_u32 $s4, [%istep];
add_u32 $s9, 0, 0; // reset s9 to zero
@loop:
add_u32 $s2, $s2, $s4;
ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0); //(coordWidth, coordHeight)
add_u32 $s9, $s9, $s5;
//force to retrieve different image elements
add_u32 $s1, $s1, 64;
and_b32 $s1, $s1, 255;
add_u32 $s0, $s0, 64;
and_b32 $s0, $s0, 255;
ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0);
add_u32 $s9, $s9, $s6;
//force to retrieve different image elements
add_u32 $s1, $s1, 64;
and_b32 $s1, $s1, 255;
add_u32 $s0, $s0, 64;
and_b32 $s0, $s0, 255;
ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0);
add_u32 $s9, $s9, $s7;
//force to retrieve different image elements
add_u32 $s1, $s1, 64;
and_b32 $s1, $s1, 255;
add_u32 $s0, $s0, 64;
and_b32 $s0, $s0, 255;
ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0);
add_u32 $s9, $s9, $s8;
//force to retrieve different image elements
add_u32 $s1, $s1, 64;
and_b32 $s1, $s1, 255;
add_u32 $s0, $s0, 64;
and_b32 $s0, $s0, 255;
ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0);
add_u32 $s9, $s9, $s5;
//force to retrieve different image elements
add_u32 $s1, $s1, 64;
and_b32 $s1, $s1, 255;
add_u32 $s0, $s0, 64;
and_b32 $s0, $s0, 255;
ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0);
add_u32 $s9, $s9, $s6;
//force to retrieve different image elements
add_u32 $s1, $s1, 64;
and_b32 $s1, $s1, 255;
add_u32 $s0, $s0, 64;
and_b32 $s0, $s0, 255;
ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0);
add_u32 $s9, $s9, $s7;
//force to retrieve different image elements
add_u32 $s1, $s1, 64;
and_b32 $s1, $s1, 255;
add_u32 $s0, $s0, 64;
and_b32 $s0, $s0, 255;
ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0);
ld_kernarg_align(8)_width(all)_u64 $d4, [%result];
add_u32 $s9, $s9, $s8;
st_u32 $s9, [$d4];
//loop until we hit condition
cmp_lt_b1_u32 $c0, $s2, $s3;
cbr_b1 $c0, @loop;
};
-37
Просмотреть файл
@@ -1,37 +0,0 @@
module &m:1:0:$full:$large:$default;
extension "amd:gcn";
extension "IMAGE";
decl prog function &abort()();
/* This function takes in 2 memory locations, one storing a number of
iterations to execute, and the other a place to store a result.
The function iterates through a loop "iteration" times, and stores
the number of iterations executed in the "results" location.
A successful run is when the value stored in %iteration is the
same as the value store in %results.
*/
prog kernel &__simple_kernel(
kernarg_u64 %iteration,
kernarg_u64 %results)
{
ret;
ld_kernarg_align(8)_width(all)_u64 $d1, [%iteration];
ld_kernarg_align(8)_width(all)_u64 $d2, [%results];
ld_global_u32 $s1, [$d1];
mov_u32 $s2, 0;
@loop:
add_u32 $s2, $s2, 1;
cmp_lt_b1_u32 $c0, $s2, $s1;
cbr_b1 $c0, @loop;
st_global_u32 $s2, [$d2];
ret;
};
-28
Просмотреть файл
@@ -1,28 +0,0 @@
module &m:1:0:$base:$large:$default;
extension "amd:gcn";
extension "IMAGE";
decl prog function &abort()();
prog kernel &__simple_kernel(
kernarg_u64 %iteration,
kernarg_u64 %results)
{
ld_kernarg_align(8)_width(all)_u64 $d1, [%iteration];
ld_kernarg_align(8)_width(all)_u64 $d2, [%results];
ld_global_u32 $s1, [$d1];
mov_u32 $s2, 0;
@loop:
add_u32 $s2, $s2, 1;
cmp_lt_b1_u32 $c0, $s2, $s1;
cbr_b1 $c0, @loop;
st_global_u32 $s2, [$d2];
ret;
};
-105
Просмотреть файл
@@ -1,105 +0,0 @@
module &m:1:0:$full:$large:$default;
extension "amd:gcn";
extension "IMAGE";
decl prog function &abort()();
prog kernel &__OpenCL_store_2d_image_kernel(
kernarg_rwimg %output,
kernarg_u32 %istart,
kernarg_u32 %iend,
kernarg_u32 %istep)
{
pragma "AMD RTI", "ARGSTART:__OpenCL_store_2d_image_kernel";
pragma "AMD RTI", "version:3:1:104";
pragma "AMD RTI", "device:generic";
pragma "AMD RTI", "uniqueid:1024";
pragma "AMD RTI", "function:1:0";
pragma "AMD RTI", "memory:64bitABI";
pragma "AMD RTI", "uavid:8";
pragma "AMD RTI", "privateid:8";
pragma "AMD RTI", "ARGEND:__OpenCL_store_2d_image_kernel";
@__OpenCL_store_2d_image_kernel_entry:
// BB#0: // %entry
workitemabsid_u32 $s0, 1;
workitemabsid_u32 $s1, 0;
ld_kernarg_rwimg $d5, [%output];
ld_kernarg_u32 $s2, [%istart];
ld_kernarg_u32 $s3, [%iend];
ld_kernarg_u32 $s4, [%istep];
mov_b32 $s5, 0;
@loop:
add_u32 $s2, $s2, $s4;
add_u32 $s5, $s5, 1;
stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
//force to retrieve different image elements
add_u32 $s1, $s1, 64;
and_b32 $s1, $s1, 255;
add_u32 $s0, $s0, 64;
and_b32 $s0, $s0, 255;
add_u32 $s5, $s5, $s2;
stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
//force to retrieve different image elements
add_u32 $s1, $s1, 64;
and_b32 $s1, $s1, 255;
add_u32 $s0, $s0, 64;
and_b32 $s0, $s0, 255;
add_u32 $s5, $s5, $s2;
stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
//force to retrieve different image elements
add_u32 $s1, $s1, 64;
and_b32 $s1, $s1, 255;
add_u32 $s0, $s0, 64;
and_b32 $s0, $s0, 255;
add_u32 $s5, $s5, $s2;
stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
//force to retrieve different image elements
add_u32 $s1, $s1, 64;
and_b32 $s1, $s1, 255;
add_u32 $s0, $s0, 64;
and_b32 $s0, $s0, 255;
add_u32 $s5, $s5, $s2;
stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
//force to retrieve different image elements
add_u32 $s1, $s1, 64;
and_b32 $s1, $s1, 255;
add_u32 $s0, $s0, 64;
and_b32 $s0, $s0, 255;
add_u32 $s5, $s5, $s2;
stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
//force to retrieve different image elements
add_u32 $s1, $s1, 64;
and_b32 $s1, $s1, 255;
add_u32 $s0, $s0, 64;
and_b32 $s0, $s0, 255;
add_u32 $s5, $s5, $s2;
stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
//force to retrieve different image elements
add_u32 $s1, $s1, 64;
and_b32 $s1, $s1, 255;
add_u32 $s0, $s0, 64;
and_b32 $s0, $s0, 255;
add_u32 $s5, $s5, $s2;
stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
//loop until we hit condition
cmp_lt_b1_u32 $c0, $s2, $s3;
cbr_b1 $c0, @loop;
ret;
};
-237
Просмотреть файл
@@ -1,237 +0,0 @@
module &m:1:0:$full:$large:$default;
/* Copyright 2014 HSA Foundation Inc. All Rights Reserved.
*
* HSAF is granting you permission to use this software and documentation (if
* any) (collectively, the "Materials") pursuant to the terms and conditions
* of the Software License Agreement included with the Materials. If you do
* not have a copy of the Software License Agreement, contact the HSA Foundation for a copy.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
*/
extension "amd:gcn";
extension "IMAGE";
decl prog function &abort()();
/**
* @brief Hsail kernel to benchmark READ accesses to system memory.
* The kernel is given a input buffer from which each each thread will
* read. The thread will read from multiple locations of the input buffer.
* The locations to read from is determined by the work-item Id, the function
* being work-item Id modulo total number of work-items in the global work grid.
* So given a global work grid of 16 work-items the reads by a thread with absolute
* id 4 would be 4, 20, 36, 52, etc.
*
* @NOTE: A constraint imposed by the kernel is that the buffer size be large
* enough to support 16 reads by each thread. So a dispatch of 8 work-items
* should allocate enough buffer for 8 * 16 * sizeof(uint32_t).
*
* @param bufStart beginning byte address of user buffer in system memory
* from which kernel threads could read
*
* @param bufEnd byte address that follows the end of user buffer. Accessing
* memory at bufEnd is illegal
*
* @param addrStep size by which to increment byte address following each read
* operation. The value represents total number of work-items * sizeof(uint32_t)
*
* @param outAddr argument that is passed by the user to be updated with values
* read by the kernel threads. This is ensure compiler and finalizer do not eliminate
* code because the values being read are not used in any meaningfule way.
*
*/
prog kernel &__SysMemLoad(kernarg_u64 %bufStart,
kernarg_u64 %bufEnd,
kernarg_u64 %addrStep,
kernarg_u64 %outAddr) {
pragma "AMD RTI", "ARGSTART:__SysMemLoad";
pragma "AMD RTI", "version:3:1:104";
pragma "AMD RTI", "device:generic";
pragma "AMD RTI", "uniqueid:1024";
pragma "AMD RTI", "function:1:0";
pragma "AMD RTI", "memory:64bitABI";
pragma "AMD RTI", "uavid:8";
pragma "AMD RTI", "privateid:8";
pragma "AMD RTI", "ARGEND:__SysMemLoad";
// Retrieve the values of input arguments
// bufStart refers to the starting byte address
// bufEnd refers to the end of byte address
// addrStep refers to the product of total number
// of work-items in the grid * sizeof(uint32_t)
ld_kernarg_u64 $d0, [%bufStart];
ld_kernarg_u64 $d1, [%bufEnd];
ld_kernarg_u64 $d2, [%addrStep];
ld_kernarg_u64 $d3, [%outAddr];
// Compute the absolute id of current thread
// and shift it by two to get index into user
// buffer to access for Read operation
workitemflatabsid_u32 $s0;
shl_u32 $s0, $s0, 2;
cvt_u64_u32 $d4, $s0;
// Add index to base address of user buffer to obtain
// effective address for access
add_u64 $d0, $d0, $d4;
add_u64 $d3, $d3, $d4;
// Initialize thread's read accumulator to zero
mov_u32 $s2, 0;
@loop:
// Read sixteeen values with a stride that is
// determined by the total number of work-items
// in the global grid
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
// Update output buffer with values read
// from input buffer
st_global_u32 $s2, [$d3];
};
-237
Просмотреть файл
@@ -1,237 +0,0 @@
module &m:1:0:$base:$large:$default;
/* Copyright 2014 HSA Foundation Inc. All Rights Reserved.
*
* HSAF is granting you permission to use this software and documentation (if
* any) (collectively, the "Materials") pursuant to the terms and conditions
* of the Software License Agreement included with the Materials. If you do
* not have a copy of the Software License Agreement, contact the HSA Foundation for a copy.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
*/
extension "amd:gcn";
extension "IMAGE";
decl prog function &abort()();
/**
* @brief Hsail kernel to benchmark READ accesses to system memory.
* The kernel is given a input buffer from which each each thread will
* read. The thread will read from multiple locations of the input buffer.
* The locations to read from is determined by the work-item Id, the function
* being work-item Id modulo total number of work-items in the global work grid.
* So given a global work grid of 16 work-items the reads by a thread with absolute
* id 4 would be 4, 20, 36, 52, etc.
*
* @NOTE: A constraint imposed by the kernel is that the buffer size be large
* enough to support 16 reads by each thread. So a dispatch of 8 work-items
* should allocate enough buffer for 8 * 16 * sizeof(uint32_t).
*
* @param bufStart beginning byte address of user buffer in system memory
* from which kernel threads could read
*
* @param bufEnd byte address that follows the end of user buffer. Accessing
* memory at bufEnd is illegal
*
* @param addrStep size by which to increment byte address following each read
* operation. The value represents total number of work-items * sizeof(uint32_t)
*
* @param outAddr argument that is passed by the user to be updated with values
* read by the kernel threads. This is ensure compiler and finalizer do not eliminate
* code because the values being read are not used in any meaningfule way.
*
*/
prog kernel &__SysMemLoad(kernarg_u64 %bufStart,
kernarg_u64 %bufEnd,
kernarg_u64 %addrStep,
kernarg_u64 %outAddr) {
pragma "AMD RTI", "ARGSTART:__SysMemLoad";
pragma "AMD RTI", "version:3:1:104";
pragma "AMD RTI", "device:generic";
pragma "AMD RTI", "uniqueid:1024";
pragma "AMD RTI", "function:1:0";
pragma "AMD RTI", "memory:64bitABI";
pragma "AMD RTI", "uavid:8";
pragma "AMD RTI", "privateid:8";
pragma "AMD RTI", "ARGEND:__SysMemLoad";
// Retrieve the values of input arguments
// bufStart refers to the starting byte address
// bufEnd refers to the end of byte address
// addrStep refers to the product of total number
// of work-items in the grid * sizeof(uint32_t)
ld_kernarg_u64 $d0, [%bufStart];
ld_kernarg_u64 $d1, [%bufEnd];
ld_kernarg_u64 $d2, [%addrStep];
ld_kernarg_u64 $d3, [%outAddr];
// Compute the absolute id of current thread
// and shift it by two to get index into user
// buffer to access for Read operation
workitemflatabsid_u32 $s0;
shl_u32 $s0, $s0, 2;
cvt_u64_u32 $d4, $s0;
// Add index to base address of user buffer to obtain
// effective address for access
add_u64 $d0, $d0, $d4;
add_u64 $d3, $d3, $d4;
// Initialize thread's read accumulator to zero
mov_u32 $s2, 0;
@loop:
// Read sixteeen values with a stride that is
// determined by the total number of work-items
// in the global grid
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
ld_global_u32 $s1, [$d0];
add_u32 $s2, $s1, $s2;
add_u64 $d0, $d0, $d2;
// Update output buffer with values read
// from input buffer
st_global_u32 $s2, [$d3];
};
-105
Просмотреть файл
@@ -1,105 +0,0 @@
module &m:1:0:$full:$large:$default;
extension "amd:gcn";
extension "IMAGE";
decl prog function &abort()();
prog kernel &__SysMemStore(kernarg_u64 %bufStart,
kernarg_u64 %bufEnd,
kernarg_u64 %addrStep,
kernarg_u64 %deadArg) {
// Directives for Compiler
pragma "AMD RTI", "ARGSTART:__SysMemStore";
pragma "AMD RTI", "version:3:1:104";
pragma "AMD RTI", "device:generic";
pragma "AMD RTI", "uniqueid:1024";
pragma "AMD RTI", "function:1:0";
pragma "AMD RTI", "memory:64bitABI";
pragma "AMD RTI", "uavid:8";
pragma "AMD RTI", "privateid:8";
pragma "AMD RTI", "ARGEND:__SysMemStore";
// Retrieve the values of input arguments
// bufStart refers to the starting byte address
// bufEnd refers to the end of byte address
// addrStep refers to the product of total number
// of work-items in the grid * sizeof(uint32_t)
ld_kernarg_u64 $d0, [%bufStart];
ld_kernarg_u64 $d1, [%bufEnd];
ld_kernarg_u64 $d2, [%addrStep];
ld_kernarg_u64 $d3, [%deadArg];
// Compute the absolute id of current thread
// and shift it by two to get index into user
// buffer to access for Write operation
workitemflatabsid_u32 $s0;
shl_u32 $s0, $s0, 2;
// Convert the thread id into a 64-bit number
// and add it to the starting address of user
// buffer to obtain effective address for access
cvt_u64_u32 $d4, $s0;
add_u64 $d0, $d0, $d4;
@loop:
// Write sixteeen values with a stride that is
// determined by the total number of work-items
// in the global grid
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
// Loop until we hit end of buffer [%bufEnd]
cmp_lt_b1_u64 $c0, $d0, $d1;
cbr_b1 $c0, @loop;
};
-105
Просмотреть файл
@@ -1,105 +0,0 @@
module &m:1:0:$base:$large:$default;
extension "amd:gcn";
extension "IMAGE";
decl prog function &abort()();
prog kernel &__SysMemStore(kernarg_u64 %bufStart,
kernarg_u64 %bufEnd,
kernarg_u64 %addrStep,
kernarg_u64 %deadArg) {
// Directives for Compiler
pragma "AMD RTI", "ARGSTART:__SysMemStore";
pragma "AMD RTI", "version:3:1:104";
pragma "AMD RTI", "device:generic";
pragma "AMD RTI", "uniqueid:1024";
pragma "AMD RTI", "function:1:0";
pragma "AMD RTI", "memory:64bitABI";
pragma "AMD RTI", "uavid:8";
pragma "AMD RTI", "privateid:8";
pragma "AMD RTI", "ARGEND:__SysMemStore";
// Retrieve the values of input arguments
// bufStart refers to the starting byte address
// bufEnd refers to the end of byte address
// addrStep refers to the product of total number
// of work-items in the grid * sizeof(uint32_t)
ld_kernarg_u64 $d0, [%bufStart];
ld_kernarg_u64 $d1, [%bufEnd];
ld_kernarg_u64 $d2, [%addrStep];
ld_kernarg_u64 $d3, [%deadArg];
// Compute the absolute id of current thread
// and shift it by two to get index into user
// buffer to access for Write operation
workitemflatabsid_u32 $s0;
shl_u32 $s0, $s0, 2;
// Convert the thread id into a 64-bit number
// and add it to the starting address of user
// buffer to obtain effective address for access
cvt_u64_u32 $d4, $s0;
add_u64 $d0, $d0, $d4;
@loop:
// Write sixteeen values with a stride that is
// determined by the total number of work-items
// in the global grid
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
st_global_u32 $s0, [$d0];
add_u64 $d0, $d0, $d2;
// Loop until we hit end of buffer [%bufEnd]
cmp_lt_b1_u64 $c0, $d0, $d1;
cbr_b1 $c0, @loop;
};
@@ -43,40 +43,12 @@
*
*/
#ifndef __ROCRTST_SRC_INC_IMAGE_STORE_BANDWIDTH_H__
#define __ROCRTST_SRC_INC_IMAGE_STORE_BANDWIDTH_H__
#include "perf_common/perf_base.h"
#include "common/base_rocr.h"
#include "hsa/hsa.h"
class ImageStoreBandwidth: public rocrtst::BaseRocR, public PerfBase {
public:
//@Brief: Constructor
ImageStoreBandwidth();
//@Brief: Destructor
~ImageStoreBandwidth();
//@Brief: Set up the test environment
virtual void SetUp();
//@Brief: Run the actual testing
virtual void Run();
//@Brief: Clean up the test environment
virtual void Close();
//@Brief: Display results
virtual void DisplayResults() const;
private:
//@Brief: Image Store Bandwidth
double store_bandwidth_;
//@Brief: Image size
size_t image_size_;
};
#endif //__ROCRTST_SRC_INC_IMAGE_STORE_BANDWIDTH_H__
__kernel void
square(__global int *dstArray, __global const int *srcArray, const int sz) {
unsigned int id = get_global_id(0);
if (id < sz) {
dstArray[id] = srcArray[id] * srcArray[id];
}
return;
}
-53
Просмотреть файл
@@ -1,53 +0,0 @@
module &m:1:0:$full:$large:$default;
extension "amd:gcn";
extension "IMAGE";
decl prog function &abort()();
prog kernel &__OpenCL_vec_assign_kernel(
kernarg_u64 %buf,
kernarg_u32 %num)
{
pragma "AMD RTI", "ARGSTART:__OpenCL_vec_assign_kernel";
pragma "AMD RTI", "version:3:1:104";
pragma "AMD RTI", "device:generic";
pragma "AMD RTI", "uniqueid:1024";
pragma "AMD RTI", "function:1:0";
pragma "AMD RTI", "memory:64bitABI";
pragma "AMD RTI", "uavid:8";
pragma "AMD RTI", "privateid:8";
pragma "AMD RTI", "ARGEND:__OpenCL_vec_assign_kernel";
@__OpenCL_vec_assign_kernel_entry:
// BB#0: // %entry
ld_kernarg_align(8)_width(all)_u64 $d0, [%buf];
ld_global_u32 $s1, [$d0];
ld_kernarg_align(4)_width(all)_u32 $s0, [%num];
cmp_ge_b1_s32 $c0, $s1, $s0;
cbr_b1 $c0, @BB0_4;
// BB#1: // %while.body.lr.ph
workitemabsid_u32 $s1, 0;
cmp_eq_b1_s32 $c0, $s1, 0;
cbr_b1 $c0, @BB0_2;
@BB0_3:
// %while.cond.backedge
ld_global_u32 $s1, [$d0];
cmp_lt_b1_s32 $c0, $s1, $s0;
cbr_b1 $c0, @BB0_3;
br @BB0_4;
@BB0_2:
// %while.cond.backedge.us
ld_global_u32 $s1, [$d0];
add_u32 $s1, $s1, 1;
st_global_u32 $s1, [$d0];
ld_global_u32 $s1, [$d0];
cmp_lt_b1_s32 $c0, $s1, $s0;
cbr_b1 $c0, @BB0_2;
@BB0_4:
// %while.end
ret;
};
-108
Просмотреть файл
@@ -1,108 +0,0 @@
module &m:1:0:$full:$large:$default;
extension "amd:gcn";
extension "IMAGE";
decl prog function &abort()();
prog kernel &__OpenCL_matrixTranspose_kernel(
kernarg_u64 %__global_offset_0,
kernarg_u64 %__global_offset_1,
kernarg_u64 %__global_offset_2,
kernarg_u64 %__printf_buffer,
kernarg_u64 %__vqueue_pointer,
kernarg_u64 %__aqlwrap_pointer,
kernarg_u64 %inBuf,
kernarg_u64 %outBuf,
kernarg_u64 %localBuf,
kernarg_u32 %blockSize,
kernarg_u32 %width,
kernarg_u32 %height)
{
pragma "AMD RTI", "ARGSTART:__OpenCL_matrixTranspose_kernel";
pragma "AMD RTI", "version:3:1:104";
pragma "AMD RTI", "device:generic";
pragma "AMD RTI", "uniqueid:1024";
pragma "AMD RTI", "memory:private:0";
pragma "AMD RTI", "memory:region:0";
pragma "AMD RTI", "memory:local:0";
pragma "AMD RTI", "value:__global_offset_0:u64:1:1:0";
pragma "AMD RTI", "value:__global_offset_1:u64:1:1:16";
pragma "AMD RTI", "value:__global_offset_2:u64:1:1:32";
pragma "AMD RTI", "pointer:__printf_buffer:u8:1:1:48:uav:7:1:RW:0:0:0";
pragma "AMD RTI", "value:__vqueue_pointer:u64:1:1:64";
pragma "AMD RTI", "value:__aqlwrap_pointer:u64:1:1:80";
pragma "AMD RTI", "pointer:inBuf:u32:1:1:96:uav:7:4:RW:0:1:0";
pragma "AMD RTI", "pointer:outBuf:u32:1:1:112:uav:7:4:RW:0:1:0";
pragma "AMD RTI", "pointer:localBuf:u32:1:1:128:l:7:4:RW:0:0:0";
pragma "AMD RTI", "value:blockSize:u32:1:1:144";
pragma "AMD RTI", "value:width:u32:1:1:160";
pragma "AMD RTI", "value:height:u32:1:1:176";
pragma "AMD RTI", "function:1:0";
pragma "AMD RTI", "memory:64bitABI";
pragma "AMD RTI", "privateid:8";
pragma "AMD RTI", "enqueue_kernel:0";
pragma "AMD RTI", "kernel_index:0";
pragma "AMD RTI", "reflection:0:size_t";
pragma "AMD RTI", "reflection:1:size_t";
pragma "AMD RTI", "reflection:2:size_t";
pragma "AMD RTI", "reflection:3:size_t";
pragma "AMD RTI", "reflection:4:size_t";
pragma "AMD RTI", "reflection:5:size_t";
pragma "AMD RTI", "reflection:6:uint*";
pragma "AMD RTI", "reflection:7:uint*";
pragma "AMD RTI", "reflection:8:uint*";
pragma "AMD RTI", "reflection:9:uint";
pragma "AMD RTI", "reflection:10:uint";
pragma "AMD RTI", "reflection:11:uint";
pragma "AMD RTI", "ARGEND:__OpenCL_matrixTranspose_kernel";
@__OpenCL_matrixTranspose_kernel_entry:
// BB#0: // %entry
workitemid_u32 $s0, 1;
ld_kernarg_align(4)_width(all)_u32 $s1, [%blockSize];
workitemid_u32 $s2, 0;
mad_u32 $s3, $s2, $s1, $s0;
cvt_u64_u32 $d1, $s3;
workitemabsid_u32 $s3, 0;
cvt_u64_u32 $d0, $s3;
ld_kernarg_align(8)_width(all)_u64 $d2, [%__global_offset_0];
add_u64 $d0, $d0, $d2;
workitemabsid_u32 $s5, 1;
workgroupid_u32 $s4, 0;
workgroupid_u32 $s3, 1;
shl_u64 $d1, $d1, 2;
mad_u32 $s3, $s3, $s1, $s2;
mad_u32 $s4, $s4, $s1, $s0;
cvt_u64_u32 $d2, $s5;
ld_kernarg_align(8)_width(all)_u64 $d3, [%__global_offset_1];
cvt_u32_u64 $s5, $d0;
add_u64 $d0, $d2, $d3;
cvt_u32_u64 $s6, $d0;
ld_kernarg_align(4)_width(all)_u32 $s7, [%width];
ld_kernarg_align(8)_width(all)_u64 $d0, [%localBuf];
ld_kernarg_align(4)_width(all)_u32 $s8, [%height];
mad_u32 $s3, $s4, $s8, $s3;
add_u64 $d1, $d0, $d1;
cvt_u32_u64 $s4, $d1;
mad_u32 $s5, $s6, $s7, $s5;
cvt_u64_u32 $d1, $s5;
shl_u64 $d2, $d1, 2;
ld_kernarg_align(8)_width(all)_u64 $d1, [%outBuf];
ld_kernarg_align(8)_width(all)_u64 $d3, [%inBuf];
add_u64 $d2, $d3, $d2;
ld_global_align(4)_u32 $s5, [$d2];
st_group_align(4)_u32 $s5, [$s4];
cvt_u64_u32 $d2, $s3;
shl_u64 $d2, $d2, 2;
add_u64 $d1, $d1, $d2;
mad_u32 $s0, $s0, $s1, $s2;
cvt_u64_u32 $d2, $s0;
shl_u64 $d2, $d2, 2;
add_u64 $d0, $d0, $d2;
cvt_u32_u64 $s0, $d0;
barrier;
ld_group_align(4)_u32 $s0, [$s0];
st_global_align(4)_u32 $s0, [$d1];
ret;
};
-34
Просмотреть файл
@@ -1,34 +0,0 @@
module &m:1:0:$full:$large:$default;
extension "amd:gcn";
extension "IMAGE";
decl prog function &abort()();
prog kernel &__vector_copy_kernel(
kernarg_u64 %a,
kernarg_u64 %b)
{
pragma "AMD RTI", "ARGSTART:__vector_copy_kernel";
pragma "AMD RTI", "version:3:1:104";
pragma "AMD RTI", "device:generic";
pragma "AMD RTI", "uniqueid:1024";
pragma "AMD RTI", "function:1:0";
pragma "AMD RTI", "memory:64bitABI";
pragma "AMD RTI", "uavid:8";
pragma "AMD RTI", "privateid:8";
pragma "AMD RTI", "ARGEND:__vector_copy_kernel";
@__vector_copy_kernel_entry:
// BB#0: // %entry
workitemabsid_u32 $s0, 0;
cvt_s64_s32 $d0, $s0;
shl_u64 $d0, $d0, 2;
ld_kernarg_align(8)_width(all)_u64 $d1, [%b];
add_u64 $d1, $d1, $d0;
ld_kernarg_align(8)_width(all)_u64 $d2, [%a];
add_u64 $d0, $d2, $d0;
ld_global_u32 $s0, [$d0];
st_global_u32 $s0, [$d1];
ret;
};
-64
Просмотреть файл
@@ -1,64 +0,0 @@
module &m:1:0:$base:$large:$default;
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
decl prog function &abort()();
prog kernel &__vector_copy_kernel(
kernarg_u64 %in,
kernarg_u64 %out)
{
@__vector_copy_kernel_entry:
// BB#0: // %entry
workitemabsid_u32 $s0, 0;
cvt_s64_s32 $d0, $s0;
shl_u64 $d0, $d0, 2;
ld_kernarg_align(8)_width(all)_u64 $d1, [%out];
add_u64 $d1, $d1, $d0;
ld_kernarg_align(8)_width(all)_u64 $d2, [%in];
add_u64 $d0, $d2, $d0;
ld_global_u32 $s0, [$d0];
st_global_u32 $s0, [$d1];
ret;
};
-64
Просмотреть файл
@@ -1,64 +0,0 @@
module &m:1:0:$full:$large:$default;
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
decl prog function &abort()();
prog kernel &__vector_copy_kernel(
kernarg_u64 %in,
kernarg_u64 %out)
{
@__vector_copy_kernel_entry:
// BB#0: // %entry
workitemabsid_u32 $s0, 0;
cvt_s64_s32 $d0, $s0;
shl_u64 $d0, $d0, 2;
ld_kernarg_align(8)_width(all)_u64 $d1, [%out];
add_u64 $d1, $d1, $d0;
ld_kernarg_align(8)_width(all)_u64 $d2, [%in];
add_u64 $d0, $d2, $d0;
ld_global_u32 $s0, [$d0];
st_global_u32 $s0, [$d1];
ret;
};
Обычный файл → Исполняемый файл
+57 -210
Просмотреть файл
@@ -43,238 +43,85 @@
*
*/
#include "cp_process_time.h"
#include "cu_masking.h"
#include "device_load_bandwidth.h"
#include "device_store_bandwidth.h"
#include "dispatch_time.h"
#include "flush_latency.h"
#include "gtest/gtest.h"
#include "hsa_info.h"
#include "image_bandwidth.h"
#include "image_load_bandwidth.h"
#include "image_store_bandwidth.h"
#include "matrix_transpose.h"
#include "memory_copy.h"
#include "memory_allocation.h"
#include "memory_async_copy.h"
#include "queue_concurrency.h"
#include "queue_create_destroy_latency.h"
#include "system_load_bandwidth.h"
#include "system_store_bandwidth.h"
#include "vector_copy.h"
#include "suites/performance/dispatch_time.h"
#include "suites/performance/memory_async_copy.h"
#include "suites/performance/test_case_template.h"
#include "suites/performance/main.h"
#include "suites/test_common/test_common.h"
/**
* Try to order tests from fastest running to slowest running.
*/
static uint32_t sRocrTstOptVerbosity = 1;
static uint32_t sRocrTestOptIterations = 0;
// DisplayResultsResults HSA system information first.
TEST(rocrtst, Feature_Hsa_Info) {
HsaInfo hi;
hi.SetUp();
hi.Run();
hi.Close();
static void RunTest(TestBase *test) {
test->set_verbosity(sRocrTstOptVerbosity);
if (sRocrTestOptIterations) {
test->set_num_iteration(sRocrTestOptIterations);
}
test->DisplayTestInfo();
test->SetUp();
test->Run();
test->DisplayResults();
test->Close();
return;
}
// Requires HSA_PFOFILE_FULL
TEST(rocrtst, Perf_Image_Store_Bandwidth) {
ImageStoreBandwidth isb;
isb.SetUp();
isb.Run();
isb.DisplayResults();
isb.Close();
// TEST ENTRY TEMPLATE:
// TEST(rocrtst, Perf_<test name>) {
// <Test Implementation class> <test_obj>;
//
// // Copy and modify implementation of RunTest() if you need to deviate
// // from the standard pattern implemented there.
// RunTest(&<test_obj>);
// }
TEST(rocrtst, Test_Example) {
TestExample tst;
RunTest(&tst);
}
// Requires HSA_PFOFILE_FULL
TEST(rocrtst, Perf_Image_Load_Bandwidth) {
ImageLoadBandwidth ilb;
ilb.SetUp();
ilb.Run();
ilb.DisplayResults();
ilb.Close();
TEST(rocrtst, Perf_Memory_Async_Copy) {
MemoryAsyncCopy mac;
// To do full test, uncomment this:
// mac.set_full_test(true);
// To test only 1 path, add lines like this:
// mac.set_src_pool(<src pool id>);
// mac.set_dst_pool(<dst pool id>);
// The default is to and from the cpu to 1 gpu, and to/from a gpu to
// another gpu
RunTest(&mac);
}
// Requires HSA_PFOFILE_FULL
TEST(rocrtst, Perf_Image_Bandwidth) {
ImageBandwidth ib;
ib.SetUp();
ib.Run();
ib.DisplayResults();
ib.Close();
}
// Requires HSA_PFOFILE_FULL
TEST(rocrtst, Perf_Queue_Concurrency) {
QueueConcurrency mc;
mc.SetUp();
mc.Run();
mc.DisplayResults();
mc.Close();
}
TEST(rocrtst, Feature_Cu_Masking) {
CuMasking cm;
cm.SetUp();
cm.Run();
cm.Close();
}
TEST(rocrtst, Perf_Flush_Latency) {
FlushLatency fl;
fl.SetUp();
fl.Run();
fl.DisplayResults();
fl.Close();
}
// This test apparently has some sort of memory bounds overwrite
// issue with the out_data_ buffer. Commenting out the free of
// out_data_ avoids the problem. Left uncommented, a crash will
// occur immediately or some time after.
TEST(rocrtst, DISABLED_Perf_Device_Memory_Store_Bandwidth) {
DeviceStoreBandwidth slb;
slb.SetUp();
slb.Run();
slb.DisplayResults();
slb.Close();
}
// This test apparently has some sort of memory bounds overwrite
// issue with the out_data_ buffer. Commenting out the free of
// out_data_ avoids the problem. Left uncommented, a crash will
// occur immediately or some time after.
TEST(rocrtst, DISABLED_Perf_Device_Memory_Load_Bandwidth) {
DeviceLoadBandwidth slb;
slb.SetUp();
slb.Run();
slb.DisplayResults();
slb.Close();
}
TEST(rocrtst, Perf_Dispatch_Time_Single_SpinWait) {
DispatchTime dt;
dt.set_num_iteration(100);
dt.UseDefaultSignal(true);
dt.LaunchSingleKernel(true);
dt.SetUp();
dt.Run();
dt.DisplayResults();
dt.Close();
DispatchTime dt(true, true);
RunTest(&dt);
}
TEST(rocrtst, Perf_Dispatch_Time_Single_Interrupt) {
DispatchTime dt;
dt.UseDefaultSignal(false);
dt.LaunchSingleKernel(true);
dt.SetUp();
dt.Run();
dt.DisplayResults();
dt.Close();
DispatchTime dt(false, true);
RunTest(&dt);
}
TEST(rocrtst, Perf_Dispatch_Time_Multi_SpinWait) {
DispatchTime dt;
dt.UseDefaultSignal(true);
dt.LaunchSingleKernel(false);
dt.SetUp();
dt.Run();
dt.DisplayResults();
dt.Close();
DispatchTime dt(true, false);
RunTest(&dt);
}
TEST(rocrtst, Perf_Dispatch_Time_Multi_Interrupt) {
DispatchTime dt;
dt.UseDefaultSignal(false);
dt.LaunchSingleKernel(false);
dt.SetUp();
dt.Run();
dt.DisplayResults();
dt.Close();
DispatchTime dt(false, false);
RunTest(&dt);
}
TEST(rocrtst, DISABLED_Perf_CpProcessTime) {
CpProcessTime cpt;
cpt.set_num_iteration(10);
cpt.SetUp();
cpt.Run();
cpt.DisplayResults();
cpt.Close();
}
TEST(rocrtst, Perf_Memory_Allocation) {
MemoryAllocation ma(10);
ma.SetUp();
ma.Run();
ma.DisplayResults();
ma.Close();
}
#if MEM_POOL_FILL_BUG
TEST(rocrtst, Perf_Queue_Latency) {
QueueLatency ql;
ql.set_num_iteration(10);
ql.SetUp();
ql.Run();
ql.DisplayResults();
ql.Close();
}
TEST(rocrtst, Perf_System_Memory_Load_Bandwidth) {
SystemLoadBandwidth slb;
slb.SetUp();
slb.Run();
slb.DisplayResults();
slb.Close();
}
TEST(rocrtst, Perf_System_Memory_Store_Bandwidth) {
SystemStoreBandwidth ssb;
ssb.SetUp();
ssb.Run();
ssb.DisplayResults();
ssb.Close();
}
TEST(rocrtst, Perf_Memory_Copy) {
MemoryCopy mc;
mc.set_num_iteration(10);
mc.SetUp();
mc.Run();
mc.DisplayResults();
mc.Close();
}
#endif
#if 0
// These tests were not complete. Needs research/work.
TEST(rocrtst, Feature_Vector_Copy) {
VectorCopy vc;
vc.SetUp();
vc.Run();
vc.Close();
}
TEST(rocrtst, Perf_Matrix_Transpose) {
MatrixTranspose mt;
mt.SetUp();
mt.Run();
mt.DisplayResults();
mt.Close();
}
#endif
//#if NEED_TO_MAKE_BATCH
TEST(rocrtst, Perf_Memory_Async_Copy) {
MemoryAsyncCopy mac;
mac.set_num_iteration(10);
mac.SetUp();
mac.Run();
mac.DisplayResults();
mac.Close();
}
//#endif
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
RocrtstOptions opts(&sRocrTstOptVerbosity, &sRocrTestOptIterations);
if (ProcessCmdline(&opts, argc, argv)) {
return 1;
}
return RUN_ALL_TESTS();
}
-289
Просмотреть файл
@@ -1,289 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include "matrix_transpose.h"
#include "common/base_rocr_utils.h"
#include "common/common.h"
#include "common/helper_funcs.h"
#include "common/hsatimer.h"
#include "hsa/hsa.h"
#include "hsa/hsa_ext_amd.h"
#include "hsa/hsa_ext_finalize.h"
#include "gtest/gtest.h"
#include <stdlib.h>
#include <algorithm>
static const unsigned int NUM_BLOCK_SIZES = 2;
static const unsigned int blockSizes[NUM_BLOCK_SIZES] = {8, 16};
static const unsigned int NUM_MATRIX_DIMS = 2;
static const unsigned int matrixDims[NUM_MATRIX_DIMS] = {1024, 64};
MatrixTranspose::MatrixTranspose(void) :
BaseRocR() {
in_buffer_sys_ = NULL;
out_buffer_sys_ = NULL;
in_buffer_ = NULL;
out_buffer_ = NULL;
width_ = 0;
height_ = 0;
buf_size_ = 0;
block_size_ = 0;
time_mean_ = 0.0;
}
MatrixTranspose::~MatrixTranspose(void) {
}
void MatrixTranspose::SetUp(void) {
hsa_status_t err;
InitializeData();
set_kernel_file_name("transpose_kernel.o");
set_kernel_name("&__OpenCL_matrixTranspose_kernel");
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
return;
}
hsa_agent_t* gpu_dev = gpu_device1();
hsa_agent_t* cpu_dev = cpu_device();
err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool,
&cpu_pool());
ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
err = hsa_amd_memory_pool_allocate(cpu_pool(), buf_size_, 0,
(void**) &in_buffer_);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_memory_pool_allocate(cpu_pool(), buf_size_, 0,
(void**) &out_buffer_);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_agents_allow_access(1, gpu_dev, NULL, in_buffer_);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_agents_allow_access(1, gpu_dev, NULL, out_buffer_);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Create a queue
hsa_queue_t* q = nullptr;
rocrtst::CreateQueue(*gpu_dev, &q);
set_main_queue(q);
rocrtst::LoadKernelFromObjFile(this);
// Fill up aql packet
rocrtst::InitializeAQLPacket(this, &aql());
aql().setup = 0;
aql().setup |= 2 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
aql().workgroup_size_x = block_size_;
aql().workgroup_size_y = block_size_;
aql().grid_size_x = width_;
aql().grid_size_y = height_;
aql().group_segment_size = sizeof(uint) * block_size_ * block_size_;
// Debug
#ifdef DEBUG
std::cout << "workgroup size: " << block_size_ << ", " << block_size_
<< ", " << 1 << std::endl;
std::cout << "grid size: " << aql().grid_size_x << ", " <<
aql().grid_size_y << ", " << aql().grid_size_z << std::endl;
std::cout << "group segment size: " << aql().group_segment_size << std::endl;
#endif
}
void MatrixTranspose::Run(void) {
hsa_status_t err;
hsa_agent_t* gpu_dev = gpu_device1();
if (!rocrtst::CheckProfile(this)) {
return;
}
// Allocate kernel parameter
typedef struct args_t {
uint* offset_0;
uint* offset_1;
uint* offset_2;
uint* printf_buffer;
uint* vqueue_buffer;
uint* aqlwrap_pointer;
uint* in_buf;
uint* out_buf;
uint* local_buf;
uint iblock_size;
uint iwidth;
uint iheight;
} args;
args* kern_ptr = NULL;
err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(args), 0,
(void**) &kern_ptr);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_agents_allow_access(1, gpu_dev, NULL, kern_ptr);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
kern_ptr->offset_0 = 0;
kern_ptr->offset_1 = 0;
kern_ptr->offset_2 = 0;
kern_ptr->printf_buffer = 0;
kern_ptr->vqueue_buffer = 0;
kern_ptr->aqlwrap_pointer = 0;
kern_ptr->in_buf = in_buffer_sys_;
kern_ptr->out_buf = out_buffer_sys_;
kern_ptr->local_buf = 0;
kern_ptr->iblock_size = block_size_;
kern_ptr->iwidth = width_;
kern_ptr->iheight = height_;
aql().kernarg_address = kern_ptr;
//Obtain the current queue write index.
uint64_t idx = hsa_queue_add_write_index_relaxed(main_queue(), 1);
((hsa_kernel_dispatch_packet_t*)(main_queue()->base_address))[idx] = aql();
rocrtst::PerfTimer p_timer;
int id = p_timer.CreateTimer();
p_timer.StartTimer(id);
((hsa_kernel_dispatch_packet_t*)(main_queue()->base_address))[idx].header |=
HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
hsa_signal_store_release(main_queue()->doorbell_signal, idx);
//Wait on the dispatch signal until the kernel is finished.
hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE);
p_timer.StopTimer(id);
hsa_amd_profiling_dispatch_time_t dispatch_time;
err = hsa_amd_profiling_get_dispatch_time(*gpu_dev, signal(), &dispatch_time);
uint64_t stamp = dispatch_time.end - dispatch_time.start;
uint64_t freq;
err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freq);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
std::cout << "Kernel time is: " <<
(double) stamp / (double) freq * 1000.0 << std::endl;
hsa_signal_store_release(signal(), 1);
// Verify Results
VerifyResults (out_buffer_sys_);
// Abandon the first result which is warm up
time_mean_ = p_timer.ReadTimer(id); //rocrtst::CalcMean(timer);
}
void MatrixTranspose::DisplayResults(void) const {
if (!rocrtst::CheckProfile(this)) {
return;
}
std::cout << "============================================" << std::endl;
std::cout << "Matrix Transpose Mean Time: " << time_mean_ << std::endl;
return;
}
void MatrixTranspose::Close(void) {
hsa_status_t err;
err = rocrtst::CommonCleanUp(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
}
void MatrixTranspose::InitializeData(void) {
// int openTest = 1;
block_size_ = 16; //blockSizes[openTest % NUM_BLOCK_SIZES];
width_ = 1920; //matrixDims[openTest / NUM_BLOCK_SIZES];
height_ = width_;
buf_size_ = width_ * height_ * sizeof(uint);
in_buffer_sys_ = (uint*) aligned_alloc(256, buf_size_);
SetData (in_buffer_sys_);
out_buffer_sys_ = (uint*) aligned_alloc(256, buf_size_);
FillData(out_buffer_sys_, 0xdeadbeef);
return;
}
void MatrixTranspose::SetData(uint* buffer) {
for (unsigned int i = 0; i < height_; i++) {
for (unsigned int j = 0; j < width_; j++) {
*(buffer + i * width_ + j) = i * width_ + j;
}
}
}
void MatrixTranspose::FillData(uint* buffer, unsigned int val) {
for (unsigned int i = 0; i < width_ * height_; i++) {
buffer[i] = val;
}
}
void MatrixTranspose::VerifyResults(uint* buffer) {
bool err = false;
for (unsigned int i = 0; (i < width_) && !err; i++) {
for (unsigned int j = 0; (j < height_) && !err; j++) {
ASSERT_EQ(*(buffer + i * height_ + j), j * width_ + i);
}
}
std::cout << "PASSED!" << std::endl;
}
-101
Просмотреть файл
@@ -1,101 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef __ROCRTST_SRC_MATRIX_TRANSPOSE_H__
#define __ROCRTST_SRC_MATRIX_TRANSPOSE_H__
#include "perf_common/perf_base.h"
#include "common/base_rocr.h"
#include "hsa/hsa.h"
class MatrixTranspose: public rocrtst::BaseRocR, public PerfBase {
public:
//@Brief: Default Constructor
MatrixTranspose();
//@Brief: Destructor
~MatrixTranspose();
//@Brief: Override SetUp function
virtual void SetUp();
//@Brief: Run the measurement
virtual void Run();
//@Brief: Clean up and Close
virtual void Close();
//@Brief: Display results
virtual void DisplayResults() const;
private:
//@Brief: Set up data
virtual void SetData(uint* buffer);
//@Brief: Fill Data
virtual void FillData(uint* buffer, unsigned int val);
//@Brief: VerifyResults
virtual void VerifyResults(uint* buffer);
//@Brief: Initialize the object attribute
virtual void InitializeData();
uint* in_buffer_;
uint* out_buffer_;
uint* in_buffer_sys_;
uint* out_buffer_sys_;
unsigned int width_;
unsigned int height_;
unsigned int buf_size_;
unsigned int block_size_;
double time_mean_;
hsa_barrier_and_packet_t bpkt;
};
#endif //__ROCRTST_SRC_MATRIX_TRANSPOSE_H__
-198
Просмотреть файл
@@ -1,198 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include "memory_allocation.h"
#include "common/base_rocr_utils.h"
#include "common/common.h"
#include "hsa/hsa.h"
#include "gtest/gtest.h"
#include <algorithm>
MemoryAllocation::MemoryAllocation(uint32_t num_iters) :
BaseRocR(), allocation_time_ {0.0}, mem_pool_flag_(0) {
ptr = NULL;
}
MemoryAllocation::~MemoryAllocation() {
}
const char* MemoryAllocation::Str[16] = {"64K", "128K", "256K", "512K", "1M",
"2M", "4M", "8M", "16M", "32M",
"64M", "128M", "256M", "512M", "1G",
"2G"
};
const size_t MemoryAllocation::Size[16] = {64*1024, 128*1024,
256*1024,512*1024, 1024*1024,
2048*1024, 4096*1024, 8*1024*1024,
16*1024*1024, 32*1024*1024,
64*1024*1024, 128*1024*1024,
256 * 1024*1024, 512*1024*1024,
1024*1024*1024,
(size_t)2*1024*1024*1024
};
void MemoryAllocation::SetUp() {
hsa_status_t err;
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
return;
}
hsa_agent_t* cpu_dev = cpu_device();
err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool,
&cpu_pool());
EXPECT_EQ(err, HSA_STATUS_INFO_BREAK);
if (err != HSA_STATUS_INFO_BREAK) {
std::cout << "Unable to find global pool. Test will not be run."
<< std::endl;
return;
}
//At this point, cpu_pool() should be in the global segment
err = hsa_amd_memory_pool_get_info(cpu_pool(),
(hsa_amd_memory_pool_info_t) HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS,
&mem_pool_flag_);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
}
void MemoryAllocation::Run() {
if (!rocrtst::CheckProfile(this)) {
return;
}
if (cpu_pool().handle == 0) {
return;
}
size_t iterations = RealIterationNum();
hsa_status_t err;
//Iterate over the different data size
for (int i = 0; i < 16; i++) {
std::vector<double> time;
for (uint32_t it = 0; it < iterations; it++) {
#if DEBUG
std::cout << "." << std::flush;
#endif
rocrtst::PerfTimer allocation_timer;
int index = allocation_timer.CreateTimer();
allocation_timer.StartTimer(index);
err = hsa_amd_memory_pool_allocate(cpu_pool(), Size[i], 0, &ptr);
allocation_timer.StopTimer(index);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
//Free the memory which was allocated
err = hsa_amd_memory_pool_free(ptr);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
ptr = NULL;
// PUsh the results back to vector time
time.push_back(allocation_timer.ReadTimer(index));
}
#if DEBUG
std::cout << std::endl;
#endif
//Get mean copy time and store to the array
allocation_time_[i] = GetMeanTime(time);
}
}
size_t MemoryAllocation::RealIterationNum() {
return num_iteration() * 1.2 + 1;
}
double MemoryAllocation::GetMeanTime(std::vector<double>& vec) {
std::sort(vec.begin(), vec.end());
vec.erase(vec.begin());
vec.erase(vec.begin(), vec.begin() + num_iteration() * 0.1);
vec.erase(vec.begin() + num_iteration(), vec.end());
double mean = 0.0;
int num = vec.size();
for (int it = 0; it < num; it++) {
mean += vec[it];
}
mean /= num;
return mean;
}
void MemoryAllocation::DisplayResults() const {
if (!rocrtst::CheckProfile(this)) {
return;
}
fprintf(stdout, "==============================================\n");
fprintf(stdout, " Data Size Allocation_time BandWidth(GB/s)\n");
for (int i = 0; i < 16; i++) {
fprintf(stdout, " %9s %15.6f %15.6f\n", Str[i], allocation_time_[i],
2 * Size[i] / allocation_time_[i] / 1024 / 1024 / 1024);
}
fprintf(stdout, "==============================================\n");
return;
}
void MemoryAllocation::Close() {
hsa_status_t err;
err = rocrtst::CommonCleanUp(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
return;
}
-98
Просмотреть файл
@@ -1,98 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef __ROCRTST_SRC_MEMORY_MEM_ALLOCATION_H__
#define __ROCRTST_SRC_MEMORY_MEM_ALLOCATION_H__
#include "perf_common/perf_base.h"
#include "common/base_rocr.h"
#include "common/hsatimer.h"
#include "hsa/hsa.h"
#include <vector>
class MemoryAllocation: public rocrtst::BaseRocR, public PerfBase {
public:
//@Brief: Constructor for test case of MemoryAllocation
MemoryAllocation(uint32_t num_iters = 100);
//@Brief: Destructor for test case of MemoryAllocation
virtual ~MemoryAllocation();
//@Brief: Set up the environment for the test
virtual void SetUp();
//@Brief: Execute the test
virtual void Run();
//@Brief: Display results
virtual void DisplayResults() const;
//@Brief: Clean up and close the environment
virtual void Close();
protected:
//@Brief: Pointer to the memory space which is allocated by HSA Memory
// allocation API
void* ptr;
//@Brief: Array to store the timers results for each data size
double allocation_time_[16];
private:
//@Brief: Define allocated data size and corresponding string
static const size_t Size[16];
static const char* Str[16];
uint32_t mem_pool_flag_;
//@Brief: Get the actual iteration number
size_t RealIterationNum();
//@Brief: Get mean execution time
double GetMeanTime(std::vector<double>& vec);
};
#endif
Обычный файл → Исполняемый файл
+392 -520
Разница между файлами не показана из-за своего большого размера Загрузить разницу
+120 -137
Просмотреть файл
@@ -43,199 +43,182 @@
*
*/
#ifndef __ROCRTST_SRC_MEMORY_ASYNC_COPY_H__
#define __ROCRTST_SRC_MEMORY_ASYNC_COPY_H__
#ifndef ROCRTST_SUITES_PERFORMANCE_MEMORY_ASYNC_COPY_H_
#define ROCRTST_SUITES_PERFORMANCE_MEMORY_ASYNC_COPY_H_
#include <vector>
#include <algorithm>
#include "perf_common/perf_base.h"
#include "common/base_rocr.h"
#include "common/common.h"
#include "common/hsatimer.h"
#include "hsa/hsa.h"
#include "hsa/hsa_ext_amd.h"
#include <unistd.h>
#include <algorithm>
#include <vector>
#include <cctype>
#include "suites/test_common/test_base.h"
extern int mac_argc;
extern char** mac_argv;
typedef enum TransType {H2D = 0, D2H, P2P} TransType;
typedef struct transaction {
typedef struct Transaction {
int src;
int dst;
hsa_signal_t signal;
size_t size;
size_t num_dep_signal;
hsa_signal_t* dep_signal;
} transaction;
size_t max_size; // Max. amount of kBytes to copy
TransType type;
// BenchMark copy time
std::vector<double> *benchmark_copy_time;
// Min time
std::vector<double> *min_time;
} Transaction;
typedef struct agent_info {
agent_info(hsa_agent_t agent, int index, hsa_device_type_t device_type) {
agent_ = agent;
index_ = index;
device_type_ = device_type;
}
agent_info() {
}
hsa_agent_t agent_;
int index_;
hsa_device_type_t device_type_;
} agent_info;
class AgentInfo {
public:
AgentInfo(hsa_agent_t agent, int index, hsa_device_type_t device_type) {
agent_ = agent;
index_ = index;
device_type_ = device_type;
}
AgentInfo() {}
~AgentInfo() {}
hsa_agent_t agent(void) const {return agent_;}
hsa_device_type_t device_type(void) const {return device_type_;}
hsa_agent_t agent_;
int index_;
private:
hsa_device_type_t device_type_;
};
class PoolInfo {
public:
PoolInfo(hsa_amd_memory_pool_t pool, int index,
hsa_amd_segment_t segment, bool is_fine_graind, size_t size,
AgentInfo *agent_info) {
pool_ = pool;
index_ = index;
segment_ = segment;
is_fine_grained_ = is_fine_graind;
allocable_size_ = size;
owner_agent_info_ = agent_info;
}
PoolInfo() {}
~PoolInfo() {}
AgentInfo* owner_agent_info(void) const {return owner_agent_info_;}
hsa_amd_memory_pool_t pool_;
int index_;
hsa_amd_segment_t segment_;
bool is_fine_grained_;
size_t allocable_size_;
private:
AgentInfo *owner_agent_info_;
};
typedef struct region_info {
region_info(hsa_amd_memory_pool_t region, int index,
hsa_amd_segment_t segment, bool is_fine_graind, size_t size,
hsa_agent_t agent) {
region_ = region;
index_ = index;
segment_ = segment;
is_fine_grained_ = is_fine_graind;
allocable_size_ = size;
owner_agent_ = agent;
}
region_info() {
}
hsa_amd_memory_pool_t region_;
int index_;
hsa_amd_segment_t segment_;
bool is_fine_grained_;
size_t allocable_size_;
hsa_agent_t owner_agent_;
} region_info;
// Used to print out topology info
typedef struct node_info {
node_info() {
}
agent_info agent;
std::vector<region_info> region;
} node_info;
typedef struct NodeInfo {
AgentInfo agent;
std::vector<PoolInfo> pool;
} NodeInfo;
hsa_status_t AgentInfo(hsa_agent_t agent, void* data);
hsa_status_t RegionInfo(hsa_amd_memory_pool_t region, void* data);
class MemoryAsyncCopy: public rocrtst::BaseRocR, public PerfBase {
class MemoryAsyncCopy : public TestBase {
public:
MemoryAsyncCopy();
//@Brief: Destructor for test case of MemoryAsyncCopy
// @Brief: Destructor for test case of MemoryAsyncCopy
virtual ~MemoryAsyncCopy();
//@Brief: Setup the environment for measurement
// @Brief: Setup the environment for measurement
virtual void SetUp();
//@Brief: Core measurement execution
// @Brief: Core measurement execution
virtual void Run();
//@Brief: Clean up and retrive the resource
// @Brief: Clean up and retrive the resource
virtual void Close();
//@Brief: Display results
// @Brief: Display results
virtual void DisplayResults() const;
// There are 3 levels of testing, from quickest/very specific to
// longest/most complete:
// 1. to and from a specified source to a specified target
// 2. to and from the cpu to 1 gpu, and to/from a gpu to another gpu
// (if available)
// 3. to and from the cpu to 1 gpu and, to/from every gpu to every
// other gpu
// The default is #2 above. If *both* a source and dest. are set for #1
// above, then that overides both #2 and #3
void set_src_pool(int pool_id) {src_pool_id_ = pool_id;}
void set_dst_pool(int pool_id) {dst_pool_id_ = pool_id;}
void set_full_test(bool full_test) {do_full_test_ = full_test;}
int pool_index(void) const {return pool_index_;}
void set_pool_index(int i) {pool_index_ = i;}
int agent_index(void) const {return agent_index_;}
void set_agent_index(int i) {agent_index_ = i;}
std::vector<PoolInfo *> *pool_info(void) {return &pool_info_;}
std::vector<AgentInfo *> *agent_info(void) {return &agent_info_;}
std::vector<NodeInfo> *node_info(void) {return &node_info_;}
// @Brief: Display information about what this test does
virtual void DisplayTestInfo(void);
private:
//@Brief: Get real iteration number
virtual size_t RealIterationNum();
// @Brief: Get real iteration number
virtual size_t RealIterationNum(void);
//@Brief: Get the mean copy time
virtual double GetMeanTime(std::vector<double>& vec);
// @Brief: Get the mean copy time
double GetMeanTime(std::vector<double>* vec);
//@Brief: Get the min copy time
virtual double GetMinTime(std::vector<double>& vec);
// @Brief: Find and print out the needed topology info
void FindTopology(void);
//@Brief: Find and print out the needed topology info
void FindTopology();
// @Brief: Run for Benchmark mode with verification
void RunBenchmarkWithVerification(Transaction *t);
//@Brief: Parse the argument and interact with the user
// to fill the vectors.
void ParseArgument();
// @Brief: Dispaly Benchmark result
void DisplayBenchmark(Transaction *t) const;
//@Brief: Run for Benchmark mode
void RunBenchmark();
// @Brief: Print topology info
void PrintTopology(void);
//@Brief: Run for Benchmark mode with verification
void RunBenchmarkWithVerification();
void ConstructTransactionList(void);
//@Brief: Dispaly Benchmark result
void DisplayBenchmark();
// @Brief: Find system region
void FindSystemPool(void);
//@Brief: Run user defined
void RunNormal();
//@Brief: Print topology info
void PrintTopology();
//@Brief: Find system region
void FindSystemRegion();
//@Brief: Check if agent and access memory pool, if so, set
//access to the agent, if not, exit
void AcquireAccess(hsa_agent_t agent, hsa_amd_memory_pool_t pool, void* ptr);
friend hsa_status_t AgentInfo(hsa_agent_t agent, void* data);
friend hsa_status_t RegionInfo(hsa_amd_memory_pool_t region, void* data);
protected:
// More variables declared for testing
std::vector<transaction> tran_;
std::vector<Transaction> tran_;
// Variable used to store agent info, indexed by agent_index_
std::vector<agent_info> agent_info_;
std::vector<AgentInfo *> agent_info_;
// Variable used to store region info, indexed by region_index_
std::vector<region_info> region_info_;
// Variable used to store region info, indexed by pool_index_
std::vector<PoolInfo *> pool_info_;
// Variable to store argument number
int argc_;
// Pointer to store address of argument text
char** argv_;
// To store node info
std::vector<NodeInfo> node_info_;
// Variable to help count agent index
int agent_index_;
// Variable to help count region index
int region_index_;
// BenchMark mode by default
bool bench_mark_mode_;
// BenchMark copy time
std::vector<double> benchmark_copy_time_;
// Min time
std::vector<double> min_time_;
// User define copy time
double user_copy_time_;
int pool_index_;
// Verification result
bool verified_;
// If it needs verification
bool verification_;
// To store node info
std::vector<node_info> node_info_;
// Store the testing level
int src_pool_id_;
int dst_pool_id_;
bool do_full_test_;
// System region
hsa_amd_memory_pool_t sys_region_;
hsa_amd_memory_pool_t sys_pool_;
// CPU agent used for verification
hsa_agent_t cpu_agent_;
constexpr const static char* help_info =
MULTILINE(. / memory_async_copy - f source_region - t dst_region - s data_size_in_KB - r[y | n] - i iteration_number - b\n\
\n\
-h Help info \n\
-f Memory Pool where data copy from \n\
-t Memory Pool where data copy to \n\
-s Size of copy data, 256MB by default \n\
-r If wants to add more copy \n\
-i Iteration number for each copy \n\
-b Enable benchmark mode \n\
Note : -f - t must be specified\n);
rocrtst::PerfTimer copy_timer_;
};
#endif
#endif // ROCRTST_SUITES_PERFORMANCE_MEMORY_ASYNC_COPY_H_
-411
Просмотреть файл
@@ -1,411 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include "memory_copy.h"
#include "common/base_rocr_utils.h"
#include "common/common.h"
#include "hsa/hsa.h"
#include "gtest/gtest.h"
#include <algorithm>
MemoryCopy::MemoryCopy(size_t num) :
BaseRocR() {
ptr_src_ = NULL;
ptr_dst_ = NULL;
ptr_dev_src_ = NULL;
ptr_dev_dst_ = NULL;
device_region_.handle = 0;
set_requires_profile (HSA_PROFILE_BASE);
}
MemoryCopy::~MemoryCopy() {
}
const char* MemoryCopy::Str[16] = {"64K", "128K", "256K", "512K", "1M", "2M",
"4M", "8M", "16M", "32M", "64M", "128M",
"256M", "512M", "1G", "2G"
};
const size_t MemoryCopy::Size[16] = {64*1024, 128*1024, 256*1024, 512*1024,
1024*1024, 2048*1024, 4096*1024,
8*1024*1024, 16*1024* 1024, 32*1024*1024,
64*1024*1024, 128*1024*1024, 256*1024*1024,
512*1024*1024, 1024*1024*1024,
(size_t)2*1024*1024* 1024
};
void MemoryCopy::SetUp() {
hsa_status_t err;
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
return;
}
hsa_agent_t* gpu_dev = gpu_device1();
hsa_agent_t* cpu_dev = cpu_device();
// Find system memory pool for kernarg allocation.
// hsa_amd_memory_pool_t sys_coarse_grained_pool;
err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool,
&cpu_pool());
ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
ASSERT_NE(cpu_pool().handle, 0);
// Get local memory pool of the first GPU.
// hsa_amd_memory_pool_t gpu_pool_;
err = hsa_amd_agent_iterate_memory_pools(*gpu_dev, rocrtst::FindStandardPool,
&device_pool());
ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
ASSERT_NE(device_pool().handle, 0);
//Allocate buffers whose size is 2GB
err = hsa_amd_memory_pool_allocate(cpu_pool(), Size[12], 0, &ptr_src_);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_memory_pool_allocate(cpu_pool(), Size[12], 0, &ptr_dst_);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_memory_pool_allocate(device_pool(), Size[11], 0, &ptr_dev_src_);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_memory_pool_allocate(device_pool(), Size[11], 0, &ptr_dev_dst_);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
//Assign the region ownership to GPU
err = hsa_memory_assign_agent(ptr_dev_src_, *gpu_dev,
HSA_ACCESS_PERMISSION_RW);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_memory_assign_agent(ptr_dev_dst_, *gpu_dev,
HSA_ACCESS_PERMISSION_RW);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
//rocrtst::CommonCleanUp the two buffer, src to 1 each byte and dst to 0
err = hsa_amd_memory_fill(ptr_src_, 1, Size[12]);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
//Check if the initialization is correct
#if DEBUG
std::cout << "Value after setting source buffer is: "
<< (int)((uint8_t*)ptr_src_)[0] << std::endl;
#endif
return;
}
void MemoryCopy::Run() {
hsa_status_t err;
if (!rocrtst::CheckProfile(this)) {
return;
}
uint32_t iterations = RealIterationNum();
//Iteration over the different data size on system memory
for (int i = 0; i < 13; i++) {
std::vector<double> time;
for (uint32_t it = 0; it < iterations; it++) {
#if DEBUG
std::cout << ".";
fflush(stdout);
#endif
rocrtst::PerfTimer copy_timer;
int index = copy_timer.CreateTimer();
copy_timer.StartTimer(index);
err = hsa_memory_copy(ptr_dst_, ptr_src_, Size[i]);
copy_timer.StopTimer(index);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Push the result back to vector time
time.push_back(copy_timer.ReadTimer(index));
#if DEBUG
//Check if the data copied is correct
uint8_t* temp_ptr = (uint8_t*)ptr_dst_;
for (uint32_t j = 0; j < Size[i]; j++) {
ASSERT_EQ(temp_ptr[j], 1);
}
#endif
}
#if DEBUG
std::cout << std::endl;
#endif
//Get mean copy time and store to the array
sys2sys_copy_time_.push_back(GetMeanTime(time));
}
//Copy from system memory to device memory
for (int i = 0; i < 12; i++) {
std::vector<double> time;
for (uint32_t it = 0; it < iterations; it++) {
#if DEBUG
std::cout << ".";
fflush(stdout);
#endif
rocrtst::PerfTimer copy_timer;
int index = copy_timer.CreateTimer();
copy_timer.StartTimer(index);
err = hsa_memory_copy(ptr_dev_src_, ptr_src_, Size[i]);
copy_timer.StopTimer(index);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Push the result back to vector time
time.push_back(copy_timer.ReadTimer(index));
#if DEBUG
//Check if the data copied is correct
uint8_t* temp_ptr = (uint8_t*)ptr_dst_;
for (uint32_t j = 0; j < Size[i]; j++) {
ASSERT_EQ(temp_ptr[j], 1);
}
#endif
}
#if DEBUG
std::cout << std::endl;
#endif
//Get mean copy time and store to the array
sys2dev_copy_time_.push_back(GetMeanTime(time));
}
//Copy from device memory to device memory
for (int i = 0; i < 12; i++) {
std::vector<double> time;
for (uint32_t it = 0; it < iterations; it++) {
#if DEBUG
std::cout << ".";
fflush(stdout);
#endif
rocrtst::PerfTimer copy_timer;
int index = copy_timer.CreateTimer();
copy_timer.StartTimer(index);
err = hsa_memory_copy(ptr_dev_dst_, ptr_dev_src_, Size[i]);
copy_timer.StopTimer(index);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Push the result back to vector time
time.push_back(copy_timer.ReadTimer(index));
#if DEBUG
//Check if the data copied is correct
uint8_t* temp_ptr = (uint8_t*)ptr_dst_;
for (uint32_t j = 0; j < Size[i]; j++) {
ASSERT_EQ(temp_ptr[j], 1);
}
#endif
}
#if DEBUG
std::cout << std::endl;
#endif
//Get mean copy time and store to the array
dev2dev_copy_time_.push_back(GetMeanTime(time));
}
//Copy from device memory to system memory
for (int i = 0; i < 12; i++) {
std::vector<double> time;
for (uint32_t it = 0; it < iterations; it++) {
#if DEBUG
std::cout << ".";
fflush(stdout);
#endif
rocrtst::PerfTimer copy_timer;
int index = copy_timer.CreateTimer();
copy_timer.StartTimer(index);
err = hsa_memory_copy(ptr_dst_, ptr_dev_src_, Size[i]);
copy_timer.StopTimer(index);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Push the result back to vector time
time.push_back(copy_timer.ReadTimer(index));
#if DEBUG
//Check if the data copied is correct
uint8_t* temp_ptr = (uint8_t*)ptr_dst_;
for (uint32_t j = 0; j < Size[i]; j++) {
if (temp_ptr[j] != 1) {
ASSERT_EQ(temp_ptr[j], 1);
}
}
#endif
}
#if DEBUG
std::cout << std::endl;
#endif
//Get mean copy time and store to the array
dev2sys_copy_time_.push_back(GetMeanTime(time));
}
}
size_t MemoryCopy::RealIterationNum() {
return num_iteration() * 1.2 + 1;
}
double MemoryCopy::GetMeanTime(std::vector<double>& vec) {
std::sort(vec.begin(), vec.end());
vec.erase(vec.begin());
vec.erase(vec.begin(), vec.begin() + num_iteration() * 0.1);
vec.erase(vec.begin() + num_iteration(), vec.end());
double mean = 0.0;
int num = vec.size();
for (int it = 0; it < num; it++) {
// printf("%f\n", vec[it]);
mean += vec[it];
}
mean /= num;
return mean;
}
void MemoryCopy::DisplayResults() const {
if (!rocrtst::CheckProfile(this)) {
return;
}
printf(
"================ System to System ==================================\n");
printf(" Data Size BandWidth(GB/s)\n");
//Output the BW of system memory to system memory
for (int i = 0; i < 13; i++) {
double band_width = (double) Size[i] / sys2sys_copy_time_[i] / 1024 / 1024
/ 1024 * 2;
#ifdef DEBUG
printf("size: %zu time: %f\n", Size[i], sys2sys_copy_time_[i]);
#endif
printf(" %s %lf\n", Str[i], band_width);
}
printf(
"================ System to Device ===================================\n");
for (int i = 0; i < 12; i++) {
double band_width = (double) Size[i] / sys2dev_copy_time_[i] / 1024 / 1024
/ 1024 * 2;
#ifdef DEBUG
printf("size: %zu time: %f\n", Size[i], sys2dev_copy_time_[i]);
#endif
printf(" %s %lf\n", Str[i], band_width);
}
printf(
"================ Device to Device ===================================\n");
for (int i = 0; i < 12; i++) {
double band_width = (double) Size[i] / dev2dev_copy_time_[i] / 1024 / 1024
/ 1024 * 2;
#ifdef DEBUG
printf("size: %zu time: %f\n", Size[i], dev2dev_copy_time_[i]);
#endif
printf(" %s %lf\n", Str[i], band_width);
}
printf(
"================ Device to System ===================================\n");
for (int i = 0; i < 12; i++) {
double band_width = (double) Size[i] / dev2sys_copy_time_[i] / 1024 / 1024
/ 1024 * 2;
#ifdef DEBUG
printf("size: %zu time: %f\n", Size[i], dev2sys_copy_time_[i]);
#endif
printf(" %s %lf\n", Str[i], band_width);
}
printf("===================================================\n");
return;
}
void MemoryCopy::Close() {
hsa_status_t err;
//Free the memory allocated
err = hsa_memory_free(ptr_src_);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_memory_free(ptr_dst_);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
ptr_src_ = NULL;
ptr_dst_ = NULL;
err = rocrtst::CommonCleanUp(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
return;
}
-109
Просмотреть файл
@@ -1,109 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef __ROCRTST_SRC_MEMORY_MEM_COPY_H__
#define __ROCRTST_SRC_MEMORY_MEM_COPY_H__
#include "common/base_rocr.h"
#include "perf_common/perf_base.h"
#include "hsa/hsa.h"
#include "common/hsatimer.h"
#include <vector>
class MemoryCopy: public rocrtst::BaseRocR, public PerfBase {
public:
//@Brief: Constructor for test case of MemoryCopy
MemoryCopy(size_t num = 100);
//@Brief: Destructor for test case of MemoryCopy
virtual ~MemoryCopy();
//@Brief: Setup the environment for measurement
virtual void SetUp();
//@Brief: Core measurement execution
virtual void Run();
//@Brief: Clean up and retrive the resource
virtual void Close();
//@Brief: Display results
virtual void DisplayResults() const;
private:
//@Brief: Define copy data size and corresponding string
static const size_t Size[16];
static const char* Str[16];
//@Brief: Get real iteration number
virtual size_t RealIterationNum();
//@Brief: Get the mean copy time
virtual double GetMeanTime(std::vector<double>& vec);
protected:
//@Brief: More variables declared for testing
//@Brief: Source pointer from which data copy
void* ptr_src_;
//@Brief: Destination pointer to which data copy
void* ptr_dst_;
//@Brief: Pointer to device memory
void* ptr_dev_src_;
void* ptr_dev_dst_;
//@Brief: Array to store the timer results for each data size
std::vector<double> sys2sys_copy_time_;
std::vector<double> sys2dev_copy_time_;
std::vector<double> dev2sys_copy_time_;
std::vector<double> dev2dev_copy_time_;
//@Brief: Device memory region
hsa_region_t device_region_;
};
#endif
-284
Просмотреть файл
@@ -1,284 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include "queue_concurrency.h"
#include "common/base_rocr_utils.h"
#include "common/common.h"
#include "common/helper_funcs.h"
#include "common/hsatimer.h"
#include "common/os.h"
#include "hsa/hsa_ext_finalize.h"
#include "gtest/gtest.h"
#include <math.h>
#include <thread>
QueueConcurrency::QueueConcurrency() :
BaseRocR(), execution_time_(8) {
queue_num_ = 0;
std_time_ = 0.0;
set_enable_interrupt(true);
set_requires_profile (HSA_PROFILE_FULL);
}
QueueConcurrency::~QueueConcurrency() {
}
void QueueConcurrency::SetUp() {
hsa_status_t err;
set_kernel_file_name("test_kernel.o");
set_kernel_name("&__OpenCL_vec_assign_kernel");
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
return;
}
rocrtst::LoadKernelFromObjFile(this);
hsa_agent_t* gpu_dev = gpu_device1();
// Fill up part of aql pakcet which are the same cross the threads
rocrtst::InitializeAQLPacket(this, &aql());
// Create a queue
hsa_queue_t* q = main_queue();
rocrtst::CreateQueue(*gpu_dev, &q);
for (int i = 0; i < 2; i++) {
// Output of kernel
int output = 0;
// Iteration number
int iterations = 1024 * 1024; // * 1024;
struct ALIGNED_(16)
args_t {
void* arg0;
int arg1;
} local_args;
local_args.arg0 = (void*) &output;
local_args.arg1 = iterations;
err = hsa_memory_register(&local_args, sizeof(local_args));
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
//Obtain the current queue write index.
uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
//Write the aql packet at the calculated queue index address.
const uint32_t queue_mask = main_queue()->size - 1;
hsa_kernel_dispatch_packet_t* pkt_addr =
(hsa_kernel_dispatch_packet_t*) (main_queue()->base_address);
(pkt_addr)[index & queue_mask] = aql();
(pkt_addr)[index & queue_mask].completion_signal = signal();
(pkt_addr)[index & queue_mask].kernarg_address = &local_args;
//Get timing stamp and ring the doorbell to dispatch the kernel.
rocrtst::PerfTimer p_timer;
int id = p_timer.CreateTimer();
p_timer.StartTimer(id);
//.type = HSA_PACKET_TYPE_DISPATCH;
(pkt_addr)[index & queue_mask].header |= HSA_PACKET_TYPE_KERNEL_DISPATCH
<< HSA_PACKET_HEADER_TYPE;
hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
//Wait on the dispatch signal until the kernel is finished.
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
;
p_timer.StopTimer(id);
hsa_signal_store_screlease(signal(), 1);
if (1 == i) {
std_time_ = p_timer.ReadTimer(id);
}
}
//Destroy the queue
err = hsa_queue_destroy(main_queue());
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
}
void QueueConcurrency::Run() {
if (!rocrtst::CheckProfile(this)) {
return;
}
// Launch 8 child threads
std::vector < std::thread > threads;
for (int i = 0; i < 8; i++) {
threads.push_back(std::thread(&QueueConcurrency::ThreadFunc, this, i));
}
// Wait for join
for (int i = 0; i < 8; i++) {
threads[i].join();
}
CalculateQueueNum();
}
void QueueConcurrency::CalculateQueueNum() {
for (int i = 0; i < 8; i++) {
double expected_time = execution_time_[0] / (1 << i);
double deviation = sqrt(
(expected_time - execution_time_[i])
* (expected_time - execution_time_[i]));
if (deviation < 0.1 * expected_time) {
queue_num_++;
}
}
}
void QueueConcurrency::DisplayResults() const {
if (!rocrtst::CheckProfile(this)) {
return;
}
for (int i = 0; i < 8; i++) {
std::cout << execution_time_[i] << std::endl;
}
std::cout << "Number of Concurrent Queue is: " << queue_num_ << std::endl;
ASSERT_EQ(queue_num_, 3);
return;
}
void QueueConcurrency::Close() {
hsa_status_t err;
err = rocrtst::CommonCleanUp(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
}
void QueueConcurrency::ThreadFunc(int threadID) {
// Define local queue and signal
hsa_queue_t* queue;
hsa_signal_t signal;
hsa_status_t err;
hsa_agent_t* gpu_dev = gpu_device1();
// Create a signal
err = hsa_signal_create(1, 0, NULL, &signal);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
rocrtst::CreateQueue(*gpu_dev, &queue);
std::vector<double> time;
for (uint32_t i = 0; i < num_iteration(); i++) {
// Output of kernel
int output = 0;
// Iteration number
int iterations = 1024 * 1024 / (1 << threadID);
struct ALIGNED_(16)
args_t {
void* arg0;
int arg1;
} local_args;
local_args.arg0 = (void*) &output;
local_args.arg1 = iterations;
err = hsa_memory_register(&local_args, sizeof(local_args));
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
//Obtain the current queue write index.
uint64_t index = hsa_queue_add_write_index_relaxed(queue, 1);
//Write the aql packet at the calculated queue index address.
const uint32_t queue_mask = queue->size - 1;
hsa_kernel_dispatch_packet_t* pkt_addr =
(hsa_kernel_dispatch_packet_t*) (queue->base_address);
(pkt_addr)[index & queue_mask] = aql();
(pkt_addr)[index & queue_mask].completion_signal = signal;
(pkt_addr)[index & queue_mask].kernarg_address = &local_args;
//Get timing stamp and ring the doorbell to dispatch the kernel.
rocrtst::PerfTimer p_timer;
int id = p_timer.CreateTimer();
p_timer.StartTimer(id);
//.type = HSA_PACKET_TYPE_DISPATCH;
(pkt_addr)[index & queue_mask].header |= HSA_PACKET_TYPE_KERNEL_DISPATCH
<< HSA_PACKET_HEADER_TYPE;
hsa_signal_store_screlease(queue->doorbell_signal, index);
//Wait on the dispatch signal until the kernel is finished.
while (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1,
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
;
p_timer.StopTimer(id);
hsa_signal_store_screlease(signal, 1);
time.push_back(p_timer.ReadTimer(id));
EXPECT_EQ(output, iterations);
if (1 == i) {
execution_time_[threadID] = p_timer.ReadTimer(id);
}
}
time.erase(time.begin());
execution_time_[threadID] = rocrtst::CalcMean(time);
return;
}
-271
Просмотреть файл
@@ -1,271 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include "queue_create_destroy_latency.h"
#include "common/hsatimer.h"
#include "common/common.h"
#include "common/base_rocr_utils.h"
#include "common/helper_funcs.h"
#include "hsa/hsa_ext_amd.h"
#include "hsa/hsa_ext_finalize.h"
#include "gtest/gtest.h"
#include <stdio.h>
static const int kGridDimension = 1024;
// Construct the test case class
QueueLatency::QueueLatency() :
BaseRocR() {
max_queue_ = 0;
in_ = NULL;
out_ = NULL;
}
// Destruct the test case claa
QueueLatency::~QueueLatency() {
}
void QueueLatency::Close() {
hsa_memory_free (in_);
hsa_memory_free (out_);
hsa_status_t err;
err = rocrtst::CommonCleanUp(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
return;
}
// Set up the environment
void QueueLatency::SetUp() {
hsa_status_t err;
// We get hangs with vector_copy
set_kernel_file_name("vector_copy.o");
set_kernel_name("&__vector_copy_kernel");
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
return;
}
hsa_agent_t* gpu_dev = gpu_device1();
hsa_agent_t* cpu_dev = cpu_device();
// Get the max queue which can be active for GPU device
err = hsa_agent_get_info(*gpu_dev, HSA_AGENT_INFO_QUEUES_MAX, &max_queue_);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Find system coarse grained region
err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool,
&cpu_pool());
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
size_t pool_size;
err = hsa_amd_memory_pool_get_info(cpu_pool(), HSA_AMD_MEMORY_POOL_INFO_SIZE,
&pool_size);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_memory_pool_allocate(cpu_pool(),
kGridDimension * kGridDimension * 4, 0,
(void**) &in_);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_memory_pool_allocate(cpu_pool(),
kGridDimension * kGridDimension * 4, 0,
(void**) &out_);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
//rocrtst::LoadKernelFromObjFile(gpu_dev, "./"+ kernel_file_name() + ".o");
rocrtst::LoadKernelFromObjFile(this);
// Fill up the aql packet
rocrtst::InitializeAQLPacket(this, &aql());
aql().grid_size_x = kGridDimension * kGridDimension;
// rocrtst::CommonCleanUp vector memory and register them
//memset(in_, 1, kGridDimension*kGridDimension * 4);
err = hsa_amd_memory_fill(in_, 1, kGridDimension * kGridDimension * 4);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
return;
}
void QueueLatency::Run() {
hsa_agent_t* gpu_dev = gpu_device1();
hsa_status_t err;
if (!rocrtst::CheckProfile(this)) {
return;
}
// The outer for loop iterator represents the predefined queue number
// After creating a queue, launch a kernel to train the queue, then destroy
// TODO:Hardcode max_queue_ to 100
max_queue_ = 20;
for (uint32_t pre_defined_num = 0; pre_defined_num < max_queue_;
pre_defined_num++) {
#ifdef DEBUG
std::cout << "Existing queue number: " << pre_defined_num << std::endl;
#endif
// vector to store the creation and destruction time
std::vector<double> creation;
std::vector<double> destruction;
// Create pre_defined_num queues first
hsa_queue_t* q;
for (uint32_t i = 0; i < pre_defined_num; i++) {
q = main_queue();
rocrtst::CreateQueue(*gpu_dev, &q);
queues_.push_back(q);
}
for (uint32_t i = 0; i < num_iteration(); i++) {
rocrtst::PerfTimer p_timer;
int id = p_timer.CreateTimer();
uint32_t size = 0;
err = hsa_agent_get_info(*gpu_dev, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &size);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
p_timer.StartTimer(id);
hsa_queue_t* q = main_queue();
err = hsa_queue_create(*gpu_dev, size, HSA_QUEUE_TYPE_MULTI, NULL, NULL,
UINT32_MAX, UINT32_MAX, &q);
p_timer.StopTimer(id);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
creation.push_back(p_timer.ReadTimer(id));
p_timer.ResetTimer(id);
// Launch a kernel to the currently created queue
// Allocate kernel parameter
typedef struct args_t {
void* in_buf;
void* out_buf;
} args;
args* kern_ptr = NULL;
err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(args), 0,
(void**) &kern_ptr);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
kern_ptr->in_buf = in_;
kern_ptr->out_buf = out_;
aql().kernarg_address = kern_ptr;
// Obtain the current queue write index.
uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
// Write the aql packet at the calculated queue index address.
const uint32_t queue_mask = main_queue()->size - 1;
((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index
& queue_mask] = aql();
((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index
& queue_mask].header |= HSA_PACKET_TYPE_KERNEL_DISPATCH
<< HSA_PACKET_HEADER_TYPE;
hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
// Wait on the dispatch signal until the kernel is finished.
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
;
hsa_signal_store_screlease(signal(), 1);
// Destroy the queue and record the timer
p_timer.StartTimer(id);
err = hsa_queue_destroy(main_queue());
p_timer.StopTimer(id);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
destruction.push_back(p_timer.ReadTimer(id));
}
#ifdef DEBUG
std::cout << std::endl;
#endif
// Destroy the predefined queue
for (uint32_t i = 0; i < pre_defined_num; i++) {
ASSERT_EQ(queues_.size(), pre_defined_num);
err = hsa_queue_destroy(queues_[i]);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
}
// Clear the queue vector
queues_.clear();
// Get the mean creation and detruction time and push back
double creation_mean = rocrtst::CalcMean(creation);
double destruction_mean = rocrtst::CalcMean(destruction);
construction_mean_.push_back(creation_mean);
destruction_mean_.push_back(destruction_mean);
}
}
void QueueLatency::DisplayResults() const {
if (!rocrtst::CheckProfile(this)) {
return;
}
printf("======================================================\n");
printf(" Existing queue# Creation Destroy\n");
for (uint32_t i = 0; i < max_queue_; i++) {
printf(" %d, %fms %fms\n", i,
construction_mean_[i] * 1e3, destruction_mean_[i] * 1e3);
}
}
-95
Просмотреть файл
@@ -1,95 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef __ROCRTST_SRC_INC_QUEUE_CREATE_DESTROY_LATENCY_H__
#define __ROCRTST_SRC_INC_QUEUE_CREATE_DESTROY_LATENCY_H__
#include "perf_common/perf_base.h"
#include "common/base_rocr.h"
#include "hsa/hsa.h"
#include <vector>
class QueueLatency: public rocrtst::BaseRocR, public PerfBase {
public:
//@Brief: Constructor
QueueLatency();
//@Brief: Destructor
~QueueLatency();
//@Brief: Set up the teset environment
virtual void SetUp();
//@Brief: Run the test
virtual void Run();
//@Brief: Clean up and close the test
virtual void Close();
//@Brief: Display results
virtual void DisplayResults() const;
private:
//@Brief: A vector to store the pointers to multiple queues
std::vector<hsa_queue_t*> queues_;
//@Brief: Variable to store the mean time for both queue construction
// and destruction
std::vector<double> construction_mean_;
std::vector<double> destruction_mean_;
//@Brief: Variable to store the max number of queue which are active for
// device_
uint32_t max_queue_;
//@Brief: Pointer which points to original and destination vector memory
// space
uint8_t* in_;
uint8_t* out_;
};
#endif //__ROCRTST_SRC_INC_QUEUE_CREATE_DESTROY_LATENCY_H__
-281
Просмотреть файл
@@ -1,281 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include "system_load_bandwidth.h"
#include "common/base_rocr_utils.h"
#include "common/common.h"
#include "common/helper_funcs.h"
#include "common/hsatimer.h"
#include "common/os.h"
#include "gtest/gtest.h"
#include <algorithm>
#if 0
static void initGlobalReadBuffer(uint32_t* in_data, uint32_t num_thrds,
uint32_t num_ops, uint32_t num_loops) {
// Populate input buffer with thread Id left shifted by 2.
uint32_t value = 0;
uint32_t val_idx = 0;
for (int idx1 = 0; idx1 < num_loops; idx1++) {
for (int idx2 = 0; idx2 < num_ops; idx2++) {
// Write the value to be read by each thread
for (int idx3 = 0; idx3 < num_thrds; idx3++) {
value = idx3 << 2;
in_data[val_idx++] = value;
}
}
}
return;
}
static bool verifyGlobalLoadKernel(uint32_t* data, uint32_t num_thrds,
uint32_t scale, const char* kernel_name, bool print_debug) {
// Verify kernel operation i.e. validate the data in the output buffer.
bool valid = true;
uint32_t valid_value = 0;
for (int idx = 0; idx < num_thrds; idx++) {
valid_value = (idx << 2) * scale;
if (print_debug) {
std::cout << "Value expected = " << valid_value << std::endl;
std::cout << "Value of data = " << data[idx] << std::endl;
}
if (data[idx] != valid_value) {
std::cout << kernel_name << ": VALIDATION FAILED ! Bad index: " << idx
<< std::endl;
std::cout << kernel_name << ": VALUE @ Bad index: " << data[idx]
<< std::endl;
std::cout << std::endl;
break;
}
}
#ifdef DEBUG
std::cout << kernel_name << ": Passed validation" << std::endl;
std::cout << std::endl;
#endif
return true;
}
#endif
// Constructor
SystemLoadBandwidth::SystemLoadBandwidth() :
BaseRocR() {
set_group_size(0);
num_group_ = 0;
num_cus_ = 0;
kernel_loop_count_ = 0;
mean_ = 0.0;
data_size_ = 0;
set_enable_interrupt(0);
}
// Destructor
SystemLoadBandwidth::~SystemLoadBandwidth() {
}
// Set up the test environment
void SystemLoadBandwidth::SetUp() {
set_kernel_file_name("sysMemRead.o");
set_kernel_name("&__SysMemLoad");
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
return;
}
hsa_agent_t* gpu_dev = gpu_device1();
SetWorkItemNum();
//Create a queue with max number size
hsa_queue_t* q = main_queue();
rocrtst::CreateQueue(*gpu_dev, &q);
rocrtst::LoadKernelFromObjFile(this);
uint32_t total_work_items = num_cus_ * num_group_ * group_size();
//Fill up part of aql
rocrtst::InitializeAQLPacket(this, &aql());
aql().workgroup_size_x = group_size();
aql().grid_size_x = total_work_items;
return;
}
// Run the test
void SystemLoadBandwidth::Run() {
if (!rocrtst::CheckProfile(this)) {
return;
}
uint32_t total_workitems = num_cus_ * num_group_ * group_size();
hsa_agent_t* gpu_dev = gpu_device1();
hsa_status_t err;
uint32_t ops_thrd = 32;
uint64_t addr_step = (uint64_t) total_workitems * sizeof(uint32_t);
uint64_t total_ops = (uint64_t) total_workitems * ops_thrd;
uint64_t in_data_size = (uint64_t) total_ops * sizeof(uint32_t);
//uint32_t *in_data = (uint32_t *)malloc(in_data_size);
err = hsa_amd_agent_iterate_memory_pools(*gpu_dev, rocrtst::FindStandardPool,
&device_pool());
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
int32_t* in_data = NULL;
err = hsa_amd_memory_pool_allocate(device_pool(), in_data_size, 0,
(void**) &in_data);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
memset(in_data, 0, in_data_size);
uint32_t out_data_size = total_workitems * sizeof(uint32_t);
//uint32_t *out_data = (uint32_t *)malloc(out_data_size);
uint32_t* out_data;
err = hsa_amd_memory_pool_allocate(device_pool(), out_data_size, 0,
(void**) &out_data);
memset(out_data, 0, out_data_size);
data_size_ = in_data_size;
// initGlobalReadBuffer (in_data, total_workitems, ops_thrd,
// kernel_loop_count_);
typedef struct local_args_t {
void* arg0;
void* arg1;
uint64_t arg2;
void* arg3;
} args;
args* kern_ptr = NULL;
err = hsa_amd_memory_pool_allocate(device_pool(), sizeof(args), 0,
(void**) &kern_ptr);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// in_data is 32 bit ptr, so adding total_ops
kern_ptr->arg0 = in_data;
kern_ptr->arg1 = in_data + total_ops;
kern_ptr->arg2 = addr_step;
kern_ptr->arg3 = out_data;
aql().kernarg_address = kern_ptr;
std::vector<double> time;
int it = num_iteration() * 1.2 + 1;
void *q_base_addr = main_queue()->base_address;
for (int i = 0; i < it; i++) {
// Obtain the current queue write index
uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
// Write the aql packet at the calculated queue index address.
const uint32_t queue_mask = main_queue()->size - 1;
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql();
rocrtst::PerfTimer p_timer;
int id = p_timer.CreateTimer();
p_timer.StartTimer(id);
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask].header |=
HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
// Wait on the dispatch signal until the kernel is finished.
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
;
p_timer.StopTimer(id);
#if DEBUG
std::cout << ".";
std::cout.flush();
#endif
// Verify the results
// uint32_t scale = kernel_loop_count_ * ops_thrd;
//verifyGlobalLoadKernel(out_data, total_workitems, scale,
// kernel_name_.c_str(), false);
time.push_back(p_timer.ReadTimer(id));
hsa_signal_store_screlease(signal(), 1);
}
time.erase(time.begin());
std::sort(time.begin(), time.end());
time.erase(time.begin() + num_iteration(), time.end());
mean_ = rocrtst::CalcMean(time);
return;
}
void SystemLoadBandwidth::Close() {
hsa_status_t err;
err = rocrtst::CommonCleanUp(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
}
void SystemLoadBandwidth::DisplayResults() const {
if (!rocrtst::CheckProfile(this)) {
return;
}
std::cout << "=======================================" << std::endl;
std::cout << "System Load Bandwidth: %f(GB/S)" <<
data_size_ / mean_ / 1024 / 1024 / 1024 << std::endl;
}
-119
Просмотреть файл
@@ -1,119 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef __ROCRTST_SRC_INC_SYSTEM_LOAD_BANDWIDTH_H__
#define __ROCRTST_SRC_INC_SYSTEM_LOAD_BANDWIDTH_H__
#include "perf_common/perf_base.h"
#include "common/base_rocr.h"
#include "hsa/hsa.h"
#include <stdio.h>
class SystemLoadBandwidth: public rocrtst::BaseRocR, public PerfBase {
public:
//@Brief: Constructor
SystemLoadBandwidth();
//@Brief: Destructor
~SystemLoadBandwidth();
//@Brief: Set up the testing environment
virtual void SetUp();
//@Brief: Run the test case
virtual void Run();
//@Brief: Close and clean up the test enrionment
virtual void Close();
//@Brief: Display load bandwidth
virtual void DisplayResults() const;
//@Brief: Set work-item configuration
void SetWorkItemNum() {
#ifdef INTERACTIVE
uint32_t tmp;
printf("Please input the number of CUs you want to try:\n");
scanf("%d", &num_cus_);
printf("Please input the number of groups you want to try:\n");
scanf("%d", &num_group_);
printf("Please input the size of each group:\n");
uint32_t sz = 0;
scanf("%d", &tmp);
set_group_size(tmp);
printf("Please input the number of kernel loop you want to try:\n");
scanf("%d", &kernel_loop_count_);
#else
num_cus_ = 32;
num_group_ = 128;
set_group_size(256);
kernel_loop_count_ = 16;
#endif
return;
}
private:
//@Brief: number of group
uint32_t num_group_;
//@Brief: number of CUs
uint32_t num_cus_;
//@Brief: number of kernel loop
uint32_t kernel_loop_count_;
//@Brief: Mean execution time
double mean_;
//@Brief: data size for test
uint64_t data_size_;
};
#endif
-243
Просмотреть файл
@@ -1,243 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include "system_store_bandwidth.h"
#include "common/base_rocr_utils.h"
#include "common/common.h"
#include "common/helper_funcs.h"
#include "common/hsatimer.h"
#include "gtest/gtest.h"
static bool verifyGlobalStoreKernel(uint32_t* data, uint32_t num_thrds,
uint32_t loop_cnt, uint32_t ops_loop,
const char* kernel_name,
bool print_debug) {
// Verify kernel operation i.e. validate the data in the output buffer.
for (uint32_t idx1 = 0; idx1 < loop_cnt; idx1++) {
for (uint32_t idx2 = 0; idx2 < ops_loop; idx2++) {
for (uint32_t idx3 = 0; idx3 < num_thrds; idx3++) {
if (data[idx3] != (idx3 << 2)) {
std::cout << kernel_name << ": VALIDATION FAILED ! Bad index: "
<< idx3 << std::endl;
std::cout << kernel_name << ": VALUE @ Bad index: " << data[idx3]
<< std::endl;
break;
}
}
}
}
#ifdef DEBUG
std::cout << kernel_name << ": Passed validation" << std::endl;
std::cout << std::endl;
#endif
return true;
}
// Constructor
SystemStoreBandwidth::SystemStoreBandwidth() :
BaseRocR() {
set_group_size(0);
num_group_ = 0;
num_cus_ = 0;
kernel_loop_count_ = 0;
mean_ = 0.0;
data_size_ = 0;
}
// Destructor
SystemStoreBandwidth::~SystemStoreBandwidth() {
}
// Set up the test environment
void SystemStoreBandwidth::SetUp() {
set_kernel_file_name("sysMemWrite.o");
set_kernel_name("&__SysMemStore");
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
return;
}
hsa_agent_t* gpu_dev = gpu_device1();
SetWorkItemNum();
//Create a queue with max number size
hsa_queue_t* q = nullptr;
rocrtst::CreateQueue(*gpu_dev, &q);
set_main_queue(q);
rocrtst::LoadKernelFromObjFile(this);
uint32_t total_work_items = num_cus_ * num_group_ * group_size();
//Fill up part of aql
rocrtst::InitializeAQLPacket(this, &aql());
aql().workgroup_size_x = group_size();
aql().grid_size_x = total_work_items;
return;
}
// Run the test
void SystemStoreBandwidth::Run() {
hsa_status_t err;
if (!rocrtst::CheckProfile(this)) {
return;
}
uint32_t total_workitems = num_cus_ * num_group_ * group_size();
hsa_agent_t* gpu_dev = gpu_device1();
uint32_t ops_thrd = 16;
uint64_t addr_step = (uint64_t) total_workitems * sizeof(uint32_t);
uint64_t total_ops = (uint64_t) total_workitems * kernel_loop_count_
* ops_thrd;
uint64_t in_data_size = (uint64_t) total_ops * sizeof(uint32_t);
err = hsa_amd_agent_iterate_memory_pools(*gpu_dev,
rocrtst::FindStandardPool, &device_pool());
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
uint32_t* in_data = NULL;
err = hsa_amd_memory_pool_allocate(device_pool(), in_data_size, 0,
(void**) &in_data);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
//memset(in_data, 0, in_data_size);
err = hsa_amd_memory_fill(in_data, 0, in_data_size);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
uint32_t out_data_size = total_workitems * sizeof(uint32_t);
uint32_t* out_data = NULL;
err = hsa_amd_memory_pool_allocate(device_pool(), out_data_size, 0,
(void**) &out_data);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
//memset(out_data, 0, out_data_size);
err = hsa_amd_memory_fill(out_data, 0, out_data_size);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
data_size_ = in_data_size;
typedef struct local_args_t {
void* arg0;
void* arg1;
uint64_t arg2;
void* arg3;
} args;
// in_data is 32 bit ptr, so adding total_ops
args* kern_ptr = NULL;
err = hsa_amd_memory_pool_allocate(device_pool(), sizeof(args), 0,
(void**) &kern_ptr);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
kern_ptr->arg0 = in_data;
kern_ptr->arg1 = in_data + total_ops;
kern_ptr->arg2 = addr_step;
kern_ptr->arg3 = out_data;
aql().kernarg_address = kern_ptr;
std::vector<double> time;
void *q_base_addr = main_queue()->base_address;
for (uint32_t i = 0; i < num_iteration(); i++) {
// Obtain the current queue write index
uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
// Write the aql packet at the calculated queue index address.
const uint32_t queue_mask = main_queue()->size - 1;
((hsa_kernel_dispatch_packet_t*)(q_base_addr))[index & queue_mask] = aql();
rocrtst::PerfTimer p_timer;
int id = p_timer.CreateTimer();
p_timer.StartTimer(id);
((hsa_kernel_dispatch_packet_t*)(q_base_addr))[index & queue_mask].header |=
HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
// Wait on the dispatch signal until the kernel is finished.
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
;
p_timer.StopTimer(id);
// Verify the results
verifyGlobalStoreKernel(in_data, total_workitems, kernel_loop_count_,
ops_thrd, kernel_name().c_str(), false);
time.push_back(p_timer.ReadTimer(id));
hsa_signal_store_screlease(signal(), 1);
}
time.erase(time.begin());
mean_ = rocrtst::CalcMean(time);
return;
}
void SystemStoreBandwidth::Close() {
hsa_status_t err;
err = rocrtst::CommonCleanUp(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
return;
}
void SystemStoreBandwidth::DisplayResults() const {
if (!rocrtst::CheckProfile(this)) {
return;
}
std::cout << "=======================================" << std::endl;
std::cout << "System Load Bandwidth: %f(GB/S)"
<< data_size_ / mean_ / 1024 / 1024 / 1024 << std::endl;
}
-121
Просмотреть файл
@@ -1,121 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef __ROCRTST_SRC_INC_SYSTEM_STORE_BANDWIDTH_H__
#define __ROCRTST_SRC_INC_SYSTEM_STORE_BANDWIDTH_H__
#include "perf_common/perf_base.h"
#include "common/base_rocr.h"
#include "hsa/hsa.h"
#include <stdio.h>
class SystemStoreBandwidth: public rocrtst::BaseRocR, public PerfBase {
public:
//@Brief: Constructor
SystemStoreBandwidth();
//@Brief: Destructor
~SystemStoreBandwidth();
//@Brief: Set up the testing environment
virtual void SetUp();
//@Brief: Run the test case
virtual void Run();
//@Brief: Close and clean up the test enrionment
virtual void Close();
//@Brief: Display load bandwidth
virtual void DisplayResults() const;
//@Brief: Set work-item configuration
void SetWorkItemNum() {
#ifdef INTERACTIVE
uint32_t tmp;
printf("Please input the number of CUs you want to try:\n");
scanf("%d", &num_cus_);
printf("Please input the number of groups you want to try:\n");
scanf("%d", &num_group_);
printf("Please input the size of each group:\n");
scanf("%d", &tmp);
set_group_size(tmp);
printf("Please input the number of kernel loop you want to try:\n");
scanf("%d", &kernel_loop_count_);
#else
num_cus_ = 32;
num_group_ = 128;
group_size_ = 256;
kernel_loop_count_ = 16;
#endif
return;
}
private:
//@Brief: number of work item in one group
uint32_t group_size_;
//@Brief: number of group
uint32_t num_group_;
//@Brief: number of CUs
uint32_t num_cus_;
//@Brief: number of kernel loop
uint32_t kernel_loop_count_;
//@Brief: Mean execution time
double mean_;
//@Brief: data size for test
uint64_t data_size_;
};
#endif
+395
Просмотреть файл
@@ -0,0 +1,395 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
// The purpose of this test is to provide an example of the use of the
// common RocrTest classes and utilities that are used in many examples.
// It can be used as a template to start off with when writing new tests.
// In many cases, the existing boilerplate code will be sufficient as is.
// Otherwise, the boilerplate code can be either supplemented or replaced
// by your own code in your example, as necessary.
//
// The comments provided are focused more on the use of the common rocrtst
// utilities and boilerplate code, rather than the example app. itself.
//
// The boilerplate code includes code for:
// * hsa initialization and clean up
// * code to load pre-built kernels
// * creating queues
// * populating AQL packets
// * checking for required profiles
// * finding cpu and gpu agents (callbacks for common use cases)
// * finding pools (having common requirements)
// * allocating and setting kernel arguments
// * somewhat standardized output
// * handling additional command line arguments, beyond google-test arguments
// * support for various level of verbosity, controlled from command line arg
// * support for building OpenCL kernels
// * timer support
//
// Overview of RocrTst code organization:
// Classes:
// * class BaseRocR (base_rocr.h) -- base class for all rocrtst examples and
// tests. Most of the rocrtst common utilities act on BaseRocR objects
//
// * TestBase (test_base.h) -- derives from BaseRocR and is the base class
// for all tests under <rocrtst root>/suites. The implementation in TestBase
// methods are typically actions that are required for most/all tests and
// should therefore be called from the derived implementions of the methods.
//
// Utilities:
// * <rocrtst root>/common/base_rocr_utils.<cc/h> contains a set of utilities
// that act on BaseRocR objects.
//
// * <rocrtst root>/common/common.<cc/h> contain other non-BaseRocR utilities
//
// Special Files:
// * main.cc -- The main google test file from which the tests are invoked.
// There should be an entry for each test to be run there.
//
// * kernels -- OpenCL kernel source files should go in the kernels directory
//
// * CMakeLists.txt -- Host code (*.cc and *.h files) should build without
// modifying the CMakeList.txt file, if the files are place in the
// "performance" directory. However, an entry for OpenCL kernels. For
// each kernel to be built, the bitcode libraries must be indicated before
// the call to "build_kernel()" is made. See existing code for examples.
#include <algorithm>
#include <iostream>
#include <vector>
#include "suites/performance/test_case_template.h"
#include "common/base_rocr_utils.h"
#include "common/common.h"
#include "common/helper_funcs.h"
#include "common/hsatimer.h"
#include "gtest/gtest.h"
#include "hsa/hsa.h"
#include "hsa/hsa_ext_finalize.h"
static const uint32_t kNumBufferElements = 256;
#define RET_IF_HSA_ERR(err) { \
if ((err) != HSA_STATUS_SUCCESS) { \
const char* msg = 0; \
hsa_status_string(err, &msg); \
std::cout << "hsa api call failure at line " << __LINE__ << ", file: " << \
__FILE__ << ". Call returned " << err << std::endl; \
std::cout << msg << std::endl; \
return (err); \
} \
}
// Many test cases want to perform an operation on memory sizes of various
// granularities.
#if 0
static const int kNumGranularity = 20;
const char* Str[kNumGranularity] = {"1k", "2K", "4K", "8K", "16K", "32K",
"64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M",
"64M", "128M", "256M", "512M"};
const size_t Size[kNumGranularity] = {
1024, 2*1024, 4*1024, 8*1024, 16*1024, 32*1024, 64*1024, 128*1024,
256*1024, 512*1024, 1024*1024, 2048*1024, 4096*1024, 8*1024*1024,
16*1024*1024, 32*1024*1024, 64*1024*1024, 128*1024*1024, 256*1024*1024,
512*1024*1024};
static const int kMaxCopySize = Size[kNumGranularity - 1];
#endif
TestExample::TestExample(void) :
TestBase() {
set_num_iteration(10); // Number of iterations to execute of the main test;
// This is a default value which can be overridden
// on the command line.
set_title("Test Case Example");
set_description("Put a description of the test case here. Line breaks "
"will be taken care of on output, not here.");
set_kernel_file_name("test_case_template_kernels.hsaco");
set_kernel_name("square"); // kernel function name
#if 0
// Set required profile to HSA_PROFILE_FULL or HSA_PROFILE_BASE if it
// matters for this test. If either profile is fine, then leave with
// default
set_requires_profile(<value>);
#endif
}
TestExample::~TestExample(void) {
}
// Any 1-time setup involving member variables used in the rest of the test
// should be done here.
void TestExample::SetUp(void) {
hsa_status_t err;
// TestBase::SetUp() will set HSA_ENABLE_INTERRUPT if enable_interrupt() is
// true, and call hsa_init(). It also prints the SetUp header.
TestBase::SetUp();
// SetDefaultAgents(this) will assign the first CPU and GPU found on
// iterating through the agents and assign them to cpu_device_ and
// gpu_device1_, respectively (cpu_device() and gpu_device1()). These
// BaseRocR member variables are used in some utilities. Additionally,
// SetDefaultAgents() checks the profile of the gpu and compares this
// to any required profile.
//
// If SetDefaultAgents() is not used, if the profile of the target GPU
// matters for this test, it should be set with set_profile() and
// CheckProfileAndInform() should be called to check if it is the
// required profile
err = rocrtst::SetDefaultAgents(this);
ASSERT_EQ(HSA_STATUS_SUCCESS, err);
hsa_agent_t* gpu_dev = gpu_device1();
// Find and assign HSA_AMD_SEGMENT_GLOBAL pools for cpu, gpu and a kern_arg
// pool
err = rocrtst::SetPoolsTypical(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Create a queue
hsa_queue_t* q = nullptr;
rocrtst::CreateQueue(*gpu_dev, &q);
ASSERT_NE(q, nullptr);
set_main_queue(q);
err = rocrtst::LoadKernelFromObjFile(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Fill up the kernel packet (except header) with some values we've
// collected so far, and some reasonable default values; this should be after
// LoadKernelFromObjFile(). AllocAndSetKernArgs() will fill in the kern_args
err = rocrtst::InitializeAQLPacket(this, &aql());
ASSERT_EQ(HSA_STATUS_SUCCESS, err);
hsa_agent_t ag_list[2] = {*gpu_device1(), *cpu_device()};
// Allocate a few buffers for our example
err = hsa_amd_memory_pool_allocate(cpu_pool(),
kNumBufferElements*sizeof(uint32_t),
0, reinterpret_cast<void**>(&src_buffer_));
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_agents_allow_access(2, ag_list, NULL, src_buffer_);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Initialize the source buffer
for (uint32_t i = 0; i < kNumBufferElements; ++i) {
reinterpret_cast<uint32_t *>(src_buffer_)[i] = i;
}
err = hsa_amd_memory_pool_allocate(cpu_pool(),
kNumBufferElements*sizeof(uint32_t),
0, reinterpret_cast<void**>(&dst_buffer_));
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_agents_allow_access(2, ag_list, NULL, dst_buffer_);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Set up Kernel arguments
// See the meta-data for the compiled OpenCL kernel code to ascertain
// the sizes, padding and alignment required for kernel arguments.
// This can be seen by executing
// $ amdgcn-amd-amdhsa-readelf -aw ./binary_search_kernels.hsaco
// The kernel code will expect the following arguments aligned as shown.
// typedef uint32_t uint4[4];
struct __attribute__((aligned(16))) local_args_t {
uint32_t* dstArray;
uint32_t* srcArray;
uint32_t size;
uint32_t pad;
uint64_t global_offset_x;
uint64_t global_offset_y;
uint64_t global_offset_z;
} local_args;
local_args.dstArray = reinterpret_cast<uint32_t *>(dst_buffer_);
local_args.srcArray = reinterpret_cast<uint32_t *>(src_buffer_);
local_args.size = kNumBufferElements;
local_args.global_offset_x = 0;
local_args.global_offset_y = 0;
local_args.global_offset_z = 0;
err = rocrtst::AllocAndSetKernArgs(this, &local_args, sizeof(local_args));
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
return;
}
// This wrapper atomically writes the provided header and setup to the
// provided AQL packet. The provided AQL packet address should be in the
// queue memory space.
static inline void AtomicSetPacketHeader(uint16_t header, uint16_t setup,
hsa_kernel_dispatch_packet_t* queue_packet) {
__atomic_store_n(reinterpret_cast<uint32_t*>(queue_packet),
header | (setup << 16), __ATOMIC_RELEASE);
}
// Do a few extra iterations as we toss out some of the inital and final
// iterations when calculating statistics
uint32_t TestExample::RealIterationNum(void) {
return num_iteration() * 1.2 + 1;
}
static bool VerifyResult(uint32_t *ar, size_t sz) {
for (size_t i = sz; i < sz; ++i) {
if (i*i != ar[i]) {
return false;
}
}
return true;
}
void TestExample::Run(void) {
// Compare required profile for this test case with what we're actually
// running on
if (!rocrtst::CheckProfile(this)) {
return;
}
TestBase::Run();
// Override whatever we need to...
aql().workgroup_size_x = kNumBufferElements;
aql().grid_size_x = kNumBufferElements;
std::vector<double> timer;
int it = RealIterationNum();
hsa_kernel_dispatch_packet_t *queue_aql_packet;
rocrtst::PerfTimer p_timer;
uint64_t index;
for (int i = 0; i < it; i++) {
// This function simply copies the data we've collected so far into our
// local AQL packet, except the the setup and header fields.
queue_aql_packet = WriteAQLToQueue(this, &index);
ASSERT_EQ(queue_aql_packet,
reinterpret_cast<hsa_kernel_dispatch_packet_t *>
(main_queue()->base_address) + index);
uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
// Create and start a timer for this iteration
int id = p_timer.CreateTimer();
p_timer.StartTimer(id);
AtomicSetPacketHeader(aql_header, aql().setup, queue_aql_packet);
hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
// Wait on the dispatch signal until the kernel is finished.
while (hsa_signal_wait_scacquire(aql().completion_signal,
HSA_SIGNAL_CONDITION_LT, 1, (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) {
}
// Stop the timer
p_timer.StopTimer(id);
// Store time for later analysis
timer.push_back(p_timer.ReadTimer(id));
hsa_signal_store_screlease(aql().completion_signal, 1);
ASSERT_TRUE(VerifyResult(reinterpret_cast<uint32_t *>(dst_buffer_),
kNumBufferElements));
// Pay attention to verbosity level for things like progress output
if (verbosity() >= VERBOSE_PROGRESS) {
std::cout << ".";
fflush(stdout);
}
}
if (verbosity() >= VERBOSE_PROGRESS) {
std::cout << std::endl;
}
// Abandon the first result and after sort, delete the last 2% value
timer.erase(timer.begin());
std::sort(timer.begin(), timer.end());
timer.erase(timer.begin() + num_iteration(), timer.end());
time_mean_ = rocrtst::CalcMean(timer);
}
void TestExample::DisplayTestInfo(void) {
TestBase::DisplayTestInfo();
}
void TestExample::DisplayResults(void) const {
// Compare required profile for this test case with what we're actually
// running on
if (!rocrtst::CheckProfile(this)) {
return;
}
TestBase::DisplayResults();
std::cout << "The average time was: " << time_mean_ * 1e6 <<
" uS" << std::endl;
return;
}
void TestExample::Close() {
hsa_status_t err;
err = hsa_amd_memory_pool_free(src_buffer_);
ASSERT_EQ(HSA_STATUS_SUCCESS, err);
err = hsa_amd_memory_pool_free(dst_buffer_);
ASSERT_EQ(HSA_STATUS_SUCCESS, err);
// This will close handles opened within rocrtst utility calls and call
// hsa_shut_down(), so it should be done after other hsa cleanup
TestBase::Close();
}
#undef RET_IF_HSA_ERR
@@ -43,40 +43,41 @@
*
*/
#ifndef __ROCRTST_SRC_INC_IMAGE_LOAD_BANDWIDTH_H__
#define __ROCRTST_SRC_INC_IMAGE_LOAD_BANDWIDTH_H__
#ifndef ROCRTST_SUITES_PERFORMANCE_TEST_CASE_TEMPLATE_H_
#define ROCRTST_SUITES_PERFORMANCE_TEST_CASE_TEMPLATE_H_
#include "common/base_rocr.h"
#include "hsa/hsa.h"
#include "perf_common/perf_base.h"
#include "suites/test_common/test_base.h"
class ImageLoadBandwidth: public rocrtst::BaseRocR, public PerfBase {
class TestExample : public TestBase {
public:
//@Brief: Constructor
ImageLoadBandwidth();
TestExample();
//@Brief: Destructor
~ImageLoadBandwidth();
// @Brief: Destructor for test case of TestExample
virtual ~TestExample();
//@Brief: Set up the test environment
// @Brief: Setup the environment for measurement
virtual void SetUp();
//@Brief: Run the actual testing
// @Brief: Core measurement execution
virtual void Run();
//@Brief: Clean up the test environment
// @Brief: Clean up and retrive the resource
virtual void Close();
//@Brief: Display results
// @Brief: Display results
virtual void DisplayResults() const;
private:
//@Brief: Image Load Bandwidth
double load_bandwidth_;
// @Brief: Display information about what this test does
virtual void DisplayTestInfo(void);
//@Brief: Image size
size_t image_size_;
private:
uint32_t RealIterationNum(void);
double time_mean_;
void *src_buffer_;
void *dst_buffer_;
};
#endif //__ROCRTST_SRC_INC_IMAGE_LOAD_BANDWIDTH_H__
#endif // ROCRTST_SUITES_PERFORMANCE_TEST_CASE_TEMPLATE_H_
-279
Просмотреть файл
@@ -1,279 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include "vector_copy.h"
#include "common/base_rocr_utils.h"
#include "gtest/gtest.h"
// Copy vector buffer size.
static const size_t BUFFER_SIZE = 1024 * 1024 * 4;
static char* gCPUOutput = nullptr;
static uint64_t gQueueIndex = 0;
//Constructor
VectorCopy::VectorCopy() :
BaseRocR() {
set_kernel_name("&__vector_copy_kernel");
kernarg_address = NULL;
}
//Destructor
VectorCopy::~VectorCopy() {
}
// Find coarse grained system memory.
static hsa_status_t get_sys_coarse_grained_memory_pool(
hsa_amd_memory_pool_t pool, void* data) {
hsa_amd_segment_t segment;
hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
&segment);
if (HSA_AMD_SEGMENT_GLOBAL != segment) {
return HSA_STATUS_SUCCESS;
}
hsa_amd_memory_pool_global_flag_t flags;
hsa_status_t err = hsa_amd_memory_pool_get_info(pool,
HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags);
if (HSA_STATUS_SUCCESS == err
&& (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)) {
hsa_amd_memory_pool_t* ret = (hsa_amd_memory_pool_t*) data;
*ret = pool;
return HSA_STATUS_INFO_BREAK;
}
return err;
}
// Find out dGPU's local memory pool.
static hsa_status_t get_local_memory_pool(hsa_amd_memory_pool_t pool,
void* data) {
// With memory pool API, each agent will only report it is own memory pools.
// So, a coarse grained memory pool in global segment is what we want.
hsa_amd_segment_t segment;
hsa_status_t err = hsa_amd_memory_pool_get_info(pool,
HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
if (HSA_STATUS_SUCCESS != err) {
return err;
}
if (HSA_AMD_SEGMENT_GLOBAL != segment) {
return HSA_STATUS_SUCCESS;
}
hsa_amd_memory_pool_global_flag_t flags;
err = hsa_amd_memory_pool_get_info(pool,
HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags);
if (HSA_STATUS_SUCCESS == err
&& (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)) {
hsa_amd_memory_pool_t* ret = (hsa_amd_memory_pool_t*) data;
*ret = pool;
return HSA_STATUS_INFO_BREAK;
}
return err;
}
void VectorCopy::SetUp() {
hsa_status_t err;
hsa_agent_t* gpu_dev = gpu_device1();
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
return;
}
//Create a queue with max number size
hsa_queue_t* q;
rocrtst::CreateQueue(*gpu_dev, &q);
set_main_queue(q);
rocrtst::LoadKernelFromObjFile(this);
// Obtain the current queue write index.
gQueueIndex = hsa_queue_load_write_index_scacquire(main_queue());
rocrtst::InitializeAQLPacket(this, &aql());
uint16_t header = 0;
header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
aql().grid_size_x = (uint32_t)(1024 * 1024);
aql().kernarg_address = (void*) kernarg_address;
// Find system memory pool for kernarg allocation.
// hsa_amd_memory_pool_t sys_coarse_grained_pool;
err = hsa_amd_agent_iterate_memory_pools(cpus[0],
get_sys_coarse_grained_memory_pool, &sys_coarse_grained_pool_);
ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
// Get local memory pool of the first GPU.
// hsa_amd_memory_pool_t gpu_pool_;
err = hsa_amd_agent_iterate_memory_pools(gpus[0], get_local_memory_pool,
&gpu_pool_);
ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
return;
}
void VectorCopy::Run() {
hsa_status_t err;
void* in;
void* out;
if (!rocrtst::CheckProfile(this)) {
return;
}
// Allocate vector on the first GPU local memory as input.
err = hsa_amd_memory_pool_allocate(gpu_pool_, BUFFER_SIZE, 0, &in);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
std::cout << "Allocating " << BUFFER_SIZE <<
" Bytes of local memory on the first GPU, address = " <<
in << std::endl;
// rocrtst::CommonCleanUp input buffer on the first GPU to 1 for each byte.
err = hsa_amd_memory_fill(in, 0x01010101, BUFFER_SIZE / 4);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Allocate vector on the first GPU local memory as output
err = hsa_amd_memory_pool_allocate(gpu_pool_, BUFFER_SIZE, 0, &out);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
std::cout << "Allocating " << BUFFER_SIZE <<
" Bytes of local memory on the second GPU, address = " <<
out << std::endl;
// rocrtst::CommonCleanUp output buffer on the first GPU to 0.
err = hsa_amd_memory_fill(out, 0x00000000, BUFFER_SIZE / 4);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
typedef struct args_t {
void* in;
void* out;
} args;
args* kargs;
kargs->in = in;
kargs->out = out;
// Allocate the kernel argument buffer from the system memory pool.
err = hsa_amd_memory_pool_allocate(sys_coarse_grained_pool_, kernarg_size(),
0, &kernarg_address);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
memcpy(kernarg_address, &kargs, sizeof(args));
// Map kernarg space to the first GPU
err = hsa_amd_agents_allow_access(1, &gpus[0], NULL, kernarg_address);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
/*
* Increment the write index and ring the doorbell to dispatch the kernel.
*/
hsa_queue_store_write_index_screlease(main_queue(), gQueueIndex + 1);
hsa_signal_store_relaxed(main_queue()->doorbell_signal, gQueueIndex);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Wait on the dispatch completion signal until the kernel is finished.
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_EQ, 0,
UINT64_MAX, HSA_WAIT_STATE_BLOCKED))
;
// Reset signal value for future usage to copy output.
hsa_signal_store_screlease(signal(), 1);
// Allocate vector on the system memory pool.
err = hsa_amd_memory_pool_allocate(sys_coarse_grained_pool_, BUFFER_SIZE, 0,
(void**) &gCPUOutput);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Allow the first GPU to access the output
err = hsa_amd_agents_allow_access(1, &gpus[0], NULL, gCPUOutput);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
//Copy the output from GPU to the CPU buffer for validation
err = hsa_amd_memory_async_copy(gCPUOutput, cpus[0], out, gpus[0],
BUFFER_SIZE, 0, NULL, signal());
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Wait on the completion signal until the async copy is finished.
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_EQ, 0,
UINT64_MAX, HSA_WAIT_STATE_BLOCKED))
;
for (uint32_t i = 0; i < BUFFER_SIZE; i++) {
ASSERT_EQ(gCPUOutput[i], 1);
}
return;
}
void VectorCopy::Close() {
hsa_status_t err;
// Cleanup all allocated resources.
err = hsa_amd_memory_pool_free(kernarg_address);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_signal_destroy(signal());
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_queue_destroy(main_queue());
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_memory_pool_free(gCPUOutput);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = rocrtst::CommonCleanUp(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
return;
}
void VectorCopy::DisplayResults() const {
if (!rocrtst::CheckProfile(this)) {
return;
}
}
-109
Просмотреть файл
@@ -1,109 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef __ROCRTST_SRC_VECTOR_COPY_H__
#define __ROCRTST_SRC_VECTOR_COPY_H__
#include "perf_common/perf_base.h"
#include "common/base_rocr.h"
#include "common/common.h"
#include "common/hsatimer.h"
#include "hsa/hsa.h"
#include "hsa/hsa_ext_amd.h"
#include "hsa/hsa_ext_finalize.h"
#include <algorithm>
#include <vector>
//@Brief: This class is defined to measure the mean latency of launching
//an empty kernel
class VectorCopy: public rocrtst::BaseRocR, public PerfBase {
public:
//@Brief: Constructor
VectorCopy();
//@Brief: Destructor
virtual ~VectorCopy();
//@Brief: Set up the environment for the test
virtual void SetUp();
//@Brief: Run the test case
virtual void Run();
virtual void DisplayResults() const;
//@Brief: Clean up and close the runtime
virtual void Close();
private:
//@Brief: Store the size of queue
uint32_t queue_size_;
//@Brief: kernarg_address;
void* kernarg_address;
//@Brief: The mean time of CP Processing
double mean_;
//@Brief: The group memory region
hsa_region_t group_region_;
hsa_amd_memory_pool_t gpu_pool_;
hsa_amd_memory_pool_t sys_coarse_grained_pool_;
std::vector<hsa_agent_t> cpus;
std::vector<hsa_agent_t> gpus;
//@Brief: Pointer to cu_id array
uint32_t* cu_;
uint32_t manual_input;
uint32_t group_input;
};
#endif
-106
Просмотреть файл
@@ -1,106 +0,0 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef __ROCRTST_SRC_VECTOR_COPY_P2P_H__
#define __ROCRTST_SRC_VECTOR_COPY_P2P_H__
#include "perf_common/perf_base.h"
#include "common/base_rocr.h"
#include "common/common.h"
#include "common/hsatimer.h"
#include "hsa/hsa.h"
#include "hsa/hsa_ext_amd.h"
#include "hsa/hsa_ext_finalize.h"
#include <algorithm>
#include <vector>
//@Brief: This class is defined to measure the mean latency of launching
//an empty kernel
class VectorCopyP2P: public rocrtst::BaseRocR, public PerfBase {
public:
//@Brief: Constructor
VectorCopyP2P();
//@Brief: Destructor
virtual ~VectorCopyP2P();
//@Brief: Set up the environment for the test
virtual void SetUp();
//@Brief: Run the test case
virtual void Run();
//@Brief: Display results we got
virtual void DisplayResults() const;
//@Brief: Clean up and close the runtime
virtual void Close();
private:
//@Brief: Get actual iteration number
virtual size_t RealIterationNum();
//@Brief: Create Queue
virtual void CreateQueue();
//@Brief: Store the size of queue
uint32_t queue_size_;
//@Brief: The mean time of CP Processing
double mean_;
//@Brief: The group memory region
hsa_region_t group_region_;
//@Brief: Pointer to cu_id array
uint32_t* cu_;
uint32_t manual_input;
uint32_t group_input;
};
#endif
+141
Просмотреть файл
@@ -0,0 +1,141 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include <assert.h>
#include "suites/test_common/test_base.h"
#include "common/base_rocr_utils.h"
#include "gtest/gtest.h"
static const int kOutputLineLength = 80;
static const char kLabelDelimiter[] = "####";
static const char kDescriptionLabel[] = "TEST DESCRIPTION";
static const char kTitleLabel[] = "TEST NAME";
static const char kSetupLabel[] = "TEST SETUP";
static const char kRunLabel[] = "TEST EXECUTION";
static const char kCloseLabel[] = "TEST CLEAN UP";
static const char kResultsLabel[] = "TEST RESULTS";
TestBase::TestBase() {
set_description("");
}
TestBase::~TestBase() {
}
static void MakeHeaderStr(const char *inStr, std::string *outStr) {
assert(outStr != nullptr);
assert(inStr != nullptr);
outStr->clear();
*outStr = kLabelDelimiter;
*outStr += " ";
*outStr += inStr;
*outStr += " ";
*outStr += kLabelDelimiter;
}
void TestBase::SetUp(void) {
hsa_status_t err;
std::string label;
MakeHeaderStr(kSetupLabel, &label);
printf("\n\t%s\n", label.c_str());
err = rocrtst::InitAndSetupHSA(this);
ASSERT_EQ(HSA_STATUS_SUCCESS, err);
return;
}
void TestBase::Run(void) {
std::string label;
MakeHeaderStr(kRunLabel, &label);
printf("\n\t%s\n", label.c_str());
}
void TestBase::Close(void) {
hsa_status_t err;
std::string label;
MakeHeaderStr(kCloseLabel, &label);
printf("\n\t%s\n", label.c_str());
err = rocrtst::CommonCleanUp(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
}
void TestBase::DisplayResults(void) const {
std::string label;
MakeHeaderStr(kResultsLabel, &label);
printf("\n\t%s\n", label.c_str());
}
void TestBase::DisplayTestInfo(void) {
printf("#########################################"
"######################################\n");
std::string label;
MakeHeaderStr(kTitleLabel, &label);
printf("\n\t%s\n%s\n", label.c_str(), title().c_str());
if (verbosity() >= VERBOSE_STANDARD) {
MakeHeaderStr(kDescriptionLabel, &label);
printf("\n\t%s\n%s\n", label.c_str(), description().c_str());
}
}
void TestBase::set_description(std::string d) {
int le = kOutputLineLength - 4;
description_ = d;
size_t endlptr;
for (size_t i = le; i < description_.size(); i += le) {
endlptr = description_.find_last_of(" ", i);
description_.replace(endlptr, 1, "\n");
i = endlptr;
}
}
@@ -42,52 +42,43 @@
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef ROCRTST_SUITES_TEST_COMMON_TEST_BASE_H_
#define ROCRTST_SUITES_TEST_COMMON_TEST_BASE_H_
#ifndef __ROCRTST_SRC_INC_QUEUE_CONCURRENCY_H__
#define __ROCRTST_SRC_INC_QUEUE_CONCURRENCY_H__
#include "perf_common/perf_base.h"
#include <string>
#include "common/base_rocr.h"
#include "hsa/hsa.h"
#include <vector>
class QueueConcurrency: public rocrtst::BaseRocR, public PerfBase {
class TestBase : public rocrtst::BaseRocR {
public:
//@Brief: Constructor
QueueConcurrency();
//@Brief: Destructor
~QueueConcurrency();
TestBase(void);
//@Brief: Set up the test environmnet
void SetUp();
virtual ~TestBase(void);
//@Brief: Run the test
void Run();
enum VerboseLevel {VERBOSE_MIN = 0, VERBOSE_STANDARD, VERBOSE_PROGRESS};
//@Brief: Clean up and close
void Close();
// @Brief: Before run the core measure codes, do something to set up
// i.e. init runtime, prepare packet...
virtual void SetUp(void);
void DisplayResults() const;
// @Brief: Core measurement codes executing here
virtual void Run(void);
// @Brief: Do something clean up
virtual void Close(void);
// @Brief: Display the results
virtual void DisplayResults(void) const;
// @Brief: Display information about the test
virtual void DisplayTestInfo(void);
const std::string & description(void) const {return description_;}
void set_description(std::string d);
private:
//@Brief: Thread function
void ThreadFunc(int i);
//@Brief: Calculate the concurrent queue number
void CalculateQueueNum();
//@Brief: Vector to store execution time
std::vector<double> execution_time_;
//@Brief: Number of concurrent queues
size_t queue_num_;
//@Brief: Store the standard execution time
double std_time_;
std::string description_;
};
#endif //__ROCRTST_SRC_INC_QUEUE_CONCURRENCY_H__
#endif // ROCRTST_SUITES_TEST_COMMON_TEST_BASE_H_
@@ -43,77 +43,79 @@
*
*/
#ifndef __ROCRTST_SRC_INC_DEVICE_LOAD_BANDWIDTH_H__
#define __ROCRTST_SRC_INC_DEVICE_LOAD_BANDWIDTH_H__
#include <assert.h>
#include <stdint.h>
#include <iostream>
#include <getopt.h>
#include "perf_common/perf_base.h"
#include "common/base_rocr.h"
#include "hsa/hsa.h"
#include <stdio.h>
#include "suites/test_common/test_common.h"
class DeviceLoadBandwidth: public rocrtst::BaseRocR, public PerfBase {
public:
//@Brief: Constructor
DeviceLoadBandwidth();
RocrtstOptions::RocrtstOptions(uint32_t *verb, uint32_t *iter) {
assert(verb != nullptr);
assert(iter != nullptr);
//@Brief: Destructor
~DeviceLoadBandwidth();
verbosity_ = verb;
iterations_ = iter;
}
//@Brief: Set up the testing environment
virtual void SetUp();
RocrtstOptions::~RocrtstOptions() {
}
//@Brief: Run the test case
virtual void Run();
static const struct option long_options[] = {
{"iterations", required_argument, nullptr, 'i'},
{"verbose", no_argument, nullptr, 'v'},
//@Brief: Close and clean up the test enrionment
virtual void Close();
//@Brief: Display load bandwidth
virtual void DisplayResults() const;
//@Brief: Set work-item configuration
void SetWorkItemNum() {
#ifdef INTERACTIVE
uint32_t tmp;
printf("Please input the number of CUs you want to try:\n");
scanf("%d", &num_cus_);
printf("Please input the number of groups you want to try:\n");
scanf("%d", &num_group_);
printf("Please input the size of each group:\n");
scanf("%d", &tmp);
set_group_size(tmp);
printf("Please input the number of kernel loop you want to try:\n");
scanf("%d", &kernel_loop_count_);
#else
num_cus_ = 16;
num_group_ = 128;
set_group_size(64);
kernel_loop_count_ = 16;
#endif
return;
}
private:
//@Brief: number of group
uint32_t num_group_;
//@Brief: number of CUs
uint32_t num_cus_;
//@Brief: number of kernel loop
uint32_t kernel_loop_count_;
//@Brief: Mean execution time
double mean_;
//@Brief: data size for test
uint64_t data_size_;
uint32_t* in_data_;
uint32_t* out_data_;
{nullptr, 0, nullptr, 0}
};
static const char* short_options = "i:v:r";
#endif
static void PrintHelp(void) {
std::cout <<
// "Required Arguments:\n"
// "--kernel, -k <path to kernel obj. file>\n"
"Optional RocRTst Arguments:\n"
"--iterations, -i <number of iterations to execute>; override default, "
"which varies for each test\n"
"--rocrtst_help, -r print this help message\n"
"--verbosity, -v <verbosity level>\n"
" Verbosity levels:\n"
" 0 -- minimal; just summary information\n"
" 1 -- intermediate; show intermediate values such as intermediate "
"perf. data\n"
" 2 -- progress; show progress displays\n"
" >= 3 -- more debug output\n";
}
uint32_t ProcessCmdline(RocrtstOptions* test, int arg_cnt, char** arg_list) {
int a;
int ind = -1;
assert(test != nullptr);
while (true) {
a = getopt_long(arg_cnt, arg_list, short_options, long_options, &ind);
if (a == -1) {
break;
}
switch (a) {
case 'i':
*test->iterations_ = std::stoi(optarg);
break;
case 'v':
*test->verbosity_ = std::stoi(optarg);
break;
case 'r':
PrintHelp();
return 1;
default:
PrintHelp();
return 1;
}
}
return 0;
}
@@ -43,24 +43,19 @@
*
*/
#ifndef ROCRTST_SUITES_TEST_COMMON_TEST_COMMON_H_
#define ROCRTST_SUITES_TEST_COMMON_TEST_COMMON_H_
#ifndef ROCRTST_SUITES_PERFORMANCE_PERF_COMMON_PERF_BASE_H_
#define ROCRTST_SUITES_PERFORMANCE_PERF_COMMON_PERF_BASE_H_
class PerfBase {
class RocrtstOptions {
public:
// @Brief: Before run the core measure codes, do something to set up
// i.e. init runtime, prepare packet...
virtual void SetUp(void) = 0;
RocrtstOptions(uint32_t *verb, uint32_t *iter);
// @Brief: Core measurement codes executing here
virtual void Run(void) = 0;
~RocrtstOptions(void);
// @Brief: Do something clean up
virtual void Close(void) = 0;
// @Brief: Display the results
virtual void DisplayResults(void) const = 0;
uint32_t *verbosity_;
uint32_t *iterations_;
};
#endif // ROCRTST_SUITES_PERFORMANCE_PERF_COMMON_PERF_BASE_H_
uint32_t ProcessCmdline(RocrtstOptions* test, int arg_cnt, char** arg_list);
#endif // ROCRTST_SUITES_TEST_COMMON_TEST_COMMON_H_