From a12c5628ea8690f85b75e03e93eabac4e532af09 Mon Sep 17 00:00:00 2001 From: Chris Freehill Date: Wed, 28 Jun 2017 10:54:57 -0500 Subject: [PATCH] Added dispatch time, async copy and test template rocrtst tests Change-Id: I57a844ee65c36bd61616ee6d60d358303f51db56 --- rocrtst/common/base_rocr.cc | 5 +- rocrtst/common/base_rocr.h | 17 - rocrtst/common/base_rocr_utils.cc | 122 ++- rocrtst/common/base_rocr_utils.h | 38 +- rocrtst/common/common.cc | 39 - rocrtst/common/common.h | 30 - rocrtst/common/helper_funcs.cc | 16 +- rocrtst/common/helper_funcs.h | 4 +- rocrtst/common/hsa_perf_cntrs.cc | 4 + rocrtst/common/hsatimer.cc | 3 +- rocrtst/common/hsatimer.h | 1 + rocrtst/samples/CMakeLists.txt | 5 + rocrtst/suites/performance/CMakeLists.txt | 108 ++- rocrtst/suites/performance/cp_process_time.cc | 258 ----- rocrtst/suites/performance/cp_process_time.h | 91 -- rocrtst/suites/performance/cu_masking.cc | 220 ----- rocrtst/suites/performance/cu_masking.h | 103 -- .../performance/device_load_bandwidth.cc | 293 ------ .../performance/device_store_bandwidth.cc | 219 ----- .../performance/device_store_bandwidth.h | 119 --- rocrtst/suites/performance/dispatch_time.cc | 247 ++--- rocrtst/suites/performance/dispatch_time.h | 87 +- rocrtst/suites/performance/flush_latency.cc | 351 ------- rocrtst/suites/performance/flush_latency.h | 122 --- rocrtst/suites/performance/hsa_info.cc | 502 ---------- rocrtst/suites/performance/image_bandwidth.cc | 328 ------- rocrtst/suites/performance/image_bandwidth.h | 99 -- .../performance/image_load_bandwidth.cc | 270 ------ .../performance/image_store_bandwidth.cc | 271 ------ .../performance/kernels/cu_masking.brig | Bin 1200 -> 0 bytes .../dispatch_time_kernels.cl} | 45 +- .../performance/kernels/empty_kernel.hsail | 12 - .../performance/kernels/flush_latency.hsail | 88 -- .../kernels/flush_latency_base.hsail | 88 -- .../performance/kernels/load_2d_image.hsail | 109 --- .../performance/kernels/simple_kernel.hsail | 37 - .../kernels/simple_kernel_base.hsail | 28 - .../performance/kernels/store_2d_image.hsail | 105 -- .../performance/kernels/sysMemRead.hsail | 237 ----- .../performance/kernels/sysMemRead_base.hsail | 237 ----- .../performance/kernels/sysMemWrite.hsail | 105 -- .../kernels/sysMemWrite_base.hsail | 105 -- .../test_case_template_kernels.cl} | 46 +- .../performance/kernels/test_kernel.hsail | 53 - .../kernels/transpose_kernel.hsail | 108 --- .../performance/kernels/vector_copy.hsail | 34 - .../kernels/vector_copy_base.hsail | 64 -- .../kernels/vector_copy_full.hsail | 64 -- rocrtst/suites/performance/main.cc | 267 ++--- .../suites/performance/matrix_transpose.cc | 289 ------ rocrtst/suites/performance/matrix_transpose.h | 101 -- .../suites/performance/memory_allocation.cc | 198 ---- .../suites/performance/memory_allocation.h | 98 -- .../suites/performance/memory_async_copy.cc | 912 ++++++++---------- .../suites/performance/memory_async_copy.h | 257 +++-- rocrtst/suites/performance/memory_copy.cc | 411 -------- rocrtst/suites/performance/memory_copy.h | 109 --- .../suites/performance/queue_concurrency.cc | 284 ------ .../queue_create_destroy_latency.cc | 271 ------ .../queue_create_destroy_latency.h | 95 -- .../performance/system_load_bandwidth.cc | 281 ------ .../performance/system_load_bandwidth.h | 119 --- .../performance/system_store_bandwidth.cc | 243 ----- .../performance/system_store_bandwidth.h | 121 --- .../suites/performance/test_case_template.cc | 395 ++++++++ ..._load_bandwidth.h => test_case_template.h} | 39 +- rocrtst/suites/performance/vector_copy.cc | 279 ------ rocrtst/suites/performance/vector_copy.h | 109 --- .../performance/vector_copy_peer_to_peer.h | 106 -- rocrtst/suites/test_common/test_base.cc | 141 +++ .../test_base.h} | 63 +- .../test_common.cc} | 132 +-- .../perf_base.h => test_common/test_common.h} | 25 +- 73 files changed, 1592 insertions(+), 9290 deletions(-) mode change 100644 => 100755 rocrtst/common/base_rocr.h delete mode 100755 rocrtst/suites/performance/cp_process_time.cc delete mode 100755 rocrtst/suites/performance/cp_process_time.h delete mode 100644 rocrtst/suites/performance/cu_masking.cc delete mode 100755 rocrtst/suites/performance/cu_masking.h delete mode 100755 rocrtst/suites/performance/device_load_bandwidth.cc delete mode 100755 rocrtst/suites/performance/device_store_bandwidth.cc delete mode 100755 rocrtst/suites/performance/device_store_bandwidth.h delete mode 100755 rocrtst/suites/performance/flush_latency.cc delete mode 100755 rocrtst/suites/performance/flush_latency.h delete mode 100755 rocrtst/suites/performance/hsa_info.cc delete mode 100755 rocrtst/suites/performance/image_bandwidth.cc delete mode 100755 rocrtst/suites/performance/image_bandwidth.h delete mode 100755 rocrtst/suites/performance/image_load_bandwidth.cc delete mode 100755 rocrtst/suites/performance/image_store_bandwidth.cc delete mode 100644 rocrtst/suites/performance/kernels/cu_masking.brig rename rocrtst/suites/performance/{hsa_info.h => kernels/dispatch_time_kernels.cl} (72%) delete mode 100755 rocrtst/suites/performance/kernels/empty_kernel.hsail delete mode 100755 rocrtst/suites/performance/kernels/flush_latency.hsail delete mode 100755 rocrtst/suites/performance/kernels/flush_latency_base.hsail delete mode 100755 rocrtst/suites/performance/kernels/load_2d_image.hsail delete mode 100755 rocrtst/suites/performance/kernels/simple_kernel.hsail delete mode 100755 rocrtst/suites/performance/kernels/simple_kernel_base.hsail delete mode 100755 rocrtst/suites/performance/kernels/store_2d_image.hsail delete mode 100755 rocrtst/suites/performance/kernels/sysMemRead.hsail delete mode 100755 rocrtst/suites/performance/kernels/sysMemRead_base.hsail delete mode 100755 rocrtst/suites/performance/kernels/sysMemWrite.hsail delete mode 100755 rocrtst/suites/performance/kernels/sysMemWrite_base.hsail rename rocrtst/suites/performance/{image_store_bandwidth.h => kernels/test_case_template_kernels.cl} (72%) delete mode 100755 rocrtst/suites/performance/kernels/test_kernel.hsail delete mode 100755 rocrtst/suites/performance/kernels/transpose_kernel.hsail delete mode 100755 rocrtst/suites/performance/kernels/vector_copy.hsail delete mode 100755 rocrtst/suites/performance/kernels/vector_copy_base.hsail delete mode 100755 rocrtst/suites/performance/kernels/vector_copy_full.hsail mode change 100644 => 100755 rocrtst/suites/performance/main.cc delete mode 100755 rocrtst/suites/performance/matrix_transpose.cc delete mode 100755 rocrtst/suites/performance/matrix_transpose.h delete mode 100755 rocrtst/suites/performance/memory_allocation.cc delete mode 100755 rocrtst/suites/performance/memory_allocation.h mode change 100644 => 100755 rocrtst/suites/performance/memory_async_copy.cc delete mode 100755 rocrtst/suites/performance/memory_copy.cc delete mode 100644 rocrtst/suites/performance/memory_copy.h delete mode 100755 rocrtst/suites/performance/queue_concurrency.cc delete mode 100755 rocrtst/suites/performance/queue_create_destroy_latency.cc delete mode 100755 rocrtst/suites/performance/queue_create_destroy_latency.h delete mode 100755 rocrtst/suites/performance/system_load_bandwidth.cc delete mode 100755 rocrtst/suites/performance/system_load_bandwidth.h delete mode 100755 rocrtst/suites/performance/system_store_bandwidth.cc delete mode 100755 rocrtst/suites/performance/system_store_bandwidth.h create mode 100755 rocrtst/suites/performance/test_case_template.cc rename rocrtst/suites/performance/{image_load_bandwidth.h => test_case_template.h} (76%) delete mode 100644 rocrtst/suites/performance/vector_copy.cc delete mode 100755 rocrtst/suites/performance/vector_copy.h delete mode 100755 rocrtst/suites/performance/vector_copy_peer_to_peer.h create mode 100755 rocrtst/suites/test_common/test_base.cc rename rocrtst/suites/{performance/queue_concurrency.h => test_common/test_base.h} (70%) rename rocrtst/suites/{performance/device_load_bandwidth.h => test_common/test_common.cc} (55%) rename rocrtst/suites/{performance/perf_common/perf_base.h => test_common/test_common.h} (78%) diff --git a/rocrtst/common/base_rocr.cc b/rocrtst/common/base_rocr.cc index a7aa71649a..4c95e4cf85 100755 --- a/rocrtst/common/base_rocr.cc +++ b/rocrtst/common/base_rocr.cc @@ -50,11 +50,9 @@ namespace rocrtst { BaseRocR::BaseRocR(void) { - num_iteration_ = 100; - signal_.handle = 0; + num_iteration_ = 1; cpu_device_.handle = -1; gpu_device1_.handle = -1; - region_.handle = 0; device_pool_.handle = 0; kern_arg_pool_.handle = 0; main_queue_ = nullptr; @@ -66,6 +64,7 @@ BaseRocR::BaseRocR(void) { orig_hsa_enable_interrupt_ = GetEnv("HSA_ENABLE_INTERRUPT"); set_kernel_file_name(""); set_verbosity(0); + set_title("unset_title"); } BaseRocR::~BaseRocR() { diff --git a/rocrtst/common/base_rocr.h b/rocrtst/common/base_rocr.h old mode 100644 new mode 100755 index f96b2a9e00..121c5318df --- a/rocrtst/common/base_rocr.h +++ b/rocrtst/common/base_rocr.h @@ -105,13 +105,6 @@ class BaseRocR { return kernel_object_; } - void set_signal(hsa_signal_t sig) { - signal_.handle = sig.handle; - } - const hsa_signal_t& signal(void) const { - return signal_; - } - void set_profile(hsa_profile_t in_prof) { profile_ = in_prof; } @@ -151,10 +144,6 @@ class BaseRocR { return aql_; } - hsa_region_t& region(void) { - return region_; - } - void set_num_iteration(int num) { num_iteration_ = num; } @@ -237,16 +226,12 @@ class BaseRocR { private: uint64_t num_iteration_; ///< Number of times to execute test - hsa_signal_t signal_; ///< Completion signal used for kernel execution - hsa_queue_t* main_queue_; ///< AQL queue used for packets hsa_agent_t gpu_device1_; ///< Handle to first GPU found hsa_agent_t cpu_device_; ///< Handle to CPU - hsa_region_t region_; ///< TODO(cfreehil): delete this - hsa_amd_memory_pool_t device_pool_; ///< Memory pool on gpu pool list hsa_amd_memory_pool_t cpu_pool_; ///< Memory pool on cpu pool list @@ -255,8 +240,6 @@ class BaseRocR { uint64_t kernel_object_; ///< Handle to kernel code - std::string brig_file_; // TODO(cfreehil): delete this - std::string kernel_file_name_; ///< Code object file name std::string kernel_name_; ///< Kernel name diff --git a/rocrtst/common/base_rocr_utils.cc b/rocrtst/common/base_rocr_utils.cc index 05bc1a9c28..bba6391419 100755 --- a/rocrtst/common/base_rocr_utils.cc +++ b/rocrtst/common/base_rocr_utils.cc @@ -70,6 +70,8 @@ namespace rocrtst { } \ } +// Clean up some of the common handles and memory used by BaseRocR code, then +// shut down hsa. Restore HSA_ENABLE_INTERRUPT to original value, if necessary hsa_status_t CommonCleanUp(BaseRocR* test) { hsa_status_t err; @@ -87,13 +89,9 @@ hsa_status_t CommonCleanUp(BaseRocR* test) { test->set_main_queue(nullptr); } - if (0 != test->signal().handle) { - hsa_signal_t sig; - sig.handle = 0; - - err = hsa_signal_destroy(test->signal()); + if (test->aql().completion_signal.handle != 0) { + err = hsa_signal_destroy(test->aql().completion_signal); RET_IF_HSA_UTILS_ERR(err); - test->set_signal(sig); } err = hsa_shut_down(); @@ -122,7 +120,7 @@ static const char* PROFILE_STR[] = {"HSA_PROFILE_BASE", "HSA_PROFILE_FULL", }; /// \returns bool /// - true Machine meets test requirements /// - false Machine does not meet test requirements -static bool CheckProfileAndInform(BaseRocR* test) { +bool CheckProfileAndInform(BaseRocR* test) { if (test->verbosity() > 0) { std::cout << "Target HW Profile is " << PROFILE_STR[test->profile()] << std::endl; @@ -162,6 +160,10 @@ static hsa_status_t ProcessIterateError(hsa_status_t err) { return err; } +// Find pools for cpu, gpu and for kernel arguments. These pools have +// common basic requirements, but are not suitable for all cases. In +// that case, set cpu_pool(), device_pool() and/or kern_arg_pool() +// yourself instead of using this function. hsa_status_t SetPoolsTypical(BaseRocR* test) { hsa_status_t err; @@ -180,11 +182,9 @@ hsa_status_t SetPoolsTypical(BaseRocR* test) { return HSA_STATUS_SUCCESS; } +// Enable interrupts if necessary, and call hsa_init() hsa_status_t InitAndSetupHSA(BaseRocR* test) { - hsa_agent_t gpu_device1; - hsa_agent_t cpu_device; hsa_status_t err; - hsa_signal_t sig; if (test->enable_interrupt()) { SetEnv("HSA_ENABLE_INTERRUPT", "1"); @@ -193,6 +193,15 @@ hsa_status_t InitAndSetupHSA(BaseRocR* test) { err = hsa_init(); RET_IF_HSA_UTILS_ERR(err); + return HSA_STATUS_SUCCESS; +} + +// Attempt to find and set test->cpu_device and test->gpu_device1 +hsa_status_t SetDefaultAgents(BaseRocR* test) { + hsa_agent_t gpu_device1; + hsa_agent_t cpu_device; + hsa_status_t err; + gpu_device1.handle = 0; err = hsa_iterate_agents(FindGPUDevice, &gpu_device1); RET_IF_HSA_UTILS_ERR(rocrtst::ProcessIterateError(err)); @@ -217,7 +226,7 @@ hsa_status_t InitAndSetupHSA(BaseRocR* test) { char name[64] = {0}; err = hsa_agent_get_info(gpu_device1, HSA_AGENT_INFO_NAME, name); RET_IF_HSA_UTILS_ERR(err); - std::cout << "The device name is " << name << std::endl; + std::cout << "The gpu device name is " << name << std::endl; } hsa_profile_t profile; @@ -228,14 +237,11 @@ hsa_status_t InitAndSetupHSA(BaseRocR* test) { if (!CheckProfileAndInform(test)) { return HSA_STATUS_ERROR; } - - err = hsa_signal_create(1, 0, NULL, &sig); - RET_IF_HSA_UTILS_ERR(err); - test->set_signal(sig); - return HSA_STATUS_SUCCESS; } +// See if the profile of the target matches any required profile by the +// test program. bool CheckProfile(BaseRocR const* test) { if (test->requires_profile() == -1) { return true; @@ -243,6 +249,19 @@ bool CheckProfile(BaseRocR const* test) { return (test->requires_profile() == test->profile()); } } +// Load the specified kernel code from the specified file, inspect and fill +// in BaseRocR member variables related to the kernel and executable. +// Required Input BaseRocR member variables: +// - gpu_device1() +// - kernel_file_name() +// - kernel_name() +// +// Written BaseRocR member variables: +// -kernel_object() +// -private_segment_size() +// -group_segment_size() +// -kernarg_size() +// -kernarg_align() hsa_status_t LoadKernelFromObjFile(BaseRocR* test) { hsa_status_t err; hsa_code_object_reader_t code_obj_rdr = {0}; @@ -334,13 +353,16 @@ hsa_status_t CreateQueue(hsa_agent_t device, hsa_queue_t** queue, return HSA_STATUS_SUCCESS; } - -void InitializeAQLPacket(const BaseRocR* test, +// Initialize the provided aql packet with standard default values, and +// values from provided BaseRocR object. +hsa_status_t InitializeAQLPacket(const BaseRocR* test, hsa_kernel_dispatch_packet_t* aql) { + hsa_status_t err; + assert(aql != nullptr); if (aql == nullptr) { - return; + return HSA_STATUS_ERROR; } aql->header = 0; // Set this right before doorbell ring @@ -361,19 +383,25 @@ void InitializeAQLPacket(const BaseRocR* test, // Pin kernel code and the kernel argument buffer to the aql packet-> aql->kernel_object = test->kernel_object(); - aql->kernarg_address = NULL; - aql->completion_signal.handle = test->signal().handle; + // aql->kernarg_address may be filled in by AllocAndSetKernArgs() if it is + // called before this function, so we don't want overwrite it, therefore + // we ignore it in this function. - return; + err = hsa_signal_create(1, 0, NULL, &aql->completion_signal); + + return err; } -void WriteAQLToQueue(BaseRocR* test) { +// Copy BaseRocR aql object values to the BaseRocR object queue in the +// specified queue position (ind) +hsa_kernel_dispatch_packet_t * WriteAQLToQueue(BaseRocR* test, uint64_t *ind) { assert(test); assert(test->main_queue()); void *queue_base = test->main_queue()->base_address; const uint32_t queue_mask = test->main_queue()->size - 1; uint64_t que_idx = hsa_queue_add_write_index_relaxed(test->main_queue(), 1); + *ind = que_idx; hsa_kernel_dispatch_packet_t* staging_aql_packet = &test->aql(); hsa_kernel_dispatch_packet_t* queue_aql_packet; @@ -395,8 +423,12 @@ void WriteAQLToQueue(BaseRocR* test) { queue_aql_packet->kernel_object = staging_aql_packet->kernel_object; queue_aql_packet->kernarg_address = staging_aql_packet->kernarg_address; queue_aql_packet->completion_signal = staging_aql_packet->completion_signal; + + return queue_aql_packet; } +// Allocate a buffer in the kern_arg_pool for the kernel arguments and write +// the arguments to buffer hsa_status_t AllocAndSetKernArgs(BaseRocR* test, void* args, size_t arg_size) { void* kern_arg_buf = nullptr; hsa_status_t err; @@ -421,56 +453,18 @@ hsa_status_t AllocAndSetKernArgs(BaseRocR* test, void* args, size_t arg_size) { assert(((uintptr_t)adj_kern_arg_buf + arg_size) < ((uintptr_t)kern_arg_buf + buf_size)); - err = hsa_memory_copy_workaround_cpu(adj_kern_arg_buf, args, arg_size); - RET_IF_HSA_UTILS_ERR(err); - hsa_agent_t ag_list[2] = {*test->gpu_device1(), *test->cpu_device()}; err = hsa_amd_agents_allow_access(2, ag_list, NULL, kern_arg_buf); RET_IF_HSA_UTILS_ERR(err); + err = hsa_memory_copy(adj_kern_arg_buf, args, arg_size); + RET_IF_HSA_UTILS_ERR(err); + test->aql().kernarg_address = adj_kern_arg_buf; return HSA_STATUS_SUCCESS; } -hsa_status_t AllocAndAllowAccess(BaseRocR* test, size_t len, - hsa_amd_memory_pool_t pool, void**buffer) { - hsa_status_t err; - - err = hsa_amd_memory_pool_allocate(pool, len, 0, buffer); - RET_IF_HSA_UTILS_ERR(err); - - hsa_agent_t ag_list[2] = {*test->gpu_device1(), *test->cpu_device()}; - err = hsa_amd_agents_allow_access(2, ag_list, NULL, *buffer); - RET_IF_HSA_UTILS_ERR(err); - - return err; -} - -hsa_status_t hsa_memory_fill_workaround_gen(void* ptr, uint32_t value, - size_t count, hsa_agent_t dst_ag, hsa_agent_t src_ag, BaseRocR* test) { - - hsa_status_t err; - - void *tmp_mem; - - err = hsa_amd_memory_pool_allocate(test->cpu_pool(), count, 0, &tmp_mem); - RET_IF_HSA_UTILS_ERR(err); - - hsa_agent_t ag_list[2] = {*test->gpu_device1(), *test->cpu_device()}; - err = hsa_amd_agents_allow_access(2, ag_list, NULL, tmp_mem); - RET_IF_HSA_UTILS_ERR(err); - - (void)memset(tmp_mem, value, count); - - err = hsa_memory_copy_workaround_gen(ptr, tmp_mem, count, dst_ag, src_ag); - RET_IF_HSA_UTILS_ERR(err); - - hsa_amd_memory_pool_free(tmp_mem); - - return HSA_STATUS_SUCCESS; -} - #undef RET_IF_HSA_UTILS_ERR } // namespace rocrtst diff --git a/rocrtst/common/base_rocr_utils.h b/rocrtst/common/base_rocr_utils.h index a1f0c73612..d083608314 100755 --- a/rocrtst/common/base_rocr_utils.h +++ b/rocrtst/common/base_rocr_utils.h @@ -60,14 +60,16 @@ namespace rocrtst { /// \param[in] test Test for which the kernel will be loaded. /// \returns HSA_STATUS_SUCCESS if no errors hsa_status_t LoadKernelFromObjFile(BaseRocR* test); -/// Do initialization tasks for HSA test program. This includes calling -/// hsa_init(), finding and setting the cpu and gpu agent member variables, -/// creating the signal needed for queueing AQL packets and checking -/// HW requirements. + +/// Do initialization tasks for HSA test program. /// \param[in] test Test to initialize /// \returns HSA_STATUS_SUCCESS if no errors hsa_status_t InitAndSetupHSA(BaseRocR* test); +/// Find and set the cpu and gpu agent member variables. Also checks that +/// gpu agent meets test requirements (e.g., FULL profile vs. BASE profile). +hsa_status_t SetDefaultAgents(BaseRocR* test); + /// For the provided device agent, create an AQL queue /// \param[in] device Device for which a queue is to be created /// \param[out] queue Address to which created queue pointer will be written @@ -84,16 +86,16 @@ hsa_status_t CreateQueue(hsa_agent_t device, hsa_queue_t** queue, /// be drawn. /// \param[inout] aql Caller provided pointer to aql packet that will be /// populated -/// \returns void -void InitializeAQLPacket(const BaseRocR* test, +/// \returns Appropriate hsa_status_t +hsa_status_t InitializeAQLPacket(const BaseRocR* test, hsa_kernel_dispatch_packet_t* aql); /// This function writes all of the aql packet fields to the queue besides /// "setup" and "header". This assumes all the aql fields have be set /// appropriately. /// \param[in] test Test containing the queue and aql packet to be written. -/// \returns void -void WriteAQLToQueue(BaseRocR* test); +/// \returns Pointer to dispatch packet in queue that was written to +hsa_kernel_dispatch_packet_t* WriteAQLToQueue(BaseRocR* test, uint64_t *ind); /// This function writes the first 32 bits of an aql packet to the provided /// aql packet. This function is meant to be called immediately before @@ -139,6 +141,15 @@ bool CheckProfile(BaseRocR const* test); hsa_status_t AllocAndSetKernArgs(BaseRocR* test, void* args, size_t arg_size); +/// Verify that the machine running the test has the required profile. +/// This function will verify that the execution machine meets any specific +/// test requirement for a profile (HSA_PROFILE_BASE or HSA_PROFILE_FULL). +/// \param[in] test Test that provides profile requirements. +/// \returns bool +/// - true Machine meets test requirements +/// - false Machine does not meet test requirements +bool CheckProfileAndInform(BaseRocR* test); + /// This function will set the cpu and gpu memory pools to the type used in /// many applications. /// \param[in] test Test that provides profile requirements. @@ -146,17 +157,6 @@ hsa_status_t AllocAndSetKernArgs(BaseRocR* test, void* args, /// error code otherwise. hsa_status_t SetPoolsTypical(BaseRocR* test); -/// Allocate memory from a specified pool and grant both standard BaseRocR -/// agents access -/// \param[in] test Test having the agents to which access is granted -/// \param[in] len Size of the memory buffer to allocate -/// \pool[in] Pool from which to allocate memory -/// \buffer[out] Address of pointer which will point to newly allocated memory -/// upon return -/// \returns HSA_STATUS_OK if no errors -hsa_status_t AllocAndAllowAccess(BaseRocR* test, size_t len, - hsa_amd_memory_pool_t pool, void**buffer); - /// Work-around for hsa_amd_memory_fill, which is currently broken. /// \param[in] ptr Pointer to start of memory location to be filled /// \param[in] value Value to write to each byte of input buffer diff --git a/rocrtst/common/common.cc b/rocrtst/common/common.cc index 0625e0c2b1..1ee4355e49 100755 --- a/rocrtst/common/common.cc +++ b/rocrtst/common/common.cc @@ -341,45 +341,6 @@ hsa_status_t DumpPointerInfo(void* ptr) { return HSA_STATUS_SUCCESS; } -hsa_status_t hsa_memory_fill_workaround_cpu(void* ptr, uint32_t value, - size_t count) { - (void)memset(ptr, value, count); - - return HSA_STATUS_SUCCESS; -} - -hsa_status_t hsa_memory_copy_workaround_cpu(void* dst, const void *src, - size_t size) { - (void)memcpy(dst, src, size); - - return HSA_STATUS_SUCCESS; -} - -hsa_status_t hsa_memory_copy_workaround_gen(void* dst, const void *src, - size_t size, hsa_agent_t dst_ag, hsa_agent_t src_ag) { - hsa_signal_t s; - hsa_status_t err; - - err = hsa_signal_create(1, 0, NULL, &s); - RET_IF_HSA_COMMON_ERR(err); - - err = hsa_amd_memory_async_copy(dst, dst_ag, src, src_ag, size, 0, NULL, s); - RET_IF_HSA_COMMON_ERR(err); - - if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1, - UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0) { - err = HSA_STATUS_ERROR; - std::cout << "Async copy signal error" << std::endl; - - RET_IF_HSA_COMMON_ERR(err); - } - - err = hsa_signal_destroy(s); - - RET_IF_HSA_COMMON_ERR(err); - - return err; -} /*! \brief Writes to the buffer and increments the write pointer to the * buffer. Also, ensures that the argument is written to an diff --git a/rocrtst/common/common.h b/rocrtst/common/common.h index f82aea202f..08a59fa736 100755 --- a/rocrtst/common/common.h +++ b/rocrtst/common/common.h @@ -140,35 +140,5 @@ hsa_status_t DumpMemoryPoolInfo(const hsa_amd_memory_pool_t pool, /// \returns HSA_STATUS_SUCCESS if there are no errors hsa_status_t DumpPointerInfo(void* ptr); -/// This is a work-around for filling cpu-memory to be used until -/// hsa_amd_memory_fill is fixed. Should only be used for cpu memory. -/// \param[in] ptr Start address of memory to be filled. -/// \param[in] value Value to fill buffer with -/// \param[in] count Size of buffer to fill -/// \returns HSA_STATUS_SUCCESS if there are no errors -hsa_status_t hsa_memory_fill_workaround_cpu(void* ptr, uint32_t value, - size_t count); - -/// This is a work-around for copying cpu-memory to be used until -/// hsa_amd_memory_copy is fixed. Should only be used for cpu memory. -/// \param[in] dst Destination address of memory to be copied -/// \param[in] src Source address of memory to be copied -/// \param[in] size Size of buffer to fill -/// \returns HSA_STATUS_SUCCESS if there are no errors -hsa_status_t hsa_memory_copy_workaround_cpu(void* dst, const void *src, - size_t size); - -/// This is a work-around for copying memory to be used until -/// hsa_amd_memory_copy is fixed. Should be used when gpu local memory is -/// involved. -/// \param[in] dst Destination address of memory to be copied -/// \param[in] src Source address of memory to be copied -/// \param[in] size Size of buffer to fill -/// \param[in] dst_ag Destination agent handle -/// \param[in] src_ag Source agent handle -/// \returns HSA_STATUS_SUCCESS if there are no errors -hsa_status_t hsa_memory_copy_workaround_gen(void* dst, const void *src, - size_t size, hsa_agent_t dst_ag, hsa_agent_t src_ag); - } // namespace rocrtst #endif // ROCRTST_COMMON_COMMON_H_ diff --git a/rocrtst/common/helper_funcs.cc b/rocrtst/common/helper_funcs.cc index e0af455863..6e86f6bbc5 100755 --- a/rocrtst/common/helper_funcs.cc +++ b/rocrtst/common/helper_funcs.cc @@ -52,10 +52,10 @@ #include #include #include +#include namespace rocrtst { - template void PrintArray(const std::string header, const T* data, const int width, const int height) { @@ -191,7 +191,7 @@ AlignUp(void* value, size_t alignment) { alignment)); } -double CalcMedian(std::vector scores) { +double CalcMedian(const std::vector &scores) { double median; size_t size = scores.size(); @@ -204,15 +204,11 @@ double CalcMedian(std::vector scores) { return median; } -double CalcMean(std::vector scores) { - double mean = 0; - size_t size = scores.size(); +double CalcMean(const std::vector &scores) { + double mean; - for (size_t i = 0; i < size; ++i) { - mean += scores[i]; - } - - return mean / size; + mean = std::accumulate(scores.begin(), scores.end(), 0.0); + return mean/scores.size(); } double CalcMean(const std::vector& v1, const std::vector& v2) { diff --git a/rocrtst/common/helper_funcs.h b/rocrtst/common/helper_funcs.h index 06008d77fb..5ed8114e95 100755 --- a/rocrtst/common/helper_funcs.h +++ b/rocrtst/common/helper_funcs.h @@ -60,7 +60,7 @@ bool Compare(const double* refData, const double* data, const int length, const double epsilon = 1e-6); /// Calculate the mean number of the vector -double CalcMean(std::vector scores); +double CalcMean(const std::vector &scores); /// Calculate the mean time of difference of the two vectors double CalcMean(const std::vector& v1, const std::vector& v2); @@ -68,7 +68,7 @@ double CalcMean(const std::vector& v1, const std::vector& v2); /// Return the median value of a vector of doubles /// \param[in] scores Vector of doubles /// \returns double Median value of provided vector -double CalcMedian(std::vector scores); +double CalcMedian(const std::vector &scores); /// Calculate the standard deviation of the vector double CalcStdDeviation(std::vector scores, int score_mean); diff --git a/rocrtst/common/hsa_perf_cntrs.cc b/rocrtst/common/hsa_perf_cntrs.cc index d35433de93..fc2a79c66b 100755 --- a/rocrtst/common/hsa_perf_cntrs.cc +++ b/rocrtst/common/hsa_perf_cntrs.cc @@ -70,6 +70,7 @@ PreDispatchCallback(const hsa_dispatch_callback_t* dispParam, void* usrArg) { dispParam->aql_translation_handle, true); assert((status == HSA_STATUS_SUCCESS) && "Error in beginning Perf Cntr Session"); + (void)status; // Avoid warning } static void @@ -82,6 +83,7 @@ PostDispatchCallback(const hsa_dispatch_callback_t* dispParam, void* usrArg) { dispParam->aql_translation_handle); assert((status == HSA_STATUS_SUCCESS) && "Error in endning Perf Cntr Session"); + (void)status; // Avoid warning } /// Constructor of the class @@ -192,6 +194,8 @@ void RocrPerfCntrApp::RegisterCallbacks(hsa_queue_t* queue) { status = hsa_ext_tools_set_callback_arguments(queue, &perfMgr_, &perfMgr_); assert((status == HSA_STATUS_SUCCESS) && "Error in registering Pre & Post Dispatch Callback Params"); + + (void)status; // Avoid warning return; } diff --git a/rocrtst/common/hsatimer.cc b/rocrtst/common/hsatimer.cc index 1e7eef2092..2610ecdd06 100755 --- a/rocrtst/common/hsatimer.cc +++ b/rocrtst/common/hsatimer.cc @@ -176,8 +176,7 @@ uint64_t PerfTimer::MeasureTSCFreqHz() { do { tscTicksEnd = __rdtscp(&unused); - } - while (tscTicksEnd - tscTicksBegin < 1000000000); + } while (tscTicksEnd - tscTicksBegin < 1000000000); uint64_t coarseEndUs = CoarseTimestampUs(); diff --git a/rocrtst/common/hsatimer.h b/rocrtst/common/hsatimer.h index 72b7ba190f..8d12b768eb 100755 --- a/rocrtst/common/hsatimer.h +++ b/rocrtst/common/hsatimer.h @@ -91,6 +91,7 @@ class PerfTimer { void ResetTimer(int index); /// Read the time value of the timer associated with the provided index. + /// Units are seconds /// \param[in] index Index of the timer to read /// \returns double Value of the timer double ReadTimer(int index); diff --git a/rocrtst/samples/CMakeLists.txt b/rocrtst/samples/CMakeLists.txt index bd47600822..a118c68ad5 100755 --- a/rocrtst/samples/CMakeLists.txt +++ b/rocrtst/samples/CMakeLists.txt @@ -254,6 +254,11 @@ set(BITCODE_LIBS "${BITCODE_LIBS} ${BITCODE_PREF}/ocml.amdgcn.bc") set(CL_FILE_LIST "${PROJECT_SOURCE_DIR}/binary_search/binary_search_kernels.cl") process_sample("binary_search") +# P2P Memory Access +set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}") +set(CL_FILE_LIST "${PROJECT_SOURCE_DIR}/p2p_mem_access/p2p_mem_access_kernels.cl") +process_sample("p2p_mem_access") + # RocR Info aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/rocrinfo ROCR_INFO_SOURCES) add_executable(rocrinfo ${ROCR_INFO_SOURCES}) diff --git a/rocrtst/suites/performance/CMakeLists.txt b/rocrtst/suites/performance/CMakeLists.txt index 21629ff95f..129ca7c519 100755 --- a/rocrtst/suites/performance/CMakeLists.txt +++ b/rocrtst/suites/performance/CMakeLists.txt @@ -25,10 +25,6 @@ cmake_minimum_required(VERSION 2.8.0) # 4) Set env. variable TARGET_DEVICE to indicate gpu type (e.g., gfx803, # gfx900, ...) # -# 5) Set env. variables AMDHSAFIN_DIR and and AMDHSAFIN_TARGET to the -# directory containing the amd finalizer executable and version -# (e.g, 8:0:3) respectively. -# # Building rocrtst Suite # # 1) Create build folder e.g. "rocrtst/build" - any name will do @@ -91,6 +87,32 @@ else() endif() endif() +if (DEFINED ENV{OPENCL_DIR}) + set(CLANG $ENV{OPENCL_DIR}/bin/x86_64/clang) + set(OPENCL_DIR $ENV{OPENCL_DIR}) + if (NOT EXISTS ${CLANG}) + message("ERROR: path to clang (${CLANG}) is not valid. Is env. variable OPENCL_DIR correct?") + return() + endif() + + if (DEFINED ENV{OPENCL_VER}) + set(OPENCL_VER $ENV{OPENCL_VER}) + else() + message("OPENCL_VER environment variable is not set. Using default") + set(OPENCL_VER "2.0") + endif() +else() + message("WARNING: OPENCL_DIR environment variable is not set. Kernels will not be built.") +endif() + +if (DEFINED ENV{TARGET_DEVICE}) + set(TARGET_DEVICE $ENV{TARGET_DEVICE}) +else() + message("ERROR: TARGET_DEVICE environment variable is not defined.") + message("Please define a valid clang target (e.g., gfx803, gfx900,...).") + return() +endif() + # # Set Name for rocrtst Suite Project # @@ -105,17 +127,22 @@ project (${ROCRTST_SUITE_NAME}) # Build Type: Debug Vs Release, 32 Vs 64 # Compiler Version, etc # -MESSAGE("") -MESSAGE("-------------IS64BIT: " ${IS64BIT}) -MESSAGE("-----------BuildType: " ${BUILD_TYPE}) -MESSAGE("------------Compiler: " ${CMAKE_CXX_COMPILER}) -MESSAGE("-------------Version: " ${CMAKE_CXX_COMPILER_VERSION}) -MESSAGE("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR}) -MESSAGE("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR}) -MESSAGE("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib) -MESSAGE("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin) -MESSAGE("") +message("") +message("Build Configuration:") +message("-------------IS64BIT: " ${IS64BIT}) +message("-----------BuildType: " ${BUILD_TYPE}) +message("------------Compiler: " ${CMAKE_CXX_COMPILER}) +message("-------------Version: " ${CMAKE_CXX_COMPILER_VERSION}) +message("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR}) +message("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR}) +message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib) +message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin) +message("-------Target Device: " ${TARGET_DEVICE}) +message("----------Clang path: " ${CLANG}) +message("-------OpenCL version " ${OPENCL_VER}) +message("") +set(KERNELS_DIR ${PROJECT_SOURCE_DIR}/kernels) # # Set the build type based on user input # @@ -148,7 +175,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-threadsafe-statics") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pedantic") +# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pedantic") # @@ -164,7 +191,7 @@ endif() # Add compiler flags to include symbol information for debug builds # if(ISDEBUG) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb -O0") endif() MESSAGE("ISDEBUG STEP:Done") @@ -201,10 +228,11 @@ MESSAGE(${ROCRTST_LIBS}) set(ROCRTST "rocrtst${ONLY64STR}") # -# Sorce files for building rocrtst +# Source files for building rocrtst # aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} performanceSources) - +aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/test_common testCommonSources) +aux_source_directory(${ROCRTST_ROOT}/suites/test_common testCommonSources) # Header file include path @@ -212,11 +240,51 @@ include_directories(${ROCR_INC_DIR}) include_directories(${ROCRTST_ROOT}) include_directories(${ROCRTST_ROOT}/gtest/include) -# Build rules +# Use this function to build any samples that have kernels to be built +function(build_kernel S_NAME) + set(SNAME_KERNEL "${S_NAME}_kernels.hsaco") + set(TARG_NAME "${S_NAME}_hsaco") + set(HSACO_TARG_LIST ${HSACO_TARG_LIST} ${TARG_NAME} PARENT_SCOPE) + separate_arguments(CLANG_ARG_LIST UNIX_COMMAND "-target amdgcn-amdh-amdhsa -mcpu=${TARGET_DEVICE} -include ${OPENCL_DIR}/include/opencl-c.h ${BITCODE_LIBS} -cl-std=CL${OPENCL_VER} ${CL_FILE_LIST} -o ${PROJECT_BINARY_DIR}/${SNAME_KERNEL}") + add_custom_target(${TARG_NAME} ${CLANG} ${CLANG_ARG_LIST} + COMMENT "BUILDING KERNEL..." + VERBATIM) +endfunction(build_kernel) -add_executable(${ROCRTST} ${performanceSources} ${common_srcs}) +###################### +# Kernel Build Section +###################### +set(KERN_SUFFIX "kernels.hsaco") +set(BITCODE_PREF "-Xclang -mlink-bitcode-file -Xclang") +set(BITCODE_PREF "${BITCODE_PREF} ${OPENCL_DIR}/lib/x86_64/bitcode") + +set(COMMON_BITCODE_LIBS "${BITCODE_PREF}/opencl.amdgcn.bc") +set(COMMON_BITCODE_LIBS "${COMMON_BITCODE_LIBS} ${BITCODE_PREF}/ockl.amdgcn.bc") + +# To build kernels, repeat the pattern used below for the P2P kernel; this +# pattern sets the bitcode libraries required by the kernel which will be +# used in the build_kernel() call, which builds the kernel. + +# Test Case Template example +set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}") +set(CL_FILE_LIST "${KERNELS_DIR}/test_case_template_kernels.cl") +build_kernel("test_case_template") + +# P2P Memory Access +#set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}") +#set(CL_FILE_LIST "${KERNELS_DIR}/p2p_mem_access_kernels.cl") +#build_kernel("p2p_mem_access") + +# Dispatch Time +set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}") +set(CL_FILE_LIST "${KERNELS_DIR}/dispatch_time_kernels.cl") +build_kernel("dispatch_time") + +# Build rules +add_executable(${ROCRTST} ${performanceSources} ${common_srcs} ${testCommonSources}) target_link_libraries(${ROCRTST} ${ROCRTST_LIBS} c stdc++ dl pthread rt) +add_custom_target(rocrtst_kernels DEPENDS ${HSACO_TARG_LIST}) INSTALL(TARGETS ${ROCRTST} ARCHIVE DESTINATION ${PROJECT_BINARY_DIR}/lib LIBRARY DESTINATION ${PROJECT_BINARY_DIR}/lib diff --git a/rocrtst/suites/performance/cp_process_time.cc b/rocrtst/suites/performance/cp_process_time.cc deleted file mode 100755 index a393c617d1..0000000000 --- a/rocrtst/suites/performance/cp_process_time.cc +++ /dev/null @@ -1,258 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#include "cp_process_time.h" -#include "common/base_rocr_utils.h" -#include "common/common.h" -#include "common/helper_funcs.h" -#include "common/hsatimer.h" -#include "common/os.h" -#include "gtest/gtest.h" -#include "hsa/hsa.h" -#include "hsa/hsa_ext_amd.h" -#include "hsa/hsa_ext_finalize.h" -#include - -static const uint64_t kKernelIterations = 10000; -static const uint64_t kTestBadValue = 1234567891234567891; -//Set up some expectations for reasonable processing times -//For gfx803, Overhead time had a max of 18.208uS and a min of 7.82uS -static const double kGfx803MinOverhead = 7.78; -static const double kGfx803MaxOverhead = 21.064; -static const double kOverheadToleranceFactor = 0.25; - -CpProcessTime::CpProcessTime() : - BaseRocR() { - // kernel_name_ = "&__simple_kernel"; - mean_ = 0.0; -} - -CpProcessTime::~CpProcessTime() { -} - -void CpProcessTime::SetUp() { - hsa_status_t err; - set_kernel_file_name("simple_kernel.o"); - set_kernel_name("&__simple_kernel"); - - if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) { - return; - } - hsa_agent_t* gpu_dev = gpu_device1(); - - // Create a queue - hsa_queue_t* q = nullptr; - rocrtst::CreateQueue(*gpu_dev, &q); - ASSERT_NE(q, nullptr); - set_main_queue(q); - - // Set profiling - err = hsa_amd_profiling_set_profiler_enabled(q, 1); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Load and finalize the kernel - err = rocrtst::LoadKernelFromObjFile(this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - rocrtst::InitializeAQLPacket(this, &aql()); - aql().workgroup_size_x = 1; - aql().grid_size_x = 1; -} - -size_t CpProcessTime::RealIterationNum() { - return num_iteration() * 1.2 + 1; -} - -void CpProcessTime::Run() { - hsa_status_t err; - std::vector timer; - - if (!rocrtst::CheckProfile(this)) { - return; - } - - hsa_agent_t* gpu_dev = gpu_device1(); - hsa_agent_t* cpu_dev = cpu_device(); - - ASSERT_NE(gpu_dev, nullptr); - ASSERT_NE(cpu_dev, nullptr); - uint32_t it = RealIterationNum(); - - typedef struct args_t { - uint64_t* iteration; - uint64_t* result; - } args; - - err = rocrtst::SetPoolsTypical(this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - uint64_t* iter = NULL; - uint64_t* result = NULL; - err = rocrtst::AllocAndAllowAccess(this, sizeof(uint64_t), cpu_pool(), - (void**)&iter); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = rocrtst::AllocAndAllowAccess(this, sizeof(uint64_t), cpu_pool(), - (void**)&result); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - *iter = kKernelIterations; - *result = kTestBadValue; - - args k_args; - - k_args.iteration = (uint64_t*)iter; - k_args.result = (uint64_t*)result; - - err = rocrtst::AllocAndSetKernArgs(this, &k_args, sizeof(args)); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - rocrtst::WriteAQLToQueue(this); - - void * q_base_addr = main_queue()->base_address; - const uint32_t queue_mask = main_queue()->size - 1; - uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH; -// aql_header |= HSA_FENCE_SCOPE_SYSTEM << -// HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; -// aql_header |= HSA_FENCE_SCOPE_SYSTEM << -// HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; - - for (uint32_t i = 0; i < it; i++) { - // uint64_t que_idx = hsa_queue_load_write_index_relaxed(main_queue()); - uint64_t que_idx = hsa_queue_add_write_index_relaxed(main_queue(), 1); - - //Get timing stamp an ring the doorbell to dispatch the kernel. - rocrtst::PerfTimer p_timer; - int id = p_timer.CreateTimer(); - p_timer.StartTimer(id); - - rocrtst::AtomicSetPacketHeader(aql_header, aql().setup, - &((hsa_kernel_dispatch_packet_t*)(q_base_addr))[que_idx & queue_mask]); - - hsa_queue_store_write_index_relaxed(main_queue(), (que_idx + 1)); - hsa_signal_store_relaxed(main_queue()->doorbell_signal, que_idx); - - while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1, - (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) - ; -// hsa_signal_value_t value = hsa_signal_wait_scacquire(signal(), -// HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); - // value should be 0, or we timed-out - //ASSERT_EQ(value, 0); - - p_timer.StopTimer(id); - - hsa_amd_profiling_dispatch_time_t dispatch_time; - err = hsa_amd_profiling_get_dispatch_time(*gpu_dev, signal(), - &dispatch_time); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - uint64_t ticks = dispatch_time.end - dispatch_time.start; - uint64_t freq; - - err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freq); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - hsa_signal_store_screlease(signal(), 1); - - double execution_time = (double) ticks / freq * 1e6; //convert to us - double temp = p_timer.ReadTimer(id) * 1e6; - double cp_time = temp - execution_time; - -#ifdef DEBUG - std::cout << "Total:" << temp << "uS "; - std::cout << "Execution:" << execution_time << "uS "; - std::cout << "Overhead:" << cp_time << "uS "; - std::cout << "Overhead %:" << cp_time / execution_time * 100 << std::endl; -#endif - - EXPECT_EQ(kKernelIterations, *result); - timer.push_back(cp_time); - - //Assume overhead will not deviate too much from previously recorded - // values. If this does happen and there is not a performance bug, - // modify these constants - - //This may need to be made specific to the gpu being used - EXPECT_GT(cp_time, kGfx803MinOverhead * (1 - kOverheadToleranceFactor)); - EXPECT_LT(cp_time, kGfx803MaxOverhead * (1 + kOverheadToleranceFactor)); - - *result = 0; - } - - //Abandon the first result and after sort, delete the last 2% value - timer.erase(timer.begin()); - std::sort(timer.begin(), timer.end()); - - timer.erase(timer.begin() + num_iteration(), timer.end()); - mean_ = rocrtst::CalcMean(timer); - - return; -} - -void CpProcessTime::DisplayResults() const { - - if (!rocrtst::CheckProfile(this)) { - return; - } - - if (mean_ == 0.0) { - return; - } - - std::cout << "====================================================" - << std::endl; - std::cout << "The average Command Processor processing time is: " << mean_ - << "us" << std::endl; - std::cout << "====================================================" - << std::endl; - return; -} - -void CpProcessTime::Close() { - hsa_status_t err; - err = rocrtst::CommonCleanUp(this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); -} diff --git a/rocrtst/suites/performance/cp_process_time.h b/rocrtst/suites/performance/cp_process_time.h deleted file mode 100755 index 6abec7d9bb..0000000000 --- a/rocrtst/suites/performance/cp_process_time.h +++ /dev/null @@ -1,91 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#ifndef __ROCRTST_SRC_CP_PROCESS_TIME_H__ -#define __ROCRTST_SRC_CP_PROCESS_TIME_H__ -#include "perf_common/perf_base.h" -#include "common/base_rocr.h" -#include "common/common.h" -#include "hsa/hsa.h" -#include "hsa/hsa_ext_amd.h" -#include - -//@Brief: This class is defined to measure the mean latency of launching -//an empty kernel - -class CpProcessTime: public rocrtst::BaseRocR, public PerfBase { - public: - //@Brief: Constructor - CpProcessTime(); - - //@Brief: Destructor - virtual ~CpProcessTime(); - - //@Brief: Set up the environment for the test - virtual void SetUp(); - - //@Brief: Run the test case - virtual void Run(); - - //@Brief: Display results we got - virtual void DisplayResults() const; - - //@Brief: Clean up and close the runtime - virtual void Close(); - - private: - //@Brief: Get actual iteration number - virtual size_t RealIterationNum(); - - //@Brief: Store the size of queue - uint32_t queue_size_; - - //@Brief: The mean time of CP Processing - double mean_; - -}; - -#endif - diff --git a/rocrtst/suites/performance/cu_masking.cc b/rocrtst/suites/performance/cu_masking.cc deleted file mode 100644 index 29f11f377a..0000000000 --- a/rocrtst/suites/performance/cu_masking.cc +++ /dev/null @@ -1,220 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#include "cu_masking.h" -#include "common/base_rocr_utils.h" -#include "gtest/gtest.h" - -CuMasking::CuMasking() : - BaseRocR() { - memset(&aql(), 0, sizeof(hsa_kernel_dispatch_packet_t)); - mean_ = 0.0; - group_region_.handle = 0; - cu_ = NULL; -} - -CuMasking::~CuMasking() { -} - -void CuMasking::SetUp() { - hsa_status_t err; - - hsa_agent_t* gpu_dev = gpu_device1(); - hsa_agent_t* cpu_dev = cpu_device(); - - set_kernel_file_name("cu_masking.o"); - set_kernel_name("&main"); - - if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) { - return; - } - - // Create a queue - hsa_queue_t* q = nullptr; - rocrtst::CreateQueue(*gpu_dev, &q); - set_main_queue(q); - - rocrtst::LoadKernelFromObjFile(this); - - // Fill up the kernel packet except header - // aql().completion_signal=signal(); - // TODO: Will delete manual_input later - uint32_t cu_count = 0; - err = hsa_agent_get_info(*gpu_dev, - (hsa_agent_info_t) HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - std::cout << "CU# is: " << cu_count << std::endl; - - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - rocrtst::InitializeAQLPacket(this, &aql()); - aql().workgroup_size_x = 1024; - - //manual_input * group_input; // workgroup_max_size; - aql().grid_size_x = (long long) 1024 * 640 * 640; - - // TODO:Manully set the max cu number to 8, the api return 10 - std::cout << "Grid size is: " << aql().grid_size_x << std::endl; - - err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, - rocrtst::FindGlobalPool, &cpu_pool()); - ASSERT_EQ(err, HSA_STATUS_INFO_BREAK); -} - -size_t CuMasking::RealIterationNum() { - return num_iteration() * 1.2 + 1; -} - -void CuMasking::Run() { - hsa_status_t err; - - if (!rocrtst::CheckProfile(this)) { - return; - } - - std::vector timer; - - typedef struct args_t { - uint32_t* iteration; - uint32_t* result; - } local_args; - - uint32_t* iter = NULL; - uint32_t* result = NULL; - err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(uint32_t), 0, - (void**) &iter); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(uint32_t), 0, - (void**) &result); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - *iter = 0xff; - *result = 0; - - err = hsa_amd_agents_allow_access(1, gpu_device1(), NULL, iter); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - err = hsa_amd_agents_allow_access(1, gpu_device1(), NULL, result); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - local_args* kernarg = NULL; - err = hsa_amd_memory_pool_allocate(cpu_pool(), kernarg_size(), 0, - (void**) &kernarg); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = hsa_amd_agents_allow_access(1, gpu_device1(), NULL, kernarg); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - kernarg->iteration = iter; - kernarg->result = result; - - aql().kernarg_address = kernarg; - - // Obtain the current queue write inex. - uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1); - - // Write the aql packet at the calculate queue index address. - const uint32_t queue_mask = main_queue()->size - 1; - - // Set CU mask - uint32_t cu_mask = 0; -#if 0 - std::cout << "Enter cu mask value:" << std::endl; - ASSERT_NE(scanf("%d", &cu_mask), EOF); -#else - cu_mask = 0xAAAAAAAA; -#endif - - std::cout << "Value of bit array is: 0x" << std::hex << cu_mask << std::endl; - err = hsa_amd_queue_cu_set_mask(main_queue(), 32, &cu_mask); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - void *q_base_addr = main_queue()->base_address; - // Write the aql packet at the calculate queue index address. - aql().completion_signal = signal(); - ((hsa_kernel_dispatch_packet_t*)(q_base_addr))[index & queue_mask] = aql(); - - // Get timing stamp an ring the doorbell to dispatch the kernel. - rocrtst::PerfTimer p_timer; - int id = p_timer.CreateTimer(); - p_timer.StartTimer(id); - ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask].header |= - HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; - hsa_signal_store_screlease(main_queue()->doorbell_signal, index); - - // Wait on the dispatch signal until the kernel is finished. - while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1, - (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) - ; - - p_timer.StopTimer(id); - - hsa_signal_store_screlease(signal(), 1); - - double t1 = p_timer.ReadTimer(id) * 1e6; - std::cout << "Execution time after setting cu masking: " << t1 << std::endl; - - return; -} - -void CuMasking::DisplayResults() const { - - if (!rocrtst::CheckProfile(this)) { - return; - } - - std::cout << "====================================================" - << std::endl; - - std::cout << "=====================================================" - << std::endl; - return; -} - -void CuMasking::Close() { - hsa_status_t err; - err = rocrtst::CommonCleanUp(this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); -} diff --git a/rocrtst/suites/performance/cu_masking.h b/rocrtst/suites/performance/cu_masking.h deleted file mode 100755 index e6826d9572..0000000000 --- a/rocrtst/suites/performance/cu_masking.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#ifndef __ROCRTST_SRC_CU_MASKING_TIME_H__ -#define __ROCRTST_SRC_CU_MASKING_TIME_H__ - -#include "perf_common/perf_base.h" -#include "common/base_rocr.h" -#include "common/common.h" -#include "common/hsatimer.h" -#include "hsa/hsa.h" -#include "hsa/hsa_ext_amd.h" -#include "hsa/hsa_ext_finalize.h" -#include -#include - -//@Brief: This class is defined to measure the mean latency of launching -//an empty kernel - -class CuMasking: public rocrtst::BaseRocR, public PerfBase { - public: - //@Brief: Constructor - CuMasking(); - - //@Brief: Destructor - virtual ~CuMasking(); - - //@Brief: Set up the environment for the test - virtual void SetUp(); - - //@Brief: Run the test case - virtual void Run(); - - //@Brief: Display results we got - virtual void DisplayResults() const; - - //@Brief: Clean up and close the runtime - virtual void Close(); - - private: - //@Brief: Get actual iteration number - virtual size_t RealIterationNum(); - - //@Brief: Store the size of queue - uint32_t queue_size_; - - //@Brief: The mean time of CP Processing - double mean_; - - //@Brief: The group memory region - hsa_region_t group_region_; - - //@Brief: Pointer to cu_id array - uint32_t* cu_; - - uint32_t manual_input; - uint32_t group_input; -}; - -#endif - diff --git a/rocrtst/suites/performance/device_load_bandwidth.cc b/rocrtst/suites/performance/device_load_bandwidth.cc deleted file mode 100755 index 5cfcf829b8..0000000000 --- a/rocrtst/suites/performance/device_load_bandwidth.cc +++ /dev/null @@ -1,293 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#include "device_load_bandwidth.h" -#include "common/base_rocr_utils.h" -#include "common/common.h" -#include "common/helper_funcs.h" -#include "common/hsatimer.h" -#include "common/os.h" -#include "gtest/gtest.h" -#include - -// TODO: The validation code has problems to debug -#if 0 -static void initGlobalReadBuffer(uint32_t* in_data, uint32_t num_thrds, - uint32_t num_ops, uint32_t num_loops) { - - // Populate input buffer with thread Id left shifted by 2. - uint32_t value = 0; - uint32_t val_idx; - - for (uint32_t idx1 = 0; idx1 < num_loops; idx1++) { - val_idx = 0; - for (uint32_t idx2 = 0; idx2 < num_ops; idx2++) { - // Write the value to be read by each thread - for (uint32_t idx3 = 0; idx3 < num_thrds; idx3++) { - value = idx3 << 2; - in_data[val_idx++] = value; - } - } - } - - return; -} - -static bool verifyGlobalLoadKernel(uint32_t* data, uint32_t num_thrds, - uint32_t scale, const char* kernel_name) { - - // Verify kernel operation i.e. validate the data in the output buffer. - uint32_t valid_value = 0; - - for (uint32_t idx = 0; idx < num_thrds; idx++) { - - valid_value = (idx << 2) * scale; - - - if (data[idx] != valid_value) { - std::cout << "Value expected = " << valid_value << std::endl; - std::cout << "Value of data = " << data[idx] << std::endl; - - std::cout << kernel_name << ": VALIDATION FAILED ! Bad index: " << idx - << std::endl; - std::cout << kernel_name << ": VALUE @ Bad index: " << data[idx] - << std::endl; - std::cout << std::endl; - return false; - } - } - -#ifdef DEBUG - std::cout << kernel_name << ": Passed validation" << std::endl; - std::cout << std::endl; -#endif - - return true; -} -#endif - -// Constructor -DeviceLoadBandwidth::DeviceLoadBandwidth() : - BaseRocR() { - - set_group_size(0); - set_enable_interrupt(false); - - num_group_ = 0; - num_cus_ = 0; - - kernel_loop_count_ = 0; - mean_ = 0.0; - data_size_ = 0; - - set_requires_profile (HSA_PROFILE_BASE); -} - -// Destructor -DeviceLoadBandwidth::~DeviceLoadBandwidth() { -} - -// Set up the test environment -void DeviceLoadBandwidth::SetUp() { - SetWorkItemNum(); - - set_kernel_file_name("sysMemRead.o"); - set_kernel_name("&__SysMemLoad"); - - if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) { - return; - } - - hsa_agent_t* gpu_dev = gpu_device1(); - - //Create a queue with max number size - hsa_queue_t* q = nullptr; - rocrtst::CreateQueue(*gpu_dev, &q); - ASSERT_NE(q, nullptr); - set_main_queue(q); - - rocrtst::LoadKernelFromObjFile(this); - - uint32_t total_work_items = num_cus_ * num_group_ * group_size(); - - //Fill up part of aql - rocrtst::InitializeAQLPacket(this, &aql()); - aql().workgroup_size_x = group_size(); - aql().grid_size_x = total_work_items; - - return; -} - -// Run the test -void DeviceLoadBandwidth::Run() { - hsa_status_t err; - - if (!rocrtst::CheckProfile(this)) { - return; - } - - uint32_t total_workitems = num_cus_ * num_group_ * group_size(); - - uint32_t ops_thrd = 32; - uint64_t addr_step = (uint64_t) total_workitems * sizeof(uint64_t); - uint64_t total_ops = (uint64_t) total_workitems * ops_thrd; - uint64_t in_data_size = (uint64_t) total_ops * sizeof(uint64_t); - - data_size_ = in_data_size; - - err = rocrtst::SetPoolsTypical(this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = rocrtst::AllocAndAllowAccess(this, in_data_size, device_pool(), - (void**)&in_data_); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - //uint32_t out_data_size = total_workitems * sizeof(uint64_t); - uint32_t out_data_size = in_data_size; - - err = rocrtst::AllocAndAllowAccess(this, out_data_size, device_pool(), - (void**)&out_data_); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - -#if 0 - initGlobalReadBuffer(in_data_, total_workitems, ops_thrd, kernel_loop_count_); -#endif - - struct local_args_t { - void* arg0; - void* arg1; - uint64_t arg2; - void* arg3; - } local_args; - - local_args.arg0 = in_data_; - local_args.arg1 = in_data_ + total_ops; - local_args.arg2 = addr_step; - local_args.arg3 = out_data_; - - // Copy the kernel args structure into a registered memory block - err = rocrtst::AllocAndSetKernArgs(this, &local_args, sizeof(local_args)); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - std::vector time; - - rocrtst::WriteAQLToQueue(this); - // Write the aql packet at the calculated queue index address. - const uint32_t queue_mask = main_queue()->size - 1; - void * q_base = main_queue()->base_address; - - for (uint32_t i = 0; i < num_iteration(); i++) { - uint64_t que_idx = hsa_queue_load_write_index_relaxed(main_queue()); - - rocrtst::PerfTimer p_timer; - int id = p_timer.CreateTimer(); - p_timer.StartTimer(id); - - uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH; - rocrtst::AtomicSetPacketHeader(aql_header, aql().setup, - &((hsa_kernel_dispatch_packet_t*)(q_base))[que_idx & queue_mask]); - hsa_signal_store_screlease(main_queue()->doorbell_signal, que_idx); - - // Wait on the dispatch signal until the kernel is finished. - while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1, - (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) - ; - - p_timer.StopTimer(id); - -#ifdef DEBUG - std::cout << "." << std::flush; -#endif - -#if 0 - // Verify the results - uint32_t scale = kernel_loop_count_ * ops_thrd; - verifyGlobalLoadKernel(out_data_, total_workitems, scale, - kernel_name().c_str()); -#endif - time.push_back(p_timer.ReadTimer(id)); - - hsa_signal_store_screlease(signal(), 1); - } - -#ifdef DEBUG - std::cout << std::endl; -#endif - - time.erase(time.begin()); - std::sort(time.begin(), time.end()); - time.erase(time.begin() + num_iteration(), time.end()); - mean_ = rocrtst::CalcMean(time); - - return; -} - -void DeviceLoadBandwidth::Close() { - hsa_status_t err; - - err = hsa_amd_memory_pool_free(in_data_); - EXPECT_EQ(err, HSA_STATUS_SUCCESS); - - err = hsa_amd_memory_pool_free(out_data_); - EXPECT_EQ(err, HSA_STATUS_SUCCESS); - - err = rocrtst::CommonCleanUp(this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - return; -} - -void DeviceLoadBandwidth::DisplayResults() const { - if (!rocrtst::CheckProfile(this)) { - return; - } - - std::cout << "=======================================" << std::endl; - std::cout << "Device Load Bandwidth: "; - std::cout << data_size_ / mean_ / 1024 / 1024 / 1024 << "(GB/S)" << std::endl; - std::cout << "=======================================" << std::endl; - - return; -} diff --git a/rocrtst/suites/performance/device_store_bandwidth.cc b/rocrtst/suites/performance/device_store_bandwidth.cc deleted file mode 100755 index d2d51075d2..0000000000 --- a/rocrtst/suites/performance/device_store_bandwidth.cc +++ /dev/null @@ -1,219 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#include "device_store_bandwidth.h" -#include "common/base_rocr_utils.h" -#include "common/common.h" -#include "common/helper_funcs.h" -#include "common/hsatimer.h" -#include "gtest/gtest.h" - -// Constructor -DeviceStoreBandwidth::DeviceStoreBandwidth() : - BaseRocR() { - - set_group_size(0); - num_group_ = 0; - num_cus_ = 0; - - kernel_loop_count_ = 0; - mean_ = 0.0; - data_size_ = 0; - set_requires_profile (HSA_PROFILE_BASE); - in_data_ = nullptr; - out_data_ = nullptr; -} - -// Destructor -DeviceStoreBandwidth::~DeviceStoreBandwidth() { -} - -// Set up the test environment -void DeviceStoreBandwidth::SetUp() { - SetWorkItemNum(); - - set_kernel_file_name("sysMemWrite.o"); - set_kernel_name("&__SysMemStore"); - - if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) { - return; - } - - hsa_agent_t* gpu_dev = gpu_device1(); - - //Create a queue with max number size - hsa_queue_t* q = nullptr; - rocrtst::CreateQueue(*gpu_dev, &q); - ASSERT_NE(q, nullptr); - set_main_queue(q); - - rocrtst::LoadKernelFromObjFile(this); - - uint32_t total_work_items = num_cus_ * num_group_ * group_size(); - - //Fill up part of aql - rocrtst::InitializeAQLPacket(this, &aql()); - aql().workgroup_size_x = group_size(); - aql().grid_size_x = total_work_items; - - return; -} - -// Run the test -void DeviceStoreBandwidth::Run() { - hsa_status_t err; - - if (!rocrtst::CheckProfile(this)) { - return; - } - - uint32_t total_workitems = num_cus_ * num_group_ * group_size(); - - uint32_t ops_thrd = 16; - uint64_t addr_step = (uint64_t) total_workitems * sizeof(uint32_t); - uint64_t total_ops = (uint64_t) total_workitems * kernel_loop_count_ - * ops_thrd; - uint64_t in_data_size = (uint64_t) total_ops * sizeof(uint32_t); - - data_size_ = in_data_size; - - err = rocrtst::SetPoolsTypical(this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = rocrtst::AllocAndAllowAccess(this, in_data_size, device_pool(), - (void**)&in_data_); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - uint32_t out_data_size = total_workitems * sizeof(uint32_t); - - err = rocrtst::AllocAndAllowAccess(this, out_data_size, device_pool(), - (void**)&out_data_); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - struct local_args_t { - void* arg0; - void* arg1; - uint64_t arg2; - void* arg3; - } local_args; - - local_args.arg0 = in_data_; - local_args.arg1 = in_data_ + total_ops; - local_args.arg2 = addr_step; - local_args.arg3 = out_data_; - - // Copy the kernel args structure into a registered memory block - err = rocrtst::AllocAndSetKernArgs(this, &local_args, sizeof(local_args)); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - std::vector time; - - rocrtst::WriteAQLToQueue(this); - - for (uint32_t i = 0; i < num_iteration(); i++) { - uint64_t que_idx = hsa_queue_load_write_index_relaxed(main_queue()); - - // Write the aql packet at the calculated queue index address. - const uint32_t queue_mask = main_queue()->size - 1; - - rocrtst::PerfTimer p_timer; - int id = p_timer.CreateTimer(); - p_timer.StartTimer(id); - - void * q_base = main_queue()->base_address; - uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH; - rocrtst::AtomicSetPacketHeader(aql_header, aql().setup, - &((hsa_kernel_dispatch_packet_t*)(q_base))[que_idx & queue_mask]); - hsa_signal_store_screlease(main_queue()->doorbell_signal, que_idx); - - // Wait on the dispatch signal until the kernel is finished. - while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1, - (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) - ; - - p_timer.StopTimer(id); - -#ifdef DEBUG - std::cout << "." << std::flush; -#endif - - time.push_back(p_timer.ReadTimer(id)); - - hsa_signal_store_screlease(signal(), 1); - } - -#ifdef DEBUG - std::cout << std::endl; -#endif - - time.erase(time.begin()); - mean_ = rocrtst::CalcMean(time); - - return; -} - -void DeviceStoreBandwidth::Close() { - hsa_status_t err; - - err = hsa_amd_memory_pool_free(in_data_); - EXPECT_EQ(err, HSA_STATUS_SUCCESS); - err = hsa_amd_memory_pool_free(out_data_); - EXPECT_EQ(err, HSA_STATUS_SUCCESS); - - err = rocrtst::CommonCleanUp(this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - return; -} - -void DeviceStoreBandwidth::DisplayResults() const { - if (!rocrtst::CheckProfile(this)) { - return; - } - std::cout << "=======================================" << std::endl; - std::cout << "Device Store Bandwidth: "; - std::cout << data_size_ / mean_ / 1024 / 1024 / 1024 << "(GB/S)" << std::endl; - std::cout << "=======================================" << std::endl; - return; -} diff --git a/rocrtst/suites/performance/device_store_bandwidth.h b/rocrtst/suites/performance/device_store_bandwidth.h deleted file mode 100755 index 4aa032751e..0000000000 --- a/rocrtst/suites/performance/device_store_bandwidth.h +++ /dev/null @@ -1,119 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#ifndef __ROCRTST_SRC_INC_DEVICE_STORE_BANDWIDTH_H__ -#define __ROCRTST_SRC_INC_DEVICE_STORE_BANDWIDTH_H__ - -#include "perf_common/perf_base.h" -#include "common/base_rocr.h" -#include "hsa/hsa.h" -#include - -class DeviceStoreBandwidth: public rocrtst::BaseRocR, public PerfBase { - public: - //@Brief: Constructor - DeviceStoreBandwidth(); - - //@Brief: Destructor - ~DeviceStoreBandwidth(); - - //@Brief: Set up the testing environment - virtual void SetUp(); - - //@Brief: Run the test case - virtual void Run(); - - //@Brief: Close and clean up the test enrionment - virtual void Close(); - - //@Brief: Display load bandwidth - virtual void DisplayResults() const; - - //@Brief: Set work-item configuration - void SetWorkItemNum() { -#ifdef INTERACTIVE - uint32_t tmp; - printf("Please input the number of CUs you want to try:\n"); - scanf("%d", &num_cus_); - - printf("Please input the number of groups you want to try:\n"); - scanf("%d", &num_group_); - - printf("Please input the size of each group:\n"); - scanf("%d", &tmp); - set_group_size(tmp); - - printf("Please input the number of kernel loop you want to try:\n"); - scanf("%d", &kernel_loop_count_); -#else - num_cus_ = 32; - num_group_ = 128; - set_group_size(64); - kernel_loop_count_ = 16; -#endif - return; - } - - private: - //@Brief: number of group - uint32_t num_group_; - - //@Brief: number of CUs - uint32_t num_cus_; - - //@Brief: number of kernel loop - uint32_t kernel_loop_count_; - - //@Brief: Mean execution time - double mean_; - - //@Brief: data size for test - uint64_t data_size_; - uint32_t* in_data_; - uint32_t* out_data_; -}; - -#endif - diff --git a/rocrtst/suites/performance/dispatch_time.cc b/rocrtst/suites/performance/dispatch_time.cc index 3b4a9262b4..400c314906 100755 --- a/rocrtst/suites/performance/dispatch_time.cc +++ b/rocrtst/suites/performance/dispatch_time.cc @@ -43,7 +43,10 @@ * */ -#include "dispatch_time.h" +#include +#include + +#include "suites/performance/dispatch_time.h" #include "common/base_rocr_utils.h" #include "common/common.h" #include "common/os.h" @@ -52,40 +55,68 @@ #include "gtest/gtest.h" #include "hsa/hsa.h" #include "hsa/hsa_ext_finalize.h" -#include -DispatchTime::DispatchTime() : - BaseRocR() { - use_default_ = false; - launch_single_ = false; +DispatchTime:: +DispatchTime(bool defaultInterrupt, bool launchSingleKernel) : TestBase(), + use_default_interupt_(defaultInterrupt), + launch_single_(launchSingleKernel) { queue_size_ = 0; num_batch_ = 100000; memset(&aql(), 0, sizeof(hsa_kernel_dispatch_packet_t)); - single_default_mean_ = 0.0; - single_interrupt_mean_ = 0.0; - multi_default_mean_ = 0.0; - multi_interrupt_mean_ = 0.0; + dispatch_time_mean_ = 0.0; + set_num_iteration(100); + + set_kernel_file_name("dispatch_time_kernels.hsaco"); + set_kernel_name("empty_kernel"); + + std::string name; + std::string desc; + + name = "Average Dispatch Time"; + desc = "This test measures the time to handle AQL packets that " + "do no work. Time is measured from when the packet is made available to" + " the Command Processor to when the target agent notifies the host that " + "the packet has been executed. "; + + if (defaultInterrupt) { + name += ", Default Interrupts"; + desc += "Interrupts are controlled by HSA_ENABLE_INTERRUPT environment " + "variable. "; + } else { + name += ", Interrupts Enabled"; + desc += "Interrupts are enabled. "; + } + + if (launchSingleKernel) { + name += ", Single Kernel"; + desc += " One kernel at a time is and executed."; + } else { + name += ", Multiple Kernels"; + desc += " Enough kernels to fill the queue are dispatched at one time"; + } + + set_title(name); + set_description(desc); } DispatchTime::~DispatchTime() { - } void DispatchTime::SetUp() { - // If it indicates to use default signal, set env var properly - if (use_default_) { + hsa_status_t err; + + // This need to happen before TestBase::SetUp() + if (use_default_interupt_) { set_enable_interrupt(false); - } - else { + } else { set_enable_interrupt(true); } - set_kernel_file_name("empty_kernel.o"); - set_kernel_name("&__Empty_kernel"); + TestBase::SetUp(); + // If it indicates to use default signal, set env var properly - if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) { - return; - } + err = SetDefaultAgents(this); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); hsa_agent_t* gpu_dev = gpu_device1(); @@ -105,24 +136,26 @@ void DispatchTime::SetUp() { num_batch_ = num_batch_ > size ? size : num_batch_; } - rocrtst::LoadKernelFromObjFile(this); + err = rocrtst::LoadKernelFromObjFile(this); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); // Fill up the kernel packet except header - rocrtst::InitializeAQLPacket(this, &aql()); + err = rocrtst::InitializeAQLPacket(this, &aql()); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); + aql().workgroup_size_x = 1; aql().grid_size_x = 1; } void DispatchTime::Run() { - if (!rocrtst::CheckProfile(this)) { return; } + TestBase::Run(); if (launch_single_) { RunSingle(); - } - else { + } else { RunMulti(); } } @@ -137,59 +170,59 @@ void DispatchTime::RunSingle() { int it = RealIterationNum(); const uint32_t queue_mask = main_queue()->size - 1; - //queue should be empty + // queue should be empty ASSERT_EQ(hsa_queue_load_read_index_scacquire(main_queue()), hsa_queue_load_write_index_scacquire(main_queue())); void *q_base_addr = main_queue()->base_address; for (int i = 0; i < it; i++) { - //Obtain the current queue write index. + // Obtain the current queue write index. uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1); ASSERT_LT(index, main_queue()->size + index); - //Write the aql packet at the calculated queue index address. + // Write the aql packet at the calculated queue index address. - ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql(); - - //Get timing stamp and ring the doorbell to dispatch the kernel. + reinterpret_cast( + q_base_addr)[index & queue_mask] = aql(); + // Get timing stamp and ring the doorbell to dispatch the kernel. rocrtst::PerfTimer p_timer; int id = p_timer.CreateTimer(); p_timer.StartTimer(id); - ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask].header |= - HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; + reinterpret_cast( + q_base_addr)[index & queue_mask].header |= + HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; + hsa_signal_store_screlease(main_queue()->doorbell_signal, index); - //Wait on the dispatch signal until the kernel is finished. - while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1, - (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) - ; + // Wait on the dispatch signal until the kernel is finished. + while (hsa_signal_wait_scacquire(aql().completion_signal, + HSA_SIGNAL_CONDITION_LT, 1, (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) { + } + p_timer.StopTimer(id); timer.push_back(p_timer.ReadTimer(id)); - hsa_signal_store_screlease(signal(), 1); + hsa_signal_store_screlease(aql().completion_signal, 1); -#ifdef DEBUG - std::cout << "."; - fflush(stdout); -#endif + if (verbosity() >= VERBOSE_PROGRESS) { + std::cout << "."; + fflush(stdout); + } } - std::cout << std::endl; + if (verbosity() >= VERBOSE_PROGRESS) { + std::cout << std::endl; + } - //Abandon the first result and after sort, delete the last 2% value + // Abandon the first result and after sort, delete the last 2% value timer.erase(timer.begin()); std::sort(timer.begin(), timer.end()); timer.erase(timer.begin() + num_iteration(), timer.end()); - if (use_default_) { - single_default_mean_ = rocrtst::CalcMean(timer); - } - else { - single_interrupt_mean_ = rocrtst::CalcMean(timer); - } + dispatch_time_mean_ = rocrtst::CalcMean(timer); return; } @@ -199,72 +232,69 @@ void DispatchTime::RunMulti() { int it = RealIterationNum(); const uint32_t queue_mask = main_queue()->size - 1; - //queue should be empty + // queue should be empty ASSERT_EQ(hsa_queue_load_read_index_scacquire(main_queue()), hsa_queue_load_write_index_scacquire(main_queue())); - for (int i = 0; i < it; i++) { - uint64_t* index = (uint64_t*) malloc(sizeof(uint64_t) * num_batch_); + rocrtst::PerfTimer p_timer; - hsa_signal_store_screlease(signal(), num_batch_); + for (int i = 0; i < it; i++) { + uint64_t* index = + reinterpret_cast(malloc(sizeof(uint64_t) * num_batch_)); + + hsa_signal_store_screlease(aql().completion_signal, num_batch_); for (uint32_t j = 0; j < num_batch_; j++) { - //index[j] = hsa_queue_add_write_index_scacq_screl(main_queue(), 1); + // index[j] = hsa_queue_add_write_index_scacq_screl(main_queue(), 1); index[j] = hsa_queue_add_write_index_relaxed(main_queue(), 1); - //Write the aql packet at the calculated queue index address. - ((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index[j] - & queue_mask] = aql(); + // Write the aql packet at the calculated queue index address. + (reinterpret_cast(( + main_queue()->base_address)))[index[j] & queue_mask] = aql(); if (j == num_batch_ - 1) { - ((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index[j] - & queue_mask].header |= 1 << HSA_PACKET_HEADER_BARRIER; - - //TODO: verify if the below is needed. I don't think it is. It should - // already be initialized to signal(). - ((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index[j] - & queue_mask].completion_signal = signal(); + (reinterpret_cast( + main_queue()->base_address))[index[j] & queue_mask].header |= + 1 << HSA_PACKET_HEADER_BARRIER; } } // Set packet header reversly; set all headers except the very first // one, for now. for (uint32_t j = num_batch_ - 1; j > 0; j--) { - - ((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index[j] - & queue_mask].header |= HSA_PACKET_TYPE_KERNEL_DISPATCH - << HSA_PACKET_HEADER_TYPE; + reinterpret_cast( + (main_queue()->base_address))[index[j] & queue_mask].header |= + HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; } - //Get timing stamp and ring the doorbell to dispatch the kernel. - rocrtst::PerfTimer p_timer; + // Get timing stamp and ring the doorbell to dispatch the kernel. int id = p_timer.CreateTimer(); p_timer.StartTimer(id); - //Set the very first header... - ((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index[0] - & queue_mask].header |= HSA_PACKET_TYPE_KERNEL_DISPATCH - << HSA_PACKET_HEADER_TYPE; + // Set the very first header... + (reinterpret_cast( + main_queue()->base_address))[index[0] & queue_mask].header |= + HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; for (uint32_t j = 0; j < num_batch_; j++) { hsa_signal_store_screlease(main_queue()->doorbell_signal, index[j]); } - //Wait on the dispatch signal until the kernel is finished. - while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_EQ, 0, - UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0) - ; + // Wait on the dispatch signal until the kernel is finished. + while (hsa_signal_wait_scacquire(aql().completion_signal, + HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0) { + } p_timer.StopTimer(id); timer.push_back(p_timer.ReadTimer(id)); - hsa_signal_store_screlease(signal(), 1); + hsa_signal_store_screlease(aql().completion_signal, 1); free(index); -#ifdef DEBUG - std::cout << "."; - fflush(stdout); -#endif + if (verbosity() >= VERBOSE_PROGRESS) { + std::cout << "."; + fflush(stdout); + } } std::cout << std::endl; @@ -275,57 +305,34 @@ void DispatchTime::RunMulti() { timer.erase(timer.begin() + num_iteration(), timer.end()); - if (use_default_) { - multi_default_mean_ = rocrtst::CalcMean(timer); - } - else { - multi_interrupt_mean_ = rocrtst::CalcMean(timer); - } + dispatch_time_mean_ = rocrtst::CalcMean(timer); return; } -void DispatchTime::DisplayResults() const { +void DispatchTime::DisplayTestInfo(void) { + TestBase::DisplayTestInfo(); +} +void DispatchTime::DisplayResults(void) const { if (!rocrtst::CheckProfile(this)) { return; } - std::cout << "====================================================" - << std::endl; + TestBase::DisplayResults(); - if (use_default_) { - if (launch_single_) { - std::cout << "Single_Default: " << single_default_mean_ * 1e6 - << std::endl; - } - else { - std::cout << "Multi_Default: " - << multi_default_mean_ * 1e6 / num_batch_ << std::endl; - } - } - else { - if (launch_single_) { - std::cout << "Single_Interrupt: " << single_interrupt_mean_ * 1e6 - << std::endl; - } - else { - std::cout << "Multi_Interrupt: " - << multi_interrupt_mean_ * 1e6 / num_batch_ << std::endl; - } + std::cout << "Average Time to Completion: "; + if (launch_single_) { + std::cout << dispatch_time_mean_ * 1e6; + } else { + std::cout << dispatch_time_mean_ * 1e6 / num_batch_; } - std::cout << "=====================================================" - << std::endl; - + std::cout << " uS" << std::endl; return; } void DispatchTime::Close() { - hsa_status_t err; - - err = rocrtst::CommonCleanUp(this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - + TestBase::Close(); return; } diff --git a/rocrtst/suites/performance/dispatch_time.h b/rocrtst/suites/performance/dispatch_time.h index 559cd5733f..7df879ed9a 100755 --- a/rocrtst/suites/performance/dispatch_time.h +++ b/rocrtst/suites/performance/dispatch_time.h @@ -43,83 +43,68 @@ * */ -#ifndef __ROCRTST_SRC_DISPATCH_TIME_H__ -#define __ROCRTST_SRC_DISPATCH_TIME_H__ -#include "perf_common/perf_base.h" +#ifndef ROCRTST_SUITES_PERFORMANCE_DISPATCH_TIME_H_ +#define ROCRTST_SUITES_PERFORMANCE_DISPATCH_TIME_H_ +#include + +#include "suites/test_common/test_base.h" #include "common/base_rocr.h" #include "common/common.h" #include "hsa/hsa.h" -#include -//@Brief: This class is defined to measure the mean latency of launching -//an empty kernel +// @Brief: This class is defined to measure the mean latency of launching +// an empty kernel -class DispatchTime: public rocrtst::BaseRocR, public PerfBase { +class DispatchTime : public TestBase { public: - //@Brief: Constructor - DispatchTime(); + // @Brief: Constructor + DispatchTime(bool defaultInterrupt, bool launchSingleKernel); - //@Brief: Destructor - virtual ~DispatchTime(); + // @Brief: Destructor + virtual ~DispatchTime(void); - //@Brief: Set up the environment for the test - virtual void SetUp(); + // @Brief: Set up the environment for the test + virtual void SetUp(void); - //@Brief: Run the test case - virtual void Run(); + // @Brief: Run the test case + virtual void Run(void); - //@Brief: Display results we got - virtual void DisplayResults() const; + // @Brief: Display results we got + virtual void DisplayResults(void) const; - //@Brief: Clean up and close the runtime - virtual void Close(); + // @Brief: Display information about what this test does + virtual void DisplayTestInfo(void); - //@Brief: Choose if use default signal or not - void UseDefaultSignal(bool use_default = true) { - use_default_ = use_default; - } - - //@Brief; Choose to launch a single kernels or not - void LaunchSingleKernel(bool launch_single = true) { - launch_single_ = launch_single; - } + // @Brief: Clean up and close the runtime + virtual void Close(void); private: - //@Brief: Get actual iteration number - virtual size_t RealIterationNum(); + // @Brief: Get actual iteration number + virtual size_t RealIterationNum(void); - //@Brief: Launch single packet each time - virtual void RunSingle(); + // @Brief: Launch single packet each time + virtual void RunSingle(void); - //@Brief: Launch multiple packets each time - virtual void RunMulti(); + // @Brief: Launch multiple packets each time + virtual void RunMulti(void); - //@Brief: Indicate if use default signal or not - bool use_default_; + // @Brief: Indicate if use default signal or not + bool use_default_interupt_; - //@Brief: Indicate if launch single kernel or not + // @Brief: Indicate if launch single kernel or not bool launch_single_; - //@Brief: Store the size of queue + // @Brief: Store the size of queue uint32_t queue_size_; - //@Brief: Number of packets in a batch + // @Brief: Number of packets in a batch uint32_t num_batch_; - //@Brief: Time of single default signal dispatch time - double single_default_mean_; - - //@Brief: Time of single interrupt signal dispatch time - double single_interrupt_mean_; - - //@Brief: Time of multi default signal dispatch time - double multi_default_mean_; - - //@Brief: Time of multi interrupt signal dispatch time - double multi_interrupt_mean_; + // @Brief: Ave. dispatch time + double dispatch_time_mean_; char* orig_iterrupt_env_; }; -#endif +#endif // ROCRTST_SUITES_PERFORMANCE_DISPATCH_TIME_H_ diff --git a/rocrtst/suites/performance/flush_latency.cc b/rocrtst/suites/performance/flush_latency.cc deleted file mode 100755 index 298aefb780..0000000000 --- a/rocrtst/suites/performance/flush_latency.cc +++ /dev/null @@ -1,351 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#include "flush_latency.h" -#include "common/base_rocr_utils.h" -#include "common/common.h" -#include "common/helper_funcs.h" -#include "common/hsatimer.h" -#include "common/os.h" -#include "gtest/gtest.h" -#include - -static const int kWorkItem = 1024 * 1204; -// Constructor -FlushLatency::FlushLatency() : - BaseRocR() { - set_group_size(0); - num_group_ = 0; - num_cus_ = 0; - - kernel_loop_count_ = 0; - mean_ = 0.0; - data_size_ = 0; - - set_requires_profile (HSA_PROFILE_BASE); -} - -// Destructor -FlushLatency::~FlushLatency() { -} - -// Set up the test environment -void FlushLatency::SetUp() { - hsa_status_t err; - - SetWorkItemNum(); - - set_kernel_file_name("flush_latency.o"); - set_kernel_name("&main"); - - if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) { - return; - } - - hsa_agent_t* gpu_dev = gpu_device1(); - - //Create a queue with max number size - hsa_queue_t* q; - rocrtst::CreateQueue(*gpu_dev, &q); - set_main_queue(q); - - //Enable profiling - err = hsa_amd_profiling_set_profiler_enabled(main_queue(), 1); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - rocrtst::LoadKernelFromObjFile(this); - - uint32_t total_work_items = kWorkItem * 0.3; - - //Fill up part of aql - rocrtst::InitializeAQLPacket(this, &aql()); - aql().workgroup_size_x = group_size(); - aql().grid_size_x = total_work_items; - - return; -} - -// Run the test -void FlushLatency::Run() { - hsa_status_t err; - hsa_amd_memory_pool_t cpu_pool; - - if (!rocrtst::CheckProfile(this)) { - return; - } - - hsa_agent_t* gpu_dev = gpu_device1(); - hsa_agent_t* cpu_dev = cpu_device(); - - err = hsa_amd_agent_iterate_memory_pools(*gpu_dev, rocrtst::FindStandardPool, - &device_pool()); - ASSERT_EQ(err, HSA_STATUS_INFO_BREAK); - - ASSERT_NE(device_pool().handle, 0); - - cpu_pool.handle = 0; - err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool, - &cpu_pool); - ASSERT_EQ(err, HSA_STATUS_INFO_BREAK); - - ASSERT_NE(cpu_pool.handle, 0); - -#if DEBUG - std::cout << "Device Pool Properties:" << std::endl; - err = rocrtst::DumpMemoryPoolInfo(device_pool()); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - std::cout << "Global Pool Properties:" << std::endl; - err = rocrtst::DumpMemoryPoolInfo(cpu_pool); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); -#endif - uint32_t out_data_size = 1024 * 1024 * sizeof(uint32_t); - - std::vector time_none; - std::vector time_release; - - std::vector < uint64_t > time_none_stamp; - std::vector < uint64_t > time_release_stamp; - - //Query system timestamp frequency - uint64_t freq; - err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freq); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - void* out = NULL; - uint32_t* out_data; - const uint32_t queue_mask = main_queue()->size - 1; - typedef struct local_args_t { - void* arg0; - } args; - - // Warm up - uint16_t header = 0; - header |= HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; - header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; - header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; - aql().header = header; - - err = hsa_amd_memory_pool_allocate(device_pool(), out_data_size, 0, - (void**) &out_data); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - args* kern_ptr = NULL; - err = hsa_amd_memory_pool_allocate(cpu_pool, sizeof(args), 0, - (void**) &kern_ptr); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - kern_ptr->arg0 = out_data; - - aql().kernarg_address = kern_ptr; - - // Obtain the current queue write index - int64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1); - - void *q_base_addr = main_queue()->base_address; - // Write the aql packet at the calculated queue index address. - ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql(); - - hsa_signal_store_screlease(main_queue()->doorbell_signal, index); - - // Wait on the dispatch signal until the kernel is finished. - while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1, - (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) - ; - - hsa_signal_store_screlease(signal(), 1); - - for (int i = 0; i < 1000; i++) { - err = hsa_amd_memory_pool_allocate(device_pool(), out_data_size, 0, - (void**) &out_data); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - args* kern_ptr = NULL; - err = hsa_amd_memory_pool_allocate(cpu_pool, sizeof(args), 0, - (void**) &kern_ptr); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - kern_ptr->arg0 = out_data; - - aql().kernarg_address = kern_ptr; - - // Obtain the current queue write index - int64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1); - - // Write the aql packet at the calculated queue index address. - ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql(); - - hsa_signal_store_screlease(main_queue()->doorbell_signal, index); - - // Wait on the dispatch signal until the kernel is finished. - while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1, - (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) - ; - - hsa_amd_profiling_dispatch_time_t dispatch_time; - err = hsa_amd_profiling_get_dispatch_time(*gpu_dev, signal(), - &dispatch_time); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - uint64_t sys_start = 0; - uint64_t sys_end = 0; - err = hsa_amd_profiling_convert_tick_to_system_domain(*gpu_dev, - dispatch_time.start, &sys_start); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - err = hsa_amd_profiling_convert_tick_to_system_domain(*gpu_dev, - dispatch_time.end, &sys_end); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - uint64_t stamp = dispatch_time.end - dispatch_time.start; - double execution_time = (double) stamp / freq * 1e6; // convert to us. - - time_none.push_back(execution_time); - time_none_stamp.push_back(stamp); - - hsa_signal_store_screlease(signal(), 1); - - if (out != NULL) { - err = hsa_memory_free(out); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - } - - out = out_data; - out_data = NULL; - } - - header = 0; - header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; - header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; - header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; - aql().header = header; - - for (int i = 0; i < 1000; i++) { - err = hsa_amd_memory_pool_allocate(device_pool(), out_data_size, 0, - (void**) &out_data); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - args* kern_ptr = NULL; - err = hsa_amd_memory_pool_allocate(cpu_pool, sizeof(args), 0, - (void**) &kern_ptr); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - kern_ptr->arg0 = out_data; - - aql().kernarg_address = kern_ptr; - - // Obtain the current queue write index - uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1); - - // Write the aql packet at the calculated queue index address. - ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql(); - - hsa_signal_store_screlease(main_queue()->doorbell_signal, index); - - // Wait on the dispatch signal until the kernel is finished. - while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1, - (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) - ; - - hsa_signal_store_screlease(signal(), 1); - - hsa_amd_profiling_dispatch_time_t dispatch_time; - err = hsa_amd_profiling_get_dispatch_time(*gpu_dev, signal(), - &dispatch_time); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - uint64_t sys_start = 0; - uint64_t sys_end = 0; - err = hsa_amd_profiling_convert_tick_to_system_domain(*gpu_dev, - dispatch_time.start, &sys_start); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - err = hsa_amd_profiling_convert_tick_to_system_domain(*gpu_dev, - dispatch_time.end, &sys_end); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - uint64_t stamp = dispatch_time.end - dispatch_time.start; - double execution_time = (double) stamp / freq * 1e6; // convert to us. - time_release.push_back(execution_time); - time_release_stamp.push_back(stamp); - - if (out != NULL) { - err = hsa_memory_free(out); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - } - - out = out_data; - out_data = NULL; - } - - std::sort(time_none.begin(), time_none.end()); - std::sort(time_release.begin(), time_release.end()); - - time_none.erase(time_none.begin(), time_none.begin() + 50); - time_none.erase(time_none.end() - 50, time_none.end()); - time_release.erase(time_release.begin(), time_release.begin() + 50); - time_release.erase(time_release.end() - 50, time_release.end()); - - mean_ = rocrtst::CalcMean(time_none, time_release); - - return; -} - -void FlushLatency::Close() { - hsa_status_t err; - err = rocrtst::CommonCleanUp(this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); -} - -void FlushLatency::DisplayResults() const { - - if (!rocrtst::CheckProfile(this)) { - return; - } - - std::cout << std::endl << "=======================================" - << std::endl; - std::cout << "Average cache flush overhead: " << mean_ << "uS" - << std::endl; - std::cout << "=======================================" << std::endl; - return; -} diff --git a/rocrtst/suites/performance/flush_latency.h b/rocrtst/suites/performance/flush_latency.h deleted file mode 100755 index 4d4a25fa2d..0000000000 --- a/rocrtst/suites/performance/flush_latency.h +++ /dev/null @@ -1,122 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#ifndef __ROCRTST_SRC_INC_FLUSH_LATENCY_H__ -#define __ROCRTST_SRC_INC_FLUSH_LATENCY_H__ - -#include "perf_common/perf_base.h" -#include "common/base_rocr.h" -#include "hsa/hsa.h" -#include - -class FlushLatency: public rocrtst::BaseRocR, public PerfBase { - public: - //@Brief: Constructor - FlushLatency(); - - //@Brief: Destructor - ~FlushLatency(); - - //@Brief: Set up the testing environment - virtual void SetUp(); - - //@Brief: Run the test case - virtual void Run(); - - //@Brief: Close and clean up the test enrionment - virtual void Close(); - - //@Brief: Display load bandwidth - virtual void DisplayResults() const; - - //@Brief: Set work-item configuration - void SetWorkItemNum() { -#ifdef INTERACTIVE - uint32_t tmp; - printf("Please input the number of CUs you want to try:\n"); - int i; - i = scanf("%d", &num_cus_); - - printf("Please input the number of groups you want to try:\n"); - i = scanf("%d", &num_group_); - - printf("Please input the size of each group:\n"); - i = scanf("%d", &tmp); - set_group_size(tmp); - - printf("Please input the number of kernel loop you want to try:\n"); - i = scanf("%d", &kernel_loop_count_); -#else - num_cus_ = 32; - num_group_ = 128; - group_size_ = 256; - kernel_loop_count_ = 16; -#endif - return; - } - - private: - //@Brief: number of work item in one group - uint32_t group_size_; - - //@Brief: number of group - uint32_t num_group_; - - //@Brief: number of CUs - uint32_t num_cus_; - - //@Brief: number of kernel loop - uint32_t kernel_loop_count_; - - //@Brief: Mean execution time - double mean_; - - //@Brief: data size for test - uint64_t data_size_; - -}; - -#endif - diff --git a/rocrtst/suites/performance/hsa_info.cc b/rocrtst/suites/performance/hsa_info.cc deleted file mode 100755 index fc7848e358..0000000000 --- a/rocrtst/suites/performance/hsa_info.cc +++ /dev/null @@ -1,502 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#include "common/base_rocr_utils.h" -#include "common/common.h" -#include "gtest/gtest.h" -#include "hsa_info.h" - -static hsa_status_t get_agent_info(hsa_agent_t, void*); - -static hsa_status_t get_pool_info(hsa_amd_memory_pool_t, void*); - -static int agent_number = 0; -static bool output_amd = false; - -//@Brief: Map to store the peak FLOPS for different agent -std::map flops_table = { {"Kaveri CPU", 118.4}, { - "S pectre", 737.0 - }, {"Carrizo CPU", 67.2}, {"Carrizo GPU", 819.2} -}; - -//@Brief: Vector to store the agent_names -std::vector agent_names = {"Kaveri CPU", "Spectre", - "Carri zo CPU", "Carrizo GPU" - }; - -HsaInfo::HsaInfo() : - BaseRocR() { -} - -HsaInfo::~HsaInfo() { -} - -void HsaInfo::SetUp() { - // Get Env Var to determine if output AMD specific info - char* EnvVar = rocrtst::GetEnv("HSA_VENDOR_AMD"); - - if (NULL != EnvVar) { - output_amd = ('1' == *EnvVar); - } - - if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) { - return; - } -} - -void HsaInfo::Run() { - hsa_status_t err; - // Get the system info first - // Get version info - uint16_t major, minor; - - if (!rocrtst::CheckProfile(this)) { - return; - } - - err = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &major); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - err = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &minor); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Get timestamp frequency - uint64_t timestamp_frequency = 0; - err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, - ×tamp_frequency); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Get maximum duration of a signal wait operation - uint64_t max_wait = 0; - err = hsa_system_get_info(HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT, &max_wait); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Get Endianness of the system - hsa_endianness_t endianness; - err = hsa_system_get_info(HSA_SYSTEM_INFO_ENDIANNESS, &endianness); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Get machine model info - hsa_machine_model_t machine_model; - err = hsa_system_get_info(HSA_SYSTEM_INFO_MACHINE_MODEL, &machine_model); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Print out the results - std::cout << "HSA System Info:" << std::endl; - std::cout << "Runtime Version: " << major << - "." << minor << std::endl; - std::cout << "System Timestamp Frequency: " << - timestamp_frequency / 1e6 << "MHz" << std::endl; - - std::cout << "Signal Max Wait Duration: " << max_wait - << "(number of timestamp)" << std::endl; - std::cout << "Machine Model: "; - - if (HSA_MACHINE_MODEL_SMALL == machine_model) { - std::cout << "SMALL" << std::endl; - } - else if (HSA_MACHINE_MODEL_LARGE == machine_model) { - std::cout << "LARGE" << std::endl; - } - - std::cout << "System Endianness: "; - - if (HSA_ENDIANNESS_LITTLE == endianness) { - std::cout << "LITTLE" << std::endl; - } - else if (HSA_ENDIANNESS_BIG == endianness) { - std::cout << "BIG" << std::endl; - } - - std::cout << std::endl; - - // Iterate every agent and get their info - err = hsa_iterate_agents(get_agent_info, NULL); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - return; - -} - -#define RET_IF_HSA_INFO_ERR(err) { \ - if ((err) != HSA_STATUS_SUCCESS) { \ - std::cout << "hsa api call failure at line " << __LINE__ << ", file: " << \ - __FILE__ << std::endl; \ - return (err); \ - } \ -} - -static hsa_status_t get_agent_info(hsa_agent_t agent, void* data) { - int pool_number = 0; - hsa_status_t err; - { - // Increase the number of agent - agent_number++; - - // Get agent name and vendor - char name[64]; - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, name); - RET_IF_HSA_INFO_ERR(err) - char vendor_name[64]; - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_VENDOR_NAME, &vendor_name); - RET_IF_HSA_INFO_ERR(err) - - // Get agent feature - hsa_agent_feature_t agent_feature; - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_FEATURE, &agent_feature); - RET_IF_HSA_INFO_ERR(err) - - // Get profile supported by the agent - hsa_profile_t agent_profile; - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_profile); - RET_IF_HSA_INFO_ERR(err) - - // Get floating-point rounding mode - hsa_default_float_rounding_mode_t float_rounding_mode; - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE, - &float_rounding_mode); - RET_IF_HSA_INFO_ERR(err) - - // Get max number of queue - uint32_t max_queue = 0; - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUES_MAX, &max_queue); - RET_IF_HSA_INFO_ERR(err) - - // Get queue min size - uint32_t queue_min_size = 0; - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, - &queue_min_size); - RET_IF_HSA_INFO_ERR(err) - - // Get queue max size - uint32_t queue_max_size = 0; - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, - &queue_max_size); - RET_IF_HSA_INFO_ERR(err) - - // Get queue type - hsa_queue_type_t queue_type; - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_TYPE, &queue_type); - RET_IF_HSA_INFO_ERR(err) - - // Get agent node - uint32_t node; - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NODE, &node); - RET_IF_HSA_INFO_ERR(err) - - // Get device type - hsa_device_type_t device_type; - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); - RET_IF_HSA_INFO_ERR(err) - - // Get cache size - uint32_t cache_size[4]; - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_CACHE_SIZE, cache_size); - RET_IF_HSA_INFO_ERR(err) - - // Get chip id - uint32_t chip_id = 0; - err = hsa_agent_get_info(agent, - (hsa_agent_info_t) HSA_AMD_AGENT_INFO_CHIP_ID, - &chip_id); - RET_IF_HSA_INFO_ERR(err) - - // Get cacheline size - uint32_t cacheline_size = 0; - err = hsa_agent_get_info(agent, - (hsa_agent_info_t) HSA_AMD_AGENT_INFO_CACHELINE_SIZE, - &cacheline_size); - RET_IF_HSA_INFO_ERR(err) - - // Get Max clock frequency - uint32_t max_clock_freq = 0; - err = hsa_agent_get_info(agent, - (hsa_agent_info_t) HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, - &max_clock_freq); - RET_IF_HSA_INFO_ERR(err) - - // Get Agent BDFID - uint16_t bdf_id = 1; - err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_BDFID, - &bdf_id); - RET_IF_HSA_INFO_ERR(err) - - // Get number of Compute Unit - uint32_t compute_unit = 0; - err = hsa_agent_get_info(agent, - (hsa_agent_info_t) HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, - &compute_unit); - RET_IF_HSA_INFO_ERR(err) - - // Print out the common results - std::cout << std::endl; - std::cout << "Agent #" << agent_number << ":" << std::endl; - std::cout << "Agent Name: " << name << - std::endl; - std::cout << "Agent Vendor Name: " << - vendor_name << std::endl; - - if (agent_feature & HSA_AGENT_FEATURE_KERNEL_DISPATCH - && agent_feature & HSA_AGENT_FEATURE_AGENT_DISPATCH) - std::cout << "Agent Feature: KERNEL_DISPATCH & AGENT_DISPATCH" - << std::endl; - else if (agent_feature & HSA_AGENT_FEATURE_KERNEL_DISPATCH) { - std::cout << "Agent Feature: KERNEL_DISPATCH" << std::endl; - } - else if (agent_feature & HSA_AGENT_FEATURE_AGENT_DISPATCH) { - std::cout << "Agent Feature: AGENT_DISPATCH" << std::endl; - } - else { - std::cout << "Agent Feature: Not Supported" << std::endl; - } - - if (HSA_PROFILE_BASE == agent_profile) { - std::cout << "Agent Profile: BASE_PROFILE" << std::endl; - } - else if (HSA_PROFILE_FULL == agent_profile) { - std::cout << "Agent Profile: FULL_PROFILE" << std::endl; - } - else { - std::cout << "Agent Profile: Not Supported" << std::endl; - } - - if (HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO == float_rounding_mode) { - std::cout << "Agent Floating Rounding Mode: ZERO" << std::endl; - } - else if (HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR == float_rounding_mode) { - std::cout << "Agent Floating Rounding Mode: NEAR" << std::endl; - } - else { - std::cout << "Agent Floating Rounding Mode: Not Supported" << std::endl; - } - - std::cout << "Agent Max Queue Number: " << max_queue << std::endl; - std::cout << "Agent Queue Min Size: " << queue_min_size << std::endl; - std::cout << "Agent Queue Max Size: " << queue_max_size << std::endl; - - if (HSA_QUEUE_TYPE_MULTI == queue_type) { - std::cout << "Agent Queue Type: MULTI" << std::endl; - } - else if (HSA_QUEUE_TYPE_SINGLE == queue_type) { - std::cout << "Agent Queue Type: SINGLE" << std::endl; - } - else { - std::cout << "Agent Queue Type: Not Supported" << std::endl; - } - - std::cout << "Agent Node: " << node << std::endl; - - if (HSA_DEVICE_TYPE_CPU == device_type) { - std::cout << "Agent Device Type: CPU" << std::endl; - } - else if (HSA_DEVICE_TYPE_GPU == device_type) { - std::cout << "Agent Device Type: GPU" << std::endl; - // Get ISA info - hsa_isa_t agent_isa; - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_ISA, &agent_isa); - RET_IF_HSA_INFO_ERR(err) - } - else { - std::cout << "Agent Device Type: DSP" << std::endl; - } - - std::cout << "Agent Cache Info:" << std::endl; - - for (int i = 0; i < 4; i++) { - if (cache_size[i]) { - std::cout << " $L" << i + 1 << ": " << cache_size[i] / 1024 - << "KB" << std::endl; - } - } - - std::cout << "Agent Chip ID: " << chip_id << std::endl; - std::cout << "Agent Cacheline Size: " << cacheline_size << std::endl; - std::cout << "Agent Max Clock Frequency: " << max_clock_freq << "MHz" - << std::endl; - std::cout << "Agent BDFID: " << bdf_id << std::endl; - std::cout << "Agent Compute Unit: " << compute_unit << std::endl; - - // Output Peak FLOPS and Peak Bandwidth if Env var is set - // TODO: Fan, need to add BW - if (output_amd) { - std::string agent_name = name; - - for (size_t i = 0; i < agent_names.size(); i++) { - if (agent_name.compare(agent_names[i]) == 0) - std::cout << "Agent Peak GFLOPS: " << flops_table[agent_name] - << std::endl; - } - } - - // Check if the agent is kernel agent - if (agent_feature & HSA_AGENT_FEATURE_KERNEL_DISPATCH) { - - // Get flaf of fast_f16 operation - bool fast_f16; - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_FAST_F16_OPERATION, - &fast_f16); - RET_IF_HSA_INFO_ERR(err) - - // Get wavefront size - uint32_t wavefront_size = 0; - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, - &wavefront_size); - RET_IF_HSA_INFO_ERR(err) - - // Get max total number of work-items in a workgroup - uint32_t workgroup_max_size = 0; - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, - &workgroup_max_size); - RET_IF_HSA_INFO_ERR(err) - - // Get max number of work-items of each dimension of a work-group - uint16_t workgroup_max_dim[3]; - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, - &workgroup_max_dim); - RET_IF_HSA_INFO_ERR(err) - - // Get max number of a grid per dimension - hsa_dim3_t grid_max_dim; - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_DIM, - &grid_max_dim); - RET_IF_HSA_INFO_ERR(err) - - // Get max total number of work-items in a grid - uint32_t grid_max_size = 0; - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_SIZE, - &grid_max_size); - RET_IF_HSA_INFO_ERR(err) - - // Get max number of fbarriers per work group - uint32_t fbarrier_max_size = 0; - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, - &fbarrier_max_size); - RET_IF_HSA_INFO_ERR(err) - - // Print info for kernel agent - if (true == fast_f16) { - std::cout << "Agent Fast F16 Operation: TRUE" << - std::endl; - } - - std::cout << "Agent Wavefront Size: " << - wavefront_size << std::endl; - std::cout << "Agent Workgroup Max Size: " << - workgroup_max_size << std::endl; - std::cout << - "Agent Workgroup Max Size Per Dimension: " << - std::endl; - - for (int i = 0; i < 3; i++) { - std::cout << " Dim[" << i << - "]: " << workgroup_max_dim[i] << - std::endl; - } - - std::cout << "Agent Grid Max Size: " << - grid_max_size << std::endl; - - // Stop using the above kmt functions as per SWDEV-97044 - // - uint32_t waves_per_cu = 0; - err = hsa_agent_get_info(agent, - (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, - &waves_per_cu); - RET_IF_HSA_INFO_ERR(err) - std::cout << "Agent Waves Per CU: " << - waves_per_cu << std::endl; - std::cout << "Agent Max Work-item Per CU: " - << wavefront_size* waves_per_cu << std::endl; - - std::cout << "Agent Grid Max Size per Dimension:" << std::endl; - - for (int i = 0; i < 3; i++) { - std::cout << " Dim[" << i << - "] " - << reinterpret_cast(&grid_max_dim)[i] << std::endl; - } - - std::cout << "Agent Max number Of fbarriers Per Workgroup: " - << fbarrier_max_size << std::endl; - } - } - - // Get pool info - std::cout << "Agent Pool Info:" << std::endl; - err = hsa_amd_agent_iterate_memory_pools(agent, get_pool_info, &pool_number); - RET_IF_HSA_INFO_ERR(err) - - return HSA_STATUS_SUCCESS; -} - -// Implement region iteration function -hsa_status_t get_pool_info(hsa_amd_memory_pool_t pool, void* data) { - hsa_status_t err; - int* p_int = reinterpret_cast(data); - (*p_int)++; - - std::cout << " Pool #" << *p_int << ":" << std::endl; - - err = rocrtst::DumpMemoryPoolInfo(pool, 4); - RET_IF_HSA_INFO_ERR(err) - - return err; -} - -#undef RET_IF_HSA_INFO_ERR - -void HsaInfo::DisplayResults() const { - if (!rocrtst::CheckProfile(this)) { - return; - } - - return; -} - -void HsaInfo::Close() { - hsa_status_t err; - err = rocrtst::CommonCleanUp(this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - return; -} - diff --git a/rocrtst/suites/performance/image_bandwidth.cc b/rocrtst/suites/performance/image_bandwidth.cc deleted file mode 100755 index 482870ee8e..0000000000 --- a/rocrtst/suites/performance/image_bandwidth.cc +++ /dev/null @@ -1,328 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#include "image_bandwidth.h" -#include "common/base_rocr_utils.h" -#include "common/common.h" -#include "common/hsatimer.h" -#include "gtest/gtest.h" -#include "hsa/hsa.h" -#include "hsa/hsa_ext_image.h" -#include -#include -#include - -ImageBandwidth::ImageBandwidth(size_t num) : - BaseRocR(), import_bandwidth_ {0.0}, export_bandwidth_ {0.0}, - copy_bandwidth_ {0.0} { - format_.channel_order = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA; - format_.channel_type = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8; - geometry_ = HSA_EXT_IMAGE_GEOMETRY_2D; - - set_requires_profile (HSA_PROFILE_FULL); -} - -ImageBandwidth::~ImageBandwidth() { -} - -const size_t ImageBandwidth::Size[10] = {32, 64, 128, 256, 512, 1024, 2048, - 4096, 8192, 16384 - }; -const char* const ImageBandwidth::Str[10] = {"4K", "16K", "64K", "256K", "1M", - "4M", "16M", "64M", "256M", "1G" - }; - -void ImageBandwidth::SetUp() { - hsa_status_t err; - - if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) { - return; - } - - hsa_agent_t* gpu_dev = gpu_device1(); - - // Find the global region - err = hsa_amd_agent_iterate_memory_pools(*gpu_dev, rocrtst::FindGlobalPool, - &cpu_pool()); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); -} - -void ImageBandwidth::Run() { - hsa_status_t err; - - if (!rocrtst::CheckProfile(this)) { - return; - } - - hsa_agent_t* gpu_dev = gpu_device1(); - - for (int i = 0; i < 10; i++) { - // Create timer for import, export and copy tests - rocrtst::PerfTimer import_timer; - rocrtst::PerfTimer export_timer; - rocrtst::PerfTimer copy_timer; - std::vector import_image; - std::vector export_image; - std::vector copy_image; - // Allocate image buffer in host memory - uint32_t* image_buffer = NULL; - err = hsa_amd_memory_pool_allocate(cpu_pool(), - Size[i] * Size[i] * sizeof(uint32_t), - 0, (void**) &image_buffer); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // rocrtst::CommonCleanUp the image buffer - for (uint32_t j = 0; j < Size[i] * Size[i]; j++) { - image_buffer[j] = 0x10101010; - } - - // Prepare for 2D image creation - hsa_ext_image_t image_handle; - - hsa_ext_image_descriptor_t image_descriptor; - image_descriptor.geometry = geometry_; - image_descriptor.width = Size[i]; - image_descriptor.height = Size[i]; - image_descriptor.depth = 1; - image_descriptor.array_size = 0; - image_descriptor.format = format_; - - // Check if device_ supports at least read and write operation on - // image format - uint32_t capability_mask; - err = hsa_ext_image_get_capability(*gpu_dev, HSA_EXT_IMAGE_GEOMETRY_2D, - &format_, &capability_mask); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - if (!(capability_mask & HSA_EXT_IMAGE_CAPABILITY_READ_WRITE)) { - std::cout << - "Device does not support read and write operation on this kind of image!" - << std::endl; - ASSERT_NE(capability_mask & HSA_EXT_IMAGE_CAPABILITY_READ_WRITE, 0); - } - - // Get image info - hsa_ext_image_data_info_t image_info; - err = hsa_ext_image_data_get_info(*gpu_dev, &image_descriptor, - HSA_ACCESS_PERMISSION_RW, &image_info); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Allocate memory for image - uintptr_t ptr_temp = 0; - err = hsa_amd_memory_pool_allocate(cpu_pool(), - image_info.size + image_info.alignment, 0, (void**) &ptr_temp); - - // Align the image address - uintptr_t mul = ptr_temp / image_info.alignment; - void* ptr_image = (void*) ((mul + 1) * image_info.alignment); - - // rocrtst::CommonCleanUp the image to 0 - hsa_amd_memory_fill(ptr_image, 0, image_info.size); - - // Create image handle - err = hsa_ext_image_create(*gpu_dev, &image_descriptor, ptr_image, - HSA_ACCESS_PERMISSION_RW, &image_handle); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Set import image region - hsa_dim3_t range = {(uint32_t) Size[i], (uint32_t) Size[i], 1}; - - hsa_ext_image_region_t image_region; - hsa_dim3_t image_offset = {0, 0, 0}; - image_region.offset = image_offset; - image_region.range = range; - - size_t iterations = RealIterationNum(); - - for (uint32_t it = 0; it < iterations; it++) { - // Create a timer - int index = import_timer.CreateTimer(); - - // Stamp at the beginning - import_timer.StartTimer(index); - - // Import image from host - err = hsa_ext_image_import(*gpu_dev, image_buffer, 0, 0, image_handle, - &image_region); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Stamp in the end - import_timer.StopTimer(index); - import_image.push_back(import_timer.ReadTimer(index)); - } - - // Reset image_buffer - hsa_amd_memory_fill(image_buffer, 0, Size[i] * Size[i] * sizeof(uint32_t)); - - for (uint32_t it = 0; it < iterations; it++) { - // Export image - // Stamp at the beginning - int index = export_timer.CreateTimer(); - export_timer.StartTimer(index); - - err = hsa_ext_image_export(*gpu_dev, image_handle, image_buffer, 0, 0, - &image_region); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - export_timer.StopTimer(index); - export_image.push_back(export_timer.ReadTimer(index)); - - // Check if the value is correct - for (uint32_t j = 0; j < Size[i] * Size[i]; j++) { - ASSERT_EQ(image_buffer[j], 0x10101010); - } - } - - // Create another image for copy - // Allocate memory for image - uintptr_t ptr_temp2 = 0; - err = hsa_amd_memory_pool_allocate(cpu_pool(), - image_info.size + image_info.alignment, 0, (void**) &ptr_temp2); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Align the image address - mul = ptr_temp2 / image_info.alignment; - void* ptr_image2 = (void*) ((mul + 1) * image_info.alignment); - - // rocrtst::CommonCleanUp the image to 0 - hsa_amd_memory_fill(ptr_image2, 0, image_info.size); - - // Create image handle - hsa_ext_image_t image_handle_copy; - err = hsa_ext_image_create(*gpu_dev, &image_descriptor, ptr_image2, - HSA_ACCESS_PERMISSION_RW, &image_handle_copy); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - for (uint32_t it = 0; it < iterations; it++) { - // Stamp at the beginning - int index = copy_timer.CreateTimer(); - copy_timer.StartTimer(index); - - err = hsa_ext_image_copy(*gpu_dev, image_handle, &image_offset, - image_handle_copy, &image_offset, &range); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Stamp in the end - copy_timer.StopTimer(index); - copy_image.push_back(copy_timer.ReadTimer(index)); - - // Check if image data is correct - hsa_amd_memory_fill(image_buffer, 0, - Size[i] * Size[i] * sizeof(uint32_t)); - - // Export image - err = hsa_ext_image_export(*gpu_dev, image_handle_copy, image_buffer, - 0, 0, &image_region); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Check if the value is correct - for (uint32_t j = 0; j < Size[i] * Size[i]; j++) { - ASSERT_EQ(image_buffer[j], 0x10101010); - } - - } - - // Calculate Bandwidth - import_bandwidth_[i] = CalculateBandwidth(import_image, Size[i]); - export_bandwidth_[i] = CalculateBandwidth(export_image, Size[i]); - copy_bandwidth_[i] = CalculateBandwidth(copy_image, Size[i]); - } -} - -double ImageBandwidth::CalculateBandwidth(std::vector& vec, - size_t size) { - double mean = 0.0; - - // Delete the first timer result, which is warm up test - vec.erase(vec.begin()); - - // Sort the results - std::sort(vec.begin(), vec.end()); - - // Delete the last 20% of the results - - vec.erase(vec.begin() + num_iteration(), vec.end()); - - int num = vec.size(); - - for (int index = 0; index < num; index++) { - mean += vec[index]; - } - - mean /= num; - - return (double) size * size * 4 / mean / 1024 / 1024 / 1024; -} - -void ImageBandwidth::DisplayResults() const { - if (!rocrtst::CheckProfile(this)) { - return; - } - - fprintf(stdout, "===================================================" - "=========================\n"); - - fprintf(stdout, - " Size Import Export Copy\n"); - - for (int i = 0; i < 10; i++) { - fprintf(stdout, - " %s %f(GB/s) %f(GB/s) %f(GB/s)\n", - Str[i], import_bandwidth_[i], export_bandwidth_[i], - copy_bandwidth_[i]); - fprintf(stdout, "=================================================" - "===========================\n"); - } -} - -void ImageBandwidth::Close() { - hsa_status_t err; - err = rocrtst::CommonCleanUp(this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); -} - -size_t ImageBandwidth::RealIterationNum() { - return num_iteration() * 1.2 + 1; -} diff --git a/rocrtst/suites/performance/image_bandwidth.h b/rocrtst/suites/performance/image_bandwidth.h deleted file mode 100755 index 2e28e31a39..0000000000 --- a/rocrtst/suites/performance/image_bandwidth.h +++ /dev/null @@ -1,99 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#ifndef __ROCRTST_SRC_IMAGE_BANDWIDTH_H__ -#define __ROCRTST_SRC_IMAGE_BANDWIDTH_H__ - -#include "perf_common/perf_base.h" -#include "common/base_rocr.h" -#include "hsa/hsa.h" -#include "hsa/hsa_ext_image.h" -#include - -class ImageBandwidth: public rocrtst::BaseRocR, public PerfBase { - public: - //@Brief: Constructor for test case of ImageBandwidth - ImageBandwidth(size_t num = 100); - - //@Brief: Destructor - virtual ~ImageBandwidth(); - - //@Brief: Setup the environment for measurement - virtual void SetUp(); - - //@Brief: Core measurement execution - virtual void Run(); - - //@Brief: Clean up and retrive the resource - virtual void Close(); - - //@Brief: Display results - virtual void DisplayResults() const; - - private: - //@Brief: Define image size and corresponding string - static const size_t Size[10]; - static const char* const Str[10]; - - //@Brief: Get actual iteration number - size_t RealIterationNum(); - - //@Brief: Calculate Bandwidth - double CalculateBandwidth(std::vector& vec, size_t size); - - protected: - //@Brief: bandwidth data - double import_bandwidth_[10]; - double export_bandwidth_[10]; - double copy_bandwidth_[10]; - - //@Brief: Image format - hsa_ext_image_format_t format_; - - //@Brief: Image geometry - hsa_ext_image_geometry_t geometry_; -}; - -#endif diff --git a/rocrtst/suites/performance/image_load_bandwidth.cc b/rocrtst/suites/performance/image_load_bandwidth.cc deleted file mode 100755 index 33ec707d9d..0000000000 --- a/rocrtst/suites/performance/image_load_bandwidth.cc +++ /dev/null @@ -1,270 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#include "image_load_bandwidth.h" -#include "common/base_rocr_utils.h" -#include "common/common.h" -#include "common/hsatimer.h" -#include "common/helper_funcs.h" -#include "gtest/gtest.h" -#include "hsa/hsa_ext_image.h" -#include -#include - -// Constructor of the class -ImageLoadBandwidth::ImageLoadBandwidth() : - BaseRocR() { - load_bandwidth_ = 0.0; - image_size_ = 0; - - set_requires_profile (HSA_PROFILE_FULL); -} - -// Destructor of the class -ImageLoadBandwidth::~ImageLoadBandwidth() { - -} - -// Set up the environment -void ImageLoadBandwidth::SetUp() { - hsa_agent_t* gpu_dev = gpu_device1(); - - set_kernel_file_name("load_2d_image.o"); - set_kernel_name("&__OpenCL_load_2d_image_kernel"); - - if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) { - return; - } - - //Create a queue with max number size - hsa_queue_t* q = main_queue(); - rocrtst::CreateQueue(*gpu_dev, &q); - - rocrtst::LoadKernelFromObjFile(this); - - //Fill up part of aql - rocrtst::InitializeAQLPacket(this, &aql()); - aql().setup = 0; - aql().setup |= 2 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; - - return; -} - -// Run the test -void ImageLoadBandwidth::Run() { - hsa_agent_t* gpu_dev = gpu_device1(); - hsa_agent_t* cpu_dev = cpu_device(); - - hsa_status_t err; - - if (!rocrtst::CheckProfile(this)) { - return; - } - - hsa_ext_image_descriptor_t image_descriptor; - image_descriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2D; - image_descriptor.width = 256; - image_descriptor.height = 256; - image_descriptor.depth = 1; - image_descriptor.array_size = 0; - image_descriptor.format.channel_type = - HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8; - image_descriptor.format.channel_order = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA; - - hsa_ext_image_format_t image_format; - image_format.channel_type = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8; - image_format.channel_order = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA; - - // Check if device_ supports at least read only operation on image format - uint32_t capability_mask; - err = hsa_ext_image_get_capability(*gpu_dev, HSA_EXT_IMAGE_GEOMETRY_2D, - &image_format, &capability_mask); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - if (!(capability_mask & HSA_EXT_IMAGE_CAPABILITY_READ_ONLY)) { - ASSERT_FALSE( - "Device does not support read and write operation on this kind of image!"); - } - - // Get image info - hsa_ext_image_data_info_t image_info; - err = hsa_ext_image_data_get_info(*gpu_dev, &image_descriptor, - HSA_ACCESS_PERMISSION_RO, &image_info); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - image_size_ = image_info.size; - - std::vector time; - - for (uint32_t i = 0; i < num_iteration(); i++) { -#ifdef DEBUG - std::cout << "."; - fflush(stdout); -#endif - // Allocate memory space for image - // Find the global region - err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool, - &cpu_pool()); - ASSERT_EQ(err, HSA_STATUS_INFO_BREAK); - uintptr_t ptr_temp = 0; - err = hsa_amd_memory_pool_allocate(cpu_pool(), - image_info.size + image_info.alignment, - 0, (void**) &ptr_temp); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = hsa_amd_agents_allow_access(1, gpu_dev, NULL, (void*) ptr_temp); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Align the image address - uintptr_t mul = ptr_temp / image_info.alignment; - void* ptr_image = (void*) ((mul + 1) * image_info.alignment); - - // rocrtst::CommonCleanUp the image memory to 1 - err = hsa_amd_memory_fill(ptr_image, 1, image_info.size); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Create image handle - hsa_ext_image_t image_handle; - err = hsa_ext_image_create(*gpu_dev, &image_descriptor, ptr_image, - HSA_ACCESS_PERMISSION_RO, &image_handle); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Allocate and initialize the kernel argument - typedef struct args_t { - uint64_t arg0; - int* arg1; - int istart; - int iend; - int istep; - } args; - - int local_out = 5; - int istart = 0; - int iend = 64; - int istep = 1; - - args* kern_ptr = NULL; - err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(args), 0, - (void**) &kern_ptr); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - kern_ptr->arg0 = image_handle.handle; - kern_ptr->arg1 = &local_out; - kern_ptr->istart = istart; - kern_ptr->iend = iend; - kern_ptr->istep = istep; - - aql().kernarg_address = kern_ptr; - - // Obtain the current queue write index - uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1); - - void *q_base_addr = main_queue()->base_address; - - // Write the aql packet at the calculated queue index address. - const uint32_t queue_mask = main_queue()->size - 1; - ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql(); - - rocrtst::PerfTimer p_timer; - int id = p_timer.CreateTimer(); - p_timer.StartTimer(id); - - ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask].header |= - HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; - hsa_signal_store_release(main_queue()->doorbell_signal, index); - - // Wait on the dispatch signal until the kernel is finished. - while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1, - (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) - ; - - p_timer.StopTimer(id); - - time.push_back(p_timer.ReadTimer(id)); - - hsa_signal_store_release(signal(), 1); - - err = hsa_ext_image_destroy(*gpu_dev, image_handle); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = hsa_memory_deregister(ptr_image, image_info.size); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - hsa_amd_memory_pool_free((void*) ptr_temp); - } - - // Calculte the mean load time - time.erase(time.begin()); -#ifdef DEBUG - - for (uint32_t i = 0; i < time.size(); i++) { - std::cout << time[i] << std::endl; - } - -#endif - double mean_time = rocrtst::CalcMean(time); - load_bandwidth_ = image_size_ / mean_time / 1024 / 1024 / 1024; - -} - -void ImageLoadBandwidth::Close() { - hsa_status_t err; - err = rocrtst::CommonCleanUp(this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); -} - -void ImageLoadBandwidth::DisplayResults() const { - if (!rocrtst::CheckProfile(this)) { - return; - } - - std::cout << "======================================" - "======================================" << std::endl; - std::cout << " Image Size(bytes): LoadBandwidth(GB/S): " - << std::endl; - std::cout << " " << image_size_ << " " - << load_bandwidth_ << std::endl; -} - diff --git a/rocrtst/suites/performance/image_store_bandwidth.cc b/rocrtst/suites/performance/image_store_bandwidth.cc deleted file mode 100755 index ea30a620c4..0000000000 --- a/rocrtst/suites/performance/image_store_bandwidth.cc +++ /dev/null @@ -1,271 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#include "image_store_bandwidth.h" -#include "common/base_rocr_utils.h" -#include "common/common.h" -#include "common/helper_funcs.h" -#include "common/hsatimer.h" -#include "gtest/gtest.h" -#include "hsa/hsa_ext_image.h" -#include -#include - -// Constructor of the class -ImageStoreBandwidth::ImageStoreBandwidth() : - BaseRocR() { - store_bandwidth_ = 0.0; - store_bandwidth_ = 0.0; - image_size_ = 0; - - set_requires_profile (HSA_PROFILE_FULL); -} - -// Destructor of the class -ImageStoreBandwidth::~ImageStoreBandwidth() { - -} - -// Set up the environment -void ImageStoreBandwidth::SetUp() { - - set_kernel_file_name("store_2d_image.o"); - set_kernel_name("&__OpenCL_store_2d_image_kernel"); - - if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) { - return; - } - - hsa_agent_t* gpu_dev = gpu_device1(); - - //Create a queue with max number size - hsa_queue_t* q = nullptr; - rocrtst::CreateQueue(*gpu_dev, &q); - set_main_queue(q); - - rocrtst::LoadKernelFromObjFile(this); - - //Fill up part of aql - rocrtst::InitializeAQLPacket(this, &aql()); - aql().setup = 0; - aql().setup |= 2 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; - - return; -} - -// Run the test -void ImageStoreBandwidth::Run() { - hsa_status_t err; - - if (!rocrtst::CheckProfile(this)) { - return; - } - - hsa_agent_t* gpu_dev = gpu_device1(); - hsa_agent_t* cpu_dev = cpu_device(); - - hsa_ext_image_descriptor_t image_descriptor; - image_descriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2D; - image_descriptor.width = 256; - image_descriptor.height = 256; - image_descriptor.depth = 1; - image_descriptor.array_size = 0; - image_descriptor.format.channel_type = - HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8; - image_descriptor.format.channel_order = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA; - - hsa_ext_image_format_t image_format; - image_format.channel_type = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8; - image_format.channel_order = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA; - - // Check if device_ supports at least read only operation on image format - uint32_t capability_mask; - err = hsa_ext_image_get_capability(*gpu_dev, HSA_EXT_IMAGE_GEOMETRY_2D, - &image_format, &capability_mask); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - if (!(capability_mask & HSA_EXT_IMAGE_CAPABILITY_READ_ONLY)) { - std::cout << - "Device does not support read and write operation on this kind of image!" - << std::endl; - ASSERT_NE(capability_mask & HSA_EXT_IMAGE_CAPABILITY_READ_ONLY, 0); - } - - // Get image info - hsa_ext_image_data_info_t image_info; - err = hsa_ext_image_data_get_info(*gpu_dev, &image_descriptor, - HSA_ACCESS_PERMISSION_RW, &image_info); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - image_size_ = image_info.size; - - std::vector time; - - for (uint32_t i = 0; i < num_iteration(); i++) { -#ifdef DEBUG - std::cout << "."; - fflush(stdout); -#endif - // Allocate memory space for image - err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool, - &cpu_pool()); - ASSERT_EQ(err, HSA_STATUS_INFO_BREAK); - - uintptr_t ptr_temp = 0; - err = hsa_amd_memory_pool_allocate(cpu_pool(), - image_info.size + image_info.alignment, - 0, (void**) &ptr_temp); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Align the image address - uintptr_t mul = ptr_temp / image_info.alignment; - void* ptr_image = (void*) ((mul + 1) * image_info.alignment); - - // rocrtst::CommonCleanUp the image memory to 0 - err = hsa_amd_memory_fill(ptr_image, 0, image_info.size); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Create image handle - hsa_ext_image_t image_handle; - err = hsa_ext_image_create(*gpu_dev, &image_descriptor, ptr_image, - HSA_ACCESS_PERMISSION_RO, &image_handle); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Allocate and initialize the kernel argument - typedef struct args_t { - uint64_t arg0; - int istart; - int iend; - int istep; - } args; - - //int local_out = 5; - int istart = 0; - int iend = 64; - int istep = 1; - - args* kern_ptr = NULL; - err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(args), 0, - (void**) &kern_ptr); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - kern_ptr->arg0 = image_handle.handle; - kern_ptr->istart = istart; - kern_ptr->iend = iend; - kern_ptr->istep = istep; - - aql().kernarg_address = kern_ptr; - - // Obtain the current queue write index - uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1); - - void *q_base_addr = main_queue()->base_address; - // Write the aql packet at the calculated queue index address. - const uint32_t queue_mask = main_queue()->size - 1; - ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql(); - - rocrtst::PerfTimer p_timer; - int id = p_timer.CreateTimer(); - p_timer.StartTimer(id); - - ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask].header |= - HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; - hsa_signal_store_release(main_queue()->doorbell_signal, index); - - // Wait on the dispatch signal until the kernel is finished. - while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1, - (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) - ; - - p_timer.StopTimer(id); - - time.push_back(p_timer.ReadTimer(id)); - - hsa_signal_store_release(signal(), 1); - - err = hsa_ext_image_destroy(*gpu_dev, image_handle); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = hsa_memory_deregister(ptr_image, image_info.size); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - hsa_amd_memory_pool_free(reinterpret_cast(ptr_temp)); - } - - // Calculte the mean load time - time.erase(time.begin()); -#ifdef DEBUG - - for (size_t i = 0; i < time.size(); i++) { - std::cout << time[i] << std::endl; - } - -#endif - double mean_time = rocrtst::CalcMean(time); - std::cout << "mean time: " << mean_time << std::endl; - - store_bandwidth_ = image_size_ / mean_time / 1024 / 1024 / 1024; -} - -void ImageStoreBandwidth::Close() { - hsa_status_t err; - err = rocrtst::CommonCleanUp(this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); -} - -void ImageStoreBandwidth::DisplayResults() const { - if (!rocrtst::CheckProfile(this)) { - return; - } - - std::cout << "=============================================" - "===============================" << std::endl; - - std::cout << " Image Size(bytes): StoreBandwidth(GB/S): " - << std::cout; - std::cout << " " << image_size_ << " " - << store_bandwidth_ << std::endl; -} - diff --git a/rocrtst/suites/performance/kernels/cu_masking.brig b/rocrtst/suites/performance/kernels/cu_masking.brig deleted file mode 100644 index bec66be1b1b2e09dbbc1edaafcd6373992e58be1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1200 zcmb`GFKkm$6vof{LnvEAo~W3Zcp=?F!bHUcA&8D-X_r7k0J(&{>@{nXwr^RBLsTM= zhzSI8asq*@Kp>C^1QLNjA`$$)_wMbQ&T^8k-?`s+?>YD0ciVlwmp*!S&xC7HLx|9_Rg~Dpg>T)t+U?b)o?md-UK^9c^hT zqBw=+=?Haj0WQHOH~tKN1>b$2dc|dyxYlR#){Zt9k+}i zib_v+r&#qGR!cjmeR;9n(HyT$uW6CZ(88eFX5GT{o<-$-`}_jGX19`$u(S~W3wguh zbRXqAIZsN@#S)9s7iho8b#C?T;{GA0M;X4ZCioh4a%z+_ch`j-H$#^ -#include -#include -#include - -//@Brief: This is trying to replicate clinfo - -class HsaInfo: public rocrtst::BaseRocR, public PerfBase { - public: - //@Brief: Constructor - HsaInfo(); - - //@Brief: Destructor - virtual ~HsaInfo(); - - //@Brief: Set up the environment for the test - virtual void SetUp(); - - //@Brief: Run the test case - virtual void Run(); - - //@Brief: Display results we got - virtual void DisplayResults() const; - - //@Brief: Clean up and close the runtime - virtual void Close(); - -}; - -#endif - +__kernel void +empty_kernel(void) { + return; +} + diff --git a/rocrtst/suites/performance/kernels/empty_kernel.hsail b/rocrtst/suites/performance/kernels/empty_kernel.hsail deleted file mode 100755 index 9736e413a9..0000000000 --- a/rocrtst/suites/performance/kernels/empty_kernel.hsail +++ /dev/null @@ -1,12 +0,0 @@ -module &m:1:0:$full:$large:$default; -extension "amd:gcn"; -extension "IMAGE"; - -decl prog function &abort()(); - -prog kernel &__Empty_kernel() -{ - - ret; -}; - diff --git a/rocrtst/suites/performance/kernels/flush_latency.hsail b/rocrtst/suites/performance/kernels/flush_latency.hsail deleted file mode 100755 index 21ed473d0c..0000000000 --- a/rocrtst/suites/performance/kernels/flush_latency.hsail +++ /dev/null @@ -1,88 +0,0 @@ -module &m:1:0:$full:$large:$default; - -/* Copyright 2014 HSA Foundation Inc. All Rights Reserved. - * - * HSAF is granting you permission to use this software and documentation (if - * any) (collectively, the "Materials") pursuant to the terms and conditions - * of the Software License Agreement included with the Materials. If you do - * not have a copy of the Software License Agreement, contact the HSA Foundation for a copy. - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS - * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. - */ - -extension "amd:gcn"; -extension "IMAGE"; - -decl prog function &abort()(); - - -/** - * @brief Hsail kernel to benchmark READ accesses to system memory. - * The kernel is given a input buffer from which each each thread will - * read. The thread will read from multiple locations of the input buffer. - * The locations to read from is determined by the work-item Id, the function - * being work-item Id modulo total number of work-items in the global work grid. - * So given a global work grid of 16 work-items the reads by a thread with absolute - * id 4 would be 4, 20, 36, 52, etc. - * - * @NOTE: A constraint imposed by the kernel is that the buffer size be large - * enough to support 16 reads by each thread. So a dispatch of 8 work-items - * should allocate enough buffer for 8 * 16 * sizeof(uint32_t). - * - * @param bufStart beginning byte address of user buffer in system memory - * from which kernel threads could read - * - * @param bufEnd byte address that follows the end of user buffer. Accessing - * memory at bufEnd is illegal - * - * @param addrStep size by which to increment byte address following each read - * operation. The value represents total number of work-items * sizeof(uint32_t) - * - * @param outAddr argument that is passed by the user to be updated with values - * read by the kernel threads. This is ensure compiler and finalizer do not eliminate - * code because the values being read are not used in any meaningfule way. - * - */ -prog kernel &main(kernarg_u64 %outAddr) { - - pragma "AMD RTI", "ARGSTART:__SysMemLoad"; - pragma "AMD RTI", "version:3:1:104"; - pragma "AMD RTI", "device:generic"; - pragma "AMD RTI", "uniqueid:1024"; - pragma "AMD RTI", "function:1:0"; - pragma "AMD RTI", "memory:64bitABI"; - pragma "AMD RTI", "uavid:8"; - pragma "AMD RTI", "privateid:8"; - pragma "AMD RTI", "ARGEND:__SysMemLoad"; - - ld_kernarg_u64 $d0, [%outAddr]; - - // Compute the absolute id of current thread - // and shift it by two to get index into user - // buffer to access for Read operation - workitemflatabsid_u32 $s0; - shl_u32 $s0, $s0, 2; - cvt_u64_u32 $d4, $s0; - - // Add index to base address of user buffer to obtain - // effective address for access - add_u64 $d0, $d0, $d4; - - mov_u32 $s2, 1; - - st_global_u32 $s2, [$d0]; - -}; - diff --git a/rocrtst/suites/performance/kernels/flush_latency_base.hsail b/rocrtst/suites/performance/kernels/flush_latency_base.hsail deleted file mode 100755 index 015614252e..0000000000 --- a/rocrtst/suites/performance/kernels/flush_latency_base.hsail +++ /dev/null @@ -1,88 +0,0 @@ -module &m:1:0:$base:$large:$default; - -/* Copyright 2014 HSA Foundation Inc. All Rights Reserved. - * - * HSAF is granting you permission to use this software and documentation (if - * any) (collectively, the "Materials") pursuant to the terms and conditions - * of the Software License Agreement included with the Materials. If you do - * not have a copy of the Software License Agreement, contact the HSA Foundation for a copy. - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS - * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. - */ - -extension "amd:gcn"; -extension "IMAGE"; - -decl prog function &abort()(); - - -/** - * @brief Hsail kernel to benchmark READ accesses to system memory. - * The kernel is given a input buffer from which each each thread will - * read. The thread will read from multiple locations of the input buffer. - * The locations to read from is determined by the work-item Id, the function - * being work-item Id modulo total number of work-items in the global work grid. - * So given a global work grid of 16 work-items the reads by a thread with absolute - * id 4 would be 4, 20, 36, 52, etc. - * - * @NOTE: A constraint imposed by the kernel is that the buffer size be large - * enough to support 16 reads by each thread. So a dispatch of 8 work-items - * should allocate enough buffer for 8 * 16 * sizeof(uint32_t). - * - * @param bufStart beginning byte address of user buffer in system memory - * from which kernel threads could read - * - * @param bufEnd byte address that follows the end of user buffer. Accessing - * memory at bufEnd is illegal - * - * @param addrStep size by which to increment byte address following each read - * operation. The value represents total number of work-items * sizeof(uint32_t) - * - * @param outAddr argument that is passed by the user to be updated with values - * read by the kernel threads. This is ensure compiler and finalizer do not eliminate - * code because the values being read are not used in any meaningfule way. - * - */ -prog kernel &main(kernarg_u64 %outAddr) { - - pragma "AMD RTI", "ARGSTART:__SysMemLoad"; - pragma "AMD RTI", "version:3:1:104"; - pragma "AMD RTI", "device:generic"; - pragma "AMD RTI", "uniqueid:1024"; - pragma "AMD RTI", "function:1:0"; - pragma "AMD RTI", "memory:64bitABI"; - pragma "AMD RTI", "uavid:8"; - pragma "AMD RTI", "privateid:8"; - pragma "AMD RTI", "ARGEND:__SysMemLoad"; - - ld_kernarg_u64 $d0, [%outAddr]; - - // Compute the absolute id of current thread - // and shift it by two to get index into user - // buffer to access for Read operation - workitemflatabsid_u32 $s0; - shl_u32 $s0, $s0, 2; - cvt_u64_u32 $d4, $s0; - - // Add index to base address of user buffer to obtain - // effective address for access - add_u64 $d0, $d0, $d4; - - mov_u32 $s2, 1; - - st_global_u32 $s2, [$d0]; - -}; - diff --git a/rocrtst/suites/performance/kernels/load_2d_image.hsail b/rocrtst/suites/performance/kernels/load_2d_image.hsail deleted file mode 100755 index 637c14b273..0000000000 --- a/rocrtst/suites/performance/kernels/load_2d_image.hsail +++ /dev/null @@ -1,109 +0,0 @@ -module &m:1:0:$full:$large:$default; -extension "amd:gcn"; -extension "IMAGE"; - -decl prog function &abort()(); - -prog kernel &__OpenCL_load_2d_image_kernel( - kernarg_rwimg %input, - kernarg_u64 %result, - kernarg_u32 %istart, - kernarg_u32 %iend, - kernarg_u32 %istep) -{ - pragma "AMD RTI", "ARGSTART:__OpenCL_load_2d_image_kernel"; - pragma "AMD RTI", "version:3:1:104"; - pragma "AMD RTI", "device:generic"; - pragma "AMD RTI", "uniqueid:1024"; - pragma "AMD RTI", "function:1:0"; - pragma "AMD RTI", "memory:64bitABI"; - pragma "AMD RTI", "uavid:8"; - pragma "AMD RTI", "privateid:8"; - pragma "AMD RTI", "ARGEND:__OpenCL_load_2d_image_kernel"; - -@__OpenCL_load_2d_image_kernel_entry: - // BB#0: // %entry - workitemabsid_u32 $s0, 1; - workitemabsid_u32 $s1, 0; - ld_kernarg_rwimg $d5, [%input]; - ld_kernarg_u32 $s2, [%istart]; - ld_kernarg_u32 $s3, [%iend]; - ld_kernarg_u32 $s4, [%istep]; - - add_u32 $s9, 0, 0; // reset s9 to zero -@loop: - add_u32 $s2, $s2, $s4; - - ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0); //(coordWidth, coordHeight) - add_u32 $s9, $s9, $s5; - - //force to retrieve different image elements - add_u32 $s1, $s1, 64; - and_b32 $s1, $s1, 255; - add_u32 $s0, $s0, 64; - and_b32 $s0, $s0, 255; - - ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0); - add_u32 $s9, $s9, $s6; - - //force to retrieve different image elements - add_u32 $s1, $s1, 64; - and_b32 $s1, $s1, 255; - add_u32 $s0, $s0, 64; - and_b32 $s0, $s0, 255; - - ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0); - add_u32 $s9, $s9, $s7; - - //force to retrieve different image elements - add_u32 $s1, $s1, 64; - and_b32 $s1, $s1, 255; - add_u32 $s0, $s0, 64; - and_b32 $s0, $s0, 255; - - ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0); - add_u32 $s9, $s9, $s8; - - //force to retrieve different image elements - add_u32 $s1, $s1, 64; - and_b32 $s1, $s1, 255; - add_u32 $s0, $s0, 64; - and_b32 $s0, $s0, 255; - - ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0); - add_u32 $s9, $s9, $s5; - - //force to retrieve different image elements - add_u32 $s1, $s1, 64; - and_b32 $s1, $s1, 255; - add_u32 $s0, $s0, 64; - and_b32 $s0, $s0, 255; - - ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0); - add_u32 $s9, $s9, $s6; - - //force to retrieve different image elements - add_u32 $s1, $s1, 64; - and_b32 $s1, $s1, 255; - add_u32 $s0, $s0, 64; - and_b32 $s0, $s0, 255; - - ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0); - add_u32 $s9, $s9, $s7; - - //force to retrieve different image elements - add_u32 $s1, $s1, 64; - and_b32 $s1, $s1, 255; - add_u32 $s0, $s0, 64; - and_b32 $s0, $s0, 255; - - ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0); - ld_kernarg_align(8)_width(all)_u64 $d4, [%result]; - add_u32 $s9, $s9, $s8; - - st_u32 $s9, [$d4]; - -//loop until we hit condition - cmp_lt_b1_u32 $c0, $s2, $s3; - cbr_b1 $c0, @loop; -}; diff --git a/rocrtst/suites/performance/kernels/simple_kernel.hsail b/rocrtst/suites/performance/kernels/simple_kernel.hsail deleted file mode 100755 index 063f9ece3c..0000000000 --- a/rocrtst/suites/performance/kernels/simple_kernel.hsail +++ /dev/null @@ -1,37 +0,0 @@ -module &m:1:0:$full:$large:$default; -extension "amd:gcn"; -extension "IMAGE"; - -decl prog function &abort()(); - - -/* This function takes in 2 memory locations, one storing a number of - iterations to execute, and the other a place to store a result. - The function iterates through a loop "iteration" times, and stores - the number of iterations executed in the "results" location. - A successful run is when the value stored in %iteration is the - same as the value store in %results. -*/ - -prog kernel &__simple_kernel( - kernarg_u64 %iteration, - kernarg_u64 %results) -{ - ret; - ld_kernarg_align(8)_width(all)_u64 $d1, [%iteration]; - ld_kernarg_align(8)_width(all)_u64 $d2, [%results]; - - ld_global_u32 $s1, [$d1]; - mov_u32 $s2, 0; - - -@loop: - add_u32 $s2, $s2, 1; - cmp_lt_b1_u32 $c0, $s2, $s1; - cbr_b1 $c0, @loop; - - st_global_u32 $s2, [$d2]; - - ret; -}; - diff --git a/rocrtst/suites/performance/kernels/simple_kernel_base.hsail b/rocrtst/suites/performance/kernels/simple_kernel_base.hsail deleted file mode 100755 index 0ee7207b2a..0000000000 --- a/rocrtst/suites/performance/kernels/simple_kernel_base.hsail +++ /dev/null @@ -1,28 +0,0 @@ -module &m:1:0:$base:$large:$default; -extension "amd:gcn"; -extension "IMAGE"; - -decl prog function &abort()(); - -prog kernel &__simple_kernel( - kernarg_u64 %iteration, - kernarg_u64 %results) -{ - - ld_kernarg_align(8)_width(all)_u64 $d1, [%iteration]; - ld_kernarg_align(8)_width(all)_u64 $d2, [%results]; - - ld_global_u32 $s1, [$d1]; - mov_u32 $s2, 0; - - -@loop: - add_u32 $s2, $s2, 1; - cmp_lt_b1_u32 $c0, $s2, $s1; - cbr_b1 $c0, @loop; - - st_global_u32 $s2, [$d2]; - - ret; -}; - diff --git a/rocrtst/suites/performance/kernels/store_2d_image.hsail b/rocrtst/suites/performance/kernels/store_2d_image.hsail deleted file mode 100755 index b24bdebb14..0000000000 --- a/rocrtst/suites/performance/kernels/store_2d_image.hsail +++ /dev/null @@ -1,105 +0,0 @@ -module &m:1:0:$full:$large:$default; -extension "amd:gcn"; -extension "IMAGE"; - -decl prog function &abort()(); - -prog kernel &__OpenCL_store_2d_image_kernel( - kernarg_rwimg %output, - kernarg_u32 %istart, - kernarg_u32 %iend, - kernarg_u32 %istep) -{ - pragma "AMD RTI", "ARGSTART:__OpenCL_store_2d_image_kernel"; - pragma "AMD RTI", "version:3:1:104"; - pragma "AMD RTI", "device:generic"; - pragma "AMD RTI", "uniqueid:1024"; - pragma "AMD RTI", "function:1:0"; - pragma "AMD RTI", "memory:64bitABI"; - pragma "AMD RTI", "uavid:8"; - pragma "AMD RTI", "privateid:8"; - pragma "AMD RTI", "ARGEND:__OpenCL_store_2d_image_kernel"; - -@__OpenCL_store_2d_image_kernel_entry: - // BB#0: // %entry - workitemabsid_u32 $s0, 1; - workitemabsid_u32 $s1, 0; - ld_kernarg_rwimg $d5, [%output]; - ld_kernarg_u32 $s2, [%istart]; - ld_kernarg_u32 $s3, [%iend]; - ld_kernarg_u32 $s4, [%istep]; - - mov_b32 $s5, 0; -@loop: - add_u32 $s2, $s2, $s4; - add_u32 $s5, $s5, 1; - stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0); - - //force to retrieve different image elements - add_u32 $s1, $s1, 64; - and_b32 $s1, $s1, 255; - add_u32 $s0, $s0, 64; - and_b32 $s0, $s0, 255; - - add_u32 $s5, $s5, $s2; - stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0); - - //force to retrieve different image elements - add_u32 $s1, $s1, 64; - and_b32 $s1, $s1, 255; - add_u32 $s0, $s0, 64; - and_b32 $s0, $s0, 255; - - add_u32 $s5, $s5, $s2; - stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0); - - //force to retrieve different image elements - add_u32 $s1, $s1, 64; - and_b32 $s1, $s1, 255; - add_u32 $s0, $s0, 64; - and_b32 $s0, $s0, 255; - - add_u32 $s5, $s5, $s2; - stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0); - - //force to retrieve different image elements - add_u32 $s1, $s1, 64; - and_b32 $s1, $s1, 255; - add_u32 $s0, $s0, 64; - and_b32 $s0, $s0, 255; - - add_u32 $s5, $s5, $s2; - stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0); - - //force to retrieve different image elements - add_u32 $s1, $s1, 64; - and_b32 $s1, $s1, 255; - add_u32 $s0, $s0, 64; - and_b32 $s0, $s0, 255; - - add_u32 $s5, $s5, $s2; - stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0); - - //force to retrieve different image elements - add_u32 $s1, $s1, 64; - and_b32 $s1, $s1, 255; - add_u32 $s0, $s0, 64; - and_b32 $s0, $s0, 255; - - add_u32 $s5, $s5, $s2; - stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0); - - //force to retrieve different image elements - add_u32 $s1, $s1, 64; - and_b32 $s1, $s1, 255; - add_u32 $s0, $s0, 64; - and_b32 $s0, $s0, 255; - - add_u32 $s5, $s5, $s2; - stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0); - -//loop until we hit condition - cmp_lt_b1_u32 $c0, $s2, $s3; - cbr_b1 $c0, @loop; - ret; -}; diff --git a/rocrtst/suites/performance/kernels/sysMemRead.hsail b/rocrtst/suites/performance/kernels/sysMemRead.hsail deleted file mode 100755 index bfdb35de7c..0000000000 --- a/rocrtst/suites/performance/kernels/sysMemRead.hsail +++ /dev/null @@ -1,237 +0,0 @@ -module &m:1:0:$full:$large:$default; - -/* Copyright 2014 HSA Foundation Inc. All Rights Reserved. - * - * HSAF is granting you permission to use this software and documentation (if - * any) (collectively, the "Materials") pursuant to the terms and conditions - * of the Software License Agreement included with the Materials. If you do - * not have a copy of the Software License Agreement, contact the HSA Foundation for a copy. - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS - * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. - */ - -extension "amd:gcn"; -extension "IMAGE"; - -decl prog function &abort()(); - - -/** - * @brief Hsail kernel to benchmark READ accesses to system memory. - * The kernel is given a input buffer from which each each thread will - * read. The thread will read from multiple locations of the input buffer. - * The locations to read from is determined by the work-item Id, the function - * being work-item Id modulo total number of work-items in the global work grid. - * So given a global work grid of 16 work-items the reads by a thread with absolute - * id 4 would be 4, 20, 36, 52, etc. - * - * @NOTE: A constraint imposed by the kernel is that the buffer size be large - * enough to support 16 reads by each thread. So a dispatch of 8 work-items - * should allocate enough buffer for 8 * 16 * sizeof(uint32_t). - * - * @param bufStart beginning byte address of user buffer in system memory - * from which kernel threads could read - * - * @param bufEnd byte address that follows the end of user buffer. Accessing - * memory at bufEnd is illegal - * - * @param addrStep size by which to increment byte address following each read - * operation. The value represents total number of work-items * sizeof(uint32_t) - * - * @param outAddr argument that is passed by the user to be updated with values - * read by the kernel threads. This is ensure compiler and finalizer do not eliminate - * code because the values being read are not used in any meaningfule way. - * - */ -prog kernel &__SysMemLoad(kernarg_u64 %bufStart, - kernarg_u64 %bufEnd, - kernarg_u64 %addrStep, - kernarg_u64 %outAddr) { - - pragma "AMD RTI", "ARGSTART:__SysMemLoad"; - pragma "AMD RTI", "version:3:1:104"; - pragma "AMD RTI", "device:generic"; - pragma "AMD RTI", "uniqueid:1024"; - pragma "AMD RTI", "function:1:0"; - pragma "AMD RTI", "memory:64bitABI"; - pragma "AMD RTI", "uavid:8"; - pragma "AMD RTI", "privateid:8"; - pragma "AMD RTI", "ARGEND:__SysMemLoad"; - - // Retrieve the values of input arguments - // bufStart refers to the starting byte address - // bufEnd refers to the end of byte address - // addrStep refers to the product of total number - // of work-items in the grid * sizeof(uint32_t) - ld_kernarg_u64 $d0, [%bufStart]; - ld_kernarg_u64 $d1, [%bufEnd]; - ld_kernarg_u64 $d2, [%addrStep]; - ld_kernarg_u64 $d3, [%outAddr]; - - // Compute the absolute id of current thread - // and shift it by two to get index into user - // buffer to access for Read operation - workitemflatabsid_u32 $s0; - shl_u32 $s0, $s0, 2; - cvt_u64_u32 $d4, $s0; - - // Add index to base address of user buffer to obtain - // effective address for access - add_u64 $d0, $d0, $d4; - add_u64 $d3, $d3, $d4; - - // Initialize thread's read accumulator to zero - mov_u32 $s2, 0; - -@loop: - - // Read sixteeen values with a stride that is - // determined by the total number of work-items - // in the global grid - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - // Update output buffer with values read - // from input buffer - st_global_u32 $s2, [$d3]; - -}; - diff --git a/rocrtst/suites/performance/kernels/sysMemRead_base.hsail b/rocrtst/suites/performance/kernels/sysMemRead_base.hsail deleted file mode 100755 index 264a194c92..0000000000 --- a/rocrtst/suites/performance/kernels/sysMemRead_base.hsail +++ /dev/null @@ -1,237 +0,0 @@ -module &m:1:0:$base:$large:$default; - -/* Copyright 2014 HSA Foundation Inc. All Rights Reserved. - * - * HSAF is granting you permission to use this software and documentation (if - * any) (collectively, the "Materials") pursuant to the terms and conditions - * of the Software License Agreement included with the Materials. If you do - * not have a copy of the Software License Agreement, contact the HSA Foundation for a copy. - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS - * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. - */ - -extension "amd:gcn"; -extension "IMAGE"; - -decl prog function &abort()(); - - -/** - * @brief Hsail kernel to benchmark READ accesses to system memory. - * The kernel is given a input buffer from which each each thread will - * read. The thread will read from multiple locations of the input buffer. - * The locations to read from is determined by the work-item Id, the function - * being work-item Id modulo total number of work-items in the global work grid. - * So given a global work grid of 16 work-items the reads by a thread with absolute - * id 4 would be 4, 20, 36, 52, etc. - * - * @NOTE: A constraint imposed by the kernel is that the buffer size be large - * enough to support 16 reads by each thread. So a dispatch of 8 work-items - * should allocate enough buffer for 8 * 16 * sizeof(uint32_t). - * - * @param bufStart beginning byte address of user buffer in system memory - * from which kernel threads could read - * - * @param bufEnd byte address that follows the end of user buffer. Accessing - * memory at bufEnd is illegal - * - * @param addrStep size by which to increment byte address following each read - * operation. The value represents total number of work-items * sizeof(uint32_t) - * - * @param outAddr argument that is passed by the user to be updated with values - * read by the kernel threads. This is ensure compiler and finalizer do not eliminate - * code because the values being read are not used in any meaningfule way. - * - */ -prog kernel &__SysMemLoad(kernarg_u64 %bufStart, - kernarg_u64 %bufEnd, - kernarg_u64 %addrStep, - kernarg_u64 %outAddr) { - - pragma "AMD RTI", "ARGSTART:__SysMemLoad"; - pragma "AMD RTI", "version:3:1:104"; - pragma "AMD RTI", "device:generic"; - pragma "AMD RTI", "uniqueid:1024"; - pragma "AMD RTI", "function:1:0"; - pragma "AMD RTI", "memory:64bitABI"; - pragma "AMD RTI", "uavid:8"; - pragma "AMD RTI", "privateid:8"; - pragma "AMD RTI", "ARGEND:__SysMemLoad"; - - // Retrieve the values of input arguments - // bufStart refers to the starting byte address - // bufEnd refers to the end of byte address - // addrStep refers to the product of total number - // of work-items in the grid * sizeof(uint32_t) - ld_kernarg_u64 $d0, [%bufStart]; - ld_kernarg_u64 $d1, [%bufEnd]; - ld_kernarg_u64 $d2, [%addrStep]; - ld_kernarg_u64 $d3, [%outAddr]; - - // Compute the absolute id of current thread - // and shift it by two to get index into user - // buffer to access for Read operation - workitemflatabsid_u32 $s0; - shl_u32 $s0, $s0, 2; - cvt_u64_u32 $d4, $s0; - - // Add index to base address of user buffer to obtain - // effective address for access - add_u64 $d0, $d0, $d4; - add_u64 $d3, $d3, $d4; - - // Initialize thread's read accumulator to zero - mov_u32 $s2, 0; - -@loop: - - // Read sixteeen values with a stride that is - // determined by the total number of work-items - // in the global grid - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - ld_global_u32 $s1, [$d0]; - add_u32 $s2, $s1, $s2; - add_u64 $d0, $d0, $d2; - - // Update output buffer with values read - // from input buffer - st_global_u32 $s2, [$d3]; - -}; - diff --git a/rocrtst/suites/performance/kernels/sysMemWrite.hsail b/rocrtst/suites/performance/kernels/sysMemWrite.hsail deleted file mode 100755 index 97a83e6105..0000000000 --- a/rocrtst/suites/performance/kernels/sysMemWrite.hsail +++ /dev/null @@ -1,105 +0,0 @@ -module &m:1:0:$full:$large:$default; -extension "amd:gcn"; -extension "IMAGE"; - -decl prog function &abort()(); - -prog kernel &__SysMemStore(kernarg_u64 %bufStart, - kernarg_u64 %bufEnd, - kernarg_u64 %addrStep, - kernarg_u64 %deadArg) { - - // Directives for Compiler - pragma "AMD RTI", "ARGSTART:__SysMemStore"; - pragma "AMD RTI", "version:3:1:104"; - pragma "AMD RTI", "device:generic"; - pragma "AMD RTI", "uniqueid:1024"; - pragma "AMD RTI", "function:1:0"; - pragma "AMD RTI", "memory:64bitABI"; - pragma "AMD RTI", "uavid:8"; - pragma "AMD RTI", "privateid:8"; - pragma "AMD RTI", "ARGEND:__SysMemStore"; - - // Retrieve the values of input arguments - // bufStart refers to the starting byte address - // bufEnd refers to the end of byte address - // addrStep refers to the product of total number - // of work-items in the grid * sizeof(uint32_t) - ld_kernarg_u64 $d0, [%bufStart]; - ld_kernarg_u64 $d1, [%bufEnd]; - ld_kernarg_u64 $d2, [%addrStep]; - ld_kernarg_u64 $d3, [%deadArg]; - - // Compute the absolute id of current thread - // and shift it by two to get index into user - // buffer to access for Write operation - workitemflatabsid_u32 $s0; - shl_u32 $s0, $s0, 2; - - // Convert the thread id into a 64-bit number - // and add it to the starting address of user - // buffer to obtain effective address for access - cvt_u64_u32 $d4, $s0; - add_u64 $d0, $d0, $d4; - - -@loop: - - // Write sixteeen values with a stride that is - // determined by the total number of work-items - // in the global grid - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - // Loop until we hit end of buffer [%bufEnd] - cmp_lt_b1_u64 $c0, $d0, $d1; - cbr_b1 $c0, @loop; - -}; - diff --git a/rocrtst/suites/performance/kernels/sysMemWrite_base.hsail b/rocrtst/suites/performance/kernels/sysMemWrite_base.hsail deleted file mode 100755 index e2f304fe1d..0000000000 --- a/rocrtst/suites/performance/kernels/sysMemWrite_base.hsail +++ /dev/null @@ -1,105 +0,0 @@ -module &m:1:0:$base:$large:$default; -extension "amd:gcn"; -extension "IMAGE"; - -decl prog function &abort()(); - -prog kernel &__SysMemStore(kernarg_u64 %bufStart, - kernarg_u64 %bufEnd, - kernarg_u64 %addrStep, - kernarg_u64 %deadArg) { - - // Directives for Compiler - pragma "AMD RTI", "ARGSTART:__SysMemStore"; - pragma "AMD RTI", "version:3:1:104"; - pragma "AMD RTI", "device:generic"; - pragma "AMD RTI", "uniqueid:1024"; - pragma "AMD RTI", "function:1:0"; - pragma "AMD RTI", "memory:64bitABI"; - pragma "AMD RTI", "uavid:8"; - pragma "AMD RTI", "privateid:8"; - pragma "AMD RTI", "ARGEND:__SysMemStore"; - - // Retrieve the values of input arguments - // bufStart refers to the starting byte address - // bufEnd refers to the end of byte address - // addrStep refers to the product of total number - // of work-items in the grid * sizeof(uint32_t) - ld_kernarg_u64 $d0, [%bufStart]; - ld_kernarg_u64 $d1, [%bufEnd]; - ld_kernarg_u64 $d2, [%addrStep]; - ld_kernarg_u64 $d3, [%deadArg]; - - // Compute the absolute id of current thread - // and shift it by two to get index into user - // buffer to access for Write operation - workitemflatabsid_u32 $s0; - shl_u32 $s0, $s0, 2; - - // Convert the thread id into a 64-bit number - // and add it to the starting address of user - // buffer to obtain effective address for access - cvt_u64_u32 $d4, $s0; - add_u64 $d0, $d0, $d4; - - -@loop: - - // Write sixteeen values with a stride that is - // determined by the total number of work-items - // in the global grid - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - st_global_u32 $s0, [$d0]; - add_u64 $d0, $d0, $d2; - - // Loop until we hit end of buffer [%bufEnd] - cmp_lt_b1_u64 $c0, $d0, $d1; - cbr_b1 $c0, @loop; - -}; - diff --git a/rocrtst/suites/performance/image_store_bandwidth.h b/rocrtst/suites/performance/kernels/test_case_template_kernels.cl similarity index 72% rename from rocrtst/suites/performance/image_store_bandwidth.h rename to rocrtst/suites/performance/kernels/test_case_template_kernels.cl index 6de0d9f860..b7408570f5 100755 --- a/rocrtst/suites/performance/image_store_bandwidth.h +++ b/rocrtst/suites/performance/kernels/test_case_template_kernels.cl @@ -43,40 +43,12 @@ * */ -#ifndef __ROCRTST_SRC_INC_IMAGE_STORE_BANDWIDTH_H__ -#define __ROCRTST_SRC_INC_IMAGE_STORE_BANDWIDTH_H__ - -#include "perf_common/perf_base.h" -#include "common/base_rocr.h" -#include "hsa/hsa.h" - -class ImageStoreBandwidth: public rocrtst::BaseRocR, public PerfBase { - public: - //@Brief: Constructor - ImageStoreBandwidth(); - - //@Brief: Destructor - ~ImageStoreBandwidth(); - - //@Brief: Set up the test environment - virtual void SetUp(); - - //@Brief: Run the actual testing - virtual void Run(); - - //@Brief: Clean up the test environment - virtual void Close(); - - //@Brief: Display results - virtual void DisplayResults() const; - - private: - //@Brief: Image Store Bandwidth - double store_bandwidth_; - - //@Brief: Image size - size_t image_size_; -}; - -#endif //__ROCRTST_SRC_INC_IMAGE_STORE_BANDWIDTH_H__ - + __kernel void +square(__global int *dstArray, __global const int *srcArray, const int sz) { + unsigned int id = get_global_id(0); + if (id < sz) { + dstArray[id] = srcArray[id] * srcArray[id]; + } + return; +} + diff --git a/rocrtst/suites/performance/kernels/test_kernel.hsail b/rocrtst/suites/performance/kernels/test_kernel.hsail deleted file mode 100755 index 7c8587b213..0000000000 --- a/rocrtst/suites/performance/kernels/test_kernel.hsail +++ /dev/null @@ -1,53 +0,0 @@ -module &m:1:0:$full:$large:$default; -extension "amd:gcn"; -extension "IMAGE"; - -decl prog function &abort()(); - -prog kernel &__OpenCL_vec_assign_kernel( - kernarg_u64 %buf, - kernarg_u32 %num) -{ - pragma "AMD RTI", "ARGSTART:__OpenCL_vec_assign_kernel"; - pragma "AMD RTI", "version:3:1:104"; - pragma "AMD RTI", "device:generic"; - pragma "AMD RTI", "uniqueid:1024"; - pragma "AMD RTI", "function:1:0"; - pragma "AMD RTI", "memory:64bitABI"; - pragma "AMD RTI", "uavid:8"; - pragma "AMD RTI", "privateid:8"; - pragma "AMD RTI", "ARGEND:__OpenCL_vec_assign_kernel"; - -@__OpenCL_vec_assign_kernel_entry: - // BB#0: // %entry - ld_kernarg_align(8)_width(all)_u64 $d0, [%buf]; - ld_global_u32 $s1, [$d0]; - ld_kernarg_align(4)_width(all)_u32 $s0, [%num]; - cmp_ge_b1_s32 $c0, $s1, $s0; - cbr_b1 $c0, @BB0_4; - // BB#1: // %while.body.lr.ph - workitemabsid_u32 $s1, 0; - cmp_eq_b1_s32 $c0, $s1, 0; - cbr_b1 $c0, @BB0_2; - -@BB0_3: - // %while.cond.backedge - ld_global_u32 $s1, [$d0]; - cmp_lt_b1_s32 $c0, $s1, $s0; - cbr_b1 $c0, @BB0_3; - br @BB0_4; - -@BB0_2: - // %while.cond.backedge.us - ld_global_u32 $s1, [$d0]; - add_u32 $s1, $s1, 1; - st_global_u32 $s1, [$d0]; - ld_global_u32 $s1, [$d0]; - cmp_lt_b1_s32 $c0, $s1, $s0; - cbr_b1 $c0, @BB0_2; - -@BB0_4: - // %while.end - ret; -}; - diff --git a/rocrtst/suites/performance/kernels/transpose_kernel.hsail b/rocrtst/suites/performance/kernels/transpose_kernel.hsail deleted file mode 100755 index b29b30d8b7..0000000000 --- a/rocrtst/suites/performance/kernels/transpose_kernel.hsail +++ /dev/null @@ -1,108 +0,0 @@ -module &m:1:0:$full:$large:$default; -extension "amd:gcn"; -extension "IMAGE"; - -decl prog function &abort()(); - -prog kernel &__OpenCL_matrixTranspose_kernel( - kernarg_u64 %__global_offset_0, - kernarg_u64 %__global_offset_1, - kernarg_u64 %__global_offset_2, - kernarg_u64 %__printf_buffer, - kernarg_u64 %__vqueue_pointer, - kernarg_u64 %__aqlwrap_pointer, - kernarg_u64 %inBuf, - kernarg_u64 %outBuf, - kernarg_u64 %localBuf, - kernarg_u32 %blockSize, - kernarg_u32 %width, - kernarg_u32 %height) -{ - pragma "AMD RTI", "ARGSTART:__OpenCL_matrixTranspose_kernel"; - pragma "AMD RTI", "version:3:1:104"; - pragma "AMD RTI", "device:generic"; - pragma "AMD RTI", "uniqueid:1024"; - pragma "AMD RTI", "memory:private:0"; - pragma "AMD RTI", "memory:region:0"; - pragma "AMD RTI", "memory:local:0"; - pragma "AMD RTI", "value:__global_offset_0:u64:1:1:0"; - pragma "AMD RTI", "value:__global_offset_1:u64:1:1:16"; - pragma "AMD RTI", "value:__global_offset_2:u64:1:1:32"; - pragma "AMD RTI", "pointer:__printf_buffer:u8:1:1:48:uav:7:1:RW:0:0:0"; - pragma "AMD RTI", "value:__vqueue_pointer:u64:1:1:64"; - pragma "AMD RTI", "value:__aqlwrap_pointer:u64:1:1:80"; - pragma "AMD RTI", "pointer:inBuf:u32:1:1:96:uav:7:4:RW:0:1:0"; - pragma "AMD RTI", "pointer:outBuf:u32:1:1:112:uav:7:4:RW:0:1:0"; - pragma "AMD RTI", "pointer:localBuf:u32:1:1:128:l:7:4:RW:0:0:0"; - pragma "AMD RTI", "value:blockSize:u32:1:1:144"; - pragma "AMD RTI", "value:width:u32:1:1:160"; - pragma "AMD RTI", "value:height:u32:1:1:176"; - pragma "AMD RTI", "function:1:0"; - pragma "AMD RTI", "memory:64bitABI"; - pragma "AMD RTI", "privateid:8"; - pragma "AMD RTI", "enqueue_kernel:0"; - pragma "AMD RTI", "kernel_index:0"; - pragma "AMD RTI", "reflection:0:size_t"; - pragma "AMD RTI", "reflection:1:size_t"; - pragma "AMD RTI", "reflection:2:size_t"; - pragma "AMD RTI", "reflection:3:size_t"; - pragma "AMD RTI", "reflection:4:size_t"; - pragma "AMD RTI", "reflection:5:size_t"; - pragma "AMD RTI", "reflection:6:uint*"; - pragma "AMD RTI", "reflection:7:uint*"; - pragma "AMD RTI", "reflection:8:uint*"; - pragma "AMD RTI", "reflection:9:uint"; - pragma "AMD RTI", "reflection:10:uint"; - pragma "AMD RTI", "reflection:11:uint"; - pragma "AMD RTI", "ARGEND:__OpenCL_matrixTranspose_kernel"; - -@__OpenCL_matrixTranspose_kernel_entry: - // BB#0: // %entry - workitemid_u32 $s0, 1; - ld_kernarg_align(4)_width(all)_u32 $s1, [%blockSize]; - workitemid_u32 $s2, 0; - mad_u32 $s3, $s2, $s1, $s0; - cvt_u64_u32 $d1, $s3; - workitemabsid_u32 $s3, 0; - cvt_u64_u32 $d0, $s3; - ld_kernarg_align(8)_width(all)_u64 $d2, [%__global_offset_0]; - add_u64 $d0, $d0, $d2; - workitemabsid_u32 $s5, 1; - workgroupid_u32 $s4, 0; - workgroupid_u32 $s3, 1; - shl_u64 $d1, $d1, 2; - mad_u32 $s3, $s3, $s1, $s2; - mad_u32 $s4, $s4, $s1, $s0; - cvt_u64_u32 $d2, $s5; - ld_kernarg_align(8)_width(all)_u64 $d3, [%__global_offset_1]; - cvt_u32_u64 $s5, $d0; - add_u64 $d0, $d2, $d3; - cvt_u32_u64 $s6, $d0; - ld_kernarg_align(4)_width(all)_u32 $s7, [%width]; - ld_kernarg_align(8)_width(all)_u64 $d0, [%localBuf]; - ld_kernarg_align(4)_width(all)_u32 $s8, [%height]; - mad_u32 $s3, $s4, $s8, $s3; - add_u64 $d1, $d0, $d1; - cvt_u32_u64 $s4, $d1; - mad_u32 $s5, $s6, $s7, $s5; - cvt_u64_u32 $d1, $s5; - shl_u64 $d2, $d1, 2; - ld_kernarg_align(8)_width(all)_u64 $d1, [%outBuf]; - ld_kernarg_align(8)_width(all)_u64 $d3, [%inBuf]; - add_u64 $d2, $d3, $d2; - ld_global_align(4)_u32 $s5, [$d2]; - st_group_align(4)_u32 $s5, [$s4]; - cvt_u64_u32 $d2, $s3; - shl_u64 $d2, $d2, 2; - add_u64 $d1, $d1, $d2; - mad_u32 $s0, $s0, $s1, $s2; - cvt_u64_u32 $d2, $s0; - shl_u64 $d2, $d2, 2; - add_u64 $d0, $d0, $d2; - cvt_u32_u64 $s0, $d0; - barrier; - ld_group_align(4)_u32 $s0, [$s0]; - st_global_align(4)_u32 $s0, [$d1]; - ret; -}; - diff --git a/rocrtst/suites/performance/kernels/vector_copy.hsail b/rocrtst/suites/performance/kernels/vector_copy.hsail deleted file mode 100755 index 79c2bb0708..0000000000 --- a/rocrtst/suites/performance/kernels/vector_copy.hsail +++ /dev/null @@ -1,34 +0,0 @@ -module &m:1:0:$full:$large:$default; -extension "amd:gcn"; -extension "IMAGE"; - -decl prog function &abort()(); - -prog kernel &__vector_copy_kernel( - kernarg_u64 %a, - kernarg_u64 %b) -{ - pragma "AMD RTI", "ARGSTART:__vector_copy_kernel"; - pragma "AMD RTI", "version:3:1:104"; - pragma "AMD RTI", "device:generic"; - pragma "AMD RTI", "uniqueid:1024"; - pragma "AMD RTI", "function:1:0"; - pragma "AMD RTI", "memory:64bitABI"; - pragma "AMD RTI", "uavid:8"; - pragma "AMD RTI", "privateid:8"; - pragma "AMD RTI", "ARGEND:__vector_copy_kernel"; - -@__vector_copy_kernel_entry: - // BB#0: // %entry - workitemabsid_u32 $s0, 0; - cvt_s64_s32 $d0, $s0; - shl_u64 $d0, $d0, 2; - ld_kernarg_align(8)_width(all)_u64 $d1, [%b]; - add_u64 $d1, $d1, $d0; - ld_kernarg_align(8)_width(all)_u64 $d2, [%a]; - add_u64 $d0, $d2, $d0; - ld_global_u32 $s0, [$d0]; - st_global_u32 $s0, [$d1]; - ret; -}; - diff --git a/rocrtst/suites/performance/kernels/vector_copy_base.hsail b/rocrtst/suites/performance/kernels/vector_copy_base.hsail deleted file mode 100755 index 6a3a1572d8..0000000000 --- a/rocrtst/suites/performance/kernels/vector_copy_base.hsail +++ /dev/null @@ -1,64 +0,0 @@ -module &m:1:0:$base:$large:$default; - -//////////////////////////////////////////////////////////////////////////////// -// -// The University of Illinois/NCSA -// Open Source License (NCSA) -// -// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// -// Developed by: -// -// AMD Research and AMD HSA Software Development -// -// Advanced Micro Devices, Inc. -// -// www.amd.com -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to -// deal with the Software without restriction, including without limitation -// the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// - Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimers. -// - Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimers in -// the documentation and/or other materials provided with the distribution. -// - Neither the names of Advanced Micro Devices, Inc, -// nor the names of its contributors may be used to endorse or promote -// products derived from this Software without specific prior written -// permission. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -// DEALINGS WITH THE SOFTWARE. -// -//////////////////////////////////////////////////////////////////////////////// - - -decl prog function &abort()(); - -prog kernel &__vector_copy_kernel( - kernarg_u64 %in, - kernarg_u64 %out) -{ -@__vector_copy_kernel_entry: - // BB#0: // %entry - workitemabsid_u32 $s0, 0; - cvt_s64_s32 $d0, $s0; - shl_u64 $d0, $d0, 2; - ld_kernarg_align(8)_width(all)_u64 $d1, [%out]; - add_u64 $d1, $d1, $d0; - ld_kernarg_align(8)_width(all)_u64 $d2, [%in]; - add_u64 $d0, $d2, $d0; - ld_global_u32 $s0, [$d0]; - st_global_u32 $s0, [$d1]; - ret; -}; diff --git a/rocrtst/suites/performance/kernels/vector_copy_full.hsail b/rocrtst/suites/performance/kernels/vector_copy_full.hsail deleted file mode 100755 index 07872eeac3..0000000000 --- a/rocrtst/suites/performance/kernels/vector_copy_full.hsail +++ /dev/null @@ -1,64 +0,0 @@ -module &m:1:0:$full:$large:$default; - -//////////////////////////////////////////////////////////////////////////////// -// -// The University of Illinois/NCSA -// Open Source License (NCSA) -// -// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// -// Developed by: -// -// AMD Research and AMD HSA Software Development -// -// Advanced Micro Devices, Inc. -// -// www.amd.com -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to -// deal with the Software without restriction, including without limitation -// the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// - Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimers. -// - Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimers in -// the documentation and/or other materials provided with the distribution. -// - Neither the names of Advanced Micro Devices, Inc, -// nor the names of its contributors may be used to endorse or promote -// products derived from this Software without specific prior written -// permission. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -// DEALINGS WITH THE SOFTWARE. -// -//////////////////////////////////////////////////////////////////////////////// - - -decl prog function &abort()(); - -prog kernel &__vector_copy_kernel( - kernarg_u64 %in, - kernarg_u64 %out) -{ -@__vector_copy_kernel_entry: - // BB#0: // %entry - workitemabsid_u32 $s0, 0; - cvt_s64_s32 $d0, $s0; - shl_u64 $d0, $d0, 2; - ld_kernarg_align(8)_width(all)_u64 $d1, [%out]; - add_u64 $d1, $d1, $d0; - ld_kernarg_align(8)_width(all)_u64 $d2, [%in]; - add_u64 $d0, $d2, $d0; - ld_global_u32 $s0, [$d0]; - st_global_u32 $s0, [$d1]; - ret; -}; diff --git a/rocrtst/suites/performance/main.cc b/rocrtst/suites/performance/main.cc old mode 100644 new mode 100755 index b5a14ed4eb..36bab14d5a --- a/rocrtst/suites/performance/main.cc +++ b/rocrtst/suites/performance/main.cc @@ -43,238 +43,85 @@ * */ -#include "cp_process_time.h" -#include "cu_masking.h" -#include "device_load_bandwidth.h" -#include "device_store_bandwidth.h" -#include "dispatch_time.h" -#include "flush_latency.h" #include "gtest/gtest.h" -#include "hsa_info.h" -#include "image_bandwidth.h" -#include "image_load_bandwidth.h" -#include "image_store_bandwidth.h" -#include "matrix_transpose.h" -#include "memory_copy.h" -#include "memory_allocation.h" -#include "memory_async_copy.h" -#include "queue_concurrency.h" -#include "queue_create_destroy_latency.h" -#include "system_load_bandwidth.h" -#include "system_store_bandwidth.h" -#include "vector_copy.h" +#include "suites/performance/dispatch_time.h" +#include "suites/performance/memory_async_copy.h" +#include "suites/performance/test_case_template.h" +#include "suites/performance/main.h" +#include "suites/test_common/test_common.h" -/** - * Try to order tests from fastest running to slowest running. - */ +static uint32_t sRocrTstOptVerbosity = 1; +static uint32_t sRocrTestOptIterations = 0; -// DisplayResultsResults HSA system information first. -TEST(rocrtst, Feature_Hsa_Info) { - HsaInfo hi; - hi.SetUp(); - hi.Run(); - hi.Close(); +static void RunTest(TestBase *test) { + test->set_verbosity(sRocrTstOptVerbosity); + + if (sRocrTestOptIterations) { + test->set_num_iteration(sRocrTestOptIterations); + } + test->DisplayTestInfo(); + test->SetUp(); + test->Run(); + test->DisplayResults(); + test->Close(); + + return; } -// Requires HSA_PFOFILE_FULL -TEST(rocrtst, Perf_Image_Store_Bandwidth) { - ImageStoreBandwidth isb; - isb.SetUp(); - isb.Run(); - isb.DisplayResults(); - isb.Close(); +// TEST ENTRY TEMPLATE: +// TEST(rocrtst, Perf_) { +// ; +// +// // Copy and modify implementation of RunTest() if you need to deviate +// // from the standard pattern implemented there. +// RunTest(&); +// } + +TEST(rocrtst, Test_Example) { + TestExample tst; + RunTest(&tst); } -// Requires HSA_PFOFILE_FULL -TEST(rocrtst, Perf_Image_Load_Bandwidth) { - ImageLoadBandwidth ilb; - ilb.SetUp(); - ilb.Run(); - ilb.DisplayResults(); - ilb.Close(); +TEST(rocrtst, Perf_Memory_Async_Copy) { + MemoryAsyncCopy mac; + // To do full test, uncomment this: + // mac.set_full_test(true); + // To test only 1 path, add lines like this: + // mac.set_src_pool(); + // mac.set_dst_pool(); + // The default is to and from the cpu to 1 gpu, and to/from a gpu to + // another gpu + RunTest(&mac); } -// Requires HSA_PFOFILE_FULL -TEST(rocrtst, Perf_Image_Bandwidth) { - ImageBandwidth ib; - ib.SetUp(); - ib.Run(); - ib.DisplayResults(); - ib.Close(); -} - -// Requires HSA_PFOFILE_FULL -TEST(rocrtst, Perf_Queue_Concurrency) { - QueueConcurrency mc; - mc.SetUp(); - mc.Run(); - mc.DisplayResults(); - mc.Close(); -} - -TEST(rocrtst, Feature_Cu_Masking) { - CuMasking cm; - cm.SetUp(); - cm.Run(); - cm.Close(); -} - -TEST(rocrtst, Perf_Flush_Latency) { - FlushLatency fl; - fl.SetUp(); - fl.Run(); - fl.DisplayResults(); - fl.Close(); -} - -// This test apparently has some sort of memory bounds overwrite -// issue with the out_data_ buffer. Commenting out the free of -// out_data_ avoids the problem. Left uncommented, a crash will -// occur immediately or some time after. -TEST(rocrtst, DISABLED_Perf_Device_Memory_Store_Bandwidth) { - DeviceStoreBandwidth slb; - slb.SetUp(); - slb.Run(); - slb.DisplayResults(); - slb.Close(); -} - -// This test apparently has some sort of memory bounds overwrite -// issue with the out_data_ buffer. Commenting out the free of -// out_data_ avoids the problem. Left uncommented, a crash will -// occur immediately or some time after. -TEST(rocrtst, DISABLED_Perf_Device_Memory_Load_Bandwidth) { - DeviceLoadBandwidth slb; - slb.SetUp(); - slb.Run(); - slb.DisplayResults(); - slb.Close(); -} TEST(rocrtst, Perf_Dispatch_Time_Single_SpinWait) { - DispatchTime dt; - dt.set_num_iteration(100); - dt.UseDefaultSignal(true); - dt.LaunchSingleKernel(true); - dt.SetUp(); - dt.Run(); - dt.DisplayResults(); - dt.Close(); + DispatchTime dt(true, true); + RunTest(&dt); } TEST(rocrtst, Perf_Dispatch_Time_Single_Interrupt) { - DispatchTime dt; - dt.UseDefaultSignal(false); - dt.LaunchSingleKernel(true); - dt.SetUp(); - dt.Run(); - dt.DisplayResults(); - dt.Close(); + DispatchTime dt(false, true); + RunTest(&dt); } TEST(rocrtst, Perf_Dispatch_Time_Multi_SpinWait) { - DispatchTime dt; - dt.UseDefaultSignal(true); - dt.LaunchSingleKernel(false); - dt.SetUp(); - dt.Run(); - dt.DisplayResults(); - dt.Close(); + DispatchTime dt(true, false); + RunTest(&dt); } TEST(rocrtst, Perf_Dispatch_Time_Multi_Interrupt) { - DispatchTime dt; - dt.UseDefaultSignal(false); - dt.LaunchSingleKernel(false); - dt.SetUp(); - dt.Run(); - dt.DisplayResults(); - dt.Close(); + DispatchTime dt(false, false); + RunTest(&dt); } -TEST(rocrtst, DISABLED_Perf_CpProcessTime) { - CpProcessTime cpt; - cpt.set_num_iteration(10); - cpt.SetUp(); - cpt.Run(); - cpt.DisplayResults(); - cpt.Close(); -} - -TEST(rocrtst, Perf_Memory_Allocation) { - MemoryAllocation ma(10); - ma.SetUp(); - ma.Run(); - ma.DisplayResults(); - ma.Close(); -} - -#if MEM_POOL_FILL_BUG -TEST(rocrtst, Perf_Queue_Latency) { - QueueLatency ql; - ql.set_num_iteration(10); - ql.SetUp(); - ql.Run(); - ql.DisplayResults(); - ql.Close(); -} - -TEST(rocrtst, Perf_System_Memory_Load_Bandwidth) { - SystemLoadBandwidth slb; - slb.SetUp(); - slb.Run(); - slb.DisplayResults(); - slb.Close(); -} - -TEST(rocrtst, Perf_System_Memory_Store_Bandwidth) { - SystemStoreBandwidth ssb; - ssb.SetUp(); - ssb.Run(); - ssb.DisplayResults(); - ssb.Close(); -} - -TEST(rocrtst, Perf_Memory_Copy) { - MemoryCopy mc; - mc.set_num_iteration(10); - mc.SetUp(); - mc.Run(); - mc.DisplayResults(); - mc.Close(); -} - -#endif - -#if 0 -// These tests were not complete. Needs research/work. -TEST(rocrtst, Feature_Vector_Copy) { - VectorCopy vc; - vc.SetUp(); - vc.Run(); - vc.Close(); -} - -TEST(rocrtst, Perf_Matrix_Transpose) { - MatrixTranspose mt; - mt.SetUp(); - mt.Run(); - mt.DisplayResults(); - mt.Close(); -} - -#endif - -//#if NEED_TO_MAKE_BATCH -TEST(rocrtst, Perf_Memory_Async_Copy) { - MemoryAsyncCopy mac; - mac.set_num_iteration(10); - mac.SetUp(); - mac.Run(); - mac.DisplayResults(); - mac.Close(); -} -//#endif int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); + + RocrtstOptions opts(&sRocrTstOptVerbosity, &sRocrTestOptIterations); + + if (ProcessCmdline(&opts, argc, argv)) { + return 1; + } + return RUN_ALL_TESTS(); } diff --git a/rocrtst/suites/performance/matrix_transpose.cc b/rocrtst/suites/performance/matrix_transpose.cc deleted file mode 100755 index 714b534ae6..0000000000 --- a/rocrtst/suites/performance/matrix_transpose.cc +++ /dev/null @@ -1,289 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#include "matrix_transpose.h" -#include "common/base_rocr_utils.h" -#include "common/common.h" -#include "common/helper_funcs.h" -#include "common/hsatimer.h" -#include "hsa/hsa.h" -#include "hsa/hsa_ext_amd.h" -#include "hsa/hsa_ext_finalize.h" -#include "gtest/gtest.h" -#include -#include - -static const unsigned int NUM_BLOCK_SIZES = 2; -static const unsigned int blockSizes[NUM_BLOCK_SIZES] = {8, 16}; -static const unsigned int NUM_MATRIX_DIMS = 2; -static const unsigned int matrixDims[NUM_MATRIX_DIMS] = {1024, 64}; - -MatrixTranspose::MatrixTranspose(void) : - BaseRocR() { - in_buffer_sys_ = NULL; - out_buffer_sys_ = NULL; - in_buffer_ = NULL; - out_buffer_ = NULL; - width_ = 0; - height_ = 0; - buf_size_ = 0; - block_size_ = 0; - time_mean_ = 0.0; -} - -MatrixTranspose::~MatrixTranspose(void) { - -} - -void MatrixTranspose::SetUp(void) { - hsa_status_t err; - - InitializeData(); - - set_kernel_file_name("transpose_kernel.o"); - set_kernel_name("&__OpenCL_matrixTranspose_kernel"); - - if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) { - return; - } - - hsa_agent_t* gpu_dev = gpu_device1(); - hsa_agent_t* cpu_dev = cpu_device(); - - err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool, - &cpu_pool()); - ASSERT_EQ(err, HSA_STATUS_INFO_BREAK); - - err = hsa_amd_memory_pool_allocate(cpu_pool(), buf_size_, 0, - (void**) &in_buffer_); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = hsa_amd_memory_pool_allocate(cpu_pool(), buf_size_, 0, - (void**) &out_buffer_); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = hsa_amd_agents_allow_access(1, gpu_dev, NULL, in_buffer_); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = hsa_amd_agents_allow_access(1, gpu_dev, NULL, out_buffer_); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Create a queue - hsa_queue_t* q = nullptr; - rocrtst::CreateQueue(*gpu_dev, &q); - set_main_queue(q); - - rocrtst::LoadKernelFromObjFile(this); - - // Fill up aql packet - rocrtst::InitializeAQLPacket(this, &aql()); - aql().setup = 0; - aql().setup |= 2 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; - aql().workgroup_size_x = block_size_; - aql().workgroup_size_y = block_size_; - aql().grid_size_x = width_; - aql().grid_size_y = height_; - aql().group_segment_size = sizeof(uint) * block_size_ * block_size_; - - // Debug -#ifdef DEBUG - std::cout << "workgroup size: " << block_size_ << ", " << block_size_ - << ", " << 1 << std::endl; - std::cout << "grid size: " << aql().grid_size_x << ", " << - aql().grid_size_y << ", " << aql().grid_size_z << std::endl; - std::cout << "group segment size: " << aql().group_segment_size << std::endl; -#endif -} - -void MatrixTranspose::Run(void) { - hsa_status_t err; - hsa_agent_t* gpu_dev = gpu_device1(); - - if (!rocrtst::CheckProfile(this)) { - return; - } - - // Allocate kernel parameter - typedef struct args_t { - uint* offset_0; - uint* offset_1; - uint* offset_2; - uint* printf_buffer; - uint* vqueue_buffer; - uint* aqlwrap_pointer; - - uint* in_buf; - uint* out_buf; - uint* local_buf; - uint iblock_size; - uint iwidth; - uint iheight; - } args; - - args* kern_ptr = NULL; - err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(args), 0, - (void**) &kern_ptr); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = hsa_amd_agents_allow_access(1, gpu_dev, NULL, kern_ptr); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - kern_ptr->offset_0 = 0; - kern_ptr->offset_1 = 0; - kern_ptr->offset_2 = 0; - kern_ptr->printf_buffer = 0; - kern_ptr->vqueue_buffer = 0; - kern_ptr->aqlwrap_pointer = 0; - - kern_ptr->in_buf = in_buffer_sys_; - kern_ptr->out_buf = out_buffer_sys_; - kern_ptr->local_buf = 0; - kern_ptr->iblock_size = block_size_; - kern_ptr->iwidth = width_; - kern_ptr->iheight = height_; - - aql().kernarg_address = kern_ptr; - - //Obtain the current queue write index. - uint64_t idx = hsa_queue_add_write_index_relaxed(main_queue(), 1); - - ((hsa_kernel_dispatch_packet_t*)(main_queue()->base_address))[idx] = aql(); - - rocrtst::PerfTimer p_timer; - int id = p_timer.CreateTimer(); - p_timer.StartTimer(id); - - ((hsa_kernel_dispatch_packet_t*)(main_queue()->base_address))[idx].header |= - HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; - - hsa_signal_store_release(main_queue()->doorbell_signal, idx); - - //Wait on the dispatch signal until the kernel is finished. - hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1, - (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE); - p_timer.StopTimer(id); - - hsa_amd_profiling_dispatch_time_t dispatch_time; - err = hsa_amd_profiling_get_dispatch_time(*gpu_dev, signal(), &dispatch_time); - - uint64_t stamp = dispatch_time.end - dispatch_time.start; - uint64_t freq; - - err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freq); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - std::cout << "Kernel time is: " << - (double) stamp / (double) freq * 1000.0 << std::endl; - hsa_signal_store_release(signal(), 1); - - - // Verify Results - VerifyResults (out_buffer_sys_); - - // Abandon the first result which is warm up - - time_mean_ = p_timer.ReadTimer(id); //rocrtst::CalcMean(timer); -} - -void MatrixTranspose::DisplayResults(void) const { - if (!rocrtst::CheckProfile(this)) { - return; - } - - std::cout << "============================================" << std::endl; - std::cout << "Matrix Transpose Mean Time: " << time_mean_ << std::endl; - - return; -} - -void MatrixTranspose::Close(void) { - hsa_status_t err; - err = rocrtst::CommonCleanUp(this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); -} - -void MatrixTranspose::InitializeData(void) { - // int openTest = 1; - block_size_ = 16; //blockSizes[openTest % NUM_BLOCK_SIZES]; - width_ = 1920; //matrixDims[openTest / NUM_BLOCK_SIZES]; - height_ = width_; - - buf_size_ = width_ * height_ * sizeof(uint); - - in_buffer_sys_ = (uint*) aligned_alloc(256, buf_size_); - - SetData (in_buffer_sys_); - out_buffer_sys_ = (uint*) aligned_alloc(256, buf_size_); - - FillData(out_buffer_sys_, 0xdeadbeef); - - return; -} - -void MatrixTranspose::SetData(uint* buffer) { - for (unsigned int i = 0; i < height_; i++) { - for (unsigned int j = 0; j < width_; j++) { - *(buffer + i * width_ + j) = i * width_ + j; - } - } -} - -void MatrixTranspose::FillData(uint* buffer, unsigned int val) { - for (unsigned int i = 0; i < width_ * height_; i++) { - buffer[i] = val; - } -} - -void MatrixTranspose::VerifyResults(uint* buffer) { - bool err = false; - - for (unsigned int i = 0; (i < width_) && !err; i++) { - for (unsigned int j = 0; (j < height_) && !err; j++) { - ASSERT_EQ(*(buffer + i * height_ + j), j * width_ + i); - } - } - - std::cout << "PASSED!" << std::endl; -} diff --git a/rocrtst/suites/performance/matrix_transpose.h b/rocrtst/suites/performance/matrix_transpose.h deleted file mode 100755 index 8b90060c41..0000000000 --- a/rocrtst/suites/performance/matrix_transpose.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#ifndef __ROCRTST_SRC_MATRIX_TRANSPOSE_H__ -#define __ROCRTST_SRC_MATRIX_TRANSPOSE_H__ - -#include "perf_common/perf_base.h" -#include "common/base_rocr.h" -#include "hsa/hsa.h" - -class MatrixTranspose: public rocrtst::BaseRocR, public PerfBase { - - public: - //@Brief: Default Constructor - MatrixTranspose(); - - //@Brief: Destructor - ~MatrixTranspose(); - - //@Brief: Override SetUp function - virtual void SetUp(); - - //@Brief: Run the measurement - virtual void Run(); - - //@Brief: Clean up and Close - virtual void Close(); - - //@Brief: Display results - virtual void DisplayResults() const; - - private: - //@Brief: Set up data - virtual void SetData(uint* buffer); - - //@Brief: Fill Data - virtual void FillData(uint* buffer, unsigned int val); - - //@Brief: VerifyResults - virtual void VerifyResults(uint* buffer); - - //@Brief: Initialize the object attribute - virtual void InitializeData(); - - uint* in_buffer_; - uint* out_buffer_; - uint* in_buffer_sys_; - uint* out_buffer_sys_; - unsigned int width_; - unsigned int height_; - unsigned int buf_size_; - unsigned int block_size_; - double time_mean_; - - hsa_barrier_and_packet_t bpkt; -}; - -#endif //__ROCRTST_SRC_MATRIX_TRANSPOSE_H__ - diff --git a/rocrtst/suites/performance/memory_allocation.cc b/rocrtst/suites/performance/memory_allocation.cc deleted file mode 100755 index a2beb60000..0000000000 --- a/rocrtst/suites/performance/memory_allocation.cc +++ /dev/null @@ -1,198 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#include "memory_allocation.h" -#include "common/base_rocr_utils.h" -#include "common/common.h" -#include "hsa/hsa.h" -#include "gtest/gtest.h" -#include - -MemoryAllocation::MemoryAllocation(uint32_t num_iters) : - BaseRocR(), allocation_time_ {0.0}, mem_pool_flag_(0) { - ptr = NULL; -} - -MemoryAllocation::~MemoryAllocation() { - -} - -const char* MemoryAllocation::Str[16] = {"64K", "128K", "256K", "512K", "1M", - "2M", "4M", "8M", "16M", "32M", - "64M", "128M", "256M", "512M", "1G", - "2G" - }; -const size_t MemoryAllocation::Size[16] = {64*1024, 128*1024, - 256*1024,512*1024, 1024*1024, - 2048*1024, 4096*1024, 8*1024*1024, - 16*1024*1024, 32*1024*1024, - 64*1024*1024, 128*1024*1024, - 256 * 1024*1024, 512*1024*1024, - 1024*1024*1024, - (size_t)2*1024*1024*1024 - }; - -void MemoryAllocation::SetUp() { - hsa_status_t err; - - if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) { - return; - } - - hsa_agent_t* cpu_dev = cpu_device(); - - err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool, - &cpu_pool()); - - EXPECT_EQ(err, HSA_STATUS_INFO_BREAK); - - if (err != HSA_STATUS_INFO_BREAK) { - std::cout << "Unable to find global pool. Test will not be run." - << std::endl; - return; - } - - //At this point, cpu_pool() should be in the global segment - err = hsa_amd_memory_pool_get_info(cpu_pool(), - (hsa_amd_memory_pool_info_t) HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, - &mem_pool_flag_); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); -} - -void MemoryAllocation::Run() { - - if (!rocrtst::CheckProfile(this)) { - return; - } - - if (cpu_pool().handle == 0) { - return; - } - - size_t iterations = RealIterationNum(); - hsa_status_t err; - - //Iterate over the different data size - for (int i = 0; i < 16; i++) { - std::vector time; - - for (uint32_t it = 0; it < iterations; it++) { -#if DEBUG - std::cout << "." << std::flush; -#endif - - rocrtst::PerfTimer allocation_timer; - int index = allocation_timer.CreateTimer(); - - allocation_timer.StartTimer(index); - err = hsa_amd_memory_pool_allocate(cpu_pool(), Size[i], 0, &ptr); - allocation_timer.StopTimer(index); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - //Free the memory which was allocated - err = hsa_amd_memory_pool_free(ptr); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - ptr = NULL; - - // PUsh the results back to vector time - time.push_back(allocation_timer.ReadTimer(index)); - } - -#if DEBUG - std::cout << std::endl; -#endif - - //Get mean copy time and store to the array - allocation_time_[i] = GetMeanTime(time); - } -} - -size_t MemoryAllocation::RealIterationNum() { - return num_iteration() * 1.2 + 1; -} - -double MemoryAllocation::GetMeanTime(std::vector& vec) { - std::sort(vec.begin(), vec.end()); - - vec.erase(vec.begin()); - vec.erase(vec.begin(), vec.begin() + num_iteration() * 0.1); - vec.erase(vec.begin() + num_iteration(), vec.end()); - - double mean = 0.0; - int num = vec.size(); - - for (int it = 0; it < num; it++) { - mean += vec[it]; - } - - mean /= num; - return mean; -} - -void MemoryAllocation::DisplayResults() const { - - if (!rocrtst::CheckProfile(this)) { - return; - } - - fprintf(stdout, "==============================================\n"); - fprintf(stdout, " Data Size Allocation_time BandWidth(GB/s)\n"); - - for (int i = 0; i < 16; i++) { - fprintf(stdout, " %9s %15.6f %15.6f\n", Str[i], allocation_time_[i], - 2 * Size[i] / allocation_time_[i] / 1024 / 1024 / 1024); - } - - fprintf(stdout, "==============================================\n"); - - return; -} - -void MemoryAllocation::Close() { - hsa_status_t err; - err = rocrtst::CommonCleanUp(this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - return; -} diff --git a/rocrtst/suites/performance/memory_allocation.h b/rocrtst/suites/performance/memory_allocation.h deleted file mode 100755 index 1c39b1b2d3..0000000000 --- a/rocrtst/suites/performance/memory_allocation.h +++ /dev/null @@ -1,98 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#ifndef __ROCRTST_SRC_MEMORY_MEM_ALLOCATION_H__ -#define __ROCRTST_SRC_MEMORY_MEM_ALLOCATION_H__ - -#include "perf_common/perf_base.h" -#include "common/base_rocr.h" -#include "common/hsatimer.h" -#include "hsa/hsa.h" -#include - -class MemoryAllocation: public rocrtst::BaseRocR, public PerfBase { - - public: - //@Brief: Constructor for test case of MemoryAllocation - MemoryAllocation(uint32_t num_iters = 100); - - //@Brief: Destructor for test case of MemoryAllocation - virtual ~MemoryAllocation(); - - //@Brief: Set up the environment for the test - virtual void SetUp(); - - //@Brief: Execute the test - virtual void Run(); - - //@Brief: Display results - virtual void DisplayResults() const; - - //@Brief: Clean up and close the environment - virtual void Close(); - - protected: - //@Brief: Pointer to the memory space which is allocated by HSA Memory - // allocation API - void* ptr; - - //@Brief: Array to store the timers results for each data size - double allocation_time_[16]; - - private: - //@Brief: Define allocated data size and corresponding string - static const size_t Size[16]; - static const char* Str[16]; - - uint32_t mem_pool_flag_; - - //@Brief: Get the actual iteration number - size_t RealIterationNum(); - - //@Brief: Get mean execution time - double GetMeanTime(std::vector& vec); - -}; -#endif diff --git a/rocrtst/suites/performance/memory_async_copy.cc b/rocrtst/suites/performance/memory_async_copy.cc old mode 100644 new mode 100755 index a6d2112ecf..3c55aad265 --- a/rocrtst/suites/performance/memory_async_copy.cc +++ b/rocrtst/suites/performance/memory_async_copy.cc @@ -43,153 +43,236 @@ * */ -#include "memory_async_copy.h" +#include +#include + +#include "common/base_rocr.h" +#include "suites/test_common/test_base.h" +#include "hsa/hsa.h" +#include "hsa/hsa_ext_amd.h" +#include "suites/performance/memory_async_copy.h" #include "common/base_rocr_utils.h" #include "gtest/gtest.h" -const char* Str[20] = {"1k", "2K", "4K", "8K", "16K", "32K", "64K", "128K", - "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", - "64M", "128M", "256M", "512M" - }; -const size_t Size[20] = {1024, 2 * 1024, 4 * 1024, 8 * 1024, 16 * 1024, 32 - * 1024, 64 * 1024, 128 * 1024, 256 * 1024, 512 * 1024, - 1024 * 1024, 2048 * 1024, 4096 * 1024, 8 * 1024 * 1024, - 16 * 1024 * 1024, 32 * 1024 * 1024, 64 * 1024 * 1024, - 128 * 1024 * 1024, 256 * 1024 * 1024, 512 * 1024 * 1024 - }; +#define RET_IF_HSA_ERR(err) { \ + if ((err) != HSA_STATUS_SUCCESS) { \ + const char* msg = 0; \ + hsa_status_string(err, &msg); \ + std::cout << "hsa api call failure at line " << __LINE__ << ", file: " << \ + __FILE__ << ". Call returned " << err << std::endl; \ + std::cout << msg << std::endl; \ + return (err); \ + } \ +} + +static const int kNumGranularity = 20; +const char* Str[kNumGranularity] = {"1k", "2K", "4K", "8K", "16K", "32K", + "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", + "64M", "128M", "256M", "512M"}; + +const size_t Size[kNumGranularity] = { + 1024, 2*1024, 4*1024, 8*1024, 16*1024, 32*1024, 64*1024, 128*1024, + 256*1024, 512*1024, 1024*1024, 2048*1024, 4096*1024, 8*1024*1024, + 16*1024*1024, 32*1024*1024, 64*1024*1024, 128*1024*1024, 256*1024*1024, + 512*1024*1024}; + +static const int kMaxCopySize = Size[kNumGranularity - 1]; + +MemoryAsyncCopy::MemoryAsyncCopy(void) : + TestBase() { + static_assert(sizeof(Size)/sizeof(size_t) == kNumGranularity, + "kNumGranularity does not match size of arrays"); -MemoryAsyncCopy::MemoryAsyncCopy() : - BaseRocR() { -// argc_ = argc; -// argv_ = argv; - bench_mark_mode_ = false; - verification_ = false; agent_index_ = 0; - region_index_ = 0; + pool_index_ = 0; tran_.clear(); - agent_info_.clear(); - region_info_.clear(); - node_info_.clear(); + agent_info()->clear(); + pool_info()->clear(); + node_info()->clear(); verified_ = true; + src_pool_id_ = -1; + dst_pool_id_ = -1; + do_full_test_ = false; + set_num_iteration(10); // Default value + set_title("Asynchronous Memory Copy Bandwidth"); + set_description("This test measures bandwidth to/from Host from/to GPU " + "and Peer to Peer using hsa_amd_memory_async_copy() to copy buffers " + "of various length from memory pool to another."); } -MemoryAsyncCopy::~MemoryAsyncCopy() { - size_t size = tran_.size(); +MemoryAsyncCopy::~MemoryAsyncCopy(void) { + for (PoolInfo *p : pool_info_) { + delete p; + } - if (size != 0) { - for (size_t i = 0; i < size; i++) { - if (tran_.at(i).dep_signal != nullptr) - ; - - delete[] tran_.at(i).dep_signal; - } + for (AgentInfo *a : agent_info_) { + delete a; } } -void MemoryAsyncCopy::SetUp() { +void MemoryAsyncCopy::SetUp(void) { + TestBase::SetUp(); - if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) { - return; - } FindTopology(); - ParseArgument(); + if (verbosity() >= VERBOSE_STANDARD) { + PrintTopology(); + } + ConstructTransactionList(); return; } -void MemoryAsyncCopy::Run() { - if (bench_mark_mode_) - if (verification_) { - RunBenchmarkWithVerification(); - } - else { - RunBenchmark(); - } - else { - RunNormal(); +void MemoryAsyncCopy::Run(void) { + TestBase::Run(); + + for (Transaction t : tran_) { + RunBenchmarkWithVerification(&t); } } -void MemoryAsyncCopy::FindSystemRegion() { +void MemoryAsyncCopy::FindSystemPool(void) { hsa_status_t err; err = hsa_iterate_agents(rocrtst::FindCPUDevice, &cpu_agent_); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); + ASSERT_EQ(HSA_STATUS_INFO_BREAK, err); err = hsa_amd_agent_iterate_memory_pools(cpu_agent_, rocrtst::FindGlobalPool, - &sys_region_); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); + &sys_pool_); + ASSERT_EQ(HSA_STATUS_INFO_BREAK, err); } -void MemoryAsyncCopy::AcquireAccess(hsa_agent_t agent, +static hsa_status_t AcquireAccess(hsa_agent_t agent, hsa_amd_memory_pool_t pool, void* ptr) { hsa_status_t err; hsa_amd_memory_pool_access_t access; err = hsa_amd_agent_memory_pool_get_info(agent, pool, - HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); + HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access); - ASSERT_NE(HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED, access); + RET_IF_HSA_ERR(err); - if (HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT == access) { + if (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { + return HSA_STATUS_ERROR; + } + + if (access == HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT) { err = hsa_amd_agents_allow_access(1, &agent, NULL, ptr); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); + RET_IF_HSA_ERR(err); + } + + return err; +} + +static hsa_agent_t * +AcquireAsyncCopyAccess( + void *dst_ptr, hsa_amd_memory_pool_t dst_pool, hsa_agent_t *dst_ag, + void *src_ptr, hsa_amd_memory_pool_t src_pool, hsa_agent_t *src_ag) { + if (AcquireAccess(*src_ag, dst_pool, dst_ptr) != HSA_STATUS_SUCCESS) { + if (AcquireAccess(*dst_ag, src_pool, src_ptr) == HSA_STATUS_SUCCESS) { + return dst_ag; + } else { + return nullptr; + } + } else { + return src_ag; } } -void MemoryAsyncCopy::RunBenchmarkWithVerification() { +void MemoryAsyncCopy::RunBenchmarkWithVerification(Transaction *t) { hsa_status_t err; void* ptr_src; void* ptr_dst; - transaction& t = tran_.at(0); - size_t size = t.size * 1024; + size_t size = t->max_size * 1024; - FindSystemRegion(); + hsa_amd_memory_pool_t src_pool = pool_info_[t->src]->pool_; + hsa_agent_t dst_agent = pool_info_[t->dst]->owner_agent_info()->agent(); + hsa_amd_memory_pool_t dst_pool = pool_info_[t->dst]->pool_; - err = hsa_amd_memory_pool_allocate(region_info_[t.src].region_, size, 0, - &ptr_src); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); + hsa_agent_t src_agent = pool_info_[t->src]->owner_agent_info()->agent(); - err = hsa_amd_memory_pool_allocate(region_info_[t.dst].region_, size, 0, + if (verbosity() >= VERBOSE_STANDARD) { + printf("Executing Copy Path: From Pool %d To Pool %d ", t->src, t->dst); + switch (t->type) { + case H2D: + printf("(Host-To-Device)\n"); + break; + + case D2H: + printf("(Device-To-Host)\n"); + break; + + case P2P: + printf("(Peer-To-Peer)\n"); + break; + + default: + printf("**Unexpected path**\n"); + return; + } + } + + err = hsa_amd_memory_pool_allocate(src_pool, size, 0, &ptr_src); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); + + err = hsa_amd_memory_pool_allocate(dst_pool, size, 0, &ptr_dst); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); // rocrtst::CommonCleanUp data void* host_ptr_src = NULL; void* host_ptr_dst = NULL; - err = hsa_amd_memory_pool_allocate(sys_region_, size, 0, - (void**) &host_ptr_src); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - err = hsa_amd_memory_pool_allocate(sys_region_, size, 0, - (void**) &host_ptr_dst); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); + err = hsa_amd_memory_pool_allocate(sys_pool_, size, 0, + reinterpret_cast(&host_ptr_src)); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); + err = hsa_amd_memory_pool_allocate(sys_pool_, size, 0, + reinterpret_cast(&host_ptr_dst)); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); - memset(host_ptr_src, 1, size); - memset(host_ptr_dst, 0, size); + err = hsa_amd_memory_fill(host_ptr_src, 1, size/sizeof(uint32_t)); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); + + err = hsa_amd_memory_fill(host_ptr_dst, 0, size/sizeof(uint32_t)); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); hsa_signal_t s; err = hsa_signal_create(1, 0, NULL, &s); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); - AcquireAccess(region_info_[t.src].owner_agent_, sys_region_, host_ptr_src); - AcquireAccess(cpu_agent_, region_info_[t.src].region_, ptr_src); + // **** First copy from the system buffer source to the test source pool + // Acquire the appropriate access; prefer GPU agent over CPU where there + // is a choice. + hsa_agent_t *cpy_ag = nullptr; + cpy_ag = AcquireAsyncCopyAccess(ptr_src, src_pool, &src_agent, host_ptr_src, + sys_pool_, &cpu_agent_); + if (cpy_ag == nullptr) { + std::cout << "Agents " << t->src << " and " << t->dst << + "cannot access each other's pool." << std::endl; + } + ASSERT_NE(cpy_ag, nullptr); - err = hsa_amd_memory_async_copy(ptr_src, region_info_[t.src].owner_agent_, - host_ptr_src, cpu_agent_, size, 0, NULL, s); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); + err = hsa_amd_memory_async_copy(ptr_src, *cpy_ag, host_ptr_src, *cpy_ag, + size, 0, NULL, s); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); while (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1, uint64_t(-1), HSA_WAIT_STATE_ACTIVE)) - ; + {} int iterations = RealIterationNum(); - AcquireAccess(region_info_[t.dst].owner_agent_, region_info_[t.src].region_, - ptr_src); + // **** Next, copy from the test source pool to the test destination pool + // Prefer a gpu agent to a cpu agent - for (int i = 0; i < 20; i++) { + cpy_ag = AcquireAsyncCopyAccess(ptr_dst, dst_pool, &dst_agent, ptr_src, + src_pool, &src_agent); + if (cpy_ag == nullptr) { + std::cout << "Owner agents for pools" << t->src << " and " << + t->dst << " cannot access each other's pool." << std::endl; + } + ASSERT_NE(cpy_ag, nullptr); + + for (int i = 0; i < kNumGranularity; i++) { if (Size[i] > size) { break; } @@ -197,500 +280,179 @@ void MemoryAsyncCopy::RunBenchmarkWithVerification() { std::vector time; for (int it = 0; it < iterations; it++) { -#if DEBUG - std::cout << "."; - std::cout.flush(); -#endif - // Check access to memory pool region - AcquireAccess(region_info_[t.src].owner_agent_, - region_info_[t.dst].region_, ptr_dst); + if (verbosity() >= VERBOSE_PROGRESS) { + std::cout << "."; + std::cout.flush(); + } - hsa_signal_store_relaxed(t.signal, 1); + hsa_signal_store_relaxed(t->signal, 1); rocrtst::PerfTimer copy_timer; int index = copy_timer.CreateTimer(); copy_timer.StartTimer(index); - err = hsa_amd_memory_async_copy(ptr_dst, region_info_[t.dst].owner_agent_, - ptr_src, region_info_[t.src].owner_agent_, - Size[i], 0, NULL, t.signal); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); + err = hsa_amd_memory_async_copy(ptr_dst, *cpy_ag, ptr_src, *cpy_ag, + Size[i], 0, NULL, t->signal); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); - while (hsa_signal_wait_scacquire(t.signal, HSA_SIGNAL_CONDITION_LT, 1, - uint64_t(-1), HSA_WAIT_STATE_ACTIVE)) - ; + while (hsa_signal_wait_scacquire(t->signal, HSA_SIGNAL_CONDITION_LT, 1, + uint64_t(-1), HSA_WAIT_STATE_ACTIVE)) + {} copy_timer.StopTimer(index); hsa_signal_store_relaxed(s, 1); - AcquireAccess(region_info_[t.dst].owner_agent_, sys_region_, + err = AcquireAccess(dst_agent, sys_pool_, host_ptr_dst); - AcquireAccess(cpu_agent_, region_info_[t.dst].region_, ptr_dst); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); + err = hsa_amd_memory_async_copy(host_ptr_dst, cpu_agent_, ptr_dst, - region_info_[t.dst].owner_agent_, size, 0, NULL, s); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); + dst_agent, size, 0, NULL, s); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); while (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1, uint64_t(-1), HSA_WAIT_STATE_ACTIVE)) - ; + {} - err = hsa_memory_copy(host_ptr_dst, ptr_dst, size); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); + err = AcquireAccess(cpu_agent_, sys_pool_, host_ptr_dst); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); if (memcmp(host_ptr_src, host_ptr_dst, Size[i])) { verified_ = false; } - // Push the result back to vector time + time.push_back(copy_timer.ReadTimer(index)); } -#if DEBUG - std::cout << std::endl; -#endif + if (verbosity() >= VERBOSE_PROGRESS) { + std::cout << std::endl; + } // Get Min copy time - min_time_.push_back(GetMinTime(time)); + t->min_time->push_back(*std::min_element(time.begin(), time.end())); // Get mean copy time and store to the array - benchmark_copy_time_.push_back(GetMeanTime(time)); + t->benchmark_copy_time->push_back(GetMeanTime(&time)); } - DisplayBenchmark(); + err = hsa_signal_destroy(s); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); } -void MemoryAsyncCopy::RunBenchmark() { - hsa_status_t err; - void* ptr_src; - void* ptr_dst; - - transaction& t = tran_.at(0); - size_t size = t.size * 1024; - - FindSystemRegion(); - - err = hsa_amd_memory_pool_allocate(region_info_[t.src].region_, size, 0, - &ptr_src); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = hsa_amd_memory_pool_allocate(region_info_[t.dst].region_, size, 0, - &ptr_dst); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Check access to memory pool region - AcquireAccess(region_info_[t.src].owner_agent_, region_info_[t.dst].region_, - ptr_dst); - AcquireAccess(region_info_[t.dst].owner_agent_, region_info_[t.src].region_, - ptr_src); - - int iterations = RealIterationNum(); - - for (int i = 0; i < 20; i++) { - if (Size[i] > size) { - break; - } - - std::vector time; - - for (int it = 0; it < iterations; it++) { -#if DEBUG - std::cout << "."; - std::cout.flush(); -#endif - - hsa_signal_store_relaxed(t.signal, 1); - - rocrtst::PerfTimer copy_timer; - int index = copy_timer.CreateTimer(); - - copy_timer.StartTimer(index); - err = hsa_amd_memory_async_copy(ptr_dst, region_info_[t.dst].owner_agent_, - ptr_src, region_info_[t.src].owner_agent_, - Size[i], 0, NULL, t.signal); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - while (hsa_signal_wait_scacquire(t.signal, HSA_SIGNAL_CONDITION_LT, 1, - uint64_t(-1), HSA_WAIT_STATE_ACTIVE)) - ; - - copy_timer.StopTimer(index); - - // Push the result back to vector time - time.push_back(copy_timer.ReadTimer(index)); - } - -#if DEBUG - std::cout << std::endl; -#endif - - // Get Min copy time - min_time_.push_back(GetMinTime(time)); - // Get mean copy time and store to the array - benchmark_copy_time_.push_back(GetMeanTime(time)); - } - - DisplayBenchmark(); -} - -void MemoryAsyncCopy::RunNormal() { - int num_transaction = tran_.size(); - hsa_status_t err; - std::vector ptr_src; - std::vector ptr_dst; - - for (int i = 0; i < num_transaction; i++) { - void* ptr_src_temp; - void* ptr_dst_temp; - transaction& t = tran_[i]; - hsa_amd_memory_pool_t region_src = region_info_[t.src].region_; - hsa_amd_memory_pool_t region_dst = region_info_[t.dst].region_; - size_t size = t.size * 1024; - - // Allocate memory - err = hsa_amd_memory_pool_allocate(region_src, size, 0, - (void**) &ptr_src_temp); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - err = hsa_amd_memory_pool_allocate(region_dst, size, 0, - (void**) &ptr_dst_temp); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - AcquireAccess(region_info_[t.dst].owner_agent_, region_src, ptr_src_temp); - AcquireAccess(region_info_[t.src].owner_agent_, region_dst, ptr_dst_temp); - - ptr_src.push_back(ptr_src_temp); - ptr_dst.push_back(ptr_dst_temp); - } - - int iterations = RealIterationNum(); - std::vector time; - - for (int i = 0; i < iterations; i++) { - for (int j = 0; j < num_transaction; j++) { - transaction& t = tran_[j]; - hsa_signal_store_relaxed(t.signal, 1); - } - - rocrtst::PerfTimer copy_timer; - int index = copy_timer.CreateTimer(); - copy_timer.StartTimer(index); - - for (int j = 0; j < num_transaction; j++) { - transaction& t = tran_[j]; - err = hsa_amd_memory_async_copy(ptr_dst[j], - region_info_[t.dst].owner_agent_, ptr_src[j], - region_info_[t.src].owner_agent_, t.size * 1024, t.num_dep_signal, - t.dep_signal, t.signal); - } - - // Wait on the last transaction to finish - while (hsa_signal_wait_scacquire(tran_[num_transaction - 1].signal, - HSA_SIGNAL_CONDITION_LT, 1, uint64_t(-1), HSA_WAIT_STATE_ACTIVE)) - ; - - copy_timer.StopTimer(index); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - time.push_back(copy_timer.ReadTimer(index)); - } - - user_copy_time_ = GetMeanTime(time); - DisplayResults(); -} - -size_t MemoryAsyncCopy::RealIterationNum() { +size_t MemoryAsyncCopy::RealIterationNum(void) { return num_iteration() * 1.2 + 1; } -double MemoryAsyncCopy::GetMinTime(std::vector& vec) { - std::sort(vec.begin(), vec.end()); - return vec.at(0); -} -double MemoryAsyncCopy::GetMeanTime(std::vector& vec) { - std::sort(vec.begin(), vec.end()); +double MemoryAsyncCopy::GetMeanTime(std::vector *vec) { + std::sort(vec->begin(), vec->end()); - vec.erase(vec.begin()); - vec.erase(vec.begin(), vec.begin() + num_iteration() * 0.1); - vec.erase(vec.begin() + num_iteration(), vec.end()); + vec->erase(vec->begin()); + vec->erase(vec->begin(), vec->begin() + num_iteration() * 0.1); + vec->erase(vec->begin() + num_iteration(), vec->end()); double mean = 0.0; - int num = vec.size(); + int num = vec->size(); for (int it = 0; it < num; it++) { - mean += vec[it]; + mean += (*vec)[it]; } mean /= num; return mean; } -void MemoryAsyncCopy::DisplayResults() const { - +void MemoryAsyncCopy::DisplayResults(void) const { if (!rocrtst::CheckProfile(this)) { return; } - printf("================ User-Defined Mode Result " - "===================================\n"); - double band_width = (double) tran_.back().size / user_copy_time_ / 1024 - / 1024; - printf(" %zuKB %lf\n", tran_.back().size, - band_width); + TestBase::DisplayResults(); + + for (Transaction t : tran_) { + DisplayBenchmark(&t); + delete t.benchmark_copy_time; + delete t.min_time; + } + return; } -void MemoryAsyncCopy::DisplayBenchmark() { - transaction& t = tran_.at(0); - size_t size = t.size * 1024; - printf("================ Benchmark Mode Result " - "===================================\n"); +void MemoryAsyncCopy::DisplayBenchmark(Transaction *t) const { + size_t size = t->max_size * 1024; + printf("=========================== PATH: From Pool %d To Pool %d (", + t->src, t->dst); + + switch (t->type) { + case H2D: + printf("Host-To-Device) ===========================\n"); + break; + + case D2H: + printf("Device-To-Host) ===========================\n"); + break; + + case P2P: + printf("Peer-To-Peer) =============================\n"); + break; + + default: + ASSERT_EQ(t->type == H2D || t->type == D2H || t->type == P2P, true); + } + if (verified_) { + std::cout << "Verification: Pass" << std::endl; + } else { + std::cout << "Verification: Fail" << std::endl; + } + + if (verbosity() < VERBOSE_STANDARD) { + return; + } printf("Data Size Avg Time(us) Avg BW(GB/s)" - " Min Time(us) Peak BW(GB/s)\n"); + " Min Time(us) Peak BW(GB/s)\n"); for (int i = 0; i < 20; i++) { if (Size[i] > size) { break; } - double band_width = (double) Size[i] / benchmark_copy_time_[i] / 1024 / 1024 - / 1024; - double peak_band_width = (double) Size[i] / min_time_[i] / 1024 / 1024 - / 1024; - printf(" %4s %14lf %14lf %14lf %14lf\n", - Str[i], benchmark_copy_time_[i] * 1e6, band_width, min_time_[i] * 1e6, - peak_band_width); + double band_width = + static_cast(Size[i]/(*(t->benchmark_copy_time))[i]/1024/1024/1024); + double peak_band_width = + static_cast(Size[i] / (*(t->min_time))[i]/ 1024 / 1024 / 1024); + printf( + " %4s %14lf %14lf %14lf %14lf\n", + Str[i], (*(t->benchmark_copy_time))[i] * 1e6, band_width, + (*(t->min_time))[i] * 1e6, peak_band_width); } - if (verification_) { - if (verified_) { - std::cout << "Verification: Pass" << std::endl; - } - else { - std::cout << "Verification: Fail" << std::endl; - } - } return; } void MemoryAsyncCopy::Close() { - hsa_status_t err; - err = rocrtst::CommonCleanUp(this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); + TestBase::Close(); } -void MemoryAsyncCopy::FindTopology() { - hsa_status_t err; - err = hsa_iterate_agents(AgentInfo, this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); -} - -void MemoryAsyncCopy::ParseArgument() { - bool print_help_info = false; - hsa_status_t err; - - opterr = 0; - int c; - int src_region = 0; - int dst_region = 0; - size_t data_size = 512 * 1024; - size_t opt_num = 0; - char rec = 'n'; - - while ((c = getopt(argc_, argv_, "hbvs:f:t:i:r:")) != -1) { - switch (c) { - case 'h': - print_help_info = true; - break; - - case 'f': - src_region = std::stoi(optarg); - opt_num++; - break; - - case 't': - dst_region = std::stoi(optarg); - opt_num++; - break; - - case 's': - data_size = std::stoi(optarg); - break; - - case 'i': - set_num_iteration(std::stoi(optarg)); - break; - - case 'r': - rec = tolower(*optarg); - break; - - case 'b': - bench_mark_mode_ = true; - break; - - case 'v': - verification_ = true; - break; - - case '?': - if (optopt == 'f' || optopt == 't' || optopt == 's' || optopt == 'i' - || optopt == 'r') { - std::cout << "Error: Option -f -t -s -i and -r ALL requires argument" - << std::endl; - std::cout << help_info << std::endl; - } - - ASSERT_NE("Error: Option -f -t -s -i and -r ALL requires argument", ""); - break; - - default: - std::cout << "Error: Please set option argument properly!" << std::endl; - std::cout << help_info << std::endl; - ASSERT_NE("Error: Please set option argument properly!", ""); - } - } - - //-h option has the highest priority - if (print_help_info) { - std::cout << help_info << std::endl; - PrintTopology(); - ASSERT_NE("Exit on -h", ""); - } - - if (opt_num != 2) { - std::cout << "You must specify all of -f -t" << std::endl; - std::cout << help_info << std::endl; - PrintTopology(); - ASSERT_NE("You must specify all of -f -t", ""); - } - - // Set transaction - transaction trans; - trans.src = src_region; - trans.dst = dst_region; - trans.size = data_size; - trans.num_dep_signal = 0; - trans.dep_signal = nullptr; - err = hsa_signal_create(1, 0, NULL, &trans.signal); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - tran_.push_back(trans); - - if (!bench_mark_mode_) { - while (rec != 'n') { - int dep = 0; - ; - std::cout - << "You will add another copy transaction, which will depends on " - "previous ones." << std::endl; - std::cout << "There are " << tran_.size() << - " copy transactions already, how many transactions" - " you want the new transaction depends on?" - << std::endl; - std::cin >> dep; - std::cout - << "Please specify which one you want to depend on, separate with " - "whitespace, index from 0:" << std::endl; - int* dep_ptr = new int[dep]; - - for (int i = 0; i < dep; i++) { - std::cin >> dep_ptr[i]; - } - - std::cout << "Please specify the dst memory pool:" << std::endl; - std::cin >> dst_region; - std::cout << "Please specify the src memory pool:" << std::endl; - std::cin >> src_region; - std::cout << "Please specify the data size:" << std::endl; - std::cin >> data_size; - std::cout << "Do you want to add more copy transaction: \"y\" or \"n\"?" - << std::endl; - char temp; - std::cin >> temp; - rec = tolower(temp); - - transaction t; - t.dst = dst_region; - t.src = src_region; - t.size = data_size; - err = hsa_signal_create(1, 0, NULL, &t.signal); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - t.num_dep_signal = dep; - hsa_signal_t* signal_ptr = nullptr; - - if (dep != 0) { - signal_ptr = new hsa_signal_t[dep]; - } - - for (int i = 0; i < dep; i++) { - signal_ptr[i] = tran_.at(dep_ptr[i]).signal; - } - - t.dep_signal = signal_ptr; - tran_.push_back(t); - - delete[] dep_ptr; - } - } -} - -void MemoryAsyncCopy::PrintTopology() { - size_t node_num = node_info_.size(); - - for (uint32_t i = 0; i < node_num; i++) { - node_info node = node_info_.at(i); - // Print agent info - std::cout << std::endl; - std::cout << "Agent #" << node.agent.index_ << ":" << std::endl; - - if (HSA_DEVICE_TYPE_CPU == node.agent.device_type_) - std::cout << "Agent Device Type: CPU" - << std::endl; - else if (HSA_DEVICE_TYPE_GPU == node.agent.device_type_) - std::cout << "Agent Device Type: GPU" - << std::endl; - - // Print region info - size_t region_num = node.region.size(); - - for (uint32_t j = 0; j < region_num; j++) { - std::cout << " Memory Pool#" << node.region.at(j).index_ << ":" - << std::endl; - std::cout << " max allocable size in KB: " - << node.region.at(j).allocable_size_ / 1024 << std::endl; - std::cout << " is fine-grained: " - << node.region.at(j).is_fine_grained_ << std::endl; - } - } -} - -#define RET_IF_MEM_ASYNC_ERR(err) { \ - if ((err) != HSA_STATUS_SUCCESS) { \ - std::cout << "hsa api call failure at line " << __LINE__ << ", file: " << \ - __FILE__ << ". Call returned " << err << std::endl; \ - return (err); \ - } \ -} - -hsa_status_t RegionInfo(hsa_amd_memory_pool_t region, void* data) { +static hsa_status_t GetPoolInfo(hsa_amd_memory_pool_t pool, void* data) { hsa_status_t err; MemoryAsyncCopy* ptr = reinterpret_cast(data); - // Query region segment, only report global one + // Query pool segment, only report global one hsa_amd_segment_t region_segment; - err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, ®ion_segment); - RET_IF_MEM_ASYNC_ERR(err); + RET_IF_HSA_ERR(err); - if (HSA_AMD_SEGMENT_GLOBAL != region_segment) { + if (region_segment != HSA_AMD_SEGMENT_GLOBAL) { return HSA_STATUS_SUCCESS; } - // Check if the region is alloc allowed, if not, discard this region + // Check if the pool is alloc allowed, if not, discard this pool bool alloc_allowed = false; - err = hsa_amd_memory_pool_get_info(region, + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &alloc_allowed); - RET_IF_MEM_ASYNC_ERR(err); + RET_IF_HSA_ERR(err); if (alloc_allowed != true) { return HSA_STATUS_SUCCESS; @@ -698,56 +460,166 @@ hsa_status_t RegionInfo(hsa_amd_memory_pool_t region, void* data) { // Query the max allocable size size_t alloc_max_size = 0; - err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_SIZE, + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &alloc_max_size); - RET_IF_MEM_ASYNC_ERR(err); + RET_IF_HSA_ERR(err); - // Check if the region is fine-grained or coarse-grained + // Check if the pool is fine-grained or coarse-grained uint32_t global_flag = 0; - err = hsa_amd_memory_pool_get_info(region, + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag); - RET_IF_MEM_ASYNC_ERR(err); + RET_IF_HSA_ERR(err); bool is_fine_grained = HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & global_flag; - // ptr->region_info_.push_back(region_info(region, ptr->region_index_, - // region_segment, is_fine_grained, host_accessible, alloc_max_size)); - ptr->region_info_.push_back( - region_info(region, ptr->region_index_, region_segment, is_fine_grained, - alloc_max_size, ptr->agent_info_.back().agent_)); + int pool_i = ptr->pool_index(); + int ag_ind = ptr->agent_index(); + ptr->pool_info()->push_back( + new PoolInfo(pool, pool_i, region_segment, is_fine_grained, + alloc_max_size, ptr->agent_info()->back())); - // Construct node_info and push back to node_info_ - ptr->node_info_[ptr->agent_index_].region.push_back(ptr->region_info_.back()); - ptr->region_index_++; + // Construct node_info and push back to agent_info_ + (*ptr->node_info())[ag_ind].pool.push_back(*ptr->pool_info()->back()); + ptr->set_pool_index(pool_i + 1); return HSA_STATUS_SUCCESS; } -hsa_status_t AgentInfo(hsa_agent_t agent, void* data) { +static hsa_status_t GetAgentInfo(hsa_agent_t agent, void* data) { MemoryAsyncCopy* ptr = reinterpret_cast(data); hsa_status_t err; char name[64]; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, name); - RET_IF_MEM_ASYNC_ERR(err); + RET_IF_HSA_ERR(err); // Get device type hsa_device_type_t device_type; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); - RET_IF_MEM_ASYNC_ERR(err); + RET_IF_HSA_ERR(err); - ptr->agent_info_.push_back(agent_info(agent, ptr->agent_index_, device_type)); + ptr->agent_info()->push_back( + new AgentInfo(agent, ptr->agent_index(), device_type)); - // Contruct an new node_info structure and push back to node_info_ - node_info node; - node.agent = ptr->agent_info_.back(); - ptr->node_info_.push_back(node); - - err = hsa_amd_agent_iterate_memory_pools(agent, RegionInfo, ptr); - ptr->agent_index_++; + // Contruct a new NodeInfo structure and push back to agent_info_ + NodeInfo node; + node.agent = *ptr->agent_info()->back(); + ptr->node_info()->push_back(node); + err = hsa_amd_agent_iterate_memory_pools(agent, GetPoolInfo, ptr); + ptr->set_agent_index(ptr->agent_index() + 1); return HSA_STATUS_SUCCESS; } -#undef RET_IF_MEM_ASYNC_ERR +void MemoryAsyncCopy::FindTopology() { + hsa_status_t err; + + err = hsa_iterate_agents(GetAgentInfo, this); + FindSystemPool(); + + ASSERT_EQ(HSA_STATUS_SUCCESS, err); +} + +void MemoryAsyncCopy::DisplayTestInfo(void) { + TestBase::DisplayTestInfo(); +} + +void MemoryAsyncCopy::ConstructTransactionList(void) { + hsa_status_t err; + + tran_.clear(); + + int cpu_pool_indx = -1; + int gpu1_pool_indx = -1; + int gpu2_pool_indx = -1; + + auto push_trans = [&](int from_indx, int to_indx, TransType type) { + Transaction t; + t.src = from_indx; + t.dst = to_indx; + t.max_size = kMaxCopySize/1024; + t.type = type; + t.benchmark_copy_time = new std::vector; + t.min_time = new std::vector; + err = hsa_signal_create(1, 0, NULL, &t.signal); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); + + tran_.push_back(t); + }; + + // Find the CPU Node and pool + for (NodeInfo n : *node_info()) { + if (cpu_pool_indx == -1 && n.agent.device_type() == HSA_DEVICE_TYPE_CPU) { + cpu_pool_indx = n.pool[0].index_; + continue; + } + if (gpu1_pool_indx == -1 && n.agent.device_type() == HSA_DEVICE_TYPE_GPU) { + gpu1_pool_indx = n.pool[0].index_; + continue; + } + if (gpu2_pool_indx == -1 && n.agent.device_type() == HSA_DEVICE_TYPE_GPU) { + gpu2_pool_indx = n.pool[0].index_; + break; + } + } + + ASSERT_NE(cpu_pool_indx, -1); + ASSERT_NE(gpu1_pool_indx, -1); + + push_trans(cpu_pool_indx, gpu1_pool_indx, H2D); + push_trans(gpu1_pool_indx, cpu_pool_indx, D2H); + + if (do_full_test_) { + for (NodeInfo n : *node_info()) { + if (n.agent.device_type() == HSA_DEVICE_TYPE_CPU) { + continue; + } + + for (PoolInfo p : n.pool) { + if (p.index_ == gpu1_pool_indx) { + continue; + } + push_trans(gpu1_pool_indx, p.index_, P2P); + push_trans(p.index_, gpu1_pool_indx, P2P); + } + } + } else { + if (gpu2_pool_indx != -1) { + push_trans(gpu1_pool_indx, gpu2_pool_indx, P2P); + push_trans(gpu2_pool_indx, gpu1_pool_indx, P2P); + } + } +} + +void MemoryAsyncCopy::PrintTopology(void) { + size_t node_num = node_info()->size(); + + for (uint32_t i = 0; i < node_num; i++) { + NodeInfo node = node_info()->at(i); + // Print agent info + std::cout << std::endl; + std::cout << "Agent #" << node.agent.index_ << ":" << std::endl; + + if (HSA_DEVICE_TYPE_CPU == node.agent.device_type()) + std::cout << "Agent Device Type: CPU" + << std::endl; + else if (HSA_DEVICE_TYPE_GPU == node.agent.device_type()) + std::cout << "Agent Device Type: GPU" + << std::endl; + + // Print pool info + size_t pool_num = node.pool.size(); + + for (uint32_t j = 0; j < pool_num; j++) { + std::cout << " Memory Pool#" << node.pool.at(j).index_ << ":" + << std::endl; + std::cout << " max allocable size in KB: \t\t" + << node.pool.at(j).allocable_size_ / 1024 << std::endl; + std::cout << " is fine-grained: \t\t\t" + << node.pool.at(j).is_fine_grained_ << std::endl; + } + } +} + +#undef RET_IF_HSA_ERR diff --git a/rocrtst/suites/performance/memory_async_copy.h b/rocrtst/suites/performance/memory_async_copy.h index 001884a499..6cbf9d7913 100755 --- a/rocrtst/suites/performance/memory_async_copy.h +++ b/rocrtst/suites/performance/memory_async_copy.h @@ -43,199 +43,182 @@ * */ -#ifndef __ROCRTST_SRC_MEMORY_ASYNC_COPY_H__ -#define __ROCRTST_SRC_MEMORY_ASYNC_COPY_H__ +#ifndef ROCRTST_SUITES_PERFORMANCE_MEMORY_ASYNC_COPY_H_ +#define ROCRTST_SUITES_PERFORMANCE_MEMORY_ASYNC_COPY_H_ + +#include +#include -#include "perf_common/perf_base.h" #include "common/base_rocr.h" -#include "common/common.h" -#include "common/hsatimer.h" #include "hsa/hsa.h" #include "hsa/hsa_ext_amd.h" -#include -#include -#include -#include +#include "suites/test_common/test_base.h" -extern int mac_argc; -extern char** mac_argv; +typedef enum TransType {H2D = 0, D2H, P2P} TransType; -typedef struct transaction { +typedef struct Transaction { int src; int dst; hsa_signal_t signal; - size_t size; - size_t num_dep_signal; - hsa_signal_t* dep_signal; -} transaction; + size_t max_size; // Max. amount of kBytes to copy + TransType type; + // BenchMark copy time + std::vector *benchmark_copy_time; + // Min time + std::vector *min_time; +} Transaction; -typedef struct agent_info { - agent_info(hsa_agent_t agent, int index, hsa_device_type_t device_type) { - agent_ = agent; - index_ = index; - device_type_ = device_type; - } - agent_info() { - } - hsa_agent_t agent_; - int index_; - hsa_device_type_t device_type_; -} agent_info; +class AgentInfo { + public: + AgentInfo(hsa_agent_t agent, int index, hsa_device_type_t device_type) { + agent_ = agent; + index_ = index; + device_type_ = device_type; + } + AgentInfo() {} + + ~AgentInfo() {} + hsa_agent_t agent(void) const {return agent_;} + hsa_device_type_t device_type(void) const {return device_type_;} + + hsa_agent_t agent_; + int index_; + + private: + hsa_device_type_t device_type_; +}; + +class PoolInfo { + public: + PoolInfo(hsa_amd_memory_pool_t pool, int index, + hsa_amd_segment_t segment, bool is_fine_graind, size_t size, + AgentInfo *agent_info) { + pool_ = pool; + index_ = index; + segment_ = segment; + is_fine_grained_ = is_fine_graind; + allocable_size_ = size; + owner_agent_info_ = agent_info; + } + PoolInfo() {} + ~PoolInfo() {} + AgentInfo* owner_agent_info(void) const {return owner_agent_info_;} + hsa_amd_memory_pool_t pool_; + int index_; + hsa_amd_segment_t segment_; + bool is_fine_grained_; + size_t allocable_size_; + private: + AgentInfo *owner_agent_info_; +}; -typedef struct region_info { - region_info(hsa_amd_memory_pool_t region, int index, - hsa_amd_segment_t segment, bool is_fine_graind, size_t size, - hsa_agent_t agent) { - region_ = region; - index_ = index; - segment_ = segment; - is_fine_grained_ = is_fine_graind; - allocable_size_ = size; - owner_agent_ = agent; - } - region_info() { - } - hsa_amd_memory_pool_t region_; - int index_; - hsa_amd_segment_t segment_; - bool is_fine_grained_; - size_t allocable_size_; - hsa_agent_t owner_agent_; -} region_info; // Used to print out topology info -typedef struct node_info { - node_info() { - } - agent_info agent; - std::vector region; -} node_info; +typedef struct NodeInfo { + AgentInfo agent; + std::vector pool; +} NodeInfo; -hsa_status_t AgentInfo(hsa_agent_t agent, void* data); -hsa_status_t RegionInfo(hsa_amd_memory_pool_t region, void* data); -class MemoryAsyncCopy: public rocrtst::BaseRocR, public PerfBase { +class MemoryAsyncCopy : public TestBase { public: MemoryAsyncCopy(); - //@Brief: Destructor for test case of MemoryAsyncCopy + // @Brief: Destructor for test case of MemoryAsyncCopy virtual ~MemoryAsyncCopy(); - //@Brief: Setup the environment for measurement + // @Brief: Setup the environment for measurement virtual void SetUp(); - //@Brief: Core measurement execution + // @Brief: Core measurement execution virtual void Run(); - //@Brief: Clean up and retrive the resource + // @Brief: Clean up and retrive the resource virtual void Close(); - //@Brief: Display results + // @Brief: Display results virtual void DisplayResults() const; + // There are 3 levels of testing, from quickest/very specific to + // longest/most complete: + // 1. to and from a specified source to a specified target + // 2. to and from the cpu to 1 gpu, and to/from a gpu to another gpu + // (if available) + // 3. to and from the cpu to 1 gpu and, to/from every gpu to every + // other gpu + // The default is #2 above. If *both* a source and dest. are set for #1 + // above, then that overides both #2 and #3 + void set_src_pool(int pool_id) {src_pool_id_ = pool_id;} + void set_dst_pool(int pool_id) {dst_pool_id_ = pool_id;} + void set_full_test(bool full_test) {do_full_test_ = full_test;} + int pool_index(void) const {return pool_index_;} + void set_pool_index(int i) {pool_index_ = i;} + int agent_index(void) const {return agent_index_;} + void set_agent_index(int i) {agent_index_ = i;} + std::vector *pool_info(void) {return &pool_info_;} + std::vector *agent_info(void) {return &agent_info_;} + std::vector *node_info(void) {return &node_info_;} + + // @Brief: Display information about what this test does + virtual void DisplayTestInfo(void); + private: - //@Brief: Get real iteration number - virtual size_t RealIterationNum(); + // @Brief: Get real iteration number + virtual size_t RealIterationNum(void); - //@Brief: Get the mean copy time - virtual double GetMeanTime(std::vector& vec); + // @Brief: Get the mean copy time + double GetMeanTime(std::vector* vec); - //@Brief: Get the min copy time - virtual double GetMinTime(std::vector& vec); + // @Brief: Find and print out the needed topology info + void FindTopology(void); - //@Brief: Find and print out the needed topology info - void FindTopology(); + // @Brief: Run for Benchmark mode with verification + void RunBenchmarkWithVerification(Transaction *t); - //@Brief: Parse the argument and interact with the user - // to fill the vectors. - void ParseArgument(); + // @Brief: Dispaly Benchmark result + void DisplayBenchmark(Transaction *t) const; - //@Brief: Run for Benchmark mode - void RunBenchmark(); + // @Brief: Print topology info + void PrintTopology(void); - //@Brief: Run for Benchmark mode with verification - void RunBenchmarkWithVerification(); + void ConstructTransactionList(void); - //@Brief: Dispaly Benchmark result - void DisplayBenchmark(); + // @Brief: Find system region + void FindSystemPool(void); - //@Brief: Run user defined - void RunNormal(); - - //@Brief: Print topology info - void PrintTopology(); - - //@Brief: Find system region - void FindSystemRegion(); - - //@Brief: Check if agent and access memory pool, if so, set - //access to the agent, if not, exit - void AcquireAccess(hsa_agent_t agent, hsa_amd_memory_pool_t pool, void* ptr); - - friend hsa_status_t AgentInfo(hsa_agent_t agent, void* data); - friend hsa_status_t RegionInfo(hsa_amd_memory_pool_t region, void* data); - - protected: // More variables declared for testing - std::vector tran_; + std::vector tran_; // Variable used to store agent info, indexed by agent_index_ - std::vector agent_info_; + std::vector agent_info_; - // Variable used to store region info, indexed by region_index_ - std::vector region_info_; + // Variable used to store region info, indexed by pool_index_ + std::vector pool_info_; - // Variable to store argument number - int argc_; - - // Pointer to store address of argument text - char** argv_; + // To store node info + std::vector node_info_; // Variable to help count agent index int agent_index_; // Variable to help count region index - int region_index_; - - // BenchMark mode by default - bool bench_mark_mode_; - - // BenchMark copy time - std::vector benchmark_copy_time_; - - // Min time - std::vector min_time_; - - // User define copy time - double user_copy_time_; + int pool_index_; // Verification result bool verified_; - // If it needs verification - bool verification_; - - // To store node info - std::vector node_info_; + // Store the testing level + int src_pool_id_; + int dst_pool_id_; + bool do_full_test_; // System region - hsa_amd_memory_pool_t sys_region_; + hsa_amd_memory_pool_t sys_pool_; // CPU agent used for verification hsa_agent_t cpu_agent_; - constexpr const static char* help_info = - MULTILINE(. / memory_async_copy - f source_region - t dst_region - s data_size_in_KB - r[y | n] - i iteration_number - b\n\ - \n\ - -h Help info \n\ - -f Memory Pool where data copy from \n\ - -t Memory Pool where data copy to \n\ - - -s Size of copy data, 256MB by default \n\ - -r If wants to add more copy \n\ - -i Iteration number for each copy \n\ - -b Enable benchmark mode \n\ - Note : -f - t must be specified\n); + rocrtst::PerfTimer copy_timer_; }; -#endif +#endif // ROCRTST_SUITES_PERFORMANCE_MEMORY_ASYNC_COPY_H_ diff --git a/rocrtst/suites/performance/memory_copy.cc b/rocrtst/suites/performance/memory_copy.cc deleted file mode 100755 index a08306fe4c..0000000000 --- a/rocrtst/suites/performance/memory_copy.cc +++ /dev/null @@ -1,411 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#include "memory_copy.h" -#include "common/base_rocr_utils.h" -#include "common/common.h" -#include "hsa/hsa.h" -#include "gtest/gtest.h" -#include - -MemoryCopy::MemoryCopy(size_t num) : - BaseRocR() { - ptr_src_ = NULL; - ptr_dst_ = NULL; - ptr_dev_src_ = NULL; - ptr_dev_dst_ = NULL; - device_region_.handle = 0; - set_requires_profile (HSA_PROFILE_BASE); -} - -MemoryCopy::~MemoryCopy() { -} - -const char* MemoryCopy::Str[16] = {"64K", "128K", "256K", "512K", "1M", "2M", - "4M", "8M", "16M", "32M", "64M", "128M", - "256M", "512M", "1G", "2G" - }; -const size_t MemoryCopy::Size[16] = {64*1024, 128*1024, 256*1024, 512*1024, - 1024*1024, 2048*1024, 4096*1024, - 8*1024*1024, 16*1024* 1024, 32*1024*1024, - 64*1024*1024, 128*1024*1024, 256*1024*1024, - 512*1024*1024, 1024*1024*1024, - (size_t)2*1024*1024* 1024 - }; - - -void MemoryCopy::SetUp() { - hsa_status_t err; - - if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) { - return; - } - - hsa_agent_t* gpu_dev = gpu_device1(); - hsa_agent_t* cpu_dev = cpu_device(); - - // Find system memory pool for kernarg allocation. - // hsa_amd_memory_pool_t sys_coarse_grained_pool; - err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool, - &cpu_pool()); - ASSERT_EQ(err, HSA_STATUS_INFO_BREAK); - - ASSERT_NE(cpu_pool().handle, 0); - - // Get local memory pool of the first GPU. - // hsa_amd_memory_pool_t gpu_pool_; - err = hsa_amd_agent_iterate_memory_pools(*gpu_dev, rocrtst::FindStandardPool, - &device_pool()); - ASSERT_EQ(err, HSA_STATUS_INFO_BREAK); - ASSERT_NE(device_pool().handle, 0); - - //Allocate buffers whose size is 2GB - err = hsa_amd_memory_pool_allocate(cpu_pool(), Size[12], 0, &ptr_src_); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = hsa_amd_memory_pool_allocate(cpu_pool(), Size[12], 0, &ptr_dst_); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = hsa_amd_memory_pool_allocate(device_pool(), Size[11], 0, &ptr_dev_src_); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = hsa_amd_memory_pool_allocate(device_pool(), Size[11], 0, &ptr_dev_dst_); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - //Assign the region ownership to GPU - err = hsa_memory_assign_agent(ptr_dev_src_, *gpu_dev, - HSA_ACCESS_PERMISSION_RW); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = hsa_memory_assign_agent(ptr_dev_dst_, *gpu_dev, - HSA_ACCESS_PERMISSION_RW); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - //rocrtst::CommonCleanUp the two buffer, src to 1 each byte and dst to 0 - err = hsa_amd_memory_fill(ptr_src_, 1, Size[12]); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - //Check if the initialization is correct -#if DEBUG - std::cout << "Value after setting source buffer is: " - << (int)((uint8_t*)ptr_src_)[0] << std::endl; -#endif - - return; -} - -void MemoryCopy::Run() { - hsa_status_t err; - - if (!rocrtst::CheckProfile(this)) { - return; - } - - uint32_t iterations = RealIterationNum(); - - //Iteration over the different data size on system memory - for (int i = 0; i < 13; i++) { - std::vector time; - - for (uint32_t it = 0; it < iterations; it++) { -#if DEBUG - std::cout << "."; - fflush(stdout); -#endif - - rocrtst::PerfTimer copy_timer; - int index = copy_timer.CreateTimer(); - - copy_timer.StartTimer(index); - err = hsa_memory_copy(ptr_dst_, ptr_src_, Size[i]); - copy_timer.StopTimer(index); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Push the result back to vector time - time.push_back(copy_timer.ReadTimer(index)); - -#if DEBUG - //Check if the data copied is correct - uint8_t* temp_ptr = (uint8_t*)ptr_dst_; - - for (uint32_t j = 0; j < Size[i]; j++) { - ASSERT_EQ(temp_ptr[j], 1); - } - -#endif - } - -#if DEBUG - std::cout << std::endl; -#endif - - //Get mean copy time and store to the array - sys2sys_copy_time_.push_back(GetMeanTime(time)); - } - - //Copy from system memory to device memory - for (int i = 0; i < 12; i++) { - std::vector time; - - for (uint32_t it = 0; it < iterations; it++) { -#if DEBUG - std::cout << "."; - fflush(stdout); -#endif - - rocrtst::PerfTimer copy_timer; - int index = copy_timer.CreateTimer(); - - copy_timer.StartTimer(index); - err = hsa_memory_copy(ptr_dev_src_, ptr_src_, Size[i]); - copy_timer.StopTimer(index); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Push the result back to vector time - time.push_back(copy_timer.ReadTimer(index)); - -#if DEBUG - //Check if the data copied is correct - uint8_t* temp_ptr = (uint8_t*)ptr_dst_; - - for (uint32_t j = 0; j < Size[i]; j++) { - ASSERT_EQ(temp_ptr[j], 1); - } - -#endif - } - -#if DEBUG - std::cout << std::endl; -#endif - - //Get mean copy time and store to the array - sys2dev_copy_time_.push_back(GetMeanTime(time)); - } - - //Copy from device memory to device memory - for (int i = 0; i < 12; i++) { - std::vector time; - - for (uint32_t it = 0; it < iterations; it++) { -#if DEBUG - std::cout << "."; - fflush(stdout); -#endif - - rocrtst::PerfTimer copy_timer; - int index = copy_timer.CreateTimer(); - - copy_timer.StartTimer(index); - err = hsa_memory_copy(ptr_dev_dst_, ptr_dev_src_, Size[i]); - copy_timer.StopTimer(index); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Push the result back to vector time - time.push_back(copy_timer.ReadTimer(index)); - -#if DEBUG - //Check if the data copied is correct - uint8_t* temp_ptr = (uint8_t*)ptr_dst_; - - for (uint32_t j = 0; j < Size[i]; j++) { - ASSERT_EQ(temp_ptr[j], 1); - } - -#endif - } - -#if DEBUG - std::cout << std::endl; -#endif - - //Get mean copy time and store to the array - dev2dev_copy_time_.push_back(GetMeanTime(time)); - } - - //Copy from device memory to system memory - for (int i = 0; i < 12; i++) { - std::vector time; - - for (uint32_t it = 0; it < iterations; it++) { -#if DEBUG - std::cout << "."; - fflush(stdout); -#endif - - rocrtst::PerfTimer copy_timer; - int index = copy_timer.CreateTimer(); - - copy_timer.StartTimer(index); - err = hsa_memory_copy(ptr_dst_, ptr_dev_src_, Size[i]); - copy_timer.StopTimer(index); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Push the result back to vector time - time.push_back(copy_timer.ReadTimer(index)); - -#if DEBUG - //Check if the data copied is correct - uint8_t* temp_ptr = (uint8_t*)ptr_dst_; - - for (uint32_t j = 0; j < Size[i]; j++) { - if (temp_ptr[j] != 1) { - ASSERT_EQ(temp_ptr[j], 1); - } - } - -#endif - } - -#if DEBUG - std::cout << std::endl; -#endif - - //Get mean copy time and store to the array - dev2sys_copy_time_.push_back(GetMeanTime(time)); - } -} - -size_t MemoryCopy::RealIterationNum() { - return num_iteration() * 1.2 + 1; -} - -double MemoryCopy::GetMeanTime(std::vector& vec) { - std::sort(vec.begin(), vec.end()); - - vec.erase(vec.begin()); - vec.erase(vec.begin(), vec.begin() + num_iteration() * 0.1); - vec.erase(vec.begin() + num_iteration(), vec.end()); - - double mean = 0.0; - int num = vec.size(); - - for (int it = 0; it < num; it++) { - // printf("%f\n", vec[it]); - mean += vec[it]; - } - - mean /= num; - return mean; -} - -void MemoryCopy::DisplayResults() const { - - if (!rocrtst::CheckProfile(this)) { - return; - } - - printf( - "================ System to System ==================================\n"); - printf(" Data Size BandWidth(GB/s)\n"); - - //Output the BW of system memory to system memory - for (int i = 0; i < 13; i++) { - double band_width = (double) Size[i] / sys2sys_copy_time_[i] / 1024 / 1024 - / 1024 * 2; -#ifdef DEBUG - printf("size: %zu time: %f\n", Size[i], sys2sys_copy_time_[i]); -#endif - printf(" %s %lf\n", Str[i], band_width); - } - - printf( - "================ System to Device ===================================\n"); - - for (int i = 0; i < 12; i++) { - double band_width = (double) Size[i] / sys2dev_copy_time_[i] / 1024 / 1024 - / 1024 * 2; -#ifdef DEBUG - printf("size: %zu time: %f\n", Size[i], sys2dev_copy_time_[i]); -#endif - printf(" %s %lf\n", Str[i], band_width); - } - - printf( - "================ Device to Device ===================================\n"); - - for (int i = 0; i < 12; i++) { - double band_width = (double) Size[i] / dev2dev_copy_time_[i] / 1024 / 1024 - / 1024 * 2; -#ifdef DEBUG - printf("size: %zu time: %f\n", Size[i], dev2dev_copy_time_[i]); -#endif - printf(" %s %lf\n", Str[i], band_width); - } - - printf( - "================ Device to System ===================================\n"); - - for (int i = 0; i < 12; i++) { - double band_width = (double) Size[i] / dev2sys_copy_time_[i] / 1024 / 1024 - / 1024 * 2; -#ifdef DEBUG - printf("size: %zu time: %f\n", Size[i], dev2sys_copy_time_[i]); -#endif - printf(" %s %lf\n", Str[i], band_width); - } - - printf("===================================================\n"); - return; -} - -void MemoryCopy::Close() { - hsa_status_t err; - - //Free the memory allocated - err = hsa_memory_free(ptr_src_); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = hsa_memory_free(ptr_dst_); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - ptr_src_ = NULL; - ptr_dst_ = NULL; - - err = rocrtst::CommonCleanUp(this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - return; -} diff --git a/rocrtst/suites/performance/memory_copy.h b/rocrtst/suites/performance/memory_copy.h deleted file mode 100644 index f6f2deb1eb..0000000000 --- a/rocrtst/suites/performance/memory_copy.h +++ /dev/null @@ -1,109 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#ifndef __ROCRTST_SRC_MEMORY_MEM_COPY_H__ -#define __ROCRTST_SRC_MEMORY_MEM_COPY_H__ - -#include "common/base_rocr.h" -#include "perf_common/perf_base.h" -#include "hsa/hsa.h" -#include "common/hsatimer.h" -#include - -class MemoryCopy: public rocrtst::BaseRocR, public PerfBase { - - public: - //@Brief: Constructor for test case of MemoryCopy - MemoryCopy(size_t num = 100); - - //@Brief: Destructor for test case of MemoryCopy - virtual ~MemoryCopy(); - - //@Brief: Setup the environment for measurement - virtual void SetUp(); - - //@Brief: Core measurement execution - virtual void Run(); - - //@Brief: Clean up and retrive the resource - virtual void Close(); - - //@Brief: Display results - virtual void DisplayResults() const; - - private: - //@Brief: Define copy data size and corresponding string - static const size_t Size[16]; - static const char* Str[16]; - - //@Brief: Get real iteration number - virtual size_t RealIterationNum(); - - //@Brief: Get the mean copy time - virtual double GetMeanTime(std::vector& vec); - - protected: - //@Brief: More variables declared for testing - //@Brief: Source pointer from which data copy - void* ptr_src_; - - //@Brief: Destination pointer to which data copy - void* ptr_dst_; - - //@Brief: Pointer to device memory - void* ptr_dev_src_; - void* ptr_dev_dst_; - - //@Brief: Array to store the timer results for each data size - std::vector sys2sys_copy_time_; - std::vector sys2dev_copy_time_; - std::vector dev2sys_copy_time_; - std::vector dev2dev_copy_time_; - - //@Brief: Device memory region - hsa_region_t device_region_; -}; - -#endif diff --git a/rocrtst/suites/performance/queue_concurrency.cc b/rocrtst/suites/performance/queue_concurrency.cc deleted file mode 100755 index b127e35749..0000000000 --- a/rocrtst/suites/performance/queue_concurrency.cc +++ /dev/null @@ -1,284 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ -#include "queue_concurrency.h" -#include "common/base_rocr_utils.h" -#include "common/common.h" -#include "common/helper_funcs.h" -#include "common/hsatimer.h" -#include "common/os.h" -#include "hsa/hsa_ext_finalize.h" -#include "gtest/gtest.h" - -#include -#include - -QueueConcurrency::QueueConcurrency() : - BaseRocR(), execution_time_(8) { - queue_num_ = 0; - std_time_ = 0.0; - - set_enable_interrupt(true); - set_requires_profile (HSA_PROFILE_FULL); -} - -QueueConcurrency::~QueueConcurrency() { -} - -void QueueConcurrency::SetUp() { - hsa_status_t err; - - set_kernel_file_name("test_kernel.o"); - set_kernel_name("&__OpenCL_vec_assign_kernel"); - - if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) { - return; - } - - rocrtst::LoadKernelFromObjFile(this); - - hsa_agent_t* gpu_dev = gpu_device1(); - - // Fill up part of aql pakcet which are the same cross the threads - rocrtst::InitializeAQLPacket(this, &aql()); - - // Create a queue - hsa_queue_t* q = main_queue(); - rocrtst::CreateQueue(*gpu_dev, &q); - - for (int i = 0; i < 2; i++) { - // Output of kernel - int output = 0; - - // Iteration number - int iterations = 1024 * 1024; // * 1024; - - struct ALIGNED_(16) - args_t { - void* arg0; - int arg1; - } local_args; - - local_args.arg0 = (void*) &output; - local_args.arg1 = iterations; - - err = hsa_memory_register(&local_args, sizeof(local_args)); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - //Obtain the current queue write index. - uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1); - - //Write the aql packet at the calculated queue index address. - - const uint32_t queue_mask = main_queue()->size - 1; - hsa_kernel_dispatch_packet_t* pkt_addr = - (hsa_kernel_dispatch_packet_t*) (main_queue()->base_address); - - (pkt_addr)[index & queue_mask] = aql(); - (pkt_addr)[index & queue_mask].completion_signal = signal(); - (pkt_addr)[index & queue_mask].kernarg_address = &local_args; - - //Get timing stamp and ring the doorbell to dispatch the kernel. - rocrtst::PerfTimer p_timer; - int id = p_timer.CreateTimer(); - p_timer.StartTimer(id); - - //.type = HSA_PACKET_TYPE_DISPATCH; - (pkt_addr)[index & queue_mask].header |= HSA_PACKET_TYPE_KERNEL_DISPATCH - << HSA_PACKET_HEADER_TYPE; - hsa_signal_store_screlease(main_queue()->doorbell_signal, index); - - //Wait on the dispatch signal until the kernel is finished. - while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1, - (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) - ; - - p_timer.StopTimer(id); - hsa_signal_store_screlease(signal(), 1); - - if (1 == i) { - std_time_ = p_timer.ReadTimer(id); - } - } - - //Destroy the queue - err = hsa_queue_destroy(main_queue()); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); -} - -void QueueConcurrency::Run() { - - if (!rocrtst::CheckProfile(this)) { - return; - } - - // Launch 8 child threads - std::vector < std::thread > threads; - - for (int i = 0; i < 8; i++) { - threads.push_back(std::thread(&QueueConcurrency::ThreadFunc, this, i)); - } - - // Wait for join - for (int i = 0; i < 8; i++) { - threads[i].join(); - } - - CalculateQueueNum(); -} - -void QueueConcurrency::CalculateQueueNum() { - for (int i = 0; i < 8; i++) { - double expected_time = execution_time_[0] / (1 << i); - double deviation = sqrt( - (expected_time - execution_time_[i]) - * (expected_time - execution_time_[i])); - - if (deviation < 0.1 * expected_time) { - queue_num_++; - } - } -} - -void QueueConcurrency::DisplayResults() const { - - if (!rocrtst::CheckProfile(this)) { - return; - } - - for (int i = 0; i < 8; i++) { - std::cout << execution_time_[i] << std::endl; - } - - std::cout << "Number of Concurrent Queue is: " << queue_num_ << std::endl; - - ASSERT_EQ(queue_num_, 3); - - return; -} - -void QueueConcurrency::Close() { - hsa_status_t err; - err = rocrtst::CommonCleanUp(this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); -} - -void QueueConcurrency::ThreadFunc(int threadID) { - // Define local queue and signal - hsa_queue_t* queue; - hsa_signal_t signal; - hsa_status_t err; - hsa_agent_t* gpu_dev = gpu_device1(); - - // Create a signal - err = hsa_signal_create(1, 0, NULL, &signal); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - rocrtst::CreateQueue(*gpu_dev, &queue); - - std::vector time; - - for (uint32_t i = 0; i < num_iteration(); i++) { - // Output of kernel - int output = 0; - - // Iteration number - int iterations = 1024 * 1024 / (1 << threadID); - - struct ALIGNED_(16) - args_t { - void* arg0; - int arg1; - } local_args; - - local_args.arg0 = (void*) &output; - local_args.arg1 = iterations; - - err = hsa_memory_register(&local_args, sizeof(local_args)); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - //Obtain the current queue write index. - uint64_t index = hsa_queue_add_write_index_relaxed(queue, 1); - - //Write the aql packet at the calculated queue index address. - - const uint32_t queue_mask = queue->size - 1; - hsa_kernel_dispatch_packet_t* pkt_addr = - (hsa_kernel_dispatch_packet_t*) (queue->base_address); - (pkt_addr)[index & queue_mask] = aql(); - (pkt_addr)[index & queue_mask].completion_signal = signal; - (pkt_addr)[index & queue_mask].kernarg_address = &local_args; - - //Get timing stamp and ring the doorbell to dispatch the kernel. - rocrtst::PerfTimer p_timer; - int id = p_timer.CreateTimer(); - p_timer.StartTimer(id); - - //.type = HSA_PACKET_TYPE_DISPATCH; - (pkt_addr)[index & queue_mask].header |= HSA_PACKET_TYPE_KERNEL_DISPATCH - << HSA_PACKET_HEADER_TYPE; - hsa_signal_store_screlease(queue->doorbell_signal, index); - - //Wait on the dispatch signal until the kernel is finished. - while (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, - (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) - ; - - p_timer.StopTimer(id); - hsa_signal_store_screlease(signal, 1); - - time.push_back(p_timer.ReadTimer(id)); - - EXPECT_EQ(output, iterations); - - if (1 == i) { - execution_time_[threadID] = p_timer.ReadTimer(id); - } - } - - time.erase(time.begin()); - execution_time_[threadID] = rocrtst::CalcMean(time); - return; -} - diff --git a/rocrtst/suites/performance/queue_create_destroy_latency.cc b/rocrtst/suites/performance/queue_create_destroy_latency.cc deleted file mode 100755 index 28c4f9a160..0000000000 --- a/rocrtst/suites/performance/queue_create_destroy_latency.cc +++ /dev/null @@ -1,271 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#include "queue_create_destroy_latency.h" -#include "common/hsatimer.h" -#include "common/common.h" -#include "common/base_rocr_utils.h" -#include "common/helper_funcs.h" -#include "hsa/hsa_ext_amd.h" -#include "hsa/hsa_ext_finalize.h" -#include "gtest/gtest.h" -#include - -static const int kGridDimension = 1024; - -// Construct the test case class -QueueLatency::QueueLatency() : - BaseRocR() { - max_queue_ = 0; - in_ = NULL; - out_ = NULL; -} - -// Destruct the test case claa -QueueLatency::~QueueLatency() { - -} - -void QueueLatency::Close() { - hsa_memory_free (in_); - hsa_memory_free (out_); - - hsa_status_t err; - err = rocrtst::CommonCleanUp(this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - return; -} - -// Set up the environment -void QueueLatency::SetUp() { - hsa_status_t err; - - // We get hangs with vector_copy - set_kernel_file_name("vector_copy.o"); - set_kernel_name("&__vector_copy_kernel"); - - if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) { - return; - } - - hsa_agent_t* gpu_dev = gpu_device1(); - hsa_agent_t* cpu_dev = cpu_device(); - - // Get the max queue which can be active for GPU device - err = hsa_agent_get_info(*gpu_dev, HSA_AGENT_INFO_QUEUES_MAX, &max_queue_); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Find system coarse grained region - err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool, - &cpu_pool()); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - size_t pool_size; - err = hsa_amd_memory_pool_get_info(cpu_pool(), HSA_AMD_MEMORY_POOL_INFO_SIZE, - &pool_size); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = hsa_amd_memory_pool_allocate(cpu_pool(), - kGridDimension * kGridDimension * 4, 0, - (void**) &in_); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = hsa_amd_memory_pool_allocate(cpu_pool(), - kGridDimension * kGridDimension * 4, 0, - (void**) &out_); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - //rocrtst::LoadKernelFromObjFile(gpu_dev, "./"+ kernel_file_name() + ".o"); - rocrtst::LoadKernelFromObjFile(this); - - // Fill up the aql packet - rocrtst::InitializeAQLPacket(this, &aql()); - aql().grid_size_x = kGridDimension * kGridDimension; - - // rocrtst::CommonCleanUp vector memory and register them - //memset(in_, 1, kGridDimension*kGridDimension * 4); - - err = hsa_amd_memory_fill(in_, 1, kGridDimension * kGridDimension * 4); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - return; -} - -void QueueLatency::Run() { - hsa_agent_t* gpu_dev = gpu_device1(); - hsa_status_t err; - - if (!rocrtst::CheckProfile(this)) { - return; - } - - // The outer for loop iterator represents the predefined queue number - // After creating a queue, launch a kernel to train the queue, then destroy - // TODO:Hardcode max_queue_ to 100 - max_queue_ = 20; - - for (uint32_t pre_defined_num = 0; pre_defined_num < max_queue_; - pre_defined_num++) { -#ifdef DEBUG - std::cout << "Existing queue number: " << pre_defined_num << std::endl; -#endif - // vector to store the creation and destruction time - std::vector creation; - std::vector destruction; - // Create pre_defined_num queues first - hsa_queue_t* q; - - for (uint32_t i = 0; i < pre_defined_num; i++) { - q = main_queue(); - rocrtst::CreateQueue(*gpu_dev, &q); - - queues_.push_back(q); - } - - for (uint32_t i = 0; i < num_iteration(); i++) { - rocrtst::PerfTimer p_timer; - int id = p_timer.CreateTimer(); - - uint32_t size = 0; - err = hsa_agent_get_info(*gpu_dev, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &size); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - p_timer.StartTimer(id); - hsa_queue_t* q = main_queue(); - - err = hsa_queue_create(*gpu_dev, size, HSA_QUEUE_TYPE_MULTI, NULL, NULL, - UINT32_MAX, UINT32_MAX, &q); - p_timer.StopTimer(id); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - creation.push_back(p_timer.ReadTimer(id)); - - p_timer.ResetTimer(id); - - // Launch a kernel to the currently created queue - // Allocate kernel parameter - typedef struct args_t { - void* in_buf; - void* out_buf; - } args; - - args* kern_ptr = NULL; - err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(args), 0, - (void**) &kern_ptr); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - kern_ptr->in_buf = in_; - kern_ptr->out_buf = out_; - - aql().kernarg_address = kern_ptr; - - // Obtain the current queue write index. - uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1); - - // Write the aql packet at the calculated queue index address. - const uint32_t queue_mask = main_queue()->size - 1; - ((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index - & queue_mask] = aql(); - - ((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index - & queue_mask].header |= HSA_PACKET_TYPE_KERNEL_DISPATCH - << HSA_PACKET_HEADER_TYPE; - hsa_signal_store_screlease(main_queue()->doorbell_signal, index); - - // Wait on the dispatch signal until the kernel is finished. - while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1, - (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) - ; - - hsa_signal_store_screlease(signal(), 1); - - // Destroy the queue and record the timer - p_timer.StartTimer(id); - err = hsa_queue_destroy(main_queue()); - p_timer.StopTimer(id); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - destruction.push_back(p_timer.ReadTimer(id)); - - } - -#ifdef DEBUG - std::cout << std::endl; -#endif - - // Destroy the predefined queue - for (uint32_t i = 0; i < pre_defined_num; i++) { - - ASSERT_EQ(queues_.size(), pre_defined_num); - - err = hsa_queue_destroy(queues_[i]); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - } - - // Clear the queue vector - queues_.clear(); - - // Get the mean creation and detruction time and push back - double creation_mean = rocrtst::CalcMean(creation); - double destruction_mean = rocrtst::CalcMean(destruction); - construction_mean_.push_back(creation_mean); - destruction_mean_.push_back(destruction_mean); - } -} - -void QueueLatency::DisplayResults() const { - - if (!rocrtst::CheckProfile(this)) { - return; - } - - printf("======================================================\n"); - printf(" Existing queue# Creation Destroy\n"); - - for (uint32_t i = 0; i < max_queue_; i++) { - printf(" %d, %fms %fms\n", i, - construction_mean_[i] * 1e3, destruction_mean_[i] * 1e3); - } -} diff --git a/rocrtst/suites/performance/queue_create_destroy_latency.h b/rocrtst/suites/performance/queue_create_destroy_latency.h deleted file mode 100755 index fba92f87e0..0000000000 --- a/rocrtst/suites/performance/queue_create_destroy_latency.h +++ /dev/null @@ -1,95 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#ifndef __ROCRTST_SRC_INC_QUEUE_CREATE_DESTROY_LATENCY_H__ -#define __ROCRTST_SRC_INC_QUEUE_CREATE_DESTROY_LATENCY_H__ - -#include "perf_common/perf_base.h" -#include "common/base_rocr.h" -#include "hsa/hsa.h" -#include - -class QueueLatency: public rocrtst::BaseRocR, public PerfBase { - public: - //@Brief: Constructor - QueueLatency(); - - //@Brief: Destructor - ~QueueLatency(); - - //@Brief: Set up the teset environment - virtual void SetUp(); - - //@Brief: Run the test - virtual void Run(); - - //@Brief: Clean up and close the test - virtual void Close(); - - //@Brief: Display results - virtual void DisplayResults() const; - - private: - //@Brief: A vector to store the pointers to multiple queues - std::vector queues_; - - //@Brief: Variable to store the mean time for both queue construction - // and destruction - std::vector construction_mean_; - std::vector destruction_mean_; - - //@Brief: Variable to store the max number of queue which are active for - // device_ - uint32_t max_queue_; - - //@Brief: Pointer which points to original and destination vector memory - // space - uint8_t* in_; - uint8_t* out_; - -}; - -#endif //__ROCRTST_SRC_INC_QUEUE_CREATE_DESTROY_LATENCY_H__ - diff --git a/rocrtst/suites/performance/system_load_bandwidth.cc b/rocrtst/suites/performance/system_load_bandwidth.cc deleted file mode 100755 index b0e1d1ed42..0000000000 --- a/rocrtst/suites/performance/system_load_bandwidth.cc +++ /dev/null @@ -1,281 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#include "system_load_bandwidth.h" -#include "common/base_rocr_utils.h" -#include "common/common.h" -#include "common/helper_funcs.h" -#include "common/hsatimer.h" -#include "common/os.h" -#include "gtest/gtest.h" -#include - -#if 0 -static void initGlobalReadBuffer(uint32_t* in_data, uint32_t num_thrds, - uint32_t num_ops, uint32_t num_loops) { - - // Populate input buffer with thread Id left shifted by 2. - uint32_t value = 0; - uint32_t val_idx = 0; - - for (int idx1 = 0; idx1 < num_loops; idx1++) { - for (int idx2 = 0; idx2 < num_ops; idx2++) { - // Write the value to be read by each thread - for (int idx3 = 0; idx3 < num_thrds; idx3++) { - value = idx3 << 2; - in_data[val_idx++] = value; - } - } - } - - return; -} - -static bool verifyGlobalLoadKernel(uint32_t* data, uint32_t num_thrds, - uint32_t scale, const char* kernel_name, bool print_debug) { - - // Verify kernel operation i.e. validate the data in the output buffer. - bool valid = true; - uint32_t valid_value = 0; - - for (int idx = 0; idx < num_thrds; idx++) { - - valid_value = (idx << 2) * scale; - - if (print_debug) { - std::cout << "Value expected = " << valid_value << std::endl; - std::cout << "Value of data = " << data[idx] << std::endl; - } - - if (data[idx] != valid_value) { - std::cout << kernel_name << ": VALIDATION FAILED ! Bad index: " << idx - << std::endl; - std::cout << kernel_name << ": VALUE @ Bad index: " << data[idx] - << std::endl; - std::cout << std::endl; - break; - } - } - -#ifdef DEBUG - std::cout << kernel_name << ": Passed validation" << std::endl; - std::cout << std::endl; -#endif - - return true; -} -#endif - -// Constructor -SystemLoadBandwidth::SystemLoadBandwidth() : - BaseRocR() { - set_group_size(0); - num_group_ = 0; - num_cus_ = 0; - - kernel_loop_count_ = 0; - mean_ = 0.0; - data_size_ = 0; - set_enable_interrupt(0); -} - -// Destructor -SystemLoadBandwidth::~SystemLoadBandwidth() { -} - -// Set up the test environment -void SystemLoadBandwidth::SetUp() { - set_kernel_file_name("sysMemRead.o"); - set_kernel_name("&__SysMemLoad"); - - if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) { - return; - } - - hsa_agent_t* gpu_dev = gpu_device1(); - SetWorkItemNum(); - - //Create a queue with max number size - hsa_queue_t* q = main_queue(); - rocrtst::CreateQueue(*gpu_dev, &q); - - rocrtst::LoadKernelFromObjFile(this); - - uint32_t total_work_items = num_cus_ * num_group_ * group_size(); - - //Fill up part of aql - rocrtst::InitializeAQLPacket(this, &aql()); - aql().workgroup_size_x = group_size(); - aql().grid_size_x = total_work_items; - - return; -} - -// Run the test -void SystemLoadBandwidth::Run() { - - if (!rocrtst::CheckProfile(this)) { - return; - } - - uint32_t total_workitems = num_cus_ * num_group_ * group_size(); - hsa_agent_t* gpu_dev = gpu_device1(); - hsa_status_t err; - - uint32_t ops_thrd = 32; - uint64_t addr_step = (uint64_t) total_workitems * sizeof(uint32_t); - uint64_t total_ops = (uint64_t) total_workitems * ops_thrd; - uint64_t in_data_size = (uint64_t) total_ops * sizeof(uint32_t); - //uint32_t *in_data = (uint32_t *)malloc(in_data_size); - err = hsa_amd_agent_iterate_memory_pools(*gpu_dev, rocrtst::FindStandardPool, - &device_pool()); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - int32_t* in_data = NULL; - err = hsa_amd_memory_pool_allocate(device_pool(), in_data_size, 0, - (void**) &in_data); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - memset(in_data, 0, in_data_size); - uint32_t out_data_size = total_workitems * sizeof(uint32_t); - //uint32_t *out_data = (uint32_t *)malloc(out_data_size); - uint32_t* out_data; - err = hsa_amd_memory_pool_allocate(device_pool(), out_data_size, 0, - (void**) &out_data); - memset(out_data, 0, out_data_size); - - data_size_ = in_data_size; - - // initGlobalReadBuffer (in_data, total_workitems, ops_thrd, - // kernel_loop_count_); - - typedef struct local_args_t { - void* arg0; - void* arg1; - uint64_t arg2; - void* arg3; - } args; - - args* kern_ptr = NULL; - err = hsa_amd_memory_pool_allocate(device_pool(), sizeof(args), 0, - (void**) &kern_ptr); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // in_data is 32 bit ptr, so adding total_ops - kern_ptr->arg0 = in_data; - kern_ptr->arg1 = in_data + total_ops; - kern_ptr->arg2 = addr_step; - kern_ptr->arg3 = out_data; - - aql().kernarg_address = kern_ptr; - - std::vector time; - - int it = num_iteration() * 1.2 + 1; - - void *q_base_addr = main_queue()->base_address; - - for (int i = 0; i < it; i++) { - // Obtain the current queue write index - uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1); - - // Write the aql packet at the calculated queue index address. - const uint32_t queue_mask = main_queue()->size - 1; - ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql(); - - rocrtst::PerfTimer p_timer; - int id = p_timer.CreateTimer(); - p_timer.StartTimer(id); - - ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask].header |= - HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; - hsa_signal_store_screlease(main_queue()->doorbell_signal, index); - - // Wait on the dispatch signal until the kernel is finished. - while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1, - (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) - ; - - p_timer.StopTimer(id); - -#if DEBUG - std::cout << "."; - std::cout.flush(); -#endif - - // Verify the results - // uint32_t scale = kernel_loop_count_ * ops_thrd; - //verifyGlobalLoadKernel(out_data, total_workitems, scale, - // kernel_name_.c_str(), false); - - time.push_back(p_timer.ReadTimer(id)); - - hsa_signal_store_screlease(signal(), 1); - } - - time.erase(time.begin()); - std::sort(time.begin(), time.end()); - time.erase(time.begin() + num_iteration(), time.end()); - mean_ = rocrtst::CalcMean(time); - - return; - -} - -void SystemLoadBandwidth::Close() { - hsa_status_t err; - err = rocrtst::CommonCleanUp(this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); -} - -void SystemLoadBandwidth::DisplayResults() const { - - if (!rocrtst::CheckProfile(this)) { - return; - } - - std::cout << "=======================================" << std::endl; - std::cout << "System Load Bandwidth: %f(GB/S)" << - data_size_ / mean_ / 1024 / 1024 / 1024 << std::endl; -} diff --git a/rocrtst/suites/performance/system_load_bandwidth.h b/rocrtst/suites/performance/system_load_bandwidth.h deleted file mode 100755 index 69d90be217..0000000000 --- a/rocrtst/suites/performance/system_load_bandwidth.h +++ /dev/null @@ -1,119 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#ifndef __ROCRTST_SRC_INC_SYSTEM_LOAD_BANDWIDTH_H__ -#define __ROCRTST_SRC_INC_SYSTEM_LOAD_BANDWIDTH_H__ - -#include "perf_common/perf_base.h" -#include "common/base_rocr.h" -#include "hsa/hsa.h" -#include - -class SystemLoadBandwidth: public rocrtst::BaseRocR, public PerfBase { - public: - //@Brief: Constructor - SystemLoadBandwidth(); - - //@Brief: Destructor - ~SystemLoadBandwidth(); - - //@Brief: Set up the testing environment - virtual void SetUp(); - - //@Brief: Run the test case - virtual void Run(); - - //@Brief: Close and clean up the test enrionment - virtual void Close(); - - //@Brief: Display load bandwidth - virtual void DisplayResults() const; - - //@Brief: Set work-item configuration - void SetWorkItemNum() { -#ifdef INTERACTIVE - uint32_t tmp; - printf("Please input the number of CUs you want to try:\n"); - scanf("%d", &num_cus_); - - printf("Please input the number of groups you want to try:\n"); - scanf("%d", &num_group_); - - printf("Please input the size of each group:\n"); - uint32_t sz = 0; - scanf("%d", &tmp); - set_group_size(tmp); - - printf("Please input the number of kernel loop you want to try:\n"); - scanf("%d", &kernel_loop_count_); -#else - num_cus_ = 32; - num_group_ = 128; - set_group_size(256); - kernel_loop_count_ = 16; -#endif - return; - } - - private: - - //@Brief: number of group - uint32_t num_group_; - - //@Brief: number of CUs - uint32_t num_cus_; - - //@Brief: number of kernel loop - uint32_t kernel_loop_count_; - - //@Brief: Mean execution time - double mean_; - - //@Brief: data size for test - uint64_t data_size_; -}; - -#endif - diff --git a/rocrtst/suites/performance/system_store_bandwidth.cc b/rocrtst/suites/performance/system_store_bandwidth.cc deleted file mode 100755 index d2e1cc5082..0000000000 --- a/rocrtst/suites/performance/system_store_bandwidth.cc +++ /dev/null @@ -1,243 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#include "system_store_bandwidth.h" -#include "common/base_rocr_utils.h" -#include "common/common.h" -#include "common/helper_funcs.h" -#include "common/hsatimer.h" -#include "gtest/gtest.h" - -static bool verifyGlobalStoreKernel(uint32_t* data, uint32_t num_thrds, - uint32_t loop_cnt, uint32_t ops_loop, - const char* kernel_name, - bool print_debug) { - - // Verify kernel operation i.e. validate the data in the output buffer. - for (uint32_t idx1 = 0; idx1 < loop_cnt; idx1++) { - for (uint32_t idx2 = 0; idx2 < ops_loop; idx2++) { - for (uint32_t idx3 = 0; idx3 < num_thrds; idx3++) { - if (data[idx3] != (idx3 << 2)) { - std::cout << kernel_name << ": VALIDATION FAILED ! Bad index: " - << idx3 << std::endl; - std::cout << kernel_name << ": VALUE @ Bad index: " << data[idx3] - << std::endl; - break; - } - } - } - } - -#ifdef DEBUG - std::cout << kernel_name << ": Passed validation" << std::endl; - std::cout << std::endl; -#endif - - return true; -} - -// Constructor -SystemStoreBandwidth::SystemStoreBandwidth() : - BaseRocR() { - - set_group_size(0); - num_group_ = 0; - num_cus_ = 0; - - kernel_loop_count_ = 0; - mean_ = 0.0; - data_size_ = 0; -} - -// Destructor -SystemStoreBandwidth::~SystemStoreBandwidth() { -} - -// Set up the test environment -void SystemStoreBandwidth::SetUp() { - - set_kernel_file_name("sysMemWrite.o"); - set_kernel_name("&__SysMemStore"); - - if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) { - return; - } - hsa_agent_t* gpu_dev = gpu_device1(); - - SetWorkItemNum(); - - //Create a queue with max number size - hsa_queue_t* q = nullptr; - rocrtst::CreateQueue(*gpu_dev, &q); - set_main_queue(q); - - rocrtst::LoadKernelFromObjFile(this); - - uint32_t total_work_items = num_cus_ * num_group_ * group_size(); - - //Fill up part of aql - rocrtst::InitializeAQLPacket(this, &aql()); - aql().workgroup_size_x = group_size(); - aql().grid_size_x = total_work_items; - - return; -} - -// Run the test -void SystemStoreBandwidth::Run() { - hsa_status_t err; - - if (!rocrtst::CheckProfile(this)) { - return; - } - - uint32_t total_workitems = num_cus_ * num_group_ * group_size(); - hsa_agent_t* gpu_dev = gpu_device1(); - - uint32_t ops_thrd = 16; - uint64_t addr_step = (uint64_t) total_workitems * sizeof(uint32_t); - uint64_t total_ops = (uint64_t) total_workitems * kernel_loop_count_ - * ops_thrd; - uint64_t in_data_size = (uint64_t) total_ops * sizeof(uint32_t); - err = hsa_amd_agent_iterate_memory_pools(*gpu_dev, - rocrtst::FindStandardPool, &device_pool()); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - uint32_t* in_data = NULL; - err = hsa_amd_memory_pool_allocate(device_pool(), in_data_size, 0, - (void**) &in_data); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - //memset(in_data, 0, in_data_size); - err = hsa_amd_memory_fill(in_data, 0, in_data_size); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - uint32_t out_data_size = total_workitems * sizeof(uint32_t); - uint32_t* out_data = NULL; - err = hsa_amd_memory_pool_allocate(device_pool(), out_data_size, 0, - (void**) &out_data); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - //memset(out_data, 0, out_data_size); - err = hsa_amd_memory_fill(out_data, 0, out_data_size); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - data_size_ = in_data_size; - - typedef struct local_args_t { - void* arg0; - void* arg1; - uint64_t arg2; - void* arg3; - } args; - - // in_data is 32 bit ptr, so adding total_ops - args* kern_ptr = NULL; - err = hsa_amd_memory_pool_allocate(device_pool(), sizeof(args), 0, - (void**) &kern_ptr); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - kern_ptr->arg0 = in_data; - kern_ptr->arg1 = in_data + total_ops; - kern_ptr->arg2 = addr_step; - kern_ptr->arg3 = out_data; - - aql().kernarg_address = kern_ptr; - - std::vector time; - void *q_base_addr = main_queue()->base_address; - for (uint32_t i = 0; i < num_iteration(); i++) { - // Obtain the current queue write index - uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1); - - // Write the aql packet at the calculated queue index address. - const uint32_t queue_mask = main_queue()->size - 1; - ((hsa_kernel_dispatch_packet_t*)(q_base_addr))[index & queue_mask] = aql(); - - rocrtst::PerfTimer p_timer; - int id = p_timer.CreateTimer(); - p_timer.StartTimer(id); - - ((hsa_kernel_dispatch_packet_t*)(q_base_addr))[index & queue_mask].header |= - HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; - hsa_signal_store_screlease(main_queue()->doorbell_signal, index); - - // Wait on the dispatch signal until the kernel is finished. - while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1, - (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) - ; - - p_timer.StopTimer(id); - - // Verify the results - verifyGlobalStoreKernel(in_data, total_workitems, kernel_loop_count_, - ops_thrd, kernel_name().c_str(), false); - - time.push_back(p_timer.ReadTimer(id)); - - hsa_signal_store_screlease(signal(), 1); - } - - time.erase(time.begin()); - mean_ = rocrtst::CalcMean(time); - - return; -} - -void SystemStoreBandwidth::Close() { - hsa_status_t err; - err = rocrtst::CommonCleanUp(this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - return; -} - -void SystemStoreBandwidth::DisplayResults() const { - - if (!rocrtst::CheckProfile(this)) { - return; - } - - std::cout << "=======================================" << std::endl; - std::cout << "System Load Bandwidth: %f(GB/S)" - << data_size_ / mean_ / 1024 / 1024 / 1024 << std::endl; -} diff --git a/rocrtst/suites/performance/system_store_bandwidth.h b/rocrtst/suites/performance/system_store_bandwidth.h deleted file mode 100755 index 7327a0d5f1..0000000000 --- a/rocrtst/suites/performance/system_store_bandwidth.h +++ /dev/null @@ -1,121 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#ifndef __ROCRTST_SRC_INC_SYSTEM_STORE_BANDWIDTH_H__ -#define __ROCRTST_SRC_INC_SYSTEM_STORE_BANDWIDTH_H__ - -#include "perf_common/perf_base.h" -#include "common/base_rocr.h" -#include "hsa/hsa.h" -#include - -class SystemStoreBandwidth: public rocrtst::BaseRocR, public PerfBase { - public: - //@Brief: Constructor - SystemStoreBandwidth(); - - //@Brief: Destructor - ~SystemStoreBandwidth(); - - //@Brief: Set up the testing environment - virtual void SetUp(); - - //@Brief: Run the test case - virtual void Run(); - - //@Brief: Close and clean up the test enrionment - virtual void Close(); - - //@Brief: Display load bandwidth - virtual void DisplayResults() const; - - //@Brief: Set work-item configuration - void SetWorkItemNum() { -#ifdef INTERACTIVE - uint32_t tmp; - - printf("Please input the number of CUs you want to try:\n"); - scanf("%d", &num_cus_); - - printf("Please input the number of groups you want to try:\n"); - scanf("%d", &num_group_); - - printf("Please input the size of each group:\n"); - scanf("%d", &tmp); - set_group_size(tmp); - - printf("Please input the number of kernel loop you want to try:\n"); - scanf("%d", &kernel_loop_count_); -#else - num_cus_ = 32; - num_group_ = 128; - group_size_ = 256; - kernel_loop_count_ = 16; -#endif - return; - } - - private: - //@Brief: number of work item in one group - uint32_t group_size_; - - //@Brief: number of group - uint32_t num_group_; - - //@Brief: number of CUs - uint32_t num_cus_; - - //@Brief: number of kernel loop - uint32_t kernel_loop_count_; - - //@Brief: Mean execution time - double mean_; - - //@Brief: data size for test - uint64_t data_size_; -}; - -#endif - diff --git a/rocrtst/suites/performance/test_case_template.cc b/rocrtst/suites/performance/test_case_template.cc new file mode 100755 index 0000000000..65f24ae7b0 --- /dev/null +++ b/rocrtst/suites/performance/test_case_template.cc @@ -0,0 +1,395 @@ +/* + * ============================================================================= + * ROC Runtime Conformance Release License + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2017, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +// The purpose of this test is to provide an example of the use of the +// common RocrTest classes and utilities that are used in many examples. +// It can be used as a template to start off with when writing new tests. +// In many cases, the existing boilerplate code will be sufficient as is. +// Otherwise, the boilerplate code can be either supplemented or replaced +// by your own code in your example, as necessary. +// +// The comments provided are focused more on the use of the common rocrtst +// utilities and boilerplate code, rather than the example app. itself. +// +// The boilerplate code includes code for: +// * hsa initialization and clean up +// * code to load pre-built kernels +// * creating queues +// * populating AQL packets +// * checking for required profiles +// * finding cpu and gpu agents (callbacks for common use cases) +// * finding pools (having common requirements) +// * allocating and setting kernel arguments +// * somewhat standardized output +// * handling additional command line arguments, beyond google-test arguments +// * support for various level of verbosity, controlled from command line arg +// * support for building OpenCL kernels +// * timer support +// +// Overview of RocrTst code organization: +// Classes: +// * class BaseRocR (base_rocr.h) -- base class for all rocrtst examples and +// tests. Most of the rocrtst common utilities act on BaseRocR objects +// +// * TestBase (test_base.h) -- derives from BaseRocR and is the base class +// for all tests under /suites. The implementation in TestBase +// methods are typically actions that are required for most/all tests and +// should therefore be called from the derived implementions of the methods. +// +// Utilities: +// * /common/base_rocr_utils. contains a set of utilities +// that act on BaseRocR objects. +// +// * /common/common. contain other non-BaseRocR utilities +// +// Special Files: +// * main.cc -- The main google test file from which the tests are invoked. +// There should be an entry for each test to be run there. +// +// * kernels -- OpenCL kernel source files should go in the kernels directory +// +// * CMakeLists.txt -- Host code (*.cc and *.h files) should build without +// modifying the CMakeList.txt file, if the files are place in the +// "performance" directory. However, an entry for OpenCL kernels. For +// each kernel to be built, the bitcode libraries must be indicated before +// the call to "build_kernel()" is made. See existing code for examples. + +#include +#include +#include + +#include "suites/performance/test_case_template.h" +#include "common/base_rocr_utils.h" +#include "common/common.h" +#include "common/helper_funcs.h" +#include "common/hsatimer.h" +#include "gtest/gtest.h" +#include "hsa/hsa.h" +#include "hsa/hsa_ext_finalize.h" + +static const uint32_t kNumBufferElements = 256; + +#define RET_IF_HSA_ERR(err) { \ + if ((err) != HSA_STATUS_SUCCESS) { \ + const char* msg = 0; \ + hsa_status_string(err, &msg); \ + std::cout << "hsa api call failure at line " << __LINE__ << ", file: " << \ + __FILE__ << ". Call returned " << err << std::endl; \ + std::cout << msg << std::endl; \ + return (err); \ + } \ +} + +// Many test cases want to perform an operation on memory sizes of various +// granularities. +#if 0 +static const int kNumGranularity = 20; +const char* Str[kNumGranularity] = {"1k", "2K", "4K", "8K", "16K", "32K", + "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", + "64M", "128M", "256M", "512M"}; + +const size_t Size[kNumGranularity] = { + 1024, 2*1024, 4*1024, 8*1024, 16*1024, 32*1024, 64*1024, 128*1024, + 256*1024, 512*1024, 1024*1024, 2048*1024, 4096*1024, 8*1024*1024, + 16*1024*1024, 32*1024*1024, 64*1024*1024, 128*1024*1024, 256*1024*1024, + 512*1024*1024}; + +static const int kMaxCopySize = Size[kNumGranularity - 1]; +#endif +TestExample::TestExample(void) : + TestBase() { + set_num_iteration(10); // Number of iterations to execute of the main test; + // This is a default value which can be overridden + // on the command line. + set_title("Test Case Example"); + set_description("Put a description of the test case here. Line breaks " + "will be taken care of on output, not here."); + + set_kernel_file_name("test_case_template_kernels.hsaco"); + set_kernel_name("square"); // kernel function name + +#if 0 + // Set required profile to HSA_PROFILE_FULL or HSA_PROFILE_BASE if it + // matters for this test. If either profile is fine, then leave with + // default + set_requires_profile(); +#endif +} + +TestExample::~TestExample(void) { +} + +// Any 1-time setup involving member variables used in the rest of the test +// should be done here. +void TestExample::SetUp(void) { + hsa_status_t err; + + // TestBase::SetUp() will set HSA_ENABLE_INTERRUPT if enable_interrupt() is + // true, and call hsa_init(). It also prints the SetUp header. + TestBase::SetUp(); + + // SetDefaultAgents(this) will assign the first CPU and GPU found on + // iterating through the agents and assign them to cpu_device_ and + // gpu_device1_, respectively (cpu_device() and gpu_device1()). These + // BaseRocR member variables are used in some utilities. Additionally, + // SetDefaultAgents() checks the profile of the gpu and compares this + // to any required profile. + // + // If SetDefaultAgents() is not used, if the profile of the target GPU + // matters for this test, it should be set with set_profile() and + // CheckProfileAndInform() should be called to check if it is the + // required profile + err = rocrtst::SetDefaultAgents(this); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); + + hsa_agent_t* gpu_dev = gpu_device1(); + + // Find and assign HSA_AMD_SEGMENT_GLOBAL pools for cpu, gpu and a kern_arg + // pool + err = rocrtst::SetPoolsTypical(this); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + // Create a queue + hsa_queue_t* q = nullptr; + rocrtst::CreateQueue(*gpu_dev, &q); + ASSERT_NE(q, nullptr); + set_main_queue(q); + + err = rocrtst::LoadKernelFromObjFile(this); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + // Fill up the kernel packet (except header) with some values we've + // collected so far, and some reasonable default values; this should be after + // LoadKernelFromObjFile(). AllocAndSetKernArgs() will fill in the kern_args + err = rocrtst::InitializeAQLPacket(this, &aql()); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); + + hsa_agent_t ag_list[2] = {*gpu_device1(), *cpu_device()}; + + // Allocate a few buffers for our example + err = hsa_amd_memory_pool_allocate(cpu_pool(), + kNumBufferElements*sizeof(uint32_t), + 0, reinterpret_cast(&src_buffer_)); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + err = hsa_amd_agents_allow_access(2, ag_list, NULL, src_buffer_); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + // Initialize the source buffer + for (uint32_t i = 0; i < kNumBufferElements; ++i) { + reinterpret_cast(src_buffer_)[i] = i; + } + + err = hsa_amd_memory_pool_allocate(cpu_pool(), + kNumBufferElements*sizeof(uint32_t), + 0, reinterpret_cast(&dst_buffer_)); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + err = hsa_amd_agents_allow_access(2, ag_list, NULL, dst_buffer_); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + // Set up Kernel arguments + // See the meta-data for the compiled OpenCL kernel code to ascertain + // the sizes, padding and alignment required for kernel arguments. + // This can be seen by executing + // $ amdgcn-amd-amdhsa-readelf -aw ./binary_search_kernels.hsaco + // The kernel code will expect the following arguments aligned as shown. +// typedef uint32_t uint4[4]; + struct __attribute__((aligned(16))) local_args_t { + uint32_t* dstArray; + uint32_t* srcArray; + uint32_t size; + uint32_t pad; + uint64_t global_offset_x; + uint64_t global_offset_y; + uint64_t global_offset_z; + } local_args; + + local_args.dstArray = reinterpret_cast(dst_buffer_); + local_args.srcArray = reinterpret_cast(src_buffer_); + local_args.size = kNumBufferElements; + local_args.global_offset_x = 0; + local_args.global_offset_y = 0; + local_args.global_offset_z = 0; + + err = rocrtst::AllocAndSetKernArgs(this, &local_args, sizeof(local_args)); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + return; +} + +// This wrapper atomically writes the provided header and setup to the +// provided AQL packet. The provided AQL packet address should be in the +// queue memory space. +static inline void AtomicSetPacketHeader(uint16_t header, uint16_t setup, + hsa_kernel_dispatch_packet_t* queue_packet) { + __atomic_store_n(reinterpret_cast(queue_packet), + header | (setup << 16), __ATOMIC_RELEASE); +} + +// Do a few extra iterations as we toss out some of the inital and final +// iterations when calculating statistics +uint32_t TestExample::RealIterationNum(void) { + return num_iteration() * 1.2 + 1; +} + +static bool VerifyResult(uint32_t *ar, size_t sz) { + for (size_t i = sz; i < sz; ++i) { + if (i*i != ar[i]) { + return false; + } + } + return true; +} +void TestExample::Run(void) { + // Compare required profile for this test case with what we're actually + // running on + if (!rocrtst::CheckProfile(this)) { + return; + } + + TestBase::Run(); + + // Override whatever we need to... + aql().workgroup_size_x = kNumBufferElements; + aql().grid_size_x = kNumBufferElements; + + std::vector timer; + + int it = RealIterationNum(); + hsa_kernel_dispatch_packet_t *queue_aql_packet; + + rocrtst::PerfTimer p_timer; + uint64_t index; + + for (int i = 0; i < it; i++) { + // This function simply copies the data we've collected so far into our + // local AQL packet, except the the setup and header fields. + queue_aql_packet = WriteAQLToQueue(this, &index); + ASSERT_EQ(queue_aql_packet, + reinterpret_cast + (main_queue()->base_address) + index); + uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH; + + aql_header |= HSA_FENCE_SCOPE_SYSTEM << + HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; + aql_header |= HSA_FENCE_SCOPE_SYSTEM << + HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; + + // Create and start a timer for this iteration + int id = p_timer.CreateTimer(); + p_timer.StartTimer(id); + + AtomicSetPacketHeader(aql_header, aql().setup, queue_aql_packet); + + hsa_signal_store_screlease(main_queue()->doorbell_signal, index); + + // Wait on the dispatch signal until the kernel is finished. + while (hsa_signal_wait_scacquire(aql().completion_signal, + HSA_SIGNAL_CONDITION_LT, 1, (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) { + } + + // Stop the timer + p_timer.StopTimer(id); + + // Store time for later analysis + timer.push_back(p_timer.ReadTimer(id)); + hsa_signal_store_screlease(aql().completion_signal, 1); + + ASSERT_TRUE(VerifyResult(reinterpret_cast(dst_buffer_), + kNumBufferElements)); + + // Pay attention to verbosity level for things like progress output + if (verbosity() >= VERBOSE_PROGRESS) { + std::cout << "."; + fflush(stdout); + } + } + + if (verbosity() >= VERBOSE_PROGRESS) { + std::cout << std::endl; + } + + // Abandon the first result and after sort, delete the last 2% value + timer.erase(timer.begin()); + std::sort(timer.begin(), timer.end()); + timer.erase(timer.begin() + num_iteration(), timer.end()); + + time_mean_ = rocrtst::CalcMean(timer); +} + +void TestExample::DisplayTestInfo(void) { + TestBase::DisplayTestInfo(); +} + +void TestExample::DisplayResults(void) const { + // Compare required profile for this test case with what we're actually + // running on + if (!rocrtst::CheckProfile(this)) { + return; + } + + TestBase::DisplayResults(); + std::cout << "The average time was: " << time_mean_ * 1e6 << + " uS" << std::endl; + return; +} + +void TestExample::Close() { + hsa_status_t err; + + err = hsa_amd_memory_pool_free(src_buffer_); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); + + err = hsa_amd_memory_pool_free(dst_buffer_); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); + + // This will close handles opened within rocrtst utility calls and call + // hsa_shut_down(), so it should be done after other hsa cleanup + TestBase::Close(); +} + + +#undef RET_IF_HSA_ERR diff --git a/rocrtst/suites/performance/image_load_bandwidth.h b/rocrtst/suites/performance/test_case_template.h similarity index 76% rename from rocrtst/suites/performance/image_load_bandwidth.h rename to rocrtst/suites/performance/test_case_template.h index 9239853064..e20ed27d6b 100755 --- a/rocrtst/suites/performance/image_load_bandwidth.h +++ b/rocrtst/suites/performance/test_case_template.h @@ -43,40 +43,41 @@ * */ -#ifndef __ROCRTST_SRC_INC_IMAGE_LOAD_BANDWIDTH_H__ -#define __ROCRTST_SRC_INC_IMAGE_LOAD_BANDWIDTH_H__ +#ifndef ROCRTST_SUITES_PERFORMANCE_TEST_CASE_TEMPLATE_H_ +#define ROCRTST_SUITES_PERFORMANCE_TEST_CASE_TEMPLATE_H_ #include "common/base_rocr.h" #include "hsa/hsa.h" -#include "perf_common/perf_base.h" +#include "suites/test_common/test_base.h" -class ImageLoadBandwidth: public rocrtst::BaseRocR, public PerfBase { +class TestExample : public TestBase { public: - //@Brief: Constructor - ImageLoadBandwidth(); + TestExample(); - //@Brief: Destructor - ~ImageLoadBandwidth(); + // @Brief: Destructor for test case of TestExample + virtual ~TestExample(); - //@Brief: Set up the test environment + // @Brief: Setup the environment for measurement virtual void SetUp(); - //@Brief: Run the actual testing + // @Brief: Core measurement execution virtual void Run(); - //@Brief: Clean up the test environment + // @Brief: Clean up and retrive the resource virtual void Close(); - //@Brief: Display results + // @Brief: Display results virtual void DisplayResults() const; - private: - //@Brief: Image Load Bandwidth - double load_bandwidth_; + // @Brief: Display information about what this test does + virtual void DisplayTestInfo(void); - //@Brief: Image size - size_t image_size_; + private: + uint32_t RealIterationNum(void); + + double time_mean_; + void *src_buffer_; + void *dst_buffer_; }; -#endif //__ROCRTST_SRC_INC_IMAGE_LOAD_BANDWIDTH_H__ - +#endif // ROCRTST_SUITES_PERFORMANCE_TEST_CASE_TEMPLATE_H_ diff --git a/rocrtst/suites/performance/vector_copy.cc b/rocrtst/suites/performance/vector_copy.cc deleted file mode 100644 index f772a48351..0000000000 --- a/rocrtst/suites/performance/vector_copy.cc +++ /dev/null @@ -1,279 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#include "vector_copy.h" -#include "common/base_rocr_utils.h" -#include "gtest/gtest.h" - -// Copy vector buffer size. -static const size_t BUFFER_SIZE = 1024 * 1024 * 4; -static char* gCPUOutput = nullptr; -static uint64_t gQueueIndex = 0; - -//Constructor -VectorCopy::VectorCopy() : - BaseRocR() { - set_kernel_name("&__vector_copy_kernel"); - kernarg_address = NULL; -} - -//Destructor -VectorCopy::~VectorCopy() { -} - -// Find coarse grained system memory. -static hsa_status_t get_sys_coarse_grained_memory_pool( - hsa_amd_memory_pool_t pool, void* data) { - hsa_amd_segment_t segment; - hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, - &segment); - - if (HSA_AMD_SEGMENT_GLOBAL != segment) { - return HSA_STATUS_SUCCESS; - } - - hsa_amd_memory_pool_global_flag_t flags; - hsa_status_t err = hsa_amd_memory_pool_get_info(pool, - HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags); - - if (HSA_STATUS_SUCCESS == err - && (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)) { - hsa_amd_memory_pool_t* ret = (hsa_amd_memory_pool_t*) data; - *ret = pool; - return HSA_STATUS_INFO_BREAK; - } - - return err; -} - -// Find out dGPU's local memory pool. -static hsa_status_t get_local_memory_pool(hsa_amd_memory_pool_t pool, - void* data) { - // With memory pool API, each agent will only report it is own memory pools. - // So, a coarse grained memory pool in global segment is what we want. - hsa_amd_segment_t segment; - - hsa_status_t err = hsa_amd_memory_pool_get_info(pool, - HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); - - if (HSA_STATUS_SUCCESS != err) { - return err; - } - - if (HSA_AMD_SEGMENT_GLOBAL != segment) { - return HSA_STATUS_SUCCESS; - } - - hsa_amd_memory_pool_global_flag_t flags; - err = hsa_amd_memory_pool_get_info(pool, - HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags); - - if (HSA_STATUS_SUCCESS == err - && (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)) { - hsa_amd_memory_pool_t* ret = (hsa_amd_memory_pool_t*) data; - *ret = pool; - return HSA_STATUS_INFO_BREAK; - } - - return err; -} - -void VectorCopy::SetUp() { - hsa_status_t err; - hsa_agent_t* gpu_dev = gpu_device1(); - - if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) { - return; - } - - //Create a queue with max number size - hsa_queue_t* q; - rocrtst::CreateQueue(*gpu_dev, &q); - set_main_queue(q); - - rocrtst::LoadKernelFromObjFile(this); - - // Obtain the current queue write index. - gQueueIndex = hsa_queue_load_write_index_scacquire(main_queue()); - - rocrtst::InitializeAQLPacket(this, &aql()); - uint16_t header = 0; - header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; - - aql().grid_size_x = (uint32_t)(1024 * 1024); - aql().kernarg_address = (void*) kernarg_address; - - // Find system memory pool for kernarg allocation. - // hsa_amd_memory_pool_t sys_coarse_grained_pool; - err = hsa_amd_agent_iterate_memory_pools(cpus[0], - get_sys_coarse_grained_memory_pool, &sys_coarse_grained_pool_); - ASSERT_EQ(err, HSA_STATUS_INFO_BREAK); - - // Get local memory pool of the first GPU. - // hsa_amd_memory_pool_t gpu_pool_; - err = hsa_amd_agent_iterate_memory_pools(gpus[0], get_local_memory_pool, - &gpu_pool_); - ASSERT_EQ(err, HSA_STATUS_INFO_BREAK); - - return; -} - -void VectorCopy::Run() { - hsa_status_t err; - void* in; - void* out; - - if (!rocrtst::CheckProfile(this)) { - return; - } - - // Allocate vector on the first GPU local memory as input. - err = hsa_amd_memory_pool_allocate(gpu_pool_, BUFFER_SIZE, 0, &in); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - std::cout << "Allocating " << BUFFER_SIZE << - " Bytes of local memory on the first GPU, address = " << - in << std::endl; - - // rocrtst::CommonCleanUp input buffer on the first GPU to 1 for each byte. - err = hsa_amd_memory_fill(in, 0x01010101, BUFFER_SIZE / 4); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Allocate vector on the first GPU local memory as output - err = hsa_amd_memory_pool_allocate(gpu_pool_, BUFFER_SIZE, 0, &out); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - std::cout << "Allocating " << BUFFER_SIZE << - " Bytes of local memory on the second GPU, address = " << - out << std::endl; - - // rocrtst::CommonCleanUp output buffer on the first GPU to 0. - err = hsa_amd_memory_fill(out, 0x00000000, BUFFER_SIZE / 4); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - typedef struct args_t { - void* in; - void* out; - } args; - - args* kargs; - - kargs->in = in; - kargs->out = out; - - // Allocate the kernel argument buffer from the system memory pool. - err = hsa_amd_memory_pool_allocate(sys_coarse_grained_pool_, kernarg_size(), - 0, &kernarg_address); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - memcpy(kernarg_address, &kargs, sizeof(args)); - - // Map kernarg space to the first GPU - err = hsa_amd_agents_allow_access(1, &gpus[0], NULL, kernarg_address); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - /* - * Increment the write index and ring the doorbell to dispatch the kernel. - */ - hsa_queue_store_write_index_screlease(main_queue(), gQueueIndex + 1); - hsa_signal_store_relaxed(main_queue()->doorbell_signal, gQueueIndex); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Wait on the dispatch completion signal until the kernel is finished. - while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_EQ, 0, - UINT64_MAX, HSA_WAIT_STATE_BLOCKED)) - ; - - // Reset signal value for future usage to copy output. - hsa_signal_store_screlease(signal(), 1); - - // Allocate vector on the system memory pool. - err = hsa_amd_memory_pool_allocate(sys_coarse_grained_pool_, BUFFER_SIZE, 0, - (void**) &gCPUOutput); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Allow the first GPU to access the output - err = hsa_amd_agents_allow_access(1, &gpus[0], NULL, gCPUOutput); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - //Copy the output from GPU to the CPU buffer for validation - err = hsa_amd_memory_async_copy(gCPUOutput, cpus[0], out, gpus[0], - BUFFER_SIZE, 0, NULL, signal()); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - // Wait on the completion signal until the async copy is finished. - while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_EQ, 0, - UINT64_MAX, HSA_WAIT_STATE_BLOCKED)) - ; - - for (uint32_t i = 0; i < BUFFER_SIZE; i++) { - ASSERT_EQ(gCPUOutput[i], 1); - } - - return; -} - -void VectorCopy::Close() { - hsa_status_t err; - // Cleanup all allocated resources. - err = hsa_amd_memory_pool_free(kernarg_address); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = hsa_signal_destroy(signal()); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = hsa_queue_destroy(main_queue()); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = hsa_amd_memory_pool_free(gCPUOutput); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - - err = rocrtst::CommonCleanUp(this); - ASSERT_EQ(err, HSA_STATUS_SUCCESS); - return; -} - -void VectorCopy::DisplayResults() const { - if (!rocrtst::CheckProfile(this)) { - return; - } -} diff --git a/rocrtst/suites/performance/vector_copy.h b/rocrtst/suites/performance/vector_copy.h deleted file mode 100755 index 5946b04023..0000000000 --- a/rocrtst/suites/performance/vector_copy.h +++ /dev/null @@ -1,109 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#ifndef __ROCRTST_SRC_VECTOR_COPY_H__ -#define __ROCRTST_SRC_VECTOR_COPY_H__ - -#include "perf_common/perf_base.h" -#include "common/base_rocr.h" -#include "common/common.h" -#include "common/hsatimer.h" -#include "hsa/hsa.h" -#include "hsa/hsa_ext_amd.h" -#include "hsa/hsa_ext_finalize.h" -#include -#include - -//@Brief: This class is defined to measure the mean latency of launching -//an empty kernel - -class VectorCopy: public rocrtst::BaseRocR, public PerfBase { - public: - //@Brief: Constructor - VectorCopy(); - - //@Brief: Destructor - virtual ~VectorCopy(); - - //@Brief: Set up the environment for the test - virtual void SetUp(); - - //@Brief: Run the test case - virtual void Run(); - - virtual void DisplayResults() const; - - //@Brief: Clean up and close the runtime - virtual void Close(); - - private: - - //@Brief: Store the size of queue - uint32_t queue_size_; - - //@Brief: kernarg_address; - void* kernarg_address; - - //@Brief: The mean time of CP Processing - double mean_; - - //@Brief: The group memory region - hsa_region_t group_region_; - - hsa_amd_memory_pool_t gpu_pool_; - hsa_amd_memory_pool_t sys_coarse_grained_pool_; - - std::vector cpus; - std::vector gpus; - - //@Brief: Pointer to cu_id array - uint32_t* cu_; - - uint32_t manual_input; - uint32_t group_input; -}; - -#endif - diff --git a/rocrtst/suites/performance/vector_copy_peer_to_peer.h b/rocrtst/suites/performance/vector_copy_peer_to_peer.h deleted file mode 100755 index 0f05674cb5..0000000000 --- a/rocrtst/suites/performance/vector_copy_peer_to_peer.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ - -#ifndef __ROCRTST_SRC_VECTOR_COPY_P2P_H__ -#define __ROCRTST_SRC_VECTOR_COPY_P2P_H__ - -#include "perf_common/perf_base.h" -#include "common/base_rocr.h" -#include "common/common.h" -#include "common/hsatimer.h" -#include "hsa/hsa.h" -#include "hsa/hsa_ext_amd.h" -#include "hsa/hsa_ext_finalize.h" -#include -#include - -//@Brief: This class is defined to measure the mean latency of launching -//an empty kernel - -class VectorCopyP2P: public rocrtst::BaseRocR, public PerfBase { - public: - //@Brief: Constructor - VectorCopyP2P(); - - //@Brief: Destructor - virtual ~VectorCopyP2P(); - - //@Brief: Set up the environment for the test - virtual void SetUp(); - - //@Brief: Run the test case - virtual void Run(); - - //@Brief: Display results we got - virtual void DisplayResults() const; - - //@Brief: Clean up and close the runtime - virtual void Close(); - - private: - //@Brief: Get actual iteration number - virtual size_t RealIterationNum(); - - //@Brief: Create Queue - virtual void CreateQueue(); - - //@Brief: Store the size of queue - uint32_t queue_size_; - - //@Brief: The mean time of CP Processing - double mean_; - - //@Brief: The group memory region - hsa_region_t group_region_; - - //@Brief: Pointer to cu_id array - uint32_t* cu_; - - uint32_t manual_input; - uint32_t group_input; -}; - -#endif - diff --git a/rocrtst/suites/test_common/test_base.cc b/rocrtst/suites/test_common/test_base.cc new file mode 100755 index 0000000000..d7fa7883bf --- /dev/null +++ b/rocrtst/suites/test_common/test_base.cc @@ -0,0 +1,141 @@ +/* + * ============================================================================= + * ROC Runtime Conformance Release License + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2017, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +#include + +#include "suites/test_common/test_base.h" +#include "common/base_rocr_utils.h" +#include "gtest/gtest.h" + +static const int kOutputLineLength = 80; +static const char kLabelDelimiter[] = "####"; +static const char kDescriptionLabel[] = "TEST DESCRIPTION"; +static const char kTitleLabel[] = "TEST NAME"; +static const char kSetupLabel[] = "TEST SETUP"; +static const char kRunLabel[] = "TEST EXECUTION"; +static const char kCloseLabel[] = "TEST CLEAN UP"; +static const char kResultsLabel[] = "TEST RESULTS"; + + +TestBase::TestBase() { + set_description(""); +} +TestBase::~TestBase() { +} + +static void MakeHeaderStr(const char *inStr, std::string *outStr) { + assert(outStr != nullptr); + assert(inStr != nullptr); + + outStr->clear(); + *outStr = kLabelDelimiter; + *outStr += " "; + *outStr += inStr; + *outStr += " "; + *outStr += kLabelDelimiter; +} + +void TestBase::SetUp(void) { + hsa_status_t err; + std::string label; + MakeHeaderStr(kSetupLabel, &label); + printf("\n\t%s\n", label.c_str()); + + err = rocrtst::InitAndSetupHSA(this); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); + + return; +} + +void TestBase::Run(void) { + std::string label; + MakeHeaderStr(kRunLabel, &label); + printf("\n\t%s\n", label.c_str()); +} + +void TestBase::Close(void) { + hsa_status_t err; + std::string label; + MakeHeaderStr(kCloseLabel, &label); + printf("\n\t%s\n", label.c_str()); + + err = rocrtst::CommonCleanUp(this); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); +} + + +void TestBase::DisplayResults(void) const { + std::string label; + MakeHeaderStr(kResultsLabel, &label); + printf("\n\t%s\n", label.c_str()); +} + +void TestBase::DisplayTestInfo(void) { + printf("#########################################" + "######################################\n"); + + std::string label; + MakeHeaderStr(kTitleLabel, &label); + printf("\n\t%s\n%s\n", label.c_str(), title().c_str()); + + if (verbosity() >= VERBOSE_STANDARD) { + MakeHeaderStr(kDescriptionLabel, &label); + printf("\n\t%s\n%s\n", label.c_str(), description().c_str()); + } +} + +void TestBase::set_description(std::string d) { + int le = kOutputLineLength - 4; + + description_ = d; + size_t endlptr; + + for (size_t i = le; i < description_.size(); i += le) { + endlptr = description_.find_last_of(" ", i); + description_.replace(endlptr, 1, "\n"); + i = endlptr; + } +} + diff --git a/rocrtst/suites/performance/queue_concurrency.h b/rocrtst/suites/test_common/test_base.h similarity index 70% rename from rocrtst/suites/performance/queue_concurrency.h rename to rocrtst/suites/test_common/test_base.h index 326514bfe8..9141fbf66a 100755 --- a/rocrtst/suites/performance/queue_concurrency.h +++ b/rocrtst/suites/test_common/test_base.h @@ -42,52 +42,43 @@ * DEALINGS WITH THE SOFTWARE. * */ +#ifndef ROCRTST_SUITES_TEST_COMMON_TEST_BASE_H_ +#define ROCRTST_SUITES_TEST_COMMON_TEST_BASE_H_ -#ifndef __ROCRTST_SRC_INC_QUEUE_CONCURRENCY_H__ -#define __ROCRTST_SRC_INC_QUEUE_CONCURRENCY_H__ - -#include "perf_common/perf_base.h" +#include #include "common/base_rocr.h" -#include "hsa/hsa.h" -#include -class QueueConcurrency: public rocrtst::BaseRocR, public PerfBase { +class TestBase : public rocrtst::BaseRocR { public: - //@Brief: Constructor - QueueConcurrency(); - //@Brief: Destructor - ~QueueConcurrency(); + TestBase(void); - //@Brief: Set up the test environmnet - void SetUp(); + virtual ~TestBase(void); - //@Brief: Run the test - void Run(); + enum VerboseLevel {VERBOSE_MIN = 0, VERBOSE_STANDARD, VERBOSE_PROGRESS}; - //@Brief: Clean up and close - void Close(); + // @Brief: Before run the core measure codes, do something to set up + // i.e. init runtime, prepare packet... + virtual void SetUp(void); - void DisplayResults() const; + // @Brief: Core measurement codes executing here + virtual void Run(void); + + // @Brief: Do something clean up + virtual void Close(void); + + // @Brief: Display the results + virtual void DisplayResults(void) const; + + // @Brief: Display information about the test + virtual void DisplayTestInfo(void); + + const std::string & description(void) const {return description_;} + + void set_description(std::string d); private: - - //@Brief: Thread function - void ThreadFunc(int i); - - //@Brief: Calculate the concurrent queue number - void CalculateQueueNum(); - - //@Brief: Vector to store execution time - std::vector execution_time_; - - //@Brief: Number of concurrent queues - size_t queue_num_; - - //@Brief: Store the standard execution time - double std_time_; - + std::string description_; }; -#endif //__ROCRTST_SRC_INC_QUEUE_CONCURRENCY_H__ - +#endif // ROCRTST_SUITES_TEST_COMMON_TEST_BASE_H_ diff --git a/rocrtst/suites/performance/device_load_bandwidth.h b/rocrtst/suites/test_common/test_common.cc similarity index 55% rename from rocrtst/suites/performance/device_load_bandwidth.h rename to rocrtst/suites/test_common/test_common.cc index 9cf98ca4be..8ae52d026b 100755 --- a/rocrtst/suites/performance/device_load_bandwidth.h +++ b/rocrtst/suites/test_common/test_common.cc @@ -43,77 +43,79 @@ * */ -#ifndef __ROCRTST_SRC_INC_DEVICE_LOAD_BANDWIDTH_H__ -#define __ROCRTST_SRC_INC_DEVICE_LOAD_BANDWIDTH_H__ +#include +#include +#include +#include -#include "perf_common/perf_base.h" -#include "common/base_rocr.h" -#include "hsa/hsa.h" -#include +#include "suites/test_common/test_common.h" -class DeviceLoadBandwidth: public rocrtst::BaseRocR, public PerfBase { - public: - //@Brief: Constructor - DeviceLoadBandwidth(); +RocrtstOptions::RocrtstOptions(uint32_t *verb, uint32_t *iter) { + assert(verb != nullptr); + assert(iter != nullptr); - //@Brief: Destructor - ~DeviceLoadBandwidth(); + verbosity_ = verb; + iterations_ = iter; +} - //@Brief: Set up the testing environment - virtual void SetUp(); +RocrtstOptions::~RocrtstOptions() { +} - //@Brief: Run the test case - virtual void Run(); +static const struct option long_options[] = { + {"iterations", required_argument, nullptr, 'i'}, + {"verbose", no_argument, nullptr, 'v'}, - //@Brief: Close and clean up the test enrionment - virtual void Close(); - - //@Brief: Display load bandwidth - virtual void DisplayResults() const; - - //@Brief: Set work-item configuration - void SetWorkItemNum() { -#ifdef INTERACTIVE - uint32_t tmp; - printf("Please input the number of CUs you want to try:\n"); - scanf("%d", &num_cus_); - - printf("Please input the number of groups you want to try:\n"); - scanf("%d", &num_group_); - - printf("Please input the size of each group:\n"); - scanf("%d", &tmp); - set_group_size(tmp); - - printf("Please input the number of kernel loop you want to try:\n"); - scanf("%d", &kernel_loop_count_); -#else - num_cus_ = 16; - num_group_ = 128; - set_group_size(64); - kernel_loop_count_ = 16; -#endif - return; - } - - private: - //@Brief: number of group - uint32_t num_group_; - - //@Brief: number of CUs - uint32_t num_cus_; - - //@Brief: number of kernel loop - uint32_t kernel_loop_count_; - - //@Brief: Mean execution time - double mean_; - - //@Brief: data size for test - uint64_t data_size_; - uint32_t* in_data_; - uint32_t* out_data_; + {nullptr, 0, nullptr, 0} }; +static const char* short_options = "i:v:r"; -#endif +static void PrintHelp(void) { + std::cout << +// "Required Arguments:\n" +// "--kernel, -k \n" + "Optional RocRTst Arguments:\n" + "--iterations, -i ; override default, " + "which varies for each test\n" + "--rocrtst_help, -r print this help message\n" + "--verbosity, -v \n" + " Verbosity levels:\n" + " 0 -- minimal; just summary information\n" + " 1 -- intermediate; show intermediate values such as intermediate " + "perf. data\n" + " 2 -- progress; show progress displays\n" + " >= 3 -- more debug output\n"; +} +uint32_t ProcessCmdline(RocrtstOptions* test, int arg_cnt, char** arg_list) { + int a; + int ind = -1; + + assert(test != nullptr); + + while (true) { + a = getopt_long(arg_cnt, arg_list, short_options, long_options, &ind); + + if (a == -1) { + break; + } + + switch (a) { + case 'i': + *test->iterations_ = std::stoi(optarg); + break; + + case 'v': + *test->verbosity_ = std::stoi(optarg); + break; + + case 'r': + PrintHelp(); + return 1; + + default: + PrintHelp(); + return 1; + } + } + return 0; +} diff --git a/rocrtst/suites/performance/perf_common/perf_base.h b/rocrtst/suites/test_common/test_common.h similarity index 78% rename from rocrtst/suites/performance/perf_common/perf_base.h rename to rocrtst/suites/test_common/test_common.h index 20f118b4ad..c40329bce2 100755 --- a/rocrtst/suites/performance/perf_common/perf_base.h +++ b/rocrtst/suites/test_common/test_common.h @@ -43,24 +43,19 @@ * */ +#ifndef ROCRTST_SUITES_TEST_COMMON_TEST_COMMON_H_ +#define ROCRTST_SUITES_TEST_COMMON_TEST_COMMON_H_ -#ifndef ROCRTST_SUITES_PERFORMANCE_PERF_COMMON_PERF_BASE_H_ -#define ROCRTST_SUITES_PERFORMANCE_PERF_COMMON_PERF_BASE_H_ - -class PerfBase { +class RocrtstOptions { public: - // @Brief: Before run the core measure codes, do something to set up - // i.e. init runtime, prepare packet... - virtual void SetUp(void) = 0; + RocrtstOptions(uint32_t *verb, uint32_t *iter); - // @Brief: Core measurement codes executing here - virtual void Run(void) = 0; + ~RocrtstOptions(void); - // @Brief: Do something clean up - virtual void Close(void) = 0; - - // @Brief: Display the results - virtual void DisplayResults(void) const = 0; + uint32_t *verbosity_; + uint32_t *iterations_; }; -#endif // ROCRTST_SUITES_PERFORMANCE_PERF_COMMON_PERF_BASE_H_ +uint32_t ProcessCmdline(RocrtstOptions* test, int arg_cnt, char** arg_list); + +#endif // ROCRTST_SUITES_TEST_COMMON_TEST_COMMON_H_