Added dispatch time, async copy and test template rocrtst tests
Change-Id: I57a844ee65c36bd61616ee6d60d358303f51db56
[ROCm/ROCR-Runtime commit: a12c5628ea]
Этот коммит содержится в:
@@ -50,11 +50,9 @@
|
||||
namespace rocrtst {
|
||||
|
||||
BaseRocR::BaseRocR(void) {
|
||||
num_iteration_ = 100;
|
||||
signal_.handle = 0;
|
||||
num_iteration_ = 1;
|
||||
cpu_device_.handle = -1;
|
||||
gpu_device1_.handle = -1;
|
||||
region_.handle = 0;
|
||||
device_pool_.handle = 0;
|
||||
kern_arg_pool_.handle = 0;
|
||||
main_queue_ = nullptr;
|
||||
@@ -66,6 +64,7 @@ BaseRocR::BaseRocR(void) {
|
||||
orig_hsa_enable_interrupt_ = GetEnv("HSA_ENABLE_INTERRUPT");
|
||||
set_kernel_file_name("");
|
||||
set_verbosity(0);
|
||||
set_title("unset_title");
|
||||
}
|
||||
|
||||
BaseRocR::~BaseRocR() {
|
||||
|
||||
Обычный файл → Исполняемый файл
-17
@@ -105,13 +105,6 @@ class BaseRocR {
|
||||
return kernel_object_;
|
||||
}
|
||||
|
||||
void set_signal(hsa_signal_t sig) {
|
||||
signal_.handle = sig.handle;
|
||||
}
|
||||
const hsa_signal_t& signal(void) const {
|
||||
return signal_;
|
||||
}
|
||||
|
||||
void set_profile(hsa_profile_t in_prof) {
|
||||
profile_ = in_prof;
|
||||
}
|
||||
@@ -151,10 +144,6 @@ class BaseRocR {
|
||||
return aql_;
|
||||
}
|
||||
|
||||
hsa_region_t& region(void) {
|
||||
return region_;
|
||||
}
|
||||
|
||||
void set_num_iteration(int num) {
|
||||
num_iteration_ = num;
|
||||
}
|
||||
@@ -237,16 +226,12 @@ class BaseRocR {
|
||||
private:
|
||||
uint64_t num_iteration_; ///< Number of times to execute test
|
||||
|
||||
hsa_signal_t signal_; ///< Completion signal used for kernel execution
|
||||
|
||||
hsa_queue_t* main_queue_; ///< AQL queue used for packets
|
||||
|
||||
hsa_agent_t gpu_device1_; ///< Handle to first GPU found
|
||||
|
||||
hsa_agent_t cpu_device_; ///< Handle to CPU
|
||||
|
||||
hsa_region_t region_; ///< TODO(cfreehil): delete this
|
||||
|
||||
hsa_amd_memory_pool_t device_pool_; ///< Memory pool on gpu pool list
|
||||
|
||||
hsa_amd_memory_pool_t cpu_pool_; ///< Memory pool on cpu pool list
|
||||
@@ -255,8 +240,6 @@ class BaseRocR {
|
||||
|
||||
uint64_t kernel_object_; ///< Handle to kernel code
|
||||
|
||||
std::string brig_file_; // TODO(cfreehil): delete this
|
||||
|
||||
std::string kernel_file_name_; ///< Code object file name
|
||||
|
||||
std::string kernel_name_; ///< Kernel name
|
||||
|
||||
@@ -70,6 +70,8 @@ namespace rocrtst {
|
||||
} \
|
||||
}
|
||||
|
||||
// Clean up some of the common handles and memory used by BaseRocR code, then
|
||||
// shut down hsa. Restore HSA_ENABLE_INTERRUPT to original value, if necessary
|
||||
hsa_status_t CommonCleanUp(BaseRocR* test) {
|
||||
hsa_status_t err;
|
||||
|
||||
@@ -87,13 +89,9 @@ hsa_status_t CommonCleanUp(BaseRocR* test) {
|
||||
test->set_main_queue(nullptr);
|
||||
}
|
||||
|
||||
if (0 != test->signal().handle) {
|
||||
hsa_signal_t sig;
|
||||
sig.handle = 0;
|
||||
|
||||
err = hsa_signal_destroy(test->signal());
|
||||
if (test->aql().completion_signal.handle != 0) {
|
||||
err = hsa_signal_destroy(test->aql().completion_signal);
|
||||
RET_IF_HSA_UTILS_ERR(err);
|
||||
test->set_signal(sig);
|
||||
}
|
||||
|
||||
err = hsa_shut_down();
|
||||
@@ -122,7 +120,7 @@ static const char* PROFILE_STR[] = {"HSA_PROFILE_BASE", "HSA_PROFILE_FULL", };
|
||||
/// \returns bool
|
||||
/// - true Machine meets test requirements
|
||||
/// - false Machine does not meet test requirements
|
||||
static bool CheckProfileAndInform(BaseRocR* test) {
|
||||
bool CheckProfileAndInform(BaseRocR* test) {
|
||||
if (test->verbosity() > 0) {
|
||||
std::cout << "Target HW Profile is "
|
||||
<< PROFILE_STR[test->profile()] << std::endl;
|
||||
@@ -162,6 +160,10 @@ static hsa_status_t ProcessIterateError(hsa_status_t err) {
|
||||
return err;
|
||||
}
|
||||
|
||||
// Find pools for cpu, gpu and for kernel arguments. These pools have
|
||||
// common basic requirements, but are not suitable for all cases. In
|
||||
// that case, set cpu_pool(), device_pool() and/or kern_arg_pool()
|
||||
// yourself instead of using this function.
|
||||
hsa_status_t SetPoolsTypical(BaseRocR* test) {
|
||||
hsa_status_t err;
|
||||
|
||||
@@ -180,11 +182,9 @@ hsa_status_t SetPoolsTypical(BaseRocR* test) {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// Enable interrupts if necessary, and call hsa_init()
|
||||
hsa_status_t InitAndSetupHSA(BaseRocR* test) {
|
||||
hsa_agent_t gpu_device1;
|
||||
hsa_agent_t cpu_device;
|
||||
hsa_status_t err;
|
||||
hsa_signal_t sig;
|
||||
|
||||
if (test->enable_interrupt()) {
|
||||
SetEnv("HSA_ENABLE_INTERRUPT", "1");
|
||||
@@ -193,6 +193,15 @@ hsa_status_t InitAndSetupHSA(BaseRocR* test) {
|
||||
err = hsa_init();
|
||||
RET_IF_HSA_UTILS_ERR(err);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// Attempt to find and set test->cpu_device and test->gpu_device1
|
||||
hsa_status_t SetDefaultAgents(BaseRocR* test) {
|
||||
hsa_agent_t gpu_device1;
|
||||
hsa_agent_t cpu_device;
|
||||
hsa_status_t err;
|
||||
|
||||
gpu_device1.handle = 0;
|
||||
err = hsa_iterate_agents(FindGPUDevice, &gpu_device1);
|
||||
RET_IF_HSA_UTILS_ERR(rocrtst::ProcessIterateError(err));
|
||||
@@ -217,7 +226,7 @@ hsa_status_t InitAndSetupHSA(BaseRocR* test) {
|
||||
char name[64] = {0};
|
||||
err = hsa_agent_get_info(gpu_device1, HSA_AGENT_INFO_NAME, name);
|
||||
RET_IF_HSA_UTILS_ERR(err);
|
||||
std::cout << "The device name is " << name << std::endl;
|
||||
std::cout << "The gpu device name is " << name << std::endl;
|
||||
}
|
||||
|
||||
hsa_profile_t profile;
|
||||
@@ -228,14 +237,11 @@ hsa_status_t InitAndSetupHSA(BaseRocR* test) {
|
||||
if (!CheckProfileAndInform(test)) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
err = hsa_signal_create(1, 0, NULL, &sig);
|
||||
RET_IF_HSA_UTILS_ERR(err);
|
||||
test->set_signal(sig);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// See if the profile of the target matches any required profile by the
|
||||
// test program.
|
||||
bool CheckProfile(BaseRocR const* test) {
|
||||
if (test->requires_profile() == -1) {
|
||||
return true;
|
||||
@@ -243,6 +249,19 @@ bool CheckProfile(BaseRocR const* test) {
|
||||
return (test->requires_profile() == test->profile());
|
||||
}
|
||||
}
|
||||
// Load the specified kernel code from the specified file, inspect and fill
|
||||
// in BaseRocR member variables related to the kernel and executable.
|
||||
// Required Input BaseRocR member variables:
|
||||
// - gpu_device1()
|
||||
// - kernel_file_name()
|
||||
// - kernel_name()
|
||||
//
|
||||
// Written BaseRocR member variables:
|
||||
// -kernel_object()
|
||||
// -private_segment_size()
|
||||
// -group_segment_size()
|
||||
// -kernarg_size()
|
||||
// -kernarg_align()
|
||||
hsa_status_t LoadKernelFromObjFile(BaseRocR* test) {
|
||||
hsa_status_t err;
|
||||
hsa_code_object_reader_t code_obj_rdr = {0};
|
||||
@@ -334,13 +353,16 @@ hsa_status_t CreateQueue(hsa_agent_t device, hsa_queue_t** queue,
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void InitializeAQLPacket(const BaseRocR* test,
|
||||
// Initialize the provided aql packet with standard default values, and
|
||||
// values from provided BaseRocR object.
|
||||
hsa_status_t InitializeAQLPacket(const BaseRocR* test,
|
||||
hsa_kernel_dispatch_packet_t* aql) {
|
||||
hsa_status_t err;
|
||||
|
||||
assert(aql != nullptr);
|
||||
|
||||
if (aql == nullptr) {
|
||||
return;
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
aql->header = 0; // Set this right before doorbell ring
|
||||
@@ -361,19 +383,25 @@ void InitializeAQLPacket(const BaseRocR* test,
|
||||
// Pin kernel code and the kernel argument buffer to the aql packet->
|
||||
aql->kernel_object = test->kernel_object();
|
||||
|
||||
aql->kernarg_address = NULL;
|
||||
aql->completion_signal.handle = test->signal().handle;
|
||||
// aql->kernarg_address may be filled in by AllocAndSetKernArgs() if it is
|
||||
// called before this function, so we don't want overwrite it, therefore
|
||||
// we ignore it in this function.
|
||||
|
||||
return;
|
||||
err = hsa_signal_create(1, 0, NULL, &aql->completion_signal);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
void WriteAQLToQueue(BaseRocR* test) {
|
||||
// Copy BaseRocR aql object values to the BaseRocR object queue in the
|
||||
// specified queue position (ind)
|
||||
hsa_kernel_dispatch_packet_t * WriteAQLToQueue(BaseRocR* test, uint64_t *ind) {
|
||||
assert(test);
|
||||
assert(test->main_queue());
|
||||
|
||||
void *queue_base = test->main_queue()->base_address;
|
||||
const uint32_t queue_mask = test->main_queue()->size - 1;
|
||||
uint64_t que_idx = hsa_queue_add_write_index_relaxed(test->main_queue(), 1);
|
||||
*ind = que_idx;
|
||||
|
||||
hsa_kernel_dispatch_packet_t* staging_aql_packet = &test->aql();
|
||||
hsa_kernel_dispatch_packet_t* queue_aql_packet;
|
||||
@@ -395,8 +423,12 @@ void WriteAQLToQueue(BaseRocR* test) {
|
||||
queue_aql_packet->kernel_object = staging_aql_packet->kernel_object;
|
||||
queue_aql_packet->kernarg_address = staging_aql_packet->kernarg_address;
|
||||
queue_aql_packet->completion_signal = staging_aql_packet->completion_signal;
|
||||
|
||||
return queue_aql_packet;
|
||||
}
|
||||
|
||||
// Allocate a buffer in the kern_arg_pool for the kernel arguments and write
|
||||
// the arguments to buffer
|
||||
hsa_status_t AllocAndSetKernArgs(BaseRocR* test, void* args, size_t arg_size) {
|
||||
void* kern_arg_buf = nullptr;
|
||||
hsa_status_t err;
|
||||
@@ -421,56 +453,18 @@ hsa_status_t AllocAndSetKernArgs(BaseRocR* test, void* args, size_t arg_size) {
|
||||
assert(((uintptr_t)adj_kern_arg_buf + arg_size) <
|
||||
((uintptr_t)kern_arg_buf + buf_size));
|
||||
|
||||
err = hsa_memory_copy_workaround_cpu(adj_kern_arg_buf, args, arg_size);
|
||||
RET_IF_HSA_UTILS_ERR(err);
|
||||
|
||||
hsa_agent_t ag_list[2] = {*test->gpu_device1(), *test->cpu_device()};
|
||||
err = hsa_amd_agents_allow_access(2, ag_list, NULL, kern_arg_buf);
|
||||
RET_IF_HSA_UTILS_ERR(err);
|
||||
|
||||
err = hsa_memory_copy(adj_kern_arg_buf, args, arg_size);
|
||||
RET_IF_HSA_UTILS_ERR(err);
|
||||
|
||||
test->aql().kernarg_address = adj_kern_arg_buf;
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t AllocAndAllowAccess(BaseRocR* test, size_t len,
|
||||
hsa_amd_memory_pool_t pool, void**buffer) {
|
||||
hsa_status_t err;
|
||||
|
||||
err = hsa_amd_memory_pool_allocate(pool, len, 0, buffer);
|
||||
RET_IF_HSA_UTILS_ERR(err);
|
||||
|
||||
hsa_agent_t ag_list[2] = {*test->gpu_device1(), *test->cpu_device()};
|
||||
err = hsa_amd_agents_allow_access(2, ag_list, NULL, *buffer);
|
||||
RET_IF_HSA_UTILS_ERR(err);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
hsa_status_t hsa_memory_fill_workaround_gen(void* ptr, uint32_t value,
|
||||
size_t count, hsa_agent_t dst_ag, hsa_agent_t src_ag, BaseRocR* test) {
|
||||
|
||||
hsa_status_t err;
|
||||
|
||||
void *tmp_mem;
|
||||
|
||||
err = hsa_amd_memory_pool_allocate(test->cpu_pool(), count, 0, &tmp_mem);
|
||||
RET_IF_HSA_UTILS_ERR(err);
|
||||
|
||||
hsa_agent_t ag_list[2] = {*test->gpu_device1(), *test->cpu_device()};
|
||||
err = hsa_amd_agents_allow_access(2, ag_list, NULL, tmp_mem);
|
||||
RET_IF_HSA_UTILS_ERR(err);
|
||||
|
||||
(void)memset(tmp_mem, value, count);
|
||||
|
||||
err = hsa_memory_copy_workaround_gen(ptr, tmp_mem, count, dst_ag, src_ag);
|
||||
RET_IF_HSA_UTILS_ERR(err);
|
||||
|
||||
hsa_amd_memory_pool_free(tmp_mem);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
#undef RET_IF_HSA_UTILS_ERR
|
||||
|
||||
} // namespace rocrtst
|
||||
|
||||
@@ -60,14 +60,16 @@ namespace rocrtst {
|
||||
/// \param[in] test Test for which the kernel will be loaded.
|
||||
/// \returns HSA_STATUS_SUCCESS if no errors
|
||||
hsa_status_t LoadKernelFromObjFile(BaseRocR* test);
|
||||
/// Do initialization tasks for HSA test program. This includes calling
|
||||
/// hsa_init(), finding and setting the cpu and gpu agent member variables,
|
||||
/// creating the signal needed for queueing AQL packets and checking
|
||||
/// HW requirements.
|
||||
|
||||
/// Do initialization tasks for HSA test program.
|
||||
/// \param[in] test Test to initialize
|
||||
/// \returns HSA_STATUS_SUCCESS if no errors
|
||||
hsa_status_t InitAndSetupHSA(BaseRocR* test);
|
||||
|
||||
/// Find and set the cpu and gpu agent member variables. Also checks that
|
||||
/// gpu agent meets test requirements (e.g., FULL profile vs. BASE profile).
|
||||
hsa_status_t SetDefaultAgents(BaseRocR* test);
|
||||
|
||||
/// For the provided device agent, create an AQL queue
|
||||
/// \param[in] device Device for which a queue is to be created
|
||||
/// \param[out] queue Address to which created queue pointer will be written
|
||||
@@ -84,16 +86,16 @@ hsa_status_t CreateQueue(hsa_agent_t device, hsa_queue_t** queue,
|
||||
/// be drawn.
|
||||
/// \param[inout] aql Caller provided pointer to aql packet that will be
|
||||
/// populated
|
||||
/// \returns void
|
||||
void InitializeAQLPacket(const BaseRocR* test,
|
||||
/// \returns Appropriate hsa_status_t
|
||||
hsa_status_t InitializeAQLPacket(const BaseRocR* test,
|
||||
hsa_kernel_dispatch_packet_t* aql);
|
||||
|
||||
/// This function writes all of the aql packet fields to the queue besides
|
||||
/// "setup" and "header". This assumes all the aql fields have be set
|
||||
/// appropriately.
|
||||
/// \param[in] test Test containing the queue and aql packet to be written.
|
||||
/// \returns void
|
||||
void WriteAQLToQueue(BaseRocR* test);
|
||||
/// \returns Pointer to dispatch packet in queue that was written to
|
||||
hsa_kernel_dispatch_packet_t* WriteAQLToQueue(BaseRocR* test, uint64_t *ind);
|
||||
|
||||
/// This function writes the first 32 bits of an aql packet to the provided
|
||||
/// aql packet. This function is meant to be called immediately before
|
||||
@@ -139,6 +141,15 @@ bool CheckProfile(BaseRocR const* test);
|
||||
hsa_status_t AllocAndSetKernArgs(BaseRocR* test, void* args,
|
||||
size_t arg_size);
|
||||
|
||||
/// Verify that the machine running the test has the required profile.
|
||||
/// This function will verify that the execution machine meets any specific
|
||||
/// test requirement for a profile (HSA_PROFILE_BASE or HSA_PROFILE_FULL).
|
||||
/// \param[in] test Test that provides profile requirements.
|
||||
/// \returns bool
|
||||
/// - true Machine meets test requirements
|
||||
/// - false Machine does not meet test requirements
|
||||
bool CheckProfileAndInform(BaseRocR* test);
|
||||
|
||||
/// This function will set the cpu and gpu memory pools to the type used in
|
||||
/// many applications.
|
||||
/// \param[in] test Test that provides profile requirements.
|
||||
@@ -146,17 +157,6 @@ hsa_status_t AllocAndSetKernArgs(BaseRocR* test, void* args,
|
||||
/// error code otherwise.
|
||||
hsa_status_t SetPoolsTypical(BaseRocR* test);
|
||||
|
||||
/// Allocate memory from a specified pool and grant both standard BaseRocR
|
||||
/// agents access
|
||||
/// \param[in] test Test having the agents to which access is granted
|
||||
/// \param[in] len Size of the memory buffer to allocate
|
||||
/// \pool[in] Pool from which to allocate memory
|
||||
/// \buffer[out] Address of pointer which will point to newly allocated memory
|
||||
/// upon return
|
||||
/// \returns HSA_STATUS_OK if no errors
|
||||
hsa_status_t AllocAndAllowAccess(BaseRocR* test, size_t len,
|
||||
hsa_amd_memory_pool_t pool, void**buffer);
|
||||
|
||||
/// Work-around for hsa_amd_memory_fill, which is currently broken.
|
||||
/// \param[in] ptr Pointer to start of memory location to be filled
|
||||
/// \param[in] value Value to write to each byte of input buffer
|
||||
|
||||
@@ -341,45 +341,6 @@ hsa_status_t DumpPointerInfo(void* ptr) {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t hsa_memory_fill_workaround_cpu(void* ptr, uint32_t value,
|
||||
size_t count) {
|
||||
(void)memset(ptr, value, count);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t hsa_memory_copy_workaround_cpu(void* dst, const void *src,
|
||||
size_t size) {
|
||||
(void)memcpy(dst, src, size);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t hsa_memory_copy_workaround_gen(void* dst, const void *src,
|
||||
size_t size, hsa_agent_t dst_ag, hsa_agent_t src_ag) {
|
||||
hsa_signal_t s;
|
||||
hsa_status_t err;
|
||||
|
||||
err = hsa_signal_create(1, 0, NULL, &s);
|
||||
RET_IF_HSA_COMMON_ERR(err);
|
||||
|
||||
err = hsa_amd_memory_async_copy(dst, dst_ag, src, src_ag, size, 0, NULL, s);
|
||||
RET_IF_HSA_COMMON_ERR(err);
|
||||
|
||||
if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1,
|
||||
UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0) {
|
||||
err = HSA_STATUS_ERROR;
|
||||
std::cout << "Async copy signal error" << std::endl;
|
||||
|
||||
RET_IF_HSA_COMMON_ERR(err);
|
||||
}
|
||||
|
||||
err = hsa_signal_destroy(s);
|
||||
|
||||
RET_IF_HSA_COMMON_ERR(err);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
/*! \brief Writes to the buffer and increments the write pointer to the
|
||||
* buffer. Also, ensures that the argument is written to an
|
||||
|
||||
@@ -140,35 +140,5 @@ hsa_status_t DumpMemoryPoolInfo(const hsa_amd_memory_pool_t pool,
|
||||
/// \returns HSA_STATUS_SUCCESS if there are no errors
|
||||
hsa_status_t DumpPointerInfo(void* ptr);
|
||||
|
||||
/// This is a work-around for filling cpu-memory to be used until
|
||||
/// hsa_amd_memory_fill is fixed. Should only be used for cpu memory.
|
||||
/// \param[in] ptr Start address of memory to be filled.
|
||||
/// \param[in] value Value to fill buffer with
|
||||
/// \param[in] count Size of buffer to fill
|
||||
/// \returns HSA_STATUS_SUCCESS if there are no errors
|
||||
hsa_status_t hsa_memory_fill_workaround_cpu(void* ptr, uint32_t value,
|
||||
size_t count);
|
||||
|
||||
/// This is a work-around for copying cpu-memory to be used until
|
||||
/// hsa_amd_memory_copy is fixed. Should only be used for cpu memory.
|
||||
/// \param[in] dst Destination address of memory to be copied
|
||||
/// \param[in] src Source address of memory to be copied
|
||||
/// \param[in] size Size of buffer to fill
|
||||
/// \returns HSA_STATUS_SUCCESS if there are no errors
|
||||
hsa_status_t hsa_memory_copy_workaround_cpu(void* dst, const void *src,
|
||||
size_t size);
|
||||
|
||||
/// This is a work-around for copying memory to be used until
|
||||
/// hsa_amd_memory_copy is fixed. Should be used when gpu local memory is
|
||||
/// involved.
|
||||
/// \param[in] dst Destination address of memory to be copied
|
||||
/// \param[in] src Source address of memory to be copied
|
||||
/// \param[in] size Size of buffer to fill
|
||||
/// \param[in] dst_ag Destination agent handle
|
||||
/// \param[in] src_ag Source agent handle
|
||||
/// \returns HSA_STATUS_SUCCESS if there are no errors
|
||||
hsa_status_t hsa_memory_copy_workaround_gen(void* dst, const void *src,
|
||||
size_t size, hsa_agent_t dst_ag, hsa_agent_t src_ag);
|
||||
|
||||
} // namespace rocrtst
|
||||
#endif // ROCRTST_COMMON_COMMON_H_
|
||||
|
||||
@@ -52,10 +52,10 @@
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <numeric>
|
||||
|
||||
namespace rocrtst {
|
||||
|
||||
|
||||
template<typename T>
|
||||
void PrintArray(const std::string header, const T* data, const int width,
|
||||
const int height) {
|
||||
@@ -191,7 +191,7 @@ AlignUp(void* value, size_t alignment) {
|
||||
alignment));
|
||||
}
|
||||
|
||||
double CalcMedian(std::vector<double> scores) {
|
||||
double CalcMedian(const std::vector<double> &scores) {
|
||||
double median;
|
||||
size_t size = scores.size();
|
||||
|
||||
@@ -204,15 +204,11 @@ double CalcMedian(std::vector<double> scores) {
|
||||
return median;
|
||||
}
|
||||
|
||||
double CalcMean(std::vector<double> scores) {
|
||||
double mean = 0;
|
||||
size_t size = scores.size();
|
||||
double CalcMean(const std::vector<double> &scores) {
|
||||
double mean;
|
||||
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
mean += scores[i];
|
||||
}
|
||||
|
||||
return mean / size;
|
||||
mean = std::accumulate(scores.begin(), scores.end(), 0.0);
|
||||
return mean/scores.size();
|
||||
}
|
||||
|
||||
double CalcMean(const std::vector<double>& v1, const std::vector<double>& v2) {
|
||||
|
||||
@@ -60,7 +60,7 @@ bool Compare(const double* refData, const double* data,
|
||||
const int length, const double epsilon = 1e-6);
|
||||
|
||||
/// Calculate the mean number of the vector
|
||||
double CalcMean(std::vector<double> scores);
|
||||
double CalcMean(const std::vector<double> &scores);
|
||||
|
||||
/// Calculate the mean time of difference of the two vectors
|
||||
double CalcMean(const std::vector<double>& v1, const std::vector<double>& v2);
|
||||
@@ -68,7 +68,7 @@ double CalcMean(const std::vector<double>& v1, const std::vector<double>& v2);
|
||||
/// Return the median value of a vector of doubles
|
||||
/// \param[in] scores Vector of doubles
|
||||
/// \returns double Median value of provided vector
|
||||
double CalcMedian(std::vector<double> scores);
|
||||
double CalcMedian(const std::vector<double> &scores);
|
||||
|
||||
/// Calculate the standard deviation of the vector
|
||||
double CalcStdDeviation(std::vector<double> scores, int score_mean);
|
||||
|
||||
@@ -70,6 +70,7 @@ PreDispatchCallback(const hsa_dispatch_callback_t* dispParam, void* usrArg) {
|
||||
dispParam->aql_translation_handle, true);
|
||||
assert((status == HSA_STATUS_SUCCESS) &&
|
||||
"Error in beginning Perf Cntr Session");
|
||||
(void)status; // Avoid warning
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -82,6 +83,7 @@ PostDispatchCallback(const hsa_dispatch_callback_t* dispParam, void* usrArg) {
|
||||
dispParam->aql_translation_handle);
|
||||
assert((status == HSA_STATUS_SUCCESS) &&
|
||||
"Error in endning Perf Cntr Session");
|
||||
(void)status; // Avoid warning
|
||||
}
|
||||
|
||||
/// Constructor of the class
|
||||
@@ -192,6 +194,8 @@ void RocrPerfCntrApp::RegisterCallbacks(hsa_queue_t* queue) {
|
||||
status = hsa_ext_tools_set_callback_arguments(queue, &perfMgr_, &perfMgr_);
|
||||
assert((status == HSA_STATUS_SUCCESS) &&
|
||||
"Error in registering Pre & Post Dispatch Callback Params");
|
||||
|
||||
(void)status; // Avoid warning
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -176,8 +176,7 @@ uint64_t PerfTimer::MeasureTSCFreqHz() {
|
||||
|
||||
do {
|
||||
tscTicksEnd = __rdtscp(&unused);
|
||||
}
|
||||
while (tscTicksEnd - tscTicksBegin < 1000000000);
|
||||
} while (tscTicksEnd - tscTicksBegin < 1000000000);
|
||||
|
||||
uint64_t coarseEndUs = CoarseTimestampUs();
|
||||
|
||||
|
||||
@@ -91,6 +91,7 @@ class PerfTimer {
|
||||
void ResetTimer(int index);
|
||||
|
||||
/// Read the time value of the timer associated with the provided index.
|
||||
/// Units are seconds
|
||||
/// \param[in] index Index of the timer to read
|
||||
/// \returns double Value of the timer
|
||||
double ReadTimer(int index);
|
||||
|
||||
@@ -254,6 +254,11 @@ set(BITCODE_LIBS "${BITCODE_LIBS} ${BITCODE_PREF}/ocml.amdgcn.bc")
|
||||
set(CL_FILE_LIST "${PROJECT_SOURCE_DIR}/binary_search/binary_search_kernels.cl")
|
||||
process_sample("binary_search")
|
||||
|
||||
# P2P Memory Access
|
||||
set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
|
||||
set(CL_FILE_LIST "${PROJECT_SOURCE_DIR}/p2p_mem_access/p2p_mem_access_kernels.cl")
|
||||
process_sample("p2p_mem_access")
|
||||
|
||||
# RocR Info
|
||||
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/rocrinfo ROCR_INFO_SOURCES)
|
||||
add_executable(rocrinfo ${ROCR_INFO_SOURCES})
|
||||
|
||||
@@ -25,10 +25,6 @@ cmake_minimum_required(VERSION 2.8.0)
|
||||
# 4) Set env. variable TARGET_DEVICE to indicate gpu type (e.g., gfx803,
|
||||
# gfx900, ...)
|
||||
#
|
||||
# 5) Set env. variables AMDHSAFIN_DIR and and AMDHSAFIN_TARGET to the
|
||||
# directory containing the amd finalizer executable and version
|
||||
# (e.g, 8:0:3) respectively.
|
||||
#
|
||||
# Building rocrtst Suite
|
||||
#
|
||||
# 1) Create build folder e.g. "rocrtst/build" - any name will do
|
||||
@@ -91,6 +87,32 @@ else()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (DEFINED ENV{OPENCL_DIR})
|
||||
set(CLANG $ENV{OPENCL_DIR}/bin/x86_64/clang)
|
||||
set(OPENCL_DIR $ENV{OPENCL_DIR})
|
||||
if (NOT EXISTS ${CLANG})
|
||||
message("ERROR: path to clang (${CLANG}) is not valid. Is env. variable OPENCL_DIR correct?")
|
||||
return()
|
||||
endif()
|
||||
|
||||
if (DEFINED ENV{OPENCL_VER})
|
||||
set(OPENCL_VER $ENV{OPENCL_VER})
|
||||
else()
|
||||
message("OPENCL_VER environment variable is not set. Using default")
|
||||
set(OPENCL_VER "2.0")
|
||||
endif()
|
||||
else()
|
||||
message("WARNING: OPENCL_DIR environment variable is not set. Kernels will not be built.")
|
||||
endif()
|
||||
|
||||
if (DEFINED ENV{TARGET_DEVICE})
|
||||
set(TARGET_DEVICE $ENV{TARGET_DEVICE})
|
||||
else()
|
||||
message("ERROR: TARGET_DEVICE environment variable is not defined.")
|
||||
message("Please define a valid clang target (e.g., gfx803, gfx900,...).")
|
||||
return()
|
||||
endif()
|
||||
|
||||
#
|
||||
# Set Name for rocrtst Suite Project
|
||||
#
|
||||
@@ -105,17 +127,22 @@ project (${ROCRTST_SUITE_NAME})
|
||||
# Build Type: Debug Vs Release, 32 Vs 64
|
||||
# Compiler Version, etc
|
||||
#
|
||||
MESSAGE("")
|
||||
MESSAGE("-------------IS64BIT: " ${IS64BIT})
|
||||
MESSAGE("-----------BuildType: " ${BUILD_TYPE})
|
||||
MESSAGE("------------Compiler: " ${CMAKE_CXX_COMPILER})
|
||||
MESSAGE("-------------Version: " ${CMAKE_CXX_COMPILER_VERSION})
|
||||
MESSAGE("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR})
|
||||
MESSAGE("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR})
|
||||
MESSAGE("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib)
|
||||
MESSAGE("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin)
|
||||
MESSAGE("")
|
||||
message("")
|
||||
message("Build Configuration:")
|
||||
message("-------------IS64BIT: " ${IS64BIT})
|
||||
message("-----------BuildType: " ${BUILD_TYPE})
|
||||
message("------------Compiler: " ${CMAKE_CXX_COMPILER})
|
||||
message("-------------Version: " ${CMAKE_CXX_COMPILER_VERSION})
|
||||
message("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR})
|
||||
message("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR})
|
||||
message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib)
|
||||
message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin)
|
||||
message("-------Target Device: " ${TARGET_DEVICE})
|
||||
message("----------Clang path: " ${CLANG})
|
||||
message("-------OpenCL version " ${OPENCL_VER})
|
||||
message("")
|
||||
|
||||
set(KERNELS_DIR ${PROJECT_SOURCE_DIR}/kernels)
|
||||
#
|
||||
# Set the build type based on user input
|
||||
#
|
||||
@@ -148,7 +175,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-threadsafe-statics")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pedantic")
|
||||
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pedantic")
|
||||
|
||||
|
||||
#
|
||||
@@ -164,7 +191,7 @@ endif()
|
||||
# Add compiler flags to include symbol information for debug builds
|
||||
#
|
||||
if(ISDEBUG)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb -O0")
|
||||
endif()
|
||||
MESSAGE("ISDEBUG STEP:Done")
|
||||
|
||||
@@ -201,10 +228,11 @@ MESSAGE(${ROCRTST_LIBS})
|
||||
set(ROCRTST "rocrtst${ONLY64STR}")
|
||||
|
||||
#
|
||||
# Sorce files for building rocrtst
|
||||
# Source files for building rocrtst
|
||||
#
|
||||
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} performanceSources)
|
||||
|
||||
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/test_common testCommonSources)
|
||||
aux_source_directory(${ROCRTST_ROOT}/suites/test_common testCommonSources)
|
||||
|
||||
# Header file include path
|
||||
|
||||
@@ -212,11 +240,51 @@ include_directories(${ROCR_INC_DIR})
|
||||
include_directories(${ROCRTST_ROOT})
|
||||
include_directories(${ROCRTST_ROOT}/gtest/include)
|
||||
|
||||
# Build rules
|
||||
# Use this function to build any samples that have kernels to be built
|
||||
function(build_kernel S_NAME)
|
||||
set(SNAME_KERNEL "${S_NAME}_kernels.hsaco")
|
||||
set(TARG_NAME "${S_NAME}_hsaco")
|
||||
set(HSACO_TARG_LIST ${HSACO_TARG_LIST} ${TARG_NAME} PARENT_SCOPE)
|
||||
separate_arguments(CLANG_ARG_LIST UNIX_COMMAND "-target amdgcn-amdh-amdhsa -mcpu=${TARGET_DEVICE} -include ${OPENCL_DIR}/include/opencl-c.h ${BITCODE_LIBS} -cl-std=CL${OPENCL_VER} ${CL_FILE_LIST} -o ${PROJECT_BINARY_DIR}/${SNAME_KERNEL}")
|
||||
add_custom_target(${TARG_NAME} ${CLANG} ${CLANG_ARG_LIST}
|
||||
COMMENT "BUILDING KERNEL..."
|
||||
VERBATIM)
|
||||
endfunction(build_kernel)
|
||||
|
||||
add_executable(${ROCRTST} ${performanceSources} ${common_srcs})
|
||||
######################
|
||||
# Kernel Build Section
|
||||
######################
|
||||
set(KERN_SUFFIX "kernels.hsaco")
|
||||
set(BITCODE_PREF "-Xclang -mlink-bitcode-file -Xclang")
|
||||
set(BITCODE_PREF "${BITCODE_PREF} ${OPENCL_DIR}/lib/x86_64/bitcode")
|
||||
|
||||
set(COMMON_BITCODE_LIBS "${BITCODE_PREF}/opencl.amdgcn.bc")
|
||||
set(COMMON_BITCODE_LIBS "${COMMON_BITCODE_LIBS} ${BITCODE_PREF}/ockl.amdgcn.bc")
|
||||
|
||||
# To build kernels, repeat the pattern used below for the P2P kernel; this
|
||||
# pattern sets the bitcode libraries required by the kernel which will be
|
||||
# used in the build_kernel() call, which builds the kernel.
|
||||
|
||||
# Test Case Template example
|
||||
set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
|
||||
set(CL_FILE_LIST "${KERNELS_DIR}/test_case_template_kernels.cl")
|
||||
build_kernel("test_case_template")
|
||||
|
||||
# P2P Memory Access
|
||||
#set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
|
||||
#set(CL_FILE_LIST "${KERNELS_DIR}/p2p_mem_access_kernels.cl")
|
||||
#build_kernel("p2p_mem_access")
|
||||
|
||||
# Dispatch Time
|
||||
set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
|
||||
set(CL_FILE_LIST "${KERNELS_DIR}/dispatch_time_kernels.cl")
|
||||
build_kernel("dispatch_time")
|
||||
|
||||
# Build rules
|
||||
add_executable(${ROCRTST} ${performanceSources} ${common_srcs} ${testCommonSources})
|
||||
target_link_libraries(${ROCRTST} ${ROCRTST_LIBS} c stdc++ dl pthread rt)
|
||||
|
||||
add_custom_target(rocrtst_kernels DEPENDS ${HSACO_TARG_LIST})
|
||||
INSTALL(TARGETS ${ROCRTST}
|
||||
ARCHIVE DESTINATION ${PROJECT_BINARY_DIR}/lib
|
||||
LIBRARY DESTINATION ${PROJECT_BINARY_DIR}/lib
|
||||
|
||||
@@ -1,258 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "cp_process_time.h"
|
||||
#include "common/base_rocr_utils.h"
|
||||
#include "common/common.h"
|
||||
#include "common/helper_funcs.h"
|
||||
#include "common/hsatimer.h"
|
||||
#include "common/os.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include "hsa/hsa_ext_amd.h"
|
||||
#include "hsa/hsa_ext_finalize.h"
|
||||
#include <algorithm>
|
||||
|
||||
static const uint64_t kKernelIterations = 10000;
|
||||
static const uint64_t kTestBadValue = 1234567891234567891;
|
||||
//Set up some expectations for reasonable processing times
|
||||
//For gfx803, Overhead time had a max of 18.208uS and a min of 7.82uS
|
||||
static const double kGfx803MinOverhead = 7.78;
|
||||
static const double kGfx803MaxOverhead = 21.064;
|
||||
static const double kOverheadToleranceFactor = 0.25;
|
||||
|
||||
CpProcessTime::CpProcessTime() :
|
||||
BaseRocR() {
|
||||
// kernel_name_ = "&__simple_kernel";
|
||||
mean_ = 0.0;
|
||||
}
|
||||
|
||||
CpProcessTime::~CpProcessTime() {
|
||||
}
|
||||
|
||||
void CpProcessTime::SetUp() {
|
||||
hsa_status_t err;
|
||||
set_kernel_file_name("simple_kernel.o");
|
||||
set_kernel_name("&__simple_kernel");
|
||||
|
||||
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
|
||||
return;
|
||||
}
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
|
||||
// Create a queue
|
||||
hsa_queue_t* q = nullptr;
|
||||
rocrtst::CreateQueue(*gpu_dev, &q);
|
||||
ASSERT_NE(q, nullptr);
|
||||
set_main_queue(q);
|
||||
|
||||
// Set profiling
|
||||
err = hsa_amd_profiling_set_profiler_enabled(q, 1);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Load and finalize the kernel
|
||||
err = rocrtst::LoadKernelFromObjFile(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
rocrtst::InitializeAQLPacket(this, &aql());
|
||||
aql().workgroup_size_x = 1;
|
||||
aql().grid_size_x = 1;
|
||||
}
|
||||
|
||||
size_t CpProcessTime::RealIterationNum() {
|
||||
return num_iteration() * 1.2 + 1;
|
||||
}
|
||||
|
||||
void CpProcessTime::Run() {
|
||||
hsa_status_t err;
|
||||
std::vector<double> timer;
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
hsa_agent_t* cpu_dev = cpu_device();
|
||||
|
||||
ASSERT_NE(gpu_dev, nullptr);
|
||||
ASSERT_NE(cpu_dev, nullptr);
|
||||
uint32_t it = RealIterationNum();
|
||||
|
||||
typedef struct args_t {
|
||||
uint64_t* iteration;
|
||||
uint64_t* result;
|
||||
} args;
|
||||
|
||||
err = rocrtst::SetPoolsTypical(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
uint64_t* iter = NULL;
|
||||
uint64_t* result = NULL;
|
||||
err = rocrtst::AllocAndAllowAccess(this, sizeof(uint64_t), cpu_pool(),
|
||||
(void**)&iter);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = rocrtst::AllocAndAllowAccess(this, sizeof(uint64_t), cpu_pool(),
|
||||
(void**)&result);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
*iter = kKernelIterations;
|
||||
*result = kTestBadValue;
|
||||
|
||||
args k_args;
|
||||
|
||||
k_args.iteration = (uint64_t*)iter;
|
||||
k_args.result = (uint64_t*)result;
|
||||
|
||||
err = rocrtst::AllocAndSetKernArgs(this, &k_args, sizeof(args));
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
rocrtst::WriteAQLToQueue(this);
|
||||
|
||||
void * q_base_addr = main_queue()->base_address;
|
||||
const uint32_t queue_mask = main_queue()->size - 1;
|
||||
uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
|
||||
// aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
|
||||
// HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
|
||||
// aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
|
||||
// HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
|
||||
|
||||
for (uint32_t i = 0; i < it; i++) {
|
||||
// uint64_t que_idx = hsa_queue_load_write_index_relaxed(main_queue());
|
||||
uint64_t que_idx = hsa_queue_add_write_index_relaxed(main_queue(), 1);
|
||||
|
||||
//Get timing stamp an ring the doorbell to dispatch the kernel.
|
||||
rocrtst::PerfTimer p_timer;
|
||||
int id = p_timer.CreateTimer();
|
||||
p_timer.StartTimer(id);
|
||||
|
||||
rocrtst::AtomicSetPacketHeader(aql_header, aql().setup,
|
||||
&((hsa_kernel_dispatch_packet_t*)(q_base_addr))[que_idx & queue_mask]);
|
||||
|
||||
hsa_queue_store_write_index_relaxed(main_queue(), (que_idx + 1));
|
||||
hsa_signal_store_relaxed(main_queue()->doorbell_signal, que_idx);
|
||||
|
||||
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
|
||||
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
|
||||
;
|
||||
// hsa_signal_value_t value = hsa_signal_wait_scacquire(signal(),
|
||||
// HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
|
||||
// value should be 0, or we timed-out
|
||||
//ASSERT_EQ(value, 0);
|
||||
|
||||
p_timer.StopTimer(id);
|
||||
|
||||
hsa_amd_profiling_dispatch_time_t dispatch_time;
|
||||
err = hsa_amd_profiling_get_dispatch_time(*gpu_dev, signal(),
|
||||
&dispatch_time);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
uint64_t ticks = dispatch_time.end - dispatch_time.start;
|
||||
uint64_t freq;
|
||||
|
||||
err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freq);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
hsa_signal_store_screlease(signal(), 1);
|
||||
|
||||
double execution_time = (double) ticks / freq * 1e6; //convert to us
|
||||
double temp = p_timer.ReadTimer(id) * 1e6;
|
||||
double cp_time = temp - execution_time;
|
||||
|
||||
#ifdef DEBUG
|
||||
std::cout << "Total:" << temp << "uS ";
|
||||
std::cout << "Execution:" << execution_time << "uS ";
|
||||
std::cout << "Overhead:" << cp_time << "uS ";
|
||||
std::cout << "Overhead %:" << cp_time / execution_time * 100 << std::endl;
|
||||
#endif
|
||||
|
||||
EXPECT_EQ(kKernelIterations, *result);
|
||||
timer.push_back(cp_time);
|
||||
|
||||
//Assume overhead will not deviate too much from previously recorded
|
||||
// values. If this does happen and there is not a performance bug,
|
||||
// modify these constants
|
||||
|
||||
//This may need to be made specific to the gpu being used
|
||||
EXPECT_GT(cp_time, kGfx803MinOverhead * (1 - kOverheadToleranceFactor));
|
||||
EXPECT_LT(cp_time, kGfx803MaxOverhead * (1 + kOverheadToleranceFactor));
|
||||
|
||||
*result = 0;
|
||||
}
|
||||
|
||||
//Abandon the first result and after sort, delete the last 2% value
|
||||
timer.erase(timer.begin());
|
||||
std::sort(timer.begin(), timer.end());
|
||||
|
||||
timer.erase(timer.begin() + num_iteration(), timer.end());
|
||||
mean_ = rocrtst::CalcMean(timer);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void CpProcessTime::DisplayResults() const {
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (mean_ == 0.0) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::cout << "===================================================="
|
||||
<< std::endl;
|
||||
std::cout << "The average Command Processor processing time is: " << mean_
|
||||
<< "us" << std::endl;
|
||||
std::cout << "===================================================="
|
||||
<< std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
void CpProcessTime::Close() {
|
||||
hsa_status_t err;
|
||||
err = rocrtst::CommonCleanUp(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
}
|
||||
@@ -1,91 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __ROCRTST_SRC_CP_PROCESS_TIME_H__
|
||||
#define __ROCRTST_SRC_CP_PROCESS_TIME_H__
|
||||
#include "perf_common/perf_base.h"
|
||||
#include "common/base_rocr.h"
|
||||
#include "common/common.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include "hsa/hsa_ext_amd.h"
|
||||
#include <vector>
|
||||
|
||||
//@Brief: This class is defined to measure the mean latency of launching
|
||||
//an empty kernel
|
||||
|
||||
class CpProcessTime: public rocrtst::BaseRocR, public PerfBase {
|
||||
public:
|
||||
//@Brief: Constructor
|
||||
CpProcessTime();
|
||||
|
||||
//@Brief: Destructor
|
||||
virtual ~CpProcessTime();
|
||||
|
||||
//@Brief: Set up the environment for the test
|
||||
virtual void SetUp();
|
||||
|
||||
//@Brief: Run the test case
|
||||
virtual void Run();
|
||||
|
||||
//@Brief: Display results we got
|
||||
virtual void DisplayResults() const;
|
||||
|
||||
//@Brief: Clean up and close the runtime
|
||||
virtual void Close();
|
||||
|
||||
private:
|
||||
//@Brief: Get actual iteration number
|
||||
virtual size_t RealIterationNum();
|
||||
|
||||
//@Brief: Store the size of queue
|
||||
uint32_t queue_size_;
|
||||
|
||||
//@Brief: The mean time of CP Processing
|
||||
double mean_;
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,220 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "cu_masking.h"
|
||||
#include "common/base_rocr_utils.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
CuMasking::CuMasking() :
|
||||
BaseRocR() {
|
||||
memset(&aql(), 0, sizeof(hsa_kernel_dispatch_packet_t));
|
||||
mean_ = 0.0;
|
||||
group_region_.handle = 0;
|
||||
cu_ = NULL;
|
||||
}
|
||||
|
||||
CuMasking::~CuMasking() {
|
||||
}
|
||||
|
||||
void CuMasking::SetUp() {
|
||||
hsa_status_t err;
|
||||
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
hsa_agent_t* cpu_dev = cpu_device();
|
||||
|
||||
set_kernel_file_name("cu_masking.o");
|
||||
set_kernel_name("&main");
|
||||
|
||||
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Create a queue
|
||||
hsa_queue_t* q = nullptr;
|
||||
rocrtst::CreateQueue(*gpu_dev, &q);
|
||||
set_main_queue(q);
|
||||
|
||||
rocrtst::LoadKernelFromObjFile(this);
|
||||
|
||||
// Fill up the kernel packet except header
|
||||
// aql().completion_signal=signal();
|
||||
// TODO: Will delete manual_input later
|
||||
uint32_t cu_count = 0;
|
||||
err = hsa_agent_get_info(*gpu_dev,
|
||||
(hsa_agent_info_t) HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
std::cout << "CU# is: " << cu_count << std::endl;
|
||||
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
rocrtst::InitializeAQLPacket(this, &aql());
|
||||
aql().workgroup_size_x = 1024;
|
||||
|
||||
//manual_input * group_input; // workgroup_max_size;
|
||||
aql().grid_size_x = (long long) 1024 * 640 * 640;
|
||||
|
||||
// TODO:Manully set the max cu number to 8, the api return 10
|
||||
std::cout << "Grid size is: " << aql().grid_size_x << std::endl;
|
||||
|
||||
err = hsa_amd_agent_iterate_memory_pools(*cpu_dev,
|
||||
rocrtst::FindGlobalPool, &cpu_pool());
|
||||
ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
|
||||
}
|
||||
|
||||
size_t CuMasking::RealIterationNum() {
|
||||
return num_iteration() * 1.2 + 1;
|
||||
}
|
||||
|
||||
void CuMasking::Run() {
|
||||
hsa_status_t err;
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<double> timer;
|
||||
|
||||
typedef struct args_t {
|
||||
uint32_t* iteration;
|
||||
uint32_t* result;
|
||||
} local_args;
|
||||
|
||||
uint32_t* iter = NULL;
|
||||
uint32_t* result = NULL;
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(uint32_t), 0,
|
||||
(void**) &iter);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(uint32_t), 0,
|
||||
(void**) &result);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
*iter = 0xff;
|
||||
*result = 0;
|
||||
|
||||
err = hsa_amd_agents_allow_access(1, gpu_device1(), NULL, iter);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
err = hsa_amd_agents_allow_access(1, gpu_device1(), NULL, result);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
local_args* kernarg = NULL;
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool(), kernarg_size(), 0,
|
||||
(void**) &kernarg);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = hsa_amd_agents_allow_access(1, gpu_device1(), NULL, kernarg);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
kernarg->iteration = iter;
|
||||
kernarg->result = result;
|
||||
|
||||
aql().kernarg_address = kernarg;
|
||||
|
||||
// Obtain the current queue write inex.
|
||||
uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
|
||||
|
||||
// Write the aql packet at the calculate queue index address.
|
||||
const uint32_t queue_mask = main_queue()->size - 1;
|
||||
|
||||
// Set CU mask
|
||||
uint32_t cu_mask = 0;
|
||||
#if 0
|
||||
std::cout << "Enter cu mask value:" << std::endl;
|
||||
ASSERT_NE(scanf("%d", &cu_mask), EOF);
|
||||
#else
|
||||
cu_mask = 0xAAAAAAAA;
|
||||
#endif
|
||||
|
||||
std::cout << "Value of bit array is: 0x" << std::hex << cu_mask << std::endl;
|
||||
err = hsa_amd_queue_cu_set_mask(main_queue(), 32, &cu_mask);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
void *q_base_addr = main_queue()->base_address;
|
||||
// Write the aql packet at the calculate queue index address.
|
||||
aql().completion_signal = signal();
|
||||
((hsa_kernel_dispatch_packet_t*)(q_base_addr))[index & queue_mask] = aql();
|
||||
|
||||
// Get timing stamp an ring the doorbell to dispatch the kernel.
|
||||
rocrtst::PerfTimer p_timer;
|
||||
int id = p_timer.CreateTimer();
|
||||
p_timer.StartTimer(id);
|
||||
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask].header |=
|
||||
HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
|
||||
hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
|
||||
|
||||
// Wait on the dispatch signal until the kernel is finished.
|
||||
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
|
||||
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
|
||||
;
|
||||
|
||||
p_timer.StopTimer(id);
|
||||
|
||||
hsa_signal_store_screlease(signal(), 1);
|
||||
|
||||
double t1 = p_timer.ReadTimer(id) * 1e6;
|
||||
std::cout << "Execution time after setting cu masking: " << t1 << std::endl;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void CuMasking::DisplayResults() const {
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::cout << "===================================================="
|
||||
<< std::endl;
|
||||
|
||||
std::cout << "====================================================="
|
||||
<< std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
void CuMasking::Close() {
|
||||
hsa_status_t err;
|
||||
err = rocrtst::CommonCleanUp(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
}
|
||||
@@ -1,103 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __ROCRTST_SRC_CU_MASKING_TIME_H__
|
||||
#define __ROCRTST_SRC_CU_MASKING_TIME_H__
|
||||
|
||||
#include "perf_common/perf_base.h"
|
||||
#include "common/base_rocr.h"
|
||||
#include "common/common.h"
|
||||
#include "common/hsatimer.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include "hsa/hsa_ext_amd.h"
|
||||
#include "hsa/hsa_ext_finalize.h"
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
|
||||
//@Brief: This class is defined to measure the mean latency of launching
|
||||
//an empty kernel
|
||||
|
||||
class CuMasking: public rocrtst::BaseRocR, public PerfBase {
|
||||
public:
|
||||
//@Brief: Constructor
|
||||
CuMasking();
|
||||
|
||||
//@Brief: Destructor
|
||||
virtual ~CuMasking();
|
||||
|
||||
//@Brief: Set up the environment for the test
|
||||
virtual void SetUp();
|
||||
|
||||
//@Brief: Run the test case
|
||||
virtual void Run();
|
||||
|
||||
//@Brief: Display results we got
|
||||
virtual void DisplayResults() const;
|
||||
|
||||
//@Brief: Clean up and close the runtime
|
||||
virtual void Close();
|
||||
|
||||
private:
|
||||
//@Brief: Get actual iteration number
|
||||
virtual size_t RealIterationNum();
|
||||
|
||||
//@Brief: Store the size of queue
|
||||
uint32_t queue_size_;
|
||||
|
||||
//@Brief: The mean time of CP Processing
|
||||
double mean_;
|
||||
|
||||
//@Brief: The group memory region
|
||||
hsa_region_t group_region_;
|
||||
|
||||
//@Brief: Pointer to cu_id array
|
||||
uint32_t* cu_;
|
||||
|
||||
uint32_t manual_input;
|
||||
uint32_t group_input;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,293 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "device_load_bandwidth.h"
|
||||
#include "common/base_rocr_utils.h"
|
||||
#include "common/common.h"
|
||||
#include "common/helper_funcs.h"
|
||||
#include "common/hsatimer.h"
|
||||
#include "common/os.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <algorithm>
|
||||
|
||||
// TODO: The validation code has problems to debug
|
||||
#if 0
|
||||
static void initGlobalReadBuffer(uint32_t* in_data, uint32_t num_thrds,
|
||||
uint32_t num_ops, uint32_t num_loops) {
|
||||
|
||||
// Populate input buffer with thread Id left shifted by 2.
|
||||
uint32_t value = 0;
|
||||
uint32_t val_idx;
|
||||
|
||||
for (uint32_t idx1 = 0; idx1 < num_loops; idx1++) {
|
||||
val_idx = 0;
|
||||
for (uint32_t idx2 = 0; idx2 < num_ops; idx2++) {
|
||||
// Write the value to be read by each thread
|
||||
for (uint32_t idx3 = 0; idx3 < num_thrds; idx3++) {
|
||||
value = idx3 << 2;
|
||||
in_data[val_idx++] = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
static bool verifyGlobalLoadKernel(uint32_t* data, uint32_t num_thrds,
|
||||
uint32_t scale, const char* kernel_name) {
|
||||
|
||||
// Verify kernel operation i.e. validate the data in the output buffer.
|
||||
uint32_t valid_value = 0;
|
||||
|
||||
for (uint32_t idx = 0; idx < num_thrds; idx++) {
|
||||
|
||||
valid_value = (idx << 2) * scale;
|
||||
|
||||
|
||||
if (data[idx] != valid_value) {
|
||||
std::cout << "Value expected = " << valid_value << std::endl;
|
||||
std::cout << "Value of data = " << data[idx] << std::endl;
|
||||
|
||||
std::cout << kernel_name << ": VALIDATION FAILED ! Bad index: " << idx
|
||||
<< std::endl;
|
||||
std::cout << kernel_name << ": VALUE @ Bad index: " << data[idx]
|
||||
<< std::endl;
|
||||
std::cout << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
std::cout << kernel_name << ": Passed validation" << std::endl;
|
||||
std::cout << std::endl;
|
||||
#endif
|
||||
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Constructor
|
||||
DeviceLoadBandwidth::DeviceLoadBandwidth() :
|
||||
BaseRocR() {
|
||||
|
||||
set_group_size(0);
|
||||
set_enable_interrupt(false);
|
||||
|
||||
num_group_ = 0;
|
||||
num_cus_ = 0;
|
||||
|
||||
kernel_loop_count_ = 0;
|
||||
mean_ = 0.0;
|
||||
data_size_ = 0;
|
||||
|
||||
set_requires_profile (HSA_PROFILE_BASE);
|
||||
}
|
||||
|
||||
// Destructor
|
||||
DeviceLoadBandwidth::~DeviceLoadBandwidth() {
|
||||
}
|
||||
|
||||
// Set up the test environment
|
||||
void DeviceLoadBandwidth::SetUp() {
|
||||
SetWorkItemNum();
|
||||
|
||||
set_kernel_file_name("sysMemRead.o");
|
||||
set_kernel_name("&__SysMemLoad");
|
||||
|
||||
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
|
||||
//Create a queue with max number size
|
||||
hsa_queue_t* q = nullptr;
|
||||
rocrtst::CreateQueue(*gpu_dev, &q);
|
||||
ASSERT_NE(q, nullptr);
|
||||
set_main_queue(q);
|
||||
|
||||
rocrtst::LoadKernelFromObjFile(this);
|
||||
|
||||
uint32_t total_work_items = num_cus_ * num_group_ * group_size();
|
||||
|
||||
//Fill up part of aql
|
||||
rocrtst::InitializeAQLPacket(this, &aql());
|
||||
aql().workgroup_size_x = group_size();
|
||||
aql().grid_size_x = total_work_items;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Run the test
|
||||
void DeviceLoadBandwidth::Run() {
|
||||
hsa_status_t err;
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t total_workitems = num_cus_ * num_group_ * group_size();
|
||||
|
||||
uint32_t ops_thrd = 32;
|
||||
uint64_t addr_step = (uint64_t) total_workitems * sizeof(uint64_t);
|
||||
uint64_t total_ops = (uint64_t) total_workitems * ops_thrd;
|
||||
uint64_t in_data_size = (uint64_t) total_ops * sizeof(uint64_t);
|
||||
|
||||
data_size_ = in_data_size;
|
||||
|
||||
err = rocrtst::SetPoolsTypical(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = rocrtst::AllocAndAllowAccess(this, in_data_size, device_pool(),
|
||||
(void**)&in_data_);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
//uint32_t out_data_size = total_workitems * sizeof(uint64_t);
|
||||
uint32_t out_data_size = in_data_size;
|
||||
|
||||
err = rocrtst::AllocAndAllowAccess(this, out_data_size, device_pool(),
|
||||
(void**)&out_data_);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
#if 0
|
||||
initGlobalReadBuffer(in_data_, total_workitems, ops_thrd, kernel_loop_count_);
|
||||
#endif
|
||||
|
||||
struct local_args_t {
|
||||
void* arg0;
|
||||
void* arg1;
|
||||
uint64_t arg2;
|
||||
void* arg3;
|
||||
} local_args;
|
||||
|
||||
local_args.arg0 = in_data_;
|
||||
local_args.arg1 = in_data_ + total_ops;
|
||||
local_args.arg2 = addr_step;
|
||||
local_args.arg3 = out_data_;
|
||||
|
||||
// Copy the kernel args structure into a registered memory block
|
||||
err = rocrtst::AllocAndSetKernArgs(this, &local_args, sizeof(local_args));
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
std::vector<double> time;
|
||||
|
||||
rocrtst::WriteAQLToQueue(this);
|
||||
// Write the aql packet at the calculated queue index address.
|
||||
const uint32_t queue_mask = main_queue()->size - 1;
|
||||
void * q_base = main_queue()->base_address;
|
||||
|
||||
for (uint32_t i = 0; i < num_iteration(); i++) {
|
||||
uint64_t que_idx = hsa_queue_load_write_index_relaxed(main_queue());
|
||||
|
||||
rocrtst::PerfTimer p_timer;
|
||||
int id = p_timer.CreateTimer();
|
||||
p_timer.StartTimer(id);
|
||||
|
||||
uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
|
||||
rocrtst::AtomicSetPacketHeader(aql_header, aql().setup,
|
||||
&((hsa_kernel_dispatch_packet_t*)(q_base))[que_idx & queue_mask]);
|
||||
hsa_signal_store_screlease(main_queue()->doorbell_signal, que_idx);
|
||||
|
||||
// Wait on the dispatch signal until the kernel is finished.
|
||||
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
|
||||
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
|
||||
;
|
||||
|
||||
p_timer.StopTimer(id);
|
||||
|
||||
#ifdef DEBUG
|
||||
std::cout << "." << std::flush;
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
// Verify the results
|
||||
uint32_t scale = kernel_loop_count_ * ops_thrd;
|
||||
verifyGlobalLoadKernel(out_data_, total_workitems, scale,
|
||||
kernel_name().c_str());
|
||||
#endif
|
||||
time.push_back(p_timer.ReadTimer(id));
|
||||
|
||||
hsa_signal_store_screlease(signal(), 1);
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
std::cout << std::endl;
|
||||
#endif
|
||||
|
||||
time.erase(time.begin());
|
||||
std::sort(time.begin(), time.end());
|
||||
time.erase(time.begin() + num_iteration(), time.end());
|
||||
mean_ = rocrtst::CalcMean(time);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void DeviceLoadBandwidth::Close() {
|
||||
hsa_status_t err;
|
||||
|
||||
err = hsa_amd_memory_pool_free(in_data_);
|
||||
EXPECT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = hsa_amd_memory_pool_free(out_data_);
|
||||
EXPECT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = rocrtst::CommonCleanUp(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void DeviceLoadBandwidth::DisplayResults() const {
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::cout << "=======================================" << std::endl;
|
||||
std::cout << "Device Load Bandwidth: ";
|
||||
std::cout << data_size_ / mean_ / 1024 / 1024 / 1024 << "(GB/S)" << std::endl;
|
||||
std::cout << "=======================================" << std::endl;
|
||||
|
||||
return;
|
||||
}
|
||||
@@ -1,219 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "device_store_bandwidth.h"
|
||||
#include "common/base_rocr_utils.h"
|
||||
#include "common/common.h"
|
||||
#include "common/helper_funcs.h"
|
||||
#include "common/hsatimer.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
// Constructor
|
||||
DeviceStoreBandwidth::DeviceStoreBandwidth() :
|
||||
BaseRocR() {
|
||||
|
||||
set_group_size(0);
|
||||
num_group_ = 0;
|
||||
num_cus_ = 0;
|
||||
|
||||
kernel_loop_count_ = 0;
|
||||
mean_ = 0.0;
|
||||
data_size_ = 0;
|
||||
set_requires_profile (HSA_PROFILE_BASE);
|
||||
in_data_ = nullptr;
|
||||
out_data_ = nullptr;
|
||||
}
|
||||
|
||||
// Destructor
|
||||
DeviceStoreBandwidth::~DeviceStoreBandwidth() {
|
||||
}
|
||||
|
||||
// Set up the test environment
|
||||
void DeviceStoreBandwidth::SetUp() {
|
||||
SetWorkItemNum();
|
||||
|
||||
set_kernel_file_name("sysMemWrite.o");
|
||||
set_kernel_name("&__SysMemStore");
|
||||
|
||||
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
|
||||
//Create a queue with max number size
|
||||
hsa_queue_t* q = nullptr;
|
||||
rocrtst::CreateQueue(*gpu_dev, &q);
|
||||
ASSERT_NE(q, nullptr);
|
||||
set_main_queue(q);
|
||||
|
||||
rocrtst::LoadKernelFromObjFile(this);
|
||||
|
||||
uint32_t total_work_items = num_cus_ * num_group_ * group_size();
|
||||
|
||||
//Fill up part of aql
|
||||
rocrtst::InitializeAQLPacket(this, &aql());
|
||||
aql().workgroup_size_x = group_size();
|
||||
aql().grid_size_x = total_work_items;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Run the test
|
||||
void DeviceStoreBandwidth::Run() {
|
||||
hsa_status_t err;
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t total_workitems = num_cus_ * num_group_ * group_size();
|
||||
|
||||
uint32_t ops_thrd = 16;
|
||||
uint64_t addr_step = (uint64_t) total_workitems * sizeof(uint32_t);
|
||||
uint64_t total_ops = (uint64_t) total_workitems * kernel_loop_count_
|
||||
* ops_thrd;
|
||||
uint64_t in_data_size = (uint64_t) total_ops * sizeof(uint32_t);
|
||||
|
||||
data_size_ = in_data_size;
|
||||
|
||||
err = rocrtst::SetPoolsTypical(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = rocrtst::AllocAndAllowAccess(this, in_data_size, device_pool(),
|
||||
(void**)&in_data_);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
uint32_t out_data_size = total_workitems * sizeof(uint32_t);
|
||||
|
||||
err = rocrtst::AllocAndAllowAccess(this, out_data_size, device_pool(),
|
||||
(void**)&out_data_);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
struct local_args_t {
|
||||
void* arg0;
|
||||
void* arg1;
|
||||
uint64_t arg2;
|
||||
void* arg3;
|
||||
} local_args;
|
||||
|
||||
local_args.arg0 = in_data_;
|
||||
local_args.arg1 = in_data_ + total_ops;
|
||||
local_args.arg2 = addr_step;
|
||||
local_args.arg3 = out_data_;
|
||||
|
||||
// Copy the kernel args structure into a registered memory block
|
||||
err = rocrtst::AllocAndSetKernArgs(this, &local_args, sizeof(local_args));
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
std::vector<double> time;
|
||||
|
||||
rocrtst::WriteAQLToQueue(this);
|
||||
|
||||
for (uint32_t i = 0; i < num_iteration(); i++) {
|
||||
uint64_t que_idx = hsa_queue_load_write_index_relaxed(main_queue());
|
||||
|
||||
// Write the aql packet at the calculated queue index address.
|
||||
const uint32_t queue_mask = main_queue()->size - 1;
|
||||
|
||||
rocrtst::PerfTimer p_timer;
|
||||
int id = p_timer.CreateTimer();
|
||||
p_timer.StartTimer(id);
|
||||
|
||||
void * q_base = main_queue()->base_address;
|
||||
uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
|
||||
rocrtst::AtomicSetPacketHeader(aql_header, aql().setup,
|
||||
&((hsa_kernel_dispatch_packet_t*)(q_base))[que_idx & queue_mask]);
|
||||
hsa_signal_store_screlease(main_queue()->doorbell_signal, que_idx);
|
||||
|
||||
// Wait on the dispatch signal until the kernel is finished.
|
||||
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
|
||||
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
|
||||
;
|
||||
|
||||
p_timer.StopTimer(id);
|
||||
|
||||
#ifdef DEBUG
|
||||
std::cout << "." << std::flush;
|
||||
#endif
|
||||
|
||||
time.push_back(p_timer.ReadTimer(id));
|
||||
|
||||
hsa_signal_store_screlease(signal(), 1);
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
std::cout << std::endl;
|
||||
#endif
|
||||
|
||||
time.erase(time.begin());
|
||||
mean_ = rocrtst::CalcMean(time);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void DeviceStoreBandwidth::Close() {
|
||||
hsa_status_t err;
|
||||
|
||||
err = hsa_amd_memory_pool_free(in_data_);
|
||||
EXPECT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
err = hsa_amd_memory_pool_free(out_data_);
|
||||
EXPECT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = rocrtst::CommonCleanUp(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
return;
|
||||
}
|
||||
|
||||
void DeviceStoreBandwidth::DisplayResults() const {
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
std::cout << "=======================================" << std::endl;
|
||||
std::cout << "Device Store Bandwidth: ";
|
||||
std::cout << data_size_ / mean_ / 1024 / 1024 / 1024 << "(GB/S)" << std::endl;
|
||||
std::cout << "=======================================" << std::endl;
|
||||
return;
|
||||
}
|
||||
@@ -1,119 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __ROCRTST_SRC_INC_DEVICE_STORE_BANDWIDTH_H__
|
||||
#define __ROCRTST_SRC_INC_DEVICE_STORE_BANDWIDTH_H__
|
||||
|
||||
#include "perf_common/perf_base.h"
|
||||
#include "common/base_rocr.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include <stdio.h>
|
||||
|
||||
class DeviceStoreBandwidth: public rocrtst::BaseRocR, public PerfBase {
|
||||
public:
|
||||
//@Brief: Constructor
|
||||
DeviceStoreBandwidth();
|
||||
|
||||
//@Brief: Destructor
|
||||
~DeviceStoreBandwidth();
|
||||
|
||||
//@Brief: Set up the testing environment
|
||||
virtual void SetUp();
|
||||
|
||||
//@Brief: Run the test case
|
||||
virtual void Run();
|
||||
|
||||
//@Brief: Close and clean up the test enrionment
|
||||
virtual void Close();
|
||||
|
||||
//@Brief: Display load bandwidth
|
||||
virtual void DisplayResults() const;
|
||||
|
||||
//@Brief: Set work-item configuration
|
||||
void SetWorkItemNum() {
|
||||
#ifdef INTERACTIVE
|
||||
uint32_t tmp;
|
||||
printf("Please input the number of CUs you want to try:\n");
|
||||
scanf("%d", &num_cus_);
|
||||
|
||||
printf("Please input the number of groups you want to try:\n");
|
||||
scanf("%d", &num_group_);
|
||||
|
||||
printf("Please input the size of each group:\n");
|
||||
scanf("%d", &tmp);
|
||||
set_group_size(tmp);
|
||||
|
||||
printf("Please input the number of kernel loop you want to try:\n");
|
||||
scanf("%d", &kernel_loop_count_);
|
||||
#else
|
||||
num_cus_ = 32;
|
||||
num_group_ = 128;
|
||||
set_group_size(64);
|
||||
kernel_loop_count_ = 16;
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
private:
|
||||
//@Brief: number of group
|
||||
uint32_t num_group_;
|
||||
|
||||
//@Brief: number of CUs
|
||||
uint32_t num_cus_;
|
||||
|
||||
//@Brief: number of kernel loop
|
||||
uint32_t kernel_loop_count_;
|
||||
|
||||
//@Brief: Mean execution time
|
||||
double mean_;
|
||||
|
||||
//@Brief: data size for test
|
||||
uint64_t data_size_;
|
||||
uint32_t* in_data_;
|
||||
uint32_t* out_data_;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -43,7 +43,10 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include "dispatch_time.h"
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
|
||||
#include "suites/performance/dispatch_time.h"
|
||||
#include "common/base_rocr_utils.h"
|
||||
#include "common/common.h"
|
||||
#include "common/os.h"
|
||||
@@ -52,40 +55,68 @@
|
||||
#include "gtest/gtest.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include "hsa/hsa_ext_finalize.h"
|
||||
#include <algorithm>
|
||||
|
||||
DispatchTime::DispatchTime() :
|
||||
BaseRocR() {
|
||||
use_default_ = false;
|
||||
launch_single_ = false;
|
||||
DispatchTime::
|
||||
DispatchTime(bool defaultInterrupt, bool launchSingleKernel) : TestBase(),
|
||||
use_default_interupt_(defaultInterrupt),
|
||||
launch_single_(launchSingleKernel) {
|
||||
queue_size_ = 0;
|
||||
num_batch_ = 100000;
|
||||
memset(&aql(), 0, sizeof(hsa_kernel_dispatch_packet_t));
|
||||
single_default_mean_ = 0.0;
|
||||
single_interrupt_mean_ = 0.0;
|
||||
multi_default_mean_ = 0.0;
|
||||
multi_interrupt_mean_ = 0.0;
|
||||
dispatch_time_mean_ = 0.0;
|
||||
set_num_iteration(100);
|
||||
|
||||
set_kernel_file_name("dispatch_time_kernels.hsaco");
|
||||
set_kernel_name("empty_kernel");
|
||||
|
||||
std::string name;
|
||||
std::string desc;
|
||||
|
||||
name = "Average Dispatch Time";
|
||||
desc = "This test measures the time to handle AQL packets that "
|
||||
"do no work. Time is measured from when the packet is made available to"
|
||||
" the Command Processor to when the target agent notifies the host that "
|
||||
"the packet has been executed. ";
|
||||
|
||||
if (defaultInterrupt) {
|
||||
name += ", Default Interrupts";
|
||||
desc += "Interrupts are controlled by HSA_ENABLE_INTERRUPT environment "
|
||||
"variable. ";
|
||||
} else {
|
||||
name += ", Interrupts Enabled";
|
||||
desc += "Interrupts are enabled. ";
|
||||
}
|
||||
|
||||
if (launchSingleKernel) {
|
||||
name += ", Single Kernel";
|
||||
desc += " One kernel at a time is and executed.";
|
||||
} else {
|
||||
name += ", Multiple Kernels";
|
||||
desc += " Enough kernels to fill the queue are dispatched at one time";
|
||||
}
|
||||
|
||||
set_title(name);
|
||||
set_description(desc);
|
||||
}
|
||||
|
||||
DispatchTime::~DispatchTime() {
|
||||
|
||||
}
|
||||
|
||||
void DispatchTime::SetUp() {
|
||||
// If it indicates to use default signal, set env var properly
|
||||
if (use_default_) {
|
||||
hsa_status_t err;
|
||||
|
||||
// This need to happen before TestBase::SetUp()
|
||||
if (use_default_interupt_) {
|
||||
set_enable_interrupt(false);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
set_enable_interrupt(true);
|
||||
}
|
||||
|
||||
set_kernel_file_name("empty_kernel.o");
|
||||
set_kernel_name("&__Empty_kernel");
|
||||
TestBase::SetUp();
|
||||
// If it indicates to use default signal, set env var properly
|
||||
|
||||
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
|
||||
return;
|
||||
}
|
||||
err = SetDefaultAgents(this);
|
||||
ASSERT_EQ(HSA_STATUS_SUCCESS, err);
|
||||
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
|
||||
@@ -105,24 +136,26 @@ void DispatchTime::SetUp() {
|
||||
num_batch_ = num_batch_ > size ? size : num_batch_;
|
||||
}
|
||||
|
||||
rocrtst::LoadKernelFromObjFile(this);
|
||||
err = rocrtst::LoadKernelFromObjFile(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Fill up the kernel packet except header
|
||||
rocrtst::InitializeAQLPacket(this, &aql());
|
||||
err = rocrtst::InitializeAQLPacket(this, &aql());
|
||||
ASSERT_EQ(HSA_STATUS_SUCCESS, err);
|
||||
|
||||
aql().workgroup_size_x = 1;
|
||||
aql().grid_size_x = 1;
|
||||
}
|
||||
|
||||
void DispatchTime::Run() {
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
TestBase::Run();
|
||||
if (launch_single_) {
|
||||
RunSingle();
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
RunMulti();
|
||||
}
|
||||
}
|
||||
@@ -137,59 +170,59 @@ void DispatchTime::RunSingle() {
|
||||
int it = RealIterationNum();
|
||||
const uint32_t queue_mask = main_queue()->size - 1;
|
||||
|
||||
//queue should be empty
|
||||
// queue should be empty
|
||||
ASSERT_EQ(hsa_queue_load_read_index_scacquire(main_queue()),
|
||||
hsa_queue_load_write_index_scacquire(main_queue()));
|
||||
|
||||
void *q_base_addr = main_queue()->base_address;
|
||||
for (int i = 0; i < it; i++) {
|
||||
//Obtain the current queue write index.
|
||||
// Obtain the current queue write index.
|
||||
uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
|
||||
|
||||
ASSERT_LT(index, main_queue()->size + index);
|
||||
|
||||
//Write the aql packet at the calculated queue index address.
|
||||
// Write the aql packet at the calculated queue index address.
|
||||
|
||||
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql();
|
||||
|
||||
//Get timing stamp and ring the doorbell to dispatch the kernel.
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t *>(
|
||||
q_base_addr)[index & queue_mask] = aql();
|
||||
// Get timing stamp and ring the doorbell to dispatch the kernel.
|
||||
rocrtst::PerfTimer p_timer;
|
||||
int id = p_timer.CreateTimer();
|
||||
p_timer.StartTimer(id);
|
||||
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask].header |=
|
||||
HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t *>(
|
||||
q_base_addr)[index & queue_mask].header |=
|
||||
HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
|
||||
|
||||
hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
|
||||
|
||||
//Wait on the dispatch signal until the kernel is finished.
|
||||
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
|
||||
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
|
||||
;
|
||||
// Wait on the dispatch signal until the kernel is finished.
|
||||
while (hsa_signal_wait_scacquire(aql().completion_signal,
|
||||
HSA_SIGNAL_CONDITION_LT, 1, (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) {
|
||||
}
|
||||
|
||||
|
||||
p_timer.StopTimer(id);
|
||||
|
||||
timer.push_back(p_timer.ReadTimer(id));
|
||||
hsa_signal_store_screlease(signal(), 1);
|
||||
hsa_signal_store_screlease(aql().completion_signal, 1);
|
||||
|
||||
#ifdef DEBUG
|
||||
std::cout << ".";
|
||||
fflush(stdout);
|
||||
#endif
|
||||
if (verbosity() >= VERBOSE_PROGRESS) {
|
||||
std::cout << ".";
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << std::endl;
|
||||
if (verbosity() >= VERBOSE_PROGRESS) {
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
//Abandon the first result and after sort, delete the last 2% value
|
||||
// Abandon the first result and after sort, delete the last 2% value
|
||||
timer.erase(timer.begin());
|
||||
std::sort(timer.begin(), timer.end());
|
||||
|
||||
timer.erase(timer.begin() + num_iteration(), timer.end());
|
||||
|
||||
if (use_default_) {
|
||||
single_default_mean_ = rocrtst::CalcMean(timer);
|
||||
}
|
||||
else {
|
||||
single_interrupt_mean_ = rocrtst::CalcMean(timer);
|
||||
}
|
||||
dispatch_time_mean_ = rocrtst::CalcMean(timer);
|
||||
|
||||
return;
|
||||
}
|
||||
@@ -199,72 +232,69 @@ void DispatchTime::RunMulti() {
|
||||
int it = RealIterationNum();
|
||||
const uint32_t queue_mask = main_queue()->size - 1;
|
||||
|
||||
//queue should be empty
|
||||
// queue should be empty
|
||||
ASSERT_EQ(hsa_queue_load_read_index_scacquire(main_queue()),
|
||||
hsa_queue_load_write_index_scacquire(main_queue()));
|
||||
|
||||
for (int i = 0; i < it; i++) {
|
||||
uint64_t* index = (uint64_t*) malloc(sizeof(uint64_t) * num_batch_);
|
||||
rocrtst::PerfTimer p_timer;
|
||||
|
||||
hsa_signal_store_screlease(signal(), num_batch_);
|
||||
for (int i = 0; i < it; i++) {
|
||||
uint64_t* index =
|
||||
reinterpret_cast<uint64_t*>(malloc(sizeof(uint64_t) * num_batch_));
|
||||
|
||||
hsa_signal_store_screlease(aql().completion_signal, num_batch_);
|
||||
|
||||
for (uint32_t j = 0; j < num_batch_; j++) {
|
||||
//index[j] = hsa_queue_add_write_index_scacq_screl(main_queue(), 1);
|
||||
// index[j] = hsa_queue_add_write_index_scacq_screl(main_queue(), 1);
|
||||
index[j] = hsa_queue_add_write_index_relaxed(main_queue(), 1);
|
||||
|
||||
//Write the aql packet at the calculated queue index address.
|
||||
((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index[j]
|
||||
& queue_mask] = aql();
|
||||
// Write the aql packet at the calculated queue index address.
|
||||
(reinterpret_cast<hsa_kernel_dispatch_packet_t*>((
|
||||
main_queue()->base_address)))[index[j] & queue_mask] = aql();
|
||||
|
||||
if (j == num_batch_ - 1) {
|
||||
((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index[j]
|
||||
& queue_mask].header |= 1 << HSA_PACKET_HEADER_BARRIER;
|
||||
|
||||
//TODO: verify if the below is needed. I don't think it is. It should
|
||||
// already be initialized to signal().
|
||||
((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index[j]
|
||||
& queue_mask].completion_signal = signal();
|
||||
(reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
|
||||
main_queue()->base_address))[index[j] & queue_mask].header |=
|
||||
1 << HSA_PACKET_HEADER_BARRIER;
|
||||
}
|
||||
}
|
||||
|
||||
// Set packet header reversly; set all headers except the very first
|
||||
// one, for now.
|
||||
for (uint32_t j = num_batch_ - 1; j > 0; j--) {
|
||||
|
||||
((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index[j]
|
||||
& queue_mask].header |= HSA_PACKET_TYPE_KERNEL_DISPATCH
|
||||
<< HSA_PACKET_HEADER_TYPE;
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
|
||||
(main_queue()->base_address))[index[j] & queue_mask].header |=
|
||||
HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
|
||||
}
|
||||
|
||||
//Get timing stamp and ring the doorbell to dispatch the kernel.
|
||||
rocrtst::PerfTimer p_timer;
|
||||
// Get timing stamp and ring the doorbell to dispatch the kernel.
|
||||
int id = p_timer.CreateTimer();
|
||||
p_timer.StartTimer(id);
|
||||
//Set the very first header...
|
||||
((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index[0]
|
||||
& queue_mask].header |= HSA_PACKET_TYPE_KERNEL_DISPATCH
|
||||
<< HSA_PACKET_HEADER_TYPE;
|
||||
// Set the very first header...
|
||||
(reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
|
||||
main_queue()->base_address))[index[0] & queue_mask].header |=
|
||||
HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
|
||||
|
||||
for (uint32_t j = 0; j < num_batch_; j++) {
|
||||
hsa_signal_store_screlease(main_queue()->doorbell_signal, index[j]);
|
||||
}
|
||||
|
||||
//Wait on the dispatch signal until the kernel is finished.
|
||||
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_EQ, 0,
|
||||
UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0)
|
||||
;
|
||||
// Wait on the dispatch signal until the kernel is finished.
|
||||
while (hsa_signal_wait_scacquire(aql().completion_signal,
|
||||
HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0) {
|
||||
}
|
||||
|
||||
p_timer.StopTimer(id);
|
||||
|
||||
timer.push_back(p_timer.ReadTimer(id));
|
||||
hsa_signal_store_screlease(signal(), 1);
|
||||
hsa_signal_store_screlease(aql().completion_signal, 1);
|
||||
|
||||
free(index);
|
||||
|
||||
#ifdef DEBUG
|
||||
std::cout << ".";
|
||||
fflush(stdout);
|
||||
#endif
|
||||
if (verbosity() >= VERBOSE_PROGRESS) {
|
||||
std::cout << ".";
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << std::endl;
|
||||
@@ -275,57 +305,34 @@ void DispatchTime::RunMulti() {
|
||||
|
||||
timer.erase(timer.begin() + num_iteration(), timer.end());
|
||||
|
||||
if (use_default_) {
|
||||
multi_default_mean_ = rocrtst::CalcMean(timer);
|
||||
}
|
||||
else {
|
||||
multi_interrupt_mean_ = rocrtst::CalcMean(timer);
|
||||
}
|
||||
dispatch_time_mean_ = rocrtst::CalcMean(timer);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void DispatchTime::DisplayResults() const {
|
||||
void DispatchTime::DisplayTestInfo(void) {
|
||||
TestBase::DisplayTestInfo();
|
||||
}
|
||||
|
||||
void DispatchTime::DisplayResults(void) const {
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::cout << "===================================================="
|
||||
<< std::endl;
|
||||
TestBase::DisplayResults();
|
||||
|
||||
if (use_default_) {
|
||||
if (launch_single_) {
|
||||
std::cout << "Single_Default: " << single_default_mean_ * 1e6
|
||||
<< std::endl;
|
||||
}
|
||||
else {
|
||||
std::cout << "Multi_Default: "
|
||||
<< multi_default_mean_ * 1e6 / num_batch_ << std::endl;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (launch_single_) {
|
||||
std::cout << "Single_Interrupt: " << single_interrupt_mean_ * 1e6
|
||||
<< std::endl;
|
||||
}
|
||||
else {
|
||||
std::cout << "Multi_Interrupt: "
|
||||
<< multi_interrupt_mean_ * 1e6 / num_batch_ << std::endl;
|
||||
}
|
||||
std::cout << "Average Time to Completion: ";
|
||||
if (launch_single_) {
|
||||
std::cout << dispatch_time_mean_ * 1e6;
|
||||
} else {
|
||||
std::cout << dispatch_time_mean_ * 1e6 / num_batch_;
|
||||
}
|
||||
|
||||
std::cout << "====================================================="
|
||||
<< std::endl;
|
||||
|
||||
std::cout << " uS" << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
void DispatchTime::Close() {
|
||||
hsa_status_t err;
|
||||
|
||||
err = rocrtst::CommonCleanUp(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
TestBase::Close();
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -43,83 +43,68 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __ROCRTST_SRC_DISPATCH_TIME_H__
|
||||
#define __ROCRTST_SRC_DISPATCH_TIME_H__
|
||||
#include "perf_common/perf_base.h"
|
||||
#ifndef ROCRTST_SUITES_PERFORMANCE_DISPATCH_TIME_H_
|
||||
#define ROCRTST_SUITES_PERFORMANCE_DISPATCH_TIME_H_
|
||||
#include <vector>
|
||||
|
||||
#include "suites/test_common/test_base.h"
|
||||
#include "common/base_rocr.h"
|
||||
#include "common/common.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include <vector>
|
||||
|
||||
//@Brief: This class is defined to measure the mean latency of launching
|
||||
//an empty kernel
|
||||
// @Brief: This class is defined to measure the mean latency of launching
|
||||
// an empty kernel
|
||||
|
||||
class DispatchTime: public rocrtst::BaseRocR, public PerfBase {
|
||||
class DispatchTime : public TestBase {
|
||||
public:
|
||||
//@Brief: Constructor
|
||||
DispatchTime();
|
||||
// @Brief: Constructor
|
||||
DispatchTime(bool defaultInterrupt, bool launchSingleKernel);
|
||||
|
||||
//@Brief: Destructor
|
||||
virtual ~DispatchTime();
|
||||
// @Brief: Destructor
|
||||
virtual ~DispatchTime(void);
|
||||
|
||||
//@Brief: Set up the environment for the test
|
||||
virtual void SetUp();
|
||||
// @Brief: Set up the environment for the test
|
||||
virtual void SetUp(void);
|
||||
|
||||
//@Brief: Run the test case
|
||||
virtual void Run();
|
||||
// @Brief: Run the test case
|
||||
virtual void Run(void);
|
||||
|
||||
//@Brief: Display results we got
|
||||
virtual void DisplayResults() const;
|
||||
// @Brief: Display results we got
|
||||
virtual void DisplayResults(void) const;
|
||||
|
||||
//@Brief: Clean up and close the runtime
|
||||
virtual void Close();
|
||||
// @Brief: Display information about what this test does
|
||||
virtual void DisplayTestInfo(void);
|
||||
|
||||
//@Brief: Choose if use default signal or not
|
||||
void UseDefaultSignal(bool use_default = true) {
|
||||
use_default_ = use_default;
|
||||
}
|
||||
|
||||
//@Brief; Choose to launch a single kernels or not
|
||||
void LaunchSingleKernel(bool launch_single = true) {
|
||||
launch_single_ = launch_single;
|
||||
}
|
||||
// @Brief: Clean up and close the runtime
|
||||
virtual void Close(void);
|
||||
|
||||
private:
|
||||
//@Brief: Get actual iteration number
|
||||
virtual size_t RealIterationNum();
|
||||
// @Brief: Get actual iteration number
|
||||
virtual size_t RealIterationNum(void);
|
||||
|
||||
//@Brief: Launch single packet each time
|
||||
virtual void RunSingle();
|
||||
// @Brief: Launch single packet each time
|
||||
virtual void RunSingle(void);
|
||||
|
||||
//@Brief: Launch multiple packets each time
|
||||
virtual void RunMulti();
|
||||
// @Brief: Launch multiple packets each time
|
||||
virtual void RunMulti(void);
|
||||
|
||||
//@Brief: Indicate if use default signal or not
|
||||
bool use_default_;
|
||||
// @Brief: Indicate if use default signal or not
|
||||
bool use_default_interupt_;
|
||||
|
||||
//@Brief: Indicate if launch single kernel or not
|
||||
// @Brief: Indicate if launch single kernel or not
|
||||
bool launch_single_;
|
||||
|
||||
//@Brief: Store the size of queue
|
||||
// @Brief: Store the size of queue
|
||||
uint32_t queue_size_;
|
||||
|
||||
//@Brief: Number of packets in a batch
|
||||
// @Brief: Number of packets in a batch
|
||||
uint32_t num_batch_;
|
||||
|
||||
//@Brief: Time of single default signal dispatch time
|
||||
double single_default_mean_;
|
||||
|
||||
//@Brief: Time of single interrupt signal dispatch time
|
||||
double single_interrupt_mean_;
|
||||
|
||||
//@Brief: Time of multi default signal dispatch time
|
||||
double multi_default_mean_;
|
||||
|
||||
//@Brief: Time of multi interrupt signal dispatch time
|
||||
double multi_interrupt_mean_;
|
||||
// @Brief: Ave. dispatch time
|
||||
double dispatch_time_mean_;
|
||||
|
||||
char* orig_iterrupt_env_;
|
||||
};
|
||||
|
||||
#endif
|
||||
#endif // ROCRTST_SUITES_PERFORMANCE_DISPATCH_TIME_H_
|
||||
|
||||
|
||||
@@ -1,351 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "flush_latency.h"
|
||||
#include "common/base_rocr_utils.h"
|
||||
#include "common/common.h"
|
||||
#include "common/helper_funcs.h"
|
||||
#include "common/hsatimer.h"
|
||||
#include "common/os.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <algorithm>
|
||||
|
||||
static const int kWorkItem = 1024 * 1204;
|
||||
// Constructor
|
||||
FlushLatency::FlushLatency() :
|
||||
BaseRocR() {
|
||||
set_group_size(0);
|
||||
num_group_ = 0;
|
||||
num_cus_ = 0;
|
||||
|
||||
kernel_loop_count_ = 0;
|
||||
mean_ = 0.0;
|
||||
data_size_ = 0;
|
||||
|
||||
set_requires_profile (HSA_PROFILE_BASE);
|
||||
}
|
||||
|
||||
// Destructor
|
||||
FlushLatency::~FlushLatency() {
|
||||
}
|
||||
|
||||
// Set up the test environment
|
||||
void FlushLatency::SetUp() {
|
||||
hsa_status_t err;
|
||||
|
||||
SetWorkItemNum();
|
||||
|
||||
set_kernel_file_name("flush_latency.o");
|
||||
set_kernel_name("&main");
|
||||
|
||||
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
|
||||
//Create a queue with max number size
|
||||
hsa_queue_t* q;
|
||||
rocrtst::CreateQueue(*gpu_dev, &q);
|
||||
set_main_queue(q);
|
||||
|
||||
//Enable profiling
|
||||
err = hsa_amd_profiling_set_profiler_enabled(main_queue(), 1);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
rocrtst::LoadKernelFromObjFile(this);
|
||||
|
||||
uint32_t total_work_items = kWorkItem * 0.3;
|
||||
|
||||
//Fill up part of aql
|
||||
rocrtst::InitializeAQLPacket(this, &aql());
|
||||
aql().workgroup_size_x = group_size();
|
||||
aql().grid_size_x = total_work_items;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Run the test
|
||||
void FlushLatency::Run() {
|
||||
hsa_status_t err;
|
||||
hsa_amd_memory_pool_t cpu_pool;
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
hsa_agent_t* cpu_dev = cpu_device();
|
||||
|
||||
err = hsa_amd_agent_iterate_memory_pools(*gpu_dev, rocrtst::FindStandardPool,
|
||||
&device_pool());
|
||||
ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
|
||||
|
||||
ASSERT_NE(device_pool().handle, 0);
|
||||
|
||||
cpu_pool.handle = 0;
|
||||
err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool,
|
||||
&cpu_pool);
|
||||
ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
|
||||
|
||||
ASSERT_NE(cpu_pool.handle, 0);
|
||||
|
||||
#if DEBUG
|
||||
std::cout << "Device Pool Properties:" << std::endl;
|
||||
err = rocrtst::DumpMemoryPoolInfo(device_pool());
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
std::cout << "Global Pool Properties:" << std::endl;
|
||||
err = rocrtst::DumpMemoryPoolInfo(cpu_pool);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
#endif
|
||||
uint32_t out_data_size = 1024 * 1024 * sizeof(uint32_t);
|
||||
|
||||
std::vector<double> time_none;
|
||||
std::vector<double> time_release;
|
||||
|
||||
std::vector < uint64_t > time_none_stamp;
|
||||
std::vector < uint64_t > time_release_stamp;
|
||||
|
||||
//Query system timestamp frequency
|
||||
uint64_t freq;
|
||||
err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freq);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
void* out = NULL;
|
||||
uint32_t* out_data;
|
||||
const uint32_t queue_mask = main_queue()->size - 1;
|
||||
typedef struct local_args_t {
|
||||
void* arg0;
|
||||
} args;
|
||||
|
||||
// Warm up
|
||||
uint16_t header = 0;
|
||||
header |= HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
|
||||
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
|
||||
header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
|
||||
aql().header = header;
|
||||
|
||||
err = hsa_amd_memory_pool_allocate(device_pool(), out_data_size, 0,
|
||||
(void**) &out_data);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
args* kern_ptr = NULL;
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool, sizeof(args), 0,
|
||||
(void**) &kern_ptr);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
kern_ptr->arg0 = out_data;
|
||||
|
||||
aql().kernarg_address = kern_ptr;
|
||||
|
||||
// Obtain the current queue write index
|
||||
int64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
|
||||
|
||||
void *q_base_addr = main_queue()->base_address;
|
||||
// Write the aql packet at the calculated queue index address.
|
||||
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql();
|
||||
|
||||
hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
|
||||
|
||||
// Wait on the dispatch signal until the kernel is finished.
|
||||
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
|
||||
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
|
||||
;
|
||||
|
||||
hsa_signal_store_screlease(signal(), 1);
|
||||
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
err = hsa_amd_memory_pool_allocate(device_pool(), out_data_size, 0,
|
||||
(void**) &out_data);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
args* kern_ptr = NULL;
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool, sizeof(args), 0,
|
||||
(void**) &kern_ptr);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
kern_ptr->arg0 = out_data;
|
||||
|
||||
aql().kernarg_address = kern_ptr;
|
||||
|
||||
// Obtain the current queue write index
|
||||
int64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
|
||||
|
||||
// Write the aql packet at the calculated queue index address.
|
||||
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql();
|
||||
|
||||
hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
|
||||
|
||||
// Wait on the dispatch signal until the kernel is finished.
|
||||
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
|
||||
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
|
||||
;
|
||||
|
||||
hsa_amd_profiling_dispatch_time_t dispatch_time;
|
||||
err = hsa_amd_profiling_get_dispatch_time(*gpu_dev, signal(),
|
||||
&dispatch_time);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
uint64_t sys_start = 0;
|
||||
uint64_t sys_end = 0;
|
||||
err = hsa_amd_profiling_convert_tick_to_system_domain(*gpu_dev,
|
||||
dispatch_time.start, &sys_start);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
err = hsa_amd_profiling_convert_tick_to_system_domain(*gpu_dev,
|
||||
dispatch_time.end, &sys_end);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
uint64_t stamp = dispatch_time.end - dispatch_time.start;
|
||||
double execution_time = (double) stamp / freq * 1e6; // convert to us.
|
||||
|
||||
time_none.push_back(execution_time);
|
||||
time_none_stamp.push_back(stamp);
|
||||
|
||||
hsa_signal_store_screlease(signal(), 1);
|
||||
|
||||
if (out != NULL) {
|
||||
err = hsa_memory_free(out);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
out = out_data;
|
||||
out_data = NULL;
|
||||
}
|
||||
|
||||
header = 0;
|
||||
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
|
||||
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
|
||||
header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
|
||||
aql().header = header;
|
||||
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
err = hsa_amd_memory_pool_allocate(device_pool(), out_data_size, 0,
|
||||
(void**) &out_data);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
args* kern_ptr = NULL;
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool, sizeof(args), 0,
|
||||
(void**) &kern_ptr);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
kern_ptr->arg0 = out_data;
|
||||
|
||||
aql().kernarg_address = kern_ptr;
|
||||
|
||||
// Obtain the current queue write index
|
||||
uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
|
||||
|
||||
// Write the aql packet at the calculated queue index address.
|
||||
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql();
|
||||
|
||||
hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
|
||||
|
||||
// Wait on the dispatch signal until the kernel is finished.
|
||||
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
|
||||
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
|
||||
;
|
||||
|
||||
hsa_signal_store_screlease(signal(), 1);
|
||||
|
||||
hsa_amd_profiling_dispatch_time_t dispatch_time;
|
||||
err = hsa_amd_profiling_get_dispatch_time(*gpu_dev, signal(),
|
||||
&dispatch_time);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
uint64_t sys_start = 0;
|
||||
uint64_t sys_end = 0;
|
||||
err = hsa_amd_profiling_convert_tick_to_system_domain(*gpu_dev,
|
||||
dispatch_time.start, &sys_start);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
err = hsa_amd_profiling_convert_tick_to_system_domain(*gpu_dev,
|
||||
dispatch_time.end, &sys_end);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
uint64_t stamp = dispatch_time.end - dispatch_time.start;
|
||||
double execution_time = (double) stamp / freq * 1e6; // convert to us.
|
||||
time_release.push_back(execution_time);
|
||||
time_release_stamp.push_back(stamp);
|
||||
|
||||
if (out != NULL) {
|
||||
err = hsa_memory_free(out);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
out = out_data;
|
||||
out_data = NULL;
|
||||
}
|
||||
|
||||
std::sort(time_none.begin(), time_none.end());
|
||||
std::sort(time_release.begin(), time_release.end());
|
||||
|
||||
time_none.erase(time_none.begin(), time_none.begin() + 50);
|
||||
time_none.erase(time_none.end() - 50, time_none.end());
|
||||
time_release.erase(time_release.begin(), time_release.begin() + 50);
|
||||
time_release.erase(time_release.end() - 50, time_release.end());
|
||||
|
||||
mean_ = rocrtst::CalcMean(time_none, time_release);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void FlushLatency::Close() {
|
||||
hsa_status_t err;
|
||||
err = rocrtst::CommonCleanUp(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
void FlushLatency::DisplayResults() const {
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::cout << std::endl << "======================================="
|
||||
<< std::endl;
|
||||
std::cout << "Average cache flush overhead: " << mean_ << "uS"
|
||||
<< std::endl;
|
||||
std::cout << "=======================================" << std::endl;
|
||||
return;
|
||||
}
|
||||
@@ -1,122 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __ROCRTST_SRC_INC_FLUSH_LATENCY_H__
|
||||
#define __ROCRTST_SRC_INC_FLUSH_LATENCY_H__
|
||||
|
||||
#include "perf_common/perf_base.h"
|
||||
#include "common/base_rocr.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include <stdio.h>
|
||||
|
||||
class FlushLatency: public rocrtst::BaseRocR, public PerfBase {
|
||||
public:
|
||||
//@Brief: Constructor
|
||||
FlushLatency();
|
||||
|
||||
//@Brief: Destructor
|
||||
~FlushLatency();
|
||||
|
||||
//@Brief: Set up the testing environment
|
||||
virtual void SetUp();
|
||||
|
||||
//@Brief: Run the test case
|
||||
virtual void Run();
|
||||
|
||||
//@Brief: Close and clean up the test enrionment
|
||||
virtual void Close();
|
||||
|
||||
//@Brief: Display load bandwidth
|
||||
virtual void DisplayResults() const;
|
||||
|
||||
//@Brief: Set work-item configuration
|
||||
void SetWorkItemNum() {
|
||||
#ifdef INTERACTIVE
|
||||
uint32_t tmp;
|
||||
printf("Please input the number of CUs you want to try:\n");
|
||||
int i;
|
||||
i = scanf("%d", &num_cus_);
|
||||
|
||||
printf("Please input the number of groups you want to try:\n");
|
||||
i = scanf("%d", &num_group_);
|
||||
|
||||
printf("Please input the size of each group:\n");
|
||||
i = scanf("%d", &tmp);
|
||||
set_group_size(tmp);
|
||||
|
||||
printf("Please input the number of kernel loop you want to try:\n");
|
||||
i = scanf("%d", &kernel_loop_count_);
|
||||
#else
|
||||
num_cus_ = 32;
|
||||
num_group_ = 128;
|
||||
group_size_ = 256;
|
||||
kernel_loop_count_ = 16;
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
private:
|
||||
//@Brief: number of work item in one group
|
||||
uint32_t group_size_;
|
||||
|
||||
//@Brief: number of group
|
||||
uint32_t num_group_;
|
||||
|
||||
//@Brief: number of CUs
|
||||
uint32_t num_cus_;
|
||||
|
||||
//@Brief: number of kernel loop
|
||||
uint32_t kernel_loop_count_;
|
||||
|
||||
//@Brief: Mean execution time
|
||||
double mean_;
|
||||
|
||||
//@Brief: data size for test
|
||||
uint64_t data_size_;
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,502 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "common/base_rocr_utils.h"
|
||||
#include "common/common.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "hsa_info.h"
|
||||
|
||||
static hsa_status_t get_agent_info(hsa_agent_t, void*);
|
||||
|
||||
static hsa_status_t get_pool_info(hsa_amd_memory_pool_t, void*);
|
||||
|
||||
static int agent_number = 0;
|
||||
static bool output_amd = false;
|
||||
|
||||
//@Brief: Map to store the peak FLOPS for different agent
|
||||
std::map<std::string, double> flops_table = { {"Kaveri CPU", 118.4}, {
|
||||
"S pectre", 737.0
|
||||
}, {"Carrizo CPU", 67.2}, {"Carrizo GPU", 819.2}
|
||||
};
|
||||
|
||||
//@Brief: Vector to store the agent_names
|
||||
std::vector<std::string> agent_names = {"Kaveri CPU", "Spectre",
|
||||
"Carri zo CPU", "Carrizo GPU"
|
||||
};
|
||||
|
||||
HsaInfo::HsaInfo() :
|
||||
BaseRocR() {
|
||||
}
|
||||
|
||||
HsaInfo::~HsaInfo() {
|
||||
}
|
||||
|
||||
void HsaInfo::SetUp() {
|
||||
// Get Env Var to determine if output AMD specific info
|
||||
char* EnvVar = rocrtst::GetEnv("HSA_VENDOR_AMD");
|
||||
|
||||
if (NULL != EnvVar) {
|
||||
output_amd = ('1' == *EnvVar);
|
||||
}
|
||||
|
||||
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
void HsaInfo::Run() {
|
||||
hsa_status_t err;
|
||||
// Get the system info first
|
||||
// Get version info
|
||||
uint16_t major, minor;
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
err = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &major);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
err = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &minor);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Get timestamp frequency
|
||||
uint64_t timestamp_frequency = 0;
|
||||
err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY,
|
||||
×tamp_frequency);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Get maximum duration of a signal wait operation
|
||||
uint64_t max_wait = 0;
|
||||
err = hsa_system_get_info(HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT, &max_wait);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Get Endianness of the system
|
||||
hsa_endianness_t endianness;
|
||||
err = hsa_system_get_info(HSA_SYSTEM_INFO_ENDIANNESS, &endianness);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Get machine model info
|
||||
hsa_machine_model_t machine_model;
|
||||
err = hsa_system_get_info(HSA_SYSTEM_INFO_MACHINE_MODEL, &machine_model);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Print out the results
|
||||
std::cout << "HSA System Info:" << std::endl;
|
||||
std::cout << "Runtime Version: " << major <<
|
||||
"." << minor << std::endl;
|
||||
std::cout << "System Timestamp Frequency: " <<
|
||||
timestamp_frequency / 1e6 << "MHz" << std::endl;
|
||||
|
||||
std::cout << "Signal Max Wait Duration: " << max_wait
|
||||
<< "(number of timestamp)" << std::endl;
|
||||
std::cout << "Machine Model: ";
|
||||
|
||||
if (HSA_MACHINE_MODEL_SMALL == machine_model) {
|
||||
std::cout << "SMALL" << std::endl;
|
||||
}
|
||||
else if (HSA_MACHINE_MODEL_LARGE == machine_model) {
|
||||
std::cout << "LARGE" << std::endl;
|
||||
}
|
||||
|
||||
std::cout << "System Endianness: ";
|
||||
|
||||
if (HSA_ENDIANNESS_LITTLE == endianness) {
|
||||
std::cout << "LITTLE" << std::endl;
|
||||
}
|
||||
else if (HSA_ENDIANNESS_BIG == endianness) {
|
||||
std::cout << "BIG" << std::endl;
|
||||
}
|
||||
|
||||
std::cout << std::endl;
|
||||
|
||||
// Iterate every agent and get their info
|
||||
err = hsa_iterate_agents(get_agent_info, NULL);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
#define RET_IF_HSA_INFO_ERR(err) { \
|
||||
if ((err) != HSA_STATUS_SUCCESS) { \
|
||||
std::cout << "hsa api call failure at line " << __LINE__ << ", file: " << \
|
||||
__FILE__ << std::endl; \
|
||||
return (err); \
|
||||
} \
|
||||
}
|
||||
|
||||
static hsa_status_t get_agent_info(hsa_agent_t agent, void* data) {
|
||||
int pool_number = 0;
|
||||
hsa_status_t err;
|
||||
{
|
||||
// Increase the number of agent
|
||||
agent_number++;
|
||||
|
||||
// Get agent name and vendor
|
||||
char name[64];
|
||||
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, name);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
char vendor_name[64];
|
||||
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_VENDOR_NAME, &vendor_name);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
// Get agent feature
|
||||
hsa_agent_feature_t agent_feature;
|
||||
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_FEATURE, &agent_feature);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
// Get profile supported by the agent
|
||||
hsa_profile_t agent_profile;
|
||||
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_profile);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
// Get floating-point rounding mode
|
||||
hsa_default_float_rounding_mode_t float_rounding_mode;
|
||||
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE,
|
||||
&float_rounding_mode);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
// Get max number of queue
|
||||
uint32_t max_queue = 0;
|
||||
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUES_MAX, &max_queue);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
// Get queue min size
|
||||
uint32_t queue_min_size = 0;
|
||||
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE,
|
||||
&queue_min_size);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
// Get queue max size
|
||||
uint32_t queue_max_size = 0;
|
||||
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
|
||||
&queue_max_size);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
// Get queue type
|
||||
hsa_queue_type_t queue_type;
|
||||
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_TYPE, &queue_type);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
// Get agent node
|
||||
uint32_t node;
|
||||
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NODE, &node);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
// Get device type
|
||||
hsa_device_type_t device_type;
|
||||
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
// Get cache size
|
||||
uint32_t cache_size[4];
|
||||
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_CACHE_SIZE, cache_size);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
// Get chip id
|
||||
uint32_t chip_id = 0;
|
||||
err = hsa_agent_get_info(agent,
|
||||
(hsa_agent_info_t) HSA_AMD_AGENT_INFO_CHIP_ID,
|
||||
&chip_id);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
// Get cacheline size
|
||||
uint32_t cacheline_size = 0;
|
||||
err = hsa_agent_get_info(agent,
|
||||
(hsa_agent_info_t) HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
|
||||
&cacheline_size);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
// Get Max clock frequency
|
||||
uint32_t max_clock_freq = 0;
|
||||
err = hsa_agent_get_info(agent,
|
||||
(hsa_agent_info_t) HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY,
|
||||
&max_clock_freq);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
// Get Agent BDFID
|
||||
uint16_t bdf_id = 1;
|
||||
err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_BDFID,
|
||||
&bdf_id);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
// Get number of Compute Unit
|
||||
uint32_t compute_unit = 0;
|
||||
err = hsa_agent_get_info(agent,
|
||||
(hsa_agent_info_t) HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
|
||||
&compute_unit);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
// Print out the common results
|
||||
std::cout << std::endl;
|
||||
std::cout << "Agent #" << agent_number << ":" << std::endl;
|
||||
std::cout << "Agent Name: " << name <<
|
||||
std::endl;
|
||||
std::cout << "Agent Vendor Name: " <<
|
||||
vendor_name << std::endl;
|
||||
|
||||
if (agent_feature & HSA_AGENT_FEATURE_KERNEL_DISPATCH
|
||||
&& agent_feature & HSA_AGENT_FEATURE_AGENT_DISPATCH)
|
||||
std::cout << "Agent Feature: KERNEL_DISPATCH & AGENT_DISPATCH"
|
||||
<< std::endl;
|
||||
else if (agent_feature & HSA_AGENT_FEATURE_KERNEL_DISPATCH) {
|
||||
std::cout << "Agent Feature: KERNEL_DISPATCH" << std::endl;
|
||||
}
|
||||
else if (agent_feature & HSA_AGENT_FEATURE_AGENT_DISPATCH) {
|
||||
std::cout << "Agent Feature: AGENT_DISPATCH" << std::endl;
|
||||
}
|
||||
else {
|
||||
std::cout << "Agent Feature: Not Supported" << std::endl;
|
||||
}
|
||||
|
||||
if (HSA_PROFILE_BASE == agent_profile) {
|
||||
std::cout << "Agent Profile: BASE_PROFILE" << std::endl;
|
||||
}
|
||||
else if (HSA_PROFILE_FULL == agent_profile) {
|
||||
std::cout << "Agent Profile: FULL_PROFILE" << std::endl;
|
||||
}
|
||||
else {
|
||||
std::cout << "Agent Profile: Not Supported" << std::endl;
|
||||
}
|
||||
|
||||
if (HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO == float_rounding_mode) {
|
||||
std::cout << "Agent Floating Rounding Mode: ZERO" << std::endl;
|
||||
}
|
||||
else if (HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR == float_rounding_mode) {
|
||||
std::cout << "Agent Floating Rounding Mode: NEAR" << std::endl;
|
||||
}
|
||||
else {
|
||||
std::cout << "Agent Floating Rounding Mode: Not Supported" << std::endl;
|
||||
}
|
||||
|
||||
std::cout << "Agent Max Queue Number: " << max_queue << std::endl;
|
||||
std::cout << "Agent Queue Min Size: " << queue_min_size << std::endl;
|
||||
std::cout << "Agent Queue Max Size: " << queue_max_size << std::endl;
|
||||
|
||||
if (HSA_QUEUE_TYPE_MULTI == queue_type) {
|
||||
std::cout << "Agent Queue Type: MULTI" << std::endl;
|
||||
}
|
||||
else if (HSA_QUEUE_TYPE_SINGLE == queue_type) {
|
||||
std::cout << "Agent Queue Type: SINGLE" << std::endl;
|
||||
}
|
||||
else {
|
||||
std::cout << "Agent Queue Type: Not Supported" << std::endl;
|
||||
}
|
||||
|
||||
std::cout << "Agent Node: " << node << std::endl;
|
||||
|
||||
if (HSA_DEVICE_TYPE_CPU == device_type) {
|
||||
std::cout << "Agent Device Type: CPU" << std::endl;
|
||||
}
|
||||
else if (HSA_DEVICE_TYPE_GPU == device_type) {
|
||||
std::cout << "Agent Device Type: GPU" << std::endl;
|
||||
// Get ISA info
|
||||
hsa_isa_t agent_isa;
|
||||
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_ISA, &agent_isa);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
}
|
||||
else {
|
||||
std::cout << "Agent Device Type: DSP" << std::endl;
|
||||
}
|
||||
|
||||
std::cout << "Agent Cache Info:" << std::endl;
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (cache_size[i]) {
|
||||
std::cout << " $L" << i + 1 << ": " << cache_size[i] / 1024
|
||||
<< "KB" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Agent Chip ID: " << chip_id << std::endl;
|
||||
std::cout << "Agent Cacheline Size: " << cacheline_size << std::endl;
|
||||
std::cout << "Agent Max Clock Frequency: " << max_clock_freq << "MHz"
|
||||
<< std::endl;
|
||||
std::cout << "Agent BDFID: " << bdf_id << std::endl;
|
||||
std::cout << "Agent Compute Unit: " << compute_unit << std::endl;
|
||||
|
||||
// Output Peak FLOPS and Peak Bandwidth if Env var is set
|
||||
// TODO: Fan, need to add BW
|
||||
if (output_amd) {
|
||||
std::string agent_name = name;
|
||||
|
||||
for (size_t i = 0; i < agent_names.size(); i++) {
|
||||
if (agent_name.compare(agent_names[i]) == 0)
|
||||
std::cout << "Agent Peak GFLOPS: " << flops_table[agent_name]
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if the agent is kernel agent
|
||||
if (agent_feature & HSA_AGENT_FEATURE_KERNEL_DISPATCH) {
|
||||
|
||||
// Get flaf of fast_f16 operation
|
||||
bool fast_f16;
|
||||
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_FAST_F16_OPERATION,
|
||||
&fast_f16);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
// Get wavefront size
|
||||
uint32_t wavefront_size = 0;
|
||||
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
|
||||
&wavefront_size);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
// Get max total number of work-items in a workgroup
|
||||
uint32_t workgroup_max_size = 0;
|
||||
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
|
||||
&workgroup_max_size);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
// Get max number of work-items of each dimension of a work-group
|
||||
uint16_t workgroup_max_dim[3];
|
||||
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
|
||||
&workgroup_max_dim);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
// Get max number of a grid per dimension
|
||||
hsa_dim3_t grid_max_dim;
|
||||
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_DIM,
|
||||
&grid_max_dim);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
// Get max total number of work-items in a grid
|
||||
uint32_t grid_max_size = 0;
|
||||
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
|
||||
&grid_max_size);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
// Get max number of fbarriers per work group
|
||||
uint32_t fbarrier_max_size = 0;
|
||||
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE,
|
||||
&fbarrier_max_size);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
// Print info for kernel agent
|
||||
if (true == fast_f16) {
|
||||
std::cout << "Agent Fast F16 Operation: TRUE" <<
|
||||
std::endl;
|
||||
}
|
||||
|
||||
std::cout << "Agent Wavefront Size: " <<
|
||||
wavefront_size << std::endl;
|
||||
std::cout << "Agent Workgroup Max Size: " <<
|
||||
workgroup_max_size << std::endl;
|
||||
std::cout <<
|
||||
"Agent Workgroup Max Size Per Dimension: " <<
|
||||
std::endl;
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
std::cout << " Dim[" << i <<
|
||||
"]: " << workgroup_max_dim[i] <<
|
||||
std::endl;
|
||||
}
|
||||
|
||||
std::cout << "Agent Grid Max Size: " <<
|
||||
grid_max_size << std::endl;
|
||||
|
||||
// Stop using the above kmt functions as per SWDEV-97044
|
||||
//
|
||||
uint32_t waves_per_cu = 0;
|
||||
err = hsa_agent_get_info(agent,
|
||||
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU,
|
||||
&waves_per_cu);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
std::cout << "Agent Waves Per CU: " <<
|
||||
waves_per_cu << std::endl;
|
||||
std::cout << "Agent Max Work-item Per CU: "
|
||||
<< wavefront_size* waves_per_cu << std::endl;
|
||||
|
||||
std::cout << "Agent Grid Max Size per Dimension:" << std::endl;
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
std::cout << " Dim[" << i <<
|
||||
"] "
|
||||
<< reinterpret_cast<uint32_t*>(&grid_max_dim)[i] << std::endl;
|
||||
}
|
||||
|
||||
std::cout << "Agent Max number Of fbarriers Per Workgroup: "
|
||||
<< fbarrier_max_size << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// Get pool info
|
||||
std::cout << "Agent Pool Info:" << std::endl;
|
||||
err = hsa_amd_agent_iterate_memory_pools(agent, get_pool_info, &pool_number);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// Implement region iteration function
|
||||
hsa_status_t get_pool_info(hsa_amd_memory_pool_t pool, void* data) {
|
||||
hsa_status_t err;
|
||||
int* p_int = reinterpret_cast<int*>(data);
|
||||
(*p_int)++;
|
||||
|
||||
std::cout << " Pool #" << *p_int << ":" << std::endl;
|
||||
|
||||
err = rocrtst::DumpMemoryPoolInfo(pool, 4);
|
||||
RET_IF_HSA_INFO_ERR(err)
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
#undef RET_IF_HSA_INFO_ERR
|
||||
|
||||
void HsaInfo::DisplayResults() const {
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void HsaInfo::Close() {
|
||||
hsa_status_t err;
|
||||
err = rocrtst::CommonCleanUp(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1,328 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "image_bandwidth.h"
|
||||
#include "common/base_rocr_utils.h"
|
||||
#include "common/common.h"
|
||||
#include "common/hsatimer.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include "hsa/hsa_ext_image.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <algorithm>
|
||||
|
||||
ImageBandwidth::ImageBandwidth(size_t num) :
|
||||
BaseRocR(), import_bandwidth_ {0.0}, export_bandwidth_ {0.0},
|
||||
copy_bandwidth_ {0.0} {
|
||||
format_.channel_order = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA;
|
||||
format_.channel_type = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8;
|
||||
geometry_ = HSA_EXT_IMAGE_GEOMETRY_2D;
|
||||
|
||||
set_requires_profile (HSA_PROFILE_FULL);
|
||||
}
|
||||
|
||||
ImageBandwidth::~ImageBandwidth() {
|
||||
}
|
||||
|
||||
const size_t ImageBandwidth::Size[10] = {32, 64, 128, 256, 512, 1024, 2048,
|
||||
4096, 8192, 16384
|
||||
};
|
||||
const char* const ImageBandwidth::Str[10] = {"4K", "16K", "64K", "256K", "1M",
|
||||
"4M", "16M", "64M", "256M", "1G"
|
||||
};
|
||||
|
||||
void ImageBandwidth::SetUp() {
|
||||
hsa_status_t err;
|
||||
|
||||
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
|
||||
// Find the global region
|
||||
err = hsa_amd_agent_iterate_memory_pools(*gpu_dev, rocrtst::FindGlobalPool,
|
||||
&cpu_pool());
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
void ImageBandwidth::Run() {
|
||||
hsa_status_t err;
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
|
||||
for (int i = 0; i < 10; i++) {
|
||||
// Create timer for import, export and copy tests
|
||||
rocrtst::PerfTimer import_timer;
|
||||
rocrtst::PerfTimer export_timer;
|
||||
rocrtst::PerfTimer copy_timer;
|
||||
std::vector<double> import_image;
|
||||
std::vector<double> export_image;
|
||||
std::vector<double> copy_image;
|
||||
// Allocate image buffer in host memory
|
||||
uint32_t* image_buffer = NULL;
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool(),
|
||||
Size[i] * Size[i] * sizeof(uint32_t),
|
||||
0, (void**) &image_buffer);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// rocrtst::CommonCleanUp the image buffer
|
||||
for (uint32_t j = 0; j < Size[i] * Size[i]; j++) {
|
||||
image_buffer[j] = 0x10101010;
|
||||
}
|
||||
|
||||
// Prepare for 2D image creation
|
||||
hsa_ext_image_t image_handle;
|
||||
|
||||
hsa_ext_image_descriptor_t image_descriptor;
|
||||
image_descriptor.geometry = geometry_;
|
||||
image_descriptor.width = Size[i];
|
||||
image_descriptor.height = Size[i];
|
||||
image_descriptor.depth = 1;
|
||||
image_descriptor.array_size = 0;
|
||||
image_descriptor.format = format_;
|
||||
|
||||
// Check if device_ supports at least read and write operation on
|
||||
// image format
|
||||
uint32_t capability_mask;
|
||||
err = hsa_ext_image_get_capability(*gpu_dev, HSA_EXT_IMAGE_GEOMETRY_2D,
|
||||
&format_, &capability_mask);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
if (!(capability_mask & HSA_EXT_IMAGE_CAPABILITY_READ_WRITE)) {
|
||||
std::cout <<
|
||||
"Device does not support read and write operation on this kind of image!"
|
||||
<< std::endl;
|
||||
ASSERT_NE(capability_mask & HSA_EXT_IMAGE_CAPABILITY_READ_WRITE, 0);
|
||||
}
|
||||
|
||||
// Get image info
|
||||
hsa_ext_image_data_info_t image_info;
|
||||
err = hsa_ext_image_data_get_info(*gpu_dev, &image_descriptor,
|
||||
HSA_ACCESS_PERMISSION_RW, &image_info);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Allocate memory for image
|
||||
uintptr_t ptr_temp = 0;
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool(),
|
||||
image_info.size + image_info.alignment, 0, (void**) &ptr_temp);
|
||||
|
||||
// Align the image address
|
||||
uintptr_t mul = ptr_temp / image_info.alignment;
|
||||
void* ptr_image = (void*) ((mul + 1) * image_info.alignment);
|
||||
|
||||
// rocrtst::CommonCleanUp the image to 0
|
||||
hsa_amd_memory_fill(ptr_image, 0, image_info.size);
|
||||
|
||||
// Create image handle
|
||||
err = hsa_ext_image_create(*gpu_dev, &image_descriptor, ptr_image,
|
||||
HSA_ACCESS_PERMISSION_RW, &image_handle);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Set import image region
|
||||
hsa_dim3_t range = {(uint32_t) Size[i], (uint32_t) Size[i], 1};
|
||||
|
||||
hsa_ext_image_region_t image_region;
|
||||
hsa_dim3_t image_offset = {0, 0, 0};
|
||||
image_region.offset = image_offset;
|
||||
image_region.range = range;
|
||||
|
||||
size_t iterations = RealIterationNum();
|
||||
|
||||
for (uint32_t it = 0; it < iterations; it++) {
|
||||
// Create a timer
|
||||
int index = import_timer.CreateTimer();
|
||||
|
||||
// Stamp at the beginning
|
||||
import_timer.StartTimer(index);
|
||||
|
||||
// Import image from host
|
||||
err = hsa_ext_image_import(*gpu_dev, image_buffer, 0, 0, image_handle,
|
||||
&image_region);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Stamp in the end
|
||||
import_timer.StopTimer(index);
|
||||
import_image.push_back(import_timer.ReadTimer(index));
|
||||
}
|
||||
|
||||
// Reset image_buffer
|
||||
hsa_amd_memory_fill(image_buffer, 0, Size[i] * Size[i] * sizeof(uint32_t));
|
||||
|
||||
for (uint32_t it = 0; it < iterations; it++) {
|
||||
// Export image
|
||||
// Stamp at the beginning
|
||||
int index = export_timer.CreateTimer();
|
||||
export_timer.StartTimer(index);
|
||||
|
||||
err = hsa_ext_image_export(*gpu_dev, image_handle, image_buffer, 0, 0,
|
||||
&image_region);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
export_timer.StopTimer(index);
|
||||
export_image.push_back(export_timer.ReadTimer(index));
|
||||
|
||||
// Check if the value is correct
|
||||
for (uint32_t j = 0; j < Size[i] * Size[i]; j++) {
|
||||
ASSERT_EQ(image_buffer[j], 0x10101010);
|
||||
}
|
||||
}
|
||||
|
||||
// Create another image for copy
|
||||
// Allocate memory for image
|
||||
uintptr_t ptr_temp2 = 0;
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool(),
|
||||
image_info.size + image_info.alignment, 0, (void**) &ptr_temp2);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Align the image address
|
||||
mul = ptr_temp2 / image_info.alignment;
|
||||
void* ptr_image2 = (void*) ((mul + 1) * image_info.alignment);
|
||||
|
||||
// rocrtst::CommonCleanUp the image to 0
|
||||
hsa_amd_memory_fill(ptr_image2, 0, image_info.size);
|
||||
|
||||
// Create image handle
|
||||
hsa_ext_image_t image_handle_copy;
|
||||
err = hsa_ext_image_create(*gpu_dev, &image_descriptor, ptr_image2,
|
||||
HSA_ACCESS_PERMISSION_RW, &image_handle_copy);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
for (uint32_t it = 0; it < iterations; it++) {
|
||||
// Stamp at the beginning
|
||||
int index = copy_timer.CreateTimer();
|
||||
copy_timer.StartTimer(index);
|
||||
|
||||
err = hsa_ext_image_copy(*gpu_dev, image_handle, &image_offset,
|
||||
image_handle_copy, &image_offset, &range);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Stamp in the end
|
||||
copy_timer.StopTimer(index);
|
||||
copy_image.push_back(copy_timer.ReadTimer(index));
|
||||
|
||||
// Check if image data is correct
|
||||
hsa_amd_memory_fill(image_buffer, 0,
|
||||
Size[i] * Size[i] * sizeof(uint32_t));
|
||||
|
||||
// Export image
|
||||
err = hsa_ext_image_export(*gpu_dev, image_handle_copy, image_buffer,
|
||||
0, 0, &image_region);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Check if the value is correct
|
||||
for (uint32_t j = 0; j < Size[i] * Size[i]; j++) {
|
||||
ASSERT_EQ(image_buffer[j], 0x10101010);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Calculate Bandwidth
|
||||
import_bandwidth_[i] = CalculateBandwidth(import_image, Size[i]);
|
||||
export_bandwidth_[i] = CalculateBandwidth(export_image, Size[i]);
|
||||
copy_bandwidth_[i] = CalculateBandwidth(copy_image, Size[i]);
|
||||
}
|
||||
}
|
||||
|
||||
double ImageBandwidth::CalculateBandwidth(std::vector<double>& vec,
|
||||
size_t size) {
|
||||
double mean = 0.0;
|
||||
|
||||
// Delete the first timer result, which is warm up test
|
||||
vec.erase(vec.begin());
|
||||
|
||||
// Sort the results
|
||||
std::sort(vec.begin(), vec.end());
|
||||
|
||||
// Delete the last 20% of the results
|
||||
|
||||
vec.erase(vec.begin() + num_iteration(), vec.end());
|
||||
|
||||
int num = vec.size();
|
||||
|
||||
for (int index = 0; index < num; index++) {
|
||||
mean += vec[index];
|
||||
}
|
||||
|
||||
mean /= num;
|
||||
|
||||
return (double) size * size * 4 / mean / 1024 / 1024 / 1024;
|
||||
}
|
||||
|
||||
void ImageBandwidth::DisplayResults() const {
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
fprintf(stdout, "==================================================="
|
||||
"=========================\n");
|
||||
|
||||
fprintf(stdout,
|
||||
" Size Import Export Copy\n");
|
||||
|
||||
for (int i = 0; i < 10; i++) {
|
||||
fprintf(stdout,
|
||||
" %s %f(GB/s) %f(GB/s) %f(GB/s)\n",
|
||||
Str[i], import_bandwidth_[i], export_bandwidth_[i],
|
||||
copy_bandwidth_[i]);
|
||||
fprintf(stdout, "================================================="
|
||||
"===========================\n");
|
||||
}
|
||||
}
|
||||
|
||||
void ImageBandwidth::Close() {
|
||||
hsa_status_t err;
|
||||
err = rocrtst::CommonCleanUp(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
size_t ImageBandwidth::RealIterationNum() {
|
||||
return num_iteration() * 1.2 + 1;
|
||||
}
|
||||
@@ -1,99 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __ROCRTST_SRC_IMAGE_BANDWIDTH_H__
|
||||
#define __ROCRTST_SRC_IMAGE_BANDWIDTH_H__
|
||||
|
||||
#include "perf_common/perf_base.h"
|
||||
#include "common/base_rocr.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include "hsa/hsa_ext_image.h"
|
||||
#include <vector>
|
||||
|
||||
class ImageBandwidth: public rocrtst::BaseRocR, public PerfBase {
|
||||
public:
|
||||
//@Brief: Constructor for test case of ImageBandwidth
|
||||
ImageBandwidth(size_t num = 100);
|
||||
|
||||
//@Brief: Destructor
|
||||
virtual ~ImageBandwidth();
|
||||
|
||||
//@Brief: Setup the environment for measurement
|
||||
virtual void SetUp();
|
||||
|
||||
//@Brief: Core measurement execution
|
||||
virtual void Run();
|
||||
|
||||
//@Brief: Clean up and retrive the resource
|
||||
virtual void Close();
|
||||
|
||||
//@Brief: Display results
|
||||
virtual void DisplayResults() const;
|
||||
|
||||
private:
|
||||
//@Brief: Define image size and corresponding string
|
||||
static const size_t Size[10];
|
||||
static const char* const Str[10];
|
||||
|
||||
//@Brief: Get actual iteration number
|
||||
size_t RealIterationNum();
|
||||
|
||||
//@Brief: Calculate Bandwidth
|
||||
double CalculateBandwidth(std::vector<double>& vec, size_t size);
|
||||
|
||||
protected:
|
||||
//@Brief: bandwidth data
|
||||
double import_bandwidth_[10];
|
||||
double export_bandwidth_[10];
|
||||
double copy_bandwidth_[10];
|
||||
|
||||
//@Brief: Image format
|
||||
hsa_ext_image_format_t format_;
|
||||
|
||||
//@Brief: Image geometry
|
||||
hsa_ext_image_geometry_t geometry_;
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -1,270 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "image_load_bandwidth.h"
|
||||
#include "common/base_rocr_utils.h"
|
||||
#include "common/common.h"
|
||||
#include "common/hsatimer.h"
|
||||
#include "common/helper_funcs.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "hsa/hsa_ext_image.h"
|
||||
#include <stdio.h>
|
||||
#include <vector>
|
||||
|
||||
// Constructor of the class
|
||||
ImageLoadBandwidth::ImageLoadBandwidth() :
|
||||
BaseRocR() {
|
||||
load_bandwidth_ = 0.0;
|
||||
image_size_ = 0;
|
||||
|
||||
set_requires_profile (HSA_PROFILE_FULL);
|
||||
}
|
||||
|
||||
// Destructor of the class
|
||||
ImageLoadBandwidth::~ImageLoadBandwidth() {
|
||||
|
||||
}
|
||||
|
||||
// Set up the environment
|
||||
void ImageLoadBandwidth::SetUp() {
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
|
||||
set_kernel_file_name("load_2d_image.o");
|
||||
set_kernel_name("&__OpenCL_load_2d_image_kernel");
|
||||
|
||||
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
//Create a queue with max number size
|
||||
hsa_queue_t* q = main_queue();
|
||||
rocrtst::CreateQueue(*gpu_dev, &q);
|
||||
|
||||
rocrtst::LoadKernelFromObjFile(this);
|
||||
|
||||
//Fill up part of aql
|
||||
rocrtst::InitializeAQLPacket(this, &aql());
|
||||
aql().setup = 0;
|
||||
aql().setup |= 2 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Run the test
|
||||
void ImageLoadBandwidth::Run() {
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
hsa_agent_t* cpu_dev = cpu_device();
|
||||
|
||||
hsa_status_t err;
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
hsa_ext_image_descriptor_t image_descriptor;
|
||||
image_descriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2D;
|
||||
image_descriptor.width = 256;
|
||||
image_descriptor.height = 256;
|
||||
image_descriptor.depth = 1;
|
||||
image_descriptor.array_size = 0;
|
||||
image_descriptor.format.channel_type =
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8;
|
||||
image_descriptor.format.channel_order = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA;
|
||||
|
||||
hsa_ext_image_format_t image_format;
|
||||
image_format.channel_type = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8;
|
||||
image_format.channel_order = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA;
|
||||
|
||||
// Check if device_ supports at least read only operation on image format
|
||||
uint32_t capability_mask;
|
||||
err = hsa_ext_image_get_capability(*gpu_dev, HSA_EXT_IMAGE_GEOMETRY_2D,
|
||||
&image_format, &capability_mask);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
if (!(capability_mask & HSA_EXT_IMAGE_CAPABILITY_READ_ONLY)) {
|
||||
ASSERT_FALSE(
|
||||
"Device does not support read and write operation on this kind of image!");
|
||||
}
|
||||
|
||||
// Get image info
|
||||
hsa_ext_image_data_info_t image_info;
|
||||
err = hsa_ext_image_data_get_info(*gpu_dev, &image_descriptor,
|
||||
HSA_ACCESS_PERMISSION_RO, &image_info);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
image_size_ = image_info.size;
|
||||
|
||||
std::vector<double> time;
|
||||
|
||||
for (uint32_t i = 0; i < num_iteration(); i++) {
|
||||
#ifdef DEBUG
|
||||
std::cout << ".";
|
||||
fflush(stdout);
|
||||
#endif
|
||||
// Allocate memory space for image
|
||||
// Find the global region
|
||||
err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool,
|
||||
&cpu_pool());
|
||||
ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
|
||||
uintptr_t ptr_temp = 0;
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool(),
|
||||
image_info.size + image_info.alignment,
|
||||
0, (void**) &ptr_temp);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = hsa_amd_agents_allow_access(1, gpu_dev, NULL, (void*) ptr_temp);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Align the image address
|
||||
uintptr_t mul = ptr_temp / image_info.alignment;
|
||||
void* ptr_image = (void*) ((mul + 1) * image_info.alignment);
|
||||
|
||||
// rocrtst::CommonCleanUp the image memory to 1
|
||||
err = hsa_amd_memory_fill(ptr_image, 1, image_info.size);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Create image handle
|
||||
hsa_ext_image_t image_handle;
|
||||
err = hsa_ext_image_create(*gpu_dev, &image_descriptor, ptr_image,
|
||||
HSA_ACCESS_PERMISSION_RO, &image_handle);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Allocate and initialize the kernel argument
|
||||
typedef struct args_t {
|
||||
uint64_t arg0;
|
||||
int* arg1;
|
||||
int istart;
|
||||
int iend;
|
||||
int istep;
|
||||
} args;
|
||||
|
||||
int local_out = 5;
|
||||
int istart = 0;
|
||||
int iend = 64;
|
||||
int istep = 1;
|
||||
|
||||
args* kern_ptr = NULL;
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(args), 0,
|
||||
(void**) &kern_ptr);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
kern_ptr->arg0 = image_handle.handle;
|
||||
kern_ptr->arg1 = &local_out;
|
||||
kern_ptr->istart = istart;
|
||||
kern_ptr->iend = iend;
|
||||
kern_ptr->istep = istep;
|
||||
|
||||
aql().kernarg_address = kern_ptr;
|
||||
|
||||
// Obtain the current queue write index
|
||||
uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
|
||||
|
||||
void *q_base_addr = main_queue()->base_address;
|
||||
|
||||
// Write the aql packet at the calculated queue index address.
|
||||
const uint32_t queue_mask = main_queue()->size - 1;
|
||||
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql();
|
||||
|
||||
rocrtst::PerfTimer p_timer;
|
||||
int id = p_timer.CreateTimer();
|
||||
p_timer.StartTimer(id);
|
||||
|
||||
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask].header |=
|
||||
HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
|
||||
hsa_signal_store_release(main_queue()->doorbell_signal, index);
|
||||
|
||||
// Wait on the dispatch signal until the kernel is finished.
|
||||
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
|
||||
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
|
||||
;
|
||||
|
||||
p_timer.StopTimer(id);
|
||||
|
||||
time.push_back(p_timer.ReadTimer(id));
|
||||
|
||||
hsa_signal_store_release(signal(), 1);
|
||||
|
||||
err = hsa_ext_image_destroy(*gpu_dev, image_handle);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = hsa_memory_deregister(ptr_image, image_info.size);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
hsa_amd_memory_pool_free((void*) ptr_temp);
|
||||
}
|
||||
|
||||
// Calculte the mean load time
|
||||
time.erase(time.begin());
|
||||
#ifdef DEBUG
|
||||
|
||||
for (uint32_t i = 0; i < time.size(); i++) {
|
||||
std::cout << time[i] << std::endl;
|
||||
}
|
||||
|
||||
#endif
|
||||
double mean_time = rocrtst::CalcMean(time);
|
||||
load_bandwidth_ = image_size_ / mean_time / 1024 / 1024 / 1024;
|
||||
|
||||
}
|
||||
|
||||
void ImageLoadBandwidth::Close() {
|
||||
hsa_status_t err;
|
||||
err = rocrtst::CommonCleanUp(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
void ImageLoadBandwidth::DisplayResults() const {
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::cout << "======================================"
|
||||
"======================================" << std::endl;
|
||||
std::cout << " Image Size(bytes): LoadBandwidth(GB/S): "
|
||||
<< std::endl;
|
||||
std::cout << " " << image_size_ << " "
|
||||
<< load_bandwidth_ << std::endl;
|
||||
}
|
||||
|
||||
@@ -1,271 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "image_store_bandwidth.h"
|
||||
#include "common/base_rocr_utils.h"
|
||||
#include "common/common.h"
|
||||
#include "common/helper_funcs.h"
|
||||
#include "common/hsatimer.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "hsa/hsa_ext_image.h"
|
||||
#include <stdio.h>
|
||||
#include <vector>
|
||||
|
||||
// Constructor of the class
|
||||
ImageStoreBandwidth::ImageStoreBandwidth() :
|
||||
BaseRocR() {
|
||||
store_bandwidth_ = 0.0;
|
||||
store_bandwidth_ = 0.0;
|
||||
image_size_ = 0;
|
||||
|
||||
set_requires_profile (HSA_PROFILE_FULL);
|
||||
}
|
||||
|
||||
// Destructor of the class
|
||||
ImageStoreBandwidth::~ImageStoreBandwidth() {
|
||||
|
||||
}
|
||||
|
||||
// Set up the environment
|
||||
void ImageStoreBandwidth::SetUp() {
|
||||
|
||||
set_kernel_file_name("store_2d_image.o");
|
||||
set_kernel_name("&__OpenCL_store_2d_image_kernel");
|
||||
|
||||
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
|
||||
//Create a queue with max number size
|
||||
hsa_queue_t* q = nullptr;
|
||||
rocrtst::CreateQueue(*gpu_dev, &q);
|
||||
set_main_queue(q);
|
||||
|
||||
rocrtst::LoadKernelFromObjFile(this);
|
||||
|
||||
//Fill up part of aql
|
||||
rocrtst::InitializeAQLPacket(this, &aql());
|
||||
aql().setup = 0;
|
||||
aql().setup |= 2 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Run the test
|
||||
void ImageStoreBandwidth::Run() {
|
||||
hsa_status_t err;
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
hsa_agent_t* cpu_dev = cpu_device();
|
||||
|
||||
hsa_ext_image_descriptor_t image_descriptor;
|
||||
image_descriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2D;
|
||||
image_descriptor.width = 256;
|
||||
image_descriptor.height = 256;
|
||||
image_descriptor.depth = 1;
|
||||
image_descriptor.array_size = 0;
|
||||
image_descriptor.format.channel_type =
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8;
|
||||
image_descriptor.format.channel_order = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA;
|
||||
|
||||
hsa_ext_image_format_t image_format;
|
||||
image_format.channel_type = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8;
|
||||
image_format.channel_order = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA;
|
||||
|
||||
// Check if device_ supports at least read only operation on image format
|
||||
uint32_t capability_mask;
|
||||
err = hsa_ext_image_get_capability(*gpu_dev, HSA_EXT_IMAGE_GEOMETRY_2D,
|
||||
&image_format, &capability_mask);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
if (!(capability_mask & HSA_EXT_IMAGE_CAPABILITY_READ_ONLY)) {
|
||||
std::cout <<
|
||||
"Device does not support read and write operation on this kind of image!"
|
||||
<< std::endl;
|
||||
ASSERT_NE(capability_mask & HSA_EXT_IMAGE_CAPABILITY_READ_ONLY, 0);
|
||||
}
|
||||
|
||||
// Get image info
|
||||
hsa_ext_image_data_info_t image_info;
|
||||
err = hsa_ext_image_data_get_info(*gpu_dev, &image_descriptor,
|
||||
HSA_ACCESS_PERMISSION_RW, &image_info);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
image_size_ = image_info.size;
|
||||
|
||||
std::vector<double> time;
|
||||
|
||||
for (uint32_t i = 0; i < num_iteration(); i++) {
|
||||
#ifdef DEBUG
|
||||
std::cout << ".";
|
||||
fflush(stdout);
|
||||
#endif
|
||||
// Allocate memory space for image
|
||||
err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool,
|
||||
&cpu_pool());
|
||||
ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
|
||||
|
||||
uintptr_t ptr_temp = 0;
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool(),
|
||||
image_info.size + image_info.alignment,
|
||||
0, (void**) &ptr_temp);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Align the image address
|
||||
uintptr_t mul = ptr_temp / image_info.alignment;
|
||||
void* ptr_image = (void*) ((mul + 1) * image_info.alignment);
|
||||
|
||||
// rocrtst::CommonCleanUp the image memory to 0
|
||||
err = hsa_amd_memory_fill(ptr_image, 0, image_info.size);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Create image handle
|
||||
hsa_ext_image_t image_handle;
|
||||
err = hsa_ext_image_create(*gpu_dev, &image_descriptor, ptr_image,
|
||||
HSA_ACCESS_PERMISSION_RO, &image_handle);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Allocate and initialize the kernel argument
|
||||
typedef struct args_t {
|
||||
uint64_t arg0;
|
||||
int istart;
|
||||
int iend;
|
||||
int istep;
|
||||
} args;
|
||||
|
||||
//int local_out = 5;
|
||||
int istart = 0;
|
||||
int iend = 64;
|
||||
int istep = 1;
|
||||
|
||||
args* kern_ptr = NULL;
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(args), 0,
|
||||
(void**) &kern_ptr);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
kern_ptr->arg0 = image_handle.handle;
|
||||
kern_ptr->istart = istart;
|
||||
kern_ptr->iend = iend;
|
||||
kern_ptr->istep = istep;
|
||||
|
||||
aql().kernarg_address = kern_ptr;
|
||||
|
||||
// Obtain the current queue write index
|
||||
uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
|
||||
|
||||
void *q_base_addr = main_queue()->base_address;
|
||||
// Write the aql packet at the calculated queue index address.
|
||||
const uint32_t queue_mask = main_queue()->size - 1;
|
||||
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql();
|
||||
|
||||
rocrtst::PerfTimer p_timer;
|
||||
int id = p_timer.CreateTimer();
|
||||
p_timer.StartTimer(id);
|
||||
|
||||
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask].header |=
|
||||
HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
|
||||
hsa_signal_store_release(main_queue()->doorbell_signal, index);
|
||||
|
||||
// Wait on the dispatch signal until the kernel is finished.
|
||||
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
|
||||
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
|
||||
;
|
||||
|
||||
p_timer.StopTimer(id);
|
||||
|
||||
time.push_back(p_timer.ReadTimer(id));
|
||||
|
||||
hsa_signal_store_release(signal(), 1);
|
||||
|
||||
err = hsa_ext_image_destroy(*gpu_dev, image_handle);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = hsa_memory_deregister(ptr_image, image_info.size);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
hsa_amd_memory_pool_free(reinterpret_cast<void*>(ptr_temp));
|
||||
}
|
||||
|
||||
// Calculte the mean load time
|
||||
time.erase(time.begin());
|
||||
#ifdef DEBUG
|
||||
|
||||
for (size_t i = 0; i < time.size(); i++) {
|
||||
std::cout << time[i] << std::endl;
|
||||
}
|
||||
|
||||
#endif
|
||||
double mean_time = rocrtst::CalcMean(time);
|
||||
std::cout << "mean time: " << mean_time << std::endl;
|
||||
|
||||
store_bandwidth_ = image_size_ / mean_time / 1024 / 1024 / 1024;
|
||||
}
|
||||
|
||||
void ImageStoreBandwidth::Close() {
|
||||
hsa_status_t err;
|
||||
err = rocrtst::CommonCleanUp(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
void ImageStoreBandwidth::DisplayResults() const {
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::cout << "============================================="
|
||||
"===============================" << std::endl;
|
||||
|
||||
std::cout << " Image Size(bytes): StoreBandwidth(GB/S): "
|
||||
<< std::cout;
|
||||
std::cout << " " << image_size_ << " "
|
||||
<< store_bandwidth_ << std::endl;
|
||||
}
|
||||
|
||||
Двоичные данные
Двоичный файл не отображается.
+5
-40
@@ -43,43 +43,8 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __ROCRTST_SRC_HSA_INFO_H__
|
||||
#define __ROCRTST_SRC_HSA_INFO_H__
|
||||
|
||||
#include "perf_common/perf_base.h"
|
||||
#include "common/base_rocr.h"
|
||||
#include "common/common.h"
|
||||
#include "common/os.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include "hsa/hsa_ext_amd.h"
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
//@Brief: This is trying to replicate clinfo
|
||||
|
||||
class HsaInfo: public rocrtst::BaseRocR, public PerfBase {
|
||||
public:
|
||||
//@Brief: Constructor
|
||||
HsaInfo();
|
||||
|
||||
//@Brief: Destructor
|
||||
virtual ~HsaInfo();
|
||||
|
||||
//@Brief: Set up the environment for the test
|
||||
virtual void SetUp();
|
||||
|
||||
//@Brief: Run the test case
|
||||
virtual void Run();
|
||||
|
||||
//@Brief: Display results we got
|
||||
virtual void DisplayResults() const;
|
||||
|
||||
//@Brief: Clean up and close the runtime
|
||||
virtual void Close();
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
__kernel void
|
||||
empty_kernel(void) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1,12 +0,0 @@
|
||||
module &m:1:0:$full:$large:$default;
|
||||
extension "amd:gcn";
|
||||
extension "IMAGE";
|
||||
|
||||
decl prog function &abort()();
|
||||
|
||||
prog kernel &__Empty_kernel()
|
||||
{
|
||||
|
||||
ret;
|
||||
};
|
||||
|
||||
@@ -1,88 +0,0 @@
|
||||
module &m:1:0:$full:$large:$default;
|
||||
|
||||
/* Copyright 2014 HSA Foundation Inc. All Rights Reserved.
|
||||
*
|
||||
* HSAF is granting you permission to use this software and documentation (if
|
||||
* any) (collectively, the "Materials") pursuant to the terms and conditions
|
||||
* of the Software License Agreement included with the Materials. If you do
|
||||
* not have a copy of the Software License Agreement, contact the HSA Foundation for a copy.
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
|
||||
*/
|
||||
|
||||
extension "amd:gcn";
|
||||
extension "IMAGE";
|
||||
|
||||
decl prog function &abort()();
|
||||
|
||||
|
||||
/**
|
||||
* @brief Hsail kernel to benchmark READ accesses to system memory.
|
||||
* The kernel is given a input buffer from which each each thread will
|
||||
* read. The thread will read from multiple locations of the input buffer.
|
||||
* The locations to read from is determined by the work-item Id, the function
|
||||
* being work-item Id modulo total number of work-items in the global work grid.
|
||||
* So given a global work grid of 16 work-items the reads by a thread with absolute
|
||||
* id 4 would be 4, 20, 36, 52, etc.
|
||||
*
|
||||
* @NOTE: A constraint imposed by the kernel is that the buffer size be large
|
||||
* enough to support 16 reads by each thread. So a dispatch of 8 work-items
|
||||
* should allocate enough buffer for 8 * 16 * sizeof(uint32_t).
|
||||
*
|
||||
* @param bufStart beginning byte address of user buffer in system memory
|
||||
* from which kernel threads could read
|
||||
*
|
||||
* @param bufEnd byte address that follows the end of user buffer. Accessing
|
||||
* memory at bufEnd is illegal
|
||||
*
|
||||
* @param addrStep size by which to increment byte address following each read
|
||||
* operation. The value represents total number of work-items * sizeof(uint32_t)
|
||||
*
|
||||
* @param outAddr argument that is passed by the user to be updated with values
|
||||
* read by the kernel threads. This is ensure compiler and finalizer do not eliminate
|
||||
* code because the values being read are not used in any meaningfule way.
|
||||
*
|
||||
*/
|
||||
prog kernel &main(kernarg_u64 %outAddr) {
|
||||
|
||||
pragma "AMD RTI", "ARGSTART:__SysMemLoad";
|
||||
pragma "AMD RTI", "version:3:1:104";
|
||||
pragma "AMD RTI", "device:generic";
|
||||
pragma "AMD RTI", "uniqueid:1024";
|
||||
pragma "AMD RTI", "function:1:0";
|
||||
pragma "AMD RTI", "memory:64bitABI";
|
||||
pragma "AMD RTI", "uavid:8";
|
||||
pragma "AMD RTI", "privateid:8";
|
||||
pragma "AMD RTI", "ARGEND:__SysMemLoad";
|
||||
|
||||
ld_kernarg_u64 $d0, [%outAddr];
|
||||
|
||||
// Compute the absolute id of current thread
|
||||
// and shift it by two to get index into user
|
||||
// buffer to access for Read operation
|
||||
workitemflatabsid_u32 $s0;
|
||||
shl_u32 $s0, $s0, 2;
|
||||
cvt_u64_u32 $d4, $s0;
|
||||
|
||||
// Add index to base address of user buffer to obtain
|
||||
// effective address for access
|
||||
add_u64 $d0, $d0, $d4;
|
||||
|
||||
mov_u32 $s2, 1;
|
||||
|
||||
st_global_u32 $s2, [$d0];
|
||||
|
||||
};
|
||||
|
||||
-88
@@ -1,88 +0,0 @@
|
||||
module &m:1:0:$base:$large:$default;
|
||||
|
||||
/* Copyright 2014 HSA Foundation Inc. All Rights Reserved.
|
||||
*
|
||||
* HSAF is granting you permission to use this software and documentation (if
|
||||
* any) (collectively, the "Materials") pursuant to the terms and conditions
|
||||
* of the Software License Agreement included with the Materials. If you do
|
||||
* not have a copy of the Software License Agreement, contact the HSA Foundation for a copy.
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
|
||||
*/
|
||||
|
||||
extension "amd:gcn";
|
||||
extension "IMAGE";
|
||||
|
||||
decl prog function &abort()();
|
||||
|
||||
|
||||
/**
|
||||
* @brief Hsail kernel to benchmark READ accesses to system memory.
|
||||
* The kernel is given a input buffer from which each each thread will
|
||||
* read. The thread will read from multiple locations of the input buffer.
|
||||
* The locations to read from is determined by the work-item Id, the function
|
||||
* being work-item Id modulo total number of work-items in the global work grid.
|
||||
* So given a global work grid of 16 work-items the reads by a thread with absolute
|
||||
* id 4 would be 4, 20, 36, 52, etc.
|
||||
*
|
||||
* @NOTE: A constraint imposed by the kernel is that the buffer size be large
|
||||
* enough to support 16 reads by each thread. So a dispatch of 8 work-items
|
||||
* should allocate enough buffer for 8 * 16 * sizeof(uint32_t).
|
||||
*
|
||||
* @param bufStart beginning byte address of user buffer in system memory
|
||||
* from which kernel threads could read
|
||||
*
|
||||
* @param bufEnd byte address that follows the end of user buffer. Accessing
|
||||
* memory at bufEnd is illegal
|
||||
*
|
||||
* @param addrStep size by which to increment byte address following each read
|
||||
* operation. The value represents total number of work-items * sizeof(uint32_t)
|
||||
*
|
||||
* @param outAddr argument that is passed by the user to be updated with values
|
||||
* read by the kernel threads. This is ensure compiler and finalizer do not eliminate
|
||||
* code because the values being read are not used in any meaningfule way.
|
||||
*
|
||||
*/
|
||||
prog kernel &main(kernarg_u64 %outAddr) {
|
||||
|
||||
pragma "AMD RTI", "ARGSTART:__SysMemLoad";
|
||||
pragma "AMD RTI", "version:3:1:104";
|
||||
pragma "AMD RTI", "device:generic";
|
||||
pragma "AMD RTI", "uniqueid:1024";
|
||||
pragma "AMD RTI", "function:1:0";
|
||||
pragma "AMD RTI", "memory:64bitABI";
|
||||
pragma "AMD RTI", "uavid:8";
|
||||
pragma "AMD RTI", "privateid:8";
|
||||
pragma "AMD RTI", "ARGEND:__SysMemLoad";
|
||||
|
||||
ld_kernarg_u64 $d0, [%outAddr];
|
||||
|
||||
// Compute the absolute id of current thread
|
||||
// and shift it by two to get index into user
|
||||
// buffer to access for Read operation
|
||||
workitemflatabsid_u32 $s0;
|
||||
shl_u32 $s0, $s0, 2;
|
||||
cvt_u64_u32 $d4, $s0;
|
||||
|
||||
// Add index to base address of user buffer to obtain
|
||||
// effective address for access
|
||||
add_u64 $d0, $d0, $d4;
|
||||
|
||||
mov_u32 $s2, 1;
|
||||
|
||||
st_global_u32 $s2, [$d0];
|
||||
|
||||
};
|
||||
|
||||
@@ -1,109 +0,0 @@
|
||||
module &m:1:0:$full:$large:$default;
|
||||
extension "amd:gcn";
|
||||
extension "IMAGE";
|
||||
|
||||
decl prog function &abort()();
|
||||
|
||||
prog kernel &__OpenCL_load_2d_image_kernel(
|
||||
kernarg_rwimg %input,
|
||||
kernarg_u64 %result,
|
||||
kernarg_u32 %istart,
|
||||
kernarg_u32 %iend,
|
||||
kernarg_u32 %istep)
|
||||
{
|
||||
pragma "AMD RTI", "ARGSTART:__OpenCL_load_2d_image_kernel";
|
||||
pragma "AMD RTI", "version:3:1:104";
|
||||
pragma "AMD RTI", "device:generic";
|
||||
pragma "AMD RTI", "uniqueid:1024";
|
||||
pragma "AMD RTI", "function:1:0";
|
||||
pragma "AMD RTI", "memory:64bitABI";
|
||||
pragma "AMD RTI", "uavid:8";
|
||||
pragma "AMD RTI", "privateid:8";
|
||||
pragma "AMD RTI", "ARGEND:__OpenCL_load_2d_image_kernel";
|
||||
|
||||
@__OpenCL_load_2d_image_kernel_entry:
|
||||
// BB#0: // %entry
|
||||
workitemabsid_u32 $s0, 1;
|
||||
workitemabsid_u32 $s1, 0;
|
||||
ld_kernarg_rwimg $d5, [%input];
|
||||
ld_kernarg_u32 $s2, [%istart];
|
||||
ld_kernarg_u32 $s3, [%iend];
|
||||
ld_kernarg_u32 $s4, [%istep];
|
||||
|
||||
add_u32 $s9, 0, 0; // reset s9 to zero
|
||||
@loop:
|
||||
add_u32 $s2, $s2, $s4;
|
||||
|
||||
ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0); //(coordWidth, coordHeight)
|
||||
add_u32 $s9, $s9, $s5;
|
||||
|
||||
//force to retrieve different image elements
|
||||
add_u32 $s1, $s1, 64;
|
||||
and_b32 $s1, $s1, 255;
|
||||
add_u32 $s0, $s0, 64;
|
||||
and_b32 $s0, $s0, 255;
|
||||
|
||||
ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0);
|
||||
add_u32 $s9, $s9, $s6;
|
||||
|
||||
//force to retrieve different image elements
|
||||
add_u32 $s1, $s1, 64;
|
||||
and_b32 $s1, $s1, 255;
|
||||
add_u32 $s0, $s0, 64;
|
||||
and_b32 $s0, $s0, 255;
|
||||
|
||||
ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0);
|
||||
add_u32 $s9, $s9, $s7;
|
||||
|
||||
//force to retrieve different image elements
|
||||
add_u32 $s1, $s1, 64;
|
||||
and_b32 $s1, $s1, 255;
|
||||
add_u32 $s0, $s0, 64;
|
||||
and_b32 $s0, $s0, 255;
|
||||
|
||||
ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0);
|
||||
add_u32 $s9, $s9, $s8;
|
||||
|
||||
//force to retrieve different image elements
|
||||
add_u32 $s1, $s1, 64;
|
||||
and_b32 $s1, $s1, 255;
|
||||
add_u32 $s0, $s0, 64;
|
||||
and_b32 $s0, $s0, 255;
|
||||
|
||||
ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0);
|
||||
add_u32 $s9, $s9, $s5;
|
||||
|
||||
//force to retrieve different image elements
|
||||
add_u32 $s1, $s1, 64;
|
||||
and_b32 $s1, $s1, 255;
|
||||
add_u32 $s0, $s0, 64;
|
||||
and_b32 $s0, $s0, 255;
|
||||
|
||||
ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0);
|
||||
add_u32 $s9, $s9, $s6;
|
||||
|
||||
//force to retrieve different image elements
|
||||
add_u32 $s1, $s1, 64;
|
||||
and_b32 $s1, $s1, 255;
|
||||
add_u32 $s0, $s0, 64;
|
||||
and_b32 $s0, $s0, 255;
|
||||
|
||||
ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0);
|
||||
add_u32 $s9, $s9, $s7;
|
||||
|
||||
//force to retrieve different image elements
|
||||
add_u32 $s1, $s1, 64;
|
||||
and_b32 $s1, $s1, 255;
|
||||
add_u32 $s0, $s0, 64;
|
||||
and_b32 $s0, $s0, 255;
|
||||
|
||||
ldimage_v4_2d_u32_rwimg_u32 ($s5, $s6, $s7, $s8), $d5, ($s1, $s0);
|
||||
ld_kernarg_align(8)_width(all)_u64 $d4, [%result];
|
||||
add_u32 $s9, $s9, $s8;
|
||||
|
||||
st_u32 $s9, [$d4];
|
||||
|
||||
//loop until we hit condition
|
||||
cmp_lt_b1_u32 $c0, $s2, $s3;
|
||||
cbr_b1 $c0, @loop;
|
||||
};
|
||||
@@ -1,37 +0,0 @@
|
||||
module &m:1:0:$full:$large:$default;
|
||||
extension "amd:gcn";
|
||||
extension "IMAGE";
|
||||
|
||||
decl prog function &abort()();
|
||||
|
||||
|
||||
/* This function takes in 2 memory locations, one storing a number of
|
||||
iterations to execute, and the other a place to store a result.
|
||||
The function iterates through a loop "iteration" times, and stores
|
||||
the number of iterations executed in the "results" location.
|
||||
A successful run is when the value stored in %iteration is the
|
||||
same as the value store in %results.
|
||||
*/
|
||||
|
||||
prog kernel &__simple_kernel(
|
||||
kernarg_u64 %iteration,
|
||||
kernarg_u64 %results)
|
||||
{
|
||||
ret;
|
||||
ld_kernarg_align(8)_width(all)_u64 $d1, [%iteration];
|
||||
ld_kernarg_align(8)_width(all)_u64 $d2, [%results];
|
||||
|
||||
ld_global_u32 $s1, [$d1];
|
||||
mov_u32 $s2, 0;
|
||||
|
||||
|
||||
@loop:
|
||||
add_u32 $s2, $s2, 1;
|
||||
cmp_lt_b1_u32 $c0, $s2, $s1;
|
||||
cbr_b1 $c0, @loop;
|
||||
|
||||
st_global_u32 $s2, [$d2];
|
||||
|
||||
ret;
|
||||
};
|
||||
|
||||
-28
@@ -1,28 +0,0 @@
|
||||
module &m:1:0:$base:$large:$default;
|
||||
extension "amd:gcn";
|
||||
extension "IMAGE";
|
||||
|
||||
decl prog function &abort()();
|
||||
|
||||
prog kernel &__simple_kernel(
|
||||
kernarg_u64 %iteration,
|
||||
kernarg_u64 %results)
|
||||
{
|
||||
|
||||
ld_kernarg_align(8)_width(all)_u64 $d1, [%iteration];
|
||||
ld_kernarg_align(8)_width(all)_u64 $d2, [%results];
|
||||
|
||||
ld_global_u32 $s1, [$d1];
|
||||
mov_u32 $s2, 0;
|
||||
|
||||
|
||||
@loop:
|
||||
add_u32 $s2, $s2, 1;
|
||||
cmp_lt_b1_u32 $c0, $s2, $s1;
|
||||
cbr_b1 $c0, @loop;
|
||||
|
||||
st_global_u32 $s2, [$d2];
|
||||
|
||||
ret;
|
||||
};
|
||||
|
||||
@@ -1,105 +0,0 @@
|
||||
module &m:1:0:$full:$large:$default;
|
||||
extension "amd:gcn";
|
||||
extension "IMAGE";
|
||||
|
||||
decl prog function &abort()();
|
||||
|
||||
prog kernel &__OpenCL_store_2d_image_kernel(
|
||||
kernarg_rwimg %output,
|
||||
kernarg_u32 %istart,
|
||||
kernarg_u32 %iend,
|
||||
kernarg_u32 %istep)
|
||||
{
|
||||
pragma "AMD RTI", "ARGSTART:__OpenCL_store_2d_image_kernel";
|
||||
pragma "AMD RTI", "version:3:1:104";
|
||||
pragma "AMD RTI", "device:generic";
|
||||
pragma "AMD RTI", "uniqueid:1024";
|
||||
pragma "AMD RTI", "function:1:0";
|
||||
pragma "AMD RTI", "memory:64bitABI";
|
||||
pragma "AMD RTI", "uavid:8";
|
||||
pragma "AMD RTI", "privateid:8";
|
||||
pragma "AMD RTI", "ARGEND:__OpenCL_store_2d_image_kernel";
|
||||
|
||||
@__OpenCL_store_2d_image_kernel_entry:
|
||||
// BB#0: // %entry
|
||||
workitemabsid_u32 $s0, 1;
|
||||
workitemabsid_u32 $s1, 0;
|
||||
ld_kernarg_rwimg $d5, [%output];
|
||||
ld_kernarg_u32 $s2, [%istart];
|
||||
ld_kernarg_u32 $s3, [%iend];
|
||||
ld_kernarg_u32 $s4, [%istep];
|
||||
|
||||
mov_b32 $s5, 0;
|
||||
@loop:
|
||||
add_u32 $s2, $s2, $s4;
|
||||
add_u32 $s5, $s5, 1;
|
||||
stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
|
||||
|
||||
//force to retrieve different image elements
|
||||
add_u32 $s1, $s1, 64;
|
||||
and_b32 $s1, $s1, 255;
|
||||
add_u32 $s0, $s0, 64;
|
||||
and_b32 $s0, $s0, 255;
|
||||
|
||||
add_u32 $s5, $s5, $s2;
|
||||
stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
|
||||
|
||||
//force to retrieve different image elements
|
||||
add_u32 $s1, $s1, 64;
|
||||
and_b32 $s1, $s1, 255;
|
||||
add_u32 $s0, $s0, 64;
|
||||
and_b32 $s0, $s0, 255;
|
||||
|
||||
add_u32 $s5, $s5, $s2;
|
||||
stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
|
||||
|
||||
//force to retrieve different image elements
|
||||
add_u32 $s1, $s1, 64;
|
||||
and_b32 $s1, $s1, 255;
|
||||
add_u32 $s0, $s0, 64;
|
||||
and_b32 $s0, $s0, 255;
|
||||
|
||||
add_u32 $s5, $s5, $s2;
|
||||
stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
|
||||
|
||||
//force to retrieve different image elements
|
||||
add_u32 $s1, $s1, 64;
|
||||
and_b32 $s1, $s1, 255;
|
||||
add_u32 $s0, $s0, 64;
|
||||
and_b32 $s0, $s0, 255;
|
||||
|
||||
add_u32 $s5, $s5, $s2;
|
||||
stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
|
||||
|
||||
//force to retrieve different image elements
|
||||
add_u32 $s1, $s1, 64;
|
||||
and_b32 $s1, $s1, 255;
|
||||
add_u32 $s0, $s0, 64;
|
||||
and_b32 $s0, $s0, 255;
|
||||
|
||||
add_u32 $s5, $s5, $s2;
|
||||
stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
|
||||
|
||||
//force to retrieve different image elements
|
||||
add_u32 $s1, $s1, 64;
|
||||
and_b32 $s1, $s1, 255;
|
||||
add_u32 $s0, $s0, 64;
|
||||
and_b32 $s0, $s0, 255;
|
||||
|
||||
add_u32 $s5, $s5, $s2;
|
||||
stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
|
||||
|
||||
//force to retrieve different image elements
|
||||
add_u32 $s1, $s1, 64;
|
||||
and_b32 $s1, $s1, 255;
|
||||
add_u32 $s0, $s0, 64;
|
||||
and_b32 $s0, $s0, 255;
|
||||
|
||||
add_u32 $s5, $s5, $s2;
|
||||
stimage_v4_2d_u32_rwimg_u32 ($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
|
||||
|
||||
//loop until we hit condition
|
||||
cmp_lt_b1_u32 $c0, $s2, $s3;
|
||||
cbr_b1 $c0, @loop;
|
||||
ret;
|
||||
};
|
||||
@@ -1,237 +0,0 @@
|
||||
module &m:1:0:$full:$large:$default;
|
||||
|
||||
/* Copyright 2014 HSA Foundation Inc. All Rights Reserved.
|
||||
*
|
||||
* HSAF is granting you permission to use this software and documentation (if
|
||||
* any) (collectively, the "Materials") pursuant to the terms and conditions
|
||||
* of the Software License Agreement included with the Materials. If you do
|
||||
* not have a copy of the Software License Agreement, contact the HSA Foundation for a copy.
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
|
||||
*/
|
||||
|
||||
extension "amd:gcn";
|
||||
extension "IMAGE";
|
||||
|
||||
decl prog function &abort()();
|
||||
|
||||
|
||||
/**
|
||||
* @brief Hsail kernel to benchmark READ accesses to system memory.
|
||||
* The kernel is given a input buffer from which each each thread will
|
||||
* read. The thread will read from multiple locations of the input buffer.
|
||||
* The locations to read from is determined by the work-item Id, the function
|
||||
* being work-item Id modulo total number of work-items in the global work grid.
|
||||
* So given a global work grid of 16 work-items the reads by a thread with absolute
|
||||
* id 4 would be 4, 20, 36, 52, etc.
|
||||
*
|
||||
* @NOTE: A constraint imposed by the kernel is that the buffer size be large
|
||||
* enough to support 16 reads by each thread. So a dispatch of 8 work-items
|
||||
* should allocate enough buffer for 8 * 16 * sizeof(uint32_t).
|
||||
*
|
||||
* @param bufStart beginning byte address of user buffer in system memory
|
||||
* from which kernel threads could read
|
||||
*
|
||||
* @param bufEnd byte address that follows the end of user buffer. Accessing
|
||||
* memory at bufEnd is illegal
|
||||
*
|
||||
* @param addrStep size by which to increment byte address following each read
|
||||
* operation. The value represents total number of work-items * sizeof(uint32_t)
|
||||
*
|
||||
* @param outAddr argument that is passed by the user to be updated with values
|
||||
* read by the kernel threads. This is ensure compiler and finalizer do not eliminate
|
||||
* code because the values being read are not used in any meaningfule way.
|
||||
*
|
||||
*/
|
||||
prog kernel &__SysMemLoad(kernarg_u64 %bufStart,
|
||||
kernarg_u64 %bufEnd,
|
||||
kernarg_u64 %addrStep,
|
||||
kernarg_u64 %outAddr) {
|
||||
|
||||
pragma "AMD RTI", "ARGSTART:__SysMemLoad";
|
||||
pragma "AMD RTI", "version:3:1:104";
|
||||
pragma "AMD RTI", "device:generic";
|
||||
pragma "AMD RTI", "uniqueid:1024";
|
||||
pragma "AMD RTI", "function:1:0";
|
||||
pragma "AMD RTI", "memory:64bitABI";
|
||||
pragma "AMD RTI", "uavid:8";
|
||||
pragma "AMD RTI", "privateid:8";
|
||||
pragma "AMD RTI", "ARGEND:__SysMemLoad";
|
||||
|
||||
// Retrieve the values of input arguments
|
||||
// bufStart refers to the starting byte address
|
||||
// bufEnd refers to the end of byte address
|
||||
// addrStep refers to the product of total number
|
||||
// of work-items in the grid * sizeof(uint32_t)
|
||||
ld_kernarg_u64 $d0, [%bufStart];
|
||||
ld_kernarg_u64 $d1, [%bufEnd];
|
||||
ld_kernarg_u64 $d2, [%addrStep];
|
||||
ld_kernarg_u64 $d3, [%outAddr];
|
||||
|
||||
// Compute the absolute id of current thread
|
||||
// and shift it by two to get index into user
|
||||
// buffer to access for Read operation
|
||||
workitemflatabsid_u32 $s0;
|
||||
shl_u32 $s0, $s0, 2;
|
||||
cvt_u64_u32 $d4, $s0;
|
||||
|
||||
// Add index to base address of user buffer to obtain
|
||||
// effective address for access
|
||||
add_u64 $d0, $d0, $d4;
|
||||
add_u64 $d3, $d3, $d4;
|
||||
|
||||
// Initialize thread's read accumulator to zero
|
||||
mov_u32 $s2, 0;
|
||||
|
||||
@loop:
|
||||
|
||||
// Read sixteeen values with a stride that is
|
||||
// determined by the total number of work-items
|
||||
// in the global grid
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
// Update output buffer with values read
|
||||
// from input buffer
|
||||
st_global_u32 $s2, [$d3];
|
||||
|
||||
};
|
||||
|
||||
-237
@@ -1,237 +0,0 @@
|
||||
module &m:1:0:$base:$large:$default;
|
||||
|
||||
/* Copyright 2014 HSA Foundation Inc. All Rights Reserved.
|
||||
*
|
||||
* HSAF is granting you permission to use this software and documentation (if
|
||||
* any) (collectively, the "Materials") pursuant to the terms and conditions
|
||||
* of the Software License Agreement included with the Materials. If you do
|
||||
* not have a copy of the Software License Agreement, contact the HSA Foundation for a copy.
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
|
||||
*/
|
||||
|
||||
extension "amd:gcn";
|
||||
extension "IMAGE";
|
||||
|
||||
decl prog function &abort()();
|
||||
|
||||
|
||||
/**
|
||||
* @brief Hsail kernel to benchmark READ accesses to system memory.
|
||||
* The kernel is given a input buffer from which each each thread will
|
||||
* read. The thread will read from multiple locations of the input buffer.
|
||||
* The locations to read from is determined by the work-item Id, the function
|
||||
* being work-item Id modulo total number of work-items in the global work grid.
|
||||
* So given a global work grid of 16 work-items the reads by a thread with absolute
|
||||
* id 4 would be 4, 20, 36, 52, etc.
|
||||
*
|
||||
* @NOTE: A constraint imposed by the kernel is that the buffer size be large
|
||||
* enough to support 16 reads by each thread. So a dispatch of 8 work-items
|
||||
* should allocate enough buffer for 8 * 16 * sizeof(uint32_t).
|
||||
*
|
||||
* @param bufStart beginning byte address of user buffer in system memory
|
||||
* from which kernel threads could read
|
||||
*
|
||||
* @param bufEnd byte address that follows the end of user buffer. Accessing
|
||||
* memory at bufEnd is illegal
|
||||
*
|
||||
* @param addrStep size by which to increment byte address following each read
|
||||
* operation. The value represents total number of work-items * sizeof(uint32_t)
|
||||
*
|
||||
* @param outAddr argument that is passed by the user to be updated with values
|
||||
* read by the kernel threads. This is ensure compiler and finalizer do not eliminate
|
||||
* code because the values being read are not used in any meaningfule way.
|
||||
*
|
||||
*/
|
||||
prog kernel &__SysMemLoad(kernarg_u64 %bufStart,
|
||||
kernarg_u64 %bufEnd,
|
||||
kernarg_u64 %addrStep,
|
||||
kernarg_u64 %outAddr) {
|
||||
|
||||
pragma "AMD RTI", "ARGSTART:__SysMemLoad";
|
||||
pragma "AMD RTI", "version:3:1:104";
|
||||
pragma "AMD RTI", "device:generic";
|
||||
pragma "AMD RTI", "uniqueid:1024";
|
||||
pragma "AMD RTI", "function:1:0";
|
||||
pragma "AMD RTI", "memory:64bitABI";
|
||||
pragma "AMD RTI", "uavid:8";
|
||||
pragma "AMD RTI", "privateid:8";
|
||||
pragma "AMD RTI", "ARGEND:__SysMemLoad";
|
||||
|
||||
// Retrieve the values of input arguments
|
||||
// bufStart refers to the starting byte address
|
||||
// bufEnd refers to the end of byte address
|
||||
// addrStep refers to the product of total number
|
||||
// of work-items in the grid * sizeof(uint32_t)
|
||||
ld_kernarg_u64 $d0, [%bufStart];
|
||||
ld_kernarg_u64 $d1, [%bufEnd];
|
||||
ld_kernarg_u64 $d2, [%addrStep];
|
||||
ld_kernarg_u64 $d3, [%outAddr];
|
||||
|
||||
// Compute the absolute id of current thread
|
||||
// and shift it by two to get index into user
|
||||
// buffer to access for Read operation
|
||||
workitemflatabsid_u32 $s0;
|
||||
shl_u32 $s0, $s0, 2;
|
||||
cvt_u64_u32 $d4, $s0;
|
||||
|
||||
// Add index to base address of user buffer to obtain
|
||||
// effective address for access
|
||||
add_u64 $d0, $d0, $d4;
|
||||
add_u64 $d3, $d3, $d4;
|
||||
|
||||
// Initialize thread's read accumulator to zero
|
||||
mov_u32 $s2, 0;
|
||||
|
||||
@loop:
|
||||
|
||||
// Read sixteeen values with a stride that is
|
||||
// determined by the total number of work-items
|
||||
// in the global grid
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s2, $s1, $s2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
// Update output buffer with values read
|
||||
// from input buffer
|
||||
st_global_u32 $s2, [$d3];
|
||||
|
||||
};
|
||||
|
||||
@@ -1,105 +0,0 @@
|
||||
module &m:1:0:$full:$large:$default;
|
||||
extension "amd:gcn";
|
||||
extension "IMAGE";
|
||||
|
||||
decl prog function &abort()();
|
||||
|
||||
prog kernel &__SysMemStore(kernarg_u64 %bufStart,
|
||||
kernarg_u64 %bufEnd,
|
||||
kernarg_u64 %addrStep,
|
||||
kernarg_u64 %deadArg) {
|
||||
|
||||
// Directives for Compiler
|
||||
pragma "AMD RTI", "ARGSTART:__SysMemStore";
|
||||
pragma "AMD RTI", "version:3:1:104";
|
||||
pragma "AMD RTI", "device:generic";
|
||||
pragma "AMD RTI", "uniqueid:1024";
|
||||
pragma "AMD RTI", "function:1:0";
|
||||
pragma "AMD RTI", "memory:64bitABI";
|
||||
pragma "AMD RTI", "uavid:8";
|
||||
pragma "AMD RTI", "privateid:8";
|
||||
pragma "AMD RTI", "ARGEND:__SysMemStore";
|
||||
|
||||
// Retrieve the values of input arguments
|
||||
// bufStart refers to the starting byte address
|
||||
// bufEnd refers to the end of byte address
|
||||
// addrStep refers to the product of total number
|
||||
// of work-items in the grid * sizeof(uint32_t)
|
||||
ld_kernarg_u64 $d0, [%bufStart];
|
||||
ld_kernarg_u64 $d1, [%bufEnd];
|
||||
ld_kernarg_u64 $d2, [%addrStep];
|
||||
ld_kernarg_u64 $d3, [%deadArg];
|
||||
|
||||
// Compute the absolute id of current thread
|
||||
// and shift it by two to get index into user
|
||||
// buffer to access for Write operation
|
||||
workitemflatabsid_u32 $s0;
|
||||
shl_u32 $s0, $s0, 2;
|
||||
|
||||
// Convert the thread id into a 64-bit number
|
||||
// and add it to the starting address of user
|
||||
// buffer to obtain effective address for access
|
||||
cvt_u64_u32 $d4, $s0;
|
||||
add_u64 $d0, $d0, $d4;
|
||||
|
||||
|
||||
@loop:
|
||||
|
||||
// Write sixteeen values with a stride that is
|
||||
// determined by the total number of work-items
|
||||
// in the global grid
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
// Loop until we hit end of buffer [%bufEnd]
|
||||
cmp_lt_b1_u64 $c0, $d0, $d1;
|
||||
cbr_b1 $c0, @loop;
|
||||
|
||||
};
|
||||
|
||||
-105
@@ -1,105 +0,0 @@
|
||||
module &m:1:0:$base:$large:$default;
|
||||
extension "amd:gcn";
|
||||
extension "IMAGE";
|
||||
|
||||
decl prog function &abort()();
|
||||
|
||||
prog kernel &__SysMemStore(kernarg_u64 %bufStart,
|
||||
kernarg_u64 %bufEnd,
|
||||
kernarg_u64 %addrStep,
|
||||
kernarg_u64 %deadArg) {
|
||||
|
||||
// Directives for Compiler
|
||||
pragma "AMD RTI", "ARGSTART:__SysMemStore";
|
||||
pragma "AMD RTI", "version:3:1:104";
|
||||
pragma "AMD RTI", "device:generic";
|
||||
pragma "AMD RTI", "uniqueid:1024";
|
||||
pragma "AMD RTI", "function:1:0";
|
||||
pragma "AMD RTI", "memory:64bitABI";
|
||||
pragma "AMD RTI", "uavid:8";
|
||||
pragma "AMD RTI", "privateid:8";
|
||||
pragma "AMD RTI", "ARGEND:__SysMemStore";
|
||||
|
||||
// Retrieve the values of input arguments
|
||||
// bufStart refers to the starting byte address
|
||||
// bufEnd refers to the end of byte address
|
||||
// addrStep refers to the product of total number
|
||||
// of work-items in the grid * sizeof(uint32_t)
|
||||
ld_kernarg_u64 $d0, [%bufStart];
|
||||
ld_kernarg_u64 $d1, [%bufEnd];
|
||||
ld_kernarg_u64 $d2, [%addrStep];
|
||||
ld_kernarg_u64 $d3, [%deadArg];
|
||||
|
||||
// Compute the absolute id of current thread
|
||||
// and shift it by two to get index into user
|
||||
// buffer to access for Write operation
|
||||
workitemflatabsid_u32 $s0;
|
||||
shl_u32 $s0, $s0, 2;
|
||||
|
||||
// Convert the thread id into a 64-bit number
|
||||
// and add it to the starting address of user
|
||||
// buffer to obtain effective address for access
|
||||
cvt_u64_u32 $d4, $s0;
|
||||
add_u64 $d0, $d0, $d4;
|
||||
|
||||
|
||||
@loop:
|
||||
|
||||
// Write sixteeen values with a stride that is
|
||||
// determined by the total number of work-items
|
||||
// in the global grid
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
st_global_u32 $s0, [$d0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
|
||||
// Loop until we hit end of buffer [%bufEnd]
|
||||
cmp_lt_b1_u64 $c0, $d0, $d1;
|
||||
cbr_b1 $c0, @loop;
|
||||
|
||||
};
|
||||
|
||||
+9
-37
@@ -43,40 +43,12 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __ROCRTST_SRC_INC_IMAGE_STORE_BANDWIDTH_H__
|
||||
#define __ROCRTST_SRC_INC_IMAGE_STORE_BANDWIDTH_H__
|
||||
|
||||
#include "perf_common/perf_base.h"
|
||||
#include "common/base_rocr.h"
|
||||
#include "hsa/hsa.h"
|
||||
|
||||
class ImageStoreBandwidth: public rocrtst::BaseRocR, public PerfBase {
|
||||
public:
|
||||
//@Brief: Constructor
|
||||
ImageStoreBandwidth();
|
||||
|
||||
//@Brief: Destructor
|
||||
~ImageStoreBandwidth();
|
||||
|
||||
//@Brief: Set up the test environment
|
||||
virtual void SetUp();
|
||||
|
||||
//@Brief: Run the actual testing
|
||||
virtual void Run();
|
||||
|
||||
//@Brief: Clean up the test environment
|
||||
virtual void Close();
|
||||
|
||||
//@Brief: Display results
|
||||
virtual void DisplayResults() const;
|
||||
|
||||
private:
|
||||
//@Brief: Image Store Bandwidth
|
||||
double store_bandwidth_;
|
||||
|
||||
//@Brief: Image size
|
||||
size_t image_size_;
|
||||
};
|
||||
|
||||
#endif //__ROCRTST_SRC_INC_IMAGE_STORE_BANDWIDTH_H__
|
||||
|
||||
__kernel void
|
||||
square(__global int *dstArray, __global const int *srcArray, const int sz) {
|
||||
unsigned int id = get_global_id(0);
|
||||
if (id < sz) {
|
||||
dstArray[id] = srcArray[id] * srcArray[id];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1,53 +0,0 @@
|
||||
module &m:1:0:$full:$large:$default;
|
||||
extension "amd:gcn";
|
||||
extension "IMAGE";
|
||||
|
||||
decl prog function &abort()();
|
||||
|
||||
prog kernel &__OpenCL_vec_assign_kernel(
|
||||
kernarg_u64 %buf,
|
||||
kernarg_u32 %num)
|
||||
{
|
||||
pragma "AMD RTI", "ARGSTART:__OpenCL_vec_assign_kernel";
|
||||
pragma "AMD RTI", "version:3:1:104";
|
||||
pragma "AMD RTI", "device:generic";
|
||||
pragma "AMD RTI", "uniqueid:1024";
|
||||
pragma "AMD RTI", "function:1:0";
|
||||
pragma "AMD RTI", "memory:64bitABI";
|
||||
pragma "AMD RTI", "uavid:8";
|
||||
pragma "AMD RTI", "privateid:8";
|
||||
pragma "AMD RTI", "ARGEND:__OpenCL_vec_assign_kernel";
|
||||
|
||||
@__OpenCL_vec_assign_kernel_entry:
|
||||
// BB#0: // %entry
|
||||
ld_kernarg_align(8)_width(all)_u64 $d0, [%buf];
|
||||
ld_global_u32 $s1, [$d0];
|
||||
ld_kernarg_align(4)_width(all)_u32 $s0, [%num];
|
||||
cmp_ge_b1_s32 $c0, $s1, $s0;
|
||||
cbr_b1 $c0, @BB0_4;
|
||||
// BB#1: // %while.body.lr.ph
|
||||
workitemabsid_u32 $s1, 0;
|
||||
cmp_eq_b1_s32 $c0, $s1, 0;
|
||||
cbr_b1 $c0, @BB0_2;
|
||||
|
||||
@BB0_3:
|
||||
// %while.cond.backedge
|
||||
ld_global_u32 $s1, [$d0];
|
||||
cmp_lt_b1_s32 $c0, $s1, $s0;
|
||||
cbr_b1 $c0, @BB0_3;
|
||||
br @BB0_4;
|
||||
|
||||
@BB0_2:
|
||||
// %while.cond.backedge.us
|
||||
ld_global_u32 $s1, [$d0];
|
||||
add_u32 $s1, $s1, 1;
|
||||
st_global_u32 $s1, [$d0];
|
||||
ld_global_u32 $s1, [$d0];
|
||||
cmp_lt_b1_s32 $c0, $s1, $s0;
|
||||
cbr_b1 $c0, @BB0_2;
|
||||
|
||||
@BB0_4:
|
||||
// %while.end
|
||||
ret;
|
||||
};
|
||||
|
||||
-108
@@ -1,108 +0,0 @@
|
||||
module &m:1:0:$full:$large:$default;
|
||||
extension "amd:gcn";
|
||||
extension "IMAGE";
|
||||
|
||||
decl prog function &abort()();
|
||||
|
||||
prog kernel &__OpenCL_matrixTranspose_kernel(
|
||||
kernarg_u64 %__global_offset_0,
|
||||
kernarg_u64 %__global_offset_1,
|
||||
kernarg_u64 %__global_offset_2,
|
||||
kernarg_u64 %__printf_buffer,
|
||||
kernarg_u64 %__vqueue_pointer,
|
||||
kernarg_u64 %__aqlwrap_pointer,
|
||||
kernarg_u64 %inBuf,
|
||||
kernarg_u64 %outBuf,
|
||||
kernarg_u64 %localBuf,
|
||||
kernarg_u32 %blockSize,
|
||||
kernarg_u32 %width,
|
||||
kernarg_u32 %height)
|
||||
{
|
||||
pragma "AMD RTI", "ARGSTART:__OpenCL_matrixTranspose_kernel";
|
||||
pragma "AMD RTI", "version:3:1:104";
|
||||
pragma "AMD RTI", "device:generic";
|
||||
pragma "AMD RTI", "uniqueid:1024";
|
||||
pragma "AMD RTI", "memory:private:0";
|
||||
pragma "AMD RTI", "memory:region:0";
|
||||
pragma "AMD RTI", "memory:local:0";
|
||||
pragma "AMD RTI", "value:__global_offset_0:u64:1:1:0";
|
||||
pragma "AMD RTI", "value:__global_offset_1:u64:1:1:16";
|
||||
pragma "AMD RTI", "value:__global_offset_2:u64:1:1:32";
|
||||
pragma "AMD RTI", "pointer:__printf_buffer:u8:1:1:48:uav:7:1:RW:0:0:0";
|
||||
pragma "AMD RTI", "value:__vqueue_pointer:u64:1:1:64";
|
||||
pragma "AMD RTI", "value:__aqlwrap_pointer:u64:1:1:80";
|
||||
pragma "AMD RTI", "pointer:inBuf:u32:1:1:96:uav:7:4:RW:0:1:0";
|
||||
pragma "AMD RTI", "pointer:outBuf:u32:1:1:112:uav:7:4:RW:0:1:0";
|
||||
pragma "AMD RTI", "pointer:localBuf:u32:1:1:128:l:7:4:RW:0:0:0";
|
||||
pragma "AMD RTI", "value:blockSize:u32:1:1:144";
|
||||
pragma "AMD RTI", "value:width:u32:1:1:160";
|
||||
pragma "AMD RTI", "value:height:u32:1:1:176";
|
||||
pragma "AMD RTI", "function:1:0";
|
||||
pragma "AMD RTI", "memory:64bitABI";
|
||||
pragma "AMD RTI", "privateid:8";
|
||||
pragma "AMD RTI", "enqueue_kernel:0";
|
||||
pragma "AMD RTI", "kernel_index:0";
|
||||
pragma "AMD RTI", "reflection:0:size_t";
|
||||
pragma "AMD RTI", "reflection:1:size_t";
|
||||
pragma "AMD RTI", "reflection:2:size_t";
|
||||
pragma "AMD RTI", "reflection:3:size_t";
|
||||
pragma "AMD RTI", "reflection:4:size_t";
|
||||
pragma "AMD RTI", "reflection:5:size_t";
|
||||
pragma "AMD RTI", "reflection:6:uint*";
|
||||
pragma "AMD RTI", "reflection:7:uint*";
|
||||
pragma "AMD RTI", "reflection:8:uint*";
|
||||
pragma "AMD RTI", "reflection:9:uint";
|
||||
pragma "AMD RTI", "reflection:10:uint";
|
||||
pragma "AMD RTI", "reflection:11:uint";
|
||||
pragma "AMD RTI", "ARGEND:__OpenCL_matrixTranspose_kernel";
|
||||
|
||||
@__OpenCL_matrixTranspose_kernel_entry:
|
||||
// BB#0: // %entry
|
||||
workitemid_u32 $s0, 1;
|
||||
ld_kernarg_align(4)_width(all)_u32 $s1, [%blockSize];
|
||||
workitemid_u32 $s2, 0;
|
||||
mad_u32 $s3, $s2, $s1, $s0;
|
||||
cvt_u64_u32 $d1, $s3;
|
||||
workitemabsid_u32 $s3, 0;
|
||||
cvt_u64_u32 $d0, $s3;
|
||||
ld_kernarg_align(8)_width(all)_u64 $d2, [%__global_offset_0];
|
||||
add_u64 $d0, $d0, $d2;
|
||||
workitemabsid_u32 $s5, 1;
|
||||
workgroupid_u32 $s4, 0;
|
||||
workgroupid_u32 $s3, 1;
|
||||
shl_u64 $d1, $d1, 2;
|
||||
mad_u32 $s3, $s3, $s1, $s2;
|
||||
mad_u32 $s4, $s4, $s1, $s0;
|
||||
cvt_u64_u32 $d2, $s5;
|
||||
ld_kernarg_align(8)_width(all)_u64 $d3, [%__global_offset_1];
|
||||
cvt_u32_u64 $s5, $d0;
|
||||
add_u64 $d0, $d2, $d3;
|
||||
cvt_u32_u64 $s6, $d0;
|
||||
ld_kernarg_align(4)_width(all)_u32 $s7, [%width];
|
||||
ld_kernarg_align(8)_width(all)_u64 $d0, [%localBuf];
|
||||
ld_kernarg_align(4)_width(all)_u32 $s8, [%height];
|
||||
mad_u32 $s3, $s4, $s8, $s3;
|
||||
add_u64 $d1, $d0, $d1;
|
||||
cvt_u32_u64 $s4, $d1;
|
||||
mad_u32 $s5, $s6, $s7, $s5;
|
||||
cvt_u64_u32 $d1, $s5;
|
||||
shl_u64 $d2, $d1, 2;
|
||||
ld_kernarg_align(8)_width(all)_u64 $d1, [%outBuf];
|
||||
ld_kernarg_align(8)_width(all)_u64 $d3, [%inBuf];
|
||||
add_u64 $d2, $d3, $d2;
|
||||
ld_global_align(4)_u32 $s5, [$d2];
|
||||
st_group_align(4)_u32 $s5, [$s4];
|
||||
cvt_u64_u32 $d2, $s3;
|
||||
shl_u64 $d2, $d2, 2;
|
||||
add_u64 $d1, $d1, $d2;
|
||||
mad_u32 $s0, $s0, $s1, $s2;
|
||||
cvt_u64_u32 $d2, $s0;
|
||||
shl_u64 $d2, $d2, 2;
|
||||
add_u64 $d0, $d0, $d2;
|
||||
cvt_u32_u64 $s0, $d0;
|
||||
barrier;
|
||||
ld_group_align(4)_u32 $s0, [$s0];
|
||||
st_global_align(4)_u32 $s0, [$d1];
|
||||
ret;
|
||||
};
|
||||
|
||||
@@ -1,34 +0,0 @@
|
||||
module &m:1:0:$full:$large:$default;
|
||||
extension "amd:gcn";
|
||||
extension "IMAGE";
|
||||
|
||||
decl prog function &abort()();
|
||||
|
||||
prog kernel &__vector_copy_kernel(
|
||||
kernarg_u64 %a,
|
||||
kernarg_u64 %b)
|
||||
{
|
||||
pragma "AMD RTI", "ARGSTART:__vector_copy_kernel";
|
||||
pragma "AMD RTI", "version:3:1:104";
|
||||
pragma "AMD RTI", "device:generic";
|
||||
pragma "AMD RTI", "uniqueid:1024";
|
||||
pragma "AMD RTI", "function:1:0";
|
||||
pragma "AMD RTI", "memory:64bitABI";
|
||||
pragma "AMD RTI", "uavid:8";
|
||||
pragma "AMD RTI", "privateid:8";
|
||||
pragma "AMD RTI", "ARGEND:__vector_copy_kernel";
|
||||
|
||||
@__vector_copy_kernel_entry:
|
||||
// BB#0: // %entry
|
||||
workitemabsid_u32 $s0, 0;
|
||||
cvt_s64_s32 $d0, $s0;
|
||||
shl_u64 $d0, $d0, 2;
|
||||
ld_kernarg_align(8)_width(all)_u64 $d1, [%b];
|
||||
add_u64 $d1, $d1, $d0;
|
||||
ld_kernarg_align(8)_width(all)_u64 $d2, [%a];
|
||||
add_u64 $d0, $d2, $d0;
|
||||
ld_global_u32 $s0, [$d0];
|
||||
st_global_u32 $s0, [$d1];
|
||||
ret;
|
||||
};
|
||||
|
||||
-64
@@ -1,64 +0,0 @@
|
||||
module &m:1:0:$base:$large:$default;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// The University of Illinois/NCSA
|
||||
// Open Source License (NCSA)
|
||||
//
|
||||
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Developed by:
|
||||
//
|
||||
// AMD Research and AMD HSA Software Development
|
||||
//
|
||||
// Advanced Micro Devices, Inc.
|
||||
//
|
||||
// www.amd.com
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to
|
||||
// deal with the Software without restriction, including without limitation
|
||||
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
// and/or sell copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// - Redistributions of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimers.
|
||||
// - Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimers in
|
||||
// the documentation and/or other materials provided with the distribution.
|
||||
// - Neither the names of Advanced Micro Devices, Inc,
|
||||
// nor the names of its contributors may be used to endorse or promote
|
||||
// products derived from this Software without specific prior written
|
||||
// permission.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
// DEALINGS WITH THE SOFTWARE.
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
decl prog function &abort()();
|
||||
|
||||
prog kernel &__vector_copy_kernel(
|
||||
kernarg_u64 %in,
|
||||
kernarg_u64 %out)
|
||||
{
|
||||
@__vector_copy_kernel_entry:
|
||||
// BB#0: // %entry
|
||||
workitemabsid_u32 $s0, 0;
|
||||
cvt_s64_s32 $d0, $s0;
|
||||
shl_u64 $d0, $d0, 2;
|
||||
ld_kernarg_align(8)_width(all)_u64 $d1, [%out];
|
||||
add_u64 $d1, $d1, $d0;
|
||||
ld_kernarg_align(8)_width(all)_u64 $d2, [%in];
|
||||
add_u64 $d0, $d2, $d0;
|
||||
ld_global_u32 $s0, [$d0];
|
||||
st_global_u32 $s0, [$d1];
|
||||
ret;
|
||||
};
|
||||
-64
@@ -1,64 +0,0 @@
|
||||
module &m:1:0:$full:$large:$default;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// The University of Illinois/NCSA
|
||||
// Open Source License (NCSA)
|
||||
//
|
||||
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Developed by:
|
||||
//
|
||||
// AMD Research and AMD HSA Software Development
|
||||
//
|
||||
// Advanced Micro Devices, Inc.
|
||||
//
|
||||
// www.amd.com
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to
|
||||
// deal with the Software without restriction, including without limitation
|
||||
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
// and/or sell copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// - Redistributions of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimers.
|
||||
// - Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimers in
|
||||
// the documentation and/or other materials provided with the distribution.
|
||||
// - Neither the names of Advanced Micro Devices, Inc,
|
||||
// nor the names of its contributors may be used to endorse or promote
|
||||
// products derived from this Software without specific prior written
|
||||
// permission.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
// DEALINGS WITH THE SOFTWARE.
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
decl prog function &abort()();
|
||||
|
||||
prog kernel &__vector_copy_kernel(
|
||||
kernarg_u64 %in,
|
||||
kernarg_u64 %out)
|
||||
{
|
||||
@__vector_copy_kernel_entry:
|
||||
// BB#0: // %entry
|
||||
workitemabsid_u32 $s0, 0;
|
||||
cvt_s64_s32 $d0, $s0;
|
||||
shl_u64 $d0, $d0, 2;
|
||||
ld_kernarg_align(8)_width(all)_u64 $d1, [%out];
|
||||
add_u64 $d1, $d1, $d0;
|
||||
ld_kernarg_align(8)_width(all)_u64 $d2, [%in];
|
||||
add_u64 $d0, $d2, $d0;
|
||||
ld_global_u32 $s0, [$d0];
|
||||
st_global_u32 $s0, [$d1];
|
||||
ret;
|
||||
};
|
||||
Обычный файл → Исполняемый файл
+57
-210
@@ -43,238 +43,85 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include "cp_process_time.h"
|
||||
#include "cu_masking.h"
|
||||
#include "device_load_bandwidth.h"
|
||||
#include "device_store_bandwidth.h"
|
||||
#include "dispatch_time.h"
|
||||
#include "flush_latency.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "hsa_info.h"
|
||||
#include "image_bandwidth.h"
|
||||
#include "image_load_bandwidth.h"
|
||||
#include "image_store_bandwidth.h"
|
||||
#include "matrix_transpose.h"
|
||||
#include "memory_copy.h"
|
||||
#include "memory_allocation.h"
|
||||
#include "memory_async_copy.h"
|
||||
#include "queue_concurrency.h"
|
||||
#include "queue_create_destroy_latency.h"
|
||||
#include "system_load_bandwidth.h"
|
||||
#include "system_store_bandwidth.h"
|
||||
#include "vector_copy.h"
|
||||
#include "suites/performance/dispatch_time.h"
|
||||
#include "suites/performance/memory_async_copy.h"
|
||||
#include "suites/performance/test_case_template.h"
|
||||
#include "suites/performance/main.h"
|
||||
#include "suites/test_common/test_common.h"
|
||||
|
||||
/**
|
||||
* Try to order tests from fastest running to slowest running.
|
||||
*/
|
||||
static uint32_t sRocrTstOptVerbosity = 1;
|
||||
static uint32_t sRocrTestOptIterations = 0;
|
||||
|
||||
// DisplayResultsResults HSA system information first.
|
||||
TEST(rocrtst, Feature_Hsa_Info) {
|
||||
HsaInfo hi;
|
||||
hi.SetUp();
|
||||
hi.Run();
|
||||
hi.Close();
|
||||
static void RunTest(TestBase *test) {
|
||||
test->set_verbosity(sRocrTstOptVerbosity);
|
||||
|
||||
if (sRocrTestOptIterations) {
|
||||
test->set_num_iteration(sRocrTestOptIterations);
|
||||
}
|
||||
test->DisplayTestInfo();
|
||||
test->SetUp();
|
||||
test->Run();
|
||||
test->DisplayResults();
|
||||
test->Close();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Requires HSA_PFOFILE_FULL
|
||||
TEST(rocrtst, Perf_Image_Store_Bandwidth) {
|
||||
ImageStoreBandwidth isb;
|
||||
isb.SetUp();
|
||||
isb.Run();
|
||||
isb.DisplayResults();
|
||||
isb.Close();
|
||||
// TEST ENTRY TEMPLATE:
|
||||
// TEST(rocrtst, Perf_<test name>) {
|
||||
// <Test Implementation class> <test_obj>;
|
||||
//
|
||||
// // Copy and modify implementation of RunTest() if you need to deviate
|
||||
// // from the standard pattern implemented there.
|
||||
// RunTest(&<test_obj>);
|
||||
// }
|
||||
|
||||
TEST(rocrtst, Test_Example) {
|
||||
TestExample tst;
|
||||
RunTest(&tst);
|
||||
}
|
||||
|
||||
// Requires HSA_PFOFILE_FULL
|
||||
TEST(rocrtst, Perf_Image_Load_Bandwidth) {
|
||||
ImageLoadBandwidth ilb;
|
||||
ilb.SetUp();
|
||||
ilb.Run();
|
||||
ilb.DisplayResults();
|
||||
ilb.Close();
|
||||
TEST(rocrtst, Perf_Memory_Async_Copy) {
|
||||
MemoryAsyncCopy mac;
|
||||
// To do full test, uncomment this:
|
||||
// mac.set_full_test(true);
|
||||
// To test only 1 path, add lines like this:
|
||||
// mac.set_src_pool(<src pool id>);
|
||||
// mac.set_dst_pool(<dst pool id>);
|
||||
// The default is to and from the cpu to 1 gpu, and to/from a gpu to
|
||||
// another gpu
|
||||
RunTest(&mac);
|
||||
}
|
||||
|
||||
// Requires HSA_PFOFILE_FULL
|
||||
TEST(rocrtst, Perf_Image_Bandwidth) {
|
||||
ImageBandwidth ib;
|
||||
ib.SetUp();
|
||||
ib.Run();
|
||||
ib.DisplayResults();
|
||||
ib.Close();
|
||||
}
|
||||
|
||||
// Requires HSA_PFOFILE_FULL
|
||||
TEST(rocrtst, Perf_Queue_Concurrency) {
|
||||
QueueConcurrency mc;
|
||||
mc.SetUp();
|
||||
mc.Run();
|
||||
mc.DisplayResults();
|
||||
mc.Close();
|
||||
}
|
||||
|
||||
TEST(rocrtst, Feature_Cu_Masking) {
|
||||
CuMasking cm;
|
||||
cm.SetUp();
|
||||
cm.Run();
|
||||
cm.Close();
|
||||
}
|
||||
|
||||
TEST(rocrtst, Perf_Flush_Latency) {
|
||||
FlushLatency fl;
|
||||
fl.SetUp();
|
||||
fl.Run();
|
||||
fl.DisplayResults();
|
||||
fl.Close();
|
||||
}
|
||||
|
||||
// This test apparently has some sort of memory bounds overwrite
|
||||
// issue with the out_data_ buffer. Commenting out the free of
|
||||
// out_data_ avoids the problem. Left uncommented, a crash will
|
||||
// occur immediately or some time after.
|
||||
TEST(rocrtst, DISABLED_Perf_Device_Memory_Store_Bandwidth) {
|
||||
DeviceStoreBandwidth slb;
|
||||
slb.SetUp();
|
||||
slb.Run();
|
||||
slb.DisplayResults();
|
||||
slb.Close();
|
||||
}
|
||||
|
||||
// This test apparently has some sort of memory bounds overwrite
|
||||
// issue with the out_data_ buffer. Commenting out the free of
|
||||
// out_data_ avoids the problem. Left uncommented, a crash will
|
||||
// occur immediately or some time after.
|
||||
TEST(rocrtst, DISABLED_Perf_Device_Memory_Load_Bandwidth) {
|
||||
DeviceLoadBandwidth slb;
|
||||
slb.SetUp();
|
||||
slb.Run();
|
||||
slb.DisplayResults();
|
||||
slb.Close();
|
||||
}
|
||||
TEST(rocrtst, Perf_Dispatch_Time_Single_SpinWait) {
|
||||
DispatchTime dt;
|
||||
dt.set_num_iteration(100);
|
||||
dt.UseDefaultSignal(true);
|
||||
dt.LaunchSingleKernel(true);
|
||||
dt.SetUp();
|
||||
dt.Run();
|
||||
dt.DisplayResults();
|
||||
dt.Close();
|
||||
DispatchTime dt(true, true);
|
||||
RunTest(&dt);
|
||||
}
|
||||
|
||||
TEST(rocrtst, Perf_Dispatch_Time_Single_Interrupt) {
|
||||
DispatchTime dt;
|
||||
dt.UseDefaultSignal(false);
|
||||
dt.LaunchSingleKernel(true);
|
||||
dt.SetUp();
|
||||
dt.Run();
|
||||
dt.DisplayResults();
|
||||
dt.Close();
|
||||
DispatchTime dt(false, true);
|
||||
RunTest(&dt);
|
||||
}
|
||||
|
||||
TEST(rocrtst, Perf_Dispatch_Time_Multi_SpinWait) {
|
||||
DispatchTime dt;
|
||||
dt.UseDefaultSignal(true);
|
||||
dt.LaunchSingleKernel(false);
|
||||
dt.SetUp();
|
||||
dt.Run();
|
||||
dt.DisplayResults();
|
||||
dt.Close();
|
||||
DispatchTime dt(true, false);
|
||||
RunTest(&dt);
|
||||
}
|
||||
|
||||
TEST(rocrtst, Perf_Dispatch_Time_Multi_Interrupt) {
|
||||
DispatchTime dt;
|
||||
dt.UseDefaultSignal(false);
|
||||
dt.LaunchSingleKernel(false);
|
||||
dt.SetUp();
|
||||
dt.Run();
|
||||
dt.DisplayResults();
|
||||
dt.Close();
|
||||
DispatchTime dt(false, false);
|
||||
RunTest(&dt);
|
||||
}
|
||||
TEST(rocrtst, DISABLED_Perf_CpProcessTime) {
|
||||
CpProcessTime cpt;
|
||||
cpt.set_num_iteration(10);
|
||||
cpt.SetUp();
|
||||
cpt.Run();
|
||||
cpt.DisplayResults();
|
||||
cpt.Close();
|
||||
}
|
||||
|
||||
TEST(rocrtst, Perf_Memory_Allocation) {
|
||||
MemoryAllocation ma(10);
|
||||
ma.SetUp();
|
||||
ma.Run();
|
||||
ma.DisplayResults();
|
||||
ma.Close();
|
||||
}
|
||||
|
||||
#if MEM_POOL_FILL_BUG
|
||||
TEST(rocrtst, Perf_Queue_Latency) {
|
||||
QueueLatency ql;
|
||||
ql.set_num_iteration(10);
|
||||
ql.SetUp();
|
||||
ql.Run();
|
||||
ql.DisplayResults();
|
||||
ql.Close();
|
||||
}
|
||||
|
||||
TEST(rocrtst, Perf_System_Memory_Load_Bandwidth) {
|
||||
SystemLoadBandwidth slb;
|
||||
slb.SetUp();
|
||||
slb.Run();
|
||||
slb.DisplayResults();
|
||||
slb.Close();
|
||||
}
|
||||
|
||||
TEST(rocrtst, Perf_System_Memory_Store_Bandwidth) {
|
||||
SystemStoreBandwidth ssb;
|
||||
ssb.SetUp();
|
||||
ssb.Run();
|
||||
ssb.DisplayResults();
|
||||
ssb.Close();
|
||||
}
|
||||
|
||||
TEST(rocrtst, Perf_Memory_Copy) {
|
||||
MemoryCopy mc;
|
||||
mc.set_num_iteration(10);
|
||||
mc.SetUp();
|
||||
mc.Run();
|
||||
mc.DisplayResults();
|
||||
mc.Close();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
// These tests were not complete. Needs research/work.
|
||||
TEST(rocrtst, Feature_Vector_Copy) {
|
||||
VectorCopy vc;
|
||||
vc.SetUp();
|
||||
vc.Run();
|
||||
vc.Close();
|
||||
}
|
||||
|
||||
TEST(rocrtst, Perf_Matrix_Transpose) {
|
||||
MatrixTranspose mt;
|
||||
mt.SetUp();
|
||||
mt.Run();
|
||||
mt.DisplayResults();
|
||||
mt.Close();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
//#if NEED_TO_MAKE_BATCH
|
||||
TEST(rocrtst, Perf_Memory_Async_Copy) {
|
||||
MemoryAsyncCopy mac;
|
||||
mac.set_num_iteration(10);
|
||||
mac.SetUp();
|
||||
mac.Run();
|
||||
mac.DisplayResults();
|
||||
mac.Close();
|
||||
}
|
||||
//#endif
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
|
||||
RocrtstOptions opts(&sRocrTstOptVerbosity, &sRocrTestOptIterations);
|
||||
|
||||
if (ProcessCmdline(&opts, argc, argv)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
@@ -1,289 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "matrix_transpose.h"
|
||||
#include "common/base_rocr_utils.h"
|
||||
#include "common/common.h"
|
||||
#include "common/helper_funcs.h"
|
||||
#include "common/hsatimer.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include "hsa/hsa_ext_amd.h"
|
||||
#include "hsa/hsa_ext_finalize.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <stdlib.h>
|
||||
#include <algorithm>
|
||||
|
||||
static const unsigned int NUM_BLOCK_SIZES = 2;
|
||||
static const unsigned int blockSizes[NUM_BLOCK_SIZES] = {8, 16};
|
||||
static const unsigned int NUM_MATRIX_DIMS = 2;
|
||||
static const unsigned int matrixDims[NUM_MATRIX_DIMS] = {1024, 64};
|
||||
|
||||
MatrixTranspose::MatrixTranspose(void) :
|
||||
BaseRocR() {
|
||||
in_buffer_sys_ = NULL;
|
||||
out_buffer_sys_ = NULL;
|
||||
in_buffer_ = NULL;
|
||||
out_buffer_ = NULL;
|
||||
width_ = 0;
|
||||
height_ = 0;
|
||||
buf_size_ = 0;
|
||||
block_size_ = 0;
|
||||
time_mean_ = 0.0;
|
||||
}
|
||||
|
||||
MatrixTranspose::~MatrixTranspose(void) {
|
||||
|
||||
}
|
||||
|
||||
void MatrixTranspose::SetUp(void) {
|
||||
hsa_status_t err;
|
||||
|
||||
InitializeData();
|
||||
|
||||
set_kernel_file_name("transpose_kernel.o");
|
||||
set_kernel_name("&__OpenCL_matrixTranspose_kernel");
|
||||
|
||||
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
hsa_agent_t* cpu_dev = cpu_device();
|
||||
|
||||
err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool,
|
||||
&cpu_pool());
|
||||
ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
|
||||
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool(), buf_size_, 0,
|
||||
(void**) &in_buffer_);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool(), buf_size_, 0,
|
||||
(void**) &out_buffer_);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = hsa_amd_agents_allow_access(1, gpu_dev, NULL, in_buffer_);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = hsa_amd_agents_allow_access(1, gpu_dev, NULL, out_buffer_);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Create a queue
|
||||
hsa_queue_t* q = nullptr;
|
||||
rocrtst::CreateQueue(*gpu_dev, &q);
|
||||
set_main_queue(q);
|
||||
|
||||
rocrtst::LoadKernelFromObjFile(this);
|
||||
|
||||
// Fill up aql packet
|
||||
rocrtst::InitializeAQLPacket(this, &aql());
|
||||
aql().setup = 0;
|
||||
aql().setup |= 2 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
|
||||
aql().workgroup_size_x = block_size_;
|
||||
aql().workgroup_size_y = block_size_;
|
||||
aql().grid_size_x = width_;
|
||||
aql().grid_size_y = height_;
|
||||
aql().group_segment_size = sizeof(uint) * block_size_ * block_size_;
|
||||
|
||||
// Debug
|
||||
#ifdef DEBUG
|
||||
std::cout << "workgroup size: " << block_size_ << ", " << block_size_
|
||||
<< ", " << 1 << std::endl;
|
||||
std::cout << "grid size: " << aql().grid_size_x << ", " <<
|
||||
aql().grid_size_y << ", " << aql().grid_size_z << std::endl;
|
||||
std::cout << "group segment size: " << aql().group_segment_size << std::endl;
|
||||
#endif
|
||||
}
|
||||
|
||||
void MatrixTranspose::Run(void) {
|
||||
hsa_status_t err;
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Allocate kernel parameter
|
||||
typedef struct args_t {
|
||||
uint* offset_0;
|
||||
uint* offset_1;
|
||||
uint* offset_2;
|
||||
uint* printf_buffer;
|
||||
uint* vqueue_buffer;
|
||||
uint* aqlwrap_pointer;
|
||||
|
||||
uint* in_buf;
|
||||
uint* out_buf;
|
||||
uint* local_buf;
|
||||
uint iblock_size;
|
||||
uint iwidth;
|
||||
uint iheight;
|
||||
} args;
|
||||
|
||||
args* kern_ptr = NULL;
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(args), 0,
|
||||
(void**) &kern_ptr);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = hsa_amd_agents_allow_access(1, gpu_dev, NULL, kern_ptr);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
kern_ptr->offset_0 = 0;
|
||||
kern_ptr->offset_1 = 0;
|
||||
kern_ptr->offset_2 = 0;
|
||||
kern_ptr->printf_buffer = 0;
|
||||
kern_ptr->vqueue_buffer = 0;
|
||||
kern_ptr->aqlwrap_pointer = 0;
|
||||
|
||||
kern_ptr->in_buf = in_buffer_sys_;
|
||||
kern_ptr->out_buf = out_buffer_sys_;
|
||||
kern_ptr->local_buf = 0;
|
||||
kern_ptr->iblock_size = block_size_;
|
||||
kern_ptr->iwidth = width_;
|
||||
kern_ptr->iheight = height_;
|
||||
|
||||
aql().kernarg_address = kern_ptr;
|
||||
|
||||
//Obtain the current queue write index.
|
||||
uint64_t idx = hsa_queue_add_write_index_relaxed(main_queue(), 1);
|
||||
|
||||
((hsa_kernel_dispatch_packet_t*)(main_queue()->base_address))[idx] = aql();
|
||||
|
||||
rocrtst::PerfTimer p_timer;
|
||||
int id = p_timer.CreateTimer();
|
||||
p_timer.StartTimer(id);
|
||||
|
||||
((hsa_kernel_dispatch_packet_t*)(main_queue()->base_address))[idx].header |=
|
||||
HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
|
||||
|
||||
hsa_signal_store_release(main_queue()->doorbell_signal, idx);
|
||||
|
||||
//Wait on the dispatch signal until the kernel is finished.
|
||||
hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
|
||||
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE);
|
||||
p_timer.StopTimer(id);
|
||||
|
||||
hsa_amd_profiling_dispatch_time_t dispatch_time;
|
||||
err = hsa_amd_profiling_get_dispatch_time(*gpu_dev, signal(), &dispatch_time);
|
||||
|
||||
uint64_t stamp = dispatch_time.end - dispatch_time.start;
|
||||
uint64_t freq;
|
||||
|
||||
err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freq);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
std::cout << "Kernel time is: " <<
|
||||
(double) stamp / (double) freq * 1000.0 << std::endl;
|
||||
hsa_signal_store_release(signal(), 1);
|
||||
|
||||
|
||||
// Verify Results
|
||||
VerifyResults (out_buffer_sys_);
|
||||
|
||||
// Abandon the first result which is warm up
|
||||
|
||||
time_mean_ = p_timer.ReadTimer(id); //rocrtst::CalcMean(timer);
|
||||
}
|
||||
|
||||
void MatrixTranspose::DisplayResults(void) const {
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::cout << "============================================" << std::endl;
|
||||
std::cout << "Matrix Transpose Mean Time: " << time_mean_ << std::endl;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void MatrixTranspose::Close(void) {
|
||||
hsa_status_t err;
|
||||
err = rocrtst::CommonCleanUp(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
void MatrixTranspose::InitializeData(void) {
|
||||
// int openTest = 1;
|
||||
block_size_ = 16; //blockSizes[openTest % NUM_BLOCK_SIZES];
|
||||
width_ = 1920; //matrixDims[openTest / NUM_BLOCK_SIZES];
|
||||
height_ = width_;
|
||||
|
||||
buf_size_ = width_ * height_ * sizeof(uint);
|
||||
|
||||
in_buffer_sys_ = (uint*) aligned_alloc(256, buf_size_);
|
||||
|
||||
SetData (in_buffer_sys_);
|
||||
out_buffer_sys_ = (uint*) aligned_alloc(256, buf_size_);
|
||||
|
||||
FillData(out_buffer_sys_, 0xdeadbeef);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void MatrixTranspose::SetData(uint* buffer) {
|
||||
for (unsigned int i = 0; i < height_; i++) {
|
||||
for (unsigned int j = 0; j < width_; j++) {
|
||||
*(buffer + i * width_ + j) = i * width_ + j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MatrixTranspose::FillData(uint* buffer, unsigned int val) {
|
||||
for (unsigned int i = 0; i < width_ * height_; i++) {
|
||||
buffer[i] = val;
|
||||
}
|
||||
}
|
||||
|
||||
void MatrixTranspose::VerifyResults(uint* buffer) {
|
||||
bool err = false;
|
||||
|
||||
for (unsigned int i = 0; (i < width_) && !err; i++) {
|
||||
for (unsigned int j = 0; (j < height_) && !err; j++) {
|
||||
ASSERT_EQ(*(buffer + i * height_ + j), j * width_ + i);
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
}
|
||||
@@ -1,101 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __ROCRTST_SRC_MATRIX_TRANSPOSE_H__
|
||||
#define __ROCRTST_SRC_MATRIX_TRANSPOSE_H__
|
||||
|
||||
#include "perf_common/perf_base.h"
|
||||
#include "common/base_rocr.h"
|
||||
#include "hsa/hsa.h"
|
||||
|
||||
class MatrixTranspose: public rocrtst::BaseRocR, public PerfBase {
|
||||
|
||||
public:
|
||||
//@Brief: Default Constructor
|
||||
MatrixTranspose();
|
||||
|
||||
//@Brief: Destructor
|
||||
~MatrixTranspose();
|
||||
|
||||
//@Brief: Override SetUp function
|
||||
virtual void SetUp();
|
||||
|
||||
//@Brief: Run the measurement
|
||||
virtual void Run();
|
||||
|
||||
//@Brief: Clean up and Close
|
||||
virtual void Close();
|
||||
|
||||
//@Brief: Display results
|
||||
virtual void DisplayResults() const;
|
||||
|
||||
private:
|
||||
//@Brief: Set up data
|
||||
virtual void SetData(uint* buffer);
|
||||
|
||||
//@Brief: Fill Data
|
||||
virtual void FillData(uint* buffer, unsigned int val);
|
||||
|
||||
//@Brief: VerifyResults
|
||||
virtual void VerifyResults(uint* buffer);
|
||||
|
||||
//@Brief: Initialize the object attribute
|
||||
virtual void InitializeData();
|
||||
|
||||
uint* in_buffer_;
|
||||
uint* out_buffer_;
|
||||
uint* in_buffer_sys_;
|
||||
uint* out_buffer_sys_;
|
||||
unsigned int width_;
|
||||
unsigned int height_;
|
||||
unsigned int buf_size_;
|
||||
unsigned int block_size_;
|
||||
double time_mean_;
|
||||
|
||||
hsa_barrier_and_packet_t bpkt;
|
||||
};
|
||||
|
||||
#endif //__ROCRTST_SRC_MATRIX_TRANSPOSE_H__
|
||||
|
||||
@@ -1,198 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "memory_allocation.h"
|
||||
#include "common/base_rocr_utils.h"
|
||||
#include "common/common.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <algorithm>
|
||||
|
||||
MemoryAllocation::MemoryAllocation(uint32_t num_iters) :
|
||||
BaseRocR(), allocation_time_ {0.0}, mem_pool_flag_(0) {
|
||||
ptr = NULL;
|
||||
}
|
||||
|
||||
MemoryAllocation::~MemoryAllocation() {
|
||||
|
||||
}
|
||||
|
||||
const char* MemoryAllocation::Str[16] = {"64K", "128K", "256K", "512K", "1M",
|
||||
"2M", "4M", "8M", "16M", "32M",
|
||||
"64M", "128M", "256M", "512M", "1G",
|
||||
"2G"
|
||||
};
|
||||
const size_t MemoryAllocation::Size[16] = {64*1024, 128*1024,
|
||||
256*1024,512*1024, 1024*1024,
|
||||
2048*1024, 4096*1024, 8*1024*1024,
|
||||
16*1024*1024, 32*1024*1024,
|
||||
64*1024*1024, 128*1024*1024,
|
||||
256 * 1024*1024, 512*1024*1024,
|
||||
1024*1024*1024,
|
||||
(size_t)2*1024*1024*1024
|
||||
};
|
||||
|
||||
void MemoryAllocation::SetUp() {
|
||||
hsa_status_t err;
|
||||
|
||||
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
hsa_agent_t* cpu_dev = cpu_device();
|
||||
|
||||
err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool,
|
||||
&cpu_pool());
|
||||
|
||||
EXPECT_EQ(err, HSA_STATUS_INFO_BREAK);
|
||||
|
||||
if (err != HSA_STATUS_INFO_BREAK) {
|
||||
std::cout << "Unable to find global pool. Test will not be run."
|
||||
<< std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
//At this point, cpu_pool() should be in the global segment
|
||||
err = hsa_amd_memory_pool_get_info(cpu_pool(),
|
||||
(hsa_amd_memory_pool_info_t) HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS,
|
||||
&mem_pool_flag_);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
void MemoryAllocation::Run() {
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (cpu_pool().handle == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
size_t iterations = RealIterationNum();
|
||||
hsa_status_t err;
|
||||
|
||||
//Iterate over the different data size
|
||||
for (int i = 0; i < 16; i++) {
|
||||
std::vector<double> time;
|
||||
|
||||
for (uint32_t it = 0; it < iterations; it++) {
|
||||
#if DEBUG
|
||||
std::cout << "." << std::flush;
|
||||
#endif
|
||||
|
||||
rocrtst::PerfTimer allocation_timer;
|
||||
int index = allocation_timer.CreateTimer();
|
||||
|
||||
allocation_timer.StartTimer(index);
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool(), Size[i], 0, &ptr);
|
||||
allocation_timer.StopTimer(index);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
//Free the memory which was allocated
|
||||
err = hsa_amd_memory_pool_free(ptr);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
ptr = NULL;
|
||||
|
||||
// PUsh the results back to vector time
|
||||
time.push_back(allocation_timer.ReadTimer(index));
|
||||
}
|
||||
|
||||
#if DEBUG
|
||||
std::cout << std::endl;
|
||||
#endif
|
||||
|
||||
//Get mean copy time and store to the array
|
||||
allocation_time_[i] = GetMeanTime(time);
|
||||
}
|
||||
}
|
||||
|
||||
size_t MemoryAllocation::RealIterationNum() {
|
||||
return num_iteration() * 1.2 + 1;
|
||||
}
|
||||
|
||||
double MemoryAllocation::GetMeanTime(std::vector<double>& vec) {
|
||||
std::sort(vec.begin(), vec.end());
|
||||
|
||||
vec.erase(vec.begin());
|
||||
vec.erase(vec.begin(), vec.begin() + num_iteration() * 0.1);
|
||||
vec.erase(vec.begin() + num_iteration(), vec.end());
|
||||
|
||||
double mean = 0.0;
|
||||
int num = vec.size();
|
||||
|
||||
for (int it = 0; it < num; it++) {
|
||||
mean += vec[it];
|
||||
}
|
||||
|
||||
mean /= num;
|
||||
return mean;
|
||||
}
|
||||
|
||||
void MemoryAllocation::DisplayResults() const {
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
fprintf(stdout, "==============================================\n");
|
||||
fprintf(stdout, " Data Size Allocation_time BandWidth(GB/s)\n");
|
||||
|
||||
for (int i = 0; i < 16; i++) {
|
||||
fprintf(stdout, " %9s %15.6f %15.6f\n", Str[i], allocation_time_[i],
|
||||
2 * Size[i] / allocation_time_[i] / 1024 / 1024 / 1024);
|
||||
}
|
||||
|
||||
fprintf(stdout, "==============================================\n");
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void MemoryAllocation::Close() {
|
||||
hsa_status_t err;
|
||||
err = rocrtst::CommonCleanUp(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
return;
|
||||
}
|
||||
@@ -1,98 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __ROCRTST_SRC_MEMORY_MEM_ALLOCATION_H__
|
||||
#define __ROCRTST_SRC_MEMORY_MEM_ALLOCATION_H__
|
||||
|
||||
#include "perf_common/perf_base.h"
|
||||
#include "common/base_rocr.h"
|
||||
#include "common/hsatimer.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include <vector>
|
||||
|
||||
class MemoryAllocation: public rocrtst::BaseRocR, public PerfBase {
|
||||
|
||||
public:
|
||||
//@Brief: Constructor for test case of MemoryAllocation
|
||||
MemoryAllocation(uint32_t num_iters = 100);
|
||||
|
||||
//@Brief: Destructor for test case of MemoryAllocation
|
||||
virtual ~MemoryAllocation();
|
||||
|
||||
//@Brief: Set up the environment for the test
|
||||
virtual void SetUp();
|
||||
|
||||
//@Brief: Execute the test
|
||||
virtual void Run();
|
||||
|
||||
//@Brief: Display results
|
||||
virtual void DisplayResults() const;
|
||||
|
||||
//@Brief: Clean up and close the environment
|
||||
virtual void Close();
|
||||
|
||||
protected:
|
||||
//@Brief: Pointer to the memory space which is allocated by HSA Memory
|
||||
// allocation API
|
||||
void* ptr;
|
||||
|
||||
//@Brief: Array to store the timers results for each data size
|
||||
double allocation_time_[16];
|
||||
|
||||
private:
|
||||
//@Brief: Define allocated data size and corresponding string
|
||||
static const size_t Size[16];
|
||||
static const char* Str[16];
|
||||
|
||||
uint32_t mem_pool_flag_;
|
||||
|
||||
//@Brief: Get the actual iteration number
|
||||
size_t RealIterationNum();
|
||||
|
||||
//@Brief: Get mean execution time
|
||||
double GetMeanTime(std::vector<double>& vec);
|
||||
|
||||
};
|
||||
#endif
|
||||
Обычный файл → Исполняемый файл
+392
-520
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -43,199 +43,182 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __ROCRTST_SRC_MEMORY_ASYNC_COPY_H__
|
||||
#define __ROCRTST_SRC_MEMORY_ASYNC_COPY_H__
|
||||
#ifndef ROCRTST_SUITES_PERFORMANCE_MEMORY_ASYNC_COPY_H_
|
||||
#define ROCRTST_SUITES_PERFORMANCE_MEMORY_ASYNC_COPY_H_
|
||||
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
#include "perf_common/perf_base.h"
|
||||
#include "common/base_rocr.h"
|
||||
#include "common/common.h"
|
||||
#include "common/hsatimer.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include "hsa/hsa_ext_amd.h"
|
||||
#include <unistd.h>
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <cctype>
|
||||
#include "suites/test_common/test_base.h"
|
||||
|
||||
extern int mac_argc;
|
||||
extern char** mac_argv;
|
||||
typedef enum TransType {H2D = 0, D2H, P2P} TransType;
|
||||
|
||||
typedef struct transaction {
|
||||
typedef struct Transaction {
|
||||
int src;
|
||||
int dst;
|
||||
hsa_signal_t signal;
|
||||
size_t size;
|
||||
size_t num_dep_signal;
|
||||
hsa_signal_t* dep_signal;
|
||||
} transaction;
|
||||
size_t max_size; // Max. amount of kBytes to copy
|
||||
TransType type;
|
||||
// BenchMark copy time
|
||||
std::vector<double> *benchmark_copy_time;
|
||||
// Min time
|
||||
std::vector<double> *min_time;
|
||||
} Transaction;
|
||||
|
||||
typedef struct agent_info {
|
||||
agent_info(hsa_agent_t agent, int index, hsa_device_type_t device_type) {
|
||||
agent_ = agent;
|
||||
index_ = index;
|
||||
device_type_ = device_type;
|
||||
}
|
||||
agent_info() {
|
||||
}
|
||||
hsa_agent_t agent_;
|
||||
int index_;
|
||||
hsa_device_type_t device_type_;
|
||||
} agent_info;
|
||||
class AgentInfo {
|
||||
public:
|
||||
AgentInfo(hsa_agent_t agent, int index, hsa_device_type_t device_type) {
|
||||
agent_ = agent;
|
||||
index_ = index;
|
||||
device_type_ = device_type;
|
||||
}
|
||||
AgentInfo() {}
|
||||
|
||||
~AgentInfo() {}
|
||||
hsa_agent_t agent(void) const {return agent_;}
|
||||
hsa_device_type_t device_type(void) const {return device_type_;}
|
||||
|
||||
hsa_agent_t agent_;
|
||||
int index_;
|
||||
|
||||
private:
|
||||
hsa_device_type_t device_type_;
|
||||
};
|
||||
|
||||
class PoolInfo {
|
||||
public:
|
||||
PoolInfo(hsa_amd_memory_pool_t pool, int index,
|
||||
hsa_amd_segment_t segment, bool is_fine_graind, size_t size,
|
||||
AgentInfo *agent_info) {
|
||||
pool_ = pool;
|
||||
index_ = index;
|
||||
segment_ = segment;
|
||||
is_fine_grained_ = is_fine_graind;
|
||||
allocable_size_ = size;
|
||||
owner_agent_info_ = agent_info;
|
||||
}
|
||||
PoolInfo() {}
|
||||
~PoolInfo() {}
|
||||
AgentInfo* owner_agent_info(void) const {return owner_agent_info_;}
|
||||
hsa_amd_memory_pool_t pool_;
|
||||
int index_;
|
||||
hsa_amd_segment_t segment_;
|
||||
bool is_fine_grained_;
|
||||
size_t allocable_size_;
|
||||
private:
|
||||
AgentInfo *owner_agent_info_;
|
||||
};
|
||||
|
||||
typedef struct region_info {
|
||||
region_info(hsa_amd_memory_pool_t region, int index,
|
||||
hsa_amd_segment_t segment, bool is_fine_graind, size_t size,
|
||||
hsa_agent_t agent) {
|
||||
region_ = region;
|
||||
index_ = index;
|
||||
segment_ = segment;
|
||||
is_fine_grained_ = is_fine_graind;
|
||||
allocable_size_ = size;
|
||||
owner_agent_ = agent;
|
||||
}
|
||||
region_info() {
|
||||
}
|
||||
hsa_amd_memory_pool_t region_;
|
||||
int index_;
|
||||
hsa_amd_segment_t segment_;
|
||||
bool is_fine_grained_;
|
||||
size_t allocable_size_;
|
||||
hsa_agent_t owner_agent_;
|
||||
} region_info;
|
||||
|
||||
// Used to print out topology info
|
||||
typedef struct node_info {
|
||||
node_info() {
|
||||
}
|
||||
agent_info agent;
|
||||
std::vector<region_info> region;
|
||||
} node_info;
|
||||
typedef struct NodeInfo {
|
||||
AgentInfo agent;
|
||||
std::vector<PoolInfo> pool;
|
||||
} NodeInfo;
|
||||
|
||||
hsa_status_t AgentInfo(hsa_agent_t agent, void* data);
|
||||
hsa_status_t RegionInfo(hsa_amd_memory_pool_t region, void* data);
|
||||
|
||||
class MemoryAsyncCopy: public rocrtst::BaseRocR, public PerfBase {
|
||||
class MemoryAsyncCopy : public TestBase {
|
||||
public:
|
||||
MemoryAsyncCopy();
|
||||
|
||||
//@Brief: Destructor for test case of MemoryAsyncCopy
|
||||
// @Brief: Destructor for test case of MemoryAsyncCopy
|
||||
virtual ~MemoryAsyncCopy();
|
||||
|
||||
//@Brief: Setup the environment for measurement
|
||||
// @Brief: Setup the environment for measurement
|
||||
virtual void SetUp();
|
||||
|
||||
//@Brief: Core measurement execution
|
||||
// @Brief: Core measurement execution
|
||||
virtual void Run();
|
||||
|
||||
//@Brief: Clean up and retrive the resource
|
||||
// @Brief: Clean up and retrive the resource
|
||||
virtual void Close();
|
||||
|
||||
//@Brief: Display results
|
||||
// @Brief: Display results
|
||||
virtual void DisplayResults() const;
|
||||
|
||||
// There are 3 levels of testing, from quickest/very specific to
|
||||
// longest/most complete:
|
||||
// 1. to and from a specified source to a specified target
|
||||
// 2. to and from the cpu to 1 gpu, and to/from a gpu to another gpu
|
||||
// (if available)
|
||||
// 3. to and from the cpu to 1 gpu and, to/from every gpu to every
|
||||
// other gpu
|
||||
// The default is #2 above. If *both* a source and dest. are set for #1
|
||||
// above, then that overides both #2 and #3
|
||||
void set_src_pool(int pool_id) {src_pool_id_ = pool_id;}
|
||||
void set_dst_pool(int pool_id) {dst_pool_id_ = pool_id;}
|
||||
void set_full_test(bool full_test) {do_full_test_ = full_test;}
|
||||
int pool_index(void) const {return pool_index_;}
|
||||
void set_pool_index(int i) {pool_index_ = i;}
|
||||
int agent_index(void) const {return agent_index_;}
|
||||
void set_agent_index(int i) {agent_index_ = i;}
|
||||
std::vector<PoolInfo *> *pool_info(void) {return &pool_info_;}
|
||||
std::vector<AgentInfo *> *agent_info(void) {return &agent_info_;}
|
||||
std::vector<NodeInfo> *node_info(void) {return &node_info_;}
|
||||
|
||||
// @Brief: Display information about what this test does
|
||||
virtual void DisplayTestInfo(void);
|
||||
|
||||
private:
|
||||
//@Brief: Get real iteration number
|
||||
virtual size_t RealIterationNum();
|
||||
// @Brief: Get real iteration number
|
||||
virtual size_t RealIterationNum(void);
|
||||
|
||||
//@Brief: Get the mean copy time
|
||||
virtual double GetMeanTime(std::vector<double>& vec);
|
||||
// @Brief: Get the mean copy time
|
||||
double GetMeanTime(std::vector<double>* vec);
|
||||
|
||||
//@Brief: Get the min copy time
|
||||
virtual double GetMinTime(std::vector<double>& vec);
|
||||
// @Brief: Find and print out the needed topology info
|
||||
void FindTopology(void);
|
||||
|
||||
//@Brief: Find and print out the needed topology info
|
||||
void FindTopology();
|
||||
// @Brief: Run for Benchmark mode with verification
|
||||
void RunBenchmarkWithVerification(Transaction *t);
|
||||
|
||||
//@Brief: Parse the argument and interact with the user
|
||||
// to fill the vectors.
|
||||
void ParseArgument();
|
||||
// @Brief: Dispaly Benchmark result
|
||||
void DisplayBenchmark(Transaction *t) const;
|
||||
|
||||
//@Brief: Run for Benchmark mode
|
||||
void RunBenchmark();
|
||||
// @Brief: Print topology info
|
||||
void PrintTopology(void);
|
||||
|
||||
//@Brief: Run for Benchmark mode with verification
|
||||
void RunBenchmarkWithVerification();
|
||||
void ConstructTransactionList(void);
|
||||
|
||||
//@Brief: Dispaly Benchmark result
|
||||
void DisplayBenchmark();
|
||||
// @Brief: Find system region
|
||||
void FindSystemPool(void);
|
||||
|
||||
//@Brief: Run user defined
|
||||
void RunNormal();
|
||||
|
||||
//@Brief: Print topology info
|
||||
void PrintTopology();
|
||||
|
||||
//@Brief: Find system region
|
||||
void FindSystemRegion();
|
||||
|
||||
//@Brief: Check if agent and access memory pool, if so, set
|
||||
//access to the agent, if not, exit
|
||||
void AcquireAccess(hsa_agent_t agent, hsa_amd_memory_pool_t pool, void* ptr);
|
||||
|
||||
friend hsa_status_t AgentInfo(hsa_agent_t agent, void* data);
|
||||
friend hsa_status_t RegionInfo(hsa_amd_memory_pool_t region, void* data);
|
||||
|
||||
protected:
|
||||
// More variables declared for testing
|
||||
std::vector<transaction> tran_;
|
||||
std::vector<Transaction> tran_;
|
||||
|
||||
// Variable used to store agent info, indexed by agent_index_
|
||||
std::vector<agent_info> agent_info_;
|
||||
std::vector<AgentInfo *> agent_info_;
|
||||
|
||||
// Variable used to store region info, indexed by region_index_
|
||||
std::vector<region_info> region_info_;
|
||||
// Variable used to store region info, indexed by pool_index_
|
||||
std::vector<PoolInfo *> pool_info_;
|
||||
|
||||
// Variable to store argument number
|
||||
int argc_;
|
||||
|
||||
// Pointer to store address of argument text
|
||||
char** argv_;
|
||||
// To store node info
|
||||
std::vector<NodeInfo> node_info_;
|
||||
|
||||
// Variable to help count agent index
|
||||
int agent_index_;
|
||||
|
||||
// Variable to help count region index
|
||||
int region_index_;
|
||||
|
||||
// BenchMark mode by default
|
||||
bool bench_mark_mode_;
|
||||
|
||||
// BenchMark copy time
|
||||
std::vector<double> benchmark_copy_time_;
|
||||
|
||||
// Min time
|
||||
std::vector<double> min_time_;
|
||||
|
||||
// User define copy time
|
||||
double user_copy_time_;
|
||||
int pool_index_;
|
||||
|
||||
// Verification result
|
||||
bool verified_;
|
||||
|
||||
// If it needs verification
|
||||
bool verification_;
|
||||
|
||||
// To store node info
|
||||
std::vector<node_info> node_info_;
|
||||
// Store the testing level
|
||||
int src_pool_id_;
|
||||
int dst_pool_id_;
|
||||
bool do_full_test_;
|
||||
|
||||
// System region
|
||||
hsa_amd_memory_pool_t sys_region_;
|
||||
hsa_amd_memory_pool_t sys_pool_;
|
||||
|
||||
// CPU agent used for verification
|
||||
hsa_agent_t cpu_agent_;
|
||||
|
||||
constexpr const static char* help_info =
|
||||
MULTILINE(. / memory_async_copy - f source_region - t dst_region - s data_size_in_KB - r[y | n] - i iteration_number - b\n\
|
||||
\n\
|
||||
-h Help info \n\
|
||||
-f Memory Pool where data copy from \n\
|
||||
-t Memory Pool where data copy to \n\
|
||||
|
||||
-s Size of copy data, 256MB by default \n\
|
||||
-r If wants to add more copy \n\
|
||||
-i Iteration number for each copy \n\
|
||||
-b Enable benchmark mode \n\
|
||||
Note : -f - t must be specified\n);
|
||||
rocrtst::PerfTimer copy_timer_;
|
||||
};
|
||||
|
||||
#endif
|
||||
#endif // ROCRTST_SUITES_PERFORMANCE_MEMORY_ASYNC_COPY_H_
|
||||
|
||||
@@ -1,411 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "memory_copy.h"
|
||||
#include "common/base_rocr_utils.h"
|
||||
#include "common/common.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <algorithm>
|
||||
|
||||
MemoryCopy::MemoryCopy(size_t num) :
|
||||
BaseRocR() {
|
||||
ptr_src_ = NULL;
|
||||
ptr_dst_ = NULL;
|
||||
ptr_dev_src_ = NULL;
|
||||
ptr_dev_dst_ = NULL;
|
||||
device_region_.handle = 0;
|
||||
set_requires_profile (HSA_PROFILE_BASE);
|
||||
}
|
||||
|
||||
MemoryCopy::~MemoryCopy() {
|
||||
}
|
||||
|
||||
const char* MemoryCopy::Str[16] = {"64K", "128K", "256K", "512K", "1M", "2M",
|
||||
"4M", "8M", "16M", "32M", "64M", "128M",
|
||||
"256M", "512M", "1G", "2G"
|
||||
};
|
||||
const size_t MemoryCopy::Size[16] = {64*1024, 128*1024, 256*1024, 512*1024,
|
||||
1024*1024, 2048*1024, 4096*1024,
|
||||
8*1024*1024, 16*1024* 1024, 32*1024*1024,
|
||||
64*1024*1024, 128*1024*1024, 256*1024*1024,
|
||||
512*1024*1024, 1024*1024*1024,
|
||||
(size_t)2*1024*1024* 1024
|
||||
};
|
||||
|
||||
|
||||
void MemoryCopy::SetUp() {
|
||||
hsa_status_t err;
|
||||
|
||||
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
hsa_agent_t* cpu_dev = cpu_device();
|
||||
|
||||
// Find system memory pool for kernarg allocation.
|
||||
// hsa_amd_memory_pool_t sys_coarse_grained_pool;
|
||||
err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool,
|
||||
&cpu_pool());
|
||||
ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
|
||||
|
||||
ASSERT_NE(cpu_pool().handle, 0);
|
||||
|
||||
// Get local memory pool of the first GPU.
|
||||
// hsa_amd_memory_pool_t gpu_pool_;
|
||||
err = hsa_amd_agent_iterate_memory_pools(*gpu_dev, rocrtst::FindStandardPool,
|
||||
&device_pool());
|
||||
ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
|
||||
ASSERT_NE(device_pool().handle, 0);
|
||||
|
||||
//Allocate buffers whose size is 2GB
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool(), Size[12], 0, &ptr_src_);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool(), Size[12], 0, &ptr_dst_);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = hsa_amd_memory_pool_allocate(device_pool(), Size[11], 0, &ptr_dev_src_);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = hsa_amd_memory_pool_allocate(device_pool(), Size[11], 0, &ptr_dev_dst_);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
//Assign the region ownership to GPU
|
||||
err = hsa_memory_assign_agent(ptr_dev_src_, *gpu_dev,
|
||||
HSA_ACCESS_PERMISSION_RW);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = hsa_memory_assign_agent(ptr_dev_dst_, *gpu_dev,
|
||||
HSA_ACCESS_PERMISSION_RW);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
//rocrtst::CommonCleanUp the two buffer, src to 1 each byte and dst to 0
|
||||
err = hsa_amd_memory_fill(ptr_src_, 1, Size[12]);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
//Check if the initialization is correct
|
||||
#if DEBUG
|
||||
std::cout << "Value after setting source buffer is: "
|
||||
<< (int)((uint8_t*)ptr_src_)[0] << std::endl;
|
||||
#endif
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void MemoryCopy::Run() {
|
||||
hsa_status_t err;
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t iterations = RealIterationNum();
|
||||
|
||||
//Iteration over the different data size on system memory
|
||||
for (int i = 0; i < 13; i++) {
|
||||
std::vector<double> time;
|
||||
|
||||
for (uint32_t it = 0; it < iterations; it++) {
|
||||
#if DEBUG
|
||||
std::cout << ".";
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
rocrtst::PerfTimer copy_timer;
|
||||
int index = copy_timer.CreateTimer();
|
||||
|
||||
copy_timer.StartTimer(index);
|
||||
err = hsa_memory_copy(ptr_dst_, ptr_src_, Size[i]);
|
||||
copy_timer.StopTimer(index);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Push the result back to vector time
|
||||
time.push_back(copy_timer.ReadTimer(index));
|
||||
|
||||
#if DEBUG
|
||||
//Check if the data copied is correct
|
||||
uint8_t* temp_ptr = (uint8_t*)ptr_dst_;
|
||||
|
||||
for (uint32_t j = 0; j < Size[i]; j++) {
|
||||
ASSERT_EQ(temp_ptr[j], 1);
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#if DEBUG
|
||||
std::cout << std::endl;
|
||||
#endif
|
||||
|
||||
//Get mean copy time and store to the array
|
||||
sys2sys_copy_time_.push_back(GetMeanTime(time));
|
||||
}
|
||||
|
||||
//Copy from system memory to device memory
|
||||
for (int i = 0; i < 12; i++) {
|
||||
std::vector<double> time;
|
||||
|
||||
for (uint32_t it = 0; it < iterations; it++) {
|
||||
#if DEBUG
|
||||
std::cout << ".";
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
rocrtst::PerfTimer copy_timer;
|
||||
int index = copy_timer.CreateTimer();
|
||||
|
||||
copy_timer.StartTimer(index);
|
||||
err = hsa_memory_copy(ptr_dev_src_, ptr_src_, Size[i]);
|
||||
copy_timer.StopTimer(index);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Push the result back to vector time
|
||||
time.push_back(copy_timer.ReadTimer(index));
|
||||
|
||||
#if DEBUG
|
||||
//Check if the data copied is correct
|
||||
uint8_t* temp_ptr = (uint8_t*)ptr_dst_;
|
||||
|
||||
for (uint32_t j = 0; j < Size[i]; j++) {
|
||||
ASSERT_EQ(temp_ptr[j], 1);
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#if DEBUG
|
||||
std::cout << std::endl;
|
||||
#endif
|
||||
|
||||
//Get mean copy time and store to the array
|
||||
sys2dev_copy_time_.push_back(GetMeanTime(time));
|
||||
}
|
||||
|
||||
//Copy from device memory to device memory
|
||||
for (int i = 0; i < 12; i++) {
|
||||
std::vector<double> time;
|
||||
|
||||
for (uint32_t it = 0; it < iterations; it++) {
|
||||
#if DEBUG
|
||||
std::cout << ".";
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
rocrtst::PerfTimer copy_timer;
|
||||
int index = copy_timer.CreateTimer();
|
||||
|
||||
copy_timer.StartTimer(index);
|
||||
err = hsa_memory_copy(ptr_dev_dst_, ptr_dev_src_, Size[i]);
|
||||
copy_timer.StopTimer(index);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Push the result back to vector time
|
||||
time.push_back(copy_timer.ReadTimer(index));
|
||||
|
||||
#if DEBUG
|
||||
//Check if the data copied is correct
|
||||
uint8_t* temp_ptr = (uint8_t*)ptr_dst_;
|
||||
|
||||
for (uint32_t j = 0; j < Size[i]; j++) {
|
||||
ASSERT_EQ(temp_ptr[j], 1);
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#if DEBUG
|
||||
std::cout << std::endl;
|
||||
#endif
|
||||
|
||||
//Get mean copy time and store to the array
|
||||
dev2dev_copy_time_.push_back(GetMeanTime(time));
|
||||
}
|
||||
|
||||
//Copy from device memory to system memory
|
||||
for (int i = 0; i < 12; i++) {
|
||||
std::vector<double> time;
|
||||
|
||||
for (uint32_t it = 0; it < iterations; it++) {
|
||||
#if DEBUG
|
||||
std::cout << ".";
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
rocrtst::PerfTimer copy_timer;
|
||||
int index = copy_timer.CreateTimer();
|
||||
|
||||
copy_timer.StartTimer(index);
|
||||
err = hsa_memory_copy(ptr_dst_, ptr_dev_src_, Size[i]);
|
||||
copy_timer.StopTimer(index);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Push the result back to vector time
|
||||
time.push_back(copy_timer.ReadTimer(index));
|
||||
|
||||
#if DEBUG
|
||||
//Check if the data copied is correct
|
||||
uint8_t* temp_ptr = (uint8_t*)ptr_dst_;
|
||||
|
||||
for (uint32_t j = 0; j < Size[i]; j++) {
|
||||
if (temp_ptr[j] != 1) {
|
||||
ASSERT_EQ(temp_ptr[j], 1);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#if DEBUG
|
||||
std::cout << std::endl;
|
||||
#endif
|
||||
|
||||
//Get mean copy time and store to the array
|
||||
dev2sys_copy_time_.push_back(GetMeanTime(time));
|
||||
}
|
||||
}
|
||||
|
||||
size_t MemoryCopy::RealIterationNum() {
|
||||
return num_iteration() * 1.2 + 1;
|
||||
}
|
||||
|
||||
double MemoryCopy::GetMeanTime(std::vector<double>& vec) {
|
||||
std::sort(vec.begin(), vec.end());
|
||||
|
||||
vec.erase(vec.begin());
|
||||
vec.erase(vec.begin(), vec.begin() + num_iteration() * 0.1);
|
||||
vec.erase(vec.begin() + num_iteration(), vec.end());
|
||||
|
||||
double mean = 0.0;
|
||||
int num = vec.size();
|
||||
|
||||
for (int it = 0; it < num; it++) {
|
||||
// printf("%f\n", vec[it]);
|
||||
mean += vec[it];
|
||||
}
|
||||
|
||||
mean /= num;
|
||||
return mean;
|
||||
}
|
||||
|
||||
void MemoryCopy::DisplayResults() const {
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
printf(
|
||||
"================ System to System ==================================\n");
|
||||
printf(" Data Size BandWidth(GB/s)\n");
|
||||
|
||||
//Output the BW of system memory to system memory
|
||||
for (int i = 0; i < 13; i++) {
|
||||
double band_width = (double) Size[i] / sys2sys_copy_time_[i] / 1024 / 1024
|
||||
/ 1024 * 2;
|
||||
#ifdef DEBUG
|
||||
printf("size: %zu time: %f\n", Size[i], sys2sys_copy_time_[i]);
|
||||
#endif
|
||||
printf(" %s %lf\n", Str[i], band_width);
|
||||
}
|
||||
|
||||
printf(
|
||||
"================ System to Device ===================================\n");
|
||||
|
||||
for (int i = 0; i < 12; i++) {
|
||||
double band_width = (double) Size[i] / sys2dev_copy_time_[i] / 1024 / 1024
|
||||
/ 1024 * 2;
|
||||
#ifdef DEBUG
|
||||
printf("size: %zu time: %f\n", Size[i], sys2dev_copy_time_[i]);
|
||||
#endif
|
||||
printf(" %s %lf\n", Str[i], band_width);
|
||||
}
|
||||
|
||||
printf(
|
||||
"================ Device to Device ===================================\n");
|
||||
|
||||
for (int i = 0; i < 12; i++) {
|
||||
double band_width = (double) Size[i] / dev2dev_copy_time_[i] / 1024 / 1024
|
||||
/ 1024 * 2;
|
||||
#ifdef DEBUG
|
||||
printf("size: %zu time: %f\n", Size[i], dev2dev_copy_time_[i]);
|
||||
#endif
|
||||
printf(" %s %lf\n", Str[i], band_width);
|
||||
}
|
||||
|
||||
printf(
|
||||
"================ Device to System ===================================\n");
|
||||
|
||||
for (int i = 0; i < 12; i++) {
|
||||
double band_width = (double) Size[i] / dev2sys_copy_time_[i] / 1024 / 1024
|
||||
/ 1024 * 2;
|
||||
#ifdef DEBUG
|
||||
printf("size: %zu time: %f\n", Size[i], dev2sys_copy_time_[i]);
|
||||
#endif
|
||||
printf(" %s %lf\n", Str[i], band_width);
|
||||
}
|
||||
|
||||
printf("===================================================\n");
|
||||
return;
|
||||
}
|
||||
|
||||
void MemoryCopy::Close() {
|
||||
hsa_status_t err;
|
||||
|
||||
//Free the memory allocated
|
||||
err = hsa_memory_free(ptr_src_);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = hsa_memory_free(ptr_dst_);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
ptr_src_ = NULL;
|
||||
ptr_dst_ = NULL;
|
||||
|
||||
err = rocrtst::CommonCleanUp(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
return;
|
||||
}
|
||||
@@ -1,109 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __ROCRTST_SRC_MEMORY_MEM_COPY_H__
|
||||
#define __ROCRTST_SRC_MEMORY_MEM_COPY_H__
|
||||
|
||||
#include "common/base_rocr.h"
|
||||
#include "perf_common/perf_base.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include "common/hsatimer.h"
|
||||
#include <vector>
|
||||
|
||||
class MemoryCopy: public rocrtst::BaseRocR, public PerfBase {
|
||||
|
||||
public:
|
||||
//@Brief: Constructor for test case of MemoryCopy
|
||||
MemoryCopy(size_t num = 100);
|
||||
|
||||
//@Brief: Destructor for test case of MemoryCopy
|
||||
virtual ~MemoryCopy();
|
||||
|
||||
//@Brief: Setup the environment for measurement
|
||||
virtual void SetUp();
|
||||
|
||||
//@Brief: Core measurement execution
|
||||
virtual void Run();
|
||||
|
||||
//@Brief: Clean up and retrive the resource
|
||||
virtual void Close();
|
||||
|
||||
//@Brief: Display results
|
||||
virtual void DisplayResults() const;
|
||||
|
||||
private:
|
||||
//@Brief: Define copy data size and corresponding string
|
||||
static const size_t Size[16];
|
||||
static const char* Str[16];
|
||||
|
||||
//@Brief: Get real iteration number
|
||||
virtual size_t RealIterationNum();
|
||||
|
||||
//@Brief: Get the mean copy time
|
||||
virtual double GetMeanTime(std::vector<double>& vec);
|
||||
|
||||
protected:
|
||||
//@Brief: More variables declared for testing
|
||||
//@Brief: Source pointer from which data copy
|
||||
void* ptr_src_;
|
||||
|
||||
//@Brief: Destination pointer to which data copy
|
||||
void* ptr_dst_;
|
||||
|
||||
//@Brief: Pointer to device memory
|
||||
void* ptr_dev_src_;
|
||||
void* ptr_dev_dst_;
|
||||
|
||||
//@Brief: Array to store the timer results for each data size
|
||||
std::vector<double> sys2sys_copy_time_;
|
||||
std::vector<double> sys2dev_copy_time_;
|
||||
std::vector<double> dev2sys_copy_time_;
|
||||
std::vector<double> dev2dev_copy_time_;
|
||||
|
||||
//@Brief: Device memory region
|
||||
hsa_region_t device_region_;
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -1,284 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
#include "queue_concurrency.h"
|
||||
#include "common/base_rocr_utils.h"
|
||||
#include "common/common.h"
|
||||
#include "common/helper_funcs.h"
|
||||
#include "common/hsatimer.h"
|
||||
#include "common/os.h"
|
||||
#include "hsa/hsa_ext_finalize.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
#include <math.h>
|
||||
#include <thread>
|
||||
|
||||
QueueConcurrency::QueueConcurrency() :
|
||||
BaseRocR(), execution_time_(8) {
|
||||
queue_num_ = 0;
|
||||
std_time_ = 0.0;
|
||||
|
||||
set_enable_interrupt(true);
|
||||
set_requires_profile (HSA_PROFILE_FULL);
|
||||
}
|
||||
|
||||
QueueConcurrency::~QueueConcurrency() {
|
||||
}
|
||||
|
||||
void QueueConcurrency::SetUp() {
|
||||
hsa_status_t err;
|
||||
|
||||
set_kernel_file_name("test_kernel.o");
|
||||
set_kernel_name("&__OpenCL_vec_assign_kernel");
|
||||
|
||||
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
rocrtst::LoadKernelFromObjFile(this);
|
||||
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
|
||||
// Fill up part of aql pakcet which are the same cross the threads
|
||||
rocrtst::InitializeAQLPacket(this, &aql());
|
||||
|
||||
// Create a queue
|
||||
hsa_queue_t* q = main_queue();
|
||||
rocrtst::CreateQueue(*gpu_dev, &q);
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
// Output of kernel
|
||||
int output = 0;
|
||||
|
||||
// Iteration number
|
||||
int iterations = 1024 * 1024; // * 1024;
|
||||
|
||||
struct ALIGNED_(16)
|
||||
args_t {
|
||||
void* arg0;
|
||||
int arg1;
|
||||
} local_args;
|
||||
|
||||
local_args.arg0 = (void*) &output;
|
||||
local_args.arg1 = iterations;
|
||||
|
||||
err = hsa_memory_register(&local_args, sizeof(local_args));
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
//Obtain the current queue write index.
|
||||
uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
|
||||
|
||||
//Write the aql packet at the calculated queue index address.
|
||||
|
||||
const uint32_t queue_mask = main_queue()->size - 1;
|
||||
hsa_kernel_dispatch_packet_t* pkt_addr =
|
||||
(hsa_kernel_dispatch_packet_t*) (main_queue()->base_address);
|
||||
|
||||
(pkt_addr)[index & queue_mask] = aql();
|
||||
(pkt_addr)[index & queue_mask].completion_signal = signal();
|
||||
(pkt_addr)[index & queue_mask].kernarg_address = &local_args;
|
||||
|
||||
//Get timing stamp and ring the doorbell to dispatch the kernel.
|
||||
rocrtst::PerfTimer p_timer;
|
||||
int id = p_timer.CreateTimer();
|
||||
p_timer.StartTimer(id);
|
||||
|
||||
//.type = HSA_PACKET_TYPE_DISPATCH;
|
||||
(pkt_addr)[index & queue_mask].header |= HSA_PACKET_TYPE_KERNEL_DISPATCH
|
||||
<< HSA_PACKET_HEADER_TYPE;
|
||||
hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
|
||||
|
||||
//Wait on the dispatch signal until the kernel is finished.
|
||||
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
|
||||
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
|
||||
;
|
||||
|
||||
p_timer.StopTimer(id);
|
||||
hsa_signal_store_screlease(signal(), 1);
|
||||
|
||||
if (1 == i) {
|
||||
std_time_ = p_timer.ReadTimer(id);
|
||||
}
|
||||
}
|
||||
|
||||
//Destroy the queue
|
||||
err = hsa_queue_destroy(main_queue());
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
void QueueConcurrency::Run() {
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Launch 8 child threads
|
||||
std::vector < std::thread > threads;
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
threads.push_back(std::thread(&QueueConcurrency::ThreadFunc, this, i));
|
||||
}
|
||||
|
||||
// Wait for join
|
||||
for (int i = 0; i < 8; i++) {
|
||||
threads[i].join();
|
||||
}
|
||||
|
||||
CalculateQueueNum();
|
||||
}
|
||||
|
||||
void QueueConcurrency::CalculateQueueNum() {
|
||||
for (int i = 0; i < 8; i++) {
|
||||
double expected_time = execution_time_[0] / (1 << i);
|
||||
double deviation = sqrt(
|
||||
(expected_time - execution_time_[i])
|
||||
* (expected_time - execution_time_[i]));
|
||||
|
||||
if (deviation < 0.1 * expected_time) {
|
||||
queue_num_++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void QueueConcurrency::DisplayResults() const {
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
std::cout << execution_time_[i] << std::endl;
|
||||
}
|
||||
|
||||
std::cout << "Number of Concurrent Queue is: " << queue_num_ << std::endl;
|
||||
|
||||
ASSERT_EQ(queue_num_, 3);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void QueueConcurrency::Close() {
|
||||
hsa_status_t err;
|
||||
err = rocrtst::CommonCleanUp(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
void QueueConcurrency::ThreadFunc(int threadID) {
|
||||
// Define local queue and signal
|
||||
hsa_queue_t* queue;
|
||||
hsa_signal_t signal;
|
||||
hsa_status_t err;
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
|
||||
// Create a signal
|
||||
err = hsa_signal_create(1, 0, NULL, &signal);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
rocrtst::CreateQueue(*gpu_dev, &queue);
|
||||
|
||||
std::vector<double> time;
|
||||
|
||||
for (uint32_t i = 0; i < num_iteration(); i++) {
|
||||
// Output of kernel
|
||||
int output = 0;
|
||||
|
||||
// Iteration number
|
||||
int iterations = 1024 * 1024 / (1 << threadID);
|
||||
|
||||
struct ALIGNED_(16)
|
||||
args_t {
|
||||
void* arg0;
|
||||
int arg1;
|
||||
} local_args;
|
||||
|
||||
local_args.arg0 = (void*) &output;
|
||||
local_args.arg1 = iterations;
|
||||
|
||||
err = hsa_memory_register(&local_args, sizeof(local_args));
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
//Obtain the current queue write index.
|
||||
uint64_t index = hsa_queue_add_write_index_relaxed(queue, 1);
|
||||
|
||||
//Write the aql packet at the calculated queue index address.
|
||||
|
||||
const uint32_t queue_mask = queue->size - 1;
|
||||
hsa_kernel_dispatch_packet_t* pkt_addr =
|
||||
(hsa_kernel_dispatch_packet_t*) (queue->base_address);
|
||||
(pkt_addr)[index & queue_mask] = aql();
|
||||
(pkt_addr)[index & queue_mask].completion_signal = signal;
|
||||
(pkt_addr)[index & queue_mask].kernarg_address = &local_args;
|
||||
|
||||
//Get timing stamp and ring the doorbell to dispatch the kernel.
|
||||
rocrtst::PerfTimer p_timer;
|
||||
int id = p_timer.CreateTimer();
|
||||
p_timer.StartTimer(id);
|
||||
|
||||
//.type = HSA_PACKET_TYPE_DISPATCH;
|
||||
(pkt_addr)[index & queue_mask].header |= HSA_PACKET_TYPE_KERNEL_DISPATCH
|
||||
<< HSA_PACKET_HEADER_TYPE;
|
||||
hsa_signal_store_screlease(queue->doorbell_signal, index);
|
||||
|
||||
//Wait on the dispatch signal until the kernel is finished.
|
||||
while (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1,
|
||||
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
|
||||
;
|
||||
|
||||
p_timer.StopTimer(id);
|
||||
hsa_signal_store_screlease(signal, 1);
|
||||
|
||||
time.push_back(p_timer.ReadTimer(id));
|
||||
|
||||
EXPECT_EQ(output, iterations);
|
||||
|
||||
if (1 == i) {
|
||||
execution_time_[threadID] = p_timer.ReadTimer(id);
|
||||
}
|
||||
}
|
||||
|
||||
time.erase(time.begin());
|
||||
execution_time_[threadID] = rocrtst::CalcMean(time);
|
||||
return;
|
||||
}
|
||||
|
||||
-271
@@ -1,271 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "queue_create_destroy_latency.h"
|
||||
#include "common/hsatimer.h"
|
||||
#include "common/common.h"
|
||||
#include "common/base_rocr_utils.h"
|
||||
#include "common/helper_funcs.h"
|
||||
#include "hsa/hsa_ext_amd.h"
|
||||
#include "hsa/hsa_ext_finalize.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <stdio.h>
|
||||
|
||||
static const int kGridDimension = 1024;
|
||||
|
||||
// Construct the test case class
|
||||
QueueLatency::QueueLatency() :
|
||||
BaseRocR() {
|
||||
max_queue_ = 0;
|
||||
in_ = NULL;
|
||||
out_ = NULL;
|
||||
}
|
||||
|
||||
// Destruct the test case claa
|
||||
QueueLatency::~QueueLatency() {
|
||||
|
||||
}
|
||||
|
||||
void QueueLatency::Close() {
|
||||
hsa_memory_free (in_);
|
||||
hsa_memory_free (out_);
|
||||
|
||||
hsa_status_t err;
|
||||
err = rocrtst::CommonCleanUp(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
return;
|
||||
}
|
||||
|
||||
// Set up the environment
|
||||
void QueueLatency::SetUp() {
|
||||
hsa_status_t err;
|
||||
|
||||
// We get hangs with vector_copy
|
||||
set_kernel_file_name("vector_copy.o");
|
||||
set_kernel_name("&__vector_copy_kernel");
|
||||
|
||||
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
hsa_agent_t* cpu_dev = cpu_device();
|
||||
|
||||
// Get the max queue which can be active for GPU device
|
||||
err = hsa_agent_get_info(*gpu_dev, HSA_AGENT_INFO_QUEUES_MAX, &max_queue_);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Find system coarse grained region
|
||||
err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool,
|
||||
&cpu_pool());
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
size_t pool_size;
|
||||
err = hsa_amd_memory_pool_get_info(cpu_pool(), HSA_AMD_MEMORY_POOL_INFO_SIZE,
|
||||
&pool_size);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool(),
|
||||
kGridDimension * kGridDimension * 4, 0,
|
||||
(void**) &in_);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool(),
|
||||
kGridDimension * kGridDimension * 4, 0,
|
||||
(void**) &out_);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
//rocrtst::LoadKernelFromObjFile(gpu_dev, "./"+ kernel_file_name() + ".o");
|
||||
rocrtst::LoadKernelFromObjFile(this);
|
||||
|
||||
// Fill up the aql packet
|
||||
rocrtst::InitializeAQLPacket(this, &aql());
|
||||
aql().grid_size_x = kGridDimension * kGridDimension;
|
||||
|
||||
// rocrtst::CommonCleanUp vector memory and register them
|
||||
//memset(in_, 1, kGridDimension*kGridDimension * 4);
|
||||
|
||||
err = hsa_amd_memory_fill(in_, 1, kGridDimension * kGridDimension * 4);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void QueueLatency::Run() {
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
hsa_status_t err;
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// The outer for loop iterator represents the predefined queue number
|
||||
// After creating a queue, launch a kernel to train the queue, then destroy
|
||||
// TODO:Hardcode max_queue_ to 100
|
||||
max_queue_ = 20;
|
||||
|
||||
for (uint32_t pre_defined_num = 0; pre_defined_num < max_queue_;
|
||||
pre_defined_num++) {
|
||||
#ifdef DEBUG
|
||||
std::cout << "Existing queue number: " << pre_defined_num << std::endl;
|
||||
#endif
|
||||
// vector to store the creation and destruction time
|
||||
std::vector<double> creation;
|
||||
std::vector<double> destruction;
|
||||
// Create pre_defined_num queues first
|
||||
hsa_queue_t* q;
|
||||
|
||||
for (uint32_t i = 0; i < pre_defined_num; i++) {
|
||||
q = main_queue();
|
||||
rocrtst::CreateQueue(*gpu_dev, &q);
|
||||
|
||||
queues_.push_back(q);
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < num_iteration(); i++) {
|
||||
rocrtst::PerfTimer p_timer;
|
||||
int id = p_timer.CreateTimer();
|
||||
|
||||
uint32_t size = 0;
|
||||
err = hsa_agent_get_info(*gpu_dev, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &size);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
p_timer.StartTimer(id);
|
||||
hsa_queue_t* q = main_queue();
|
||||
|
||||
err = hsa_queue_create(*gpu_dev, size, HSA_QUEUE_TYPE_MULTI, NULL, NULL,
|
||||
UINT32_MAX, UINT32_MAX, &q);
|
||||
p_timer.StopTimer(id);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
creation.push_back(p_timer.ReadTimer(id));
|
||||
|
||||
p_timer.ResetTimer(id);
|
||||
|
||||
// Launch a kernel to the currently created queue
|
||||
// Allocate kernel parameter
|
||||
typedef struct args_t {
|
||||
void* in_buf;
|
||||
void* out_buf;
|
||||
} args;
|
||||
|
||||
args* kern_ptr = NULL;
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(args), 0,
|
||||
(void**) &kern_ptr);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
kern_ptr->in_buf = in_;
|
||||
kern_ptr->out_buf = out_;
|
||||
|
||||
aql().kernarg_address = kern_ptr;
|
||||
|
||||
// Obtain the current queue write index.
|
||||
uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
|
||||
|
||||
// Write the aql packet at the calculated queue index address.
|
||||
const uint32_t queue_mask = main_queue()->size - 1;
|
||||
((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index
|
||||
& queue_mask] = aql();
|
||||
|
||||
((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index
|
||||
& queue_mask].header |= HSA_PACKET_TYPE_KERNEL_DISPATCH
|
||||
<< HSA_PACKET_HEADER_TYPE;
|
||||
hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
|
||||
|
||||
// Wait on the dispatch signal until the kernel is finished.
|
||||
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
|
||||
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
|
||||
;
|
||||
|
||||
hsa_signal_store_screlease(signal(), 1);
|
||||
|
||||
// Destroy the queue and record the timer
|
||||
p_timer.StartTimer(id);
|
||||
err = hsa_queue_destroy(main_queue());
|
||||
p_timer.StopTimer(id);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
destruction.push_back(p_timer.ReadTimer(id));
|
||||
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
std::cout << std::endl;
|
||||
#endif
|
||||
|
||||
// Destroy the predefined queue
|
||||
for (uint32_t i = 0; i < pre_defined_num; i++) {
|
||||
|
||||
ASSERT_EQ(queues_.size(), pre_defined_num);
|
||||
|
||||
err = hsa_queue_destroy(queues_[i]);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
// Clear the queue vector
|
||||
queues_.clear();
|
||||
|
||||
// Get the mean creation and detruction time and push back
|
||||
double creation_mean = rocrtst::CalcMean(creation);
|
||||
double destruction_mean = rocrtst::CalcMean(destruction);
|
||||
construction_mean_.push_back(creation_mean);
|
||||
destruction_mean_.push_back(destruction_mean);
|
||||
}
|
||||
}
|
||||
|
||||
void QueueLatency::DisplayResults() const {
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
printf("======================================================\n");
|
||||
printf(" Existing queue# Creation Destroy\n");
|
||||
|
||||
for (uint32_t i = 0; i < max_queue_; i++) {
|
||||
printf(" %d, %fms %fms\n", i,
|
||||
construction_mean_[i] * 1e3, destruction_mean_[i] * 1e3);
|
||||
}
|
||||
}
|
||||
-95
@@ -1,95 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __ROCRTST_SRC_INC_QUEUE_CREATE_DESTROY_LATENCY_H__
|
||||
#define __ROCRTST_SRC_INC_QUEUE_CREATE_DESTROY_LATENCY_H__
|
||||
|
||||
#include "perf_common/perf_base.h"
|
||||
#include "common/base_rocr.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include <vector>
|
||||
|
||||
class QueueLatency: public rocrtst::BaseRocR, public PerfBase {
|
||||
public:
|
||||
//@Brief: Constructor
|
||||
QueueLatency();
|
||||
|
||||
//@Brief: Destructor
|
||||
~QueueLatency();
|
||||
|
||||
//@Brief: Set up the teset environment
|
||||
virtual void SetUp();
|
||||
|
||||
//@Brief: Run the test
|
||||
virtual void Run();
|
||||
|
||||
//@Brief: Clean up and close the test
|
||||
virtual void Close();
|
||||
|
||||
//@Brief: Display results
|
||||
virtual void DisplayResults() const;
|
||||
|
||||
private:
|
||||
//@Brief: A vector to store the pointers to multiple queues
|
||||
std::vector<hsa_queue_t*> queues_;
|
||||
|
||||
//@Brief: Variable to store the mean time for both queue construction
|
||||
// and destruction
|
||||
std::vector<double> construction_mean_;
|
||||
std::vector<double> destruction_mean_;
|
||||
|
||||
//@Brief: Variable to store the max number of queue which are active for
|
||||
// device_
|
||||
uint32_t max_queue_;
|
||||
|
||||
//@Brief: Pointer which points to original and destination vector memory
|
||||
// space
|
||||
uint8_t* in_;
|
||||
uint8_t* out_;
|
||||
|
||||
};
|
||||
|
||||
#endif //__ROCRTST_SRC_INC_QUEUE_CREATE_DESTROY_LATENCY_H__
|
||||
|
||||
@@ -1,281 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "system_load_bandwidth.h"
|
||||
#include "common/base_rocr_utils.h"
|
||||
#include "common/common.h"
|
||||
#include "common/helper_funcs.h"
|
||||
#include "common/hsatimer.h"
|
||||
#include "common/os.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <algorithm>
|
||||
|
||||
#if 0
|
||||
static void initGlobalReadBuffer(uint32_t* in_data, uint32_t num_thrds,
|
||||
uint32_t num_ops, uint32_t num_loops) {
|
||||
|
||||
// Populate input buffer with thread Id left shifted by 2.
|
||||
uint32_t value = 0;
|
||||
uint32_t val_idx = 0;
|
||||
|
||||
for (int idx1 = 0; idx1 < num_loops; idx1++) {
|
||||
for (int idx2 = 0; idx2 < num_ops; idx2++) {
|
||||
// Write the value to be read by each thread
|
||||
for (int idx3 = 0; idx3 < num_thrds; idx3++) {
|
||||
value = idx3 << 2;
|
||||
in_data[val_idx++] = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
static bool verifyGlobalLoadKernel(uint32_t* data, uint32_t num_thrds,
|
||||
uint32_t scale, const char* kernel_name, bool print_debug) {
|
||||
|
||||
// Verify kernel operation i.e. validate the data in the output buffer.
|
||||
bool valid = true;
|
||||
uint32_t valid_value = 0;
|
||||
|
||||
for (int idx = 0; idx < num_thrds; idx++) {
|
||||
|
||||
valid_value = (idx << 2) * scale;
|
||||
|
||||
if (print_debug) {
|
||||
std::cout << "Value expected = " << valid_value << std::endl;
|
||||
std::cout << "Value of data = " << data[idx] << std::endl;
|
||||
}
|
||||
|
||||
if (data[idx] != valid_value) {
|
||||
std::cout << kernel_name << ": VALIDATION FAILED ! Bad index: " << idx
|
||||
<< std::endl;
|
||||
std::cout << kernel_name << ": VALUE @ Bad index: " << data[idx]
|
||||
<< std::endl;
|
||||
std::cout << std::endl;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
std::cout << kernel_name << ": Passed validation" << std::endl;
|
||||
std::cout << std::endl;
|
||||
#endif
|
||||
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Constructor
|
||||
SystemLoadBandwidth::SystemLoadBandwidth() :
|
||||
BaseRocR() {
|
||||
set_group_size(0);
|
||||
num_group_ = 0;
|
||||
num_cus_ = 0;
|
||||
|
||||
kernel_loop_count_ = 0;
|
||||
mean_ = 0.0;
|
||||
data_size_ = 0;
|
||||
set_enable_interrupt(0);
|
||||
}
|
||||
|
||||
// Destructor
|
||||
SystemLoadBandwidth::~SystemLoadBandwidth() {
|
||||
}
|
||||
|
||||
// Set up the test environment
|
||||
void SystemLoadBandwidth::SetUp() {
|
||||
set_kernel_file_name("sysMemRead.o");
|
||||
set_kernel_name("&__SysMemLoad");
|
||||
|
||||
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
SetWorkItemNum();
|
||||
|
||||
//Create a queue with max number size
|
||||
hsa_queue_t* q = main_queue();
|
||||
rocrtst::CreateQueue(*gpu_dev, &q);
|
||||
|
||||
rocrtst::LoadKernelFromObjFile(this);
|
||||
|
||||
uint32_t total_work_items = num_cus_ * num_group_ * group_size();
|
||||
|
||||
//Fill up part of aql
|
||||
rocrtst::InitializeAQLPacket(this, &aql());
|
||||
aql().workgroup_size_x = group_size();
|
||||
aql().grid_size_x = total_work_items;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Run the test
|
||||
void SystemLoadBandwidth::Run() {
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t total_workitems = num_cus_ * num_group_ * group_size();
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
hsa_status_t err;
|
||||
|
||||
uint32_t ops_thrd = 32;
|
||||
uint64_t addr_step = (uint64_t) total_workitems * sizeof(uint32_t);
|
||||
uint64_t total_ops = (uint64_t) total_workitems * ops_thrd;
|
||||
uint64_t in_data_size = (uint64_t) total_ops * sizeof(uint32_t);
|
||||
//uint32_t *in_data = (uint32_t *)malloc(in_data_size);
|
||||
err = hsa_amd_agent_iterate_memory_pools(*gpu_dev, rocrtst::FindStandardPool,
|
||||
&device_pool());
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
int32_t* in_data = NULL;
|
||||
err = hsa_amd_memory_pool_allocate(device_pool(), in_data_size, 0,
|
||||
(void**) &in_data);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
memset(in_data, 0, in_data_size);
|
||||
uint32_t out_data_size = total_workitems * sizeof(uint32_t);
|
||||
//uint32_t *out_data = (uint32_t *)malloc(out_data_size);
|
||||
uint32_t* out_data;
|
||||
err = hsa_amd_memory_pool_allocate(device_pool(), out_data_size, 0,
|
||||
(void**) &out_data);
|
||||
memset(out_data, 0, out_data_size);
|
||||
|
||||
data_size_ = in_data_size;
|
||||
|
||||
// initGlobalReadBuffer (in_data, total_workitems, ops_thrd,
|
||||
// kernel_loop_count_);
|
||||
|
||||
typedef struct local_args_t {
|
||||
void* arg0;
|
||||
void* arg1;
|
||||
uint64_t arg2;
|
||||
void* arg3;
|
||||
} args;
|
||||
|
||||
args* kern_ptr = NULL;
|
||||
err = hsa_amd_memory_pool_allocate(device_pool(), sizeof(args), 0,
|
||||
(void**) &kern_ptr);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// in_data is 32 bit ptr, so adding total_ops
|
||||
kern_ptr->arg0 = in_data;
|
||||
kern_ptr->arg1 = in_data + total_ops;
|
||||
kern_ptr->arg2 = addr_step;
|
||||
kern_ptr->arg3 = out_data;
|
||||
|
||||
aql().kernarg_address = kern_ptr;
|
||||
|
||||
std::vector<double> time;
|
||||
|
||||
int it = num_iteration() * 1.2 + 1;
|
||||
|
||||
void *q_base_addr = main_queue()->base_address;
|
||||
|
||||
for (int i = 0; i < it; i++) {
|
||||
// Obtain the current queue write index
|
||||
uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
|
||||
|
||||
// Write the aql packet at the calculated queue index address.
|
||||
const uint32_t queue_mask = main_queue()->size - 1;
|
||||
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql();
|
||||
|
||||
rocrtst::PerfTimer p_timer;
|
||||
int id = p_timer.CreateTimer();
|
||||
p_timer.StartTimer(id);
|
||||
|
||||
((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask].header |=
|
||||
HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
|
||||
hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
|
||||
|
||||
// Wait on the dispatch signal until the kernel is finished.
|
||||
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
|
||||
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
|
||||
;
|
||||
|
||||
p_timer.StopTimer(id);
|
||||
|
||||
#if DEBUG
|
||||
std::cout << ".";
|
||||
std::cout.flush();
|
||||
#endif
|
||||
|
||||
// Verify the results
|
||||
// uint32_t scale = kernel_loop_count_ * ops_thrd;
|
||||
//verifyGlobalLoadKernel(out_data, total_workitems, scale,
|
||||
// kernel_name_.c_str(), false);
|
||||
|
||||
time.push_back(p_timer.ReadTimer(id));
|
||||
|
||||
hsa_signal_store_screlease(signal(), 1);
|
||||
}
|
||||
|
||||
time.erase(time.begin());
|
||||
std::sort(time.begin(), time.end());
|
||||
time.erase(time.begin() + num_iteration(), time.end());
|
||||
mean_ = rocrtst::CalcMean(time);
|
||||
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
void SystemLoadBandwidth::Close() {
|
||||
hsa_status_t err;
|
||||
err = rocrtst::CommonCleanUp(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
void SystemLoadBandwidth::DisplayResults() const {
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::cout << "=======================================" << std::endl;
|
||||
std::cout << "System Load Bandwidth: %f(GB/S)" <<
|
||||
data_size_ / mean_ / 1024 / 1024 / 1024 << std::endl;
|
||||
}
|
||||
@@ -1,119 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __ROCRTST_SRC_INC_SYSTEM_LOAD_BANDWIDTH_H__
|
||||
#define __ROCRTST_SRC_INC_SYSTEM_LOAD_BANDWIDTH_H__
|
||||
|
||||
#include "perf_common/perf_base.h"
|
||||
#include "common/base_rocr.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include <stdio.h>
|
||||
|
||||
class SystemLoadBandwidth: public rocrtst::BaseRocR, public PerfBase {
|
||||
public:
|
||||
//@Brief: Constructor
|
||||
SystemLoadBandwidth();
|
||||
|
||||
//@Brief: Destructor
|
||||
~SystemLoadBandwidth();
|
||||
|
||||
//@Brief: Set up the testing environment
|
||||
virtual void SetUp();
|
||||
|
||||
//@Brief: Run the test case
|
||||
virtual void Run();
|
||||
|
||||
//@Brief: Close and clean up the test enrionment
|
||||
virtual void Close();
|
||||
|
||||
//@Brief: Display load bandwidth
|
||||
virtual void DisplayResults() const;
|
||||
|
||||
//@Brief: Set work-item configuration
|
||||
void SetWorkItemNum() {
|
||||
#ifdef INTERACTIVE
|
||||
uint32_t tmp;
|
||||
printf("Please input the number of CUs you want to try:\n");
|
||||
scanf("%d", &num_cus_);
|
||||
|
||||
printf("Please input the number of groups you want to try:\n");
|
||||
scanf("%d", &num_group_);
|
||||
|
||||
printf("Please input the size of each group:\n");
|
||||
uint32_t sz = 0;
|
||||
scanf("%d", &tmp);
|
||||
set_group_size(tmp);
|
||||
|
||||
printf("Please input the number of kernel loop you want to try:\n");
|
||||
scanf("%d", &kernel_loop_count_);
|
||||
#else
|
||||
num_cus_ = 32;
|
||||
num_group_ = 128;
|
||||
set_group_size(256);
|
||||
kernel_loop_count_ = 16;
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
//@Brief: number of group
|
||||
uint32_t num_group_;
|
||||
|
||||
//@Brief: number of CUs
|
||||
uint32_t num_cus_;
|
||||
|
||||
//@Brief: number of kernel loop
|
||||
uint32_t kernel_loop_count_;
|
||||
|
||||
//@Brief: Mean execution time
|
||||
double mean_;
|
||||
|
||||
//@Brief: data size for test
|
||||
uint64_t data_size_;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,243 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "system_store_bandwidth.h"
|
||||
#include "common/base_rocr_utils.h"
|
||||
#include "common/common.h"
|
||||
#include "common/helper_funcs.h"
|
||||
#include "common/hsatimer.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
static bool verifyGlobalStoreKernel(uint32_t* data, uint32_t num_thrds,
|
||||
uint32_t loop_cnt, uint32_t ops_loop,
|
||||
const char* kernel_name,
|
||||
bool print_debug) {
|
||||
|
||||
// Verify kernel operation i.e. validate the data in the output buffer.
|
||||
for (uint32_t idx1 = 0; idx1 < loop_cnt; idx1++) {
|
||||
for (uint32_t idx2 = 0; idx2 < ops_loop; idx2++) {
|
||||
for (uint32_t idx3 = 0; idx3 < num_thrds; idx3++) {
|
||||
if (data[idx3] != (idx3 << 2)) {
|
||||
std::cout << kernel_name << ": VALIDATION FAILED ! Bad index: "
|
||||
<< idx3 << std::endl;
|
||||
std::cout << kernel_name << ": VALUE @ Bad index: " << data[idx3]
|
||||
<< std::endl;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
std::cout << kernel_name << ": Passed validation" << std::endl;
|
||||
std::cout << std::endl;
|
||||
#endif
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Constructor
|
||||
SystemStoreBandwidth::SystemStoreBandwidth() :
|
||||
BaseRocR() {
|
||||
|
||||
set_group_size(0);
|
||||
num_group_ = 0;
|
||||
num_cus_ = 0;
|
||||
|
||||
kernel_loop_count_ = 0;
|
||||
mean_ = 0.0;
|
||||
data_size_ = 0;
|
||||
}
|
||||
|
||||
// Destructor
|
||||
SystemStoreBandwidth::~SystemStoreBandwidth() {
|
||||
}
|
||||
|
||||
// Set up the test environment
|
||||
void SystemStoreBandwidth::SetUp() {
|
||||
|
||||
set_kernel_file_name("sysMemWrite.o");
|
||||
set_kernel_name("&__SysMemStore");
|
||||
|
||||
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
|
||||
return;
|
||||
}
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
|
||||
SetWorkItemNum();
|
||||
|
||||
//Create a queue with max number size
|
||||
hsa_queue_t* q = nullptr;
|
||||
rocrtst::CreateQueue(*gpu_dev, &q);
|
||||
set_main_queue(q);
|
||||
|
||||
rocrtst::LoadKernelFromObjFile(this);
|
||||
|
||||
uint32_t total_work_items = num_cus_ * num_group_ * group_size();
|
||||
|
||||
//Fill up part of aql
|
||||
rocrtst::InitializeAQLPacket(this, &aql());
|
||||
aql().workgroup_size_x = group_size();
|
||||
aql().grid_size_x = total_work_items;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Run the test
|
||||
void SystemStoreBandwidth::Run() {
|
||||
hsa_status_t err;
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t total_workitems = num_cus_ * num_group_ * group_size();
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
|
||||
uint32_t ops_thrd = 16;
|
||||
uint64_t addr_step = (uint64_t) total_workitems * sizeof(uint32_t);
|
||||
uint64_t total_ops = (uint64_t) total_workitems * kernel_loop_count_
|
||||
* ops_thrd;
|
||||
uint64_t in_data_size = (uint64_t) total_ops * sizeof(uint32_t);
|
||||
err = hsa_amd_agent_iterate_memory_pools(*gpu_dev,
|
||||
rocrtst::FindStandardPool, &device_pool());
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
uint32_t* in_data = NULL;
|
||||
err = hsa_amd_memory_pool_allocate(device_pool(), in_data_size, 0,
|
||||
(void**) &in_data);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
//memset(in_data, 0, in_data_size);
|
||||
err = hsa_amd_memory_fill(in_data, 0, in_data_size);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
uint32_t out_data_size = total_workitems * sizeof(uint32_t);
|
||||
uint32_t* out_data = NULL;
|
||||
err = hsa_amd_memory_pool_allocate(device_pool(), out_data_size, 0,
|
||||
(void**) &out_data);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
//memset(out_data, 0, out_data_size);
|
||||
err = hsa_amd_memory_fill(out_data, 0, out_data_size);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
data_size_ = in_data_size;
|
||||
|
||||
typedef struct local_args_t {
|
||||
void* arg0;
|
||||
void* arg1;
|
||||
uint64_t arg2;
|
||||
void* arg3;
|
||||
} args;
|
||||
|
||||
// in_data is 32 bit ptr, so adding total_ops
|
||||
args* kern_ptr = NULL;
|
||||
err = hsa_amd_memory_pool_allocate(device_pool(), sizeof(args), 0,
|
||||
(void**) &kern_ptr);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
kern_ptr->arg0 = in_data;
|
||||
kern_ptr->arg1 = in_data + total_ops;
|
||||
kern_ptr->arg2 = addr_step;
|
||||
kern_ptr->arg3 = out_data;
|
||||
|
||||
aql().kernarg_address = kern_ptr;
|
||||
|
||||
std::vector<double> time;
|
||||
void *q_base_addr = main_queue()->base_address;
|
||||
for (uint32_t i = 0; i < num_iteration(); i++) {
|
||||
// Obtain the current queue write index
|
||||
uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
|
||||
|
||||
// Write the aql packet at the calculated queue index address.
|
||||
const uint32_t queue_mask = main_queue()->size - 1;
|
||||
((hsa_kernel_dispatch_packet_t*)(q_base_addr))[index & queue_mask] = aql();
|
||||
|
||||
rocrtst::PerfTimer p_timer;
|
||||
int id = p_timer.CreateTimer();
|
||||
p_timer.StartTimer(id);
|
||||
|
||||
((hsa_kernel_dispatch_packet_t*)(q_base_addr))[index & queue_mask].header |=
|
||||
HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
|
||||
hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
|
||||
|
||||
// Wait on the dispatch signal until the kernel is finished.
|
||||
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
|
||||
(uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
|
||||
;
|
||||
|
||||
p_timer.StopTimer(id);
|
||||
|
||||
// Verify the results
|
||||
verifyGlobalStoreKernel(in_data, total_workitems, kernel_loop_count_,
|
||||
ops_thrd, kernel_name().c_str(), false);
|
||||
|
||||
time.push_back(p_timer.ReadTimer(id));
|
||||
|
||||
hsa_signal_store_screlease(signal(), 1);
|
||||
}
|
||||
|
||||
time.erase(time.begin());
|
||||
mean_ = rocrtst::CalcMean(time);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void SystemStoreBandwidth::Close() {
|
||||
hsa_status_t err;
|
||||
err = rocrtst::CommonCleanUp(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
return;
|
||||
}
|
||||
|
||||
void SystemStoreBandwidth::DisplayResults() const {
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::cout << "=======================================" << std::endl;
|
||||
std::cout << "System Load Bandwidth: %f(GB/S)"
|
||||
<< data_size_ / mean_ / 1024 / 1024 / 1024 << std::endl;
|
||||
}
|
||||
@@ -1,121 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __ROCRTST_SRC_INC_SYSTEM_STORE_BANDWIDTH_H__
|
||||
#define __ROCRTST_SRC_INC_SYSTEM_STORE_BANDWIDTH_H__
|
||||
|
||||
#include "perf_common/perf_base.h"
|
||||
#include "common/base_rocr.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include <stdio.h>
|
||||
|
||||
class SystemStoreBandwidth: public rocrtst::BaseRocR, public PerfBase {
|
||||
public:
|
||||
//@Brief: Constructor
|
||||
SystemStoreBandwidth();
|
||||
|
||||
//@Brief: Destructor
|
||||
~SystemStoreBandwidth();
|
||||
|
||||
//@Brief: Set up the testing environment
|
||||
virtual void SetUp();
|
||||
|
||||
//@Brief: Run the test case
|
||||
virtual void Run();
|
||||
|
||||
//@Brief: Close and clean up the test enrionment
|
||||
virtual void Close();
|
||||
|
||||
//@Brief: Display load bandwidth
|
||||
virtual void DisplayResults() const;
|
||||
|
||||
//@Brief: Set work-item configuration
|
||||
void SetWorkItemNum() {
|
||||
#ifdef INTERACTIVE
|
||||
uint32_t tmp;
|
||||
|
||||
printf("Please input the number of CUs you want to try:\n");
|
||||
scanf("%d", &num_cus_);
|
||||
|
||||
printf("Please input the number of groups you want to try:\n");
|
||||
scanf("%d", &num_group_);
|
||||
|
||||
printf("Please input the size of each group:\n");
|
||||
scanf("%d", &tmp);
|
||||
set_group_size(tmp);
|
||||
|
||||
printf("Please input the number of kernel loop you want to try:\n");
|
||||
scanf("%d", &kernel_loop_count_);
|
||||
#else
|
||||
num_cus_ = 32;
|
||||
num_group_ = 128;
|
||||
group_size_ = 256;
|
||||
kernel_loop_count_ = 16;
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
private:
|
||||
//@Brief: number of work item in one group
|
||||
uint32_t group_size_;
|
||||
|
||||
//@Brief: number of group
|
||||
uint32_t num_group_;
|
||||
|
||||
//@Brief: number of CUs
|
||||
uint32_t num_cus_;
|
||||
|
||||
//@Brief: number of kernel loop
|
||||
uint32_t kernel_loop_count_;
|
||||
|
||||
//@Brief: Mean execution time
|
||||
double mean_;
|
||||
|
||||
//@Brief: data size for test
|
||||
uint64_t data_size_;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
+395
@@ -0,0 +1,395 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
// The purpose of this test is to provide an example of the use of the
|
||||
// common RocrTest classes and utilities that are used in many examples.
|
||||
// It can be used as a template to start off with when writing new tests.
|
||||
// In many cases, the existing boilerplate code will be sufficient as is.
|
||||
// Otherwise, the boilerplate code can be either supplemented or replaced
|
||||
// by your own code in your example, as necessary.
|
||||
//
|
||||
// The comments provided are focused more on the use of the common rocrtst
|
||||
// utilities and boilerplate code, rather than the example app. itself.
|
||||
//
|
||||
// The boilerplate code includes code for:
|
||||
// * hsa initialization and clean up
|
||||
// * code to load pre-built kernels
|
||||
// * creating queues
|
||||
// * populating AQL packets
|
||||
// * checking for required profiles
|
||||
// * finding cpu and gpu agents (callbacks for common use cases)
|
||||
// * finding pools (having common requirements)
|
||||
// * allocating and setting kernel arguments
|
||||
// * somewhat standardized output
|
||||
// * handling additional command line arguments, beyond google-test arguments
|
||||
// * support for various level of verbosity, controlled from command line arg
|
||||
// * support for building OpenCL kernels
|
||||
// * timer support
|
||||
//
|
||||
// Overview of RocrTst code organization:
|
||||
// Classes:
|
||||
// * class BaseRocR (base_rocr.h) -- base class for all rocrtst examples and
|
||||
// tests. Most of the rocrtst common utilities act on BaseRocR objects
|
||||
//
|
||||
// * TestBase (test_base.h) -- derives from BaseRocR and is the base class
|
||||
// for all tests under <rocrtst root>/suites. The implementation in TestBase
|
||||
// methods are typically actions that are required for most/all tests and
|
||||
// should therefore be called from the derived implementions of the methods.
|
||||
//
|
||||
// Utilities:
|
||||
// * <rocrtst root>/common/base_rocr_utils.<cc/h> contains a set of utilities
|
||||
// that act on BaseRocR objects.
|
||||
//
|
||||
// * <rocrtst root>/common/common.<cc/h> contain other non-BaseRocR utilities
|
||||
//
|
||||
// Special Files:
|
||||
// * main.cc -- The main google test file from which the tests are invoked.
|
||||
// There should be an entry for each test to be run there.
|
||||
//
|
||||
// * kernels -- OpenCL kernel source files should go in the kernels directory
|
||||
//
|
||||
// * CMakeLists.txt -- Host code (*.cc and *.h files) should build without
|
||||
// modifying the CMakeList.txt file, if the files are place in the
|
||||
// "performance" directory. However, an entry for OpenCL kernels. For
|
||||
// each kernel to be built, the bitcode libraries must be indicated before
|
||||
// the call to "build_kernel()" is made. See existing code for examples.
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "suites/performance/test_case_template.h"
|
||||
#include "common/base_rocr_utils.h"
|
||||
#include "common/common.h"
|
||||
#include "common/helper_funcs.h"
|
||||
#include "common/hsatimer.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include "hsa/hsa_ext_finalize.h"
|
||||
|
||||
static const uint32_t kNumBufferElements = 256;
|
||||
|
||||
#define RET_IF_HSA_ERR(err) { \
|
||||
if ((err) != HSA_STATUS_SUCCESS) { \
|
||||
const char* msg = 0; \
|
||||
hsa_status_string(err, &msg); \
|
||||
std::cout << "hsa api call failure at line " << __LINE__ << ", file: " << \
|
||||
__FILE__ << ". Call returned " << err << std::endl; \
|
||||
std::cout << msg << std::endl; \
|
||||
return (err); \
|
||||
} \
|
||||
}
|
||||
|
||||
// Many test cases want to perform an operation on memory sizes of various
|
||||
// granularities.
|
||||
#if 0
|
||||
static const int kNumGranularity = 20;
|
||||
const char* Str[kNumGranularity] = {"1k", "2K", "4K", "8K", "16K", "32K",
|
||||
"64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M",
|
||||
"64M", "128M", "256M", "512M"};
|
||||
|
||||
const size_t Size[kNumGranularity] = {
|
||||
1024, 2*1024, 4*1024, 8*1024, 16*1024, 32*1024, 64*1024, 128*1024,
|
||||
256*1024, 512*1024, 1024*1024, 2048*1024, 4096*1024, 8*1024*1024,
|
||||
16*1024*1024, 32*1024*1024, 64*1024*1024, 128*1024*1024, 256*1024*1024,
|
||||
512*1024*1024};
|
||||
|
||||
static const int kMaxCopySize = Size[kNumGranularity - 1];
|
||||
#endif
|
||||
TestExample::TestExample(void) :
|
||||
TestBase() {
|
||||
set_num_iteration(10); // Number of iterations to execute of the main test;
|
||||
// This is a default value which can be overridden
|
||||
// on the command line.
|
||||
set_title("Test Case Example");
|
||||
set_description("Put a description of the test case here. Line breaks "
|
||||
"will be taken care of on output, not here.");
|
||||
|
||||
set_kernel_file_name("test_case_template_kernels.hsaco");
|
||||
set_kernel_name("square"); // kernel function name
|
||||
|
||||
#if 0
|
||||
// Set required profile to HSA_PROFILE_FULL or HSA_PROFILE_BASE if it
|
||||
// matters for this test. If either profile is fine, then leave with
|
||||
// default
|
||||
set_requires_profile(<value>);
|
||||
#endif
|
||||
}
|
||||
|
||||
TestExample::~TestExample(void) {
|
||||
}
|
||||
|
||||
// Any 1-time setup involving member variables used in the rest of the test
|
||||
// should be done here.
|
||||
void TestExample::SetUp(void) {
|
||||
hsa_status_t err;
|
||||
|
||||
// TestBase::SetUp() will set HSA_ENABLE_INTERRUPT if enable_interrupt() is
|
||||
// true, and call hsa_init(). It also prints the SetUp header.
|
||||
TestBase::SetUp();
|
||||
|
||||
// SetDefaultAgents(this) will assign the first CPU and GPU found on
|
||||
// iterating through the agents and assign them to cpu_device_ and
|
||||
// gpu_device1_, respectively (cpu_device() and gpu_device1()). These
|
||||
// BaseRocR member variables are used in some utilities. Additionally,
|
||||
// SetDefaultAgents() checks the profile of the gpu and compares this
|
||||
// to any required profile.
|
||||
//
|
||||
// If SetDefaultAgents() is not used, if the profile of the target GPU
|
||||
// matters for this test, it should be set with set_profile() and
|
||||
// CheckProfileAndInform() should be called to check if it is the
|
||||
// required profile
|
||||
err = rocrtst::SetDefaultAgents(this);
|
||||
ASSERT_EQ(HSA_STATUS_SUCCESS, err);
|
||||
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
|
||||
// Find and assign HSA_AMD_SEGMENT_GLOBAL pools for cpu, gpu and a kern_arg
|
||||
// pool
|
||||
err = rocrtst::SetPoolsTypical(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Create a queue
|
||||
hsa_queue_t* q = nullptr;
|
||||
rocrtst::CreateQueue(*gpu_dev, &q);
|
||||
ASSERT_NE(q, nullptr);
|
||||
set_main_queue(q);
|
||||
|
||||
err = rocrtst::LoadKernelFromObjFile(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Fill up the kernel packet (except header) with some values we've
|
||||
// collected so far, and some reasonable default values; this should be after
|
||||
// LoadKernelFromObjFile(). AllocAndSetKernArgs() will fill in the kern_args
|
||||
err = rocrtst::InitializeAQLPacket(this, &aql());
|
||||
ASSERT_EQ(HSA_STATUS_SUCCESS, err);
|
||||
|
||||
hsa_agent_t ag_list[2] = {*gpu_device1(), *cpu_device()};
|
||||
|
||||
// Allocate a few buffers for our example
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool(),
|
||||
kNumBufferElements*sizeof(uint32_t),
|
||||
0, reinterpret_cast<void**>(&src_buffer_));
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = hsa_amd_agents_allow_access(2, ag_list, NULL, src_buffer_);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Initialize the source buffer
|
||||
for (uint32_t i = 0; i < kNumBufferElements; ++i) {
|
||||
reinterpret_cast<uint32_t *>(src_buffer_)[i] = i;
|
||||
}
|
||||
|
||||
err = hsa_amd_memory_pool_allocate(cpu_pool(),
|
||||
kNumBufferElements*sizeof(uint32_t),
|
||||
0, reinterpret_cast<void**>(&dst_buffer_));
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = hsa_amd_agents_allow_access(2, ag_list, NULL, dst_buffer_);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Set up Kernel arguments
|
||||
// See the meta-data for the compiled OpenCL kernel code to ascertain
|
||||
// the sizes, padding and alignment required for kernel arguments.
|
||||
// This can be seen by executing
|
||||
// $ amdgcn-amd-amdhsa-readelf -aw ./binary_search_kernels.hsaco
|
||||
// The kernel code will expect the following arguments aligned as shown.
|
||||
// typedef uint32_t uint4[4];
|
||||
struct __attribute__((aligned(16))) local_args_t {
|
||||
uint32_t* dstArray;
|
||||
uint32_t* srcArray;
|
||||
uint32_t size;
|
||||
uint32_t pad;
|
||||
uint64_t global_offset_x;
|
||||
uint64_t global_offset_y;
|
||||
uint64_t global_offset_z;
|
||||
} local_args;
|
||||
|
||||
local_args.dstArray = reinterpret_cast<uint32_t *>(dst_buffer_);
|
||||
local_args.srcArray = reinterpret_cast<uint32_t *>(src_buffer_);
|
||||
local_args.size = kNumBufferElements;
|
||||
local_args.global_offset_x = 0;
|
||||
local_args.global_offset_y = 0;
|
||||
local_args.global_offset_z = 0;
|
||||
|
||||
err = rocrtst::AllocAndSetKernArgs(this, &local_args, sizeof(local_args));
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// This wrapper atomically writes the provided header and setup to the
|
||||
// provided AQL packet. The provided AQL packet address should be in the
|
||||
// queue memory space.
|
||||
static inline void AtomicSetPacketHeader(uint16_t header, uint16_t setup,
|
||||
hsa_kernel_dispatch_packet_t* queue_packet) {
|
||||
__atomic_store_n(reinterpret_cast<uint32_t*>(queue_packet),
|
||||
header | (setup << 16), __ATOMIC_RELEASE);
|
||||
}
|
||||
|
||||
// Do a few extra iterations as we toss out some of the inital and final
|
||||
// iterations when calculating statistics
|
||||
uint32_t TestExample::RealIterationNum(void) {
|
||||
return num_iteration() * 1.2 + 1;
|
||||
}
|
||||
|
||||
static bool VerifyResult(uint32_t *ar, size_t sz) {
|
||||
for (size_t i = sz; i < sz; ++i) {
|
||||
if (i*i != ar[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
void TestExample::Run(void) {
|
||||
// Compare required profile for this test case with what we're actually
|
||||
// running on
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
TestBase::Run();
|
||||
|
||||
// Override whatever we need to...
|
||||
aql().workgroup_size_x = kNumBufferElements;
|
||||
aql().grid_size_x = kNumBufferElements;
|
||||
|
||||
std::vector<double> timer;
|
||||
|
||||
int it = RealIterationNum();
|
||||
hsa_kernel_dispatch_packet_t *queue_aql_packet;
|
||||
|
||||
rocrtst::PerfTimer p_timer;
|
||||
uint64_t index;
|
||||
|
||||
for (int i = 0; i < it; i++) {
|
||||
// This function simply copies the data we've collected so far into our
|
||||
// local AQL packet, except the the setup and header fields.
|
||||
queue_aql_packet = WriteAQLToQueue(this, &index);
|
||||
ASSERT_EQ(queue_aql_packet,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t *>
|
||||
(main_queue()->base_address) + index);
|
||||
uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
|
||||
|
||||
aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
|
||||
HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
|
||||
aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
|
||||
HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
|
||||
|
||||
// Create and start a timer for this iteration
|
||||
int id = p_timer.CreateTimer();
|
||||
p_timer.StartTimer(id);
|
||||
|
||||
AtomicSetPacketHeader(aql_header, aql().setup, queue_aql_packet);
|
||||
|
||||
hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
|
||||
|
||||
// Wait on the dispatch signal until the kernel is finished.
|
||||
while (hsa_signal_wait_scacquire(aql().completion_signal,
|
||||
HSA_SIGNAL_CONDITION_LT, 1, (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) {
|
||||
}
|
||||
|
||||
// Stop the timer
|
||||
p_timer.StopTimer(id);
|
||||
|
||||
// Store time for later analysis
|
||||
timer.push_back(p_timer.ReadTimer(id));
|
||||
hsa_signal_store_screlease(aql().completion_signal, 1);
|
||||
|
||||
ASSERT_TRUE(VerifyResult(reinterpret_cast<uint32_t *>(dst_buffer_),
|
||||
kNumBufferElements));
|
||||
|
||||
// Pay attention to verbosity level for things like progress output
|
||||
if (verbosity() >= VERBOSE_PROGRESS) {
|
||||
std::cout << ".";
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
|
||||
if (verbosity() >= VERBOSE_PROGRESS) {
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
// Abandon the first result and after sort, delete the last 2% value
|
||||
timer.erase(timer.begin());
|
||||
std::sort(timer.begin(), timer.end());
|
||||
timer.erase(timer.begin() + num_iteration(), timer.end());
|
||||
|
||||
time_mean_ = rocrtst::CalcMean(timer);
|
||||
}
|
||||
|
||||
void TestExample::DisplayTestInfo(void) {
|
||||
TestBase::DisplayTestInfo();
|
||||
}
|
||||
|
||||
void TestExample::DisplayResults(void) const {
|
||||
// Compare required profile for this test case with what we're actually
|
||||
// running on
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
TestBase::DisplayResults();
|
||||
std::cout << "The average time was: " << time_mean_ * 1e6 <<
|
||||
" uS" << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
void TestExample::Close() {
|
||||
hsa_status_t err;
|
||||
|
||||
err = hsa_amd_memory_pool_free(src_buffer_);
|
||||
ASSERT_EQ(HSA_STATUS_SUCCESS, err);
|
||||
|
||||
err = hsa_amd_memory_pool_free(dst_buffer_);
|
||||
ASSERT_EQ(HSA_STATUS_SUCCESS, err);
|
||||
|
||||
// This will close handles opened within rocrtst utility calls and call
|
||||
// hsa_shut_down(), so it should be done after other hsa cleanup
|
||||
TestBase::Close();
|
||||
}
|
||||
|
||||
|
||||
#undef RET_IF_HSA_ERR
|
||||
+20
-19
@@ -43,40 +43,41 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __ROCRTST_SRC_INC_IMAGE_LOAD_BANDWIDTH_H__
|
||||
#define __ROCRTST_SRC_INC_IMAGE_LOAD_BANDWIDTH_H__
|
||||
#ifndef ROCRTST_SUITES_PERFORMANCE_TEST_CASE_TEMPLATE_H_
|
||||
#define ROCRTST_SUITES_PERFORMANCE_TEST_CASE_TEMPLATE_H_
|
||||
|
||||
#include "common/base_rocr.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include "perf_common/perf_base.h"
|
||||
#include "suites/test_common/test_base.h"
|
||||
|
||||
class ImageLoadBandwidth: public rocrtst::BaseRocR, public PerfBase {
|
||||
class TestExample : public TestBase {
|
||||
public:
|
||||
//@Brief: Constructor
|
||||
ImageLoadBandwidth();
|
||||
TestExample();
|
||||
|
||||
//@Brief: Destructor
|
||||
~ImageLoadBandwidth();
|
||||
// @Brief: Destructor for test case of TestExample
|
||||
virtual ~TestExample();
|
||||
|
||||
//@Brief: Set up the test environment
|
||||
// @Brief: Setup the environment for measurement
|
||||
virtual void SetUp();
|
||||
|
||||
//@Brief: Run the actual testing
|
||||
// @Brief: Core measurement execution
|
||||
virtual void Run();
|
||||
|
||||
//@Brief: Clean up the test environment
|
||||
// @Brief: Clean up and retrive the resource
|
||||
virtual void Close();
|
||||
|
||||
//@Brief: Display results
|
||||
// @Brief: Display results
|
||||
virtual void DisplayResults() const;
|
||||
|
||||
private:
|
||||
//@Brief: Image Load Bandwidth
|
||||
double load_bandwidth_;
|
||||
// @Brief: Display information about what this test does
|
||||
virtual void DisplayTestInfo(void);
|
||||
|
||||
//@Brief: Image size
|
||||
size_t image_size_;
|
||||
private:
|
||||
uint32_t RealIterationNum(void);
|
||||
|
||||
double time_mean_;
|
||||
void *src_buffer_;
|
||||
void *dst_buffer_;
|
||||
};
|
||||
|
||||
#endif //__ROCRTST_SRC_INC_IMAGE_LOAD_BANDWIDTH_H__
|
||||
|
||||
#endif // ROCRTST_SUITES_PERFORMANCE_TEST_CASE_TEMPLATE_H_
|
||||
@@ -1,279 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "vector_copy.h"
|
||||
#include "common/base_rocr_utils.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
// Copy vector buffer size.
|
||||
static const size_t BUFFER_SIZE = 1024 * 1024 * 4;
|
||||
static char* gCPUOutput = nullptr;
|
||||
static uint64_t gQueueIndex = 0;
|
||||
|
||||
//Constructor
|
||||
VectorCopy::VectorCopy() :
|
||||
BaseRocR() {
|
||||
set_kernel_name("&__vector_copy_kernel");
|
||||
kernarg_address = NULL;
|
||||
}
|
||||
|
||||
//Destructor
|
||||
VectorCopy::~VectorCopy() {
|
||||
}
|
||||
|
||||
// Find coarse grained system memory.
|
||||
static hsa_status_t get_sys_coarse_grained_memory_pool(
|
||||
hsa_amd_memory_pool_t pool, void* data) {
|
||||
hsa_amd_segment_t segment;
|
||||
hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
|
||||
&segment);
|
||||
|
||||
if (HSA_AMD_SEGMENT_GLOBAL != segment) {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_amd_memory_pool_global_flag_t flags;
|
||||
hsa_status_t err = hsa_amd_memory_pool_get_info(pool,
|
||||
HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags);
|
||||
|
||||
if (HSA_STATUS_SUCCESS == err
|
||||
&& (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)) {
|
||||
hsa_amd_memory_pool_t* ret = (hsa_amd_memory_pool_t*) data;
|
||||
*ret = pool;
|
||||
return HSA_STATUS_INFO_BREAK;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
// Find out dGPU's local memory pool.
|
||||
static hsa_status_t get_local_memory_pool(hsa_amd_memory_pool_t pool,
|
||||
void* data) {
|
||||
// With memory pool API, each agent will only report it is own memory pools.
|
||||
// So, a coarse grained memory pool in global segment is what we want.
|
||||
hsa_amd_segment_t segment;
|
||||
|
||||
hsa_status_t err = hsa_amd_memory_pool_get_info(pool,
|
||||
HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
|
||||
|
||||
if (HSA_STATUS_SUCCESS != err) {
|
||||
return err;
|
||||
}
|
||||
|
||||
if (HSA_AMD_SEGMENT_GLOBAL != segment) {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_amd_memory_pool_global_flag_t flags;
|
||||
err = hsa_amd_memory_pool_get_info(pool,
|
||||
HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags);
|
||||
|
||||
if (HSA_STATUS_SUCCESS == err
|
||||
&& (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)) {
|
||||
hsa_amd_memory_pool_t* ret = (hsa_amd_memory_pool_t*) data;
|
||||
*ret = pool;
|
||||
return HSA_STATUS_INFO_BREAK;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
void VectorCopy::SetUp() {
|
||||
hsa_status_t err;
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
|
||||
if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
//Create a queue with max number size
|
||||
hsa_queue_t* q;
|
||||
rocrtst::CreateQueue(*gpu_dev, &q);
|
||||
set_main_queue(q);
|
||||
|
||||
rocrtst::LoadKernelFromObjFile(this);
|
||||
|
||||
// Obtain the current queue write index.
|
||||
gQueueIndex = hsa_queue_load_write_index_scacquire(main_queue());
|
||||
|
||||
rocrtst::InitializeAQLPacket(this, &aql());
|
||||
uint16_t header = 0;
|
||||
header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
|
||||
|
||||
aql().grid_size_x = (uint32_t)(1024 * 1024);
|
||||
aql().kernarg_address = (void*) kernarg_address;
|
||||
|
||||
// Find system memory pool for kernarg allocation.
|
||||
// hsa_amd_memory_pool_t sys_coarse_grained_pool;
|
||||
err = hsa_amd_agent_iterate_memory_pools(cpus[0],
|
||||
get_sys_coarse_grained_memory_pool, &sys_coarse_grained_pool_);
|
||||
ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
|
||||
|
||||
// Get local memory pool of the first GPU.
|
||||
// hsa_amd_memory_pool_t gpu_pool_;
|
||||
err = hsa_amd_agent_iterate_memory_pools(gpus[0], get_local_memory_pool,
|
||||
&gpu_pool_);
|
||||
ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void VectorCopy::Run() {
|
||||
hsa_status_t err;
|
||||
void* in;
|
||||
void* out;
|
||||
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Allocate vector on the first GPU local memory as input.
|
||||
err = hsa_amd_memory_pool_allocate(gpu_pool_, BUFFER_SIZE, 0, &in);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
std::cout << "Allocating " << BUFFER_SIZE <<
|
||||
" Bytes of local memory on the first GPU, address = " <<
|
||||
in << std::endl;
|
||||
|
||||
// rocrtst::CommonCleanUp input buffer on the first GPU to 1 for each byte.
|
||||
err = hsa_amd_memory_fill(in, 0x01010101, BUFFER_SIZE / 4);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Allocate vector on the first GPU local memory as output
|
||||
err = hsa_amd_memory_pool_allocate(gpu_pool_, BUFFER_SIZE, 0, &out);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
std::cout << "Allocating " << BUFFER_SIZE <<
|
||||
" Bytes of local memory on the second GPU, address = " <<
|
||||
out << std::endl;
|
||||
|
||||
// rocrtst::CommonCleanUp output buffer on the first GPU to 0.
|
||||
err = hsa_amd_memory_fill(out, 0x00000000, BUFFER_SIZE / 4);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
typedef struct args_t {
|
||||
void* in;
|
||||
void* out;
|
||||
} args;
|
||||
|
||||
args* kargs;
|
||||
|
||||
kargs->in = in;
|
||||
kargs->out = out;
|
||||
|
||||
// Allocate the kernel argument buffer from the system memory pool.
|
||||
err = hsa_amd_memory_pool_allocate(sys_coarse_grained_pool_, kernarg_size(),
|
||||
0, &kernarg_address);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
memcpy(kernarg_address, &kargs, sizeof(args));
|
||||
|
||||
// Map kernarg space to the first GPU
|
||||
err = hsa_amd_agents_allow_access(1, &gpus[0], NULL, kernarg_address);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
/*
|
||||
* Increment the write index and ring the doorbell to dispatch the kernel.
|
||||
*/
|
||||
hsa_queue_store_write_index_screlease(main_queue(), gQueueIndex + 1);
|
||||
hsa_signal_store_relaxed(main_queue()->doorbell_signal, gQueueIndex);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Wait on the dispatch completion signal until the kernel is finished.
|
||||
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_EQ, 0,
|
||||
UINT64_MAX, HSA_WAIT_STATE_BLOCKED))
|
||||
;
|
||||
|
||||
// Reset signal value for future usage to copy output.
|
||||
hsa_signal_store_screlease(signal(), 1);
|
||||
|
||||
// Allocate vector on the system memory pool.
|
||||
err = hsa_amd_memory_pool_allocate(sys_coarse_grained_pool_, BUFFER_SIZE, 0,
|
||||
(void**) &gCPUOutput);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Allow the first GPU to access the output
|
||||
err = hsa_amd_agents_allow_access(1, &gpus[0], NULL, gCPUOutput);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
//Copy the output from GPU to the CPU buffer for validation
|
||||
err = hsa_amd_memory_async_copy(gCPUOutput, cpus[0], out, gpus[0],
|
||||
BUFFER_SIZE, 0, NULL, signal());
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Wait on the completion signal until the async copy is finished.
|
||||
while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_EQ, 0,
|
||||
UINT64_MAX, HSA_WAIT_STATE_BLOCKED))
|
||||
;
|
||||
|
||||
for (uint32_t i = 0; i < BUFFER_SIZE; i++) {
|
||||
ASSERT_EQ(gCPUOutput[i], 1);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void VectorCopy::Close() {
|
||||
hsa_status_t err;
|
||||
// Cleanup all allocated resources.
|
||||
err = hsa_amd_memory_pool_free(kernarg_address);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = hsa_signal_destroy(signal());
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = hsa_queue_destroy(main_queue());
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = hsa_amd_memory_pool_free(gCPUOutput);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = rocrtst::CommonCleanUp(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
return;
|
||||
}
|
||||
|
||||
void VectorCopy::DisplayResults() const {
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -1,109 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __ROCRTST_SRC_VECTOR_COPY_H__
|
||||
#define __ROCRTST_SRC_VECTOR_COPY_H__
|
||||
|
||||
#include "perf_common/perf_base.h"
|
||||
#include "common/base_rocr.h"
|
||||
#include "common/common.h"
|
||||
#include "common/hsatimer.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include "hsa/hsa_ext_amd.h"
|
||||
#include "hsa/hsa_ext_finalize.h"
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
|
||||
//@Brief: This class is defined to measure the mean latency of launching
|
||||
//an empty kernel
|
||||
|
||||
class VectorCopy: public rocrtst::BaseRocR, public PerfBase {
|
||||
public:
|
||||
//@Brief: Constructor
|
||||
VectorCopy();
|
||||
|
||||
//@Brief: Destructor
|
||||
virtual ~VectorCopy();
|
||||
|
||||
//@Brief: Set up the environment for the test
|
||||
virtual void SetUp();
|
||||
|
||||
//@Brief: Run the test case
|
||||
virtual void Run();
|
||||
|
||||
virtual void DisplayResults() const;
|
||||
|
||||
//@Brief: Clean up and close the runtime
|
||||
virtual void Close();
|
||||
|
||||
private:
|
||||
|
||||
//@Brief: Store the size of queue
|
||||
uint32_t queue_size_;
|
||||
|
||||
//@Brief: kernarg_address;
|
||||
void* kernarg_address;
|
||||
|
||||
//@Brief: The mean time of CP Processing
|
||||
double mean_;
|
||||
|
||||
//@Brief: The group memory region
|
||||
hsa_region_t group_region_;
|
||||
|
||||
hsa_amd_memory_pool_t gpu_pool_;
|
||||
hsa_amd_memory_pool_t sys_coarse_grained_pool_;
|
||||
|
||||
std::vector<hsa_agent_t> cpus;
|
||||
std::vector<hsa_agent_t> gpus;
|
||||
|
||||
//@Brief: Pointer to cu_id array
|
||||
uint32_t* cu_;
|
||||
|
||||
uint32_t manual_input;
|
||||
uint32_t group_input;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,106 +0,0 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __ROCRTST_SRC_VECTOR_COPY_P2P_H__
|
||||
#define __ROCRTST_SRC_VECTOR_COPY_P2P_H__
|
||||
|
||||
#include "perf_common/perf_base.h"
|
||||
#include "common/base_rocr.h"
|
||||
#include "common/common.h"
|
||||
#include "common/hsatimer.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include "hsa/hsa_ext_amd.h"
|
||||
#include "hsa/hsa_ext_finalize.h"
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
|
||||
//@Brief: This class is defined to measure the mean latency of launching
|
||||
//an empty kernel
|
||||
|
||||
class VectorCopyP2P: public rocrtst::BaseRocR, public PerfBase {
|
||||
public:
|
||||
//@Brief: Constructor
|
||||
VectorCopyP2P();
|
||||
|
||||
//@Brief: Destructor
|
||||
virtual ~VectorCopyP2P();
|
||||
|
||||
//@Brief: Set up the environment for the test
|
||||
virtual void SetUp();
|
||||
|
||||
//@Brief: Run the test case
|
||||
virtual void Run();
|
||||
|
||||
//@Brief: Display results we got
|
||||
virtual void DisplayResults() const;
|
||||
|
||||
//@Brief: Clean up and close the runtime
|
||||
virtual void Close();
|
||||
|
||||
private:
|
||||
//@Brief: Get actual iteration number
|
||||
virtual size_t RealIterationNum();
|
||||
|
||||
//@Brief: Create Queue
|
||||
virtual void CreateQueue();
|
||||
|
||||
//@Brief: Store the size of queue
|
||||
uint32_t queue_size_;
|
||||
|
||||
//@Brief: The mean time of CP Processing
|
||||
double mean_;
|
||||
|
||||
//@Brief: The group memory region
|
||||
hsa_region_t group_region_;
|
||||
|
||||
//@Brief: Pointer to cu_id array
|
||||
uint32_t* cu_;
|
||||
|
||||
uint32_t manual_input;
|
||||
uint32_t group_input;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
Исполняемый файл
+141
@@ -0,0 +1,141 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include "suites/test_common/test_base.h"
|
||||
#include "common/base_rocr_utils.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
static const int kOutputLineLength = 80;
|
||||
static const char kLabelDelimiter[] = "####";
|
||||
static const char kDescriptionLabel[] = "TEST DESCRIPTION";
|
||||
static const char kTitleLabel[] = "TEST NAME";
|
||||
static const char kSetupLabel[] = "TEST SETUP";
|
||||
static const char kRunLabel[] = "TEST EXECUTION";
|
||||
static const char kCloseLabel[] = "TEST CLEAN UP";
|
||||
static const char kResultsLabel[] = "TEST RESULTS";
|
||||
|
||||
|
||||
TestBase::TestBase() {
|
||||
set_description("");
|
||||
}
|
||||
TestBase::~TestBase() {
|
||||
}
|
||||
|
||||
static void MakeHeaderStr(const char *inStr, std::string *outStr) {
|
||||
assert(outStr != nullptr);
|
||||
assert(inStr != nullptr);
|
||||
|
||||
outStr->clear();
|
||||
*outStr = kLabelDelimiter;
|
||||
*outStr += " ";
|
||||
*outStr += inStr;
|
||||
*outStr += " ";
|
||||
*outStr += kLabelDelimiter;
|
||||
}
|
||||
|
||||
void TestBase::SetUp(void) {
|
||||
hsa_status_t err;
|
||||
std::string label;
|
||||
MakeHeaderStr(kSetupLabel, &label);
|
||||
printf("\n\t%s\n", label.c_str());
|
||||
|
||||
err = rocrtst::InitAndSetupHSA(this);
|
||||
ASSERT_EQ(HSA_STATUS_SUCCESS, err);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void TestBase::Run(void) {
|
||||
std::string label;
|
||||
MakeHeaderStr(kRunLabel, &label);
|
||||
printf("\n\t%s\n", label.c_str());
|
||||
}
|
||||
|
||||
void TestBase::Close(void) {
|
||||
hsa_status_t err;
|
||||
std::string label;
|
||||
MakeHeaderStr(kCloseLabel, &label);
|
||||
printf("\n\t%s\n", label.c_str());
|
||||
|
||||
err = rocrtst::CommonCleanUp(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
void TestBase::DisplayResults(void) const {
|
||||
std::string label;
|
||||
MakeHeaderStr(kResultsLabel, &label);
|
||||
printf("\n\t%s\n", label.c_str());
|
||||
}
|
||||
|
||||
void TestBase::DisplayTestInfo(void) {
|
||||
printf("#########################################"
|
||||
"######################################\n");
|
||||
|
||||
std::string label;
|
||||
MakeHeaderStr(kTitleLabel, &label);
|
||||
printf("\n\t%s\n%s\n", label.c_str(), title().c_str());
|
||||
|
||||
if (verbosity() >= VERBOSE_STANDARD) {
|
||||
MakeHeaderStr(kDescriptionLabel, &label);
|
||||
printf("\n\t%s\n%s\n", label.c_str(), description().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
void TestBase::set_description(std::string d) {
|
||||
int le = kOutputLineLength - 4;
|
||||
|
||||
description_ = d;
|
||||
size_t endlptr;
|
||||
|
||||
for (size_t i = le; i < description_.size(); i += le) {
|
||||
endlptr = description_.find_last_of(" ", i);
|
||||
description_.replace(endlptr, 1, "\n");
|
||||
i = endlptr;
|
||||
}
|
||||
}
|
||||
|
||||
+27
-36
@@ -42,52 +42,43 @@
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
#ifndef ROCRTST_SUITES_TEST_COMMON_TEST_BASE_H_
|
||||
#define ROCRTST_SUITES_TEST_COMMON_TEST_BASE_H_
|
||||
|
||||
#ifndef __ROCRTST_SRC_INC_QUEUE_CONCURRENCY_H__
|
||||
#define __ROCRTST_SRC_INC_QUEUE_CONCURRENCY_H__
|
||||
|
||||
#include "perf_common/perf_base.h"
|
||||
#include <string>
|
||||
#include "common/base_rocr.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include <vector>
|
||||
|
||||
class QueueConcurrency: public rocrtst::BaseRocR, public PerfBase {
|
||||
class TestBase : public rocrtst::BaseRocR {
|
||||
public:
|
||||
//@Brief: Constructor
|
||||
QueueConcurrency();
|
||||
|
||||
//@Brief: Destructor
|
||||
~QueueConcurrency();
|
||||
TestBase(void);
|
||||
|
||||
//@Brief: Set up the test environmnet
|
||||
void SetUp();
|
||||
virtual ~TestBase(void);
|
||||
|
||||
//@Brief: Run the test
|
||||
void Run();
|
||||
enum VerboseLevel {VERBOSE_MIN = 0, VERBOSE_STANDARD, VERBOSE_PROGRESS};
|
||||
|
||||
//@Brief: Clean up and close
|
||||
void Close();
|
||||
// @Brief: Before run the core measure codes, do something to set up
|
||||
// i.e. init runtime, prepare packet...
|
||||
virtual void SetUp(void);
|
||||
|
||||
void DisplayResults() const;
|
||||
// @Brief: Core measurement codes executing here
|
||||
virtual void Run(void);
|
||||
|
||||
// @Brief: Do something clean up
|
||||
virtual void Close(void);
|
||||
|
||||
// @Brief: Display the results
|
||||
virtual void DisplayResults(void) const;
|
||||
|
||||
// @Brief: Display information about the test
|
||||
virtual void DisplayTestInfo(void);
|
||||
|
||||
const std::string & description(void) const {return description_;}
|
||||
|
||||
void set_description(std::string d);
|
||||
|
||||
private:
|
||||
|
||||
//@Brief: Thread function
|
||||
void ThreadFunc(int i);
|
||||
|
||||
//@Brief: Calculate the concurrent queue number
|
||||
void CalculateQueueNum();
|
||||
|
||||
//@Brief: Vector to store execution time
|
||||
std::vector<double> execution_time_;
|
||||
|
||||
//@Brief: Number of concurrent queues
|
||||
size_t queue_num_;
|
||||
|
||||
//@Brief: Store the standard execution time
|
||||
double std_time_;
|
||||
|
||||
std::string description_;
|
||||
};
|
||||
|
||||
#endif //__ROCRTST_SRC_INC_QUEUE_CONCURRENCY_H__
|
||||
|
||||
#endif // ROCRTST_SUITES_TEST_COMMON_TEST_BASE_H_
|
||||
+67
-65
@@ -43,77 +43,79 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __ROCRTST_SRC_INC_DEVICE_LOAD_BANDWIDTH_H__
|
||||
#define __ROCRTST_SRC_INC_DEVICE_LOAD_BANDWIDTH_H__
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
#include <iostream>
|
||||
#include <getopt.h>
|
||||
|
||||
#include "perf_common/perf_base.h"
|
||||
#include "common/base_rocr.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include <stdio.h>
|
||||
#include "suites/test_common/test_common.h"
|
||||
|
||||
class DeviceLoadBandwidth: public rocrtst::BaseRocR, public PerfBase {
|
||||
public:
|
||||
//@Brief: Constructor
|
||||
DeviceLoadBandwidth();
|
||||
RocrtstOptions::RocrtstOptions(uint32_t *verb, uint32_t *iter) {
|
||||
assert(verb != nullptr);
|
||||
assert(iter != nullptr);
|
||||
|
||||
//@Brief: Destructor
|
||||
~DeviceLoadBandwidth();
|
||||
verbosity_ = verb;
|
||||
iterations_ = iter;
|
||||
}
|
||||
|
||||
//@Brief: Set up the testing environment
|
||||
virtual void SetUp();
|
||||
RocrtstOptions::~RocrtstOptions() {
|
||||
}
|
||||
|
||||
//@Brief: Run the test case
|
||||
virtual void Run();
|
||||
static const struct option long_options[] = {
|
||||
{"iterations", required_argument, nullptr, 'i'},
|
||||
{"verbose", no_argument, nullptr, 'v'},
|
||||
|
||||
//@Brief: Close and clean up the test enrionment
|
||||
virtual void Close();
|
||||
|
||||
//@Brief: Display load bandwidth
|
||||
virtual void DisplayResults() const;
|
||||
|
||||
//@Brief: Set work-item configuration
|
||||
void SetWorkItemNum() {
|
||||
#ifdef INTERACTIVE
|
||||
uint32_t tmp;
|
||||
printf("Please input the number of CUs you want to try:\n");
|
||||
scanf("%d", &num_cus_);
|
||||
|
||||
printf("Please input the number of groups you want to try:\n");
|
||||
scanf("%d", &num_group_);
|
||||
|
||||
printf("Please input the size of each group:\n");
|
||||
scanf("%d", &tmp);
|
||||
set_group_size(tmp);
|
||||
|
||||
printf("Please input the number of kernel loop you want to try:\n");
|
||||
scanf("%d", &kernel_loop_count_);
|
||||
#else
|
||||
num_cus_ = 16;
|
||||
num_group_ = 128;
|
||||
set_group_size(64);
|
||||
kernel_loop_count_ = 16;
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
private:
|
||||
//@Brief: number of group
|
||||
uint32_t num_group_;
|
||||
|
||||
//@Brief: number of CUs
|
||||
uint32_t num_cus_;
|
||||
|
||||
//@Brief: number of kernel loop
|
||||
uint32_t kernel_loop_count_;
|
||||
|
||||
//@Brief: Mean execution time
|
||||
double mean_;
|
||||
|
||||
//@Brief: data size for test
|
||||
uint64_t data_size_;
|
||||
uint32_t* in_data_;
|
||||
uint32_t* out_data_;
|
||||
{nullptr, 0, nullptr, 0}
|
||||
};
|
||||
static const char* short_options = "i:v:r";
|
||||
|
||||
#endif
|
||||
static void PrintHelp(void) {
|
||||
std::cout <<
|
||||
// "Required Arguments:\n"
|
||||
// "--kernel, -k <path to kernel obj. file>\n"
|
||||
"Optional RocRTst Arguments:\n"
|
||||
"--iterations, -i <number of iterations to execute>; override default, "
|
||||
"which varies for each test\n"
|
||||
"--rocrtst_help, -r print this help message\n"
|
||||
"--verbosity, -v <verbosity level>\n"
|
||||
" Verbosity levels:\n"
|
||||
" 0 -- minimal; just summary information\n"
|
||||
" 1 -- intermediate; show intermediate values such as intermediate "
|
||||
"perf. data\n"
|
||||
" 2 -- progress; show progress displays\n"
|
||||
" >= 3 -- more debug output\n";
|
||||
}
|
||||
|
||||
uint32_t ProcessCmdline(RocrtstOptions* test, int arg_cnt, char** arg_list) {
|
||||
int a;
|
||||
int ind = -1;
|
||||
|
||||
assert(test != nullptr);
|
||||
|
||||
while (true) {
|
||||
a = getopt_long(arg_cnt, arg_list, short_options, long_options, &ind);
|
||||
|
||||
if (a == -1) {
|
||||
break;
|
||||
}
|
||||
|
||||
switch (a) {
|
||||
case 'i':
|
||||
*test->iterations_ = std::stoi(optarg);
|
||||
break;
|
||||
|
||||
case 'v':
|
||||
*test->verbosity_ = std::stoi(optarg);
|
||||
break;
|
||||
|
||||
case 'r':
|
||||
PrintHelp();
|
||||
return 1;
|
||||
|
||||
default:
|
||||
PrintHelp();
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
+10
-15
@@ -43,24 +43,19 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef ROCRTST_SUITES_TEST_COMMON_TEST_COMMON_H_
|
||||
#define ROCRTST_SUITES_TEST_COMMON_TEST_COMMON_H_
|
||||
|
||||
#ifndef ROCRTST_SUITES_PERFORMANCE_PERF_COMMON_PERF_BASE_H_
|
||||
#define ROCRTST_SUITES_PERFORMANCE_PERF_COMMON_PERF_BASE_H_
|
||||
|
||||
class PerfBase {
|
||||
class RocrtstOptions {
|
||||
public:
|
||||
// @Brief: Before run the core measure codes, do something to set up
|
||||
// i.e. init runtime, prepare packet...
|
||||
virtual void SetUp(void) = 0;
|
||||
RocrtstOptions(uint32_t *verb, uint32_t *iter);
|
||||
|
||||
// @Brief: Core measurement codes executing here
|
||||
virtual void Run(void) = 0;
|
||||
~RocrtstOptions(void);
|
||||
|
||||
// @Brief: Do something clean up
|
||||
virtual void Close(void) = 0;
|
||||
|
||||
// @Brief: Display the results
|
||||
virtual void DisplayResults(void) const = 0;
|
||||
uint32_t *verbosity_;
|
||||
uint32_t *iterations_;
|
||||
};
|
||||
|
||||
#endif // ROCRTST_SUITES_PERFORMANCE_PERF_COMMON_PERF_BASE_H_
|
||||
uint32_t ProcessCmdline(RocrtstOptions* test, int arg_cnt, char** arg_list);
|
||||
|
||||
#endif // ROCRTST_SUITES_TEST_COMMON_TEST_COMMON_H_
|
||||
Ссылка в новой задаче
Block a user