From e761eda5fa919ffff613d36c15480d7a5a432510 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Fri, 27 Apr 2018 20:00:20 -0500 Subject: [PATCH] memory allocation refactoring Change-Id: Ic63b4f5ea44f2dc5e009e3e58652a661e957b7d6 [ROCm/rocprofiler commit: c9c0ecc976688cfb76450cc78d489579816410dc] --- projects/rocprofiler/inc/rocprofiler.h | 2 +- projects/rocprofiler/src/core/context.h | 30 +++---- projects/rocprofiler/src/core/profile.h | 4 +- .../rocprofiler/src/util/hsa_rsrc_factory.cpp | 6 +- .../rocprofiler/src/util/hsa_rsrc_factory.h | 3 +- projects/rocprofiler/test/ctrl/run_kernel.h | 17 ++-- projects/rocprofiler/test/ctrl/test_hsa.cpp | 62 ++++++++++---- projects/rocprofiler/test/ctrl/test_kernel.h | 83 ++++++++++++------- projects/rocprofiler/test/ctrl/tool.cpp | 16 ++-- .../simple_convolution/simple_convolution.cl | 2 +- .../simple_convolution/simple_convolution.cpp | 37 +++++---- .../simple_convolution/simple_convolution.h | 5 +- .../test/util/hsa_rsrc_factory.cpp | 54 ++++-------- .../rocprofiler/test/util/hsa_rsrc_factory.h | 48 +++-------- 14 files changed, 194 insertions(+), 175 deletions(-) diff --git a/projects/rocprofiler/inc/rocprofiler.h b/projects/rocprofiler/inc/rocprofiler.h index 07dc842d90..f9f0f371d0 100644 --- a/projects/rocprofiler/inc/rocprofiler.h +++ b/projects/rocprofiler/inc/rocprofiler.h @@ -166,9 +166,9 @@ typedef void rocprofiler_t; // Profiling group object typedef struct { unsigned index; // group index + rocprofiler_t* context; // context object rocprofiler_feature_t** features; // profiling info array uint32_t feature_count; // profiling info count - rocprofiler_t* context; // context object } rocprofiler_group_t; // Profiling mode mask diff --git a/projects/rocprofiler/src/core/context.h b/projects/rocprofiler/src/core/context.h index 60ebed61b1..63833435bc 100644 --- a/projects/rocprofiler/src/core/context.h +++ b/projects/rocprofiler/src/core/context.h @@ -121,9 +121,6 @@ class Group { Context* GetContext() { return context_; } uint32_t GetIndex() const { return index_; } - rocprofiler_group_t GetGroup() { - return rocprofiler_group_t{index_, &info_vector_[0], (uint32_t)info_vector_.size(), context_}; - } void ResetRefs() { refs_ = n_profiles_; } uint32_t DecrRefs() { return (refs_ > 0) ? --refs_ : 0; @@ -279,15 +276,18 @@ class Context { uint32_t GetGroupCount() const { return set_.size(); } - rocprofiler_group_t GetGroupInfo(const uint32_t& index) { - rocprofiler::info_vector_t& info_vector = set_[index].GetInfoVector(); + rocprofiler_group_t GetGroupInfo(Group* g) { + rocprofiler::info_vector_t& info_vector = g->GetInfoVector(); rocprofiler_group_t group = {}; - group.feature_count = info_vector.size(); - group.features = &info_vector[0]; + group.index = g->GetIndex(); group.context = reinterpret_cast(this); - group.index = index; + group.features = &info_vector[0]; + group.feature_count = info_vector.size(); return group; } + rocprofiler_group_t GetGroupInfo(const uint32_t& index) { + return GetGroupInfo(&set_[index]); + } const pkt_vector_t& StartPackets(const uint32_t& group_index) const { return set_[group_index].GetStartVector(); @@ -391,11 +391,12 @@ class Context { static bool Handler(hsa_signal_value_t value, void* arg) { Group* group = reinterpret_cast(arg); - group->GetContext()->mutex_.lock(); + Context* context = group->GetContext(); + context->mutex_.lock(); uint32_t r = group->DecrRefs(); - group->GetContext()->mutex_.unlock(); + context->mutex_.unlock(); if (r == 0) { - return group->GetContext()->handler_(group->GetGroup(), group->GetContext()->handler_arg_); + return context->handler_(context->GetGroupInfo(group), context->handler_arg_); } return false; } @@ -427,8 +428,9 @@ class Context { if (rinfo->data.result_bytes.copy) { if (sample_id == 0) { const uint32_t output_buffer_size = profile->output_buffer.size; - const uint32_t output_buffer_size64 = output_buffer_size / sizeof(uint64_t); - void* ptr = calloc(output_buffer_size64, sizeof(uint64_t)); + util::HsaRsrcFactory* hsa_rsrc = &util::HsaRsrcFactory::Instance(); + const util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(profile->agent); + void* ptr = hsa_rsrc->AllocateSysMemory(agent_info, output_buffer_size); rinfo->data.result_bytes.size = output_buffer_size; rinfo->data.result_bytes.ptr = ptr; callback_data->ptr = reinterpret_cast(ptr); @@ -446,7 +448,7 @@ class Context { else EXC_RAISING(HSA_STATUS_ERROR, "SQTT data out of output buffer"); } - const bool suc = util::HsaRsrcFactory::CopyToHost(dest, src, size); + bool suc = util::HsaRsrcFactory::Memcpy(profile->agent, dest, src, size); if (suc) { *header = size; callback_data->ptr = dest + align_size(size, sizeof(uint32_t)); diff --git a/projects/rocprofiler/src/core/profile.h b/projects/rocprofiler/src/core/profile.h index 79c68e6ef6..01a89ad4d4 100644 --- a/projects/rocprofiler/src/core/profile.h +++ b/projects/rocprofiler/src/core/profile.h @@ -88,8 +88,8 @@ class Profile { } virtual ~Profile() { info_vector_.clear(); - if (profile_.command_buffer.ptr) util::HsaRsrcFactory::MemoryFree(profile_.command_buffer.ptr); - if (profile_.output_buffer.ptr) util::HsaRsrcFactory::MemoryFree(profile_.output_buffer.ptr); + if (profile_.command_buffer.ptr) util::HsaRsrcFactory::FreeMemory(profile_.command_buffer.ptr); + if (profile_.output_buffer.ptr) util::HsaRsrcFactory::FreeMemory(profile_.output_buffer.ptr); if (profile_.events) free(const_cast(profile_.events)); if (profile_.parameters) free(const_cast(profile_.parameters)); if (completion_signal_.handle) { diff --git a/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp b/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp index e77580f65c..fd3b30e300 100644 --- a/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp +++ b/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp @@ -359,9 +359,13 @@ bool HsaRsrcFactory::CopyToHost(void* dest_buff, const void* src_buff, uint32_t CHECK_STATUS("hsa_memory_copy", status); return (status == HSA_STATUS_SUCCESS); } +bool HsaRsrcFactory::Memcpy(hsa_agent_t agent, void* dest_buff, const void* src_buff, uint32_t length) { + (void)agent; + return CopyToHost(dest_buff, src_buff, length); +} // Free method -bool HsaRsrcFactory::MemoryFree(void* ptr) { +bool HsaRsrcFactory::FreeMemory(void* ptr) { const hsa_status_t status = hsa_memory_free(ptr); CHECK_STATUS("hsa_memory_free", status); return (status == HSA_STATUS_SUCCESS); diff --git a/projects/rocprofiler/src/util/hsa_rsrc_factory.h b/projects/rocprofiler/src/util/hsa_rsrc_factory.h index 40f8c165bf..c9ab5f3f09 100644 --- a/projects/rocprofiler/src/util/hsa_rsrc_factory.h +++ b/projects/rocprofiler/src/util/hsa_rsrc_factory.h @@ -217,9 +217,10 @@ class HsaRsrcFactory { // Memcopy method static bool CopyToHost(void* dest_buff, const void* src_buff, uint32_t length); + static bool Memcpy(hsa_agent_t agent, void* dest_buff, const void* src_buff, uint32_t length); // Free method - static bool MemoryFree(void* ptr); + static bool FreeMemory(void* ptr); // Loads an Assembled Brig file and Finalizes it into Device Isa // diff --git a/projects/rocprofiler/test/ctrl/run_kernel.h b/projects/rocprofiler/test/ctrl/run_kernel.h index a731dbe17a..bd6dfa5c62 100644 --- a/projects/rocprofiler/test/ctrl/run_kernel.h +++ b/projects/rocprofiler/test/ctrl/run_kernel.h @@ -57,22 +57,23 @@ template bool RunKernel(int argc, char* argv[], int c return false; } - // Run test kernel + // Kernel dspatch iterations for (int i = 0; i < count; ++i) { + // Run test kernel ret_val = test_aql->Run(); if (ret_val == false) { std::cerr << "Error in running the test kernel" << std::endl; TEST_ASSERT(ret_val); return false; } - } - // Verify the results of the execution - ret_val = test_aql->VerifyResults(); - if (ret_val) { - std::clog << "Test : Passed" << std::endl; - } else { - std::clog << "Test : Failed" << std::endl; + // Verify the results of the execution + ret_val = test_aql->VerifyResults(); + if (ret_val) { + std::clog << "Test : Passed" << std::endl; + } else { + std::clog << "Test : Failed" << std::endl; + } } // Print time taken by sample diff --git a/projects/rocprofiler/test/ctrl/test_hsa.cpp b/projects/rocprofiler/test/ctrl/test_hsa.cpp index edcfd570b2..076fe66ca7 100644 --- a/projects/rocprofiler/test/ctrl/test_hsa.cpp +++ b/projects/rocprofiler/test/ctrl/test_hsa.cpp @@ -115,11 +115,26 @@ bool TestHsa::Setup() { mem_map_t& mem_map = test_->GetMemMap(); for (mem_it_t it = mem_map.begin(); it != mem_map.end(); ++it) { mem_descr_t& des = it->second; - void* ptr = (des.local) ? hsa_rsrc_->AllocateLocalMemory(agent_info_, des.size) - : hsa_rsrc_->AllocateSysMemory(agent_info_, des.size); - des.ptr = ptr; - TEST_ASSERT(ptr != NULL); - if (ptr == NULL) return false; + switch (des.id) { + case TestKernel::LOCAL_DES_ID: + des.ptr = hsa_rsrc_->AllocateLocalMemory(agent_info_, des.size); + break; + case TestKernel::KERNARG_DES_ID: + des.ptr = hsa_rsrc_->AllocateKernArgMemory(agent_info_, des.size); + if (des.ptr) memset(des.ptr, 0, des.size); + break; + case TestKernel::SYS_DES_ID: + des.ptr = hsa_rsrc_->AllocateSysMemory(agent_info_, des.size); + if (des.ptr) memset(des.ptr, 0, des.size); + break; + case TestKernel::NULL_DES_ID: + des.ptr = NULL; + break; + default: + break; + }; + TEST_ASSERT(des.ptr != NULL); + if (des.ptr == NULL) return false; } test_->Init(); @@ -208,24 +223,41 @@ bool TestHsa::Run() { hsa_signal_wait_acquire(hsa_signal_, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1, HSA_WAIT_STATE_BLOCKED); + std::clog << "> DONE, que_idx=" << que_idx << std::endl; + // Stop the timer object hsa_timer_.StopTimer(dispatch_timer_idx_); dispatch_time_taken_ = hsa_timer_.ReadTimer(dispatch_timer_idx_); total_time_taken_ += dispatch_time_taken_; - // Copy kernel buffers from local memory into system memory - const bool suc = hsa_rsrc_->CopyToHost(test_->GetOutputPtr(), test_->GetLocalPtr(), test_->GetOutputSize()); - if (suc) test_->PrintOutput(); - - return suc; + return true; } bool TestHsa::VerifyResults() { - // Compare the results and see if they match - const void* const refout_ptr = test_->GetRefoutPtr(); - const int32_t cmp_val = - (refout_ptr != NULL) ? memcmp(test_->GetOutputPtr(), refout_ptr, test_->GetOutputSize()) : 0; - return (cmp_val == 0); + bool cmp = false; + void* output = NULL; + const uint32_t size = test_->GetOutputSize(); + bool suc = false; + + // Copy local kernel output buffers from local memory into host memory + if (test_->IsOutputLocal()) { + output = hsa_rsrc_->AllocateSysMemory(agent_info_, size); + suc = hsa_rsrc_->Memcpy(agent_info_, output, test_->GetOutputPtr(), size); + } else { + output = test_->GetOutputPtr();; + suc = true; + } + + if ((output != NULL) && suc) { + // Print the test output + test_->PrintOutput(output); + // Compare the results and see if they match + cmp = (memcmp(output, test_->GetRefOut(), size) == 0); + } + + if (test_->IsOutputLocal() && (output != NULL)) hsa_rsrc_->FreeMemory(output); + + return cmp; } void TestHsa::PrintTime() { diff --git a/projects/rocprofiler/test/ctrl/test_kernel.h b/projects/rocprofiler/test/ctrl/test_kernel.h index 8bdbe1a525..01b06c3433 100644 --- a/projects/rocprofiler/test/ctrl/test_kernel.h +++ b/projects/rocprofiler/test/ctrl/test_kernel.h @@ -28,22 +28,23 @@ OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef TEST_CTRL_TEST_KERNEL_H_ #define TEST_CTRL_TEST_KERNEL_H_ +#include #include #include -#include "util/hsa_rsrc_factory.h" - // Class implements kernel test class TestKernel { public: + // Exported buffers IDs + enum buf_id_t { KERNARG_EXP_ID, OUTPUT_EXP_ID, REFOUT_EXP_ID }; // Memory descriptors IDs - enum { INPUT_DES_ID, OUTPUT_DES_ID, LOCAL_DES_ID, MASK_DES_ID, KERNARG_DES_ID, REFOUT_DES_ID }; + enum des_id_t { NULL_DES_ID, LOCAL_DES_ID, KERNARG_DES_ID, SYS_DES_ID, REFOUT_DES_ID }; // Memory descriptors vector declaration struct mem_descr_t { + des_id_t id; void* ptr; uint32_t size; - bool local; }; // Memory map declaration @@ -51,12 +52,7 @@ class TestKernel { typedef mem_map_t::iterator mem_it_t; typedef mem_map_t::const_iterator mem_const_it_t; - virtual ~TestKernel() { - for (auto& entry : mem_map_) { - void* ptr = entry.second.ptr; - if (ptr != NULL) HsaRsrcFactory::MemoryFree(ptr); - } - } + virtual ~TestKernel() {} // Initialize method virtual void Init() = 0; @@ -65,50 +61,79 @@ class TestKernel { mem_map_t& GetMemMap() { return mem_map_; } // Return NULL descriptor - static mem_descr_t NullDescriptor() { return {NULL, 0, 0}; } + static mem_descr_t NullDescriptor() { return {NULL_DES_ID, NULL, 0}; } + + // Check if decripter is local + bool IsLocal(const mem_descr_t& descr) const { return (descr.id == LOCAL_DES_ID); } // Methods to get the kernel attributes - void* GetKernargPtr() const { return GetDescr(KERNARG_DES_ID).ptr; } - uint32_t GetKernargSize() const { return GetDescr(KERNARG_DES_ID).size; } - void* GetOutputPtr() const { return GetDescr(OUTPUT_DES_ID).ptr; } - uint32_t GetOutputSize() const { return GetDescr(OUTPUT_DES_ID).size; } - void* GetLocalPtr() const { return GetDescr(LOCAL_DES_ID).ptr; } - void* GetRefoutPtr() const { return GetDescr(REFOUT_DES_ID).ptr; } + const mem_descr_t& GetKernargDescr() { return *test_map_[KERNARG_EXP_ID]; }; + const mem_descr_t& GetOutputDescr() { return *test_map_[OUTPUT_EXP_ID]; }; + void* GetKernargPtr() { return GetKernargDescr().ptr; } + uint32_t GetKernargSize() { return GetKernargDescr().size; } + void* GetOutputPtr() { return GetOutputDescr().ptr; } + uint32_t GetOutputSize() { return GetOutputDescr().size; } + bool IsOutputLocal() { return IsLocal(GetOutputDescr()); } virtual uint32_t GetGridSize() const = 0; + // Return reference output + void* GetRefOut() { return test_map_[REFOUT_EXP_ID]->ptr; }; + // Print output - virtual void PrintOutput() const = 0; + virtual void PrintOutput(const void* ptr) const = 0; // Return name virtual std::string Name() const = 0; protected: - // Set system memory descriptor - bool SetSysDescr(const uint32_t& id, const uint32_t& size) { - return SetMemDescr(id, size, false); + // Set buffer descriptor + bool SetInDescr(const uint32_t& buf_id, const des_id_t& des_id, const uint32_t& size) { + bool suc = SetMemDescr(buf_id, des_id, size); + if (des_id == KERNARG_DES_ID) { + test_map_[KERNARG_EXP_ID] = &mem_map_[buf_id]; + } + return suc; } - // Set local memory descriptor - bool SetLocalDescr(const uint32_t& id, const uint32_t& size) { - return SetMemDescr(id, size, true); + // Set results descriptor + bool SetOutDescr(const uint32_t& buf_id, const des_id_t& des_id, const uint32_t& size) { + bool suc = SetMemDescr(buf_id, des_id, size); + test_map_[OUTPUT_EXP_ID] = &mem_map_[buf_id]; + return suc; + } + + // Set host descriptor + bool SetHostDescr(const uint32_t& buf_id, const des_id_t& des_id, const uint32_t& size) { + bool suc = SetMemDescr(buf_id, des_id, size); + if (suc) { + mem_descr_t& descr = mem_map_[buf_id]; + descr.ptr = malloc(size); + if (des_id == REFOUT_DES_ID) { + test_map_[REFOUT_EXP_ID] = &descr; + } + if (descr.ptr == NULL) suc = false; + } + return suc; } // Get memory descriptor - mem_descr_t GetDescr(const uint32_t& id) const { - mem_const_it_t it = mem_map_.find(id); + mem_descr_t GetDescr(const uint32_t& buf_id) const { + mem_const_it_t it = mem_map_.find(buf_id); return (it != mem_map_.end()) ? it->second : NullDescriptor(); } private: // Set memory descriptor - bool SetMemDescr(const uint32_t& id, const uint32_t& size, const bool& local) { - const mem_descr_t des = {NULL, size, local}; - auto ret = mem_map_.insert(mem_map_t::value_type(id, des)); + bool SetMemDescr(const uint32_t& buf_id, const des_id_t& des_id, const uint32_t& size) { + const mem_descr_t des = {des_id, NULL, size}; + auto ret = mem_map_.insert(mem_map_t::value_type(buf_id, des)); return ret.second; } // Kernel memory map object mem_map_t mem_map_; + // Test memory map object + std::map test_map_; }; #endif // TEST_CTRL_TEST_KERNEL_H_ diff --git a/projects/rocprofiler/test/ctrl/tool.cpp b/projects/rocprofiler/test/ctrl/tool.cpp index b56301243e..a83cb04918 100644 --- a/projects/rocprofiler/test/ctrl/tool.cpp +++ b/projects/rocprofiler/test/ctrl/tool.cpp @@ -266,16 +266,14 @@ hsa_status_t trace_data_cb(hsa_ven_amd_aqlprofile_info_type_t info_type, const uint32_t data_size = info_data->sqtt_data.size; const void* data_ptr = info_data->sqtt_data.ptr; fprintf(arg->file, " SE(%u) size(%u)\n", info_data->sample_id, data_size); -#if 1 - dump_sqtt_trace(arg->label, info_data->sample_id, data_ptr, data_size); -#else - void* buffer = malloc(data_size); - memset(buffer, 0, data_size); - const bool suc = HsaRsrcFactory::Instance().CopyToHost(arg->agent, buffer, data_ptr, data_size); + + HsaRsrcFactory* hsa_rsrc = &HsaRsrcFactory::Instance(); + const AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(arg->agent); + void* buffer = hsa_rsrc->AllocateSysMemory(agent_info, data_size); + const bool suc = HsaRsrcFactory::Memcpy(arg->agent, buffer, data_ptr, data_size); if (suc) dump_sqtt_trace(arg->label, info_data->sample_id, buffer, data_size); else fatal("SQTT data memcopy to host failed"); - free(buffer); -#endif + HsaRsrcFactory::FreeMemory(buffer); } else status = HSA_STATUS_ERROR; return status; @@ -320,7 +318,7 @@ void output_results(const context_entry_t* entry, const char* label) { size += chunk_size; } fprintf(file, "size(%lu)\n", size); - free(p->data.result_bytes.ptr); + HsaRsrcFactory::FreeMemory(p->data.result_bytes.ptr); const_cast(p)->data.result_bytes.size = 0; } else { fprintf(file, "(\n"); diff --git a/projects/rocprofiler/test/simple_convolution/simple_convolution.cl b/projects/rocprofiler/test/simple_convolution/simple_convolution.cl index 9cf58d2008..11f3bc1bf0 100644 --- a/projects/rocprofiler/test/simple_convolution/simple_convolution.cl +++ b/projects/rocprofiler/test/simple_convolution/simple_convolution.cl @@ -67,7 +67,7 @@ __kernel void SimpleConvolution(__global uint * output, float sumFX = 0; for(uint i = left; i <= right; ++i) { - for(uint j = top ; j <= bottom; ++j) { + for(uint j = top; j <= bottom; ++j) { // performing wighted sum within the mask boundaries uint maskIndex = (j - (y - hstep)) * maskWidth + (i - (x - vstep)); uint index = j * width + i; diff --git a/projects/rocprofiler/test/simple_convolution/simple_convolution.cpp b/projects/rocprofiler/test/simple_convolution/simple_convolution.cpp index e5676d6d44..8052f2b971 100644 --- a/projects/rocprofiler/test/simple_convolution/simple_convolution.cpp +++ b/projects/rocprofiler/test/simple_convolution/simple_convolution.cpp @@ -27,6 +27,7 @@ OF THE POSSIBILITY OF SUCH DAMAGE. #include "simple_convolution/simple_convolution.h" +#include #include #include @@ -282,12 +283,11 @@ SimpleConvolution::SimpleConvolution() { const uint32_t input_size_bytes = width_ * height_ * sizeof(uint32_t); const uint32_t mask_size_bytes = mask_width_ * mask_height_ * sizeof(float); - SetSysDescr(KERNARG_DES_ID, sizeof(kernel_args_t)); - SetSysDescr(INPUT_DES_ID, input_size_bytes); - SetSysDescr(OUTPUT_DES_ID, input_size_bytes); - SetLocalDescr(LOCAL_DES_ID, input_size_bytes); - SetSysDescr(MASK_DES_ID, mask_size_bytes); - SetSysDescr(REFOUT_DES_ID, input_size_bytes); + SetInDescr(KERNARG_BUF_ID, KERNARG_DES_ID, sizeof(kernel_args_t)); + SetInDescr(INPUT_BUF_ID, SYS_DES_ID, input_size_bytes); + SetInDescr(MASK_BUF_ID, SYS_DES_ID, mask_size_bytes); + SetOutDescr(LOCAL_BUF_ID, LOCAL_DES_ID, input_size_bytes); + SetHostDescr(REFOUT_BUF_ID, REFOUT_DES_ID, input_size_bytes); if (!randomize_seed_) TEST_ASSERT(sizeof(input_data_) <= input_size_bytes); } @@ -295,14 +295,18 @@ SimpleConvolution::SimpleConvolution() { void SimpleConvolution::Init() { std::clog << "SimpleConvolution::init :" << std::endl; - mem_descr_t input_des = GetDescr(INPUT_DES_ID); - mem_descr_t local_des = GetDescr(LOCAL_DES_ID); - mem_descr_t mask_des = GetDescr(MASK_DES_ID); - mem_descr_t refout_des = GetDescr(REFOUT_DES_ID); - mem_descr_t kernarg_des = GetDescr(KERNARG_DES_ID); - + mem_descr_t kernarg_des = GetDescr(KERNARG_BUF_ID); + mem_descr_t input_des = GetDescr(INPUT_BUF_ID); + mem_descr_t mask_des = GetDescr(MASK_BUF_ID); + mem_descr_t output_des = GetDescr(LOCAL_BUF_ID); +#if 0 + printf("kernarg_des %p 0x%x\n", kernarg_des.ptr, kernarg_des.size); + printf("input_des %p 0x%x\n", input_des.ptr, input_des.size); + printf("mask_des %p 0x%x\n", mask_des.ptr, mask_des.size); + printf("output_des %p 0x%x\n", output_des.ptr, output_des.size); +#endif uint32_t* input = (uint32_t*)input_des.ptr; - uint32_t* output_local = (uint32_t*)local_des.ptr; + uint32_t* output_local = (uint32_t*)output_des.ptr; float* mask = (float*)mask_des.ptr; kernel_args_t* kernel_args = (kernel_args_t*)kernarg_des.ptr; @@ -343,13 +347,12 @@ void SimpleConvolution::Init() { kernel_args->arg51 = mask_height_; // Calculate the reference output - memset(refout_des.ptr, 0, refout_des.size); - ReferenceImplementation(reinterpret_cast(refout_des.ptr), input, mask, width_, height_, + ReferenceImplementation(reinterpret_cast(GetRefOut()), input, mask, width_, height_, mask_width_, mask_height_); } -void SimpleConvolution::PrintOutput() const { - PrintArray("> Output[0]", reinterpret_cast(GetOutputPtr()), width_, 1); +void SimpleConvolution::PrintOutput(const void* ptr) const { + PrintArray("> Output[0]", reinterpret_cast(ptr), width_, 1); } bool SimpleConvolution::ReferenceImplementation(uint32_t* output, const uint32_t* input, diff --git a/projects/rocprofiler/test/simple_convolution/simple_convolution.h b/projects/rocprofiler/test/simple_convolution/simple_convolution.h index a5b75a6c30..38424baaed 100644 --- a/projects/rocprofiler/test/simple_convolution/simple_convolution.h +++ b/projects/rocprofiler/test/simple_convolution/simple_convolution.h @@ -36,6 +36,9 @@ OF THE POSSIBILITY OF SUCH DAMAGE. // Class implements SimpleConvolution kernel parameters class SimpleConvolution : public TestKernel { public: + // Kernel buffers IDs + enum { INPUT_BUF_ID, LOCAL_BUF_ID, MASK_BUF_ID, KERNARG_BUF_ID, REFOUT_BUF_ID }; + // Constructor SimpleConvolution(); @@ -46,7 +49,7 @@ class SimpleConvolution : public TestKernel { uint32_t GetGridSize() const { return width_ * height_; } // Print output - void PrintOutput() const; + void PrintOutput(const void* ptr) const; // Return name std::string Name() const { return std::string("SimpleConvolution"); } diff --git a/projects/rocprofiler/test/util/hsa_rsrc_factory.cpp b/projects/rocprofiler/test/util/hsa_rsrc_factory.cpp index 7c257e5e93..1ba88faccd 100644 --- a/projects/rocprofiler/test/util/hsa_rsrc_factory.cpp +++ b/projects/rocprofiler/test/util/hsa_rsrc_factory.cpp @@ -217,25 +217,17 @@ const AgentInfo* HsaRsrcFactory::GetAgentInfo(const hsa_agent_t agent) { } // Get the count of Hsa Gpu Agents available on the platform -// // @return uint32_t Number of Gpu agents on platform -// uint32_t HsaRsrcFactory::GetCountOfGpuAgents() { return uint32_t(gpu_list_.size()); } // Get the count of Hsa Cpu Agents available on the platform -// // @return uint32_t Number of Cpu agents on platform -// uint32_t HsaRsrcFactory::GetCountOfCpuAgents() { return uint32_t(cpu_list_.size()); } // Get the AgentInfo handle of a Gpu device -// // @param idx Gpu Agent at specified index -// // @param agent_info Output parameter updated with AgentInfo -// // @return bool true if successful, false otherwise -// bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) { // Determine if request is valid uint32_t size = uint32_t(gpu_list_.size()); @@ -250,13 +242,9 @@ bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) } // Get the AgentInfo handle of a Cpu device -// // @param idx Cpu Agent at specified index -// // @param agent_info Output parameter updated with AgentInfo -// // @return bool true if successful, false otherwise -// bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) { // Determine if request is valid uint32_t size = uint32_t(cpu_list_.size()); @@ -271,15 +259,10 @@ bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) // Create a Queue object and return its handle. The queue object is expected // to support user requested number of Aql dispatch packets. -// // @param agent_info Gpu Agent on which to create a queue object -// // @param num_Pkts Number of packets to be held by queue -// // @param queue Output parameter updated with handle of queue object -// // @return bool true if successful, false otherwise -// bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue) { hsa_status_t status; @@ -289,13 +272,9 @@ bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, } // Create a Signal object and return its handle. -// // @param value Initial value of signal object -// // @param signal Output parameter updated with handle of signal object -// // @return bool true if successful, false otherwise -// bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) { hsa_status_t status; status = hsa_signal_create(value, 0, NULL, signal); @@ -305,13 +284,9 @@ bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) { // Allocate memory for use by a kernel of specified size in specified // agent's memory region. Currently supports Global segment whose Kernarg // flag set. -// // @param agent_info Agent from whose memory region to allocate -// // @param size Size of memory in terms of bytes -// // @return uint8_t* Pointer to buffer, null if allocation fails. -// uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t size) { hsa_status_t status; uint8_t* buffer = NULL; @@ -332,14 +307,10 @@ uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t return (status == HSA_STATUS_SUCCESS) ? buffer : NULL; } -// Allocate memory tp pass kernel parameters. -// +// Allocate host memory. // @param agent_info Agent from whose memory region to allocate -// // @param size Size of memory in terms of bytes -// // @return uint8_t* Pointer to buffer, null if allocation fails. -// uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t size) { hsa_status_t status; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; @@ -350,33 +321,40 @@ uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t s return (status == HSA_STATUS_SUCCESS) ? buffer : NULL; } +// Allocate memory tp pass kernel parameters. +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size_t size) { + return AllocateSysMemory(agent_info, size); +} + // Memcopy method -bool HsaRsrcFactory::CopyToHost(void* dest_buff, const void* src_buff, uint32_t length) { +bool HsaRsrcFactory::Memcpy(hsa_agent_t agent, void* dest_buff, const void* src_buff, uint32_t length) { + (void)agent; const hsa_status_t status = hsa_memory_copy(dest_buff, src_buff, length); CHECK_STATUS("hsa_memory_copy", status); return (status == HSA_STATUS_SUCCESS); } +bool HsaRsrcFactory::Memcpy(const AgentInfo* agent_info, void* dest_buff, const void* src_buff, uint32_t length) { + (void)agent_info; + return Memcpy(agent_info->dev_id, dest_buff, src_buff, length); +} // Free method -bool HsaRsrcFactory::MemoryFree(void* ptr) { +bool HsaRsrcFactory::FreeMemory(void* ptr) { const hsa_status_t status = hsa_memory_free(ptr); CHECK_STATUS("hsa_memory_free", status); return (status == HSA_STATUS_SUCCESS); } // Loads an Assembled Brig file and Finalizes it into Device Isa -// // @param agent_info Gpu device for which to finalize -// // @param brig_path File path of the Assembled Brig file -// // @param kernel_name Name of the kernel to finalize -// // @param code_desc Handle of finalized Code Descriptor that could // be used to submit for execution -// // @return bool true if successful, false otherwise -// bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name, hsa_executable_t* executable, hsa_executable_symbol_t* code_desc) { hsa_status_t status = HSA_STATUS_ERROR; diff --git a/projects/rocprofiler/test/util/hsa_rsrc_factory.h b/projects/rocprofiler/test/util/hsa_rsrc_factory.h index f5bd947dbc..021f02858b 100644 --- a/projects/rocprofiler/test/util/hsa_rsrc_factory.h +++ b/projects/rocprofiler/test/util/hsa_rsrc_factory.h @@ -137,101 +137,73 @@ class HsaRsrcFactory { const AgentInfo* GetAgentInfo(const hsa_agent_t agent); // Get the count of Hsa Gpu Agents available on the platform - // // @return uint32_t Number of Gpu agents on platform - // uint32_t GetCountOfGpuAgents(); // Get the count of Hsa Cpu Agents available on the platform - // // @return uint32_t Number of Cpu agents on platform - // uint32_t GetCountOfCpuAgents(); // Get the AgentInfo handle of a Gpu device - // // @param idx Gpu Agent at specified index - // // @param agent_info Output parameter updated with AgentInfo - // // @return bool true if successful, false otherwise - // bool GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info); // Get the AgentInfo handle of a Cpu device - // // @param idx Cpu Agent at specified index - // // @param agent_info Output parameter updated with AgentInfo - // // @return bool true if successful, false otherwise - // bool GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info); // Create a Queue object and return its handle. The queue object is expected // to support user requested number of Aql dispatch packets. - // // @param agent_info Gpu Agent on which to create a queue object - // // @param num_Pkts Number of packets to be held by queue - // // @param queue Output parameter updated with handle of queue object - // // @return bool true if successful, false otherwise - // bool CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue); // Create a Signal object and return its handle. - // // @param value Initial value of signal object - // // @param signal Output parameter updated with handle of signal object - // // @return bool true if successful, false otherwise - // bool CreateSignal(uint32_t value, hsa_signal_t* signal); // Allocate memory for use by a kernel of specified size in specified // agent's memory region. Currently supports Global segment whose Kernarg // flag set. - // // @param agent_info Agent from whose memory region to allocate - // // @param size Size of memory in terms of bytes - // // @return uint8_t* Pointer to buffer, null if allocation fails. - // uint8_t* AllocateLocalMemory(const AgentInfo* agent_info, size_t size); - // Allocate memory tp pass kernel parameters. - // + // Allocate system memory. // @param agent_info Agent from whose memory region to allocate - // // @param size Size of memory in terms of bytes - // // @return uint8_t* Pointer to buffer, null if allocation fails. - // uint8_t* AllocateSysMemory(const AgentInfo* agent_info, size_t size); + // Allocate memory tp pass kernel parameters. + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateKernArgMemory(const AgentInfo* agent_info, size_t size); + // Memcopy method - static bool CopyToHost(void* dest_buff, const void* src_buff, uint32_t length); + static bool Memcpy(const AgentInfo* agent_info, void* dest_buff, const void* src_buff, uint32_t length); + static bool Memcpy(hsa_agent_t agent, void* dest_buff, const void* src_buff, uint32_t length); // Free method - static bool MemoryFree(void* ptr); + static bool FreeMemory(void* ptr); // Loads an Assembled Brig file and Finalizes it into Device Isa - // // @param agent_info Gpu device for which to finalize - // // @param brig_path File path of the Assembled Brig file - // // @param kernel_name Name of the kernel to finalize - // // @param code_desc Handle of finalized Code Descriptor that could // be used to submit for execution - // // @return true if successful, false otherwise - // bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name, hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc);