From daad2bc3d18d7fffa35bc303985e45d660f187eb Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 20 Dec 2017 21:18:01 -0600 Subject: [PATCH] fixing simple proxy queue implementation, write index supporting for checking the queue overflow --- inc/rocprofiler.h | 5 --- src/core/rocprofiler.cpp | 29 +++++++++++++++ src/core/simple_proxy_queue.cpp | 11 ++++-- src/core/simple_proxy_queue.h | 62 ++++++++++++++++++++++++++++----- test/run.sh | 4 +-- 5 files changed, 93 insertions(+), 18 deletions(-) diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index e738ed0db2..a649fb5e0b 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -253,11 +253,6 @@ hsa_status_t rocprofiler_iterate_trace_data( hsa_status_t rocprofiler_error_string( const char** str); // [out] the API error string pointer returning -//////////////////////////////////////////////////////////////////////////////// -// HSA-runtime tool on-load method -bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count, - const char* const* failed_tool_names); - #ifdef __cplusplus } // extern "C" block #endif // __cplusplus diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index 58707ce135..4afbf0747e 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -32,9 +32,18 @@ namespace rocprofiler { decltype(hsa_queue_create)* hsa_queue_create_fn; decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; + decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed_fn; +decltype(hsa_signal_store_relaxed)* hsa_signal_store_screlease_fn; + decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed_fn; decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed_fn; +decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed_fn; + +decltype(hsa_queue_load_write_index_scacquire)* hsa_queue_load_write_index_scacquire_fn; +decltype(hsa_queue_store_write_index_screlease)* hsa_queue_store_write_index_screlease_fn; +decltype(hsa_queue_load_read_index_scacquire)* hsa_queue_load_read_index_scacquire_fn; + #ifdef ROCP_HSA_PROXY decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn; decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn; @@ -46,9 +55,18 @@ void SaveHsaApi(::HsaApiTable* table) { kHsaApiTable = table; hsa_queue_create_fn = table->core_->hsa_queue_create_fn; hsa_queue_destroy_fn = table->core_->hsa_queue_destroy_fn; + hsa_signal_store_relaxed_fn = table->core_->hsa_signal_store_relaxed_fn; + hsa_signal_store_screlease_fn = table->core_->hsa_signal_store_screlease_fn; + hsa_queue_load_write_index_relaxed_fn = table->core_->hsa_queue_load_write_index_relaxed_fn; hsa_queue_store_write_index_relaxed_fn = table->core_->hsa_queue_store_write_index_relaxed_fn; + hsa_queue_load_read_index_relaxed_fn = table->core_->hsa_queue_load_read_index_relaxed_fn; + + hsa_queue_load_write_index_scacquire_fn = table->core_->hsa_queue_load_write_index_scacquire_fn; + hsa_queue_store_write_index_screlease_fn = table->core_->hsa_queue_store_write_index_screlease_fn; + hsa_queue_load_read_index_scacquire_fn = table->core_->hsa_queue_load_read_index_scacquire_fn; + #ifdef ROCP_HSA_PROXY hsa_amd_queue_intercept_create_fn = table->amd_ext_->hsa_amd_queue_intercept_create_fn; hsa_amd_queue_intercept_register_fn = table->amd_ext_->hsa_amd_queue_intercept_register_fn; @@ -59,9 +77,18 @@ void RestoreHsaApi() { ::HsaApiTable* table = kHsaApiTable; table->core_->hsa_queue_create_fn = hsa_queue_create_fn; table->core_->hsa_queue_destroy_fn = hsa_queue_destroy_fn; + table->core_->hsa_signal_store_relaxed_fn = hsa_signal_store_relaxed_fn; + table->core_->hsa_signal_store_screlease_fn = hsa_signal_store_screlease_fn; + table->core_->hsa_queue_load_write_index_relaxed_fn = hsa_queue_load_write_index_relaxed_fn; table->core_->hsa_queue_store_write_index_relaxed_fn = hsa_queue_store_write_index_relaxed_fn; + table->core_->hsa_queue_load_read_index_relaxed_fn = hsa_queue_load_read_index_relaxed_fn; + + table->core_->hsa_queue_load_write_index_scacquire_fn = hsa_queue_load_write_index_scacquire_fn; + table->core_->hsa_queue_store_write_index_screlease_fn = hsa_queue_store_write_index_screlease_fn; + table->core_->hsa_queue_load_read_index_scacquire_fn = hsa_queue_load_read_index_scacquire_fn; + #ifdef ROCP_HSA_PROXY table->amd_ext_->hsa_amd_queue_intercept_create_fn = hsa_amd_queue_intercept_create_fn; table->amd_ext_->hsa_amd_queue_intercept_register_fn = hsa_amd_queue_intercept_register_fn; @@ -240,6 +267,7 @@ PUBLIC_API hsa_status_t rocprofiler_iterate_trace_data( API_METHOD_SUFFIX } +// HSA-runtime tool on-load method PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count, const char* const* failed_tool_names) { rocprofiler::SaveHsaApi(table); @@ -253,6 +281,7 @@ PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t fa return true; } +// HSA-runtime tool on-unload method PUBLIC_API void OnUnload() { rocprofiler::RestoreHsaApi(); } } // extern "C" diff --git a/src/core/simple_proxy_queue.cpp b/src/core/simple_proxy_queue.cpp index cfb25f2c4e..589acaa9d8 100644 --- a/src/core/simple_proxy_queue.cpp +++ b/src/core/simple_proxy_queue.cpp @@ -3,8 +3,15 @@ namespace rocprofiler { void SimpleProxyQueue::HsaIntercept(HsaApiTable* table) { table->core_->hsa_signal_store_relaxed_fn = rocprofiler::SimpleProxyQueue::SignalStore; - table->core_->hsa_queue_load_write_index_relaxed_fn = rocprofiler::SimpleProxyQueue::LoadIndex; - table->core_->hsa_queue_store_write_index_relaxed_fn = rocprofiler::SimpleProxyQueue::StoreIndex; + table->core_->hsa_signal_store_screlease_fn = rocprofiler::SimpleProxyQueue::SignalStore; + + table->core_->hsa_queue_load_write_index_relaxed_fn = rocprofiler::SimpleProxyQueue::GetQueueIndex; + table->core_->hsa_queue_store_write_index_relaxed_fn = rocprofiler::SimpleProxyQueue::SetQueueIndex; + table->core_->hsa_queue_load_read_index_relaxed_fn = rocprofiler::SimpleProxyQueue::GetSubmitIndex; + + table->core_->hsa_queue_load_write_index_scacquire_fn = rocprofiler::SimpleProxyQueue::GetQueueIndex; + table->core_->hsa_queue_store_write_index_screlease_fn = rocprofiler::SimpleProxyQueue::SetQueueIndex; + table->core_->hsa_queue_load_read_index_scacquire_fn = rocprofiler::SimpleProxyQueue::GetSubmitIndex; } SimpleProxyQueue::queue_map_t* SimpleProxyQueue::queue_map_ = NULL; diff --git a/src/core/simple_proxy_queue.h b/src/core/simple_proxy_queue.h index 9739c5e5d1..0695fb4596 100644 --- a/src/core/simple_proxy_queue.h +++ b/src/core/simple_proxy_queue.h @@ -10,12 +10,25 @@ #include "core/types.h" #include "util/hsa_rsrc_factory.h" +#ifndef ROCP_PROXY_LOCK +# define ROCP_PROXY_LOCK 1 +#endif + namespace rocprofiler { extern decltype(hsa_queue_create)* hsa_queue_create_fn; extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; + extern decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed_fn; +extern decltype(hsa_signal_store_relaxed)* hsa_signal_store_screlease_fn; + extern decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed_fn; extern decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed_fn; +extern decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed_fn; + +extern decltype(hsa_queue_load_write_index_scacquire)* hsa_queue_load_write_index_scacquire_fn; +extern decltype(hsa_queue_store_write_index_screlease)* hsa_queue_store_write_index_screlease_fn; +extern decltype(hsa_queue_load_read_index_scacquire)* hsa_queue_load_read_index_scacquire_fn; + typedef decltype(hsa_signal_t::handle) signal_handle_t; @@ -27,9 +40,11 @@ class SimpleProxyQueue : public ProxyQueue { auto it = queue_map_->find(signal.handle); if (it != queue_map_->end()) { SimpleProxyQueue* instance = it->second; + instance->mutex_lock(); const uint64_t begin = instance->submit_index_; const uint64_t end = que_idx + 1; instance->submit_index_ = end; + instance->mutex_unlock(); for (uint64_t j = begin; j < end; ++j) { // Submited packet const uint32_t idx = j & instance->queue_mask_; @@ -44,12 +59,24 @@ class SimpleProxyQueue : public ProxyQueue { } } - static uint64_t LoadIndex(const hsa_queue_t* queue) { + static uint64_t GetSubmitIndex(const hsa_queue_t* queue) { uint64_t index = 0; auto it = queue_map_->find(queue->doorbell_signal.handle); if (it != queue_map_->end()) { SimpleProxyQueue* instance = it->second; - instance->mutex_.lock(); + index = instance->submit_index_; + } else { + index = hsa_queue_load_read_index_relaxed_fn(queue); + } + return index; + } + + static uint64_t GetQueueIndex(const hsa_queue_t* queue) { + uint64_t index = 0; + auto it = queue_map_->find(queue->doorbell_signal.handle); + if (it != queue_map_->end()) { + SimpleProxyQueue* instance = it->second; + instance->mutex_lock(); index = instance->queue_index_; } else { index = hsa_queue_load_write_index_relaxed_fn(queue); @@ -57,12 +84,12 @@ class SimpleProxyQueue : public ProxyQueue { return index; } - static void StoreIndex(const hsa_queue_t* queue, uint64_t value) { + static void SetQueueIndex(const hsa_queue_t* queue, uint64_t value) { auto it = queue_map_->find(queue->doorbell_signal.handle); if (it != queue_map_->end()) { SimpleProxyQueue* instance = it->second; instance->queue_index_ = value; - instance->mutex_.unlock(); + instance->mutex_unlock(); } else { hsa_queue_store_write_index_relaxed_fn(queue, value); } @@ -75,9 +102,13 @@ class SimpleProxyQueue : public ProxyQueue { } void Submit(const packet_t* packet) { - // Compute the write index of queue and copy Aql packet into it + // Compute the write index of queue const uint64_t que_idx = hsa_queue_load_write_index_relaxed_fn(queue_); - // Increment the write index and ring the doorbell to submit the packet. + + // Waiting untill there is a free space in the queue + while (que_idx >= (hsa_queue_load_read_index_relaxed_fn(queue_) + size_)); + + // Increment the write index hsa_queue_store_write_index_relaxed_fn(queue_, que_idx + 1); const uint32_t mask = queue_->size - 1; @@ -92,12 +123,12 @@ class SimpleProxyQueue : public ProxyQueue { // To maintain global order to ensure the prior copy of the packet contents is made visible // before the header is updated. - // With in-order CP it will wait until the first packet in the blob will be valid + // With in-order CP it will wait until the first packet in the blob will be valid. std::atomic* header_atomic_ptr = reinterpret_cast*>(&dst[0]); header_atomic_ptr->store(src[0], std::memory_order_release); - // Doorbell signaling + // Doorbell signaling to submit the packet hsa_signal_store_relaxed_fn(doorbell_signal_, que_idx); } @@ -121,6 +152,7 @@ class SimpleProxyQueue : public ProxyQueue { void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), void* data, uint32_t private_segment_size, uint32_t group_segment_size, hsa_queue_t** queue) { + size_ = size; auto status = Init(agent, size); *queue = queue_; return status; @@ -131,7 +163,6 @@ class SimpleProxyQueue : public ProxyQueue { agent_info_ = util::HsaRsrcFactory::Instance().GetAgentInfo(agent); if (agent_info_ != NULL) { if (agent_info_->dev_type == HSA_DEVICE_TYPE_GPU) { - printf("queue_create size 0x%x(%d)\n", size, (int)size); status = hsa_queue_create_fn(agent, size, HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue_); if (status == HSA_STATUS_SUCCESS) { @@ -163,6 +194,19 @@ class SimpleProxyQueue : public ProxyQueue { return status; } + void mutex_lock() { +#if ROCP_PROXY_LOCK + mutex_.lock(); +#endif + } + + void mutex_unlock() { +#if ROCP_PROXY_LOCK + mutex_.unlock(); +#endif + } + + uint32_t size_; static queue_map_t* queue_map_; const util::AgentInfo* agent_info_; hsa_queue_t* queue_; diff --git a/test/run.sh b/test/run.sh index 2459030c11..970e8cae41 100755 --- a/test/run.sh +++ b/test/run.sh @@ -4,11 +4,11 @@ test_bin_dflt=./test/ctrl #export HSA_LIB=/home/evgeny/pkg/compute-psdb-16453/lib export HSA_LIB=/home/evgeny/git/compute/out/ubuntu-16.04/16.04/lib -export OCL_LIB=/home/evgeny/pkg/opencl_modified/opencl_x86_64/lib +#export OCL_LIB=/home/evgeny/pkg/opencl_modified/opencl_x86_64/lib #export OCL_LIB=/home/evgeny/Perforce/eshcherb_opencl/drivers/opencl/dist/linux/debug/lib/x86_64 # paths to ROC profiler and oher libraries -export LD_LIBRARY_PATH=$PWD:$HSA_LIB:$OCL_LIB +export LD_LIBRARY_PATH=$PWD:$HSA_LIB # enable error messages logging to '/tmp/rocprofiler_log.txt' export ROCPROFILER_LOG=1