fixing simple proxy queue implementation, write index supporting for checking the queue overflow

Bu işleme şunda yer alıyor:
Evgeny
2017-12-20 21:18:01 -06:00
ebeveyn d47ce7f51d
işleme daad2bc3d1
5 değiştirilmiş dosya ile 93 ekleme ve 18 silme
-5
Dosyayı Görüntüle
@@ -253,11 +253,6 @@ hsa_status_t rocprofiler_iterate_trace_data(
hsa_status_t rocprofiler_error_string(
const char** str); // [out] the API error string pointer returning
////////////////////////////////////////////////////////////////////////////////
// HSA-runtime tool on-load method
bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count,
const char* const* failed_tool_names);
#ifdef __cplusplus
} // extern "C" block
#endif // __cplusplus
+29
Dosyayı Görüntüle
@@ -32,9 +32,18 @@
namespace rocprofiler {
decltype(hsa_queue_create)* hsa_queue_create_fn;
decltype(hsa_queue_destroy)* hsa_queue_destroy_fn;
decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed_fn;
decltype(hsa_signal_store_relaxed)* hsa_signal_store_screlease_fn;
decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed_fn;
decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed_fn;
decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed_fn;
decltype(hsa_queue_load_write_index_scacquire)* hsa_queue_load_write_index_scacquire_fn;
decltype(hsa_queue_store_write_index_screlease)* hsa_queue_store_write_index_screlease_fn;
decltype(hsa_queue_load_read_index_scacquire)* hsa_queue_load_read_index_scacquire_fn;
#ifdef ROCP_HSA_PROXY
decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn;
decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn;
@@ -46,9 +55,18 @@ void SaveHsaApi(::HsaApiTable* table) {
kHsaApiTable = table;
hsa_queue_create_fn = table->core_->hsa_queue_create_fn;
hsa_queue_destroy_fn = table->core_->hsa_queue_destroy_fn;
hsa_signal_store_relaxed_fn = table->core_->hsa_signal_store_relaxed_fn;
hsa_signal_store_screlease_fn = table->core_->hsa_signal_store_screlease_fn;
hsa_queue_load_write_index_relaxed_fn = table->core_->hsa_queue_load_write_index_relaxed_fn;
hsa_queue_store_write_index_relaxed_fn = table->core_->hsa_queue_store_write_index_relaxed_fn;
hsa_queue_load_read_index_relaxed_fn = table->core_->hsa_queue_load_read_index_relaxed_fn;
hsa_queue_load_write_index_scacquire_fn = table->core_->hsa_queue_load_write_index_scacquire_fn;
hsa_queue_store_write_index_screlease_fn = table->core_->hsa_queue_store_write_index_screlease_fn;
hsa_queue_load_read_index_scacquire_fn = table->core_->hsa_queue_load_read_index_scacquire_fn;
#ifdef ROCP_HSA_PROXY
hsa_amd_queue_intercept_create_fn = table->amd_ext_->hsa_amd_queue_intercept_create_fn;
hsa_amd_queue_intercept_register_fn = table->amd_ext_->hsa_amd_queue_intercept_register_fn;
@@ -59,9 +77,18 @@ void RestoreHsaApi() {
::HsaApiTable* table = kHsaApiTable;
table->core_->hsa_queue_create_fn = hsa_queue_create_fn;
table->core_->hsa_queue_destroy_fn = hsa_queue_destroy_fn;
table->core_->hsa_signal_store_relaxed_fn = hsa_signal_store_relaxed_fn;
table->core_->hsa_signal_store_screlease_fn = hsa_signal_store_screlease_fn;
table->core_->hsa_queue_load_write_index_relaxed_fn = hsa_queue_load_write_index_relaxed_fn;
table->core_->hsa_queue_store_write_index_relaxed_fn = hsa_queue_store_write_index_relaxed_fn;
table->core_->hsa_queue_load_read_index_relaxed_fn = hsa_queue_load_read_index_relaxed_fn;
table->core_->hsa_queue_load_write_index_scacquire_fn = hsa_queue_load_write_index_scacquire_fn;
table->core_->hsa_queue_store_write_index_screlease_fn = hsa_queue_store_write_index_screlease_fn;
table->core_->hsa_queue_load_read_index_scacquire_fn = hsa_queue_load_read_index_scacquire_fn;
#ifdef ROCP_HSA_PROXY
table->amd_ext_->hsa_amd_queue_intercept_create_fn = hsa_amd_queue_intercept_create_fn;
table->amd_ext_->hsa_amd_queue_intercept_register_fn = hsa_amd_queue_intercept_register_fn;
@@ -240,6 +267,7 @@ PUBLIC_API hsa_status_t rocprofiler_iterate_trace_data(
API_METHOD_SUFFIX
}
// HSA-runtime tool on-load method
PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count,
const char* const* failed_tool_names) {
rocprofiler::SaveHsaApi(table);
@@ -253,6 +281,7 @@ PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t fa
return true;
}
// HSA-runtime tool on-unload method
PUBLIC_API void OnUnload() { rocprofiler::RestoreHsaApi(); }
} // extern "C"
+9 -2
Dosyayı Görüntüle
@@ -3,8 +3,15 @@
namespace rocprofiler {
void SimpleProxyQueue::HsaIntercept(HsaApiTable* table) {
table->core_->hsa_signal_store_relaxed_fn = rocprofiler::SimpleProxyQueue::SignalStore;
table->core_->hsa_queue_load_write_index_relaxed_fn = rocprofiler::SimpleProxyQueue::LoadIndex;
table->core_->hsa_queue_store_write_index_relaxed_fn = rocprofiler::SimpleProxyQueue::StoreIndex;
table->core_->hsa_signal_store_screlease_fn = rocprofiler::SimpleProxyQueue::SignalStore;
table->core_->hsa_queue_load_write_index_relaxed_fn = rocprofiler::SimpleProxyQueue::GetQueueIndex;
table->core_->hsa_queue_store_write_index_relaxed_fn = rocprofiler::SimpleProxyQueue::SetQueueIndex;
table->core_->hsa_queue_load_read_index_relaxed_fn = rocprofiler::SimpleProxyQueue::GetSubmitIndex;
table->core_->hsa_queue_load_write_index_scacquire_fn = rocprofiler::SimpleProxyQueue::GetQueueIndex;
table->core_->hsa_queue_store_write_index_screlease_fn = rocprofiler::SimpleProxyQueue::SetQueueIndex;
table->core_->hsa_queue_load_read_index_scacquire_fn = rocprofiler::SimpleProxyQueue::GetSubmitIndex;
}
SimpleProxyQueue::queue_map_t* SimpleProxyQueue::queue_map_ = NULL;
+53 -9
Dosyayı Görüntüle
@@ -10,12 +10,25 @@
#include "core/types.h"
#include "util/hsa_rsrc_factory.h"
#ifndef ROCP_PROXY_LOCK
# define ROCP_PROXY_LOCK 1
#endif
namespace rocprofiler {
extern decltype(hsa_queue_create)* hsa_queue_create_fn;
extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn;
extern decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed_fn;
extern decltype(hsa_signal_store_relaxed)* hsa_signal_store_screlease_fn;
extern decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed_fn;
extern decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed_fn;
extern decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed_fn;
extern decltype(hsa_queue_load_write_index_scacquire)* hsa_queue_load_write_index_scacquire_fn;
extern decltype(hsa_queue_store_write_index_screlease)* hsa_queue_store_write_index_screlease_fn;
extern decltype(hsa_queue_load_read_index_scacquire)* hsa_queue_load_read_index_scacquire_fn;
typedef decltype(hsa_signal_t::handle) signal_handle_t;
@@ -27,9 +40,11 @@ class SimpleProxyQueue : public ProxyQueue {
auto it = queue_map_->find(signal.handle);
if (it != queue_map_->end()) {
SimpleProxyQueue* instance = it->second;
instance->mutex_lock();
const uint64_t begin = instance->submit_index_;
const uint64_t end = que_idx + 1;
instance->submit_index_ = end;
instance->mutex_unlock();
for (uint64_t j = begin; j < end; ++j) {
// Submited packet
const uint32_t idx = j & instance->queue_mask_;
@@ -44,12 +59,24 @@ class SimpleProxyQueue : public ProxyQueue {
}
}
static uint64_t LoadIndex(const hsa_queue_t* queue) {
static uint64_t GetSubmitIndex(const hsa_queue_t* queue) {
uint64_t index = 0;
auto it = queue_map_->find(queue->doorbell_signal.handle);
if (it != queue_map_->end()) {
SimpleProxyQueue* instance = it->second;
instance->mutex_.lock();
index = instance->submit_index_;
} else {
index = hsa_queue_load_read_index_relaxed_fn(queue);
}
return index;
}
static uint64_t GetQueueIndex(const hsa_queue_t* queue) {
uint64_t index = 0;
auto it = queue_map_->find(queue->doorbell_signal.handle);
if (it != queue_map_->end()) {
SimpleProxyQueue* instance = it->second;
instance->mutex_lock();
index = instance->queue_index_;
} else {
index = hsa_queue_load_write_index_relaxed_fn(queue);
@@ -57,12 +84,12 @@ class SimpleProxyQueue : public ProxyQueue {
return index;
}
static void StoreIndex(const hsa_queue_t* queue, uint64_t value) {
static void SetQueueIndex(const hsa_queue_t* queue, uint64_t value) {
auto it = queue_map_->find(queue->doorbell_signal.handle);
if (it != queue_map_->end()) {
SimpleProxyQueue* instance = it->second;
instance->queue_index_ = value;
instance->mutex_.unlock();
instance->mutex_unlock();
} else {
hsa_queue_store_write_index_relaxed_fn(queue, value);
}
@@ -75,9 +102,13 @@ class SimpleProxyQueue : public ProxyQueue {
}
void Submit(const packet_t* packet) {
// Compute the write index of queue and copy Aql packet into it
// Compute the write index of queue
const uint64_t que_idx = hsa_queue_load_write_index_relaxed_fn(queue_);
// Increment the write index and ring the doorbell to submit the packet.
// Waiting untill there is a free space in the queue
while (que_idx >= (hsa_queue_load_read_index_relaxed_fn(queue_) + size_));
// Increment the write index
hsa_queue_store_write_index_relaxed_fn(queue_, que_idx + 1);
const uint32_t mask = queue_->size - 1;
@@ -92,12 +123,12 @@ class SimpleProxyQueue : public ProxyQueue {
// To maintain global order to ensure the prior copy of the packet contents is made visible
// before the header is updated.
// With in-order CP it will wait until the first packet in the blob will be valid
// With in-order CP it will wait until the first packet in the blob will be valid.
std::atomic<packet_word_t>* header_atomic_ptr =
reinterpret_cast<std::atomic<packet_word_t>*>(&dst[0]);
header_atomic_ptr->store(src[0], std::memory_order_release);
// Doorbell signaling
// Doorbell signaling to submit the packet
hsa_signal_store_relaxed_fn(doorbell_signal_, que_idx);
}
@@ -121,6 +152,7 @@ class SimpleProxyQueue : public ProxyQueue {
void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data),
void* data, uint32_t private_segment_size, uint32_t group_segment_size,
hsa_queue_t** queue) {
size_ = size;
auto status = Init(agent, size);
*queue = queue_;
return status;
@@ -131,7 +163,6 @@ class SimpleProxyQueue : public ProxyQueue {
agent_info_ = util::HsaRsrcFactory::Instance().GetAgentInfo(agent);
if (agent_info_ != NULL) {
if (agent_info_->dev_type == HSA_DEVICE_TYPE_GPU) {
printf("queue_create size 0x%x(%d)\n", size, (int)size);
status = hsa_queue_create_fn(agent, size, HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX,
UINT32_MAX, &queue_);
if (status == HSA_STATUS_SUCCESS) {
@@ -163,6 +194,19 @@ class SimpleProxyQueue : public ProxyQueue {
return status;
}
void mutex_lock() {
#if ROCP_PROXY_LOCK
mutex_.lock();
#endif
}
void mutex_unlock() {
#if ROCP_PROXY_LOCK
mutex_.unlock();
#endif
}
uint32_t size_;
static queue_map_t* queue_map_;
const util::AgentInfo* agent_info_;
hsa_queue_t* queue_;
+2 -2
Dosyayı Görüntüle
@@ -4,11 +4,11 @@ test_bin_dflt=./test/ctrl
#export HSA_LIB=/home/evgeny/pkg/compute-psdb-16453/lib
export HSA_LIB=/home/evgeny/git/compute/out/ubuntu-16.04/16.04/lib
export OCL_LIB=/home/evgeny/pkg/opencl_modified/opencl_x86_64/lib
#export OCL_LIB=/home/evgeny/pkg/opencl_modified/opencl_x86_64/lib
#export OCL_LIB=/home/evgeny/Perforce/eshcherb_opencl/drivers/opencl/dist/linux/debug/lib/x86_64
# paths to ROC profiler and oher libraries
export LD_LIBRARY_PATH=$PWD:$HSA_LIB:$OCL_LIB
export LD_LIBRARY_PATH=$PWD:$HSA_LIB
# enable error messages logging to '/tmp/rocprofiler_log.txt'
export ROCPROFILER_LOG=1