Blit SDMA support for gfx70x

Change-Id: Ie6f215890553ef41c3f36b349fc9cc39c2d38747


[ROCm/ROCR-Runtime commit: 103cd04236]
This commit is contained in:
Besar Wicaksono
2016-05-31 13:38:45 -05:00
committed by Kent Russell
parent f937d533f6
commit 7b50eacba5
5 changed files with 54 additions and 14 deletions
@@ -227,6 +227,9 @@ class BlitSdma : public core::Blit {
/// Max total fill count supported by the queue.
size_t max_total_fill_size_;
/// True if platform atomic is supported.
bool platform_atomic_support_;
};
} // namespace amd
@@ -79,6 +79,11 @@ class GpuAgentInt : public core::Agent {
// @retval HSA_STATUS_SUCCESS DMA queue initialization is successful.
virtual void InitDma() = 0;
// @brief Initialize blit kernel object based on AQL queue.
//
// @retval HSA_STATUS_SUCCESS blit kernel object initialization is successful.
virtual hsa_status_t InitBlitKernel() = 0;
// @brief Invoke the user provided callback for each region accessible by
// this agent.
//
@@ -178,10 +183,8 @@ class GpuAgent : public GpuAgentInt {
// @brief Override from core::Agent.
void InitDma() override;
// @brief Initialize blit kernel object based on AQL queue.
//
// @retval HSA_STATUS_SUCCESS blit kernel object initialization is successful.
hsa_status_t InitBlitKernel();
// @brief Override from core::Agent.
hsa_status_t InitBlitKernel() override;
uint16_t GetMicrocodeVersion() const;
@@ -365,7 +365,8 @@ BlitSdma::BlitSdma()
fence_pool_size_(0),
fence_pool_counter_(0),
cached_reserve_offset_(0),
cached_commit_offset_(0) {
cached_commit_offset_(0),
platform_atomic_support_(true) {
std::memset(&queue_resource_, 0, sizeof(queue_resource_));
}
@@ -418,6 +419,10 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
return HSA_STATUS_ERROR;
}
if (amd_gpu_agent.isa()->version() == core::Isa::Version(7, 0, 1)) {
platform_atomic_support_ = false;
}
// Allocate queue buffer.
queue_size_ = kQueueSize;
@@ -568,7 +573,8 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(
const uint32_t total_copy_command_size =
num_copy_command * linear_copy_command_size_;
// In case the user disable or enable the profiling in the middle of the call.
// Load the profiling state early in case the user disable or enable the
// profiling in the middle of the call.
const bool profiling_enabled = agent_->profiling_enabled();
uint64_t* end_ts_addr = NULL;
@@ -589,8 +595,20 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(
(2 * timestamp_command_size_) + linear_copy_command_size_;
}
// On agent that does not support platform atomic, we replace it with
// one or two fence packet(s) to update the signal value. The reason fence
// is used and not write packet is because the SDMA engine may overlap a
// serial copy/write packets.
const uint64_t completion_signal_value =
static_cast<uint64_t>(out_signal.LoadRelaxed() - 1);
const size_t sync_command_size = (platform_atomic_support_)
? atomic_command_size_
: (completion_signal_value > UINT32_MAX)
? 2 * fence_command_size_
: fence_command_size_;
const uint32_t total_command_size =
total_poll_command_size + total_copy_command_size + atomic_command_size_ +
total_poll_command_size + total_copy_command_size + sync_command_size +
total_timestamp_command_size;
char* command_addr = AcquireWriteAddress(total_command_size);
@@ -635,7 +653,20 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(
}
// After transfer is completed, decrement the signal.
BuildAtomicDecrementCommand(command_addr, out_signal.ValueLocation());
if (platform_atomic_support_) {
BuildAtomicDecrementCommand(command_addr, out_signal.ValueLocation());
} else {
uint32_t* signal_value_location =
reinterpret_cast<uint32_t*>(out_signal.ValueLocation());
if (completion_signal_value > UINT32_MAX) {
BuildFenceCommand(command_addr, signal_value_location + 1,
static_cast<uint32_t>(completion_signal_value >> 32));
command_addr += fence_command_size_;
}
BuildFenceCommand(command_addr, signal_value_location,
static_cast<uint32_t>(completion_signal_value));
}
ReleaseWriteAddress(command_addr_temp, total_command_size);
@@ -78,12 +78,6 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop) {
GpuAgent* gpu = new GpuAgent(node_id, node_prop);
core::Runtime::runtime_singleton_->RegisterAgent(gpu);
if (HSA_STATUS_SUCCESS != gpu->InitBlitKernel()) {
assert(false && "Fail init blit");
delete gpu;
gpu = NULL;
}
return gpu;
}
@@ -430,6 +430,7 @@ hsa_status_t Runtime::CopyMemory(void* dst, core::Agent& dst_agent,
[](void* dst, const void* src, size_t size,
std::vector<core::Signal*> dep_signals,
core::Signal* completion_signal, bool profiling_enabled) {
for (core::Signal* dep : dep_signals) {
dep->WaitRelaxed(HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
HSA_WAIT_STATE_BLOCKED);
@@ -820,6 +821,14 @@ void Runtime::Load() {
// Load tools libraries
LoadTools();
// Initialize blit kernel object after tools is initialized to allow tools
// to overload blit kernel.
for (core::Agent* agent : gpu_agents_) {
const hsa_status_t stat =
reinterpret_cast<amd::GpuAgentInt*>(agent)->InitBlitKernel();
assert(HSA_STATUS_SUCCESS == stat);
}
}
void Runtime::Unload() {