Blit SDMA support for gfx70x
Change-Id: Ie6f215890553ef41c3f36b349fc9cc39c2d38747
This commit is contained in:
committed by
Kent Russell
parent
f49ddad0a1
commit
103cd04236
@@ -227,6 +227,9 @@ class BlitSdma : public core::Blit {
|
||||
|
||||
/// Max total fill count supported by the queue.
|
||||
size_t max_total_fill_size_;
|
||||
|
||||
/// True if platform atomic is supported.
|
||||
bool platform_atomic_support_;
|
||||
};
|
||||
} // namespace amd
|
||||
|
||||
|
||||
@@ -79,6 +79,11 @@ class GpuAgentInt : public core::Agent {
|
||||
// @retval HSA_STATUS_SUCCESS DMA queue initialization is successful.
|
||||
virtual void InitDma() = 0;
|
||||
|
||||
// @brief Initialize blit kernel object based on AQL queue.
|
||||
//
|
||||
// @retval HSA_STATUS_SUCCESS blit kernel object initialization is successful.
|
||||
virtual hsa_status_t InitBlitKernel() = 0;
|
||||
|
||||
// @brief Invoke the user provided callback for each region accessible by
|
||||
// this agent.
|
||||
//
|
||||
@@ -178,10 +183,8 @@ class GpuAgent : public GpuAgentInt {
|
||||
// @brief Override from core::Agent.
|
||||
void InitDma() override;
|
||||
|
||||
// @brief Initialize blit kernel object based on AQL queue.
|
||||
//
|
||||
// @retval HSA_STATUS_SUCCESS blit kernel object initialization is successful.
|
||||
hsa_status_t InitBlitKernel();
|
||||
// @brief Override from core::Agent.
|
||||
hsa_status_t InitBlitKernel() override;
|
||||
|
||||
uint16_t GetMicrocodeVersion() const;
|
||||
|
||||
|
||||
@@ -365,7 +365,8 @@ BlitSdma::BlitSdma()
|
||||
fence_pool_size_(0),
|
||||
fence_pool_counter_(0),
|
||||
cached_reserve_offset_(0),
|
||||
cached_commit_offset_(0) {
|
||||
cached_commit_offset_(0),
|
||||
platform_atomic_support_(true) {
|
||||
std::memset(&queue_resource_, 0, sizeof(queue_resource_));
|
||||
}
|
||||
|
||||
@@ -418,6 +419,10 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
if (amd_gpu_agent.isa()->version() == core::Isa::Version(7, 0, 1)) {
|
||||
platform_atomic_support_ = false;
|
||||
}
|
||||
|
||||
// Allocate queue buffer.
|
||||
queue_size_ = kQueueSize;
|
||||
|
||||
@@ -568,7 +573,8 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(
|
||||
const uint32_t total_copy_command_size =
|
||||
num_copy_command * linear_copy_command_size_;
|
||||
|
||||
// In case the user disable or enable the profiling in the middle of the call.
|
||||
// Load the profiling state early in case the user disable or enable the
|
||||
// profiling in the middle of the call.
|
||||
const bool profiling_enabled = agent_->profiling_enabled();
|
||||
|
||||
uint64_t* end_ts_addr = NULL;
|
||||
@@ -589,8 +595,20 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(
|
||||
(2 * timestamp_command_size_) + linear_copy_command_size_;
|
||||
}
|
||||
|
||||
// On agent that does not support platform atomic, we replace it with
|
||||
// one or two fence packet(s) to update the signal value. The reason fence
|
||||
// is used and not write packet is because the SDMA engine may overlap a
|
||||
// serial copy/write packets.
|
||||
const uint64_t completion_signal_value =
|
||||
static_cast<uint64_t>(out_signal.LoadRelaxed() - 1);
|
||||
const size_t sync_command_size = (platform_atomic_support_)
|
||||
? atomic_command_size_
|
||||
: (completion_signal_value > UINT32_MAX)
|
||||
? 2 * fence_command_size_
|
||||
: fence_command_size_;
|
||||
|
||||
const uint32_t total_command_size =
|
||||
total_poll_command_size + total_copy_command_size + atomic_command_size_ +
|
||||
total_poll_command_size + total_copy_command_size + sync_command_size +
|
||||
total_timestamp_command_size;
|
||||
|
||||
char* command_addr = AcquireWriteAddress(total_command_size);
|
||||
@@ -635,7 +653,20 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(
|
||||
}
|
||||
|
||||
// After transfer is completed, decrement the signal.
|
||||
BuildAtomicDecrementCommand(command_addr, out_signal.ValueLocation());
|
||||
if (platform_atomic_support_) {
|
||||
BuildAtomicDecrementCommand(command_addr, out_signal.ValueLocation());
|
||||
} else {
|
||||
uint32_t* signal_value_location =
|
||||
reinterpret_cast<uint32_t*>(out_signal.ValueLocation());
|
||||
if (completion_signal_value > UINT32_MAX) {
|
||||
BuildFenceCommand(command_addr, signal_value_location + 1,
|
||||
static_cast<uint32_t>(completion_signal_value >> 32));
|
||||
command_addr += fence_command_size_;
|
||||
}
|
||||
|
||||
BuildFenceCommand(command_addr, signal_value_location,
|
||||
static_cast<uint32_t>(completion_signal_value));
|
||||
}
|
||||
|
||||
ReleaseWriteAddress(command_addr_temp, total_command_size);
|
||||
|
||||
|
||||
@@ -78,12 +78,6 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop) {
|
||||
GpuAgent* gpu = new GpuAgent(node_id, node_prop);
|
||||
core::Runtime::runtime_singleton_->RegisterAgent(gpu);
|
||||
|
||||
if (HSA_STATUS_SUCCESS != gpu->InitBlitKernel()) {
|
||||
assert(false && "Fail init blit");
|
||||
delete gpu;
|
||||
gpu = NULL;
|
||||
}
|
||||
|
||||
return gpu;
|
||||
}
|
||||
|
||||
|
||||
@@ -430,6 +430,7 @@ hsa_status_t Runtime::CopyMemory(void* dst, core::Agent& dst_agent,
|
||||
[](void* dst, const void* src, size_t size,
|
||||
std::vector<core::Signal*> dep_signals,
|
||||
core::Signal* completion_signal, bool profiling_enabled) {
|
||||
|
||||
for (core::Signal* dep : dep_signals) {
|
||||
dep->WaitRelaxed(HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
|
||||
HSA_WAIT_STATE_BLOCKED);
|
||||
@@ -820,6 +821,14 @@ void Runtime::Load() {
|
||||
|
||||
// Load tools libraries
|
||||
LoadTools();
|
||||
|
||||
// Initialize blit kernel object after tools is initialized to allow tools
|
||||
// to overload blit kernel.
|
||||
for (core::Agent* agent : gpu_agents_) {
|
||||
const hsa_status_t stat =
|
||||
reinterpret_cast<amd::GpuAgentInt*>(agent)->InitBlitKernel();
|
||||
assert(HSA_STATUS_SUCCESS == stat);
|
||||
}
|
||||
}
|
||||
|
||||
void Runtime::Unload() {
|
||||
|
||||
Reference in New Issue
Block a user