diff --git a/runtime/hsa-runtime/core/inc/amd_blit_sdma.h b/runtime/hsa-runtime/core/inc/amd_blit_sdma.h index 756ff98270..4c677360c0 100644 --- a/runtime/hsa-runtime/core/inc/amd_blit_sdma.h +++ b/runtime/hsa-runtime/core/inc/amd_blit_sdma.h @@ -76,7 +76,7 @@ class BlitSdmaBase : public core::Blit { // RingIndexTy: 32/64-bit monotonic ring index, counting in bytes. // HwIndexMonotonic: true if SDMA HW index is monotonic, false if it wraps at end of ring. // SizeToCountOffset: value added to size (in bytes) to form SDMA command count field. -template +template class BlitSdma : public BlitSdmaBase { public: BlitSdma(); @@ -209,6 +209,8 @@ class BlitSdma : public BlitSdmaBase { void BuildTrapCommand(char* cmd_addr); + void BuildGCRCommand(char* cmd_addr, bool invalidate); + hsa_status_t SubmitCommand(const void* cmds, size_t cmd_size, const std::vector& dep_signals, core::Signal& out_signal); @@ -250,6 +252,8 @@ class BlitSdma : public BlitSdmaBase { static const uint32_t trap_command_size_; + static const uint32_t gcr_command_size_; + // Max copy size of a single linear copy command packet. size_t max_single_linear_copy_size_; @@ -272,12 +276,18 @@ class BlitSdma : public BlitSdmaBase { // Ring indices are 32-bit. // HW ring indices are not monotonic (wrap at end of ring). // Count fields of SDMA commands are 0-based. -typedef BlitSdma BlitSdmaV2V3; +typedef BlitSdma BlitSdmaV2V3; // Ring indices are 64-bit. // HW ring indices are monotonic (do not wrap at end of ring). // Count fields of SDMA commands are 1-based. -typedef BlitSdma BlitSdmaV4; +typedef BlitSdma BlitSdmaV4; + +// Ring indices are 64-bit. +// HW ring indices are monotonic (do not wrap at end of ring). +// Count fields of SDMA commands are 1-based. +// SDMA is connected to gL2. +typedef BlitSdma BlitSdmaV5; } // namespace amd diff --git a/runtime/hsa-runtime/core/inc/sdma_registers.h b/runtime/hsa-runtime/core/inc/sdma_registers.h index cf91cf0022..1b895597fa 100644 --- a/runtime/hsa-runtime/core/inc/sdma_registers.h +++ b/runtime/hsa-runtime/core/inc/sdma_registers.h @@ -58,9 +58,11 @@ const unsigned int SDMA_OP_POLL_REGMEM = 8; const unsigned int SDMA_OP_ATOMIC = 10; const unsigned int SDMA_OP_CONST_FILL = 11; const unsigned int SDMA_OP_TIMESTAMP = 13; +const unsigned int SDMA_OP_GCR = 17; const unsigned int SDMA_SUBOP_COPY_LINEAR = 0; const unsigned int SDMA_SUBOP_COPY_LINEAR_RECT = 4; const unsigned int SDMA_SUBOP_TIMESTAMP_GET_GLOBAL = 2; +const unsigned int SDMA_SUBOP_USER_GCR = 1; const unsigned int SDMA_ATOMIC_ADD64 = 47; typedef struct SDMA_PKT_COPY_LINEAR_TAG { @@ -503,6 +505,65 @@ typedef struct SDMA_PKT_HDP_FLUSH_TAG { } SDMA_PKT_HDP_FLUSH; static const SDMA_PKT_HDP_FLUSH hdp_flush_cmd = {0x8, 0x0, 0x80000000, 0x0, 0x0, 0x0}; +typedef struct SDMA_PKT_GCR_TAG { + union { + struct { + unsigned int op : 8; + unsigned int sub_op : 8; + unsigned int : 16; + }; + unsigned int DW_0_DATA; + } HEADER_UNION; + + union { + struct { + unsigned int : 7; + unsigned int BaseVA_LO : 25; + }; + unsigned int DW_1_DATA; + } WORD1_UNION; + + union { + struct { + unsigned int BaseVA_HI : 16; + unsigned int GCR_CONTROL_GLI_INV : 2; + unsigned int GCR_CONTROL_GL1_RANGE : 2; + unsigned int GCR_CONTROL_GLM_WB : 1; + unsigned int GCR_CONTROL_GLM_INV : 1; + unsigned int GCR_CONTROL_GLK_WB : 1; + unsigned int GCR_CONTROL_GLK_INV : 1; + unsigned int GCR_CONTROL_GLV_INV : 1; + unsigned int GCR_CONTROL_GL1_INV : 1; + unsigned int GCR_CONTROL_GL2_US : 1; + unsigned int GCR_CONTROL_GL2_RANGE : 2; + unsigned int GCR_CONTROL_GL2_DISCARD : 1; + unsigned int GCR_CONTROL_GL2_INV : 1; + unsigned int GCR_CONTROL_GL2_WB : 1; + }; + unsigned int DW_2_DATA; + } WORD2_UNION; + + union { + struct { + unsigned int GCR_CONTROL_RANGE_IS_PA : 1; + unsigned int GCR_CONTROL_SEQ : 2; + unsigned int : 4; + unsigned int LimitVA_LO : 25; + }; + unsigned int DW_3_DATA; + } WORD3_UNION; + + union { + struct { + unsigned int LimitVA_HI : 16; + unsigned int : 8; + unsigned int VMID : 4; + unsigned int : 4; + }; + unsigned int DW_4_DATA; + } WORD4_UNION; +} SDMA_PKT_GCR; + } // namespace amd #endif // HSA_RUNTIME_CORE_INC_SDMA_REGISTERS_H_ diff --git a/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp b/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp index 188345e2d3..9d58bcf7b4 100644 --- a/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp @@ -75,32 +75,44 @@ const size_t BlitSdmaBase::kMaxSingleCopySize = SDMA_PKT_COPY_LINEAR::kMaxSize_; const size_t BlitSdmaBase::kMaxSingleFillSize = SDMA_PKT_CONSTANT_FILL::kMaxSize_; // Initialize size of various sDMA commands use by this module -template -const uint32_t BlitSdma::linear_copy_command_size_ = sizeof(SDMA_PKT_COPY_LINEAR); +template +const uint32_t BlitSdma::linear_copy_command_size_ = sizeof(SDMA_PKT_COPY_LINEAR); -template -const uint32_t BlitSdma::fill_command_size_ = sizeof(SDMA_PKT_CONSTANT_FILL); +template +const uint32_t BlitSdma::fill_command_size_ = sizeof(SDMA_PKT_CONSTANT_FILL); -template -const uint32_t BlitSdma::fence_command_size_ = sizeof(SDMA_PKT_FENCE); +template +const uint32_t BlitSdma::fence_command_size_ = sizeof(SDMA_PKT_FENCE); -template -const uint32_t BlitSdma::poll_command_size_ = sizeof(SDMA_PKT_POLL_REGMEM); +template +const uint32_t BlitSdma::poll_command_size_ = sizeof(SDMA_PKT_POLL_REGMEM); -template -const uint32_t BlitSdma::flush_command_size_ = sizeof(SDMA_PKT_POLL_REGMEM); +template +const uint32_t BlitSdma::flush_command_size_ = sizeof(SDMA_PKT_POLL_REGMEM); -template -const uint32_t BlitSdma::atomic_command_size_ = sizeof(SDMA_PKT_ATOMIC); +template +const uint32_t BlitSdma::atomic_command_size_ = sizeof(SDMA_PKT_ATOMIC); -template -const uint32_t BlitSdma::timestamp_command_size_ = sizeof(SDMA_PKT_TIMESTAMP); +template +const uint32_t BlitSdma::timestamp_command_size_ = sizeof(SDMA_PKT_TIMESTAMP); -template -const uint32_t BlitSdma::trap_command_size_ = sizeof(SDMA_PKT_TRAP); +template +const uint32_t BlitSdma::trap_command_size_ = sizeof(SDMA_PKT_TRAP); -template -BlitSdma::BlitSdma() +template +const uint32_t BlitSdma::gcr_command_size_ = sizeof(SDMA_PKT_GCR); + +template +BlitSdma::BlitSdma() : agent_(NULL), queue_start_addr_(NULL), parity_(false), @@ -111,11 +123,11 @@ BlitSdma::BlitSdma() std::memset(&queue_resource_, 0, sizeof(queue_resource_)); } -template -BlitSdma::~BlitSdma() {} +template +BlitSdma::~BlitSdma() {} -template -hsa_status_t BlitSdma::Initialize( +template +hsa_status_t BlitSdma::Initialize( const core::Agent& agent, bool use_xgmi) { if (queue_start_addr_ != NULL) { // Already initialized. @@ -179,8 +191,8 @@ hsa_status_t BlitSdma::Initial return HSA_STATUS_SUCCESS; } -template -hsa_status_t BlitSdma::Destroy( +template +hsa_status_t BlitSdma::Destroy( const core::Agent& agent) { // Release all allocated resources and reset them to zero. @@ -206,9 +218,9 @@ hsa_status_t BlitSdma::Destroy return HSA_STATUS_SUCCESS; } -template -hsa_status_t BlitSdma::SubmitBlockingCommand( - const void* cmd, size_t cmd_size) { +template +hsa_status_t BlitSdma::SubmitBlockingCommand(const void* cmd, size_t cmd_size) { ScopedAcquire lock(&lock_); // Alternate between completion signals @@ -234,8 +246,8 @@ hsa_status_t BlitSdma::SubmitB return ret; } -template -hsa_status_t BlitSdma::SubmitCommand( +template +hsa_status_t BlitSdma::SubmitCommand( const void* cmd, size_t cmd_size, const std::vector& dep_signals, core::Signal& out_signal) { // The signal is 64 bit value, and poll checks for 32 bit value. So we @@ -285,6 +297,9 @@ hsa_status_t BlitSdma::SubmitC } } + // Add space for cache flush. + if (useGCR) flush_cmd_size += gcr_command_size_ * 2; + const uint32_t total_command_size = total_poll_command_size + cmd_size + sync_command_size + total_timestamp_command_size + interrupt_command_size + flush_cmd_size; @@ -319,10 +334,22 @@ hsa_status_t BlitSdma::SubmitC } } + // Issue cache invalidate + if (useGCR) { + BuildGCRCommand(command_addr, true); + command_addr += gcr_command_size_; + } + // Do the command after all polls are satisfied. memcpy(command_addr, cmd, cmd_size); command_addr += cmd_size; + // Issue cache writeback + if (useGCR) { + BuildGCRCommand(command_addr, false); + command_addr += gcr_command_size_; + } + if (profiling_enabled) { assert(IsMultipleOf(end_ts_addr, 32)); BuildGetGlobalTimestampCommand(command_addr, @@ -364,9 +391,9 @@ hsa_status_t BlitSdma::SubmitC return HSA_STATUS_SUCCESS; } -template -hsa_status_t BlitSdma::SubmitLinearCopyCommand( - void* dst, const void* src, size_t size) { +template +hsa_status_t BlitSdma::SubmitLinearCopyCommand(void* dst, const void* src, size_t size) { // Break the copy into multiple copy operation incase the copy size exceeds // the SDMA linear copy limit. const uint32_t num_copy_command = (size + kMaxSingleCopySize - 1) / kMaxSingleCopySize; @@ -377,10 +404,11 @@ hsa_status_t BlitSdma::SubmitL return SubmitBlockingCommand(&buff[0], buff.size() * sizeof(SDMA_PKT_COPY_LINEAR)); } -template -hsa_status_t BlitSdma::SubmitLinearCopyCommand( - void* dst, const void* src, size_t size, std::vector& dep_signals, - core::Signal& out_signal) { +template +hsa_status_t BlitSdma::SubmitLinearCopyCommand(void* dst, const void* src, size_t size, + std::vector& dep_signals, + core::Signal& out_signal) { // Break the copy into multiple copy operations when the copy size exceeds // the SDMA linear copy limit. const uint32_t num_copy_command = (size + kMaxSingleCopySize - 1) / kMaxSingleCopySize; @@ -393,8 +421,9 @@ hsa_status_t BlitSdma::SubmitL out_signal); } -template -hsa_status_t BlitSdma::SubmitCopyRectCommand( +template +hsa_status_t +BlitSdma::SubmitCopyRectCommand( const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src, const hsa_dim3_t* src_offset, const hsa_dim3_t* range, std::vector& dep_signals, core::Signal& out_signal) { @@ -456,9 +485,9 @@ hsa_status_t BlitSdma::SubmitC out_signal); } -template -hsa_status_t BlitSdma::SubmitLinearFillCommand( - void* ptr, uint32_t value, size_t count) { +template +hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value, size_t count) { const size_t size = count * sizeof(uint32_t); const uint32_t num_fill_command = (size + kMaxSingleFillSize - 1) / kMaxSingleFillSize; @@ -469,14 +498,14 @@ hsa_status_t BlitSdma::SubmitL return SubmitBlockingCommand(&buff[0], buff.size() * sizeof(SDMA_PKT_CONSTANT_FILL)); } -template -hsa_status_t BlitSdma::EnableProfiling( +template +hsa_status_t BlitSdma::EnableProfiling( bool enable) { return HSA_STATUS_SUCCESS; } -template -char* BlitSdma::AcquireWriteAddress( +template +char* BlitSdma::AcquireWriteAddress( uint32_t cmd_size, RingIndexTy& curr_index) { // Ring is full when all but one byte is written. if (cmd_size >= kQueueSize) { @@ -516,9 +545,10 @@ char* BlitSdma::AcquireWriteAd return NULL; } -template -void BlitSdma::UpdateWriteAndDoorbellRegister( - RingIndexTy curr_index, RingIndexTy new_index) { +template +void BlitSdma::UpdateWriteAndDoorbellRegister(RingIndexTy curr_index, + RingIndexTy new_index) { while (true) { // Make sure that the address before ::curr_index is already released. // Otherwise the CP may read invalid packets. @@ -552,8 +582,8 @@ void BlitSdma::UpdateWriteAndD } } -template -void BlitSdma::ReleaseWriteAddress( +template +void BlitSdma::ReleaseWriteAddress( RingIndexTy curr_index, uint32_t cmd_size) { if (cmd_size > kQueueSize) { assert(false && "cmd_addr is outside the queue buffer range"); @@ -563,8 +593,8 @@ void BlitSdma::ReleaseWriteAdd UpdateWriteAndDoorbellRegister(curr_index, curr_index + cmd_size); } -template -void BlitSdma::PadRingToEnd( +template +void BlitSdma::PadRingToEnd( RingIndexTy curr_index) { // Reserve region from here to the end of the ring. RingIndexTy new_index = curr_index + (kQueueSize - WrapIntoRing(curr_index)); @@ -585,14 +615,14 @@ void BlitSdma::PadRingToEnd( } } -template -uint32_t BlitSdma::WrapIntoRing( +template +uint32_t BlitSdma::WrapIntoRing( RingIndexTy index) { return index & (kQueueSize - 1); } -template -bool BlitSdma::CanWriteUpto( +template +bool BlitSdma::CanWriteUpto( RingIndexTy upto_index) { // Get/calculate the monotonic read index. RingIndexTy hw_read_index = *reinterpret_cast(queue_resource_.Queue_read_ptr); @@ -613,8 +643,8 @@ bool BlitSdma::CanWriteUpto( return (upto_index - read_index) < kQueueSize; } -template -void BlitSdma::BuildFenceCommand( +template +void BlitSdma::BuildFenceCommand( char* fence_command_addr, uint32_t* fence, uint32_t fence_value) { assert(fence_command_addr != NULL); SDMA_PKT_FENCE* packet_addr = @@ -635,8 +665,8 @@ void BlitSdma::BuildFenceComma packet_addr->DATA_UNION.data = fence_value; } -template -void BlitSdma::BuildCopyCommand( +template +void BlitSdma::BuildCopyCommand( char* cmd_addr, uint32_t num_copy_command, void* dst, const void* src, size_t size) { size_t cur_size = 0; for (uint32_t i = 0; i < num_copy_command; ++i) { @@ -675,8 +705,8 @@ Elements are coded by the log2 of the element size in bytes (ie. element 0=1 byt This routine breaks a large rect into tiles that can be handled by hardware. Pitches and offsets must be representable in terms of elements in all tiles of the copy. */ -template -void BlitSdma::BuildCopyRectCommand( +template +void BlitSdma::BuildCopyRectCommand( const std::function& append, const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src, const hsa_dim3_t* src_offset, const hsa_dim3_t* range) { @@ -794,8 +824,8 @@ void BlitSdma::BuildCopyRectCo } } -template -void BlitSdma::BuildFillCommand( +template +void BlitSdma::BuildFillCommand( char* cmd_addr, uint32_t num_fill_command, void* ptr, uint32_t value, size_t count) { char* cur_ptr = reinterpret_cast(ptr); const uint32_t maxDwordCount = kMaxSingleFillSize / sizeof(uint32_t); @@ -824,8 +854,8 @@ void BlitSdma::BuildFillComman assert(count == 0 && "SDMA fill command count error."); } -template -void BlitSdma::BuildPollCommand( +template +void BlitSdma::BuildPollCommand( char* cmd_addr, void* addr, uint32_t reference) { SDMA_PKT_POLL_REGMEM* packet_addr = reinterpret_cast(cmd_addr); @@ -846,9 +876,9 @@ void BlitSdma::BuildPollComman packet_addr->DW5_UNION.retry_count = 0xfff; // Retry forever. } -template -void BlitSdma::BuildAtomicDecrementCommand( - char* cmd_addr, void* addr) { +template +void BlitSdma::BuildAtomicDecrementCommand(char* cmd_addr, void* addr) { SDMA_PKT_ATOMIC* packet_addr = reinterpret_cast(cmd_addr); memset(packet_addr, 0, sizeof(SDMA_PKT_ATOMIC)); @@ -863,9 +893,9 @@ void BlitSdma::BuildAtomicDecr packet_addr->SRC_DATA_HI_UNION.src_data_63_32 = 0xffffffff; } -template -void BlitSdma::BuildGetGlobalTimestampCommand( - char* cmd_addr, void* write_address) { +template +void BlitSdma::BuildGetGlobalTimestampCommand(char* cmd_addr, void* write_address) { SDMA_PKT_TIMESTAMP* packet_addr = reinterpret_cast(cmd_addr); @@ -878,8 +908,9 @@ void BlitSdma::BuildGetGlobalT packet_addr->ADDR_HI_UNION.addr_63_32 = ptrhigh32(write_address); } -template -void BlitSdma::BuildTrapCommand(char* cmd_addr) { +template +void BlitSdma::BuildTrapCommand( + char* cmd_addr) { SDMA_PKT_TRAP* packet_addr = reinterpret_cast(cmd_addr); @@ -888,15 +919,37 @@ void BlitSdma::BuildTrapComman packet_addr->HEADER_UNION.op = SDMA_OP_TRAP; } -template -void BlitSdma::BuildHdpFlushCommand( +template +void BlitSdma::BuildHdpFlushCommand( char* cmd_addr) { assert(cmd_addr != NULL); SDMA_PKT_POLL_REGMEM* addr = reinterpret_cast(cmd_addr); memcpy(addr, &hdp_flush_cmd, flush_command_size_); } -template class BlitSdma; -template class BlitSdma; +template +void BlitSdma::BuildGCRCommand( + char* cmd_addr, bool invalidate) { + assert(cmd_addr != NULL); + assert(useGCR && "Unsupported SDMA command - GCR."); + SDMA_PKT_GCR* addr = reinterpret_cast(cmd_addr); + memset(addr, 0, sizeof(SDMA_PKT_GCR)); + addr->HEADER_UNION.op = SDMA_OP_GCR; + addr->HEADER_UNION.sub_op = SDMA_SUBOP_USER_GCR; + addr->WORD2_UNION.GCR_CONTROL_GL2_WB = 1; + addr->WORD2_UNION.GCR_CONTROL_GLK_WB = 1; + if (invalidate) { + addr->WORD2_UNION.GCR_CONTROL_GL2_INV = 1; + addr->WORD2_UNION.GCR_CONTROL_GL1_INV = 1; + addr->WORD2_UNION.GCR_CONTROL_GLV_INV = 1; + addr->WORD2_UNION.GCR_CONTROL_GLK_INV = 1; + } + // Discarding all lines for now. + addr->WORD2_UNION.GCR_CONTROL_GL2_RANGE = 0; +} + +template class BlitSdma; +template class BlitSdma; +template class BlitSdma; } // namespace amd diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 33fbf67371..d73ad56b38 100644 --- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -488,16 +488,26 @@ core::Queue* GpuAgent::CreateInterceptibleQueue() { core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi) { amd::BlitSdmaBase* sdma; - if (isa_->GetMajorVersion() <= 8) { - sdma = new BlitSdmaV2V3(); - } else { - sdma = new BlitSdmaV4(); + switch (isa_->GetMajorVersion()) { + case 7: + case 8: + sdma = new BlitSdmaV2V3(); + break; + case 9: + sdma = new BlitSdmaV4(); + break; + case 10: + sdma = new BlitSdmaV5(); + break; + default: + assert(false && "Unexpected device major version."); + return nullptr; } if (sdma->Initialize(*this, use_xgmi) != HSA_STATUS_SUCCESS) { sdma->Destroy(*this); delete sdma; - sdma = NULL; + sdma = nullptr; } return sdma; @@ -533,7 +543,8 @@ void GpuAgent::InitDma() { auto blit_lambda = [this](bool use_xgmi, lazy_ptr& queue) { const std::string& sdma_override = core::Runtime::runtime_singleton_->flag().enable_sdma(); - bool use_sdma = ((isa_->GetMajorVersion() != 8) && (isa_->GetMajorVersion() != 10)); + // User SDMA queues are unstable on gfx8. + bool use_sdma = ((isa_->GetMajorVersion() != 8)); if (sdma_override.size() != 0) use_sdma = (sdma_override == "1"); if (use_sdma && (HSA_PROFILE_BASE == profile_)) {