Add USER GCR packet

Allows cache flush from SDMA.

Change-Id: Iecae0b49519b2d4da7b1c053b03f721544439e71
This commit is contained in:
Sean Keely
2019-12-20 16:42:52 -06:00
parent 2ffc9ecbb3
commit ddebda6433
4 changed files with 222 additions and 87 deletions
+13 -3
View File
@@ -76,7 +76,7 @@ class BlitSdmaBase : public core::Blit {
// RingIndexTy: 32/64-bit monotonic ring index, counting in bytes.
// HwIndexMonotonic: true if SDMA HW index is monotonic, false if it wraps at end of ring.
// SizeToCountOffset: value added to size (in bytes) to form SDMA command count field.
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
class BlitSdma : public BlitSdmaBase {
public:
BlitSdma();
@@ -209,6 +209,8 @@ class BlitSdma : public BlitSdmaBase {
void BuildTrapCommand(char* cmd_addr);
void BuildGCRCommand(char* cmd_addr, bool invalidate);
hsa_status_t SubmitCommand(const void* cmds, size_t cmd_size,
const std::vector<core::Signal*>& dep_signals,
core::Signal& out_signal);
@@ -250,6 +252,8 @@ class BlitSdma : public BlitSdmaBase {
static const uint32_t trap_command_size_;
static const uint32_t gcr_command_size_;
// Max copy size of a single linear copy command packet.
size_t max_single_linear_copy_size_;
@@ -272,12 +276,18 @@ class BlitSdma : public BlitSdmaBase {
// Ring indices are 32-bit.
// HW ring indices are not monotonic (wrap at end of ring).
// Count fields of SDMA commands are 0-based.
typedef BlitSdma<uint32_t, false, 0> BlitSdmaV2V3;
typedef BlitSdma<uint32_t, false, 0, false> BlitSdmaV2V3;
// Ring indices are 64-bit.
// HW ring indices are monotonic (do not wrap at end of ring).
// Count fields of SDMA commands are 1-based.
typedef BlitSdma<uint64_t, true, -1> BlitSdmaV4;
typedef BlitSdma<uint64_t, true, -1, false> BlitSdmaV4;
// Ring indices are 64-bit.
// HW ring indices are monotonic (do not wrap at end of ring).
// Count fields of SDMA commands are 1-based.
// SDMA is connected to gL2.
typedef BlitSdma<uint64_t, true, -1, true> BlitSdmaV5;
} // namespace amd
@@ -58,9 +58,11 @@ const unsigned int SDMA_OP_POLL_REGMEM = 8;
const unsigned int SDMA_OP_ATOMIC = 10;
const unsigned int SDMA_OP_CONST_FILL = 11;
const unsigned int SDMA_OP_TIMESTAMP = 13;
const unsigned int SDMA_OP_GCR = 17;
const unsigned int SDMA_SUBOP_COPY_LINEAR = 0;
const unsigned int SDMA_SUBOP_COPY_LINEAR_RECT = 4;
const unsigned int SDMA_SUBOP_TIMESTAMP_GET_GLOBAL = 2;
const unsigned int SDMA_SUBOP_USER_GCR = 1;
const unsigned int SDMA_ATOMIC_ADD64 = 47;
typedef struct SDMA_PKT_COPY_LINEAR_TAG {
@@ -503,6 +505,65 @@ typedef struct SDMA_PKT_HDP_FLUSH_TAG {
} SDMA_PKT_HDP_FLUSH;
static const SDMA_PKT_HDP_FLUSH hdp_flush_cmd = {0x8, 0x0, 0x80000000, 0x0, 0x0, 0x0};
typedef struct SDMA_PKT_GCR_TAG {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int : 16;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int : 7;
unsigned int BaseVA_LO : 25;
};
unsigned int DW_1_DATA;
} WORD1_UNION;
union {
struct {
unsigned int BaseVA_HI : 16;
unsigned int GCR_CONTROL_GLI_INV : 2;
unsigned int GCR_CONTROL_GL1_RANGE : 2;
unsigned int GCR_CONTROL_GLM_WB : 1;
unsigned int GCR_CONTROL_GLM_INV : 1;
unsigned int GCR_CONTROL_GLK_WB : 1;
unsigned int GCR_CONTROL_GLK_INV : 1;
unsigned int GCR_CONTROL_GLV_INV : 1;
unsigned int GCR_CONTROL_GL1_INV : 1;
unsigned int GCR_CONTROL_GL2_US : 1;
unsigned int GCR_CONTROL_GL2_RANGE : 2;
unsigned int GCR_CONTROL_GL2_DISCARD : 1;
unsigned int GCR_CONTROL_GL2_INV : 1;
unsigned int GCR_CONTROL_GL2_WB : 1;
};
unsigned int DW_2_DATA;
} WORD2_UNION;
union {
struct {
unsigned int GCR_CONTROL_RANGE_IS_PA : 1;
unsigned int GCR_CONTROL_SEQ : 2;
unsigned int : 4;
unsigned int LimitVA_LO : 25;
};
unsigned int DW_3_DATA;
} WORD3_UNION;
union {
struct {
unsigned int LimitVA_HI : 16;
unsigned int : 8;
unsigned int VMID : 4;
unsigned int : 4;
};
unsigned int DW_4_DATA;
} WORD4_UNION;
} SDMA_PKT_GCR;
} // namespace amd
#endif // HSA_RUNTIME_CORE_INC_SDMA_REGISTERS_H_
@@ -75,32 +75,44 @@ const size_t BlitSdmaBase::kMaxSingleCopySize = SDMA_PKT_COPY_LINEAR::kMaxSize_;
const size_t BlitSdmaBase::kMaxSingleFillSize = SDMA_PKT_CONSTANT_FILL::kMaxSize_;
// Initialize size of various sDMA commands use by this module
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::linear_copy_command_size_ = sizeof(SDMA_PKT_COPY_LINEAR);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::linear_copy_command_size_ = sizeof(SDMA_PKT_COPY_LINEAR);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::fill_command_size_ = sizeof(SDMA_PKT_CONSTANT_FILL);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::fill_command_size_ = sizeof(SDMA_PKT_CONSTANT_FILL);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::fence_command_size_ = sizeof(SDMA_PKT_FENCE);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::fence_command_size_ = sizeof(SDMA_PKT_FENCE);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::poll_command_size_ = sizeof(SDMA_PKT_POLL_REGMEM);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::poll_command_size_ = sizeof(SDMA_PKT_POLL_REGMEM);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::flush_command_size_ = sizeof(SDMA_PKT_POLL_REGMEM);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::flush_command_size_ = sizeof(SDMA_PKT_POLL_REGMEM);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::atomic_command_size_ = sizeof(SDMA_PKT_ATOMIC);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::atomic_command_size_ = sizeof(SDMA_PKT_ATOMIC);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::timestamp_command_size_ = sizeof(SDMA_PKT_TIMESTAMP);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::timestamp_command_size_ = sizeof(SDMA_PKT_TIMESTAMP);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::trap_command_size_ = sizeof(SDMA_PKT_TRAP);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::trap_command_size_ = sizeof(SDMA_PKT_TRAP);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BlitSdma()
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::gcr_command_size_ = sizeof(SDMA_PKT_GCR);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BlitSdma()
: agent_(NULL),
queue_start_addr_(NULL),
parity_(false),
@@ -111,11 +123,11 @@ BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BlitSdma()
std::memset(&queue_resource_, 0, sizeof(queue_resource_));
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::~BlitSdma() {}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::~BlitSdma() {}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initialize(
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::Initialize(
const core::Agent& agent, bool use_xgmi) {
if (queue_start_addr_ != NULL) {
// Already initialized.
@@ -179,8 +191,8 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initial
return HSA_STATUS_SUCCESS;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Destroy(
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::Destroy(
const core::Agent& agent) {
// Release all allocated resources and reset them to zero.
@@ -206,9 +218,9 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Destroy
return HSA_STATUS_SUCCESS;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitBlockingCommand(
const void* cmd, size_t cmd_size) {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::SubmitBlockingCommand(const void* cmd, size_t cmd_size) {
ScopedAcquire<KernelMutex> lock(&lock_);
// Alternate between completion signals
@@ -234,8 +246,8 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitB
return ret;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitCommand(
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::SubmitCommand(
const void* cmd, size_t cmd_size, const std::vector<core::Signal*>& dep_signals,
core::Signal& out_signal) {
// The signal is 64 bit value, and poll checks for 32 bit value. So we
@@ -285,6 +297,9 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitC
}
}
// Add space for cache flush.
if (useGCR) flush_cmd_size += gcr_command_size_ * 2;
const uint32_t total_command_size = total_poll_command_size + cmd_size + sync_command_size +
total_timestamp_command_size + interrupt_command_size + flush_cmd_size;
@@ -319,10 +334,22 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitC
}
}
// Issue cache invalidate
if (useGCR) {
BuildGCRCommand(command_addr, true);
command_addr += gcr_command_size_;
}
// Do the command after all polls are satisfied.
memcpy(command_addr, cmd, cmd_size);
command_addr += cmd_size;
// Issue cache writeback
if (useGCR) {
BuildGCRCommand(command_addr, false);
command_addr += gcr_command_size_;
}
if (profiling_enabled) {
assert(IsMultipleOf(end_ts_addr, 32));
BuildGetGlobalTimestampCommand(command_addr,
@@ -364,9 +391,9 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitC
return HSA_STATUS_SUCCESS;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitLinearCopyCommand(
void* dst, const void* src, size_t size) {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::SubmitLinearCopyCommand(void* dst, const void* src, size_t size) {
// Break the copy into multiple copy operation incase the copy size exceeds
// the SDMA linear copy limit.
const uint32_t num_copy_command = (size + kMaxSingleCopySize - 1) / kMaxSingleCopySize;
@@ -377,10 +404,11 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitL
return SubmitBlockingCommand(&buff[0], buff.size() * sizeof(SDMA_PKT_COPY_LINEAR));
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitLinearCopyCommand(
void* dst, const void* src, size_t size, std::vector<core::Signal*>& dep_signals,
core::Signal& out_signal) {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::SubmitLinearCopyCommand(void* dst, const void* src, size_t size,
std::vector<core::Signal*>& dep_signals,
core::Signal& out_signal) {
// Break the copy into multiple copy operations when the copy size exceeds
// the SDMA linear copy limit.
const uint32_t num_copy_command = (size + kMaxSingleCopySize - 1) / kMaxSingleCopySize;
@@ -393,8 +421,9 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitL
out_signal);
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitCopyRectCommand(
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
hsa_status_t
BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::SubmitCopyRectCommand(
const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src,
const hsa_dim3_t* src_offset, const hsa_dim3_t* range, std::vector<core::Signal*>& dep_signals,
core::Signal& out_signal) {
@@ -456,9 +485,9 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitC
out_signal);
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitLinearFillCommand(
void* ptr, uint32_t value, size_t count) {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::SubmitLinearFillCommand(void* ptr, uint32_t value, size_t count) {
const size_t size = count * sizeof(uint32_t);
const uint32_t num_fill_command = (size + kMaxSingleFillSize - 1) / kMaxSingleFillSize;
@@ -469,14 +498,14 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitL
return SubmitBlockingCommand(&buff[0], buff.size() * sizeof(SDMA_PKT_CONSTANT_FILL));
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::EnableProfiling(
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::EnableProfiling(
bool enable) {
return HSA_STATUS_SUCCESS;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
char* BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::AcquireWriteAddress(
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
char* BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::AcquireWriteAddress(
uint32_t cmd_size, RingIndexTy& curr_index) {
// Ring is full when all but one byte is written.
if (cmd_size >= kQueueSize) {
@@ -516,9 +545,10 @@ char* BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::AcquireWriteAd
return NULL;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::UpdateWriteAndDoorbellRegister(
RingIndexTy curr_index, RingIndexTy new_index) {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::UpdateWriteAndDoorbellRegister(RingIndexTy curr_index,
RingIndexTy new_index) {
while (true) {
// Make sure that the address before ::curr_index is already released.
// Otherwise the CP may read invalid packets.
@@ -552,8 +582,8 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::UpdateWriteAndD
}
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::ReleaseWriteAddress(
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::ReleaseWriteAddress(
RingIndexTy curr_index, uint32_t cmd_size) {
if (cmd_size > kQueueSize) {
assert(false && "cmd_addr is outside the queue buffer range");
@@ -563,8 +593,8 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::ReleaseWriteAdd
UpdateWriteAndDoorbellRegister(curr_index, curr_index + cmd_size);
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::PadRingToEnd(
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::PadRingToEnd(
RingIndexTy curr_index) {
// Reserve region from here to the end of the ring.
RingIndexTy new_index = curr_index + (kQueueSize - WrapIntoRing(curr_index));
@@ -585,14 +615,14 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::PadRingToEnd(
}
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::WrapIntoRing(
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::WrapIntoRing(
RingIndexTy index) {
return index & (kQueueSize - 1);
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
bool BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::CanWriteUpto(
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
bool BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::CanWriteUpto(
RingIndexTy upto_index) {
// Get/calculate the monotonic read index.
RingIndexTy hw_read_index = *reinterpret_cast<RingIndexTy*>(queue_resource_.Queue_read_ptr);
@@ -613,8 +643,8 @@ bool BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::CanWriteUpto(
return (upto_index - read_index) < kQueueSize;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildFenceCommand(
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildFenceCommand(
char* fence_command_addr, uint32_t* fence, uint32_t fence_value) {
assert(fence_command_addr != NULL);
SDMA_PKT_FENCE* packet_addr =
@@ -635,8 +665,8 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildFenceComma
packet_addr->DATA_UNION.data = fence_value;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildCopyCommand(
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildCopyCommand(
char* cmd_addr, uint32_t num_copy_command, void* dst, const void* src, size_t size) {
size_t cur_size = 0;
for (uint32_t i = 0; i < num_copy_command; ++i) {
@@ -675,8 +705,8 @@ Elements are coded by the log2 of the element size in bytes (ie. element 0=1 byt
This routine breaks a large rect into tiles that can be handled by hardware. Pitches and offsets
must be representable in terms of elements in all tiles of the copy.
*/
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildCopyRectCommand(
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildCopyRectCommand(
const std::function<void*(size_t)>& append, const hsa_pitched_ptr_t* dst,
const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src, const hsa_dim3_t* src_offset,
const hsa_dim3_t* range) {
@@ -794,8 +824,8 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildCopyRectCo
}
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildFillCommand(
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildFillCommand(
char* cmd_addr, uint32_t num_fill_command, void* ptr, uint32_t value, size_t count) {
char* cur_ptr = reinterpret_cast<char*>(ptr);
const uint32_t maxDwordCount = kMaxSingleFillSize / sizeof(uint32_t);
@@ -824,8 +854,8 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildFillComman
assert(count == 0 && "SDMA fill command count error.");
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildPollCommand(
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildPollCommand(
char* cmd_addr, void* addr, uint32_t reference) {
SDMA_PKT_POLL_REGMEM* packet_addr =
reinterpret_cast<SDMA_PKT_POLL_REGMEM*>(cmd_addr);
@@ -846,9 +876,9 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildPollComman
packet_addr->DW5_UNION.retry_count = 0xfff; // Retry forever.
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildAtomicDecrementCommand(
char* cmd_addr, void* addr) {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::BuildAtomicDecrementCommand(char* cmd_addr, void* addr) {
SDMA_PKT_ATOMIC* packet_addr = reinterpret_cast<SDMA_PKT_ATOMIC*>(cmd_addr);
memset(packet_addr, 0, sizeof(SDMA_PKT_ATOMIC));
@@ -863,9 +893,9 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildAtomicDecr
packet_addr->SRC_DATA_HI_UNION.src_data_63_32 = 0xffffffff;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildGetGlobalTimestampCommand(
char* cmd_addr, void* write_address) {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::BuildGetGlobalTimestampCommand(char* cmd_addr, void* write_address) {
SDMA_PKT_TIMESTAMP* packet_addr =
reinterpret_cast<SDMA_PKT_TIMESTAMP*>(cmd_addr);
@@ -878,8 +908,9 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildGetGlobalT
packet_addr->ADDR_HI_UNION.addr_63_32 = ptrhigh32(write_address);
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildTrapCommand(char* cmd_addr) {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildTrapCommand(
char* cmd_addr) {
SDMA_PKT_TRAP* packet_addr =
reinterpret_cast<SDMA_PKT_TRAP*>(cmd_addr);
@@ -888,15 +919,37 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildTrapComman
packet_addr->HEADER_UNION.op = SDMA_OP_TRAP;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildHdpFlushCommand(
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildHdpFlushCommand(
char* cmd_addr) {
assert(cmd_addr != NULL);
SDMA_PKT_POLL_REGMEM* addr = reinterpret_cast<SDMA_PKT_POLL_REGMEM*>(cmd_addr);
memcpy(addr, &hdp_flush_cmd, flush_command_size_);
}
template class BlitSdma<uint32_t, false, 0>;
template class BlitSdma<uint64_t, true, -1>;
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildGCRCommand(
char* cmd_addr, bool invalidate) {
assert(cmd_addr != NULL);
assert(useGCR && "Unsupported SDMA command - GCR.");
SDMA_PKT_GCR* addr = reinterpret_cast<SDMA_PKT_GCR*>(cmd_addr);
memset(addr, 0, sizeof(SDMA_PKT_GCR));
addr->HEADER_UNION.op = SDMA_OP_GCR;
addr->HEADER_UNION.sub_op = SDMA_SUBOP_USER_GCR;
addr->WORD2_UNION.GCR_CONTROL_GL2_WB = 1;
addr->WORD2_UNION.GCR_CONTROL_GLK_WB = 1;
if (invalidate) {
addr->WORD2_UNION.GCR_CONTROL_GL2_INV = 1;
addr->WORD2_UNION.GCR_CONTROL_GL1_INV = 1;
addr->WORD2_UNION.GCR_CONTROL_GLV_INV = 1;
addr->WORD2_UNION.GCR_CONTROL_GLK_INV = 1;
}
// Discarding all lines for now.
addr->WORD2_UNION.GCR_CONTROL_GL2_RANGE = 0;
}
template class BlitSdma<uint32_t, false, 0, false>;
template class BlitSdma<uint64_t, true, -1, false>;
template class BlitSdma<uint64_t, true, -1, true>;
} // namespace amd
@@ -488,16 +488,26 @@ core::Queue* GpuAgent::CreateInterceptibleQueue() {
core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi) {
amd::BlitSdmaBase* sdma;
if (isa_->GetMajorVersion() <= 8) {
sdma = new BlitSdmaV2V3();
} else {
sdma = new BlitSdmaV4();
switch (isa_->GetMajorVersion()) {
case 7:
case 8:
sdma = new BlitSdmaV2V3();
break;
case 9:
sdma = new BlitSdmaV4();
break;
case 10:
sdma = new BlitSdmaV5();
break;
default:
assert(false && "Unexpected device major version.");
return nullptr;
}
if (sdma->Initialize(*this, use_xgmi) != HSA_STATUS_SUCCESS) {
sdma->Destroy(*this);
delete sdma;
sdma = NULL;
sdma = nullptr;
}
return sdma;
@@ -533,7 +543,8 @@ void GpuAgent::InitDma() {
auto blit_lambda = [this](bool use_xgmi, lazy_ptr<core::Queue>& queue) {
const std::string& sdma_override = core::Runtime::runtime_singleton_->flag().enable_sdma();
bool use_sdma = ((isa_->GetMajorVersion() != 8) && (isa_->GetMajorVersion() != 10));
// User SDMA queues are unstable on gfx8.
bool use_sdma = ((isa_->GetMajorVersion() != 8));
if (sdma_override.size() != 0) use_sdma = (sdma_override == "1");
if (use_sdma && (HSA_PROFILE_BASE == profile_)) {