Initial support for xgmi sdma queues

Change-Id: I1aee379c7b9eede5f4b913cf2f9af3abb32e5baa


[ROCm/ROCR-Runtime commit: 8864c188b4]
Этот коммит содержится в:
Ramesh Errabolu
2019-07-24 19:28:24 -05:00
родитель 5e5b7fac71
Коммит 08e994db50
9 изменённых файлов: 170 добавлений и 69 удалений
+1 -1
Просмотреть файл
@@ -60,7 +60,7 @@ class BlitKernel : public core::Blit {
/// @param agent Pointer to the agent that will execute the AQL packets.
///
/// @return hsa_status_t
virtual hsa_status_t Initialize(const core::Agent& agent) override;
hsa_status_t Initialize(const core::Agent& agent);
/// @brief Marks the blit kernel object as invalid and uncouples its link with
/// the underlying AQL kernel queue. Use of the blit object
+3 -6
Просмотреть файл
@@ -64,6 +64,7 @@ class BlitSdmaBase : public core::Blit {
static const size_t kMaxSingleCopySize;
static const size_t kMaxSingleFillSize;
virtual bool isSDMA() const override { return true; }
virtual hsa_status_t Initialize(const core::Agent& agent, bool use_xgmi) = 0;
virtual hsa_status_t SubmitCopyRectCommand(const hsa_pitched_ptr_t* dst,
const hsa_dim3_t* dst_offset,
const hsa_pitched_ptr_t* src,
@@ -78,7 +79,7 @@ class BlitSdmaBase : public core::Blit {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
class BlitSdma : public BlitSdmaBase {
public:
explicit BlitSdma(bool copy_direction);
BlitSdma();
virtual ~BlitSdma() override;
@@ -88,7 +89,7 @@ class BlitSdma : public BlitSdmaBase {
/// @param agent Pointer to the agent that will execute the PM4 commands.
///
/// @return hsa_status_t
virtual hsa_status_t Initialize(const core::Agent& agent) override;
virtual hsa_status_t Initialize(const core::Agent& agent, bool use_xgmi) override;
/// @brief Marks the queue object as invalid and uncouples its link with
/// the underlying compute device's control block. Use of queue object
@@ -249,10 +250,6 @@ class BlitSdma : public BlitSdmaBase {
static const uint32_t trap_command_size_;
// Flag to indicate if sDMA queue is used for H2D copy operations
// true if used for H2D operations, false otherwise
const bool sdma_h2d_;
// Max copy size of a single linear copy command packet.
size_t max_single_linear_copy_size_;
+17 -3
Просмотреть файл
@@ -351,7 +351,7 @@ class GpuAgent : public GpuAgentInt {
// @brief Create SDMA blit object.
//
// @retval NULL if SDMA blit creation and initialization failed.
core::Blit* CreateBlitSdma(bool h2d);
core::Blit* CreateBlitSdma(bool use_xgmi);
// @brief Create Kernel blit object using provided compute queue.
//
@@ -405,9 +405,13 @@ class GpuAgent : public GpuAgentInt {
size_t scratch_per_thread_;
// @brief Blit interfaces for each data path.
enum BlitEnum { BlitHostToDev, BlitDevToHost, BlitDevToDev, BlitCount };
enum BlitEnum { BlitDevToDev, BlitHostToDev, BlitDevToHost, DefaultBlitCount };
lazy_ptr<core::Blit> blits_[BlitCount];
// Blit objects managed by an instance of GpuAgent
std::vector<lazy_ptr<core::Blit>> blits_;
// List of agents connected via xGMI
std::vector<const core::Agent*> xgmi_peer_list_;
// @brief AQL queues for cache management and blit compute usage.
enum QueueEnum {
@@ -490,6 +494,16 @@ class GpuAgent : public GpuAgentInt {
// @retval True if the memory pool for end timestamp object is initialized.
bool InitEndTsPool();
// Bind index of peer device that is connected via xGMI links
lazy_ptr<core::Blit>& GetXgmiBlit(const core::Agent& peer_agent);
// Bind the Blit object that will drive the copy operation
// across PCIe links (H2D or D2H) or is within same device D2D
lazy_ptr<core::Blit>& GetPcieBlit(const core::Agent& dst_agent, const core::Agent& src_agent);
// Bind the Blit object that will drive the copy operation
lazy_ptr<core::Blit>& GetBlitObject(const core::Agent& dst_agent, const core::Agent& src_agent);
// @brief Alternative aperture base address. Only on KV.
uintptr_t ape1_base_;
-7
Просмотреть файл
@@ -53,13 +53,6 @@ class Blit {
explicit Blit() {}
virtual ~Blit() {}
/// @brief Initialize a blit object.
///
/// @param agent Pointer to the agent that will execute the blit commands.
///
/// @return hsa_status_t
virtual hsa_status_t Initialize(const core::Agent& agent) = 0;
/// @brief Marks the blit object as invalid and uncouples its link with
/// the underlying compute device's control block. Use of blit object
/// once it has been release is illegal and any behavior is indeterminate
+8 -15
Просмотреть файл
@@ -100,13 +100,12 @@ template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::trap_command_size_ = sizeof(SDMA_PKT_TRAP);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BlitSdma(bool copy_direction)
BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BlitSdma()
: agent_(NULL),
queue_start_addr_(NULL),
parity_(false),
cached_reserve_index_(0),
cached_commit_index_(0),
sdma_h2d_(copy_direction),
platform_atomic_support_(true),
hdp_flush_support_(false) {
std::memset(&queue_resource_, 0, sizeof(queue_resource_));
@@ -117,7 +116,7 @@ BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::~BlitSdma() {}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initialize(
const core::Agent& agent) {
const core::Agent& agent, bool use_xgmi) {
if (queue_start_addr_ != NULL) {
// Already initialized.
return HSA_STATUS_SUCCESS;
@@ -159,8 +158,10 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initial
// Access kernel driver to initialize the queue control block
// This call binds user mode queue object to underlying compute
// device.
const HSA_QUEUE_TYPE kQueueType_ = HSA_QUEUE_SDMA;
// device. ROCr creates queues that are of two kinds: PCIe optimized
// and xGMI optimized. Which queue to create is indicated via input
// boolean flag
const HSA_QUEUE_TYPE kQueueType_ = use_xgmi ? HSA_QUEUE_SDMA_XGMI : HSA_QUEUE_SDMA;
if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueue(agent_->node_id(), kQueueType_, 100,
HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_,
kQueueSize, NULL, &queue_resource_)) {
@@ -319,9 +320,9 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitC
command_addr += timestamp_command_size_;
}
// Determine if a Hdp flush cmd is required at the top of cmd stream
// Issue a Hdp flush cmd
if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_ == false)) {
if ((HwIndexMonotonic) && (hdp_flush_support_)) {
BuildHdpFlushCommand(command_addr);
command_addr += flush_command_size_;
}
@@ -331,14 +332,6 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitC
memcpy(command_addr, cmd, cmd_size);
command_addr += cmd_size;
// Determine if a Hdp flush cmd is required at the end of cmd stream
if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_)) {
BuildHdpFlushCommand(command_addr);
command_addr += flush_command_size_;
}
}
if (profiling_enabled) {
assert(IsMultipleOf(end_ts_addr, 32));
BuildGetGlobalTimestampCommand(command_addr,
+124 -32
Просмотреть файл
@@ -77,7 +77,6 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
: GpuAgentInt(node),
properties_(node_props),
current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT),
blits_(),
queues_(),
local_region_(NULL),
is_kv_device_(false),
@@ -138,9 +137,9 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
}
GpuAgent::~GpuAgent() {
for (int i = 0; i < BlitCount; ++i) {
if (blits_[i] != nullptr) {
hsa_status_t status = blits_[i]->Destroy(*this);
for (auto& blit : blits_) {
if (blit.created()) {
hsa_status_t status = blit->Destroy(*this);
assert(status == HSA_STATUS_SUCCESS);
}
}
@@ -537,16 +536,16 @@ core::Queue* GpuAgent::CreateInterceptibleQueue() {
return queue;
}
core::Blit* GpuAgent::CreateBlitSdma(bool h2d) {
core::Blit* sdma;
core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi) {
amd::BlitSdmaBase* sdma;
if (isa_->GetMajorVersion() <= 8) {
sdma = new BlitSdmaV2V3(h2d);
sdma = new BlitSdmaV2V3();
} else {
sdma = new BlitSdmaV4(h2d);
sdma = new BlitSdmaV4();
}
if (sdma->Initialize(*this) != HSA_STATUS_SUCCESS) {
if (sdma->Initialize(*this, use_xgmi) != HSA_STATUS_SUCCESS) {
sdma->Destroy(*this);
delete sdma;
sdma = NULL;
@@ -582,14 +581,14 @@ void GpuAgent::InitDma() {
queues_[QueueUtility].reset(queue_lambda);
// Decide which engine to use for blits.
auto blit_lambda = [this](bool h2d, lazy_ptr<core::Queue>& queue) {
auto blit_lambda = [this](bool use_xgmi, lazy_ptr<core::Queue>& queue) {
const std::string& sdma_override = core::Runtime::runtime_singleton_->flag().enable_sdma();
bool use_sdma = (isa_->GetMajorVersion() != 8);
if (sdma_override.size() != 0) use_sdma = (sdma_override == "1");
if (use_sdma && (HSA_PROFILE_BASE == profile_)) {
auto ret = CreateBlitSdma(h2d);
auto ret = CreateBlitSdma(use_xgmi);
if (ret != nullptr) return ret;
}
@@ -599,20 +598,45 @@ void GpuAgent::InitDma() {
return ret;
};
blits_[BlitHostToDev].reset([blit_lambda, this]() { return blit_lambda(true, queues_[QueueBlitOnly]); });
blits_[BlitDevToHost].reset([blit_lambda, this]() { return blit_lambda(false, queues_[QueueUtility]); });
// Determine and instantiate the number of blit objects to
// engage. The total number is sum of three plus number of
// sdma-xgmi engines
uint32_t blit_cnt_ = DefaultBlitCount + properties_.NumSdmaXgmiEngines;
blits_.resize(blit_cnt_);
// Initialize blit objects used for D2D, H2D, D2H, and
// P2P copy operations.
// -- Blit at index BlitDevToDev(0) deals with copies within
// local framebuffer and always engages a Blit Kernel
// -- Blit at index BlitHostToDev(1) deals with copies from
// Host to Device (H2D) and could engage either a Blit
// Kernel or sDMA
// -- Blit at index BlitDevToHost(2) deals with copies from
// Device to Host (D2H) and Peer to Peer (P2P) over PCIe.
// It could engage either a Blit Kernel or sDMA
// -- Blit at index DefaultBlitCount(3) and beyond deal
// exclusively P2P over xGMI links
blits_[BlitDevToDev].reset([this]() {
auto ret = CreateBlitKernel((*queues_[QueueUtility]).get());
if (ret == nullptr)
throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Blit creation failed.");
return ret;
});
blits_[BlitHostToDev].reset(
[blit_lambda, this]() { return blit_lambda(false, queues_[QueueBlitOnly]); });
blits_[BlitDevToHost].reset(
[blit_lambda, this]() { return blit_lambda(false, queues_[QueueUtility]); });
// XGMI engines.
for (uint32_t idx = DefaultBlitCount; idx < blit_cnt_; idx++) {
blits_[idx].reset([blit_lambda, this]() { return blit_lambda(true, queues_[QueueUtility]); });
}
}
void GpuAgent::PreloadBlits() {
blits_[BlitHostToDev].touch();
blits_[BlitDevToHost].touch();
blits_[BlitDevToDev].touch();
for (auto& blit : blits_) {
blit.touch();
}
}
hsa_status_t GpuAgent::PostToolsInit() {
@@ -633,15 +657,8 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
size_t size,
std::vector<core::Signal*>& dep_signals,
core::Signal& out_signal) {
lazy_ptr<core::Blit>& blit =
(src_agent.device_type() == core::Agent::kAmdCpuDevice &&
dst_agent.device_type() == core::Agent::kAmdGpuDevice)
? blits_[BlitHostToDev]
: (src_agent.device_type() == core::Agent::kAmdGpuDevice &&
dst_agent.device_type() == core::Agent::kAmdCpuDevice)
? blits_[BlitDevToHost]
: (src_agent.node_id() == dst_agent.node_id())
? blits_[BlitDevToDev] : blits_[BlitDevToHost];
// Bind the Blit object that will drive this copy operation
lazy_ptr<core::Blit>& blit = GetBlitObject(dst_agent, src_agent);
if (profiling_enabled()) {
// Track the agent so we could translate the resulting timestamp to system
@@ -688,9 +705,9 @@ hsa_status_t GpuAgent::EnableDmaProfiling(bool enable) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
for (int i = 0; i < BlitCount; ++i) {
if (blits_[i].created()) {
const hsa_status_t stat = blits_[i]->EnableProfiling(enable);
for (auto& blit : blits_) {
if (blit.created()) {
const hsa_status_t stat = blit->EnableProfiling(enable);
if (stat != HSA_STATUS_SUCCESS) {
return stat;
}
@@ -701,12 +718,10 @@ hsa_status_t GpuAgent::EnableDmaProfiling(bool enable) {
}
hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
// agent, and vendor name size limit
const size_t attribute_u = static_cast<size_t>(attribute);
switch (attribute_u) {
// Build agent name by concatenating the Major, Minor and Stepping Ids
// of devices compute capability with a prefix of "gfx"
case HSA_AGENT_INFO_NAME: {
@@ -878,7 +893,7 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
case HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY:
*((uint32_t*)value) = memory_max_frequency_;
break;
// The code copies HsaNodeProperties.MarketingName a Unicode string
// which is encoded in UTF-16 as a 7-bit ASCII string
case HSA_AMD_AGENT_INFO_PRODUCT_NAME: {
@@ -1252,4 +1267,81 @@ void GpuAgent::InvalidateCodeCaches() {
queues_[QueueUtility]->ExecutePM4(cache_inv, sizeof(cache_inv));
}
lazy_ptr<core::Blit>& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent) {
// Determine if destination is a member xgmi peers list
uint32_t xgmi_engine_cnt = properties_.NumSdmaXgmiEngines;
assert((xgmi_engine_cnt > 0) && ("Illegal condition, should not happen"));
for (uint32_t idx = 0; idx < xgmi_peer_list_.size(); idx++) {
uint64_t dst_handle = dst_agent.public_handle().handle;
uint64_t peer_handle = xgmi_peer_list_[idx]->public_handle().handle;
if (peer_handle == dst_handle) {
return blits_[(idx % xgmi_engine_cnt) + DefaultBlitCount];
}
}
// Add agent to the xGMI neighbours list
xgmi_peer_list_.push_back(&dst_agent);
return blits_[((xgmi_peer_list_.size() - 1) % xgmi_engine_cnt) + DefaultBlitCount];
}
lazy_ptr<core::Blit>& GpuAgent::GetPcieBlit(const core::Agent& dst_agent,
const core::Agent& src_agent) {
lazy_ptr<core::Blit>& blit =
(src_agent.device_type() == core::Agent::kAmdCpuDevice &&
dst_agent.device_type() == core::Agent::kAmdGpuDevice)
? blits_[BlitHostToDev]
: (src_agent.device_type() == core::Agent::kAmdGpuDevice &&
dst_agent.device_type() == core::Agent::kAmdCpuDevice)
? blits_[BlitDevToHost] : blits_[BlitDevToHost];
return blit;
}
lazy_ptr<core::Blit>& GpuAgent::GetBlitObject(const core::Agent& dst_agent,
const core::Agent& src_agent) {
// At this point it is guaranteed that one of
// the two devices is a GPU, potentially both
assert(((src_agent.device_type() == core::Agent::kAmdGpuDevice) ||
(dst_agent.device_type() == core::Agent::kAmdGpuDevice)) &&
("Both devices are CPU agents which is not expected"));
// Determine if Src and Dst devices are same
if ((src_agent.public_handle().handle) == (dst_agent.public_handle().handle)) {
return blits_[BlitDevToDev];
}
// Acquire Hive Id of Src and Dst devices
uint64_t src_hive_id = src_agent.HiveId();
uint64_t dst_hive_id = dst_agent.HiveId();
// Bind to a PCIe facing Blit object if the two
// devices have different Hive Ids. This can occur
// for following scenarios:
//
// Neither device claims membership in a Hive
// srcId = 0 <-> dstId = 0;
//
// Src device claims membership in a Hive
// srcId = 0x1926 <-> dstId = 0;
//
// Dst device claims membership in a Hive
// srcId = 0 <-> dstId = 0x1123;
//
// Both device claims membership in a Hive
// and the Hives are different
// srcId = 0x1926 <-> dstId = 0x1123;
//
if ((dst_hive_id != src_hive_id) || (dst_hive_id == 0)) {
return GetPcieBlit(dst_agent, src_agent);
}
// Accommodates platforms where devices have xGMI
// links but without sdmaXgmiEngines e.g. Vega 20
if (properties_.NumSdmaXgmiEngines == 0) {
return GetPcieBlit(dst_agent, src_agent);
}
return GetXgmiBlit(dst_agent);
}
} // namespace
+4 -2
Просмотреть файл
@@ -255,10 +255,12 @@ hsa_status_t hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent_handle,
core::Signal* out_signal_obj = core::Signal::Convert(completion_signal);
IS_VALID(out_signal_obj);
bool rev_copy_dir = core::Runtime::runtime_singleton_->flag().rev_copy_dir();
if (size > 0) {
return core::Runtime::runtime_singleton_->CopyMemory(
dst, *dst_agent, src, *src_agent, size, dep_signal_list,
*out_signal_obj);
dst, (rev_copy_dir ? *src_agent : *dst_agent),
src, (rev_copy_dir ? *dst_agent : *src_agent),
size, dep_signal_list, *out_signal_obj);
}
return HSA_STATUS_SUCCESS;
-2
Просмотреть файл
@@ -464,8 +464,6 @@ hsa_status_t Runtime::CopyMemory(void* dst, core::Agent& dst_agent,
(src_agent.device_type() == core::Agent::DeviceType::kAmdGpuDevice);
if (dst_gpu || src_gpu) {
core::Agent* copy_agent = (src_gpu) ? &src_agent : &dst_agent;
if (flag_.rev_copy_dir() && dst_gpu && src_gpu)
copy_agent = (copy_agent == &src_agent) ? &dst_agent : &src_agent;
return copy_agent->DmaCopy(dst, dst_agent, src, src_agent, size, dep_signals,
completion_signal);
}
+13 -1
Просмотреть файл
@@ -58,6 +58,19 @@ template <typename T> class lazy_ptr {
explicit lazy_ptr(std::function<T*()> Constructor) { Init(Constructor); }
lazy_ptr(lazy_ptr&& rhs) {
obj = std::move(rhs.obj);
func = std::move(rhs.func);
}
lazy_ptr& operator=(lazy_ptr&& rhs) {
obj = std::move(rhs.obj);
func = std::move(rhs.func);
}
lazy_ptr(lazy_ptr&) = delete;
lazy_ptr& operator=(lazy_ptr&) = delete;
void reset(std::function<T*()> Constructor = nullptr) {
obj.reset();
func = Constructor;
@@ -122,7 +135,6 @@ template <typename T> class lazy_ptr {
}
}
DISALLOW_COPY_AND_ASSIGN(lazy_ptr);
};
#endif // HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_