Implement SDMA copy rect for gfx9.

Fix pitch overflow due to small element detection.
Add wide pitch 2D copy handling.
Cleanup code duplication.

Change-Id: I93b1584aba8e5964957eb7ab3544df806ca3e2f9
Этот коммит содержится в:
Sean Keely
2018-06-07 12:14:01 -05:00
родитель aca00b7238
Коммит e0839ab27e
13 изменённых файлов: 1015 добавлений и 473 удалений
+11
Просмотреть файл
@@ -963,6 +963,17 @@ hsa_status_t HSA_API
num_dep_signals, dep_signals, completion_signal);
}
// Mirrors Amd Extension Apis
hsa_status_t HSA_API hsa_amd_memory_async_copy_rect(
const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src,
const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent,
hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
hsa_signal_t completion_signal) {
return amdExtTable->hsa_amd_memory_async_copy_rect_fn(dst, dst_offset, src, src_offset, range,
copy_agent, dir, num_dep_signals,
dep_signals, completion_signal);
}
// Mirrors Amd Extension Apis
hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info(
hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool,
+23
Просмотреть файл
@@ -45,6 +45,7 @@
#include <mutex>
#include <stdint.h>
#include <vector>
#include "hsakmt.h"
@@ -55,6 +56,7 @@
#include "core/util/utils.h"
namespace amd {
class BlitSdmaBase : public core::Blit {
public:
static const size_t kQueueSize;
@@ -62,6 +64,12 @@ class BlitSdmaBase : public core::Blit {
static const size_t kMaxSingleCopySize;
static const size_t kMaxSingleFillSize;
virtual bool isSDMA() const override { return true; }
virtual hsa_status_t SubmitCopyRectCommand(const hsa_pitched_ptr_t* dst,
const hsa_dim3_t* dst_offset,
const hsa_pitched_ptr_t* src,
const hsa_dim3_t* src_offset, const hsa_dim3_t* range,
std::vector<core::Signal*>& dep_signals,
core::Signal& out_signal) = 0;
};
// RingIndexTy: 32/64-bit monotonic ring index, counting in bytes.
@@ -116,6 +124,13 @@ class BlitSdma : public BlitSdmaBase {
std::vector<core::Signal*>& dep_signals,
core::Signal& out_signal) override;
virtual hsa_status_t SubmitCopyRectCommand(const hsa_pitched_ptr_t* dst,
const hsa_dim3_t* dst_offset,
const hsa_pitched_ptr_t* src,
const hsa_dim3_t* src_offset, const hsa_dim3_t* range,
std::vector<core::Signal*>& dep_signals,
core::Signal& out_signal) override;
/// @brief Submit a linear fill command to the queue buffer
///
/// @param ptr Memory address of the fill destination.
@@ -181,6 +196,11 @@ class BlitSdma : public BlitSdmaBase {
void BuildCopyCommand(char* cmd_addr, uint32_t num_copy_command, void* dst,
const void* src, size_t size);
void BuildCopyRectCommand(const std::function<void*(size_t)>& append,
const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset,
const hsa_pitched_ptr_t* src, const hsa_dim3_t* src_offset,
const hsa_dim3_t* range);
void BuildPollCommand(char* cmd_addr, void* addr, uint32_t reference);
void BuildAtomicDecrementCommand(char* cmd_addr, void* addr);
@@ -189,6 +209,9 @@ class BlitSdma : public BlitSdmaBase {
void BuildTrapCommand(char* cmd_addr);
hsa_status_t SubmitCommand(const void* cmds, size_t cmd_size,
std::vector<core::Signal*>& dep_signals, core::Signal& out_signal);
// Agent object owning the SDMA engine.
GpuAgent* agent_;
+6
Просмотреть файл
@@ -240,6 +240,12 @@ class GpuAgent : public GpuAgentInt {
std::vector<core::Signal*>& dep_signals,
core::Signal& out_signal) override;
// @brief Override from core::Agent.
hsa_status_t DmaCopyRect(const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset,
const hsa_pitched_ptr_t* src, const hsa_dim3_t* src_offset,
const hsa_dim3_t* range, hsa_amd_copy_direction_t dir,
std::vector<core::Signal*>& dep_signals, core::Signal& out_signal);
// @brief Override from core::Agent.
hsa_status_t DmaFill(void* ptr, uint32_t value, size_t count) override;
+7
Просмотреть файл
@@ -140,6 +140,13 @@ hsa_status_t HSA_API
const hsa_signal_t* dep_signals,
hsa_signal_t completion_signal);
// Mirrors Amd Extension Apis
hsa_status_t HSA_API hsa_amd_memory_async_copy_rect(
const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src,
const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent,
hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
hsa_signal_t completion_signal);
// Mirrors Amd Extension Apis
hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info(
hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool,
+499
Просмотреть файл
@@ -0,0 +1,499 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
#ifndef HSA_RUNTIME_CORE_INC_SDMA_REGISTERS_H_
#define HSA_RUNTIME_CORE_INC_SDMA_REGISTERS_H_
namespace amd {
// SDMA packet for VI device.
// Reference: http://people.freedesktop.org/~agd5f/dma_packets.txt
const unsigned int SDMA_OP_COPY = 1;
const unsigned int SDMA_OP_FENCE = 5;
const unsigned int SDMA_OP_TRAP = 6;
const unsigned int SDMA_OP_POLL_REGMEM = 8;
const unsigned int SDMA_OP_ATOMIC = 10;
const unsigned int SDMA_OP_CONST_FILL = 11;
const unsigned int SDMA_OP_TIMESTAMP = 13;
const unsigned int SDMA_SUBOP_COPY_LINEAR = 0;
const unsigned int SDMA_SUBOP_COPY_LINEAR_RECT = 4;
const unsigned int SDMA_SUBOP_TIMESTAMP_GET_GLOBAL = 2;
const unsigned int SDMA_ATOMIC_ADD64 = 47;
typedef struct SDMA_PKT_COPY_LINEAR_TAG {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int extra_info : 16;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int count : 22;
unsigned int reserved_0 : 10;
};
unsigned int DW_1_DATA;
} COUNT_UNION;
union {
struct {
unsigned int reserved_0 : 16;
unsigned int dst_swap : 2;
unsigned int reserved_1 : 6;
unsigned int src_swap : 2;
unsigned int reserved_2 : 6;
};
unsigned int DW_2_DATA;
} PARAMETER_UNION;
union {
struct {
unsigned int src_addr_31_0 : 32;
};
unsigned int DW_3_DATA;
} SRC_ADDR_LO_UNION;
union {
struct {
unsigned int src_addr_63_32 : 32;
};
unsigned int DW_4_DATA;
} SRC_ADDR_HI_UNION;
union {
struct {
unsigned int dst_addr_31_0 : 32;
};
unsigned int DW_5_DATA;
} DST_ADDR_LO_UNION;
union {
struct {
unsigned int dst_addr_63_32 : 32;
};
unsigned int DW_6_DATA;
} DST_ADDR_HI_UNION;
static const size_t kMaxSize_ = 0x3fffe0;
} SDMA_PKT_COPY_LINEAR;
// linear sub-window
typedef struct SDMA_PKT_COPY_LINEAR_RECT_TAG {
static const unsigned int pitch_bits = 19;
static const unsigned int slice_bits = 28;
static const unsigned int rect_xy_bits = 14;
static const unsigned int rect_z_bits = 11;
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int reserved : 13;
unsigned int element : 3;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int src_addr_31_0 : 32;
};
unsigned int DW_1_DATA;
} SRC_ADDR_LO_UNION;
union {
struct {
unsigned int src_addr_63_32 : 32;
};
unsigned int DW_2_DATA;
} SRC_ADDR_HI_UNION;
union {
struct {
unsigned int src_offset_x : 14;
unsigned int reserved_1 : 2;
unsigned int src_offset_y : 14;
unsigned int reserved_2 : 2;
};
unsigned int DW_3_DATA;
} SRC_PARAMETER_1_UNION;
union {
struct {
unsigned int src_offset_z : 11;
unsigned int reserved_1 : 2;
unsigned int src_pitch : pitch_bits;
};
unsigned int DW_4_DATA;
} SRC_PARAMETER_2_UNION;
union {
struct {
unsigned int src_slice_pitch : slice_bits;
unsigned int reserved_1 : 4;
};
unsigned int DW_5_DATA;
} SRC_PARAMETER_3_UNION;
union {
struct {
unsigned int dst_addr_31_0 : 32;
};
unsigned int DW_6_DATA;
} DST_ADDR_LO_UNION;
union {
struct {
unsigned int dst_addr_63_32 : 32;
};
unsigned int DW_7_DATA;
} DST_ADDR_HI_UNION;
union {
struct {
unsigned int dst_offset_x : 14;
unsigned int reserved_1 : 2;
unsigned int dst_offset_y : 14;
unsigned int reserved_2 : 2;
};
unsigned int DW_8_DATA;
} DST_PARAMETER_1_UNION;
union {
struct {
unsigned int dst_offset_z : 11;
unsigned int reserved_1 : 2;
unsigned int dst_pitch : pitch_bits;
};
unsigned int DW_9_DATA;
} DST_PARAMETER_2_UNION;
union {
struct {
unsigned int dst_slice_pitch : slice_bits;
unsigned int reserved_1 : 4;
};
unsigned int DW_10_DATA;
} DST_PARAMETER_3_UNION;
union {
struct {
unsigned int rect_x : rect_xy_bits;
unsigned int reserved_1 : 2;
unsigned int rect_y : rect_xy_bits;
unsigned int reserved_2 : 2;
};
unsigned int DW_11_DATA;
} RECT_PARAMETER_1_UNION;
union {
struct {
unsigned int rect_z : rect_z_bits;
unsigned int reserved_1 : 5;
unsigned int dst_swap : 2;
unsigned int reserved_2 : 6;
unsigned int src_swap : 2;
unsigned int reserved_3 : 6;
};
unsigned int DW_12_DATA;
} RECT_PARAMETER_2_UNION;
// static const unsigned int pitch_bits = 19;
} SDMA_PKT_COPY_LINEAR_RECT;
typedef struct SDMA_PKT_CONSTANT_FILL_TAG {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int sw : 2;
unsigned int reserved_0 : 12;
unsigned int fillsize : 2;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int dst_addr_31_0 : 32;
};
unsigned int DW_1_DATA;
} DST_ADDR_LO_UNION;
union {
struct {
unsigned int dst_addr_63_32 : 32;
};
unsigned int DW_2_DATA;
} DST_ADDR_HI_UNION;
union {
struct {
unsigned int src_data_31_0 : 32;
};
unsigned int DW_3_DATA;
} DATA_UNION;
union {
struct {
unsigned int count : 22;
unsigned int reserved_0 : 10;
};
unsigned int DW_4_DATA;
} COUNT_UNION;
static const size_t kMaxSize_ = 0x3fffe0;
} SDMA_PKT_CONSTANT_FILL;
typedef struct SDMA_PKT_FENCE_TAG {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int reserved_0 : 16;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int addr_31_0 : 32;
};
unsigned int DW_1_DATA;
} ADDR_LO_UNION;
union {
struct {
unsigned int addr_63_32 : 32;
};
unsigned int DW_2_DATA;
} ADDR_HI_UNION;
union {
struct {
unsigned int data : 32;
};
unsigned int DW_3_DATA;
} DATA_UNION;
} SDMA_PKT_FENCE;
typedef struct SDMA_PKT_POLL_REGMEM_TAG {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int reserved_0 : 10;
unsigned int hdp_flush : 1;
unsigned int reserved_1 : 1;
unsigned int func : 3;
unsigned int mem_poll : 1;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int addr_31_0 : 32;
};
unsigned int DW_1_DATA;
} ADDR_LO_UNION;
union {
struct {
unsigned int addr_63_32 : 32;
};
unsigned int DW_2_DATA;
} ADDR_HI_UNION;
union {
struct {
unsigned int value : 32;
};
unsigned int DW_3_DATA;
} VALUE_UNION;
union {
struct {
unsigned int mask : 32;
};
unsigned int DW_4_DATA;
} MASK_UNION;
union {
struct {
unsigned int interval : 16;
unsigned int retry_count : 12;
unsigned int reserved_0 : 4;
};
unsigned int DW_5_DATA;
} DW5_UNION;
} SDMA_PKT_POLL_REGMEM;
typedef struct SDMA_PKT_ATOMIC_TAG {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int l : 1;
unsigned int reserved_0 : 8;
unsigned int operation : 7;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int addr_31_0 : 32;
};
unsigned int DW_1_DATA;
} ADDR_LO_UNION;
union {
struct {
unsigned int addr_63_32 : 32;
};
unsigned int DW_2_DATA;
} ADDR_HI_UNION;
union {
struct {
unsigned int src_data_31_0 : 32;
};
unsigned int DW_3_DATA;
} SRC_DATA_LO_UNION;
union {
struct {
unsigned int src_data_63_32 : 32;
};
unsigned int DW_4_DATA;
} SRC_DATA_HI_UNION;
union {
struct {
unsigned int cmp_data_31_0 : 32;
};
unsigned int DW_5_DATA;
} CMP_DATA_LO_UNION;
union {
struct {
unsigned int cmp_data_63_32 : 32;
};
unsigned int DW_6_DATA;
} CMP_DATA_HI_UNION;
union {
struct {
unsigned int loop_interval : 13;
unsigned int reserved_0 : 19;
};
unsigned int DW_7_DATA;
} LOOP_UNION;
} SDMA_PKT_ATOMIC;
typedef struct SDMA_PKT_TIMESTAMP_TAG {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int reserved_0 : 16;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int addr_31_0 : 32;
};
unsigned int DW_1_DATA;
} ADDR_LO_UNION;
union {
struct {
unsigned int addr_63_32 : 32;
};
unsigned int DW_2_DATA;
} ADDR_HI_UNION;
} SDMA_PKT_TIMESTAMP;
typedef struct SDMA_PKT_TRAP_TAG {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int reserved_0 : 16;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int int_ctx : 28;
unsigned int reserved_1 : 4;
};
unsigned int DW_1_DATA;
} INT_CONTEXT_UNION;
} SDMA_PKT_TRAP;
// HDP flush packet, no parameters.
typedef struct SDMA_PKT_HDP_FLUSH_TAG {
unsigned int DW_0_DATA;
unsigned int DW_1_DATA;
unsigned int DW_2_DATA;
unsigned int DW_3_DATA;
unsigned int DW_4_DATA;
unsigned int DW_5_DATA;
// Version of gfx9 sDMA microcode introducing SDMA_PKT_HDP_FLUSH
static const uint16_t kMinVersion_ = 0x1A5;
} SDMA_PKT_HDP_FLUSH;
static const SDMA_PKT_HDP_FLUSH hdp_flush_cmd = {0x8, 0x0, 0x80000000, 0x0, 0x0, 0x0};
} // namespace amd
#endif // HSA_RUNTIME_CORE_INC_SDMA_REGISTERS_H_
+347 -473
Просмотреть файл
@@ -51,328 +51,10 @@
#include "core/inc/amd_gpu_agent.h"
#include "core/inc/amd_memory_region.h"
#include "core/inc/runtime.h"
#include "core/inc/sdma_registers.h"
#include "core/inc/signal.h"
namespace amd {
// SDMA packet for VI device.
// Reference: http://people.freedesktop.org/~agd5f/dma_packets.txt
const unsigned int SDMA_OP_COPY = 1;
const unsigned int SDMA_OP_FENCE = 5;
const unsigned int SDMA_OP_TRAP = 6;
const unsigned int SDMA_OP_POLL_REGMEM = 8;
const unsigned int SDMA_OP_ATOMIC = 10;
const unsigned int SDMA_OP_CONST_FILL = 11;
const unsigned int SDMA_OP_TIMESTAMP = 13;
const unsigned int SDMA_SUBOP_COPY_LINEAR = 0;
const unsigned int SDMA_SUBOP_TIMESTAMP_GET_GLOBAL = 2;
const unsigned int SDMA_ATOMIC_ADD64 = 47;
typedef struct SDMA_PKT_COPY_LINEAR_TAG {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int extra_info : 16;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int count : 22;
unsigned int reserved_0 : 10;
};
unsigned int DW_1_DATA;
} COUNT_UNION;
union {
struct {
unsigned int reserved_0 : 16;
unsigned int dst_swap : 2;
unsigned int reserved_1 : 6;
unsigned int src_swap : 2;
unsigned int reserved_2 : 6;
};
unsigned int DW_2_DATA;
} PARAMETER_UNION;
union {
struct {
unsigned int src_addr_31_0 : 32;
};
unsigned int DW_3_DATA;
} SRC_ADDR_LO_UNION;
union {
struct {
unsigned int src_addr_63_32 : 32;
};
unsigned int DW_4_DATA;
} SRC_ADDR_HI_UNION;
union {
struct {
unsigned int dst_addr_31_0 : 32;
};
unsigned int DW_5_DATA;
} DST_ADDR_LO_UNION;
union {
struct {
unsigned int dst_addr_63_32 : 32;
};
unsigned int DW_6_DATA;
} DST_ADDR_HI_UNION;
} SDMA_PKT_COPY_LINEAR;
typedef struct SDMA_PKT_CONSTANT_FILL_TAG {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int sw : 2;
unsigned int reserved_0 : 12;
unsigned int fillsize : 2;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int dst_addr_31_0 : 32;
};
unsigned int DW_1_DATA;
} DST_ADDR_LO_UNION;
union {
struct {
unsigned int dst_addr_63_32 : 32;
};
unsigned int DW_2_DATA;
} DST_ADDR_HI_UNION;
union {
struct {
unsigned int src_data_31_0 : 32;
};
unsigned int DW_3_DATA;
} DATA_UNION;
union {
struct {
unsigned int count : 22;
unsigned int reserved_0 : 10;
};
unsigned int DW_4_DATA;
} COUNT_UNION;
} SDMA_PKT_CONSTANT_FILL;
typedef struct SDMA_PKT_FENCE_TAG {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int reserved_0 : 16;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int addr_31_0 : 32;
};
unsigned int DW_1_DATA;
} ADDR_LO_UNION;
union {
struct {
unsigned int addr_63_32 : 32;
};
unsigned int DW_2_DATA;
} ADDR_HI_UNION;
union {
struct {
unsigned int data : 32;
};
unsigned int DW_3_DATA;
} DATA_UNION;
} SDMA_PKT_FENCE;
typedef struct SDMA_PKT_POLL_REGMEM_TAG {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int reserved_0 : 10;
unsigned int hdp_flush : 1;
unsigned int reserved_1 : 1;
unsigned int func : 3;
unsigned int mem_poll : 1;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int addr_31_0 : 32;
};
unsigned int DW_1_DATA;
} ADDR_LO_UNION;
union {
struct {
unsigned int addr_63_32 : 32;
};
unsigned int DW_2_DATA;
} ADDR_HI_UNION;
union {
struct {
unsigned int value : 32;
};
unsigned int DW_3_DATA;
} VALUE_UNION;
union {
struct {
unsigned int mask : 32;
};
unsigned int DW_4_DATA;
} MASK_UNION;
union {
struct {
unsigned int interval : 16;
unsigned int retry_count : 12;
unsigned int reserved_0 : 4;
};
unsigned int DW_5_DATA;
} DW5_UNION;
} SDMA_PKT_POLL_REGMEM;
typedef struct SDMA_PKT_ATOMIC_TAG {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int l : 1;
unsigned int reserved_0 : 8;
unsigned int operation : 7;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int addr_31_0 : 32;
};
unsigned int DW_1_DATA;
} ADDR_LO_UNION;
union {
struct {
unsigned int addr_63_32 : 32;
};
unsigned int DW_2_DATA;
} ADDR_HI_UNION;
union {
struct {
unsigned int src_data_31_0 : 32;
};
unsigned int DW_3_DATA;
} SRC_DATA_LO_UNION;
union {
struct {
unsigned int src_data_63_32 : 32;
};
unsigned int DW_4_DATA;
} SRC_DATA_HI_UNION;
union {
struct {
unsigned int cmp_data_31_0 : 32;
};
unsigned int DW_5_DATA;
} CMP_DATA_LO_UNION;
union {
struct {
unsigned int cmp_data_63_32 : 32;
};
unsigned int DW_6_DATA;
} CMP_DATA_HI_UNION;
union {
struct {
unsigned int loop_interval : 13;
unsigned int reserved_0 : 19;
};
unsigned int DW_7_DATA;
} LOOP_UNION;
} SDMA_PKT_ATOMIC;
typedef struct SDMA_PKT_TIMESTAMP_TAG {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int reserved_0 : 16;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int addr_31_0 : 32;
};
unsigned int DW_1_DATA;
} ADDR_LO_UNION;
union {
struct {
unsigned int addr_63_32 : 32;
};
unsigned int DW_2_DATA;
} ADDR_HI_UNION;
} SDMA_PKT_TIMESTAMP;
typedef struct SDMA_PKT_TRAP_TAG {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int reserved_0 : 16;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int int_ctx : 28;
unsigned int reserved_1 : 4;
};
unsigned int DW_1_DATA;
} INT_CONTEXT_UNION;
} SDMA_PKT_TRAP;
// Initialize Hdp flush packet for use on sDMA of devices
// from Gfx9 or new family
static const SDMA_PKT_POLL_REGMEM hdp_flush_cmd_ {
{ SDMA_OP_POLL_REGMEM },
{ 0x00 },
{ 0x80000000 },
{ 0x00 },
{ 0x00 },
{ 0x00 },
};
// Version of sDMA microcode supporting Hdp flush
static const uint16_t sdma_version_ = 0x01A5;
inline uint32_t ptrlow32(const void* p) {
return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(p));
@@ -388,8 +70,8 @@ inline uint32_t ptrhigh32(const void* p) {
const size_t BlitSdmaBase::kQueueSize = 1024 * 1024;
const size_t BlitSdmaBase::kCopyPacketSize = sizeof(SDMA_PKT_COPY_LINEAR);
const size_t BlitSdmaBase::kMaxSingleCopySize = 0x3fffe0; // From HW documentation
const size_t BlitSdmaBase::kMaxSingleFillSize = 0x3fffe0;
const size_t BlitSdmaBase::kMaxSingleCopySize = SDMA_PKT_COPY_LINEAR::kMaxSize_;
const size_t BlitSdmaBase::kMaxSingleFillSize = SDMA_PKT_CONSTANT_FILL::kMaxSize_;
// Initialize size of various sDMA commands use by this module
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
@@ -437,8 +119,6 @@ BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::~BlitSdma() {}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initialize(
const core::Agent& agent) {
agent_ = reinterpret_cast<amd::GpuAgent*>(&const_cast<core::Agent&>(agent));
if (queue_start_addr_ != NULL) {
// Already initialized.
return HSA_STATUS_SUCCESS;
@@ -448,24 +128,23 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initial
return HSA_STATUS_ERROR;
}
const amd::GpuAgentInt& amd_gpu_agent =
static_cast<const amd::GpuAgentInt&>(agent);
agent_ = reinterpret_cast<amd::GpuAgent*>(&const_cast<core::Agent&>(agent));
if (HSA_PROFILE_FULL == amd_gpu_agent.profile()) {
if (HSA_PROFILE_FULL == agent_->profile()) {
assert(false && "Only support SDMA for dgpu currently");
return HSA_STATUS_ERROR;
}
if (amd_gpu_agent.isa()->version() == core::Isa::Version(7, 0, 1)) {
if (agent_->isa()->version() == core::Isa::Version(7, 0, 1)) {
platform_atomic_support_ = false;
} else {
const core::Runtime::LinkInfo& link = core::Runtime::runtime_singleton_->GetLinkInfo(
amd_gpu_agent.node_id(), core::Runtime::runtime_singleton_->cpu_agents()[0]->node_id());
agent_->node_id(), core::Runtime::runtime_singleton_->cpu_agents()[0]->node_id());
platform_atomic_support_ = link.info.atomic_support_64bit;
}
// Determine if sDMA microcode supports HDP flush command
if (agent_->GetSdmaMicrocodeVersion() >= sdma_version_) {
if (agent_->GetSdmaMicrocodeVersion() >= SDMA_PKT_HDP_FLUSH::kMinVersion_) {
hdp_flush_support_ = true;
}
@@ -483,7 +162,7 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initial
// This call binds user mode queue object to underlying compute
// device.
const HSA_QUEUE_TYPE kQueueType_ = HSA_QUEUE_SDMA;
if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueue(amd_gpu_agent.node_id(), kQueueType_, 100,
if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueue(agent_->node_id(), kQueueType_, 100,
HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_,
kQueueSize, NULL, &queue_resource_)) {
Destroy(agent);
@@ -539,6 +218,159 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Destroy
return HSA_STATUS_SUCCESS;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitCommand(
const void* cmd, size_t cmd_size, std::vector<core::Signal*>& dep_signals,
core::Signal& out_signal) {
// The signal is 64 bit value, and poll checks for 32 bit value. So we
// need to use two poll operations per dependent signal.
const uint32_t num_poll_command =
static_cast<uint32_t>(2 * dep_signals.size());
const uint32_t total_poll_command_size =
(num_poll_command * poll_command_size_);
// Load the profiling state early in case the user disable or enable the
// profiling in the middle of the call.
const bool profiling_enabled = agent_->profiling_enabled();
uint64_t* end_ts_addr = NULL;
uint32_t total_timestamp_command_size = 0;
if (profiling_enabled) {
// SDMA timestamp packet requires 32 byte of aligned memory, but
// amd_signal_t::end_ts is not 32 byte aligned. So an extra copy packet to
// read from a 32 byte aligned bounce buffer is required to avoid changing
// the amd_signal_t ABI.
end_ts_addr = agent_->ObtainEndTsObject();
if (end_ts_addr == NULL) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
total_timestamp_command_size =
(2 * timestamp_command_size_) + linear_copy_command_size_;
}
// On agent that does not support platform atomic, we replace it with
// one or two fence packet(s) to update the signal value. The reason fence
// is used and not write packet is because the SDMA engine may overlap a
// serial copy/write packets.
const uint64_t completion_signal_value =
static_cast<uint64_t>(out_signal.LoadRelaxed() - 1);
const size_t sync_command_size = (platform_atomic_support_)
? atomic_command_size_
: (completion_signal_value > UINT32_MAX)
? 2 * fence_command_size_
: fence_command_size_;
// If the signal is an interrupt signal, we also need to make SDMA engine to
// send interrupt packet to IH.
const size_t interrupt_command_size =
(out_signal.signal_.event_mailbox_ptr != 0)
? (fence_command_size_ + trap_command_size_)
: 0;
// Add space for acquire or release Hdp flush command
uint32_t flush_cmd_size = 0;
if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
if ((HwIndexMonotonic) && (hdp_flush_support_)) {
flush_cmd_size = flush_command_size_;
}
}
const uint32_t total_command_size = total_poll_command_size + cmd_size + sync_command_size +
total_timestamp_command_size + interrupt_command_size + flush_cmd_size;
RingIndexTy curr_index;
char* command_addr = AcquireWriteAddress(total_command_size, curr_index);
if (command_addr == NULL) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
for (size_t i = 0; i < dep_signals.size(); ++i) {
uint32_t* signal_addr =
reinterpret_cast<uint32_t*>(dep_signals[i]->ValueLocation());
// Wait for the higher 64 bit to 0.
BuildPollCommand(command_addr, &signal_addr[1], 0);
command_addr += poll_command_size_;
// Then wait for the lower 64 bit to 0.
BuildPollCommand(command_addr, &signal_addr[0], 0);
command_addr += poll_command_size_;
}
if (profiling_enabled) {
BuildGetGlobalTimestampCommand(
command_addr, reinterpret_cast<void*>(&out_signal.signal_.start_ts));
command_addr += timestamp_command_size_;
}
// Determine if a Hdp flush cmd is required at the top of cmd stream
if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_ == false)) {
BuildHdpFlushCommand(command_addr);
command_addr += flush_command_size_;
}
}
// Do the command after all polls are satisfied.
memcpy(command_addr, cmd, cmd_size);
command_addr += cmd_size;
// Determine if a Hdp flush cmd is required at the end of cmd stream
if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_)) {
BuildHdpFlushCommand(command_addr);
command_addr += flush_command_size_;
}
}
if (profiling_enabled) {
assert(IsMultipleOf(end_ts_addr, 32));
BuildGetGlobalTimestampCommand(command_addr,
reinterpret_cast<void*>(end_ts_addr));
command_addr += timestamp_command_size_;
BuildCopyCommand(command_addr, 1,
reinterpret_cast<void*>(&out_signal.signal_.end_ts),
reinterpret_cast<void*>(end_ts_addr), sizeof(uint64_t));
command_addr += linear_copy_command_size_;
}
// After transfer is completed, decrement the signal value.
if (platform_atomic_support_) {
BuildAtomicDecrementCommand(command_addr, out_signal.ValueLocation());
command_addr += atomic_command_size_;
} else {
uint32_t* signal_value_location = reinterpret_cast<uint32_t*>(out_signal.ValueLocation());
if (completion_signal_value > UINT32_MAX) {
BuildFenceCommand(command_addr, signal_value_location + 1,
static_cast<uint32_t>(completion_signal_value >> 32));
command_addr += fence_command_size_;
}
BuildFenceCommand(command_addr, signal_value_location,
static_cast<uint32_t>(completion_signal_value));
command_addr += fence_command_size_;
}
// Update mailbox event and send interrupt to IH.
if (out_signal.signal_.event_mailbox_ptr != 0) {
BuildFenceCommand(command_addr,
reinterpret_cast<uint32_t*>(out_signal.signal_.event_mailbox_ptr),
static_cast<uint32_t>(out_signal.signal_.event_id));
command_addr += fence_command_size_;
BuildTrapCommand(command_addr);
}
ReleaseWriteAddress(curr_index, total_command_size);
return HSA_STATUS_SUCCESS;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitLinearCopyCommand(
void* dst, const void* src, size_t size) {
@@ -546,8 +378,7 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitL
// the SDMA linear copy limit.
const uint32_t num_copy_command = (size + kMaxSingleCopySize - 1) / kMaxSingleCopySize;
const uint32_t total_copy_command_size =
num_copy_command * linear_copy_command_size_;
const uint32_t total_copy_command_size = num_copy_command * linear_copy_command_size_;
// Add space for acquire or release Hdp flush command
uint32_t flush_cmd_size = 0;
@@ -603,161 +434,79 @@ template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitLinearCopyCommand(
void* dst, const void* src, size_t size, std::vector<core::Signal*>& dep_signals,
core::Signal& out_signal) {
// The signal is 64 bit value, and poll checks for 32 bit value. So we
// need to use two poll operations per dependent signal.
const uint32_t num_poll_command =
static_cast<uint32_t>(2 * dep_signals.size());
const uint32_t total_poll_command_size =
(num_poll_command * poll_command_size_);
// Break the copy into multiple copy operation incase the copy size exceeds
// Break the copy into multiple copy operations when the copy size exceeds
// the SDMA linear copy limit.
const uint32_t num_copy_command = (size + kMaxSingleCopySize - 1) / kMaxSingleCopySize;
const uint32_t total_copy_command_size =
num_copy_command * linear_copy_command_size_;
// Load the profiling state early in case the user disable or enable the
// profiling in the middle of the call.
const bool profiling_enabled = agent_->profiling_enabled();
// Assemble copy packets.
std::vector<SDMA_PKT_COPY_LINEAR> buff(num_copy_command);
BuildCopyCommand(reinterpret_cast<char*>(&buff[0]), num_copy_command, dst, src, size);
uint64_t* end_ts_addr = NULL;
uint32_t total_timestamp_command_size = 0;
return SubmitCommand(&buff[0], buff.size() * sizeof(SDMA_PKT_COPY_LINEAR), dep_signals,
out_signal);
}
if (profiling_enabled) {
// SDMA timestamp packet requires 32 byte of aligned memory, but
// amd_signal_t::end_ts is not 32 byte aligned. So an extra copy packet to
// read from a 32 byte aligned bounce buffer is required to avoid changing
// the amd_signal_t ABI.
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitCopyRectCommand(
const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src,
const hsa_dim3_t* src_offset, const hsa_dim3_t* range, std::vector<core::Signal*>& dep_signals,
core::Signal& out_signal) {
// Hardware requires DWORD alignment for base address, pitches
// Also confirm that we have a geometric rect (copied block does not wrap an edge).
if (((uintptr_t)dst->base) % 4 != 0 || ((uintptr_t)src->base) % 4 != 0)
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT,
"Copy rect base address not aligned.");
if (((uintptr_t)dst->pitch) % 4 != 0 || ((uintptr_t)src->pitch) % 4 != 0)
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect pitch not aligned.");
if (((uintptr_t)dst->slice) % 4 != 0 || ((uintptr_t)src->slice) % 4 != 0)
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect slice not aligned.");
if (uint64_t(src_offset->x) + range->x > src->pitch ||
uint64_t(dst_offset->x) + range->x > dst->pitch)
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect width out of range.");
if ((src->slice != 0) && (uint64_t(src_offset->y) + range->y) > src->slice / src->pitch)
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect height out of range.");
if ((dst->slice != 0) && (uint64_t(dst_offset->y) + range->y) > dst->slice / dst->pitch)
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect height out of range.");
if (range->z > 1 && (src->slice == 0 || dst->slice == 0))
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect slice needed.");
end_ts_addr = agent_->ObtainEndTsObject();
if (end_ts_addr == NULL) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
const uint max_pitch = 1 << SDMA_PKT_COPY_LINEAR_RECT::pitch_bits;
total_timestamp_command_size =
(2 * timestamp_command_size_) + linear_copy_command_size_;
}
std::vector<SDMA_PKT_COPY_LINEAR_RECT> pkts;
auto append = [&](size_t size) {
assert(size == sizeof(SDMA_PKT_COPY_LINEAR_RECT) && "SDMA packet size missmatch");
pkts.emplace_back(SDMA_PKT_COPY_LINEAR_RECT());
return &pkts.back();
};
// On agent that does not support platform atomic, we replace it with
// one or two fence packet(s) to update the signal value. The reason fence
// is used and not write packet is because the SDMA engine may overlap a
// serial copy/write packets.
const uint64_t completion_signal_value =
static_cast<uint64_t>(out_signal.LoadRelaxed() - 1);
const size_t sync_command_size = (platform_atomic_support_)
? atomic_command_size_
: (completion_signal_value > UINT32_MAX)
? 2 * fence_command_size_
: fence_command_size_;
// Do wide pitch 2D copies along X-Z
if (range->z == 1 && (src->pitch > max_pitch || dst->pitch > max_pitch)) {
hsa_pitched_ptr_t Src = *src;
hsa_pitched_ptr_t Dst = *dst;
hsa_dim3_t Soff = *src_offset;
hsa_dim3_t Doff = *dst_offset;
hsa_dim3_t Range = *range;
// If the signal is an interrupt signal, we also need to make SDMA engine to
// send interrupt packet to IH.
const size_t interrupt_command_size =
(out_signal.signal_.event_mailbox_ptr != 0)
? (fence_command_size_ + trap_command_size_)
: 0;
Src.base += Soff.z * Src.slice + Soff.y * Src.pitch;
Dst.base += Doff.z * Dst.slice + Doff.y * Dst.pitch;
Soff.y = Soff.z = 0;
Doff.y = Doff.z = 0;
// Add space for acquire or release Hdp flush command
uint32_t flush_cmd_size = 0;
if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
if ((HwIndexMonotonic) && (hdp_flush_support_)) {
flush_cmd_size = flush_command_size_;
}
}
Src.slice = Src.pitch;
Src.pitch = 0;
Dst.slice = Dst.pitch;
Dst.pitch = 0;
const uint32_t total_command_size =
total_poll_command_size + total_copy_command_size + sync_command_size +
total_timestamp_command_size + interrupt_command_size + flush_cmd_size;
RingIndexTy curr_index;
char* command_addr = AcquireWriteAddress(total_command_size, curr_index);
if (command_addr == NULL) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
for (size_t i = 0; i < dep_signals.size(); ++i) {
uint32_t* signal_addr =
reinterpret_cast<uint32_t*>(dep_signals[i]->ValueLocation());
// Wait for the higher 64 bit to 0.
BuildPollCommand(command_addr, &signal_addr[1], 0);
command_addr += poll_command_size_;
// Then wait for the lower 64 bit to 0.
BuildPollCommand(command_addr, &signal_addr[0], 0);
command_addr += poll_command_size_;
}
if (profiling_enabled) {
BuildGetGlobalTimestampCommand(
command_addr, reinterpret_cast<void*>(&out_signal.signal_.start_ts));
command_addr += timestamp_command_size_;
}
// Determine if a Hdp flush cmd is required at the top of cmd stream
if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_ == false)) {
BuildHdpFlushCommand(command_addr);
command_addr += flush_command_size_;
}
}
// Do the transfer after all polls are satisfied.
BuildCopyCommand(command_addr, num_copy_command, dst, src, size);
command_addr += total_copy_command_size;
// Determine if a Hdp flush cmd is required at the end of cmd stream
if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_)) {
BuildHdpFlushCommand(command_addr);
command_addr += flush_command_size_;
}
}
if (profiling_enabled) {
assert(IsMultipleOf(end_ts_addr, 32));
BuildGetGlobalTimestampCommand(command_addr,
reinterpret_cast<void*>(end_ts_addr));
command_addr += timestamp_command_size_;
BuildCopyCommand(command_addr, 1,
reinterpret_cast<void*>(&out_signal.signal_.end_ts),
reinterpret_cast<void*>(end_ts_addr), sizeof(uint64_t));
command_addr += linear_copy_command_size_;
}
// After transfer is completed, decrement the signal value.
if (platform_atomic_support_) {
BuildAtomicDecrementCommand(command_addr, out_signal.ValueLocation());
command_addr += atomic_command_size_;
Range.z = Range.y;
Range.y = 1;
BuildCopyRectCommand(append, &Dst, &Doff, &Src, &Soff, &Range);
} else {
uint32_t* signal_value_location =
reinterpret_cast<uint32_t*>(out_signal.ValueLocation());
if (completion_signal_value > UINT32_MAX) {
BuildFenceCommand(command_addr, signal_value_location + 1,
static_cast<uint32_t>(completion_signal_value >> 32));
command_addr += fence_command_size_;
}
BuildFenceCommand(command_addr, signal_value_location,
static_cast<uint32_t>(completion_signal_value));
command_addr += fence_command_size_;
BuildCopyRectCommand(append, dst, dst_offset, src, src_offset, range);
}
// Update mailbox event and send interrupt to IH.
if (out_signal.signal_.event_mailbox_ptr != 0) {
BuildFenceCommand(command_addr, reinterpret_cast<uint32_t*>(
out_signal.signal_.event_mailbox_ptr),
static_cast<uint32_t>(out_signal.signal_.event_id));
command_addr += fence_command_size_;
BuildTrapCommand(command_addr);
}
ReleaseWriteAddress(curr_index, total_command_size);
return HSA_STATUS_SUCCESS;
return SubmitCommand(&pkts[0], pkts.size() * sizeof(SDMA_PKT_COPY_LINEAR_RECT), dep_signals,
out_signal);
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
@@ -1057,6 +806,131 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildCopyComman
assert(cur_size == size);
}
/*
Copies are done in terms of elements (1, 2, 4, 8, or 16 bytes) and have alignment restrictions.
Elements are coded by the log2 of the element size in bytes (ie. element 0=1 byte, 4=16 byte).
This routine breaks a large rect into tiles that can be handled by hardware. Pitches and offsets
must be representable in terms of elements in all tiles of the copy.
*/
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildCopyRectCommand(
const std::function<void*(size_t)>& append, const hsa_pitched_ptr_t* dst,
const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src, const hsa_dim3_t* src_offset,
const hsa_dim3_t* range) {
// Returns the index of the first set bit (ie log2 of the largest power of 2 that evenly divides
// width), the largest element that perfectly covers width.
// width | 16 ensures that we don't return a higher element than is supported and avoids
// issues with 0.
auto maxAlignedElement = [](size_t width) {
return __builtin_ctz(width | 16);
};
// Limits in terms of element count
const uint max_pitch = 1 << SDMA_PKT_COPY_LINEAR_RECT::pitch_bits;
const uint max_slice = 1 << SDMA_PKT_COPY_LINEAR_RECT::slice_bits;
const uint max_x = 1 << SDMA_PKT_COPY_LINEAR_RECT::rect_xy_bits;
const uint max_y = 1 << SDMA_PKT_COPY_LINEAR_RECT::rect_xy_bits;
const uint max_z = 1 << SDMA_PKT_COPY_LINEAR_RECT::rect_z_bits;
// Find maximum element that describes the pitch and slice.
// Pitch and slice must both be represented in units of elements. No element larger than this
// may be used in any tile as the pitches would not be exactly represented.
int max_ele = Min(maxAlignedElement(src->pitch), maxAlignedElement(dst->pitch));
if (range->z != 1) // Only need to consider slice if HW will copy along Z.
max_ele = Min(max_ele, maxAlignedElement(src->slice), maxAlignedElement(dst->slice));
/*
Find the minimum element size that will be needed for any tile.
No subdivision of a range admits a larger element size for the smallest element in any subdivision
than the element size that covers the whole range, though some can be worse (this is easily model
checked). Subdividing with any element larger than the covering element won't change the covering
element of the remainder
( Range%Element = (Range-N*LargerElement)%Element since LargerElement%Element=0 ).
Ex. range->x=71, assume max range is 16 elements: We can break at 64 giving tiles:
[0,63], [64-70] (width 64 & 7). 64 is covered by element 4 (16B) and 7 is covered by element 0
(1B). Exactly covering 71 requires using element 0.
Base addresses in each tile must be DWORD aligned, if not then the offset from an aligned address
must be represented in elements. This may reduce the size of the element, but since elements are
integer multiples of each other this is harmless.
src and dst base has already been checked for DWORD alignment so we only need to consider the
offset here.
*/
int min_ele = Min(max_ele, maxAlignedElement(range->x), maxAlignedElement(src_offset->x % 4),
maxAlignedElement(dst_offset->x % 4));
// Check that pitch and slice can be represented in the tile with the smallest element
if ((src->pitch >> min_ele) > max_pitch || (dst->pitch >> min_ele) > max_pitch)
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect pitch out of limits.\n");
if (range->z != 1) { // Only need to consider slice if HW will copy along Z.
if ((src->slice >> min_ele) > max_slice || (dst->slice >> min_ele) > max_slice)
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT,
"Copy rect slice out of limits.\n");
}
// Break copy into tiles
for (uint64_t z = 0; z < range->z; z += max_z) {
for (uint64_t y = 0; y < range->y; y += max_y) {
uint64_t x = 0;
while (x < range->x) {
uint64_t width = range->x - x;
// Get largest element which describes the start of this tile after its base address has
// been aligned. Base addresses must be DWORD (4 byte) aligned.
int aligned_ele = Min(maxAlignedElement((src_offset->x + x) % 4),
maxAlignedElement((dst_offset->x + x) % 4), max_ele);
// Get largest permissible element which exactly covers width
int element = Min(maxAlignedElement(width), aligned_ele);
int xcount = width >> element;
// If width is too large then width is at least max_x bytes (bigger than any element) so
// drop the width restriction and clip element count to max_x.
if (xcount > max_x) {
element = aligned_ele;
xcount = Min(width >> element, max_x);
}
// Get base addresses and offsets for this tile.
uintptr_t sbase = (uintptr_t)src->base + src_offset->x + x +
(src_offset->y + y) * src->pitch + (src_offset->z + z) * src->slice;
uintptr_t dbase = (uintptr_t)dst->base + dst_offset->x + x +
(dst_offset->y + y) * dst->pitch + (dst_offset->z + z) * dst->slice;
uint soff = (sbase % 4) >> element;
uint doff = (dbase % 4) >> element;
sbase &= ~3ull;
dbase &= ~3ull;
x += xcount << element;
SDMA_PKT_COPY_LINEAR_RECT* pkt =
(SDMA_PKT_COPY_LINEAR_RECT*)append(sizeof(SDMA_PKT_COPY_LINEAR_RECT));
*pkt = {};
pkt->HEADER_UNION.op = SDMA_OP_COPY;
pkt->HEADER_UNION.sub_op = SDMA_SUBOP_COPY_LINEAR_RECT;
pkt->HEADER_UNION.element = element;
pkt->SRC_ADDR_LO_UNION.src_addr_31_0 = sbase;
pkt->SRC_ADDR_HI_UNION.src_addr_63_32 = sbase >> 32;
pkt->SRC_PARAMETER_1_UNION.src_offset_x = soff;
pkt->SRC_PARAMETER_2_UNION.src_pitch = (src->pitch >> element) - 1;
pkt->SRC_PARAMETER_3_UNION.src_slice_pitch =
(range->z == 1) ? 0 : (src->slice >> element) - 1;
pkt->DST_ADDR_LO_UNION.dst_addr_31_0 = dbase;
pkt->DST_ADDR_HI_UNION.dst_addr_63_32 = dbase >> 32;
pkt->DST_PARAMETER_1_UNION.dst_offset_x = doff;
pkt->DST_PARAMETER_2_UNION.dst_pitch = (dst->pitch >> element) - 1;
pkt->DST_PARAMETER_3_UNION.dst_slice_pitch =
(range->z == 1) ? 0 : (dst->slice >> element) - 1;
pkt->RECT_PARAMETER_1_UNION.rect_x = xcount - 1;
pkt->RECT_PARAMETER_1_UNION.rect_y = Min(range->y - y, max_y) - 1;
pkt->RECT_PARAMETER_2_UNION.rect_z = Min(range->z - z, max_z) - 1;
}
}
}
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildPollCommand(
char* cmd_addr, void* addr, uint32_t reference) {
@@ -1126,7 +1000,7 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildHdpFlushCo
char* cmd_addr) {
assert(cmd_addr != NULL);
SDMA_PKT_POLL_REGMEM* addr = reinterpret_cast<SDMA_PKT_POLL_REGMEM*>(cmd_addr);
memcpy(addr, &hdp_flush_cmd_, flush_command_size_);
memcpy(addr, &hdp_flush_cmd, flush_command_size_);
}
template class BlitSdma<uint32_t, false, 0>;
+25
Просмотреть файл
@@ -634,6 +634,31 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
return stat;
}
hsa_status_t GpuAgent::DmaCopyRect(const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset,
const hsa_pitched_ptr_t* src, const hsa_dim3_t* src_offset,
const hsa_dim3_t* range, hsa_amd_copy_direction_t dir,
std::vector<core::Signal*>& dep_signals,
core::Signal& out_signal) {
if (isa_->GetMajorVersion() < 9) return HSA_STATUS_ERROR_INVALID_AGENT;
lazy_ptr<core::Blit>& blit =
(dir == hsaHostToDevice) ? blits_[BlitHostToDev] : blits_[BlitDevToHost];
if (!blit->isSDMA()) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
if (profiling_enabled()) {
// Track the agent so we could translate the resulting timestamp to system
// domain correctly.
out_signal.async_copy_agent(core::Agent::Convert(this->public_handle()));
}
BlitSdmaBase* sdmaBlit = static_cast<BlitSdmaBase*>((*blit).get());
hsa_status_t stat = sdmaBlit->SubmitCopyRectCommand(dst, dst_offset, src, src_offset, range,
dep_signals, out_signal);
return stat;
}
hsa_status_t GpuAgent::DmaFill(void* ptr, uint32_t value, size_t count) {
return blits_[BlitDevToDev]->SubmitLinearFillCommand(ptr, value, count);
}
+1
Просмотреть файл
@@ -381,6 +381,7 @@ void HsaApiTable::UpdateAmdExts() {
amd_ext_api.hsa_amd_queue_intercept_create_fn = AMD::hsa_amd_queue_intercept_create;
amd_ext_api.hsa_amd_queue_intercept_register_fn = AMD::hsa_amd_queue_intercept_register;
amd_ext_api.hsa_amd_queue_set_priority_fn = AMD::hsa_amd_queue_set_priority;
amd_ext_api.hsa_amd_memory_async_copy_rect_fn = AMD::hsa_amd_memory_async_copy_rect;
}
class Init {
+47
Просмотреть файл
@@ -47,6 +47,7 @@
#include <utility>
#include <memory>
#include <map>
#include <vector>
#include "core/inc/runtime.h"
#include "core/inc/agent.h"
@@ -262,6 +263,52 @@ hsa_status_t hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent_handle,
CATCH;
}
hsa_status_t HSA_API hsa_amd_memory_async_copy_rect(
const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src,
const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent,
hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
hsa_signal_t completion_signal) {
TRY;
if (dst == nullptr || src == nullptr || dst_offset == nullptr || src_offset == nullptr ||
range == nullptr) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
if ((num_dep_signals == 0 && dep_signals != NULL) ||
(num_dep_signals > 0 && dep_signals == NULL)) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
if (dir == hsaHostToHost) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
core::Agent* base_agent = core::Agent::Convert(copy_agent);
IS_VALID(base_agent);
if (base_agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice)
return HSA_STATUS_ERROR_INVALID_AGENT;
amd::GpuAgent* agent = static_cast<amd::GpuAgent*>(base_agent);
std::vector<core::Signal*> dep_signal_list(num_dep_signals);
if (num_dep_signals > 0) {
for (size_t i = 0; i < num_dep_signals; ++i) {
core::Signal* dep_signal_obj = core::Signal::Convert(dep_signals[i]);
IS_VALID(dep_signal_obj);
dep_signal_list[i] = dep_signal_obj;
}
}
core::Signal* out_signal_obj = core::Signal::Convert(completion_signal);
IS_VALID(out_signal_obj);
if ((range->x != 0) && (range->y != 0) && (range->z != 0)) {
return agent->DmaCopyRect(dst, dst_offset, src, src_offset, range, dir, dep_signal_list,
*out_signal_obj);
}
return HSA_STATUS_SUCCESS;
CATCH;
}
hsa_status_t hsa_amd_profiling_set_profiler_enabled(hsa_queue_t* queue, int enable) {
TRY;
IS_OPEN();
+10
Просмотреть файл
@@ -176,6 +176,11 @@ static __forceinline T Min(const T& a, const T& b) {
return (a > b) ? b : a;
}
template <class T, class... Arg>
static __forceinline T Min(const T& a, const T& b, Arg... args) {
return Min(a, Min(b, args...));
}
/// @brief: Find out the max one of two inputs, input must support ">" operator.
/// @param: a(Input), a reference to type T.
/// @param: b(Input), a reference to type T.
@@ -185,6 +190,11 @@ static __forceinline T Max(const T& a, const T& b) {
return (b > a) ? b : a;
}
template <class T, class... Arg>
static __forceinline T Max(const T& a, const T& b, Arg... args) {
return Max(a, Max(b, args...));
}
/// @brief: Free the memory space which is newed previously.
/// @param: ptr(Input), a pointer to memory space. Can't be NULL.
/// @return: void.
+1
Просмотреть файл
@@ -216,6 +216,7 @@ global:
hsa_amd_ipc_signal_attach;
hsa_amd_register_system_event_handler;
hsa_amd_queue_set_priority;
hsa_amd_memory_async_copy_rect;
local:
*;
+1
Просмотреть файл
@@ -172,6 +172,7 @@ struct AmdExtTable {
decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn;
decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn;
decltype(hsa_amd_queue_set_priority)* hsa_amd_queue_set_priority_fn;
decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect_fn;
};
// Table to export HSA Core Runtime Apis
+37
Просмотреть файл
@@ -882,6 +882,43 @@ hsa_status_t HSA_API
const hsa_signal_t* dep_signals,
hsa_signal_t completion_signal);
/*
[Provisional API]
Pitched memory descriptor.
All elements must be 4 byte aligned. Pitch and slice are in bytes.
*/
typedef struct hsa_pitched_ptr_s {
void* base;
size_t pitch;
size_t slice;
} hsa_pitched_ptr_t;
/*
[Provisional API]
Copy direction flag.
*/
typedef enum {
hsaHostToHost = 0,
hsaHostToDevice = 1,
hsaDeviceToHost = 2,
hsaDeviceToDevice = 3
} hsa_amd_copy_direction_t;
/*
[Provisional API]
SDMA 3D memory copy API. The same requirements must be met by src and dst as in
hsa_amd_memory_async_copy.
Both src and dst must be directly accessible to the copy_agent during the copy, src and dst rects
must not overlap.
CPU agents are not supported. API requires SDMA and will return an error if SDMA is not available.
Offsets and range carry x in bytes, y and z in rows and layers.
*/
hsa_status_t HSA_API hsa_amd_memory_async_copy_rect(
const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src,
const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent,
hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
hsa_signal_t completion_signal);
/**
* @brief Type of accesses to a memory pool from a given agent.
*/