Implement SDMA copy rect for gfx9.
Fix pitch overflow due to small element detection. Add wide pitch 2D copy handling. Cleanup code duplication. Change-Id: I93b1584aba8e5964957eb7ab3544df806ca3e2f9
Этот коммит содержится в:
@@ -963,6 +963,17 @@ hsa_status_t HSA_API
|
||||
num_dep_signals, dep_signals, completion_signal);
|
||||
}
|
||||
|
||||
// Mirrors Amd Extension Apis
|
||||
hsa_status_t HSA_API hsa_amd_memory_async_copy_rect(
|
||||
const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src,
|
||||
const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent,
|
||||
hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
|
||||
hsa_signal_t completion_signal) {
|
||||
return amdExtTable->hsa_amd_memory_async_copy_rect_fn(dst, dst_offset, src, src_offset, range,
|
||||
copy_agent, dir, num_dep_signals,
|
||||
dep_signals, completion_signal);
|
||||
}
|
||||
|
||||
// Mirrors Amd Extension Apis
|
||||
hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info(
|
||||
hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool,
|
||||
|
||||
@@ -45,6 +45,7 @@
|
||||
|
||||
#include <mutex>
|
||||
#include <stdint.h>
|
||||
#include <vector>
|
||||
|
||||
#include "hsakmt.h"
|
||||
|
||||
@@ -55,6 +56,7 @@
|
||||
#include "core/util/utils.h"
|
||||
|
||||
namespace amd {
|
||||
|
||||
class BlitSdmaBase : public core::Blit {
|
||||
public:
|
||||
static const size_t kQueueSize;
|
||||
@@ -62,6 +64,12 @@ class BlitSdmaBase : public core::Blit {
|
||||
static const size_t kMaxSingleCopySize;
|
||||
static const size_t kMaxSingleFillSize;
|
||||
virtual bool isSDMA() const override { return true; }
|
||||
virtual hsa_status_t SubmitCopyRectCommand(const hsa_pitched_ptr_t* dst,
|
||||
const hsa_dim3_t* dst_offset,
|
||||
const hsa_pitched_ptr_t* src,
|
||||
const hsa_dim3_t* src_offset, const hsa_dim3_t* range,
|
||||
std::vector<core::Signal*>& dep_signals,
|
||||
core::Signal& out_signal) = 0;
|
||||
};
|
||||
|
||||
// RingIndexTy: 32/64-bit monotonic ring index, counting in bytes.
|
||||
@@ -116,6 +124,13 @@ class BlitSdma : public BlitSdmaBase {
|
||||
std::vector<core::Signal*>& dep_signals,
|
||||
core::Signal& out_signal) override;
|
||||
|
||||
virtual hsa_status_t SubmitCopyRectCommand(const hsa_pitched_ptr_t* dst,
|
||||
const hsa_dim3_t* dst_offset,
|
||||
const hsa_pitched_ptr_t* src,
|
||||
const hsa_dim3_t* src_offset, const hsa_dim3_t* range,
|
||||
std::vector<core::Signal*>& dep_signals,
|
||||
core::Signal& out_signal) override;
|
||||
|
||||
/// @brief Submit a linear fill command to the queue buffer
|
||||
///
|
||||
/// @param ptr Memory address of the fill destination.
|
||||
@@ -181,6 +196,11 @@ class BlitSdma : public BlitSdmaBase {
|
||||
void BuildCopyCommand(char* cmd_addr, uint32_t num_copy_command, void* dst,
|
||||
const void* src, size_t size);
|
||||
|
||||
void BuildCopyRectCommand(const std::function<void*(size_t)>& append,
|
||||
const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset,
|
||||
const hsa_pitched_ptr_t* src, const hsa_dim3_t* src_offset,
|
||||
const hsa_dim3_t* range);
|
||||
|
||||
void BuildPollCommand(char* cmd_addr, void* addr, uint32_t reference);
|
||||
|
||||
void BuildAtomicDecrementCommand(char* cmd_addr, void* addr);
|
||||
@@ -189,6 +209,9 @@ class BlitSdma : public BlitSdmaBase {
|
||||
|
||||
void BuildTrapCommand(char* cmd_addr);
|
||||
|
||||
hsa_status_t SubmitCommand(const void* cmds, size_t cmd_size,
|
||||
std::vector<core::Signal*>& dep_signals, core::Signal& out_signal);
|
||||
|
||||
// Agent object owning the SDMA engine.
|
||||
GpuAgent* agent_;
|
||||
|
||||
|
||||
@@ -240,6 +240,12 @@ class GpuAgent : public GpuAgentInt {
|
||||
std::vector<core::Signal*>& dep_signals,
|
||||
core::Signal& out_signal) override;
|
||||
|
||||
// @brief Override from core::Agent.
|
||||
hsa_status_t DmaCopyRect(const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset,
|
||||
const hsa_pitched_ptr_t* src, const hsa_dim3_t* src_offset,
|
||||
const hsa_dim3_t* range, hsa_amd_copy_direction_t dir,
|
||||
std::vector<core::Signal*>& dep_signals, core::Signal& out_signal);
|
||||
|
||||
// @brief Override from core::Agent.
|
||||
hsa_status_t DmaFill(void* ptr, uint32_t value, size_t count) override;
|
||||
|
||||
|
||||
@@ -140,6 +140,13 @@ hsa_status_t HSA_API
|
||||
const hsa_signal_t* dep_signals,
|
||||
hsa_signal_t completion_signal);
|
||||
|
||||
// Mirrors Amd Extension Apis
|
||||
hsa_status_t HSA_API hsa_amd_memory_async_copy_rect(
|
||||
const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src,
|
||||
const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent,
|
||||
hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
|
||||
hsa_signal_t completion_signal);
|
||||
|
||||
// Mirrors Amd Extension Apis
|
||||
hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info(
|
||||
hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool,
|
||||
|
||||
@@ -0,0 +1,499 @@
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// The University of Illinois/NCSA
|
||||
// Open Source License (NCSA)
|
||||
//
|
||||
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Developed by:
|
||||
//
|
||||
// AMD Research and AMD HSA Software Development
|
||||
//
|
||||
// Advanced Micro Devices, Inc.
|
||||
//
|
||||
// www.amd.com
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to
|
||||
// deal with the Software without restriction, including without limitation
|
||||
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
// and/or sell copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// - Redistributions of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimers.
|
||||
// - Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimers in
|
||||
// the documentation and/or other materials provided with the distribution.
|
||||
// - Neither the names of Advanced Micro Devices, Inc,
|
||||
// nor the names of its contributors may be used to endorse or promote
|
||||
// products derived from this Software without specific prior written
|
||||
// permission.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
// DEALINGS WITH THE SOFTWARE.
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef HSA_RUNTIME_CORE_INC_SDMA_REGISTERS_H_
|
||||
#define HSA_RUNTIME_CORE_INC_SDMA_REGISTERS_H_
|
||||
|
||||
namespace amd {
|
||||
|
||||
// SDMA packet for VI device.
|
||||
// Reference: http://people.freedesktop.org/~agd5f/dma_packets.txt
|
||||
|
||||
const unsigned int SDMA_OP_COPY = 1;
|
||||
const unsigned int SDMA_OP_FENCE = 5;
|
||||
const unsigned int SDMA_OP_TRAP = 6;
|
||||
const unsigned int SDMA_OP_POLL_REGMEM = 8;
|
||||
const unsigned int SDMA_OP_ATOMIC = 10;
|
||||
const unsigned int SDMA_OP_CONST_FILL = 11;
|
||||
const unsigned int SDMA_OP_TIMESTAMP = 13;
|
||||
const unsigned int SDMA_SUBOP_COPY_LINEAR = 0;
|
||||
const unsigned int SDMA_SUBOP_COPY_LINEAR_RECT = 4;
|
||||
const unsigned int SDMA_SUBOP_TIMESTAMP_GET_GLOBAL = 2;
|
||||
const unsigned int SDMA_ATOMIC_ADD64 = 47;
|
||||
|
||||
typedef struct SDMA_PKT_COPY_LINEAR_TAG {
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int extra_info : 16;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int count : 22;
|
||||
unsigned int reserved_0 : 10;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} COUNT_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int reserved_0 : 16;
|
||||
unsigned int dst_swap : 2;
|
||||
unsigned int reserved_1 : 6;
|
||||
unsigned int src_swap : 2;
|
||||
unsigned int reserved_2 : 6;
|
||||
};
|
||||
unsigned int DW_2_DATA;
|
||||
} PARAMETER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_3_DATA;
|
||||
} SRC_ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_4_DATA;
|
||||
} SRC_ADDR_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int dst_addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_5_DATA;
|
||||
} DST_ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int dst_addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_6_DATA;
|
||||
} DST_ADDR_HI_UNION;
|
||||
|
||||
static const size_t kMaxSize_ = 0x3fffe0;
|
||||
} SDMA_PKT_COPY_LINEAR;
|
||||
|
||||
// linear sub-window
|
||||
typedef struct SDMA_PKT_COPY_LINEAR_RECT_TAG {
|
||||
static const unsigned int pitch_bits = 19;
|
||||
static const unsigned int slice_bits = 28;
|
||||
static const unsigned int rect_xy_bits = 14;
|
||||
static const unsigned int rect_z_bits = 11;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int reserved : 13;
|
||||
unsigned int element : 3;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} SRC_ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_2_DATA;
|
||||
} SRC_ADDR_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_offset_x : 14;
|
||||
unsigned int reserved_1 : 2;
|
||||
unsigned int src_offset_y : 14;
|
||||
unsigned int reserved_2 : 2;
|
||||
};
|
||||
unsigned int DW_3_DATA;
|
||||
} SRC_PARAMETER_1_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_offset_z : 11;
|
||||
unsigned int reserved_1 : 2;
|
||||
unsigned int src_pitch : pitch_bits;
|
||||
};
|
||||
unsigned int DW_4_DATA;
|
||||
} SRC_PARAMETER_2_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_slice_pitch : slice_bits;
|
||||
unsigned int reserved_1 : 4;
|
||||
};
|
||||
unsigned int DW_5_DATA;
|
||||
} SRC_PARAMETER_3_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int dst_addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_6_DATA;
|
||||
} DST_ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int dst_addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_7_DATA;
|
||||
} DST_ADDR_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int dst_offset_x : 14;
|
||||
unsigned int reserved_1 : 2;
|
||||
unsigned int dst_offset_y : 14;
|
||||
unsigned int reserved_2 : 2;
|
||||
};
|
||||
unsigned int DW_8_DATA;
|
||||
} DST_PARAMETER_1_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int dst_offset_z : 11;
|
||||
unsigned int reserved_1 : 2;
|
||||
unsigned int dst_pitch : pitch_bits;
|
||||
};
|
||||
unsigned int DW_9_DATA;
|
||||
} DST_PARAMETER_2_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int dst_slice_pitch : slice_bits;
|
||||
unsigned int reserved_1 : 4;
|
||||
};
|
||||
unsigned int DW_10_DATA;
|
||||
} DST_PARAMETER_3_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int rect_x : rect_xy_bits;
|
||||
unsigned int reserved_1 : 2;
|
||||
unsigned int rect_y : rect_xy_bits;
|
||||
unsigned int reserved_2 : 2;
|
||||
};
|
||||
unsigned int DW_11_DATA;
|
||||
} RECT_PARAMETER_1_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int rect_z : rect_z_bits;
|
||||
unsigned int reserved_1 : 5;
|
||||
unsigned int dst_swap : 2;
|
||||
unsigned int reserved_2 : 6;
|
||||
unsigned int src_swap : 2;
|
||||
unsigned int reserved_3 : 6;
|
||||
};
|
||||
unsigned int DW_12_DATA;
|
||||
} RECT_PARAMETER_2_UNION;
|
||||
|
||||
// static const unsigned int pitch_bits = 19;
|
||||
} SDMA_PKT_COPY_LINEAR_RECT;
|
||||
|
||||
typedef struct SDMA_PKT_CONSTANT_FILL_TAG {
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int sw : 2;
|
||||
unsigned int reserved_0 : 12;
|
||||
unsigned int fillsize : 2;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int dst_addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} DST_ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int dst_addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_2_DATA;
|
||||
} DST_ADDR_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_data_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_3_DATA;
|
||||
} DATA_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int count : 22;
|
||||
unsigned int reserved_0 : 10;
|
||||
};
|
||||
unsigned int DW_4_DATA;
|
||||
} COUNT_UNION;
|
||||
|
||||
static const size_t kMaxSize_ = 0x3fffe0;
|
||||
} SDMA_PKT_CONSTANT_FILL;
|
||||
|
||||
typedef struct SDMA_PKT_FENCE_TAG {
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int reserved_0 : 16;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_2_DATA;
|
||||
} ADDR_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int data : 32;
|
||||
};
|
||||
unsigned int DW_3_DATA;
|
||||
} DATA_UNION;
|
||||
} SDMA_PKT_FENCE;
|
||||
|
||||
typedef struct SDMA_PKT_POLL_REGMEM_TAG {
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int reserved_0 : 10;
|
||||
unsigned int hdp_flush : 1;
|
||||
unsigned int reserved_1 : 1;
|
||||
unsigned int func : 3;
|
||||
unsigned int mem_poll : 1;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_2_DATA;
|
||||
} ADDR_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int value : 32;
|
||||
};
|
||||
unsigned int DW_3_DATA;
|
||||
} VALUE_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int mask : 32;
|
||||
};
|
||||
unsigned int DW_4_DATA;
|
||||
} MASK_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int interval : 16;
|
||||
unsigned int retry_count : 12;
|
||||
unsigned int reserved_0 : 4;
|
||||
};
|
||||
unsigned int DW_5_DATA;
|
||||
} DW5_UNION;
|
||||
} SDMA_PKT_POLL_REGMEM;
|
||||
|
||||
typedef struct SDMA_PKT_ATOMIC_TAG {
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int l : 1;
|
||||
unsigned int reserved_0 : 8;
|
||||
unsigned int operation : 7;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_2_DATA;
|
||||
} ADDR_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_data_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_3_DATA;
|
||||
} SRC_DATA_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_data_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_4_DATA;
|
||||
} SRC_DATA_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int cmp_data_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_5_DATA;
|
||||
} CMP_DATA_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int cmp_data_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_6_DATA;
|
||||
} CMP_DATA_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int loop_interval : 13;
|
||||
unsigned int reserved_0 : 19;
|
||||
};
|
||||
unsigned int DW_7_DATA;
|
||||
} LOOP_UNION;
|
||||
} SDMA_PKT_ATOMIC;
|
||||
|
||||
typedef struct SDMA_PKT_TIMESTAMP_TAG {
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int reserved_0 : 16;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_2_DATA;
|
||||
} ADDR_HI_UNION;
|
||||
|
||||
} SDMA_PKT_TIMESTAMP;
|
||||
|
||||
typedef struct SDMA_PKT_TRAP_TAG {
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int reserved_0 : 16;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int int_ctx : 28;
|
||||
unsigned int reserved_1 : 4;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} INT_CONTEXT_UNION;
|
||||
} SDMA_PKT_TRAP;
|
||||
|
||||
// HDP flush packet, no parameters.
|
||||
typedef struct SDMA_PKT_HDP_FLUSH_TAG {
|
||||
unsigned int DW_0_DATA;
|
||||
unsigned int DW_1_DATA;
|
||||
unsigned int DW_2_DATA;
|
||||
unsigned int DW_3_DATA;
|
||||
unsigned int DW_4_DATA;
|
||||
unsigned int DW_5_DATA;
|
||||
|
||||
// Version of gfx9 sDMA microcode introducing SDMA_PKT_HDP_FLUSH
|
||||
static const uint16_t kMinVersion_ = 0x1A5;
|
||||
} SDMA_PKT_HDP_FLUSH;
|
||||
static const SDMA_PKT_HDP_FLUSH hdp_flush_cmd = {0x8, 0x0, 0x80000000, 0x0, 0x0, 0x0};
|
||||
|
||||
} // namespace amd
|
||||
|
||||
#endif // HSA_RUNTIME_CORE_INC_SDMA_REGISTERS_H_
|
||||
@@ -51,328 +51,10 @@
|
||||
#include "core/inc/amd_gpu_agent.h"
|
||||
#include "core/inc/amd_memory_region.h"
|
||||
#include "core/inc/runtime.h"
|
||||
#include "core/inc/sdma_registers.h"
|
||||
#include "core/inc/signal.h"
|
||||
|
||||
namespace amd {
|
||||
// SDMA packet for VI device.
|
||||
// Reference: http://people.freedesktop.org/~agd5f/dma_packets.txt
|
||||
|
||||
const unsigned int SDMA_OP_COPY = 1;
|
||||
const unsigned int SDMA_OP_FENCE = 5;
|
||||
const unsigned int SDMA_OP_TRAP = 6;
|
||||
const unsigned int SDMA_OP_POLL_REGMEM = 8;
|
||||
const unsigned int SDMA_OP_ATOMIC = 10;
|
||||
const unsigned int SDMA_OP_CONST_FILL = 11;
|
||||
const unsigned int SDMA_OP_TIMESTAMP = 13;
|
||||
const unsigned int SDMA_SUBOP_COPY_LINEAR = 0;
|
||||
const unsigned int SDMA_SUBOP_TIMESTAMP_GET_GLOBAL = 2;
|
||||
const unsigned int SDMA_ATOMIC_ADD64 = 47;
|
||||
|
||||
typedef struct SDMA_PKT_COPY_LINEAR_TAG {
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int extra_info : 16;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int count : 22;
|
||||
unsigned int reserved_0 : 10;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} COUNT_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int reserved_0 : 16;
|
||||
unsigned int dst_swap : 2;
|
||||
unsigned int reserved_1 : 6;
|
||||
unsigned int src_swap : 2;
|
||||
unsigned int reserved_2 : 6;
|
||||
};
|
||||
unsigned int DW_2_DATA;
|
||||
} PARAMETER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_3_DATA;
|
||||
} SRC_ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_4_DATA;
|
||||
} SRC_ADDR_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int dst_addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_5_DATA;
|
||||
} DST_ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int dst_addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_6_DATA;
|
||||
} DST_ADDR_HI_UNION;
|
||||
} SDMA_PKT_COPY_LINEAR;
|
||||
|
||||
typedef struct SDMA_PKT_CONSTANT_FILL_TAG {
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int sw : 2;
|
||||
unsigned int reserved_0 : 12;
|
||||
unsigned int fillsize : 2;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int dst_addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} DST_ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int dst_addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_2_DATA;
|
||||
} DST_ADDR_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_data_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_3_DATA;
|
||||
} DATA_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int count : 22;
|
||||
unsigned int reserved_0 : 10;
|
||||
};
|
||||
unsigned int DW_4_DATA;
|
||||
} COUNT_UNION;
|
||||
} SDMA_PKT_CONSTANT_FILL;
|
||||
|
||||
typedef struct SDMA_PKT_FENCE_TAG {
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int reserved_0 : 16;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_2_DATA;
|
||||
} ADDR_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int data : 32;
|
||||
};
|
||||
unsigned int DW_3_DATA;
|
||||
} DATA_UNION;
|
||||
} SDMA_PKT_FENCE;
|
||||
|
||||
typedef struct SDMA_PKT_POLL_REGMEM_TAG {
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int reserved_0 : 10;
|
||||
unsigned int hdp_flush : 1;
|
||||
unsigned int reserved_1 : 1;
|
||||
unsigned int func : 3;
|
||||
unsigned int mem_poll : 1;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_2_DATA;
|
||||
} ADDR_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int value : 32;
|
||||
};
|
||||
unsigned int DW_3_DATA;
|
||||
} VALUE_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int mask : 32;
|
||||
};
|
||||
unsigned int DW_4_DATA;
|
||||
} MASK_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int interval : 16;
|
||||
unsigned int retry_count : 12;
|
||||
unsigned int reserved_0 : 4;
|
||||
};
|
||||
unsigned int DW_5_DATA;
|
||||
} DW5_UNION;
|
||||
} SDMA_PKT_POLL_REGMEM;
|
||||
|
||||
typedef struct SDMA_PKT_ATOMIC_TAG {
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int l : 1;
|
||||
unsigned int reserved_0 : 8;
|
||||
unsigned int operation : 7;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_2_DATA;
|
||||
} ADDR_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_data_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_3_DATA;
|
||||
} SRC_DATA_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_data_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_4_DATA;
|
||||
} SRC_DATA_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int cmp_data_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_5_DATA;
|
||||
} CMP_DATA_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int cmp_data_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_6_DATA;
|
||||
} CMP_DATA_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int loop_interval : 13;
|
||||
unsigned int reserved_0 : 19;
|
||||
};
|
||||
unsigned int DW_7_DATA;
|
||||
} LOOP_UNION;
|
||||
} SDMA_PKT_ATOMIC;
|
||||
|
||||
typedef struct SDMA_PKT_TIMESTAMP_TAG {
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int reserved_0 : 16;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_2_DATA;
|
||||
} ADDR_HI_UNION;
|
||||
|
||||
} SDMA_PKT_TIMESTAMP;
|
||||
|
||||
typedef struct SDMA_PKT_TRAP_TAG {
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int reserved_0 : 16;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int int_ctx : 28;
|
||||
unsigned int reserved_1 : 4;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} INT_CONTEXT_UNION;
|
||||
} SDMA_PKT_TRAP;
|
||||
|
||||
// Initialize Hdp flush packet for use on sDMA of devices
|
||||
// from Gfx9 or new family
|
||||
static const SDMA_PKT_POLL_REGMEM hdp_flush_cmd_ {
|
||||
{ SDMA_OP_POLL_REGMEM },
|
||||
{ 0x00 },
|
||||
{ 0x80000000 },
|
||||
{ 0x00 },
|
||||
{ 0x00 },
|
||||
{ 0x00 },
|
||||
};
|
||||
|
||||
// Version of sDMA microcode supporting Hdp flush
|
||||
static const uint16_t sdma_version_ = 0x01A5;
|
||||
|
||||
inline uint32_t ptrlow32(const void* p) {
|
||||
return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(p));
|
||||
@@ -388,8 +70,8 @@ inline uint32_t ptrhigh32(const void* p) {
|
||||
|
||||
const size_t BlitSdmaBase::kQueueSize = 1024 * 1024;
|
||||
const size_t BlitSdmaBase::kCopyPacketSize = sizeof(SDMA_PKT_COPY_LINEAR);
|
||||
const size_t BlitSdmaBase::kMaxSingleCopySize = 0x3fffe0; // From HW documentation
|
||||
const size_t BlitSdmaBase::kMaxSingleFillSize = 0x3fffe0;
|
||||
const size_t BlitSdmaBase::kMaxSingleCopySize = SDMA_PKT_COPY_LINEAR::kMaxSize_;
|
||||
const size_t BlitSdmaBase::kMaxSingleFillSize = SDMA_PKT_CONSTANT_FILL::kMaxSize_;
|
||||
|
||||
// Initialize size of various sDMA commands use by this module
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
@@ -437,8 +119,6 @@ BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::~BlitSdma() {}
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initialize(
|
||||
const core::Agent& agent) {
|
||||
agent_ = reinterpret_cast<amd::GpuAgent*>(&const_cast<core::Agent&>(agent));
|
||||
|
||||
if (queue_start_addr_ != NULL) {
|
||||
// Already initialized.
|
||||
return HSA_STATUS_SUCCESS;
|
||||
@@ -448,24 +128,23 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initial
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
const amd::GpuAgentInt& amd_gpu_agent =
|
||||
static_cast<const amd::GpuAgentInt&>(agent);
|
||||
agent_ = reinterpret_cast<amd::GpuAgent*>(&const_cast<core::Agent&>(agent));
|
||||
|
||||
if (HSA_PROFILE_FULL == amd_gpu_agent.profile()) {
|
||||
if (HSA_PROFILE_FULL == agent_->profile()) {
|
||||
assert(false && "Only support SDMA for dgpu currently");
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
if (amd_gpu_agent.isa()->version() == core::Isa::Version(7, 0, 1)) {
|
||||
if (agent_->isa()->version() == core::Isa::Version(7, 0, 1)) {
|
||||
platform_atomic_support_ = false;
|
||||
} else {
|
||||
const core::Runtime::LinkInfo& link = core::Runtime::runtime_singleton_->GetLinkInfo(
|
||||
amd_gpu_agent.node_id(), core::Runtime::runtime_singleton_->cpu_agents()[0]->node_id());
|
||||
agent_->node_id(), core::Runtime::runtime_singleton_->cpu_agents()[0]->node_id());
|
||||
platform_atomic_support_ = link.info.atomic_support_64bit;
|
||||
}
|
||||
|
||||
// Determine if sDMA microcode supports HDP flush command
|
||||
if (agent_->GetSdmaMicrocodeVersion() >= sdma_version_) {
|
||||
if (agent_->GetSdmaMicrocodeVersion() >= SDMA_PKT_HDP_FLUSH::kMinVersion_) {
|
||||
hdp_flush_support_ = true;
|
||||
}
|
||||
|
||||
@@ -483,7 +162,7 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initial
|
||||
// This call binds user mode queue object to underlying compute
|
||||
// device.
|
||||
const HSA_QUEUE_TYPE kQueueType_ = HSA_QUEUE_SDMA;
|
||||
if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueue(amd_gpu_agent.node_id(), kQueueType_, 100,
|
||||
if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueue(agent_->node_id(), kQueueType_, 100,
|
||||
HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_,
|
||||
kQueueSize, NULL, &queue_resource_)) {
|
||||
Destroy(agent);
|
||||
@@ -539,6 +218,159 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Destroy
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitCommand(
|
||||
const void* cmd, size_t cmd_size, std::vector<core::Signal*>& dep_signals,
|
||||
core::Signal& out_signal) {
|
||||
// The signal is 64 bit value, and poll checks for 32 bit value. So we
|
||||
// need to use two poll operations per dependent signal.
|
||||
const uint32_t num_poll_command =
|
||||
static_cast<uint32_t>(2 * dep_signals.size());
|
||||
const uint32_t total_poll_command_size =
|
||||
(num_poll_command * poll_command_size_);
|
||||
|
||||
// Load the profiling state early in case the user disable or enable the
|
||||
// profiling in the middle of the call.
|
||||
const bool profiling_enabled = agent_->profiling_enabled();
|
||||
|
||||
uint64_t* end_ts_addr = NULL;
|
||||
uint32_t total_timestamp_command_size = 0;
|
||||
|
||||
if (profiling_enabled) {
|
||||
// SDMA timestamp packet requires 32 byte of aligned memory, but
|
||||
// amd_signal_t::end_ts is not 32 byte aligned. So an extra copy packet to
|
||||
// read from a 32 byte aligned bounce buffer is required to avoid changing
|
||||
// the amd_signal_t ABI.
|
||||
|
||||
end_ts_addr = agent_->ObtainEndTsObject();
|
||||
if (end_ts_addr == NULL) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
total_timestamp_command_size =
|
||||
(2 * timestamp_command_size_) + linear_copy_command_size_;
|
||||
}
|
||||
|
||||
// On agent that does not support platform atomic, we replace it with
|
||||
// one or two fence packet(s) to update the signal value. The reason fence
|
||||
// is used and not write packet is because the SDMA engine may overlap a
|
||||
// serial copy/write packets.
|
||||
const uint64_t completion_signal_value =
|
||||
static_cast<uint64_t>(out_signal.LoadRelaxed() - 1);
|
||||
const size_t sync_command_size = (platform_atomic_support_)
|
||||
? atomic_command_size_
|
||||
: (completion_signal_value > UINT32_MAX)
|
||||
? 2 * fence_command_size_
|
||||
: fence_command_size_;
|
||||
|
||||
// If the signal is an interrupt signal, we also need to make SDMA engine to
|
||||
// send interrupt packet to IH.
|
||||
const size_t interrupt_command_size =
|
||||
(out_signal.signal_.event_mailbox_ptr != 0)
|
||||
? (fence_command_size_ + trap_command_size_)
|
||||
: 0;
|
||||
|
||||
// Add space for acquire or release Hdp flush command
|
||||
uint32_t flush_cmd_size = 0;
|
||||
if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
|
||||
if ((HwIndexMonotonic) && (hdp_flush_support_)) {
|
||||
flush_cmd_size = flush_command_size_;
|
||||
}
|
||||
}
|
||||
|
||||
const uint32_t total_command_size = total_poll_command_size + cmd_size + sync_command_size +
|
||||
total_timestamp_command_size + interrupt_command_size + flush_cmd_size;
|
||||
|
||||
RingIndexTy curr_index;
|
||||
char* command_addr = AcquireWriteAddress(total_command_size, curr_index);
|
||||
|
||||
if (command_addr == NULL) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < dep_signals.size(); ++i) {
|
||||
uint32_t* signal_addr =
|
||||
reinterpret_cast<uint32_t*>(dep_signals[i]->ValueLocation());
|
||||
// Wait for the higher 64 bit to 0.
|
||||
BuildPollCommand(command_addr, &signal_addr[1], 0);
|
||||
command_addr += poll_command_size_;
|
||||
// Then wait for the lower 64 bit to 0.
|
||||
BuildPollCommand(command_addr, &signal_addr[0], 0);
|
||||
command_addr += poll_command_size_;
|
||||
}
|
||||
|
||||
if (profiling_enabled) {
|
||||
BuildGetGlobalTimestampCommand(
|
||||
command_addr, reinterpret_cast<void*>(&out_signal.signal_.start_ts));
|
||||
command_addr += timestamp_command_size_;
|
||||
}
|
||||
|
||||
// Determine if a Hdp flush cmd is required at the top of cmd stream
|
||||
if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
|
||||
if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_ == false)) {
|
||||
BuildHdpFlushCommand(command_addr);
|
||||
command_addr += flush_command_size_;
|
||||
}
|
||||
}
|
||||
|
||||
// Do the command after all polls are satisfied.
|
||||
memcpy(command_addr, cmd, cmd_size);
|
||||
command_addr += cmd_size;
|
||||
|
||||
// Determine if a Hdp flush cmd is required at the end of cmd stream
|
||||
if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
|
||||
if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_)) {
|
||||
BuildHdpFlushCommand(command_addr);
|
||||
command_addr += flush_command_size_;
|
||||
}
|
||||
}
|
||||
|
||||
if (profiling_enabled) {
|
||||
assert(IsMultipleOf(end_ts_addr, 32));
|
||||
BuildGetGlobalTimestampCommand(command_addr,
|
||||
reinterpret_cast<void*>(end_ts_addr));
|
||||
command_addr += timestamp_command_size_;
|
||||
|
||||
BuildCopyCommand(command_addr, 1,
|
||||
reinterpret_cast<void*>(&out_signal.signal_.end_ts),
|
||||
reinterpret_cast<void*>(end_ts_addr), sizeof(uint64_t));
|
||||
command_addr += linear_copy_command_size_;
|
||||
}
|
||||
|
||||
// After transfer is completed, decrement the signal value.
|
||||
if (platform_atomic_support_) {
|
||||
BuildAtomicDecrementCommand(command_addr, out_signal.ValueLocation());
|
||||
command_addr += atomic_command_size_;
|
||||
|
||||
} else {
|
||||
uint32_t* signal_value_location = reinterpret_cast<uint32_t*>(out_signal.ValueLocation());
|
||||
if (completion_signal_value > UINT32_MAX) {
|
||||
BuildFenceCommand(command_addr, signal_value_location + 1,
|
||||
static_cast<uint32_t>(completion_signal_value >> 32));
|
||||
command_addr += fence_command_size_;
|
||||
}
|
||||
|
||||
BuildFenceCommand(command_addr, signal_value_location,
|
||||
static_cast<uint32_t>(completion_signal_value));
|
||||
|
||||
command_addr += fence_command_size_;
|
||||
}
|
||||
|
||||
// Update mailbox event and send interrupt to IH.
|
||||
if (out_signal.signal_.event_mailbox_ptr != 0) {
|
||||
BuildFenceCommand(command_addr,
|
||||
reinterpret_cast<uint32_t*>(out_signal.signal_.event_mailbox_ptr),
|
||||
static_cast<uint32_t>(out_signal.signal_.event_id));
|
||||
command_addr += fence_command_size_;
|
||||
|
||||
BuildTrapCommand(command_addr);
|
||||
}
|
||||
|
||||
ReleaseWriteAddress(curr_index, total_command_size);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitLinearCopyCommand(
|
||||
void* dst, const void* src, size_t size) {
|
||||
@@ -546,8 +378,7 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitL
|
||||
// the SDMA linear copy limit.
|
||||
const uint32_t num_copy_command = (size + kMaxSingleCopySize - 1) / kMaxSingleCopySize;
|
||||
|
||||
const uint32_t total_copy_command_size =
|
||||
num_copy_command * linear_copy_command_size_;
|
||||
const uint32_t total_copy_command_size = num_copy_command * linear_copy_command_size_;
|
||||
|
||||
// Add space for acquire or release Hdp flush command
|
||||
uint32_t flush_cmd_size = 0;
|
||||
@@ -603,161 +434,79 @@ template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitLinearCopyCommand(
|
||||
void* dst, const void* src, size_t size, std::vector<core::Signal*>& dep_signals,
|
||||
core::Signal& out_signal) {
|
||||
// The signal is 64 bit value, and poll checks for 32 bit value. So we
|
||||
// need to use two poll operations per dependent signal.
|
||||
const uint32_t num_poll_command =
|
||||
static_cast<uint32_t>(2 * dep_signals.size());
|
||||
const uint32_t total_poll_command_size =
|
||||
(num_poll_command * poll_command_size_);
|
||||
|
||||
// Break the copy into multiple copy operation incase the copy size exceeds
|
||||
// Break the copy into multiple copy operations when the copy size exceeds
|
||||
// the SDMA linear copy limit.
|
||||
const uint32_t num_copy_command = (size + kMaxSingleCopySize - 1) / kMaxSingleCopySize;
|
||||
const uint32_t total_copy_command_size =
|
||||
num_copy_command * linear_copy_command_size_;
|
||||
|
||||
// Load the profiling state early in case the user disable or enable the
|
||||
// profiling in the middle of the call.
|
||||
const bool profiling_enabled = agent_->profiling_enabled();
|
||||
// Assemble copy packets.
|
||||
std::vector<SDMA_PKT_COPY_LINEAR> buff(num_copy_command);
|
||||
BuildCopyCommand(reinterpret_cast<char*>(&buff[0]), num_copy_command, dst, src, size);
|
||||
|
||||
uint64_t* end_ts_addr = NULL;
|
||||
uint32_t total_timestamp_command_size = 0;
|
||||
return SubmitCommand(&buff[0], buff.size() * sizeof(SDMA_PKT_COPY_LINEAR), dep_signals,
|
||||
out_signal);
|
||||
}
|
||||
|
||||
if (profiling_enabled) {
|
||||
// SDMA timestamp packet requires 32 byte of aligned memory, but
|
||||
// amd_signal_t::end_ts is not 32 byte aligned. So an extra copy packet to
|
||||
// read from a 32 byte aligned bounce buffer is required to avoid changing
|
||||
// the amd_signal_t ABI.
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitCopyRectCommand(
|
||||
const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src,
|
||||
const hsa_dim3_t* src_offset, const hsa_dim3_t* range, std::vector<core::Signal*>& dep_signals,
|
||||
core::Signal& out_signal) {
|
||||
// Hardware requires DWORD alignment for base address, pitches
|
||||
// Also confirm that we have a geometric rect (copied block does not wrap an edge).
|
||||
if (((uintptr_t)dst->base) % 4 != 0 || ((uintptr_t)src->base) % 4 != 0)
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT,
|
||||
"Copy rect base address not aligned.");
|
||||
if (((uintptr_t)dst->pitch) % 4 != 0 || ((uintptr_t)src->pitch) % 4 != 0)
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect pitch not aligned.");
|
||||
if (((uintptr_t)dst->slice) % 4 != 0 || ((uintptr_t)src->slice) % 4 != 0)
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect slice not aligned.");
|
||||
if (uint64_t(src_offset->x) + range->x > src->pitch ||
|
||||
uint64_t(dst_offset->x) + range->x > dst->pitch)
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect width out of range.");
|
||||
if ((src->slice != 0) && (uint64_t(src_offset->y) + range->y) > src->slice / src->pitch)
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect height out of range.");
|
||||
if ((dst->slice != 0) && (uint64_t(dst_offset->y) + range->y) > dst->slice / dst->pitch)
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect height out of range.");
|
||||
if (range->z > 1 && (src->slice == 0 || dst->slice == 0))
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect slice needed.");
|
||||
|
||||
end_ts_addr = agent_->ObtainEndTsObject();
|
||||
if (end_ts_addr == NULL) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
const uint max_pitch = 1 << SDMA_PKT_COPY_LINEAR_RECT::pitch_bits;
|
||||
|
||||
total_timestamp_command_size =
|
||||
(2 * timestamp_command_size_) + linear_copy_command_size_;
|
||||
}
|
||||
std::vector<SDMA_PKT_COPY_LINEAR_RECT> pkts;
|
||||
auto append = [&](size_t size) {
|
||||
assert(size == sizeof(SDMA_PKT_COPY_LINEAR_RECT) && "SDMA packet size missmatch");
|
||||
pkts.emplace_back(SDMA_PKT_COPY_LINEAR_RECT());
|
||||
return &pkts.back();
|
||||
};
|
||||
|
||||
// On agent that does not support platform atomic, we replace it with
|
||||
// one or two fence packet(s) to update the signal value. The reason fence
|
||||
// is used and not write packet is because the SDMA engine may overlap a
|
||||
// serial copy/write packets.
|
||||
const uint64_t completion_signal_value =
|
||||
static_cast<uint64_t>(out_signal.LoadRelaxed() - 1);
|
||||
const size_t sync_command_size = (platform_atomic_support_)
|
||||
? atomic_command_size_
|
||||
: (completion_signal_value > UINT32_MAX)
|
||||
? 2 * fence_command_size_
|
||||
: fence_command_size_;
|
||||
// Do wide pitch 2D copies along X-Z
|
||||
if (range->z == 1 && (src->pitch > max_pitch || dst->pitch > max_pitch)) {
|
||||
hsa_pitched_ptr_t Src = *src;
|
||||
hsa_pitched_ptr_t Dst = *dst;
|
||||
hsa_dim3_t Soff = *src_offset;
|
||||
hsa_dim3_t Doff = *dst_offset;
|
||||
hsa_dim3_t Range = *range;
|
||||
|
||||
// If the signal is an interrupt signal, we also need to make SDMA engine to
|
||||
// send interrupt packet to IH.
|
||||
const size_t interrupt_command_size =
|
||||
(out_signal.signal_.event_mailbox_ptr != 0)
|
||||
? (fence_command_size_ + trap_command_size_)
|
||||
: 0;
|
||||
Src.base += Soff.z * Src.slice + Soff.y * Src.pitch;
|
||||
Dst.base += Doff.z * Dst.slice + Doff.y * Dst.pitch;
|
||||
Soff.y = Soff.z = 0;
|
||||
Doff.y = Doff.z = 0;
|
||||
|
||||
// Add space for acquire or release Hdp flush command
|
||||
uint32_t flush_cmd_size = 0;
|
||||
if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
|
||||
if ((HwIndexMonotonic) && (hdp_flush_support_)) {
|
||||
flush_cmd_size = flush_command_size_;
|
||||
}
|
||||
}
|
||||
Src.slice = Src.pitch;
|
||||
Src.pitch = 0;
|
||||
Dst.slice = Dst.pitch;
|
||||
Dst.pitch = 0;
|
||||
|
||||
const uint32_t total_command_size =
|
||||
total_poll_command_size + total_copy_command_size + sync_command_size +
|
||||
total_timestamp_command_size + interrupt_command_size + flush_cmd_size;
|
||||
|
||||
RingIndexTy curr_index;
|
||||
char* command_addr = AcquireWriteAddress(total_command_size, curr_index);
|
||||
|
||||
if (command_addr == NULL) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < dep_signals.size(); ++i) {
|
||||
uint32_t* signal_addr =
|
||||
reinterpret_cast<uint32_t*>(dep_signals[i]->ValueLocation());
|
||||
// Wait for the higher 64 bit to 0.
|
||||
BuildPollCommand(command_addr, &signal_addr[1], 0);
|
||||
command_addr += poll_command_size_;
|
||||
// Then wait for the lower 64 bit to 0.
|
||||
BuildPollCommand(command_addr, &signal_addr[0], 0);
|
||||
command_addr += poll_command_size_;
|
||||
}
|
||||
|
||||
if (profiling_enabled) {
|
||||
BuildGetGlobalTimestampCommand(
|
||||
command_addr, reinterpret_cast<void*>(&out_signal.signal_.start_ts));
|
||||
command_addr += timestamp_command_size_;
|
||||
}
|
||||
|
||||
// Determine if a Hdp flush cmd is required at the top of cmd stream
|
||||
if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
|
||||
if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_ == false)) {
|
||||
BuildHdpFlushCommand(command_addr);
|
||||
command_addr += flush_command_size_;
|
||||
}
|
||||
}
|
||||
|
||||
// Do the transfer after all polls are satisfied.
|
||||
BuildCopyCommand(command_addr, num_copy_command, dst, src, size);
|
||||
command_addr += total_copy_command_size;
|
||||
|
||||
// Determine if a Hdp flush cmd is required at the end of cmd stream
|
||||
if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
|
||||
if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_)) {
|
||||
BuildHdpFlushCommand(command_addr);
|
||||
command_addr += flush_command_size_;
|
||||
}
|
||||
}
|
||||
|
||||
if (profiling_enabled) {
|
||||
assert(IsMultipleOf(end_ts_addr, 32));
|
||||
BuildGetGlobalTimestampCommand(command_addr,
|
||||
reinterpret_cast<void*>(end_ts_addr));
|
||||
command_addr += timestamp_command_size_;
|
||||
|
||||
BuildCopyCommand(command_addr, 1,
|
||||
reinterpret_cast<void*>(&out_signal.signal_.end_ts),
|
||||
reinterpret_cast<void*>(end_ts_addr), sizeof(uint64_t));
|
||||
command_addr += linear_copy_command_size_;
|
||||
}
|
||||
|
||||
// After transfer is completed, decrement the signal value.
|
||||
if (platform_atomic_support_) {
|
||||
BuildAtomicDecrementCommand(command_addr, out_signal.ValueLocation());
|
||||
command_addr += atomic_command_size_;
|
||||
Range.z = Range.y;
|
||||
Range.y = 1;
|
||||
|
||||
BuildCopyRectCommand(append, &Dst, &Doff, &Src, &Soff, &Range);
|
||||
} else {
|
||||
uint32_t* signal_value_location =
|
||||
reinterpret_cast<uint32_t*>(out_signal.ValueLocation());
|
||||
if (completion_signal_value > UINT32_MAX) {
|
||||
BuildFenceCommand(command_addr, signal_value_location + 1,
|
||||
static_cast<uint32_t>(completion_signal_value >> 32));
|
||||
command_addr += fence_command_size_;
|
||||
}
|
||||
|
||||
BuildFenceCommand(command_addr, signal_value_location,
|
||||
static_cast<uint32_t>(completion_signal_value));
|
||||
|
||||
command_addr += fence_command_size_;
|
||||
BuildCopyRectCommand(append, dst, dst_offset, src, src_offset, range);
|
||||
}
|
||||
|
||||
// Update mailbox event and send interrupt to IH.
|
||||
if (out_signal.signal_.event_mailbox_ptr != 0) {
|
||||
BuildFenceCommand(command_addr, reinterpret_cast<uint32_t*>(
|
||||
out_signal.signal_.event_mailbox_ptr),
|
||||
static_cast<uint32_t>(out_signal.signal_.event_id));
|
||||
command_addr += fence_command_size_;
|
||||
|
||||
BuildTrapCommand(command_addr);
|
||||
}
|
||||
|
||||
ReleaseWriteAddress(curr_index, total_command_size);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
return SubmitCommand(&pkts[0], pkts.size() * sizeof(SDMA_PKT_COPY_LINEAR_RECT), dep_signals,
|
||||
out_signal);
|
||||
}
|
||||
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
@@ -1057,6 +806,131 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildCopyComman
|
||||
assert(cur_size == size);
|
||||
}
|
||||
|
||||
/*
|
||||
Copies are done in terms of elements (1, 2, 4, 8, or 16 bytes) and have alignment restrictions.
|
||||
Elements are coded by the log2 of the element size in bytes (ie. element 0=1 byte, 4=16 byte).
|
||||
This routine breaks a large rect into tiles that can be handled by hardware. Pitches and offsets
|
||||
must be representable in terms of elements in all tiles of the copy.
|
||||
*/
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildCopyRectCommand(
|
||||
const std::function<void*(size_t)>& append, const hsa_pitched_ptr_t* dst,
|
||||
const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src, const hsa_dim3_t* src_offset,
|
||||
const hsa_dim3_t* range) {
|
||||
// Returns the index of the first set bit (ie log2 of the largest power of 2 that evenly divides
|
||||
// width), the largest element that perfectly covers width.
|
||||
// width | 16 ensures that we don't return a higher element than is supported and avoids
|
||||
// issues with 0.
|
||||
auto maxAlignedElement = [](size_t width) {
|
||||
return __builtin_ctz(width | 16);
|
||||
};
|
||||
|
||||
// Limits in terms of element count
|
||||
const uint max_pitch = 1 << SDMA_PKT_COPY_LINEAR_RECT::pitch_bits;
|
||||
const uint max_slice = 1 << SDMA_PKT_COPY_LINEAR_RECT::slice_bits;
|
||||
const uint max_x = 1 << SDMA_PKT_COPY_LINEAR_RECT::rect_xy_bits;
|
||||
const uint max_y = 1 << SDMA_PKT_COPY_LINEAR_RECT::rect_xy_bits;
|
||||
const uint max_z = 1 << SDMA_PKT_COPY_LINEAR_RECT::rect_z_bits;
|
||||
|
||||
// Find maximum element that describes the pitch and slice.
|
||||
// Pitch and slice must both be represented in units of elements. No element larger than this
|
||||
// may be used in any tile as the pitches would not be exactly represented.
|
||||
int max_ele = Min(maxAlignedElement(src->pitch), maxAlignedElement(dst->pitch));
|
||||
if (range->z != 1) // Only need to consider slice if HW will copy along Z.
|
||||
max_ele = Min(max_ele, maxAlignedElement(src->slice), maxAlignedElement(dst->slice));
|
||||
|
||||
/*
|
||||
Find the minimum element size that will be needed for any tile.
|
||||
|
||||
No subdivision of a range admits a larger element size for the smallest element in any subdivision
|
||||
than the element size that covers the whole range, though some can be worse (this is easily model
|
||||
checked). Subdividing with any element larger than the covering element won't change the covering
|
||||
element of the remainder
|
||||
( Range%Element = (Range-N*LargerElement)%Element since LargerElement%Element=0 ).
|
||||
Ex. range->x=71, assume max range is 16 elements: We can break at 64 giving tiles:
|
||||
[0,63], [64-70] (width 64 & 7). 64 is covered by element 4 (16B) and 7 is covered by element 0
|
||||
(1B). Exactly covering 71 requires using element 0.
|
||||
|
||||
Base addresses in each tile must be DWORD aligned, if not then the offset from an aligned address
|
||||
must be represented in elements. This may reduce the size of the element, but since elements are
|
||||
integer multiples of each other this is harmless.
|
||||
|
||||
src and dst base has already been checked for DWORD alignment so we only need to consider the
|
||||
offset here.
|
||||
*/
|
||||
int min_ele = Min(max_ele, maxAlignedElement(range->x), maxAlignedElement(src_offset->x % 4),
|
||||
maxAlignedElement(dst_offset->x % 4));
|
||||
|
||||
// Check that pitch and slice can be represented in the tile with the smallest element
|
||||
if ((src->pitch >> min_ele) > max_pitch || (dst->pitch >> min_ele) > max_pitch)
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect pitch out of limits.\n");
|
||||
if (range->z != 1) { // Only need to consider slice if HW will copy along Z.
|
||||
if ((src->slice >> min_ele) > max_slice || (dst->slice >> min_ele) > max_slice)
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT,
|
||||
"Copy rect slice out of limits.\n");
|
||||
}
|
||||
|
||||
// Break copy into tiles
|
||||
for (uint64_t z = 0; z < range->z; z += max_z) {
|
||||
for (uint64_t y = 0; y < range->y; y += max_y) {
|
||||
uint64_t x = 0;
|
||||
while (x < range->x) {
|
||||
uint64_t width = range->x - x;
|
||||
|
||||
// Get largest element which describes the start of this tile after its base address has
|
||||
// been aligned. Base addresses must be DWORD (4 byte) aligned.
|
||||
int aligned_ele = Min(maxAlignedElement((src_offset->x + x) % 4),
|
||||
maxAlignedElement((dst_offset->x + x) % 4), max_ele);
|
||||
|
||||
// Get largest permissible element which exactly covers width
|
||||
int element = Min(maxAlignedElement(width), aligned_ele);
|
||||
int xcount = width >> element;
|
||||
|
||||
// If width is too large then width is at least max_x bytes (bigger than any element) so
|
||||
// drop the width restriction and clip element count to max_x.
|
||||
if (xcount > max_x) {
|
||||
element = aligned_ele;
|
||||
xcount = Min(width >> element, max_x);
|
||||
}
|
||||
|
||||
// Get base addresses and offsets for this tile.
|
||||
uintptr_t sbase = (uintptr_t)src->base + src_offset->x + x +
|
||||
(src_offset->y + y) * src->pitch + (src_offset->z + z) * src->slice;
|
||||
uintptr_t dbase = (uintptr_t)dst->base + dst_offset->x + x +
|
||||
(dst_offset->y + y) * dst->pitch + (dst_offset->z + z) * dst->slice;
|
||||
uint soff = (sbase % 4) >> element;
|
||||
uint doff = (dbase % 4) >> element;
|
||||
sbase &= ~3ull;
|
||||
dbase &= ~3ull;
|
||||
|
||||
x += xcount << element;
|
||||
|
||||
SDMA_PKT_COPY_LINEAR_RECT* pkt =
|
||||
(SDMA_PKT_COPY_LINEAR_RECT*)append(sizeof(SDMA_PKT_COPY_LINEAR_RECT));
|
||||
*pkt = {};
|
||||
pkt->HEADER_UNION.op = SDMA_OP_COPY;
|
||||
pkt->HEADER_UNION.sub_op = SDMA_SUBOP_COPY_LINEAR_RECT;
|
||||
pkt->HEADER_UNION.element = element;
|
||||
pkt->SRC_ADDR_LO_UNION.src_addr_31_0 = sbase;
|
||||
pkt->SRC_ADDR_HI_UNION.src_addr_63_32 = sbase >> 32;
|
||||
pkt->SRC_PARAMETER_1_UNION.src_offset_x = soff;
|
||||
pkt->SRC_PARAMETER_2_UNION.src_pitch = (src->pitch >> element) - 1;
|
||||
pkt->SRC_PARAMETER_3_UNION.src_slice_pitch =
|
||||
(range->z == 1) ? 0 : (src->slice >> element) - 1;
|
||||
pkt->DST_ADDR_LO_UNION.dst_addr_31_0 = dbase;
|
||||
pkt->DST_ADDR_HI_UNION.dst_addr_63_32 = dbase >> 32;
|
||||
pkt->DST_PARAMETER_1_UNION.dst_offset_x = doff;
|
||||
pkt->DST_PARAMETER_2_UNION.dst_pitch = (dst->pitch >> element) - 1;
|
||||
pkt->DST_PARAMETER_3_UNION.dst_slice_pitch =
|
||||
(range->z == 1) ? 0 : (dst->slice >> element) - 1;
|
||||
pkt->RECT_PARAMETER_1_UNION.rect_x = xcount - 1;
|
||||
pkt->RECT_PARAMETER_1_UNION.rect_y = Min(range->y - y, max_y) - 1;
|
||||
pkt->RECT_PARAMETER_2_UNION.rect_z = Min(range->z - z, max_z) - 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildPollCommand(
|
||||
char* cmd_addr, void* addr, uint32_t reference) {
|
||||
@@ -1126,7 +1000,7 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildHdpFlushCo
|
||||
char* cmd_addr) {
|
||||
assert(cmd_addr != NULL);
|
||||
SDMA_PKT_POLL_REGMEM* addr = reinterpret_cast<SDMA_PKT_POLL_REGMEM*>(cmd_addr);
|
||||
memcpy(addr, &hdp_flush_cmd_, flush_command_size_);
|
||||
memcpy(addr, &hdp_flush_cmd, flush_command_size_);
|
||||
}
|
||||
|
||||
template class BlitSdma<uint32_t, false, 0>;
|
||||
|
||||
@@ -634,6 +634,31 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
|
||||
return stat;
|
||||
}
|
||||
|
||||
hsa_status_t GpuAgent::DmaCopyRect(const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset,
|
||||
const hsa_pitched_ptr_t* src, const hsa_dim3_t* src_offset,
|
||||
const hsa_dim3_t* range, hsa_amd_copy_direction_t dir,
|
||||
std::vector<core::Signal*>& dep_signals,
|
||||
core::Signal& out_signal) {
|
||||
if (isa_->GetMajorVersion() < 9) return HSA_STATUS_ERROR_INVALID_AGENT;
|
||||
|
||||
lazy_ptr<core::Blit>& blit =
|
||||
(dir == hsaHostToDevice) ? blits_[BlitHostToDev] : blits_[BlitDevToHost];
|
||||
|
||||
if (!blit->isSDMA()) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
|
||||
if (profiling_enabled()) {
|
||||
// Track the agent so we could translate the resulting timestamp to system
|
||||
// domain correctly.
|
||||
out_signal.async_copy_agent(core::Agent::Convert(this->public_handle()));
|
||||
}
|
||||
|
||||
BlitSdmaBase* sdmaBlit = static_cast<BlitSdmaBase*>((*blit).get());
|
||||
hsa_status_t stat = sdmaBlit->SubmitCopyRectCommand(dst, dst_offset, src, src_offset, range,
|
||||
dep_signals, out_signal);
|
||||
|
||||
return stat;
|
||||
}
|
||||
|
||||
hsa_status_t GpuAgent::DmaFill(void* ptr, uint32_t value, size_t count) {
|
||||
return blits_[BlitDevToDev]->SubmitLinearFillCommand(ptr, value, count);
|
||||
}
|
||||
|
||||
@@ -381,6 +381,7 @@ void HsaApiTable::UpdateAmdExts() {
|
||||
amd_ext_api.hsa_amd_queue_intercept_create_fn = AMD::hsa_amd_queue_intercept_create;
|
||||
amd_ext_api.hsa_amd_queue_intercept_register_fn = AMD::hsa_amd_queue_intercept_register;
|
||||
amd_ext_api.hsa_amd_queue_set_priority_fn = AMD::hsa_amd_queue_set_priority;
|
||||
amd_ext_api.hsa_amd_memory_async_copy_rect_fn = AMD::hsa_amd_memory_async_copy_rect;
|
||||
}
|
||||
|
||||
class Init {
|
||||
|
||||
@@ -47,6 +47,7 @@
|
||||
#include <utility>
|
||||
#include <memory>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
|
||||
#include "core/inc/runtime.h"
|
||||
#include "core/inc/agent.h"
|
||||
@@ -262,6 +263,52 @@ hsa_status_t hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent_handle,
|
||||
CATCH;
|
||||
}
|
||||
|
||||
hsa_status_t HSA_API hsa_amd_memory_async_copy_rect(
|
||||
const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src,
|
||||
const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent,
|
||||
hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
|
||||
hsa_signal_t completion_signal) {
|
||||
TRY;
|
||||
if (dst == nullptr || src == nullptr || dst_offset == nullptr || src_offset == nullptr ||
|
||||
range == nullptr) {
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
if ((num_dep_signals == 0 && dep_signals != NULL) ||
|
||||
(num_dep_signals > 0 && dep_signals == NULL)) {
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
if (dir == hsaHostToHost) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
|
||||
core::Agent* base_agent = core::Agent::Convert(copy_agent);
|
||||
IS_VALID(base_agent);
|
||||
if (base_agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice)
|
||||
return HSA_STATUS_ERROR_INVALID_AGENT;
|
||||
amd::GpuAgent* agent = static_cast<amd::GpuAgent*>(base_agent);
|
||||
|
||||
std::vector<core::Signal*> dep_signal_list(num_dep_signals);
|
||||
if (num_dep_signals > 0) {
|
||||
for (size_t i = 0; i < num_dep_signals; ++i) {
|
||||
core::Signal* dep_signal_obj = core::Signal::Convert(dep_signals[i]);
|
||||
IS_VALID(dep_signal_obj);
|
||||
dep_signal_list[i] = dep_signal_obj;
|
||||
}
|
||||
}
|
||||
|
||||
core::Signal* out_signal_obj = core::Signal::Convert(completion_signal);
|
||||
IS_VALID(out_signal_obj);
|
||||
|
||||
if ((range->x != 0) && (range->y != 0) && (range->z != 0)) {
|
||||
return agent->DmaCopyRect(dst, dst_offset, src, src_offset, range, dir, dep_signal_list,
|
||||
*out_signal_obj);
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
CATCH;
|
||||
}
|
||||
|
||||
|
||||
hsa_status_t hsa_amd_profiling_set_profiler_enabled(hsa_queue_t* queue, int enable) {
|
||||
TRY;
|
||||
IS_OPEN();
|
||||
|
||||
@@ -176,6 +176,11 @@ static __forceinline T Min(const T& a, const T& b) {
|
||||
return (a > b) ? b : a;
|
||||
}
|
||||
|
||||
template <class T, class... Arg>
|
||||
static __forceinline T Min(const T& a, const T& b, Arg... args) {
|
||||
return Min(a, Min(b, args...));
|
||||
}
|
||||
|
||||
/// @brief: Find out the max one of two inputs, input must support ">" operator.
|
||||
/// @param: a(Input), a reference to type T.
|
||||
/// @param: b(Input), a reference to type T.
|
||||
@@ -185,6 +190,11 @@ static __forceinline T Max(const T& a, const T& b) {
|
||||
return (b > a) ? b : a;
|
||||
}
|
||||
|
||||
template <class T, class... Arg>
|
||||
static __forceinline T Max(const T& a, const T& b, Arg... args) {
|
||||
return Max(a, Max(b, args...));
|
||||
}
|
||||
|
||||
/// @brief: Free the memory space which is newed previously.
|
||||
/// @param: ptr(Input), a pointer to memory space. Can't be NULL.
|
||||
/// @return: void.
|
||||
|
||||
@@ -216,6 +216,7 @@ global:
|
||||
hsa_amd_ipc_signal_attach;
|
||||
hsa_amd_register_system_event_handler;
|
||||
hsa_amd_queue_set_priority;
|
||||
hsa_amd_memory_async_copy_rect;
|
||||
|
||||
local:
|
||||
*;
|
||||
|
||||
@@ -172,6 +172,7 @@ struct AmdExtTable {
|
||||
decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn;
|
||||
decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn;
|
||||
decltype(hsa_amd_queue_set_priority)* hsa_amd_queue_set_priority_fn;
|
||||
decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect_fn;
|
||||
};
|
||||
|
||||
// Table to export HSA Core Runtime Apis
|
||||
|
||||
@@ -882,6 +882,43 @@ hsa_status_t HSA_API
|
||||
const hsa_signal_t* dep_signals,
|
||||
hsa_signal_t completion_signal);
|
||||
|
||||
/*
|
||||
[Provisional API]
|
||||
Pitched memory descriptor.
|
||||
All elements must be 4 byte aligned. Pitch and slice are in bytes.
|
||||
*/
|
||||
typedef struct hsa_pitched_ptr_s {
|
||||
void* base;
|
||||
size_t pitch;
|
||||
size_t slice;
|
||||
} hsa_pitched_ptr_t;
|
||||
|
||||
/*
|
||||
[Provisional API]
|
||||
Copy direction flag.
|
||||
*/
|
||||
typedef enum {
|
||||
hsaHostToHost = 0,
|
||||
hsaHostToDevice = 1,
|
||||
hsaDeviceToHost = 2,
|
||||
hsaDeviceToDevice = 3
|
||||
} hsa_amd_copy_direction_t;
|
||||
|
||||
/*
|
||||
[Provisional API]
|
||||
SDMA 3D memory copy API. The same requirements must be met by src and dst as in
|
||||
hsa_amd_memory_async_copy.
|
||||
Both src and dst must be directly accessible to the copy_agent during the copy, src and dst rects
|
||||
must not overlap.
|
||||
CPU agents are not supported. API requires SDMA and will return an error if SDMA is not available.
|
||||
Offsets and range carry x in bytes, y and z in rows and layers.
|
||||
*/
|
||||
hsa_status_t HSA_API hsa_amd_memory_async_copy_rect(
|
||||
const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src,
|
||||
const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent,
|
||||
hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
|
||||
hsa_signal_t completion_signal);
|
||||
|
||||
/**
|
||||
* @brief Type of accesses to a memory pool from a given agent.
|
||||
*/
|
||||
|
||||
Ссылка в новой задаче
Block a user