From 79a63cf292c8f674b9fbfed108b94c1d239ce62b Mon Sep 17 00:00:00 2001 From: Flora Cui Date: Tue, 30 Apr 2024 09:01:09 -0500 Subject: [PATCH] wsl/hsakmt: initial commit Signed-off-by: lyndonli Signed-off-by: Horatio Zhang Signed-off-by: Shi.Leslie Signed-off-by: LonglongYao Signed-off-by: tiancyin Signed-off-by: Frank Min Signed-off-by: Aaron Liu Signed-off-by: Shane Xiao Signed-off-by: Lang Yu Signed-off-by: Feifei Xu Signed-off-by: Ruili Ji Signed-off-by: Qiang Yu Signed-off-by: Flora Cui --- debug.cpp | 118 +++ events.cpp | 129 +++ globals.cpp | 47 + inc/pm4_cmds.h | 1064 ++++++++++++++++++++++ inc/registers.h | 363 ++++++++ inc/rocr_proxy/rocr_proxy.h | 131 +++ inc/rocr_proxy/wddm_types.h | 155 ++++ inc/wddm/cmd_util.h | 77 ++ inc/wddm/device.h | 245 +++++ inc/wddm/gpu_memory.h | 218 +++++ inc/wddm/queue.h | 284 ++++++ inc/wddm/status.h | 59 ++ inc/wddm/thunks.h | 232 +++++ inc/wddm/types.h | 101 +++ inc/wddm/va_mgr.h | 86 ++ libdrm.cpp | 69 ++ libhsakmt.h | 158 ++++ libhsakmt.ver | 97 ++ memory.cpp | 554 ++++++++++++ openclose.cpp | 274 ++++++ pc_sampling.cpp | 73 ++ perfctr.cpp | 82 ++ queues.cpp | 174 ++++ rocr_proxy/librocr_proxy.a | Bin 0 -> 365408 bytes spm.cpp | 48 + svm.cpp | 52 ++ time.cpp | 52 ++ topology.cpp | 1698 +++++++++++++++++++++++++++++++++++ util/atomic_helpers.h | 519 +++++++++++ util/flag.cpp | 226 +++++ util/flag.h | 360 ++++++++ util/lazy_ptr.h | 155 ++++ util/lnx/os_linux.cpp | 771 ++++++++++++++++ util/locks.h | 290 ++++++ util/os.h | 327 +++++++ util/simple_heap.h | 363 ++++++++ util/small_heap.cpp | 185 ++++ util/small_heap.h | 131 +++ util/timer.cpp | 111 +++ util/timer.h | 173 ++++ util/utils.h | 424 +++++++++ util/win/os_win.cpp | 327 +++++++ version.cpp | 52 ++ wddm/cmd_util.cpp | 281 ++++++ wddm/device.cpp | 879 ++++++++++++++++++ wddm/gpu_memory.cpp | 467 ++++++++++ wddm/queue.cpp | 989 ++++++++++++++++++++ wddm/va_mgr.cpp | 163 ++++ 48 files changed, 13833 insertions(+) create mode 100644 debug.cpp create mode 100644 events.cpp create mode 100644 globals.cpp create mode 100644 inc/pm4_cmds.h create mode 100644 inc/registers.h create mode 100644 inc/rocr_proxy/rocr_proxy.h create mode 100644 inc/rocr_proxy/wddm_types.h create mode 100644 inc/wddm/cmd_util.h create mode 100644 inc/wddm/device.h create mode 100644 inc/wddm/gpu_memory.h create mode 100644 inc/wddm/queue.h create mode 100644 inc/wddm/status.h create mode 100644 inc/wddm/thunks.h create mode 100644 inc/wddm/types.h create mode 100644 inc/wddm/va_mgr.h create mode 100644 libdrm.cpp create mode 100644 libhsakmt.h create mode 100644 libhsakmt.ver create mode 100644 memory.cpp create mode 100644 openclose.cpp create mode 100644 pc_sampling.cpp create mode 100644 perfctr.cpp create mode 100644 queues.cpp create mode 100644 rocr_proxy/librocr_proxy.a create mode 100644 spm.cpp create mode 100644 svm.cpp create mode 100644 time.cpp create mode 100644 topology.cpp create mode 100644 util/atomic_helpers.h create mode 100644 util/flag.cpp create mode 100644 util/flag.h create mode 100644 util/lazy_ptr.h create mode 100644 util/lnx/os_linux.cpp create mode 100644 util/locks.h create mode 100644 util/os.h create mode 100644 util/simple_heap.h create mode 100644 util/small_heap.cpp create mode 100644 util/small_heap.h create mode 100644 util/timer.cpp create mode 100644 util/timer.h create mode 100644 util/utils.h create mode 100644 util/win/os_win.cpp create mode 100644 version.cpp create mode 100644 wddm/cmd_util.cpp create mode 100644 wddm/device.cpp create mode 100644 wddm/gpu_memory.cpp create mode 100644 wddm/queue.cpp create mode 100644 wddm/va_mgr.cpp diff --git a/debug.cpp b/debug.cpp new file mode 100644 index 0000000000..5152aaf483 --- /dev/null +++ b/debug.cpp @@ -0,0 +1,118 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include + +#include "libhsakmt.h" + +static uint32_t runtime_capabilities_mask = 0; + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgRegister(HSAuint32 NodeId) { + CHECK_DXG_OPEN(); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgUnregister(HSAuint32 NodeId) { + CHECK_DXG_OPEN(); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgWavefrontControl( + HSAuint32 NodeId, HSA_DBG_WAVEOP Operand, HSA_DBG_WAVEMODE Mode, + HSAuint32 TrapId, HsaDbgWaveMessage *DbgWaveMsgRing) { + CHECK_DXG_OPEN(); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgAddressWatch( + HSAuint32 NodeId, HSAuint32 NumWatchPoints, HSA_DBG_WATCH_MODE WatchMode[], + void *WatchAddress[], HSAuint64 WatchMask[], HsaEvent *WatchEvent[]) { + CHECK_DXG_OPEN(); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtCheckRuntimeDebugSupport(void) { + CHECK_DXG_OPEN(); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeEnable(void *rDebug, bool setupTtmp) { + HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupport(); + + if (result) + return result; + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeDisable(void) { + HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupport(); + + if (result) + return result; + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetRuntimeCapabilities(HSAuint32 *caps_mask) { + CHECK_DXG_OPEN(); + *caps_mask = runtime_capabilities_mask; + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgEnable(void **runtime_info, + HSAuint32 *data_size) { + CHECK_DXG_OPEN(); + + return HSAKMT_STATUS_NOT_SUPPORTED; +} +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgDisable(void) { + CHECK_DXG_OPEN(); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetDeviceData(void **data, + HSAuint32 *n_entries, + HSAuint32 *entry_size) { + CHECK_DXG_OPEN(); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetQueueData(void **data, HSAuint32 *n_entries, + HSAuint32 *entry_size, + bool suspend_queues) { + CHECK_DXG_OPEN(); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtDebugTrapIoctl(struct kfd_ioctl_dbg_trap_args *args, HSA_QUEUEID *Queues, + HSAuint64 *DebugReturn) { + CHECK_DXG_OPEN(); + return HSAKMT_STATUS_NOT_SUPPORTED; +} diff --git a/events.cpp b/events.cpp new file mode 100644 index 0000000000..e28491f31c --- /dev/null +++ b/events.cpp @@ -0,0 +1,129 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include "libhsakmt.h" + +HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc, + bool ManualReset, bool IsSignaled, + HsaEvent **Event) { + CHECK_DXG_OPEN(); + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyEvent(HsaEvent *Event) { + CHECK_DXG_OPEN(); + + if (!Event) + return HSAKMT_STATUS_INVALID_HANDLE; + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSetEvent(HsaEvent *Event) { + CHECK_DXG_OPEN(); + + if (!Event) + return HSAKMT_STATUS_INVALID_HANDLE; + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtResetEvent(HsaEvent *Event) { + CHECK_DXG_OPEN(); + + if (!Event) + return HSAKMT_STATUS_INVALID_HANDLE; + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtQueryEventState(HsaEvent *Event) { + CHECK_DXG_OPEN(); + + if (!Event) + return HSAKMT_STATUS_INVALID_HANDLE; + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent(HsaEvent *Event, + HSAuint32 Milliseconds) { + return hsaKmtWaitOnEvent_Ext(Event, Milliseconds, NULL); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent_Ext(HsaEvent *Event, + HSAuint32 Milliseconds, + uint64_t *event_age) { + if (!Event) + return HSAKMT_STATUS_INVALID_HANDLE; + + return hsaKmtWaitOnMultipleEvents_Ext(&Event, 1, true, Milliseconds, + event_age); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents(HsaEvent *Events[], + HSAuint32 NumEvents, + bool WaitOnAll, + HSAuint32 Milliseconds) { + return hsaKmtWaitOnMultipleEvents_Ext(Events, NumEvents, WaitOnAll, + Milliseconds, NULL); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_Ext(HsaEvent *Events[], + HSAuint32 NumEvents, + bool WaitOnAll, + HSAuint32 Milliseconds, + uint64_t *event_age) { + CHECK_DXG_OPEN(); + + if (!Events) + return HSAKMT_STATUS_INVALID_HANDLE; + + if (NumEvents == 1 && Events[0] == nullptr) { + std::this_thread::sleep_for(std::chrono::microseconds(20)); + return HSAKMT_STATUS_SUCCESS; + } + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtOpenSMI(HSAuint32 NodeId, int *fd) { + CHECK_DXG_OPEN(); + + pr_debug("[%s] node %d\n", __func__, NodeId); + assert(false); + return HSAKMT_STATUS_SUCCESS; +} diff --git a/globals.cpp b/globals.cpp new file mode 100644 index 0000000000..bb635fef4b --- /dev/null +++ b/globals.cpp @@ -0,0 +1,47 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "libhsakmt.h" + +// HSAKMT global data + +int kfd_fd = -1; +unsigned long dxg_open_count; +unsigned long system_properties_count; +pthread_mutex_t hsakmt_mutex = PTHREAD_MUTEX_INITIALIZER; +bool is_dgpu; + +#ifndef PAGE_SIZE +int PAGE_SIZE; +#endif + +int PAGE_SHIFT; + +/* whether to check all dGPUs in the topology support SVM API */ +bool is_svm_api_supported; +/* zfb is mainly used during emulation */ +int zfb_support; +/* handle vendor specific packet */ +int vendor_packet_support; \ No newline at end of file diff --git a/inc/pm4_cmds.h b/inc/pm4_cmds.h new file mode 100644 index 0000000000..82edddc8dd --- /dev/null +++ b/inc/pm4_cmds.h @@ -0,0 +1,1064 @@ +#ifndef _ROCR_PM4_CMDS_H_ +#define _ROCR_PM4_CMDS_H_ + +#include + +#define mmCOMPUTE_NUM_THREAD_X 0x2E07 +#define mmCOMPUTE_PGM_LO 0x2E0C +#define mmCOMPUTE_DISPATCH_SCRATCH_BASE_LO 0x2E10 +#define mmCOMPUTE_PGM_RSRC1 0x2E12 +#define mmCOMPUTE_PGM_RSRC3 0x2E28 +#define mmCOMPUTE_RESOURCE_LIMITS 0x2E15 +#define mmCOMPUTE_USER_DATA_0 0x2E40 + +#define PM4_TYPE_SHIFT 30 +#define PM4_COUNT_SHIFT 16 +#define PM4_OPCODE_SHIFT 8 +#define PM4_SHADER_TYPE_SHIFT 1 + +#define PM4_GFX_SHADER 0 +#define PM4_COMPUTE_SHADER 1 + +#define PM4_TYPE3_HDR(_opc_, _count_) \ + (uint32_t)((3) << PM4_TYPE_SHIFT | \ + ((_count_) - 2) << PM4_COUNT_SHIFT | \ + (_opc_) << PM4_OPCODE_SHIFT) | \ + (PM4_COMPUTE_SHADER << PM4_SHADER_TYPE_SHIFT) + +union PM4_MEC_TYPE_3_HEADER { + struct { + uint32_t reserved1 : 8; ///< reserved + uint32_t opcode : 8; ///< IT opcode + uint32_t count : 14;///< number of DWORDs - 1 in the information body. + uint32_t type : 2; ///< packet identifier. It should be 3 for type 3 packets + }; + uint32_t u32All; +}; + +#define IT_DISPATCH_DIRECT 0x15 +#define IT_ATOMIC_MEM 0x1E +#define IT_INDIRECT_BUFFER 0x3F +#define IT_COPY_DATA 0x40 +#define IT_EVENT_WRITE 0x46 +#define IT_RELEASE_MEM 0x49 +#define IT_ACQUIRE_MEM 0x58 +#define IT_SET_SH_REG 0x76 + +struct PM4_MEC_SET_SH_REG { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t reg_offset:16; + uint32_t reserved1:16; + } bitfields2; + uint32_t ordinal2; + }; +}; + +struct PM4_MEC_DISPATCH_DIRECT { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + uint32_t dim_x; + uint32_t dim_y; + uint32_t dim_z; + uint32_t dispatch_initiator; +}; + +// ------------------------------- MEC_EVENT_WRITE_event_index_enum ------------------------------- +enum MEC_EVENT_WRITE_event_index_enum { + event_index__mec_event_write__other = 0, + event_index__mec_event_write__sample_pipelinestat = 2, + event_index__mec_event_write__cs_partial_flush = 4, + event_index__mec_event_write__sample_streamoutstats__GFX11 = 8, + event_index__mec_event_write__sample_streamoutstats1__GFX11 = 9, + event_index__mec_event_write__sample_streamoutstats2__GFX11 = 10, + event_index__mec_event_write__sample_streamoutstats3__GFX11 = 11, +}; + +enum VGT_EVENT_TYPE { + Reserved_0x00 = 0x00000000, + SAMPLE_STREAMOUTSTATS1 = 0x00000001, + SAMPLE_STREAMOUTSTATS2 = 0x00000002, + SAMPLE_STREAMOUTSTATS3 = 0x00000003, + CACHE_FLUSH_TS = 0x00000004, + CONTEXT_DONE = 0x00000005, + CACHE_FLUSH = 0x00000006, + CS_PARTIAL_FLUSH = 0x00000007, + VGT_STREAMOUT_SYNC = 0x00000008, + VGT_STREAMOUT_RESET = 0x0000000a, + END_OF_PIPE_INCR_DE = 0x0000000b, + END_OF_PIPE_IB_END = 0x0000000c, + RST_PIX_CNT = 0x0000000d, + BREAK_BATCH = 0x0000000e, + VS_PARTIAL_FLUSH = 0x0000000f, + PS_PARTIAL_FLUSH = 0x00000010, + FLUSH_HS_OUTPUT = 0x00000011, + FLUSH_DFSM = 0x00000012, + RESET_TO_LOWEST_VGT = 0x00000013, + CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014, + CACHE_FLUSH_AND_INV_EVENT = 0x00000016, + PERFCOUNTER_START = 0x00000017, + PERFCOUNTER_STOP = 0x00000018, + PIPELINESTAT_START = 0x00000019, + PIPELINESTAT_STOP = 0x0000001a, + PERFCOUNTER_SAMPLE = 0x0000001b, + SAMPLE_PIPELINESTAT = 0x0000001e, + SO_VGTSTREAMOUT_FLUSH = 0x0000001f, + SAMPLE_STREAMOUTSTATS = 0x00000020, + RESET_VTX_CNT = 0x00000021, + BLOCK_CONTEXT_DONE = 0x00000022, + CS_CONTEXT_DONE = 0x00000023, + VGT_FLUSH = 0x00000024, + TGID_ROLLOVER = 0x00000025, + SQ_NON_EVENT = 0x00000026, + SC_SEND_DB_VPZ = 0x00000027, + BOTTOM_OF_PIPE_TS = 0x00000028, + FLUSH_SX_TS = 0x00000029, + DB_CACHE_FLUSH_AND_INV = 0x0000002a, + FLUSH_AND_INV_DB_DATA_TS = 0x0000002b, + FLUSH_AND_INV_DB_META = 0x0000002c, + FLUSH_AND_INV_CB_DATA_TS = 0x0000002d, + FLUSH_AND_INV_CB_META = 0x0000002e, + CS_DONE = 0x0000002f, + PS_DONE = 0x00000030, + FLUSH_AND_INV_CB_PIXEL_DATA = 0x00000031, + SX_CB_RAT_ACK_REQUEST = 0x00000032, + THREAD_TRACE_START = 0x00000033, + THREAD_TRACE_STOP = 0x00000034, + THREAD_TRACE_MARKER = 0x00000035, + THREAD_TRACE_FINISH = 0x00000037, + PIXEL_PIPE_STAT_CONTROL = 0x00000038, + PIXEL_PIPE_STAT_DUMP = 0x00000039, + PIXEL_PIPE_STAT_RESET = 0x0000003a, + CONTEXT_SUSPEND = 0x0000003b, + OFFCHIP_HS_DEALLOC = 0x0000003c, + ENABLE_NGG_PIPELINE = 0x0000003d, + SET_FE_ID__GFX09 = 0x00000009, + Available_0x1c__GFX09 = 0x0000001c, + Available_0x1d__GFX09 = 0x0000001d, + THREAD_TRACE_FLUSH__GFX09 = 0x00000036, + Reserved_0x3f__GFX09 = 0x0000003f, + ZPASS_DONE__GFX09_10 = 0x00000015, + ENABLE_LEGACY_PIPELINE__GFX09_10 = 0x0000003e, + Reserved_0x09__GFX10PLUS = 0x00000009, + FLUSH_ES_OUTPUT__GFX10PLUS = 0x0000001c, + BIN_CONF_OVERRIDE_CHECK__GFX10PLUS = 0x0000001d, + THREAD_TRACE_DRAW__GFX10PLUS = 0x00000036, + DRAW_DONE__GFX10PLUS = 0x0000003f, + WAIT_SYNC__GFX11 = 0x00000015, + ENABLE_PIPELINE_NOT_USED__GFX11 = 0x0000003e, +}; + +struct PM4_MEC_EVENT_WRITE { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t event_type:6; + uint32_t reserved1:2; + uint32_t event_index:4; + uint32_t reserved2:19; + uint32_t offload_enable:1; + } bitfields2; + uint32_t ordinal2; + }; +}; + +struct PM4_MEC_ATOMIC_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t atomic:7; + uint32_t reserved1:1; + uint32_t command:4; + uint32_t reserved2:13; + uint32_t cache_policy:2; + uint32_t reserved3:5; + } bitfields2; + uint32_t ordinal2; + }; + uint32_t addr_lo; + uint32_t addr_hi; + uint32_t src_data_lo; + uint32_t src_data_hi; + uint32_t cmp_data_lo; + uint32_t cmp_data_hi; + union { + struct { + uint32_t loop_interval:13; + uint32_t reserved4:19; + } bitfields9; + uint32_t ordinal9; + }; +}; + +struct PM4_MEC_WRITE_DATA { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t reserved1:8; + uint32_t dst_sel:4; + uint32_t reserved2:4; + uint32_t addr_incr:1; + uint32_t reserved3:2; + uint32_t resume_vf:1; + uint32_t wr_confirm:1; + uint32_t reserved4:4; + uint32_t cache_policy:2; + uint32_t reserved5:5; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t dst_mmreg_addr:18; + uint32_t reserved6:14; + } bitfields3a; + struct { + uint32_t dst_gds_addr:16; + uint32_t reserved7:16; + } bitfields3b; + struct { + uint32_t reserved8:2; + uint32_t dst_mem_addr_lo:30; + } bitfields3c; + uint32_t ordinal3; + }; + uint32_t dst_mem_addr_hi; +}; + +#define PERSISTENT_SPACE_START 0x00002c00 + +template +void GenerateSetShRegHeader(T* pm4, uint32_t reg_addr) { + pm4->cmd_set_data.header.u32All = PM4_TYPE3_HDR(IT_SET_SH_REG, + sizeof(T) / sizeof(uint32_t)); + pm4->cmd_set_data.bitfields2.reg_offset = reg_addr - PERSISTENT_SPACE_START; +} + +template +void GenerateCmdHeader(T* pm4, int op_code) { + pm4->header.u32All = PM4_TYPE3_HDR(op_code, sizeof(T) / sizeof(uint32_t)); +} + +/// @brief Defines the Gpu command to dispatch a kernel. It embeds +/// various Gpu hardware specific data structures for initialization +/// and configuration before a dispatch begins to run +struct DispatchTemplate { + + /// @brief Structure used to initialize the group dimensions + /// of a kernel dispatch and if performance counters are enabled + struct DispatchDimensionRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_num_thread_x; + uint32_t compute_num_thread_y; + uint32_t compute_num_thread_z; + } dimension_regs; + + struct DispatchProgramRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_pgm_lo; + uint32_t compute_pgm_hi; + } program_regs; + + struct DispatchProgramResourceRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_pgm_rsrc1; + uint32_t compute_pgm_rsrc2; + } program_resource_regs; + + /// @brief Structure used to initialize parameters related to + /// thread management i.e. number of waves to issue and number + /// of Compute Units to use + struct DispatchResourceRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_resource_limits; + uint32_t compute_static_thread_mgmt_se0; + uint32_t compute_static_thread_mgmt_se1; + uint32_t compute_tmpring_size; + uint32_t compute_static_thread_mgmt_se2; + uint32_t compute_static_thread_mgmt_se3; + } resource_regs; + + /// @brief Structure used to pass handles of the Aql dispatch + /// packet, Aql queue, Kernel argument address block, Scratch + /// buffer + struct DispatchComputeUserDataRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_user_data[16]; + } compute_user_data_regs; + + /// @brief Structure used to configure Cache flush policy + /// and dimensions of total work size + PM4_MEC_DISPATCH_DIRECT dispatch_direct; +}; + +struct DispatchProgramResourceRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_pgm_rsrc3; +}; + + +/// @brief Structure used to issue a programing scratch command for gfx11+ +struct SetScratchTemplate { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t scratch_lo; + uint32_t scratch_hi; +}; + +/// @brief Structure used to issue a Gpu Barrier command +struct BarrierTemplate { + PM4_MEC_EVENT_WRITE event_write; +}; + +//--------------------MEC_ATOMIC_MEM-------------------- +enum MEC_ATOMIC_MEM_command_enum { + command__mec_atomic_mem__single_pass_atomic = 0, + command__mec_atomic_mem__loop_until_compare_satisfied = 1, + command__mec_atomic_mem__wait_for_write_confirmation = 2, + command__mec_atomic_mem__send_and_continue = 3, +}; + +enum MEC_ATOMIC_MEM_cache_policy_enum { + cache_policy__mec_atomic_mem__lru = 0, + cache_policy__mec_atomic_mem__stream = 1, + cache_policy__mec_atomic_mem__noa = 2, + cache_policy__mec_atomic_mem__bypass = 3, +}; + +enum TC_OP { + TC_OP_READ = 0x00000000, + TC_OP_ATOMIC_FCMPSWAP_RTN_32 = 0x00000001, + TC_OP_ATOMIC_FMIN_RTN_32 = 0x00000002, + TC_OP_ATOMIC_FMAX_RTN_32 = 0x00000003, + TC_OP_RESERVED_FOP_RTN_32_0 = 0x00000004, + TC_OP_RESERVED_FOP_RTN_32_2 = 0x00000006, + TC_OP_ATOMIC_SWAP_RTN_32 = 0x00000007, + TC_OP_ATOMIC_CMPSWAP_RTN_32 = 0x00000008, + TC_OP_ATOMIC_FCMPSWAP_FLUSH_DENORM_RTN_32 = 0x00000009, + TC_OP_ATOMIC_FMIN_FLUSH_DENORM_RTN_32 = 0x0000000a, + TC_OP_ATOMIC_FMAX_FLUSH_DENORM_RTN_32 = 0x0000000b, + TC_OP_PROBE_FILTER = 0x0000000c, + TC_OP_RESERVED_FOP_FLUSH_DENORM_RTN_32_2 = 0x0000000e, + TC_OP_ATOMIC_ADD_RTN_32 = 0x0000000f, + TC_OP_ATOMIC_SUB_RTN_32 = 0x00000010, + TC_OP_ATOMIC_SMIN_RTN_32 = 0x00000011, + TC_OP_ATOMIC_UMIN_RTN_32 = 0x00000012, + TC_OP_ATOMIC_SMAX_RTN_32 = 0x00000013, + TC_OP_ATOMIC_UMAX_RTN_32 = 0x00000014, + TC_OP_ATOMIC_AND_RTN_32 = 0x00000015, + TC_OP_ATOMIC_OR_RTN_32 = 0x00000016, + TC_OP_ATOMIC_XOR_RTN_32 = 0x00000017, + TC_OP_ATOMIC_INC_RTN_32 = 0x00000018, + TC_OP_ATOMIC_DEC_RTN_32 = 0x00000019, + TC_OP_WBINVL1_VOL = 0x0000001a, + TC_OP_WBINVL1_SD = 0x0000001b, + TC_OP_RESERVED_NON_FLOAT_RTN_32_0 = 0x0000001c, + TC_OP_RESERVED_NON_FLOAT_RTN_32_1 = 0x0000001d, + TC_OP_RESERVED_NON_FLOAT_RTN_32_2 = 0x0000001e, + TC_OP_RESERVED_NON_FLOAT_RTN_32_3 = 0x0000001f, + TC_OP_WRITE = 0x00000020, + TC_OP_ATOMIC_FCMPSWAP_RTN_64 = 0x00000021, + TC_OP_ATOMIC_FMIN_RTN_64 = 0x00000022, + TC_OP_ATOMIC_FMAX_RTN_64 = 0x00000023, + TC_OP_RESERVED_FOP_RTN_64_0 = 0x00000024, + TC_OP_RESERVED_FOP_RTN_64_1 = 0x00000025, + TC_OP_RESERVED_FOP_RTN_64_2 = 0x00000026, + TC_OP_ATOMIC_SWAP_RTN_64 = 0x00000027, + TC_OP_ATOMIC_CMPSWAP_RTN_64 = 0x00000028, + TC_OP_ATOMIC_FCMPSWAP_FLUSH_DENORM_RTN_64 = 0x00000029, + TC_OP_ATOMIC_FMIN_FLUSH_DENORM_RTN_64 = 0x0000002a, + TC_OP_ATOMIC_FMAX_FLUSH_DENORM_RTN_64 = 0x0000002b, + TC_OP_WBINVL2_SD = 0x0000002c, + TC_OP_RESERVED_FOP_FLUSH_DENORM_RTN_64_0 = 0x0000002d, + TC_OP_RESERVED_FOP_FLUSH_DENORM_RTN_64_1 = 0x0000002e, + TC_OP_ATOMIC_ADD_RTN_64 = 0x0000002f, + TC_OP_ATOMIC_SUB_RTN_64 = 0x00000030, + TC_OP_ATOMIC_SMIN_RTN_64 = 0x00000031, + TC_OP_ATOMIC_UMIN_RTN_64 = 0x00000032, + TC_OP_ATOMIC_SMAX_RTN_64 = 0x00000033, + TC_OP_ATOMIC_UMAX_RTN_64 = 0x00000034, + TC_OP_ATOMIC_AND_RTN_64 = 0x00000035, + TC_OP_ATOMIC_OR_RTN_64 = 0x00000036, + TC_OP_ATOMIC_XOR_RTN_64 = 0x00000037, + TC_OP_ATOMIC_INC_RTN_64 = 0x00000038, + TC_OP_ATOMIC_DEC_RTN_64 = 0x00000039, + TC_OP_WBL2_NC = 0x0000003a, + TC_OP_WBL2_WC = 0x0000003b, + TC_OP_RESERVED_NON_FLOAT_RTN_64_1 = 0x0000003c, + TC_OP_RESERVED_NON_FLOAT_RTN_64_2 = 0x0000003d, + TC_OP_RESERVED_NON_FLOAT_RTN_64_3 = 0x0000003e, + TC_OP_RESERVED_NON_FLOAT_RTN_64_4 = 0x0000003f, + TC_OP_WBINVL1 = 0x00000040, + TC_OP_ATOMIC_FCMPSWAP_32 = 0x00000041, + TC_OP_ATOMIC_FMIN_32 = 0x00000042, + TC_OP_ATOMIC_FMAX_32 = 0x00000043, + TC_OP_RESERVED_FOP_32_0 = 0x00000044, + TC_OP_RESERVED_FOP_32_2 = 0x00000046, + TC_OP_ATOMIC_SWAP_32 = 0x00000047, + TC_OP_ATOMIC_CMPSWAP_32 = 0x00000048, + TC_OP_ATOMIC_FCMPSWAP_FLUSH_DENORM_32 = 0x00000049, + TC_OP_ATOMIC_FMIN_FLUSH_DENORM_32 = 0x0000004a, + TC_OP_ATOMIC_FMAX_FLUSH_DENORM_32 = 0x0000004b, + TC_OP_INV_METADATA = 0x0000004c, + TC_OP_RESERVED_FOP_FLUSH_DENORM_32_2 = 0x0000004e, + TC_OP_ATOMIC_ADD_32 = 0x0000004f, + TC_OP_ATOMIC_SUB_32 = 0x00000050, + TC_OP_ATOMIC_SMIN_32 = 0x00000051, + TC_OP_ATOMIC_UMIN_32 = 0x00000052, + TC_OP_ATOMIC_SMAX_32 = 0x00000053, + TC_OP_ATOMIC_UMAX_32 = 0x00000054, + TC_OP_ATOMIC_AND_32 = 0x00000055, + TC_OP_ATOMIC_OR_32 = 0x00000056, + TC_OP_ATOMIC_XOR_32 = 0x00000057, + TC_OP_ATOMIC_INC_32 = 0x00000058, + TC_OP_ATOMIC_DEC_32 = 0x00000059, + TC_OP_INVL2_NC = 0x0000005a, + TC_OP_NOP_RTN0 = 0x0000005b, + TC_OP_RESERVED_NON_FLOAT_32_1 = 0x0000005c, + TC_OP_RESERVED_NON_FLOAT_32_2 = 0x0000005d, + TC_OP_RESERVED_NON_FLOAT_32_3 = 0x0000005e, + TC_OP_RESERVED_NON_FLOAT_32_4 = 0x0000005f, + TC_OP_WBINVL2 = 0x00000060, + TC_OP_ATOMIC_FCMPSWAP_64 = 0x00000061, + TC_OP_ATOMIC_FMIN_64 = 0x00000062, + TC_OP_ATOMIC_FMAX_64 = 0x00000063, + TC_OP_RESERVED_FOP_64_0 = 0x00000064, + TC_OP_RESERVED_FOP_64_1 = 0x00000065, + TC_OP_RESERVED_FOP_64_2 = 0x00000066, + TC_OP_ATOMIC_SWAP_64 = 0x00000067, + TC_OP_ATOMIC_CMPSWAP_64 = 0x00000068, + TC_OP_ATOMIC_FCMPSWAP_FLUSH_DENORM_64 = 0x00000069, + TC_OP_ATOMIC_FMIN_FLUSH_DENORM_64 = 0x0000006a, + TC_OP_ATOMIC_FMAX_FLUSH_DENORM_64 = 0x0000006b, + TC_OP_RESERVED_FOP_FLUSH_DENORM_64_0 = 0x0000006c, + TC_OP_RESERVED_FOP_FLUSH_DENORM_64_1 = 0x0000006d, + TC_OP_RESERVED_FOP_FLUSH_DENORM_64_2 = 0x0000006e, + TC_OP_ATOMIC_ADD_64 = 0x0000006f, + TC_OP_ATOMIC_SUB_64 = 0x00000070, + TC_OP_ATOMIC_SMIN_64 = 0x00000071, + TC_OP_ATOMIC_UMIN_64 = 0x00000072, + TC_OP_ATOMIC_SMAX_64 = 0x00000073, + TC_OP_ATOMIC_UMAX_64 = 0x00000074, + TC_OP_ATOMIC_AND_64 = 0x00000075, + TC_OP_ATOMIC_OR_64 = 0x00000076, + TC_OP_ATOMIC_XOR_64 = 0x00000077, + TC_OP_ATOMIC_INC_64 = 0x00000078, + TC_OP_ATOMIC_DEC_64 = 0x00000079, + TC_OP_WBINVL2_NC = 0x0000007a, + TC_OP_NOP_ACK = 0x0000007b, + TC_OP_RESERVED_NON_FLOAT_64_1 = 0x0000007c, + TC_OP_RESERVED_NON_FLOAT_64_2 = 0x0000007d, + TC_OP_RESERVED_NON_FLOAT_64_3 = 0x0000007e, + TC_OP_RESERVED_NON_FLOAT_64_4 = 0x0000007f, + TC_OP_RESERVED_FOP_RTN_32_1__GFX09_10 = 0x00000005, + TC_OP_RESERVED_FOP_FLUSH_DENORM_RTN_32_1__GFX09_10 = 0x0000000d, + TC_OP_RESERVED_FOP_32_1__GFX09_10 = 0x00000045, + TC_OP_RESERVED_FOP_FLUSH_DENORM_32_1__GFX09_10 = 0x0000004d, + TC_OP_RESERVED_FADD_RTN_32__GFX11 = 0x00000005, + TC_OP_ATOMIC_FADD_FLUSH_DENORM_RTN_32__GFX11 = 0x0000000d, + TC_OP_RESERVED_FADD_32__GFX11 = 0x00000045, + TC_OP_ATOMIC_FADD_FLUSH_DENORM_32__GFX11 = 0x0000004d, +}; + +// Desc: Strucuture used to perform various atomic +// operations - add, subtract, increment, etc +struct AtomicTemplate { + PM4_MEC_ATOMIC_MEM atomic; +}; + +/// @brief PM4 command to write a 64-bit value into a memory +/// location accessible to Gpu +struct WriteDataTemplate { + PM4_MEC_WRITE_DATA write_data; + uint64_t write_data_value; +}; + +// ---------------------------------- MEC_COPY_DATA_src_sel_enum ---------------------------------- +enum MEC_COPY_DATA_src_sel_enum { + src_sel__mec_copy_data__mem_mapped_register = 0, + src_sel__mec_copy_data__tc_l2_obsolete = 1, + src_sel__mec_copy_data__tc_l2 = 2, + src_sel__mec_copy_data__gds = 3, + src_sel__mec_copy_data__perfcounters = 4, + src_sel__mec_copy_data__immediate_data = 5, + src_sel__mec_copy_data__atomic_return_data = 6, + src_sel__mec_copy_data__gds_atomic_return_data0 = 7, + src_sel__mec_copy_data__gds_atomic_return_data1 = 8, + src_sel__mec_copy_data__gpu_clock_count = 9, + src_sel__mec_copy_data__system_clock_count = 10, + src_sel__mec_copy_data__ext32perfcntr = 11, +}; + +// ---------------------------------- MEC_COPY_DATA_dst_sel_enum ---------------------------------- +enum MEC_COPY_DATA_dst_sel_enum { + dst_sel__mec_copy_data__mem_mapped_register = 0, + dst_sel__mec_copy_data__tc_l2 = 2, + dst_sel__mec_copy_data__gds = 3, + dst_sel__mec_copy_data__perfcounters = 4, + dst_sel__mec_copy_data__tc_l2_obsolete = 5, + dst_sel__mec_copy_data__mem_mapped_reg_dc = 6, + dst_sel__mec_copy_data__ext32perfcntr = 11, +}; + +// ------------------------------ MEC_COPY_DATA_src_cache_policy_enum ------------------------------ +enum MEC_COPY_DATA_src_cache_policy_enum { + src_cache_policy__mec_copy_data__lru = 0, + src_cache_policy__mec_copy_data__stream = 1, + src_cache_policy__mec_copy_data__noa = 2, + src_cache_policy__mec_copy_data__bypass = 3, +}; + +// --------------------------------- MEC_COPY_DATA_count_sel_enum --------------------------------- +enum MEC_COPY_DATA_count_sel_enum { + count_sel__mec_copy_data__32_bits_of_data = 0, + count_sel__mec_copy_data__64_bits_of_data = 1, +}; + +// --------------------------------- MEC_COPY_DATA_wr_confirm_enum --------------------------------- +enum MEC_COPY_DATA_wr_confirm_enum { + wr_confirm__mec_copy_data__do_not_wait_for_confirmation = 0, + wr_confirm__mec_copy_data__wait_for_confirmation = 1, +}; + +// ------------------------------ MEC_COPY_DATA_dst_cache_policy_enum ------------------------------ +enum MEC_COPY_DATA_dst_cache_policy_enum { + dst_cache_policy__mec_copy_data__lru = 0, + dst_cache_policy__mec_copy_data__stream = 1, + dst_cache_policy__mec_copy_data__noa = 2, + dst_cache_policy__mec_copy_data__bypass = 3, +}; + +// ------------------------------- MEC_COPY_DATA_pq_exe_status_enum ------------------------------- +enum MEC_COPY_DATA_pq_exe_status_enum { + pq_exe_status__mec_copy_data__default = 0, + pq_exe_status__mec_copy_data__phase_update = 1, +}; + +typedef struct PM4_MEC_COPY_DATA { + union { + PM4_MEC_TYPE_3_HEADER header; /// header + uint32_t ordinal1; + }; + union { + struct { + uint32_t src_sel : 4; + uint32_t reserved1 : 4; + uint32_t dst_sel : 4; + uint32_t reserved2 : 1; + uint32_t src_cache_policy : 2; + uint32_t reserved3 : 1; + uint32_t count_sel : 1; + uint32_t reserved4 : 3; + uint32_t wr_confirm : 1; + uint32_t reserved5 : 4; + uint32_t dst_cache_policy : 2; + uint32_t reserved6 : 2; + uint32_t pq_exe_status : 1; + uint32_t reserved7 : 2; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t src_reg_offset : 18; + uint32_t reserved8 : 14; + } bitfields3a; + struct { + uint32_t reserved9 : 2; + uint32_t src_32b_addr_lo : 30; + } bitfields3b; + struct { + uint32_t reserved10 : 3; + uint32_t src_64b_addr_lo : 29; + } bitfields3c; + struct { + uint32_t src_gds_addr_lo : 16; + uint32_t reserved11 : 16; + } bitfields3d; + uint32_t imm_data; + uint32_t ordinal3; + }; + union { + uint32_t src_memtc_addr_hi; + uint32_t src_imm_data; + uint32_t ordinal4; + }; + union { + struct { + uint32_t dst_reg_offset : 18; + uint32_t reserved12 : 14; + } bitfields5a; + struct { + uint32_t reserved13 : 2; + uint32_t dst_32b_addr_lo : 30; + } bitfields5b; + struct { + uint32_t reserved14 : 3; + uint32_t dst_64b_addr_lo : 29; + } bitfields5c; + struct { + uint32_t dst_gds_addr_lo : 16; + uint32_t reserved15 : 16; + } bitfields5d; + uint32_t ordinal5; + }; + uint32_t dst_addr_hi; +} PM4MEC_COPY_DATA; +namespace gfx9 { + +struct PM4_MEC_ACQUIRE_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t coher_cntl:31; + uint32_t reserved1:1; + } bitfields2; + uint32_t ordinal2; + }; + uint32_t coher_size; + union { + struct { + uint32_t coher_size_hi:8; + uint32_t reserved2:24; + } bitfields4; + uint32_t ordinal4; + }; + uint32_t coher_base_lo; + union { + struct { + uint32_t coher_base_hi:24; + uint32_t reserved3:8; + } bitfields6; + uint32_t ordinal6; + }; + union { + struct { + uint32_t poll_interval:16; + uint32_t reserved4:16; + } bitfields7; + uint32_t ordinal7; + }; +}; + +struct PM4_MEC_RELEASE_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t event_type:6; + uint32_t reserved1:2; + uint32_t event_index:4; + uint32_t tcl1_vol_action_ena:1; + uint32_t tc_vol_action_ena:1; + uint32_t reserved2:1; + uint32_t tc_wb_action_ena:1; + uint32_t tcl1_action_ena:1; + uint32_t tc_action_ena:1; + uint32_t reserved3:1; + uint32_t tc_nc_action_ena:1; + uint32_t tc_wc_action_ena:1; + uint32_t tc_md_action_ena:1; + uint32_t reserved4:3; + uint32_t cache_policy:2; + uint32_t reserved5:2; + uint32_t pq_exe_status:1; + uint32_t reserved6:2; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t reserved7:16; + uint32_t dst_sel:2; + uint32_t reserved8:6; + uint32_t int_sel:3; + uint32_t reserved9:2; + uint32_t data_sel:3; + } bitfields3; + uint32_t ordinal3; + }; + union { + struct { + uint32_t reserved10:2; + uint32_t address_lo_32b:30; + } bitfields4a; + struct { + uint32_t reserved11:3; + uint32_t address_lo_64b:29; + } bitfields4b; + uint32_t reserved12; + uint32_t ordinal4; + }; + union { + uint32_t address_hi; + uint32_t reserved13; + uint32_t ordinal5; + }; + union { + uint32_t data_lo; + uint32_t cmp_data_lo; + struct { + uint32_t dw_offset:16; + uint32_t num_dwords:16; + } bitfields6c; + uint32_t reserved14; + uint32_t ordinal6; + }; + union { + uint32_t data_hi; + uint32_t cmp_data_hi; + uint32_t reserved15; + uint32_t reserved16; + uint32_t ordinal7; + }; + uint32_t int_ctxid; +}; + +struct PM4_MEC_WAIT_REG_MEM64 { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t function:3; + uint32_t reserved1:1; + uint32_t mem_space:2; + uint32_t operation:2; + uint32_t reserved2:24; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t reserved3:3; + uint32_t mem_poll_addr_lo:29; + } bitfields3a; + struct { + uint32_t reg_poll_addr:18; + uint32_t reserved4:14; + } bitfields3b; + struct { + uint32_t reg_write_addr1:18; + uint32_t reserved5:14; + } bitfields3c; + uint32_t ordinal3; + }; + union { + uint32_t mem_poll_addr_hi; + struct { + uint32_t reg_write_addr2:18; + uint32_t reserved6:14; + } bitfields4b; + uint32_t ordinal4; + }; + uint32_t reference; + uint32_t reference_hi; + uint32_t mask; + uint32_t mask_hi; + union { + struct { + uint32_t poll_interval:16; + uint32_t reserved7:16; + } bitfields9; + uint32_t ordinal9; + }; +}; + +/// @brief Structure used to configure the flushing of +/// various caches - instruction, constants, L1 and L2 +struct AcquireMemTemplate { + PM4_MEC_ACQUIRE_MEM acquire_mem; +}; + +struct EndofKernelNotifyTemplate { + PM4_MEC_RELEASE_MEM release_mem; +}; + +/// @brief PM4 command to wait for a certain event before proceeding +/// to process another command on the queue +struct WaitRegMem64Template { + PM4_MEC_WAIT_REG_MEM64 wait_reg_mem; +}; + +} // gfx9 namespace + +namespace gfx10 { + +struct PM4_MEC_ACQUIRE_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + uint32_t reserved1; + uint32_t coher_size; + union { + struct { + uint32_t coher_size_hi:8; + uint32_t reserved2:24; + } bitfields4; + uint32_t ordinal4; + }; + uint32_t coher_base_lo; + union { + struct { + uint32_t coher_base_hi:24; + uint32_t reserved3:8; + } bitfields6; + uint32_t ordinal6; + }; + union { + struct { + uint32_t poll_interval:16; + uint32_t reserved4:16; + } bitfields7; + uint32_t ordinal7; + }; + union { + struct { + uint32_t gcr_cntl:19; + uint32_t reserved4:13; + } bitfields8; + uint32_t ordinal8; + }; +}; + +struct PM4_MEC_RELEASE_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t event_type:6; + uint32_t reserved1:2; + uint32_t event_index:4; + uint32_t gcr_cntl:12; + uint32_t reserved2:1; + uint32_t cache_policy:2; + uint32_t reserved3:2; + uint32_t pq_exe_status:1; + uint32_t reserved4:2; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t reserved7:16; + uint32_t dst_sel:2; + uint32_t reserved8:2; + uint32_t mes_intr_pipe:2; + uint32_t mes_action_id:2; + uint32_t int_sel:3; + uint32_t reserved9:2; + uint32_t data_sel:3; + } bitfields3; + uint32_t ordinal3; + }; + union { + struct { + uint32_t reserved10:2; + uint32_t address_lo_32b:30; + } bitfields4a; + struct { + uint32_t reserved11:3; + uint32_t address_lo_64b:29; + } bitfields4b; + uint32_t reserved12; + uint32_t ordinal4; + }; + union { + uint32_t address_hi; + uint32_t reserved13; + uint32_t ordinal5; + }; + union { + uint32_t data_lo; + uint32_t cmp_data_lo; + struct { + uint32_t dw_offset:16; + uint32_t num_dwords:16; + } bitfields6c; + uint32_t reserved14; + uint32_t ordinal6; + }; + union { + uint32_t data_hi; + uint32_t cmp_data_hi; + uint32_t reserved15; + uint32_t reserved16; + uint32_t ordinal7; + }; + uint32_t int_ctxid; +}; + +struct PM4_MEC_WAIT_REG_MEM64 { + union { + PM4_MEC_TYPE_3_HEADER header; ///header + uint32_t ordinal1; + }; + union { + struct { + uint32_t function:3; + uint32_t reserved1:1; + uint32_t mem_space:2; + uint32_t operation:2; + uint32_t reserved2:14; + uint32_t mes_intr_pipe:2; + uint32_t mes_action:1; + uint32_t cache_policy:2; + uint32_t reserved3:5; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t reserved4:3; + uint32_t mem_poll_addr_lo:29; + } bitfields3a; + struct { + uint32_t reg_poll_addr:18; + uint32_t reserved5:14; + } bitfields3b; + struct { + uint32_t reg_write_addr1:18; + uint32_t reserved6:14; + } bitfields3c; + uint32_t ordinal3; + }; + union { + uint32_t mem_poll_addr_hi; + struct { + uint32_t reg_write_addr2:18; + uint32_t reserved7:14; + } bitfields4b; + uint32_t ordinal4; + }; + uint32_t reference; + uint32_t reference_hi; + uint32_t mask; + uint32_t mask_hi; + union { + struct { + uint32_t poll_interval:16; + uint32_t reserved8:15; + uint32_t optimize_ace_offload_mode:1; + } bitfields9; + uint32_t ordinal9; + }; +}; + +/// @brief Structure used to configure the flushing of +/// various caches - instruction, constants, L1 and L2 +struct AcquireMemTemplate { + PM4_MEC_ACQUIRE_MEM acquire_mem; +}; + +struct EndofKernelNotifyTemplate { + PM4_MEC_RELEASE_MEM release_mem; +}; + +struct WaitRegMem64Template { + PM4_MEC_WAIT_REG_MEM64 wait_reg_mem; +}; + +} // gfx10 namespace + +namespace gfx11 { + +struct PM4_MEC_RELEASE_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t event_type:6; + uint32_t reserved1:2; + uint32_t event_index:4; + uint32_t gcr_cntl:13; + uint32_t cache_policy:2; + uint32_t reserved2:1; + uint32_t pq_exe_status:1; + uint32_t reserved3:1; + uint32_t glk_inv:1; + uint32_t reserved4:1; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t reserved5:16; + uint32_t dst_sel:2; + uint32_t reserved6:2; + uint32_t mes_intr_pipe:2; + uint32_t mes_action_id:2; + uint32_t int_sel:3; + uint32_t reserved7:2; + uint32_t data_sel:3; + } bitfields3; + uint32_t ordinal3; + }; + union { + struct { + uint32_t reserved8:2; + uint32_t address_lo_32b:30; + } bitfields4a; + struct { + uint32_t reserved9:3; + uint32_t address_lo_64b:29; + } bitfields4b; + uint32_t reserved10; + uint32_t ordinal4; + }; + union { + uint32_t address_hi; + uint32_t reserved11; + uint32_t ordinal5; + }; + union { + uint32_t data_lo; + uint32_t cmp_data_lo; + struct { + uint32_t dw_offset:16; + uint32_t num_dwords:16; + } bitfields6c; + uint32_t reserved12; + uint32_t ordinal6; + }; + union { + uint32_t data_hi; + uint32_t cmp_data_hi; + uint32_t reserved13; + uint32_t reserved14; + uint32_t ordinal7; + }; + uint32_t int_ctxid; +}; + +struct EndofKernelNotifyTemplate { + PM4_MEC_RELEASE_MEM release_mem; +}; + +} // gfx11 namespace + +#endif diff --git a/inc/registers.h b/inc/registers.h new file mode 100644 index 0000000000..067b808b90 --- /dev/null +++ b/inc/registers.h @@ -0,0 +1,363 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// This file is used only for open source cmake builds, if we hardcode the +// register values in amd_aql_queue.cpp then this file won't be required. For +// now we are using this file where register details are spelled out in the +// structs/unions below. +#ifndef HSA_RUNTME_CORE_INC_REGISTERS_H_ +#define HSA_RUNTME_CORE_INC_REGISTERS_H_ + +typedef enum SQ_RSRC_BUF_TYPE { +SQ_RSRC_BUF = 0x00000000, +SQ_RSRC_BUF_RSVD_1 = 0x00000001, +SQ_RSRC_BUF_RSVD_2 = 0x00000002, +SQ_RSRC_BUF_RSVD_3 = 0x00000003, +} SQ_RSRC_BUF_TYPE; + +typedef enum BUF_DATA_FORMAT { +BUF_DATA_FORMAT_INVALID = 0x00000000, +BUF_DATA_FORMAT_8 = 0x00000001, +BUF_DATA_FORMAT_16 = 0x00000002, +BUF_DATA_FORMAT_8_8 = 0x00000003, +BUF_DATA_FORMAT_32 = 0x00000004, +BUF_DATA_FORMAT_16_16 = 0x00000005, +BUF_DATA_FORMAT_10_11_11 = 0x00000006, +BUF_DATA_FORMAT_11_11_10 = 0x00000007, +BUF_DATA_FORMAT_10_10_10_2 = 0x00000008, +BUF_DATA_FORMAT_2_10_10_10 = 0x00000009, +BUF_DATA_FORMAT_8_8_8_8 = 0x0000000a, +BUF_DATA_FORMAT_32_32 = 0x0000000b, +BUF_DATA_FORMAT_16_16_16_16 = 0x0000000c, +BUF_DATA_FORMAT_32_32_32 = 0x0000000d, +BUF_DATA_FORMAT_32_32_32_32 = 0x0000000e, +BUF_DATA_FORMAT_RESERVED_15 = 0x0000000f, +} BUF_DATA_FORMAT; + +typedef enum BUF_NUM_FORMAT { +BUF_NUM_FORMAT_UNORM = 0x00000000, +BUF_NUM_FORMAT_SNORM = 0x00000001, +BUF_NUM_FORMAT_USCALED = 0x00000002, +BUF_NUM_FORMAT_SSCALED = 0x00000003, +BUF_NUM_FORMAT_UINT = 0x00000004, +BUF_NUM_FORMAT_SINT = 0x00000005, +BUF_NUM_FORMAT_SNORM_OGL__SI__CI = 0x00000006, +BUF_NUM_FORMAT_RESERVED_6__VI = 0x00000006, +BUF_NUM_FORMAT_FLOAT = 0x00000007, +} BUF_NUM_FORMAT; + +typedef enum BUF_FORMAT { +BUF_FORMAT_32_UINT = 0x00000014, +} BUF_FORMAT; + +typedef enum SQ_SEL_XYZW01 { +SQ_SEL_0 = 0x00000000, +SQ_SEL_1 = 0x00000001, +SQ_SEL_RESERVED_0 = 0x00000002, +SQ_SEL_RESERVED_1 = 0x00000003, +SQ_SEL_X = 0x00000004, +SQ_SEL_Y = 0x00000005, +SQ_SEL_Z = 0x00000006, +SQ_SEL_W = 0x00000007, +} SQ_SEL_XYZW01; + + union COMPUTE_TMPRING_SIZE { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int WAVES : 12; + unsigned int WAVESIZE : 13; + unsigned int : 7; +#elif defined(BIGENDIAN_CPU) + unsigned int : 7; + unsigned int WAVESIZE : 13; + unsigned int WAVES : 12; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + union COMPUTE_TMPRING_SIZE_GFX11 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int WAVES : 12; + unsigned int WAVESIZE : 15; + unsigned int : 5; +#elif defined(BIGENDIAN_CPU) + unsigned int : 5; + unsigned int WAVESIZE : 15; + unsigned int WAVES : 12; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + union COMPUTE_TMPRING_SIZE_GFX12 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int WAVES : 12; + unsigned int WAVESIZE : 18; + unsigned int : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int : 2; + unsigned int WAVESIZE : 18; + unsigned int WAVES : 12; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + union SQ_BUF_RSRC_WORD0 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int BASE_ADDRESS : 32; +#elif defined(BIGENDIAN_CPU) + unsigned int BASE_ADDRESS : 32; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + + union SQ_BUF_RSRC_WORD1 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int BASE_ADDRESS_HI : 16; + unsigned int STRIDE : 14; + unsigned int CACHE_SWIZZLE : 1; + unsigned int SWIZZLE_ENABLE : 1; +#elif defined(BIGENDIAN_CPU) + unsigned int SWIZZLE_ENABLE : 1; + unsigned int CACHE_SWIZZLE : 1; + unsigned int STRIDE : 14; + unsigned int BASE_ADDRESS_HI : 16; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + union SQ_BUF_RSRC_WORD1_GFX11 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int BASE_ADDRESS_HI : 16; + unsigned int STRIDE : 14; + unsigned int SWIZZLE_ENABLE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int SWIZZLE_ENABLE : 2; + unsigned int STRIDE : 14; + unsigned int BASE_ADDRESS_HI : 16; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + + union SQ_BUF_RSRC_WORD2 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int NUM_RECORDS : 32; +#elif defined(BIGENDIAN_CPU) + unsigned int NUM_RECORDS : 32; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + + union SQ_BUF_RSRC_WORD3 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int DST_SEL_X : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_W : 3; + unsigned int NUM_FORMAT : 3; + unsigned int DATA_FORMAT : 4; + unsigned int ELEMENT_SIZE : 2; + unsigned int INDEX_STRIDE : 2; + unsigned int ADD_TID_ENABLE : 1; + unsigned int ATC__CI__VI : 1; + unsigned int HASH_ENABLE : 1; + unsigned int HEAP : 1; + unsigned int MTYPE__CI__VI : 3; + unsigned int TYPE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int TYPE : 2; + unsigned int MTYPE__CI__VI : 3; + unsigned int HEAP : 1; + unsigned int HASH_ENABLE : 1; + unsigned int ATC__CI__VI : 1; + unsigned int ADD_TID_ENABLE : 1; + unsigned int INDEX_STRIDE : 2; + unsigned int ELEMENT_SIZE : 2; + unsigned int DATA_FORMAT : 4; + unsigned int NUM_FORMAT : 3; + unsigned int DST_SEL_W : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_X : 3; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + union SQ_BUF_RSRC_WORD3_GFX10 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int DST_SEL_X : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_W : 3; + unsigned int FORMAT : 7; + unsigned int RESERVED1 : 2; + unsigned int INDEX_STRIDE : 2; + unsigned int ADD_TID_ENABLE : 1; + unsigned int RESOURCE_LEVEL : 1; + unsigned int RESERVED2 : 3; + unsigned int OOB_SELECT : 2; + unsigned int TYPE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int TYPE : 2; + unsigned int OOB_SELECT : 2; + unsigned int RESERVED2 : 3; + unsigned int RESOURCE_LEVEL : 1; + unsigned int ADD_TID_ENABLE : 1; + unsigned int INDEX_STRIDE : 2; + unsigned int RESERVED1 : 2; + unsigned int FORMAT : 7; + unsigned int DST_SEL_W : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_X : 3; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + // From V# Table + union SQ_BUF_RSRC_WORD3_GFX11 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int DST_SEL_X : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_W : 3; + unsigned int FORMAT : 6; + unsigned int RESERVED1 : 3; + unsigned int INDEX_STRIDE : 2; + unsigned int ADD_TID_ENABLE : 1; + unsigned int RESERVED2 : 4; + unsigned int OOB_SELECT : 2; + unsigned int TYPE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int TYPE : 2; + unsigned int OOB_SELECT : 2; + unsigned int RESERVED2 : 4; + unsigned int ADD_TID_ENABLE : 1; + unsigned int INDEX_STRIDE : 2; + unsigned int RESERVED1 : 3; + unsigned int FORMAT : 6; + unsigned int DST_SEL_W : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_X : 3; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + // From V# Table + union SQ_BUF_RSRC_WORD3_GFX12 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int DST_SEL_X : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_W : 3; + unsigned int FORMAT : 6; + unsigned int RESERVED1 : 3; + unsigned int INDEX_STRIDE : 2; + unsigned int ADD_TID_ENABLE : 1; + unsigned int WRITE_COMPRESS_ENABLE : 1; + unsigned int COMPRESSION_EN : 1; + unsigned int COMPRESSION_ACCESS_MODE : 2; + unsigned int OOB_SELECT : 2; + unsigned int TYPE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int TYPE : 2; + unsigned int OOB_SELECT : 2; + unsigned int COMPRESSION_ACCESS_MODE : 2; + unsigned int COMPRESSION_EN : 1; + unsigned int WRITE_COMPRESS_ENABLE : 1; + unsigned int ADD_TID_ENABLE : 1; + unsigned int INDEX_STRIDE : 2; + unsigned int RESERVED1 : 3; + unsigned int FORMAT : 6; + unsigned int DST_SEL_W : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_X : 3; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; +#endif // header guard diff --git a/inc/rocr_proxy/rocr_proxy.h b/inc/rocr_proxy/rocr_proxy.h new file mode 100644 index 0000000000..60358cdb90 --- /dev/null +++ b/inc/rocr_proxy/rocr_proxy.h @@ -0,0 +1,131 @@ +#ifndef _ROCR_PROXY_H_ +#define _ROCR_PROXY_H_ + +#include + +namespace rocr_proxy { +enum AllocDomain { + kSystem, + kLocal, + kUserMemory, + kUserQueue, + kDomainCount, +}; + +enum MemFlag { + kFineGrain = (1ULL << 0), + kKernarg = (1ULL << 1), +}; + +enum EngineFlag { + KCOMPUTE0 = (1ULL << 0), + KDRMDMA = (1ULL << 1), + KDRMDMA1 = (1ULL << 2), +}; + +enum SchedLevel { + kLow = 0, + kNormal = 1, + kHigh = 2, +}; + +enum AsicFamilyType { + kVega10, + kNavi10, + kSiennaCichlid, + kPlumBONITO, + kNavi44, + kNavi48 +}; + +struct HwsInfo { + union { + struct { + uint32_t gfxHwsEnabled : 1; + uint32_t computeHwsEnabled : 1; + uint32_t dmaHwsEnabled : 1; + uint32_t dma1HwsEnabled : 1; + uint32_t reserved : 28; + } hwsMask; + uint32_t osHwsEnableFlags; + }; + uint64_t engineOrdinalMask; // Indicates which engines (by ordinal) support MES HWS +}; + +typedef struct { + int major; + int minor; + int stepping; + bool is_dgpu; + char product_name[MAX_PATH]; + const char *uuid; + AsicFamilyType family; + uint32_t device_id; + uint32_t wavefront_size; + uint32_t compute_unit_count; + uint32_t max_engine_clock_mhz; + uint32_t watch_points_num; + uint32_t pci_bus_addr; + uint32_t memory_bus_width; + uint32_t max_memory_clock_mhz; + uint64_t gpu_counter_frequency; + uint32_t wave_per_cu; + uint32_t simd_per_cu; + uint32_t max_scratch_slots_per_cu; + uint32_t num_shader_engine; + uint32_t shader_array_per_shader_engine; + uint32_t domain; + uint32_t num_gws; + uint32_t asic_revision; + uint64_t local_visible_heap_size; + uint64_t local_invisible_heap_size; + uint64_t private_aperture_base; + uint64_t private_aperture_size; + uint64_t shared_aperture_base; + uint64_t shared_aperture_size; + uint32_t user_queue_size; + uint32_t lds_size; + uint32_t big_page_alignment_size; + uint32_t hw_big_page_min_alignment_size; + uint32_t hw_big_page_alignment_size; + bool enable_big_page_alignment; + uint32_t mec_fw_version; + uint32_t sdma_fw_version; + uint32_t l1_cache_size; + uint32_t l2_cache_size; + uint32_t l3_cache_size; + uint32_t gl2_cacheline_size; + uint32_t num_cp_queues; + HwsInfo hwsInfo; + std::vector sdma_schedid; + uint32_t compute_schedid; + bool state_shadowing_by_cpfw; + bool platform_atomic_support; + void *adapter_info; + void *adapter_ex_info; +} DeviceInfo; + +int EngineOrdinal(int engine, DeviceInfo *device_info); +bool GetHwsEnabled(int engine, DeviceInfo *device_info); +bool ShouldDisableGpuTimeout(int engine, DeviceInfo *device_info); +bool ParseAdapterInfo(D3DKMT_HANDLE adapter, DeviceInfo *device_info); +bool QueryAdapterSupported(D3DKMT_HANDLE adapter); + +uint32_t QueueEngine2EngineFlag(uint32_t queue_engine); +void SetAllocationInfo(void *data, uint64_t size, AllocDomain domain, + uint64_t addr, uint32_t mem_flags, uint32_t engine_flag, const DeviceInfo &device_info); +bool CreatePrivateAllocInfo(int num_handles, void **ppdrv_priv, void **ppalloc_priv, + int *pdrv_priv_data_size, int *palloc_priv_data_size); +void DestroyPrivateAllocInfo(void *drv_priv, void *alloc_priv); + +int CreateSubmitPrivData(void **priv_data, D3DKMT_HANDLE queue, uint64_t command_addr, + uint64_t command_size, bool is_hw_queue); +int CreateHwQueuePrivData(void **priv_data, D3DKMT_HANDLE context, + bool FwManagedGfxState, SchedLevel level = kNormal); +int CreateContextPrivData(void **priv_data, bool FwManagedGfxState); +int CreatePowerOptPrivData(void **priv_data, bool restore); +int CreateCalibratedTimestampsPrivData(void **priv_data); +void QueryCalibratedTimestamps(void* priv, uint64_t* gpu, uint64_t* cpu); +void DestroyPrivData(void *priv_data); +} +#endif diff --git a/inc/rocr_proxy/wddm_types.h b/inc/rocr_proxy/wddm_types.h new file mode 100644 index 0000000000..f2e60d907d --- /dev/null +++ b/inc/rocr_proxy/wddm_types.h @@ -0,0 +1,155 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _ROCR_WDDM_TYPES_H_ +#define _ROCR_WDDM_TYPES_H_ + +#include + +#include + +typedef uint32_t UINT, *UINT_PTR; +typedef int32_t INT32; +typedef int32_t LONG; +typedef uint32_t ULONG, *ULONG_PTR; +typedef int64_t LONGLONG; +typedef int64_t LONG64; +typedef uint64_t ULONGLONG; +typedef uint64_t ULONG64, *ULONG64_PTR; +typedef uint8_t BYTE; +typedef uint16_t WORD; +typedef uint32_t DWORD; +typedef int32_t BOOL; +typedef int32_t NTSTATUS; +typedef uint16_t USHORT; +typedef uint16_t UINT16; +typedef uint32_t UINT32; +typedef uint64_t UINT64; +typedef int32_t INT; +typedef uint64_t SIZE_T; +typedef void VOID; +typedef float FLOAT; +typedef char CHAR; +typedef unsigned char UCHAR; +typedef UCHAR BOOLEAN; +typedef int16_t WCHAR; +typedef void *HANDLE; +typedef void *PVOID; +typedef void *LPVOID; +typedef const int16_t *PCWSTR; + +#define ULONG ULONG +#define ULONG_PTR ULONG_PTR +#define USHORT USHORT + +#define DECLARE_HANDLE(name) struct name##__{int unused;}; typedef struct name##__ *name +#define C_ASSERT(e) typedef char __C_ASSERT__[(e)?1:-1] + +DECLARE_HANDLE(HWND); +DECLARE_HANDLE(HDC); +DECLARE_HANDLE(PALETTEENTRY); + +typedef struct tagPOINT { + LONG x; + LONG y; +} POINT; + +typedef struct tagRECT { + LONG left; + LONG top; + LONG right; + LONG bottom; +} RECT; + +typedef struct tagRECTL { + LONG left; + LONG top; + LONG right; + LONG bottom; +} RECTL; + +typedef union _LARGE_INTEGER { + struct { + DWORD LowPart; + DWORD HighPart; + } u; + LONGLONG QuadPart; +} LARGE_INTEGER; + +typedef LARGE_INTEGER *PLARGE_INTEGER; + +typedef struct _LUID { + ULONG LowPart; + LONG HighPart; +} LUID, *PLUID; + +typedef enum _DEVICE_POWER_STATE { + PowerDeviceUnspecified = 0, + PowerDeviceD0, + PowerDeviceD1, + PowerDeviceD2, + PowerDeviceD3, + PowerDeviceMaximum +} DEVICE_POWER_STATE, *PDEVICE_POWER_STATE; + +#define _Check_return_ +#define APIENTRY +#define CONST const +#define IN +#define OUT +#define FAR +#define MAX_PATH 260 +#define __stdcall + +#ifndef GUID_DEFINED +#define GUID_DEFINED +typedef struct _GUID { + uint32_t Data1; + uint16_t Data2; + uint16_t Data3; + uint8_t Data4[ 8 ]; +} GUID; +#endif + +#include + +#endif diff --git a/inc/wddm/cmd_util.h b/inc/wddm/cmd_util.h new file mode 100644 index 0000000000..423d9d6290 --- /dev/null +++ b/inc/wddm/cmd_util.h @@ -0,0 +1,77 @@ +/* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. */ + +#ifndef _CMD_UTIL_H_ +#define _CMD_UTIL_H_ + +#include +#include "hsa-runtime/inc/hsa.h" +#include "hsa-runtime/inc/amd_hsa_queue.h" +#include "hsa-runtime/inc/amd_hsa_kernel_code.h" +#include "inc/pm4_cmds.h" +#include "util/utils.h" + +namespace rocr { +namespace core { + +struct DispatchInfo { + uint8_t major; + hsa_kernel_dispatch_packet_t *pPacket; + void *pEntry; + const amd_kernel_code_t *pKernelObject; + uint32_t ldsBlks; + amd_queue_t *pAmdQueue; + bool wave32; + uint32_t srd; + void *pScratchBase; + uint32_t scratchSizePerWave; + uint32_t scratchBaseOffset[2]; + uint32_t offsetCnt; +}; + +class CmdUtil { +public: + CmdUtil() {}; + ~CmdUtil() {}; + + size_t BuildCopyData( + uint64_t *pDstAddr, + void *pBuffer, + uint32_t dstSel = dst_sel__mec_copy_data__tc_l2, + uint32_t dstCachePolicy = dst_cache_policy__mec_copy_data__stream, + uint32_t srcSel = src_sel__mec_copy_data__gpu_clock_count, + uint32_t srcCachePolicy = src_cache_policy__mec_copy_data__lru, + uint32_t countSel = count_sel__mec_copy_data__64_bits_of_data, + uint32_t wrConfirm = wr_confirm__mec_copy_data__wait_for_confirmation); + + size_t BuildBarrier( + void *pBuffer, + uint32_t eventIndex = event_index__mec_event_write__cs_partial_flush, + uint32_t eventType = CS_PARTIAL_FLUSH); + + size_t BuildAcquireMem( + uint8_t major, + void *pBuffer); + + size_t BuildScratch( + void *pScratchBase, + void *pBuffer); + + size_t BuildComputeShaderParams( + void *pBuffer); + + size_t BuildDispatch( + struct DispatchInfo *pInfo, + void *pBuffer); + + size_t BuildAtomicMem( + uint64_t *pAddr, + uint32_t atomic, + void *pBuffer, + uint32_t cachePolicy = cache_policy__mec_atomic_mem__stream, + uint64_t srcData = 1); +}; + +} // namespace core +} // namespace rocr + +#endif \ No newline at end of file diff --git a/inc/wddm/device.h b/inc/wddm/device.h new file mode 100644 index 0000000000..7d912be039 --- /dev/null +++ b/inc/wddm/device.h @@ -0,0 +1,245 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _ROCR_WDDM_H_ +#define _ROCR_WDDM_H_ + +#include +#include + +#include +#include +#include + +#include "inc/wddm/types.h" +#include "inc/rocr_proxy/rocr_proxy.h" +#include "inc/wddm/va_mgr.h" +#include "inc/wddm/status.h" +#include "inc/wddm/types.h" +#include "inc/wddm/gpu_memory.h" +#include "inc/wddm/cmd_util.h" + +namespace rocr { +namespace core { + +//class Queue; +class WDDMQueue; + +// WSL2 hyperv GPADL protocol limitation +#define MAX_USERPTR_BLOCK_SIZE 0xf0000000 + +class WDDMDevice { +public: + static constexpr size_t GpuMemoryChunkSize = 2 * (1ULL << 30); // 2 GB + + WDDMDevice(D3DKMT_HANDLE adapter, LUID adapter_luid); + ~WDDMDevice(); + + int Major() { return device_info_.major; } + int Minor() { return device_info_.minor; } + int Stepping() { return device_info_.stepping; } + bool IsDgpu() { return device_info_.is_dgpu; } + const char *ProductName() { return device_info_.product_name; } + const char *Uuid() { return device_info_.uuid; } + rocr_proxy::AsicFamilyType GfxFamily() { return device_info_.family; } + uint32_t DeviceId() { return device_info_.device_id; } + uint32_t WavefrontSize() { return device_info_.wavefront_size; } + uint32_t ComputeUnitCount() { return device_info_.compute_unit_count; } + uint32_t MaxEngineClockMhz() { return device_info_.max_engine_clock_mhz; } + uint32_t WatchPointsNum() { return device_info_.watch_points_num; } + uint32_t PciBusAddr() { return device_info_.pci_bus_addr; } + + uint32_t MemoryBusWidth() { return device_info_.memory_bus_width; } + uint32_t MaxMemoryClockMhz() { return device_info_.max_memory_clock_mhz; } + uint32_t WavePerCu() { return device_info_.wave_per_cu; } + uint32_t SimdPerCu() { return device_info_.simd_per_cu; } + uint32_t MaxScratchSlotsPerCu() { return device_info_.max_scratch_slots_per_cu; } + uint32_t NumShaderEngine() { return device_info_.num_shader_engine; } + uint32_t ShaderArrayPerShaderEngine() { return device_info_.shader_array_per_shader_engine; } + uint32_t NumSdmaEngine() { return device_info_.sdma_schedid.size(); } + uint32_t Domain() { return device_info_.domain; } + uint32_t NumGws() { return device_info_.num_gws; } + uint32_t AsicRevision() { return device_info_.asic_revision; } + uint64_t LocalHeapSize() { return device_info_.local_visible_heap_size + device_info_.local_invisible_heap_size; } + uint64_t LocalVisibleHeapSize() { return device_info_.local_visible_heap_size; } + uint64_t LocalInvisibleHeapSize() { return device_info_.local_invisible_heap_size; } + uint64_t PrivateApertureBase() { return device_info_.private_aperture_base; } + uint64_t PrivateApertureSize() { return device_info_.private_aperture_size; } + uint64_t SharedApertureBase() { return device_info_.shared_aperture_base; } + uint64_t SharedApertureSize() { return device_info_.shared_aperture_size; } + uint32_t LdsSize() { return device_info_.lds_size; } + uint64_t GPUCounterFrequency() { return device_info_.gpu_counter_frequency; } + uint32_t GetSwsQueueSize(void) const { return device_info_.user_queue_size; } + uint32_t GetMecFwVersion() { return device_info_.mec_fw_version; } + uint32_t GetSdmaFwVersion() { return device_info_.sdma_fw_version; } + uint32_t GetL1CacheSize() { return device_info_.l1_cache_size; } + uint32_t GetL2CacheSize() { return device_info_.l2_cache_size; } + uint32_t GetL3CacheSize() { return device_info_.l3_cache_size; } + uint32_t Gl2CacheLineSize() { return device_info_.gl2_cacheline_size; } + bool SupportStateShadowingByCpFw(void) const { return device_info_.state_shadowing_by_cpfw; } + bool SupportPlatformAtomic(void) const { return device_info_.platform_atomic_support; } + uint32_t GetSdmaEngine(uint32_t idx) { + assert(idx < NumSdmaEngine()); + return device_info_.sdma_schedid[idx]; + } + uint32_t GetComputeEngine() { return device_info_.compute_schedid; } + + uint64_t VramAvail(); + + void GetClockCounters(uint64_t *gpu, uint64_t *cpu); + uint32_t GetNumCpQueues() { return device_info_.num_cp_queues; } + + bool CreateSyncobj(D3DKMT_HANDLE *handle, uint64_t **addr); + void DestroySyncobj(D3DKMT_HANDLE handle); + + bool CreateQueue(WDDMQueue *queue); + void DestroyQueue(WDDMQueue *queue); + bool CreateHwQueue(WDDMQueue *queue); + bool DestroyHwQueue(WDDMQueue *queue); + bool SubmitToSwQueue(WDDMQueue *queue, uint64_t command_addr, + uint64_t command_size, uint64_t fence_value); + bool SubmitToHwQueue(WDDMQueue *queue, uint64_t command_addr, + uint64_t command_size, uint64_t fence_value); + + bool WaitPagingFence(WDDMQueue *queue) { + uint64_t value = page_fence_value_; + + if (*page_fence_addr_ < value && + !GpuWait(queue, &page_syncobj_, &value, 1)) + return false; + + return true; + } + + bool GpuWait(WDDMQueue *queue, const D3DKMT_HANDLE *syncobjs, + uint64_t *values, int count); + bool GpuSignal(D3DKMT_HANDLE context, const D3DKMT_HANDLE *syncobjs, + uint64_t *value, int count); + bool CpuWait(const D3DKMT_HANDLE *syncobjs, uint64_t *value, + int count, bool wait_any); + bool WaitOnPagingFenceFromCpu(); + + uint32_t LdsBlocks(const hsa_kernel_dispatch_packet_t *pkt); + uint32_t GetCmdbufSize(void) const { return cmdbuf_size_; } + uint32_t GetAqlFrameSize(void) const { return cmdbuf_aql_frame_size_; } + static uint32_t GetAqlFrameNum(void) { return cmdbuf_aql_frame_num_; } + + // Both legacy HWS and stage 1 HWS use KMD to alloc use queue memory, + // return false by default + bool AllocUserQueueMemFromUMD(void) const { return false; } + + bool IsHwsEnabled(int engine) { + return rocr_proxy::GetHwsEnabled(engine, &device_info_); + } + + void UpdatePageFence(uint64_t fence_value); + + D3DKMT_HANDLE PagingQueue() const { return page_queue_; } + D3DKMT_HANDLE PagingFence() const { return page_syncobj_; } + D3DKMT_HANDLE DeviceHandle() const { return device_; } + LUID GetLuid() const { return adapter_luid_; } + + const rocr_proxy::DeviceInfo& DeviceInfo() const { return device_info_; } + + ErrorCode ReserveGpuVirtualAddress(rocr_proxy::AllocDomain domain, + gpusize hit_base_addr, + gpusize size, + gpusize *out_gpu_virtual_addr, + gpusize alignment, + bool lock=false); + + ErrorCode FreeGpuVirtualAddress(rocr_proxy::AllocDomain domain, + gpusize base_addr, + gpusize size); + + ErrorCode CreateGpuMemory(const GpuMemoryCreateInfo &create_info, GpuMemory **gpu_mem); +private: + bool ParseDeviceInfo(void); + void DestroyDeviceInfo(void); + bool CreateDevice(void); + bool DestroyDevice(void); + bool CreatePagingQueue(void); + bool DestroyPagingQueue(void); + void *Lock(D3DKMT_HANDLE handle); + bool Unlock(D3DKMT_HANDLE handle); + bool CreateContext(int engine, D3DKMT_HANDLE *handle); + bool DestroyContext(D3DKMT_HANDLE handle); + + void SetPowerOptimization(bool restore); + void InitCmdbufInfo(void); + bool ReserveSystemHeapSpace(void); + bool FreeSystemHeapSpace(void); + bool ReserveLocalHeapSpace(void); + bool CommitSystemHeapSpace(void* addr, int64_t size, bool lock=false); + bool DecommitSystemHeapSpace(void* addr, int64_t size); + bool FreeLocalHeapSpace(void); + void InitVaMgr(); + + D3DKMT_HANDLE adapter_; + LUID adapter_luid_; + D3DKMT_HANDLE device_; + + D3DKMT_HANDLE page_queue_; + D3DKMT_HANDLE page_syncobj_; + uint64_t *page_fence_addr_; + std::atomic page_fence_value_; + + uint64_t local_heap_space_start_; + uint64_t local_heap_space_size_; + uint64_t system_heap_space_start_; + uint64_t system_heap_space_size_; + uint32_t cmdbuf_size_; + uint32_t cmdbuf_aql_frame_size_; + static const uint32_t cmdbuf_aql_frame_num_; + // device info + rocr_proxy::DeviceInfo device_info_; + + std::unique_ptr local_va_mgr_; + //CmdUtil cmd_util; +}; + +NTSTATUS WDDMGetAdapters(D3DKMT_ADAPTERINFO *&adapters, int &num_adapters); + +} // namespace core +} // namespace rocr + +#endif diff --git a/inc/wddm/gpu_memory.h b/inc/wddm/gpu_memory.h new file mode 100644 index 0000000000..b04a5d85d3 --- /dev/null +++ b/inc/wddm/gpu_memory.h @@ -0,0 +1,218 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _ROCR_GPU_MEMORY_H_ +#define _ROCR_GPU_MEMORY_H_ + +#include +#include +#include "util/utils.h" +#include "inc/wddm/types.h" +#include "inc/wddm/thunks.h" +#include "inc/rocr_proxy/rocr_proxy.h" + +namespace rocr { +namespace core { + +class WDDMDevice; + +union GpuMemoryCreateFlags { + struct { + uint64_t virtual_alloc : 1; + uint64_t physical_only : 1; + uint64_t interprocess : 1; + uint64_t locked : 1; + uint64_t unused : 60; + }; + uint64_t reserved; +}; + +struct GpuMemoryCreateInfo { + GpuMemoryCreateInfo() { + flags.reserved = 0; + domain = rocr_proxy::kLocal; + size = 0; + alignment = 0; + mem_flags = 0; + engine_flag = 0; + va_hint = 0; + user_ptr = nullptr; + dmabuf_fd = -1; + } + + GpuMemoryCreateFlags flags; + rocr_proxy::AllocDomain domain; + gpusize size; + gpusize alignment; + int mem_flags; + int engine_flag; + int dmabuf_fd; // Import from dmabuf + + void *user_ptr; + gpusize va_hint; +}; + +struct GpuMemoryDesc { + GpuMemoryDesc() { + gpu_addr = 0; + cpu_addr = nullptr; + client_size = 0; + size = alignment = 0; + flags.reserved = 0; + mem_flags = 0; + engine_flag = 0; + } + + rocr_proxy::AllocDomain domain; + LUID adapter_luid; // Where is the backing store location + gpusize gpu_addr; + void *cpu_addr; + gpusize client_size; // user request size + gpusize size; + gpusize alignment; + + union { + struct { + uint32_t is_virtual : 1; + uint32_t is_shared : 1; + uint32_t is_external : 1; + uint32_t is_physical_only : 1; + uint32_t is_locked : 1; + + uint32_t unused : 27; + }; + + uint32_t reserved; + } flags; + + int mem_flags; + int engine_flag; +}; + +struct SharedHandleInfo { + rocr_proxy::AllocDomain domain; + LUID adapter_luid; + gpusize client_size; // user request size + uint64_t size; + uint32_t flags; + int mem_flags; +}; + +using GpuMemoryHandle = void *; + +class GpuMemory { +public: + static size_t CalcChunkNumbers(gpusize size); + + ErrorCode Init(const GpuMemoryCreateInfo &create_info); + + WDDMDevice *GetDevice() const { return device_; } + gpusize Size() const { return desc_.size; } + gpusize ClientSize() const { return desc_.client_size; } + uint64_t GpuAddress() const { return desc_.gpu_addr; } + void *CpuAddress() const { return desc_.cpu_addr; } + + inline bool IsLocal() const { return desc_.domain == rocr_proxy::kLocal; } + inline bool IsUserMemory() const { return desc_.domain == rocr_proxy::kUserMemory; } + inline bool IsSystem() const { return desc_.domain == rocr_proxy::kSystem; } + inline bool IsUserQueue() const { return desc_.domain == rocr_proxy::kUserQueue; } + inline bool IsPhysicalOnly() const { return desc_.flags.is_physical_only; } + inline bool IsVirtual() const { return desc_.flags.is_virtual; } + inline bool IsShared() const { return desc_.flags.is_shared; } + inline bool IsExternal() const { return desc_.flags.is_external; } + + inline uint32_t Flags() const { return desc_.flags.reserved; } + inline int GetAllocInfo() const { return desc_.mem_flags; } + inline bool IsFineGrain() const { return (desc_.mem_flags & rocr_proxy::kFineGrain); } + inline bool IsSameAdapter(const LUID &luid) const { + return (desc_.adapter_luid.HighPart == luid.HighPart && + desc_.adapter_luid.LowPart == luid.LowPart); + } + + WinAllocationHandle GetAllocationHandle(size_t index) const { return alloc_handles_ptr_[index]; } + size_t NumChunks() const { return num_allocations_; } + + const GpuMemoryHandle GetGpuMemoryHandle() const { + return reinterpret_cast(const_cast(this)); + } + + static GpuMemory *Convert(GpuMemoryHandle handle) { return reinterpret_cast(handle); } + + ErrorCode ReserveGpuVirtualAddress(gpusize base_virt_addr, gpusize va_size, gpusize alignment); + ErrorCode FreeGpuVirtualAddress(gpusize va_start_address, gpusize va_size); + + ErrorCode MapGpuVirtualAddress(const gpusize map_addr, const gpusize size, gpusize offset = 0); + ErrorCode UnmapGpuVirtualAddress(const gpusize map_addr, const gpusize size, gpusize offset = 0); + + ErrorCode MakeResident(); + ErrorCode Evict(); + + ErrorCode ExportPhysicalHandle(int* dmabuf_fd, uint32_t flags = SHARED_ALLOCATION_ALL_ACCESS); + ErrorCode ImportPhysicalHandle(int dmabuf_fd); + ~GpuMemory(); +protected: + explicit GpuMemory(WDDMDevice *device); +private: + ErrorCode CreatePhysicalMemory(); + ErrorCode FreePhysicalMemory(); + + uint64_t AdjustSize(gpusize size) const; +private: + friend class WDDMDevice; + + WDDMDevice *const device_; + + GpuMemoryDesc desc_; + + size_t num_allocations_; + WinAllocationHandle *alloc_handles_ptr_; + WinAllocationHandle alloc_handle_; // Optimization for num_allocations_ is 1 + + WinResourceHandle resource_; // Handle to a resource object that wraps the allocation. Used for shared resources + + DISALLOW_COPY_AND_ASSIGN(GpuMemory); +}; + +} // namespace core +} // namespace rocr + +#endif diff --git a/inc/wddm/queue.h b/inc/wddm/queue.h new file mode 100644 index 0000000000..51c9dfd453 --- /dev/null +++ b/inc/wddm/queue.h @@ -0,0 +1,284 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// +#ifndef _WDDM_QUEUE_H_ +#define _WDDM_QUEUE_H_ + +#include +#include +#include "inc/wddm/types.h" +#include "inc/wddm/device.h" +#include "inc/wddm/gpu_memory.h" +#include "hsa-runtime/inc/hsa_ext_amd.h" +#include "hsa-runtime/inc/amd_hsa_queue.h" +#include "hsa-runtime/inc/amd_hsa_signal.h" +#include "inc/wddm/cmd_util.h" + +namespace rocr { +namespace core { + +class Queue; +class WDDMDevice; + +class WDDMQueue { +public: + WDDMQueue(WDDMDevice *device, + uint32_t cmdbuf_size, + uint32_t engine, + bool use_hws = true) : + device(device), + context(NULL), + queue(NULL), + syncobj(NULL), + sync_addr(NULL), + cmdbuf(0), + cmdbuf_addr(0), + cmdbuf_size(cmdbuf_size), + queue_engine(engine), + use_hws(use_hws), + prio(rocr_proxy::kNormal) { + + } + + virtual ~WDDMQueue() { } + + virtual hsa_status_t Init(void) = 0; + virtual hsa_status_t Fini(void) = 0; + + hsa_status_t SwsInit(void); + hsa_status_t SwsFini(void); + hsa_status_t SwsSubmit(uint64_t command_addr, + uint64_t command_size, + uint64_t fence_value); + + hsa_status_t HwsInit(void); + hsa_status_t HwsFini(void); + hsa_status_t HwsSubmit(uint64_t command_addr, + uint64_t command_size, + uint64_t fence_value); + hsa_status_t SetPriority(hsa_amd_queue_priority_t priority); + + uint64_t *GetSyncAddr(void) const { return sync_addr; } + uint64_t GetCmdbufAddr(void) const { return cmdbuf_addr; } + + rocr_proxy::SchedLevel ConvertSchedLevel(hsa_amd_queue_priority_t prio) const { + switch (prio) { + case HSA_AMD_QUEUE_PRIORITY_LOW: + return rocr_proxy::kLow; + case HSA_AMD_QUEUE_PRIORITY_HIGH: + return rocr_proxy::kHigh; + case HSA_AMD_QUEUE_PRIORITY_NORMAL: + default: + return rocr_proxy::kNormal; + } + } + + WDDMDevice *device; + + D3DKMT_HANDLE context; + D3DKMT_HANDLE queue; + + D3DKMT_HANDLE syncobj; + uint64_t *sync_addr; + + GpuMemoryHandle cmdbuf; + uint64_t cmdbuf_addr; + uint32_t cmdbuf_size; + + GpuMemoryHandle queue_mem; + uint64_t queue_addr; + + uint32_t queue_engine; + + bool use_hws; + rocr_proxy::SchedLevel prio; +}; + +class ComputeQueue : public WDDMQueue { +public: + ComputeQueue(WDDMDevice *device, + void *ring, + uint64_t ring_size, + std::atomic *ring_wptr, + std::atomic *ring_rptr, + volatile int64_t *error_addr, + uint32_t cmdbuf_size, + uint32_t engine, + bool use_hws = true); + + ~ComputeQueue(); + + virtual hsa_status_t Init(void); + virtual hsa_status_t Fini(void); + virtual hsa_status_t Submit(void); + + void* GetRing(void) const { return ring; } + uint64_t GetRingSize(void) const { return ring_size; } + std::atomic* GetRingWptr(void) const { return ring_wptr; } + std::atomic* GetRingRptr(void) const { return ring_rptr; } + + uint64_t GetAqlWriteIndex(void) const { return cmdbuf_aql_frame_write_index; } + uint32_t GetAqlFrameSize(void) const { return cmdbuf_aql_frame_size; } + + bool IsInvalidPacket(void) const { + uint16_t *packet = (uint16_t *)((char *)ring + + (cmdbuf_aql_frame_write_index % ring_size) * 64); + return ((*packet >> HSA_PACKET_HEADER_TYPE) & ((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1)) + == HSA_PACKET_TYPE_INVALID; + } + + hsa_status_t Process(void); + uint64_t * GetDoorbellPtr() const { return (uint64_t *)&doorbell_signal_.value; } + void RingDoorbell(); +private: + hsa_status_t KernelDispatchAqlToPm4(char *cpu, hsa_kernel_dispatch_packet_t *packet); + hsa_status_t BarrierGenericAqlToPm4(char *cpu, hsa_barrier_and_packet_t *packet, bool is_or = false); + struct amd_aql_pm4_ib { + uint16_t header; + uint16_t ven_hdr; + uint32_t ib_jump_cmd[4]; + uint32_t dw_cnt_remain; + uint32_t reserved[8]; + hsa_signal_t completion_signal; + }; + hsa_status_t VendorSpecificAqlToPm4(char *cpu, amd_aql_pm4_ib *packet); + hsa_status_t SwitchAql2PM4(void); + + hsa_status_t PreSubmit(void); + hsa_status_t EndSubmit(void); + + void *ring; + uint64_t ring_size; + std::atomic *ring_wptr; + std::atomic *ring_rptr; + + // ib_start_addr is the current ib start address + uint64_t ib_start_addr; + + // ib_size is the current ib size. + uint64_t ib_size; + + // record the last submitted aql frame write index + uint64_t sync_point; + + uint64_t cmdbuf_aql_frame_write_index; + uint32_t cmdbuf_aql_frame_size; + + bool needs_barrier; + bool ready_to_submit; + + CmdUtil cmd_util; + +private: + bool EnableProfiling() { + return AMD_HSA_BITS_GET(amd_queue_rocr_->queue_properties, AMD_QUEUE_PROPERTIES_ENABLE_PROFILING); + } + void HandleError(hsa_status_t status); + bool UpdateScratch(uint32_t private_segment_size, bool wave32); + + uint32_t UpdateIndexStride(uint32_t srd, bool wave32); + + void *ScratchBase() { return scratch_base_; } + + void AppendCmdbufSratchBaseOffset(int offset) { + scratch_base_offset_array_.push_back(offset); + } + + bool RelocateCmdbufScratchBase(uint64_t addr); + + uint32_t ScratchSizePerWave() { return scratch_size_per_wave_; } + uint64_t GetKernelObjAddr(uint64_t addr) const; + void InitScratchSRD(); + GpuMemoryHandle amd_queue_mem_; + amd_queue_t *amd_queue_; + amd_queue_t *amd_queue_rocr_; + amd_signal_t doorbell_signal_; + volatile std::atomic *error_code_; + std::thread aql_to_pm4_thread_; + bool thread_stop_; + std::mutex thread_cond_lock_; + std::condition_variable thread_cond_; + static void AqlToPm4Thread(ComputeQueue *queue); + + uint32_t scratch_waves_; + uint32_t scratch_size_per_wave_; + uint32_t scratch_size_; + void *scratch_base_; + GpuMemoryHandle scratch_mem_; + + std::vector scratch_base_offset_array_; +}; + +class SDMAQueue : public WDDMQueue { +public: + SDMAQueue(WDDMDevice *device, + uint64_t cmdbuf_size, + uint32_t engine, + bool use_hws = true) : + WDDMQueue(device, cmdbuf_size, engine, use_hws), + rptr_next(0), + ib_size(0), + ib_start_addr(0) { + + } + + virtual ~SDMAQueue() { } + + hsa_status_t Init(void); + hsa_status_t Fini(void); + hsa_status_t Submit(void); + + int PreparePacket(uint32_t offset, uint64_t size); + + void WaitQueue(void) { + device->CpuWait(&syncobj, &rptr_next, 1, false); + } + +private: + uint64_t rptr_next; + uint64_t ib_size; + uint64_t ib_start_addr; +}; + +} +} + +#endif diff --git a/inc/wddm/status.h b/inc/wddm/status.h new file mode 100644 index 0000000000..96808622ef --- /dev/null +++ b/inc/wddm/status.h @@ -0,0 +1,59 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _ROCR_CORE_INC_WDDM_STATUS_H_ +#define _ROCR_CORE_INC_WDDM_STATUS_H_ + +enum class ErrorCode { + Success, + DeviceLost, + UnSupported, + NotReady, + OutOfMemory, + OutOfGpuMemory, + Timeout, + SyscallFail, + InvalidateParams, + Unknown, +}; + +#endif diff --git a/inc/wddm/thunks.h b/inc/wddm/thunks.h new file mode 100644 index 0000000000..15556a8ab5 --- /dev/null +++ b/inc/wddm/thunks.h @@ -0,0 +1,232 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _ROCR_CORE_INC_WDDM_THUNKS_H_ +#define _ROCR_CORE_INC_WDDM_THUNKS_H_ + +#include "inc/wddm/status.h" +#include "inc/wddm/types.h" + +namespace rocr { +namespace core { + +inline ErrorCode TranslateNtStatus(NTSTATUS status) { + switch (status) { + case STATUS_SUCCESS: + return ErrorCode::Success; + case STATUS_PENDING: + return ErrorCode::NotReady; + case STATUS_NO_MEMORY: + return ErrorCode::OutOfMemory; + case STATUS_DEVICE_REMOVED: + return ErrorCode::DeviceLost; + case STATUS_GRAPHICS_NO_VIDEO_MEMORY: + return ErrorCode::OutOfGpuMemory; + case STATUS_TIMEOUT: + return ErrorCode::Timeout; + case STATUS_INVALID_PARAMETER: + return ErrorCode::InvalidateParams; + default: + break; + } + return ErrorCode::Unknown; +} + +namespace thunk { + +typedef D3DKMT_CREATEALLOCATION CreateAllocationArgs; +typedef D3DKMT_CREATECONTEXT CreateContextArgs; +typedef D3DKMT_CREATECONTEXTVIRTUAL CreateContextVirtualArgs; +typedef D3DKMT_CREATEPAGINGQUEUE CreatePagingQueueArgs; +typedef D3DKMT_CREATESYNCHRONIZATIONOBJECT CreateSynchronizationObjectArgs; +typedef D3DKMT_CREATESYNCHRONIZATIONOBJECT2 CreateSynchronizationObject2Args; +typedef D3DKMT_ESCAPE EscapeArgs; +typedef D3DKMT_EVICT EvictArgs; +typedef D3DKMT_FREEGPUVIRTUALADDRESS FreeGpuVirtualAddressArgs; +typedef D3DKMT_LOCK LockArgs; +typedef D3DKMT_LOCK2 Lock2Args; +typedef D3DKMT_OPENRESOURCE OpenResourceArgs; +typedef D3DKMT_OPENRESOURCEFROMNTHANDLE OpenResourceFromNtHandleArgs; +typedef D3DKMT_QUERYADAPTERINFO QueryAdapterInfoArgs; +typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECT SignalSynchronizationObjectArgs; +typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECT2 SignalSynchronizationObject2Args; +typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECTFROMCPU SignalSynchronizationObjectFromCpuArgs; +typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECTFROMGPU2 SignalSynchronizationObjectFromGpuArgs; +typedef D3DKMT_SUBMITCOMMAND SubmitCommandArgs; +typedef D3DKMT_UNLOCK UnlockArgs; +typedef D3DKMT_UNLOCK2 Unlock2Args; +typedef D3DKMT_UPDATEGPUVIRTUALADDRESS UpdateGpuVirtualAddressArgs; +typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECT WaitForSynchronizationObjectArgs; +typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECT2 WaitForSynchronizationObject2Args; +typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMCPU WaitForSynchronizationObjectFromCpuArgs; +typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMGPU WaitForSynchronizationObjectFromGpuArgs; +typedef D3DKMT_ACQUIREKEYEDMUTEX AcquireKeyedMutexArgs; +typedef D3DKMT_RELEASEKEYEDMUTEX ReleaseKeyedMutexArgs; +typedef D3DKMT_OPENKEYEDMUTEX OpenKeyedMutexArgs; +typedef D3DKMT_DESTROYKEYEDMUTEX DestroyKeyedMutexArgs; +typedef D3DKMT_QUERYVIDEOMEMORYINFO QueryVideoMemoryInfoArgs; +typedef D3DKMT_CREATEHWQUEUE CreateHwQueueArgs; +typedef D3DKMT_DESTROYHWQUEUE DestroyHwQueueArgs; +typedef D3DKMT_SUBMITCOMMANDTOHWQUEUE SubmitCommandToHwQueueArgs; +typedef D3DKMT_SUBMITPRESENTTOHWQUEUE SubmitPresentToHwQueueArgs; +typedef D3DKMT_SUBMITSIGNALSYNCOBJECTSTOHWQUEUE SubmitSignalSyncObjectsToHwQueueArgs; +typedef D3DKMT_SUBMITWAITFORSYNCOBJECTSTOHWQUEUE SubmitWaitForSyncObjectsToHwQueueArgs; +typedef D3DKMT_CREATESYNCFILE CreateSyncFileArgs; + +inline ErrorCode MapGpuVirtualAddress(D3DDDI_MAPGPUVIRTUALADDRESS *args) { + return TranslateNtStatus(D3DKMTMapGpuVirtualAddress(args)); +} + +inline ErrorCode CreateAllocation(CreateAllocationArgs *args) { + return TranslateNtStatus(D3DKMTCreateAllocation2(args)); +} + +inline ErrorCode DestroyAllocation( + WinDeviceHandle device, + WinResourceHandle resource, + size_t num_allocations, + const WinAllocationHandle *alloc_handles) { + + D3DKMT_DESTROYALLOCATION2 args{}; + + memset(&args, 0, sizeof(args)); + args.hDevice = device; + if (resource) { + args.hResource = resource; + } else { + args.phAllocationList = alloc_handles; + args.AllocationCount = num_allocations; + } + + return TranslateNtStatus(D3DKMTDestroyAllocation2(&args)); +} + +inline ErrorCode ReserveGpuVirtualAddress(D3DDDI_RESERVEGPUVIRTUALADDRESS *args) { + return TranslateNtStatus(D3DKMTReserveGpuVirtualAddress(args)); +} + +inline ErrorCode ReserveGpuVirtualAddress(WinAdapterHandle handle, + gpusize size, + gpusize base_address, + gpusize *out_addr) { + D3DDDI_RESERVEGPUVIRTUALADDRESS args{}; + args.hPagingQueue = handle; + args.Size = size; + args.BaseAddress = base_address; + + auto code = ReserveGpuVirtualAddress(&args); + if (code == ErrorCode::Success) + *out_addr = args.VirtualAddress; + return code; +} + +inline ErrorCode ReserveGpuVirtualAddress(WinAdapterHandle handle, + gpusize size, + gpusize minimum_address, + gpusize maximum_address, + gpusize *out_addr) { + D3DDDI_RESERVEGPUVIRTUALADDRESS args{}; + args.hPagingQueue = handle; + args.Size = size; + args.MinimumAddress = minimum_address; + args.MaximumAddress = maximum_address; + + auto code = ReserveGpuVirtualAddress(&args); + if (code == ErrorCode::Success) + *out_addr = args.VirtualAddress; + return code; +} + +inline ErrorCode FreeGpuVirtualAddress(FreeGpuVirtualAddressArgs *args) { + return TranslateNtStatus(D3DKMTFreeGpuVirtualAddress(args)); +} + +inline ErrorCode FreeGpuVirtualAddress(WinAdapterHandle handle, + gpusize base_address, + gpusize size) { + FreeGpuVirtualAddressArgs args{}; + args.hAdapter = handle; + args.Size = size; + args.BaseAddress = base_address; + return FreeGpuVirtualAddress(&args); +} + +inline ErrorCode MakeResident(D3DDDI_MAKERESIDENT *args) { + return TranslateNtStatus(D3DKMTMakeResident(args)); +} + +inline ErrorCode Evict(EvictArgs *args) { + return TranslateNtStatus(D3DKMTEvict(args)); +} + +inline ErrorCode ShareObjects(size_t num_allocations, + WinResourceHandle resource, + uint32_t flags, + int* dmabuf_fd) { + OBJECT_ATTRIBUTES obj_attr; + HANDLE nt_handle; + ErrorCode ret; + + InitializeObjectAttributes(&obj_attr, nullptr, OBJ_INHERIT, nullptr, nullptr); + ret = TranslateNtStatus(D3DKMTShareObjects(num_allocations, + &resource, &obj_attr, flags, &nt_handle)); + if (ret == ErrorCode::Success) + *dmabuf_fd = *(reinterpret_cast(&nt_handle)); + else + *dmabuf_fd = -1; + + return ret; +} + +inline ErrorCode QueryResourceInfoFromNtHandle(D3DKMT_QUERYRESOURCEINFOFROMNTHANDLE *args) { + return TranslateNtStatus(D3DKMTQueryResourceInfoFromNtHandle(args)); +} + +inline ErrorCode OpenResourceFromNtHandle(D3DKMT_OPENRESOURCEFROMNTHANDLE *args) { + return TranslateNtStatus(D3DKMTOpenResourceFromNtHandle(args)); +} + +} // namespace thunk +} // namespace core +} // namespace rocr + +#endif // _ROCR_CORE_INC_WDDM_THUNKS_H_ diff --git a/inc/wddm/types.h b/inc/wddm/types.h new file mode 100644 index 0000000000..cd831158ce --- /dev/null +++ b/inc/wddm/types.h @@ -0,0 +1,101 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _ROCR_CORE_INC_WDDM_TYPES_H_ +#define _ROCR_CORE_INC_WDDM_TYPES_H_ + +#include +#include +#include "inc/rocr_proxy/wddm_types.h" +// windows wchar is 16bit, but linux is 32bit +// seems libdxcore (not dxgkrnl.ko) convert thunk windows wchar to linux one +// so only accept 32bit wchar args. note driver private data structure still +// use 16bit wchar +#define WCHAR wchar_t +#define PCWSTR const wchar_t * +#include +#undef WCHAR +#undef PCWSTR + +using gpusize = uint64_t; // Used to specify GPU addresses and sizes of GPU allocations +using WinAllocationHandle = D3DKMT_HANDLE; +using WinResourceHandle = D3DKMT_HANDLE; +using WinContextHandle = D3DKMT_HANDLE; +using WinDeviceHandle = D3DKMT_HANDLE; +using WinAdapterHandle = D3DKMT_HANDLE; + +//reference dk/winnt.h +#define STANDARD_RIGHTS_REQUIRED (0x000F0000L) + +//reference dk/ntdef.h +#define OBJ_INHERIT (0x00000002L) +typedef WCHAR *PWCHAR, *LPWCH, *PWCH; +typedef struct _UNICODE_STRING { + USHORT Length; + USHORT MaximumLength; +#ifdef MIDL_PASS + [size_is(MaximumLength / 2), length_is((Length) / 2) ] USHORT * Buffer; +#else // MIDL_PASS + _Field_size_bytes_part_opt_(MaximumLength, Length) PWCH Buffer; +#endif // MIDL_PASS +} UNICODE_STRING; +typedef UNICODE_STRING *PUNICODE_STRING; +typedef const UNICODE_STRING *PCUNICODE_STRING; + +typedef struct _OBJECT_ATTRIBUTES { + ULONG Length; + HANDLE RootDirectory; + PUNICODE_STRING ObjectName; + ULONG Attributes; + PVOID SecurityDescriptor; + PVOID SecurityQualityOfService; +} OBJECT_ATTRIBUTES; +#define InitializeObjectAttributes( p, n, a, r, s ) { \ + (p)->Length = sizeof( OBJECT_ATTRIBUTES ); \ + (p)->RootDirectory = r; \ + (p)->Attributes = a; \ + (p)->ObjectName = n; \ + (p)->SecurityDescriptor = s; \ + (p)->SecurityQualityOfService = NULL; \ + } + +#endif // _ROCR_CORE_INC_WDDM_TYPES_H_ diff --git a/inc/wddm/va_mgr.h b/inc/wddm/va_mgr.h new file mode 100644 index 0000000000..12dac08c67 --- /dev/null +++ b/inc/wddm/va_mgr.h @@ -0,0 +1,86 @@ +#ifndef VA_MGR_H_ +#define VA_MGR_H_ + +#include +#include +#include "util/utils.h" + +namespace rocr { +namespace core { + +class VaMgr { +public: + VaMgr(uint64_t start, uint64_t size, uint64_t min_align); + ~VaMgr(); + + /* Allocate `bytes` VA, if `align` is not zero, the returned address is aligned by `align`. + * If `addr` parameter is not zero, try best to allocate VA from fixed address `addr`. + */ + uint64_t Alloc(uint64_t bytes, uint64_t align, uint64_t addr = 0); + + void Free(uint64_t addr); + +private: + uint64_t AllocImpl(uint64_t bytes, uint64_t align); + + struct Fragment { + using ptr = std::multimap::iterator; + ptr free_list_entry_; + + struct { + uint64_t size : 63; + bool is_free : 1; + }; + + Fragment() : size(0), is_free(false) {} + Fragment(ptr iterator, uint64_t len, bool is_free) + : free_list_entry_(iterator), size(len), is_free(is_free) {} + }; + + static inline Fragment make_fragment(typename Fragment::ptr iter, uint64_t len) { + return {iter, len, true}; + } + + inline Fragment make_fragment(uint64_t len) { return {free_list_.end(), len, false}; } + + static inline bool is_free(const Fragment& f) { return f.is_free; } + void set_used(Fragment& f) { + f.is_free = false; + f.free_list_entry_ = free_list_.end(); + } + static void set_free(Fragment& f, typename Fragment::ptr iter) { + f.free_list_entry_ = iter; + f.is_free = true; + } + + inline void remove_free_list_entry(Fragment& frag) { + if (frag.free_list_entry_ != free_list_.end()) { + free_list_.erase(frag.free_list_entry_); + frag.free_list_entry_ = free_list_.end(); + } + } + + inline void add_free_fragment(uint64_t size, uint64_t base) { + auto it = free_list_.insert(std::make_pair(size, base)); + frag_map_[base] = make_fragment(it, size); + } + + inline void add_used_fragment(uint64_t size, uint64_t base) { + frag_map_[base] = make_fragment(size); + } + // Indexed by size + std::multimap free_list_; + // Indexed by VA, each fragment has no overlap + std::map frag_map_; + + uint64_t min_align_; + + std::mutex lock_; // Mutex protecting allocation and free of va + + + DISALLOW_COPY_AND_ASSIGN(VaMgr); +}; + +} // namespace core +} // namespace rocr +#endif // VA_MGR_H_ diff --git a/libdrm.cpp b/libdrm.cpp new file mode 100644 index 0000000000..2e2cb8aeac --- /dev/null +++ b/libdrm.cpp @@ -0,0 +1,69 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// +#include +#include +#include + +#include "inc/wddm/types.h" +#include "inc/wddm/device.h" +#include "libhsakmt.h" + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetAMDGPUDeviceHandle( + HSAuint32 NodeId, HsaAMDGPUDeviceHandle *DeviceHandle) { + CHECK_DXG_OPEN(); + + rocr::core::WDDMDevice *pDevice = get_wddmdev(NodeId); + if (pDevice != nullptr) { + *DeviceHandle = reinterpret_cast(pDevice); + return HSAKMT_STATUS_SUCCESS; + } + return HSAKMT_STATUS_ERROR; +} + +HSAKMTAPI int hsaKmtamdgpu_query_gpu_info(void *dev, + struct amdgpu_gpu_info *info) { + rocr::core::WDDMDevice *pDevice = + reinterpret_cast(dev); + memset(info, 0, sizeof(*info)); + info->gpu_counter_freq = pDevice->GPUCounterFrequency() / 1000ull; + return 0; +} diff --git a/libhsakmt.h b/libhsakmt.h new file mode 100644 index 0000000000..471c056307 --- /dev/null +++ b/libhsakmt.h @@ -0,0 +1,158 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIBHSAKMT_H_INCLUDED +#define LIBHSAKMT_H_INCLUDED + +#include +#include +#include +#include "hsakmt/hsakmt.h" + +#include "inc/wddm/types.h" +#include "inc/wddm/device.h" + +rocr::core::WDDMDevice* get_wddmdev(uint32_t node_id); + +extern unsigned long dxg_open_count; +extern bool hsakmt_forked; +extern pthread_mutex_t hsakmt_mutex; +extern bool is_dgpu; +extern bool is_svm_api_supported; +extern int zfb_support; +extern int vendor_packet_support; + +#undef HSAKMTAPI +#define HSAKMTAPI __attribute__((visibility ("default"))) + +#if defined(__clang__) +#if __has_feature(address_sanitizer) +#define SANITIZER_AMDGPU 1 +#endif +#endif + +/*Avoid pointer-to-int-cast warning*/ +#define PORT_VPTR_TO_UINT64(vptr) ((uint64_t)(unsigned long)(vptr)) + +/*Avoid int-to-pointer-cast warning*/ +#define PORT_UINT64_TO_VPTR(v) ((void*)(unsigned long)(v)) + +#define CHECK_DXG_OPEN() \ + do { if (dxg_open_count == 0 || hsakmt_forked) return HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED; } while (0) + +/* Might be defined in limits.h on platforms where it is constant (used by musl) */ +/* See also: https://pubs.opengroup.org/onlinepubs/7908799/xsh/limits.h.html */ +#ifndef PAGE_SIZE +extern int PAGE_SIZE; +#endif +extern int PAGE_SHIFT; + +/* 64KB BigK fragment size for TLB efficiency */ +#define GPU_BIGK_PAGE_SIZE (1 << 16) + +/* 2MB huge page size for 4-level page tables on Vega10 and later GPUs */ +#define GPU_HUGE_PAGE_SIZE (2 << 20) + +#define CHECK_PAGE_MULTIPLE(x) \ + do { if ((uint64_t)PORT_VPTR_TO_UINT64(x) % PAGE_SIZE) return HSAKMT_STATUS_INVALID_PARAMETER; } while(0) + +#define ALIGN_UP(x,align) (((uint64_t)(x) + (align) - 1) & ~(uint64_t)((align)-1)) +#define ALIGN_UP_32(x,align) (((uint32_t)(x) + (align) - 1) & ~(uint32_t)((align)-1)) +#define PAGE_ALIGN_UP(x) ALIGN_UP(x,PAGE_SIZE) +#define BITMASK(n) ((n) ? (UINT64_MAX >> (sizeof(UINT64_MAX) * CHAR_BIT - (n))) : 0) +#define ARRAY_LEN(array) (sizeof(array) / sizeof(array[0])) + +/* HSA Thunk logging usage */ +extern int hsakmt_debug_level; +#define hsakmt_print(level, fmt, ...) \ + do { if (level <= hsakmt_debug_level) fprintf(stderr, fmt, ##__VA_ARGS__); } while (0) +#define HSAKMT_DEBUG_LEVEL_DEFAULT -1 +#define HSAKMT_DEBUG_LEVEL_ERR 3 +#define HSAKMT_DEBUG_LEVEL_WARNING 4 +#define HSAKMT_DEBUG_LEVEL_INFO 6 +#define HSAKMT_DEBUG_LEVEL_DEBUG 7 +#define pr_err(fmt, ...) \ + hsakmt_print(HSAKMT_DEBUG_LEVEL_ERR, fmt, ##__VA_ARGS__) +#define pr_warn(fmt, ...) \ + hsakmt_print(HSAKMT_DEBUG_LEVEL_WARNING, fmt, ##__VA_ARGS__) +#define pr_info(fmt, ...) \ + hsakmt_print(HSAKMT_DEBUG_LEVEL_INFO, fmt, ##__VA_ARGS__) +#define pr_debug(fmt, ...) \ + hsakmt_print(HSAKMT_DEBUG_LEVEL_DEBUG, fmt, ##__VA_ARGS__) +#define pr_err_once(fmt, ...) \ +({ \ + static bool __print_once; \ + if (!__print_once) { \ + __print_once = true; \ + pr_err(fmt, ##__VA_ARGS__); \ + } \ +}) +#define pr_warn_once(fmt, ...) \ +({ \ + static bool __print_once; \ + if (!__print_once) { \ + __print_once = true; \ + pr_warn(fmt, ##__VA_ARGS__); \ + } \ +}) + +/* Expects HSA_ENGINE_ID.ui32, returns gfxv (full) in hex */ +#define HSA_GET_GFX_VERSION_FULL(ui32) \ + (((ui32.Major) << 16) | ((ui32.Minor) << 8) | (ui32.Stepping)) + +HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id); +HSAKMT_STATUS gpuid_to_nodeid(uint32_t gpu_id, uint32_t* node_id); +bool prefer_ats(HSAuint32 node_id); +uint16_t get_device_id_by_node_id(HSAuint32 node_id); +uint16_t get_device_id_by_gpu_id(HSAuint32 gpu_id); +uint32_t get_direct_link_cpu(uint32_t gpu_node); + +HSAKMT_STATUS topology_sysfs_get_system_props(HsaSystemProperties *props); +HSAKMT_STATUS topology_get_node_props(HSAuint32 NodeId, + HsaNodeProperties *NodeProperties); +HSAKMT_STATUS topology_get_iolink_props(HSAuint32 NodeId, + HSAuint32 NumIoLinks, + HsaIoLinkProperties *IoLinkProperties); +void topology_setup_is_dgpu_param(HsaNodeProperties *props); + +HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags); + +#define MIN(a, b) ({ \ + typeof(a) tmp1 = (a), tmp2 = (b); \ + tmp1 < tmp2 ? tmp1 : tmp2; }) + +#define MAX(a, b) ({ \ + typeof(a) tmp1 = (a), tmp2 = (b); \ + tmp1 > tmp2 ? tmp1 : tmp2; }) + +uint32_t get_num_sysfs_nodes(void); + +bool is_forked_child(void); + +/* Calculate VGPR and SGPR register file size per CU */ +uint32_t get_vgpr_size_per_cu(HSA_ENGINE_ID id); +#define SGPR_SIZE_PER_CU 0x4000 + +#endif diff --git a/libhsakmt.ver b/libhsakmt.ver new file mode 100644 index 0000000000..a203bbc90a --- /dev/null +++ b/libhsakmt.ver @@ -0,0 +1,97 @@ +HSAKMT_1 +{ +global: +hsaKmtOpenKFD; +hsaKmtCloseKFD; +hsaKmtGetVersion; +hsaKmtAcquireSystemProperties; +hsaKmtReleaseSystemProperties; +hsaKmtGetNodeProperties; +hsaKmtGetNodeMemoryProperties; +hsaKmtGetNodeCacheProperties; +hsaKmtGetNodeIoLinkProperties; +hsaKmtCreateEvent; +hsaKmtDestroyEvent; +hsaKmtSetEvent; +hsaKmtResetEvent; +hsaKmtQueryEventState; +hsaKmtWaitOnEvent; +hsaKmtWaitOnMultipleEvents; +hsaKmtCreateQueue; +hsaKmtUpdateQueue; +hsaKmtDestroyQueue; +hsaKmtSetQueueCUMask; +hsaKmtSetMemoryPolicy; +hsaKmtAllocMemory; +hsaKmtAllocMemoryAlign; +hsaKmtFreeMemory; +hsaKmtAvailableMemory; +hsaKmtRegisterMemory; +hsaKmtRegisterMemoryToNodes; +hsaKmtRegisterMemoryWithFlags; +hsaKmtRegisterGraphicsHandleToNodes; +hsaKmtShareMemory; +hsaKmtRegisterSharedHandle; +hsaKmtRegisterSharedHandleToNodes; +hsaKmtProcessVMRead; +hsaKmtProcessVMWrite; +hsaKmtDeregisterMemory; +hsaKmtMapMemoryToGPU; +hsaKmtMapMemoryToGPUNodes; +hsaKmtUnmapMemoryToGPU; +hsaKmtDbgRegister; +hsaKmtDbgUnregister; +hsaKmtDbgWavefrontControl; +hsaKmtDbgAddressWatch; +hsaKmtDbgEnable; +hsaKmtDbgDisable; +hsaKmtDbgGetDeviceData; +hsaKmtDbgGetQueueData; +hsaKmtGetClockCounters; +hsaKmtPmcGetCounterProperties; +hsaKmtPmcRegisterTrace; +hsaKmtPmcUnregisterTrace; +hsaKmtPmcAcquireTraceAccess; +hsaKmtPmcReleaseTraceAccess; +hsaKmtPmcStartTrace; +hsaKmtPmcQueryTrace; +hsaKmtPmcStopTrace; +hsaKmtMapGraphicHandle; +hsaKmtUnmapGraphicHandle; +hsaKmtSetTrapHandler; +hsaKmtGetTileConfig; +hsaKmtQueryPointerInfo; +hsaKmtSetMemoryUserData; +hsaKmtGetQueueInfo; +hsaKmtAllocQueueGWS; +hsaKmtRuntimeEnable; +hsaKmtRuntimeDisable; +hsaKmtCheckRuntimeDebugSupport; +hsaKmtGetRuntimeCapabilities; +hsaKmtDebugTrapIoctl; +hsaKmtSPMAcquire; +hsaKmtSPMRelease; +hsaKmtSPMSetDestBuffer; +hsaKmtSVMSetAttr; +hsaKmtSVMGetAttr; +hsaKmtSetXNACKMode; +hsaKmtGetXNACKMode; +hsaKmtOpenSMI; +hsaKmtExportDMABufHandle; +hsaKmtWaitOnEvent_Ext; +hsaKmtWaitOnMultipleEvents_Ext; +hsaKmtReplaceAsanHeaderPage; +hsaKmtReturnAsanHeaderPage; +hsaKmtGetAMDGPUDeviceHandle; +hsaKmtPcSamplingQueryCapabilities; +hsaKmtPcSamplingCreate; +hsaKmtPcSamplingDestroy; +hsaKmtPcSamplingStart; +hsaKmtPcSamplingStop; +hsaKmtPcSamplingSupport; +hsaKmtGetVersionCapInfo; +hsaKmtQueueRingDoorbell; +hsaKmtamdgpu_query_gpu_info; +local: *; +}; + diff --git a/memory.cpp b/memory.cpp new file mode 100644 index 0000000000..b6e35af6fa --- /dev/null +++ b/memory.cpp @@ -0,0 +1,554 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "libhsakmt.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include "inc/wddm/gpu_memory.h" + +struct Allocation { + Allocation() + : handle(0), cpu_addr(0), gpu_addr(0), size(0), userptr(false), + user_data(nullptr), size_requested(0), node_id(0), mem_flags_value(0) {} + Allocation(rocr::core::GpuMemoryHandle handle_arg, void *cpu_addr_arg, + uint64_t gpu_addr_arg, size_t size_arg, bool userptr_arg = false, + void *user_data_arg = nullptr, size_t user_size_arg = 0, + HSAuint32 node_id_arg = 0, HSAuint32 mem_flags_value_arg = 0) + : handle(handle_arg), cpu_addr(cpu_addr_arg), gpu_addr(gpu_addr_arg), + size(size_arg), userptr(userptr_arg), user_data(user_data_arg), + size_requested(user_size_arg), node_id(node_id_arg), + mem_flags_value(mem_flags_value_arg) {} + + rocr::core::GpuMemoryHandle handle; + void *cpu_addr; + uint64_t gpu_addr; + bool userptr; + size_t size; /* actual size = align_up(size_requested, granularity) */ + void *user_data; + size_t size_requested; /* size requested by user */ + HSAuint32 node_id; + HSAuint32 mem_flags_value; +}; + +static std::map allocation_map_; +static std::mutex allocation_map_lock_; + +HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryPolicy(HSAuint32 Node, + HSAuint32 DefaultPolicy, + HSAuint32 AlternatePolicy, + void *MemoryAddressAlternate, + HSAuint64 MemorySizeInBytes) { + CHECK_DXG_OPEN(); + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags) { + switch (pageSizeFlags) { + case HSA_PAGE_SIZE_4KB: + return 4 * 1024; + case HSA_PAGE_SIZE_64KB: + return 64 * 1024; + case HSA_PAGE_SIZE_2MB: + return 2 * 1024 * 1024; + case HSA_PAGE_SIZE_1GB: + return 1024 * 1024 * 1024; + default: + assert(false); + return 4 * 1024; + } +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode, + HSAuint64 SizeInBytes, + HsaMemFlags MemFlags, + void **MemoryAddress) { + return hsaKmtAllocMemoryAlign(PreferredNode, SizeInBytes, 0, MemFlags, + MemoryAddress); +} + +#define POWER_OF_2(x) ((x && (!(x & (x - 1)))) ? 1 : 0) + +bool isSystemMemoryAvailable(HSAuint64 SizeInBytes) { + struct sysinfo info; + if (sysinfo(&info) != 0) + return false; + return SizeInBytes <= info.freeram; +} + +bool isLocalMemoryAvailable(rocr::core::WDDMDevice *dev, + HSAuint64 SizeInBytes) { + return SizeInBytes <= dev->VramAvail(); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode, + HSAuint64 SizeInBytes, + HSAuint64 Alignment, + HsaMemFlags MemFlags, + void **MemoryAddress) { + CHECK_DXG_OPEN(); + + if (!MemoryAddress) + return HSAKMT_STATUS_INVALID_PARAMETER; + + if (MemFlags.ui32.FixedAddress) { + if (*MemoryAddress == nullptr) + return HSAKMT_STATUS_INVALID_PARAMETER; + } else + *MemoryAddress = nullptr; + + rocr::core::WDDMDevice *dev = get_wddmdev(1); + if (!dev) + return HSAKMT_STATUS_ERROR; + + rocr::core::GpuMemory *gpu_mem = nullptr; + rocr::core::GpuMemoryCreateInfo create_info{}; + create_info.size = SizeInBytes; + + if (!MemFlags.ui32.NonPaged || zfb_support || MemFlags.ui32.GTTAccess) { + /* If allocate VRAM under ZFB mode */ + if (zfb_support && MemFlags.ui32.NonPaged == 1) + MemFlags.ui32.CoarseGrain = 1; + + create_info.domain = rocr_proxy::AllocDomain::kSystem; + if (!isSystemMemoryAvailable(SizeInBytes)) + return HSAKMT_STATUS_NO_MEMORY; + } else { + create_info.domain = rocr_proxy::AllocDomain::kLocal; + if (!isLocalMemoryAvailable(dev, SizeInBytes)) + return HSAKMT_STATUS_NO_MEMORY; + } + + if (!MemFlags.ui32.CoarseGrain) + create_info.mem_flags = rocr_proxy::kFineGrain; + + // create_info.mem_flags |= rocr_proxy::kKernarg; + create_info.flags.physical_only = MemFlags.ui32.NoAddress; + create_info.flags.interprocess = MemFlags.ui32.NoAddress; + create_info.flags.locked = 0; //!!(alloc_flags & AllocatePinned); + + auto code = dev->CreateGpuMemory(create_info, &gpu_mem); + if (code == ErrorCode::Success) { + *MemoryAddress = reinterpret_cast(gpu_mem->GpuAddress()); + std::lock_guard gard(allocation_map_lock_); + allocation_map_[*MemoryAddress] = Allocation( + gpu_mem->GetGpuMemoryHandle(), *MemoryAddress, (uint64_t)*MemoryAddress, + create_info.size, false, nullptr, SizeInBytes, + MemFlags.ui32.GTTAccess ? 0 : PreferredNode, MemFlags.Value); + return HSAKMT_STATUS_SUCCESS; + } + + return HSAKMT_STATUS_ERROR; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtFreeMemory(void *MemoryAddress, + HSAuint64 SizeInBytes) { + CHECK_DXG_OPEN(); + + if (!MemoryAddress) + return HSAKMT_STATUS_INVALID_PARAMETER; + + rocr::core::GpuMemory *gpu_mem = nullptr; + { + std::lock_guard gard(allocation_map_lock_); + auto it = allocation_map_.find(MemoryAddress); + if (it == allocation_map_.end()) { + return HSAKMT_STATUS_ERROR; + } + + gpu_mem = rocr::core::GpuMemory::Convert(it->second.handle); + allocation_map_.erase(it); + } + + delete gpu_mem; + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtAvailableMemory(HSAuint32 Node, + HSAuint64 *AvailableBytes) { + CHECK_DXG_OPEN(); + + if (!AvailableBytes) + return HSAKMT_STATUS_INVALID_PARAMETER; + + rocr::core::WDDMDevice *dev = get_wddmdev(Node); + if (!dev) + return HSAKMT_STATUS_ERROR; + + *AvailableBytes = dev->VramAvail(); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress, + HSAuint64 MemorySizeInBytes) { + CHECK_DXG_OPEN(); + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress, + HSAuint64 MemorySizeInBytes, + HSAuint64 NumberOfNodes, + HSAuint32 *NodeArray) { + CHECK_DXG_OPEN(); + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags( + void *MemoryAddress, HSAuint64 MemorySizeInBytes, HsaMemFlags MemFlags) { + CHECK_DXG_OPEN(); + + if (!MemoryAddress) + return HSAKMT_STATUS_INVALID_PARAMETER; + + pr_debug("[%s] address %p\n", __func__, MemoryAddress); + + if (MemFlags.ui32.ExtendedCoherent && MemFlags.ui32.CoarseGrain) + return HSAKMT_STATUS_INVALID_PARAMETER; + + // Registered memory should be ordinary paged host memory. + if ((MemFlags.ui32.HostAccess != 1) || (MemFlags.ui32.NonPaged == 1)) + return HSAKMT_STATUS_NOT_SUPPORTED; + + if (!is_dgpu) + /* TODO: support mixed APU and dGPU configurations */ + return HSAKMT_STATUS_NOT_SUPPORTED; + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodes( + HSAuint64 GraphicsResourceHandle, + HsaGraphicsResourceInfo *GraphicsResourceInfo, HSAuint64 NumberOfNodes, + HSAuint32 *NodeArray) { + CHECK_DXG_OPEN(); + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress, + HSAuint64 MemorySizeInBytes, + int *DMABufFd, + HSAuint64 *Offset) { + CHECK_DXG_OPEN(); + assert(false); + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtShareMemory(void *MemoryAddress, HSAuint64 SizeInBytes, + HsaSharedMemoryHandle *SharedMemoryHandle) { + CHECK_DXG_OPEN(); + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtRegisterSharedHandle(const HsaSharedMemoryHandle *SharedMemoryHandle, + void **MemoryAddress, HSAuint64 *SizeInBytes) { + CHECK_DXG_OPEN(); + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandleToNodes( + const HsaSharedMemoryHandle *SharedMemoryHandle, void **MemoryAddress, + HSAuint64 *SizeInBytes, HSAuint64 NumberOfNodes, HSAuint32 *NodeArray) { + CHECK_DXG_OPEN(); + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMRead(HSAuint32 Pid, + HsaMemoryRange *LocalMemoryArray, + HSAuint64 LocalMemoryArrayCount, + HsaMemoryRange *RemoteMemoryArray, + HSAuint64 RemoteMemoryArrayCount, + HSAuint64 *SizeCopied) { + pr_err("[%s] Deprecated\n", __func__); + + assert(false); + return HSAKMT_STATUS_NOT_IMPLEMENTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMWrite(HSAuint32 Pid, + HsaMemoryRange *LocalMemoryArray, + HSAuint64 LocalMemoryArrayCount, + HsaMemoryRange *RemoteMemoryArray, + HSAuint64 RemoteMemoryArrayCount, + HSAuint64 *SizeCopied) { + pr_err("[%s] Deprecated\n", __func__); + + assert(false); + return HSAKMT_STATUS_NOT_IMPLEMENTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDeregisterMemory(void *MemoryAddress) { + CHECK_DXG_OPEN(); + + if (!MemoryAddress) + return HSAKMT_STATUS_INVALID_PARAMETER; + + pr_debug("[%s] address %p\n", __func__, MemoryAddress); + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPU(void *MemoryAddress, + HSAuint64 MemorySizeInBytes, + HSAuint64 *AlternateVAGPU) { + CHECK_DXG_OPEN(); + + if (!MemoryAddress || !AlternateVAGPU) { + pr_err("FIXME: mapping NULL pointer\n"); + return HSAKMT_STATUS_ERROR; + } + + uint64_t start = rocr::AlignDown((uint64_t)MemoryAddress, 4096); + uint64_t end = + rocr::AlignUp((uint64_t)MemoryAddress + MemorySizeInBytes, 4096); + + void *aligned_ptr = (void *)start; + size_t aligned_size = end - start; + + { + std::lock_guard gard(allocation_map_lock_); + // GTT mem + auto it_gtt = allocation_map_.find(aligned_ptr); + if (it_gtt != allocation_map_.end()) { + if (!it_gtt->second.userptr) { + if (it_gtt->second.size >= MemorySizeInBytes) { + *AlternateVAGPU = (uint64_t)MemoryAddress; + return HSAKMT_STATUS_SUCCESS; + } else { + return HSAKMT_STATUS_ERROR; + } + } + } + + // userptr mem + auto it = allocation_map_.find(MemoryAddress); + if (it != allocation_map_.end()) { + if (it->second.userptr && it->second.size >= MemorySizeInBytes) { + *AlternateVAGPU = + (uintptr_t)it->second.gpu_addr + + ((uintptr_t)MemoryAddress - (uintptr_t)it->second.cpu_addr); + return HSAKMT_STATUS_SUCCESS; + } + } + } + + rocr::core::WDDMDevice *dev = get_wddmdev(1); + if (!dev) + return HSAKMT_STATUS_ERROR; + + rocr::core::GpuMemory *gpu_mem = nullptr; + rocr::core::GpuMemoryHandle handle = 0; + uint64_t addr; + rocr::core::GpuMemoryCreateInfo create_info{}; + create_info.domain = rocr_proxy::kUserMemory; + create_info.size = aligned_size; + create_info.user_ptr = aligned_ptr; + + auto code = dev->CreateGpuMemory(create_info, &gpu_mem); + if (code == ErrorCode::Success) { + addr = gpu_mem->GpuAddress(); + handle = gpu_mem->GetGpuMemoryHandle(); + } else { + return HSAKMT_STATUS_ERROR; + } + + { + std::lock_guard guard(allocation_map_lock_); + allocation_map_[MemoryAddress] = + Allocation(handle, aligned_ptr, addr, aligned_size, true, MemoryAddress, + MemorySizeInBytes); + allocation_map_[(void *)addr] = + Allocation(handle, aligned_ptr, addr, aligned_size, true, nullptr, + MemorySizeInBytes); + } + + *AlternateVAGPU = addr + ((uintptr_t)MemoryAddress - (uintptr_t)aligned_ptr); + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPUNodes( + void *MemoryAddress, HSAuint64 MemorySizeInBytes, HSAuint64 *AlternateVAGPU, + HsaMemMapFlags MemMapFlags, HSAuint64 NumberOfNodes, HSAuint32 *NodeArray) { + return hsaKmtMapMemoryToGPU(MemoryAddress, MemorySizeInBytes, AlternateVAGPU); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapMemoryToGPU(void *MemoryAddress) { + CHECK_DXG_OPEN(); + + if (!MemoryAddress) { + /* Workaround for runtime bug */ + pr_err("FIXME: Unmapping NULL pointer\n"); + return HSAKMT_STATUS_SUCCESS; + } + + pr_debug("[%s] address %p\n", __func__, MemoryAddress); + + rocr::core::GpuMemoryHandle handle = nullptr; + { + std::lock_guard gard(allocation_map_lock_); + auto it = allocation_map_.find(MemoryAddress); + if (it == allocation_map_.end()) { + return HSAKMT_STATUS_ERROR; + } + + if (!it->second.userptr) { + return HSAKMT_STATUS_SUCCESS; + } + + handle = it->second.handle; + + allocation_map_.erase((void *)it->second.gpu_addr); + allocation_map_.erase(it); + } + auto gpu_mem = rocr::core::GpuMemory::Convert(handle); + delete gpu_mem; + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtMapGraphicHandle(HSAuint32 NodeId, + HSAuint64 GraphicDeviceHandle, + HSAuint64 GraphicResourceHandle, + HSAuint64 GraphicResourceOffset, + HSAuint64 GraphicResourceSize, + HSAuint64 *FlatMemoryAddress) { + /* This API was only ever implemented in KFD for Kaveri and + * was never upstreamed. There are no open-source users of + * this interface. It has been superseded by + * RegisterGraphicsHandleToNodes. + */ + return HSAKMT_STATUS_NOT_IMPLEMENTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapGraphicHandle(HSAuint32 NodeId, + HSAuint64 FlatMemoryAddress, + HSAuint64 SizeInBytes) { + CHECK_DXG_OPEN(); + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetTileConfig(HSAuint32 NodeId, + HsaGpuTileConfig *config) { + CHECK_DXG_OPEN(); + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtQueryPointerInfo(const void *Pointer, + HsaPointerInfo *PointerInfo) { + CHECK_DXG_OPEN(); + + if (!Pointer || !PointerInfo) + return HSAKMT_STATUS_INVALID_PARAMETER; + + pr_debug("[%s] pointer %p\n", __func__, Pointer); + + memset(PointerInfo, 0, sizeof(HsaPointerInfo)); + + Allocation allocation_info; + { + std::lock_guard gard(allocation_map_lock_); + auto it = allocation_map_.find(Pointer); + if (it == allocation_map_.end()) { + PointerInfo->Type = HSA_POINTER_UNKNOWN; + return HSAKMT_STATUS_ERROR; + } + allocation_info = it->second; + } + + if (allocation_info.userptr) { + PointerInfo->Type = HSA_POINTER_REGISTERED_USER; + PointerInfo->SizeInBytes = allocation_info.size; + } else { + PointerInfo->Type = HSA_POINTER_ALLOCATED; + PointerInfo->SizeInBytes = allocation_info.size_requested; + } + + PointerInfo->Node = allocation_info.node_id; + PointerInfo->MemFlags.Value = allocation_info.mem_flags_value; + PointerInfo->CPUAddress = allocation_info.cpu_addr; + PointerInfo->GPUAddress = allocation_info.gpu_addr; + PointerInfo->UserData = allocation_info.user_data; + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryUserData(const void *Pointer, + void *UserData) { + CHECK_DXG_OPEN(); + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtReplaceAsanHeaderPage(void *addr) { + assert(false); +#ifdef SANITIZER_AMDGPU + pr_debug("[%s] address %p\n", __func__, addr); + CHECK_DXG_OPEN(); + + return HSAKMT_STATUS_SUCCESS; +#else + return HSAKMT_STATUS_NOT_SUPPORTED; +#endif +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtReturnAsanHeaderPage(void *addr) { + assert(false); +#ifdef SANITIZER_AMDGPU + pr_debug("[%s] address %p\n", __func__, addr); + CHECK_DXG_OPEN(); + + return HSAKMT_STATUS_SUCCESS; +#else + return HSAKMT_STATUS_NOT_SUPPORTED; +#endif +} diff --git a/openclose.cpp b/openclose.cpp new file mode 100644 index 0000000000..b2b3c04460 --- /dev/null +++ b/openclose.cpp @@ -0,0 +1,274 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libhsakmt.h" +#include "inc/hsa/hsa.h" +#include "inc/hsa/hsa_ven_amd_loader.h" + +int (*fn_amdgpu_device_get_fd)(HsaAMDGPUDeviceHandle device_handle); + +hsa_signal_value_t (*fn_hsa_signal_load_relaxed)(hsa_signal_t signal); +hsa_signal_value_t (*fn_hsa_signal_wait_relaxed)( + hsa_signal_t signal, hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint); +void (*fn_hsa_signal_store_screlease)(hsa_signal_t hsa_signal, + hsa_signal_value_t value); +hsa_status_t (*fn_hsa_ven_amd_loader_query_host_address)( + const void *device_address, const void **host_address); + +static const char dxg_device_name[] = "/dev/dxg"; +static pid_t parent_pid = -1; +int hsakmt_debug_level; +bool hsakmt_forked; +static int dxg_fd = -1; + +/* is_forked_child detects when the process has forked since the last + * time this function was called. We cannot rely on pthread_atfork + * because the process can fork without calling the fork function in + * libc (using clone or calling the system call directly). + */ +bool is_forked_child(void) { + pid_t cur_pid; + + if (hsakmt_forked) + return true; + + cur_pid = getpid(); + + if (parent_pid == -1) { + parent_pid = cur_pid; + return false; + } + + if (parent_pid != cur_pid) { + hsakmt_forked = true; + return true; + } + + return false; +} + +/* Callbacks from pthread_atfork */ +static void prepare_fork_handler(void) { pthread_mutex_lock(&hsakmt_mutex); } +static void parent_fork_handler(void) { pthread_mutex_unlock(&hsakmt_mutex); } +static void child_fork_handler(void) { + pthread_mutex_init(&hsakmt_mutex, NULL); + hsakmt_forked = true; +} + +/* Call this from the child process after fork. This will clear all + * data that is duplicated from the parent process, that is not valid + * in the child. + * The topology information is duplicated from the parent is valid + * in the child process so it is not cleared + */ +static void clear_after_fork(void) { + // TODO: fmm_clear_all_mem(); + if (dxg_fd) { + close(dxg_fd); + dxg_fd = -1; + } + dxg_open_count = 0; + parent_pid = -1; + hsakmt_forked = false; +} + +static inline void init_page_size(void) { +#ifndef PAGE_SIZE + PAGE_SIZE = sysconf(_SC_PAGESIZE); +#endif + PAGE_SHIFT = ffs(PAGE_SIZE) - 1; +} + +static HSAKMT_STATUS init_vars_from_env(void) { + char *envvar; + int debug_level; + + /* Normally libraries don't print messages. For debugging purpose, we'll + * print messages if an environment variable, HSAKMT_DEBUG_LEVEL, is set. + */ + hsakmt_debug_level = HSAKMT_DEBUG_LEVEL_DEFAULT; + + envvar = getenv("HSAKMT_DEBUG_LEVEL"); + if (envvar) { + debug_level = atoi(envvar); + if (debug_level >= HSAKMT_DEBUG_LEVEL_ERR && + debug_level <= HSAKMT_DEBUG_LEVEL_DEBUG) + hsakmt_debug_level = debug_level; + } + + /* Check whether to support Zero frame buffer */ + envvar = getenv("HSA_ZFB"); + if (envvar) + zfb_support = atoi(envvar); + + /* Check whether to handle vendor specific aql packet */ + envvar = getenv("WSLKMT_VENDOR_PACKET"); + if (envvar) + vendor_packet_support = atoi(envvar); + + return HSAKMT_STATUS_SUCCESS; +} + +#define _HSAKMT_LOOKUP_SYMS(_sym) \ + do { \ + fn_##_sym = \ + reinterpret_cast(dlsym(RTLD_DEFAULT, #_sym)); \ + if (!fn_##_sym) { \ + pr_err("%s not found - %s\n", #_sym, dlerror()); \ + return HSAKMT_STATUS_ERROR; \ + } \ + } while (0) + +static HSAKMT_STATUS init_symbols(void) { + _HSAKMT_LOOKUP_SYMS(hsa_signal_load_relaxed); + _HSAKMT_LOOKUP_SYMS(hsa_signal_wait_relaxed); + _HSAKMT_LOOKUP_SYMS(hsa_signal_store_screlease); + + hsa_status_t (*fn_hsa_system_get_extension_table)( + uint16_t extension, uint16_t version_major, uint16_t version_minor, + void *table); + _HSAKMT_LOOKUP_SYMS(hsa_system_get_extension_table); + hsa_ven_amd_loader_1_03_pfn_t table; + fn_hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 3, &table); + fn_hsa_ven_amd_loader_query_host_address = + table.hsa_ven_amd_loader_query_host_address; + + return HSAKMT_STATUS_SUCCESS; +} + +static void load_libdrm_amdgpu(void) { + /* load libdrm_amdgpu */ + int fd; + uint32_t major, minor; + amdgpu_device_handle device_handle; + amdgpu_device_initialize(fd, &major, &minor, &device_handle); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFD(void) { + HSAKMT_STATUS result; + int fd = -1; + HsaSystemProperties sys_props; + char *error; + char *useSvmStr; + + pthread_mutex_lock(&hsakmt_mutex); + + /* If the process has forked, the child process must re-initialize + * it's connection to DXG. Any references tracked by dxg_open_count + * belong to the parent + */ + if (is_forked_child()) + clear_after_fork(); + + if (dxg_open_count == 0) { + static bool atfork_installed = false; + + result = init_symbols(); + if (result != HSAKMT_STATUS_SUCCESS) + goto open_failed; + + load_libdrm_amdgpu(); + + result = init_vars_from_env(); + if (result != HSAKMT_STATUS_SUCCESS) + goto open_failed; + + if (dxg_fd < 0) { + fd = open(dxg_device_name, O_RDWR | O_CLOEXEC); + + if (fd == -1) { + result = HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED; + goto open_failed; + } + + dxg_fd = fd; + } + + init_page_size(); + + useSvmStr = getenv("HSA_USE_SVM"); + is_svm_api_supported = !(useSvmStr && !strcmp(useSvmStr, "0")) && false; + + // result = topology_sysfs_get_system_props(&sys_props); + if (result != HSAKMT_STATUS_SUCCESS) + goto topology_sysfs_failed; + + dxg_open_count = 1; + + if (!atfork_installed) { + /* Atfork handlers cannot be uninstalled and + * must be installed only once. Otherwise + * prepare will deadlock when trying to take + * the same lock multiple times. + */ + pthread_atfork(prepare_fork_handler, parent_fork_handler, + child_fork_handler); + atfork_installed = true; + } + } else { + dxg_open_count++; + result = HSAKMT_STATUS_KERNEL_ALREADY_OPENED; + } + + pthread_mutex_unlock(&hsakmt_mutex); + return result; +topology_sysfs_failed: + close(fd); +open_failed: + pthread_mutex_unlock(&hsakmt_mutex); + + return result; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtCloseKFD(void) { + HSAKMT_STATUS result; + + pthread_mutex_lock(&hsakmt_mutex); + + if (dxg_open_count > 0) { + if (--dxg_open_count == 0) { + close(dxg_fd); + } + + result = HSAKMT_STATUS_SUCCESS; + } else + result = HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED; + + pthread_mutex_unlock(&hsakmt_mutex); + + return result; +} diff --git a/pc_sampling.cpp b/pc_sampling.cpp new file mode 100644 index 0000000000..247726239b --- /dev/null +++ b/pc_sampling.cpp @@ -0,0 +1,73 @@ +/* + * Copyright © 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include "libhsakmt.h" + +HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingSupport(void) { + CHECK_DXG_OPEN(); + // Used for profiling tools + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtPcSamplingQueryCapabilities(HSAuint32 NodeId, void *sample_info, + HSAuint32 sample_info_sz, HSAuint32 *size) { + CHECK_DXG_OPEN(); + // Used for profiling tools + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingCreate(HSAuint32 NodeId, + HsaPcSamplingInfo *sample_info, + HsaPcSamplingTraceId *traceId) { + CHECK_DXG_OPEN(); + // Used for profiling tools + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingDestroy(HSAuint32 NodeId, + HsaPcSamplingTraceId traceId) { + CHECK_DXG_OPEN(); + // Used for profiling tools + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStart(HSAuint32 NodeId, + HsaPcSamplingTraceId traceId) { + CHECK_DXG_OPEN(); + // Used for profiling tools + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStop(HSAuint32 NodeId, + HsaPcSamplingTraceId traceId) { + CHECK_DXG_OPEN(); + // Used for profiling tools + return HSAKMT_STATUS_NOT_SUPPORTED; +} diff --git a/perfctr.cpp b/perfctr.cpp new file mode 100644 index 0000000000..9f2b755052 --- /dev/null +++ b/perfctr.cpp @@ -0,0 +1,82 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#include "libhsakmt.h" + +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcGetCounterProperties( + HSAuint32 NodeId, HsaCounterProperties **CounterProperties) { + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +/* Registers a set of (HW) counters to be used for tracing/profiling */ +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId, + HSAuint32 NumberOfCounters, + HsaCounter *Counters, + HsaPmcTraceRoot *TraceRoot) { + CHECK_DXG_OPEN(); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +/* Unregisters a set of (HW) counters used for tracing/profiling */ + +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcUnregisterTrace(HSAuint32 NodeId, + HSATraceId TraceId) { + CHECK_DXG_OPEN(); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcAcquireTraceAccess(HSAuint32 NodeId, + HSATraceId TraceId) { + CHECK_DXG_OPEN(); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcReleaseTraceAccess(HSAuint32 NodeId, + HSATraceId TraceId) { + CHECK_DXG_OPEN(); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +/* Starts tracing operation on a previously established set of performance + * counters */ +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcStartTrace(HSATraceId TraceId, + void *TraceBuffer, + HSAuint64 TraceBufferSizeBytes) { + CHECK_DXG_OPEN(); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +/*Forces an update of all the counters that a previously started trace operation + * has registered */ +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcQueryTrace(HSATraceId TraceId) { + CHECK_DXG_OPEN(); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +/* Stops tracing operation on a previously established set of performance + * counters */ +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcStopTrace(HSATraceId TraceId) { + CHECK_DXG_OPEN(); + return HSAKMT_STATUS_NOT_SUPPORTED; +} diff --git a/queues.cpp b/queues.cpp new file mode 100644 index 0000000000..7206e32e99 --- /dev/null +++ b/queues.cpp @@ -0,0 +1,174 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#include +#include "libhsakmt.h" +#include "inc/wddm/device.h" +#include "inc/wddm/queue.h" +#include "hsa-runtime/inc/amd_hsa_signal.h" + +uint32_t get_vgpr_size_per_cu(HSA_ENGINE_ID id) { + uint32_t vgpr_size = 0x40000; + + if (id.ui32.Major >= 11) { + vgpr_size = 0x60000; + } + + return vgpr_size; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueue( + HSAuint32 NodeId, HSA_QUEUE_TYPE Type, HSAuint32 QueuePercentage, + HSA_QUEUE_PRIORITY Priority, void *QueueAddress, HSAuint64 QueueSizeInBytes, + HsaEvent *Event, HsaQueueResource *QueueResource) { + HSAKMT_STATUS result; + + CHECK_DXG_OPEN(); + assert(Event == nullptr); + + if (Priority < HSA_QUEUE_PRIORITY_MINIMUM || + Priority > HSA_QUEUE_PRIORITY_MAXIMUM) + return HSAKMT_STATUS_INVALID_PARAMETER; + + rocr::core::WDDMDevice *device_ = get_wddmdev(NodeId); + assert(device_); + + switch (Type) { + case HSA_QUEUE_COMPUTE_AQL: { + assert(QueueResource->ErrorReason == nullptr); + uint64_t pkg_num = QueueSizeInBytes / 64; + uint32_t cmdbuf_size = device_->GetCmdbufSize(); + uint32_t queue_engine = device_->GetComputeEngine(); + bool use_hws = device_->IsHwsEnabled(queue_engine); + auto queue_ = new rocr::core::ComputeQueue( + device_, QueueAddress, pkg_num, + reinterpret_cast *>(QueueResource->Queue_write_ptr_aql), + reinterpret_cast *>(QueueResource->Queue_read_ptr_aql), + QueueResource->ErrorReason, cmdbuf_size, queue_engine, use_hws); + + QueueResource->QueueId = reinterpret_cast(queue_); + // for doorbell_signal.hardware_doorbell_ptr + QueueResource->Queue_DoorBell_aql = queue_->GetDoorbellPtr(); + } break; + case HSA_QUEUE_SDMA: + default: + assert(false); + QueueResource->QueueId = 0; + QueueResource->Queue_DoorBell = nullptr; + break; + } + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtUpdateQueue( + HSA_QUEUEID QueueId, HSAuint32 QueuePercentage, HSA_QUEUE_PRIORITY Priority, + void *QueueAddress, HSAuint64 QueueSize, HsaEvent *Event) { + CHECK_DXG_OPEN(); + + if (Priority < HSA_QUEUE_PRIORITY_MINIMUM || + Priority > HSA_QUEUE_PRIORITY_MAXIMUM) + return HSAKMT_STATUS_INVALID_PARAMETER; + + auto queue_ = reinterpret_cast(QueueId); + if (!queue_) + return HSAKMT_STATUS_INVALID_PARAMETER; + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyQueue(HSA_QUEUEID QueueId) { + CHECK_DXG_OPEN(); + + auto queue_ = reinterpret_cast(QueueId); + + if (!queue_) + return HSAKMT_STATUS_INVALID_PARAMETER; + + delete queue_; + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSetQueueCUMask(HSA_QUEUEID QueueId, + HSAuint32 CUMaskCount, + HSAuint32 *QueueCUMask) { + CHECK_DXG_OPEN(); + + auto queue_ = reinterpret_cast(QueueId); + if (!queue_) + return HSAKMT_STATUS_INVALID_PARAMETER; + + if (CUMaskCount == 0 || !QueueCUMask || ((CUMaskCount % 32) != 0)) + return HSAKMT_STATUS_INVALID_PARAMETER; + + pr_debug("%s not implemented\n", __func__); + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetQueueInfo(HSA_QUEUEID QueueId, + HsaQueueInfo *QueueInfo) { + CHECK_DXG_OPEN(); + + if (QueueInfo == NULL) + return HSAKMT_STATUS_INVALID_PARAMETER; + memset(QueueInfo, 0, sizeof(*QueueInfo)); + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSetTrapHandler(HSAuint32 Node, + void *TrapHandlerBaseAddress, + HSAuint64 TrapHandlerSizeInBytes, + void *TrapBufferBaseAddress, + HSAuint64 TrapBufferSizeInBytes) { + CHECK_DXG_OPEN(); + + pr_debug("%s not implemented\n", __func__); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtAllocQueueGWS(HSA_QUEUEID QueueId, HSAuint32 nGWS, + HSAuint32 *firstGWS) { + CHECK_DXG_OPEN(); + + auto queue_ = reinterpret_cast(QueueId); + if (!queue_) + return HSAKMT_STATUS_INVALID_PARAMETER; + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtQueueRingDoorbell(HSA_QUEUEID QueueId) { + CHECK_DXG_OPEN(); + + auto queue_ = reinterpret_cast(QueueId); + if (!queue_) + return HSAKMT_STATUS_INVALID_PARAMETER; + + queue_->RingDoorbell(); + return HSAKMT_STATUS_SUCCESS; +} diff --git a/rocr_proxy/librocr_proxy.a b/rocr_proxy/librocr_proxy.a new file mode 100644 index 0000000000000000000000000000000000000000..57b428938c6e7ab14d99ce517ae323dfa30ec0a6 GIT binary patch literal 365408 zcmd?S33wD$)<0a;-65S+RtN#Y78a3B$if=7urzdJ3nW26FoX~wm_W?J;)q5AN{lG( zsHmuj8!B#qxB()dqT-I|D59e{pyG%#xbpvfC$o`JHpm z-EZBhu5=h&P+mBHXk_o?;D0F^D{-ABC#R>SD9Efx!!U*#Ms&jeAFurB3zw8;FE;Y0 zPspiEP0JrqP*IdWySTEVs4{C@X{ygRGR?Q_4ANAQj%`>uH8p>H{+vbmg^Q{dR{A)X z)Rgi?h2{BW<%^asOHKDJoL9WCXkz)C;)MkzzT)f&Ir*t6qly+67ZznLoV&-s8>qivL0_%PptQKCuyRp( zR&h>2RnAd>U{d0;chnJKrDlDih zUbK+i&R$$fZIH^zqZXAG6fg9ZE?6*m?7wT7w6u}sMFo{b+2zHH5wJd6EY8keoRgZL zU7VAaU(eU{QAHJ%<%^cp(}GS)eRzuGkyAChw78OVMio>R_>jSZ($d-WjHe3Pzl19! z2tJ#OlT$dqXwJBz#YH9cI$D}i9=T{?Wzo{0yy|CU7k$x^qVkDlX9lPD5wAuTloZb{ zM@`M4{#a32P+C@@yI8L@eYr0-q~^G?SY3yUj@ zFD}YoTu?$YKtJEO*_Z}W2ju6_NUqStJoIxa`;;zPT$DAr*f)`;kbdPw#OIe6%`GY~ zT8MF+oA2vWxvZ?nH&OL~di^;yt*ErDq@WNp5BhZ0x#;4)$;G3}CgxI^aw_}JT~xlL zpnMMcK9VLkoDKSm%7^NKJSmW_y{J6Dvb+FOT9z6;g^L!V(^VB#W)*W*izh=jJ2#&) zlulRA!}Owsb7cOiCra&)ISyT)GRrSsSW#47DJ57uVPJm#yoFWyg-e$Ll)rFMd1*mO zelez!0vgO15vln;pIuD295T%N@CfH7(=4SgELxK96b)4}uwHqB%A%<@>_lj`G$~CU ztNIfr_7JbINQ}?Nz$v^izoZD0L-yE0sqhe7-`Pc&+ojS1K9m+yxkGLGd^8>#a4$C> zE#gbQ4FGmEa#bn*7mN99F4=N9>RQpBjB4tPfY(rF!)K87g+dR(0U zz_-$3UkT;(AE?ezW%{322qu^_bf^KP1xxcQFy*KUuh%>JIKrIDbbzx;VJ6FixvjxwLGS8cj5T0hImRxJw1S0|S+jXkR87{{M3b7LH5tz_ zs@WRtv5j}bdX<9Sos;j>9I97(kk&Z$CW=5^V9?UE!hSftF~5L@W(rmS+GAkcp^C8{ zpm*lV+Hxk_L1k6l6XqMMnn0V|o>S8nFRa(JdOxjKARggP9Rx~*6|AUS=7Q{EABw^L z!<|0)T50oBvxmbhpTV|!jURI z3ZYNMMKRD3SNc`801VX=xYP~n9tEJ0YR;tLTj zQt>i`7pZtT!U`3yRAH40(FKhqD!vrqGF*B7+Q0d0mmjUI`f{)DHzXK)eWxfm)~?QX z%D*~cB{BXw-%-DLpw3A5oicsD)qp}e$oFOWp*r7hGhb#aHOIZMbPCv0do!by+NnBY z=TTSfE=sNaz+b!f=+&oBpMJHzMs;#8QsKN#y;vQC2*5KL{9|mpUnq;(<;VTCRfk~< zS9P3=Fv!ezlJt&s@z>>@s`H)nuZ`WZ!nMw87^{3g#%}TbIM8>zDhz^Su{FMvX3g@G z?Js?A8w{VG`Lb;}$6s^2Ywhw+VEzEiY?9DrZt)h)G(MGjZC|BNr z+VT76`0G4pb*0?V4N`YF)x6iotpol9`GzJ#J)rJueP5z{2Y^0W=i6WFJKBIWB?UVSqO>Q<8O zvyK?~c0@DPuI!BDniq%~m0kU-siC2}bNjUv+Uy=DCS9*?<+|TVxhvHClCwW-R)pkQ z@*9bILR80e5W|9u-*6`5w!zh_*kavED(?1KJmB86i?s%INi`--P_M?utzOsJ*a7D* zKW@aqt(vIL#<+o>RDF+xHEycPOP=SdEJIMgD%T)&`z%cTp;bwOx|O8+tRqHMe#o`4 z0?9Qm5Dl)%^=B%r6t8yWDyX64ltG{>bAF|&yas7?Ob0P6s46#{$+&IMsY)vDc3Kbp zwSTrM^(w?5D64&8-S=E|YY^0{x|=s6l?UasUT(1ufAl}%U(LzrH4r|Vi3WMm3%5~m zcl|~M+=S$s7l;N|+@ohKTir_Xz`xobjbc)^+dVXq)b8=uzQqIR-}Djc3UCo!0j*cR zAZw6{rUsQsXc1i9yYd?F9pKj9m8%d##5gvrZuRqwr4)Z5M)(20xdSVff9GMhzjl`z zJF#0muGp0TK^7X;yPI8 zgD*5sk3)Aw0%kjvP>MQ3!g?p+0aVdm-+rn=7&=&&-IMIquIUV)_xi-6uSB@MZWX20 z`L_LuI*nbq30h~yZbAKTp#^q*ZQP$mW_$iT)O$u~U4l!>nv7B%i>vW%!!~ByAm9Gj zYhq}@q_r>1z;gmxA;2BQ>O2bllUL^|eQN%cDyXrknKbLx%%HKyHQN=rNt)fRwQFWL zY4aW2EJ<5mXVjdEsG_BKJvnRJJE} z>j5mQ7=papBQ0TN*{a=D=X>5fQ0sfX#<$y)B2vNXe7gtkT5?Z~Z-+{xa@YBGAhCut zcE~uAvO?`}?AGdy*h9?SdDK(=glc)en-?;WhLqyiZ+a$ha+vrb~u8vV8L!ju!02%w#GgY@aVQK z3$Hm9xjf?7*)^x4Dw`flCQtWP?+2tB)qSjefH#nKEH=PrLcWGro+U2pEs|WI=w^Tg zYly2ou=B9XJP;T-*<|SK0301%;MD}*dDvq+NgEK**ldYZ zcDi=&QQtpgrS#WcaoA3-tLCWcuNcjHUgG;OQb)(8-nc$?%bJ70*SzLh#SyW$=Otm_71$9+g$0%Q_HqMu`qkO5@ zLHS({c@=VWe=`-&0Li|7~s`w638PUXtxqzPk_z)66q%}{fOcT zD0*1YL+N`nz12#vH?t+*{@}io2iS?)Z1kGE0|fKeX3s}>YOg^V;g+=Klsk6CS%_2Q zx|=!Ts%%|z$`yMZ)eib$OwB1Xc0~=We^Z4NGK0M1YF_iM%8pZ9vzpgDtFoIZu1(F6 zQ|wwBO4yT`OfJUJ^kH4rw$8T?yBpXg^eIe1Fib7yy^O^cM5?hYU{b4x zxizm$#CK|>|mK47ZN+eck?T&-^#jw-0TqK}2~tc~*)?0V|*j#HOA9{V^Q zU7+KB;$K_4s}{4liD^BIHc=;N6Lpf0yJ%K|8~P0g`Zm?;0e>AG>)>_=V&hopie!i&t7ob_8OE0Pgp@^kHc_1h1;bQ zq+*{SAdWF0*#uWr3gAa^CW(U{4CC>`;+o~BjG+Db&u3~D z(#Jg_90@*KK*C<52{Itj-b@eGsYJbqsc2NDilT?Wwd`adWMrd4;;`x+t~o(xd6A@v zX`+_*beMj*|Ju4yE05P44u|)L5zr<9 z17bvc&Ee}X8b-vEIu`RCHAl?VQBBbkn${foJ>s=h*-b@)dFjXr8RF<(d55_Mkmh6tlHln=9j5{<$Uo1|8g zJk#N5x=F@GB_!yoH7k@18<|yz@$5oSMuB=>lwp}k%1NA~{ zU8aqxj)ScLcOXbc%d?W%HL{JI*TypDD@<>PdBUQ4qrC-<(n=Z%*kdVHIqIm_W zDaM3?#l;A6vV0RJ49_2#HPSyWYm|{aE^mDPh=~)jawjsPPag>e7{=m7#dDGaZx&`` z=x^@PduUxprH>jrJ~!V#e8Q-4zMjd$bF+qz8lIi&o18Ua^u+Gn(602a_uP_2cITb!^dZho7RioPORuP z8gCC)l^0c{PQjaTbGR`;{=MGFJQ&TX#A`94h#bzDI@Q*%!n+&kY5A2wW#RV$>%Ab^ z)y8%+I&{ohh^PGMQWw#?l=(&cnrcTFwJ|~V0xwzW?_M@&Pb}rnMUlx|{r0c-N@iDT z+U`A*Dd&ED@+*4=Wj&l;JCpY`b%^bzq)`R*Oy-wYf|LWz-ldx{x1gk=$Ur7RkcM8k z6p!Fi$dC$n7>H3n>SmPTRn7c@@_7}F*bjL{v#|`wSa9(JZ*Zys+_3*6gWuwG?2C8Q z4h`#NhrH2QU$J4mhThV&U+z>kP$>oF1*Q3hyxN^pZx>OB+dF*huGwC1{zxO~k2RA1QX}d9N<6jJ{@G5CYb5=G zM$)qyNk6xd^y@04v|Q!vff5!W-EJe|dB=j#-g)F;6WNkB+`Q?kn~7~1ZLREkLL4}L z#yiV3@(O{Y2eo?JDs2~MOJz@k{fuppPR+5^ah_@Dx2v=Xf)pp%AsZ$n22w&irmc0h z7kUUYVCeWwUnPw&3K0NInQFhY_v~WJb-0)9h0qe-E#&?+79|jxIM6S5wuB+M?swu@lMNE$K_rMX_&R<+;Lu3%} zOdu#$9B09|?%Z3qX|IIg43$a=53=LdAaXSHWsv$~F9i*EP;W4LZHMd6E(E&PYTwb& zmOCNc#IBTo*FU;CGXtF;Y4+<{f;&tHW*#&P&k(gO;XA#&6zH*myP6aJI;w$Fa%1Bl zs1N#)2dcIT^_I6yFOu{@i0mTKXbiC3UiQ}>Ldp(|`PdtF;HZcm#cgxoI+ox53aHYv z|J7)))zskvT@xMJL<0A~0C=G9a5>$32Bj>H(Nm652SOLgqm~PL@hP-M)E&ECOl~<$ z2LH+UZRxa+K9$&wW4AIU>Vy7nc!%Q@`e$$p>GO2W@)Jg7bCl#1N}|st>eaNM={*@a zpW<6qn#gL$qx5vqq^53TRkMZmhu|4eP5FV@z1|$jUbRqO!8+#1!1%)I;q)bF@~D7Q z2kO-?8k##Gy~yJ7|4Ti!K&Af=Dn5{t@)|Rseh+WgC3kbX&q}COoukXs0IJ&S$4fTd z_EGC-)nL~yw+0Ozj35I&y`l-lxx;B|BFc_Nw$Bh`8QIPgW}vP#AEX!l$>s}Le*#+E z*bOaNng+K>Xlw90qc|36kFkapr9o&{7mB$WeGfBN$iQrHDXZ5aArqX6~ z-3=Bqs9o}qt;SwR58&m+akY3Q_EX8QW_R-W!)KhaEa=hBo5m-!Dhz6T=_*>mHL25y zsKZgeLk~Sezf-+A$R}#^&~hL4_dt`<1SIEqr;JPK*}V2*^mx@1^tDqHqm!5if=7D; z#h@-P|Bm)FvDY=d#9hxD;0{VsFX)}joAlmsw3HdWJrZ~&kVM|o$G+WSIsv@&z|}~g z_zi5iWsI>7486wAkwDDDY4;)B>%`)a1zWh$^s?Q!gchVe!y6yN=6(Y+(-R->{v46v zxQzIw%?Kg!W|0J-x2qtC4W}pXib)8g7oI&vY!uSyiXk7}MsJfI#uy2dbd%``!#%i> zKzf{gd~^4ckaE0fljii)G$GX%HVbPt0r9S+e=s4{`#$cwIOl|PE$4)O5i|2pMv?Sq zCQg_E%(N6{AT!CrWH8fJm_f{>2s4n(XGQ*f9 z%;Pd^9^IEFyotkj)GPTGa>egd4_R?FcZCbuvR0?q}U=b ztAv>r9a)Uyd120Pu?Uj2!pvy349t3AX0kgE3UfgO^^MKKn!{t!<(3CQB!&oS6drl+nacu zN?&6QX8HBjR1R;juIBJYYa@p@S$jFW+4_RRTPy>=?C>VuYIWdntu=td+bkc4w_6u- zc!za8hwH3IIlR+4$l+bqNeSif<&$7+isRBz&5 zYY2z?tXvLXwW>IL&ANrd*R8D_?zawd_=a_Y!#Axc{36Di_@>o^!?&z+Rang70c({C zAK>t9YmW;5#^F1b2V>ov_>R>TVS1uvCMKiPrYA<3iGw*3Z6?m)NNY2O%95Vg&WxeT zPEYJ$#!xM%Cw4YtsCv^AyP7eg-pz~=_3ma2so#R$WEkOInpn(fF+4ArErv(Eg>wa- zL0XJD=LX;g5v;}NcnT^YyA{`+xIDK(sH9`t*wSYpdCrk`!LLdUGLn~{znCNF+;bsz zavOsAtg+la2QZA}xIA}-=0?&^kaTmT^>Z_gg!cfG45ma-&U}X0vZa|s$iZ~D7Rx_e$lpC`dbU_>i55UBep)i>IhZLfim5hP1k0^wqc zC8b4k{$}f`W>h(PLw`zbJ;&r+pCTLH`5@6CVyN|WvjeTu2_#%nbde$N+ff0I5S)+# zUs6(OlyD(Pcsf|vj`OLMe5krQqYJy5k}?zO!iIoq1w<*sY>*P&C?e9BV~bn_6c0rI z9s#ugsFpw!sh7#40nlTOf}Rb4-gQ8n9aV<%=@kJ{%69~6gX~n*oo{xaq9#zf`1ZPh zDWNrLUw8X05mDU@A`#gUW4D<_i{=`w-wo)Bu@3_Yw3`rccdBS=FvHuXx0P5-VT#p} z!&IvmhiO&@hkdLMIP7a3;jo|e9f$p`UpO3SE#@%8x`M+&*0mfCwr=Nes5M~$*&k;0 z<8Zh&n$t&ElQ|q|UBKZeYXOI&ttN~cV>RW_Z?)qv%j(YISZgEW##v8tINo}Z!wJ@F z9A;b788^uqL1Ejz)?5zzS(`2-zQ45>VFcAc%T{KapRlUPq?r>z6R3{_X1hx)Ok9PR zVCFQtQA)dY-1#2@^l@CCRwYOj2~85129WxuEpfFY@f=ETx1lKoosjSq#6JW>mXho- zBByPE%o9dJ4HDRyxc0O2nqw>ws(n>7buIC!{Ux095O_Kgm!~UbdOHuU_SL+IndWo| zifp0l973?y?Q2+f8AK}_G3~{axR%9hA$l-K>}ms6O4fc8W70|tV+RsncW|7GxO-7_ ze~3+CN}{RDnF$AhWC*%ALeYITpgRmA zUx@gHfcOH4<~U;Ur6PJd>eLidI~hw5^J(L~n+O-w24m zhUnW6@w+UhS~Z)*U{Kq2aLQ2`5_y_FV-3D4VjEM zlarAB;>g9Jh07i?;%3t^gN6Slkex zl>%G_;nfa+I<)e)vjDjIHvsS~fvAJJ=+2Szk_^S;6oJP@!Q*!U{lbBh1syp+kTmg| zzXRpPf-8neS>--L19b$TzXNj2xit7_4L%9bvPR(b+#atfUJa$29YvlHq->Kk(BlB? zc0d7VGWC{)DLV274LKO7aSoDZ7#(@DhMWu3UmPSk zTdykCYVex@y}v$OHO@K>`4UiXIY?>}-G%iU@*ALjc95aY-lrj(VcF=4ODdp2XE$o_ z5r9sv4_D4UtRYK)y2L?-I{TQ0yalL@4l>l)CpF{_px$?o4LZ9;gMSa`>H2Wx>~;;= zsu>m}Tn%@Fmo(%^pmH2!sIxC?$WoxLaF7i;yH|s+19Ve;xN=tQ)ws{@0qO$>8CtUk zG@B=YG7{`!g*vPDaBMRPsNN2;L1)ztkKtK>POA@B&VH)7PytkpgA8@{3k`WEP>(vu zP-nl=kb8mp#6dRb?B6x`KLKrmRXC)J%PzS6NkP&S*##hNsJeOjKW#Rptq@KM@vpt% zYEJoiyAww~UObx}3x7txX)>WWCDaE!JTUZ?k^m@OG=^Lb7*S29fuEDF^kB~Bi30QZn8#mcp~v#_WP&Aj2R^VXX0#5Kb3eXhd(F2 z$KfxDH*xrD;?o@dmiRq|9cEi}7V7O#XkA-IVUe|g!@1TL4(D0>Ih=2O!C|rW3x^A= zxQhsXp>;NgC6=GVQmdH5i>#F#mRpZ-SZVF&u*&+D!^KuiIq5I8&f;*HHJHOott<{} ztn)csVO_-GO6wX9ud?pqaFz88hjrG+9Im#Ga(JEf2f{dNYtws&88xD%+S+?Jn9;OG z^G%Rf-8T7J7)CiRS5gJY3d9V)5Afb)M$>jXfrOIsl*t=yQU9xSMN;1qr0ybCX*_C1 z|9qjkSN47{8eckS(v=*=WG6WYB_B?cs69MV&Zdcod`UuaL?m9_WAYYSmhi+u zGQCs+;|_Fv+}QLohaXK-_YcV(^Nt|il?2|7cg5_6(TN1^Ln$3Mzy>BnB;J6_(~GKx zAaV~Qmv&Df3M9>DQ`fTf3&AfYg)&lLd*c6fQOu~)JeDwnyc5ymP03YntG7Ypq(-)L?%QDK zWirOObNNO5B8xwS=!hfc;_}&7$6YKY{4Wq2Nw6@UYEq|w1CMj2f z=niEmm39{-@7Piwveht8H!64EVq*$EfSWLh{b7x$fpIGE-72Gt&QR55- zW{iX5O#5@eW=p2?A(Pt`l1bAD;6ee^HVRNDaSuBH_Li$(ELJ+#-E!+?2C-vb5lF+NSicRw$ zU{Bd7{wA5xaR;|)6r68?OWfguBvIX`RL_BMfCFHE1@OKCOn|U32p~n*Y<;Pq%K*E! z0Vvm!SXKKla|2*6HUgD7SMW__`7+-F_8SN9^gSKgS3yq$7TMk|@?2^Gv6Za=$q=SE z09x5}Gr0;i8i*+liYi7&O;xB8ATDuGl$nmYTcK_M;!YbSM{;~0-0=YcTnIgLD*(G} zkc(SFKu>6(PXPGZ0de;f(DNF|OvbJimoz$$Faf=xfw};Y>VUYK1QhS)sg!*h3&0cy zL{%l5|BhV*M2nnR2Ebwh@mwZGQbbOJ*IWbHEhOi4N8|%UPQ%sQ4A~Y(e=r@^bQe2!_k!)dO^x3=`br9O_GcVW47vmy*(~*M8L&;KM#PX3z#tG zXpnIP)=E{V8Ji1P3CXomm1z1`LUv7vewE0%)Egn&6e3@)$@f6EKSX||X8$Y5j)%yv z)8tJ$q7&fK{a-6`8jofd$a;pz@6zO>AsZJWzfY6Tg{;Jp%Lscy?pdR>gX-a;7jI*{9KuuSoyF*|K= zK%>2A5YXHSvY#}Z>N5u8#I$1{kAZ(=^b1_y5rP~Ln~ca}1Q`iK9&xxd8&ZyGT1MRg zNU=exg6C+ip9@(w$+a=CNYh^cSy_nw#hU)rkgW;PUnO#$ULJ<*8CzeU@~+oFZvk+~ z0e$O26-Y^L*FYx$IPHM6DenOd)Cx;$XIxSz+HCfM1{w-LmIKnJ;WsrX;rdJ11$#NatEZXCu=m&9RO^0K)k53`&zxN*Wj-K zddPu?*4t(cc@n78K}c0^J2ghs92y-gb~&L0YBXs3DI56^6T0twar0>cg5& zd*J&7>x98uBSdjmSuXc%%J0Dz92M*H3vkq)wlzri~(R~4&=zyJJrB#?{ zeC=q=1N!m?Gy<(C8pj-s+kk$^(FowAT7Q$`Qr70nKpi9`?{ByP9GBEytw{vW3Fdcz z{p=W}zM`rkP5_jR+3Xy=j)hC#$h6Vq>5vT!k)JJc%G}I>Y#Pb6&3ah?0ok$; z{b3^K^4tj7gSI?yug|rs+#RpMcLDm219xV0g)G&O$AS9IL55c8r5ZA^7fwBKNtMzX zrRQF!Av1x>c95aDKdK=Y0CkyzbaK}XKc~TO1N5Q#a8<1R8uDeJ-gS_n&K}Z`-vaf! zgA8@{cMaL1H#YpZqyidr){5jl#Z5LE&?)ud%GvfBvJ9wd2N~*YFAaG+P!BuEP-iD; z$lXAFO2P->g)y$c@a=oI>-i{ zeNuz32Xu3NxN>&4hI|dELk=?3*>^SMNkXOC6%gv|KQ&}Kpwb;=gU))aKsOi<=*;?X zl)E`rRa+3g;no^X(XiB0Cgfr8|>IbH|q0|!aPQW@W(fldGrhNBs2 ze8-%=`Pi(%+XFhZ5x7kEWJ2Y$OH-T*r9wxMYEFR;X`tl*+~9x$&S>zTH25Y!_cQ{R zX#{J2$TQdrD21O-B~W#$_u4x3{tyQ_ym{=@FNv}x|W(zod>ALl&*`-w$qWC@G)|Alic+eyv7ekj@(_# zO*{^QYG2$04>f{x%GM=xt}d^t(*rrPLE;eYB|!T&N2yde&bh()Qd$VDhUy( zH;G7eRfy;oF-YFs>xg)R@-L{c3GxH{;hZ!9EJNXoU%WfsUG2eJ_2N*khT`%{)u5w3ych7joymQv4s z7?RB)(yv)cU2z{IZ-z*ZJC?tK}!Zi@z8Vp%V zkjIGJZwu_Vl)E8;2Rgrg@%MrD`v`dbq6v)g(6nE_*bVFtxLmy;ak+t2FE8Nr3$l|y z!U)R7+tBM5q}~@w0d*I#N~59IFA5zr=}L~`k$U}te7KZA=~NRUQmAlc-nbyA*CPQApEh~Y?UIil+7V_nrG5ncF~ljKsYYCrk|=?WHjI{GxlyZzIL!Gm zcV?(p4#H+e-ix@re-L&7GhSW-!}6Jt*AT)AIM-o-`fzzhP%+Rwu)CPcl4(etV@q6P z0^(8@FNLVa5sxGBUr4)($n}n35($iif1x-QNy4T@%tNAkKLxR&=~`zH!i6it@Ia& zYoj2`b>yTWQ)gQ$eb_t!xfTFX<)GLK|CWwgn)j{LqDBHuWK&47W!Osdhb$~O2(2h6 za)kWg2$g^%)N?peVn?=87|Ka)YYP{0NCff0)w$2*Ph10O?DTVz&wox)x`MxN8Pndy! zo8htfGURXC7#G(R^B@InJDW|_x?(~9ir~}}h zKrO0JN$QDkE)e++ip@v|yU!f54U2$e+(Xa=G_M4?#@0~1@qRP>7+PGweFRJx^C^&L z3H%IY%`Z8G4dEG#Y`u-^Ly~g87wKr0lJfVsej+LBidPSqDZiqqlF>xTh%jcG5xD=u zC8h+t(d=uSYlEi>m@sA*$V39OWieKu87qct5y{yvkuT8nYazQXM88txT`d-F)rQzTO@yKXR{k*y+h=8X!0z`COUHI z#Se+xEJAu2$*Al+fv6tyf&yF#;VnS`m)5WTs!*GNc+y6x_WDY5VLxQ=kX&nm??q0N zxOp71e>n1P=-o#62p&gGMq&NOr8y|geUF(|;q=&M@>rRG<^Yh94QbqK_P@;0I7>7L zXf6P$A`S9RuY_I#a@9e2OE7@mQwSU7Ax|mSM{#WWZ50 zCjt0{Kw4KU(L7D?;l)^7dOa@D^!q@TL2^z13Qa#3vgslEYeY`_aI+G!i$nBp68TXV zYU(D)Zg=#hE8Qn@b2HMPvvu`O@{|I+4dEva;3b+O#KNlz@DB)2*#J3Z3cn(Zs+gyu z_|f>)H!i7T8d{QzRAaO}_h#fEpvDFxIXgkN(~w0#RRtq?Rup7^4S5|<8-kI6Syte; zXz&*SeKiQKhQ$*a=nDXT3<9~RSgKm~3DS%hgJl?(l$+KJ1vsPt-62e|0ZKMd9#OkS zBYlc3lO2LKen%=)2@p$yP+}(BG`ap=t0BD80nn~OYvJ(U73wJ<_BkkOiqIUx3teLH zQ^3A)pj4AmKjDuHfTjhP+mG7{Tv~Sy&y~E`cqhoxZ8=w+OKay3q;QNJ57f*?k)H^X z+j9v}R}s<~r}8TDa_Wa)qu{dRx*rfJ9PgP}dcu*uNYdSbmNMkAvFkmge@4c6nxGw{ z>gJC)-Q0BwayM2z!I1X~vn_yW19{INjEjM1k$Yh+B)8!JOmaZHh2m|mfI13jE&z*z zL4l1^_`~WB;SL}k!G*qxJuxD32On@z?%+F-I9xtP!E^DufPQN05u@wzO}}%aaRKSd zjmGLB`pGa>Ms?reb$#zzhI8eKblStEaJk%3B<_iYe=si3@2GRW#gRr6kR>b4BmmAM zPFLvW$&U_4jXB|)?EbsY4QV*eQe8hBTEGohITi90^nByQAd_1Ba%~{fJkO0jYs###l_YwT_+%}zF`2I zOCV=RD4*2(hSO0K<{SVP*dQgpTa2tkRhcUwyE;VvxyZ9yLcRgAhePB?HTiDH-m~TW zUX=E#nk!b;FY3MKcR-!Ak;;<!YT<2Fd~8f`&jn zG|*B2Y8(&`t-%E)C5CH281*~K?GA|Vu_1V=b>T9@mFY;Ih2$kiDz5sq3_gPFvk>_N zk#isTnPiye>b2Zt4b%>RZViEE3WzIpBmh}9NLA{5OX^ON+l+9WH${v~$t6DamAc5kd@lUZvU)OU{s(?in<&Gl#U6jm3vRmmW zk8?4AKE=m+x|=y7GjePhS(qd((e-*)6cv%I(|7wsSpbstH{uq3%uhU`rOxR3y(9~! zj~R=(#h6PN8ig+6T*MmjsZN!26V&yD;wlM&imkIuF3E$0B4Wf3aV$(03-3A>IHqMr z7RAiJ#6S9`V7=P>U1ao~L6#|%nNm;m*+KD-4eOP)Hc(mgQ9@}7y|NPA+#~2qg)Ztb zdK(sr`8twummx(Lu}1t{5FVjj_Xe_{?;D09rEuk9iayB~;;GmZQ*Q=L(Wemu-U>)e zm5M3)sGk7XAu;fJzyN(qQIm^qFKG_v1yHGIX?A)SBY`BzD4Nl=fJXy;fl(Z$vMbR7 zZtpvt3^Y*q+ zjRD=ly~IS4``se=TbqiBWX2;Biy3)b*e!~gD3L@n6DLdzGYP`TCs|tx6UUM^!o)L^ zEKEWaeJ`u4FwJABZ>Jaut>9ELIo0(uE(?97lFO-5i@0Lwo0Wlz;qj>CrCg`~0l4Kkrj0>rKXGseJ4m0$3}mIo+QnvUCdzQ5BcqtXS+_vM z=z5tMgU2}reM?i68Ia7fLyV)(YicgCTy!5b%K*6PsCF5|DOF$p|Y}jy=0O(7g4Flo? zK%WQ|0IwLDYtv<1)3-y}2YbH1IWVT_d!kCl3uV`P`Phuwk3KOfu9IC^+`F!4=V|7q zFOLSYRpLF4cCn+)V_A&vcC_g`rGZi@?M;q0eZI6_-VZz4^i|V(dB5#w)5lIlTg&?` zN1MKZS}*Tk9BukEY9McEGI9TuV`7|RB2ee@c2Ua|(@Mtj=uP~U!}LUtX$?j~dSaMq z%|T=_^_}i#Sw0HQi2lEykI9AN5fj5HSO?qp;_?`@e-+7OmTW=lPFvy%3y7z&_in-_m9!hIad%mrZ@~&X5jeu-%6nLT` zEjhA^xScVSV7>+NJ^`s9@==F~R~+%Tko@RKd3Q%ElaX)AAJ(@U;&RW^V@b>s$I#N@eV`nu`RHvEXLZ1de zy$wGisho1s)&EN2h=%b8=__FLVZW$V&6qW3NS*; z`)mNx2ox|YpfQ@IaR8jxkfm{&rHcT#q#;X_1Vn9T-U7fn8zeWE)El_B(2;xfoCBRM-Ja@A${QyjNJw!xO`t*yEY8+j3c-8Lw= z%dnA8A^XOb>qb@O6>J3_#AzSuRyP?otP-UqN zpN&C)%k!^T)0W6m+X1OPZHddI6A)RMWU0-BXsj&`UTW!tB50{4oOuz*C8Q7sv5R8P zQcH&n&Qi-yLiMG#mb4!S+3sjl%voxiRzfZcvea_^hXFZ33REx`#s1Aw8;_dlfJ+qk zl!df{mRd?MGXTmYAQdENsU`7LNG@=sQe*2~uwz_})Z1|dzpzQEW9ZyL&p7zmjgdgF z+p0w+C6eAGp=C5~poDJ^kZ^?J=@rz8Tac(Pr*w$)6S9&QaP{Rh4(42Rp2Wn+Bv=1y z%PGCinlJ@9*vl!s6)W+feFq>@d)XXjU0jO zTU)LhQF9;{D;%}g6c-oEMX|sfSn8Bb&4HZlKww6Z2Kg=;O{mXAsyUGJz5sw?0#R-0 zATL|!bH#Ds;UJ#A zw;xY`S0RbWKw|mB{PCB7Eg)pl0E<4T|A3j4(eiR68cFn__@o^m@l2TgHA0Qno@X_G z0)F#o?b)TpeNO!Bq-{>TYa9AwFoNmEe>W##67rCQ?uqM=i^O}jqC*LZpVJz*AS&Lc zX$J@2x7j&PykE0{PP~7D&xsFc7O+1sVYZW=(QKg;ACz#h6Zba_tGk@|Ijy!h z@m{UobmFP4ZgAphNe?^m^rT%*yid|0C!Ue$G+OV$&75-e9-_sEYVk2GordkL)lctp z19jDVtQJ2{)19uR7d8LN@nLS_KM?PVHa2FZqsh<$X`EFjQsT@IX_Gp%73Cr zjto;qF(aphDL!W8U@&DgGjcGPGKQHHG3{q&fG}CiWD0XGGk#&ldg&%OTbOaoOcG`S zGn0kMW+q3N$;{*mlgmt=Fy}KfMVJ}PoF_~^GgE~rU}l;yh0IJBrihvIg(+cXhA^ef z%rug@;x3M%TLSuv0z7%hQ?6>BI#QM=zK8U9`tt*^G~T`e$A8?)ls|Swrwz=vH{$<5 z{6RCGjsTMA1SXzH5>-hO{h6eAx;ss>qChAXPanukqQ44|M6vk45n+~MGrXyT-Q*p8 z>z18P9TLGS_JZ=FQe!B`Ju=KwhlPy>YDQe@C{|hTsQ6;2j~c8pn&V!jGA3-9s7zoL zI&F|#CN`saS5&e&ZmUcR+XNIQFsx`P?%3mC;(q{n$Rxk$8@#+BlwBci<^fZug>6Yj z#E9<*`mouKrTMO=6p}W5u~!huESejeiIBeEE3F}^#eBwmVMuBTpUTp=eFZC74|d1g z2NU$E-+)_u@+Rancy2txCoZWo+%Z(m^f_SZ0bJ2y=W-YK4*Eo}ZvFyyELAgoG1x_Q z%tX#yq+^ciCQ%RQmR z*@T(|VzDclzECVHAvLCvx<`(V#lk-GF%naEHj;^?E{@Ga@G7OUKNK_KX;dU5Vx&I7 zRs3C$T+EC78UFJ`^tt1JzR0B#TS%ekJIHz^J|UGspGB54e_rh;P?nKRTe3x`13& z&}F=vi7X2e@f;nO78#d@bRJ!4QT(^w6cZgwfAUTZue2C9sb7v1?5=uXDnSd3MoRny zB=H~Cqd)n^a}5%QA}*K(POK6!2GVz-8Q4rldwOnDav_1z5mOBNAgfj)K}u%Ku0 zsyIns!RBNc@PRoZ{dWOGl-mG`8S%F}rFhfD1Bkwxts8cU9(_hzl6kPEdnH*Ksk58D zs?Cn6Mq0}iOyAhnQ*Y-~`r>w=5UQQub!vY zTs(dCI*Gn#kVId6Nt#K7(gu<=i6QjC=y>{dems4OAfA4xJN-glMD&|_ zCF;|Z82Zt?))am2h$n42MUm8`WYaLNFn7Y82wrU|$9Q@vG=Y8%o}_ivivqz`Di0P%Pt0?cSif2>L3gIOuzR^%fI zLo|y+wfL~6^p~9kGlDJ3%+${(Y5u@1OTVnx!>}i_UBBz&GOE;U*YAcHa$RP-emBOD zMR~m4?+f~ z-n2B1j6o7dMorC4wId=zO1hoW$By)6v!j`gVVcEsEYtB!Co;`uI+(_E%`O#MvH zWjc;2ElYUchw1rDXE2?^w20|krVH7b@xI(tW*0FnV|o$Oa;6nbE16a?UCeX|(_1up zE63L|y^ZPZOy$G*>o|TV)4Q0;H#63A{2r$FGL^@x8#pe<7;<3n0H?^u8y@2L!%QDx zD!Z&lIsO>a$C++s`UKM_nLfqzX{OI;bT3y3{toqJMl9`UhM=>S&|N$J1_u6xd1*&E zpvL?g(rZ}nQVA|&M!K}gaTilJQxDTHrcIc7nMN>;WNI;`dlDQ|GmT-2GhO_PW5*Dw zW>&JOpOIVql)*NjQgBv@OIuq}>K)HJtUv4cyWsZaz|ZHXOLENi+>t zBZ`CrpA|07BkXozlfA+|Cu~ZT%m@RYZ{jBN+Tb5p=AHeQV1KwUXRfJm#{8N3QZ>c+ zc&1I6He;H=v^mrNy@`hQjX1?+dN$LpOl9in&hZ{hdon$TX)mU|nWiw60hz|}bf$fn z_GQ|SX@8~zm=0u`!E_MQKQpO~VB|=qG6#*~xR2>*rhcX}H_0p`GnC9VGEd2jGf}nt z$6j|8S{@?>w~zz>>U{@|KEXTib8px^i1{g*nEmL0(Kb_v}FO1C`XZA(i`oRP)qUw9XhEP?XTyIWP%%uD_kyIauk49fV%sB8ai#z6}AM1A;g z6}TDnDMSs7AVW_@#Vr4GMK)q-0%!-cy{r2S)yG?ijFqiu8^YTLPsYn}-(PAN&8em{ zb~mR*+*I5v!kHr5Bb-aPy~4SL+b5hyxL1V>6Ye$P=uZ_Phu4J*7jD0B@(ss~H-w82 z*#Y4qg?m>xOSt!iixTbw;i856P`DW34hk15+(*L233o`ic;P-4uBmXJ2-i%wzY3Qi z+^51d7w)idiNbv@Tnpij2-i}$FNA9)+?T>73HLYVT&;!sN@Q(?`&zhm!W|W^y=u#E z+`ZAp+?KA4V{zYHX1C?>=5&N@x8=9OskZ!1IMtTl3#Z!hgm9`Ye-KW!y*->nL$+~1~HQtn&9cVQqh(7uKcx zeL+~;!50-+Irx&Wwu3vAE<3o(wEET94!*2_%E8@=RSv$QSmodz#VQB)3Tr#KPf3-7 zuL>L1OzP-01#T(q>k8aP*!{wq?WqReG_4OE2j5aa<=_FuDhJRf8W2YuDgGrQ1&S5E;*y;iJ$iSC~0j#!izu=n8XWmhpy3ZJ$W> z%X^KCLuNE}cGbl{Hr@RJ)piXTNv=ic=O%S}q4js!^|$j4Y*K@aj&V&``Cl+9uE;&b zneQ6J#s9q>aa&b^7rNY3U3vwUx-zc(FO>Dn<@?_&Svzj}phE>!*6UsF^U;cWS#NZ$ z`H#!0tls5vkAV%{>U!5M+iFnfQyTZX+;@g(Jm?yGm8$dlcT;l5sdrJio6_qrnM*AP zEuA=?$+ViOy1lmV9#7lrQn)*AKJt~-Cc@o;mJoYrMY`SOlUnEc6XqnBna2R-GSWA6|SQorwZ4}-GO#aGlc8x z?m&C03(jUY&vJJlH@&lj?c#37?0jL*R&3eiO;_QBOm1H?db#fgM}MQ3mf34PanIG- zi@|js+03cAc8y}?cX=7tD^^~Q$hbkVY9_u>v7Y=fqq2;Qn-m)+^ZU(;RrB;MidCDQ zTNNw6oyb_LST&2^rr1a^b%$atVb>{Et?G9wHd>^2DK^ApZ6rwY$VJJo-Q<8sdIho z3A+Ot&Ab_(dLmX)%#-oCC%OVL@$86)zglJQ&%RKss*x`htCrTk2^+4~(yxTY;wgRP zGtXJHcQRVFg?W#W*bIU6?#al#eOOq#Iy?8|Fn))`RL*oztZMjns}wFmxTV5b!d)U)lh zgi9ChJ>mLwCa{7P?7W;7j&$}2a6hDVdJWHB&;Ve-bzppnds6UnIP z$$({xe-veZBZQxA^OoFr4)VEQLJ@|LQ*>`MXm!j^KzJb)8nmc$`k&cRp4Lph8T0p> zl*#|h*#C!iS6l%dnaa+iGspk$JxLwK{_AI`a<=+EI7wv-*-R%foy;_cX)e<|rc;=n z$8;*wX-uax)t~;%zH1{bT!j!nO?_q4b$tH-oW%mrZ+LYndvP||L=Y5wT1oP%5)pk zXPIti`W(~enZCgEMW!z?-NAGx(_KtoX1bf{D@^w=-OF?z(^r}9XZi-yH<`Z0^Z?Vh znZCpHU8e6beZOA(15Wvn=|QF+F+IfeW2T=l{VUT?nSRFfFw@VO9%1@7re87rn(0xd z-!T0<(_>7JGyRt7cTB%$dV=W>O#i|3N2Vv4{>1d3O#j976w{xX{=)QEroS=$o#`J; zPlwZjV#s&2ObJ{PxFzsN5QcS>)}3FlLhw2jBTwWbZ&rNJ&F0|q%|{{NEQFdIcQJJ{ z^)Q89{AkGEaVZy&*^Dg24YV#ib{&`56d-E;->qUy=V#hOW z%Cs5N1g6cICNgcwv=!4NrmdN_VJdc{Dh1h|)7wiC_n;)QJF@pkJFrLk#7=eFsk9L~ zZl~fM4B~&Y`EcQ*ZdJCL3E#%%kr%xv($nULNe>!q^Qdm>QR8ghKA0ZT%!<1aiM*{F z(%foFD8(fTCpY3lS_tPAWJ{|lsYQv|`)sr7(eaR0mYcHTJa^bal!2@tu=!}c$%p2+ zeu982PVdEL5NNuq%i0D&dRKQF8I;S=-L58sfR!=lQHgt8O|qRtHOgiof!hG5?4(&{ z^O3?=+kAvFGQr*CCZtK*3uc3ju>xDXVZCBVejrs+)kFdfWv2-BfVhcV4$I-Kc9rlXmTVd`g^#q?aJW0{U)I-cnS zrW2WFGo8eAGSeKUxlHqz&S6@_bS~3{OcyaNV|o$Oa;6nbE16a?UCeX|(_1upE63L| zy^ZPZOz&X2j_I9D?_zp4)Ada6VR|pq`ROdnwSAk&AKKFstHrkj{P%Jea& zk2BrO^a-XbvdvvL?)upg;okx9baqnJf(i-L-@UIAZm|SQBv>lJG6^o0;4czfBEfPAE|uUi z392QyT!JeksF7fW1S=)DQi7`_SS3NN1nQ66)CqTu1gj-bf92*n;nqlSy#(sd+}tSK zO%mKJf%-c)w+gpbg4-mxU4lC#SSP`q65J)h-4d*q;2sI?mEb-JHb{VPX=D1q534wM zK!OJ)ct`^E5za@1+a$rG5`i{Q(6E{yt;*%0QC4u_$HhADd`E8M4s|4y>`_Bru zU4rK%P~X~rLAVzscu9gC66};yxB-kUtUJ3R|@Tvr_N$|P^>f`%w2=}H0 zZ%J@K0`%4t*b2C^t?bc%|MA1RnDTWi9U zRLKGn$zsBkG)bA5J$w?YXQ!m5*pYq`NlO`^B1$wZEf5jRv_5u9pHxX1HOiMGS|f7C zb7XjKmgI$q!no%~|9Pm?|kbb_6NPF)bqMdHrm#7E$1SQw5Xb8!ye2>&ua{ zBO+N$$Qw6~GtHWiJDnr)k|e!{Rd|ou0!7P>Mup9L0^1pPVa)M+gSIpJZH6BIf9sax z|M_OB124k$w^V0wdKac=YgFHAb?5XROnWjthiNaSy_u#kO=T)uGx1sM>)W<>W*G zX?v)^cSIg->PTlZ4aU12wGocj_p z?za2YeV_YMQ+tBszDzi|3v^#DoZJPvuMke%^|@d8}<-J3BO@!W~dC|p5;{!s+QQ3!7 zb1k8d2(=PAOsJjEmxSn2rSTOZx>RX=O(;Y)zag}d(07FBQl;@PLZ?&N_k_+Q^dq5j z3H?OqLP9?i+D1sTN$Fos$WG`gLNSDf2ssGtBot5RMnZ{%ZXx6(w2P2Q=uSc&LiZA~ z2<;`5Oz0s(DYhg$cYKskDwVxND2>oTLg|EFBa}htO+uN34iOq{i%08tUp_ivB6mA2S?7@NPcw+tsx%9sd#21k)@ z@n_A2OTc3a8|-N+ZAK6bgJX+%hPiT=bTsn~Q41-EEh>z-xsVcaT9rmrY_WhL#}*42 za%{1PA;%Vr31+}e8}9)bDTE&owTFsD&I))(!JZl(oz;i4^4tS1Z$N^7)AqPBb8FIkW%#Z_~ zbqqP+X<^6#Pb)(Xc!CT$;91X*1D-a99PqR=;7ZJCGOjXyxD)Oh$~J}!@gfMZ zJA1pa4SvUCv9ZG#gAcPf817`~pwjCYngp+Bm^0)33F$AQDrzkAw&Ku*9Q4C$vt%=#j z$A)m+;P-T(i>RvE~~^9V=TjR!`I&|+WnUyQ*0_1KLxXl(*t;C8tUcJp#TG`EfR zOeCGrCc6xS?c8p&-RKx@cbeUNCfx1}`*qx@OFkgzg-M@e!Cg+{Pr;9`Q1TK&F@Yw<@w#kkmq+dL!RF~40#p3m!K~} zOMV_iLDPMJGPSfPAxlm7LA&uFK&d`p?zuKpw#JIPY>=RM18NF^cip?}y5F-FfL^p?F4XEc+&*xU9+fEhZHR&@l13!ne=tke>)(5f4K)GA~0mGK(P_nU5hGSu#U*A}I_xC`e_gG?yXArehd#Y?{ZAW7BaAIW`?n(8jUp1TN)ZbRt6zMkf)( zV6+5;LMAgMDPBfpbRsl0$v}CrKxQV*dr%5wPLh}cDH2F2_sVcCLBRq!noF6f^B6Ky z%NR0K{RC6uwB#p2;6zb*6h#$D#y%)wiWVe&Mif;giPypuEv9Z_nW7~G?L1~RK?m!avxNU~K!5&NtYl3v@V98gzM5jq=o zN8L$9sO=oV^iUbcEWL!-Nu5ZDom5{^5emiXX|Ju&`jd*#DY1(RQN;}MUjs=+(tokX z+CYU9$YY&EXbPc?gbE04N-Dz0p_tI-q#|VbOhP9o72&NwrR?cWN%{fGqCDNHN!I>< zZ}3sRA=^3Yh*i~sbg|RC94f$mVbCcqf)uR3mpS7$Ly9!Q(!pa(%p(nxsau}bg$E}01}sA)8;hwhZUFhI+Nss zB*e3Q&QzQZ^JTv?6)VEsR>%oAB;|Vc19Rdbr|~3I=BVR$PV-!-0i5`~Q=9=Q;KU!C zaovz2PCU-!Gkk&}bK;K#vtXxL0!?T(PpR2Vfh;wfKRJyofRU^6GtP%0M=nLY)_Jf# zTxnE?%+EWGE1-&`HQRX6X}W)>IPkJF^18z-gji1w5Mq8DbS7Z1psqW->NNfYeJbVq zo73DD?(}u154pjr_;;rd3FJ=SaQcv+-044@nRo!k%J;4_6VL7~630i*Or$e|(5Frx z4p2(s*a=A~j$RhWXTTT8i{#4}&V!H>$rqXC`_`1?zH^%MACZ##!71iI3MBWFGj0N; zNOC_rqYlC}7eSVsP7o#cIgCb-Xu$a0)Z5aqZRnh=F$nr9bR+n`PROJ0syDRkqKqTYiU2>z(=mbJ` zhU=pWJ6+!R|3S-uq5EClzl4VlMGid>Iker_?6U6ogJjUJT<%*S1Ef|(H02P4^VDYL;_Y+UWKu6SJW`x+LXfjt=45@MYmBE&lV8$ztp z+X=Bw?;ylFy^|2@^mT+-r>`f(I(-8nw)Ptdv4-D7XbMsGTS5h_2}@Xqsj!-mJ#8PT zr6UdXp2=M8^5IR~Oy)6EZyU8*L+Els#}c}V&~b!@2nEOxcc!7S+(LQXE*C2MUP8St zAHGQFr9{cb6REYOpRJ>wr}D+6jg&os;l(Z&>g*J#g(_LV**CjfD8n*_w@{-H!&?b% zWOy6FQyAV(@N|Z|2%gDsH$k`^KvKDf;DrqDAh?a;odhpucn`s=7~bddrGxFk&Am-~ zU4ws-2KA6DX*2j9FsR?T+yO|DK|K=fOdcKKOn&e3mOx!n*~duA+Nf;fPcHB4*ju*o zXIJX&kRnDu;;!?Kw zcNp>p{auE<34D(sUs`&fA#dM5V949|4;k|I{Ue6FegBvtZ{I&*$lLc%8S-xPGlsm| z{G1{0HosuVyUoK4c{l$hK}V{V{542`B>uJXVIQGTG;e|Nv&$Oyw33`=(&|Fu)(NqD zvJqnUWDsJzw-aK!7lhb7#Smim6ibNRlYF-PGuI9l4@p%4fwHtNiz)IY1n^=3-M zyc!6xHZ&4qCa)mGOkPQdwV{a+vwRgHX8CGDtPRJQDaZ&Gz#1wm(7dTeimdUK&7Pp+2;ad6>- zfmYKJLK-iS z%ScJb`0y5Psw)9^ zj#RS9m2mZ=3f%8Xz)cdBgvE2`l-x<5N*H$#6896v;9BB7&9xQ+ z6N9&Q*n-I z?tE5KalR?8h7?e7p&557q=<@(qN%tznu<%Jso2Jlqkut%90gp;kfVUh7;+SFIl;78 zCjJUD;rsAReywKmMZ`?5GL5&@OdQ5FrfEJ0GjSNdF-07t;IjM=bORYA; z*AbkjlIUUeiLi^1WJWgU4^N^X2a$pzp zF!jk!<#%S%HfZ-1IgUrn6tocytO{?h#U7RY;7!6j@x(l;nSu zQtpQ=r6kXRk^p3)UNFu5MAXZsxR;1JV8&esslfb!e$6;&T6hq}FAp+aB`A0={8xeo zN4I|?XhJx@8hRiqURPAqLYAW9P1BeUP*UMA-ZIT;FG|yR#}wls1*Y+y8D~MN2dVhb zj6691*i1&J#>)E%Avkk>Gd$#H$~s<&81k@bJOz-fBiH!KG(UP-S;yC=6%oGt)3i`h zJis@mg&S42j&G^vxujX&5xS7jzXOBtrP zEmXELhN)D#h+!JRYKG|qYZ;CrsDgzIf=yhSNpKCrEVqS^v9Dz~n&9aSvkC5GnB%ta znBq}}xo+!wP!M!I_`*|6OIHJrv!k0ycqQz5;?Ss!F>YfHOiJ35X-ssR8_8*iJ2D zB@Ee(9mSB{*i3>D7SKFqxozjFel25;+e|y4<}o*F9!K*$Xjb#w?pt6?lY1$1+ul^Y zoNmmAS%rJ4bf;pM%$o&-cs*N0h}WztLZvjqVs|P&>t4$vEOFb$K{qpy`F5kmopDh5 z^IErvgA_4hxm&(aRxzTEAv2<$Av2fcHzo63TO@(8Ub z#C&aYXCNt4XpL=mXFP$3Z=mjW%S&G&hQ02Lm!Whc!#;O9KB-k;P}_cYItD?-3_}D< z8Ezn0!SE!~v}%SM-RY>ZwG21A(@{qn7@k6vUt+kGs5;2-OoFd5ypZ6V3@;*hh~Xvf zgxy!b{qct8LYu_NN=t?pRiLfm^dyFRXAr-ma54+>a9CpzK0p|G2ibQ@-g3I4EdPv4-EO3?{S8F%=ZLCKIZ!) zLq6tvk|E!;^AtmVWbSE(=_HgtF{JlU8-Hd|1M+Wcidt+q#!na&mETvDeCckchp|<1BR?3A2MVW z`G_H_$j1y>MLuE3D)K2qR*}ycvWk4ckX7U`LspS58M2Ce#gJ9xYlf^M|76H2@(n{) zk#8BYihReARpeg`Sw+5Q$SU#!!6Ywp`bT%dKft(%?LWDBk9`2LRQRoVjHdyzRoXm` zHzdpL9`SXU<-!wXl`#yN<*^KzbNL_M!jD=9dv+;T^pxO95Vk4wTfs(14M20C0SvILWCR;`tL$-``g7`3LE_6sF zWJHmW=`nmzB-J_F80|4r-c)`$+cOrO9((B=&)6XVytl|DA^*`7)nvOUdZ$TB;cA=}eDhHOt| z4B4LM6NH^9c$P|!=nr#ZA(f~NZWTi&a7nmbav?}G897MuuZfcXQjd`b6*qt;`_ zy(QgQy+?dZ>~HkMy&4{Lg(qEV0keN4LEOFH86IF24b6J9+7tU*C}OmR23OJeaUgLh zWM{h8V}3!Sw|K;dkb)@(J#o)MiiE%3Bb}*@VH-i6rQOburQN}hrF}d@mi7q@S=yZp zS=wC;nVH=TS=v1W;mSF2wAYg`4QM2AqKDm99%L!E)$cKq0m^xvZVY(LU%jn3vcZ#& z9A*BTo8j_FKdm^`uHxP2*6{(v&;xi<}G;a08-5zdmn@1jBa`W2>K?Bpc z!xQ&HxZ0gj)$UT&XwY4rgfpQg_i?u;0lQZ>x7_H7tp}KlG_tF^&tohAL_x02c)()@ zVJg(ugPt_p6EfQGNyAyPMm+3ELr&&V&EI?ErA*$R{K1okolK#l_5nyrIfr=2Cp?A= z@>n8&_awh7CGw_67?2W@cmIf%$Xg8AyS~kk?d=dlwzqc}vc0{_knQa~g1Ga$1gIdD z`5;QTA9{=}P{c;^smDAU=%R^!&J_uL!A^wGVNWJbg^lD(Pdxq=YBrLuJn@SMfF)G( zYmW!t9aqW>`KQM=1{$*S`IpE16cAiB_|YTYgA~}n&z`uKAVs{|=q`WWLFa?m)QAqR~U7;?}!ksK3|yWO}+CZw2c$sh z7J1_?hZJ$OinL5|bumNc>Jo;$OjR>vCf6`zt}Z3$PhAMWjz6gdZ41l zuNi9@vS(=~h}Yac2-DP}XnF%_Vs-|-w&#bbX!DxS5f#UK#nVJZr#J5YFcn?Yv5J|x z89IoH9)`@YUWRaEK@apmoas|k1R+aN5%Str0%Trn@R~C}lDydD6@`!jUYzWW%Yqc~ z;uPvw@#0qm(UW`~9_KVQ&PNgBoZ+>-uEr@dwtCGUpiffdv%CcnihMR9R^)RCu_B*K zh&{=9-U8G>R_pT#u_9kUD4RyOklIZkbdk3J1^{$l~Gh*6-D*c-hx+881{eHcx`deCre|< zYyJ@s`1KuL@d%_q8rON_c0r1yaecHjZWxxvR;WW#ze!1B6J#l=-|V%W0FY;LtJh5b zM9t)OZzj%}XR?bB^MAKD6A5CD@9}0%0jpBa+uq@|y$FTuY2WqsLYF}B2VQY3q=?`T zy-`npeMAt|rx@xaA%Vsx-h_xpm_GH|$3qcQ`Z+iqKqQ?n*i#ZZOo(~@rPsponeMN= z@%WZ8b<6(OUgKA)PrLDr*If0fQ$fptELI?7cfQN^t&XX)!ZwNUF+0%AGTjricd2Dq)LaYPHmJgZ7 zi%E**L#owko@yD}0ZP(#8X1<^2c*$BnU)U$qb%z4bSfKd#pBbDXKF5-$`<%1gF$t< zBHJ=%L7wc@VdProK0wqmFN#rPqZu{MGA@Cdte+Ds^K&Wc$(HyLvPslah&9T+O(lr> z`4ZG2_T^iV`dMJvpM)Z2e-X?Px+Q9+)2tX36Jq_GVaf5C!zGpvC&3&(%8JLQJC$9{ zw2VyXlSi0snQw(hm=iTZY19aF3GoO=6XFr(5n>sZMUUsVjMLP3+l+F{O#ed7sKUxb zz%#0};_>O$(=``TGzki#{J@l|YtPH;%DP z0VyJ8jpcn4>?MZbu>@@~@K?l*faRJ56^VqkR6?McfR^?ekfNnwckos#t*mJhFeT*&YO%ZEY5g)^x1Ld%B- z2G20O$nxRRQC&i%7h68`ES(H50e=L_ns^kIZnJ#&cJnC=2dz=4-kl6DwUSYne_(hS z5cx5*ok?vkw|ppvDGaZ$e7Lx-Vfbsyw-%;w%1lkZGab`XVlA}w?NxX!OYm09*blvR zA$JbxcH^LR;@8rQ{$`20pfYLRYnD9ZAn>|14PR-5al^ZocqLrvy~s-M6JRxZl^ti= zg;3i*9tShtuyXNFso7aW-=r$%k{14h(1nEFvQqHX9@_}LZOMb;I|&`KeE7uay=h48 zUdsE>O2ubVA0qUTm4Hv7K1#!WY^C4~?T>4B==isM99scN7HRyL>L=E(kVaEfsYWD<|QwRpF?9kSFC{4dMc29P`c!7^jN zkzD*4_8pLdj&)z$C;wC(+k8>GYJ(xi1$KrU7YK$N7sN5--B3J3K56DQ zymj(03iz6^SEe;jh%$nzN%w37_)G{*VNKO@5(#$=zk z2vRiu6ki(lM_{Ti@>cQ*RK!C~B;-hI$3Eo~)%m`Rh$pHGd>J@E2AwAU4U$r?yliSk zK4UeuMA9mZVxQ^#R?VU$YEDP_GVt#wsjI#-ea8FHF7nV~wr@GK{ofBQcvQd7$cDPC zbc=oFQ-Hv9t9=Q-hZHE?QeWI2NRc)#BN3@{}KeWvp}HQ7}@y3j)cTuq4MmScP#+^?vqukjf#LoZ#}uLa)eINwXq&8SYJ z#V6K4iU?@+O-5!D2>K=?UkR-DO-99G0j`Z2xY;)u4cneJ2|D6;A9eU9;#-F7A)@Vg z-$dl|GlWj?O+s>B69L2LS4RzIKhL2x_uMz4==pW4yDI83d-z}0(>($?_d8` zfc)gF6mQfYH;K}Lp_I1&rid#OZ8%90<@B96KfBkVB$-c~p z&oqC44v8}fnPcVVi%??b*uwjPGpwpfnFbCmb*9X4WtyGet9EAQ=!kZ1LaZ~M%+c}> zy-}ZDLL7Tqgm~BP%N&i-Og4{~oN2_t2s4oh4&6x2G@lI*n3g#W|8g3Wot`-jhd-T$ z7?n8K3JRlHAC5t2$&3egC_^u7ht% zgpfskC;j5vDRW%%+x*41TjqG=w^x2!^4llBljS$PS~~KZ{?16`cbcq8f6^vf?aQ9c zdjR+WUhjoGaU-P2>%Ev4bz9d<40)-2nIJy1+zCy{wH%;Kmh-_p8}gY|_pf>8kAO%c zUd!`+JiK1Lo;M2#VAcJ5o)7e?C6O{WFhXUB5JyN@EHe)wFa2cGPmcT%^b;kh3czLY7>>4oyVp>}^ z6n8%S`cRf-zwurPcC65LLOPZ0AY>!7laN8^Izo0r*Ao(iZXgsx=te@Zgl>wL<2eZZ zmdfG?-ApK+&@F@#2;E93kS$Vq4yAs3>y|(PJxNpKt`2%&qEP9*}Gvx_F%#=S8Vx~MP5uRJ#0)5e7Pt#!Wa^#CLmiRJFH!R4fg$ay$`TpU z?!7FDA?@DFoD6CAUgly*yZ17aA?@DFk_qD5X&(V{Ww26~s*`B}xubRQYao*Hvvv8} z8Ui_bCi0I!E|pLWQ)Y`ohMqZmEx0UAkG&6?(GI+96xF1I;Ia%oQyw46NT@86$_mnu zUvvqqY%Jx?q#EN1@oflY6LjO|VU|zT%{PeUMY>oGDWY_`?nVtHP#g(n5I`Z#*2PX} z0YWO(<1T^}3283>g$ag76U6iQ)^Pu2{1+yO!hAgm4PQ%}02OGHQFfG`gcQ(4in5t{ z62454wj5=CJr+q-Gb-1Obb!sMQ2v;NZy*Mqi)joQ{IN0PA*$99(>GWUe84Bbha;aLgyRh=tF6k0KDBMN0N5r6Z>pB{6*suDF^L@(Jfj~uZ|BtFR`z_80xU!p0D=1Ww-s9d6A zMimmxWK=0pDWe4vl`&c=(+so)!i5_BfoJ5Z@3P|)gqqPz}#i&`LXBe%M=s89$31}t<8MRs$ zfla^9C@5=cHqw{%k}^A^Hi-nI_P85BlO2pYBrF-P0>ugC&G4CbFvn|Ey z)MnOjn{L&{hL%mygMD-{?whIZ_~E+OMAtnR27+2s@#UOUH*vV`bK$zUCd$rj=z4jC z%&B-s6}37{i$BgF0ggcA3~aI$IxL&~lEl$~ql`@85i#u(T{3MtHbbTn=cwCfsSI1L z!*cAO`bHMnVn9mri@Umrs$UuIN{;(2tV%=yx-gskx+hmn!+*SiT1bD#)4oY}WBGpU z0TVpER$Xcl=DZRvDnQoRAC=T2g{mzh`2 z3TJ*l7W$BxXDtn9{u47##O9ms1sysuT(ItBW}IXZkKG?0dmC1i%!?qcJ6wE6bg>+u z8F}{yERaoP-Z7Z>T6E>pkQ)1P{FVa@kD|HXYlhGP1A+7#Tj2edSY)Bh+z2fbQt6oI zo(QIveiwep)EAig3BOjf!q+R(`MmU`fp~#7pB~MFx55(3NSV67)Yh#m-|B~ zwR&)%bHQC%n{c4}upSnquRa>8pdYg?#e#%+kasG6eNhEfupk}3+z74vpvtThv9t}E z$v=~F5VCmJfNls~h1@S>YZex5(Qkv?I0HF?ANQtZ(9e%h;ueb`6$^)e>6l8#g7kHb zQ03kLqz=R4>wXJZmX6T97p83$W90;F`|f&x=)l$H%CZT?)@!)f6jq#{a(DMz36ux~Qj%&)OR)~}$&UfD6W#ZMaj zX!_8DQTZ#v`5!^c3`F)*ZMy7zuyX`%*Gh6)U@@YzUD;APGDzt|DUdq_^L!vr+hDtv z`)RGLBEQgc<3}I524h#}lq4{F#wJc+ujCa+I@@KU!+kt#^>EbkNzzy`TT{nLkp<_Oi2tS$YO_)~=;DLrMA4h?e zJe*-mdQsEn1p>`l!n+oB2crR82QZJ|S}nH)EKq%hSyKXC6k$up7~8n%%$|}T*J)ZK z9gxc=Gq|BXbUNfpR{ns_NDc#UaXmAI$XyGg!u{`Oj{6)sunjh8xf>z~A7=D7&`WsA z(FycJ#8K0pL%lM`P5Ugu%yGBfM|cs{ciSg3=e3e2VfWJtvHQeO})umFe`t!l%T@g`No_#`nedu~s z_f(~dH!b3h;0K(%{OCiULOm%L@p;U|$7S6`QlZ4xF|K34=7GC}zfl-uJU>FlMd7|? z!{rT<@n?GOG#VFhujcqn&wm{FZ5vz*xEt}&26Q1MHyOOH`jmwG+75k5c8e$UtSqPk zI-qbq;k63SBTV8Gf7DYil4I8{K1mvcrsX~lgR9T9@ECs$)BCKRJxnjq_pF{z^sUv( z_eW^fut{XX<(m;E@l&)({1i5cdRI9-oj})nO5Su*x=$=u4}=2M7oq zvF;hFu9{!&wMSH=*P$M%$@BWCC^dRspAw};&+Da8YV?BcV>QxB5^??!bjYw4=+}qR z;5yxi`FCq#rJhdnU8|+zs-F8fe5nt}SAO)NPG}tMlrSohwxpND+|%LspMpGmk@ zE9vxW+67RkKGmGihptlOTq(uTq?H^CR{uam-Os{xzfpBj^^C#2r%Abs*J84YAs8|S z@5GFL4Kng^Nj3&=IgK2_KVm$7s9_td)%re!NW~6UDKp@o3`1stzCEM|0v%esVV`s~ zAnkk|hr&-8UC)$h*%zerq2-XqZkS6tHgWbhAglK#$v*xfn;Z4FYFM(kmg}j~GD7NgsL}a%D%v%0*hq z28b5jqi7~L;Demhhi-;kNdp!~(11;i!K+S_B8fJH1T2**?4=4YLBbCz465^>U(@~w z!>W%8PxPS|Ava>au%zCj%p59%UdoR?RFW~0@iCA?lLm!>1^#C}W7wI3^Z!|&PuySo zJKX{IP%`_QAhNB3@yp+iaInMPphu3wF_K*FV`^sN;L@Aj4zl?H4EcHrPfP26M8dPI)1Rx5c40`!d$ZL_!&edsLJ z*2X@*uYa+oT?)Ck!ar$^iBNK(9QSx*vV3yk5gnW1=P&$=>0yWVcgStPK7dyZ*UH~o zEf-Hm)Mqp&AS{F(x>RlZ$+nCeAR8xJj%Oq4qa>>jtq<2d(Ke3iHWjt?4rulhZ6)%w zWU#iVg>WTdQeS(YEejiC-|qXNZ#TBi#iJJWd5IJH&?Dhd*4Xmp0Rl%a_G81n$aUgb zs5rcV(_9eCYM3Xtd9yVOJ96VOP&ppgs83mVARipF)70y1v*f`IhbM5FQ;k~^So^eG zJl0X4kHc-x47Z(S8^c6WU$bn}@tDCj*pP#@?Q?8CtPE2pTxyF&2WJ~>m6Ny{$|5G= z##16W6D*K3Db}p>Af3B8f)Km_fFFG*DO*iqye&71kny&nC_*OMEUX+RWKuLCxlk5K z2p&O6La;y*G98}vfOPI}BM5=$iwXHN6cGm!i4a_lRK^@Ua4`n&<%}d-7LtbkkZL$> zd3a=H4DLFOxMbQa>=4!`!fr)uwdC5HP&Q2Lqml@Fnl0;V*=CLA#zQelJ=T=euhy*R zAnn@w9-INGOgP+p2hzE(MNH{^ni25!>Krwt!+K5>Zx8FUN7|dJe{v~FLoZ5#Xt2ZrPu>y!_(8;cvvTAg9UOnjheL`(zz*6qCUfc z?PpL#1An6Du~92rOc*p9&+^pg2Ttfid*IBC>OQOIkfv5EJcF3PlX_NPgm_ra&*{0i zNDPbhIlYuJZeg)LujbiAVtrBJHG~f+yq54mgT?f- zIr!0st{tN$a*>`zlV<5$q)&~K&PDn>Jbg3hAf0a!s{2A)0A7_IuGs4;^7>lki41J1>>TJ;%pP)Z-dL)57V9vw^0 zW9T%3vvD*Gz~mO@nSLI?&t_Jw*p zzW&b`y!kZ9cH`fVcH?6iZhQ%#6g%=pioHs+1f=C_%yRMMTzwwogg$f*luN z@n%WrJs#}X7(D%C2_Ilt{IEGp^eXNHc(RvC{&q8j1$YFnKCgzyIjqLHbt;W>VL%Rh z>wMCO(*tsx+p^j>;jlXc6@9^wwkr`&6OF;4fE2oIe5BB6vY00)yI8YukX$_SSDz)E z)Q84T;CWs-4ac_+e*EpJl7K50pw8I`-~9H}i}khTSG@@RRb%@mNVLO0`?2ulo{uS% zEfo#`JWCbYPJ`4etU-mBLh9yl;Si+KuY)gZxW>Hz_o*6uS?%gnya`6TL=SG%uJNKo zNgb~ha_qs=zdKd(WV_^veekPqPnAqL3T8IK60VQKJHPCM-+p_lYLaCaKv&no zKl?WLat~l?m}R#B+^Y(0cSC9x)}WS8K3+C1^(AS0wo%_M@5y{dXSW_;uvj+qlVhc)3|%JE{D z`p7*M0DWl9Wah;3l*lS)TSbtYgF*tPU`bZADMg6%mtqEWAKCv4rIUP%ac zn~R|X^%=tnFnP$8yWu&qzjm!J893GoxhKOv$^9u%a-nRyMVEiH7LJ#&8Gd%~FQ&8b z3qJbLosipreL&>b&?SG@YLCel`koxDgC?Oq$(+!K7SjB}oY67mz*R^QX*$r zBWY<^85x#tsu4$}L=6NlcT~Kp^5VTEnUNiLJBOK<6 z%#st#f!h%^EjwIUa*fE#DIPIa?lgU&*R6YE%TiB~6BrZOELwJXky0pvDUA)+1-0mQ zDjAQ<*9^(p@sVOVS5H7TObAN?$U-_NhC6`_m~D=(`k4g%fbwYC9-QdOoP z?@$ksZ&M{BbL^;9#K~!Lx{*2gaQkb*(FukVuJeX_l>;ko7DnnKP@`!@(62-3J`OZp z>MqO~b+Cm^d^qnLeExGggf>&=uL62rPWYrfK)*>?TtoY;gM zJ4;$Ja89_B@I1)YN@W32PHc-{?p&!f!<|7}9QTcX^4ELK|jT5N+va7PJtdJkwq*;V?0;SVFBZy z%QDbp(QDR5HUPzw{A*!Xwj4&%>d?HgIqG(Oq=ad{4MHoFG&RDXY6GJ{m%1W6V<1D$ zcqJJ{n2a2_Wf)l3UPwOuoJwjzMXO*$=M3Ey2vU)(WGILgu0QlQfd)0>Y@Hn zXxv{fsv#-8Qo9o)vVa!S#Xi!X>>L{s+%HXqr-f=AngwLd&PhRSNYb={=vkhlgPvo% z4YHb)m!_S>oXr`9t=M;Bt&Q9|CyRN`9EW?dR7oB;;gake4I6HjK2>&)bZgqls=mOU zY_tx*3l4^}Y2-+O|7v6*s)UW`haDBUnDN-3(e0Eo)3npLjZ_k$JkRNRSpHIeNYNR3 zc%{k#N54fc))q=5+)7~$Fz-yAq%>R35f_=WAcss@)6SM@T)xlIr5^_aU~@wjoJ;*H zFNO+$wq=!4QuYEE~`OngUn%baGrc#>G15mSClc z$DB);WM1E9=DZ0PlD5G(s1(vgq*O(%E`#6&$2z?U#^_O_8&ll633m`(3f*!fBt6e% zIs`&sp_l8tFv3i((80tb{Nbd7dtnd`Jf^Cx?*$S#Y6DN3Tj6v|XlEFjpiex?^|% zxz-Z7atKY^Iie7i=(=IW;s%&|J@U^XyM|e#SZ;_Q4)KlD7t==b!~B~@j6ufpTYV1j zR?}|QXGg@tx9H#oP%`igL%L$ud+*`4^8SJv@zB- z#1AVA1P)@W-$h_edj$F=8&+dsokvxllIs*0{XS|skv9HV)Fe>;suV=}8muJnKYYN> zB0i(jBLF1T4o&5%ou0yKJE^d`wyE&Q`f5+E%6%%qzu-xtKDR`n;S0tpTU{zwSY2XQ zSlu?D@OqWtU;hY1fI?2 zaB_V(;iTJxsG~@|jM?CrCdR%iG9B5%w_6yyMB*yp-zk>v64@c)-Gd_ZzAJnn1PlJf!i$_I zZxHDrk+>7(n!HLBZHIC4JYw=zkuHDR=y%$H$V5D4K#X!660txRjA}DBPg9cj8u9yt zaX?N9hF=JC@`};h#o`?zce_}*L*#E477Vjq6t5DE+r@(IV(MHG`i&S{Dn`94;$eb= zsxdO(E1UzU%x1ob4T(l5GV;X={ITVFL<%MxIU*i^_kq2@&ybh^KcLiDsBaf(_)`iG zg9D*eVjPu8z28n%N=1IH7#k8zJ3;!3#mb!sfKvFLwnL2DCg$xCIorje9U^nPDB2-% zLSozwkrfgJJJA%2c8f_{MA{xP?|>M$MaP;gB7k9lZ?cW zu%uWfZ54$F#MA>~_5mSY5wnrDC1M%=yet+~i=}Uij@4oY@NvphB5$V<+eP&fqPc;F=q#IxpW7n>#h?$+eOw&OdP8cZ8DL1J+gkRqfm$gVr;?iw>1t6 zvQ{BAv!Lp(hg7ObOx!6F539}w|7k=tbj z(0uNGbdK|}|F6aP4d4aIS%ugOW`L&}4b0gs#_SfkyZ4LOKjAQAmI&u{B7P6DdUBCt zyNG>HO!#GMAGG$CimZ0A#Ia1&mWoN+M3G}aOmqwi=Rs~?2~v-B)ch~)C%2)=3j4BL6Gp z%R*w5^hVPh*6!UfSLh<|pco6|9u%oieU(^T0PbV1Ft&>E10sHxFg9>Phs4n$_Df*p zTnDbkddzxzl1Pg&o;i6NtgE_luiaLpmu_(!AtLx7Mi-@yC8Er+-4=U5wgOAUJ_L^{LpU>{>t~>G-hMHjyLJd$ zahsF4Jgtu49|< zEfM3l2jI^PrS!w@Ac7P0$@!no*p?tpM`8iXA7 zeIkCEurC&+he*uv(X8I$fWYueG3u2gGUIPaa-Q#sltaS3Po%FGw(TNyiO70L6de*% z4~rb2(^D#v2SmX>F$Gl6wqCA_wtaHZ0nd*<-Pi|T_LymM9gT@u5_8CwaDFZExbDSis0pSH1I`WDA zBS!eMLwA&l+;>EN2z|Es3V5+;Vn&!TQDOWbkmKGjGTGop@Zy?SUF3tD_le>yN2K=( zs4xLsGQ=GrQL~-uBt-N6M^Gx)HzF2d)m6gTCd3k|=7`p*{{hBjfO8n*I5I7dLwUO6 zj$y`*X#c5X`~b9He?+bPED0>Y^?{gvP`LI9?-KCqAjr972P5WvWwaRLqA7hGCRFsW zh`pZXF{1Qc@Fde5ry$dkL&COA#4HsPzZ4m(#6%;$Nt7NIu{qQ*F`7@8fG9E?uR*9g z&u5E6O&`(oRv6c7BsoIBgBy;>!|Uz3@O>>}e@CSw1o6(l5yXgUdtvgbE7um##5Bhakpe$kAi_B!Pp*cMM|~+q)3VUEe&eE(`YXGe*L4P4wIwwb zfy(;E`3oACmIo@T>MF|{%KfzsjdhiQg_R9~s-+8-X*FG~wY{Ak>->Ff^#i@VJ$<3d z?v@qhojollXn~qQU$8US+#l2etCudR4;>W*#K)YN-zu z7PhoE_XR?I%^jiss+P+7X85X{(YL;%yS1`*NlRB3P}8v7U)8Xxwlc81v9fMepr*09 zp{lmpzqE4Mipsib|EfSmW%;s-%0PW%ZSAtU2CcEC0;W*4AW&P^5P&N6Rm+xY1D!R^ z8y7WHcXXeyvZFQBuJ!jew*+gNdwYYewZXpH)?oC!D;rkI**4Vq%PRxbl`AT%wVICZ z70vab=Dv`>v!kuMD+n(-?k(zN22=!FdZ3q6yMreMnmd6L&7q#Y|CR&6uHMk5$_-jz z(aO5Yddc#R{_0>`bIYb09A)`Huy0d&Gf1GXITUPd=;#Xehnl;3`?d0_Ktok^)zXE5 z8e~dsT~&?0ZWUxzR{WBe3u_x|Y8nF|-g>0;KNr=^-k#pTy5^P>fYyrX!1soLpXT1t z(_P!w(bLxv+Em`z+}{t(>*yNj(iR1qdjkuBhvm&J?LpMIz=G!f5VfuVfh}tY)K@gD zs93giVRfZe)!h;5Xzr}<*c@E8zP35k(q7#GwI#778-gvsP5p1yU#&dzA{ zLqn@=3M{DfquteO{)Vc+f@O6z{)WK(!jgrBC3S@*O@$?a`lXPmIA7ZrM7;sd`$Hj2 z)pYdt`#bxB&8?g22D-b!=#I#^lD=R+@VBz7Zb^q$v0!CWU}W6 zYS`2ZCbLm%KWRnRg3jhP(6%06$;J>!dP7BXs97Tk^f#l_HU|S6nmfVr>RW)Y?%ECz zUU=FgIv1;L>zzTnLV`W$Y z0t>4B3&BRC6dWg6SkTwg(uXYAxT$D*eS6P9XKO`AKibv8-U0LrJp-Z2j@qU5fx?1{ z;D(NtU{&|}p2}!-TehM)P~T8lSGf#+{0)^_HCRyRijMvc=%+r|hOQ1xQ0oq~^uu7m zmS8JbM%dGqS65XoZ3t8}m1_Z1)#^pCf`C^6r(9XLq6$1{`LdeYWlNzFE)T)()>9jH*?|4`7dJcLL_^JLSOW%v5uH{Pmf#f2+nYPOgRLO_eyx6`@*qW8Ul8=HKN#o$ABIa)Y1p`C z&xThZ^z?OsXABpDugsQia?+IS{!r`e+2m_WA)hnCYaHesoh|><)N=6e88y9vxc;98 z1#Q`-EF|cWqqw3-QdC^Wud&U zAH1&=lw9_B;bK+AFeFe}GrzK;0#;q5IKq>S@I@V!g@s@eUCkQ<(g}m>8kSn*$aTdF zi{}^niVcVWW9rP}AM|e~x=pb89Pj{{BjMnVB?%|5PML!C0-} z^($*zz{@Xa?&|2=q%9rjf`z)HH`oth6|qrMlhan9C#YUlzC=beP@%l1t9Kw2l&aj& zwK-Corx(GTx~qD+x(58A76>odydgp*vmLSeOKO7an+G~W%mFanNXeDY;N_ABMY1r7 zJP5YVhsEUt>Er4l<_&iBgo5bXAxK1)wnlZV1r`R@_xAL|8(+8%t%mL3#9>nF!S=NJ z@Xx+4S_~jPXcw7uoHmg3T7@BD{=`XQEq%j3r1_O$k*1?)-78A;cs1_!r$Nz z)Ko33gB1>9-lY{)kON-a-vq1mhN@Q7UC_`f86392@&GMsE9)9oNvF`!vaq?Ut65tQ z3VTvZwA?UUThLftU0+^TS-Dh_)mTq8`!%ppSZ!J$O!v1yul-u}BDFY{2bNScX_ByV zSQ{&0nSsp}&F4S0M(IXcJNl0w=xzz&=1GH>Wk4_lku1d6t-(-p2Sgy@wUd|Z6^+$P z{7bdKigJHV09LRE^;N4Yj}-TS$MA0el9mrNx7N2ex56F_orX5hu}-N`LrZUUp)^U>J65}h zEup^xcBYkeAf*K;-ADy%oY_=30vEw{0#-k8>Gkugmeu#mXo?z`yJUm5u(L>RDCJZk zT&4{*sLR6I`rcr$wI9^8H`HDa`@2qDlfrwD!lNQL)PW5h!IO@(tFA08!sRaj+prMC zU2#5)L3%B~8s@+lGK7)N5f-rqlpF)m+U@7g_YUH*ZjuHIm47GA_excx^ zlccvF-ZE4)Erk7g1DJYEU{M7GTVQq#aOeQrcHVf8IFXRy;Cwg%fMsMM?b6XjHAvT| zt#1p4S~PIMD_|j74-wpgp1%4OHCk8ChM+9$kJz;1nuB3HY*#h}=MSu3AM9JvJZvPe z?T4_ocG*hs0*y;6Bh{&5I)5JK^zKJo+{zp4>QL2aL5o=Z zA*iRtK7e79e?A=*7k{A&r8V{1HwcRuj07aAl0tJAi*tXW6eShvAkr|OlP$Fk~t*fgfJ^*Aq{BivF z@t@ta_Zhq-m`~+>nohruF){e=bhTYq4&uMQV^JNV$9dG9dV`f%&55O2VNPFuQ;0iS zt^TLf{z)_m$?}r7z59DIp3Q6CJE?ym2@bY_#EkTeQ0&^rUJmW*KW~6qa^%HrOz|B5 zCjRsH{J=ODD)0#(ZJS#h=axTTwg2u8Zye?x*}*ofCOXA&woKyf43|4o9L~G-@+sfx zI1K4Dli}keisG8r1PiH)V+l=~9MJZ0o9ipddF%Y7eGW2peLHr~Jk^{jy#*MG@0X8E zQw{;SK^j_9C@Gg_5IHmuG4M>!^=N`)9mPPglI30PB%N#;-?i;r1hb z6);*>Wq7UI=Hrc%^WiUzfBUz}hhHj{|9$ws-~ICIV*7AE{oB{+{gi-`zmC@9yT5(^ zc=YSh;V-{_{pHu&Zo9$1M;rV)yM4cW*!=o%|Ah1ZRe_HU-vs#_fBO}{XshVyuLP%H zp4m6XL$74V@U)inbw(RX0sZvpZZ^InG%u`!oX!B> zOmX_H0g<*89n|xuNh%qcM5bZRh;t|S{?SGXf(G(^!7=%Q`IJ-JtR$1!Bc{?KS*G0*jRqJSE4#s}jk>7Sfyr8I|g>9er2jm2opry4Hfat?|n#?gNlxlc}6kvBsrQtc% zx(z=_J)tvQSjDLfm8-r#QdCY|_-U)wO;6;GX(+00ls8#RfyNBTe71^LLV^O8;jG=@ z;UyBq4(JchHe~+OgwjdyG%T=#_L|q1HCf{N@c0;0P_Ok{$!a_!$l0Wl-zI-O5Z`Q{ z&w%6R?jWo2P}d(A*}=sH>3+c;Y1CT)Mk7PstdXYx0cGJ&2oP6}x+5UK+GMoiR45Mo z=PokbIlEe}zs;8b6=oI*U@j=UxnTbD`Nb7+mP8I8XJ8>o;*rYV^El4TWi@qcB1 z`nT*)zh-~>Te$PJLxZ*)s9~vSf7m;g%A3Qz|79UIkuZ^gMm6C|RjZn$m?^8;e+1)c z{h3IeqoV(=>c0>4--j>8isxjaSy{|7I7yb?Q7n5JJ#Q}VHC^H@*UvTEoctyi3VMYQ z^kLqpUy}EAmm4DI<^7jzjsKaX1GRGTxpvNl>&P+c^)GtKWc>)FKYNVNQKO#>hwY{u z5;Gqd?6UgOIpv>Y5J)Zd?)U|eB&jN4+}>f(b71{e?i-i5*63~4kCWtSOflerDj2nk zl~nnf>(Nub20zB-_;eAY?lbcTQ;yw z9v;L@Ytea8j3AA7xwzL%8RH~|EyiQ4+CUoe!uPI|c_kf7dDT8pX03-^1-vX)x#$ks zo%SDQ-3Vg_l1yI=2i@TyFycfs4a(~ZfS4V>Y@Ca&uAPIF30f7RS(GZN_mJwdG)IRl zO1wl}sZuqLNyx`j!_JFHQfi>m=+V_GUjWX>t&bj8mitTI_n^={871~-$^xHW3A`&8 z0|dZ874AYtC%9@hJ831YTKT&i4DQZH_tQ#piOHTQ##=3qe-0G~e@w2A8yX!;0TmA+ zh$1Lfce=j%x=I!|#9mY>3HO0aWc&LMHw$XC+}m^h=E@BgIca8A3L?RfIQX!EVKD~H zp&&;NAr&9a+j8}6we9r-WvJ!7ptySPJR_pfX%ZG)>UBxMWE!9Qd{EDkC>+wGL;b`{ z?xvZ0u)#Xk5;(%w%UZ2Z=08X9HSEnUXZY^XT;N~9I?FB@M&Y7g5p?-Sb@@*IIO*u& zd+Jwz-zT9$IJy@kZoYi-pCHKMdX>sar9v*}keeO%mPkbri>F(ibVlDhR1;=;2%rRP zPS(o@C>BxZuIaonQ@qcr4F1e$kJ*&xZUm)&b zM~SIcx7Y&~vA;bOSfkyWwJs!Or(~}%$9f84CDxaus&yzKAWK_K^L(;)25wMb@mLHe z2drrQ4%tnx0m;;5y?>mo&)COS>tdGh;nugTI?wWz|4&EpDoL8ux$F)t^NXV;@O-lx zm3M*RkvJ;<4h}bpu=4NVg!g<0$Gqn|hkL*Ce(!gV_I~HX-tT27Fme+q28JwGIKSuRz5VCSq*np}dNFE!GOR?hrFjLh}z z>c934)oC84;XyA;HGdHHvdw>OU~48Px8E(RVYB<=(f!=$4&X|V#3Uz5`iqAM4wU1( z<1yarhTw277kQ@OS$u{sWPAQ{@KMOqhyANYYGFSG)UOym3WSDP5!jf2womlhg)AwC zMHP!K%+j;t_CQcSUqb4P6lIdRhl_VSoD)ALj-u2#NK)ddMj@0-rl6RaOEgC<@(+{(#a^13=VgU3tV2 zZ!-w;MOW#$!)B-EKoJW-Vg%)%QO&{bNLhFiRChFre396hoju1C3*ZF#J^SL-Hpf&)LA~lk*P3r6Y+Gk^Pu*|QCzR&c zLs1B#*;+NLN~F+Ntq}S8kMY4THFJm;q+~Hflt_F95%rJuTN0?15PQR2*S` zFf0_k>{uC0U*@%U*Ar6&Erppv&i30}!0U&5xu4U?FXWrQhig=?o*Y2fGr9~McQI1c zRGfRra1>6m7T@_$0aL4lb2@9m*MdhxC02l&bSh{bj|=av^?f{lm`o>^wchCdD+qxA zo~b+aDL4P*Tu2d0a;f<`uEP?D?<>5UCk3UWw9=lYvtq`4?Yngomx zadvTn0R%ZvtV76E>A660XA}hor$2F^W;=m|F|r?y4#7#2v=GH82}$kJE;s2|=uX&1 z^BAu+CVrDyD>(yQ>Hz`9C+#BCqXrXa(G96cxy3HvaBx4}ZB2GsasUa~Rm zjTSv(ws+>A0K|lv4(xnS$Z$VdE>1@q1*xEhUDl+Zi61UNRI*;!A9$iIpjo{~-S&0s z%y*QeKo0(&AqF>dxur3W*P@TH)2h%hBc(xa1WL%< zmbKQBNV%x zeQn%t=l#E*w4||%ysBT*Th4Je2ullQFNMBZpT-<7@R+3$L)gY@d$OIw08? z++E!L!{)qB2Z8j~HA`RLp9RS}zex;!ao?QUr)ghE2X01l_z4g30S^(D|5neE$8l2f z9@W|{w*kB4T2_rYJ^?M#CQMh<4DU$b+$~quRVc(&q^ME<;mjg7ND{Lsa8*NUdGJdvPetTFAWCcTpcaT@w{kGOaYhl;OgDwc|9#x(L z&n8G_UdE*D?-L#f{sCYDX;fq+ggJ!K>3@L-*um@^w@-#HKeT+RKKt60HpbjK_71AY zdJd#7S;LB8t-l;n=ZgP+B*p5fbbyB6sCmO>O};c_ikq+u-R(6Jgp96Dki*P61C(yCZ1km-%AG_^E}=PTE~V> zNvSmDlwIl8$Tx$<%({xQR>{DtXu-dlezJmp@C`Bh%I~^XtJJ>loc<$z=PWskv3zfN zS+%d}MtC1u`ZTv!%IcC%73t%KJt12uAV^77otvgLKs}TtXHxrhd3@g&qaQvxwfM0H zMd%j56R{U7q#6i~ItKNglS7b7fV#0ubSC`|@$*WFr*<(+2R(Xl(+0j>!bvb~ydx!I zp;M=$tu*o$eou(3{nPd_4IK0m%d1fI<>p9h(@6#vI}5t>(ht9Bi)r>%I}N}5p17+8 zLFPe(1cECT!8=+_4Qp*XJ(eL=WHmC172J7sQ_9RSS1#Nx4+Iu1M1GZG%fZ*5<3%V5 zj}xKP3HyWOy}lf+ComMiEhozdF=fF4mN-Rw`=oU%scn}uT3xIH6=QQv zoE;#*KMSiBDj^DR3&O^$QN7&F6BUNZAoUXp5}f~fN2mu%2KVKX%$osMzDrZ%otwGg z=s>z8wn@@QYsFq_k4_$v3uDSP!R>%VEVU3voQsX@H_2VhE|a3UrcGtnOIhZk6d?`X zkKo7vgSRSZpuh9lcxZLj;(;i;Xl^bkizevdR{0igrF}TUjbEpTb0N6TLKG~%#fOR_ ztW+F|+)r~$K~@yTAv<2~Uh9>FR>eF$OrQmk5XK3HDdkZh7D+v@-JNlpW%a|yY8;)W z!(%WrSV)^K3+j^R__GV07+W9=vwmuYuJRsFEs*h}94y)f1tiN-f^Mw`e5UZLC#}&j z1G*p!FD}bM?4Vhv(*}$p@r?^h`%@ADWgzXO^twh48c5LH$>8HSI<#_P-REemGNFv& z{l+)(iM^;>>PVIsA6++t7qHWZ*)-005;CnGz*C2tB#$S;#*J8 zFF-%P#GBG=^fPpVa}Rp`@HG))qPaAE2t~&n1f2)0Uxr^YYTbvQtBU4XYT!0mu0sMm`EEqm8d`fDF|~ z_un?**4jx#JSUgUiB7aelT_^`I!kF0@6zGdM_S6MmxN(|pGx+-h#TRw4}#BVumU7& zqE+9_I^Y~OA>s6>-%6zrAzW};Y#5VM`R6xS!WN>=i}NJ*5Rmy0m>2B2K4L7L*c?uP&V!!wWVOkH;-T?M<5e z3l^&qul$?hA|VI6G;<*o|M9Y@bER~WIH$5DFYlj+`drdKuZB%qe4yB*$8~vy97j2z z?W^_uyK%VXV79q%-!G`8E3IFXR(-m7m&NI4IzWEMow8D6y+l~cvIyxjQdA{Ncdu&u z4q|u;W7DlEgFi!ogX&|h;-AE%NG~ze6BSbN#2`AS8uM>pl9s%-?gc&(h!O=T$PtLK znF7@lbRbSfY!saI5WTGy5@al+$Bp9|ZKytG4``$Xx_qQmOFj@^0cRc}0~7CXS{pXo z7g;hWd?<(DN59nVNS#?$M*i!mSYDyXsVZoy_`iO)L-X3WJ!`)Uj!T6XDIq{mBDAh# zxYv12B5P7lYPFhhutJd<^Lod4QAs~lidNZWG5B4oEPJW!bD zAbsayoC)PYs`DlE@Y6W~32Wk)iaPT2yZcLIRw@OIVQfI3*&=I; zbUAc}`n#KP4$jyEIZ>`<-T@*@{tEYy(Y7R+=7cOCXsR(@Lh8Eb{fd=?q2hc6-8ViOLxPLU6P?@f1({pm!b8YH!;?s5$i*eSpS_)gDIgF{-Ew7%X z1O_ohIglTO(hjzD_xW=5$$4Q{$^oV*!NP%xw7*F4qLI+;XlKf{Ddwma`k;{c`^g`y zu*6zobtDgwo}oKk{}aXCb=d2@_qB?{5%R>Vc6fGJxL(8!jO!~1F0xi&u|klb75jN{gEVSMHCdaln~6zR z2u+hv3Ns;&EKpi=btreb%_O=ioxhs9{g3+61^Sf42w==qaA|4K!)@K#7}Put+~e3` zR=m~)wOXrrYc8c{So2?#5*c3FtVfG9xtBr+wT{!}W~+JXTmfKJzKtDvKNEnEXHk!| zm=T8fAf8^6*5s?&_xz6h^Eb`G#APiJ7jY#SDURho6DBpM?K6@26h3)!G&eN_b21MszBhT)ATsUw!>qCMYnSj)Wm=VZRlMny^gul~r_bO$aC$Ea#N6{d`96N~ zt21^|8^OoxR^NRTO#(iClJ(bi5J7Xygzje5Uv8o#-m~O~u;rgOhzsPa7zJOJ+Dmmr z*0O~SK(cFnE7x|nvf8au?JHg6!!f2E-$D3mW|h*hx(KP}!!QN;5V;YE!)RD&hNc|; zi3lk>Nmlxfi0n5>ds7=v>F79tcx)y%p$$(NM>wk4jp#RKsMYGwwG22xi>3TgG7r9!oUBJHbVlC##N)?(R9k4& zY?0G1_kOs)yxl%g=!nlW-f|n#w&Fg_nu7V)4a`Xqslg`@_^_q`{oEs}f zlo>P1{XTCasiT)?a+r2@S7Y#+)%Z!UjwYpS-Vo6$kl zCkoyUz;KH7*15)=*{UG|%nAwV5p%-}+$l3S)am&MZKN|pl3vWX2d~TQNOdGlz*gGJ zgHy=x^^{Q7cUXzgT>HF1eJFT7!QIxBk#8`L0EgdLP2<{N4je?A=Xr5l+-cI6V6dj3 zj+(h!U}hm|Id|(ZLCi#QDI*_B9O`V;Yi-SuxpgtXB0vY<#M;v1Lq*Dk?pEy*K4F(e zIr0?9acz1PM9Nhv$vbdGJ`o{3sB(Q`wNFzOF`AtE8jn0lx)jte89VLKST;O{A8j8?!Zrkw4<=l%VRjBS9U$gwj$Zdmm3q! z-?xRZD)Q1kh<)Ru9|*>gF=x0dz2$O&vDC_vsxe0%C{Hw8!|C8cbU8cy6Yc7R&TO9} z2Ez|gyb%i&o7eH# z;WBcUlT0z{$z5-ByOj(%-Hh=AZts1eV%8q9)u}`Zg`oV^I zO5bDyy($!r=@E5#@Pl_~?VAw)EDOFp*cpTi>_ug#a9(v1q_`?YIH^7N^J=J! z6&MDa1;LOF6Q8`UIn$4AN|O|kADRHl^?yzdpXd`>-*u4et?>Y znUMKJo;th_Va-&W&{c;@sjl(WzFU(2497B+r`BS;bE-l*2-vpXQ|dlryO~%|y2ECg z)CLA=k9(;Kg?<%I4N}1bC#}m<1HQ8e-!K45=_3nC8O#N z&!4~dVg_Swhr!5n00!}t*h%O?^k!Gy7y8cx#3nGQ!+Gqduz_Wg!2MTsy>S#gxHw^4 zR@#vmn5*Q7!RpNO&s6rt-M4AbC<4m1sXf$88(!;ygsykXSjeJl2V_m5oq}_PbaQoCQn; zRrx3Aq-m3^e7!Z>+rB^iEWro(9jJ^7ioW?4QJcMxF4o{dsR(yi)%Lk-F6Zc2ZBxqk z6gmN!SnM8cD=tdR_*J`F@~6=5z3CMreuZ7W$W@Px|59rtL3Y@KTT9Q>B<&qsU&m)v zP!Kdh7%?VdBBTg|HumLeeOB!*=dZ&oJk z$ix}btW~b8d!VB%EPrtXM54nels(`oTRLtlB#;`3Mg|b(R1+n1=)#unMzf(rGK~~y z*;r*8?eP?)JM)TB(i$G1Sig6_-*CZiPFql`vkz3+>0@mvUovBw$ZR>E5=HSs97~y? zzUN7v+(=!S@Qh}$-E5#9Q7^4IJU)?y*S7gG36nki3=3Bb%_^b6J$WLD%PQB7$XpxGls!2%W);F2>>w_c3rJ9P1$G*t#VQz16f=r zm6U$+&k&mk&*GVHi$G1fkFfWpYSRk3b9(ikt0}$L!gtEUc$Nh+a@N!3PUNeXw^Wh9 zO}KM2d`_D^rz^(si)JBe-TJaBqLlz z860b$Tc%au1<6BH9G0R7b$Jy%N<5|GK~mDQbtu%q#VMJ0CypGfan`r2*)?duQB%iHn{#eo*MQ9|e9M7<8m6cCrS_?37O6oG2y z$Y@hg(RT6NCqTq-!F-NOoY4g-;*B)9D1nYk5ga5%g)_?{uPh7}UOEx_z;jF=7y^C4 z>C2|PanXfI_JiR%&VlQFhk&WPO@1@XL=FxO1uWs2Yy=q;bY65);>#;|k+1A&<4xkO zo0?)9C}s?3!U5s1pADtjafT*1eOfqHHB_(ln0PbcBNk6yF>v7QGHnKuGT3~lPR&s1 zso~NOKVE)pe0Pw%XIW{;y1koeo&WWPKdYkIz7+6nF>UODBCI`OwQaUli;;x?0^6Mo*h{ zoKeuip31kl(^Xox+zqkc=fkHVSinLm4MpDHG6_apPlv=s?v1q7P{N9Z>8xag)A+qq z4a})iG#J;|cNOK^TeRPHDT+fzA6}86>AR{Ag?V$l+DP-h6Drd)Se_UvwL&@>*VD_H zyX4LBxOVTR2@b|KiZoE=*};@dFsOabiD#)ra>=#%(+<~hhQ1X0%0H5H+PjaU!rKzz ztm0WEKY9qlE7{E-n9nzkkXy-qdf`L=hXe8V(D&Wk1!(*l5xjF61gHS_At#evN8x~2 z6p@&KkyJMU1Z}60-zE0;ze`k<-ae&|uKUDl2H^ye(|RXQ-1XyDJW&U{Eld$as|G<-B`jzLDh(Br&A=`u*$Dk0$ zD&;Hmv4j}H;hI0ws=|NFS;YMU%Uiz@p0q1k5=a>yafMUHVjI`*ZHYV`EpRiJq4Fxshn z=ufxqs1RvMVWHK1TnKDl(Jwcb3-~py#h-unKC3XC(PH!m$(@lzl8!g(NS1WwAB9rg zaXv+nL8vkuG&V)#s`U#U)##C>5G(98YUzO#cibTR68=lhWDH6HrLenAPUyc+Bm0?A zRRbgV(##HtC*nC3uB}m-sKRuRf&%4bG5sPAW}a0MDY5ymj+_z`X}1zvHrh_`wxdYv zbs#-$+I0O7_-JtkD0mwLLR1__U(>(5bb8R2n8>tGkZ!{{aQo~&k#bgS~ zFQsqVI_*@~0%h)z|EQr4oz;e8&vu^C;C1me`H zwe{Z#f!jA?Ts+RJ-%p1tp`=eEFIaoPZ4LToKe0e4j5oC3+P|vOBV2pI#zKY%?O8WL z_Vsgj#B{c?l$7aY3*925NhzbEb;ArVG_mqa=!bpbnq~{3bT}{ZKrYGHUV#FnEl8DR zVx1@hJw$ajR*8@>?@G82UFkwDRLwzSynHS979(D73!mCE_^w8?Eoq%7ak<(+Zrgsy zc$Sh}Xu)Ng3+bQV3>*0s@t)o?W}d|off%LM z!$CqilI&QWyU++i27yx+l*T9ZEh73%tV)_w6lef0!ceOHA=Q$GO5HgWK*8Pc)sb&X*TGcWUbhmj-6 zn{g33Na2YPsX!!P(D@h_^Y1^~*u2L96dHhBUI@tNQXq0_sV0bw4;zZ>-U-0vW4xg& zK*!5NCsA?$XXtXZ7En>#u7zhfle}N$P5vW`w}0e2KaYJoE<^#~GFszk6mk-TdLWd< zL6MW&aq+OV(p286IG@(&U0O^jf{ts)JzXusr3vZ^A;JFPCztIkP;%lniT3?)mDk$h zqX))LfeH`G&Ws=VG;UU3P%Tvm6p5>;8a|HVGiG&ok9lDTqriCD-hN!C!UQ2k=w*x9 zeVO(>#;`wOR~)%;lenn@n@aYlYWAlCp-+hMv&obj$=y7Wf&(OpY&P~H@>V&EP$EEG z@%{HUqK!E_am@5V#0R(|MeNC823S!oAHPfdpDNj(st}lElm-1W2p+#JY|hS_n|h!J zHc#szI!?W;UeRQ#n5fus*UjgeHe-M8p*JDeOIE7gX;M7X-Y&a>g-TFBcXq;6C4G&= zlJ2XS#vh+r7&Od&n+Os}5{BeHAs6|wMszac{!RB$X{`b8y0utv-{OP7<7KS0YO`Bw zd`6i-pV1j4JluM^+(86QmB)fSBb~lJKmX~nj+u*bZc;UZz%kHCB&c#nwfJ8D0XP{d z)2hQL=;OjEtzdm8*{oZ71feB8%w3Qp4mt*nPoFs6Vnb75mnV*6;nBWa+$;a+=4A9v zS92PvEe4Sv-O5~zpAAkzL$dW zYS-=d?C+);%S4XgxvvTS!&qg9R48&z%^D+aJ;%k$Pr}N@gQ^>c{dMQYvB_5m0)&U< zcp0u6D8DyF{;gbJ!7_RW7EGQj@~e1k@gI_hUKY#fC0X( zgrX3jE;e`7#?_CYvVCF{{29LaFmo&6*XgCwCFpW0$u&bGfBjKul#F z%N|Xzl$4z~4aU5|S87bj0oSmlBsZU}C{oD8&e|MPM!6cD{iIBV#HEf=SbM)@N=tyU z#dC=AyK-4M8ls~2CNrJ6vY!u|&ihBS$has9a$$4^#HHh(rNvS@M#M$ zE>+oM4RJnth>Pj{0U#e2GVC3Av29>q7fUiRwanFo?LX z-Prp3@%QhQ3Loc9OAW?JaSOHokw6)4`$W!FhPz~S@zbj&8e-m-L?2hXhT$^{IT;uU z)pI+S1#SOO+yif3n1W$#bQUOiDdIOhjzEVJIQS_7Yl1E{c;IuKbJlE$;MOz=#ghdd zBH(|R`oPlHYVNF!*|$VWGg`ws&b{(uR_}ic7|;FYe$vE%kCmBiWb!}O^o`-}?)8K`^sb6Tgir!l#5S{Rx`Y%5t{a1OF z-PbMsedF5%POXNdy#}GmIXf0aPc-kW43o)R#W%Y(xKb@8jUf^moq`IiA&*6%T;Nn3 zqe&OqRgcc|Pk4Y(rO42zIF(1MAN8el5JjzLiF6=6D5PDsl+xP7K`WZpf04l z0^4YRLc-7T>epY24_k;ygB$JeZkw! z8-nxxDZVA7G@V|xbB<=P&rU(pM1ke_pkrb-uDoe=xA+^XIW(6`nkwgW7lIWVpseH_ zynDUjWFBnfY3{I4_2iS8f25aacJJzc^xx#jo`@dPnXIgQoO8j*Rp(o-^jNZ$EW#*M zp_H~JEQ%7pt$+lMTd8&4pvQK9#C!>zfPFXn!#y#!;n7@GbdJU65mR@OJ-q0ew5Aj_ zwNHFQB^=8ryGFsEbQ!G$%Andv7&88M8Q{^d0AVgeFuvbfSm^??&Z5nq}@6dt1m!BkLog5=L0VPAF<*EVV?{f>+kdpjUiRQuf=0-#{r_C3Ytu)&}E|8bkvLUsgGog4NOM1}0*E5$WYE0}5 z3=q#9pKg$sdKlalH5#PqLrni8hUNanHZViy;$(ENH=t>UDCmgz$Ic<_59$_l>rn|n zDO8MQykKo*Jae%D|C(%J)nfWioQo8s&StF-!cOB#aJ}@0|5cylF$1VJ#yt$Q+~mq~ z4=;P5V?#k)cbtZ57-fB&gHs*fLy|5cMtdF1#ehbDvs1_J?Ke_!n&vPguCru;1h!hy zp3qaa8jVGv-dhdBL<4)NLhZ-#hI^36@~z4{825&qWW~4+O-(Y{D5ZZNJqIbq;<~J- zkqIJ8dAs=2_DE)#!W4a0Os$1}O{Ny;RKt_Tgkg*i{qk^fc!XJ{fbvh1t z9uJGjx5eaSdXJ9FcG_@)j@K6iM$WBXZ2~nED>SIDTs=uCY5Sz3R*tsa4HybzU$FT(*PW0AV z$!VuA#;-Ru23GqX?xLVXFGe1aAn;E#lmp)^ssBFTfRPTEO0AkRLgeZVA)sAQ*LPWY ziEN3WvLXq~(Nz6k=cXu7#ZniY*@k-Zwlk8V4K$OlVh$4Kw6y->v)0CA`D=lbQ3#X7 z$h;nN3!|C*Xcdr&_?juoB}pyQ-Gz*<-5Fg0kS9?0b17HcrB-M29vhumQlf8m2wkFS z-rP*IFqQZGTyNhus1TJf2*~CY`KH@GurT4>MH3c!^R6E>%j zB{+>*3^%l92lTdOY)Kb5(U)jTPu|IVc{7^VrDm9%3@I?5c2AD4HMgntdNqJNL_U)# z@ip(RHZr%w!PsP3VQU9U*N@QMIL9V{f3@j+T|M;Yi1 zo+>*>M!J6XC33yBcG7<~UN5Lw$Vg^TMP&vDp_{^?GVaDzA%bt!8*c0#LwGpTT1Uas zOZf>XOCd_92K>4YcBu6QN`IsIY+|W`dx5f$@g0i`1nL6D{Ho^7=@%zwu1LT|Y~D0Q zufEE{DZKDDC-4R=jQ-lJAXas5`=waBpvNpuP2e#ZrYVP58AygDyP<6OQZHrU2RwU* zMf3ak@?rjMbe}0}UVb(3l;X3jvMK{f;R~C7zx7g&$09kW9ID~XLLIh=V{q8>1vN=A zHrzx#n!xNax^p$t@8odi;8)q)u@ZsU#4H4WaMM+)W%^W*k zn>grm9r&@AMiF06vJFkG<$I5ZtSCHxn>iUjsn?nuC!sd|`G zv`?n`^dOfQ-JIh9vI@R>RXO^5#&O_It6qkwXNZYXTl`;Oh)N}zP_#v_Aq;>)twWP| zV2f@?b8u%}%sf{^svN-W@kXRN6ec;!9zE_D{iLiUL8RNW|D3g2t;b&fUD}1=zq+7r z{B0lJ5H-e&21}K38)c-Bf>q%OvFTd&ZgHl4z;h4*>7yCJ)utQ=FI}sc3aYbL`H)i1 zjN;4Ux$c9C^9#`fp^jOe*KU~CumBB^rr5J)KPbXS$(X5+I~t&P7PmzoLL6Hg@!R8F zY@=nR!T-v}(9JD+EFNCXDIRimiSF?0nO}&TDIAoiN2*XI@j>#8U^42F$_A&|(2kP8 zMCaI1+2jX~FJO`~(40+II*8^A3J=`STTWc0$Ji^hm(BM2_g@4s_&s4xAq?QttIp`p z0(qpo7O$X>PrC?_v_7L)fCg>KdaMuwO{Rh+U^46TZ?M3j?93B^N3DM3ogyqk!A+|w zI-rl0;PwqsLVt(W^1npin5AaG<0UV+ToY^D%)X{8U~!%jc7%k899lZ#&<&hzCenpo zvU)#Vt9HE5@={hD*Y>55FlhFq3zG~-cyU8`N4s+KRQ;@$%P0Gnr0R=>fd>Do7ngDk z!^A%NVPZEh1V`)j=$XkOPp%3pU`kekk$N^+_0SaHqF0lo zU^Dh--T6u%{SMDAn3u!<;{zm#kzo%GiGaIdtUZo-^q88u4|}2lp7TAN(1KNBY8gzI zXhV-Gv`9Hg+ci_2LM@!1I^&p{$x7G+xb}SPUi^MaFv{i#%DemI&6m~G)dYESvvph( zDOi^NUsFaultsqJ1ukv){t_C$1TE>ApAi;(n-+qjI&@0Y;6KiW~F>%PTnSNLoB2g zrDHVY3Pkg3RG>!yofbMKT%8O>z~DWLP-^i0B8O$_Dt&EK(GVhaD)WXt1V7zx!FkcU z@>)NRchCcu-Vqp!hG%yS(r=^Eckl2AClJlpPI)G3LJ5(4xr;oE=G z<>&k@BMbEZYA?R_HtT?QBz=(Q7hIuXS4ezZg!ZOgZ%Rn~!@8m^)E>9EhZQC&+m00` zWtL#+Gk@>K{bmuqd_-TUb;*QiwMdPEK_O*^?UkUbZ?qAH20pETgO6Krfrm`J(12E_P7VY?NR+K_7C>2>w^GFl zmCJw^qnsF$EhVf4G&5v z8fu{Sw`w3SZb`7mhm}*qk*b$deCVOtBhA?~9XI=AbJF6|Oah{t67rfuYOyEA2qLLy zospA{pEZWrNDXx`UQvy^`ns}wy#eKF(ZktQ6v;w=mcAH)W@uy17Jg56Rw~e;!=UWb z2ZkkB@c=eTuX|~|T?4=Ck3>(K9x0Ob-zrD{{tK+3DW|6VO~d~FYQ|=bZl#)U3ac`t z&brIQpS>Q2D_}^%QnXn4odfsiWtLGqY{c%ZVz*o=&rxxHXq&hx#41|mcBuL$u}WFk z`3Yo+D1o-)6ux{={^i5+FTYPn6Q>+5Mo<8+M{q(k7q}9s&Bot~vY+@{1+j?Bny__R zy~|d!QqYYEzx6)c$oK`1_J8Zc{%?KU|E*7)9)og=fttXQr_DxJCmczR7TEmgLsg?0 zr}4M;FI#p0vQ_sl8i%Uz1by#&ee!Z$O}J`j)BCU6JL{!-EaMF6 zUCarHu`9q%?$m|eNY6Y|rPH9YS?mR)_|C3s;YKtl(DEn12xO?$= zDhRG`7b~<^l7%7PIgluN2&g=5oY*RyjuG=f7)!LX1e{8tAshEK+yHuse%NNKj(RDxH&2xP+W&@d=S49>fg4h+FA*fBcn0Muo&y!eDGxmK#^;po__Wn%q{E1@>u=0uIR~URT(llh7Ve>uHZ$<&(mypTI-P^zECxkZd5l&+kv*s12fn*AxhXT(HxTh z+}eL4g7{_cE2`-*Fo>#ydsZO(6F9(F4JUXktO>y3PeBv^Fd=omHi zUWxhb-k79gezXZK9C&|LJL%KnC_?!t2e!|aOTrfu8;CD{AqJeacCpz2msJ@f4SjH` zDgs4<1VU6EAcQ^Rkkl+3;YY>)1T#j1tDcF^A%dAvrBVFNFe=)Fia3>}HyanaB#NeK zHHUE)4>uXiHN@o!IxuwVT~A*{IeqtZXvNPXYI=n343_6-cT+L4Xc9@CdSkN|=>oc^wHaI0>JUVmOe~(vmA{G1R zGuYnvqMU)>3%!LxnOFPW9F4WaXSG@aGY|Z_WsGj*c*zn*0FAQ{^|RW!SS}rGIH1u{ zbTxL`)SRqkZxYK|21`8KuoatLfL|I&`vVqCNdC6EmJ{5n` z-TSd2F;uGS=P`TGCwgQe{NXuDu1z@y{umf#&NKpS=J(N|CuPJd=BTTEYPdAm?J_I; z{9C6XA3RW_8`p=WEJ|Ogw;s-($Cz?y-9&S%PVDv^I|waUaUFn zo=HuM%m8&JnL-|Iuk9zuISNMD+6J4aC+ElM;oG%PD^DO!Q<8N?ZKjl6<*tu@K-y!jA)GudKVLG@G z>N5GV__Z&d?&S*WS`8wKqkJs>(7L*X!F45FxSqAKQw;!Vl>P%st_A#A2ee_TLn)JL zwV{0!#SS_VYr6?gVEypquy=S}(y^Z1_e|tk^JlT59z=APV0<$beRA^_KMPrgifFIB zl9dv1)K|WJ*@$z6DuQ$CsQ2OABp)2=7HDI%H){6uuYHn3*8>BFMPjAgMHJtO1l;Y< zLw~#YKfAh>gEbUQVNyKY-fEZ>UABM{Xl7M&1s)+l9RK^$%tf0t^L9K{x`;jJk^BS<_=|V`6uN@2ELV4xW7HzR~V1qn|WJbyTgk z5g>lFN>)b|engsQ5pMr}pkyTjT1Jw!+~P|0^6H{F_)*YJoy@C6w&?<67u><->m=!QLUQ^&n0of?m{Vsm*~0NBa*q;UxssU?v(}8!3_QE4cygtDK;6A$uItWPT@@}onmL30}own#m14sKCrzA7? zyYN5%&*U?no`!R!(S$Cm@;(qa5tIz=EfwX_`SL+Pz`zwkq z?U|$0hA@g@!BTEyc7U*o7PF)eg^@)R6+3r>3#~-nCiSQS(hV{rP!b?OnV-Fn*VpMW zJ)&oDz&&@qEdm+$)2(u)Lsx7~b!3EDHtlRU)<8qYsl{?uN0g8^oAr~SDbUOyo4JN{nX=Her8Qt1=WCogtHeUPqqn#X)w#T$&X(OI=pyGVNTakSk20Q}p(?H4_V@U?n1=5U>>vN23{Z(aS z--`ZokUXejE&XM)B|57@ZU_uiXpl@tX|$P*@e+>QoY`QB=6dacaX_LM$iga@_`lHa zL;Py&DlsTmK8uN-Mrgo{!#X80MhjAtI#*C4;msEF*I#u$U_SN3ukx>d%YTN>(Ok(! z5MOO^=`d+Dzk9kie8ng%u2IFXqB0Mfr6ulY{G8*|*uvZMO#dpOajx=r}kEZk&jVAi3+HzJr&nxD}a5kx_9e;)-Wo*&|d z@wlki0`QlVlcP7)j#|p4o&>kUg*X`9KTA-sr#+QM^hBhWjH08sUvyH0$a2}f0M?MM zdNwc@Luha4o48!i-C249tn2gj20;mPz6%A+*B3gzNeO{0luH_4P5FrOnO&A>Mbyl0 zHu2swMxR4SZ;i8Y7HRr4zPquxQBWc`GrW5W3Ml$Pn3MkEdw;ZUZ0GMZrEJ%aNx`w? zr&pRQ>d_68$v|z=1qeSIac6z=iv@fK;`s(s?w0VTRmvE9pi|gK6e?wG_SIBGJ&yS+ zRO6hvO!jzO_Zlc_56fKhr%Zb=v(}=o21|05jI6VWfR&b={Nn!o;}xpb!>B-o#D&=~ z-s25*-XlxRey1w4oX&24le+c4%OrsRWfn-P;(z=h z1Es_UrAOBA64=ltof?ukyO9$drfpeIivD76?{~BXhR5F9-;3ZF)u++i_3@T;z34mq z;hNba8=YT7C7ayhi*O`CYN-_$gYpavG!mcl&1F3el8rzO3dCTzQnD&p!=Q`3+%Y%) z<3AvBJUE{I+iv8ch(F75l@YYrK&i(kria|$W~2|mh2sYqnIjcMS5|p#7WC7+E2r~P zMkyYH&e5emkxo$Hqp6&ys!D3u-{p$m&7_A%beNA04P=G4PDi-}GNt1_T<5#!Q(Vx*l-H=9C7ef40(jl^dG8||z73lnhKt7ZKw&~ty-C?hL z(Qj1}cRR(KP4AaUzwHx7W?&n=IY{!2IFhZ!@4x3D$UcsldYV4BLt~xb@L0N8M$pSr z6vm2aRaN8c7)Gk5m3Q_}vLnmW_v8NZ`=KmwyNU&pyMmM9$D%HgCBRucD^0FDb1hOZ z4^rf5ppNmvE;uEMlyWWDLwXAfjTc=gz8Cw1_Bq~wb~*tb`pI_mwbuO4VSnHw(d4g( zrOI;|&XenA=bj6(S*FwKP4f*Y&nu)+J$J@HeqGi+g40wybWWV8(G1UQF_xF2G%(ckH$s(q@dtM6J{JnC>gCL}fqZgdY>`G7!(F`s8p6_|Jg0&tf5k6MN(C;NZG2}j0sdiTNC+$^` zVF(#M_ym%%W_Knp$u&@&7_?lfBh8_g(t_!a#p@F~=^}9me0NOKVc8Z?rOHS1B2*)R zhG*}HE>|{kPlPZ$B$i1R;e`FrCp!tlhh%bFFHZU5=tq{W*?gELqr&&W9>EbHkEP zSsS?caG!w?b7<$Ad`;qzZ=XdBaQ)Twx5z^H7L@iq96i~XS6uq+T#>iNhENi6ou(~A zY0QD#Bl0_vF;^S^sicKXKi)z7%S6f2rGDmdDYx5gOe!oV${Iqq6W)q}WV~?t^-1H= zA2rdGh+ybU;C{<8n|`~~@VI4p;Vz>N>WHc2{5w zjaP*hPV7koO-}3?e!2XTV79pe5Xj1I$}5|)Dl}#3%e?joYe~uUjkLhYht*HLk>F)Z z@FGJT_+l`v2#4zMHz^V&HX=sFQH!JIYKOxw|W)Y3S~4W(*v7~%h%^z9Hcd${Fp)SI9W z(o1BIrBL7+0O>&xlf3`Ye9hxWKk?F3c6*Hju>ypO$v5f%ekf7erluf1XLo;UeM?p| zXxdbKhvzbTia{5D&zT0t1xXi$G>Ku@X9amP-Vq%xbWypvJ%eQ<8Ce8cp`cS69 zih2a4LM?#)$Q-?KJQjiP+)^24$w!||4UN+1Jh{Mku^B=V{czl_C`MP{#EbKDI9^EJ zcH0W4_Abtfk~q}@%Pe&yYsdYp3Hc_K?ve-a(sZ&}rgwx)%=mbWG!ZmqL)O%dO~dU9 z?)r6JwXQ*FK{C`S^}~syk8d2KyvI^yOx|SYvdV1q`u)x^$u|4kA0(a2?yx4kqRMv6 z*eqPzLT&Wo-0qPQJj6N6ADF6ar(CV4Hb`#eh2&)CLBaLMGXq+852o)q5@c2tp;W54 zMThtdhP(KZ(M0$Q@nPt}))S8DWt(xf&_5jql3arEVae1HW!*+?@D zi@~H_{7kX5k%t~8-HfGLOhGpVEfS`kT8-i_nYxwKIOHgq8qbsM`V!fGnp~y&pJSSN zj`}2%Tx`(dCdJJlNAhIZo2~|nKv4Bs?`xQxI^J8YW@-s7?H+d%Ye4Q}wV6xCG@JYo ziFazU+0FLxaf`<1MoWYCAiSWf!Say!uxIg_J0)`WkJ~57Y@-S{7OmKW{47e6T`PC` zcWk`Kah<^#$?UF|(*dx1kh$DCvbe02%@~#vWTA2-1eJmUMVc4L1r>Z%b4_DuRLsiU1sM93eN5R`PVz?#^K5 z^+#>eRa8+pcMPPero91gA$!-LK-+?BOIP6Kc(ibw5P`1LTs9tVIL#}YB)dR(Z{+oM zf;~UJh8>e_2PEt%kAv;S#zuw0OtM>#LDS1XNO!-$Lvs}ZXdU~cXR0EQU7^{ z!)LO_!6(f|Kua&7B1o&fTo?=4!?3RhC9$4U5{nny>+ue2mmbqr_$}i=sHxLF8of2Q z_;RqG;CjKjrGYg|wUnVN)YkAr^vPR4X83 z!+!lx>X;hGR0epUBvnJn^$Z!A8qcgOt484*NkFowHtj(&na|}@ zC%*K}$&jlOTtMvOPIFpGzNWL4{Y>?ON*Z`Js5aO(iS>9cQQrT&M|BEU^nbq16M;!Q z3L(X$0i{Cxw%4f~Na!s*E^BOm6wc;`x<#O_5Ib)T&dhm{k}wuCZ>RHR842wqBG^fo zGC?|gS^TQB@wbn^IEnKHv(peDecLv8VF`bXG(jsheo8S9#%@zDVKGz)B@w=p_C{Gp zV_r@;e!6tCSWB?~%Npu28I=8h{oncsKabv*5qLS&RueUOkQ5l!)9cZaCfASG5jQ(T zHA~8W1X03=h`&#lhxJPE-2w?sTie6io|!yRzp!RQ!l%-5pXzb_PM*`4lIg%Ol__fS z^dTFr`W|P!elDYrz>DiH#CN%kia2Z^{is-ud}xsyhpt8x50plrk?Pe1N6Gradc1p)hs!EzFE(p%a4KesLm{=&TP;;$=ng1!-NrEkRu}lZ?>*69F7N= zoq+%6Z;3C_ItC57hm)ZbD339+#-%{9&qWhPZk8lU%0W>mWG#(0w$3m|)4jk_mI24g zU}wqpSa#vo$Xx36t(xL+VO8Fra7^maw(r4Bj>X$Oc|7WWks*nvB&`3~N!<^sf5@mU zN$Pv4Vv0tIAjiv#0sG26E;_iBieneSdXeAB5E=KetV2RWFiSa|aN4fEr{2{xc5hzp zCDIFceSo||XkRMUZclX=y|R%#wCg{tGpI*4@Ab}R<#eUr^qc5F&u0j2Q91QlxEB8> z#tbL=3-6gA@pcOCm?(a6^QRrR*JcgAzAg+2nLA-P7bonkxtJ7b-8e17s(8r^5!xL| zbvm1%U{>3XrHL`o+IkIf-juQu%wfoHgXCMERkiO}%EtFvPbzXg(C& zBBn!C=*aP+W*QKPb^F1Ny{+qAftT?D_JhNfnz#Mc0(#ciTQgU}!6*asJ*=tw>$(f3 z&d#7+Ayp)_jpkrm5U<{Q>+$2$`9`crPjI$hhTs{4)gsiNxEF7KXBR9-YuIr@=6T9_9y7P1>RyLe#s_v^`#`J!xB2(l&Zrk-U>Ei_9$$1CJeCt$k?p zMe?qQKPL3CDq*TrUY0g@IZw0$K5zUug-#K-%3;k0;3~fOq9sb1Pzr?B(Hj&4fP}6- z{@wHKgz;wRrOT;Jg0-g#2Hv6i+8C6ClM#~=U&`Ak9|v4_mRyS0+Ey+*SGFPtO^IEt ze8Ehqd_^R%IqV-Mk0Z$y#k1bMu?vJ~xiFIX2d0~W)EK)~OH^^WcuXCbuCy|{y!n%? zCn?Ct`c9I-uSvbCDH@;xm&rtpiz~Vp(Q&tknnm&Wp2$IKJPkr?y zmfwCO*b!{ioW2L3MS?omi~dJ;gUoJczRWqn2#^snqM&Ah8AavvsOMGRN-ixB_|;^x z@IiU;a*1SD^7Zz6r5aMKB**Nw8sGp8kY$B1#3%xy%pVr_F-T?&3Xx%OfrtyTC5(2l zj4R0`8?uUk-5{DsX|6^fvW9J@fYsysIXt(nJUEJ!`T8<{l`KiGldJ6Y4b3vLM)ib#^)yPpLd;T4J9fJM2UindpDnxtINJpQ}9;v1gKtBAWgRNE3X z@Dzh$t%1M0gbjvt^r%19#CAKKuDNG2P>Ixs84(ihh+(7{`A?IlvRh0qHLDbPALYIE z4)QLbSFOaMTuAH}Pz|h}`bGcCxp4&~AagdOpY(6&RHMX5 zOsjNQ0~w{@a7wwfEZ>eE=G&Z5LkL{rXCU|Cd+{^^;qW`Zh{e;GDW&)y{|##>NUTP< zC73Y0^)1AQz|?n7zw}`ioVirS8p=}@` z56tf3abEp?$|P5e9Bm$gE4=Np2FjVmk(Znlng#4-f7YxxFN-&TIawYR|4rYq(7gHYn8@NT^ZN{!bCz#_)RNc|~uzNFWu}*UdM;G z@ibabGS*G74cSfdOi^*xI>eGIVMm%+mp^n{ROV|MKT;qxB&v z4@BCB*}h~-%PQky1D-GLr5e@~iw(WjP+T_?iKPjwS=m~@5hPFz_7ELQ8O+=7AGdq0 zpV;8Hzo}@m{}3c9sXpmgYL24Qa$P|^@acA48>}EdZIbScYTM*&C{=?ZbgF@dUU*4! z11Z+_*`jjPI=g|cS0UdQx|EkYlYFm52w*WmEff5*m-)#9t1c(_Ji5>>2+9tarw!>V z#~Mvd(P&cT0)h`Z@;Y^@j0$S+Xq35E)r@bxy#M-({>ukJ(y>>^vK(b?|^I|a-%aPxR&T@kY=XQB6^Td0i zDkI~nsQ1{MKk4Ev*BS~Rq(q-=&}$7Er@oRLo84ZG+1i-iR<8$ysKGt-D;qwdtyjc4 znJ!*INl|yaK;7`6Ja&6LHEYN$pXI~XyNi42Mh%vw-tYiQf4WusI}#AeIyN5eAJ9fY zNq99awnyP}3d#_bN<80VO%~B+M)dOVKr$2%(ukV+Y#a?b5v|3`+ zJ9v3Pm?lnGGU)UcvXzL?2xHTuPyg@Vyb)@y7tO^e4{ON6?!VnHpYGErI&60uhskRB zM3{{0hKNvmlvZq3Ix1Ou%_*C&kom6G4ACj-l~82JWa-#iOV9JXqSP07Z^Ap3$+khT zBASbwjCzNul*lSFx51wicdZpz>%-Afwy3Y62HV&^e)mZSbE}JZeKhD)--rj^Wr&&s@l2w9T(1>uq8?}vCJsnV($a;R~BI6d{Mf%z1s?Q9_{ zL)!jeO}b@Q;&u8iZWZG$+xbRV01|8OL5DKTXh!gESqJiEQGkU}8o-Y^2iHpF&q+>0 ze>eddWO|^Ic^w0U!%S$SW91YD%T_9T&`Ca`9McjhEy>3x=P zMV0wjfBz{5;mwkkoIR;q^(eXLWe^BZZ!<>RJW)&|=O>jFn9W$d(J~eg1`8onC`#I? zXx4g@8PwhrCM|rucSZi_Jv-HQ4LS(KNY#AI2Wr{OAvaM=H$~7+ii;gmbbx-^76xCe zsrbid_>JiWNa#EsWvs`*jTt5)aB+NifFmjUPcffn=g6Rb%(_TYz%76_59&|9#)Sg93C=OY%l zF?fNvp+LKZ2ilP%6|NQQ;1Z(b>l|vqV8e-mpnj4h~Q>fHiYINiy#f>f50PShQ^v z(3qfoPDHDtj$=#K1rboH5{=>X(ikI6^m!c(H4n*8X~_s{U1HQO`p8X5*|T*~=E@YG z#+<|-2ivK?A(rQs-fLKRK(2K%4e2h(#ud>;E+tJnZyJ)X7Y=X_I_2 zF=QRfxkGQ#6T3K0yX|hPpMr+D zAU|1d5lIejVHV2>%{+@8RMpvCSyD00&9c{y{BBT49>%kF(CO9IO#3giKr3Ssbi@}x zVNNSTOQ@kY-f+Ju1+GFw6C^|41ks6JEMDY7TT0`0yQOhOMsLYU8onm!DqO7pqocBf}3h>GohU9M$lN=7ErC&fCvpapu0z^rX=SwZZd~M07HsdFmZv>&^s42<$`$0 z&ib!uMMCF4wk<(Z(9vku6-sK--RKeXQy0 zGgOtEu2D?Yu4W8^Nb6`Jnr)EKSYto!?L0yB3!Ns&;wl|+OtW-o1tmejG4Oep%rz+) z{3>fp%#hN9(L8H`q<6-3=rL~YOq6LCM_cOHY9_wC&a~t`XqjO#XKMpD)I#dx#cDlX z$v~&oX%0QInW-Rg%C>=;V11Pvm)0B|)@R!qR33V_<83!?KSsGHlKbI_)!f_tbR zh+U5#Z#Yvs#m+c1cZTQ3C*h5Go}HF044QGmoF6o2?OXzB$yPGXMzB*U>xd8y*8xW( zqGUi{4FAzNFSkodpOAv2l~Zu>&DN1P3G)~$KHk7#osOZvFqzKv!<_Un3k&-(KNc?+ z4OI2cqN!CBoKZ6;;@mH;B(u^Uz2w{Z=y3-8WO%%2u4q7X_XHHkj3`!ar=0GYMKqC! znGr?PT926wXkcg_6y0-cq9D@Mnx&Ygi}OBlHF=Dk1>N%9vAT>&CBY5N4>6pKw^9M{ zAl0fj4t$$V@1#KV6T;U8CZ;blUmO%6%Q~M@htYK+;mD}0dbHg zv1G>{_N#Q_33)>MUykdk7vM~So;fzx1WRt)$L83{j8H+S4B`+h+S+;Bwa*e;-o1CANrK&eN+lV~!*M?&suT zUGe>JTf5m{R{Py@z0KHDiivulbY#R5EKFAODlof@&)aSS>Y1&jV@jkYNUWU5gft)< zce+b5$wtb+*wBo7qDpr5^h&&T&wReMH(*+=xqE{D$~lV2ol;ZEl9$d5=tyZkvUXIY zayOBCYNsref1wFoDEs`9d#a9YUY6xf#AaIV>F`p9>ku>t!I%_+9_E+yZeieUCcQKH zLTt;+gLJZ-@SetHf0%2LW)8Mprce9YrNp+vdhy(}7E)5-8*4`u{l${C#luz$F9 zl#5yGVvzQv=5XY?z%qICHgQ$!P^dYYC%Fiz@m*PUpfuKMEFg{fr&oXM4_c#EA)Kg4 z9h_bn)(QVyXY9F}>d;DR2`GsMfz;2OE{VAf8@n}~__>;pfF9V*V@Td88&0S9iZFDL zKNip>-z;sp^u;rYZ~UcNMJ`89qlVQ6!<>@;2{PvJ2{xU*u#ppqT29c&EI2SqO@0bd z?I#>pW{DIa;Lq|wgKuLF^WZ9DA%L!|Rfw~{J)ohjy#Nfxm{TsG9>{M?1Z6D{6e+mB z&U>E9Tpe*`GTSUUnQl8%QMb*MA6yRLm~U^q(mjpckky1|i)g)s-}h|f~Q z5_fs2?BC7mUP9?!UBIs*&7}}Xd&(W|cGw^0I~c!Lu!F&MxRxdW$LT^Hd!WdCK+RB8 zAozK4rkSnTrH0o>bE&mXVq+1vyO@9)@q@+@jUjOk-f!!u&vtiyIu~|B_sGMgYL{?$ zP*r!c80T6;elY|2pht7F(fWB5me8GxjRV=x*4)WrRLP`|Cy7^gv{3*z51SN~e>66- zPK@ZKw1x!Y;i4K}@rX5^Tp*RlgGa5_^g`+yWOEPnW0;7^eqSg&g{(}p2W;pAHncZ` z&L37AoGcfxdEXXe90`FDJz(Lmh(xC^5??N6n3})rMfVhLMQqwwi3rYVP)oS$&TVJ* zeL4|yQ#h=U-8|M2 zg(p4BOg%}kRzfrIp}AHbkh_{aicw8EeQy^t)xi9k2Dj_W@Jlt&D|Qe6-{DBi1ig4I ziN7Gs9M{@skO$+RJnd4|AVA2A1;K}I9y87&Fw9!ljOcnc3`Qm!AO;q!TIv+CJeSBU zgPPj1{7A}u5Z=Bol{d?^(4Q>qt11c1l(CtvseZM7l;R^+LAkaC9FXI#1W|d|)Nh{T zqmtrvkthgUO{tbm`V#@3kueS9Au&u_Ec$lobMeCG%>-~THP>HKXL>EJ3hHh9@4YBL zM!O+RT_s7`ivi}WMQIQYp2T^(R$cG+bWMyzr-m?TeDy+F&8e@pF6hHB?AmW$NjDF5 z;ss)k@aL3q8U#B?zeNU9mX+lY78*PzE%z~+PeaY%Nf|I;@tY>iQrt`KvQ#Y!8_ozE zTp*zSP0pE{=@%p{KI27@q$X`puOi$9LQKI&tfPyNdCOAl7%o2EQa_w4cy`*%0THCg zmhvczmQo;p(1Nzf0Da7GGjkDHS+KTQQ5fn;fWa695wmjPphX)w;8rq%-Xrb|Jq4UF zeuyi>BA}0%To9V>R$rPOG5&u;+sl=A7lIBIXOTs4-QUm6|h?p#2 ze@X@q>zietk(gd5$#JYM7s7KnmiDCHbVb@JdHvfyCdrSU+o-)w$BpR5OCtsFIeVK1 zwPgZG27k!3VyZHDZhxDoODGDg5j zOP|mUzW~nu#jt+5tp>cRBv_DW!jxvKb>3T;rRs_;P=bpR7tT0fo{SIeKG+p)=!x z=B+=$`D5~skzJKQG?y?%FQ^R0`dAC!#PW1n0&?M*U{7lw>kJl>NiV|8B4063ytZD0 z&vXA~OVKk(w12=2D%&MjcQJwU0tzg+oQBO#WfctxG!_|E5H=VWwGuePk^+vt$0?xN z(U_*5kp&~|@o23)N7!>eg45GeSndr)p)ye+(W+P^nh$c(H7?J>#-i!}P3L@MU)A*a z7dq8Clr=>Pxt=l?+f!LXS?u;X6LNuG9bChl$1xH1_!039FV}mveDVmTxuX)jRe{3J zmyQ|eEJn=PTnlHj2KfKf8$HSFS`UW&7XonPqI5gCCQCAl)RGqT9;95({aHv z^kwqCK-Mcb`8pfp9pp@du+nUb?e6)`T^st63u)7YxGg9LzN4g=g3T6%LOr$R`BQd! zo3uWOPUZKEjOFgqzV6~=vVNqU3Cu%spB*`(um&IlzW|V+0ttUxr_M+BBc|-=WjzrQ z7yIC^-2ojgB^|~RFwm? z)gS5+J3k_mH^eAEfOG*f0Q#IduBa4s2+ z1g6xC?-98*L^GikW3*675NQY_;SA6VpHIk})y}ERSgVgh$ia`DQ+R9VB>_AZHw$Ka4KnX^b;CG$Z*1{v#&Fx;H9$Xc+Cq5iL=Q*T@c|Ul{1%5W`mvB0e~d z=0e~{R_qPYPNrM6>k?&_T-b4Kg#epvY(*okx5mN3%J=4|q+~9TkWKL--zz?7792-g z=8j2`bIQW@Nt|c5GgQeiSXo^oiG<688{@XeUm6Dw(k3DatdG95yEns2lnBc{Gp0*2 zjAEra3bxgHt3JKQev3v%q=?*1RtAR-m~WMXlL4-wmeq!SX+>K7S;Qr(gf2A$CR=~* zJFs%%0p4&_UJX(Gus__CI>GJACfjoXmfM>ljs2ksF;}LWU zf}iZ8Cf0e^h1}gH1zArg{pnZu>CvT#RyR8vk~37Gsu~L?JN`d$J6h2wnXa^CJx)`^ zaey4OzXn&er8-TZ_bnylyX91HlLE%k9$hk(sByJ*^jR3cMq>pEAFG@7x14A%=0($X zsa0kJ z<4wY|SvjCGmsehB_B-!07%ZGU9$tH`Dznj&t(w!}^^xS*~3<0-M>7!XAzFKx;88OM5AsJ(G!H$-U$qbP_<0F95ZeS`dpNN9e z8#H@Xw@+kPTHcsWqE@KrBM1T$9s-MI0=Lr97*~6a<#T}i3UNxb?jcqP351;+E%pte zFhM46DZ{tpkFIV%T&c`%m+FML=u2FNUOJJ$m*$(8n+4yHoVvuX2yh@((T92S$>PKI zJqbHXT=(N=#}o+RydmYQ7DXPW1ULC5|0@D{Xx!j_WieZwp|S$3mHsiAcORo1jt6qa zNk&A<#krLe&W6GNv&*_72DJa zEnxUjktPLwn4i}TdA2&0S~e9FwA)npe7s^%-ZuLpB^@eU;aKUzM@D#w_H~E= zai4irK98*GQB?OE-_RZQ4dB%yGf*haR5I)1i(DO{cD27^~!D?Zx%);ZA)QUl!hb$X`yH0=YrT7j>pd zN4WdE`HJZHI&G$X2kWLMt+5ENl|f-Au?C1hb{ zz*@?&hEua|aP&MEQA-Lt9`R@$$Yk?j;xvU$lVcG8QmP>LiW#ZWVI#1lYoqk+Mpx-= zJZ;vc`a@Ds53-@x8shixo?uNe$=Wm%BBqHgW5Jakx?gacr8hv?gZ>(*EXTQUGnDW4 zdOK=$W;F8ka=!M7GS{*&Ir*ZHvRRoZhSK4HZ8se-xM(#7^PC*^JRi{+R4c$tC*8Z} zjcD7>pF_K@Wo05xg-e8*Pef_72fb^E8XzIrpZ+EFEdmhG8hZP?kYAhV(B%Fpx;)a> zNzV>>Lue~7u>IDFbk2#J2PVUd^KyP{2=e|sS>>Jcd3e9PpK>Z9+7t#<5TZyG z5eN2%a1>%#!7?GA2VwY#jUbCNbzF;0JT2s&_yg_Eju8i<`BCH+CD6$|SA*pfsb*Ds z%1!^ihTC9oqY=mIITXl|n#2WyY4vunDd&(A;~609Y5VXFTgkDnEDYK-gK>%^de(lm13D3 zwY%9Lp~^L={TTwLs*&rha}DDwY$Ll((kMe2eh_3oGo#}p}VUigw(Fb zFhOstByjf7b{K4kj9=FPT**Yki?UD`(g&cw@Y)-u4S5rE;g%aQ&iEd=2r9Y|*+@@^ zv}kgd6dC<+k4arfX>`wO%-C|1AHdETS-CcVZQ^eX$2wH5ZGF#<)6YD1p`8{ngzW8t z0C;zV(Hoa)XB>mfTX@gHyVIa+6A&+nPsxq7ZlORD0bZrvVGaa` zcjvJB4hnXSci@(&&<#fIY&n{^ZUlVH)o3d*He@8UtHCFy8e{)1Tml&)Y5Bxp>DbvI zB#Pby!L=Vk?m8F&=NvMJYT^zY?Az9l4nHeOYI#L+OZ%o2B^>rtz&!Xv`TuEEN(XR zfM9=+=aihbKhkXX+@qkcpD8E$cSn?I`*I>r+rlWq$4HV9uI63WYUHJZleLE$U z&tn}1754@Y8KmVcm>xRMO|U{oVOW?}A54gqgT@Jq)ye)p*3JaLr|SLxA4Zm}ks`8{ zNCq>;8ZsD+b<7xRj4?A9W*D{So##IHoO929o^y^bEqW6^&KR#?z*4uck{x;E_#xNHA2i^tEdrD9@S*secYtE9*QDt3jOun?z zf|l63z}tYgH-cTz$#9KHD=Q<~*le%sX?kIj-0Sr}d_fyJC{ea}O&gszwqUJflDEjz zGxq5-Y0`weEbp*WvsXh>^dCFtEwSS^2b%*$=y?vsS^BbU%OY(Cme4MC@UW`aiw8OAv)r|w;XN(Y%a z?NT^%ilXR$ZNU~;3i=WOiW@Wil=}_h4 zl!H0vK->@=-;xn4T_jsLj^PGY<7K&o+sz6GYsj;F73crgRE2(IF(wcNoBz3KmbjS0 zX_f>TlgO}EoA}?2ZBELQe)NB7BK+&~8l@qD^BU!>Lm4&Aj7do;c*nurZ}E?`TmNl# zDX{Bt_R4t2#r04uKmNZwL1#uJPm&=QwPeSL>9(7()KbQ6`Q%3Hq{APNa!397&t|(w zl>eNXim@z_G95_;COX&($usFmitgJjLAHe&l$aRX4Ov628C;$yY}~k^cm&V|#>z!e{aMx80 z+ATxsKW~nAq8I?V3R)WlZZ|bIyrj$4boTh)mKL6YVo-9hBhV!kxh)mF1FXexDJU&S zB7zB)ZbXTy%A0|=@eQ##DLiXa%<;V~{*%((Qd*6Cv7>WGLQ4?QTB>~os}6ico|hz!o($FPswq~IbGc}PRwK}()H-3;@_^D2JIi`hW))mVm(D& zvnS=`=n$F>3MR-tY{?lKziI#%6jiiV3~y5HP$cd=#ZtrT78(NM|o$gM~gK;3%^e6 z=B#Ul#4#%1L|S|QwvoMgu*$mND2&n0NQ*mB-DTrJZxck<`SiAADsICP-KO0eD~fTe zIF#E_{9z~AzskW@Kr>G=M2Ut%Kig#g61H1;;mVk7VOvK;xw{wiMws1P;7_E zvy7LnrVJ*^zVqJPt1sIO5&J`1ev;&d6$)PSzYR}B%OEol={R~r1;M7z5lOAwe@ICG z_6fG8Pezxd3c*s!XxXnTOWv=|q zOW>#z{@0?l!0<{2i)He|CP0$iRs}o_(k;eI6|%;H=Ro8j1R3*{eV-EJ;+(~W4Ef0t z4L8{Cf5B4i`(Chn-#XFcn+CGUA3adswQFL$jkIKDFE7pJ`H)f3C@3y7%2+Kmf}*6{ za*IIq(7=B^a?eJ2+936gn2OHJos{6MXY%%@ljAX6m(Y!y$!0KaYn7Z>nPHW>LfXVq zhD?)cAXAp}iG=LC?^a{EB?8_sgUmg36_A$QWs+f-7sIO=IcR6T+8;DV=&L> zW}{`iAUoitLT{Q=9A%S0-?&)M!D1ptGgS_a(H5CYi7`{V zj!ToHjAX-;)RYu4Q_a7@(gZrxHv3&JgG0GCq}d$ zB=M`j!CHcmGDlT8v`KP>1Cicj2xj?2PbqGb=oL6Q;q}$N-f2(7?fq*P8#du5ZeG{mCdg`Kv1lcpjGj3Gd#=lL>vg0$hsFnjS z^5vDsPV6m#os{MFba0be$*EqU{T~J%ifN2Wmp)3i(eS;yxE3-&=0A0h99xSY#E8z@ z1X<40@n#l7&nU{4IT1;{;$nIPzXPxF;xMVSO!k!Oq zq$#+7%Se#Eo%W3E$U9Vf-j?g$$r5>zG|(`1zV^)QtjqF`JsmejAOi%F&Ac2Yx!I*j z+Woe0LrdKkSt_bTS-2s)Z_4>Q-H@PYB2}llooR)nKiRq3*9ehiA6uwow^PgEK#b(a595-|$lMnfF8kCp)oN8bDyNCfwn}Q88Q{2YZUYuC#4;yVI z$#HQ~gvI9ckfY}?%$krP=Qv9LB3n)1{_X*7y&NTllv^a^jeP%${!-`-bz~0_v7AdQ zSS=J(GW3=Lh{ z{nRMLH#a5|+OnISo?E439^L|&B9q&Z!S;MkM^f;45&nECU7ljv7QB?j+nohVADucK zYjKUML^&#jPC!XY8apx1HNqvFlihiAVmzJawu43o(8ewLITPy49FTZW6#B|lvMtW`m_O2xQm%@J92MpW;rKNhg^b-TpK5F zosfdftHk(3<|Rf4&1nX0T!v1F*L1{wexaV9keA{rP=6wHn}@@&CMcKbFi+6YUb4b1 zS`0q>69uAQ{4=) zuXGWg$*FfjbxF z0~i?Ma*{hWTu1)0+F8X~gYowRJ-Zr+2^bKIDhTThcPI9^US zbyE@YqOiZPv%1Qil!}ir_?Uo?jAHh`>=P$lil|Ty#TP}gv?xK$w57G`DlzXAVi~)O zcaws?8h&s>w7e*p%IJa3+hJ_xVoh?mx5-r>N!B?q>kO6m-2Q9P8O|hJj{U(g1=4y& zUj{=Y!9{_a%#?v?F*w%q_kx`ZTQTmfd-a3s&-E3r5;Ejfq^K*P(FFpS4{8B{gVL=6()UMw6eW7`jH^AaE zzxP+yVvF{-&T_D~TbUl}?jloPe%-R71(O{eB3gHnqXOj=#H?vuW%nPPS?g+J3vP9{E7-EGODk`DPG(~z;Ynv#(uy}=UaVzC zD-1@sAw}5|!A;A^yi7rXc8TquP&9GKt~$XDvY-TRN16C6fqf57 zXe|e8OEA0x4aB%yN~?@C+n9z6e}DZa3#a`BX1_^E8Y&dX#oHdat&rRTw1HWLYtW?4 zA0wmosWKMphQfn$r?;|8N(d>jySnqSy^Yk|!fY9jbVumLx#GL{gVj(><)_OU&E$-} zDVE~gMo(@eN9r7XFY@NxTvLQMGgz!sq<7ehE7YA~$=}$lVr2`Kp#*woI&)pIx0KIm zIkK7)U9NxIMY4Tfyg@GCn+rM@5;J(H*SwO{vj)@v$`%7+*(%Wst8(6*}gVmDckE4i&; zm)DSIPzNnU3O0k5ZNZQM(2@5}>Geh)3pd~OPIYnHQ@MS`XRkX&fG5TZzu9`hJ))92KJbNo2}dW|G7;Tm<*eJ@NBlZg#-GN@I&62wmba0Xmhx@< zgPox__fh+AFJB=&x>iX1P%lWbFGjccR>ick9G77yuXSr{U*t@azH)?1x7hA(P{wtg zT@yIT{g7@pdXhy>aByqrqWfsH{lnauGrEu!l=hGiDL@|N?Nj0H-;9F-WY8lh`Xakw zIT5CYBJtrey|g!FMM~jeeE;AfS?>AV48DZ#O90$JvNOl9)wWE6lW%Ds&)<$N)(K|3kOYb(ga0-0Xq9=SJtHQRL+l@`Ch&WBW=UUYd)x*;l}~II3VzL6>_A?OKr`3btbM zCOgFV#oGf)mcHp2joVT{CiN$dm08=qQa>Xp1-XB-0(KQ^y916L5=JJ=f+r*(uHCtM1X_``RoeE34Rh|FiNy z^Sqqw=IIktGV)T>nx|ZGMRVD2xj5fR*Ft7#Wppe%$B8$WG9_Ck6^psOx$77e>mX4D zE3#M|62J59zN3Q~-hOx%pUizV58 zctYybBpC&5HjZU-ax;>!yiDBOf+6|P-?`GE#N--|IT07PiJs05Ov;wu?rxhh)t|{3 zV<)+R0R1WFkrYlq3ll?*Tp2FO40ITsm!6Ruo<6aYXp!#F7;%(<$xN+$NlKEDQE#U_ z_sjo1$y=BqP}GPal>Kt6H3X(i@upH;s?cpW1M>D|zIsghSaB}cOVllbl;7T)cei*A zD*ov6+`N(T9FZ6!8?Vc@X(_TBq5SefC7&AR+)Vi{gKrXPGD4M_G%3s7La)lp8QwiE zFcQzX6AOR2+NkiSa0C?s5+?bQl!PPI^rlJj315esZ}8Lc)YQxZpQpA=&662$=WY4w z|CT#$g24QgWas^iaTxTJuPJhWT7I&9OG%e+C|h)3RDMgtB*_``Cp*2~IAmVtgv@l8 zXncv@iy?VG?gsCi8TleJY-01EQ(}Q0UAwlg*En&soVS=)FCwB@RI?W05qaLvh?$X* z&00h?i@aPN0)bM2K%k2Jog;re`XvTR1_Ik03zI*0osW+|iRtly5>u;`s8yj%*`*}{ zf!e~|pSvz_|HM?PGPh*cilzICfc$iB9UPadj{C>l6ARt1c3iGY+&|_%tk8Xi<8pO( z|3s_)lF^=k(|@<)az(m-%ssR4{^uQ+E8P8K?&AvG4>&H@F!zr=zvKWXB zaPI5XJ<@%eU&1ag;RKxfE_Lr6xuL+lsk-0d z5`ejf%`0oq@1ySdLHFZ@Isav-`_v%!Nd@{Bs{2#!vq1kNB~AZ&bw3ky|4#+??^5^X ze)ln+vll<#tNRhZd&$oW?ysPk;)0;~t5^_!P1Su%(EX1U+}}stzYlUhSm2(a?w$Sa zQwsgJP~G46yO-QqaQ}LB9}wh@1mwMcb+7ApA5^ISdv$N)cbD{@E+X#!3a(&~D^z}N zs_w0vkiDObLj8Tzy}IAMPoaB;x>xnP$HKjyu!ZXWu;1O~$2$vkZ1@`;E)09ry`x`8 zZsGkW)IHAct{J|buqv*glVBF3e!L>@9iCSAMt=8^h0m{1#yJE6p~{lB>fX!=S^T6G-aka$L&e`zb@%EvyWeF* zQRm`srMiaZ7#Uim-`sMih?c2fBRTBpxY`4%GQ^wd+mURz1r#ww53^51JC z3BO1ADlHGawvq6C%3HgLlFPzfLZ0XJH}x`BAkf)!#Xn&IuQc@$<6bFO48O7nb093> z)m}y1!UCO&FvwpOf=7qoT|)3~A-JW6VCS$v&tRax!UDZQaIdy1;uaR@TZB0f7U&;> zdo7?MZef9hBFup>wD60#g#`u|VUQmZf)5SBhlk)JLhz9x_^1#(IRqaaf~SVyF839^ z!UE%pLLp&+@gewx5IiFUpBRE?h2YsCcy0)u7lP-9;Fg;Uox5tj-ryDXfq+*hh;xqe z(icHioxLAn0V{P1^|)_gftkTTe}x5Z2*GEE;I33GdW8k%7J;{jKV8Yh*ApJnLeJ;; zy2rI}^YL?}+-%^hcP>!3iXJblU%Wb9+^z`0V}%#h9taHfsL@gYc?Su0E*G z^ZM-a{VCd;}^^4Us-+8>9YSq4*{3>9W&Xw>x{R@>JQ9W&i7pLbcmCvmX zL9b6PUuP&^pb2w}>Umgs=L-S8SN?(W46U!kp6jEZ}4G+lA7`ZQ!e~%?I%wFRZWYtNc=r7uMIul|SS0!us0o+~;wN z-9VtL%C|h%J-4vFw)8yGrlSa|`S1 zbd?zA@xuDr-sAlqx6lLv77t%~ys#d>O!btk;Pj6wcy1&8R#$j&>Aa&SpIjhs`6a>Q zR@wyuUO!W$Zt!?&fqZNI_KwHxt?HPmxRtEv{4>5l-r}vb$Hx_Li-(aO&nVz&`fZ`d z3+wT*%AfUkR)Ksg{dUyj*#+F{iyD=je+uhy3vV-zPbiSr>>3y;ytsIq6@uRtg5T@u z&nwVx@$iDjtrXP$u1H-`x%lU{6yBhqH&9sKCWVk68-icwxYk_-o{}#ML&)D3fhKK z{7~OQOJ@xgm!DqLtF8TaJ5#%u@o{3cE>=~(yNlk|)_)rg5_UI6*+MmEG>jSj;D~GE z-4~xWR_58Vr|I*GoJz-GjPw}$psDb^VY=-j_I zlPmra8@=KVooOYTmV2(j*1=ts`wTF|0;0&Ci^Xi@Ab09!L%PN@^t0WD>7iXCoU0fr z%IZm(ZA(q}4ia>WLy8H8SnL-rbP=Nq(dvK05iRj%bT!WZknI15Lkj5`ku5`Pu2tOA zupGdjT484DUbrlgO&K>qj$KR7ur+K+Zh@g3pHcwbk>dpmfY>7;&;BIAh}NB-;!S)&Oq5`_?frUA;HtnTiJ|o>tx)!)qwAgJ8R?yqs||S)DQ=rhk!~ zAL-9WzU);rOGUJi;{~Ff*StL8h_j^$Y$SZvJNFUWkt4h+kIqDlGPh@tK5ilcKr$QANHU~&`(QsD@`Q*W@hF1Z{+(TTY1U8YcaP;T@aV)wnnE%MYQroGJ@`9ab8H=wXi^u zz3X+mge0-MMuP1#TGnSvX`A5RO~E;P#ZW?Slrvw6jZcs1?+Q!V!ovNQbwFd>P6x6o zMvjMza#j)8{!{!|ByC4T`j0@4?Uf!A6BD1_Evx@P7be-zPD~J{$nFTT&`VH!@sE*Y zqt9{i;C$kx3b(63@KLbsynPg$6-#Wo_o#fgL@7?zg2)4r_KNt>mMjPtS@&+Nu_nYArUkgMe#l%=Ji$+V7m)pHp=BrO=VRCv*(eMO?#dmA*p5T41T>)iP zQ8C+KOFdHbK*M621U~wl>%6s!p$tX6dhh8IxR9n3N+yeGg zq%kQnWNn=nod4jr%=8%VY^?uqtF(YQE&h)YA_dIfB&fop?mYF}Jk;1sp?FT%6 z6s+e=N_H8yh13sH3dcyUlqi1iC5HzDxnKgg!w!nw`FGf#3HZO zpNfkF$ZOmd#DPm&l6<6diR3d{P7}hGN6xqsi3u{U9P-Jya@{$|&MTe@DgS(@r%0r@ zO+CCQkPwNrwWX7!5!uFXFE z1Le2r|B>XQJZbsId0Z}guIWFQ_}dT*R8{Ui{QBGYxEnRJ=bG)3bN0u>qds|mJbbR) z{PVp&_ZV^W=V{^=51wJB-0PhE{_G&*=9P@V^G^hE3vWB+?4RL2K1d#A?sZr(cFX3*;}C@pX4O{eJnYecX)- zoBob6F77TD&wf4W#Q)NKw~V;rvo8|&l*&Kf5_ivVS92K`cbC22IU1jxh+BO2B>uk2 z$0_IdJW2AVzl4m7`=4w2%M-8QJSJB)<*a{{kGr`#i_Z)nccbjaC&)OfyPTe@<}nzQ6pL{HPFoehBWyN8RQ0_|roL9aOgG_R;6oB5v{1khrt+aaXu< zj-NRsf49mnBX0Uv5#Ouw4~Ed+R_EiKm@9|PwzUUw(?5WCzLv8?m9zhM`?x>9{1$@O zIp4(w;(3&O+WVRog5MZ|zu@C3K0WU!xA4wT|9{3@?s3-<;(Jy8girrKku!gmsqWq8 zrIQ37f859Y{@JG7{BuC{zeC*O|0Ck&pD%s-{r)-a<9`3lxuDRW{`CBWkNf@kvU2lh z2lt}o+C$v@`7v?x=T|=cet-Vv<9>gp={&4^oZmlFl$(EY)jxBIn}3!PH~*~k>G%8R z2_N_SXO>PVv455;H~*|x|J+a9{PP&`!z#bor{C|Nbv3={_R`fT`D*FyedX@CF6kIQ z;FDkB;|G26D}DS2ANT9|Re2MUz0)Uu)~Ek2AFn0f-Q|MQNB+z|T78Ml9?BW{@UD9L z$-Hd{9v6a-3&F1s!Clzh#n0UUT!g<7f`1W$pANyRXno7iZ5)Dk4#9^I*Jeo|D+Hex zg0Bg|w}#+*h-p>6;cbDR1$F=~zqnaz%P&pU-SJ*NUc<-x`gm<0&+zePKA!30 zkv=}r$6cOvmwWCc`7{4G8FxAEm%rAd;92tBY3`t0?)A8IX#J}W4$9@^^W@LuU0vfY$MfaS_;d&5a(s&X8NbCr zxg4J=f5z82D3{~@aJ}tOaDP01<>S-jyXpDS$DJRHm(hIZ^k3uSwSC;L=L#S9>xuSp zzaCp}=JdO|%bxp$Pu?HTjYP&>PTrp${PFD4jOp>Gx0yaZO1`_x>GA8Y@6n?18Rg@# z^4;`XK684m_i?kFa{LA#Pm=HMa(tHjnI4xn-No|$^x)UyPY<)@KGXA%r=_T#XMFs6 zpZv}c{A(Zg$4@gK_vef5KJJ&da?yp$pC0_>kDt4CfV-Uh9Qm_w`Q!OUACHvp?sD>T z<b@9uJX=E@?)T>cAHPDryUWSn zEPp27%0amtUnqaZ{pog*kB9PqJ&|#j)3aFqOpi;e?s9yI{2A}(pj?hGl|SQVDdxCK zW5)gY?p7c7m!EzfBQoxCdi;EtN5PlLchhg>r;}gqBHrbS z6<0=^opf2?z`61nt>4{8CH?MrH~Cwk?e%It?!s7Kf2*^n$HMNMti9^=bdkRb4$5Wm z>3CO<;Gbst-JZL+n)7j>U?qz2m$V?Ua?kjST40z?;}5w=lB*X!Jb$`y<*HzR;x*?0 zj3B;66XrPLht7dNU0!pSJ$LF4`0dJGcNxDX7V<7nxyyLhOYz&K8Fv})5{=*Y;G=-= zQ-fP8_e?(fdgyCGJBZ&B5AZJW@ZHe!8S(yG!M`KEOymDF@x#fGKgSgc za#{TE*Yr@8c-hY(SC{zsc2L}u__)6y-;Vg^5|HmsJlADPxe|yc4gk0Qi23t&E$}i) z{*>n5>BPrq{NF@;v1at;#GlK;bMGadUlsgu;?K-NxSk_kZY<>AApTfY$nPUQOUsQz z#1Cl$bLctHXYugIMbI;r_$R+W&ek%U{3l}|e?7_9)bjQw;#X=1cQx_D??C^9#5Z0C z{T~oN5drzHh?{?o5&!IC$p1|II*sQFI=^7yGXGQ~{)85&U5TeW3jO_wTlt?%{KJZn zA4fcF8p1n?_<`<_zlM1InvlPV_=R2%R@wl>P5v(8f87Q>PZ6Kp9Q;}0M|Z(LyNO$S z;(g*aj(nK-ixZQY2cxsmIfGWfLjV*al{{4#xyb%~q&WyJSnqXpU{;b&p@@~C|pQqQx@7cub%>rLT+~gl3zI{6M?;t*>DfH|l{$*?MgT$9> zyX7eHaMu}?D@+@97Orag+;fTVYXtvXLcFrpJ57i$z8&(3#1B=+bFU`epcmvP5O1vQ zmZ`+|4}$zG;_quiegW|xwESO5{N*XovxfM3Z6|CZe)cBF?<9W9GVmkBlkY}&1Lg2( z@qD%{x`piLbl}Je>IFSAlmX-trpgw|Pc;?zAf*pGNXuR{_r?zNQWM0^%>m zgWpR0qd&l(Aa3#eH1URY;m`Mp*VB&00pb(dLjQ5%->E-sKGeb+brkaD9FWWSYg(Vy zCcdf({BtSs_@8ip4Dr{OL%tvJp<3<@BA)Xu^rREd*7o*<@^E*~u8A}5O6Z?T{FJtr z<`Cce74E;C_*BimtB4<*5BUwmP5x=(o3tKzhq#qv`-mUY4(mzciyy&re}BunF;pYeGJPc$juLVu+i3AL380K>Uv(Zt|JLFMABny^*-dFC>0L zF7&S>Zt=N+_|d8WG>|CZ5}j zcnM8!-HG252mO7C_x%id(uh}9e`XUObRp!Y5wEcs{1)Qt4}h;EzD(Ph8;EZk2KlFn zZ%Kv!-y!};W619#e&AZjA0ysqD&&7AUS9iO6}9}c__y-^eBz(YhyJF-N4Xp2YDs*< zUEsZmzq|lEp7?eh$Qe((Yj?;`As*0v#7yFG*^pmDe5aDlt=jcGfC&Y8L zemqEgV+7*o5904>`B1hpo@4R6aXWrjA^zJ#;Ejn_=nUS1_{6?=ZY=TEry)Opc$1yr z@2C%ZZzO1o2+l4oM(>R~tNcIPr@5`&!~Ae+``Ok=#(s7S~mamph;%@;4LTsQLdf;`45Wp3TI|YrXIxag+a?_}ISCQ$pJ{7C$Cmj`))|;(fIsZt`u3 z_j(KZhY?@(E%<2Sv#)~wImAug_Q5cJZf^Kv@usH4nBf- zMV&~W6oTgyzjhP!EF*4u?j-);50HPF_(eJ&@hWjkSDz5yITm^j6K{1d_$lI+eoAQj z(ca5Ct-mS~e;^Zj&L@7;Z16_JtG)u>jCkU?c<;T5?@5OIkPtkH_%ov*KZSVmK=2!h zpQ{Eew-Rr@4Dt^W@1p5pJ@Jw*qU72^eD!4Toy0HEd}nr&7SE>tD9NwY3B3~94>0*& zI(}G|_>@z4Ze8M*FPaiJJuQj9HV5ye7x7<*fe#?Q_5(aOop_%?;910{#DmWxK63%| zFCl(R%fq$Af7S8r$B6ILboDmz%WL7e9}q7$3eP=4{I544f1LOPEw9R`K6@`K8$iA? z@g5oAO^H{$4LpkYws4JS;#U5r5I^t*^h_e&T*vG3iOkOW@yR_QUrOg6ES|r}1#d`v-AU+aN_>fqNAx55nrt15)s5re>>uzl!xBI#7#blcymo}*Ah?F{CFer95sMl zP29?twZu;hLHc}!xXJG%UVR4qe}uTnpCEop+cOpG1;?k!S0}z`667L@n|yoXu{!=X znD_%af0so3EzMt3h}(Gnb;LhXJu8Tt{5{0G_dtBUNc`2vfOj1r-t|(*|3tiGV{kM4 zvvg?V_GRlspYiv#{;fyc#{a{Ko1Rw0lSe~uU*b0YpGbV^eUQ&2Zt{7=JKO{LMZ|3$ zCvNjQ`NRw6cZgSCi}2hc@Dtp22 zAinE2@GZnm{zc-$)ZX|L;wFEPc=;IU4>ZK5rB9PDOMKW!$TcK>JQeRNocO~D;Jt{O zd_3{k7LcDt+~jW{e)JZ|-%s4+A0^&U$7$anZt{DG$E`+qPZBrz--(y(1OHs8}NU?zh^h(M-VspG~%asLVgx;lV3pmtd4&@N!;Y0A%5Ta2=6Dv zP5vP9HZhPlLw}1OlP}u{{xx1+%l{_CO+J!%UQ_7rPyB~$=p9V_dF`LhByRF^iQn)E z^gK-5iFJq;wJwq@n1&Zx#w$t$l}f9YZHI; ze8_hq-m5Hlcj9ITKaBX>^$@Pn#G4HTpHJN6Zz0}I$2YeVH~F`SzqlCw{Drv5|3&<( z>mlE;2|g`;Og@}=DeZR;C2sP`#NSi9(0Rm7ekt)DY6ti{ag*Oc{I-YT@9&74{7=N= z7ec=76~%|Q5%EkNuS+0q`iB#Lxe4CyO~g%p3Gw+lp7|VclYfPHxTeqJ#7+KJ;;+OZ zoOPN8hxgvm_}!5BS4+U-h$nsuKA3pKAKrZnV-5ud2@ z0;7rFn1<(0CvNr|vxxtoGfZag%RF{Op^MA57fjlZcNUjP!N`@wgkHXFl--so;+iH~FWCKivfK z9}_qEuZZ9M4CG6-z^BE7$yXqreh6~Sh@1UJYvM0OA-qF~oBSx^LpnkJCgLW)gm`{4 z$UjNk|5Mv%9}+kD&xu#A1OJrPaYsu(CSQqok8H?aMcm|j z5dT`|cT$O)d;Yzm$;2jUPHY6 zcszF*aU0*hllZzbkl#xDMqP*dDe|Lv{QlkN9Y9m(C;p3 z$WI`CXgT<7;xRqJZzjH}mBv5uK{rGGQR2Be4)_Z3iZ-AM!JZ-*q?iFC)HFCot|LJ|!CRPZHmo3%-^3B_r|NcZffC0p#})-=O*KB=P({ zkpGQ%>JsR0(iZOaUT$j+-hsHSzw1hT-vY=FB5v{{h-Z|8d>(Nt=VuV#tmW`Z;I`0&cxGwvro>HuOXBM_ z9d;*f@_mUXWFS7%h<~E>YzFbAOW>b5#P3zRh55wSYQ1wmaZCSC5WipNxn2vwcN2eS z6#V}Mano~{_{-HHf0nq(m%b9=v-h<>3G#J_n|wp!-;RKMN8%>mh4=;6;dxgR->>yl zCh-x|AfHS8ouzp0&BRUqR^nR+LeDzlR-QaZ+{Wv+6TeUUgC7x})ervpoOqa~pOeH* z{&(Wfc7~p+9fIGB$zMc#vbNJAiJN?T;@z^Ke=u>AA3=PizOOvuCVwsQJ7&V)w-Gn_ zRm3mKfc&$>O@2G^t%D%{Iq~^gzaJ(({RV{hEb%L+gWLKHOaCh#g#Pn8;?wvNZU46* z{@#Vq)0X)4S-3xuc%_x#BZyDe`RMD2+c>~t;&Zf|`H=X?b~1nig-2sJ)C$!`9R$Cd>U?6-q|61Vc_Jgvtpo=r~;;uq-na4X{0{^>#7 z^z|fC~<4IZzR6s zCZyYUh@1R}#K+|Yyz3NklRrazypC_wjK+NypJsPapZJREkh_Yw$@d`sYwHr;4Wo$L zcxEPX8{f+#o_!k6T}0f*A@3w^dhR10UIThwCO&0O3D4nk;$^g){DrvL!G?8#K8rWA zUoB63-(cv!gt*zkwjgeL+7h3v?Tt9%X4g7|_`b*BzbxV=Kb837YDc=1xY@z3B!2q| z=-)ux>|nPNA2l5Edx)DI?0(`qH$nacakGOxP2B7rs&vJD_Fin9xd!p@v3M^nh+F@z zE%85wLT?;#vr`>HymS=wPbO~i(};f;2K_6Dn_cTY#J9Wz`KO4RUF&ni8)?7zJ>q6Z z`U&wDw7mU^xY@P-PJF7iuc~T2VDWGEqIHRzy=WuiKYWgGbt7)}s{@Feo}t9s#X?U$ z@s`uUXA)obGx#dvi;}@#BA(R%@AnhpW_Nm!_)phD&mY81J}d_A7H^BR{H#UX>`pHs z{^@zR|4QN}-<9~F?T{Z%-0UAxh{sn&c&{OD^0SGD*N6VQh?||{{lx2i2>IuUoBR&q zXBxskpAt9wuCIyzpzXBZm}ld;f$sRU_}`U}-wlYHU0f65O}0S34{?(pNWA|Okk2P> z@-vA)7X|qTiJSe~W5m;ReBdqOCcl^XjqgDJDdJ|IbB6fn%Mt(8b==hA&Ft7OCf@EV z=#L=2tSxvu;!kLMW-#$5&%tw(i05AbJyVGvY5{&d@rc*KR}*i$AABwGH{Jt(lXzY2 zcf3n{gNqQkjuPLe_ChC#XH0?oxjJ8A@%h_Y@T$Z=)A@@i;xjdUcOZW166hI8d|fN> zF~n1J+-x3kIqTEA788H}Bk;AvSL%56dg8xmKjkIj;aac1PQ3F(=>LMaty?@yyq%7# zm(qH`;?v~I6K{PZ^oJAQsq^5ih&N4xd^~ZJA4Ytc+ObU`-o66T^L51M?t=bB#BKa` zJ@LDAe(fpZ)y{?fSBaaRBgAc7;#cCQo`QeQ5dUc}BYWA^zD+=pRBnts3|!;#IXBlS|y>XAp0q^0yIBejEN-Mf}yf z!Jj3bq5j!UJW}WTJ||xCH1r%M{%9n^RjN1K&y^3?#5kt*t`&&CG#m1l6MsJ!Jc9Vg zn*RF{AEWE^2ND0RH2jf6+~ltzK5Y-=R}s(G_RoXFOZ12QcH(2TJb8=w25oO0CjQYS z2=8&?T|S2XYC7*{@!5Pmcm(lpE`y$U;$<~o3?tsVFZ5&+H~HzrYpMJS;^}js=N{q} zwt+uQ+~i*(en9&X2Z-;!7ZF><0KJf#f^<3O<2&n9eWf5I^1t@;4K|UB~U05#Mz?Kh@Y?X z#h(&?sRH!>Ox*gje-i&;2IQ;v!>7g1<)7jA#l$Df1#eIMwl~1L5dT2y{UqWe6YyTf z60bN0dTu7(OXF=F@e4kH{5Il4wBP*(@#Wf%`hxhhPod`s@rSiNe2$hM7C)`Dzgdy^ z)oKsig!lu8p+A!NDIMqOMf?U$pYg=^=ONy%BfdTk`WF%}ul5HkiQ72*1H>&qK1n?C zHt2tmxaoh3xar?V-1HwL-u4;j|B-mFQsBQ4f4∓X)0!y_X}$A>WYr)7qZtK>Xr& zA>WI*$tM$USQgKnNZjIKI&q7Kxx_6VmJvU#{n>kon}4?-3uQ_2XB>pU`!?$BEaS34i`d{QBnLR}8?Xy~m29@!Xcgt9FN6 zXX4$z1@BLM(<1Oe#J|(>a|-cwgYewzh;KdtJqwA)-46NNh;P>R`D)_(=R?o)#J4sB ze}(ulwW~Zt{AV49|Bm>j&ETIhYNu)OnYjS^D-vI${kzME-+Tb_9f|iG3_g^2f8{q2 z|4rvZZzdjI5qcgXK3Mr1#Aoe={2}6#v>jU_0qz#w@;aYej`%3GPiR2=(HgkF9q|=M z!MhWmq4^?%__fO*e=G5>yTBhO{!?@#=h)!-9|AJTroEyN#J`;a?`-_-%(+CqGnwr>v*pPh~89wJ^v%Zn0;!Qn0O z0)Agi{I^r!4T%r^1iT~hxtf2+5$~q`rrE@w)_Jxh9Zr1zFz|82n`-*cBi`*I$j>6)W)t`l z;@zJBzngfK*TL5j@6#FldE(R5K65YeC~dEvB!2l3=&7pZljWDQ8m>sa*bIhu}>^ z@RlKXZ{lyMU1fX-K0XAW9D>g$UUDk@zcd7YI0Szp1b?0QpSrI6?GXH%5d4P_{2VP0 zEdKYU!v7UQ@XJH+79n^);w{x~A&>alI!JHV63^=l|I8L{|NDW zwf(uBcprEt@Fwv_T2FsT+}i)gh|jZrI`R72uBfW>-WLDXU#LsGV4qUrroScefVP+7 zh}%5r2;ychol1PR`g1OEvpZTw-0Y?AB!1D;cwbKwxB2uJiLa;!`F+IC)BUDCB|ftu zURc+wai7KK9<8s36Yu{x+ZVo}uG8Wk z;@|AbY7)2n(vY~N^9bUW-Z~Pu{1Qvt(s?5BLt6iiCSIu?;%zE%o9~!KeC3M>=Mv(p z&w#Ha-bUru5Fa)X&)r1al@jKrtCFMBT)iQn)Z z;*SCO$&_Go5(%4UoT;xXIs1{9A3WK26-l znO`D)k-pyp#BE&YTjI8l){n#+z5st(JKf?xQQy}E+I~0waVF$j63^FoxRUt7osb_w zJW2JBBHm>>K5CjRqW=-*D<^3hJ>mM`8TzOg0r z942n^$B6%><58tY;XaFJ%SY9STRy5qywiEOza??YN1cgVKI+N*I_Mci-15aZ;ywO^ z{Po05ejf4Nnr98(wYgb=N{Ir&<;l!<7 z9ZCGv!SMf;#I0T3nRq#^cVdZKySg9ovu{KHVB*%U9!K2rS1xhOFV_>dbiR=Ia2>C^ zgSfSmpCE4SqgE^xyH^ImAtVA@SxazmB-o*H01;R6xFX zleo#hOZH>i6%7|M?8`oFqQ!4)EWJ z_t5@A*jTt*{M&OY5Vz;nCw@}fr5%a?ZtYd#ox<_n(}>Ti1AojT{!nGeFD8Dm&hOkt zyiOSOtRqDO-etHV@Zzq1I)+28cuY5iHzlV7K>(KKP@y!b$ z-$2`m_FkT;2Ki>hZ`JYS_QX%cKu>q#ck~C3C*J4}=uaVDR^w+9@uaPgzlQkGm%(o$ z-sVB@WyBwT68s+GR~-R=lz3EK@TZ9{zX$vk;vI4j-gk(P&4>Ia#LKB&`eEYf+V1*^ z_<75q=PdDWSA*NUioM5@}ncv5 zytmfZQ;C17;~O^;zvm+8zm@p;i^1<9K6gF%W5nA&4*n+b$~xcCTR$zHFV*(mK;oOe zfS!rO?HtFM#DCKH&YOsvo_mQOABy+&B=KyuyLgtk>3N^{B25op5FcC(`pam$%)*sZdUv+cg?qWr|${y&g z-2AzBA>;-VZ>#k}3h~3ALw*wRF&fX;5dZQ?$S)*bwL17p;yv1euOmL$xGjiLcZA`!Vr&&A(p~e_Q)6KM>!s63_jOcwg;Dm(GK`#dG+@_+5>7OXc;5 zw^JTY{3_+WiNCNG`Uew#N!u0Eh__w=`J0Ik)A6HQh>tA;`4z;6>-w5i#D8B0`TL1i z?}GQZj`)hkkhk^R7H?HQ2H#Bb$(rw;BVJL*on9vXinhmIC;sFi=-*9zO%D9|9`S=3 z&mR$QsCMR`6R*7<`oAWA`Q6~(5zqY#{72%iYJc-r;uY2I`cLAgl!xi~hs9?vZMT#o z{=3?pRwkY`82+hFyi^&)Pc7n&+e5xS@z=fwZ$x~=hv4DFr``tMl6bjO;O&TSxd8sT ziufJ3g7+l;zCL#Vaa+$kjCk+Y@Z1#Qjnr-`gLrG@dBkIs&m=xj`Ax)AlrJS-N7MQ3 z#22_1E!W+|cYWdEzgxgKzOA;o#2X|*{t)rcv>rW5yhKaLpCCSR9Q^YO@!p-l z&k%oE$9qdofj$e@yuJ8cmiPnjfmb3P^Dg-L#FMpLx|sN@IzDp=@y2_g=W^l$^nEoa z-u)`bw<13DUhpf4_qi23n)nc1uhfJ116n@!C4Q04cO($+QVaTr5^tD>__y&ni|1HP z=V>Hg{y6kZAl`H)^h_q+=_SZdA%0#x@N0?p(e~|Z;{D4(|9s*pbHSGoU#Ii?w-L{} z6wkej_yXnk5`RGXL&WX;$j6B9SNW%ipH%(~@d^#$pO=VVrrhc=doP`p?YgLt0u651cN@GenahWI+=6^U=s zend6mZ!51!Jf#)FRhM|0j(apD{;Q_vPQ>qsg`NS#=WG9B2=RQ)?<0vnrTzC*;=P(c z|9IlD^0SDS9tb{<`25b`i;3r|p5?^r?|}TB#5xs`P2|XK$KhzfC-AX)P+j}n(&-n&=UMJpH)5C7!t9wKKJ>pw* z9P%ULwQhv`=fqoRfBzfe*XlUNY2rU=z4jOJIofV4H4V?Pd^GMI{H{ZMp{A2?;#aEO z+Lgq+X*t}B_?%aue<<;`6~NPpZ|RQrl21Hb$J6E!U##o;7ZV?)`Efb%2YTVTcM^}* z{?{7f52~HXTH>#7fS&clFVKGRM&b#Y&bJaDnhZTJ5`VBZ;$a8zpF2bTE#lwr2j4^d z!~W2-k9f4!?+1vF{|$P+BL30q;75pmR}cIc@xkwdpCW!iGW_#9@vY}UJ}@1hmOkIo zbrYqDzkUF66^NJ8@}Vm6WrHAJgZSL`;B|;!`YQOP#D{9SZ9;s>c*wUPep>ezYE3*` z$2U38e@~K+s|wza_z|cH3?#1I<-joFJ+-_VMZDU1P&9^k2aTT$ z;)k^U$|hd&3ixv>@#Z&x&m`WZ68IeAG1{J7K)l-u$S)W#1Ck=?k3(PfbiZ& z{4e$Y!^D%doPC`56Xl_26Y&dYfImyT{~h4lh`)Cl{(p`5EpLJEB7V2dQ|~1{eH-NW z6K~uD{4?TVI$n2(xUDBVO1!$R^F2YlZVbZp3-KrP{hlE{peN)@UITww`oBc`ePxN4 z`x$bTh__t;em?Q`QP6)8@!Pe&s7L&c?$C1?@mIC~)s%QsO%IX8@740KE%8fJ;h#># z7pt9TH{u66!#}-JTQg$FC(ERi+FFf*Sd-Ln>zlm zoOt#|=(&&h_uqnVAl|b#{JDep8STftM|}1^==qxXva7*=CEg+#{7>TT=i|9yGvIFN zu(i&Yl_UOU1mr3cuU7}WI`N4wg4ZJc>^wZTKJgtLAm51i6r+6U6h9@Z4XB?;HYthWMI9 z_^0Hx__X{LrTd1JB_6Boz)OkuPQd*wh!0WTk$5xBzrBekOoBg$51pB_n(tmE-e1Qb-Xnfc z+ml}s@7EXpJVCsg`u_~^DcY|rcU|y%+@a%97ZRVN>9Zm6y6y4Ymc(aV2Odp)PZ)TA z;**<$k0Ab-hHC=x=X4xq8u1UeBRn?|-=yP6w-Iln{=c7ir5~Z^N#bkPfxkq&!v)~G zi9b6S{1f7L>3G6X;we``{uknlvk*UF*TdcN-Dr)ss>Ca6Jzk&qui7qZK|J+h=)H>g zsJY<%h)>k+73|@6i6yoy5$)CJ#1ytk&quZgcxJ!gqqJXD$m zeHNb<4;K@+cxX!8;^9i-77x9Mzq1F=8%q3gZHJ5_ZsDCm+`>DTxTV`Q#4Wr}5V!EY zNZi7^o4AGd0C5ZNcf>8czZ19cmYW?M{}$dF#4Wsyh+BB0iCcK%h+BB=+%tRc7T$>@ zZ{fX`xP^BiaSQKV#4Ws!5V!C?L)^mq25}4Te&Xe{oe-E49G^Y4+^a;qek|l_5l?Sb z!gB~Get0%`OX3qBg`RfAU+M>b74hw>z+;FXxCp#A@!ZSsT$|sv_qBQv{#b)3m zh|g&PoBAU+{MI?>U`O9;;(gw{GG%v zyB&ND@kf)u*Ag%P8~A$S9d%r2Bk>_WKz=Ln4Tr&BB%ZPqd1~-G1nQfOxXILgIDZY|BX{F@b%_7a3j9*yGqs=4g!s?>;m;Pt*K`JNO?+7r z^mibB%_#6L#1k4pPaonpXnq_G2)9CLH-Q!i*>y2+cz|P=Zh^HL`k0W00TJVv?V|5%jllX{R5Uv@-7mWtLmH5=P(0>o{MYX{n zBfh>b`18cG%0mB6;t!62{Kv%CX}>8F4T=B!HSUih zUioG4uEf``29GEHWOF<>nfSr%{`JJIzx+J$ z8>YjbZxU}e9{elfAL@HQPW+6f|1-pIDFHo=7rk{Ad9e6nLE6amN6CbK}lKqK)rRClT;th3Pbpr8O zS3&ipM?Bv#LK9E?kE0dJ-o*!i6?0L;U(g&&Vl~j#LsF!{S)Fh={Uf* z#PhX(`YZA0v>j4Fi0FVuEgZ{p9q4f$cj z@5l!qNBlLdr}Bv}*L*jJ_)A(IK1DpBA&+ytN{}q%q`EhfkGgzjQCS&Cgi;oTvPLl7CG5o9l>|(Rh1;_$6v5u$g$r zdU)=0#D6;kzMc46oxt}JZ?5xM`-vad@rTcdKceOBA>y^#LH|+WTAT+?5dTEm1HTYY zycBxQ5Z|l)tddLMZt?uHmNR9EkLv-sO2ivzKl*&)XI_W=MZ~Yx{#QNXIfIa{E+c;K zROo3+{QFzLBZ)83=e8wYP20(~&fdcNoz}D6NPe{T3wjan(iP8*BYsbR@Il0%jRn7& z_}%pp-qFMlE(ae+JasShOe8+}dhlH4+CG_1JWSL7^~7J%dTK85!z#`F8cztjH9$HY%*yXy<$Pig!18{)4;!$02>U)TrXJxP4gOz_jhA5cAi z5&!8s=r5)7HkL2GnFl@R65sm(7Zb0d{hH;($Bl>loy05Z zdt5`jiLPH;OFXYX^sFad^Lp@&#N%5d{##0{NmTZj}Tv`^8&|+uZV&CDdL&^z<(z`T<7BgT7O!8U#0D| z(!~3OBiamMINPPf>#4n26XG4NfSwk_8$AU7v?e}D z%kvJz`{hGV7vkY}LQhZPCA1vtM|_%&M-3#t@?+>3M!at{o;!;8bALj94DqenPR<~{ z>PpCG6K}o_d@Au4bMf4n#A|B5V-E4&r=e#7@%KwW&r;&oX#4hd;?LdyJ$DoTY#{X9 zM?6aV5f2k@ukG;1i9ga0dNvWCs`d4=%wK|kwh{lbHuStkd|({pcM)&&Gx%QOTQb4- z6F)Nt{4?S|wE#ave5sDR9wlBz^XUoVXFq_RUx+_`JNOym4Ru_!r1tx*97x{@`Le_h zE<^lOB7Xi4kUyVzjR^3Ih*ydPuSfh(tv4?tKBGM3n-c$A$KfN1KdAFHZHdp<_t=Se zna7~N8}X}h5#C`t}328j(GQY@QK8aXgfBS z_;uGneme1l?%>xG5AOs%mv}uLZ&^ruuD-8Zi8s;l$(6*zuZRBC#M`$5e}H(x_lTcI zh&SyE`3=OQbewq$@fpJ)|2**2Km4yMR|A zZu6QK5I+zP`P#(4)%vIb@k@1_qA~Fcm%^XTh#%DPgec;<+RkiGe1VP^btXP70{XiX zfA>}RzYp;$nr`EXuWJK6Lx@Md2R$Q+_nHIwRO00{e#R63Q0HZ`h_B3s{(RzPw0}2) z_}x*EpGEvn{XLI(LU+h7CcgA1@a4qo4+p=K_y^kVT0?xq3dpY|{-dt%TTlEPO@|wa z|JnfYxs`ZT9shih_?jws?hfLW7lFS;e9J4~dx(FQ2ELE@lq3O#D)HU-LC;L$+eU)V zAznx0X94k9FF<}N@v7qx54RKVSs(mv;*UH5J@*lB_9Xbj#BbN}=f{aZFaYwKh~L#2 z{8{4hT90fa-c!dt{;#yVfUn~C;sqYu-95Mkx8Om7LxMXYgg`<`0!e`2!7XTuLyNmZ zk>U=e6fG2r1Su{lP$>R(@9!Ks{J44j^WOcmT<-UtIWs%EJ3A{0N682LfIPpFkBNr= zM!vWo{0jNH{OE^UvswH8riN^ksp=&CXMAdsoTkg-pJpc z{8u@?2a@Y^C;iBe$nk3wxjt{UguIPh$2dd2Qm)V3BCjOpJNL=AC&T)ENk_Wy1@|F;S@LZM;PuJBk@K%fZo#aPqqW$~H-}%FjlHdCg`A?G9l zKgmxONBl$b7t(Kkk;lo5Jg><697mq_CLV|nB%dPl6a9Arb$yi#M4mnr-|QuPAbHkW$Ulty zx!i{uL%u=o7fd8Svkm#bCC^w2`Dc^2lk03hkbi8B_@(6S#=%#Ur<3!7jpSwIc)E@J zp0sl}d1H69{~&o=H`IHaykaH9pCXT}jQHQkFSdp2b2_@d&aXnA8x()&4g4;-mmC)! zlb5fG{Ljh5YA0|0B8EG~`Jj^C@ko{(d5zXQ}VIggj{|ewrMQGLf&5`9Kcx zS~rjg)dr;}e1 zpG*Eqd?9(-U1Z;p<>7r6-S;|Q26XKk@{VM2g;|9SWk*64l{Ljg&_C)*}@;P$6h%*o6wEtgq#plH2iKRWM z$ScWps*L30y-{xt@_{ms%}0K%A?htkKK4AkH2K8u;1$UuYr#Fp&&d6+y5zksBEAXv z-G%U$;pCh5S1i zug;NYDv0>Y8%!|#zlu8a0OB`;nF@h{2aOZGM{%P56h1Dg5-_l_*I;|zg*8LN1j-=^D5-}dy#69cam|t0eSUg z=+CC)?c};;8}f&V(S9HDl(mtk8+rF|co6v<>1X|YqT2sYByJ$ZPpFQ1hm(7JgZ$&j zTYDmY3i*Axo->m?Bn|R>Pky!%@+>8PFXQ1F@-4Dlo5}xKi#$J(uax8Ce)1M_zIKfK zd%13Zirlj$+JAvOQwG#~mHcH@_-*prXWf?x0CB^8OR68`CWGMWdqO;dCAAg^{67`X@5m~N|Dc(>x%B=N0K9db@I|- z$Ww=Wo{ayE$vqk%z6E)f^6>WLtBb+Akniw9djiRCH-z^hpDO!%Kk`e`o+0Gd>!RLK z;wTs^P) zhwMQ73G$Dt;b+Oyj6-`ak>|gR_-OL2a@@H~zE=9-3HchCNBvEHSLSu^$lJ*LB;F6G zU-!Eo|B=t+C-1=1k*}2bOm^}mWwE^Z$@5M|p5o+NBH$Is4~ct_pBJxBep|d5`3v#( zPa3c-h_N| z6ZBha@}W Iwvg4~D@5$)8C6Q1bp+5kHVTp3G-Pk|&yp_Dm#iE6*#=AirB4@jsBq zmE-0La#tD0Hj)R)`Rq^R9mNlle~{(iKQ2YJvI#1|mX7cY+EixT8l^6$lO zlkXRQK<*~{Q4IMbiGNA{+Z?p>9r-^pkBz$s-|KdsQSOr_CVzGpaVg2ql#b*0B0c%3 z=7`TqUgq*hylgai z{Qk%jMczuDE0{*!RL)yxk*5eip6|(PPl7KYFXDs#Tt)u#HN^DW{@vSih7rlA5Dn(E#wnqK6a4&@&&}7 zC677?zd`O-3m!v$yd~Q6kv!vgc*-Rxr`toRR`{HQJoQ+(8+kj~Un-NgoPziUA-t&|=e0p|reQvN6dE8oPkN#d^&2zQ^yg9|slX1zH ze7Ia!2_gSU=4pe;mkvk%3FL8QyP88DXgADyEhqO|0soQwDH8Ez$rH=@cn$I;84>SCUb82> zAGxmgdE~#!aeO6th+Ho{L9Y3)lWYDL{w0JZ;JU$%FQE zBX`XTA4Qz_6koq7mMfY(_j<%X zA@`^Xe@EVMF!Cgo{a)*hN`?5$5ljTY48u^YvsObavA4zeN~G@d{*)^GLb4oKD{5}%aaF>gx4ky+zxL>-d~<~>P$XN zJeWK@7xD}uA2JF)o_t3%d=`1f`0!=qyN|)Qke?AhK)$Uc;!l(B%mKef9(Dl!h`et@ z_#5&MG9O613g7Gcx?2OEGm^KM3(rS>d>_0tdE47?5AxbS!JCjjnt}bj19_5oi0?r@ zO0Ks@kWc7~_|fFW#=^fNA2krZm^^Vmv}Y5!r(9RxPk!|d@|-5m{tx^*d7qT06$dj79+Ze>> zB2Rn}UYuO#+qKAbUet!Xh8(B-$t%n8bPRd@J*ZcIZcN*sRj!*YrTAxZzQ2WCw)2DJ zXXSkDEO|n?UU7pwpN!8jxek30&+f_=Lr)c|=%XvW#^1tNx>qdSt3jI)-JW%dG zHX!$uWKKK?Z6WeAlBXPb>M^Ld1-VxxTcSeLidjdBj%4ZzIns+uJ?z>1`4Jl6*~0v?r;|TeW{4$@uI>{%aWW zR3`5$=amh}f4+?Pw&e9?dk7$3SPk+0$XCmAHKWOo1R#Doc{(|+Eg{$61HXe@w})Hg zA0p8|G30CIdPv$0s9*cdOSVr}^1U*Du1Ma$1M=4;FL?!dJCpB|=fi`^?*t-#1o{0F z@M+}bW8e$PBgVnEk?VGKoxHeQZ+%RjP3GanHX@()PpUjv-b&>2_9Lzy`Fskx268vVJF#{~3Zj^T>~uM*CNjpY4J8?c^o1!ha?&CH-)kyxS1O zUmzcv5PpTc!AbZ{@>?>V-yR35QW{Ve&UG;qK&%XJfhQk}scx_*Uct ze}wy!H=G9#C*Ri|K7w556|>0oII@etSr+{q}}j`z?t)Poe$n zAo?QFQlU)0)1$o*BSS~+u?axqh?awjf+Mn~t&&d7OmE<>Netv>{ z$!N6aGP&0Klw9kLFWZaur~bPmX~C;v(21FOmZk>kz|^34xX?=kW^3E&sVbzX6s z{H@Gqo|D(_gZ=&e#7T9@~X05dy_wGjyyi(<)l5q(D6GCt=auU85AOOfmR$AjEk zp3CwgcRz)CJCP@U2M;0-ZVn$ro>j)53FLabSU|4Fi*@8FIwSv1^24%T&XN~Ri+;F4 zzFh8u$B^fe>vbQ=XBI=fDP?}4>*a86?b%>EYMl|0TY)Eh>A zN9KXU$?w)c{8aLdiI9H*dAFzV_2fr}AkQB1IC9*sr81f8qJbh0-cr@xw zD)#}j|M%`jJ2R8V&4l;@$51$j0zpo2VNuE4DJOgB`g0?>{vNQsJ?MW-+llJp4*?)79r}zi?YmnDn4EH9_xd-k;uJwkK zYd?%5FCga^Gsu19{A~sK2Dx6bj{IkNK4A-a|8iLGd&t+w{&<3XytL;M`Nc`dbC`^6z!CEIe(29%$P;&kpCz9tKfip5yqIiP zZ^@S;jZ4zKXs@m>*UI>ugoApxHE&iY&_)I zPM%19zW6A)w*Lybw*N2kyfUx*NIqV!OQ+cPdB44SgwNT?n@XO-Gs*M1wLmX-yaGuNWM|7qn0Bd-aU?^KrM3Zx2ELU&pzb3UV_NOFQGjH z$+e%ykvEd@Y9{#~9g$}#dCSpp9A6wJ*X2D+9@!u9FUWO#jwj~<+Rr;=J*FdXCG&7s za_vud@{=+yd6EYfM|+x+@09(;mwa$D#D|a|a^0_gBCqxg^&TZZDA!4@lV|in{A2O~ zvb=A}y$4}=lN>@h?dRPk;hD&Ts>1V=_n3?FWynMGBfbWCC%KN~MZVP?c{-9mmGjnb z$e*o2e1Gx;%iv?kUll@oW{~UnxrAKD&pqTLt0Dg{?^)mUKa){S|zexLepUlHw zQhYMGZWjMA%4xiglbOlq$UL(E`5DmM$oE`>CzA6*?f+5t z;Tg$y$nVXlL9WLUZ*o13bSBs1ND#RmM<$ckNrd*yXP5J!t>gpbIC6+QJO}bzBi|sO zU5?^=?FSv7Q;}b6inyHQUR~hD$j{03y(;9hXCr1GyeYmXg<$=h?Q9$H+K)kUUFowDTp!)~ z^T__yl>B-{7)U;}HS&xj|8o(1CV73yzl^-69H+LB?~?0#2gn!5 zeD4hT&V8u&Ik}F9ael`4y8YLY@h~g7j)#TFbv&#=URV0xi#%n0)E7h^U#?3>kq=0T z{b&yP%#_Hpg1q>z@U7%2e30i5`BpiPK1-hCJH$tm|C0{>guI6b>U~EZwHcoDguGjC zwXOgA$n)(L$Y-rZTpe;9H`ubZ7aFX-wu)M z{Ov5c&flWRb^i8(T<2YJ<$iBc=}%p*)Z{~D+|Nm_^D#GaosU%_*ZEjIa-EO0BG>s? zS8|VL3}6j zihbdc zzPAZHANhg#@G|6AvchYUhZjJ-P03RfM|@}UkwxGk8rdGT|A!Su|Kue9rv|(j`RyLaQ<=O*LwJ4i zc(UGGlP4^Mcz^Qeh2df3FJyfUBM*}Ej>+VD{dPWik{Zarn*4~ge+T&wvOOFnPn!^V zej`sa0e*{o^G^6*jb*K_BVxRBmZ8;=fdQ5W!$SkzFx*#Px5MV zeYiQfZg0Nio#nZdh2)#YW4YFm_i=-tCT}+tew}>&aQGAQ8gd=)J$e2sh);eN@7izL z{@mocpOz;tn+@e_k#F=v|A&(=_eJ~&@?E9j)5x_w%gMDpyU0(;b4SO?7vDhsf5^2x z3D4ns?FVg7R&pI5N|3u&!}3-kKPuNP8x6B|OUV4BJGrj+KIAKUOMjA=ko|Nj z`N*q?|AD-koDZ%gFL)mDJIQssI!>N2BjPWTr|^gWN&ch*`t3P+h9ZcMBlYWg`CYc- zROCm;AwCCrI(goyD0vQzX{rZm3)a8{1N#YS+2L_=l(|iq`%3#^=6;< zjhE}*S;zyXBCZhm{7LW%>A^fc!hTj=G+FWh2DzA&->p?IgL5dsoPd$a5gc<-WA;7kedt7V^ikyambsn27e2 zBiH+bwaGuoe%h2gZZPubeR6G2Pzm&Z5XI;5h7TfdCGq3PyUKiZCV8RSsCOCpa)0=b z*IM7}!;dB%|stO{R1-mxuwJ^5(4uD*x- zZ5Y~fl6;=rr@lfy<1O+(B=08Ut=X+@`jn=J;=MsbKU*P=g4)2vE)C>b~Tf{Sb6lrGIHH+?~redM0=i-UzYiBoXhxL z`%@0fE~&`h$b2#<`Pk7YUzFTe<_VR^b$h5s-Yx@{t2KEU59IGgo}`wzg|z?H4O24$d9yzpCqp(`{Nbz#`5_g`SG~u z|2O1Y!{JG;p&nf?I<96R*XyMP$n`o*W%5Jkk+&ZCqy%sua<^IVVDj<>;e*K+N&iHV zUv@|QT=Jj*Ebl7v5wbn+ATK5J_+#Wpdn3<9@?8y3?;UdO&$r~d-zAapPy7FXoWEow zUsDnJi;*9zjP_I^eV)>pCjZa{zLR`~Y(GDf>vMzG$#ool zOrA-OFK@{;Pf}T*+MjLZdQ>LzcrxFuNZv;78`dN5{RGR^n%wU%csKHd?csgMuM~xk zB!3hD|Caoo%zGD+9}9`&D6ovUdUM1K5e@4DACGvkHuazC`NpTC` zYyZDpg3sB=|Gp0|Os>y?R3QJ^2l1Zd(>uePlYgHT?eQhgu@~_nSnzsMgvgeSg@ckSnqa$PkOxh|I*d1!rw4@H1jjN6Dv5MEnKvp}paMl0T4f;{~}cSCTs@r~R+TolN9w zjw3D~xw~BdFHL@}E8;!KAIo_}6LOuubs$d_i99{XQ;tIaM3C!pjUgW($M^5ZJ5NXc zt>i~#o~%F5s{Nz+FH^ks+f(vOZpi0NoZ-fCO_Z+i34PQF_1-^IO$`n4Z+ zb;sv4^3rV(UyZzn>=%v6b^UfB*W+L>^4@aaXE1p)IWA5h-@6;_ zoK60;0@}Hpe85M<|42Sfu1_8$f0_h&&X8}Jg*?~EAISOFV{+|>cjTAlenAqMAL;s< zgKAxhl84Iru0$SxG}=?2eCk4Y8}dr!;oZq|S4YKt$+HZD>vN#mPQBkfo#N;0L;gkN z9^b;Zl577QW|#gtOP)ME+Vg^Ze+T4`_WN3QKzPOk0QMXv2RL;g4y>b*wZMaCZ&nLj4A+H5ynk1wgnwcdQ>T5m=2 z^$G1>aB-FJVVI!coan*DCZ+{$o1b@UqL=#D)Mh7 z*W=nD^2L`Bf0n$F-LcjyntbFmEbkL?Jr2Gn->LH?nJ?)2Dk9ebGLxT{>i`AFv&nVc zvgA?2(9W9Vhh<*sO|IL65Bcc7k*6nlb@}L%^KT;8?d>SJ{+_t=#P7~Y5cpxpl&LB7Km>vtOYD)Ar4 z-CU7>8~N)H_#yJCvOS+Ak6efNXz~M*XwM6BUGH(9qMY{sNSU{!CLi1taXHC%CPMyV z>vsfs?lNfS6!J+8;op;Ql>S*mK065Y?j(OA z*AtGBFLOu!>*Vic9DYKs?T;6Oa@zm8eWoD~m*Zwm@{4lb?M7Zq?(bA0pCs+6N3Q#O zEAm~brT@vb{bA&dW&R&YK421-a}xPA59FUm-Xa0wSCOCff^R3kE#t`%@?YdRpY!C} z{#)d`2ctbPzP1M189|<4 z0DK~O?+);#au*N8Pa)U!xSm|Mn_c9tavV89{#4e>CGsJ1-gl2Y zj?AxKk&g;NI}^%$NBi@-%-=GQpOW*$eB^s%yedOptr^x!P4d(F942|q!e~!t@&_`W zhmcR5kN6?vZ{_$jk$kG0zt1C&9*R7x$?r^r|3rQ$3H&(u-?H8QPF_RC`Mcx`P9x7t z^5qrLZwX%DdtEP?a^iD(^1H?1dC8j$hnFV5FUwnl{G>nPdy{*~@p=gP-lK@0NxonZ zdJhwUCWH*ZDbtFNB(TB z^e6e_Xm}m+#c$v($o1b(=|cWM?uU*fFLDEUrjkFchvnK#uH(!xay<{aN?!R_$iI>N+(P(1^6zAPK1Du4w$E$iy1t&1>-tLk3iW9JUu%hW zW+0zA4sm(O8_E5(Qsi0dA-+1fwx1E)%$+ewV$UDk-c%QtB^xG?P?VkkypdRgK&7Yn;_bSBYCin8fdMQI*RnF@xlBa5g zJk`mwhrvC`_21EHNZuwj;+v5#j)41+H~$D9Mm{k<+CPQ7n;b`eAlLrcLay8Gaq_+S zkpBYtq&{fRZSrqqe)5bw;2h#zUgMqDS1R(qzDHbka{c`~<;ZIlL3~Z}-()`TMcz<; z4!R?`-beU`Ja=y7A4*-q|B3r|QMln!~4lb4k10BOmmBu9K^a^FG7pOZYi z4&w8ZHyy`#!4) zQgSajK5ixtkn7L;$p_2NIi4mj_#N^;AitR${(=14gy^5NGQZRHvP9ytk`K;}JZ|LE zyTeP9AC~h$ck;U}QExT!s)OOR$@9y2+K{}v^g~B-KUrUXOfK0wQqHrsktdexy1U7Xh#w^HI1KGMPQElP{1kZ&S>E5sugLN45AvLHo$UsB z%JIm5mprS?TON~dl=b_Z{F%%vUXz#l4f#KkKVJY(@E-5Fe|3}lddbMw=SBagA%8as zo{4mQd}nf>WN1%6@;Gw+b~Jg)?8q~nT#pN@$n`nZ?c}{1qTa*g zNyfs@k{asT#t9-$z$Bn&Y9$?D#KTjw=W7`Paatg{v&x_xgNfgyqK)t z{p5e%LY|}K3vGk=la zZHM@O$P-*c{*UCT#1qCrIo%GQN}d$t9&$fDJ^7p6$e)e;t{?L1&yi^UR&xEYFvYi# z{3Xe|h*u!*DPE1duec|9qoU#^=qBbOhqap_DRBiG@B$@Sj}7(}i=Z#acq zpWFGKe0h7cb0hfzIlk;A?_3h`C&~M*fd4^WcPQ4^eR6H*dva}OQW@uTy*%2C{F%rH z7lP*_|2Yx76uC=Iw5J;RFLEB*kbFco#J3?&asc`L$ur1#M<{vTjffvguK&*XMDmWq z5kHq)mwzRBbZ^9OBOlfa?LSPuR<4_!BVTb8d2W!`O9p>R?m7bb-;)oJ?I)ScgLFOC zE|2&u#%Px}XGPeyWWe_nEJe<|{@1^KfL$p16>#uDhaOXRIn!0(Y48U}wwey=_}VM4rX z|Ho;DdNYt`l;>6QkvCk3_%h@VWu94+T>Garx%N*8`NsjsKZrbo9Ph@F*OYN-Cb@1` zOUbKkLjKL8c`eo1%}sta2kI+Co>_ig&z=07 zT>q>_URKU$J;}Go{nnP`t>rwcJ^35Ce&S31Os?MskoT19SNeOxwBIszlkJ4!3(L4V zlKhP9rxVDJ$@%zP^5e4p7LwPJ>+Q?QcgX#wwdC{0pq-n^H_G$dJIHs4?<3deB(ITU zuXAyEM1IT{^}Zo@mwro}1mA0a)+>$A8Oe3tn4i3z^g}6fuS;lAZSs$D{hs~um|F!$QQ}-&LO|}5b-O>OWlKSC7&bL(+-g*nuPeX z(7~QBd;dsFGtARO8=fGZ`>LE zd5e6T{Cv$r^0#ulctgGkX}U&+ny3iFWDoM#wxcGkH?~(`u`&NfyRixMt(_-*DuM(NS^oP8S5iYYI#mg`?<+- z)SI9Dd=11`B5xc3_adKC4|zh#tM*2GfAZ}K5I>dt@HoWJBHz>yzJa`ON%$f1+oO>G zDtX!@@LS{wLg0VPaoEl(kJLE6@RR*g`)!#VcY2d|l)OX8*A7DdiR3#D!{?A|{1WoW zXvFU%UtSS@l00!H^v`+n;ok6jeH+FZ0#T zXMGm6gdE$2IxYqv(v%)t}e3zT>Efl{Y3;YDd&u<6+gZy@PwDX0L zrz%1ned`6@tzcas|515B#)8nSN`NKBT)HZ^3F2;PZPJZRz|4f z&y^Hk=RUsNMD7^~|Ha5t$;gvRuKU{gWfOFKTitMZB*o#a#JPV)8Sxd2{PQT@TgK0& za_#4fdM*NIe;+Groa%sa+>Q{=tEaT@j@^P+M&gVv+(nh`2<+_m_Rm$*I z;@S_||J}&7|MmW=#%uqNrg-iD#YVkyi_1~_b;C;-?I|GRqTSBohF2BWdbOXM8S%x8 z_`q1=hZym0M*Q?x;#V2*MUD9VvBcjo;)@vZ?_-HCof+%R{v}&)uIv4dxV>BjjC$WwyspQ*S^n*Z{6>5waqR~k z4{MQklX*xZ@{)4C-+_FQRMJP>u05YoZxqFAI~N-9a%sa+YI`j4cgg)_K9e9TmRFZc z^XDP|5Q?}W;`XOJMmv2B&u#cn$}>ymMU%+2-hD)>AWb4*LvrX zYo0aa!zIsFBY!fhog3n#{nre)+wUgve^Z{Fa@`|={60BdF8%k^vWkD|2U(e=C|=uN zgZ!i`+S8Og(=oU&xz^j8T$gvCxZNImc_Uwpx%#0yuG|p=@FkXH@@fPEkOQA?lYDre=`wz>X6S} z0B=QpSUWL5#C3s_U@8PJg6M5>4v~3 zkx!lt|DN1+7TU90+>WyQVVzw6JWuhP4j}$Exvs|-t+vO>7`yYV*CE4gDlh%{ z2j$V_dPv?)?svZ>*LoAk^&zcyfz(^baGUm}vRmWgQe9m8U-NsBYkPdiwcbD@zrDQP zQtu$cZ7Pw-uGPimTgvmXDAw;Ha$S!b$)D6j9{qg@+Wxn)ot&ijyKw0WJuJ`-elIwa2p!_=}e-!!r z1Zd}C%A@seBG>I_Ke_JL*U5Fg-xs%++TNbsvG^{@3x3`Y&17DYSDgEy47uLlY(&1T zEtV^gd~I#iH#!zy5sU96*YWKC9?;m?;>`>g=E_FGAD?KgMn zw~7?6{Z@}$`>nT;$L_Z!_8-=3EX8ZT%^=?<&rd8N*W>jjay?$}CqLI7^_~~k_1Hw( z^N`|SwncxvCfEHjQIXI4Q^)PB;+p>v!dyyGyymY?uKAl$o)(fPFqZg!+{f;56ltG%ot@#%TX^ zNQgWc-SDoi`(0Ua`&*f^I8rsFc)i}@AB%^O>-CdifJ=~dS7w} z`Qc&6v){-ghbTuL&1o;MhePm(OhaAl(y~iCzHr6Y4mb0ZG+cAq(ypFfQaPzv~KEuuHho=oUuR}%~ZeEX#G2B)D z?s&uMar9?y!{s)+!}A#KYQ*O?TyFC_;`15qVZ@u)^X2xoBffwU-`R*a?^EFKYM(!_E6a`wTblE1fpHxREE?aPxjvjNv7Xc-^?|ug&{$sb!)gTiNW&}IAg?KgoA;v^ z7;fJ0-e9 zd}v^}w!tR3T;*ctF~^5aM!Y#b1Q>3P4`GIz$Ir5w1!x$sp93Q3{ZjKLfncI=a zRbuS7W%75s{ki4u_M16gY2VuZll_PFvg6J1;jlyD=J;^RaC3aPWVks#+%nu8A08WS zjt{R4H^+xKb_1=K{br62$qYBg2f3Z@h}Uh#E-#mR9d3>fGR<_jIX;vz@|)vBRm08k zp|0WP_|VjFb9`uTxH&%f8*YvdA%>gdLx00{U$B>VgyH7+Fwt;xe3)stIX)~j+#DZP z8*YvdKN@b14|@$a#|Pae?RkZ#^?ygZ@zzVXCG{cTPuqa{YH?k6>U+ht&($A@YhS6y zllJK{s>|&^mugZ@T~4oEYLd(CESEau+P@9RH%h%t$oGghBflZuiac#xKs)k!;$6x6 zh|A@57hSHA;=vR@R$Rw7jh`$YLGiQ2b==eV72?Aw{zq{g4>f+5cofAS71wc6<1dKM zp!hrDbIJb}UqBu|9$+bXX7N?zxy9F$yNPchFDJf(yqfqP@99xCoZK1|$`e4Ka#@|ogZz@v7(=f2JyY9>jdtpN*plRWpWT~W@B8;6*XIzX zkn8gv8_4x}nA7C?yiE+bK6jK_=3n;L_W%0)RY`Jve$1QPLkJbuipK9t|?wJRZ$6+&mubGTb~KT`}A|9=$W%JRW6Fj7Hn-H;+dihMUKu&W4-E zqe#QeEF2%cqg}FGOxKF#_enH-SKYeP_JiJtCpPIg5{ytp-!^8Z; z>o*E2<>^_gv}b>p;-B(Z#hNs99$BiCPh+31AwGT~eS*V1t;C@rsMAr{{;QqF(&9Zs z{Cs;-(&pji|E;!sZ1sK~Q|{lYbUE$zS*iN^`-O*u)(>p%TR+g#vtqA4VF5l}eEq)h z^a}iJfy?+b_YDpV4;=WwW%|FyX*w4Ry3m?z2;r)C0dwMmu{=uBjJMfDJmMZNZ)T^hj zpVjqNBkMP?+U(gZuy!x67FN+uRaWR0657uFGB zXItVI9a+|~lI?BJY>Sb2_mfcux>H7a}Aa8N@Zquxx)oyBuwd>Hf zY3*~3ef$Fa{l4+(=^yMFdde7QDKHFAf-Be=R(44L4FUBXw zUiGQYuNm3??Q?5Tp@NRKe%kwdeC_d~q;)K`kCxV8V69;5$Y|}2Exs7Rj6L$pwwitV z>pHlS$1FfMj zyrO@wZRW|EsBqCXsEBwx4Qaw>C@dhRCWuoW^aza`*!IP z>aX!ap}xW0{jEuzW3PvW26yin7#99H_6!X6*F4t2UjJ&dkK?mnNKlY}ur*T*4hi=! zUbjihVqxLdY(zd=2k7FxLPL64hiz@2e}GT7P~RXcYxmHQKD{uz{{R1TxOJT}jgvGU zVlr)ZiLyS|xBi#Y#ZT|Cm9OWde4Vc-?=9u!a-*aE#$QpsgOl=gzoLAAlk)AqqI|fM z@^+j4*Zzx?@@D^k^A+XCIw{}tE6Pui@@D@BeMR|sPU;W-ite$ge2h*-8DwzM{O|UpCu6;w#Gca?*bzzoNX}e>RtY z)K`?3$K^l$F0Lr6?C7s3|DBWekNt}B3!L=-n6D_m!b$l_Ur}D~qnQ0S?kmdga!6Mfvki%G>Mke;vP~o%G+tuPA@dN%?7CQC{y$nf?FmSCrT10?hK$os@UA z%xFzu{FdyZ-I-cdlP^##Wx=`Llhk-?oIZyjq*qZ{Cx#zvJ@N#khPm zX}@_-$SzOSxCF#fzJrv{YZo$JT;9+6&e8w$#jK|25$$KY|Lyx1-2byIvzLjAyF^O; z$>si#728t*xO8p3g$-W!^inKl5^Kx6E1p9czJ#OWK93mtDV|i|dcJ%vt?0QvZLg|39qq&g%b1 z>euD8>$mUgaQ%s`%AM7pGQSmUz07+}_Geyx`#D}`^`~z6gy%5(iCW2wKE)Stnw z)Ogu#HDC7qecO2bwUP2^&0y>AT;6^j%USszQho)M=jFGb^K!QQ95owfh8ll52AD#-2c9!vdQq<(L+ zGVAYF_-C&!vDEL<>dWn?wpEhrpBPL1o2337w7^`y{hYP4{^a@}$5Q{S)>!_|Qqr!!p7lR3e-)`ZEcMrOQh!sc zAlKjBTKUeFf2-90U&p^LR(WUJPlVKO9)FrUsegVf^*@&S&1V|y<>uv|6-)h7oYdda zN&RPIsXw1QRKnvgum6k*ocCY(|FL|FSjyLv^5(M`_VRQ6_VdQh`hSI!YasBpl$Ij~C=Vbe9Z{^|b?^G=HKa%?S_|NrcNa(!(5HLO-ha6MhOyKiAobUi#z$H20Z!^45=;F5AO8x&ee#OL6{-u=X?U&o1DT(v-pI9pXuk~;Ly9&{mIH~`rRX(NV*jmvx`_Cy-zqVJu z(a-vRKe*6B3meb|`U6!AUyUddMAL)%03E=g2E|&W5N&WkxQNrH-2RW(# zb}aRe?0|BuQ+(bsuK!gm<^KpoWHhZXE^q%llwVkrv|e`mhg$!0 z|0PTM?`qUP%*x00yT(%g$Y9juk=oIWPk(Uz z*wh9P3qV4B(3O>{2t7^`h$1!h1Rd{6=EsBIt%uQ8PW{RtL5~) zZY7e zmnl---P%6Q7ncvS$~)VBT$-Ze_F3~Y`(>}c@z(!ZUiW`{dAM$iT#gk&|1GcwC+juQ t`oF&QKbN +#include +#include "libhsakmt.h" + +HSAKMT_STATUS HSAKMTAPI hsaKmtSPMAcquire(HSAuint32 PreferredNode) { + CHECK_DXG_OPEN(); + // Used for profiling tools + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSPMSetDestBuffer( + HSAuint32 PreferredNode, HSAuint32 SizeInBytes, HSAuint32 *timeout, + HSAuint32 *SizeCopied, void *DestMemoryAddress, bool *isSPMDataLoss) { + CHECK_DXG_OPEN(); + // Used for profiling tools + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSPMRelease(HSAuint32 PreferredNode) { + CHECK_DXG_OPEN(); + // Used for profiling tools + return HSAKMT_STATUS_NOT_SUPPORTED; +} diff --git a/svm.cpp b/svm.cpp new file mode 100644 index 0000000000..9a45f89de9 --- /dev/null +++ b/svm.cpp @@ -0,0 +1,52 @@ +/* + * Copyright © 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#include "libhsakmt.h" + +/* Helper functions for calling KFD SVM ioctl */ + +HSAKMT_STATUS HSAKMTAPI hsaKmtSVMSetAttr(void *start_addr, HSAuint64 size, + unsigned int nattr, + HSA_SVM_ATTRIBUTE *attrs) { + CHECK_DXG_OPEN(); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSVMGetAttr(void *start_addr, HSAuint64 size, + unsigned int nattr, + HSA_SVM_ATTRIBUTE *attrs) { + CHECK_DXG_OPEN(); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSetXNACKMode(HSAint32 enable) { + CHECK_DXG_OPEN(); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetXNACKMode(HSAint32 *enable) { + CHECK_DXG_OPEN(); + *enable = false; + return HSAKMT_STATUS_SUCCESS; +} diff --git a/time.cpp b/time.cpp new file mode 100644 index 0000000000..477d04d0ad --- /dev/null +++ b/time.cpp @@ -0,0 +1,52 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#include +#include +#include +#include +#include "libhsakmt.h" +#include "inc/wddm/device.h" + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetClockCounters(HSAuint32 NodeId, + HsaClockCounters *Counters) { + HSAKMT_STATUS result = HSAKMT_STATUS_SUCCESS; + + CHECK_DXG_OPEN(); + + std::memset(Counters, 0, sizeof(*Counters)); + + rocr::core::WDDMDevice *device_ = get_wddmdev(NodeId); + assert(device_); + device_->GetClockCounters(&Counters->GPUClockCounter, nullptr); + + struct timespec ts; + if (clock_gettime(CLOCK_REALTIME, &ts) == 0) + Counters->CPUClockCounter = ts.tv_sec * 1e9 + ts.tv_nsec; + if (clock_gettime(CLOCK_BOOTTIME, &ts) == 0) + Counters->SystemClockCounter = ts.tv_sec * 1e9 + ts.tv_nsec; + Counters->SystemClockFrequencyHz = 1000000000; + + return result; +} diff --git a/topology.cpp b/topology.cpp new file mode 100644 index 0000000000..bfa5a22c01 --- /dev/null +++ b/topology.cpp @@ -0,0 +1,1698 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * Copyright 2016-2018 Raptor Engineering, LLC. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "libhsakmt.h" +#include "inc/wddm/types.h" +#include "inc/wddm/device.h" + +/* Number of memory banks added by thunk on top of topology + * This only includes static heaps like LDS, scratch and SVM, + * not for MMIO_REMAP heap. MMIO_REMAP memory bank is reported + * dynamically based on whether mmio aperture was mapped + * successfully on this node. + */ +#define NUM_OF_IGPU_HEAPS 3 +#define NUM_OF_DGPU_HEAPS 3 + +typedef struct { + HsaNodeProperties node; + HsaMemoryProperties *mem; /* node->NumBanks elements */ + HsaCacheProperties *cache; + HsaIoLinkProperties *link; +} node_props_t; + +static HsaSystemProperties *g_system; +static node_props_t *g_props; + +static std::vector wdevices_; +static uint32_t wdevice_num_; +static uint32_t num_sysfs_nodes; + +static int processor_vendor = -1; +/* Supported System Vendors */ +enum SUPPORTED_PROCESSOR_VENDORS { + GENUINE_INTEL = 0, + AUTHENTIC_AMD, + IBM_POWER +}; +/* Adding newline to make the search easier */ +static const char *supported_processor_vendor_name[] = { + "GenuineIntel\n", + "AuthenticAMD\n", + "\n" // POWER requires a different search method +}; + +static HSAKMT_STATUS topology_take_snapshot(void); +static void topology_drop_snapshot(void); + +/* information from /proc/cpuinfo */ +struct proc_cpuinfo { + uint32_t proc_num; /* processor */ + uint32_t apicid; /* apicid */ + char model_name[HSA_PUBLIC_NAME_SIZE]; /* model name */ +}; + +/* CPU cache table for all CPUs on the system. Each entry has the relative CPU + * info and caches connected to that CPU. + */ +typedef struct cpu_cacheinfo { + uint32_t len; /* length of the table = number of online procs */ + int32_t proc_num; /* this cpu's processor number */ + uint32_t num_caches; /* number of caches reported by this cpu */ + HsaCacheProperties *cache_prop; /* a list of cache properties */ +} cpu_cacheinfo_t; + +static void free_properties(node_props_t *props, int size) { + if (props) { + int i; + for (i = 0; i < size; i++) { + free(props[i].mem); + free(props[i].cache); + free(props[i].link); + } + + free(props); + } +} + +/* num_subdirs - find the number of sub-directories in the specified path + * @dirpath - directory path to find sub-directories underneath + * @prefix - only count sub-directory names starting with prefix. + * Use blank string, "", to count all. + * Return - number of sub-directories + */ +static int num_subdirs(char *dirpath, char *prefix) { + int count = 0; + DIR *dirp; + struct dirent *dir; + int prefix_len = strlen(prefix); + + dirp = opendir(dirpath); + if (dirp) { + while ((dir = readdir(dirp)) != 0) { + if ((strcmp(dir->d_name, ".") == 0) || (strcmp(dir->d_name, "..") == 0)) + continue; + if (prefix_len && strncmp(dir->d_name, prefix, prefix_len)) + continue; + count++; + } + closedir(dirp); + } + + return count; +} + +/* fscanf_dec - read a file whose content is a decimal number + * @file [IN ] file to read + * @num [OUT] number in the file + */ +static HSAKMT_STATUS fscanf_dec(char *file, uint32_t *num) { + FILE *fd; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + fd = fopen(file, "r"); + if (!fd) { + pr_err("Failed to open %s\n", file); + return HSAKMT_STATUS_INVALID_PARAMETER; + } + if (fscanf(fd, "%u", num) != 1) { + pr_err("Failed to parse %s as a decimal.\n", file); + ret = HSAKMT_STATUS_ERROR; + } + + fclose(fd); + return ret; +} + +/* fscanf_str - read a file whose content is a string + * @file [IN ] file to read + * @str [OUT] string in the file + */ +static HSAKMT_STATUS fscanf_str(char *file, char *str) { + FILE *fd; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + fd = fopen(file, "r"); + if (!fd) { + pr_err("Failed to open %s\n", file); + return HSAKMT_STATUS_INVALID_PARAMETER; + } + if (fscanf(fd, "%s", str) != 1) { + pr_err("Failed to parse %s as a string.\n", file); + ret = HSAKMT_STATUS_ERROR; + } + + fclose(fd); + return ret; +} + +/* fscanf_size - read a file whose content represents size as a string + * @file [IN ] file to read + * @bytes [OUT] sizes in bytes + */ +static HSAKMT_STATUS fscanf_size(char *file, uint32_t *bytes) { + FILE *fd; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + char unit; + int n; + + fd = fopen(file, "r"); + if (!fd) { + pr_err("Failed to open %s\n", file); + return HSAKMT_STATUS_INVALID_PARAMETER; + } + + n = fscanf(fd, "%u%c", bytes, &unit); + if (n < 1) { + pr_err("Failed to parse %s\n", file); + ret = HSAKMT_STATUS_ERROR; + } + + if (n == 2) { + switch (unit) { + case 'K': + *bytes <<= 10; + break; + case 'M': + *bytes <<= 20; + break; + case 'G': + *bytes <<= 30; + break; + default: + ret = HSAKMT_STATUS_ERROR; + break; + } + } + + fclose(fd); + return ret; +} + +/* cpumap_to_cpu_ci - translate shared_cpu_map string + cpuinfo->apicid into + * SiblingMap in cache + * @shared_cpu_map [IN ] shared_cpu_map string + * @cpuinfo [IN ] cpuinfo to get apicid + * @this_cache [OUT] CPU cache to fill in SiblingMap + */ +static void cpumap_to_cpu_ci(char *shared_cpu_map, struct proc_cpuinfo *cpuinfo, + HsaCacheProperties *this_cache) { + int num_hexs, bit; + uint32_t proc, apicid, mask; + char *ch_ptr; + + /* shared_cpu_map is shown as ...X3,X2,X1 Each X is a hex without 0x + * and it's up to 8 characters(32 bits). For the first 32 CPUs(actually + * procs), it's presented in X1. The next 32 is in X2, and so on. + */ + num_hexs = (strlen(shared_cpu_map) + 8) / 9; /* 8 characters + "," */ + ch_ptr = strtok(shared_cpu_map, ","); + while (num_hexs-- > 0) { + mask = strtol(ch_ptr, NULL, 16); /* each X */ + for (bit = 0; bit < 32; bit++) { + if (!((1 << bit) & mask)) + continue; + proc = num_hexs * 32 + bit; + apicid = cpuinfo[proc].apicid; + if (apicid >= HSA_CPU_SIBLINGS) { + pr_warn("SiblingMap buffer %d is too small\n", HSA_CPU_SIBLINGS); + continue; + } + this_cache->SiblingMap[apicid] = 1; + } + ch_ptr = strtok(NULL, ","); + } +} + +/* get_cpu_cache_info - get specified CPU's cache information from sysfs + * @prefix [IN] sysfs path for target cpu cache, + * /sys/devices/system/node/nodeX/cpuY/cache + * @cpuinfo [IN] /proc/cpuinfo data to get apicid + * @cpu_ci: CPU specified. This parameter is an input and also an output. + * [IN] cpu_ci->num_caches: number of index dirs + * [OUT] cpu_ci->cache_info: to store cache info collected + * [OUT] cpu_ci->num_caches: reduces when shared with other cpu(s) + * Return: number of cache reported from this cpu + */ +static int get_cpu_cache_info(const char *prefix, struct proc_cpuinfo *cpuinfo, + cpu_cacheinfo_t *cpu_ci) { + int idx, num_idx, n; + HsaCacheProperties *this_cache; + char path[256], str[256]; + bool is_power9 = false; + + if (processor_vendor == IBM_POWER) { + if (strcmp(cpuinfo[0].model_name, "POWER9") == 0) { + is_power9 = true; + } + } + + this_cache = cpu_ci->cache_prop; + num_idx = cpu_ci->num_caches; + for (idx = 0; idx < num_idx; idx++) { + /* If this cache is shared by multiple CPUs, we only need + * to list it in the first CPU. + */ + if (is_power9) { + // POWER9 has SMT4 + if (cpu_ci->proc_num & 0x3) { + /* proc is not 0,4,8,etc. Skip and reduce the cache count. */ + --cpu_ci->num_caches; + continue; + } + } else { + snprintf(path, 256, "%s/index%d/shared_cpu_list", prefix, idx); + /* shared_cpu_list is shown as n1,n2... or n1-n2,n3-n4... + * For both cases, this cache is listed to proc n1 only. + */ + fscanf_dec(path, (uint32_t *)&n); + if (cpu_ci->proc_num != n) { + /* proc is not n1. Skip and reduce the cache count. */ + --cpu_ci->num_caches; + continue; + } + this_cache->ProcessorIdLow = cpuinfo[cpu_ci->proc_num].apicid; + } + + /* CacheLevel */ + snprintf(path, 256, "%s/index%d/level", prefix, idx); + fscanf_dec(path, &this_cache->CacheLevel); + /* CacheType */ + snprintf(path, 256, "%s/index%d/type", prefix, idx); + + memset(str, 0, sizeof(str)); + fscanf_str(path, str); + if (!strcmp(str, "Data")) + this_cache->CacheType.ui32.Data = 1; + if (!strcmp(str, "Instruction")) + this_cache->CacheType.ui32.Instruction = 1; + if (!strcmp(str, "Unified")) { + this_cache->CacheType.ui32.Data = 1; + this_cache->CacheType.ui32.Instruction = 1; + } + this_cache->CacheType.ui32.CPU = 1; + /* CacheSize */ + snprintf(path, 256, "%s/index%d/size", prefix, idx); + fscanf_size(path, &this_cache->CacheSize); + /* CacheLineSize */ + snprintf(path, 256, "%s/index%d/coherency_line_size", prefix, idx); + fscanf_dec(path, &this_cache->CacheLineSize); + /* CacheAssociativity */ + snprintf(path, 256, "%s/index%d/ways_of_associativity", prefix, idx); + fscanf_dec(path, &this_cache->CacheAssociativity); + /* CacheLinesPerTag */ + snprintf(path, 256, "%s/index%d/physical_line_partition", prefix, idx); + fscanf_dec(path, &this_cache->CacheLinesPerTag); + /* CacheSiblings */ + snprintf(path, 256, "%s/index%d/shared_cpu_map", prefix, idx); + fscanf_str(path, str); + cpumap_to_cpu_ci(str, cpuinfo, this_cache); + + ++this_cache; + } + + return cpu_ci->num_caches; +} + +static HSAKMT_STATUS topology_map_node_id(uint32_t node_id, + rocr::core::WDDMDevice *&device) { + uint32_t idx = node_id; + if ((!wdevices_.size()) || (!node_id) || (node_id >= num_sysfs_nodes)) + return HSAKMT_STATUS_NOT_SUPPORTED; + + device = wdevices_[node_id - 1]; + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS topology_sysfs_get_system_props(HsaSystemProperties *props) { + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + bool is_node_supported = true; + uint32_t num_supported_nodes = 0; + + assert(props); + std::memset(props, 0, sizeof(*props)); + + D3DKMT_ADAPTERINFO *adapters; + int num_adapters; + if (rocr::core::WDDMGetAdapters(adapters, num_adapters) != STATUS_SUCCESS) { + pr_err("Failed to get adapters\n"); + ret = HSAKMT_STATUS_ERROR; + goto err; + } + + num_sysfs_nodes = num_adapters + 1; + + for (auto device : wdevices_) + delete device; + wdevices_.clear(); + + for (uint32_t i = 0; i < num_adapters; i++) { + rocr::core::WDDMDevice *device = new rocr::core::WDDMDevice( + adapters[i].hAdapter, adapters[i].AdapterLuid); + assert(device && "Create WDDM Device fail"); + wdevices_.push_back(device); + } + props->NumNodes = num_sysfs_nodes; + + delete[] adapters; + return ret; +err: + return ret; +} + +void topology_setup_is_dgpu_param(HsaNodeProperties *props) { + /* if we found a dGPU node, then treat the whole system as dGPU */ + if (!props->NumCPUCores && props->NumFComputeCores) + is_dgpu = true; +} + +static HSAKMT_STATUS topology_get_cpu_model_name(HsaNodeProperties *props, + struct proc_cpuinfo *cpuinfo, + int num_procs) { + int i, j; + + if (!props) { + pr_err("Invalid props to get cpu model name\n"); + return HSAKMT_STATUS_INVALID_PARAMETER; + } + + for (i = 0; i < num_procs; i++, cpuinfo++) { + if (props->CComputeIdLo == cpuinfo->apicid) { + if (!props->DeviceId) /* CPU-only node */ + strncpy((char *)props->AMDName, cpuinfo->model_name, + sizeof(props->AMDName)); + /* Convert from UTF8 to UTF16 */ + for (j = 0; + cpuinfo->model_name[j] != '\0' && j < HSA_PUBLIC_NAME_SIZE - 1; j++) + props->MarketingName[j] = cpuinfo->model_name[j]; + props->MarketingName[j] = '\0'; + return HSAKMT_STATUS_SUCCESS; + } + } + + return HSAKMT_STATUS_ERROR; +} + +static int topology_search_processor_vendor(const char *processor_name) { + unsigned int i; + + for (i = 0; i < ARRAY_LEN(supported_processor_vendor_name); i++) { + if (!strcmp(processor_name, supported_processor_vendor_name[i])) + return i; + if (!strcmp(processor_name, "POWER9, altivec supported\n")) + return IBM_POWER; + } + return -1; +} + +/* topology_parse_cpuinfo - Parse /proc/cpuinfo and fill up required + * topology information + * cpuinfo [OUT]: output buffer to hold cpu information + * num_procs: number of processors the output buffer can hold + */ +static HSAKMT_STATUS topology_parse_cpuinfo(struct proc_cpuinfo *cpuinfo, + uint32_t num_procs) { + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + FILE *fd; + char read_buf[256]; + char *p; + uint32_t proc = 0; + size_t p_len; + const char *proc_cpuinfo_path = "/proc/cpuinfo"; + + if (!cpuinfo) { + pr_err("CPU information will be missing\n"); + return HSAKMT_STATUS_INVALID_PARAMETER; + } + + fd = fopen(proc_cpuinfo_path, "r"); + if (!fd) { + pr_err("Failed to open [%s]. Unable to get CPU information", + proc_cpuinfo_path); + return HSAKMT_STATUS_ERROR; + } + +#ifdef __PPC64__ + char *p2; + + /* Each line in /proc/cpuinfo that read_buf is constructed, the format + * is like this: + * "token : value\n" + * where token is our target like vendor_id, model name, apicid ... + * and value is the answer + */ + while (fgets(read_buf, sizeof(read_buf), fd)) { + /* processor number */ + if (!strncmp("processor ", read_buf, sizeof("processor ") - 1)) { + p = strchr(read_buf, ':'); + p += 2; /* remove ": " */ + proc = atoi(p); + if (proc >= num_procs) { + pr_warn("cpuinfo contains processor %d larger than %u\n", proc, + num_procs); + ret = HSAKMT_STATUS_NO_MEMORY; + goto exit; + } + continue; + } + + /* vendor name / model name */ + if (!strncmp("cpu ", read_buf, sizeof("cpu ") - 1) && + (processor_vendor == -1)) { + p = strchr(read_buf, ':'); + p += 2; /* remove ": " */ + processor_vendor = topology_search_processor_vendor(p); + + p2 = strchr(p, ','); + if (p2 != NULL) { + p2++; + *p2 = 0; + } + if (strlen(p) < HSA_PUBLIC_NAME_SIZE) { + /* -1 to remove \n from p */ + strncpy(cpuinfo[proc].model_name, p, strlen(p) - 1); + cpuinfo[proc].model_name[strlen(p) - 1] = '\0'; + } else + strncpy(cpuinfo[proc].model_name, p, HSA_PUBLIC_NAME_SIZE); + continue; + } + } +#else + /* Each line in /proc/cpuinfo that read_buf is constructed, the format + * is like this: + * "token : value\n" + * where token is our target like vendor_id, model name, apicid ... + * and value is the answer + */ + while (fgets(read_buf, sizeof(read_buf), fd)) { + /* processor number */ + if (!strncmp("processor", read_buf, sizeof("processor") - 1)) { + p = strchr(read_buf, ':'); + p += 2; /* remove ": " */ + proc = atoi(p); + if (proc >= num_procs) { + pr_warn("cpuinfo contains processor %d larger than %u\n", proc, + num_procs); + ret = HSAKMT_STATUS_NO_MEMORY; + goto exit; + } + continue; + } + + /* vendor name */ + if (!strncmp("vendor_id", read_buf, sizeof("vendor_id") - 1) && + (processor_vendor == -1)) { + p = strchr(read_buf, ':'); + p += 2; /* remove ": " */ + processor_vendor = topology_search_processor_vendor(p); + continue; + } + + /* model name */ + if (!strncmp("model name", read_buf, sizeof("model name") - 1)) { + p = strchr(read_buf, ':'); + p += 2; /* remove ": " */ + p_len = strlen(p); + if (p_len > HSA_PUBLIC_NAME_SIZE) + p_len = HSA_PUBLIC_NAME_SIZE; + memcpy(cpuinfo[proc].model_name, p, p_len); + cpuinfo[proc].model_name[p_len - 1] = '\0'; + continue; + } + + /* apicid */ + if (!strncmp("apicid", read_buf, sizeof("apicid") - 1)) { + p = strchr(read_buf, ':'); + p += 2; /* remove ": " */ + cpuinfo[proc].apicid = atoi(p); + } + } +#endif + + if (processor_vendor < 0) { + pr_err("Failed to get Processor Vendor. Setting to %s", + supported_processor_vendor_name[GENUINE_INTEL]); + processor_vendor = GENUINE_INTEL; + } + +exit: + fclose(fd); + return ret; +} + +static HSAKMT_STATUS topology_get_cpu_maxfreq(uint32_t *max_freq) { + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + std::ifstream cpuinfo_max_freq( + "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq"); + if (!cpuinfo_max_freq) { + std::ifstream cpuinfo("/proc/cpuinfo"); + if (!cpuinfo) { + std::cerr << "Failed to open /proc/cpuinfo\n"; + return HSAKMT_STATUS_ERROR; + } + + std::string line; + double freq_max_ = 0; + while (std::getline(cpuinfo, line)) { + if (line.substr(0, 7) == "cpu MHz") { + double freq = std::stod(line.substr(line.find(':') + 2)); + if (freq > freq_max_) { + freq_max_ = freq; + } + } + } + *max_freq = static_cast(freq_max_); + } else { + std::string line; + std::getline(cpuinfo_max_freq, line); + *max_freq = static_cast(std::stod(line) / 1000); + } + + return ret; +} + +static int log2_int(int x) { + int result = 0; + while (x >>= 1) { + result++; + } + return result; +} + +static HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id, + HsaNodeProperties *props, + bool *p2p_links, + uint32_t *num_p2pLinks) { + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + assert(props); + memset(props, 0, sizeof(*props)); + if (p2p_links) + *p2p_links = false; + if (num_p2pLinks) + *num_p2pLinks = 0; + + topology_get_cpu_maxfreq(&props->MaxEngineClockMhzCCompute); + + if (node_id == 0) { + /* CPU node */ + props->NumCPUCores = sysconf(_SC_NPROCESSORS_ONLN); + props->NumMemoryBanks = 1; + props->KFDGpuID = 0; + return HSAKMT_STATUS_SUCCESS; + } + + /* gpu node */ + rocr::core::WDDMDevice *device; + ret = topology_map_node_id(node_id, device); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + + props->NumCPUCores = 0; + props->NumFComputeCores = device->SimdPerCu() * device->ComputeUnitCount(); + props->NumMemoryBanks = 1; + props->NumCaches = 3; + props->NumIOLinks = 1; + props->CComputeIdLo = 0; + props->FComputeIdLo = 0; + props->Capability.ui32.ASICRevision = device->AsicRevision(); + props->Capability.ui32.WatchPointsTotalBits = + log2_int(device->WatchPointsNum()); + props->MaxWavesPerSIMD = device->WavePerCu() / device->SimdPerCu(); + props->LDSSizeInKB = device->LdsSize() / 1024; + props->GDSSizeInKB = 0; + props->WaveFrontSize = device->WavefrontSize(); + props->NumShaderBanks = device->NumShaderEngine(); + props->NumArrays = device->ShaderArrayPerShaderEngine(); + props->NumCUPerArray = device->ComputeUnitCount() / props->NumArrays; + props->NumSIMDPerCU = device->SimdPerCu(); + props->MaxSlotsScratchCU = device->MaxScratchSlotsPerCu(); + props->VendorId = 0x1002; + props->DeviceId = device->DeviceId(); + props->LocationId = device->PciBusAddr(); + props->LocalMemSize = 0; + props->MaxEngineClockMhzFCompute = device->MaxEngineClockMhz(); + props->DrmRenderMinor = node_id; + + { + int i; + const char *name = device->ProductName(); + for (i = 0; name[i] != 0 && i < HSA_PUBLIC_NAME_SIZE - 1; i++) + props->MarketingName[i] = name[i]; + props->MarketingName[i] = '\0'; + } + props->uCodeEngineVersions.uCodeSDMA = device->GetSdmaFwVersion(); + props->DebugProperties.Value = 0; + props->HiveID = 0; + props->NumSdmaEngines = device->NumSdmaEngine(); + props->NumSdmaXgmiEngines = 0; + props->NumSdmaQueuesPerEngine = 6; // TODO + props->NumCpQueues = device->GetNumCpQueues(); + props->NumGws = 0; + props->Integrated = !(device->IsDgpu()); + props->Domain = device->Domain(); + props->UniqueID = atol(device->Uuid()); // TODO + props->NumXcc = 1; + props->KFDGpuID = device->DeviceId(); // TODO + props->FamilyID = device->GfxFamily(); + + props->EngineId.ui32.uCode = device->GetMecFwVersion(); + char *envvar = getenv("HSA_OVERRIDE_GFX_VERSION"); + if (envvar) { + char dummy = '\0'; + uint32_t major = 0, minor = 0, step = 0; + /* HSA_OVERRIDE_GFX_VERSION=major.minor.stepping */ + if ((sscanf(envvar, "%u.%u.%u%c", &major, &minor, &step, &dummy) != 3) || + (major > 63 || minor > 255 || step > 255)) { + pr_err("HSA_OVERRIDE_GFX_VERSION %s is invalid\n", envvar); + return HSAKMT_STATUS_ERROR; + } + props->EngineId.ui32.Major = major & 0x3f; + props->EngineId.ui32.Minor = minor & 0xff; + props->EngineId.ui32.Stepping = step & 0xff; + } else { + props->EngineId.ui32.Major = device->Major(); + props->EngineId.ui32.Minor = device->Minor(); + props->EngineId.ui32.Stepping = device->Stepping(); + } + + snprintf((char *)props->AMDName, sizeof(props->AMDName) - 1, "GFX%06x", + HSA_GET_GFX_VERSION_FULL(props->EngineId.ui32)); + + if (!is_svm_api_supported) + props->Capability.ui32.SVMAPISupported = 0; + props->Capability.ui32.DoorbellType = 2; + + /* Get VGPR/SGPR size in byte per CU */ + props->SGPRSizePerCU = SGPR_SIZE_PER_CU; + props->VGPRSizePerCU = get_vgpr_size_per_cu(props->EngineId); + + if (props->NumFComputeCores) + assert(props->EngineId.ui32.Major && + "HSA_OVERRIDE_GFX_VERSION may be needed"); + + return ret; +} + +static HSAKMT_STATUS topology_sysfs_get_mem_props(uint32_t node_id, + uint32_t mem_id, + HsaMemoryProperties *props) { + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + assert(props); + std::memset(props, 0, sizeof(*props)); + if (node_id == 0) { + /* CPU node */ + props->HeapType = HSA_HEAPTYPE_SYSTEM; + + struct sysinfo info; + sysinfo(&info); + props->SizeInBytes = info.totalram; + + props->Flags.MemoryProperty = 0; + props->Width = 64; + props->MemoryClockMax = 2133; + return HSAKMT_STATUS_SUCCESS; + } + + rocr::core::WDDMDevice *device; + ret = topology_map_node_id(node_id, device); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + + props->HeapType = HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE; + props->SizeInBytes = device->LocalHeapSize(); + props->Width = device->MemoryBusWidth(); + props->MemoryClockMax = device->MaxMemoryClockMhz(); + + return ret; +} + +/* topology_destroy_temp_cpu_cache_list - + * Free the memory allocated in topology_create_temp_cpu_cache_list(). + */ +static void +topology_destroy_temp_cpu_cache_list(cpu_cacheinfo_t *temp_cpu_ci_list) { + uint32_t n; + cpu_cacheinfo_t *p_temp_cpu_ci_list = temp_cpu_ci_list; + cpu_cacheinfo_t *cpu_ci = p_temp_cpu_ci_list; + + if (p_temp_cpu_ci_list) { + for (n = 0; n < p_temp_cpu_ci_list->len; n++, cpu_ci++) + free(cpu_ci->cache_prop); + free(p_temp_cpu_ci_list); + } + + p_temp_cpu_ci_list = NULL; +} + +/* topology_create_temp_cpu_cache_list - Create a temporary cpu-cache list to + * store cpu cache information. This list will be used to copy + * HsaCacheProperties in the CPU node. Two buffers are allocated + * inside this function: cpu_ci list and cache_prop under each + * cpu_ci. Must call topology_destroy_temp_cpu_cache_list to free + * the memory after the information is copied. + * @node [IN] CPU node number + * @cpuinfo [IN] /proc/cpuinfo data + * @temp_cpu_ci_list [OUT] cpu-cache-info list with data filled + * Return: total number of caches under this CPU node + */ +static int +topology_create_temp_cpu_cache_list(int node, struct proc_cpuinfo *cpuinfo, + cpu_cacheinfo_t **temp_cpu_ci_list) { + /* Get max path size from /sys/devices/system/node/node%d/%s/cache + * below, which will max out according to the largest filename, + * which can be present twice in the string above. 29 is for the prefix + * and the +6 is for the cache suffix + */ +#ifndef MAXNAMLEN +/* MAXNAMLEN is the BSD name for NAME_MAX. glibc aliases this as NAME_MAX, but + * not musl */ +#define MAXNAMLEN NAME_MAX +#endif + constexpr uint32_t MAXPATHSIZE = 29 + MAXNAMLEN + (MAXNAMLEN + 6); + cpu_cacheinfo_t *p_temp_cpu_ci_list; /* a list of cpu_ci */ + char path[MAXPATHSIZE], node_dir[MAXPATHSIZE]; + int max_cpus; + cpu_cacheinfo_t *this_cpu; /* one cpu_ci in cpu_ci_list */ + int cache_cnt = 0; + DIR *dirp = NULL; + struct dirent *dir; + char *p; + + if (!temp_cpu_ci_list) { + pr_err("Invalid temp_cpu_ci_list\n"); + return cache_cnt; + } + *temp_cpu_ci_list = NULL; + + /* Get info from /sys/devices/system/node/nodeX/cpuY/cache */ + int node_real = node; + if (processor_vendor == IBM_POWER) { + if (!strcmp(cpuinfo[0].model_name, "POWER9")) { + node_real = node * 8; + } + } + snprintf(node_dir, MAXPATHSIZE, "/sys/devices/system/node/node%d", node_real); + /* Other than cpuY folders, this dir also has cpulist and cpumap */ + max_cpus = num_subdirs(node_dir, "cpu"); + if (max_cpus <= 0) { + /* If CONFIG_NUMA is not enabled in the kernel, + * /sys/devices/system/node doesn't exist. + */ + if (node) { /* CPU node must be 0 or something is wrong */ + pr_err("Fail to get cpu* dirs under %s.", node_dir); + goto exit; + } + /* Fall back to use /sys/devices/system/cpu */ + snprintf(node_dir, MAXPATHSIZE, "/sys/devices/system/cpu"); + max_cpus = num_subdirs(node_dir, "cpu"); + if (max_cpus <= 0) { + pr_err("Fail to get cpu* dirs under %s\n", node_dir); + goto exit; + } + } + + p_temp_cpu_ci_list = + (cpu_cacheinfo_t *)calloc(max_cpus, sizeof(cpu_cacheinfo_t)); + if (!p_temp_cpu_ci_list) { + pr_err("Fail to allocate p_temp_cpu_ci_list\n"); + goto exit; + } + p_temp_cpu_ci_list->len = 0; + + this_cpu = p_temp_cpu_ci_list; + dirp = opendir(node_dir); + while ((dir = readdir(dirp)) != 0) { + if (strncmp(dir->d_name, "cpu", 3)) + continue; + if (!isdigit(dir->d_name[3])) /* ignore files like cpulist */ + continue; + snprintf(path, MAXPATHSIZE, "%s/%s/cache", node_dir, dir->d_name); + this_cpu->num_caches = num_subdirs(path, "index"); + this_cpu->cache_prop = (HsaCacheProperties *)calloc( + this_cpu->num_caches, sizeof(HsaCacheProperties)); + if (!this_cpu->cache_prop) { + pr_err("Fail to allocate cache_info\n"); + goto exit; + } + p = &dir->d_name[3]; + this_cpu->proc_num = atoi(p); + cache_cnt += get_cpu_cache_info(path, cpuinfo, this_cpu); + ++p_temp_cpu_ci_list->len; + ++this_cpu; + } + *temp_cpu_ci_list = p_temp_cpu_ci_list; + +exit: + if (dirp) + closedir(dirp); + return cache_cnt; +} + +/* topology_get_cpu_cache_props - Read CPU cache information from sysfs + * @node [IN] CPU node number + * @cpuinfo [IN] /proc/cpuinfo data + * @tbl [OUT] the node table to fill up + * Return: HSAKMT_STATUS_SUCCESS in success or error number in failure + */ +static HSAKMT_STATUS topology_get_cpu_cache_props(int node, + struct proc_cpuinfo *cpuinfo, + node_props_t *tbl) { + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + cpu_cacheinfo_t *cpu_ci_list = NULL; + uint32_t n, cache_cnt, i; + cpu_cacheinfo_t *cpu_ci; + HsaCacheProperties *this_cache; + + tbl->node.NumCaches = + topology_create_temp_cpu_cache_list(node, cpuinfo, &cpu_ci_list); + if (!tbl->node.NumCaches) { + /* For "Intel Meteor lake Mobile", the cache info is not in sysfs, + * That means /sys/devices/system/node/node%d/%s/cache is not exist. + * here AMD will not black this issue. + */ + pr_debug("CPU cache info is not available for node %d \n", node); + goto exit; + } + + tbl->cache = (HsaCacheProperties *)calloc(tbl->node.NumCaches, + sizeof(HsaCacheProperties)); + if (!tbl->cache) { + ret = HSAKMT_STATUS_NO_MEMORY; + goto exit; + } + + /* Now fill in the information to cache properties. */ + cache_cnt = 0; + cpu_ci = cpu_ci_list; + for (n = 0; n < cpu_ci_list->len; n++, cpu_ci++) { + this_cache = cpu_ci->cache_prop; + for (i = 0; i < cpu_ci->num_caches; i++, this_cache++) { + memcpy(&tbl->cache[cache_cnt++], this_cache, sizeof(HsaCacheProperties)); + if (cache_cnt >= tbl->node.NumCaches) + goto exit; + } + } + +exit: + topology_destroy_temp_cpu_cache_list(cpu_ci_list); + + return ret; +} + +/* For a give Node @node_id the function gets @iolink_id information i.e. parses + * sysfs the following sysfs entry + * ./nodes/@node_id/io_links/@iolink_id/properties. @node_id has to be valid + * accessible node. + * + * If node_to specified by the @iolink_id is not accessible the function returns + * HSAKMT_STATUS_NOT_SUPPORTED. If node_to is accessible, then node_to is mapped + * from sysfs_node to user_node and returns HSAKMT_STATUS_SUCCESS. + */ +static HSAKMT_STATUS topology_sysfs_get_iolink_props(uint32_t node_id, + uint32_t iolink_id, + HsaIoLinkProperties *props, + bool p2pLink) { + rocr::core::WDDMDevice *device; + topology_map_node_id(node_id, device); + + std::memset(props, 0, sizeof(*props)); + props->IoLinkType = HSA_IOLINKTYPE_PCIEXPRESS; + props->VersionMajor = props->VersionMinor = 0; + props->NodeFrom = node_id; + props->NodeTo = 0; + props->Weight = 20; + props->Flags.ui32.Override = 1; + props->Flags.ui32.NonCoherent = 1; + props->Flags.ui32.NoAtomics32bit = !(device->SupportPlatformAtomic()); + props->Flags.ui32.NoAtomics64bit = !(device->SupportPlatformAtomic()); + + return HSAKMT_STATUS_SUCCESS; +} + +/* topology_get_free_io_link_slot_for_node - For the given node_id, find the + * next available free slot to add an io_link + */ +static HsaIoLinkProperties * +topology_get_free_io_link_slot_for_node(uint32_t node_id, + const HsaSystemProperties *sys_props, + node_props_t *node_props) { + HsaIoLinkProperties *props; + + if (node_id >= sys_props->NumNodes) { + pr_err("Invalid node [%d]\n", node_id); + return NULL; + } + + props = node_props[node_id].link; + if (!props) { + pr_err("No io_link reported for Node [%d]\n", node_id); + return NULL; + } + + if (node_props[node_id].node.NumIOLinks >= sys_props->NumNodes - 1) { + pr_err("No more space for io_link for Node [%d]\n", node_id); + return NULL; + } + + return &props[node_props[node_id].node.NumIOLinks]; +} + +/* topology_add_io_link_for_node - If a free slot is available, + * add io_link for the given Node. + * TODO: Add other members of HsaIoLinkProperties + */ +static HSAKMT_STATUS topology_add_io_link_for_node( + uint32_t node_from, const HsaSystemProperties *sys_props, + node_props_t *node_props, HSA_IOLINKTYPE IoLinkType, uint32_t node_to, + uint32_t Weight) { + HsaIoLinkProperties *props; + + props = + topology_get_free_io_link_slot_for_node(node_from, sys_props, node_props); + if (!props) + return HSAKMT_STATUS_NO_MEMORY; + + props->IoLinkType = IoLinkType; + props->NodeFrom = node_from; + props->NodeTo = node_to; + props->Weight = Weight; + node_props[node_from].node.NumIOLinks++; + + return HSAKMT_STATUS_SUCCESS; +} + +/* Find the CPU that this GPU (gpu_node) directly connects to */ +static int32_t gpu_get_direct_link_cpu(uint32_t gpu_node, + node_props_t *node_props) { + HsaIoLinkProperties *props = node_props[gpu_node].link; + uint32_t i; + + if (!node_props[gpu_node].node.KFDGpuID || !props || + node_props[gpu_node].node.NumIOLinks == 0) + return -1; + + for (i = 0; i < node_props[gpu_node].node.NumIOLinks; i++) + if (props[i].IoLinkType == HSA_IOLINKTYPE_PCIEXPRESS && + props[i].Weight <= 20) /* >20 is GPU->CPU->GPU */ + return props[i].NodeTo; + + return -1; +} + +/* Get node1->node2 IO link information. This should be a direct link that has + * been created in the kernel. + */ +static HSAKMT_STATUS get_direct_iolink_info(uint32_t node1, uint32_t node2, + node_props_t *node_props, + HSAuint32 *weight, + HSA_IOLINKTYPE *type) { + HsaIoLinkProperties *props = node_props[node1].link; + uint32_t i; + + if (!props) + return HSAKMT_STATUS_INVALID_NODE_UNIT; + + for (i = 0; i < node_props[node1].node.NumIOLinks; i++) + if (props[i].NodeTo == node2) { + if (weight) + *weight = props[i].Weight; + if (type) + *type = props[i].IoLinkType; + return HSAKMT_STATUS_SUCCESS; + } + + return HSAKMT_STATUS_INVALID_PARAMETER; +} + +static HSAKMT_STATUS get_indirect_iolink_info(uint32_t node1, uint32_t node2, + node_props_t *node_props, + HSAuint32 *weight, + HSA_IOLINKTYPE *type) { + int32_t dir_cpu1 = -1, dir_cpu2 = -1; + HSAuint32 weight1 = 0, weight2 = 0, weight3 = 0; + HSAKMT_STATUS ret; + uint32_t i; + + *weight = 0; + *type = HSA_IOLINKTYPE_UNDEFINED; + + if (node1 == node2) + return HSAKMT_STATUS_INVALID_PARAMETER; + + /* CPU->CPU is not an indirect link */ + if (!node_props[node1].node.KFDGpuID && !node_props[node2].node.KFDGpuID) + return HSAKMT_STATUS_INVALID_NODE_UNIT; + + if (node_props[node1].node.HiveID && node_props[node2].node.HiveID && + node_props[node1].node.HiveID == node_props[node2].node.HiveID) + return HSAKMT_STATUS_INVALID_PARAMETER; + + if (node_props[node1].node.KFDGpuID) + dir_cpu1 = gpu_get_direct_link_cpu(node1, node_props); + if (node_props[node2].node.KFDGpuID) + dir_cpu2 = gpu_get_direct_link_cpu(node2, node_props); + + if (dir_cpu1 < 0 && dir_cpu2 < 0) + return HSAKMT_STATUS_ERROR; + + /* if the node2(dst) is GPU , it need to be large bar for host access*/ + if (node_props[node2].node.KFDGpuID) { + for (i = 0; i < node_props[node2].node.NumMemoryBanks; ++i) + if (node_props[node2].mem[i].HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC) + break; + if (i >= node_props[node2].node.NumMemoryBanks) + return HSAKMT_STATUS_ERROR; + } + /* Possible topology: + * GPU --(weight1) -- CPU -- (weight2) -- GPU + * GPU --(weight1) -- CPU -- (weight2) -- CPU -- (weight3) -- GPU + * GPU --(weight1) -- CPU -- (weight2) -- CPU + * CPU -- (weight2) -- CPU -- (weight3) -- GPU + */ + if (dir_cpu1 >= 0) { /* GPU->CPU ... */ + if (dir_cpu2 >= 0) { + if (dir_cpu1 == dir_cpu2) /* GPU->CPU->GPU*/ { + ret = + get_direct_iolink_info(node1, dir_cpu1, node_props, &weight1, NULL); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + ret = + get_direct_iolink_info(dir_cpu1, node2, node_props, &weight2, type); + } else /* GPU->CPU->CPU->GPU*/ { + ret = + get_direct_iolink_info(node1, dir_cpu1, node_props, &weight1, NULL); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + ret = get_direct_iolink_info(dir_cpu1, dir_cpu2, node_props, &weight2, + type); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + /* On QPI interconnection, GPUs can't access + * each other if they are attached to different + * CPU sockets. CPU<->CPU weight larger than 20 + * means the two CPUs are in different sockets. + */ + if (*type == HSA_IOLINK_TYPE_QPI_1_1 && weight2 > 20) + return HSAKMT_STATUS_NOT_SUPPORTED; + ret = + get_direct_iolink_info(dir_cpu2, node2, node_props, &weight3, NULL); + } + } else /* GPU->CPU->CPU */ { + ret = get_direct_iolink_info(node1, dir_cpu1, node_props, &weight1, NULL); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + ret = get_direct_iolink_info(dir_cpu1, node2, node_props, &weight2, type); + } + } else { /* CPU->CPU->GPU */ + ret = get_direct_iolink_info(node1, dir_cpu2, node_props, &weight2, type); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + ret = get_direct_iolink_info(dir_cpu2, node2, node_props, &weight3, NULL); + } + + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + + *weight = weight1 + weight2 + weight3; + return HSAKMT_STATUS_SUCCESS; +} + +static void +topology_create_indirect_gpu_links(const HsaSystemProperties *sys_props, + node_props_t *node_props) { + + uint32_t i, j; + HSAuint32 weight; + HSA_IOLINKTYPE type; + + for (i = 0; i < sys_props->NumNodes - 1; i++) { + for (j = i + 1; j < sys_props->NumNodes; j++) { + get_indirect_iolink_info(i, j, node_props, &weight, &type); + if (!weight) + goto try_alt_dir; + if (topology_add_io_link_for_node(i, sys_props, node_props, type, j, + weight) != HSAKMT_STATUS_SUCCESS) + pr_err("Fail to add IO link %d->%d\n", i, j); + try_alt_dir: + get_indirect_iolink_info(j, i, node_props, &weight, &type); + if (!weight) + continue; + if (topology_add_io_link_for_node(j, sys_props, node_props, type, i, + weight) != HSAKMT_STATUS_SUCCESS) + pr_err("Fail to add IO link %d->%d\n", j, i); + } + } +} + +HSAKMT_STATUS topology_take_snapshot(void) { + uint32_t i, mem_id, cache_id; + HsaSystemProperties sys_props; + node_props_t *temp_props = 0; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + struct proc_cpuinfo *cpuinfo; + const uint32_t num_procs = sysconf(_SC_NPROCESSORS_ONLN); + uint32_t num_ioLinks; + bool p2p_links = false; + uint32_t num_p2pLinks = 0; + + cpuinfo = (proc_cpuinfo *)calloc(num_procs, sizeof(struct proc_cpuinfo)); + if (!cpuinfo) { + pr_err("Fail to allocate memory for CPU info\n"); + return HSAKMT_STATUS_NO_MEMORY; + } + topology_parse_cpuinfo(cpuinfo, num_procs); + + ret = topology_sysfs_get_system_props(&sys_props); + if (ret != HSAKMT_STATUS_SUCCESS) + goto err; + if (sys_props.NumNodes > 0) { + temp_props = + (node_props_t *)calloc(sys_props.NumNodes * sizeof(node_props_t), 1); + if (!temp_props) { + ret = HSAKMT_STATUS_NO_MEMORY; + goto err; + } + for (i = 0; i < sys_props.NumNodes; i++) { + rocr::core::WDDMDevice *device_; + topology_map_node_id(i, device_); + + ret = topology_sysfs_get_node_props(i, &temp_props[i].node, &p2p_links, + &num_p2pLinks); + if (ret != HSAKMT_STATUS_SUCCESS) { + free_properties(temp_props, i); + goto err; + } + + if (temp_props[i].node.NumCPUCores) + topology_get_cpu_model_name(&temp_props[i].node, cpuinfo, num_procs); + + if (temp_props[i].node.NumMemoryBanks) { + temp_props[i].mem = (HsaMemoryProperties *)calloc( + temp_props[i].node.NumMemoryBanks * sizeof(HsaMemoryProperties), 1); + if (!temp_props[i].mem) { + ret = HSAKMT_STATUS_NO_MEMORY; + free_properties(temp_props, i + 1); + goto err; + } + for (mem_id = 0; mem_id < temp_props[i].node.NumMemoryBanks; mem_id++) { + ret = topology_sysfs_get_mem_props(i, mem_id, + &temp_props[i].mem[mem_id]); + if (ret != HSAKMT_STATUS_SUCCESS) { + free_properties(temp_props, i + 1); + goto err; + } + } + } + + if (temp_props[i].node.NumCaches) { + temp_props[i].cache = (HsaCacheProperties *)calloc( + temp_props[i].node.NumCaches * sizeof(HsaCacheProperties), 1); + if (!temp_props[i].cache) { + ret = HSAKMT_STATUS_NO_MEMORY; + free_properties(temp_props, i + 1); + goto err; + } + for (int j = 0; j < 3; j++) { + temp_props[i].cache[j].CacheType.ui32.Data = 1; + temp_props[i].cache[j].CacheType.ui32.HSACU = 1; + temp_props[i].cache[j].CacheLevel = j + 1; + } + temp_props[i].cache[0].CacheSize = device_->GetL1CacheSize() / 1024; + temp_props[i].cache[1].CacheSize = device_->GetL2CacheSize() / 1024; + temp_props[i].cache[2].CacheSize = device_->GetL3CacheSize() / 1024; + } else if (!temp_props[i].node.KFDGpuID) { /* a CPU node */ + ret = topology_get_cpu_cache_props(i, cpuinfo, &temp_props[i]); + if (ret != HSAKMT_STATUS_SUCCESS) { + free_properties(temp_props, i + 1); + goto err; + } + } + + /* To simplify, allocate maximum needed memory for io_links for each node. + * This removes the need for realloc when indirect and QPI links are added + * later + */ + temp_props[i].link = (HsaIoLinkProperties *)calloc( + sys_props.NumNodes - 1, sizeof(HsaIoLinkProperties)); + if (!temp_props[i].link) { + ret = HSAKMT_STATUS_NO_MEMORY; + free_properties(temp_props, i + 1); + goto err; + } + num_ioLinks = temp_props[i].node.NumIOLinks - num_p2pLinks; + uint32_t link_id = 0; + + if (num_ioLinks) { + uint32_t sys_link_id = 0; + + /* Parse all the sysfs specified io links. Skip the ones where the + * remote node (node_to) is not accessible + */ + while (sys_link_id < num_ioLinks && link_id < sys_props.NumNodes - 1) { + ret = topology_sysfs_get_iolink_props( + i, sys_link_id++, &temp_props[i].link[link_id], false); + if (ret == HSAKMT_STATUS_NOT_SUPPORTED) { + ret = HSAKMT_STATUS_SUCCESS; + continue; + } else if (ret != HSAKMT_STATUS_SUCCESS) { + free_properties(temp_props, i + 1); + goto err; + } + link_id++; + } + /* sysfs specifies all the io links. Limit the number to valid ones */ + temp_props[i].node.NumIOLinks = link_id; + } + + if (num_p2pLinks) { + uint32_t sys_link_id = 0; + + /* Parse all the sysfs specified p2p links. + */ + while (sys_link_id < num_p2pLinks && link_id < sys_props.NumNodes - 1) { + ret = topology_sysfs_get_iolink_props( + i, sys_link_id++, &temp_props[i].link[link_id], true); + if (ret == HSAKMT_STATUS_NOT_SUPPORTED) { + ret = HSAKMT_STATUS_SUCCESS; + continue; + } else if (ret != HSAKMT_STATUS_SUCCESS) { + free_properties(temp_props, i + 1); + goto err; + } + link_id++; + } + temp_props[i].node.NumIOLinks = link_id; + } + } + } + + if (!p2p_links) { + /* All direct IO links are created in the kernel. Here we need to + * connect GPU<->GPU or GPU<->CPU indirect IO links. + */ + topology_create_indirect_gpu_links(&sys_props, temp_props); + } + + if (!g_system) { + g_system = (HsaSystemProperties *)malloc(sizeof(HsaSystemProperties)); + if (!g_system) { + free_properties(temp_props, sys_props.NumNodes); + ret = HSAKMT_STATUS_NO_MEMORY; + goto err; + } + } + + *g_system = sys_props; + if (g_props) + free(g_props); + g_props = temp_props; +err: + free(cpuinfo); + return ret; +} + +/* Drop the Snashot of the HSA topology information. Assume lock is held. */ +void topology_drop_snapshot(void) { + if (!!g_system != !!g_props) + pr_warn("Probably inconsistency?\n"); + + if (g_props) { + /* Remove state */ + free_properties(g_props, g_system->NumNodes); + g_props = NULL; + } + + free(g_system); + g_system = NULL; + + for (auto device : wdevices_) + delete device; + wdevices_.clear(); +} + +HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id) { + if (!g_props || !g_system || g_system->NumNodes <= nodeid) + return HSAKMT_STATUS_INVALID_NODE_UNIT; + if (gpu_id) + *gpu_id = g_props[nodeid].node.KFDGpuID; + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS gpuid_to_nodeid(uint32_t gpu_id, uint32_t *node_id) { + uint64_t node_idx; + + for (node_idx = 0; node_idx < g_system->NumNodes; node_idx++) { + if (g_props[node_idx].node.KFDGpuID == gpu_id) { + *node_id = node_idx; + return HSAKMT_STATUS_SUCCESS; + } + } + + return HSAKMT_STATUS_INVALID_NODE_UNIT; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtAcquireSystemProperties(HsaSystemProperties *SystemProperties) { + HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS; + + CHECK_DXG_OPEN(); + + if (!SystemProperties) + return HSAKMT_STATUS_INVALID_PARAMETER; + + pthread_mutex_lock(&hsakmt_mutex); + + /* We already have a valid snapshot. Avoid double initialization that + * would leak memory. + */ + if (g_system) { + *SystemProperties = *g_system; + goto out; + } + + err = topology_take_snapshot(); + if (err != HSAKMT_STATUS_SUCCESS) + goto out; + + assert(g_system); + + // err = fmm_init_process_apertures(g_system->NumNodes); + // TODO: Determine if it is a dGPU + is_dgpu = true; + if (err != HSAKMT_STATUS_SUCCESS) + goto init_process_apertures_failed; + + // err = init_process_doorbells(g_system->NumNodes); + if (err != HSAKMT_STATUS_SUCCESS) + goto init_doorbells_failed; + + *SystemProperties = *g_system; + + goto out; + +init_doorbells_failed: + // fmm_destroy_process_apertures(); +init_process_apertures_failed: + topology_drop_snapshot(); + +out: + pthread_mutex_unlock(&hsakmt_mutex); + return err; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtReleaseSystemProperties(void) { + pthread_mutex_lock(&hsakmt_mutex); + + topology_drop_snapshot(); + + pthread_mutex_unlock(&hsakmt_mutex); + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS topology_get_node_props(HSAuint32 NodeId, + HsaNodeProperties *NodeProperties) { + if (!g_system || !g_props || NodeId >= g_system->NumNodes) + return HSAKMT_STATUS_ERROR; + + *NodeProperties = g_props[NodeId].node; + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtGetNodeProperties(HSAuint32 NodeId, HsaNodeProperties *NodeProperties) { + HSAKMT_STATUS err; + uint32_t gpu_id; + + if (!NodeProperties) + return HSAKMT_STATUS_INVALID_PARAMETER; + + CHECK_DXG_OPEN(); + pthread_mutex_lock(&hsakmt_mutex); + + err = validate_nodeid(NodeId, &gpu_id); + if (err != HSAKMT_STATUS_SUCCESS) + goto out; + + err = topology_get_node_props(NodeId, NodeProperties); + if (err != HSAKMT_STATUS_SUCCESS) + goto out; + /* For CPU only node don't add any additional GPU memory banks. */ + if (gpu_id) { + uint64_t base, limit; + if (!(NodeProperties->Integrated)) + NodeProperties->NumMemoryBanks += NUM_OF_DGPU_HEAPS; + else + NodeProperties->NumMemoryBanks += NUM_OF_IGPU_HEAPS; + // TODO: for apu + /*if (fmm_get_aperture_base_and_limit(FMM_MMIO, gpu_id, &base, + &limit) == HSAKMT_STATUS_SUCCESS) + NodeProperties->NumMemoryBanks += 1;*/ + } + +out: + pthread_mutex_unlock(&hsakmt_mutex); + return err; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtGetNodeMemoryProperties(HSAuint32 NodeId, HSAuint32 NumBanks, + HsaMemoryProperties *MemoryProperties) { + HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS; + uint32_t i; + + if (!MemoryProperties) + return HSAKMT_STATUS_INVALID_PARAMETER; + + CHECK_DXG_OPEN(); + pthread_mutex_lock(&hsakmt_mutex); + + memset(MemoryProperties, 0, NumBanks * sizeof(HsaMemoryProperties)); + for (i = 0; i < MIN(g_props[NodeId].node.NumMemoryBanks, NumBanks); i++) { + assert(g_props[NodeId].mem); + MemoryProperties[i] = g_props[NodeId].mem[i]; + } + + /* The following memory banks does not apply to CPU only node */ + rocr::core::WDDMDevice *device_ = get_wddmdev(NodeId); + if (device_ == nullptr) + goto out; + + /*Add LDS*/ + if (i < NumBanks) { + MemoryProperties[i].HeapType = HSA_HEAPTYPE_GPU_LDS; + MemoryProperties[i].VirtualBaseAddress = device_->SharedApertureBase(); + MemoryProperties[i].SizeInBytes = g_props[NodeId].node.LDSSizeInKB * 1024; + i++; + } + + /* Add SCRATCH */ + if (i < NumBanks) { + MemoryProperties[i].HeapType = HSA_HEAPTYPE_GPU_SCRATCH; + MemoryProperties[i].VirtualBaseAddress = device_->PrivateApertureBase(); + MemoryProperties[i].SizeInBytes = device_->PrivateApertureSize(); + i++; + } + +out: + pthread_mutex_unlock(&hsakmt_mutex); + return err; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeCacheProperties( + HSAuint32 NodeId, HSAuint32 ProcessorId, HSAuint32 NumCaches, + HsaCacheProperties *CacheProperties) { + HSAKMT_STATUS err; + uint32_t i; + + if (!CacheProperties) + return HSAKMT_STATUS_INVALID_PARAMETER; + + CHECK_DXG_OPEN(); + pthread_mutex_lock(&hsakmt_mutex); + + /* KFD ADD page 18, snapshot protocol violation */ + if (!g_system || NodeId >= g_system->NumNodes) { + err = HSAKMT_STATUS_INVALID_NODE_UNIT; + goto out; + } + + if (NumCaches > g_props[NodeId].node.NumCaches) { + err = HSAKMT_STATUS_INVALID_PARAMETER; + goto out; + } + + for (i = 0; i < MIN(g_props[NodeId].node.NumCaches, NumCaches); i++) { + assert(g_props[NodeId].cache); + CacheProperties[i] = g_props[NodeId].cache[i]; + } + + err = HSAKMT_STATUS_SUCCESS; + +out: + pthread_mutex_unlock(&hsakmt_mutex); + return err; +} + +HSAKMT_STATUS topology_get_iolink_props(HSAuint32 NodeId, HSAuint32 NumIoLinks, + HsaIoLinkProperties *IoLinkProperties) { + if (!g_system || !g_props || NodeId >= g_system->NumNodes) + return HSAKMT_STATUS_ERROR; + + memcpy(IoLinkProperties, g_props[NodeId].link, + NumIoLinks * sizeof(*IoLinkProperties)); + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtGetNodeIoLinkProperties(HSAuint32 NodeId, HSAuint32 NumIoLinks, + HsaIoLinkProperties *IoLinkProperties) { + HSAKMT_STATUS err; + + if (!IoLinkProperties) + return HSAKMT_STATUS_INVALID_PARAMETER; + + CHECK_DXG_OPEN(); + + pthread_mutex_lock(&hsakmt_mutex); + + /* KFD ADD page 18, snapshot protocol violation */ + if (!g_system || NodeId >= g_system->NumNodes) { + err = HSAKMT_STATUS_INVALID_NODE_UNIT; + goto out; + } + + if (NumIoLinks > g_props[NodeId].node.NumIOLinks) { + err = HSAKMT_STATUS_INVALID_PARAMETER; + goto out; + } + + assert(g_props[NodeId].link); + err = topology_get_iolink_props(NodeId, NumIoLinks, IoLinkProperties); + +out: + pthread_mutex_unlock(&hsakmt_mutex); + return err; +} + +uint16_t get_device_id_by_node_id(HSAuint32 node_id) { + if (!g_props || !g_system || g_system->NumNodes <= node_id) + return 0; + + return g_props[node_id].node.DeviceId; +} + +bool prefer_ats(HSAuint32 node_id) { + return g_props[node_id].node.Capability.ui32.HSAMMUPresent && + g_props[node_id].node.NumCPUCores && + g_props[node_id].node.NumFComputeCores; +} + +uint16_t get_device_id_by_gpu_id(HSAuint32 gpu_id) { + unsigned int i; + + if (!g_props || !g_system) + return 0; + + for (i = 0; i < g_system->NumNodes; i++) { + if (g_props[i].node.KFDGpuID == gpu_id) + return g_props[i].node.DeviceId; + } + + return 0; +} + +uint32_t get_direct_link_cpu(uint32_t gpu_node) { + HSAuint64 size = 0; + int32_t cpu_id; + HSAuint32 i; + + cpu_id = gpu_get_direct_link_cpu(gpu_node, g_props); + if (cpu_id == -1) + return INVALID_NODEID; + + assert(g_props[cpu_id].mem); + + for (i = 0; i < g_props[cpu_id].node.NumMemoryBanks; i++) + size += g_props[cpu_id].mem[i].SizeInBytes; + + return size ? (uint32_t)cpu_id : INVALID_NODEID; +} + +HSAKMT_STATUS validate_nodeid_array(uint32_t **gpu_id_array, + uint32_t NumberOfNodes, + uint32_t *NodeArray) { + HSAKMT_STATUS ret; + unsigned int i; + + if (NumberOfNodes == 0 || !NodeArray || !gpu_id_array) + return HSAKMT_STATUS_INVALID_PARAMETER; + + /* Translate Node IDs to gpu_ids */ + *gpu_id_array = (uint32_t *)malloc(NumberOfNodes * sizeof(uint32_t)); + if (!(*gpu_id_array)) + return HSAKMT_STATUS_NO_MEMORY; + for (i = 0; i < NumberOfNodes; i++) { + ret = validate_nodeid(NodeArray[i], *gpu_id_array + i); + if (ret != HSAKMT_STATUS_SUCCESS) { + free(*gpu_id_array); + break; + } + } + + return ret; +} + +uint32_t get_num_sysfs_nodes(void) { return num_sysfs_nodes; } + +rocr::core::WDDMDevice *get_wddmdev(uint32_t node_id) { + if ((!wdevices_.size()) || (!node_id) || (node_id >= num_sysfs_nodes)) + return nullptr; + + return wdevices_[node_id - 1]; +} diff --git a/util/atomic_helpers.h b/util/atomic_helpers.h new file mode 100644 index 0000000000..89cef6a638 --- /dev/null +++ b/util/atomic_helpers.h @@ -0,0 +1,519 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +/* + Helpers to use native types with C++11 atomic operations. + Fixes GCC builtin functionality for x86 with respect to WC and non-temporal + stores. +*/ +#ifndef HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_ +#define HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_ + +#include +#include "utils.h" + +//ALWAYS_CONSERVATIVE will very likely overfence your code. +//For use as a debugging aid only. +#define ALWAYS_CONSERVATIVE 0 + +#if !ALWAYS_CONSERVATIVE +#if defined(__x86_64__) || defined(_M_X64) +#define X64_ORDER_WC 1 +#endif +#if X64_ORDER_WC +#include +#endif +#endif + +namespace rocr { +namespace atomic { + +static constexpr int c11ToBuiltInFlags(std::memory_order order) +{ +#if ALWAYS_CONSERVATIVE + return __ATOMIC_RELAXED; +#elif X64_ORDER_WC + return __ATOMIC_RELAXED; +#else + return (order == std::memory_order_relaxed) ? __ATOMIC_RELAXED : + (order == std::memory_order_acquire) ? __ATOMIC_ACQUIRE : + (order == std::memory_order_release) ? __ATOMIC_RELEASE : + (order == std::memory_order_seq_cst) ? __ATOMIC_SEQ_CST : + (order == std::memory_order_consume) ? __ATOMIC_CONSUME : + (order == std::memory_order_acq_rel) ? __ATOMIC_ACQ_REL : + __ATOMIC_SEQ_CST; +#endif +} + +static __forceinline void PreFence(std::memory_order order) { +#if ALWAYS_CONSERVATIVE + switch (order) { + case std::memory_order_release: + case std::memory_order_seq_cst: + case std::memory_order_acq_rel: + __atomic_thread_fence(__ATOMIC_SEQ_CST); + default:; + } +#elif X64_ORDER_WC + switch (order) { + case std::memory_order_release: + case std::memory_order_seq_cst: + case std::memory_order_acq_rel: + _mm_sfence(); + default:; + } +#endif +} + +static __forceinline void PostFence(std::memory_order order) { +#if ALWAYS_CONSERVATIVE + switch (order) { + case std::memory_order_seq_cst: + case std::memory_order_acq_rel: + case std::memory_order_acquire: + __atomic_thread_fence(__ATOMIC_SEQ_CST); + default:; + } +#elif X64_ORDER_WC + switch (order) { + case std::memory_order_seq_cst: + return _mm_mfence(); + case std::memory_order_acq_rel: + case std::memory_order_acquire: + return _mm_lfence(); + default:; + } +#endif +} + +static __forceinline void Fence(std::memory_order order=std::memory_order_seq_cst) { +#if ALWAYS_CONSERVATIVE + __atomic_thread_fence(__ATOMIC_SEQ_CST); +#elif X64_ORDER_WC + switch (order) { + case std::memory_order_seq_cst: + case std::memory_order_acq_rel: + return _mm_mfence(); + case std::memory_order_acquire: + return _mm_lfence(); + case std::memory_order_release: + return _mm_sfence(); + default:; + } +#else + std::atomic_thread_fence(order); +#endif +} + +template +static __forceinline void BasicCheck(const T* ptr) { + constexpr bool value = __atomic_always_lock_free(sizeof(T), 0); + static_assert(value, "Atomic type may not be compatible with peripheral atomics."); +}; + +template +static __forceinline void BasicCheck(const volatile T* ptr) { + constexpr bool value = __atomic_always_lock_free(sizeof(T), 0); + static_assert(value, "Atomic type may not be compatible with peripheral atomics."); +}; + +/// @brief: Load value of type T atomically with specified memory order. +/// @param: ptr(Input), a pointer to type T. +/// @param: order(Input), memory order with atomic load, relaxed by default. +/// @return: T, loaded value. +template +static __forceinline T + Load(const T* ptr, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + T ret; + PreFence(order); + __atomic_load(ptr, &ret, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: function overloading, for more info, see previous one. +/// @param: ptr(Input), a pointer to volatile type T. +/// @param: order(Input), memory order with atomic load, relaxed by default. +/// @return: T, loaded value. +template +static __forceinline T + Load(const volatile T* ptr, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + T ret; + PreFence(order); + __atomic_load(ptr, &ret, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Store value of type T with specified memory order. +/// @param: ptr(Input), a pointer to instance which will be stored. +/// @param: val(Input), value to be stored. +/// @param: order(Input), memory order with atomic store, relaxed by default. +/// @return: void. +template +static __forceinline void Store( + T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + __atomic_store(ptr, &val, c11ToBuiltInFlags(order)); + PostFence(order); +} + +/// @brief: Function overloading, for more info, see previous one. +/// @param: ptr(Input), a pointer to volatile instance which will be stored. +/// @param: val(Input), value to be stored. +/// @param: order(Input), memory order with atomic store, relaxed by default. +/// @return: void. +template +static __forceinline void Store( + volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + __atomic_store(ptr, &val, c11ToBuiltInFlags(order)); + PostFence(order); +} + +/// @brief: Compare and swap value atomically with specified memory order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value to be stored if condition is satisfied. +/// @param: expected(Input), value which is expected. +/// @param: order(Input), memory order with atomic operation. +/// @return: T, observed value of type T. +template +static __forceinline T + Cas(T* ptr, T val, T expected, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + __atomic_compare_exchange(ptr, &expected, &val, false, c11ToBuiltInFlags(order), __ATOMIC_RELAXED); + PostFence(order); + return expected; +} + +/// @brief: Function overloading, for more info, see previous one. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value to be stored if condition is satisfied. +/// @param: expected(Input), value which is expected. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, observed value of type T. +template +static __forceinline T + Cas(volatile T* ptr, T val, T expected, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + __atomic_compare_exchange(ptr, &expected, &val, false, c11ToBuiltInFlags(order), __ATOMIC_RELAXED); + PostFence(order); + return expected; +} + +/// @brief: Exchange the value atomically with specified memory order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value to be stored. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, the value prior to the exchange. +template +static __forceinline T + Exchange(T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + T ret; + PreFence(order); + __atomic_exchange(ptr, &val, &ret, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Function overloading, for more info, see previous one. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value to be stored. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, the value prior to the exchange. +template +static __forceinline T + Exchange(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + T ret; + PreFence(order); + __atomic_exchange(ptr, &val, &ret, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Add value to variable atomically with specified memory order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value to be added. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, the value of the variable prior to the addition. +template +static __forceinline T + Add(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_add(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Subtract value from the variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value to be subtraced. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of the variable prior to the subtraction. +template +static __forceinline T + Sub(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_sub(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Bit And operation on variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value which is ANDed with variable. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + And(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_and(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Bit Or operation on variable atomically with specified memory order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value which is ORed with variable. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + Or(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_or(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Bit Xor operation on variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value which is XORed with variable. +/// @order: order(Input), memory order which is relaxed by default. +/// @return: T, valud of variable prior to the opertaion. +template +static __forceinline T + Xor(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_xor(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Increase the value of variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + Increment(T* ptr, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_add(ptr, 1, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Decrease the value of the variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + Decrement(T* ptr, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_sub(ptr, 1, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Add value to variable atomically with specified memory order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value to be added. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, the value of the variable prior to the addition. +template +static __forceinline T + Add(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_add(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Subtract value from the variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value to be subtraced. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of the variable prior to the subtraction. +template +static __forceinline T + Sub(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_sub(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Bit And operation on variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value which is ANDed with variable. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + And(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_and(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Bit Or operation on variable atomically with specified memory order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value which is ORed with variable. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T Or(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_or(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Bit Xor operation on variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value which is XORed with variable. +/// @order: order(Input), memory order which is relaxed by default. +/// @return: T, valud of variable prior to the opertaion. +template +static __forceinline T + Xor(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_xor(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Increase the value of variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + Increment(volatile T* ptr, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_add(ptr, 1, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Decrease the value of the variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + Decrement(volatile T* ptr, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_sub(ptr, 1, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} +} // namespace atomic +} // namespace rocr + +#ifdef X64_ORDER_WC +#undef X64_ORDER_WC +#endif + +#ifdef ALWAYS_CONSERVATIVE +#undef ALWAYS_CONSERVATIVE +#endif + +#endif // HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_ diff --git a/util/flag.cpp b/util/flag.cpp new file mode 100644 index 0000000000..c0ac8bd970 --- /dev/null +++ b/util/flag.cpp @@ -0,0 +1,226 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2021-2024, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIESd OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/util/flag.h" +#include "core/util/utils.h" +#include "core/util/os.h" + +#include +#include +#include +#include +#include + +namespace rocr { +FILE* log_file = stderr; +uint8_t log_flags[8]; + +void log_printf(const char* file, int line, const char* format, ...) { + va_list ap; + std::stringstream str_thrd_id; + str_thrd_id << std::hex << std::this_thread::get_id(); + va_start(ap, format); + char message[4096]; + vsnprintf(message, sizeof(message), format, ap); + va_end(ap); + fprintf(log_file, ":%-25s:%-4d: %010lld us: [pid:%-5d tid:0x%s] [***rocr***] %s\n", + file, line, os::ReadAccurateClock()/1000ULL, os::GetProcessId(), + str_thrd_id.str().c_str(), message); + fflush(log_file); +} + +// split at separators +static std::vector split(std::string& str, char sep) { + std::vector ret; + while (!str.empty()) { + size_t pos = str.find(sep); + if (pos == std::string::npos) { + ret.push_back(str); + return ret; + } + ret.push_back(str.substr(0, pos)); + str.erase(0, pos + 1); + } + return ret; +}; + +// Parse id,id-id,... strings into id lists +static std::vector get_elements(std::string& str, uint32_t maxElement) { + std::vector ret; + MAKE_NAMED_SCOPE_GUARD(error, [&]() { ret.clear(); }); + + std::vector ranges = split(str, ','); + for (auto& str : ranges) { + auto range = split(str, '-'); + // failure, too many -'s. + if (range.size() > 2) return ret; + + char* end; + uint32_t index = strtoul(range[0].c_str(), &end, 10); + // Invalid syntax - id's must be base 10 digits only. + if (*end != '\0') return ret; + if (index <= maxElement) ret.push_back(index); + + if (range.size() == 2) { + uint32_t secondindex = strtoul(range[1].c_str(), &end, 10); + if (*end != '\0') return ret; // bad syntax + if (secondindex < index) return ret; // inverted range + secondindex = Min(secondindex, maxElement); + for (uint32_t i = index + 1; i < secondindex + 1; i++) ret.push_back(i); + } + } + + // Confirm no duplicate ids. + std::sort(ret.begin(), ret.end()); + if (std::adjacent_find(ret.begin(), ret.end()) != ret.end()) return ret; + + // Good parse, keep result. + error.Dismiss(); + return ret; +}; + +/* +Parse env var per the following syntax, all whitespace is ignored: + +ID = [0-9][0-9]* ex. base 10 numbers +ID_list = (ID | ID-ID)[, (ID | ID-ID)]* ex. 0,2-4,7 +GPU_list = ID_list ex. 0,2-4,7 +CU_list = 0x[0-F]* | ID_list ex. 0x337F OR 0,2-4,7 +CU_Set = GPU_list : CU_list ex. 0,2-4,7:0-15,32-47 OR 0,2-4,7:0x337F +HSA_CU_MASK = CU_Set [; CU_Set]* ex. 0,2-4,7:0-15,32-47; 3-9:0x337F + +GPU indexes are taken post ROCR_VISIBLE_DEVICES reordering. +Listed or bit set CUs will be enabled at queue creation on the associated GPU. +All other CUs on the associated GPUs will be disabled. +CU masks of unlisted GPUs are not restricted. + +Repeating a GPU or CU ID is a syntax error. +Parsing stops at the first CU_Set that has a syntax error, that set and all +following sets are ignored. +Specifying a mask with no usable CUs (CU_list is 0x0) is a syntax error. +Users should use ROCR_VISIBLE_DEVICES if they want to exclude use of a +particular GPU. +*/ +void Flag::parse_masks(std::string& var, uint32_t maxGpu, uint32_t maxCU) { + if (var.empty()) return; + + // Remove whitespace + auto end = std::remove_if(var.begin(), var.end(), + [](char c) { return std::isspace(c, std::locale::classic()); }); + var.erase(end, var.end()); + + // Switch to uppercase + for (auto& c : var) c = toupper(c); + + // Iterate over cu sets + auto sets = split(var, ';'); + for (auto& set : sets) { + auto parts = split(set, ':'); + if (parts.size() != 2) return; + + // temp storage for cu_set parsing. + std::vector gpu_index; + std::vector mask; + + // parse cu list first, check for bitmask format + if (parts[1][1] == 'X') { + // Confirm hex format and strip prefix + auto& cu = parts[1]; + if (cu[0] != '0') return; + cu.erase(0, 2); + + // Ensure all valid hex characters + for (auto& c : cu) { + if (!isxdigit(c)) return; + } + + // Convert to uint32_t, lsb first. + size_t len = cu.length(); + while (len != 0) { + size_t trim = Min(len, size_t(8)); + len -= trim; + auto tmp = cu.substr(len, trim); + auto chunk = stoul(tmp, nullptr, 16); + mask.push_back(chunk); + } + + // Trim dwords beyond maxCUs + uint32_t maxDwords = maxCU / 32 + 1; + if (maxDwords < mask.size()) mask.resize(maxDwords); + + // Trim leading zeros + while (!mask.empty() && mask.back() == 0) mask.pop_back(); + + // Mask 0x0 is an error. + if (mask.empty()) return; + + } else { + // parse cu lists + auto cu_indices = get_elements(parts[1], maxCU); + if (cu_indices.empty()) return; + uint32_t maxdword = cu_indices.back() / 32 + 1; + mask.resize(maxdword, 0); + for (auto id : cu_indices) { + uint32_t index, offset; + index = id / 32; + offset = id % 32; + mask[index] |= 1ul << offset; + } + } + + // parse device list + gpu_index = get_elements(parts[0], maxGpu); + if (gpu_index.empty()) return; + + // Ensure that no GPU was repeated across cu_sets + for (auto id : gpu_index) { + if (cu_mask_.find(id) != cu_mask_.end()) return; + } + + // Insert into map + for (auto id : gpu_index) { + cu_mask_[id] = mask; + } + } +} + +} // namespace rocr diff --git a/util/flag.h b/util/flag.h new file mode 100644 index 0000000000..46dcf89cae --- /dev/null +++ b/util/flag.h @@ -0,0 +1,360 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2021, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIESd OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_INC_FLAG_H_ +#define HSA_RUNTIME_CORE_INC_FLAG_H_ + +#include + +#include +#include +#include + +#include "core/util/os.h" +#include "core/util/utils.h" + +namespace rocr { + +class Flag { + public: + enum SDMA_OVERRIDE { SDMA_DISABLE, SDMA_ENABLE, SDMA_DEFAULT }; + enum SRAMECC_ENABLE { SRAMECC_DISABLED, SRAMECC_ENABLED, SRAMECC_DEFAULT }; + + // The values are meaningful and chosen to satisfy the thunk API. + enum XNACK_REQUEST { XNACK_DISABLE = 0, XNACK_ENABLE = 1, XNACK_UNCHANGED = 2 }; + static_assert(XNACK_DISABLE == 0, "XNACK_REQUEST enum values improperly changed."); + static_assert(XNACK_ENABLE == 1, "XNACK_REQUEST enum values improperly changed."); + + // Lift limit for 2.10 release RCCL workaround. + const size_t DEFAULT_SCRATCH_SINGLE_LIMIT = 146800640; // small_limit >> 2; + + explicit Flag() { Refresh(); } + + virtual ~Flag() {} + + void Refresh() { + std::string var = os::GetEnvVar("HSA_CHECK_FLAT_SCRATCH"); + check_flat_scratch_ = (var == "1") ? true : false; + + var = os::GetEnvVar("HSA_ENABLE_VM_FAULT_MESSAGE"); + enable_vm_fault_message_ = (var == "0") ? false : true; + + var = os::GetEnvVar("HSA_ENABLE_QUEUE_FAULT_MESSAGE"); + enable_queue_fault_message_ = (var == "0") ? false : true; + + var = os::GetEnvVar("HSA_ENABLE_INTERRUPT"); + enable_interrupt_ = (var == "0") ? false : true; + + var = os::GetEnvVar("HSA_ENABLE_SDMA"); + enable_sdma_ = (var == "0") ? SDMA_DISABLE : ((var == "1") ? SDMA_ENABLE : SDMA_DEFAULT); + + var = os::GetEnvVar("HSA_ENABLE_PEER_SDMA"); + enable_peer_sdma_ = (var == "0") ? SDMA_DISABLE : ((var == "1") ? SDMA_ENABLE : SDMA_DEFAULT); + + var = os::GetEnvVar("HSA_ENABLE_SDMA_GANG"); + enable_sdma_gang_ = (var == "0") ? SDMA_DISABLE : + ((var == "1") ? SDMA_ENABLE : SDMA_DEFAULT); + + var = os::GetEnvVar("HSA_ENABLE_SDMA_COPY_SIZE_OVERRIDE"); + enable_sdma_copy_size_override_ = (var == "0") ? SDMA_DISABLE : + ((var == "1") ? SDMA_ENABLE : SDMA_DEFAULT); + + visible_gpus_ = os::GetEnvVar("ROCR_VISIBLE_DEVICES"); + filter_visible_gpus_ = os::IsEnvVarSet("ROCR_VISIBLE_DEVICES"); + + var = os::GetEnvVar("HSA_RUNNING_UNDER_VALGRIND"); + running_valgrind_ = (var == "1") ? true : false; + + var = os::GetEnvVar("HSA_SDMA_WAIT_IDLE"); + sdma_wait_idle_ = (var == "1") ? true : false; + + var = os::GetEnvVar("HSA_MAX_QUEUES"); + max_queues_ = static_cast(atoi(var.c_str())); + + // Maximum amount of scratch mem that can be used per process per gpu + var = os::GetEnvVar("HSA_SCRATCH_MEM"); + scratch_mem_size_ = atoi(var.c_str()); + + // Scratch memory sizes > HSA_SCRATCH_SINGLE_LIMIT will trigger a use-once scheme + // We also reserve HSA_SCRATCH_SINGLE_LIMIT per process per gpu to guarrantee we + // have sufficient memory to for scratch in case user tried to allocate all device + // memory + if (os::IsEnvVarSet("HSA_SCRATCH_SINGLE_LIMIT")) { + var = os::GetEnvVar("HSA_SCRATCH_SINGLE_LIMIT"); + scratch_single_limit_ = atoi(var.c_str()); + } else { + scratch_single_limit_ = DEFAULT_SCRATCH_SINGLE_LIMIT; + } + + tools_lib_names_ = os::GetEnvVar("HSA_TOOLS_LIB"); + + var = os::GetEnvVar("HSA_TOOLS_REPORT_LOAD_FAILURE"); + + ifdebug { + report_tool_load_failures_ = (var == "1") ? true : false; + } else { + report_tool_load_failures_ = (var == "0") ? false : true; + } + + var = os::GetEnvVar("HSA_DISABLE_FRAGMENT_ALLOCATOR"); + disable_fragment_alloc_ = (var == "1") ? true : false; + + var = os::GetEnvVar("HSA_ENABLE_SDMA_HDP_FLUSH"); + enable_sdma_hdp_flush_ = (var == "0") ? false : true; + + var = os::GetEnvVar("HSA_REV_COPY_DIR"); + rev_copy_dir_ = (var == "1") ? true : false; + + var = os::GetEnvVar("HSA_FORCE_FINE_GRAIN_PCIE"); + fine_grain_pcie_ = (var == "1") ? true : false; + + var = os::GetEnvVar("HSA_NO_SCRATCH_RECLAIM"); + no_scratch_reclaim_ = (var == "1") ? true : false; + + var = os::GetEnvVar("HSA_NO_SCRATCH_THREAD_LIMITER"); + no_scratch_thread_limit_ = (var == "1") ? true : false; + + var = os::GetEnvVar("HSA_DISABLE_IMAGE"); + disable_image_ = (var == "1") ? true : false; + + var = os::GetEnvVar("HSA_DISABLE_PC_SAMPLING"); + disable_pc_sampling_ = (var == "1") ? true : false; + + var = os::GetEnvVar("HSA_LOADER_ENABLE_MMAP_URI"); + loader_enable_mmap_uri_ = (var == "1") ? true : false; + + var = os::GetEnvVar("HSA_FORCE_SDMA_SIZE"); + force_sdma_size_ = var.empty() ? 1024 * 1024 : atoi(var.c_str()); + + var = os::GetEnvVar("HSA_IGNORE_SRAMECC_MISREPORT"); + check_sramecc_validity_ = (var == "1") ? false : true; + + // Legal values are zero "0" or one "1". Any other value will + // be interpreted as not defining the env variable. + var = os::GetEnvVar("HSA_XNACK"); + xnack_ = (var == "0") ? XNACK_DISABLE : ((var == "1") ? XNACK_ENABLE : XNACK_UNCHANGED); + + var = os::GetEnvVar("HSA_ENABLE_DEBUG"); + debug_ = (var == "1") ? true : false; + + var = os::GetEnvVar("HSA_CU_MASK_SKIP_INIT"); + cu_mask_skip_init_ = (var == "1") ? true : false; + + // Temporary opt-in for corrected HSA_AMD_AGENT_INFO_COOPERATIVE_COMPUTE_UNIT_COUNT behavior. + // Will become opt-out and possibly removed in future releases. + var = os::GetEnvVar("HSA_COOP_CU_COUNT"); + coop_cu_count_ = (var == "1") ? true : false; + + var = os::GetEnvVar("HSA_DISCOVER_COPY_AGENTS"); + discover_copy_agents_ = (var == "1") ? true : false; + + var = os::GetEnvVar("HSA_SVM_PROFILE"); + svm_profile_ = var; + + var = os::GetEnvVar("HSA_ENABLE_SRAMECC"); + sramecc_enable_ = + (var == "0") ? SRAMECC_DISABLED : ((var == "1") ? SRAMECC_ENABLED : SRAMECC_DEFAULT); + + var = os::GetEnvVar("HSA_IMAGE_PRINT_SRD"); + image_print_srd_ = (var == "1") ? true : false; + + var = os::GetEnvVar("HSA_ENABLE_MWAITX"); + enable_mwaitx_ = (var == "1") ? true : false; + + // Temporary environment variable to disable CPU affinity override + // Will either rename to HSA_OVERRIDE_CPU_AFFINITY later or remove completely. + var = os::GetEnvVar("HSA_OVERRIDE_CPU_AFFINITY_DEBUG"); + override_cpu_affinity_ = (var == "0") ? false : true; + } + + void parse_masks(uint32_t maxGpu, uint32_t maxCU) { + std::string var = os::GetEnvVar("HSA_CU_MASK"); + parse_masks(var, maxGpu, maxCU); + } + + bool check_flat_scratch() const { return check_flat_scratch_; } + + bool enable_vm_fault_message() const { return enable_vm_fault_message_; } + + bool enable_queue_fault_message() const { return enable_queue_fault_message_; } + + bool enable_interrupt() const { return enable_interrupt_; } + + bool enable_sdma_hdp_flush() const { return enable_sdma_hdp_flush_; } + + bool running_valgrind() const { return running_valgrind_; } + + bool sdma_wait_idle() const { return sdma_wait_idle_; } + + bool report_tool_load_failures() const { return report_tool_load_failures_; } + + bool disable_fragment_alloc() const { return disable_fragment_alloc_; } + + bool rev_copy_dir() const { return rev_copy_dir_; } + + bool fine_grain_pcie() const { return fine_grain_pcie_; } + + bool no_scratch_reclaim() const { return no_scratch_reclaim_; } + + bool no_scratch_thread_limiter() const { return no_scratch_thread_limit_; } + + SDMA_OVERRIDE enable_sdma() const { return enable_sdma_; } + + SDMA_OVERRIDE enable_peer_sdma() const { return enable_peer_sdma_; } + + SDMA_OVERRIDE enable_sdma_gang() const { return enable_sdma_gang_; } + + SDMA_OVERRIDE enable_sdma_copy_size_override() const { return enable_sdma_copy_size_override_; } + + std::string visible_gpus() const { return visible_gpus_; } + + bool filter_visible_gpus() const { return filter_visible_gpus_; } + + uint32_t max_queues() const { return max_queues_; } + + size_t scratch_mem_size() const { return scratch_mem_size_; } + + size_t scratch_single_limit() const { return scratch_single_limit_; } + + std::string tools_lib_names() const { return tools_lib_names_; } + + bool disable_image() const { return disable_image_; } + + bool disable_pc_sampling() const { return disable_pc_sampling_; } + + bool loader_enable_mmap_uri() const { return loader_enable_mmap_uri_; } + + size_t force_sdma_size() const { return force_sdma_size_; } + + bool check_sramecc_validity() const { return check_sramecc_validity_; } + + bool override_cpu_affinity() const { return override_cpu_affinity_; } + + bool image_print_srd() const { return image_print_srd_; } + + bool check_mwaitx(bool mwaitx_supported) { + if (enable_mwaitx_ && !mwaitx_supported) enable_mwaitx_ = false; + + return enable_mwaitx_; + } + + XNACK_REQUEST xnack() const { return xnack_; } + + bool debug() const { return debug_; } + + const std::vector& cu_mask(uint32_t gpu_index) const { + static const std::vector empty; + auto it = cu_mask_.find(gpu_index); + if (it == cu_mask_.end()) return empty; + return it->second; + } + + bool cu_mask_skip_init() const { return cu_mask_skip_init_; } + + bool coop_cu_count() const { return coop_cu_count_; } + + bool discover_copy_agents() const { return discover_copy_agents_; } + + const std::string& svm_profile() const { return svm_profile_; } + + SRAMECC_ENABLE sramecc_enable() const { return sramecc_enable_; } + + private: + bool check_flat_scratch_; + bool enable_vm_fault_message_; + bool enable_interrupt_; + bool enable_sdma_hdp_flush_; + bool running_valgrind_; + bool sdma_wait_idle_; + bool enable_queue_fault_message_; + bool report_tool_load_failures_; + bool disable_fragment_alloc_; + bool rev_copy_dir_; + bool fine_grain_pcie_; + bool no_scratch_reclaim_; + bool no_scratch_thread_limit_; + bool disable_image_; + bool disable_pc_sampling_; + bool loader_enable_mmap_uri_; + bool check_sramecc_validity_; + bool debug_; + bool cu_mask_skip_init_; + bool coop_cu_count_; + bool discover_copy_agents_; + bool override_cpu_affinity_; + bool image_print_srd_; + bool enable_mwaitx_; + + SDMA_OVERRIDE enable_sdma_; + SDMA_OVERRIDE enable_peer_sdma_; + SDMA_OVERRIDE enable_sdma_gang_; + SDMA_OVERRIDE enable_sdma_copy_size_override_; + + bool filter_visible_gpus_; + std::string visible_gpus_; + + uint32_t max_queues_; + + size_t scratch_mem_size_; + size_t scratch_single_limit_; + + std::string tools_lib_names_; + std::string svm_profile_; + + size_t force_sdma_size_; + + // Indicates user preference for Xnack state. + XNACK_REQUEST xnack_; + + SRAMECC_ENABLE sramecc_enable_; + + // Map GPU index post RVD to its default cu mask. + std::map> cu_mask_; + + void parse_masks(std::string& args, uint32_t maxGpu, uint32_t maxCU); + + DISALLOW_COPY_AND_ASSIGN(Flag); +}; + +} // namespace rocr + +#endif // header guard diff --git a/util/lazy_ptr.h b/util/lazy_ptr.h new file mode 100644 index 0000000000..2aef6a3bf3 --- /dev/null +++ b/util/lazy_ptr.h @@ -0,0 +1,155 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIESd OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_ +#define HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_ + +#include +#include +#include + +#include "core/util/locks.h" +#include "core/util/utils.h" + +namespace rocr { + +/* + * Wrapper for a std::unique_ptr that initializes its object at first use. + */ +template class lazy_ptr { + public: + lazy_ptr() {} + + explicit lazy_ptr(std::function Constructor) { reset(Constructor); } + + lazy_ptr(lazy_ptr&& rhs) { + obj = std::move(rhs.obj); + func = std::move(rhs.func); + } + + lazy_ptr& operator=(lazy_ptr&& rhs) { + obj = std::move(rhs.obj); + func = std::move(rhs.func); + } + + lazy_ptr(lazy_ptr&) = delete; + lazy_ptr& operator=(lazy_ptr&) = delete; + + void reset(std::function Constructor = nullptr) { + obj.reset(); + func = Constructor; + } + + void reset(T* ptr) { + obj.reset(ptr); + func = nullptr; + } + + bool operator==(T* rhs) const { return obj.get() == rhs; } + bool operator!=(T* rhs) const { return obj.get() != rhs; } + + const std::unique_ptr& operator->() const { + make(true); + assert(obj != nullptr && "Null dereference through lazy_ptr."); + return obj; + } + + std::unique_ptr& operator*() { + make(true); + return obj; + } + + const std::unique_ptr& operator*() const { + make(true); + return obj; + } + + /* + * Ensures that the object is created or is being created. + * This is useful when early construction of the object is required. + */ + void touch() const { make(false); } + + // Tells if the lazy object has been constructed or not. + // Construction may fail silently (return nullptr). + bool created() const { + std::atomic_thread_fence(std::memory_order_acquire); + return func == nullptr; + } + + // Tells if the lazy object exists or not. + bool empty() const { + std::atomic_thread_fence(std::memory_order_acquire); + return obj == nullptr; + } + + private: + mutable std::unique_ptr obj; + mutable std::function func; + mutable KernelMutex lock; + + // Separated from make to improve inlining. + void make_body(bool block) const { + if (block) { + lock.Acquire(); + } else if (!lock.Try()) { + return; + } + MAKE_SCOPE_GUARD([&]() { lock.Release(); }); + if (func == nullptr) return; + T* ptr = func(); + obj.reset(ptr); + std::atomic_thread_fence(std::memory_order_release); + func = nullptr; + } + + __forceinline void make(bool block) const { + if (!created()) { + make_body(block); + } + } + +}; + +} // namespace rocr + +#endif // HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_ diff --git a/util/lnx/os_linux.cpp b/util/lnx/os_linux.cpp new file mode 100644 index 0000000000..aecca6c0fd --- /dev/null +++ b/util/lnx/os_linux.cpp @@ -0,0 +1,771 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifdef __linux__ +#include "core/util/os.h" +#include "core/util/utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "core/inc/runtime.h" +#if defined(__i386__) || defined(__x86_64__) +#include +#endif + +namespace rocr { +namespace os { + +struct ThreadArgs { + void* entry_args; + ThreadEntry entry_function; +}; + +void* __stdcall ThreadTrampoline(void* arg) { + ThreadArgs* ar = (ThreadArgs*)arg; + ThreadEntry CallMe = ar->entry_function; + void* Data = ar->entry_args; + delete ar; + CallMe(Data); + return nullptr; +} + +// Thread container allows multiple waits and separate close (destroy). +class os_thread { + public: + explicit os_thread(ThreadEntry function, void* threadArgument, uint stackSize) + : thread(0), lock(nullptr), state(RUNNING) { + int err; + std::unique_ptr args(new ThreadArgs); + lock = CreateMutex(); + if (lock == nullptr) return; + + args->entry_args = threadArgument; + args->entry_function = function; + + pthread_attr_t attrib; + err = pthread_attr_init(&attrib); + if (err != 0) { + fprintf(stderr, "pthread_attr_init failed: %s\n", strerror(err)); + return; + } + + if (stackSize != 0) { + stackSize = Max(uint(PTHREAD_STACK_MIN), stackSize); + stackSize = AlignUp(stackSize, 4096); + err = pthread_attr_setstacksize(&attrib, stackSize); + if (err != 0) { + fprintf(stderr, "pthread_attr_setstacksize failed: %s\n", strerror(err)); + err = pthread_attr_destroy(&attrib); + if (err != 0) { + fprintf(stderr, "pthread_attr_destroy failed: %s\n", strerror(err)); + return; + } + } + } + + int cores = 0; + cpu_set_t* cpuset = nullptr; + + if (core::Runtime::runtime_singleton_->flag().override_cpu_affinity()) { + cores = get_nprocs_conf(); + cpuset = CPU_ALLOC(cores); + if (cpuset == nullptr) { + fprintf(stderr, "CPU_ALLOC failed: %s\n", strerror(errno)); + return; + } + CPU_ZERO_S(CPU_ALLOC_SIZE(cores), cpuset); + for (int i = 0; i < cores; i++) { + CPU_SET_S(i, CPU_ALLOC_SIZE(cores), cpuset); + } + err = pthread_attr_setaffinity_np(&attrib, CPU_ALLOC_SIZE(cores), cpuset); + CPU_FREE(cpuset); + if (err != 0) { + fprintf(stderr, "pthread_setaffinity_np failed: %s\n", strerror(err)); + return; + } + } + + err = pthread_create(&thread, &attrib, ThreadTrampoline, args.get()); + + // Probably a stack size error since system limits can be different from PTHREAD_STACK_MIN + // Attempt to grow the stack within reason. + if ((err == EINVAL) && stackSize != 0) { + while (stackSize < 20 * 1024 * 1024) { + stackSize *= 2; + err = pthread_attr_setstacksize(&attrib, stackSize); + if (err != 0) { + fprintf(stderr, "pthread_attr_setstacksize failed: %s\n", strerror(err)); + return; + } + err = pthread_create(&thread, &attrib, ThreadTrampoline, args.get()); + if (err != EINVAL) break; + debug_print("pthread_create returned EINVAL, doubling stack size\n"); + } + } + + if (err == 0) + args.release(); + else + thread = 0; + + err = pthread_attr_destroy(&attrib); + if (err != 0) { + fprintf(stderr, "pthread_attr_destroy failed: %s\n", strerror(err)); + } + } + + os_thread(os_thread&& rhs) { + thread = rhs.thread; + lock = rhs.lock; + state = int(rhs.state); + rhs.thread = 0; + rhs.lock = nullptr; + } + + os_thread(os_thread&) = delete; + + ~os_thread() { + if (lock != nullptr) DestroyMutex(lock); + if ((state == RUNNING) && (thread != 0)) { + int err = pthread_detach(thread); + if (err != 0) fprintf(stderr, "pthread_detach failed: %s\n", strerror(err)); + } + } + + bool Valid() { return (lock != nullptr) && (thread != 0); } + + bool Wait() { + if (state == FINISHED) return true; + AcquireMutex(lock); + if (state == FINISHED) { + ReleaseMutex(lock); + return true; + } + int err = pthread_join(thread, NULL); + bool success = (err == 0); + if (success) state = FINISHED; + ReleaseMutex(lock); + return success; + } + + private: + pthread_t thread; + Mutex lock; + std::atomic state; + enum { FINISHED = 0, RUNNING = 1 }; +}; + +static_assert(sizeof(LibHandle) == sizeof(void*), "OS abstraction size mismatch"); +static_assert(sizeof(Semaphore) == sizeof(sem_t*), "OS abstraction size mismatch"); +static_assert(sizeof(Mutex) == sizeof(pthread_mutex_t*), "OS abstraction size mismatch"); +static_assert(sizeof(SharedMutex) == sizeof(pthread_rwlock_t*), "OS abstraction size mismatch"); +static_assert(sizeof(Thread) == sizeof(os_thread*), "OS abstraction size mismatch"); + +LibHandle LoadLib(std::string filename) { + void* ret = dlopen(filename.c_str(), RTLD_LAZY); + if (ret == nullptr) debug_print("LoadLib(%s) failed: %s\n", filename.c_str(), dlerror()); + return *(LibHandle*)&ret; +} + +void* GetExportAddress(LibHandle lib, std::string export_name) { + void* ret = dlsym(*(void**)&lib, export_name.c_str()); + + // dlsym searches the given library and all the library's load dependencies. + // Remaining code limits symbol lookup to only the library handle given. + // This lookup pattern matches Windows. + if (ret == NULL) return ret; + + link_map* map; + int err = dlinfo(*(void**)&lib, RTLD_DI_LINKMAP, &map); + if (err == -1) { + fprintf(stderr, "dlinfo failed: %s\n", dlerror()); + return nullptr; + } + + Dl_info info; + err = dladdr(ret, &info); + if (err == 0) { + fprintf(stderr, "dladdr failed.\n"); + return nullptr; + } + + if (strcmp(info.dli_fname, map->l_name) == 0) return ret; + + return NULL; +} + +void CloseLib(LibHandle lib) { dlclose(*(void**)&lib); } + +/* + * @brief Look for a symbol called "HSA_AMD_TOOL_PRIORITY" across all loaded + * shared libraries, and if found, store the name of the library + * + * @param[in]: info A dl_phdr_info struct pointer, which contains information + * about library's load address, header, and name. + * + * @param[in]: size integer size of dl_phdr_info struct + * + * @param[out]: data copy of the data argument to dl_phdr_iterate call + * + * @retval:: Return 0 on Success. If callback returns a non-zero value, + * dl_iterate_phdr() will stop processing, even if there are unprocessed + * shared objects. + */ + +static int callback(struct dl_phdr_info* info, size_t size, void* data) { + std::vector* loadedToolsLib = (std::vector*)data; + assert(loadedToolsLib != nullptr); + /* + * Check if lib name is not empty and its not a "vdso.so" lib, + * The vDSO is a special shared object file that is built into the Linux kernel. + * It is not a regular shared library and thus does not have all the properties + * of regular shared libraries. The way the vDSO is loaded and organized in memory + * is different from regular shared libraries and it's not guaranteed that it + * will have a specific segment or section. Hence its skipped. + */ + + if ((info) && (info->dlpi_name[0] != '\0')) { + if (std::string(info->dlpi_name).find("vdso.so") != std::string::npos) return 0; + + /* + * Iterate through the program headers of the loaded lib and check for PT_DYNAMIC program + * header. If the PT_DYNAMIC program header is found, use dlpi_addr and dlpi_phdr members + * of dl_phdr_info struct to get the address of the dynamic section of the loaded + * library in memory + */ + + for (int i = 0; i < info->dlpi_phnum; i++) { + if (info->dlpi_phdr[i].p_type == PT_DYNAMIC) { + Elf64_Dyn* dyn_section = (Elf64_Dyn*)(info->dlpi_addr + info->dlpi_phdr[i].p_vaddr); + + char* strings = nullptr; + Elf64_Xword limit = 0; + + /* + * The dynamic section is searched for DT_STRTAB (address of string table), + * and DT_STRSZ (size of string table) + * DT_NULL - Marks the end of the _DYNAMIC array + */ + + for (int j = 0;; j++) { + if (dyn_section[j].d_tag == DT_NULL) break; + + if (dyn_section[j].d_tag == DT_STRTAB) strings = (char*)(dyn_section[j].d_un.d_ptr); + + if (dyn_section[j].d_tag == DT_STRSZ) limit = dyn_section[j].d_un.d_val; + } + + if (strings == nullptr) debug_print("String table not found"); + + /* + * Hacky lookup, if string and symbol tables are found, + * iterate through the strings in string table and check if + * any string matches "HSA_AMD_TOOL_PRIORITY". + * If yes, then add the name of the library to the vector of + * lib names + */ + if (strings != nullptr) { + char* end = strings + limit; + while (strings < end) { + if (strcmp(strings, "HSA_AMD_TOOL_PRIORITY") == 0) { + loadedToolsLib->push_back(info->dlpi_name); + return 0; + } + strings += (strlen(strings) + 1); + } + } + } + } + } + return 0; +} + +std::vector GetLoadedToolsLib() { + std::vector ret; + std::vector names; + + /* Iterate through all of the loaded shared libraries in the process */ + dl_iterate_phdr(callback, &names); + + if (!names.empty()) { + for (auto& name : names) ret.push_back(LoadLib(name)); + } + + return ret; +} + +std::string GetLibraryName(LibHandle lib) { + link_map *map; + if(dlinfo(lib, RTLD_DI_LINKMAP, &map)!=0) + return ""; + return map->l_name; +} + +Semaphore CreateSemaphore() { + sem_t *sem = new sem_t; + sem_init(sem, 0, 0); + return *(Semaphore*)&sem; +} + +bool WaitSemaphore(Semaphore sem) { + while(sem_wait(*(sem_t**)&sem)) + if (errno != EINTR) return false; + + return true; +} + +void PostSemaphore(Semaphore sem) { + if (sem_post(*(sem_t**)&sem)) + assert(false && "Failed to post semaphore"); +} + +void DestroySemaphore(Semaphore sem) { + sem_destroy(*(sem_t**)&sem); + delete *(sem_t**)&sem; +} + +Mutex CreateMutex() { + pthread_mutex_t* mutex = new pthread_mutex_t; + pthread_mutex_init(mutex, NULL); + return *(Mutex*)&mutex; +} + +bool TryAcquireMutex(Mutex lock) { + return pthread_mutex_trylock(*(pthread_mutex_t**)&lock) == 0; +} + +bool AcquireMutex(Mutex lock) { + return pthread_mutex_lock(*(pthread_mutex_t**)&lock) == 0; +} + +void ReleaseMutex(Mutex lock) { + pthread_mutex_unlock(*(pthread_mutex_t**)&lock); +} + +void DestroyMutex(Mutex lock) { + pthread_mutex_destroy(*(pthread_mutex_t**)&lock); + delete *(pthread_mutex_t**)&lock; +} + +void Sleep(int delay_in_millisec) { usleep(delay_in_millisec * 1000); } + +void uSleep(int delayInUs) { usleep(delayInUs); } + +void YieldThread() { sched_yield(); } + +Thread CreateThread(ThreadEntry function, void* threadArgument, uint stackSize) { + os_thread* result = new os_thread(function, threadArgument, stackSize); + if (!result->Valid()) { + delete result; + return nullptr; + } + + return reinterpret_cast(result); +} + +void CloseThread(Thread thread) { delete reinterpret_cast(thread); } + +bool WaitForThread(Thread thread) { return reinterpret_cast(thread)->Wait(); } + +bool WaitForAllThreads(Thread* threads, uint threadCount) { + for (uint i = 0; i < threadCount; i++) WaitForThread(threads[i]); + return true; +} + +bool IsEnvVarSet(std::string env_var_name) { + char* buff = NULL; + buff = getenv(env_var_name.c_str()); + return (buff != NULL); +} + +void SetEnvVar(std::string env_var_name, std::string env_var_value) { + setenv(env_var_name.c_str(), env_var_value.c_str(), 1); +} + +int GetProcessId() { + return ::getpid(); +} + +std::string GetEnvVar(std::string env_var_name) { + char* buff; + buff = getenv(env_var_name.c_str()); + std::string ret; + if (buff) { + ret = buff; + } + return ret; +} + +size_t GetUserModeVirtualMemorySize() { +#ifdef _LP64 + // https://www.kernel.org/doc/Documentation/x86/x86_64/mm.txt : + // user space is 0000000000000000 - 00007fffffffffff (=47 bits) + return (size_t)(0x800000000000); +#else + return (size_t)(0xffffffff); // ~4GB +#endif +} + +size_t GetUsablePhysicalHostMemorySize() { + struct sysinfo info = {0}; + if (sysinfo(&info) != 0) { + return 0; + } + + const size_t physical_size = + static_cast(info.totalram * info.mem_unit); + return std::min(GetUserModeVirtualMemorySize(), physical_size); +} + +uintptr_t GetUserModeVirtualMemoryBase() { return (uintptr_t)0; } + +// Os event implementation +typedef struct EventDescriptor_ { + pthread_cond_t event; + pthread_mutex_t mutex; + bool state; + bool auto_reset; +} EventDescriptor; + +EventHandle CreateOsEvent(bool auto_reset, bool init_state) { + EventDescriptor* eventDescrp; + eventDescrp = (EventDescriptor*)malloc(sizeof(EventDescriptor)); + + pthread_mutex_init(&eventDescrp->mutex, NULL); + pthread_cond_init(&eventDescrp->event, NULL); + eventDescrp->auto_reset = auto_reset; + eventDescrp->state = init_state; + + EventHandle handle = reinterpret_cast(eventDescrp); + + return handle; +} + +int DestroyOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + + EventDescriptor* eventDescrp = reinterpret_cast(event); + int ret_code = pthread_cond_destroy(&eventDescrp->event); + ret_code |= pthread_mutex_destroy(&eventDescrp->mutex); + free(eventDescrp); + return ret_code; +} + +int WaitForOsEvent(EventHandle event, unsigned int milli_seconds) { + if (event == NULL) { + return -1; + } + + EventDescriptor* eventDescrp = reinterpret_cast(event); + // Event wait time is 0 and state is non-signaled, return directly + if (milli_seconds == 0) { + int tmp_ret = pthread_mutex_trylock(&eventDescrp->mutex); + if (tmp_ret == EBUSY) { + // Timeout + return 1; + } + } + + int ret_code = 0; + pthread_mutex_lock(&eventDescrp->mutex); + if (!eventDescrp->state) { + if (milli_seconds == 0) { + ret_code = 1; + } else { + struct timespec ts; + struct timeval tp; + + ret_code = gettimeofday(&tp, NULL); + ts.tv_sec = tp.tv_sec; + ts.tv_nsec = tp.tv_usec * 1000; + + unsigned int sec = milli_seconds / 1000; + unsigned int mSec = milli_seconds % 1000; + + ts.tv_sec += sec; + ts.tv_nsec += mSec * 1000000; + + // More then one second, add 1 sec to the tv_sec elem + if (ts.tv_nsec > 1000000000) { + ts.tv_sec += 1; + ts.tv_nsec = ts.tv_nsec - 1000000000; + } + + ret_code = + pthread_cond_timedwait(&eventDescrp->event, &eventDescrp->mutex, &ts); + // Time out + if (ret_code == 110) { + ret_code = 0x14003; // 1 means time out in HSA + } + + if (ret_code == 0 && eventDescrp->auto_reset) { + eventDescrp->state = false; + } + } + } else if (eventDescrp->auto_reset) { + eventDescrp->state = false; + } + pthread_mutex_unlock(&eventDescrp->mutex); + + return ret_code; +} + +int SetOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + + EventDescriptor* eventDescrp = reinterpret_cast(event); + int ret_code = 0; + ret_code = pthread_mutex_lock(&eventDescrp->mutex); + eventDescrp->state = true; + ret_code = pthread_mutex_unlock(&eventDescrp->mutex); + ret_code |= pthread_cond_signal(&eventDescrp->event); + + return ret_code; +} + +int ResetOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + + EventDescriptor* eventDescrp = reinterpret_cast(event); + int ret_code = 0; + ret_code = pthread_mutex_lock(&eventDescrp->mutex); + eventDescrp->state = false; + ret_code = pthread_mutex_unlock(&eventDescrp->mutex); + + return ret_code; +} + +static double invPeriod = 0.0; + +uint64_t ReadAccurateClock() { + if (invPeriod == 0.0) AccurateClockFrequency(); + timespec time; + int err = clock_gettime(CLOCK_MONOTONIC_RAW, &time); + if (err != 0) { + perror("clock_gettime(CLOCK_MONOTONIC_RAW,...) failed"); + abort(); + } + return (uint64_t(time.tv_sec) * 1000000000ull + uint64_t(time.tv_nsec)) * invPeriod; +} + +uint64_t AccurateClockFrequency() { + static clockid_t clock = CLOCK_MONOTONIC; + static std::atomic first(true); + // Check kernel version - not a concurrency concern. + // use non-RAW for getres due to bug in older 2.6.x kernels + if (first.load(std::memory_order_acquire)) { + utsname kernelInfo; + if (uname(&kernelInfo) == 0) { + try { + std::string ver = kernelInfo.release; + size_t idx; + int major = std::stoi(ver, &idx); + int minor = std::stoi(ver.substr(idx + 1)); + if ((major >= 4) && (minor >= 4)) { + clock = CLOCK_MONOTONIC_RAW; + } + } catch (...) { + // Kernel version string doesn't conform to the standard pattern. + // Keep using the "safe" (non-RAW) clock. + } + } + first.store(false, std::memory_order_release); + } + timespec time; + int err = clock_getres(clock, &time); + if (err != 0) { + perror("clock_getres failed"); + abort(); + } + if (time.tv_sec != 0 || time.tv_nsec >= 0xFFFFFFFF) { + fprintf(stderr, + "clock_getres(CLOCK_MONOTONIC(_RAW),...) returned very low " + "frequency (<1Hz).\n"); + abort(); + } + if (invPeriod == 0.0) invPeriod = 1.0 / double(time.tv_nsec); + return 1000000000ull / uint64_t(time.tv_nsec); +} + +SharedMutex CreateSharedMutex() { + pthread_rwlockattr_t attrib; + int err = pthread_rwlockattr_init(&attrib); + if (err != 0) { + fprintf(stderr, "rw lock attribute init failed: %s\n", strerror(err)); + return nullptr; + } + +#ifdef __GLIBC__ + err = pthread_rwlockattr_setkind_np(&attrib, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP); + if (err != 0) { + fprintf(stderr, "Set rw lock attribute failure: %s\n", strerror(err)); + return nullptr; + } +#else + err = pthread_rwlockattr_setkind(&attrib, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP); + if (err != 0) { + fprintf(stderr, "Set rw lock attribute failure: %s\n", strerror(err)); + return nullptr; + } +#endif + + pthread_rwlock_t* lock = new pthread_rwlock_t; + err = pthread_rwlock_init(lock, &attrib); + if (err != 0) { + fprintf(stderr, "rw lock init failed: %s\n", strerror(err)); + return nullptr; + } + + pthread_rwlockattr_destroy(&attrib); + return lock; +} + +bool TryAcquireSharedMutex(SharedMutex lock) { + int err = pthread_rwlock_trywrlock(*(pthread_rwlock_t**)&lock); + return err == 0; +} + +bool AcquireSharedMutex(SharedMutex lock) { + int err = pthread_rwlock_wrlock(*(pthread_rwlock_t**)&lock); + return err == 0; +} + +void ReleaseSharedMutex(SharedMutex lock) { + int err = pthread_rwlock_unlock(*(pthread_rwlock_t**)&lock); + if (err != 0) { + fprintf(stderr, "SharedMutex unlock failed: %s\n", strerror(err)); + abort(); + } +} + +bool TrySharedAcquireSharedMutex(SharedMutex lock) { + int err = pthread_rwlock_tryrdlock(*(pthread_rwlock_t**)&lock); + return err == 0; +} + +bool SharedAcquireSharedMutex(SharedMutex lock) { + int err = pthread_rwlock_rdlock(*(pthread_rwlock_t**)&lock); + return err == 0; +} + +void SharedReleaseSharedMutex(SharedMutex lock) { + int err = pthread_rwlock_unlock(*(pthread_rwlock_t**)&lock); + if (err != 0) { + fprintf(stderr, "SharedMutex unlock failed: %s\n", strerror(err)); + abort(); + } +} + +void DestroySharedMutex(SharedMutex lock) { + pthread_rwlock_destroy(*(pthread_rwlock_t**)&lock); + delete *(pthread_rwlock_t**)&lock; +} + +static uint64_t sys_clock_period_ = 0; + +uint64_t ReadSystemClock() { + struct timespec ts; + clock_gettime(CLOCK_BOOTTIME, &ts); + uint64_t time = (uint64_t(ts.tv_sec) * 1000000000 + uint64_t(ts.tv_nsec)); + if (sys_clock_period_ != 1) + return time / sys_clock_period_; + else + return time; +} + +uint64_t SystemClockFrequency() { + struct timespec ts; + clock_getres(CLOCK_BOOTTIME, &ts); + sys_clock_period_ = (uint64_t(ts.tv_sec) * 1000000000 + uint64_t(ts.tv_nsec)); + return 1000000000 / sys_clock_period_; +} + +bool ParseCpuID(cpuid_t* cpuinfo) { +#if defined(__i386__) || defined(__x86_64__) + uint32_t eax, ebx, ecx, edx, max_eax = 0; + memset(cpuinfo, 0, sizeof(*cpuinfo)); + + /* Make sure current CPU supports at least EAX 4 */ + if (!__get_cpuid_max(0x80000004, NULL)) return false; + + // Manufacturer ID is a twelve-character ASCII string stored in order EBX, EDX, ECX. + if (!__get_cpuid(0, &max_eax, (uint32_t*)&cpuinfo->ManufacturerID[0], + (uint32_t*)&cpuinfo->ManufacturerID[8], + (uint32_t*)&cpuinfo->ManufacturerID[4])) { + return false; + } + + if (!strcmp(cpuinfo->ManufacturerID, "AuthenticAMD")) { + if (__get_cpuid(0x80000001, &eax, &ebx, &ecx, &edx)) { + cpuinfo->mwaitx = !!((ecx >> 29) & 0x1); + } + } + return true; +#else + return false; +#endif +} + +} // namespace os +} // namespace rocr + +#endif diff --git a/util/locks.h b/util/locks.h new file mode 100644 index 0000000000..6c0de49a07 --- /dev/null +++ b/util/locks.h @@ -0,0 +1,290 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// Library of syncronization primitives - to be added to as needed. + +#ifndef HSA_RUNTIME_CORE_UTIL_LOCKS_H_ +#define HSA_RUNTIME_CORE_UTIL_LOCKS_H_ + +#include "utils.h" +#include "os.h" + +namespace rocr { + +class HybridMutex { + public: + HybridMutex():lock_(0) { + sem_ = os::CreateSemaphore(); + } + + ~HybridMutex() { + os::DestroySemaphore(sem_); + } + + bool Try() { + int old = 0; + return lock_.compare_exchange_strong(old, 1); + } + + bool Acquire() { + int cnt = maxSpinIterPause + maxSpinIterYield; + + int old = 0; + while (!lock_.compare_exchange_strong(old, 1)) { + cnt--; + if (cnt > maxSpinIterPause) { + _mm_pause(); + } else if (cnt-- > maxSpinIterYield) { + os::YieldThread(); + } else { + os::WaitSemaphore(sem_); + cnt = maxSpinIterPause + maxSpinIterYield; + } + old = 0; + } + return true; + } + + void Release() { + int old = 1; + if (lock_.compare_exchange_strong(old, 0)) + os::PostSemaphore(sem_); + } + + private: + std::atomic lock_; + os::Semaphore sem_; + const uint32_t maxSpinIterPause = 55; + const uint32_t maxSpinIterYield = 55; + + /// @brief: Disable copiable and assignable ability. + DISALLOW_COPY_AND_ASSIGN(HybridMutex); +}; + + +/// @brief: a class represents a kernel mutex. +/// Uses the kernel's scheduler to keep the waiting thread from being scheduled +/// until the lock is released (Best for long waits, though anything using +/// a kernel object is a long wait). +class KernelMutex { + public: + KernelMutex() { lock_ = os::CreateMutex(); } + ~KernelMutex() { os::DestroyMutex(lock_); } + + bool Try() { return os::TryAcquireMutex(lock_); } + bool Acquire() { return os::AcquireMutex(lock_); } + void Release() { os::ReleaseMutex(lock_); } + + private: + os::Mutex lock_; + + /// @brief: Disable copiable and assignable ability. + DISALLOW_COPY_AND_ASSIGN(KernelMutex); +}; + +/// @brief: represents a spin lock. +/// For very short hold durations on the order of the thread scheduling +/// quanta or less. +class SpinMutex { + public: + SpinMutex() { lock_ = 0; } + + bool Try() { + int old = 0; + return lock_.compare_exchange_strong(old, 1); + } + bool Acquire() { + int old = 0; + while (!lock_.compare_exchange_strong(old, 1)) + { + old=0; + os::YieldThread(); + } + return true; + } + void Release() { lock_ = 0; } + + private: + std::atomic lock_; + + /// @brief: Disable copiable and assignable ability. + DISALLOW_COPY_AND_ASSIGN(SpinMutex); +}; + +class KernelEvent { + public: + KernelEvent() { evt_ = os::CreateOsEvent(true, true); } + ~KernelEvent() { os::DestroyOsEvent(evt_); } + + bool IsSet() { return os::WaitForOsEvent(evt_, 0)==0; } + bool WaitForSet() { return os::WaitForOsEvent(evt_, 0xFFFFFFFF)==0; } + void Set() { os::SetOsEvent(evt_); } + void Reset() { os::ResetOsEvent(evt_); } + + private: + os::EventHandle evt_; + + /// @brief: Disable copiable and assignable ability. + DISALLOW_COPY_AND_ASSIGN(KernelEvent); +}; + +/// @brief: represents a yielding shared mutex. +/// aka read/write mutex +class KernelSharedMutex { + public: + /// @brief: Interfaces ScopedAcquire to shared operations. + class Shared { + public: + explicit Shared(KernelSharedMutex* lock) : lock_(lock) {} + bool Try() { return lock_->TryShared(); } + bool Acquire() { return lock_->AcquireShared(); } + void Release() { lock_->ReleaseShared(); } + + private: + KernelSharedMutex* lock_; + }; + + KernelSharedMutex() { lock_ = os::CreateSharedMutex(); } + ~KernelSharedMutex() { os::DestroySharedMutex(lock_); } + + // Exclusive mode operations + bool Try() { return os::TryAcquireSharedMutex(lock_); } + bool Acquire() { return os::AcquireSharedMutex(lock_); } + void Release() { os::ReleaseSharedMutex(lock_); } + + // Shared mode operations + bool TryShared() { return os::TrySharedAcquireSharedMutex(lock_); } + bool AcquireShared() { return os::SharedAcquireSharedMutex(lock_); } + void ReleaseShared() { os::SharedReleaseSharedMutex(lock_); } + + // Return shared operations interface + Shared shared() { return Shared(this); } + + private: + os::SharedMutex lock_; + + /// @brief: Disable copiable and assignable ability. + DISALLOW_COPY_AND_ASSIGN(KernelSharedMutex); +}; + +/// @brief: Type trait to identify mutex types +template class isMutex { + public: + enum { value = false }; +}; +template <> class isMutex { + public: + enum { value = true }; +}; +template <> class isMutex { + public: + enum { value = true }; +}; +template <> class isMutex { + public: + enum { value = true }; +}; +template <> class isMutex { + public: + enum { value = true }; +}; + +/// @brief: A class behaves as a lock in a scope. When trying to enter into the +/// critical section, creat a object of this class. After the control path goes +/// out of the scope, it will release the lock automatically. +template class ScopedAcquire { + public: + /// @brief: When constructing, acquire the lock. + /// @param: lock(Input), pointer to an existing lock. + explicit ScopedAcquire(LockType* lock) : lock_(lock), doRelease(true) { + static_assert(isMutex::value, "ScopedAcquire requires a mutex type."); + lock_.Acquire(); + } + explicit ScopedAcquire(LockType lock) : lock_(lock), doRelease(true) { + static_assert(!isMutex::value, "Mutex types are not copyable."); + lock_.Acquire(); + } + + /// @brief: when destructing, release the lock. + ~ScopedAcquire() { + if (doRelease) lock_.Release(); + } + + /// @brief: Release the lock early. Avoid using when possible. + void Release() { + lock_.Release(); + doRelease = false; + } + + private: + /// @brief: Adapts between pointers to mutex types and mutex pointer types. + template class container { + public: + container(T* lock) : lock_(lock) {} + __forceinline bool Acquire() { return lock_->Acquire(); } + __forceinline void Release() { return lock_->Release(); } + + private: + T* lock_; + }; + + /// @brief: Specialization for mutex pointer types. + template class container { + public: + container(T lock) : lock_(lock) {} + __forceinline bool Acquire() { return lock_.Acquire(); } + __forceinline void Release() { return lock_.Release(); } + + private: + T lock_; + }; + + container::value> lock_; + bool doRelease; + + /// @brief: Disable copiable and assignable ability. + DISALLOW_COPY_AND_ASSIGN(ScopedAcquire); +}; + +} // namespace rocr + +#endif // HSA_RUNTIME_CORE_SUTIL_LOCKS_H_ diff --git a/util/os.h b/util/os.h new file mode 100644 index 0000000000..2eec51a34e --- /dev/null +++ b/util/os.h @@ -0,0 +1,327 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// Minimal operating system abstraction interfaces. + +#ifndef HSA_RUNTIME_CORE_UTIL_OS_H_ +#define HSA_RUNTIME_CORE_UTIL_OS_H_ + +#include +#include +#include "utils.h" + +namespace rocr { +namespace os { +typedef void* LibHandle; +typedef void* Semaphore; +typedef void* Mutex; +typedef void* SharedMutex; +typedef void* Thread; +typedef void* EventHandle; + +enum class os_t { OS_WIN = 0, OS_LINUX, COUNT }; +static __forceinline std::underlying_type::type os_index(os_t val) { + return std::underlying_type::type(val); +} + +#ifdef _WIN32 +static const os_t current_os = os_t::OS_WIN; +#elif __linux__ +static const os_t current_os = os_t::OS_LINUX; +#else +static_assert(false, "Operating System not detected!"); +#endif + +/// @brief: Loads dynamic library based on file name. Return value will be NULL +/// if failed. +/// @param: filename(Input), file name of the library. +/// @return: LibHandle. +LibHandle LoadLib(std::string filename); + +/// @brief: Gets the address of exported symbol. Return NULl if failed. +/// @param: lib(Input), library handle which exporting from. +/// @param: export_name(Input), the name of the exported symbol. +/// @return: void*. +void* GetExportAddress(LibHandle lib, std::string export_name); + +/// @brief: Unloads the dynamic library. +/// @param: lib(Input), library handle which will be unloaded. +void CloseLib(LibHandle lib); + +/// @brief: Lists loaded tool libraries that contain +/// symbol HSA_AMD_TOOL_PRIORITY +/// @return: List of library handles +std::vector GetLoadedToolsLib(); + +/// @brief: Returns the library's path name. +/// @param: lib(Input), libray handle +/// @return: Path name of library +std::string GetLibraryName(LibHandle lib); + +/// @brief: Creates a Semaphore, will return NULL if failed. +/// @param: void. +/// @return: Semaphore. +Semaphore CreateSemaphore(); + +/// @brief: Waits for the semaphore. This is a blocking wait. +/// If the Semaphore is signalled, this function will return. +/// @param: sem(Input), handle to the semaphore. +/// @return: void. +bool WaitSemaphore(Semaphore sem); + +/// @brief: Post/Signal/Wake-up the semaphore +/// @param: sem(Input), handle to the semaphore. +/// @return: void. +void PostSemaphore(Semaphore sem); + +/// @brief: Destroys the semaphore. +/// @param: sem(Input), handle to the semaphore. +/// @return: void. +void DestroySemaphore(Semaphore sem); + +/// @brief: Creates a mutex, will return NULL if failed. +/// @param: void. +/// @return: Mutex. +Mutex CreateMutex(); + +/// @brief: Tries to acquire the mutex once, if successed, return true. +/// @param: lock(Input), handle to the mutex. +/// @return: bool. +bool TryAcquireMutex(Mutex lock); + +/// @brief: Aquires the mutex, if the mutex is locked, it will wait until it is +/// released. If the mutex is acquired successfully, it will return true. +/// @param: lock(Input), handle to the mutex. +/// @return: bool. +bool AcquireMutex(Mutex lock); + +/// @brief: Releases the mutex. +/// @param: lock(Input), handle to the mutex. +/// @return: void. +void ReleaseMutex(Mutex lock); + +/// @brief: Destroys the mutex. +/// @param: lock(Input), handle to the mutex. +/// @return: void. +void DestroyMutex(Mutex lock); + +/// @brief: Creates a shared mutex, will return NULL if failed. +/// @param: void. +/// @return: SharedMutex. +SharedMutex CreateSharedMutex(); + +/// @brief: Tries to acquire the mutex in exclusive mode once, if successed, return true. +/// @param: lock(Input), handle to the shared mutex. +/// @return: bool. +bool TryAcquireSharedMutex(SharedMutex lock); + +/// @brief: Aquires the mutex in exclusive mode, if the mutex is locked, it will wait until it is +/// released. If the mutex is acquired successfully, it will return true. +/// @param: lock(Input), handle to the mutex. +/// @return: bool. +bool AcquireSharedMutex(SharedMutex lock); + +/// @brief: Releases the mutex from exclusive mode. +/// @param: lock(Input), handle to the mutex. +/// @return: void. +void ReleaseSharedMutex(SharedMutex lock); + +/// @brief: Tries to acquire the mutex in shared mode once, if successed, return true. +/// @param: lock(Input), handle to the mutex. +/// @return: bool. +bool TrySharedAcquireSharedMutex(SharedMutex lock); + +/// @brief: Aquires the mutex in shared mode, if the mutex in exclusive mode, it will wait until it +/// is released. If the mutex is acquired successfully, it will return true. +/// @param: lock(Input), handle to the mutex. +/// @return: bool. +bool SharedAcquireSharedMutex(SharedMutex lock); + +/// @brief: Releases the mutex from shared mode. +/// @param: lock(Input), handle to the mutex. +/// @return: void. +void SharedReleaseSharedMutex(SharedMutex lock); + +/// @brief: Destroys the mutex. +/// @param: lock(Input), handle to the mutex. +/// @return: void. +void DestroySharedMutex(SharedMutex lock); + +/// @brief: Puts current thread to sleep. +/// @param: delayInMs(Input), time in millisecond for sleeping. +/// @return: void. +void Sleep(int delayInMs); + +/// @brief: Puts current thread to sleep. +/// @param: delayInMs(Input), time in millisecond for sleeping. +/// @return: void. +void uSleep(int delayInUs); + +/// @brief: Yields current thread. +/// @param: void. +/// @return: void. +void YieldThread(); + +typedef void (*ThreadEntry)(void*); + +/// @brief: Creates a thread will return NULL if failed. +/// @param: entry_function(Input), a pointer to the function which the thread +/// starts from. +/// @param: entry_argument(Input), a pointer to the argument of the thread +/// function. +/// @param: stack_size(Input), size of the thread's stack, 0 by default. +/// @return: Thread, a handle to thread created. +Thread CreateThread(ThreadEntry entry_function, void* entry_argument, + uint stack_size = 0); + +/// @brief: Destroys the thread. +/// @param: thread(Input), thread handle to what will be destroyed. +/// @return: void. +void CloseThread(Thread thread); + +/// @brief: Waits for specific thread to finish, if successful, return true. +/// @param: thread(Input), handle to waiting thread. +/// @return: bool. +bool WaitForThread(Thread thread); + +/// @brief: Waits for multiple threads to finish, if successful, return true. +/// @param; threads(Input), a pointer to a list of thread handle. +/// @param: thread_count(Input), number of threads to be waited on. +/// @return: bool. +bool WaitForAllThreads(Thread* threads, uint thread_count); + +/// @brief: Determines if environment key is set. +/// @param: env_var_name(Input), name of the environment value. +/// @return: bool, true for binding any value to environment key, +/// including an empty string. False otherwise +bool IsEnvVarSet(std::string env_var_name); + +/// @brief: Sets the environment value. +/// @param: env_var_name(Input), name of the environment value. +/// @param: env_var_value(Input), value of the environment value.s +/// @return: void. +void SetEnvVar(std::string env_var_name, std::string env_var_value); + +/// @brief: Gets the value of environment value. +/// @param: env_var_name(Input), name of the environment value. +/// @return: std::string, value of the environment value, returned as string. +std::string GetEnvVar(std::string env_var_name); + +/// @brief: Gets the process ID. +/// @param: void +/// @return: int, process ID returned as int. +int GetProcessId(); + +/// @brief: Gets the max virtual memory size accessible to the application. +/// @param: void. +/// @return: size_t, size of the accessible memory to the application. +size_t GetUserModeVirtualMemorySize(); + +/// @brief: Gets the max physical host system memory size. +/// @param: void. +/// @return: size_t, size of the physical host system memory. +size_t GetUsablePhysicalHostMemorySize(); + +/// @brief: Gets the virtual memory base address. It is hardcoded to 0. +/// @param: void. +/// @return: uintptr_t, always 0. +uintptr_t GetUserModeVirtualMemoryBase(); + +/// @brief os event api, create an event +/// @param: auto_reset whether an event can reset the status automatically +/// @param: init_state initial state of the event +/// @return: event handle +EventHandle CreateOsEvent(bool auto_reset, bool init_state); + +/// @brief os event api, destroy an event +/// @param: event handle +/// @return: whether destroy is correct +int DestroyOsEvent(EventHandle event); + +/// @brief os event api, wait on event +/// @param: event Event handle +/// @param: milli_seconds wait time +/// @return: Indicate success or timeout +int WaitForOsEvent(EventHandle event, unsigned int milli_seconds); + +/// @brief os event api, set event state +/// @param: event Event handle +/// @return: Whether event set is correct +int SetOsEvent(EventHandle event); + +/// @brief os event api, reset event state +/// @param: event Event handle +/// @return: Whether event reset is correct +int ResetOsEvent(EventHandle event); + +/// @brief reads a clock which is deemed to be accurate for elapsed time +/// measurements, though not necessarilly fast to query +/// @return clock counter value +uint64_t ReadAccurateClock(); + +/// @brief retrieves the frequency in Hz of the unit used in ReadAccurateClock. +/// It does not necessarilly reflect the resolution of the clock, but is the +/// value needed to convert a difference in the clock's counter value to elapsed +/// seconds. This frequency does not change at runtime. +/// @return returns the frequency +uint64_t AccurateClockFrequency(); + +/// @brief read the system clock which serves as the HSA system clock +/// counter in KFD. +uint64_t ReadSystemClock(); + +/// @brief read the system clock frequency +uint64_t SystemClockFrequency(); + +typedef struct cpuid_s { + char ManufacturerID[13]; // 12 char, NULL terminated + bool mwaitx; +} cpuid_t; + +/// @brief parse CPUID +/// @param: cpuinfo struct to be filled +bool ParseCpuID(cpuid_t* cpuinfo); + +} // namespace os +} // namespace rocr + +#endif // HSA_RUNTIME_CORE_UTIL_OS_H_ diff --git a/util/simple_heap.h b/util/simple_heap.h new file mode 100644 index 0000000000..6c7822bcdb --- /dev/null +++ b/util/simple_heap.h @@ -0,0 +1,363 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// A simple best fit memory allocator with eager compaction. Manages block sub-allocation. +// For use when memory efficiency is more important than allocation speed. +// O(log n) time. + +#ifndef HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_ +#define HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_ + +#include +#include +#include + +#include "core/util/utils.h" + +namespace rocr { + +template class SimpleHeap { + private: + struct Fragment_T { + typedef std::multimap::iterator ptr_t; + ptr_t free_list_entry_; + struct { + size_t size : 62; + bool discard : 1; + bool free : 1; + }; + + Fragment_T(ptr_t Iterator, size_t Len, bool Free) + : free_list_entry_(Iterator), size(Len), discard(false), free(Free) {} + Fragment_T() = default; + }; + + struct Block { + uintptr_t base_ptr_; + size_t length_; + + Block(uintptr_t base, size_t length) : base_ptr_(base), length_(length) {} + Block() = default; + }; + + Allocator block_allocator_; + + std::multimap free_list_; + std::map> block_list_; + std::deque block_cache_; + + // Size of blocks that are at least partially in use. + size_t in_use_size_; + // Total size of block cache + size_t cache_size_; + + __forceinline bool isFree(const Fragment_T& node) { return node.free; } + __forceinline void setUsed(Fragment_T& node) { + node.free = false; + node.free_list_entry_ = free_list_.end(); + } + __forceinline void setFree(Fragment_T& node, typename Fragment_T::ptr_t Iterator) { + node.free_list_entry_ = Iterator; + node.free = true; + } + __forceinline Fragment_T makeFragment(size_t Len) { + return Fragment_T(free_list_.end(), Len, false); + } + __forceinline Fragment_T makeFragment(typename Fragment_T::ptr_t Iterator, size_t Len) { + return Fragment_T(Iterator, Len, true); + } + __forceinline void removeFreeListEntry(Fragment_T& node) { + if (node.free_list_entry_ != free_list_.end()) { + free_list_.erase(node.free_list_entry_); + node.free_list_entry_ = free_list_.end(); + } + } + __forceinline void discard(Fragment_T& node) { + removeFreeListEntry(node); + node.discard = true; + } + + public: + explicit SimpleHeap(const Allocator& BlockAllocator = Allocator()) + : block_allocator_(BlockAllocator), in_use_size_(0), cache_size_(0) {} + ~SimpleHeap() { + trim(); + // Leak here may be due to the user. Check is for debugging only. + // assert(in_use_size_ == 0 && "Leak in SimpleHeap."); + } + + SimpleHeap(const SimpleHeap& rhs) = delete; + SimpleHeap(SimpleHeap&& rhs) = delete; + SimpleHeap& operator=(const SimpleHeap& rhs) = delete; + SimpleHeap& operator=(SimpleHeap&& rhs) = delete; + + void* alloc(size_t bytes) { + // Find best fit. + uintptr_t base; + size_t size; + // For bytes >= 2MB, the requested mem should be aligned + size_t align_bytes = bytes; + const int retry = bytes >= GPU_HUGE_PAGE_SIZE ? 1 : 0; + size_t align = bytes >= GPU_HUGE_PAGE_SIZE ? GPU_HUGE_PAGE_SIZE : DEFAULT_GPU_PAGE_SIZE; + + for (int i = 0; i <= retry; i++) { + auto free_fragment = free_list_.lower_bound(align_bytes); + if (free_fragment == free_list_.end()) break; + + uintptr_t addr = free_fragment->second; + size = free_fragment->first; + + assert(size >= bytes && "SimpleHeap: map lower_bound failure."); + + // Find the containing block and fragment + auto it = block_list_.upper_bound(addr); + it--; + auto& frag_map = it->second; + const auto& fragment = frag_map.find(addr); + + assert(fragment != frag_map.end() && "Inconsistency in SimpleHeap."); + assert(size == fragment->second.size && "Inconsistency in SimpleHeap."); + + size_t delta = addr & (align - 1); + if (!delta) { + // already find aligned address + base = addr; + free_list_.erase(free_fragment); + // Sub-allocate from fragment. + fragment->second.size = bytes; + setUsed(fragment->second); + // Record remaining free space. + if (size > bytes) { + free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes)); + frag_map[base + bytes] = makeFragment(free_fragment, size - bytes); + } + } else { + // If this is the first request and the requested size is not enough for alignment, + // then request for a bigger hole and do trim. + if (i == 0 && size < bytes + align - delta) { + align_bytes += align; + continue; + } + + uintptr_t aligned_base = addr + align - delta; + base = aligned_base; + + // Erase the old free list + free_list_.erase(free_fragment); + + // fragment 1 - free + free_fragment = free_list_.insert(std::make_pair(aligned_base - addr, addr)); + frag_map[addr] = makeFragment(free_fragment, aligned_base - addr); + + //fragment 2 - used + frag_map[base] = makeFragment(bytes); + + // fragement 3 - free + if (size > aligned_base - addr + bytes) { + free_fragment = free_list_.insert(std::make_pair(size - (aligned_base - addr) - bytes, aligned_base + bytes)); + frag_map[aligned_base + bytes] = makeFragment(free_fragment, size - (aligned_base - addr) - bytes); + } + } + return reinterpret_cast(base); + } + + // No usable fragment, check block cache + if (bytes < default_block_size() && !block_cache_.empty()) { + const auto& block = block_cache_.back(); + base = block.base_ptr_; + size = block.length_; + block_cache_.pop_back(); + cache_size_ -= size; + } else { // Alloc new block - new block may be larger than default. + void* ptr = block_allocator_.alloc(bytes, size); + base = reinterpret_cast(ptr); + assert(ptr != nullptr && "Block allocation failed, Allocator is expected to throw."); + } + + in_use_size_ += size; + assert(size >= bytes && "Alloc exceeds block size."); + // Sub alloc and insert free region. + if (size > bytes) { + auto free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes)); + block_list_[base][base + bytes] = makeFragment(free_fragment, size - bytes); + } + // Track used region + block_list_[base][base] = makeFragment(bytes); + + // Disallow multiple suballocation from large blocks. + // Prevents a small allocation from retaining a large block. + if (bytes > default_block_size()) { + bool err = discardBlock(reinterpret_cast(base)); + assert(err && "Large block discard failed."); + } + + return reinterpret_cast(base); + } + + bool free(void* ptr) { + if (ptr == nullptr) return true; + + uintptr_t base = reinterpret_cast(ptr); + + // Find fragment and validate. + auto frag_map_it = block_list_.upper_bound(base); + if (frag_map_it == block_list_.begin()) return false; + frag_map_it--; + auto& frag_map = frag_map_it->second; + auto fragment = frag_map.find(base); + if (fragment == frag_map.end() || isFree(fragment->second)) return false; + + bool discard = fragment->second.discard; + + // Merge lower + if (fragment != frag_map.begin()) { + auto lower = fragment; + lower--; + if (isFree(lower->second)) { + removeFreeListEntry(lower->second); + lower->second.size += fragment->second.size; + frag_map.erase(fragment); + fragment = lower; + } + } + + // Merge upper + { + auto upper = fragment; + upper++; + if ((upper != frag_map.end()) && isFree(upper->second)) { + removeFreeListEntry(upper->second); + fragment->second.size += upper->second.size; + frag_map.erase(upper); + } + } + + // Release whole free blocks. + if (frag_map.size() == 1) { + Block block(fragment->first, fragment->second.size); + block_list_.erase(frag_map_it); + + // Discard or add to the block cache. + if (discard) { + block_allocator_.free(reinterpret_cast(block.base_ptr_), block.length_); + } else { + block_cache_.push_back(block); + cache_size_ += block.length_; + in_use_size_ -= block.length_; + } + + balance(); + + // Don't publish free space since block was moved to the cache. + return true; + } + + // Don't report free memory if discarding the fragment. + if (discard) return true; + + // Report free fragment + const auto& freeEntry = + free_list_.insert(std::make_pair(size_t(fragment->second.size), fragment->first)); + setFree(fragment->second, freeEntry); + + return true; + } + + void balance() { + // Release old blocks when over cache limit. + while ((block_cache_.size() > 1) && (cache_size_ > in_use_size_ * 2)) { + const auto& block = block_cache_.front(); + block_allocator_.free(reinterpret_cast(block.base_ptr_), block.length_); + cache_size_ -= block.length_; + block_cache_.pop_front(); + } + } + + void trim() { + for (const auto& block : block_cache_) + block_allocator_.free(reinterpret_cast(block.base_ptr_), block.length_); + block_cache_.clear(); + cache_size_ = 0; + } + + size_t cache_size() const { return cache_size_; } + + size_t default_block_size() const { return block_allocator_.block_size(); } + + // Prevent reuse of the block containing ptr. No further fragments will be allocated from the + // block and the block will not be added to the block cache when it is free. + bool discardBlock(void* ptr) { + if (ptr == nullptr) return true; + + uintptr_t base = reinterpret_cast(ptr); + + // Find block validate. + auto frag_map_it = block_list_.upper_bound(base); + if (frag_map_it == block_list_.begin()) return false; + frag_map_it--; + auto& frag_map = frag_map_it->second; + if ((base < frag_map.begin()->first) || + (frag_map.rbegin()->first + frag_map.rbegin()->second.size <= base)) + return false; + + // Is block already discarded? + if (frag_map.begin()->second.discard) return true; + + // Mark all fragments for discard and compute block size. Removes freelist records for all + // fragments in the block. + size_t size = 0; + for (auto& frag : frag_map) { + discard(frag.second); + size += frag.second.size; + } + + // Remove discarded block from in-use tracking and rebalance the block cache. + in_use_size_ -= size; + balance(); + + return true; + } +}; + +} // namespace rocr + +#endif // HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_ diff --git a/util/small_heap.cpp b/util/small_heap.cpp new file mode 100644 index 0000000000..9fe5da5fab --- /dev/null +++ b/util/small_heap.cpp @@ -0,0 +1,185 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "small_heap.h" + +namespace rocr { + +// Inserts node into freelist after place. +// Assumes node will not be an end of the list (list has guard nodes). +void SmallHeap::insertafter(SmallHeap::iterator_t place, SmallHeap::iterator_t node) { + assert(place->first < node->first && "Order violation"); + assert(isfree(place->second) && "Freelist operation error."); + iterator_t next = place->second.next; + node->second.next = next; + node->second.prior = place; + place->second.next = node; + next->second.prior = node; +} + +// Removes node from freelist. +// Assumes node will not be an end of the list (list has guard nodes). +void SmallHeap::remove(SmallHeap::iterator_t node) { + assert(isfree(node->second) && "Freelist operation error."); + node->second.prior->second.next = node->second.next; + node->second.next->second.prior = node->second.prior; + setused(node->second); +} + +// Returns high if merge failed or the merged node. +SmallHeap::memory_t::iterator SmallHeap::merge(SmallHeap::memory_t::iterator low, + SmallHeap::memory_t::iterator high) { + assert(isfree(low->second) && "Merge with allocated block"); + assert(isfree(high->second) && "Merge with allocated block"); + + if ((char*)low->first + low->second.len != (char*)high->first) return high; + + assert(!islastfree(high->second) && "Illegal merge."); + + low->second.len += high->second.len; + low->second.next = high->second.next; + high->second.next->second.prior = low; + + memory.erase(high); + return low; +} + +void SmallHeap::free(void* ptr) { + if (ptr == nullptr) return; + + auto iterator = memory.find(ptr); + + // Check for illegal free + if (iterator == memory.end()) { + assert(false && "Illegal free."); + return; + } + + // Return memory to total and link node into free list + total_free += iterator->second.len; + + // Could also traverse the free list which might be faster in some cases. + auto before = iterator; + before--; + while (!isfree(before->second)) before--; + assert(before->second.next->first > iterator->first && "Inconsistency in small heap."); + insertafter(before, iterator); + + // Attempt compaction + iterator = merge(before, iterator); + merge(iterator, iterator->second.next); + + // Update lowHighBondary + high.erase(ptr); +} + +void* SmallHeap::alloc(size_t bytes) { + // Is enough memory available? + if ((bytes > total_free) || (bytes == 0)) return nullptr; + + iterator_t current; + + // Walk the free list and allocate at first fitting location + current = firstfree(); + while (!islastfree(current->second)) { + if (bytes <= current->second.len) { + // Decrement from total + total_free -= bytes; + + // Split node + if (bytes != current->second.len) { + void* remaining = (char*)current->first + bytes; + Node& node = memory[remaining]; + node.len = current->second.len - bytes; + current->second.len = bytes; + insertafter(current, memory.find(remaining)); + } + + remove(current); + return current->first; + } + current = current->second.next; + } + assert(current->second.len == 0 && "Freelist corruption."); + + // Can't service the request due to fragmentation + return nullptr; +} + +void* SmallHeap::alloc_high(size_t bytes) { + // Is enough memory available? + if ((bytes > total_free) || (bytes == 0)) return nullptr; + + iterator_t current; + + // Walk the free list and allocate at first fitting location + current = lastfree(); + while (!isfirstfree(current->second)) { + if (bytes <= current->second.len) { + // Decrement from total + total_free -= bytes; + + void* alloc; + // Split node + if (bytes != current->second.len) { + alloc = (char*)current->first + current->second.len - bytes; + current->second.len -= bytes; + Node& node = memory[alloc]; + node.len = bytes; + setused(node); + } else { + alloc = current->first; + remove(current); + } + + high.insert(alloc); + return alloc; + } + current = current->second.prior; + } + assert(current->second.len == 0 && "Freelist corruption."); + + // Can't service the request due to fragmentation + return nullptr; +} + +} // namespace rocr diff --git a/util/small_heap.h b/util/small_heap.h new file mode 100644 index 0000000000..e1f5d7bdeb --- /dev/null +++ b/util/small_heap.h @@ -0,0 +1,131 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// A simple first fit memory allocator with eager compaction. For use with few +// items (where list iteration is faster than trees). +// Not thread safe! + +#ifndef HSA_RUNTME_CORE_UTIL_SMALL_HEAP_H_ +#define HSA_RUNTME_CORE_UTIL_SMALL_HEAP_H_ + +#include +#include + +#include "utils.h" + +namespace rocr { + +class SmallHeap { + private: + struct Node; + typedef std::map memory_t; + typedef memory_t::iterator iterator_t; + + struct Node { + size_t len; + iterator_t next; + iterator_t prior; + }; + + SmallHeap(const SmallHeap& rhs) = delete; + SmallHeap& operator=(const SmallHeap& rhs) = delete; + + void* const pool; + const size_t length; + + size_t total_free; + memory_t memory; + std::set high; + + __forceinline bool isfree(const Node& node) const { return node.next != memory.begin(); } + __forceinline bool islastfree(const Node& node) const { return node.next == memory.end(); } + __forceinline bool isfirstfree(const Node& node) const { return node.prior == memory.end(); } + __forceinline void setlastfree(Node& node) { node.next = memory.end(); } + __forceinline void setfirstfree(Node& node) { node.prior = memory.end(); } + __forceinline void setused(Node& node) { node.next = memory.begin(); } + + __forceinline iterator_t firstfree() { return memory.begin()->second.next; } + __forceinline iterator_t lastfree() { return memory.rbegin()->second.prior; } + void insertafter(iterator_t place, iterator_t node); + void remove(iterator_t node); + iterator_t merge(iterator_t low, iterator_t high); + + public: + SmallHeap() : pool(nullptr), length(0), total_free(0) {} + SmallHeap(void* base, size_t length) + : pool(base), length(length), total_free(length) { + assert(pool != nullptr && "Invalid base address."); + assert(pool != (void*)0xFFFFFFFFFFFFFFFFull && "Invalid base address."); + assert((char*)pool + length != (char*)0xFFFFFFFFFFFFFFFFull && "Invalid pool bounds."); + + Node& start = memory[0]; + Node& node = memory[pool]; + Node& end = memory[(void*)0xFFFFFFFFFFFFFFFFull]; + + start.len = 0; + start.next = memory.find(pool); + setfirstfree(start); + + node.len = length; + node.prior = memory.begin(); + node.next = --memory.end(); + + end.len = 0; + end.prior = start.next; + setlastfree(end); + + high.insert((void*)0xFFFFFFFFFFFFFFFFull); + } + + void* alloc(size_t bytes); + void* alloc_high(size_t bytes); + void free(void* ptr); + + void* base() const { return pool; } + size_t size() const { return length; } + size_t remaining() const { return total_free; } + void* high_split() const { return *high.begin(); } +}; + +} // namespace rocr + +#endif diff --git a/util/timer.cpp b/util/timer.cpp new file mode 100644 index 0000000000..5419e4f61d --- /dev/null +++ b/util/timer.cpp @@ -0,0 +1,111 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/util/timer.h" + +namespace rocr { +namespace timer { + +accurate_clock::init::init() { + freq = os::AccurateClockFrequency(); + accurate_clock::period_ns = 1e9 / double(freq); +} + +// Calibrates the fast clock using the accurate clock. +fast_clock::init::init() { + typedef accurate_clock clock; + clock::duration delay(std::chrono::milliseconds(1)); + + // calibrate clock + fast_clock::raw_rep min = 0; + clock::duration elapsed; + + do { + elapsed = clock::duration::max(); + + for (int t = 0; t < 10; t++) { + fast_clock::raw_rep r1, r2; + clock::time_point t0, t1, t2, t3; + + t0 = clock::now(); + std::atomic_signal_fence(std::memory_order_acq_rel); + r1 = fast_clock::raw_now(); + std::atomic_signal_fence(std::memory_order_acq_rel); + t1 = clock::now(); + std::atomic_signal_fence(std::memory_order_acq_rel); + + do { + t2 = clock::now(); + } while (t2 - t1 < delay); + + std::atomic_signal_fence(std::memory_order_acq_rel); + r2 = fast_clock::raw_now(); + std::atomic_signal_fence(std::memory_order_acq_rel); + t3 = clock::now(); + + // If elapsed time is shorter than last recorded time and both the start + // and end times are confirmed correlated then record the clock readings. + // This protects against inaccuracy due to thread switching + if ((t3 - t1 < elapsed) && ((t1 - t0) * 10 < (t2 - t1)) && + ((t3 - t2) * 10 < (t2 - t1))) { + elapsed = t3 - t1; + min = r2 - r1; + } + } + delay += delay; + } while (min < 1000); + + fast_clock::freq = double(min) / duration_in_seconds(elapsed); + fast_clock::period_ps = 1e12 / fast_clock::freq; + // printf("Timer setup took %f ms\n", duration_in_seconds(elapsed)*1000.0f); + // printf("Fast clock frequency: %f MHz\n", double(fast_clock::freq)/1e6); +} + +double accurate_clock::period_ns; +accurate_clock::raw_frequency accurate_clock::freq; +accurate_clock::init accurate_clock::accurate_clock_init; + +double fast_clock::period_ps; +fast_clock::raw_frequency fast_clock::freq; +fast_clock::init fast_clock::fast_clock_init; +} // namespace timer +} // namespace rocr diff --git a/util/timer.h b/util/timer.h new file mode 100644 index 0000000000..155a11a393 --- /dev/null +++ b/util/timer.h @@ -0,0 +1,173 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_UTIL_TIMER_H_ +#define HSA_RUNTIME_CORE_UTIL_TIMER_H_ + +#include "core/util/utils.h" +#include "core/util/os.h" +#include +#include +#include + +namespace rocr { +namespace timer { + +// Needed to patch around a mixed arithmetic bug in MSVC's duration_cast as of +// VS 2013. +template +struct wide_type { + typedef double type; +}; +template <> +struct wide_type { + typedef uintmax_t type; +}; +template <> +struct wide_type { + typedef intmax_t type; +}; + +template +static __forceinline To + duration_cast(const std::chrono::duration& d) { + typedef typename wide_type::value, + std::is_signed::value>::type wide; + typedef std::chrono::duration unit_convert_t; + + unit_convert_t temp = std::chrono::duration_cast(d); + return To(static_cast(temp.count())); +} +// End patch + +template +static __forceinline double duration_in_seconds( + std::chrono::duration delta) { + typedef std::chrono::duration> seconds; + return seconds(delta).count(); +} + +template +static __forceinline rep duration_from_seconds(double delta) { + typedef std::chrono::duration> seconds; + return std::chrono::duration_cast(seconds(delta)); +} + +// Provices a C++11 standard clock interface to the os::AccurateClock functions +class accurate_clock { + public: + typedef double rep; + typedef std::nano period; + typedef std::chrono::duration duration; + typedef std::chrono::time_point time_point; + + static const bool is_steady = true; + + static __forceinline time_point now() { + return time_point(duration(raw_now() * period_ns)); + } + + // These two extra APIs and types let us use clocks without conversion to the + // arbitrary period unit + typedef uint64_t raw_rep; + typedef uint64_t raw_frequency; + + static __forceinline raw_rep raw_now() { return os::ReadAccurateClock(); } + static __forceinline raw_frequency raw_freq() { return freq; } + + private: + static double period_ns; + static raw_frequency freq; + + class init { + public: + init(); + }; + static init accurate_clock_init; +}; + +// Provices a C++11 standard clock interface to the lowest latency approximate +// clock +class fast_clock { + public: + typedef double rep; + typedef std::pico period; + typedef std::chrono::duration duration; + typedef std::chrono::time_point time_point; + + static const bool is_steady = true; + + static __forceinline time_point now() { + return time_point(duration(raw_now() * period_ps)); + } + + // These two extra APIs and types let us use clocks without conversion to the + // arbitrary period unit + typedef uint64_t raw_rep; + typedef double raw_frequency; + +#if defined(__x86_64__) || defined(_M_X64) + static __forceinline raw_rep raw_now() { return __rdtsc(); } + static __forceinline raw_frequency raw_freq() { return freq; } +#else + static __forceinline raw_rep raw_now() { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return (raw_rep(ts.tv_sec) * 1000000000 + raw_rep(ts.tv_nsec)); + } + static __forceinline raw_frequency raw_freq() { return 1.e-9; } +#endif + + private: + static double period_ps; + static raw_frequency freq; + + class init { + public: + init(); + }; + static init fast_clock_init; +}; +} // namespace timer +} // namespace rocr + +#endif diff --git a/util/utils.h b/util/utils.h new file mode 100644 index 0000000000..c2fa8ef32c --- /dev/null +++ b/util/utils.h @@ -0,0 +1,424 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// Generally useful utility functions + +#ifndef HSA_RUNTIME_CORE_UTIL_UTILS_H_ +#define HSA_RUNTIME_CORE_UTIL_UTILS_H_ + +#include "stdint.h" +#include "stddef.h" +#include "stdlib.h" +#include "stdarg.h" +#include "unistd.h" +#include +#include +#include +#include +#include +#include + +namespace rocr { +extern FILE* log_file; +extern uint8_t log_flags[8]; + +typedef unsigned int uint; +typedef uint64_t uint64; + +#if defined(__GNUC__) +#if defined(__i386__) || defined(__x86_64__) +#include +#endif + +// 2MB huge page size +#define GPU_HUGE_PAGE_SIZE (2 << 20) + +// 4KB page size +#define DEFAULT_GPU_PAGE_SIZE (1 << 12) + +#define __forceinline __inline__ __attribute__((always_inline)) +#define __declspec(x) __attribute__((x)) +#undef __stdcall +#define __stdcall // __attribute__((__stdcall__)) +#define __ALIGNED__(x) __attribute__((aligned(x))) + +void log_printf(const char* file, int line, const char* format, ...); + +static __forceinline void* _aligned_malloc(size_t size, size_t alignment) { +#ifdef _ISOC11_SOURCE + return aligned_alloc(alignment, size); +#else + void *mem = NULL; + if (0 != posix_memalign(&mem, alignment, size)) return NULL; + return mem; +#endif +} +static __forceinline void _aligned_free(void* ptr) { return free(ptr); } +#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) +#include "intrin.h" +#define __ALIGNED__(x) __declspec(align(x)) +#if (_MSC_VER < 1800) // < VS 2013 +static __forceinline unsigned long long int strtoull(const char* str, + char** endptr, int base) { + return static_cast(_strtoui64(str, endptr, base)); +} +#endif +#if (_MSC_VER < 1900) // < VS 2015 +#define thread_local __declspec(thread) +#endif +#else +#error "Compiler and/or processor not identified." +#endif + +#define STRING2(x) #x +#define STRING(x) STRING2(x) + +#define PASTE2(x, y) x##y +#define PASTE(x, y) PASTE2(x, y) + +#ifdef NDEBUG +#define debug_warning_n(exp, limit) \ + do { \ + } while (false) +#else +#define debug_warning_n(exp, limit) \ + do { \ + static std::atomic count(0); \ + if (!(exp) && (limit == 0 || count < limit)) { \ + fprintf(stderr, "Warning: " STRING(exp) " in %s, " __FILE__ ":" STRING(__LINE__) "\n", \ + __PRETTY_FUNCTION__); \ + count++; \ + } \ + } while (false) +#endif +#define debug_warning(exp) debug_warning_n((exp), 0) + +#ifdef NDEBUG +#define debug_print(fmt, ...) \ + do { \ + } while (false) +#else +#define debug_print(fmt, ...) \ + do { \ + fprintf(stderr, fmt, ##__VA_ARGS__); \ + } while (false) +#endif + +#ifdef NDEBUG +#define ifdebug if (false) +#else +#define ifdebug if (true) +#endif + +#define __FILENAME__ (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) + +#define LogPrint(flag, format, ...) \ + do { \ + if (hsa_flag_isset64(log_flags, flag)) \ + rocr::log_printf(__FILENAME__, __LINE__, format, ##__VA_ARGS__); \ + } while (false); + + +// A macro to disallow the copy and move constructor and operator= functions +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&) = delete; \ + TypeName(TypeName&&) = delete; \ + void operator=(const TypeName&) = delete; \ + void operator=(TypeName&&) = delete; + +template +class ScopeGuard { + public: + explicit __forceinline ScopeGuard(const lambda& release) + : release_(release), dismiss_(false) {} + + ScopeGuard(ScopeGuard& rhs) { *this = rhs; } + + __forceinline ~ScopeGuard() { + if (!dismiss_) release_(); + } + __forceinline ScopeGuard& operator=(ScopeGuard& rhs) { + dismiss_ = rhs.dismiss_; + release_ = rhs.release_; + rhs.dismiss_ = true; + return *this; + } + __forceinline void Dismiss() { dismiss_ = true; } + + private: + lambda release_; + bool dismiss_; +}; + +template +static __forceinline ScopeGuard MakeScopeGuard(lambda rel) { + return ScopeGuard(rel); +} + +#define MAKE_SCOPE_GUARD_HELPER(lname, sname, ...) \ + auto lname = __VA_ARGS__; \ + ScopeGuard sname(lname); +#define MAKE_SCOPE_GUARD(...) \ + MAKE_SCOPE_GUARD_HELPER(PASTE(scopeGuardLambda, __COUNTER__), \ + PASTE(scopeGuard, __COUNTER__), __VA_ARGS__) +#define MAKE_NAMED_SCOPE_GUARD(name, ...) \ + MAKE_SCOPE_GUARD_HELPER(PASTE(scopeGuardLambda, __COUNTER__), name, \ + __VA_ARGS__) + +/// @brief: Finds out the min one of two inputs, input must support ">" +/// operator. +/// @param: a(Input), a reference to type T. +/// @param: b(Input), a reference to type T. +/// @return: T. +template +static __forceinline T Min(const T& a, const T& b) { + return (a > b) ? b : a; +} + +template +static __forceinline T Min(const T& a, const T& b, Arg... args) { + return Min(a, Min(b, args...)); +} + +/// @brief: Find out the max one of two inputs, input must support ">" operator. +/// @param: a(Input), a reference to type T. +/// @param: b(Input), a reference to type T. +/// @return: T. +template +static __forceinline T Max(const T& a, const T& b) { + return (b > a) ? b : a; +} + +template +static __forceinline T Max(const T& a, const T& b, Arg... args) { + return Max(a, Max(b, args...)); +} + +/// @brief: Free the memory space which is newed previously. +/// @param: ptr(Input), a pointer to memory space. Can't be NULL. +/// @return: void. +struct DeleteObject { + template + void operator()(const T* ptr) const { + delete ptr; + } +}; + +/// @brief: Checks if a value is power of two, if it is, return true. Be careful +/// when passing 0. +/// @param: val(Input), the data to be checked. +/// @return: bool. +template +static __forceinline bool IsPowerOfTwo(T val) { + return (val & (val - 1)) == 0; +} + +/// @brief: Calculates the floor value aligned based on parameter of alignment. +/// If value is at the boundary of alignment, it is unchanged. +/// @param: value(Input), value to be calculated. +/// @param: alignment(Input), alignment value. +/// @return: T. +template +static __forceinline T AlignDown(T value, size_t alignment) { + return (T)((value / alignment) * alignment); +} + +/// @brief: Same as previous one, but first parameter becomes pointer, for more +/// info, see the previous desciption. +/// @param: value(Input), pointer to type T. +/// @param: alignment(Input), alignment value. +/// @return: T*, pointer to type T. +template +static __forceinline T* AlignDown(T* value, size_t alignment) { + return (T*)AlignDown((intptr_t)value, alignment); +} + +/// @brief: Calculates the ceiling value aligned based on parameter of +/// alignment. +/// If value is at the boundary of alignment, it is unchanged. +/// @param: value(Input), value to be calculated. +/// @param: alignment(Input), alignment value. +/// @param: T. +template +static __forceinline T AlignUp(T value, size_t alignment) { + return AlignDown((T)(value + alignment - 1), alignment); +} + +/// @brief: Same as previous one, but first parameter becomes pointer, for more +/// info, see the previous desciption. +/// @param: value(Input), pointer to type T. +/// @param: alignment(Input), alignment value. +/// @return: T*, pointer to type T. +template +static __forceinline T* AlignUp(T* value, size_t alignment) { + return (T*)AlignDown((intptr_t)((uint8_t*)value + alignment - 1), alignment); +} + +/// @brief: Checks if the input value is at the boundary of alignment, if it is, +/// @return true. +/// @param: value(Input), value to be checked. +/// @param: alignment(Input), alignment value. +/// @return: bool. +template +static __forceinline bool IsMultipleOf(T value, size_t alignment) { + return (AlignUp(value, alignment) == value); +} + +/// @brief: Same as previous one, but first parameter becomes pointer, for more +/// info, see the previous desciption. +/// @param: value(Input), pointer to type T. +/// @param: alignment(Input), alignment value. +/// @return: bool. +template +static __forceinline bool IsMultipleOf(T* value, size_t alignment) { + return (AlignUp(value, alignment) == value); +} + +static __forceinline uint32_t NextPow2(uint32_t value) { + if (value == 0) return 1; + uint32_t v = value - 1; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return v + 1; +} + +static __forceinline uint64_t NextPow2(uint64_t value) { + if (value == 0) return 1; + uint64_t v = value - 1; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v |= v >> 32; + return v + 1; +} + +static __forceinline bool strIsEmpty(const char* str) noexcept { return str[0] == '\0'; } + +static __forceinline std::string& ltrim(std::string& s) { + auto it = std::find_if(s.begin(), s.end(), + [](char c) { return !std::isspace(c, std::locale::classic()); }); + s.erase(s.begin(), it); + return s; +} + +static __forceinline std::string& rtrim(std::string& s) { + auto it = std::find_if(s.rbegin(), s.rend(), + [](char c) { return !std::isspace(c, std::locale::classic()); }); + s.erase(it.base(), s.end()); + return s; +} + +static __forceinline std::string& trim(std::string& s) { return ltrim(rtrim(s)); } + +} // namespace rocr + +template +static __forceinline uint32_t BitSelect(T p) { + static_assert(sizeof(T) <= sizeof(uintptr_t), "Type out of range."); + static_assert(highBit < sizeof(uintptr_t) * 8, "Bit index out of range."); + + uintptr_t ptr = p; + if (highBit != (sizeof(uintptr_t) * 8 - 1)) + return (uint32_t)((ptr & ((1ull << (highBit + 1)) - 1)) >> lowBit); + else + return (uint32_t)(ptr >> lowBit); +} + +inline uint32_t PtrLow16Shift8(const void* p) { + uintptr_t ptr = reinterpret_cast(p); + return (uint32_t)((ptr & 0xFFFFULL) >> 8); +} + +inline uint32_t PtrHigh64Shift16(const void* p) { + uintptr_t ptr = reinterpret_cast(p); + return (uint32_t)((ptr & 0xFFFFFFFFFFFF0000ULL) >> 16); +} + +inline uint32_t PtrLow40Shift8(const void* p) { + uintptr_t ptr = reinterpret_cast(p); + return (uint32_t)((ptr & 0xFFFFFFFFFFULL) >> 8); +} + +inline uint32_t PtrHigh64Shift40(const void* p) { + uintptr_t ptr = reinterpret_cast(p); + return (uint32_t)((ptr & 0xFFFFFF0000000000ULL) >> 40); +} + +static inline uint8_t Ptr48High8(const void* p) { + uintptr_t ptr = reinterpret_cast(p); + return (uint8_t)((ptr & 0xFF0000000000ULL) >> 40); +} + +static inline uint32_t Ptr48Low32(const void* p) { + uintptr_t ptr = reinterpret_cast(p); + assert((ptr & 0xFFFFFFFFFF00ULL) == ptr); + return (uint32_t)((ptr & 0xFFFFFFFFFFULL) >> 8); +} + +inline uint32_t PtrLow32(const void* p) { + return static_cast(reinterpret_cast(p)); +} + +inline uint32_t PtrHigh32(const void* p) { + uint32_t ptr = 0; +#ifdef HSA_LARGE_MODEL + ptr = static_cast(reinterpret_cast(p) >> 32); +#endif + return ptr; +} + +inline uint32_t HighPart(uint64_t value) { + return (value & 0xFFFFFFFF00000000) >> 32; +} + +inline uint32_t LowPart(uint64_t value) { + return (value & 0x00000000FFFFFFFF); +} + +#include "atomic_helpers.h" + +#endif // HSA_RUNTIME_CORE_UTIL_UTILS_H_ diff --git a/util/win/os_win.cpp b/util/win/os_win.cpp new file mode 100644 index 0000000000..81c90cd266 --- /dev/null +++ b/util/win/os_win.cpp @@ -0,0 +1,327 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifdef _WIN32 // Are we compiling for windows? +#define NOMINMAX + +#include "core/util/os.h" + +#include +#include +#include +#include + +#include +#include +#include + +#undef Yield +#undef CreateMutex + +namespace rocr { +namespace os { + +static_assert(sizeof(LibHandle) == sizeof(HMODULE), + "OS abstraction size mismatch"); +static_assert(sizeof(LibHandle) == sizeof(::HANDLE), + "OS abstraction size mismatch"); +static_assert(sizeof(Semaphore) == sizeof(::HANDLE), + "OS abstraction size mismatch"); +static_assert(sizeof(Mutex) == sizeof(::HANDLE), + "OS abstraction size mismatch"); +static_assert(sizeof(Thread) == sizeof(::HANDLE), + "OS abstraction size mismatch"); +static_assert(sizeof(EventHandle) == sizeof(::HANDLE), + "OS abstraction size mismatch"); + +LibHandle LoadLib(std::string filename) { + HMODULE ret = LoadLibrary(filename.c_str()); + return *(LibHandle*)&ret; +} + +void* GetExportAddress(LibHandle lib, std::string export_name) { + return GetProcAddress(*(HMODULE*)&lib, export_name.c_str()); +} + +void CloseLib(LibHandle lib) { FreeLibrary(*(::HMODULE*)&lib); } + +std::vector GetLoadedLibs() { + // Use EnumProcessModulesEx + static_assert(false, "Not implemented."); +} + +std::string GetLibraryName(LibHandle lib) { + static_assert(false, "Not implemented."); +} + +Semaphore CreateSemaphore() { + sem = static_cast(CreateSemaphore(NULL, 0, LONG_MAX, NULL)); + assert(sem != NULL && "CreateSemaphore failed"); + + return *(Semaphore*)&sem; +} + +bool WaitSemaphore(Semaphore sem) { + return WaitForSingleObject(*(::HANDLE*)&lock, INFINITE) == WAIT_OBJECT_0; +} + +void PostSemaphore(Semaphore sem) { + ReleaseSemaphore(static_cast(*sem), 1, NULL); +} + +void DestroySemaphore(Semaphore sem) { + if (!CloseHandle(static_cast(*sem))) { + assert("CloseHandle() failed"); + } + *sem = NULL; +} + +Mutex CreateMutex() { return CreateEvent(NULL, false, true, NULL); } + +bool TryAcquireMutex(Mutex lock) { + return WaitForSingleObject(*(::HANDLE*)&lock, 0) == WAIT_OBJECT_0; +} + +bool AcquireMutex(Mutex lock) { + return WaitForSingleObject(*(::HANDLE*)&lock, INFINITE) == WAIT_OBJECT_0; +} + +void ReleaseMutex(Mutex lock) { SetEvent(*(::HANDLE*)&lock); } + +void DestroyMutex(Mutex lock) { CloseHandle(*(::HANDLE*)&lock); } + +void Sleep(int delay_in_millisecond) { ::Sleep(delay_in_millisecond); } + +void uSleep(int delayInUs) { ::Sleep(delayInUs / 1000); } + +void YieldThread() { ::Sleep(0); } + +struct ThreadArgs { + void* entry_args; + ThreadEntry entry_function; +}; + +unsigned __stdcall ThreadTrampoline(void* arg) { + ThreadArgs* thread_args = (ThreadArgs*)arg; + ThreadEntry entry = thread_args->entry_function; + void* data = thread_args->entry_args; + delete thread_args; + entry(data); + _endthreadex(0); + return 0; +} + +Thread CreateThread(ThreadEntry entry_function, void* entry_argument, + uint stack_size) { + ThreadArgs* thread_args = new ThreadArgs(); + thread_args->entry_args = entry_argument; + thread_args->entry_function = entry_function; + uintptr_t ret = + _beginthreadex(NULL, stack_size, ThreadTrampoline, thread_args, 0, NULL); + return *(Thread*)&ret; +} + +void CloseThread(Thread thread) { CloseHandle(*(::HANDLE*)&thread); } + +bool WaitForThread(Thread thread) { + return WaitForSingleObject(*(::HANDLE*)&thread, INFINITE) == WAIT_OBJECT_0; +} + +bool WaitForAllThreads(Thread* threads, uint thread_count) { + return WaitForMultipleObjects(thread_count, threads, TRUE, INFINITE) == + WAIT_OBJECT_0; +} + +void SetEnvVar(std::string env_var_name, std::string env_var_value) { + SetEnvironmentVariable(env_var_name.c_str(), env_var_value.c_str()); +} + +std::string GetEnvVar(std::string env_var_name) { + char* buff; + DWORD char_count = GetEnvironmentVariable(env_var_name.c_str(), NULL, 0); + if (char_count == 0) return ""; + buff = (char*)alloca(sizeof(char) * char_count); + GetEnvironmentVariable(env_var_name.c_str(), buff, char_count); + buff[char_count - 1] = '\0'; + std::string ret = buff; + return ret; +} + +size_t GetUserModeVirtualMemorySize() { + SYSTEM_INFO system_info = {0}; + GetSystemInfo(&system_info); + return ((size_t)system_info.lpMaximumApplicationAddress + 1); +} + +size_t GetUsablePhysicalHostMemorySize() { + MEMORYSTATUSEX memory_status = {0}; + memory_status.dwLength = sizeof(memory_status); + if (GlobalMemoryStatusEx(&memory_status) == 0) { + return 0; + } + + const size_t physical_size = static_cast(memory_status.ullTotalPhys); + return std::min(GetUserModeVirtualMemorySize(), physical_size); +} + +uintptr_t GetUserModeVirtualMemoryBase() { return (uintptr_t)0; } + +// Os event wrappers +EventHandle CreateOsEvent(bool auto_reset, bool init_state) { + EventHandle evt = reinterpret_cast( + CreateEvent(NULL, (BOOL)(!auto_reset), (BOOL)init_state, NULL)); + return evt; +} + +int DestroyOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + return CloseHandle(reinterpret_cast<::HANDLE>(event)); +} + +int WaitForOsEvent(EventHandle event, unsigned int milli_seconds) { + if (event == NULL) { + return -1; + } + + int ret_code = + WaitForSingleObject(reinterpret_cast<::HANDLE>(event), milli_seconds); + if (ret_code == WAIT_TIMEOUT) { + ret_code = 0x14003; // 0x14003 indicates timeout + } + return ret_code; +} + +int SetOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + return SetEvent(reinterpret_cast<::HANDLE>(event)); +} + +int ResetOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + return ResetEvent(reinterpret_cast<::HANDLE>(event)); +} + +uint64_t ReadAccurateClock() { + uint64_t ret; + QueryPerformanceCounter((LARGE_INTEGER*)&ret); + return ret; +} + +uint64_t AccurateClockFrequency() { + uint64_t ret; + QueryPerformanceFrequency((LARGE_INTEGER*)&ret); + return ret; +} + +SharedMutex CreateSharedMutex() { + assert(false && "Not implemented."); + abort(); + return nullptr; +} + +bool TryAcquireSharedMutex(SharedMutex lock) { + assert(false && "Not implemented."); + abort(); + return false; +} + +bool AcquireSharedMutex(SharedMutex lock) { + assert(false && "Not implemented."); + abort(); + return false; +} + +void ReleaseSharedMutex(SharedMutex lock) { + assert(false && "Not implemented."); + abort(); +} + +bool TrySharedAcquireSharedMutex(SharedMutex lock) { + assert(false && "Not implemented."); + abort(); + return false; +} + +bool SharedAcquireSharedMutex(SharedMutex lock) { + assert(false && "Not implemented."); + abort(); + return false; +} + +void SharedReleaseSharedMutex(SharedMutex lock) { + assert(false && "Not implemented."); + abort(); +} + +void DestroySharedMutex(SharedMutex lock) { + assert(false && "Not implemented."); + abort(); +} + +uint64_t ReadSystemClock() { + assert(false && "Not implemented."); + abort(); + return 0; +} + +uint64_t SystemClockFrequency() { + assert(false && "Not implemented."); + abort(); + return 0; +} + +bool ParseCpuID(cpuid_t* cpuinfo) { + assert(false && "Not implemented."); + abort(); + return false; +} + +} // namespace os +} // namespace rocr + +#endif diff --git a/version.cpp b/version.cpp new file mode 100644 index 0000000000..b71d036fc4 --- /dev/null +++ b/version.cpp @@ -0,0 +1,52 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "libhsakmt.h" + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetVersion(HsaVersionInfo *VersionInfo) { + CHECK_DXG_OPEN(); + + VersionInfo->KernelInterfaceMajorVersion = 1; + VersionInfo->KernelInterfaceMinorVersion = 16; + + return HSAKMT_STATUS_SUCCESS; +} +HSAKMT_STATUS HSAKMTAPI +hsaKmtGetVersionCapInfo(HsaVersionCapability *VersionCapInfo) { + CHECK_DXG_OPEN(); + + VersionCapInfo->Value = 0; + /*VersionCapInfo->ui64.InterruptSignal = 0; + VersionCapInfo->ui64.Sdma = 0; + VersionCapInfo->ui64.SdmaXgmi = 0; + VersionCapInfo->ui64.Image = 0; + VersionCapInfo->ui64.EventAge = 0; + VersionCapInfo->ui64.Scratch = 0; + VersionCapInfo->ui64.Vmem = 0; + VersionCapInfo->ui64.dmabuf = 0; + VersionCapInfo->ui64.XNack = 0;*/ + + return HSAKMT_STATUS_SUCCESS; +} \ No newline at end of file diff --git a/wddm/cmd_util.cpp b/wddm/cmd_util.cpp new file mode 100644 index 0000000000..20e0231977 --- /dev/null +++ b/wddm/cmd_util.cpp @@ -0,0 +1,281 @@ +/* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. */ + +#include "inc/wddm/cmd_util.h" + +namespace rocr { +namespace core { + +/* + * Builds a COPY_DATA packet that copies data. + */ +size_t CmdUtil::BuildCopyData( + uint64_t *pDstAddr, + void *pBuffer, + uint32_t dstSel, + uint32_t dstCachePolicy, + uint32_t srcSel, + uint32_t srcCachePolicy, + uint32_t countSel, + uint32_t wrConfirm) { + PM4MEC_COPY_DATA copy_data = {0}; + + GenerateCmdHeader(©_data, IT_COPY_DATA); + copy_data.bitfields2.dst_sel = dstSel; + copy_data.bitfields2.src_sel = srcSel; + copy_data.bitfields2.dst_cache_policy = dstCachePolicy; + copy_data.bitfields2.src_cache_policy = srcCachePolicy; + copy_data.bitfields2.count_sel = countSel; + copy_data.bitfields2.wr_confirm = wrConfirm; + copy_data.bitfields5c.dst_64b_addr_lo = (PtrLow32(pDstAddr) >> 3); + copy_data.dst_addr_hi = PtrHigh32(pDstAddr); + memcpy(pBuffer, ©_data, sizeof(copy_data)); + + return sizeof(copy_data); +} + +/* + * Builds a EVENT_WRITE packet. + * Applications can use Barrier command to ensure their + * command is executed only after all other commands have + * completed their execution. + */ +size_t CmdUtil::BuildBarrier( + void *pBuffer, + uint32_t eventIndex, + uint32_t eventType) { + BarrierTemplate barrier = {0}; + + GenerateCmdHeader(&barrier.event_write, IT_EVENT_WRITE); + barrier.event_write.bitfields2.event_index = eventIndex; + barrier.event_write.bitfields2.event_type = eventType; + memcpy(pBuffer, &barrier, sizeof(barrier)); + + return sizeof(barrier); +} + +/* + * Builds a ACQUIRE_MEM packet. + * Users can submit this command to + * invalidate Gpu caches - L1 and or L2. + */ +size_t CmdUtil::BuildAcquireMem( + uint8_t major, + void *pBuffer) { + size_t ret; + if (major == 9) { + gfx9::AcquireMemTemplate acq = {0}; + GenerateCmdHeader(&acq.acquire_mem, IT_ACQUIRE_MEM); + // Specify the size of memory to invalidate. Size is + // specified in terms of 256 byte chunks. A coher_size + // of 0xFFFFFFFF actually specified 0xFFFFFFFF00 (40 bits) + // of memory. The field coher_size_hi specifies memory from + // bits 40-64 for a total of 256 TB. + acq.acquire_mem.coher_size = 0xFFFFFFFF; + acq.acquire_mem.bitfields4.coher_size_hi = 0xFF; + // Specify the address of memory to invalidate. The + // address must be 256 byte aligned. + acq.acquire_mem.coher_base_lo = 0; + acq.acquire_mem.bitfields6.coher_base_hi = 0; + // Specify the poll interval for determing if operation is complete + acq.acquire_mem.bitfields7.poll_interval = 4; + acq.acquire_mem.bitfields2.coher_cntl = + (1 << 29) | // CP_COHER_CNTL__SH_ICACHE_ACTION_ENA_MASK + (1 << 27) | // CP_COHER_CNTL__SH_KCACHE_ACTION_ENA_MASK + (1 << 28); // CP_COHER_CNTL__SH_KCACHE_VOL_ACTION_ENA_MASK + memcpy(pBuffer, &acq, sizeof(acq)); + ret = sizeof(acq); + } else if (major >= 10) { + gfx10::AcquireMemTemplate acq = {0}; + GenerateCmdHeader(&acq.acquire_mem, IT_ACQUIRE_MEM); + acq.acquire_mem.coher_size = 0xFFFFFFFF; + acq.acquire_mem.bitfields4.coher_size_hi = 0xFF; + acq.acquire_mem.coher_base_lo = 0; + acq.acquire_mem.bitfields6.coher_base_hi = 0; + acq.acquire_mem.bitfields7.poll_interval = 4; + acq.acquire_mem.bitfields8.gcr_cntl = + (1 << 16) | // SEQ = FORWARD + (1 << 15) | // GL2_WB + (1 << 14) | // GL2_INV + (1 << 9) | // GL1_INV + (1 << 8) | // GLV_INV + (1 << 7) | // GLK_INV + (1 << 6) | // GLK_WB + (1 << 5) | // GLM_INV + (1 << 4) | // GLM_WB + (1 << 0); // GLI_INV = ALL + memcpy(pBuffer, &acq, sizeof(acq)); + ret = sizeof(acq); + } + + return ret; +} + +/* + * Builds a scratch packet. + */ +size_t CmdUtil::BuildScratch( + void *pScratchBase, + void *pBuffer) { + struct SetScratchTemplate scratch = {0}; + + GenerateSetShRegHeader(&scratch, mmCOMPUTE_DISPATCH_SCRATCH_BASE_LO); + scratch.scratch_lo = Ptr48Low32(pScratchBase); + scratch.scratch_hi = Ptr48High8(pScratchBase); + memcpy(pBuffer, &scratch, sizeof(scratch)); + + return sizeof(scratch); +} + +/** + * @ Set Compute Shader parameter for gfx11 and above + */ +size_t CmdUtil::BuildComputeShaderParams(void *pBuffer) { + struct DispatchProgramResourceRegs compute_shader_params = {0}; + + GenerateSetShRegHeader(&compute_shader_params, mmCOMPUTE_PGM_RSRC3); + // IMAGE_OP: Indicates the compute program contains an image op + // instruction and should be stalled by its WAIT_SYNC fence. + compute_shader_params.compute_pgm_rsrc3 = (1 << 31); + + memcpy(pBuffer, &compute_shader_params, sizeof(compute_shader_params)); + + return sizeof(compute_shader_params); +} + + +/* + * Builds a dispatch packet. + */ +size_t CmdUtil::BuildDispatch( + struct DispatchInfo *pInfo, + void *pBuffer) { + DispatchTemplate dispatch = {0}; + + GenerateSetShRegHeader(&dispatch.dimension_regs, mmCOMPUTE_NUM_THREAD_X); + dispatch.dimension_regs.compute_num_thread_x = pInfo->pPacket->workgroup_size_x; + dispatch.dimension_regs.compute_num_thread_y = pInfo->pPacket->workgroup_size_y; + dispatch.dimension_regs.compute_num_thread_z = pInfo->pPacket->workgroup_size_z; + + // TODO: Add AQL packet index for debugger + // Debugger requires AQL packet index in COMPUTE_DISPATCH_PKT_ADDR_LO + GenerateSetShRegHeader(&dispatch.program_regs, mmCOMPUTE_PGM_LO); + dispatch.program_regs.compute_pgm_lo = Ptr48Low32(pInfo->pEntry); + dispatch.program_regs.compute_pgm_hi = Ptr48High8(pInfo->pEntry); + + GenerateSetShRegHeader(&dispatch.program_resource_regs, mmCOMPUTE_PGM_RSRC1); + dispatch.program_resource_regs.compute_pgm_rsrc1 = pInfo->pKernelObject->compute_pgm_rsrc1; + dispatch.program_resource_regs.compute_pgm_rsrc2 = + (pInfo->ldsBlks << 15) | pInfo->pKernelObject->compute_pgm_rsrc2; + + GenerateSetShRegHeader(&dispatch.resource_regs, mmCOMPUTE_RESOURCE_LIMITS); + dispatch.resource_regs.compute_resource_limits = 0x3ff; + dispatch.resource_regs.compute_static_thread_mgmt_se0 = 0xFFFFFFFF; + dispatch.resource_regs.compute_static_thread_mgmt_se1 = 0xFFFFFFFF; + dispatch.resource_regs.compute_static_thread_mgmt_se2 = 0xFFFFFFFF; + dispatch.resource_regs.compute_static_thread_mgmt_se3 = 0xFFFFFFFF; + + dispatch.resource_regs.compute_tmpring_size = pInfo->pAmdQueue->compute_tmpring_size; + + GenerateSetShRegHeader(&dispatch.compute_user_data_regs, mmCOMPUTE_USER_DATA_0); + + uint32_t sgpr_no = 0; + if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) { + assert(pInfo->major < 11); + pInfo->scratchBaseOffset[pInfo->offsetCnt++] = + offsetof(struct DispatchTemplate, compute_user_data_regs.compute_user_data[0]) + + sgpr_no * sizeof(uint32_t); + + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + pInfo->pAmdQueue->scratch_resource_descriptor[0]; + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + pInfo->pAmdQueue->scratch_resource_descriptor[1]; + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + pInfo->pAmdQueue->scratch_resource_descriptor[2]; + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + pInfo->srd; + } + if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR)) { + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrLow32(pInfo->pPacket); + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrHigh32(pInfo->pPacket); + } + if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) { + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrLow32(pInfo->pAmdQueue); + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrHigh32(pInfo->pAmdQueue); + } + if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) { + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + PtrLow32(pInfo->pPacket->kernarg_address); + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + PtrHigh32(pInfo->pPacket->kernarg_address); + } + if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID)) { + // This feature may be enabled as a side effect of indirect calls. + // However, the compiler team confirmed that the dispatch id itself is not used, + // so safe to send 0 for each dispatch. + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = NULL; + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = NULL; + } + if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT)) { + assert(pInfo->major < 11); + pInfo->scratchBaseOffset[pInfo->offsetCnt++] = + offsetof(struct DispatchTemplate, compute_user_data_regs.compute_user_data[0]) + + sgpr_no * sizeof(uint32_t); + + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + PtrLow32(pInfo->pScratchBase); + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + PtrHigh32(pInfo->pScratchBase); + } + if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)) { + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + pInfo->scratchSizePerWave / (pInfo->wave32 ? 32 : 64); + } + + GenerateCmdHeader(&dispatch.dispatch_direct, IT_DISPATCH_DIRECT); + dispatch.dispatch_direct.dispatch_initiator = + (1 << 0) | // COMPUTE_SHADER_EN + (1 << 2) | // FORCE_START_AT_000 + (1 << 5); // USE_THREAD_DIMENSIONS + if (pInfo->wave32) dispatch.dispatch_direct.dispatch_initiator |= (1 << 15); // CS_W32_EN + dispatch.dispatch_direct.dim_x = pInfo->pPacket->grid_size_x; + dispatch.dispatch_direct.dim_y = pInfo->pPacket->grid_size_y; + dispatch.dispatch_direct.dim_z = pInfo->pPacket->grid_size_z; + memcpy(pBuffer, &dispatch, sizeof(dispatch)); + + return sizeof(dispatch); +} + +/* + * Builds a ATOMIC_MEM packet. + * Users can submit this command + * to perform atomic operations. + */ +size_t CmdUtil::BuildAtomicMem( + uint64_t *pAddr, + uint32_t atomic, + void *pBuffer, + uint32_t cachePolicy, + uint64_t srcData) { + AtomicTemplate atom = {0}; + + GenerateCmdHeader(&atom.atomic, IT_ATOMIC_MEM); + atom.atomic.addr_lo = PtrLow32(pAddr); + atom.atomic.addr_hi = PtrHigh32(pAddr); + atom.atomic.bitfields2.atomic = atomic; + atom.atomic.bitfields2.cache_policy = cachePolicy; + atom.atomic.src_data_lo = LowPart(srcData); + atom.atomic.src_data_hi = HighPart(srcData); + memcpy(pBuffer, &atom, sizeof(atom)); + + return sizeof(atom); +} + +} // namespace core +} // namespace rocr diff --git a/wddm/device.cpp b/wddm/device.cpp new file mode 100644 index 0000000000..753f100a92 --- /dev/null +++ b/wddm/device.cpp @@ -0,0 +1,879 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include +#include + +#include +#include +#include +#include +#include +#include "inc/wddm/status.h" +#include "inc/wddm/types.h" +#include "inc/wddm/device.h" +#include "inc/wddm/queue.h" + +namespace rocr { +namespace core { + +const uint32_t WDDMDevice::cmdbuf_aql_frame_num_ = 0x1000; + +WDDMDevice::WDDMDevice(D3DKMT_HANDLE adapter, LUID adapter_luid) + : adapter_(adapter), adapter_luid_(adapter_luid) { + memset(&device_info_, 0, sizeof(device_info_)); + + ParseDeviceInfo(); + CreateDevice(); + SetPowerOptimization(false); + CreatePagingQueue(); + ReserveLocalHeapSpace(); + ReserveSystemHeapSpace(); + InitVaMgr(); + InitCmdbufInfo(); +} + +WDDMDevice::~WDDMDevice() { + FreeLocalHeapSpace(); + FreeSystemHeapSpace(); + DestroyPagingQueue(); + SetPowerOptimization(true); + DestroyDevice(); + + DestroyDeviceInfo(); +} + +static NTSTATUS WDDMQueryAdapter(D3DKMT_HANDLE adapter, KMTQUERYADAPTERINFOTYPE type, + void *data, int size) +{ + D3DKMT_QUERYADAPTERINFO args = {0}; + + args.hAdapter = adapter; + args.Type = type; + args.pPrivateDriverData = data; + args.PrivateDriverDataSize = size; + + return D3DKMTQueryAdapterInfo(&args); +} + +uint64_t WDDMDevice::VramAvail(void) { + D3DKMT_QUERYSTATISTICS stats; + NTSTATUS ret; + uint64_t usedVis = 0; + uint64_t usedInv = 0; + + // wait fence complete + uint64_t value = page_fence_value_.load(); + if(!CpuWait(&page_syncobj_, &value, 1, false)) + return HSA_STATUS_ERROR; + + // local cpu-visible memory + memset(&stats, 0, sizeof(D3DKMT_QUERYSTATISTICS)); + stats.Type = D3DKMT_QUERYSTATISTICS_SEGMENT; + stats.AdapterLuid = adapter_luid_; + stats.QuerySegment.SegmentId = 0; + ret = D3DKMTQueryStatistics(&stats); + if (ret == 0) + usedVis = stats.QueryResult.SegmentInformation.BytesResident; + + // local invisible memory + memset(&stats, 0, sizeof(D3DKMT_QUERYSTATISTICS)); + stats.Type = D3DKMT_QUERYSTATISTICS_SEGMENT; + stats.AdapterLuid = adapter_luid_; + stats.QuerySegment.SegmentId = 1; + + ret = D3DKMTQueryStatistics(&stats); + if (ret == 0) + usedInv = stats.QueryResult.SegmentInformation.BytesResident; + + return LocalHeapSize() - usedVis - usedInv; +} + +bool WDDMDevice::CreateDevice(void) { + D3DKMT_CREATEDEVICE args = {0}; + args.hAdapter = adapter_; + + NTSTATUS ret = D3DKMTCreateDevice(&args); + if (ret == STATUS_SUCCESS) { + device_ = args.hDevice; + return true; + } + + fprintf(stderr, "%s fail %x\n", __FUNCTION__, ret); + return false; +} + +bool WDDMDevice::DestroyDevice(void) { + D3DKMT_DESTROYDEVICE args = {0}; + args.hDevice = device_; + + NTSTATUS ret = D3DKMTDestroyDevice(&args); + if (ret == STATUS_SUCCESS) + return true; + + fprintf(stderr, "%s fail %x\n", __FUNCTION__, ret); + return false; +} + +bool WDDMDevice::CreatePagingQueue(void) { + D3DKMT_CREATEPAGINGQUEUE args = {0}; + args.hDevice = device_; + args.Priority = D3DDDI_PAGINGQUEUE_PRIORITY_NORMAL; + + NTSTATUS ret = D3DKMTCreatePagingQueue(&args); + if (ret == STATUS_SUCCESS) { + page_queue_ = args.hPagingQueue; + page_syncobj_ = args.hSyncObject; + page_fence_addr_ = (uint64_t *)args.FenceValueCPUVirtualAddress; + page_fence_value_ = 0; + return true; + } + + fprintf(stderr, "%s fail %x\n", __FUNCTION__, ret); + return false; +} + +bool WDDMDevice::DestroyPagingQueue(void) { + D3DDDI_DESTROYPAGINGQUEUE args = {0}; + args.hPagingQueue = page_queue_; + + NTSTATUS ret = D3DKMTDestroyPagingQueue(&args); + if (ret == STATUS_SUCCESS) + return true; + + fprintf(stderr, "%s fail %x\n", __FUNCTION__, ret); + return false; +} + +bool WDDMDevice::CommitSystemHeapSpace(void* addr, int64_t size, bool lock) { + int32_t protFlags = PROT_READ | PROT_WRITE | PROT_EXEC; + int32_t mapFlags = MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED| + MAP_NORESERVE|MAP_UNINITIALIZED; + if (lock) + mapFlags |= MAP_LOCKED; + void* paddr = mmap(addr, size, protFlags, mapFlags, -1, 0); + if (paddr == MAP_FAILED) { + fprintf(stderr, "%s fail to commit %s addr = %p, paddr = %p\n", + __FUNCTION__, (lock ? "locked" : ""), addr, paddr); + return false; + } + assert(addr == paddr); + + /*if (!Runtime::runtime_singleton_->PinWARequired()) + return true;*/ + + /* + * Do not make the pages in this range available to the child + * after a fork(2). This is useful to prevent copy-on-write + * semantics from changing the physical location of a page if + * the parent writes to it after a fork(2). (Such page + * relocations cause problems for hardware that DMAs into the + * page.) + * + * https://man7.org/linux/man-pages/man2/madvise.2.html + */ + if (madvise(addr, size, MADV_DONTFORK)) + fprintf(stderr, "%s fail to set MADV_DONTFORK for addr = %p\n", + __FUNCTION__, addr); + + return true; +} + +bool WDDMDevice::DecommitSystemHeapSpace(void* addr, int64_t size) { + int32_t protFlags = PROT_NONE; + int32_t mapFlags = MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED| + MAP_NORESERVE|MAP_UNINITIALIZED; + void* paddr = mmap(addr, size, protFlags, mapFlags, -1, 0); + if (paddr == MAP_FAILED) { + fprintf(stderr, "%s fail to decommit addr = %p, paddr = %p\n", + __FUNCTION__, addr, paddr); + return false; + } + assert(addr == paddr); + return true; +} + +bool WDDMDevice::ReserveSystemHeapSpace() { + struct sysinfo info; + int ret = sysinfo(&info); + uint64_t max_ram = 0x10000000000; + uint64_t alignment = 0x100000000; + assert(!ret); + + int32_t protFlags = PROT_NONE; + // minimum of reserve size is 8G, maximum of reserve size is 1T. + system_heap_space_size_ = std::min(AlignUp(info.totalram, alignment) * 2, max_ram); + void* cpu = mmap(NULL, system_heap_space_size_, protFlags, + MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + if (cpu == MAP_FAILED) { + fprintf(stderr, "%s fail to reserve system_heap_space_size_ = %lx \n", + __FUNCTION__, system_heap_space_size_); + return false; + } + + system_heap_space_start_ = (uint64_t)cpu; + return true; +} + +bool WDDMDevice::FreeSystemHeapSpace(void) { + void *cpu = (void *)system_heap_space_start_; + if (munmap(cpu, system_heap_space_size_ != 0)) { + fprintf(stderr, "%s fail to unmap = %p \n", __FUNCTION__, cpu); + return false; + } + return true; +} + +/* + * To find the avaliable same range for cpu + * virtual space and gpu virtual space. + * sys_va_size of cpu va range is larger 1G + * than gpu va range, otherwise ReserveGPUVirtualAddress + * will return error. + */ +bool WDDMDevice::ReserveLocalHeapSpace(void) { + uint64_t sys_va[16] = {0}; + uint64_t local_va; + uint64_t sys_va_size; + int match_index = -1; + uint64_t align = 0x40000000; /* 1G */ + void* ptr = NULL; + + local_heap_space_start_ = 0; + local_heap_space_size_ = AlignUp(LocalHeapSize(), align) * 4; + sys_va_size = local_heap_space_size_ + align; + + /* it will retry 16 times to find the avaliable range. */ + for (int i = 0; i < 16; i++) { + local_va = 0; + ptr = mmap(NULL, sys_va_size , PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + if (ptr == MAP_FAILED) { + fprintf(stderr, "%s fail to reserve cpu va in %d time!\n", + __FUNCTION__, i); + break; + } + + sys_va[i] = (uint64_t)ptr; + + if (thunk::ReserveGpuVirtualAddress( + adapter_, local_heap_space_size_, + (uint64_t)ptr, + (uint64_t)ptr + sys_va_size, &local_va) == ErrorCode::Success) { + + match_index = i; + local_heap_space_start_ = local_va; + debug_print("%s success to reserve gpu va %lx and va cpu %p in %d time\n", + __FUNCTION__, local_va, ptr, i); + break; + } else { + debug_print("%s fail to reserve gpu va for cpu va %p in %d time!\n", + __FUNCTION__, ptr, i); + } + } + + if (match_index >= 0) { + /* release cpu unused ranges*/ + uint64_t left_size = local_va - sys_va[match_index]; + uint64_t right_size = align - left_size; + if ((left_size > 0) && munmap((void*)sys_va[match_index], left_size)) + fprintf(stderr, "%s fail to unmap left %lx with size %lx\n", + __FUNCTION__, sys_va[match_index], left_size); + if ((right_size > 0) && munmap((void*)(local_va + local_heap_space_size_), right_size)) + fprintf(stderr, "%s fail to unmap right %lx with size %lx\n", + __FUNCTION__, (local_va + local_heap_space_size_), right_size); + } else { + fprintf(stderr, "%s fail to reserve Local Heap Space !\n", + __FUNCTION__); + } + + /* free match fail address for cpu va */ + int free = match_index >= 0 ? match_index : 16; + for (int j = 0; j < free; j++) { + if (sys_va[j] != 0 && munmap((void*)sys_va[j], sys_va_size)) { + fprintf(stderr, "%s fail to unmap %d %lx\n", __FUNCTION__, j, sys_va[j]); + } + } + + return match_index >= 0; +} + +bool WDDMDevice::FreeLocalHeapSpace(void) { + thunk::FreeGpuVirtualAddress(adapter_, local_heap_space_start_, local_heap_space_size_); + void *cpu = (void *)local_heap_space_start_; + return munmap(cpu, local_heap_space_size_) == 0; +} + +void WDDMDevice::InitVaMgr() { + uint32_t min_align = 4096; + local_va_mgr_ = std::make_unique(local_heap_space_start_, local_heap_space_size_, min_align); +} + +void WDDMDevice::SetPowerOptimization(bool restore) { + void *priv_data; + int priv_size; + + priv_size = rocr_proxy::CreatePowerOptPrivData(&priv_data, restore); + + D3DKMT_ESCAPE d3dkmt_escape; + memset(&d3dkmt_escape, 0, sizeof(d3dkmt_escape)); + + d3dkmt_escape.hAdapter = adapter_; + d3dkmt_escape.hDevice = device_; + d3dkmt_escape.hContext = 0; //KMD only use device to identify the process + d3dkmt_escape.Type = D3DKMT_ESCAPE_DRIVERPRIVATE; + d3dkmt_escape.pPrivateDriverData = priv_data; + d3dkmt_escape.PrivateDriverDataSize = priv_size; + d3dkmt_escape.Flags.HardwareAccess = true; + + NTSTATUS status = D3DKMTEscape(&d3dkmt_escape); + debug_print("%s status %d restore %d\n", __FUNCTION__, status, restore); + rocr_proxy::DestroyPrivData(priv_data); +} + +ErrorCode WDDMDevice::ReserveGpuVirtualAddress(const rocr_proxy::AllocDomain domain, + gpusize hit_base_addr, gpusize size, + gpusize *out_gpu_virt_addr, gpusize alignment, bool lock) { + gpusize gpu_addr = 0; + ErrorCode code = ErrorCode::Success; + + if (domain == rocr_proxy::kSystem) { + + code = thunk::ReserveGpuVirtualAddress(adapter_, size, + system_heap_space_start_, + system_heap_space_start_ + system_heap_space_size_, + &gpu_addr); + if (code != ErrorCode::Success) + return code; + + if (!CommitSystemHeapSpace((void*)gpu_addr, size, lock)) { + thunk::FreeGpuVirtualAddress(adapter_, gpu_addr, size); + code = ErrorCode::SyscallFail; + } + } else { + uint64_t align = alignment == 0 ? (64 * 1024) : alignment; // default 64K alignment + if (domain == rocr_proxy::kLocal && size >= GPU_HUGE_PAGE_SIZE) + align = GPU_HUGE_PAGE_SIZE; + + gpu_addr = local_va_mgr_->Alloc(size, align, hit_base_addr); + if (gpu_addr == 0) + code = ErrorCode::OutOfGpuMemory; + + } + + *out_gpu_virt_addr = (code == ErrorCode::Success) ? gpu_addr : 0; + return code; +} + +ErrorCode WDDMDevice::FreeGpuVirtualAddress(const rocr_proxy::AllocDomain domain, + gpusize gpu_addr, gpusize size) { + auto code = ErrorCode::Success; + + if (domain == rocr_proxy::kSystem) { + + DecommitSystemHeapSpace((void *)gpu_addr, size); + + thunk::FreeGpuVirtualAddressArgs free_args{}; + free_args.hAdapter = adapter_; + free_args.BaseAddress = gpu_addr; + free_args.Size = size; + + code = thunk::FreeGpuVirtualAddress(&free_args); + } else { + local_va_mgr_->Free(gpu_addr); + } + + return code; +} + +void WDDMDevice::UpdatePageFence(uint64_t fence_value) { + uint64_t current = page_fence_value_.load(); + + // atomically set fence value when target is bigger than current one + do { + if (current >= fence_value) + break; + } while (!page_fence_value_.compare_exchange_weak(current, fence_value)); +} + +ErrorCode WDDMDevice::CreateGpuMemory(const GpuMemoryCreateInfo &create_info, GpuMemory **gpu_mem) { + ErrorCode ret; + + *gpu_mem = nullptr; + auto mem = new GpuMemory(this); + if (create_info.dmabuf_fd > 0) + ret = mem->ImportPhysicalHandle(create_info.dmabuf_fd); + else + ret = mem->Init(create_info); + if (ret == ErrorCode::Success) + *gpu_mem = mem; + + return ret; +} + +void *WDDMDevice::Lock(D3DKMT_HANDLE handle) { + D3DKMT_LOCK2 args = {0}; + args.hDevice = device_; + args.hAllocation = handle; + + NTSTATUS ret = D3DKMTLock2(&args); + if (ret == STATUS_SUCCESS) + return args.pData; + + fprintf(stderr, "%s fail %x\n", __FUNCTION__, ret); + return NULL; +} + +bool WDDMDevice::Unlock(D3DKMT_HANDLE handle) { + D3DKMT_UNLOCK2 args = {0}; + args.hDevice = device_; + args.hAllocation = handle; + + NTSTATUS ret = D3DKMTUnlock2(&args); + if (ret == STATUS_SUCCESS) + return true; + + fprintf(stderr, "%s fail %x\n", __FUNCTION__, ret); + return false; +} + +bool WDDMDevice::CreateContext(int engine, D3DKMT_HANDLE *handle) { + void *priv_data; + int priv_size; + + int ordinal = EngineOrdinal(engine, &device_info_); + if (ordinal < 0) + return false; + + bool FwManagedGfxState = SupportStateShadowingByCpFw(); + priv_size = rocr_proxy::CreateContextPrivData(&priv_data, FwManagedGfxState); + + D3DKMT_CREATECONTEXTVIRTUAL args = {0}; + args.hDevice = device_; + args.EngineAffinity = 1 << 0; + args.NodeOrdinal = ordinal; + args.pPrivateDriverData = priv_data; + args.PrivateDriverDataSize = priv_size; + args.ClientHint = D3DKMT_CLIENTHINT_OPENCL; + + if (IsHwsEnabled(engine)) + args.Flags.HwQueueSupported = 1; + else + args.Flags.DisableGpuTimeout = rocr_proxy::ShouldDisableGpuTimeout(engine, &device_info_); + + NTSTATUS ret = D3DKMTCreateContextVirtual(&args); + if (ret == STATUS_SUCCESS) { + *handle = args.hContext; + rocr_proxy::DestroyPrivData(priv_data); + return true; + } + + rocr_proxy::DestroyPrivData(priv_data); + + fprintf(stderr, "%s fail %x\n", __FUNCTION__, ret); + return false; +} + +bool WDDMDevice::DestroyContext(D3DKMT_HANDLE handle) { + D3DKMT_DESTROYCONTEXT args = {0}; + args.hContext = handle; + + NTSTATUS ret = D3DKMTDestroyContext(&args); + if (ret == STATUS_SUCCESS) + return true; + + fprintf(stderr, "%s fail %x\n", __FUNCTION__, ret); + return false; +} + +bool WDDMDevice::GpuWait(WDDMQueue *queue, const D3DKMT_HANDLE *syncobjs, + uint64_t *values, int count) { + + D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMGPU args = {0}; + args.hContext = queue->context; + args.ObjectCount = count; + args.ObjectHandleArray = syncobjs; + args.MonitoredFenceValueArray = values; + + NTSTATUS ret = D3DKMTWaitForSynchronizationObjectFromGpu(&args); + if (ret == STATUS_SUCCESS) + return true; + + fprintf(stderr, "%s fail %x\n", __FUNCTION__, ret); + return false; +} + +bool WDDMDevice::GpuSignal(D3DKMT_HANDLE context, const D3DKMT_HANDLE *syncobjs, + uint64_t *value, int count) { + D3DKMT_SIGNALSYNCHRONIZATIONOBJECTFROMGPU args = {0}; + args.hContext = context; + args.ObjectCount = count; + args.ObjectHandleArray = syncobjs; + args.MonitoredFenceValueArray = value; + + NTSTATUS ret = D3DKMTSignalSynchronizationObjectFromGpu(&args); + if (ret == STATUS_SUCCESS) + return true; + + fprintf(stderr, "%s fail %x\n", __FUNCTION__, ret); + return false; +} + +bool WDDMDevice::CpuWait(const D3DKMT_HANDLE *syncobjs, uint64_t *value, + int count, bool wait_any) { + D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMCPU args = {0}; + args.hDevice = device_; + args.ObjectCount = count; + args.ObjectHandleArray = syncobjs; + args.FenceValueArray = value; + args.Flags.WaitAny = wait_any; + + NTSTATUS ret = D3DKMTWaitForSynchronizationObjectFromCpu(&args); + if (ret == STATUS_SUCCESS) + return true; + + fprintf(stderr, "%s fail %x\n", __FUNCTION__, ret); + return false; +} + +bool WDDMDevice::WaitOnPagingFenceFromCpu() { + uint64_t page_fence_value = 0; + + page_fence_value = page_fence_value_.load(); + if (CpuWait(&page_syncobj_, &page_fence_value, 1, false)) + return true; + + return false; +} + +bool WDDMDevice::CreateSyncobj(D3DKMT_HANDLE *handle, uint64_t **addr) { + D3DKMT_CREATESYNCHRONIZATIONOBJECT2 args = {0}; + args.hDevice = device_; + args.Info.Type = D3DDDI_MONITORED_FENCE; + args.Info.MonitoredFence.EngineAffinity = 1 << 0; + + NTSTATUS ret = D3DKMTCreateSynchronizationObject2(&args); + if (ret == STATUS_SUCCESS) { + *handle = args.hSyncObject; + *addr = (uint64_t *)args.Info.MonitoredFence.FenceValueCPUVirtualAddress; + debug_print("create syncobj cpu addr=%p gpu addr=%" PRIx64 "\n", + args.Info.MonitoredFence.FenceValueCPUVirtualAddress, + args.Info.MonitoredFence.FenceValueGPUVirtualAddress); + return true; + } + + fprintf(stderr, "%s fail %x\n", __FUNCTION__, ret); + return false; +} + +void WDDMDevice::DestroySyncobj(D3DKMT_HANDLE handle) { + D3DKMT_DESTROYSYNCHRONIZATIONOBJECT args = {0}; + args.hSyncObject = handle; + + NTSTATUS ret = D3DKMTDestroySynchronizationObject(&args); + if (ret != STATUS_SUCCESS) + fprintf(stderr, "%s fail %x\n", __FUNCTION__, ret); +} + +void WDDMDevice::InitCmdbufInfo(void) { + if (device_info_.major == 9) { + cmdbuf_aql_frame_size_ = 2 * sizeof(gfx9::AcquireMemTemplate); + } else if (device_info_.major >= 10) { + cmdbuf_aql_frame_size_ = 2 * sizeof(gfx10::AcquireMemTemplate); + } + + if (device_info_.major >= 11) + cmdbuf_aql_frame_size_ += sizeof(SetScratchTemplate); + + cmdbuf_aql_frame_size_ += + sizeof(PM4MEC_COPY_DATA) * 2 + + sizeof(BarrierTemplate) * 2 + + sizeof(DispatchTemplate) + + sizeof(AtomicTemplate) * 2; + cmdbuf_aql_frame_size_ = AlignUp(cmdbuf_aql_frame_size_, 0x10); + + cmdbuf_size_ = AlignUp(cmdbuf_aql_frame_num_ * cmdbuf_aql_frame_size_, 0x1000); +} + +uint32_t WDDMDevice::LdsBlocks(const hsa_kernel_dispatch_packet_t *pkt) { + static const uint32_t blk_sz = 512; + uint32_t total_sz = pkt->group_segment_size; + uint32_t blk_num = (total_sz + blk_sz - 1) / blk_sz; + return blk_num; +} + +NTSTATUS WDDMGetAdapters(D3DKMT_ADAPTERINFO *&adapters, int &num_adapters) +{ + bool supported = false; + D3DKMT_ENUMADAPTERS2 args = {0}; + NTSTATUS ret = D3DKMTEnumAdapters2(&args); + if (ret != STATUS_SUCCESS) + return ret; + + if (!args.NumAdapters) { + adapters = NULL; + num_adapters = 0; + return STATUS_SUCCESS; + } + + D3DKMT_ADAPTERINFO *info = new D3DKMT_ADAPTERINFO[args.NumAdapters]; + if (!info) + return STATUS_NO_MEMORY; + + args.pAdapters = info; + ret = D3DKMTEnumAdapters2(&args); + if (ret != STATUS_SUCCESS) + goto err_out0; + + adapters = new D3DKMT_ADAPTERINFO[args.NumAdapters]; + if (!adapters) + goto err_out0; + + num_adapters = 0; + for (int i = 0; i < args.NumAdapters; i++) { + D3DKMT_ADAPTERREGISTRYINFO query = {0}; + + ret = WDDMQueryAdapter(info[i].hAdapter, KMTQAITYPE_ADAPTERREGISTRYINFO, + &query, sizeof(query)); + if (ret != STATUS_SUCCESS) + goto err_out1; + + supported = rocr_proxy::QueryAdapterSupported(info[i].hAdapter); + + if (std::wcsstr(query.ChipType, L"AMD") && supported) { + adapters[num_adapters++] = info[i]; + } + } + + delete info; + return STATUS_SUCCESS; + + err_out1: + delete adapters; + adapters = NULL; + err_out0: + delete info; + return ret; +} + +bool WDDMDevice::ParseDeviceInfo() { + bool ret; + + memset(&device_info_, 0, sizeof(device_info_)); + ret = rocr_proxy::ParseAdapterInfo(adapter_, &device_info_); + if (!ret) + return false; + + return true; +} + +void WDDMDevice::DestroyDeviceInfo() { + free(device_info_.adapter_info); +} + +void WDDMDevice::GetClockCounters(uint64_t *gpu, uint64_t *cpu) { + void *priv_data; + int priv_size; + + priv_size = rocr_proxy::CreateCalibratedTimestampsPrivData(&priv_data); + + D3DKMT_ESCAPE d3dkmt_escape; + memset(&d3dkmt_escape, 0, sizeof(d3dkmt_escape)); + + d3dkmt_escape.hAdapter = adapter_; + d3dkmt_escape.hDevice = device_; + d3dkmt_escape.hContext = 0; //KMD only use device to identify the process + d3dkmt_escape.Type = D3DKMT_ESCAPE_DRIVERPRIVATE; + d3dkmt_escape.pPrivateDriverData = priv_data; + d3dkmt_escape.PrivateDriverDataSize = priv_size; + d3dkmt_escape.Flags.HardwareAccess = true; + + NTSTATUS status = D3DKMTEscape(&d3dkmt_escape); + if (status) { + debug_print("%s status %d \n", __FUNCTION__, status); + } else { + rocr_proxy::QueryCalibratedTimestamps(priv_data, gpu, cpu); + } + rocr_proxy::DestroyPrivData(priv_data); +} + +bool WDDMDevice::CreateQueue(WDDMQueue *queue) { + if (!CreateContext(queue->queue_engine, &queue->context)) + return false; + + GpuMemoryCreateInfo create_info{}; + create_info.size = queue->cmdbuf_size; + create_info.domain = rocr_proxy::kSystem; + + GpuMemory *gpu_mem = nullptr; + auto code = CreateGpuMemory(create_info, &gpu_mem); + if (code != ErrorCode::Success) + goto err_out0; + + queue->cmdbuf = gpu_mem->GetGpuMemoryHandle(); + queue->cmdbuf_addr = gpu_mem->GpuAddress(); + + if (queue->Init()) + goto err_out1; + + return true; + +err_out1: + delete gpu_mem; +err_out0: + DestroyContext(queue->context); + + return false; +} + +void WDDMDevice::DestroyQueue(WDDMQueue *queue) { + + queue->Fini(); + + auto cmdbuf_mem = GpuMemory::Convert(queue->cmdbuf); + delete cmdbuf_mem; + + DestroyContext(queue->context); +} + +bool WDDMDevice::SubmitToSwQueue(WDDMQueue *queue, uint64_t command_addr, + uint64_t command_size, uint64_t fence_value) { + void *priv_data; + int priv_size; + + priv_size = rocr_proxy::CreateSubmitPrivData(&priv_data, queue->queue, command_addr, command_size, false); + + D3DKMT_SUBMITCOMMAND args = {0}; + args.Commands = command_addr; + args.CommandLength = command_size; + args.BroadcastContextCount = 1; + args.BroadcastContext[0] = queue->context; + args.pPrivateDriverData = priv_data; + args.PrivateDriverDataSize = priv_size; + + NTSTATUS ret = D3DKMTSubmitCommand(&args); + if (ret != STATUS_SUCCESS) { + fprintf(stderr, "%s fail %x\n", __FUNCTION__, ret); + rocr_proxy::DestroyPrivData(priv_data); + return false; + } + + rocr_proxy::DestroyPrivData(priv_data); + + if (!GpuSignal(queue->context, &queue->syncobj, &fence_value, 1)) + return false; + + return true; +} + +bool WDDMDevice::CreateHwQueue(WDDMQueue *queue) { + void *priv_data; + int priv_size; + + bool FwManagedGfxState = SupportStateShadowingByCpFw(); + priv_size = rocr_proxy::CreateHwQueuePrivData(&priv_data, queue->context, + FwManagedGfxState, queue->prio); + + D3DKMT_CREATEHWQUEUE createHwQueue = {0}; + createHwQueue.hHwContext = queue->context; + createHwQueue.Flags.DisableGpuTimeout = rocr_proxy::ShouldDisableGpuTimeout(queue->queue_engine, &device_info_); + createHwQueue.pPrivateDriverData = priv_data; + createHwQueue.PrivateDriverDataSize = priv_size; + + NTSTATUS ret = D3DKMTCreateHwQueue(&createHwQueue); + if (ret != STATUS_SUCCESS) { + fprintf(stderr, "%s fail %x\n", __FUNCTION__, ret); + rocr_proxy::DestroyPrivData(priv_data); + return false; + } + + rocr_proxy::DestroyPrivData(priv_data); + + queue->queue = createHwQueue.hHwQueue; + queue->syncobj = createHwQueue.hHwQueueProgressFence; + queue->sync_addr = (uint64_t *)createHwQueue.HwQueueProgressFenceCPUVirtualAddress; + + return true; +} + +bool WDDMDevice::DestroyHwQueue(WDDMQueue *queue) { + D3DKMT_DESTROYHWQUEUE DestroyHwQueue = { + .hHwQueue = queue->queue, + }; + + NTSTATUS ret = D3DKMTDestroyHwQueue(&DestroyHwQueue); + if (ret != STATUS_SUCCESS) { + fprintf(stderr, "%s fail %x\n", __FUNCTION__, ret); + return false; + } + + return true; +} + +bool WDDMDevice::SubmitToHwQueue(WDDMQueue *queue, uint64_t command_addr, + uint64_t command_size, uint64_t fence_value) { + void *priv_data; + int priv_size; + + priv_size = rocr_proxy::CreateSubmitPrivData(&priv_data, queue->queue, command_addr, command_size, true); + + D3DKMT_SUBMITCOMMANDTOHWQUEUE args = {0}; + args.hHwQueue = queue->queue; + args.HwQueueProgressFenceId = fence_value; + args.CommandBuffer = command_addr; + args.CommandLength = command_size; + args.pPrivateDriverData = priv_data; + args.PrivateDriverDataSize = priv_size; + + NTSTATUS ret = D3DKMTSubmitCommandToHwQueue(&args); + if (ret != STATUS_SUCCESS) { + fprintf(stderr, "%s fail %x\n", __FUNCTION__, ret); + rocr_proxy::DestroyPrivData(priv_data); + return false; + } + + rocr_proxy::DestroyPrivData(priv_data); + + return true; +} + +} // namespace core +} // namespace rocr diff --git a/wddm/gpu_memory.cpp b/wddm/gpu_memory.cpp new file mode 100644 index 0000000000..42aeaf68d6 --- /dev/null +++ b/wddm/gpu_memory.cpp @@ -0,0 +1,467 @@ +#include +#include +#include "inc/wddm/gpu_memory.h" +#include "inc/wddm/device.h" +#include "util/utils.h" + +using namespace std; + +namespace rocr { +namespace core { + +size_t GpuMemory::CalcChunkNumbers(gpusize size) { + const auto chunk_size = core::WDDMDevice::GpuMemoryChunkSize; + return (size + chunk_size - 1) / chunk_size; +} + +gpusize GpuMemory::AdjustSize(gpusize size) const { + const auto &device_info = device_->DeviceInfo(); + + if (device_info.enable_big_page_alignment && desc_.domain == rocr_proxy::kLocal) { + uint32_t alignment = device_info.big_page_alignment_size; + // BigPage is only supported for allocations > bigPageMinAlignment. + // Also, if bigPageMinAlignment == 0, BigPage optimization is not supported per KMD. + // We do either LargePage or BigPage alignment, whichever has a higher value. + if ((device_info.hw_big_page_min_alignment_size > 0) && (size > device_info.hw_big_page_min_alignment_size)) { + alignment = std::max(alignment, device_info.hw_big_page_min_alignment_size); + if (size > device_info.hw_big_page_alignment_size) + alignment = std::max(alignment, device_info.hw_big_page_alignment_size); + } + if (alignment > 0) + size = AlignUp(size, alignment); + } else { + const size_t min_size = 4096; + size = AlignUp(size, min_size); + } + return size; +} + +GpuMemory::GpuMemory(WDDMDevice *device) : device_(device) { + num_allocations_ = 0; + alloc_handles_ptr_ = nullptr; + alloc_handle_ = 0; + resource_ = 0; +} + +GpuMemory::~GpuMemory() { + FreeGpuVirtualAddress(GpuAddress(), Size()); + FreePhysicalMemory(); +} + +ErrorCode GpuMemory::Init(const GpuMemoryCreateInfo &create_info) { + desc_.domain = create_info.domain; + desc_.adapter_luid = device_->GetLuid(); + desc_.client_size = create_info.size; + desc_.alignment = create_info.alignment; + desc_.mem_flags = create_info.mem_flags; + desc_.engine_flag = create_info.engine_flag; + desc_.flags.is_virtual = create_info.flags.virtual_alloc; + desc_.flags.is_physical_only = create_info.flags.physical_only; + desc_.flags.is_shared = create_info.flags.interprocess; + desc_.flags.is_locked = create_info.flags.locked; + + desc_.size = AdjustSize(desc_.client_size); + + if (IsUserMemory() || IsSystem()) + desc_.cpu_addr = create_info.user_ptr; + + num_allocations_ = CalcChunkNumbers(Size()); + if (num_allocations_ == 1) + alloc_handles_ptr_ = &alloc_handle_; + else + alloc_handles_ptr_ = new WinAllocationHandle[num_allocations_]; + + memset(alloc_handles_ptr_, 0, num_allocations_ * sizeof(WinAllocationHandle)); + + auto code = ErrorCode::Success; + + if (IsPhysicalOnly()) { + code = CreatePhysicalMemory(); + return code; + } + + code = ReserveGpuVirtualAddress(create_info.va_hint, Size(), create_info.alignment); + if (IsVirtual() || (code != ErrorCode::Success)) + return code; + + bool physical_created = false; + + auto guard = MakeScopeGuard([this, &physical_created, &code]() { + if (code != ErrorCode::Success) { + + if (physical_created) { + FreePhysicalMemory(); + } + FreeGpuVirtualAddress(GpuAddress(), Size()); + } + }); + (void)guard; + + code = CreatePhysicalMemory(); + if (code != ErrorCode::Success) + return code; + + physical_created = true; + + code = MapGpuVirtualAddress(GpuAddress(), Size()); + if (code != ErrorCode::Success) + return code; + + code = MakeResident(); + if (code != ErrorCode::Success) + return code; + + if (!GetDevice()->WaitOnPagingFenceFromCpu()) + code = ErrorCode::Unknown; + + return code; +} + +ErrorCode GpuMemory::UnmapGpuVirtualAddress(const gpusize addr, const gpusize size, gpusize offset) { + auto code = ErrorCode::Success; + size_t i = 0; + auto map_addr = addr; + auto map_size = size; + + while (offset >= core::WDDMDevice::GpuMemoryChunkSize) { + offset -= core::WDDMDevice::GpuMemoryChunkSize; + i += 1; + } + + while (map_size > 0) { + auto block_size = std::min(map_size, core::WDDMDevice::GpuMemoryChunkSize); + + D3DDDI_MAPGPUVIRTUALADDRESS args{}; + + args.hPagingQueue = device_->PagingQueue(); + args.BaseAddress = map_addr; + args.hAllocation = GetAllocationHandle(i); + args.SizeInPages = block_size / 0x1000; + args.Protection.NoAccess = 1; + + code = thunk::MapGpuVirtualAddress(&args); + + if (code == ErrorCode::NotReady) + device_->UpdatePageFence(args.PagingFenceValue); + else if (code != ErrorCode::Success) + break; + + map_addr += block_size; + map_size -= block_size; + offset = 0; // reset second unmapped allocation offset to zero + i += 1; + } + + return code; +} + +ErrorCode GpuMemory::MapGpuVirtualAddress(const gpusize addr, const gpusize size, gpusize offset) { + + auto code = ErrorCode::Success; + size_t i = 0; + auto map_addr = addr; + auto map_size = size; + const size_t _4K = 0x1000; + + while (offset >= core::WDDMDevice::GpuMemoryChunkSize) { + offset -= core::WDDMDevice::GpuMemoryChunkSize; + i += 1; + } + const size_t first_chunk = i; + const auto first_chunk_offset = offset; + /* Found two limitation for local vram: + * 1. invisible vram va has to be 64K aligned, otherwise map gpu va fail + * 2. visible vram can not be cpu mapped when command submission or after gpu mapped + */ + while (map_size > 0) { + auto block_size = std::min(map_size, core::WDDMDevice::GpuMemoryChunkSize); + + D3DDDI_MAPGPUVIRTUALADDRESS args{}; + + args.hPagingQueue = device_->PagingQueue(); + args.BaseAddress = map_addr; + args.hAllocation = GetAllocationHandle(i); + args.OffsetInPages = offset / _4K; + args.SizeInPages = block_size / _4K; + args.Protection.Write = 1; + + code = thunk::MapGpuVirtualAddress(&args); + + if (code != ErrorCode::Success) { + if (code == ErrorCode::NotReady) { + const uint64_t fence_value = args.PagingFenceValue; + device_->UpdatePageFence(fence_value); + code = ErrorCode::Success; + } else + break; + } + + map_addr += block_size; + map_size -= block_size; + offset = 0; // reset second mapped allocation offset to zero + i++; + } + + if (code != ErrorCode::Success) { + // Map failed, unmap partial mapped block + offset = first_chunk_offset; + map_addr = addr; + map_size = size; + for (size_t j = first_chunk; j < i; j++) { + auto block_size = std::min(map_size, core::WDDMDevice::GpuMemoryChunkSize); + + D3DDDI_MAPGPUVIRTUALADDRESS args{}; + + args.hPagingQueue = device_->PagingQueue(); + args.BaseAddress = map_addr; + args.hAllocation = 0; + args.OffsetInPages = offset / _4K; + args.SizeInPages = block_size / _4K; + args.Protection.NoAccess = 1; + + auto unmap_code = thunk::MapGpuVirtualAddress(&args); + if (unmap_code == ErrorCode::NotReady) + device_->UpdatePageFence(args.PagingFenceValue); + + map_addr += block_size; + map_size -= block_size; + } + } + return code; +} + +ErrorCode GpuMemory::ReserveGpuVirtualAddress(gpusize base_virt_addr, gpusize size, gpusize alignment) { + gpusize gpu_virt_addr = 0; + auto status = device_->ReserveGpuVirtualAddress(desc_.domain, base_virt_addr, size, &gpu_virt_addr, alignment, + desc_.flags.is_locked); + if (status == ErrorCode::Success) { + desc_.gpu_addr = gpu_virt_addr; + + if (IsSystem()) + desc_.cpu_addr = reinterpret_cast(desc_.gpu_addr); + } + return status; +} + +ErrorCode GpuMemory::FreeGpuVirtualAddress(gpusize base_addr, gpusize size) { + return base_addr != 0 ? + device_->FreeGpuVirtualAddress(desc_.domain, base_addr, size) : + ErrorCode::Success; +} + +ErrorCode GpuMemory::CreatePhysicalMemory() { + + assert(!IsVirtual() && NumChunks() > 0); + + const auto num_allocations = NumChunks(); + void *priv_drv_data; + void *alloc_priv; + int priv_drv_data_size; + int alloc_priv_data_size; + + if (!rocr_proxy::CreatePrivateAllocInfo(NumChunks(), &priv_drv_data, &alloc_priv, + &priv_drv_data_size, &alloc_priv_data_size)) + return ErrorCode::OutOfMemory; + + auto alloc_info = reinterpret_cast( + static_cast(priv_drv_data) + priv_drv_data_size * num_allocations); + + size_t size = desc_.size; + uint64_t addr = desc_.gpu_addr; + char *cpu_addr = static_cast(desc_.cpu_addr); + const auto &device_info = GetDevice()->DeviceInfo(); + + for (size_t i = 0; i < num_allocations; i++) { + + void* priv_data = (void*)((char*)priv_drv_data + priv_drv_data_size * i); + size_t block_size = std::min(size, core::WDDMDevice::GpuMemoryChunkSize); + + if (IsUserMemory() || IsSystem()) { + rocr_proxy::SetAllocationInfo(priv_data, block_size, desc_.domain, 0, desc_.mem_flags, desc_.engine_flag, device_info); + alloc_info[i].pSystemMem = static_cast(cpu_addr); + cpu_addr += block_size; + } else { + rocr_proxy::SetAllocationInfo(priv_data, block_size, desc_.domain, addr, desc_.mem_flags, desc_.engine_flag, device_info); + } + + size -= block_size; + addr += block_size; + + alloc_info[i].pPrivateDriverData = priv_data; + alloc_info[i].PrivateDriverDataSize = priv_drv_data_size; + alloc_info[i].VidPnSourceId = D3DDDI_ID_UNINITIALIZED; + } + + D3DKMT_CREATEALLOCATION args = {}; + args.hDevice = device_->DeviceHandle(); + args.pPrivateDriverData = alloc_priv; + args.PrivateDriverDataSize = alloc_priv_data_size; + args.NumAllocations = num_allocations; + args.pAllocationInfo2 = alloc_info; + + SharedHandleInfo shared_info; + if (IsShared()) { + shared_info.size = desc_.size; + shared_info.client_size = desc_.client_size; + shared_info.domain = desc_.domain; + shared_info.adapter_luid = desc_.adapter_luid; + shared_info.flags = reinterpret_cast(desc_.flags.reserved); + shared_info.mem_flags = desc_.mem_flags; + args.pPrivateRuntimeData = &shared_info; + args.PrivateRuntimeDataSize = sizeof(shared_info); + args.Flags.NtSecuritySharing = 1; + args.Flags.CreateShared = 1; + args.Flags.CreateResource = 1; + } + + auto status = thunk::CreateAllocation(&args); + if (status == ErrorCode::Success) { + for (size_t i = 0; i < num_allocations; i++) + alloc_handles_ptr_[i] = alloc_info[i].hAllocation; + + resource_ = args.hResource; + } + rocr_proxy::DestroyPrivateAllocInfo(priv_drv_data, alloc_priv); + return status; +} + +ErrorCode GpuMemory::FreePhysicalMemory() { + auto code = ErrorCode::Success; + + if (alloc_handles_ptr_ == nullptr || (NumChunks() == 1 && *alloc_handles_ptr_ == 0)) + return code; + + code = thunk::DestroyAllocation(device_->DeviceHandle(), + resource_, + NumChunks(), + alloc_handles_ptr_); + if (NumChunks() > 1) + delete[] alloc_handles_ptr_; + + alloc_handles_ptr_ = nullptr; + return code; +} + +ErrorCode GpuMemory::MakeResident() { + + D3DDDI_MAKERESIDENT args = {}; + args.hPagingQueue = device_->PagingQueue(); + args.NumAllocations = NumChunks(); + args.AllocationList = alloc_handles_ptr_; + args.Flags.CantTrimFurther = 1; + + auto code = thunk::MakeResident(&args); + if (code == ErrorCode::NotReady) { + const auto fence_value = args.PagingFenceValue; + device_->UpdatePageFence(fence_value); + code = ErrorCode::Success; + } + return code; +} + +ErrorCode GpuMemory::Evict() { + + D3DKMT_EVICT args = {}; + args.hDevice = device_->DeviceHandle(); + args.NumAllocations = NumChunks(); + args.AllocationList = alloc_handles_ptr_; + + return thunk::Evict(&args); +} + +ErrorCode GpuMemory::ExportPhysicalHandle(int* dmabuf_fd, uint32_t flags) { + if (IsShared()) + return thunk::ShareObjects(num_allocations_, resource_, flags, dmabuf_fd); + else + return ErrorCode::UnSupported; +} + + +ErrorCode GpuMemory::ImportPhysicalHandle(int dmabuf_fd) { + D3DKMT_QUERYRESOURCEINFOFROMNTHANDLE query_args; + + if (dmabuf_fd <= 0) + return ErrorCode::InvalidateParams; + + memset(&query_args, 0, sizeof(query_args)); + query_args.hDevice = device_->DeviceHandle(); + query_args.hNtHandle = reinterpret_cast(dmabuf_fd); + auto ret = thunk::QueryResourceInfoFromNtHandle(&query_args); + if (ret != ErrorCode::Success) { + debug_print("%s query resource info from nt handle failed %d\n", __FUNCTION__, static_cast(ret)); + return ErrorCode::InvalidateParams; + } + debug_print("wsl-rocr: import from nt handle %d, get allocation number %d," + " runtime data size %" PRIx64 " total driver data size %" PRIx64 "resource data size=%" PRIx64 "\n", + dmabuf_fd, + query_args.NumAllocations, + query_args.PrivateRuntimeDataSize, + query_args.TotalPrivateDriverDataSize, + query_args.ResourcePrivateDriverDataSize); + + SharedHandleInfo shared_info; + if(sizeof(shared_info) != query_args.PrivateRuntimeDataSize) { + debug_print("%s shared hanle info size mismatch:%d vs %ld\n", + __FUNCTION__, query_args.PrivateRuntimeDataSize, sizeof(shared_info)); + return ErrorCode::UnSupported; + } + + uint32_t total_size = query_args.NumAllocations * sizeof(D3DDDI_OPENALLOCATIONINFO2) + + query_args.TotalPrivateDriverDataSize + + query_args.ResourcePrivateDriverDataSize; + D3DDDI_OPENALLOCATIONINFO2 *open_info = + reinterpret_cast (calloc(1, total_size)); + if (!open_info) { + debug_print("%s alloc open_info failed, NumAllocations:%d\n", + __FUNCTION__, query_args.NumAllocations); + return ErrorCode::OutOfMemory; + } + + alloc_handles_ptr_ = new WinAllocationHandle[query_args.NumAllocations]; + + D3DKMT_OPENRESOURCEFROMNTHANDLE open_args; + memset(&open_args, 0, sizeof(open_args)); + open_args.hDevice = query_args.hDevice; + open_args.hNtHandle = query_args.hNtHandle; + open_args.NumAllocations = query_args.NumAllocations; + open_args.pOpenAllocationInfo2 = open_info; + open_args.TotalPrivateDriverDataBufferSize = query_args.TotalPrivateDriverDataSize; + open_args.pTotalPrivateDriverDataBuffer = reinterpret_cast + (open_args.pOpenAllocationInfo2 + open_args.NumAllocations); + open_args.ResourcePrivateDriverDataSize = query_args.ResourcePrivateDriverDataSize; + open_args.pResourcePrivateDriverData = reinterpret_cast + (((uint64_t)open_args.pTotalPrivateDriverDataBuffer) + + open_args.TotalPrivateDriverDataBufferSize); + open_args.PrivateRuntimeDataSize = query_args.PrivateRuntimeDataSize; + open_args.pPrivateRuntimeData = reinterpret_cast (&shared_info); + + ret = thunk::OpenResourceFromNtHandle(&open_args); + if (ret != ErrorCode::Success) { + ret = ErrorCode::InvalidateParams; + debug_print("%s open resource failed %d\n", __FUNCTION__, static_cast(ret)); + goto err_out; + } + + desc_.size = shared_info.size; + desc_.client_size = shared_info.client_size; + desc_.domain = shared_info.domain; + desc_.flags.reserved = shared_info.flags; + desc_.mem_flags = shared_info.mem_flags; + desc_.adapter_luid = shared_info.adapter_luid; + resource_ = open_args.hResource; + num_allocations_ = open_args.NumAllocations; + for (int i = 0; i < num_allocations_; i++) + alloc_handles_ptr_[i] = open_info[i].hAllocation; + + free(open_info); + return ErrorCode::Success; + +err_out: + delete[] alloc_handles_ptr_; + alloc_handles_ptr_ = nullptr; + free(open_info); + return ret; +} + +} // namespace code +} // namespace rocr diff --git a/wddm/queue.cpp b/wddm/queue.cpp new file mode 100644 index 0000000000..e6856fbd75 --- /dev/null +++ b/wddm/queue.cpp @@ -0,0 +1,989 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include + +#include "inc/wddm/queue.h" +#include "inc/registers.h" +#include "libhsakmt.h" + +namespace rocr { +namespace core { + +hsa_status_t WDDMQueue::SwsInit(void) { + if (!device->CreateSyncobj(&syncobj, &sync_addr)) + return HSA_STATUS_ERROR; + + if (device->AllocUserQueueMemFromUMD()) { + + GpuMemory *gpu_mem = nullptr; + GpuMemoryCreateInfo create_info{}; + + create_info.domain = rocr_proxy::kUserQueue; + create_info.size = device->GetSwsQueueSize(); + create_info.engine_flag = rocr_proxy::QueueEngine2EngineFlag(queue_engine); + + auto code = device->CreateGpuMemory(create_info, &gpu_mem); + if (code != ErrorCode::Success) { + device->DestroySyncobj(syncobj); + return HSA_STATUS_ERROR; + } + + queue_mem = gpu_mem->GetGpuMemoryHandle(); + queue = gpu_mem->GetAllocationHandle(0); + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t WDDMQueue::SwsFini(void) { + device->DestroySyncobj(syncobj); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t WDDMQueue::SwsSubmit(uint64_t command_addr, + uint64_t command_size, + uint64_t fence_value) { + if (!device->SubmitToSwQueue(this, command_addr, command_size, fence_value)) + return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t WDDMQueue::HwsInit(void) { + if (!device->CreateHwQueue(this)) + return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t WDDMQueue::HwsFini(void) { + if (!device->DestroyHwQueue(this)) + return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t WDDMQueue::HwsSubmit(uint64_t command_addr, + uint64_t command_size, + uint64_t fence_value) { + if (!device->SubmitToHwQueue(this, command_addr, command_size, fence_value)) + return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t WDDMQueue::SetPriority(hsa_amd_queue_priority_t priority) { + if (!use_hws) + return HSA_STATUS_SUCCESS; + + rocr_proxy::SchedLevel new_prio = ConvertSchedLevel(priority); + if (prio == new_prio) + return HSA_STATUS_SUCCESS; + + debug_print("set prio %d -> %d\n", prio, new_prio); + device->DestroyHwQueue(this); + + prio = new_prio; + return HwsInit(); +} + +extern "C" void (*fn_hsa_signal_store_screlease)(hsa_signal_t hsa_signal, hsa_signal_value_t value); +void ComputeQueue::HandleError(hsa_status_t status) { + hsa_signal_t sig = amd_queue_rocr_->queue_inactive_signal; + hsa_signal_value_t val = -1; + + struct queue_error_t { + uint32_t code; + hsa_status_t status; + }; + static const queue_error_t QueueErrors[] = { + {2, HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS}, + {4, HSA_STATUS_ERROR_INVALID_ALLOCATION}, + {8, HSA_STATUS_ERROR_INVALID_CODE_OBJECT}, + //{16, HSA_STATUS_ERROR_INVALID_ARGUMENT}, + {32, HSA_STATUS_ERROR_INVALID_PACKET_FORMAT}, + {64, HSA_STATUS_ERROR_INVALID_ARGUMENT}, + //{128, HSA_STATUS_ERROR_OUT_OF_REGISTERS}, + //{0x20000000, HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION}, + //{0x40000000, HSA_STATUS_ERROR_ILLEGAL_INSTRUCTION}, + {0x80000000, HSA_STATUS_ERROR_EXCEPTION}, + }; + for (std::size_t i = 0; i < sizeof(QueueErrors) / sizeof(QueueErrors[0]); ++i) { + if (QueueErrors[i].status == status) { + val = QueueErrors[i].code; + debug_print("error %d, sig_val %d\n", status, val); + break; + } + } + + if (sig.handle) { + fn_hsa_signal_store_screlease(sig, val); + } + if (error_code_) { + error_code_->store(val, std::memory_order_release); + } +} + +void ComputeQueue::AqlToPm4Thread(ComputeQueue *queue) { + + ComputeQueue *cq = queue; + // This timing system is used for sleeping this Thread + // when one packet is invalid for about 2 seconds. + std::chrono::steady_clock::time_point start_time, time; + // Set the polling timeout value for 2 seconds + const std::chrono::milliseconds kMaxElapsed(2000); + uint64_t current_position = cq->GetAqlWriteIndex(); + bool sleep = false; + start_time = std::chrono::steady_clock::now(); + + while (true) { + if (!cq->IsInvalidPacket()) { + hsa_status_t status = cq->Process(); + if (status != HSA_STATUS_SUCCESS) { + fprintf(stderr, "process compute queue fail status = %08x\n", status); + queue->HandleError(status); + break; + } + } + + std::unique_lock lock(queue->thread_cond_lock_); + if (current_position == cq->GetAqlWriteIndex()) { + time = std::chrono::steady_clock::now(); + if (time - start_time > kMaxElapsed) + sleep = true; + } else { + start_time = std::chrono::steady_clock::now(); + current_position = cq->GetAqlWriteIndex(); + sleep = false; + } + // CPU wait for valid packet + if (cq->GetRingWptr()->load() <= cq->GetRingRptr()->load() || + (sleep && cq->IsInvalidPacket())) { + if (queue->thread_stop_) + break; + debug_print("wait %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n", + queue->ring, cq->GetRingWptr()->load(), cq->GetRingRptr()->load()); + queue->thread_cond_.wait(lock); + } + } + + debug_print("aql to pm4 thread %p exit\n", queue->ring); +} + +ComputeQueue::ComputeQueue(WDDMDevice *device, + void *ring, + uint64_t ring_size, + std::atomic *ring_wptr, + std::atomic *ring_rptr, + volatile int64_t *error_addr, + uint32_t cmdbuf_size, + uint32_t engine, + bool use_hws) : + WDDMQueue(device, cmdbuf_size, engine, use_hws), + ring(ring), + ring_size(ring_size), + ring_wptr(ring_wptr), + ring_rptr(ring_rptr), + error_code_(reinterpret_cast*>(error_addr)), + ib_start_addr(0), + ib_size(0), + sync_point(0), + cmdbuf_aql_frame_write_index(0), + cmdbuf_aql_frame_size(0), + needs_barrier(true), + ready_to_submit(false), + thread_stop_(false), + scratch_waves_(device->MaxScratchSlotsPerCu() * device->ComputeUnitCount()), + scratch_size_per_wave_(0), + scratch_size_(0), + scratch_base_(nullptr) { + + bool ret = device->CreateQueue(this); + assert(ret); + + GpuMemoryCreateInfo create_info{}; + create_info.size = PAGE_SIZE; + create_info.domain = rocr_proxy::kSystem; + GpuMemory *gpu_mem = nullptr; + auto code = device->CreateGpuMemory(create_info, &gpu_mem); + assert(code == ErrorCode::Success); + amd_queue_mem_ = gpu_mem->GetGpuMemoryHandle(); + amd_queue_ = reinterpret_cast(gpu_mem->GpuAddress()); + + aql_to_pm4_thread_ = std::thread(AqlToPm4Thread, this); + amd_queue_rocr_ = (amd_queue_t*)((char*)ring_rptr - offsetof(amd_queue_t, read_dispatch_id)); +} + +ComputeQueue::~ComputeQueue() { + thread_cond_lock_.lock(); + thread_stop_ = true; + thread_cond_lock_.unlock(); + thread_cond_.notify_one(); + aql_to_pm4_thread_.join(); + + //doorbell_signal_->Release(); + + device->DestroyQueue(this); + + if (scratch_base_) { + auto scratch_gpu_mem = GpuMemory::Convert(scratch_mem_); + delete scratch_gpu_mem; + } + + auto amd_queue_gpu_mem = GpuMemory::Convert(amd_queue_mem_); + delete amd_queue_gpu_mem; +} + +void ComputeQueue::InitScratchSRD() { + // Populate scratch resource descriptor + SQ_BUF_RSRC_WORD0 srd0; + + uintptr_t scratch_base = uintptr_t(scratch_base_); + srd0.bits.BASE_ADDRESS = scratch_base; + + uint32_t srd1_u32; + + if (device->Major() < 11) { + SQ_BUF_RSRC_WORD1 srd1; + + srd1.bits.BASE_ADDRESS_HI = scratch_base >> 32; + srd1.bits.STRIDE = 0; + srd1.bits.CACHE_SWIZZLE = 0; + srd1.bits.SWIZZLE_ENABLE = 1; + + srd1_u32 = srd1.u32All; + } else { + SQ_BUF_RSRC_WORD1_GFX11 srd1; + + srd1.bits.BASE_ADDRESS_HI = scratch_base >> 32; + srd1.bits.STRIDE = 0; + srd1.bits.SWIZZLE_ENABLE = 1; + + srd1_u32 = srd1.u32All; + } + + SQ_BUF_RSRC_WORD2 srd2; + + srd2.bits.NUM_RECORDS = scratch_size_; + + uint32_t srd3_u32; + + if (device->Major() < 10) { + SQ_BUF_RSRC_WORD3 srd3; + + srd3.bits.DST_SEL_X = SQ_SEL_X; + srd3.bits.DST_SEL_Y = SQ_SEL_Y; + srd3.bits.DST_SEL_Z = SQ_SEL_Z; + srd3.bits.DST_SEL_W = SQ_SEL_W; + srd3.bits.NUM_FORMAT = BUF_NUM_FORMAT_UINT; + srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32; + srd3.bits.ELEMENT_SIZE = 1; // 4 + srd3.bits.INDEX_STRIDE = 3; // 64 + srd3.bits.ADD_TID_ENABLE = 1; + srd3.bits.ATC__CI__VI = 0; + srd3.bits.HASH_ENABLE = 0; + srd3.bits.HEAP = 0; + srd3.bits.MTYPE__CI__VI = 0; + srd3.bits.TYPE = SQ_RSRC_BUF; + + srd3_u32 = srd3.u32All; + } else if (device->Major() == 10) { + SQ_BUF_RSRC_WORD3_GFX10 srd3; + + srd3.bits.DST_SEL_X = SQ_SEL_X; + srd3.bits.DST_SEL_Y = SQ_SEL_Y; + srd3.bits.DST_SEL_Z = SQ_SEL_Z; + srd3.bits.DST_SEL_W = SQ_SEL_W; + srd3.bits.FORMAT = BUF_FORMAT_32_UINT; + srd3.bits.RESERVED1 = 0; + srd3.bits.INDEX_STRIDE = 0; // filled in by CP + srd3.bits.ADD_TID_ENABLE = 1; + srd3.bits.RESOURCE_LEVEL = 1; + srd3.bits.RESERVED2 = 0; + srd3.bits.OOB_SELECT = 2; // no bounds check in swizzle mode + srd3.bits.TYPE = SQ_RSRC_BUF; + + srd3_u32 = srd3.u32All; + } else if (device->Major() == 11) { + SQ_BUF_RSRC_WORD3_GFX11 srd3; + + srd3.bits.DST_SEL_X = SQ_SEL_X; + srd3.bits.DST_SEL_Y = SQ_SEL_Y; + srd3.bits.DST_SEL_Z = SQ_SEL_Z; + srd3.bits.DST_SEL_W = SQ_SEL_W; + srd3.bits.FORMAT = BUF_FORMAT_32_UINT; + srd3.bits.RESERVED1 = 0; + srd3.bits.INDEX_STRIDE = 0; // filled in by CP + srd3.bits.ADD_TID_ENABLE = 1; + srd3.bits.RESERVED2 = 0; + srd3.bits.OOB_SELECT = 2; // no bounds check in swizzle mode + srd3.bits.TYPE = SQ_RSRC_BUF; + + srd3_u32 = srd3.u32All; + } else { + SQ_BUF_RSRC_WORD3_GFX12 srd3; + srd3.bits.DST_SEL_X = SQ_SEL_X; + srd3.bits.DST_SEL_Y = SQ_SEL_Y; + srd3.bits.DST_SEL_Z = SQ_SEL_Z; + srd3.bits.DST_SEL_W = SQ_SEL_W; + srd3.bits.FORMAT = BUF_FORMAT_32_UINT; + srd3.bits.RESERVED1 = 0; + srd3.bits.INDEX_STRIDE = 0; // filled in by CP + srd3.bits.ADD_TID_ENABLE = 1; + srd3.bits.WRITE_COMPRESS_ENABLE = 0; + srd3.bits.COMPRESSION_EN = 0; + srd3.bits.COMPRESSION_ACCESS_MODE = 0; + srd3.bits.OOB_SELECT = 2; // no bounds check in swizzle mode + srd3.bits.TYPE = SQ_RSRC_BUF; + + srd3_u32 = srd3.u32All; + } + + // Update Queue's Scratch descriptor's property + amd_queue_->scratch_resource_descriptor[0] = srd0.u32All; + amd_queue_->scratch_resource_descriptor[1] = srd1_u32; + amd_queue_->scratch_resource_descriptor[2] = srd2.u32All; + amd_queue_->scratch_resource_descriptor[3] = srd3_u32; + + // Populate flat scratch parameters in amd_queue_. + amd_queue_->scratch_backing_memory_location = scratch_base; + amd_queue_->scratch_backing_memory_byte_size = scratch_size_; + + // For backwards compatibility this field records the per-lane scratch + // for a 64 lane wavefront. If scratch was allocated for 32 lane waves + // then the effective size for a 64 lane wave is halved. + amd_queue_->scratch_wave64_lane_byte_size = scratch_size_per_wave_ / 64; + + if (device->Major() < 11) { + COMPUTE_TMPRING_SIZE tmpring_size; + tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ / 1024; + tmpring_size.bits.WAVES = scratch_waves_; + + amd_queue_->compute_tmpring_size = tmpring_size.u32All; + } else if (device->Major() == 11) { + COMPUTE_TMPRING_SIZE_GFX11 tmpring_size; + tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ >> 8; + tmpring_size.bits.WAVES = scratch_waves_ / device->NumShaderEngine(); + + amd_queue_->compute_tmpring_size = tmpring_size.u32All; + } else { + COMPUTE_TMPRING_SIZE_GFX12 tmpring_size = {}; + tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ >> 8; + tmpring_size.bits.WAVES = scratch_waves_ / device->NumShaderEngine(); + + amd_queue_->compute_tmpring_size = tmpring_size.u32All; + } + + return; +} + +bool ComputeQueue::UpdateScratch(uint32_t private_segment_size, bool wave32) { + const uint32_t wavefront = wave32 ? 32 : 64; + const uint32_t alignment = 1024 / wavefront; + private_segment_size = AlignUp(private_segment_size, alignment); + + uint32_t scratch_size_per_wave = private_segment_size * wavefront; + uint32_t scratch_size = scratch_size_per_wave * scratch_waves_; + + if (scratch_size_ >= scratch_size) + return true; + + debug_print("need realloc scratch buffer, size %x -> %x\n", + scratch_size_, scratch_size); + + GpuMemoryCreateInfo create_info{}; + create_info.size = scratch_size; + create_info.domain = rocr_proxy::kLocal; + GpuMemory *gpu_mem = nullptr; + auto code = device->CreateGpuMemory(create_info, &gpu_mem); + if (code != ErrorCode::Success) + return false; + + if (scratch_base_) { + auto scratch_gpu_mem = GpuMemory::Convert(scratch_mem_); + delete scratch_gpu_mem; + } + + scratch_size_per_wave_ = scratch_size_per_wave; + scratch_size_ = scratch_size; + scratch_base_ = reinterpret_cast(gpu_mem->GpuAddress()); + scratch_mem_ = gpu_mem->GetGpuMemoryHandle(); + + InitScratchSRD(); + return true; +} + +bool ComputeQueue::RelocateCmdbufScratchBase(uint64_t addr) { + if (scratch_base_offset_array_.empty()) + return true; + + for (size_t i = 0; i < scratch_base_offset_array_.size(); i++) { + uint32_t *p_compute_user_data = + reinterpret_cast(addr + scratch_base_offset_array_[i]); + if (device->Major() >= 11) { + p_compute_user_data[0] = Ptr48Low32(scratch_base_); + p_compute_user_data[1] = Ptr48High8(scratch_base_); + } else { + p_compute_user_data[0] = PtrLow32(scratch_base_); + p_compute_user_data[1] = (p_compute_user_data[1] & 0xffff0000) | PtrHigh32(scratch_base_); + } + } + scratch_base_offset_array_.clear(); + + return true; +} + +uint32_t ComputeQueue::UpdateIndexStride(uint32_t srd, bool wave32) { + + assert(device->Major() < 13); + + if (device->Major() == 10) { + SQ_BUF_RSRC_WORD3_GFX10 srd3; + + srd3.u32All = srd; + srd3.bits.INDEX_STRIDE = wave32 ? 2 : 3; + + return srd3.u32All; + } else if (device->Major() == 11) { + SQ_BUF_RSRC_WORD3_GFX11 srd3; + + srd3.u32All = srd; + srd3.bits.INDEX_STRIDE = wave32 ? 2 : 3; + + return srd3.u32All; + } else if (device->Major() == 12) { + SQ_BUF_RSRC_WORD3_GFX12 srd3; + + srd3.u32All = srd; + srd3.bits.INDEX_STRIDE = wave32 ? 2 : 3; + + return srd3.u32All; + } + + return srd; +} + +extern "C" hsa_status_t (*fn_hsa_ven_amd_loader_query_host_address)( + const void *device_address, + const void **host_address); +uint64_t ComputeQueue::GetKernelObjAddr(uint64_t addr) const { +//TODO: convert dev_addr to host_addr + uint64_t host_addr = 0; + auto ret = fn_hsa_ven_amd_loader_query_host_address(reinterpret_cast(addr), + reinterpret_cast(&host_addr)); + if (ret == HSA_STATUS_ERROR_INVALID_ARGUMENT) { + return NULL; + } + + return host_addr; +} + +void ComputeQueue::RingDoorbell() { + thread_cond_lock_.lock(); + thread_cond_lock_.unlock(); + debug_print("notify %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n", + ring, GetRingWptr()->load(), GetRingRptr()->load()); + thread_cond_.notify_one(); +} + +hsa_status_t ComputeQueue::Init(void) { + hsa_status_t ret = use_hws ? HwsInit() : SwsInit(); + if (ret) + return ret; + + ib_start_addr = cmdbuf_addr; + cmdbuf_aql_frame_size = device->GetAqlFrameSize(); + + return ret; +} + +hsa_status_t ComputeQueue::Fini(void) { + return use_hws ? HwsFini() : SwsFini(); +} + +hsa_status_t ComputeQueue::PreSubmit(void) { + if (!device->WaitPagingFence(this)) + return HSA_STATUS_ERROR; + + RelocateCmdbufScratchBase(ib_start_addr); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ComputeQueue::EndSubmit(void) { + // record last submitted cmdbuf_aql_frame_write_index to see if GPU is hungry + sync_point = cmdbuf_aql_frame_write_index; + + ib_start_addr = cmdbuf_addr + + (cmdbuf_aql_frame_write_index % WDDMDevice::GetAqlFrameNum()) * + cmdbuf_aql_frame_size; + ib_size = 0; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ComputeQueue::Submit(void) { + hsa_status_t ret = PreSubmit(); + if (ret) + return HSA_STATUS_ERROR; + + ret = use_hws ? + HwsSubmit(ib_start_addr, ib_size, cmdbuf_aql_frame_write_index) : + SwsSubmit(ib_start_addr, ib_size, cmdbuf_aql_frame_write_index); + if (ret) + return HSA_STATUS_ERROR; + + ret = EndSubmit(); + if (ret) + return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t +ComputeQueue::KernelDispatchAqlToPm4(char *cpu, hsa_kernel_dispatch_packet_t *packet) { + debug_print("queue %p kernel dispatch head=%x setup=%x wx=%x wy=%x wz=%x " + "gx=%x gy=%x gz=%x ps=%x gs=%x ko=%" PRIx64 " ka=%p cs=%" PRIx64 "\n", + ring, packet->header, + packet->setup, packet->workgroup_size_x, packet->workgroup_size_y, + packet->workgroup_size_z, packet->grid_size_x, packet->grid_size_y, + packet->grid_size_z, packet->private_segment_size, + packet->group_segment_size, packet->kernel_object, packet->kernarg_address, + packet->completion_signal.handle); + + if (packet->workgroup_size_x > 1024 || + packet->workgroup_size_y > 1024 || + packet->workgroup_size_z > 1024) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + int major = device->Major(); + int i = ib_size; + + const amd_kernel_code_t* kernel_object = + (const amd_kernel_code_t *)GetKernelObjAddr(packet->kernel_object); + if (kernel_object == NULL) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + void* entry = (void*)(packet->kernel_object + kernel_object->kernel_code_entry_byte_offset); + assert((size_t)entry % AMD_ISA_ALIGN_BYTES == 0); + + debug_print("kernel object property=%x entry=%p lds=%x+%x\n", + kernel_object->kernel_code_properties, entry, + kernel_object->workgroup_group_segment_byte_size, + packet->group_segment_size); + + if (packet->setup == 0 || packet->setup > 3) + return HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS; + if (packet->group_segment_size > device->LdsSize()) + return HSA_STATUS_ERROR_INVALID_ALLOCATION; + + uint32_t lds_blks = device->LdsBlocks(packet); + if (lds_blks > 128) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + const bool wave32 = + AMD_HSA_BITS_GET(kernel_object->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_WAVEFRONT_SIZE32); + + assert(packet->private_segment_size >= kernel_object->workitem_private_segment_byte_size); + UpdateScratch(packet->private_segment_size, wave32); + + amd_signal_t *signal = (amd_signal_t *)packet->completion_signal.handle; + + // Record start timestamp when enabling profiling + if (signal && EnableProfiling()) + i += cmd_util.BuildCopyData(&signal->start_ts, cpu + i); + + // Build a barrier packet if it is requested + const bool is_barrier_packet = (packet->header >> HSA_PACKET_HEADER_BARRIER) & 0x1; + if (is_barrier_packet && needs_barrier) + i += cmd_util.BuildBarrier(cpu + i); + + // flush cache + i += cmd_util.BuildAcquireMem(major, cpu + i); + + if (major >= 11) { + AppendCmdbufSratchBaseOffset( + i + offsetof(struct SetScratchTemplate, scratch_lo)); + + i += cmd_util.BuildScratch(ScratchBase(), cpu + i); + i += cmd_util.BuildComputeShaderParams(cpu + i); + } + + struct DispatchInfo info; + info.major = major; + info.pPacket = packet; + info.pEntry = entry; + info.pKernelObject = kernel_object; + info.ldsBlks = lds_blks; + info.pAmdQueue = amd_queue_; + info.wave32 = wave32; + info.srd = UpdateIndexStride( + info.pAmdQueue->scratch_resource_descriptor[3], wave32); + info.pScratchBase = ScratchBase(); + info.scratchSizePerWave = ScratchSizePerWave(); + memset(info.scratchBaseOffset, 0, sizeof(info.scratchBaseOffset)); + info.offsetCnt = 0; + + size_t size; + size = cmd_util.BuildDispatch(&info, cpu + i); + for (int j = 0; j < info.offsetCnt; j++) + AppendCmdbufSratchBaseOffset(i + info.scratchBaseOffset[j]); + i += size; + + needs_barrier = (packet->completion_signal.handle == 0); + + if (signal) { + // wait cs done + i += cmd_util.BuildBarrier(cpu + i); + + // Record end timestamp when enabling profiling + if (EnableProfiling()) + i += cmd_util.BuildCopyData(&signal->end_ts, cpu + i); + + // flush cache + i += cmd_util.BuildAcquireMem(major, cpu + i); + + assert(signal->kind == AMD_SIGNAL_KIND_USER); + uint64_t *signal_addr = (uint64_t *)&signal->value; + debug_print("signal value=%" PRIx64 "\n", signal->value); + + i += cmd_util.BuildAtomicMem(signal_addr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i, cache_policy__mec_atomic_mem__bypass, -1); + } + + // The ring_rptr is used to record pm4 queue rptr value, + // dispatch readptr position, this is used to share rptr with + // aql queue. + i += cmd_util.BuildAtomicMem((uint64_t *)ring_rptr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i); + + ib_size = i; + cmdbuf_aql_frame_write_index++; + packet->header = HSA_PACKET_TYPE_INVALID; + + return HSA_STATUS_SUCCESS; +} + +extern "C" hsa_signal_value_t (*fn_hsa_signal_load_relaxed)( + hsa_signal_t signal); +extern "C" hsa_signal_value_t (*fn_hsa_signal_wait_relaxed)( + hsa_signal_t signal, + hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint); +hsa_status_t +ComputeQueue::BarrierGenericAqlToPm4(char *cpu, hsa_barrier_and_packet_t *packet, bool is_or) { + debug_print("queue %p %s head=%x dep %" PRIx64 " %" PRIx64 " %" PRIx64 + " %" PRIx64 " %" PRIx64 " cs=%" PRIx64"\n", + ring, is_or ? "or" : "and", + packet->header, packet->dep_signal[0].handle, + packet->dep_signal[1].handle, packet->dep_signal[2].handle, + packet->dep_signal[3].handle, packet->dep_signal[4].handle, + packet->completion_signal.handle); + // fix me: can we use gpu packet? + if (is_or) { + bool unsignaled = true; + hsa_signal_t sig[5]; + int n = 0; + for (int i = 0; i < 5; i++) { + if (packet->dep_signal[i].handle) + sig[n++] = packet->dep_signal[i]; + } + + while (n) { + for (int i = 0; i < n; i++) { + if (!fn_hsa_signal_load_relaxed(sig[i])) { + unsignaled = false; + break; + } + } + if (!unsignaled) + break; + + std::this_thread::sleep_for(std::chrono::microseconds(20)); + } + } else { + for (int i = 0; i < 5; i++) { + if (!packet->dep_signal[i].handle) + continue; + + hsa_signal_value_t value = + fn_hsa_signal_wait_relaxed(packet->dep_signal[i], HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + assert(value == 0); + } + } + + int major = device->Major(); + int i = ib_size; + + if (packet->completion_signal.handle != 0) { + amd_signal_t *signal = (amd_signal_t *)packet->completion_signal.handle; + assert(signal->kind == AMD_SIGNAL_KIND_USER); + uint64_t *signal_addr = (uint64_t *)&signal->value; + debug_print("signal value=%" PRIx64 "\n", signal->value); + + // Record start timestamp when enabling profiling + if (EnableProfiling()) + i += cmd_util.BuildCopyData(&signal->start_ts, cpu + i); + + if (needs_barrier) + i += cmd_util.BuildBarrier(cpu + i); + + needs_barrier = false; + + // Record end timestamp when enabling profiling + if (EnableProfiling()) + i += cmd_util.BuildCopyData(&signal->end_ts, cpu + i); + + // flush cache + i += cmd_util.BuildAcquireMem(major, cpu + i); + + i += cmd_util.BuildAtomicMem(signal_addr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i, cache_policy__mec_atomic_mem__bypass, -1); + } + + // The ring_rptr is used to record pm4 queue rptr value, + // dispatch readptr position, this is used to share rptr with + // aql queue. + i += cmd_util.BuildAtomicMem((uint64_t *)ring_rptr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i); + + ib_size = i; + cmdbuf_aql_frame_write_index++; + packet->header = HSA_PACKET_TYPE_INVALID; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ComputeQueue::VendorSpecificAqlToPm4(char *cpu, amd_aql_pm4_ib *packet) { + constexpr uint32_t AMD_AQL_FORMAT_PM4_IB = 0x1; + assert(packet->ven_hdr == AMD_AQL_FORMAT_PM4_IB); + + uint8_t op = (packet->ib_jump_cmd[0] >> PM4_OPCODE_SHIFT) & 0xff; + assert(op == IT_INDIRECT_BUFFER); + uint32_t* pm4_addr = reinterpret_cast((static_cast(packet->ib_jump_cmd[2]) << 32) | (static_cast(packet->ib_jump_cmd[1]) & ~3ull)); + uint32_t pm4_size = packet->ib_jump_cmd[3]&0xfffff; + debug_print("queue %p %s VENDOR_SPECIFIC pkt pm4_addr %p pm4_size %" PRIx64 " cs=%" PRIx64"\n", + ring, vendor_packet_support ? "process" : "skip", pm4_addr, pm4_size, + packet->completion_signal.handle); + for (int i = 0; i < pm4_size; i++) { + debug_print("pm4_addr[%d]=%#x\n", i, pm4_addr[i]); + } + + if (vendor_packet_support) { + int major = device->Major(); + int i = ib_size; + + memcpy(cpu+i, pm4_addr, pm4_size * sizeof(uint32_t)); + i += pm4_size * sizeof(uint32_t); + + if (packet->completion_signal.handle != 0) { + amd_signal_t *signal = (amd_signal_t *)packet->completion_signal.handle; + assert(signal->kind == AMD_SIGNAL_KIND_USER); + uint64_t *signal_addr = (uint64_t *)&signal->value; + debug_print("signal value=%" PRIx64 "\n", signal->value); + + // Record start timestamp when enabling profiling + if (EnableProfiling()) + i += cmd_util.BuildCopyData(&signal->start_ts, cpu + i); + + //if (needs_barrier) + i += cmd_util.BuildBarrier(cpu + i); + + //needs_barrier = false; + + // Record end timestamp when enabling profiling + if (EnableProfiling()) + i += cmd_util.BuildCopyData(&signal->end_ts, cpu + i); + + // flush cache + i += cmd_util.BuildAcquireMem(major, cpu + i); + + i += cmd_util.BuildAtomicMem(signal_addr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i, cache_policy__mec_atomic_mem__bypass, -1); + } + + // The ring_rptr is used to record pm4 queue rptr value, + // dispatch readptr position, this is used to share rptr with + // aql queue. + i += cmd_util.BuildAtomicMem((uint64_t *)ring_rptr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i); + + ib_size = i; + } else { + if (packet->completion_signal.handle != 0) { + fn_hsa_signal_store_screlease(packet->completion_signal, 0); + } + } + + cmdbuf_aql_frame_write_index++; + packet->header = HSA_PACKET_TYPE_INVALID; + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ComputeQueue::SwitchAql2PM4(void) { + + uint16_t *packet = (uint16_t *) ((char *)ring + + (cmdbuf_aql_frame_write_index % ring_size) * 64); + uint16_t header = (*packet >> HSA_PACKET_HEADER_TYPE); + header &= (1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1; + hsa_kernel_dispatch_packet_t *aql_packet = + (hsa_kernel_dispatch_packet_t *)packet; + hsa_status_t ret; + + switch (header) { + case HSA_PACKET_TYPE_KERNEL_DISPATCH: + ret = KernelDispatchAqlToPm4((char *)ib_start_addr, aql_packet); + if (ret != HSA_STATUS_SUCCESS) + return ret; + + // Stop merging packages util below conditions are met: + // 1) The kernel with completion signal; + // 2) The cmdbuf_aql_frame_write_index reaches the end of cmdbuf + // 3) The queue is empty now, submit the package right now. + if (!(aql_packet->completion_signal.handle) && + (cmdbuf_aql_frame_write_index % WDDMDevice::GetAqlFrameNum()) && + (*sync_addr != sync_point)) + return HSA_STATUS_SUCCESS; + + break; + case HSA_PACKET_TYPE_BARRIER_AND: + BarrierGenericAqlToPm4((char *)ib_start_addr, (hsa_barrier_and_packet_t *)aql_packet); + break; + case HSA_PACKET_TYPE_BARRIER_OR: + BarrierGenericAqlToPm4((char *)ib_start_addr, (hsa_barrier_and_packet_t *)aql_packet, true); + break; + case HSA_PACKET_TYPE_VENDOR_SPECIFIC: + VendorSpecificAqlToPm4((char *)ib_start_addr, (amd_aql_pm4_ib *)aql_packet); + break; + case HSA_PACKET_TYPE_INVALID: + // When packets are submitted out of order, the format field of current AQL packet + // may not have been updated yet and is still INVALID. Return HSA_STATUS_SUCCESS and + // do not process AQL packets before the packet format field is updated. + assert(false && "Should not reach here, HSA_PACKET_TYPE_INVALID has been filtered in upper layer"); + return HSA_STATUS_SUCCESS; + default: + return HSA_STATUS_ERROR_INVALID_PACKET_FORMAT; + } + + ready_to_submit = true; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ComputeQueue::Process(void) { + + while (cmdbuf_aql_frame_write_index < ring_wptr->load() && + !IsInvalidPacket()) { + debug_print("process %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n", + ring, ring_wptr->load(), ring_rptr->load()); + + hsa_status_t ret; + + // wait for next few cmdbuf slots to be free + // If wptr catch up the rptr in the cmdbuf, this needs wait for the rptr to free the cmdbuf. + // Here the wptr comes from queue->cmdbuf_aql_frame_write_index, while rptr comes from *queue->sync_addr. + if (*sync_addr + WDDMDevice::GetAqlFrameNum() <= cmdbuf_aql_frame_write_index) { + uint64_t value = cmdbuf_aql_frame_write_index - WDDMDevice::GetAqlFrameNum() + 1; + if (!device->CpuWait(&syncobj, &value, 1, false)) + return HSA_STATUS_ERROR; + } + + ret = SwitchAql2PM4(); + if (ret != HSA_STATUS_SUCCESS) + return ret; + + if (!ready_to_submit) + continue; + + ret = Submit(); + if (ret != HSA_STATUS_SUCCESS) + return ret; + + ready_to_submit = false; + + debug_print("done %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n", + ring, ring_wptr->load(), ring_rptr->load()); + + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t SDMAQueue::Init(void) { + hsa_status_t ret = use_hws ? HwsInit() : SwsInit(); + if (ret) + return ret; + + std::memset((char *)cmdbuf_addr, 0, cmdbuf_size); + + return ret; +} + +hsa_status_t SDMAQueue::Fini(void) { + return use_hws ? HwsFini() : SwsFini(); +} + +int SDMAQueue::PreparePacket(uint32_t offset, uint64_t size) { + ib_start_addr = cmdbuf_addr + offset; + ib_size = size; + rptr_next += ib_size; + + return STATUS_SUCCESS; +} + +hsa_status_t SDMAQueue::Submit(void) { + if (!device->WaitPagingFence(this)) + return HSA_STATUS_ERROR; + + int ret = use_hws ? + HwsSubmit(ib_start_addr, ib_size, rptr_next) : + SwsSubmit(ib_start_addr, ib_size, rptr_next); + if (ret) + return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + +} +} diff --git a/wddm/va_mgr.cpp b/wddm/va_mgr.cpp new file mode 100644 index 0000000000..426eaa2a5f --- /dev/null +++ b/wddm/va_mgr.cpp @@ -0,0 +1,163 @@ +#include +#include +#include +#include "inc/wddm/va_mgr.h" + +using namespace std; + +namespace rocr { +namespace core { + +VaMgr::VaMgr(uint64_t start, uint64_t size, uint64_t min_align) { + min_align_ = min_align; + auto free_it = free_list_.insert(make_pair(size, start)); + frag_map_[start] = make_fragment(free_it, size); +} + +VaMgr::~VaMgr() { + + assert(free_list_.size() == 1); + assert(frag_map_.size() == 1); + + free_list_.clear(); + frag_map_.clear(); +} + +uint64_t VaMgr::Alloc(uint64_t bytes, uint64_t align, uint64_t addr) { + + if (addr > 0 && + (align == 0 || (addr % align) == 0)) { + + lock_guard gard(lock_); + auto frag_it = frag_map_.upper_bound(addr); + assert(frag_it != frag_map_.begin()); + --frag_it; + + while (frag_it != frag_map_.begin()) { + const uint64_t base = frag_it->first; + const uint64_t size = frag_it->second.size; + + // Cannot find free fragment contains the target `addr` + if (bytes > size || addr < base || addr + bytes >= base + size || + !is_free(frag_it->second)) { + --frag_it; + continue; + } else if (addr >= base + size) + break; + + + // Try to allocate target `addr` from this free fragment + auto free_it = frag_it->second.free_list_entry_; + assert(free_it != free_list_.end()); + + free_list_.erase(free_it); + frag_it->second.size = bytes; + set_used(frag_it->second); + + // [base, addr) + if (addr > base) add_free_fragment(addr - base, base); + + // [addr, addr + bytes) is used + + // [addr + bytes, base + size) + if (base + size > addr + bytes) add_free_fragment(base + size - addr - bytes, addr + bytes); + + return addr; + } + } + + // Allocate not fixed address + return AllocImpl(bytes, align); +} + +uint64_t VaMgr::AllocImpl(const uint64_t bytes, const uint64_t align) { + uint64_t addr = 0; + uint64_t align_bytes = bytes; + const int retry = align == 0 ? 0 : 1; + const uint64_t new_align = align == 0 ? min_align_ : AlignUp(align, min_align_); + + lock_guard gard(lock_); + for (int i = 0; i <= retry; i++) { + auto free_it = free_list_.lower_bound(align_bytes); + if (free_it == free_list_.end()) break; + + uint64_t base = free_it->second; + uint64_t size = free_it->first; + + assert(size >= align_bytes); + + auto fragment = frag_map_.find(base); + + assert(fragment != frag_map_.end()); + assert(size == fragment->second.size); + + uint64_t delta = align == 0 ? 0 : base % align; + if (delta == 0) { + // already find aligned address + addr = base; + + free_list_.erase(free_it); + fragment->second.size = bytes; + set_used(fragment->second); + + if (size > bytes) add_free_fragment(size - bytes, base + bytes); + + break; + } else if (i == 0) { + align_bytes += new_align; + continue; + } else { + uint64_t aligned_base = base + align - delta; + addr = aligned_base; + + free_list_.erase(free_it); + + add_used_fragment(bytes, aligned_base); + add_free_fragment(aligned_base - base, base); + + if (size > aligned_base - base + bytes) + add_free_fragment(size - (aligned_base - base) - bytes, aligned_base + bytes); + + break; + } + } + return addr; +} + +void VaMgr::Free(uint64_t addr) { + if (addr == 0) return; + + lock_guard gard(lock_); + auto frag_it = frag_map_.find(addr); + if (frag_it == frag_map_.end() || is_free(frag_it->second)) return; + + uint64_t base = addr; + // Merge lower + if (frag_it != frag_map_.begin()) { + auto lower = frag_it; + --lower; + if (is_free(lower->second)) { + remove_free_list_entry(lower->second); + base -= lower->second.size; + lower->second.size += frag_it->second.size; + frag_map_.erase(frag_it); + frag_it = lower; + } + } + // Merge upper + { + auto upper = frag_it; + ++upper; + if (upper != frag_map_.end() && is_free(upper->second)) { + remove_free_list_entry(upper->second); + frag_it->second.size += upper->second.size; + frag_map_.erase(upper); + } + } + uint64_t size = frag_it->second.size; + auto it = free_list_.insert(make_pair(size, base)); + set_free(frag_it->second, it); +} + +} // namespace core +} // namespace rocr