diff --git a/CMakeLists.txt b/CMakeLists.txt index a3a4d999c1..ccf39e8d82 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -129,7 +129,8 @@ set ( HSAKMT_SRC "src/debug.c" "src/topology.c" "src/rbtree.c" "src/spm.c" - "src/version.c") + "src/version.c" + "src/svm.c") ## Declare the library target name add_library ( ${HSAKMT_TARGET} "") diff --git a/include/hsakmt.h b/include/hsakmt.h index 83f41cac90..39b1a0c792 100644 --- a/include/hsakmt.h +++ b/include/hsakmt.h @@ -844,6 +844,16 @@ hsaKmtGetQueueSnapshot( HSAuint32 *QssEntries // IN/OUT ); +/** + Send the host trap +*/ +HSAKMT_STATUS +HSAKMTAPI +hsaKmtSendHostTrap( + HSAuint32 NodeId, //IN + HSAuint32 Pid //IN + ); + /** Set the trap override mask. When debug trap is enabled by hsaKmtEnableDebugTrap() each wave launched has its initial @@ -1244,6 +1254,37 @@ hsaKmtSPMSetDestBuffer( bool *isSPMDataLoss //OUT ); +/* Helper functions for calling KFD SVM ioctl */ +HSAKMT_STATUS +HSAKMTAPI +hsaKmtSVMSetAttr( + void *start_addr, // IN: Start of the virtual address range (page-aligned) + HSAuint64 size, // IN: size (page-aligned) + unsigned int nattr, // IN: number of attributes + HSA_SVM_ATTRIBUTE *attrs // IN: array of attributes +); + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtSVMGetAttr( + void *start_addr, // IN: Start of the virtual address range (page-aligned) + HSAuint64 size, // IN: size (page aligned) + unsigned int nattr, // IN: number of attributes + HSA_SVM_ATTRIBUTE *attrs // IN/OUT: array of attributes +); + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtSetXNACKMode( + HSAint32 enable // IN: enable/disable XNACK node. +); + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetXNACKMode( + HSAint32 * enable // OUT: returns XNACK value. +); + #ifdef __cplusplus } //extern "C" #endif diff --git a/include/hsakmttypes.h b/include/hsakmttypes.h index ef97539ea1..afc811d2c9 100644 --- a/include/hsakmttypes.h +++ b/include/hsakmttypes.h @@ -541,7 +541,8 @@ typedef struct _HsaMemFlags // and optimal alignment requirements unsigned int FixedAddress : 1; // Allocate memory at specified virtual address. Fail if address is not free. unsigned int NoNUMABind: 1; // Don't bind system memory to a specific NUMA node - unsigned int Reserved : 15; + unsigned int Uncached: 1; // Caching flag for fine-grained memory on A+A HW platform + unsigned int Reserved : 14; } ui32; HSAuint32 Value; @@ -1296,6 +1297,35 @@ typedef struct _HsaMemoryRange { HSAuint64 SizeInBytes; // Size of above memory } HsaMemoryRange; +typedef enum _HSA_SVM_FLAGS { + HSA_SVM_FLAG_HOST_ACCESS = 0x00000001, // Guarantee host access to memory + HSA_SVM_FLAG_COHERENT = 0x00000002, // Fine grained coherency between all devices with access + HSA_SVM_FLAG_HIVE_LOCAL = 0x00000004, // Use any GPU in same hive as preferred device + HSA_SVM_FLAG_GPU_RO = 0x00000008, // GPUs only read, allows replication + HSA_SVM_FLAG_GPU_EXEC = 0x00000010, // Allow execution on GPU +} HSA_SVM_FLAGS; + +typedef enum _HSA_SVM_ATTR_TYPE { + HSA_SVM_ATTR_PREFERRED_LOC, // gpuid of the preferred location, 0 for + // system memory, INVALID_NODEID for + // "don't care" + HSA_SVM_ATTR_PREFETCH_LOC, // gpuid of the prefetch location, 0 for + // system memory. Setting this triggers an + // immediate prefetch (migration) + HSA_SVM_ATTR_ACCESS, + HSA_SVM_ATTR_ACCESS_IN_PLACE, + HSA_SVM_ATTR_NO_ACCESS, // specify memory access for the gpuid given + // by the attribute value + HSA_SVM_ATTR_SET_FLAGS, // bitmask of flags to set (see HSA_SVM_FLAGS) + HSA_SVM_ATTR_CLR_FLAGS, // bitmask of flags to clear + HSA_SVM_ATTR_GRANULARITY // migration granularity (log2 num pages) +} HSA_SVM_ATTR_TYPE; + +typedef struct _HSA_SVM_ATTRIBUTE { + HSAuint32 type; // attribute type (see enum HSA_SVM_ATTR_TYPE) + HSAuint32 value; // attribute value +} HSA_SVM_ATTRIBUTE; + #pragma pack(pop, hsakmttypes_h) diff --git a/include/linux/kfd_ioctl.h b/include/linux/kfd_ioctl.h index 62a84e4e7e..2d948a68cd 100644 --- a/include/linux/kfd_ioctl.h +++ b/include/linux/kfd_ioctl.h @@ -224,6 +224,7 @@ struct kfd_ioctl_dbg_wave_control_args { #define KFD_DBG_EV_STATUS_VMFAULT 2 #define KFD_DBG_EV_STATUS_SUSPENDED 4 #define KFD_DBG_EV_STATUS_NEW_QUEUE 8 +#define KFD_DBG_EV_STATUS_HOST_TRAP_TIMEDOUT 16 #define KFD_DBG_EV_FLAG_CLEAR_STATUS 1 #define KFD_INVALID_QUEUEID 0xffffffff @@ -309,6 +310,14 @@ struct kfd_ioctl_dbg_wave_control_args { */ #define KFD_IOC_DBG_TRAP_SET_ADDRESS_WATCH 9 +/* KFD_IOC_DBG_SEND_HOST_TRAP: + * ptr: unused + * data1: unused + * data2: unused + * data3: unused + */ +#define KFD_IOC_DBG_TRAP_SEND_HOST_TRAP 10 + struct kfd_ioctl_dbg_trap_args { __u64 ptr; /* to KFD -- used for pointer arguments: queue arrays */ __u32 pid; /* to KFD */ @@ -479,6 +488,7 @@ struct kfd_ioctl_acquire_vm_args { #define KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE (1 << 28) #define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27) #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26) +#define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25) /* Allocate memory for later SVM (shared virtual memory) mapping. * @@ -721,6 +731,166 @@ struct kfd_ioctl_cross_memory_copy_args { __u64 bytes_copied; }; + +/* Guarantee host access to memory */ +#define KFD_IOCTL_SVM_FLAG_HOST_ACCESS 0x00000001 +/* Fine grained coherency between all devices with access */ +#define KFD_IOCTL_SVM_FLAG_COHERENT 0x00000002 +/* Use any GPU in same hive as preferred device */ +#define KFD_IOCTL_SVM_FLAG_HIVE_LOCAL 0x00000004 +/* GPUs only read, allows replication */ +#define KFD_IOCTL_SVM_FLAG_GPU_RO 0x00000008 +/* Allow execution on GPU */ +#define KFD_IOCTL_SVM_FLAG_GPU_EXEC 0x00000010 + +/** + * kfd_ioctl_svm_op - SVM ioctl operations + * + * @KFD_IOCTL_SVM_OP_SET_ATTR: Modify one or more attributes + * @KFD_IOCTL_SVM_OP_GET_ATTR: Query one or more attributes + */ +enum kfd_ioctl_svm_op { + KFD_IOCTL_SVM_OP_SET_ATTR, + KFD_IOCTL_SVM_OP_GET_ATTR +}; + +/** + * kfd_ioctl_svm_attr_type - SVM attribute types + * + * @KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: gpuid of the preferred location, 0 for + * system memory + * @KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: gpuid of the prefetch location, 0 for + * system memory. Setting this triggers an + * immediate prefetch (migration). + * @KFD_IOCTL_SVM_ATTR_ACCESS: + * @KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: + * @KFD_IOCTL_SVM_ATTR_NO_ACCESS: specify memory access for the gpuid given + * by the attribute value + * @KFD_IOCTL_SVM_ATTR_SET_FLAGS: bitmask of flags to set (see + * KFD_IOCTL_SVM_FLAG_...) + * @KFD_IOCTL_SVM_ATTR_CLR_FLAGS: bitmask of flags to clear + * @KFD_IOCTL_SVM_ATTR_GRANULARITY: migration granularity + * (log2 num pages) + */ +enum kfd_ioctl_svm_attr_type { + KFD_IOCTL_SVM_ATTR_PREFERRED_LOC, + KFD_IOCTL_SVM_ATTR_PREFETCH_LOC, + KFD_IOCTL_SVM_ATTR_ACCESS, + KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE, + KFD_IOCTL_SVM_ATTR_NO_ACCESS, + KFD_IOCTL_SVM_ATTR_SET_FLAGS, + KFD_IOCTL_SVM_ATTR_CLR_FLAGS, + KFD_IOCTL_SVM_ATTR_GRANULARITY +}; + +/** kfd_ioctl_svm_location - Enum for preferred and prefetch locations + * + * GPU IDs are used to specify GPUs as preferred and prefetch locations. + * Below definitions are used for system memory or for leaving the preferred + * location unspecified. + */ +enum kfd_ioctl_svm_location { + KFD_IOCTL_SVM_LOCATION_SYSMEM = 0, + KFD_IOCTL_SVM_LOCATION_UNDEFINED = 0xffffffff +}; + +/** + * kfd_ioctl_svm_attribute - Attributes as pairs of type and value + * + * The meaning of the @value depends on the attribute type. + * + * @type: attribute type (see enum @kfd_ioctl_svm_attr_type) + * @value: attribute value + */ +struct kfd_ioctl_svm_attribute { + __u32 type; + __u32 value; +}; + +/** + * kfd_ioctl_svm_args - Arguments for SVM ioctl + * + * @op specifies the operation to perform (see enum + * @kfd_ioctl_svm_op). @start_addr and @size are common for all + * operations. + * + * A variable number of attributes can be given in @attrs. + * @nattr specifies the number of attributes. New attributes can be + * added in the future without breaking the ABI. If unknown attributes + * are given, the function returns -EINVAL. + * + * @KFD_IOCTL_SVM_OP_SET_ATTR sets attributes for a virtual address + * range. It may overlap existing virtual address ranges. If it does, + * the existing ranges will be split such that the attribute changes + * only apply to the specified address range. + * + * @KFD_IOCTL_SVM_OP_GET_ATTR returns the intersection of attributes + * over all memory in the given range and returns the result as the + * attribute value. If different pages have different preferred or + * prefetch locations, 0xffffffff will be returned for + * @KFD_IOCTL_SVM_ATTR_PREFERRED_LOC or + * @KFD_IOCTL_SVM_ATTR_PREFETCH_LOC resepctively. For + * @KFD_IOCTL_SVM_ATTR_SET_FLAGS, flags of all pages will be + * aggregated by bitwise AND. The minimum migration granularity + * throughout the range will be returned for + * @KFD_IOCTL_SVM_ATTR_GRANULARITY. + * + * Querying of accessibility attributes works by initializing the + * attribute type to @KFD_IOCTL_SVM_ATTR_ACCESS and the value to the + * GPUID being queried. Multiple attributes can be given to allow + * querying multiple GPUIDs. The ioctl function overwrites the + * attribute type to indicate the access for the specified GPU. + * + * @KFD_IOCTL_SVM_ATTR_CLR_FLAGS is invalid for + * @KFD_IOCTL_SVM_OP_GET_ATTR. + */ +struct kfd_ioctl_svm_args { + __u64 start_addr; + __u64 size; + __u32 op; + __u32 nattr; + /* Variable length array of attributes */ + struct kfd_ioctl_svm_attribute attrs[0]; +}; + +/** + * kfd_ioctl_set_xnack_mode_args - Arguments for set_xnack_mode + * + * @xnack_enabled: [in/out] Whether to enable XNACK mode for this process + * + * @xnack_enabled indicates whether recoverable page faults should be + * enabled for the current process. 0 means disabled, positive means + * enabled, negative means leave unchanged. If enabled, virtual address + * translations on GFXv9 and later AMD GPUs can return XNACK and retry + * the access until a valid PTE is available. This is used to implement + * device page faults. + * + * On output, @xnack_enabled returns the (new) current mode (0 or + * positive). Therefore, a negative input value can be used to query + * the current mode without changing it. + * + * The XNACK mode fundamentally changes the way SVM managed memory works + * in the driver, with subtle effects on application performance and + * functionality. + * + * Enabling XNACK mode requires shader programs to be compiled + * differently. Furthermore, not all GPUs support changing the mode + * per-process. Therefore changing the mode is only allowed while no + * user mode queues exist in the process. This ensure that no shader + * code is running that may be compiled for the wrong mode. And GPUs + * that cannot change to the requested mode will be disabled by + * failing subsequent requests to create user mode queues. + * + * This ioctl returns the status of the requested xnack mode. + * + * GFXv8 or older GPUs do not support 48 bit virtual addresses or SVM. + * + * Return: 0 on success, -errno on failure + */ +struct kfd_ioctl_set_xnack_mode_args { + __s32 xnack_enabled; +}; + #define AMDKFD_IOCTL_BASE 'K' #define AMDKFD_IO(nr) _IO(AMDKFD_IOCTL_BASE, nr) #define AMDKFD_IOR(nr, type) _IOR(AMDKFD_IOCTL_BASE, nr, type) @@ -818,8 +988,13 @@ struct kfd_ioctl_cross_memory_copy_args { #define AMDKFD_IOC_ALLOC_QUEUE_GWS \ AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args) +#define AMDKFD_IOC_SVM AMDKFD_IOWR(0x20, struct kfd_ioctl_svm_args) + +#define AMDKFD_IOC_SET_XNACK_MODE \ + AMDKFD_IOWR(0x21, struct kfd_ioctl_set_xnack_mode_args) + #define AMDKFD_COMMAND_START 0x01 -#define AMDKFD_COMMAND_END 0x1F +#define AMDKFD_COMMAND_END 0x22 /* non-upstream ioctls */ #define AMDKFD_IOC_IPC_IMPORT_HANDLE \ diff --git a/src/debug.c b/src/debug.c index caddc79a8e..5ed6631932 100644 --- a/src/debug.c +++ b/src/debug.c @@ -731,3 +731,24 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtClearAddressWatch( NULL); return result; } + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtSendHostTrap( + HSAuint32 NodeId, //IN + HSAuint32 Pid //IN + ) +{ + int result; + + result = debug_trap(NodeId, + KFD_IOC_DBG_TRAP_SEND_HOST_TRAP, + 0, + 0, + 0, + Pid, + 0, + NULL); + + return result; +} diff --git a/src/events.c b/src/events.c index 23fb710523..d4c751c0cc 100644 --- a/src/events.c +++ b/src/events.c @@ -76,7 +76,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc, if (is_dgpu && !events_page) { events_page = allocate_exec_aligned_memory_gpu( - KFD_SIGNAL_EVENT_LIMIT * 8, PAGE_SIZE, 0, true, false); + KFD_SIGNAL_EVENT_LIMIT * 8, PAGE_SIZE, 0, true, false, true); if (!events_page) { pthread_mutex_unlock(&hsakmt_mutex); return HSAKMT_STATUS_ERROR; diff --git a/src/fmm.c b/src/fmm.c index 172c1dc46b..c9a356d0fd 100644 --- a/src/fmm.c +++ b/src/fmm.c @@ -1186,7 +1186,8 @@ static uint32_t fmm_translate_hsa_to_ioc_flags(HsaMemFlags flags) uint32_t ioc_flags = 0; if (flags.ui32.AQLQueueMemory) - ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM; + ioc_flags |= (KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM | + KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED); if (!flags.ui32.ReadOnly) ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE; /* TODO: Since, ROCr interfaces doesn't allow caller to set page @@ -1337,6 +1338,9 @@ void *fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t MemorySizeInB if (!flags.ui32.CoarseGrain || svm.disable_cache) ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_COHERENT; + if (flags.ui32.Uncached || svm.disable_cache) + ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED; + mem = __fmm_allocate_device(gpu_id, address, size, aperture, &mmap_offset, ioc_flags, &vm_obj); @@ -1548,6 +1552,10 @@ static void *fmm_allocate_host_gpu(uint32_t node_id, void *address, if (!flags.ui32.CoarseGrain || svm.disable_cache) ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_COHERENT; + + if (flags.ui32.Uncached || svm.disable_cache) + ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED; + ioc_flags |= fmm_translate_hsa_to_ioc_flags(flags); if (flags.ui32.AQLQueueMemory) diff --git a/src/libhsakmt.h b/src/libhsakmt.h index 34e30ec3ec..9c5b91f7b6 100644 --- a/src/libhsakmt.h +++ b/src/libhsakmt.h @@ -26,6 +26,7 @@ #ifndef LIBHSAKMT_H_INCLUDED #define LIBHSAKMT_H_INCLUDED +#include "linux/kfd_ioctl.h" #include "hsakmt.h" #include "pci_ids.h" #include @@ -132,6 +133,7 @@ enum asic_family_type { CHIP_NAVY_FLOUNDER, /* 19 */ CHIP_DIMGREY_CAVEFISH, /* 20 */ CHIP_VANGOGH, /* 21 */ + CHIP_ALDEBARAN, /* 22 */ CHIP_LAST }; @@ -170,7 +172,7 @@ HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags); void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t NodeId, bool NonPaged, - bool DeviceLocal); + bool DeviceLocal, bool Uncached); void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align); HSAKMT_STATUS init_process_doorbells(unsigned int NumNodes); void destroy_process_doorbells(void); diff --git a/src/libhsakmt.ver b/src/libhsakmt.ver index 97e2cf6beb..66ca6ea5b3 100644 --- a/src/libhsakmt.ver +++ b/src/libhsakmt.ver @@ -61,6 +61,7 @@ hsaKmtEnableDebugTrap; hsaKmtEnableDebugTrapWithPollFd; hsaKmtDisableDebugTrap; hsaKmtQueryDebugEvent; +hsaKmtSendHostTrap; hsaKmtGetQueueSnapshot; hsaKmtSetWaveLaunchTrapOverride; hsaKmtSetWaveLaunchMode; @@ -74,6 +75,10 @@ hsaKmtClearAddressWatch; hsaKmtSPMAcquire; hsaKmtSPMRelease; hsaKmtSPMSetDestBuffer; +hsaKmtSVMSetAttr; +hsaKmtSVMGetAttr; +hsaKmtSetXNACKMode; +hsaKmtGetXNACKMode; local: *; }; diff --git a/src/pmc_table.c b/src/pmc_table.c index 6e76842cda..502804c6cd 100644 --- a/src/pmc_table.c +++ b/src/pmc_table.c @@ -2128,6 +2128,7 @@ HSAKMT_STATUS get_block_properties(uint32_t node_id, case CHIP_RAVEN: case CHIP_RENOIR: case CHIP_ARCTURUS: + case CHIP_ALDEBARAN: *block = vega_blocks[block_id]; break; case CHIP_NAVI10: diff --git a/src/queues.c b/src/queues.c index eb8e061673..1e4188382a 100644 --- a/src/queues.c +++ b/src/queues.c @@ -42,7 +42,8 @@ #define DOORBELL_SIZE_GFX9 8 #define DOORBELLS_PAGE_SIZE(ds) (1024 * (ds)) -#define VGPR_SIZE_PER_CU(asic_family) (asic_family == CHIP_ARCTURUS ? 0x80000 : 0x40000) +#define VGPR_SIZE_PER_CU(asic_family) ((asic_family == CHIP_ARCTURUS || \ + asic_family == CHIP_ALDEBARAN) ? 0x80000 : 0x40000) #define SGPR_SIZE_PER_CU 0x4000 #define LDS_SIZE_PER_CU 0x10000 #define HWREG_SIZE_PER_CU 0x1000 @@ -147,6 +148,12 @@ const struct device_info arcturus_device_info = { .doorbell_size = DOORBELL_SIZE_GFX9, }; +const struct device_info aldebaran_device_info = { + .asic_family = CHIP_ALDEBARAN, + .eop_buffer_size = 4096, + .doorbell_size = DOORBELL_SIZE_GFX9, +}; + const struct device_info navi10_device_info = { .asic_family = CHIP_NAVI10, .eop_buffer_size = 4096, @@ -205,6 +212,7 @@ static const struct device_info *dev_lookup_table[] = { [CHIP_RAVEN] = &raven_device_info, [CHIP_RENOIR] = &renoir_device_info, [CHIP_ARCTURUS] = &arcturus_device_info, + [CHIP_ALDEBARAN] = &aldebaran_device_info, [CHIP_NAVI10] = &navi10_device_info, [CHIP_NAVI12] = &navi12_device_info, [CHIP_NAVI14] = &navi14_device_info, @@ -469,7 +477,8 @@ static bool update_ctx_save_restore_size(uint32_t nodeid, struct queue *q) void *allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t NodeId, bool nonPaged, - bool DeviceLocal) + bool DeviceLocal, + bool Uncached) { void *mem; HSAuint64 gpu_va; @@ -483,6 +492,7 @@ void *allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, flags.ui32.NonPaged = nonPaged; flags.ui32.PageSize = HSA_PAGE_SIZE_4KB; flags.ui32.CoarseGrain = DeviceLocal; + flags.ui32.Uncached = Uncached; /* Get the closest cpu_id to GPU NodeId for system memory allocation * nonPaged=1 system memory allocation uses GTT path @@ -532,11 +542,13 @@ void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align) static void *allocate_exec_aligned_memory(uint32_t size, bool use_ats, uint32_t NodeId, - bool DeviceLocal) + bool DeviceLocal, + bool Uncached) { if (!use_ats) return allocate_exec_aligned_memory_gpu(size, PAGE_SIZE, NodeId, - DeviceLocal, DeviceLocal); + DeviceLocal, DeviceLocal, + Uncached); return allocate_exec_aligned_memory_cpu(size); } @@ -578,7 +590,7 @@ static int handle_concrete_asic(struct queue *q, q->eop_buffer = allocate_exec_aligned_memory(q->dev_info->eop_buffer_size, q->use_ats, - NodeId, true); + NodeId, true, /* Unused for VRAM */false); if (!q->eop_buffer) return HSAKMT_STATUS_NO_MEMORY; @@ -596,7 +608,7 @@ static int handle_concrete_asic(struct queue *q, q->ctx_save_restore = allocate_exec_aligned_memory(q->ctx_save_restore_size, q->use_ats, - NodeId, false); + NodeId, false, false); if (!q->ctx_save_restore) return HSAKMT_STATUS_NO_MEMORY; @@ -653,7 +665,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueue(HSAuint32 NodeId, struct queue *q = allocate_exec_aligned_memory(sizeof(*q), use_ats, - NodeId, false); + NodeId, false, true); if (!q) return HSAKMT_STATUS_NO_MEMORY; diff --git a/src/svm.c b/src/svm.c new file mode 100644 index 0000000000..478217259e --- /dev/null +++ b/src/svm.c @@ -0,0 +1,224 @@ +/* + * Copyright © 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#include "libhsakmt.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Helper functions for calling KFD SVM ioctl */ + +HSAKMT_STATUS HSAKMTAPI +hsaKmtSVMSetAttr(void *start_addr, HSAuint64 size, unsigned int nattr, + HSA_SVM_ATTRIBUTE *attrs) +{ + struct kfd_ioctl_svm_args *args; + HSAuint64 s_attr; + HSAKMT_STATUS r; + HSAuint32 i; + + CHECK_KFD_OPEN(); + + pr_debug("%s: address 0x%p size 0x%lx\n", __func__, start_addr, size); + + if (!start_addr || !size) + return HSAKMT_STATUS_INVALID_PARAMETER; + if ((uint64_t)start_addr & (PAGE_SIZE - 1)) + return HSAKMT_STATUS_INVALID_PARAMETER; + if (size & (PAGE_SIZE - 1)) + return HSAKMT_STATUS_INVALID_PARAMETER; + + s_attr = sizeof(*attrs) * nattr; + args = alloca(sizeof(*args) + s_attr); + + args->start_addr = (uint64_t)start_addr; + args->size = size; + args->op = KFD_IOCTL_SVM_OP_SET_ATTR; + args->nattr = nattr; + memcpy(args->attrs, attrs, s_attr); + + for (i = 0; i < nattr; i++) { + if (attrs[i].type != KFD_IOCTL_SVM_ATTR_PREFERRED_LOC && + attrs[i].type != KFD_IOCTL_SVM_ATTR_PREFETCH_LOC && + attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS && + attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE && + attrs[i].type != KFD_IOCTL_SVM_ATTR_NO_ACCESS) + continue; + + if (attrs[i].type == KFD_IOCTL_SVM_ATTR_PREFERRED_LOC && + attrs[i].value == INVALID_NODEID) { + args->attrs[i].value = KFD_IOCTL_SVM_LOCATION_UNDEFINED; + continue; + } + + r = validate_nodeid(attrs[i].value, &args->attrs[i].value); + if (r != HSAKMT_STATUS_SUCCESS) { + pr_debug("invalid node ID: %d\n", attrs[i].value); + return r; + } else if (!args->attrs[i].value && + (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS || + attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE || + attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS)) { + pr_debug("CPU node invalid for access attribute\n"); + return HSAKMT_STATUS_INVALID_NODE_UNIT; + } + } + + /* Driver does one copy_from_user, with extra attrs size */ + r = kmtIoctl(kfd_fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args); + if (r) { + pr_debug("op set range attrs failed %s\n", strerror(errno)); + return HSAKMT_STATUS_ERROR; + } + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtSVMGetAttr(void *start_addr, HSAuint64 size, unsigned int nattr, + HSA_SVM_ATTRIBUTE *attrs) +{ + struct kfd_ioctl_svm_args *args; + HSAuint64 s_attr; + HSAKMT_STATUS r; + HSAuint32 i; + + CHECK_KFD_OPEN(); + + pr_debug("%s: address 0x%p size 0x%lx\n", __func__, start_addr, size); + + if (!start_addr || !size) + return HSAKMT_STATUS_INVALID_PARAMETER; + if ((uint64_t)start_addr & (PAGE_SIZE - 1)) + return HSAKMT_STATUS_INVALID_PARAMETER; + if (size & (PAGE_SIZE - 1)) + return HSAKMT_STATUS_INVALID_PARAMETER; + + s_attr = sizeof(*attrs) * nattr; + args = alloca(sizeof(*args) + s_attr); + + args->start_addr = (uint64_t)start_addr; + args->size = size; + args->op = KFD_IOCTL_SVM_OP_GET_ATTR; + args->nattr = nattr; + memcpy(args->attrs, attrs, s_attr); + + for (i = 0; i < nattr; i++) { + if (attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS && + attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE && + attrs[i].type != KFD_IOCTL_SVM_ATTR_NO_ACCESS) + continue; + + r = validate_nodeid(attrs[i].value, &args->attrs[i].value); + if (r != HSAKMT_STATUS_SUCCESS) { + pr_debug("invalid node ID: %d\n", attrs[i].value); + return r; + } else if (!args->attrs[i].value) { + pr_debug("CPU node invalid for access attribute\n"); + return HSAKMT_STATUS_INVALID_NODE_UNIT; + } + } + + /* Driver does one copy_from_user, with extra attrs size */ + r = kmtIoctl(kfd_fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args); + if (r) { + pr_debug("op get range attrs failed %s\n", strerror(errno)); + return HSAKMT_STATUS_ERROR; + } + + memcpy(attrs, args->attrs, s_attr); + + for (i = 0; i < nattr; i++) { + if (attrs[i].type != KFD_IOCTL_SVM_ATTR_PREFERRED_LOC && + attrs[i].type != KFD_IOCTL_SVM_ATTR_PREFETCH_LOC && + attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS && + attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE && + attrs[i].type != KFD_IOCTL_SVM_ATTR_NO_ACCESS) + continue; + + switch (attrs[i].value) { + case KFD_IOCTL_SVM_LOCATION_SYSMEM: + attrs[i].value = 0; + break; + case KFD_IOCTL_SVM_LOCATION_UNDEFINED: + attrs[i].value = INVALID_NODEID; + break; + default: + r = gpuid_to_nodeid(attrs[i].value, &attrs[i].value); + if (r != HSAKMT_STATUS_SUCCESS) { + pr_debug("invalid GPU ID: %d\n", + attrs[i].value); + return r; + } + } + } + + return HSAKMT_STATUS_SUCCESS; +} + +static HSAKMT_STATUS +hsaKmtSetGetXNACKMode(HSAint32 * enable) +{ + struct kfd_ioctl_set_xnack_mode_args args; + + CHECK_KFD_OPEN(); + + args.xnack_enabled = *enable; + + if (kmtIoctl(kfd_fd, AMDKFD_IOC_SET_XNACK_MODE, &args)) { + if (errno == EPERM) { + pr_debug("set mode not supported %s\n", + strerror(errno)); + return HSAKMT_STATUS_NOT_SUPPORTED; + } else if (errno == EBUSY) { + pr_debug("kmtIoctl queues not empty %s\n", + strerror(errno)); + } + return HSAKMT_STATUS_ERROR; + } + + *enable = args.xnack_enabled; + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtSetXNACKMode(HSAint32 enable) +{ + return hsaKmtSetGetXNACKMode(&enable); +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtGetXNACKMode(HSAint32 * enable) +{ + *enable = -1; + return hsaKmtSetGetXNACKMode(enable); +} diff --git a/src/topology.c b/src/topology.c index d5c054d24a..8a8cc96272 100644 --- a/src/topology.c +++ b/src/topology.c @@ -224,6 +224,10 @@ static const struct hsa_gfxip_table gfxip_lookup_table[] = { { 0x738C, 9, 0, 8, "Arcturus", CHIP_ARCTURUS }, { 0x738E, 9, 0, 8, "Arcturus", CHIP_ARCTURUS }, { 0x7390, 9, 0, 8, "Arcturus", CHIP_ARCTURUS }, + /* Aldebaran */ + { 0x7408, 9, 0, 10, "Aldebaran", CHIP_ALDEBARAN }, + { 0x740C, 9, 0, 10, "Aldebaran", CHIP_ALDEBARAN }, + { 0x740F, 9, 0, 10, "Aldebaran", CHIP_ALDEBARAN }, /* Navi10 */ { 0x7310, 10, 1, 0, "Navi10", CHIP_NAVI10 }, { 0x7312, 10, 1, 0, "Navi10", CHIP_NAVI10 }, diff --git a/tests/kfdtest/CMakeLists.txt b/tests/kfdtest/CMakeLists.txt index 17d4b9bf4a..a8c9f30299 100644 --- a/tests/kfdtest/CMakeLists.txt +++ b/tests/kfdtest/CMakeLists.txt @@ -38,6 +38,16 @@ set ( CPACK_PACKAGE_VERSION_MINOR "0" ) set ( CPACK_PACKAGE_VERSION_PATCH "0" ) set ( CPACK_PACKAGE_HOMEPAGE_URL "https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface" ) +## Define default variable and variables for the optional build target hsakmt-dev +set ( SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR} CACHE STRING "Location of hsakmt source code." ) +set ( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE STRING "Default installation directory." ) +set ( CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}" CACHE STRING "Default packaging prefix." ) +set ( CPACK_GENERATOR "DEB;RPM" CACHE STRING "Default packaging generators." ) + +# Debian package specific variables +set ( CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface" ) + + #set ( CMAKE_VERBOSE_MAKEFILE on ) find_package(PkgConfig) @@ -57,29 +67,19 @@ else() include_directories(${DRM_AMDGPU_INCLUDE_DIRS}) endif() -## Define default variable and variables for the optional build target hsakmt-dev -set ( SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR} CACHE STRING "Location of hsakmt source code." ) -set ( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE STRING "Default installation directory." ) -set ( CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}" CACHE STRING "Default packaging prefix." ) -set ( CPACK_GENERATOR "DEB;RPM" CACHE STRING "Default packaging generators." ) - -# Debian package specific variables -set ( CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface" ) - if( DEFINED ENV{LIBHSAKMT_PATH} ) set ( LIBHSAKMT_PATH $ENV{LIBHSAKMT_PATH} ) message ( "LIBHSAKMT_PATH environment variable is set" ) else() if ( ${ROCM_INSTALL_PATH} ) - set ( ENV{PKG_CONFIG_PATH} ${ROCM_INSTALL_PATH} ) - pkg_check_modules(HSAKMT libhsakmt) + set ( ENV{PKG_CONFIG_PATH} ${ROCM_INSTALL_PATH}/share/pkgconfig ) else() - set ( ENV{PKG_CONFIG_PATH} /opt/rocm/libhsakmt/ ) - pkg_check_modules(HSAKMT libhsakmt) + set ( ENV{PKG_CONFIG_PATH} /opt/rocm/share/pkgconfig ) endif() + + pkg_check_modules(HSAKMT libhsakmt) + if( NOT HSAKMT_FOUND ) - set ( ENV{PKG_CONFIG_PATH} /opt/rocm/libhsakmt/ ) - pkg_check_modules(HSAKMT libhsakmt) set ( LIBHSAKMT_PATH $ENV{OUT_DIR} ) endif() endif() @@ -109,6 +109,7 @@ set (SRC_FILES gtest-1.6.0/gtest-all.cpp src/GoogleTestExtension.cpp src/IndirectBuffer.cpp src/IsaGenerator.cpp + src/IsaGenerator_Aldebaran.cpp src/IsaGenerator_Gfx10.cpp src/IsaGenerator_Gfx72.cpp src/IsaGenerator_Gfx8.cpp diff --git a/tests/kfdtest/scripts/kfdtest.exclude b/tests/kfdtest/scripts/kfdtest.exclude index f526986c87..f2b605cf84 100644 --- a/tests/kfdtest/scripts/kfdtest.exclude +++ b/tests/kfdtest/scripts/kfdtest.exclude @@ -20,6 +20,7 @@ FILTER[core_sws]=\ "KFDQMTest.AllSdmaQueues:"\ "KFDQMTest.AllXgmiSdmaQueues:"\ "KFDQMTest.AllQueues:"\ +"KFDLocalMemoryTest.AccessLocalMem:"\ "KFDEventTest.SignalEvent" # HWS mode @@ -199,6 +200,13 @@ FILTER[arcturus]=\ "KFDQMTest.BasicCuMaskingEven:"\ "KFDEvictTest.BurstyTest" +FILTER[aldebaran]=\ +"$BLACKLIST_ALL_ASICS:"\ +"KFDExceptionTest.FaultStorm:"\ +"KFDEvictTest.BurstyTest:"\ +"KFDMemoryTest.PtraceAccess:"\ +"KFDMemoryTest.DeviceHdpFlush" + FILTER[navi10]=\ "$BLACKLIST_ALL_ASICS:"\ "KFDMemoryTest.MMBench" diff --git a/tests/kfdtest/sp3/lib_helper/CMakeLists_sp3.txt b/tests/kfdtest/sp3/lib_helper/CMakeLists_sp3.txt index db59e3716e..ce8a3cb33f 100644 --- a/tests/kfdtest/sp3/lib_helper/CMakeLists_sp3.txt +++ b/tests/kfdtest/sp3/lib_helper/CMakeLists_sp3.txt @@ -39,7 +39,7 @@ set ( SCLIB_SRC ${PROJECT_SOURCE_DIR} ) #endif() include_directories(${SCLIB_SRC}/sp3) -include_directories(${SCLIB_SRC}/sp3/release_headers) +#include_directories(${SCLIB_SRC}/sp3/release_headers) include_directories(${SCLIB_SRC}/sp3/gen) set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-asic.c ) @@ -53,15 +53,17 @@ set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-cipher.c ) set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-vm.c ) aux_source_directory(${SCLIB_SRC}/sp3/gen SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/si SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/ci SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/gfx8 SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/gfx81 SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/gfx9 SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/gfx10 SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/release_headers/gfx81 SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/release_headers/gfx9 SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/release_headers/gfx10 SRC_FILES) +aux_source_directory(${SCLIB_SRC}/sp3/backend/si/lib SRC_FILES) +aux_source_directory(${SCLIB_SRC}/sp3/backend/ci/lib SRC_FILES) +aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx8/lib SRC_FILES) +aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx81/lib SRC_FILES) +aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx9/lib SRC_FILES) +aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx10/lib SRC_FILES) +aux_source_directory(${SCLIB_SRC}/sp3/backend/aldbrn/lib SRC_FILES) +aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx81/arch SRC_FILES) +aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx9/arch SRC_FILES) +aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx10/arch SRC_FILES) +aux_source_directory(${SCLIB_SRC}/sp3/backend/aldbrn/arch SRC_FILES) message( STATUS "PROJECT_SOURCE_DIR:" ${PROJECT_SOURCE_DIR} ) @@ -70,7 +72,7 @@ message( STATUS "PROJECT_SOURCE_DIR:" ${PROJECT_SOURCE_DIR} ) # message(STATUS "${file}") #endforeach() -set ( CMAKE_C_FLAGS "-DSP3_STATIC_LIB -Wno-error -DPUBLIC_RELEASE -DLITTLEENDIAN_CPU -fPIC -DGFX10_BUILD" ) +set ( CMAKE_C_FLAGS "-DSP3_STATIC_LIB -Wno-error -DPUBLIC_RELEASE -DLITTLEENDIAN_CPU -fPIC -DGFX101_BUILD -DALDBRN_BUILD" ) add_library(amdsp3 ${SRC_FILES}) diff --git a/tests/kfdtest/sp3/lib_helper/build_sp3.sh b/tests/kfdtest/sp3/lib_helper/build_sp3.sh index 7cd20ccfb5..f93f145da6 100755 --- a/tests/kfdtest/sp3/lib_helper/build_sp3.sh +++ b/tests/kfdtest/sp3/lib_helper/build_sp3.sh @@ -44,7 +44,7 @@ popd rsync --progress -a build/libamdsp3.a $LIB_OUTPUT # Put the intermediate header files in the current folder for further processing -rsync --progress -a $SP3_PROJECT/sp3/sp3.h . +rsync --progress -a $SP3_PROJECT/sp3/public/lib/sp3.h . # Remove the build folder and CMakeLists.txt put into SP source folder rm -r build diff --git a/tests/kfdtest/sp3/sp3.h b/tests/kfdtest/sp3/sp3.h index d6235be5d8..513167d595 100644 --- a/tests/kfdtest/sp3/sp3.h +++ b/tests/kfdtest/sp3/sp3.h @@ -54,7 +54,9 @@ enum sp3_shtype { SP3_SHTYPE_HS = 4, SP3_SHTYPE_LS = 5, SP3_SHTYPE_CS = 6, +#ifdef NAVI10LITE_BUILD SP3_SHTYPE_ACV = 7, +#endif }; /// Assorted constants used by sp3 API. @@ -107,10 +109,12 @@ struct sp3_shader { uint32_t size; ///< Size of the compiled shader, in 32-bit words. uint32_t nsgprs; ///< Number of scalar GPRs used. uint32_t nvgprs; ///< Number of vector GPRs used. - uint32_t nsvgprs; ///< Number of shared vector GPRs used. + uint32_t nsvgprs; ///< Number of shared vector GPRs used (only available in certain projects). + uint32_t naccvgprs; ///< Number of accumulator vector GPRs used (only available in certain projects). uint32_t nsgprs_manual_alloc; uint32_t nvgprs_manual_alloc; uint32_t nsvgprs_manual_alloc; + uint32_t naccvgprs_manual_alloc; uint32_t trap_present; uint32_t user_sgpr_count; uint32_t scratch_en; @@ -209,6 +213,13 @@ SP3_EXPORT struct sp3_context *sp3_new(void); /// /// Currently supported options: /// +/// stdlib (string) -- absolute path to standard library files. May be a colon-separated list +/// of paths that will be used to search for stdlib files. Used by sp3_parse_library(). +/// +/// The following options are deprecated because they take integer arguments; you should use +/// sp3_set_option_int() for these settings going forward. They will continue to be accepted by +/// this API to support legacy users. +/// /// Werror (boolean) -- indicates whether warnings should be treated as errors. /// /// wave_size (integer) -- sets the wave size being used by the draw calls that will be using @@ -220,11 +231,53 @@ SP3_EXPORT struct sp3_context *sp3_new(void); /// /// omit_code_end (boolean) -- omit generation of the S_CODE_END footer. /// +/// allow_raw_bits (boolean) -- allow use of the raw_bits() function in sp3 shaders. This is a +/// dangerous option to allow in general so you must explicitly enable this option, otherwise +/// the raw_bits() function will always error out. +/// SP3_EXPORT void sp3_set_option( struct sp3_context *state, const char *option, const char *value); +/// Set option for sp3. +/// +/// @param state sp3 context. +/// @param option Option name. Unknown options will raise an error. +/// @param value Option value. +/// +/// Currently supported options: +/// +/// Werror (boolean) -- indicates whether warnings should be treated as errors. +/// +/// wave_size (integer) -- sets the wave size being used by the draw calls that will be using +/// this shader. Ignored in certain ASICs. You may set this to 32, 64 or the special value 0 +/// to indicate no preference on wave size. The shader will be checked to ensure it is +/// compatible with the size specified here. +/// +/// omit_version (boolean) -- omit generation of the S_VERSION opcode. +/// +/// omit_code_end (boolean) -- omit generation of the S_CODE_END footer. +/// +/// allow_raw_bits (boolean) -- allow use of the raw_bits() function in sp3 shaders. This is a +/// dangerous option to allow in general so you must explicitly enable this option, otherwise +/// the raw_bits() function will always error out. +/// +/// secure_mode (boolean) -- run in secure mode. Disables macro language features in assembly +/// path including calls to custom functions. Useful if sp3 is used as a backend to a web-based +/// assembly tool. +/// +/// debug_encoding (boolean) -- if true, debug encoding selection logic for assembly. Only +/// supported in 10.4+ backends. +/// +/// no_vs_export_check (boolean) -- if true, disable VS export sanity check. Only supported in +/// 10.4+ backends. +/// +SP3_EXPORT void sp3_set_option_int( + struct sp3_context *state, + const char *option, + int32_t value); + /// Parse a file into a context. /// /// Use sp3_compile to generate binary microcode after the shader is parsed. diff --git a/tests/kfdtest/src/BaseQueue.cpp b/tests/kfdtest/src/BaseQueue.cpp index e66d3dd784..dd1620168c 100644 --- a/tests/kfdtest/src/BaseQueue.cpp +++ b/tests/kfdtest/src/BaseQueue.cpp @@ -48,7 +48,8 @@ HSAKMT_STATUS BaseQueue::Create(unsigned int NodeId, unsigned int size, HSAuint6 memset(&m_Resources, 0, sizeof(m_Resources)); - m_QueueBuf = new HsaMemoryBuffer(size, NodeId, true/*zero*/, false/*local*/, true/*exec*/); + m_QueueBuf = new HsaMemoryBuffer(size, NodeId, true/*zero*/, false/*local*/, true/*exec*/, + /*isScratch */ false, /* isReadOnly */false, /* isUncached */true); if (type == HSA_QUEUE_COMPUTE_AQL) { m_Resources.Queue_read_ptr_aql = &pointers[0]; diff --git a/tests/kfdtest/src/Dispatch.cpp b/tests/kfdtest/src/Dispatch.cpp index 6ed67ce39b..3aa3892f38 100644 --- a/tests/kfdtest/src/Dispatch.cpp +++ b/tests/kfdtest/src/Dispatch.cpp @@ -30,6 +30,8 @@ #include "KFDBaseComponentTest.hpp" +#define mmCOMPUTE_PGM_RSRC3 0x2e2d + Dispatch::Dispatch(const HsaMemoryBuffer& isaBuf, const bool eventAutoReset) :m_IsaBuf(isaBuf), m_IndirectBuf(PACKETTYPE_PM4, PAGE_SIZE / sizeof(unsigned int), isaBuf.Node()), m_DimX(1), m_DimY(1), m_DimZ(1), m_pArg1(NULL), m_pArg2(NULL), m_pEop(NULL), m_ScratchEn(false), @@ -218,6 +220,12 @@ void Dispatch::BuildIb() { m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_PGM_RSRC1, COMPUTE_PGM_RSRC, ARRAY_SIZE(COMPUTE_PGM_RSRC))); + if (m_FamilyId == FAMILY_AL) { + const unsigned int COMPUTE_PGM_RSRC3[] = {9}; + m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_PGM_RSRC3, COMPUTE_PGM_RSRC3, + ARRAY_SIZE(COMPUTE_PGM_RSRC3))); + } + m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_RESOURCE_LIMITS, COMPUTE_RESOURCE_LIMITS, ARRAY_SIZE(COMPUTE_RESOURCE_LIMITS))); m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_TMPRING_SIZE, COMPUTE_TMPRING_SIZE, diff --git a/tests/kfdtest/src/IndirectBuffer.cpp b/tests/kfdtest/src/IndirectBuffer.cpp index b820230b0d..4e3907cc5b 100644 --- a/tests/kfdtest/src/IndirectBuffer.cpp +++ b/tests/kfdtest/src/IndirectBuffer.cpp @@ -30,7 +30,8 @@ IndirectBuffer::IndirectBuffer(PACKETTYPE type, unsigned int sizeInDWords, unsigned int NodeId) :m_NumOfPackets(0), m_MaxSize(sizeInDWords), m_ActualSize(0), m_PacketTypeAllowed(type) { m_IndirectBuf = new HsaMemoryBuffer(sizeInDWords*sizeof(unsigned int), NodeId, true/*zero*/, - false/*local*/, true/*exec*/); + false/*local*/, true/*exec*/, false/*isScratch*/, + false/*isReadOnly*/, true/*isUncached*/); } IndirectBuffer::~IndirectBuffer(void) { diff --git a/tests/kfdtest/src/IsaGenerator.cpp b/tests/kfdtest/src/IsaGenerator.cpp index 9c7376a0a4..3e69b5f9df 100644 --- a/tests/kfdtest/src/IsaGenerator.cpp +++ b/tests/kfdtest/src/IsaGenerator.cpp @@ -30,6 +30,7 @@ #include "IsaGenerator_Gfx8.hpp" #include "IsaGenerator_Gfx9.hpp" #include "IsaGenerator_Gfx10.hpp" +#include "IsaGenerator_Aldebaran.hpp" #include "GoogleTestExtension.hpp" @@ -93,6 +94,8 @@ IsaGenerator* IsaGenerator::Create(unsigned int familyId) { case FAMILY_RV: case FAMILY_AR: return new IsaGenerator_Gfx9; + case FAMILY_AL: + return new IsaGenerator_Aldbrn; case FAMILY_NV: return new IsaGenerator_Gfx10; diff --git a/tests/kfdtest/src/IsaGenerator_Aldebaran.cpp b/tests/kfdtest/src/IsaGenerator_Aldebaran.cpp new file mode 100644 index 0000000000..2c377f9111 --- /dev/null +++ b/tests/kfdtest/src/IsaGenerator_Aldebaran.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2020 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "IsaGenerator_Aldebaran.hpp" + +#include +#include + +const std::string IsaGenerator_Aldbrn::ASIC_NAME = "ALDEBARAN"; + +/* The binaries are generated from following ISA */ +#if 0 +/* flat_atomic_inc will not support by some PCIE, use flat_atomic_add instead */ +shader atomic_add +asic(ALDEBARAN) +type(CS) + v_mov_b32 v0, s0 + v_mov_b32 v1, s1 + v_mov_b32 v2, 1 + flat_atomic_add v3, v[0:1], v2 slc glc scc + s_waitcnt 0 + s_endpgm +end + +shader copy_dword +asic(ALDEBARAN) +type(CS) +/* copy the parameters from scalar registers to vector registers */ + v_mov_b32 v0, s0 + v_mov_b32 v1, s1 + v_mov_b32 v2, s2 + v_mov_b32 v3, s3 +/* copy a dword between the passed addresses */ + flat_load_dword v4, v[0:1] slc glc + s_waitcnt 0 + flat_store_dword v[2:3], v4 slc glc + s_endpgm +end + +shader main +asic(ALDEBARAN) +type(CS) +loop: + s_branch loop + s_endpgm +end + + +#endif + +const uint32_t IsaGenerator_Aldbrn::NOOP_ISA[] = { + 0xbf810000 +}; + +const uint32_t IsaGenerator_Aldbrn::COPY_DWORD_ISA[] = { + 0x7e000200, 0x7e020201, + 0x7e040202, 0x7e060203, + 0xdc530000, 0x047f0000, + 0xbf8c0000, 0xdc730000, + 0x007f0402, 0xbf810000 +}; + +const uint32_t IsaGenerator_Aldbrn::INFINITE_LOOP_ISA[] = { + 0xbf82ffff, 0xbf810000 +}; + +const uint32_t IsaGenerator_Aldbrn::ATOMIC_ADD_ISA[] = { + 0x7e000200, 0x7e020201, + 0x7e040281, 0xdf0b0000, + 0x037f0200, 0xbf8c0000, + 0xbf810000, 0x00000000 +}; + +void IsaGenerator_Aldbrn::GetNoopIsa(HsaMemoryBuffer& rBuf) { + std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As()); +} + +void IsaGenerator_Aldbrn::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) { + std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As()); +} + +void IsaGenerator_Aldbrn::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) { + std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As()); +} + +void IsaGenerator_Aldbrn::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) { + std::copy(ATOMIC_ADD_ISA, ATOMIC_ADD_ISA+ARRAY_SIZE(ATOMIC_ADD_ISA), rBuf.As()); +} + +const std::string& IsaGenerator_Aldbrn::GetAsicName() { + return ASIC_NAME; +} + diff --git a/tests/kfdtest/src/IsaGenerator_Aldebaran.hpp b/tests/kfdtest/src/IsaGenerator_Aldebaran.hpp new file mode 100644 index 0000000000..5571b91c26 --- /dev/null +++ b/tests/kfdtest/src/IsaGenerator_Aldebaran.hpp @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2020 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef _ISAGENERATOR_ALDEBARAN_H_ +#define _ISAGENERATOR_ALDEBARAN_H_ + +#include +#include "IsaGenerator.hpp" + +class IsaGenerator_Aldbrn : public IsaGenerator { + public: + virtual void GetNoopIsa(HsaMemoryBuffer& rBuf); + virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf); + virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf); + virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf); + + protected: + virtual const std::string& GetAsicName(); + + private: + static const std::string ASIC_NAME; + + static const uint32_t NOOP_ISA[]; + static const uint32_t COPY_DWORD_ISA[]; + static const uint32_t INFINITE_LOOP_ISA[]; + static const uint32_t ATOMIC_ADD_ISA[]; +}; + +#endif // _ISAGENERATOR_ALDEBARAN_H_ diff --git a/tests/kfdtest/src/KFDCWSRTest.cpp b/tests/kfdtest/src/KFDCWSRTest.cpp index 8306d05fff..daa92c9823 100644 --- a/tests/kfdtest/src/KFDCWSRTest.cpp +++ b/tests/kfdtest/src/KFDCWSRTest.cpp @@ -198,11 +198,13 @@ TEST_F(KFDCWSRTest, BasicTest) { int i; for (i = 0 ; i < wave_number; ++i) { if (result1[i] != count1) { - LOG() << "Dispatch 1, work item " << i << ' ' << result1[i] << std::endl; + LOG() << "Dispatch 1, work item [" << std::dec << i << "] " + << result1[i] << " != " << count1 << std::endl; break; } if (result2[i] != count2) { - LOG() << "Dispatch 2, work item " << i << ' ' << result2[i] << std::endl; + LOG() << "Dispatch 2, work item [" << std::dec << i << "] " + << result2[i] << " != " << count2 << std::endl; break; } } diff --git a/tests/kfdtest/src/KFDLocalMemoryTest.cpp b/tests/kfdtest/src/KFDLocalMemoryTest.cpp index 4c86594ab9..6af6765ac3 100644 --- a/tests/kfdtest/src/KFDLocalMemoryTest.cpp +++ b/tests/kfdtest/src/KFDLocalMemoryTest.cpp @@ -50,6 +50,32 @@ void KFDLocalMemoryTest::TearDown() { ROUTINE_END } +TEST_F(KFDLocalMemoryTest, AccessLocalMem) { + TEST_START(TESTPROFILE_RUNALL) + + int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; + + //local memory + HsaMemoryBuffer destBuf(PAGE_SIZE, defaultGPUNode, false, true); + HsaEvent *event; + ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event)); + + PM4Queue queue; + + ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + + queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As(), 0, 0)); + + queue.Wait4PacketConsumption(event); + + hsaKmtDestroyEvent(event); + EXPECT_SUCCESS(queue.Destroy()); + + + TEST_END +} + TEST_F(KFDLocalMemoryTest, BasicTest) { TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); TEST_START(TESTPROFILE_RUNALL); diff --git a/tests/kfdtest/src/KFDMemoryTest.cpp b/tests/kfdtest/src/KFDMemoryTest.cpp index d27d796eb7..7b1e18a5f9 100644 --- a/tests/kfdtest/src/KFDMemoryTest.cpp +++ b/tests/kfdtest/src/KFDMemoryTest.cpp @@ -108,6 +108,29 @@ wave_size(32)\n\ end\n\ "; +const char* aldbrn_ScratchCopyDword = +"\ +shader ScratchCopyDword\n\ +asic(ALDEBARAN)\n\ +type(CS)\n\ +/*copy the parameters from scalar registers to vector registers*/\n\ + v_mov_b32 v0, s0\n\ + v_mov_b32 v1, s1\n\ + v_mov_b32 v2, s2\n\ + v_mov_b32 v3, s3\n\ +/*set up the scratch parameters. This assumes a single 16-reg block.*/\n\ + s_mov_b32 flat_scratch_lo, s4\n\ + s_mov_b32 flat_scratch_hi, s5\n\ +/*copy a dword between the passed addresses*/\n\ + flat_load_dword v4, v[0:1] slc\n\ + s_waitcnt vmcnt(0)&lgkmcnt(0)\n\ + flat_store_dword v[2:3], v4 slc\n\ + \n\ + s_endpgm\n\ + \n\ +end\n\ +"; + /* Continuously poll src buffer and check buffer value @@ -131,6 +154,32 @@ type(CS)\n\ end\n\ "; +/* Similar to gfx9_PollMemory except that the buffer + * polled can be Non-coherant memory. SCC system-level + * cache coherence is not supported in scalar (smem) path. + * Use vmem operations with scc + */ +const char* gfx9_PollNCMemory = +"\ +shader ReadMemory\n\ +asic(ALDEBARAN)\n\ +wave_size(32)\n\ +type(CS)\n\ +/* Assume src address in s0, s1 and dst address in s2, s3*/\n\ + v_mov_b32 v6, 0x5678\n\ + v_mov_b32 v0, s0\n\ + v_mov_b32 v1, s1\n\ + LOOP:\n\ + flat_load_dword v4, v[0:1] scc\n\ + v_cmp_eq_u32 vcc, v4, v6\n\ + s_cbranch_vccz LOOP\n\ + v_mov_b32 v0, s2\n\ + v_mov_b32 v1, s3\n\ + flat_store_dword v[0:1], v6 scc\n\ + s_endpgm\n\ + end\n\ +"; + const char* gfx10_PollMemory = "\ shader ReadMemory\n\ @@ -226,6 +275,81 @@ type(CS)\n\ end\n\ "; +/* Continuously poll the flag at src buffer + * After the flag of s[0:1] is 1 filled, + * copy the value from s[0:1]+4 to dst buffer + */ +const char* gfx9_PollAndCopy = +"\ +shader CopyMemory\n\ +wave_size(32)\n\ +type(CS)\n\ +/* Assume src buffer in s[0:1] and dst buffer in s[2:3]*/\n\ + s_movk_i32 s18, 0x1\n\ + LOOP:\n\ + s_load_dword s16, s[0:1], 0x0 glc\n\ + s_cmp_eq_i32 s16, s18\n\ + s_cbranch_scc0 LOOP\n\ + s_load_dword s17, s[0:1], 0x4 glc\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + s_store_dword s17, s[2:3], 0x0 glc:1\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + s_endpgm\n\ + end\n\ +"; + +const char* gfx9aldbrn_PollAndCopy = +"\ +shader CopyMemory\n\ +wave_size(32)\n\ +type(CS)\n\ +/* Assume src buffer in s[0:1] and dst buffer in s[2:3]*/\n\ + v_mov_b32 v0, s0\n\ + v_mov_b32 v1, s1\n\ + v_mov_b32 v18, 0x1\n\ + LOOP:\n\ + flat_load_dword v16, v[0:1] scc:1\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + v_cmp_eq_i32 vcc, v16, v18\n\ + s_cbranch_vccz LOOP\n\ + buffer_invl2\n\ + s_load_dword s17, s[0:1], 0x4 glc\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + s_store_dword s17, s[2:3], 0x0 glc\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + buffer_wbl2\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + s_endpgm\n\ + end\n\ +"; + +/* Input0: A buffer of at least 2 dwords. + * DW0: used as a signal. Write 0x1 to signal + * DW1: Write the value from 2nd input buffer + * for other device to read. + * Input1: A buffer of at least 2 dwords. + * DW0: used as the value to be written. + */ +const char* gfx9aldbrn_WriteFlagAndValue = +"\ +shader WriteMemory\n\ +wave_size(32)\n\ +type(CS)\n\ +/* Assume two inputs buffer in s[0:1] and s[2:3]*/\n\ + v_mov_b32 v0, s0\n\ + v_mov_b32 v1, s1\n\ + s_load_dword s18, s[2:3], 0x0 glc\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + s_store_dword s18, s[0:1], 0x4 glc\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + buffer_wbl2\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + v_mov_b32 v16, 0x1\n\ + flat_store_dword v[0:1], v16 scc:1\n\ + s_endpgm\n\ + end\n\ +"; + const char* gfx10_WriteAndSignal = "\ shader WriteAndSignal\n\ @@ -389,7 +513,11 @@ TEST_F(KFDMemoryTest, MapUnmapToNodes) { else pReadMemory = gfx10_PollMemory; - m_pIsaGen->CompileShader(pReadMemory, "ReadMemory", isaBuffer); + if (m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode)) + /* On A+A system memory is mapped as NC */ + m_pIsaGen->CompileShader(gfx9_PollNCMemory, "ReadMemory", isaBuffer); + else + m_pIsaGen->CompileShader(pReadMemory, "ReadMemory", isaBuffer); PM4Queue pm4Queue; ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode)); @@ -485,14 +613,18 @@ TEST_F(KFDMemoryTest, AccessPPRMem) { ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + HsaEvent *event; + ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event)); + queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf, 0xABCDEF09, 0x12345678)); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); WaitOnValue(destBuf, 0xABCDEF09); WaitOnValue(destBuf + 1, 0x12345678); + hsaKmtDestroyEvent(event); EXPECT_SUCCESS(queue.Destroy()); /* This sleep hides the dmesg PPR message storm on Raven, which happens @@ -726,8 +858,10 @@ TEST_F(KFDMemoryTest, FlatScratchAccess) { const char *pScratchCopyDword; if (m_FamilyId < FAMILY_AI) pScratchCopyDword = gfx8_ScratchCopyDword; - else if (m_FamilyId < FAMILY_NV) + else if (m_FamilyId < FAMILY_AL) pScratchCopyDword = gfx9_ScratchCopyDword; + else if (m_FamilyId == FAMILY_AL) + pScratchCopyDword = aldbrn_ScratchCopyDword; else pScratchCopyDword = gfx10_ScratchCopyDword; m_pIsaGen->CompileShader(pScratchCopyDword, "ScratchCopyDword", isaBuffer); @@ -1514,6 +1648,7 @@ TEST_F(KFDMemoryTest, PtraceAccessInvisibleVram) { mem1 = reinterpret_cast(reinterpret_cast(mem) + VRAM_OFFSET + sizeof(HSAuint64)); PM4Queue queue; ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + queue.PlaceAndSubmitPacket(PM4WriteDataPacket((unsigned int *)mem0, data0[0], data0[1])); queue.PlaceAndSubmitPacket(PM4WriteDataPacket((unsigned int *)mem1, @@ -1592,8 +1727,10 @@ TEST_F(KFDMemoryTest, PtraceAccessInvisibleVram) { const char *pScratchCopyDword; if (m_FamilyId < FAMILY_AI) pScratchCopyDword = gfx8_ScratchCopyDword; - else if (m_FamilyId < FAMILY_NV) + else if (m_FamilyId < FAMILY_AL) pScratchCopyDword = gfx9_ScratchCopyDword; + else if (m_FamilyId == FAMILY_AL) + pScratchCopyDword = aldbrn_ScratchCopyDword; else pScratchCopyDword = gfx10_ScratchCopyDword; @@ -2294,3 +2431,211 @@ TEST_F(KFDMemoryTest, CacheInvalidateOnRemoteWrite) { TEST_END } + +/* Test is for new cache coherence on Aldebaran. It is to verify + * two GPUs can coherently share a fine grain FB. + */ +TEST_F(KFDMemoryTest, VramCacheCoherenceWithRemoteGPU) { + TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); + TEST_START(TESTPROFILE_RUNALL); + + HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + HsaMemoryBuffer tmpBuffer(PAGE_SIZE, 0, true /* zero */); + volatile HSAuint32 *tmp = tmpBuffer.As(); + const int dwSource = 0x40 * sizeof(int); /* At 3rd cache line */ + const int dwLocation = 0x80 * sizeof(int); /* At 5th cache line */ + + if (m_FamilyId != FAMILY_AL) { + LOG() << "Skipping test: Test requires aldebaran series asics." << std::endl; + return; + } + + const std::vector gpuNodes = m_NodeInfo.GetNodesWithGPU(); + if (gpuNodes.size() < 2) { + LOG() << "Skipping test: At least two GPUs are required." << std::endl; + return; + } + + HSAuint32 nondefaultNode; + for (unsigned i = 0; i < gpuNodes.size(); i++) { + if (gpuNodes.at(i) != defaultGPUNode) { + nondefaultNode = gpuNodes.at(i); + break; + } + } + + unsigned int nodes[2] = {defaultGPUNode, nondefaultNode}; + + /* Allocate a local FB */ + HsaMemoryBuffer buffer(PAGE_SIZE, defaultGPUNode, false/*zero*/, true/*local*/, false/*exec*/); + buffer.MapMemToNodes(&nodes[0], 2); + SDMAQueue sdmaQueue; + ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode)); + buffer.Fill(0, sdmaQueue, 0, PAGE_SIZE); + buffer.Fill(0x5678, sdmaQueue, dwSource, 4); + + /* Read buffer[0] as flag from local shader to fill cache line (64 dws) + * which should has 0 at buffer[1] + */ + PM4Queue queue; + ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); + m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer); + Dispatch dispatch(isaBuffer); + dispatch.SetArgs(buffer.As(), buffer.As()+dwLocation); + dispatch.Submit(queue); + + /* Delay 100ms to make sure shader executed*/ + Delay(100); + + /* Using remote shader to write the flag and copy value from dwSource + * to dwLocation in buffer. + * Local shader should get the flag and execute CopyMemory + */ + PM4Queue queue1; + ASSERT_SUCCESS(queue1.Create(nondefaultNode)); + HsaMemoryBuffer isaBuffer1(PAGE_SIZE, nondefaultNode, true/*zero*/, false/*local*/, true/*exec*/); + m_pIsaGen->CompileShader(gfx9aldbrn_WriteFlagAndValue, "WriteMemory", isaBuffer1); + Dispatch dispatch1(isaBuffer1); + dispatch1.SetArgs(buffer.As(), buffer.As()+dwSource); + dispatch1.Submit(queue1); + dispatch1.Sync(g_TestTimeOut); + + /* Check test result*/ + dispatch.Sync(g_TestTimeOut); + EXPECT_EQ(buffer.IsPattern(dwLocation, 0x5678, sdmaQueue, tmp), true); + + // Clean up + EXPECT_SUCCESS(queue.Destroy()); + EXPECT_SUCCESS(queue1.Destroy()); + EXPECT_SUCCESS(sdmaQueue.Destroy()); + + TEST_END +} + +/* Test is for new cache coherence on A+A(Aldebaran). It is to verify + * new XGMI coherence HW link in caches between CPU and GPUs + * in local FB with fine grain mode. + */ +TEST_F(KFDMemoryTest, VramCacheCoherenceWithCPU) { + TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); + TEST_START(TESTPROFILE_RUNALL); + + if (m_FamilyId != FAMILY_AL) { + LOG() << "Skipping test: Test requires aldebaran series asics." << std::endl; + return; + } + + HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + const int dwLocation = 0x80; + + if (!m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode)) { + LOG() << "Skipping test: XGMI link to CPU is required." << std::endl; + return; + } + + unsigned int *buffer; + HsaMemFlags memFlags = {0}; + /* Allocate a fine grain local FB accessed by CPU */ + memFlags.ui32.HostAccess = 1; + memFlags.ui32.NonPaged = 1; + ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode, PAGE_SIZE, memFlags, + reinterpret_cast(&buffer))); + ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(buffer, PAGE_SIZE, NULL)); + buffer[0] = 0; + buffer[dwLocation] = 0; + + /* Read buffer from shader to fill cache */ + PM4Queue queue; + ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); + m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer); + Dispatch dispatch(isaBuffer); + dispatch.SetArgs(buffer, buffer+dwLocation); + dispatch.Submit(queue); + + /* Delay 100ms to make sure shader executed*/ + Delay(100); + + /* CPU writes to buffer. Shader should get 0x5678 CPU writes + * after cache invalidating(buffer_invl2) and quits + */ + buffer[1] = 0x5678; + buffer[0] = 1; + + /* Check test result*/ + dispatch.Sync(g_TestTimeOut); + EXPECT_EQ(buffer[dwLocation], 0x5678); + + // Clean up + EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(buffer)); + EXPECT_SUCCESS(hsaKmtFreeMemory(buffer, PAGE_SIZE)); + EXPECT_SUCCESS(queue.Destroy()); + + TEST_END +} + +/* Test is for new cache coherence on Aldebaran. It is to verify + * new XGMI coherence HW link in caches between CPU and GPUs + * in system RAM. + */ +TEST_F(KFDMemoryTest, SramCacheCoherenceWithGPU) { + TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); + TEST_START(TESTPROFILE_RUNALL); + + if (m_FamilyId != FAMILY_AL) { + LOG() << "Skipping test: Test requires aldebaran series asics." << std::endl; + return; + } + + unsigned int *fineBuffer = NULL; + unsigned int tmp; + + int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + const int dwLocation = 0x80; + + ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode /* system */, PAGE_SIZE, m_MemoryFlags, + reinterpret_cast(&fineBuffer))); + ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(fineBuffer, PAGE_SIZE, NULL)); + fineBuffer[0] = 0; + fineBuffer[1] = 0; + /* Read buffer from CPU to fill cache */ + tmp = fineBuffer[dwLocation]; + + /* Read fine grain buffer from shader to fill cache */ + PM4Queue queue; + ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); + + if (m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode)) + m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer); + else + m_pIsaGen->CompileShader(gfx9_PollAndCopy, "CopyMemory", isaBuffer); + + Dispatch dispatch(isaBuffer); + dispatch.SetArgs(fineBuffer, fineBuffer+dwLocation); + dispatch.Submit(queue); + + /* Delay 100ms to make sure shader executed*/ + Delay(100); + + /* CPU writes to buffer. Shader should get what CPU writes and quits*/ + fineBuffer[1] = 0x5678; + fineBuffer[0] = 1; + + /* Check test result, based on KFDEventTest.SignalEvent passed. + * if Sync times out, + * it means coherence issue that GPU doesn't read what CPU wrote. + * if buffer value is not expected, + * it means coherence issue that CPU doesn't read what GPU wrote. + */ + dispatch.Sync(g_TestTimeOut); + EXPECT_EQ(fineBuffer[dwLocation], 0x5678); + + // Clean up + EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(fineBuffer)); + EXPECT_SUCCESS(hsaKmtFreeMemory(fineBuffer, PAGE_SIZE)); + EXPECT_SUCCESS(queue.Destroy()); + + TEST_END +} diff --git a/tests/kfdtest/src/KFDPMTest.cpp b/tests/kfdtest/src/KFDPMTest.cpp index 79b385cf72..98c2348a8c 100644 --- a/tests/kfdtest/src/KFDPMTest.cpp +++ b/tests/kfdtest/src/KFDPMTest.cpp @@ -78,8 +78,11 @@ TEST_F(KFDPMTest, SuspendWithIdleQueueAfterWork) { ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + HsaEvent *event; + ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event)); + queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuffer.As(), 0x1, 0x2)); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); WaitOnValue(&(destBuffer.As()[0]), 0x1); WaitOnValue(&(destBuffer.As()[1]), 0x2); @@ -88,7 +91,7 @@ TEST_F(KFDPMTest, SuspendWithIdleQueueAfterWork) { EXPECT_EQ(true, SuspendAndWakeUp()); queue.PlaceAndSubmitPacket(PM4WriteDataPacket(&(destBuffer.As()[2]), 0x3, 0x4)); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); EXPECT_EQ(destBuffer.As()[0], 0); EXPECT_EQ(destBuffer.As()[1], 0); @@ -96,6 +99,7 @@ TEST_F(KFDPMTest, SuspendWithIdleQueueAfterWork) { WaitOnValue(&(destBuffer.As()[2]), 0x3); WaitOnValue(&(destBuffer.As()[3]), 0x4); + hsaKmtDestroyEvent(event); EXPECT_SUCCESS(queue.Destroy()); TEST_END diff --git a/tests/kfdtest/src/KFDQMTest.cpp b/tests/kfdtest/src/KFDQMTest.cpp index ae561fccb2..6d4cb7cbef 100644 --- a/tests/kfdtest/src/KFDQMTest.cpp +++ b/tests/kfdtest/src/KFDQMTest.cpp @@ -78,13 +78,16 @@ TEST_F(KFDQMTest, SubmitNopCpQueue) { ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; PM4Queue queue; + HsaEvent *event; + ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event)); ASSERT_SUCCESS(queue.Create(defaultGPUNode)); queue.PlaceAndSubmitPacket(PM4NopPacket()); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); + hsaKmtDestroyEvent(event); EXPECT_SUCCESS(queue.Destroy()); TEST_END @@ -99,17 +102,19 @@ TEST_F(KFDQMTest, SubmitPacketCpQueue) { HsaMemoryBuffer destBuf(PAGE_SIZE, defaultGPUNode, false); destBuf.Fill(0xFF); + HsaEvent *event; + ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event)); PM4Queue queue; - ASSERT_SUCCESS(queue.Create(defaultGPUNode)); queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As(), 0, 0)); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); EXPECT_TRUE(WaitOnValue(destBuf.As(), 0)); + hsaKmtDestroyEvent(event); EXPECT_SUCCESS(queue.Destroy()); TEST_END @@ -132,7 +137,7 @@ TEST_F(KFDQMTest, AllCpQueues) { for (unsigned int qidx = 0; qidx < m_numCpQueues; ++qidx) { queues[qidx].PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As()+qidx*2, qidx, qidx)); - + queues[qidx].PlaceAndSubmitPacket(PM4ReleaseMemoryPacket(m_FamilyId, true, 0, 0)); queues[qidx].Wait4PacketConsumption(); EXPECT_TRUE(WaitOnValue(destBuf.As()+qidx*2, qidx)); @@ -330,6 +335,7 @@ TEST_F(KFDQMTest, AllQueues) { for (i = 0; i < numCpQueues; ++i) { cpQueues[i].PlaceAndSubmitPacket(PM4WriteDataPacket(destBufCp.As()+i*2, i, i)); + cpQueues[i].PlaceAndSubmitPacket(PM4ReleaseMemoryPacket(m_FamilyId, true, 0, 0)); cpQueues[i].Wait4PacketConsumption(); @@ -460,9 +466,12 @@ TEST_F(KFDQMTest, DisableCpQueueByUpdateWithNullAddress) { ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + HsaEvent *event; + ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event)); + queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As(), 0, 0)); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); WaitOnValue(destBuf.As(), 0); @@ -480,10 +489,11 @@ TEST_F(KFDQMTest, DisableCpQueueByUpdateWithNullAddress) { EXPECT_SUCCESS(queue.Update(BaseQueue::DEFAULT_QUEUE_PERCENTAGE, BaseQueue::DEFAULT_PRIORITY, false)); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); WaitOnValue(destBuf.As(), 1); + hsaKmtDestroyEvent(event); EXPECT_SUCCESS(queue.Destroy()); TEST_END @@ -544,13 +554,16 @@ TEST_F(KFDQMTest, DisableCpQueueByUpdateWithZeroPercentage) { ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + HsaEvent *event; + ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event)); + PM4WriteDataPacket packet1, packet2; packet1.InitPacket(destBuf.As(), 0, 0); packet2.InitPacket(destBuf.As(), 1, 1); queue.PlaceAndSubmitPacket(packet1); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); WaitOnValue(destBuf.As(), 0); @@ -568,7 +581,7 @@ TEST_F(KFDQMTest, DisableCpQueueByUpdateWithZeroPercentage) { EXPECT_SUCCESS(queue.Update(BaseQueue::DEFAULT_QUEUE_PERCENTAGE, BaseQueue::DEFAULT_PRIORITY, false)); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); WaitOnValue(destBuf.As(), 1); @@ -1228,6 +1241,8 @@ TEST_F(KFDQMTest, CpuWriteCoherence) { HsaMemoryBuffer destBuf(PAGE_SIZE, defaultGPUNode); ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + HsaEvent *event; + ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event)); /* The queue might be full and we fail to submit. There is always one word space unused in queue. * So let rptr one step ahead then we continually submit packet. @@ -1249,10 +1264,11 @@ TEST_F(KFDQMTest, CpuWriteCoherence) { */ queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As(), 0x42, 0x42)); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); WaitOnValue(destBuf.As(), 0x42); + hsaKmtDestroyEvent(event); TEST_END } @@ -1420,18 +1436,22 @@ TEST_F(KFDQMTest, CpQueueWraparound) { ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + HsaEvent *event; + ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event)); + for (unsigned int pktIdx = 0; pktIdx <= PAGE_SIZE/sizeof(PM4WRITE_DATA_CI); ++pktIdx) { queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As(), pktIdx, pktIdx)); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); WaitOnValue(destBuf.As(), pktIdx); } for (unsigned int pktIdx = 0; pktIdx <= PAGE_SIZE/sizeof(PM4WRITE_DATA_CI); ++pktIdx) { queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As(), pktIdx, pktIdx)); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); WaitOnValue(destBuf.As(), pktIdx); } + hsaKmtDestroyEvent(event); EXPECT_SUCCESS(queue.Destroy()); TEST_END @@ -1669,18 +1689,13 @@ TEST_F(KFDQMTest, P2PTest) { HsaMemFlags memFlags = {0}; HsaMemMapFlags mapFlags = {0}; memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB; - memFlags.ui32.HostAccess = 1; + memFlags.ui32.HostAccess = 0; memFlags.ui32.NonPaged = 1; memFlags.ui32.NoNUMABind = 1; unsigned int end = size / sizeof(HSAuint32) - 1; - if (!m_NodeInfo.IsGPUNodeLargeBar(g_TestDstNodeId) && - m_NodeInfo.AreGPUNodesXGMI(g_TestNodeId, g_TestDstNodeId)) { - memFlags.ui32.HostAccess = 0; - } - /* 1. Allocate a system buffer and allow the access to GPUs */ - EXPECT_SUCCESS(hsaKmtAllocMemory(0, size, memFlags, + EXPECT_SUCCESS(hsaKmtAllocMemory(0, size, m_MemoryFlags, reinterpret_cast(&sysBuf))); EXPECT_SUCCESS(hsaKmtMapMemoryToGPUNodes(sysBuf, size, NULL, mapFlags, nodes.size(), (HSAuint32 *)&nodes[0])); diff --git a/tests/kfdtest/src/KFDTestFlags.hpp b/tests/kfdtest/src/KFDTestFlags.hpp index 921b8bc832..9087ba23f8 100644 --- a/tests/kfdtest/src/KFDTestFlags.hpp +++ b/tests/kfdtest/src/KFDTestFlags.hpp @@ -59,6 +59,7 @@ enum KfdFamilyId { FAMILY_AI, // Arctic Islands FAMILY_RV, // Raven FAMILY_AR, // Arcturus + FAMILY_AL, // Aldebaran FAMILY_NV, // Navi10 }; diff --git a/tests/kfdtest/src/KFDTestUtil.cpp b/tests/kfdtest/src/KFDTestUtil.cpp index 1e1b85abda..8651eaa51d 100644 --- a/tests/kfdtest/src/KFDTestUtil.cpp +++ b/tests/kfdtest/src/KFDTestUtil.cpp @@ -149,8 +149,10 @@ unsigned int FamilyIdFromNode(const HsaNodeProperties *props) { familyId = FAMILY_AI; if (props->EngineId.ui32.Stepping == 2) familyId = FAMILY_RV; - if (props->EngineId.ui32.Stepping == 8) + else if (props->EngineId.ui32.Stepping == 8) familyId = FAMILY_AR; + else if (props->EngineId.ui32.Stepping == 10) + familyId = FAMILY_AL; break; case 10: familyId = FAMILY_NV; @@ -201,7 +203,7 @@ HSAuint64 GetSystemTickCountInMicroSec() { const HsaMemoryBuffer HsaMemoryBuffer::Null; HsaMemoryBuffer::HsaMemoryBuffer(HSAuint64 size, unsigned int node, bool zero, bool isLocal, bool isExec, - bool isScratch, bool isReadOnly) + bool isScratch, bool isReadOnly, bool isUncached) :m_Size(size), m_pUser(NULL), m_pBuf(NULL), @@ -222,11 +224,13 @@ HsaMemoryBuffer::HsaMemoryBuffer(HSAuint64 size, unsigned int node, bool zero, b m_Flags.ui32.HostAccess = 0; m_Flags.ui32.NonPaged = 1; m_Flags.ui32.CoarseGrain = 1; + EXPECT_EQ(isUncached, 0) << "Uncached flag is relevant only for system or host memory"; } else { m_Flags.ui32.HostAccess = 1; m_Flags.ui32.NonPaged = 0; m_Flags.ui32.CoarseGrain = 0; m_Flags.ui32.NoNUMABind = 1; + m_Flags.ui32.Uncached = isUncached; } if (isExec) @@ -667,3 +671,26 @@ int HsaNodeInfo::FindAccessiblePeers(std::vector *peers, } return peers->size(); } + +const bool HsaNodeInfo::IsNodeXGMItoCPU(int node) const { + const HsaNodeProperties *pNodeProperties; + bool ret = false; + + pNodeProperties = GetNodeProperties(node); + if (pNodeProperties && pNodeProperties->NumIOLinks) { + HsaIoLinkProperties *IolinkProperties = new HsaIoLinkProperties[pNodeProperties->NumIOLinks]; + EXPECT_SUCCESS(hsaKmtGetNodeIoLinkProperties(node, pNodeProperties->NumIOLinks, IolinkProperties)); + + for (int linkId = 0; linkId < pNodeProperties->NumIOLinks; linkId++) { + EXPECT_EQ(node, IolinkProperties[linkId].NodeFrom); + const HsaNodeProperties *pNodeProperties0 = + GetNodeProperties(IolinkProperties[linkId].NodeTo); + if (pNodeProperties0->NumFComputeCores == 0 && + IolinkProperties[linkId].IoLinkType == HSA_IOLINK_TYPE_XGMI) + ret = true; + } + delete [] IolinkProperties; + } + + return ret; +} diff --git a/tests/kfdtest/src/KFDTestUtil.hpp b/tests/kfdtest/src/KFDTestUtil.hpp index e55ca95062..2076e27df0 100644 --- a/tests/kfdtest/src/KFDTestUtil.hpp +++ b/tests/kfdtest/src/KFDTestUtil.hpp @@ -66,7 +66,7 @@ class HsaMemoryBuffer { public: HsaMemoryBuffer(HSAuint64 size, unsigned int node, bool zero = true, bool isLocal = false, - bool isExec = false, bool isScratch = false, bool isReadOnly = false); + bool isExec = false, bool isScratch = false, bool isReadOnly = false, bool isUncached = false); HsaMemoryBuffer(void *addr, HSAuint64 size); template RetType As() { @@ -197,6 +197,11 @@ class HsaNodeInfo { const bool AreGPUNodesXGMI(int node0, int node1) const; int FindAccessiblePeers(std::vector *peers, HSAuint32 node) const; + /* @brief: to determine if the node is XGMI-linked to CPU + * @param: node index of the node we are looking at + * @return: bool true or false + */ + const bool IsNodeXGMItoCPU(int node) const; }; #endif // __KFD__TEST__UTIL__H__ diff --git a/tests/kfdtest/src/KFDTopologyTest.cpp b/tests/kfdtest/src/KFDTopologyTest.cpp index 334317e943..c675e4ca88 100644 --- a/tests/kfdtest/src/KFDTopologyTest.cpp +++ b/tests/kfdtest/src/KFDTopologyTest.cpp @@ -58,7 +58,6 @@ TEST_F(KFDTopologyTest , BasicTest) { EXPECT_GT(pNodeProperties->EngineId.ui32.uCode, 0) << "uCode version is 0"; EXPECT_GE(pNodeProperties->EngineId.ui32.Major, 7) << "Major Version is less than 7"; EXPECT_LT(pNodeProperties->EngineId.ui32.Minor, 10) << "Minor Version is greater than 9"; - EXPECT_LT(pNodeProperties->EngineId.ui32.Stepping, 10) << "Stepping is greater than 9"; EXPECT_GT(pNodeProperties->uCodeEngineVersions.uCodeSDMA, 0) << "sDMA firmware version is 0"; } EXPECT_GT(pNodeProperties->NumMemoryBanks, HSAuint32(0)) << "Node index: " << node << "No MemoryBanks.";