diff --git a/CMakeLists.txt b/CMakeLists.txt
index a3a4d999c1..ccf39e8d82 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -129,7 +129,8 @@ set ( HSAKMT_SRC "src/debug.c"
                  "src/topology.c"
                  "src/rbtree.c"
                  "src/spm.c"
-                 "src/version.c")
+                 "src/version.c"
+                 "src/svm.c")
 
 ## Declare the library target name
 add_library ( ${HSAKMT_TARGET} "")
diff --git a/include/hsakmt.h b/include/hsakmt.h
index 83f41cac90..39b1a0c792 100644
--- a/include/hsakmt.h
+++ b/include/hsakmt.h
@@ -844,6 +844,16 @@ hsaKmtGetQueueSnapshot(
     HSAuint32			*QssEntries // IN/OUT
     );
 
+/**
+  Send the host trap
+*/
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtSendHostTrap(
+    HSAuint32	NodeId, //IN
+    HSAuint32	Pid //IN
+    );
+
 /**
   Set the trap override mask. When debug trap is enabled by
   hsaKmtEnableDebugTrap() each wave launched has its initial
@@ -1244,6 +1254,37 @@ hsaKmtSPMSetDestBuffer(
 	bool        *isSPMDataLoss		//OUT
     );
 
+/* Helper functions for calling KFD SVM ioctl */
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtSVMSetAttr(
+    void *start_addr,   // IN: Start of the virtual address range (page-aligned)
+    HSAuint64 size,     // IN: size (page-aligned)
+    unsigned int nattr, // IN: number of attributes
+    HSA_SVM_ATTRIBUTE *attrs  // IN: array of attributes
+);
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtSVMGetAttr(
+    void *start_addr,   // IN: Start of the virtual address range (page-aligned)
+    HSAuint64 size,     // IN: size (page aligned)
+    unsigned int nattr, // IN: number of attributes
+    HSA_SVM_ATTRIBUTE *attrs  // IN/OUT: array of attributes
+);
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtSetXNACKMode(
+    HSAint32 enable  // IN: enable/disable XNACK node.
+);
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtGetXNACKMode(
+    HSAint32 * enable  // OUT: returns XNACK value.
+);
+
 #ifdef __cplusplus
 }   //extern "C"
 #endif
diff --git a/include/hsakmttypes.h b/include/hsakmttypes.h
index ef97539ea1..afc811d2c9 100644
--- a/include/hsakmttypes.h
+++ b/include/hsakmttypes.h
@@ -541,7 +541,8 @@ typedef struct _HsaMemFlags
 					    // and optimal alignment requirements
             unsigned int FixedAddress : 1; // Allocate memory at specified virtual address. Fail if address is not free.
             unsigned int NoNUMABind:    1; // Don't bind system memory to a specific NUMA node
-            unsigned int Reserved    : 15;
+            unsigned int Uncached:      1; // Caching flag for fine-grained memory on A+A HW platform
+            unsigned int Reserved    : 14;
 
         } ui32;
         HSAuint32 Value;
@@ -1296,6 +1297,35 @@ typedef struct _HsaMemoryRange {
 	HSAuint64          SizeInBytes;      // Size of above memory
 } HsaMemoryRange;
 
+typedef enum _HSA_SVM_FLAGS {
+	HSA_SVM_FLAG_HOST_ACCESS = 0x00000001, // Guarantee host access to memory
+	HSA_SVM_FLAG_COHERENT    = 0x00000002, // Fine grained coherency between all devices with access
+	HSA_SVM_FLAG_HIVE_LOCAL  = 0x00000004, // Use any GPU in same hive as preferred device
+	HSA_SVM_FLAG_GPU_RO      = 0x00000008, // GPUs only read, allows replication
+	HSA_SVM_FLAG_GPU_EXEC    = 0x00000010, // Allow execution on GPU
+} HSA_SVM_FLAGS;
+
+typedef enum _HSA_SVM_ATTR_TYPE {
+	HSA_SVM_ATTR_PREFERRED_LOC,  // gpuid of the preferred location, 0 for
+                                     // system memory, INVALID_NODEID for
+                                     // "don't care"
+	HSA_SVM_ATTR_PREFETCH_LOC,   // gpuid of the prefetch location, 0 for
+                                     // system memory. Setting this triggers an
+                                     // immediate prefetch (migration)
+	HSA_SVM_ATTR_ACCESS,
+	HSA_SVM_ATTR_ACCESS_IN_PLACE,
+	HSA_SVM_ATTR_NO_ACCESS,      // specify memory access for the gpuid given
+                                     // by the attribute value
+	HSA_SVM_ATTR_SET_FLAGS,      // bitmask of flags to set (see HSA_SVM_FLAGS) 
+	HSA_SVM_ATTR_CLR_FLAGS,      // bitmask of flags to clear
+	HSA_SVM_ATTR_GRANULARITY     // migration granularity (log2 num pages)
+} HSA_SVM_ATTR_TYPE;
+
+typedef struct _HSA_SVM_ATTRIBUTE {
+	HSAuint32 type;  // attribute type (see enum HSA_SVM_ATTR_TYPE)
+	HSAuint32 value; // attribute value
+} HSA_SVM_ATTRIBUTE;
+
 #pragma pack(pop, hsakmttypes_h)
 
 
diff --git a/include/linux/kfd_ioctl.h b/include/linux/kfd_ioctl.h
index 62a84e4e7e..2d948a68cd 100644
--- a/include/linux/kfd_ioctl.h
+++ b/include/linux/kfd_ioctl.h
@@ -224,6 +224,7 @@ struct kfd_ioctl_dbg_wave_control_args {
 #define	KFD_DBG_EV_STATUS_VMFAULT	2
 #define	KFD_DBG_EV_STATUS_SUSPENDED	4
 #define KFD_DBG_EV_STATUS_NEW_QUEUE	8
+#define KFD_DBG_EV_STATUS_HOST_TRAP_TIMEDOUT	16
 #define	KFD_DBG_EV_FLAG_CLEAR_STATUS	1
 
 #define KFD_INVALID_QUEUEID	0xffffffff
@@ -309,6 +310,14 @@ struct kfd_ioctl_dbg_wave_control_args {
  */
 #define KFD_IOC_DBG_TRAP_SET_ADDRESS_WATCH 9
 
+/* KFD_IOC_DBG_SEND_HOST_TRAP:
+ * ptr:   unused
+ * data1: unused
+ * data2: unused
+ * data3: unused
+ */
+#define KFD_IOC_DBG_TRAP_SEND_HOST_TRAP 10
+
 struct kfd_ioctl_dbg_trap_args {
 	__u64 ptr;     /* to KFD -- used for pointer arguments: queue arrays */
 	__u32 pid;     /* to KFD */
@@ -479,6 +488,7 @@ struct kfd_ioctl_acquire_vm_args {
 #define KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE	(1 << 28)
 #define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM	(1 << 27)
 #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT	(1 << 26)
+#define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED	(1 << 25)
 
 /* Allocate memory for later SVM (shared virtual memory) mapping.
  *
@@ -721,6 +731,166 @@ struct kfd_ioctl_cross_memory_copy_args {
 	__u64 bytes_copied;
 };
 
+
+/* Guarantee host access to memory */
+#define KFD_IOCTL_SVM_FLAG_HOST_ACCESS 0x00000001
+/* Fine grained coherency between all devices with access */
+#define KFD_IOCTL_SVM_FLAG_COHERENT    0x00000002
+/* Use any GPU in same hive as preferred device */
+#define KFD_IOCTL_SVM_FLAG_HIVE_LOCAL  0x00000004
+/* GPUs only read, allows replication */
+#define KFD_IOCTL_SVM_FLAG_GPU_RO      0x00000008
+/* Allow execution on GPU */
+#define KFD_IOCTL_SVM_FLAG_GPU_EXEC    0x00000010
+
+/**
+ * kfd_ioctl_svm_op - SVM ioctl operations
+ *
+ * @KFD_IOCTL_SVM_OP_SET_ATTR: Modify one or more attributes
+ * @KFD_IOCTL_SVM_OP_GET_ATTR: Query one or more attributes
+ */
+enum kfd_ioctl_svm_op {
+	KFD_IOCTL_SVM_OP_SET_ATTR,
+	KFD_IOCTL_SVM_OP_GET_ATTR
+};
+
+/**
+ * kfd_ioctl_svm_attr_type - SVM attribute types
+ *
+ * @KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: gpuid of the preferred location, 0 for
+ *                                    system memory
+ * @KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: gpuid of the prefetch location, 0 for
+ *                                   system memory. Setting this triggers an
+ *                                   immediate prefetch (migration).
+ * @KFD_IOCTL_SVM_ATTR_ACCESS:
+ * @KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
+ * @KFD_IOCTL_SVM_ATTR_NO_ACCESS: specify memory access for the gpuid given
+ *                                by the attribute value
+ * @KFD_IOCTL_SVM_ATTR_SET_FLAGS: bitmask of flags to set (see
+ *                                KFD_IOCTL_SVM_FLAG_...)
+ * @KFD_IOCTL_SVM_ATTR_CLR_FLAGS: bitmask of flags to clear
+ * @KFD_IOCTL_SVM_ATTR_GRANULARITY: migration granularity
+ *                                  (log2 num pages)
+ */
+enum kfd_ioctl_svm_attr_type {
+	KFD_IOCTL_SVM_ATTR_PREFERRED_LOC,
+	KFD_IOCTL_SVM_ATTR_PREFETCH_LOC,
+	KFD_IOCTL_SVM_ATTR_ACCESS,
+	KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE,
+	KFD_IOCTL_SVM_ATTR_NO_ACCESS,
+	KFD_IOCTL_SVM_ATTR_SET_FLAGS,
+	KFD_IOCTL_SVM_ATTR_CLR_FLAGS,
+	KFD_IOCTL_SVM_ATTR_GRANULARITY
+};
+
+/** kfd_ioctl_svm_location - Enum for preferred and prefetch locations
+ *
+ * GPU IDs are used to specify GPUs as preferred and prefetch locations.
+ * Below definitions are used for system memory or for leaving the preferred
+ * location unspecified.
+ */
+enum kfd_ioctl_svm_location {
+	KFD_IOCTL_SVM_LOCATION_SYSMEM = 0,
+	KFD_IOCTL_SVM_LOCATION_UNDEFINED = 0xffffffff
+};
+
+/**
+ * kfd_ioctl_svm_attribute - Attributes as pairs of type and value
+ *
+ * The meaning of the @value depends on the attribute type.
+ *
+ * @type: attribute type (see enum @kfd_ioctl_svm_attr_type)
+ * @value: attribute value
+ */
+struct kfd_ioctl_svm_attribute {
+	__u32 type;
+	__u32 value;
+};
+
+/**
+ * kfd_ioctl_svm_args - Arguments for SVM ioctl
+ *
+ * @op specifies the operation to perform (see enum
+ * @kfd_ioctl_svm_op).  @start_addr and @size are common for all
+ * operations.
+ *
+ * A variable number of attributes can be given in @attrs.
+ * @nattr specifies the number of attributes. New attributes can be
+ * added in the future without breaking the ABI. If unknown attributes
+ * are given, the function returns -EINVAL.
+ *
+ * @KFD_IOCTL_SVM_OP_SET_ATTR sets attributes for a virtual address
+ * range. It may overlap existing virtual address ranges. If it does,
+ * the existing ranges will be split such that the attribute changes
+ * only apply to the specified address range.
+ *
+ * @KFD_IOCTL_SVM_OP_GET_ATTR returns the intersection of attributes
+ * over all memory in the given range and returns the result as the
+ * attribute value. If different pages have different preferred or
+ * prefetch locations, 0xffffffff will be returned for
+ * @KFD_IOCTL_SVM_ATTR_PREFERRED_LOC or
+ * @KFD_IOCTL_SVM_ATTR_PREFETCH_LOC resepctively. For
+ * @KFD_IOCTL_SVM_ATTR_SET_FLAGS, flags of all pages will be
+ * aggregated by bitwise AND. The minimum  migration granularity
+ * throughout the range will be returned for
+ * @KFD_IOCTL_SVM_ATTR_GRANULARITY.
+ *
+ * Querying of accessibility attributes works by initializing the
+ * attribute type to @KFD_IOCTL_SVM_ATTR_ACCESS and the value to the
+ * GPUID being queried. Multiple attributes can be given to allow
+ * querying multiple GPUIDs. The ioctl function overwrites the
+ * attribute type to indicate the access for the specified GPU.
+ *
+ * @KFD_IOCTL_SVM_ATTR_CLR_FLAGS is invalid for
+ * @KFD_IOCTL_SVM_OP_GET_ATTR.
+ */
+struct kfd_ioctl_svm_args {
+	__u64 start_addr;
+	__u64 size;
+	__u32 op;
+	__u32 nattr;
+	/* Variable length array of attributes */
+	struct kfd_ioctl_svm_attribute attrs[0];
+};
+
+/**
+ * kfd_ioctl_set_xnack_mode_args - Arguments for set_xnack_mode
+ *
+ * @xnack_enabled:       [in/out] Whether to enable XNACK mode for this process
+ *
+ * @xnack_enabled indicates whether recoverable page faults should be
+ * enabled for the current process. 0 means disabled, positive means
+ * enabled, negative means leave unchanged. If enabled, virtual address
+ * translations on GFXv9 and later AMD GPUs can return XNACK and retry
+ * the access until a valid PTE is available. This is used to implement
+ * device page faults.
+ *
+ * On output, @xnack_enabled returns the (new) current mode (0 or
+ * positive). Therefore, a negative input value can be used to query
+ * the current mode without changing it.
+ *
+ * The XNACK mode fundamentally changes the way SVM managed memory works
+ * in the driver, with subtle effects on application performance and
+ * functionality.
+ *
+ * Enabling XNACK mode requires shader programs to be compiled
+ * differently.  Furthermore, not all GPUs support changing the mode
+ * per-process. Therefore changing the mode is only allowed while no
+ * user mode queues exist in the process. This ensure that no shader
+ * code is running that may be compiled for the wrong mode. And GPUs
+ * that cannot change to the requested mode will be disabled by
+ * failing subsequent requests to create user mode queues.
+ *
+ * This ioctl returns the status of the requested xnack mode.
+ *
+ * GFXv8 or older GPUs do not support 48 bit virtual addresses or SVM.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+struct kfd_ioctl_set_xnack_mode_args {
+	__s32 xnack_enabled;
+};
+
 #define AMDKFD_IOCTL_BASE 'K'
 #define AMDKFD_IO(nr)			_IO(AMDKFD_IOCTL_BASE, nr)
 #define AMDKFD_IOR(nr, type)		_IOR(AMDKFD_IOCTL_BASE, nr, type)
@@ -818,8 +988,13 @@ struct kfd_ioctl_cross_memory_copy_args {
 #define AMDKFD_IOC_ALLOC_QUEUE_GWS		\
 		AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
 
+#define AMDKFD_IOC_SVM	AMDKFD_IOWR(0x20, struct kfd_ioctl_svm_args)
+
+#define AMDKFD_IOC_SET_XNACK_MODE		\
+		AMDKFD_IOWR(0x21, struct kfd_ioctl_set_xnack_mode_args)
+
 #define AMDKFD_COMMAND_START		0x01
-#define AMDKFD_COMMAND_END		0x1F
+#define AMDKFD_COMMAND_END		0x22
 
 /* non-upstream ioctls */
 #define AMDKFD_IOC_IPC_IMPORT_HANDLE                                    \
diff --git a/src/debug.c b/src/debug.c
index caddc79a8e..5ed6631932 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -731,3 +731,24 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtClearAddressWatch(
 			NULL);
 	return result;
 }
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtSendHostTrap(
+		HSAuint32	NodeId, //IN
+		HSAuint32	Pid	//IN
+		)
+{
+	int result;
+
+	result = debug_trap(NodeId,
+			KFD_IOC_DBG_TRAP_SEND_HOST_TRAP,
+			0,
+			0,
+			0,
+			Pid,
+			0,
+			NULL);
+
+	return result;
+}
diff --git a/src/events.c b/src/events.c
index 23fb710523..d4c751c0cc 100644
--- a/src/events.c
+++ b/src/events.c
@@ -76,7 +76,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc,
 
 	if (is_dgpu && !events_page) {
 		events_page = allocate_exec_aligned_memory_gpu(
-			KFD_SIGNAL_EVENT_LIMIT * 8, PAGE_SIZE, 0, true, false);
+			KFD_SIGNAL_EVENT_LIMIT * 8, PAGE_SIZE, 0, true, false, true);
 		if (!events_page) {
 			pthread_mutex_unlock(&hsakmt_mutex);
 			return HSAKMT_STATUS_ERROR;
diff --git a/src/fmm.c b/src/fmm.c
index 172c1dc46b..c9a356d0fd 100644
--- a/src/fmm.c
+++ b/src/fmm.c
@@ -1186,7 +1186,8 @@ static uint32_t fmm_translate_hsa_to_ioc_flags(HsaMemFlags flags)
 	uint32_t ioc_flags = 0;
 
 	if (flags.ui32.AQLQueueMemory)
-		ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM;
+		ioc_flags |= (KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM |
+			      KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED);
 	if (!flags.ui32.ReadOnly)
 		ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE;
 	/* TODO: Since, ROCr interfaces doesn't allow caller to set page
@@ -1337,6 +1338,9 @@ void *fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t MemorySizeInB
 	if (!flags.ui32.CoarseGrain || svm.disable_cache)
 		ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_COHERENT;
 
+	if (flags.ui32.Uncached || svm.disable_cache)
+		ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED;
+
 	mem = __fmm_allocate_device(gpu_id, address, size, aperture, &mmap_offset,
 				    ioc_flags, &vm_obj);
 
@@ -1548,6 +1552,10 @@ static void *fmm_allocate_host_gpu(uint32_t node_id, void *address,
 
 	if (!flags.ui32.CoarseGrain || svm.disable_cache)
 		ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_COHERENT;
+
+	if (flags.ui32.Uncached || svm.disable_cache)
+		ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED;
+
 	ioc_flags |= fmm_translate_hsa_to_ioc_flags(flags);
 
 	if (flags.ui32.AQLQueueMemory)
diff --git a/src/libhsakmt.h b/src/libhsakmt.h
index 34e30ec3ec..9c5b91f7b6 100644
--- a/src/libhsakmt.h
+++ b/src/libhsakmt.h
@@ -26,6 +26,7 @@
 #ifndef LIBHSAKMT_H_INCLUDED
 #define LIBHSAKMT_H_INCLUDED
 
+#include "linux/kfd_ioctl.h"
 #include "hsakmt.h"
 #include "pci_ids.h"
 #include <pthread.h>
@@ -132,6 +133,7 @@ enum asic_family_type {
 	CHIP_NAVY_FLOUNDER,	/* 19 */
 	CHIP_DIMGREY_CAVEFISH,	/* 20 */
 	CHIP_VANGOGH,	/* 21 */
+	CHIP_ALDEBARAN, /* 22 */
 	CHIP_LAST
 };
 
@@ -170,7 +172,7 @@ HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags);
 
 void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align,
 				       uint32_t NodeId, bool NonPaged,
-				       bool DeviceLocal);
+				       bool DeviceLocal, bool Uncached);
 void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align);
 HSAKMT_STATUS init_process_doorbells(unsigned int NumNodes);
 void destroy_process_doorbells(void);
diff --git a/src/libhsakmt.ver b/src/libhsakmt.ver
index 97e2cf6beb..66ca6ea5b3 100644
--- a/src/libhsakmt.ver
+++ b/src/libhsakmt.ver
@@ -61,6 +61,7 @@ hsaKmtEnableDebugTrap;
 hsaKmtEnableDebugTrapWithPollFd;
 hsaKmtDisableDebugTrap;
 hsaKmtQueryDebugEvent;
+hsaKmtSendHostTrap;
 hsaKmtGetQueueSnapshot;
 hsaKmtSetWaveLaunchTrapOverride;
 hsaKmtSetWaveLaunchMode;
@@ -74,6 +75,10 @@ hsaKmtClearAddressWatch;
 hsaKmtSPMAcquire;
 hsaKmtSPMRelease;
 hsaKmtSPMSetDestBuffer;
+hsaKmtSVMSetAttr;
+hsaKmtSVMGetAttr;
+hsaKmtSetXNACKMode;
+hsaKmtGetXNACKMode;
 
 local: *;
 };
diff --git a/src/pmc_table.c b/src/pmc_table.c
index 6e76842cda..502804c6cd 100644
--- a/src/pmc_table.c
+++ b/src/pmc_table.c
@@ -2128,6 +2128,7 @@ HSAKMT_STATUS get_block_properties(uint32_t node_id,
 	case CHIP_RAVEN:
 	case CHIP_RENOIR:
 	case CHIP_ARCTURUS:
+	case CHIP_ALDEBARAN:
 		*block = vega_blocks[block_id];
 		break;
 	case CHIP_NAVI10:
diff --git a/src/queues.c b/src/queues.c
index eb8e061673..1e4188382a 100644
--- a/src/queues.c
+++ b/src/queues.c
@@ -42,7 +42,8 @@
 #define DOORBELL_SIZE_GFX9 8
 #define DOORBELLS_PAGE_SIZE(ds) (1024 * (ds))
 
-#define VGPR_SIZE_PER_CU(asic_family)	(asic_family == CHIP_ARCTURUS ? 0x80000 : 0x40000)
+#define VGPR_SIZE_PER_CU(asic_family)	((asic_family == CHIP_ARCTURUS || \
+                        asic_family == CHIP_ALDEBARAN) ? 0x80000 : 0x40000)
 #define SGPR_SIZE_PER_CU	0x4000
 #define LDS_SIZE_PER_CU		0x10000
 #define HWREG_SIZE_PER_CU	0x1000
@@ -147,6 +148,12 @@ const struct device_info arcturus_device_info = {
 	.doorbell_size = DOORBELL_SIZE_GFX9,
 };
 
+const struct device_info aldebaran_device_info = {
+    .asic_family = CHIP_ALDEBARAN,
+    .eop_buffer_size = 4096,
+    .doorbell_size = DOORBELL_SIZE_GFX9,
+};
+
 const struct device_info navi10_device_info = {
 	.asic_family = CHIP_NAVI10,
 	.eop_buffer_size = 4096,
@@ -205,6 +212,7 @@ static const struct device_info *dev_lookup_table[] = {
 	[CHIP_RAVEN] = &raven_device_info,
 	[CHIP_RENOIR] = &renoir_device_info,
 	[CHIP_ARCTURUS] = &arcturus_device_info,
+	[CHIP_ALDEBARAN] = &aldebaran_device_info,
 	[CHIP_NAVI10] = &navi10_device_info,
 	[CHIP_NAVI12] = &navi12_device_info,
 	[CHIP_NAVI14] = &navi14_device_info,
@@ -469,7 +477,8 @@ static bool update_ctx_save_restore_size(uint32_t nodeid, struct queue *q)
 
 void *allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align,
 				       uint32_t NodeId, bool nonPaged,
-				       bool DeviceLocal)
+				       bool DeviceLocal,
+				       bool Uncached)
 {
 	void *mem;
 	HSAuint64 gpu_va;
@@ -483,6 +492,7 @@ void *allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align,
 	flags.ui32.NonPaged = nonPaged;
 	flags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
 	flags.ui32.CoarseGrain = DeviceLocal;
+	flags.ui32.Uncached = Uncached;
 
 	/* Get the closest cpu_id to GPU NodeId for system memory allocation
 	 * nonPaged=1 system memory allocation uses GTT path
@@ -532,11 +542,13 @@ void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align)
 static void *allocate_exec_aligned_memory(uint32_t size,
 					  bool use_ats,
 					  uint32_t NodeId,
-					  bool DeviceLocal)
+					  bool DeviceLocal,
+					  bool Uncached)
 {
 	if (!use_ats)
 		return allocate_exec_aligned_memory_gpu(size, PAGE_SIZE, NodeId,
-							DeviceLocal, DeviceLocal);
+							DeviceLocal, DeviceLocal,
+							Uncached);
 	return allocate_exec_aligned_memory_cpu(size);
 }
 
@@ -578,7 +590,7 @@ static int handle_concrete_asic(struct queue *q,
 		q->eop_buffer =
 				allocate_exec_aligned_memory(q->dev_info->eop_buffer_size,
 				q->use_ats,
-				NodeId, true);
+				NodeId, true, /* Unused for VRAM */false);
 		if (!q->eop_buffer)
 			return HSAKMT_STATUS_NO_MEMORY;
 
@@ -596,7 +608,7 @@ static int handle_concrete_asic(struct queue *q,
 		q->ctx_save_restore =
 			allocate_exec_aligned_memory(q->ctx_save_restore_size,
 							 q->use_ats,
-							 NodeId, false);
+							 NodeId, false, false);
 		if (!q->ctx_save_restore)
 			return HSAKMT_STATUS_NO_MEMORY;
 
@@ -653,7 +665,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueue(HSAuint32 NodeId,
 
 	struct queue *q = allocate_exec_aligned_memory(sizeof(*q),
 			use_ats,
-			NodeId, false);
+			NodeId, false, true);
 	if (!q)
 		return HSAKMT_STATUS_NO_MEMORY;
 
diff --git a/src/svm.c b/src/svm.c
new file mode 100644
index 0000000000..478217259e
--- /dev/null
+++ b/src/svm.c
@@ -0,0 +1,224 @@
+/*
+ * Copyright © 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include "libhsakmt.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+#include <errno.h>
+
+/* Helper functions for calling KFD SVM ioctl */
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtSVMSetAttr(void *start_addr, HSAuint64 size, unsigned int nattr,
+		 HSA_SVM_ATTRIBUTE *attrs)
+{
+	struct kfd_ioctl_svm_args *args;
+	HSAuint64 s_attr;
+	HSAKMT_STATUS r;
+	HSAuint32 i;
+
+	CHECK_KFD_OPEN();
+
+	pr_debug("%s: address 0x%p size 0x%lx\n", __func__, start_addr, size);
+
+	if (!start_addr || !size)
+		return HSAKMT_STATUS_INVALID_PARAMETER;
+	if ((uint64_t)start_addr & (PAGE_SIZE - 1))
+		return HSAKMT_STATUS_INVALID_PARAMETER;
+	if (size & (PAGE_SIZE - 1))
+		return HSAKMT_STATUS_INVALID_PARAMETER;
+
+	s_attr = sizeof(*attrs) * nattr;
+	args = alloca(sizeof(*args) + s_attr);
+
+	args->start_addr = (uint64_t)start_addr;
+	args->size = size;
+	args->op = KFD_IOCTL_SVM_OP_SET_ATTR;
+	args->nattr = nattr;
+	memcpy(args->attrs, attrs, s_attr);
+
+	for (i = 0; i < nattr; i++) {
+		if (attrs[i].type != KFD_IOCTL_SVM_ATTR_PREFERRED_LOC &&
+		    attrs[i].type != KFD_IOCTL_SVM_ATTR_PREFETCH_LOC &&
+		    attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS &&
+		    attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE &&
+		    attrs[i].type != KFD_IOCTL_SVM_ATTR_NO_ACCESS)
+		    continue;
+
+		if (attrs[i].type == KFD_IOCTL_SVM_ATTR_PREFERRED_LOC &&
+		    attrs[i].value == INVALID_NODEID) {
+			args->attrs[i].value = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
+			continue;
+		}
+
+		r = validate_nodeid(attrs[i].value, &args->attrs[i].value);
+		if (r != HSAKMT_STATUS_SUCCESS) {
+			pr_debug("invalid node ID: %d\n", attrs[i].value);
+			return r;
+		} else if (!args->attrs[i].value &&
+			   (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS ||
+			    attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE ||
+			    attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS)) {
+			pr_debug("CPU node invalid for access attribute\n");
+			return HSAKMT_STATUS_INVALID_NODE_UNIT;
+		}
+	}
+
+	/* Driver does one copy_from_user, with extra attrs size */
+	r = kmtIoctl(kfd_fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args);
+	if (r) {
+		pr_debug("op set range attrs failed %s\n", strerror(errno));
+		return HSAKMT_STATUS_ERROR;
+	}
+
+	return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtSVMGetAttr(void *start_addr, HSAuint64 size, unsigned int nattr,
+		 HSA_SVM_ATTRIBUTE *attrs)
+{
+	struct kfd_ioctl_svm_args *args;
+	HSAuint64 s_attr;
+	HSAKMT_STATUS r;
+	HSAuint32 i;
+
+	CHECK_KFD_OPEN();
+
+	pr_debug("%s: address 0x%p size 0x%lx\n", __func__, start_addr, size);
+
+	if (!start_addr || !size)
+		return HSAKMT_STATUS_INVALID_PARAMETER;
+	if ((uint64_t)start_addr & (PAGE_SIZE - 1))
+		return HSAKMT_STATUS_INVALID_PARAMETER;
+	if (size & (PAGE_SIZE - 1))
+		return HSAKMT_STATUS_INVALID_PARAMETER;
+
+	s_attr = sizeof(*attrs) * nattr;
+	args = alloca(sizeof(*args) + s_attr);
+
+	args->start_addr = (uint64_t)start_addr;
+	args->size = size;
+	args->op = KFD_IOCTL_SVM_OP_GET_ATTR;
+	args->nattr = nattr;
+	memcpy(args->attrs, attrs, s_attr);
+
+	for (i = 0; i < nattr; i++) {
+		if (attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS &&
+		    attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE &&
+		    attrs[i].type != KFD_IOCTL_SVM_ATTR_NO_ACCESS)
+		    continue;
+
+		r = validate_nodeid(attrs[i].value, &args->attrs[i].value);
+		if (r != HSAKMT_STATUS_SUCCESS) {
+			pr_debug("invalid node ID: %d\n", attrs[i].value);
+			return r;
+		} else if (!args->attrs[i].value) {
+			pr_debug("CPU node invalid for access attribute\n");
+			return HSAKMT_STATUS_INVALID_NODE_UNIT;
+		}
+	}
+
+	/* Driver does one copy_from_user, with extra attrs size */
+	r = kmtIoctl(kfd_fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args);
+	if (r) {
+		pr_debug("op get range attrs failed %s\n", strerror(errno));
+		return HSAKMT_STATUS_ERROR;
+	}
+
+	memcpy(attrs, args->attrs, s_attr);
+
+	for (i = 0; i < nattr; i++) {
+		if (attrs[i].type != KFD_IOCTL_SVM_ATTR_PREFERRED_LOC &&
+		    attrs[i].type != KFD_IOCTL_SVM_ATTR_PREFETCH_LOC &&
+		    attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS &&
+		    attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE &&
+		    attrs[i].type != KFD_IOCTL_SVM_ATTR_NO_ACCESS)
+			continue;
+
+		switch (attrs[i].value) {
+		case KFD_IOCTL_SVM_LOCATION_SYSMEM:
+			attrs[i].value = 0;
+			break;
+		case KFD_IOCTL_SVM_LOCATION_UNDEFINED:
+			attrs[i].value = INVALID_NODEID;
+			break;
+		default:
+			r = gpuid_to_nodeid(attrs[i].value, &attrs[i].value);
+			if (r != HSAKMT_STATUS_SUCCESS) {
+				pr_debug("invalid GPU ID: %d\n",
+					 attrs[i].value);
+				return r;
+			}
+		}
+	}
+
+	return HSAKMT_STATUS_SUCCESS;
+}
+
+static HSAKMT_STATUS
+hsaKmtSetGetXNACKMode(HSAint32 * enable)
+{
+	struct kfd_ioctl_set_xnack_mode_args args;
+
+	CHECK_KFD_OPEN();
+
+	args.xnack_enabled = *enable;
+
+	if (kmtIoctl(kfd_fd, AMDKFD_IOC_SET_XNACK_MODE, &args)) {
+		if (errno == EPERM) {
+			pr_debug("set mode not supported %s\n",
+				 strerror(errno));
+			return HSAKMT_STATUS_NOT_SUPPORTED;
+		} else if (errno == EBUSY) {
+			pr_debug("kmtIoctl queues not empty %s\n",
+				 strerror(errno));
+		}
+		return HSAKMT_STATUS_ERROR;
+	}
+
+	*enable = args.xnack_enabled;
+
+	return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtSetXNACKMode(HSAint32 enable)
+{
+	return hsaKmtSetGetXNACKMode(&enable);
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtGetXNACKMode(HSAint32 * enable)
+{
+	*enable = -1;
+	return hsaKmtSetGetXNACKMode(enable);
+}
diff --git a/src/topology.c b/src/topology.c
index d5c054d24a..8a8cc96272 100644
--- a/src/topology.c
+++ b/src/topology.c
@@ -224,6 +224,10 @@ static const struct hsa_gfxip_table gfxip_lookup_table[] = {
 	{ 0x738C, 9, 0, 8, "Arcturus", CHIP_ARCTURUS },
 	{ 0x738E, 9, 0, 8, "Arcturus", CHIP_ARCTURUS },
 	{ 0x7390, 9, 0, 8, "Arcturus", CHIP_ARCTURUS },
+	/* Aldebaran */
+	{ 0x7408, 9, 0, 10, "Aldebaran", CHIP_ALDEBARAN },
+	{ 0x740C, 9, 0, 10, "Aldebaran", CHIP_ALDEBARAN },
+	{ 0x740F, 9, 0, 10, "Aldebaran", CHIP_ALDEBARAN },
 	/* Navi10 */
 	{ 0x7310, 10, 1, 0, "Navi10", CHIP_NAVI10 },
 	{ 0x7312, 10, 1, 0, "Navi10", CHIP_NAVI10 },
diff --git a/tests/kfdtest/CMakeLists.txt b/tests/kfdtest/CMakeLists.txt
index 17d4b9bf4a..a8c9f30299 100644
--- a/tests/kfdtest/CMakeLists.txt
+++ b/tests/kfdtest/CMakeLists.txt
@@ -38,6 +38,16 @@ set ( CPACK_PACKAGE_VERSION_MINOR "0" )
 set ( CPACK_PACKAGE_VERSION_PATCH "0" )
 set ( CPACK_PACKAGE_HOMEPAGE_URL "https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface" )
 
+## Define default variable and variables for the optional build target hsakmt-dev
+set ( SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR} CACHE STRING "Location of hsakmt source code." )
+set ( CMAKE_INSTALL_PREFIX "/opt/rocm"  CACHE STRING "Default installation directory." )
+set ( CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}"  CACHE STRING "Default packaging prefix." )
+set ( CPACK_GENERATOR "DEB;RPM"  CACHE STRING "Default packaging generators." )
+
+# Debian package specific variables
+set ( CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface" )
+
+
 #set ( CMAKE_VERBOSE_MAKEFILE on )
 
 find_package(PkgConfig)
@@ -57,29 +67,19 @@ else()
     include_directories(${DRM_AMDGPU_INCLUDE_DIRS})
 endif()
 
-## Define default variable and variables for the optional build target hsakmt-dev
-set ( SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR} CACHE STRING "Location of hsakmt source code." )
-set ( CMAKE_INSTALL_PREFIX "/opt/rocm"  CACHE STRING "Default installation directory." )
-set ( CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}"  CACHE STRING "Default packaging prefix." )
-set ( CPACK_GENERATOR "DEB;RPM"  CACHE STRING "Default packaging generators." )
-
-# Debian package specific variables
-set ( CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface" )
-
 if( DEFINED ENV{LIBHSAKMT_PATH} )
     set ( LIBHSAKMT_PATH $ENV{LIBHSAKMT_PATH} )
     message ( "LIBHSAKMT_PATH environment variable is set" )
 else()
     if ( ${ROCM_INSTALL_PATH} )
-       set ( ENV{PKG_CONFIG_PATH} ${ROCM_INSTALL_PATH} )
-       pkg_check_modules(HSAKMT libhsakmt)
+       set ( ENV{PKG_CONFIG_PATH} ${ROCM_INSTALL_PATH}/share/pkgconfig )
     else()
-       set ( ENV{PKG_CONFIG_PATH} /opt/rocm/libhsakmt/ )
-       pkg_check_modules(HSAKMT libhsakmt)
+       set ( ENV{PKG_CONFIG_PATH} /opt/rocm/share/pkgconfig )
     endif()
+
+    pkg_check_modules(HSAKMT libhsakmt)
+
     if( NOT HSAKMT_FOUND )
-       set ( ENV{PKG_CONFIG_PATH} /opt/rocm/libhsakmt/ )
-       pkg_check_modules(HSAKMT libhsakmt)
        set ( LIBHSAKMT_PATH $ENV{OUT_DIR} )
     endif()
 endif()
@@ -109,6 +109,7 @@ set (SRC_FILES gtest-1.6.0/gtest-all.cpp
   src/GoogleTestExtension.cpp
   src/IndirectBuffer.cpp
   src/IsaGenerator.cpp
+  src/IsaGenerator_Aldebaran.cpp
   src/IsaGenerator_Gfx10.cpp
   src/IsaGenerator_Gfx72.cpp
   src/IsaGenerator_Gfx8.cpp
diff --git a/tests/kfdtest/scripts/kfdtest.exclude b/tests/kfdtest/scripts/kfdtest.exclude
index f526986c87..f2b605cf84 100644
--- a/tests/kfdtest/scripts/kfdtest.exclude
+++ b/tests/kfdtest/scripts/kfdtest.exclude
@@ -20,6 +20,7 @@ FILTER[core_sws]=\
 "KFDQMTest.AllSdmaQueues:"\
 "KFDQMTest.AllXgmiSdmaQueues:"\
 "KFDQMTest.AllQueues:"\
+"KFDLocalMemoryTest.AccessLocalMem:"\
 "KFDEventTest.SignalEvent"
 
 # HWS mode
@@ -199,6 +200,13 @@ FILTER[arcturus]=\
 "KFDQMTest.BasicCuMaskingEven:"\
 "KFDEvictTest.BurstyTest"
 
+FILTER[aldebaran]=\
+"$BLACKLIST_ALL_ASICS:"\
+"KFDExceptionTest.FaultStorm:"\
+"KFDEvictTest.BurstyTest:"\
+"KFDMemoryTest.PtraceAccess:"\
+"KFDMemoryTest.DeviceHdpFlush"
+
 FILTER[navi10]=\
 "$BLACKLIST_ALL_ASICS:"\
 "KFDMemoryTest.MMBench"
diff --git a/tests/kfdtest/sp3/lib_helper/CMakeLists_sp3.txt b/tests/kfdtest/sp3/lib_helper/CMakeLists_sp3.txt
index db59e3716e..ce8a3cb33f 100644
--- a/tests/kfdtest/sp3/lib_helper/CMakeLists_sp3.txt
+++ b/tests/kfdtest/sp3/lib_helper/CMakeLists_sp3.txt
@@ -39,7 +39,7 @@ set ( SCLIB_SRC ${PROJECT_SOURCE_DIR} )
 #endif()
 
 include_directories(${SCLIB_SRC}/sp3)
-include_directories(${SCLIB_SRC}/sp3/release_headers)
+#include_directories(${SCLIB_SRC}/sp3/release_headers)
 include_directories(${SCLIB_SRC}/sp3/gen)
 
 set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-asic.c )
@@ -53,15 +53,17 @@ set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-cipher.c )
 set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-vm.c )
 
 aux_source_directory(${SCLIB_SRC}/sp3/gen SRC_FILES)
-aux_source_directory(${SCLIB_SRC}/sp3/si SRC_FILES)
-aux_source_directory(${SCLIB_SRC}/sp3/ci SRC_FILES)
-aux_source_directory(${SCLIB_SRC}/sp3/gfx8 SRC_FILES)
-aux_source_directory(${SCLIB_SRC}/sp3/gfx81 SRC_FILES)
-aux_source_directory(${SCLIB_SRC}/sp3/gfx9 SRC_FILES)
-aux_source_directory(${SCLIB_SRC}/sp3/gfx10 SRC_FILES)
-aux_source_directory(${SCLIB_SRC}/sp3/release_headers/gfx81 SRC_FILES)
-aux_source_directory(${SCLIB_SRC}/sp3/release_headers/gfx9 SRC_FILES)
-aux_source_directory(${SCLIB_SRC}/sp3/release_headers/gfx10 SRC_FILES)
+aux_source_directory(${SCLIB_SRC}/sp3/backend/si/lib SRC_FILES)
+aux_source_directory(${SCLIB_SRC}/sp3/backend/ci/lib SRC_FILES)
+aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx8/lib SRC_FILES)
+aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx81/lib SRC_FILES)
+aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx9/lib SRC_FILES)
+aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx10/lib SRC_FILES)
+aux_source_directory(${SCLIB_SRC}/sp3/backend/aldbrn/lib SRC_FILES)
+aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx81/arch SRC_FILES)
+aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx9/arch SRC_FILES)
+aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx10/arch SRC_FILES)
+aux_source_directory(${SCLIB_SRC}/sp3/backend/aldbrn/arch SRC_FILES)
 
 
 message( STATUS "PROJECT_SOURCE_DIR:" ${PROJECT_SOURCE_DIR} )
@@ -70,7 +72,7 @@ message( STATUS "PROJECT_SOURCE_DIR:" ${PROJECT_SOURCE_DIR} )
 #  message(STATUS "${file}")
 #endforeach()
 
-set ( CMAKE_C_FLAGS "-DSP3_STATIC_LIB -Wno-error -DPUBLIC_RELEASE -DLITTLEENDIAN_CPU -fPIC -DGFX10_BUILD" )
+set ( CMAKE_C_FLAGS "-DSP3_STATIC_LIB -Wno-error -DPUBLIC_RELEASE -DLITTLEENDIAN_CPU -fPIC -DGFX101_BUILD -DALDBRN_BUILD" )
 
 add_library(amdsp3 ${SRC_FILES})
 
diff --git a/tests/kfdtest/sp3/lib_helper/build_sp3.sh b/tests/kfdtest/sp3/lib_helper/build_sp3.sh
index 7cd20ccfb5..f93f145da6 100755
--- a/tests/kfdtest/sp3/lib_helper/build_sp3.sh
+++ b/tests/kfdtest/sp3/lib_helper/build_sp3.sh
@@ -44,7 +44,7 @@ popd
 
 rsync --progress -a build/libamdsp3.a $LIB_OUTPUT
 # Put the intermediate header files in the current folder for further processing
-rsync --progress -a $SP3_PROJECT/sp3/sp3.h .
+rsync --progress -a $SP3_PROJECT/sp3/public/lib/sp3.h .
 
 # Remove the build folder and CMakeLists.txt put into SP source folder
 rm -r build
diff --git a/tests/kfdtest/sp3/sp3.h b/tests/kfdtest/sp3/sp3.h
index d6235be5d8..513167d595 100644
--- a/tests/kfdtest/sp3/sp3.h
+++ b/tests/kfdtest/sp3/sp3.h
@@ -54,7 +54,9 @@ enum sp3_shtype {
     SP3_SHTYPE_HS   = 4,
     SP3_SHTYPE_LS   = 5,
     SP3_SHTYPE_CS   = 6,
+#ifdef NAVI10LITE_BUILD
     SP3_SHTYPE_ACV  = 7,
+#endif
 };
 
 /// Assorted constants used by sp3 API.
@@ -107,10 +109,12 @@ struct sp3_shader {
     uint32_t size;              ///< Size of the compiled shader, in 32-bit words.
     uint32_t nsgprs;            ///< Number of scalar GPRs used.
     uint32_t nvgprs;            ///< Number of vector GPRs used.
-    uint32_t nsvgprs;           ///< Number of shared vector GPRs used.
+    uint32_t nsvgprs;           ///< Number of shared vector GPRs used (only available in certain projects).
+    uint32_t naccvgprs;         ///< Number of accumulator vector GPRs used (only available in certain projects).
     uint32_t nsgprs_manual_alloc;
     uint32_t nvgprs_manual_alloc;
     uint32_t nsvgprs_manual_alloc;
+    uint32_t naccvgprs_manual_alloc;
     uint32_t trap_present;
     uint32_t user_sgpr_count;
     uint32_t scratch_en;
@@ -209,6 +213,13 @@ SP3_EXPORT struct sp3_context *sp3_new(void);
 ///
 /// Currently supported options:
 ///
+/// stdlib (string) -- absolute path to standard library files.  May be a colon-separated list
+/// of paths that will be used to search for stdlib files.  Used by sp3_parse_library().
+///
+/// The following options are deprecated because they take integer arguments; you should use
+/// sp3_set_option_int() for these settings going forward.  They will continue to be accepted by
+/// this API to support legacy users.
+///
 /// Werror (boolean) -- indicates whether warnings should be treated as errors.
 ///
 /// wave_size (integer) -- sets the wave size being used by the draw calls that will be using
@@ -220,11 +231,53 @@ SP3_EXPORT struct sp3_context *sp3_new(void);
 ///
 /// omit_code_end (boolean) -- omit generation of the S_CODE_END footer.
 ///
+/// allow_raw_bits (boolean) -- allow use of the raw_bits() function in sp3 shaders.  This is a
+/// dangerous option to allow in general so you must explicitly enable this option, otherwise
+/// the raw_bits() function will always error out.
+///
 SP3_EXPORT void sp3_set_option(
     struct sp3_context *state,
     const char *option,
     const char *value);
 
+/// Set option for sp3.
+///
+/// @param state sp3 context.
+/// @param option Option name. Unknown options will raise an error.
+/// @param value Option value.
+///
+/// Currently supported options:
+///
+/// Werror (boolean) -- indicates whether warnings should be treated as errors.
+///
+/// wave_size (integer) -- sets the wave size being used by the draw calls that will be using
+/// this shader.  Ignored in certain ASICs.  You may set this to 32, 64 or the special value 0
+/// to indicate no preference on wave size.  The shader will be checked to ensure it is
+/// compatible with the size specified here.
+///
+/// omit_version (boolean) -- omit generation of the S_VERSION opcode.
+///
+/// omit_code_end (boolean) -- omit generation of the S_CODE_END footer.
+///
+/// allow_raw_bits (boolean) -- allow use of the raw_bits() function in sp3 shaders.  This is a
+/// dangerous option to allow in general so you must explicitly enable this option, otherwise
+/// the raw_bits() function will always error out.
+///
+/// secure_mode (boolean) -- run in secure mode. Disables macro language features in assembly
+/// path including calls to custom functions. Useful if sp3 is used as a backend to a web-based
+/// assembly tool.
+///
+/// debug_encoding (boolean) -- if true, debug encoding selection logic for assembly. Only
+/// supported in 10.4+ backends.
+///
+/// no_vs_export_check (boolean) -- if true, disable VS export sanity check.  Only supported in
+/// 10.4+ backends.
+///
+SP3_EXPORT void sp3_set_option_int(
+    struct sp3_context *state,
+    const char *option,
+    int32_t value);
+
 /// Parse a file into a context.
 ///
 /// Use sp3_compile to generate binary microcode after the shader is parsed.
diff --git a/tests/kfdtest/src/BaseQueue.cpp b/tests/kfdtest/src/BaseQueue.cpp
index e66d3dd784..dd1620168c 100644
--- a/tests/kfdtest/src/BaseQueue.cpp
+++ b/tests/kfdtest/src/BaseQueue.cpp
@@ -48,7 +48,8 @@ HSAKMT_STATUS BaseQueue::Create(unsigned int NodeId, unsigned int size, HSAuint6
 
     memset(&m_Resources, 0, sizeof(m_Resources));
 
-    m_QueueBuf = new HsaMemoryBuffer(size, NodeId, true/*zero*/, false/*local*/, true/*exec*/);
+    m_QueueBuf = new HsaMemoryBuffer(size, NodeId, true/*zero*/, false/*local*/, true/*exec*/,
+                        /*isScratch */ false, /* isReadOnly */false, /* isUncached */true);
 
     if (type == HSA_QUEUE_COMPUTE_AQL) {
         m_Resources.Queue_read_ptr_aql = &pointers[0];
diff --git a/tests/kfdtest/src/Dispatch.cpp b/tests/kfdtest/src/Dispatch.cpp
index 6ed67ce39b..3aa3892f38 100644
--- a/tests/kfdtest/src/Dispatch.cpp
+++ b/tests/kfdtest/src/Dispatch.cpp
@@ -30,6 +30,8 @@
 
 #include "KFDBaseComponentTest.hpp"
 
+#define mmCOMPUTE_PGM_RSRC3                                                     0x2e2d
+
 Dispatch::Dispatch(const HsaMemoryBuffer& isaBuf, const bool eventAutoReset)
     :m_IsaBuf(isaBuf), m_IndirectBuf(PACKETTYPE_PM4, PAGE_SIZE / sizeof(unsigned int), isaBuf.Node()),
     m_DimX(1), m_DimY(1), m_DimZ(1), m_pArg1(NULL), m_pArg2(NULL), m_pEop(NULL), m_ScratchEn(false),
@@ -218,6 +220,12 @@ void Dispatch::BuildIb() {
     m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_PGM_RSRC1, COMPUTE_PGM_RSRC,
                                                   ARRAY_SIZE(COMPUTE_PGM_RSRC)));
 
+    if (m_FamilyId == FAMILY_AL) {
+        const unsigned int COMPUTE_PGM_RSRC3[] = {9};
+        m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_PGM_RSRC3, COMPUTE_PGM_RSRC3,
+                                                      ARRAY_SIZE(COMPUTE_PGM_RSRC3)));
+    }
+
     m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_RESOURCE_LIMITS, COMPUTE_RESOURCE_LIMITS,
                                                   ARRAY_SIZE(COMPUTE_RESOURCE_LIMITS)));
     m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_TMPRING_SIZE, COMPUTE_TMPRING_SIZE,
diff --git a/tests/kfdtest/src/IndirectBuffer.cpp b/tests/kfdtest/src/IndirectBuffer.cpp
index b820230b0d..4e3907cc5b 100644
--- a/tests/kfdtest/src/IndirectBuffer.cpp
+++ b/tests/kfdtest/src/IndirectBuffer.cpp
@@ -30,7 +30,8 @@
 IndirectBuffer::IndirectBuffer(PACKETTYPE type,  unsigned int sizeInDWords, unsigned int NodeId)
     :m_NumOfPackets(0), m_MaxSize(sizeInDWords), m_ActualSize(0), m_PacketTypeAllowed(type) {
     m_IndirectBuf = new HsaMemoryBuffer(sizeInDWords*sizeof(unsigned int), NodeId, true/*zero*/,
-                                        false/*local*/, true/*exec*/);
+                                        false/*local*/, true/*exec*/, false/*isScratch*/,
+                                        false/*isReadOnly*/, true/*isUncached*/);
 }
 
 IndirectBuffer::~IndirectBuffer(void) {
diff --git a/tests/kfdtest/src/IsaGenerator.cpp b/tests/kfdtest/src/IsaGenerator.cpp
index 9c7376a0a4..3e69b5f9df 100644
--- a/tests/kfdtest/src/IsaGenerator.cpp
+++ b/tests/kfdtest/src/IsaGenerator.cpp
@@ -30,6 +30,7 @@
 #include "IsaGenerator_Gfx8.hpp"
 #include "IsaGenerator_Gfx9.hpp"
 #include "IsaGenerator_Gfx10.hpp"
+#include "IsaGenerator_Aldebaran.hpp"
 
 #include "GoogleTestExtension.hpp"
 
@@ -93,6 +94,8 @@ IsaGenerator* IsaGenerator::Create(unsigned int familyId) {
     case FAMILY_RV:
     case FAMILY_AR:
         return new IsaGenerator_Gfx9;
+    case FAMILY_AL:
+        return new IsaGenerator_Aldbrn;
     case FAMILY_NV:
         return new IsaGenerator_Gfx10;
 
diff --git a/tests/kfdtest/src/IsaGenerator_Aldebaran.cpp b/tests/kfdtest/src/IsaGenerator_Aldebaran.cpp
new file mode 100644
index 0000000000..2c377f9111
--- /dev/null
+++ b/tests/kfdtest/src/IsaGenerator_Aldebaran.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (C) 2020 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "IsaGenerator_Aldebaran.hpp"
+
+#include <algorithm>
+#include <string>
+
+const std::string IsaGenerator_Aldbrn::ASIC_NAME = "ALDEBARAN";
+
+/* The binaries are generated from following ISA */
+#if 0
+/* flat_atomic_inc will not support by some PCIE, use flat_atomic_add instead */
+shader atomic_add
+asic(ALDEBARAN)
+type(CS)
+    v_mov_b32 v0, s0
+    v_mov_b32 v1, s1
+    v_mov_b32 v2, 1
+    flat_atomic_add v3, v[0:1], v2 slc glc scc
+    s_waitcnt 0
+    s_endpgm
+end
+
+shader copy_dword
+asic(ALDEBARAN)
+type(CS)
+/* copy the parameters from scalar registers to vector registers */
+    v_mov_b32 v0, s0
+    v_mov_b32 v1, s1
+    v_mov_b32 v2, s2
+    v_mov_b32 v3, s3
+/* copy a dword between the passed addresses */
+    flat_load_dword v4, v[0:1] slc glc
+    s_waitcnt 0
+    flat_store_dword v[2:3], v4 slc glc
+    s_endpgm
+end
+
+shader main
+asic(ALDEBARAN)
+type(CS)
+loop:
+    s_branch loop
+    s_endpgm
+end
+
+
+#endif
+
+const uint32_t IsaGenerator_Aldbrn::NOOP_ISA[] = {
+    0xbf810000
+};
+
+const uint32_t IsaGenerator_Aldbrn::COPY_DWORD_ISA[] = {
+    0x7e000200, 0x7e020201,
+    0x7e040202, 0x7e060203,
+    0xdc530000, 0x047f0000,
+    0xbf8c0000, 0xdc730000,
+    0x007f0402, 0xbf810000
+};
+
+const uint32_t IsaGenerator_Aldbrn::INFINITE_LOOP_ISA[] = {
+    0xbf82ffff, 0xbf810000
+};
+
+const uint32_t IsaGenerator_Aldbrn::ATOMIC_ADD_ISA[] = {
+    0x7e000200, 0x7e020201,
+    0x7e040281, 0xdf0b0000,
+    0x037f0200, 0xbf8c0000,
+    0xbf810000, 0x00000000
+};
+
+void IsaGenerator_Aldbrn::GetNoopIsa(HsaMemoryBuffer& rBuf) {
+    std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As<uint32_t*>());
+}
+
+void IsaGenerator_Aldbrn::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) {
+    std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As<uint32_t*>());
+}
+
+void IsaGenerator_Aldbrn::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) {
+    std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As<uint32_t*>());
+}
+
+void IsaGenerator_Aldbrn::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) {
+    std::copy(ATOMIC_ADD_ISA, ATOMIC_ADD_ISA+ARRAY_SIZE(ATOMIC_ADD_ISA), rBuf.As<uint32_t*>());
+}
+
+const std::string& IsaGenerator_Aldbrn::GetAsicName() {
+    return ASIC_NAME;
+}
+
diff --git a/tests/kfdtest/src/IsaGenerator_Aldebaran.hpp b/tests/kfdtest/src/IsaGenerator_Aldebaran.hpp
new file mode 100644
index 0000000000..5571b91c26
--- /dev/null
+++ b/tests/kfdtest/src/IsaGenerator_Aldebaran.hpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2020 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef _ISAGENERATOR_ALDEBARAN_H_
+#define _ISAGENERATOR_ALDEBARAN_H_
+
+#include <string>
+#include "IsaGenerator.hpp"
+
+class IsaGenerator_Aldbrn : public IsaGenerator {
+ public:
+    virtual void GetNoopIsa(HsaMemoryBuffer& rBuf);
+    virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf);
+    virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf);
+    virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf);
+
+ protected:
+    virtual const std::string& GetAsicName();
+
+ private:
+    static const std::string ASIC_NAME;
+
+    static const uint32_t NOOP_ISA[];
+    static const uint32_t COPY_DWORD_ISA[];
+    static const uint32_t INFINITE_LOOP_ISA[];
+    static const uint32_t ATOMIC_ADD_ISA[];
+};
+
+#endif  // _ISAGENERATOR_ALDEBARAN_H_
diff --git a/tests/kfdtest/src/KFDCWSRTest.cpp b/tests/kfdtest/src/KFDCWSRTest.cpp
index 8306d05fff..daa92c9823 100644
--- a/tests/kfdtest/src/KFDCWSRTest.cpp
+++ b/tests/kfdtest/src/KFDCWSRTest.cpp
@@ -198,11 +198,13 @@ TEST_F(KFDCWSRTest, BasicTest) {
         int i;
         for (i = 0 ; i < wave_number; ++i) {
              if (result1[i] != count1) {
-                 LOG() << "Dispatch 1, work item " << i << ' ' << result1[i] << std::endl;
+                 LOG() << "Dispatch 1, work item [" << std::dec << i << "] "
+                         << result1[i] << " != " << count1 << std::endl;
                  break;
              }
              if (result2[i] != count2) {
-                 LOG() << "Dispatch 2, work item " << i << ' ' << result2[i] << std::endl;
+                 LOG() << "Dispatch 2, work item [" << std::dec << i << "] "
+                         << result2[i] << " != " << count2 << std::endl;
                  break;
              }
         }
diff --git a/tests/kfdtest/src/KFDLocalMemoryTest.cpp b/tests/kfdtest/src/KFDLocalMemoryTest.cpp
index 4c86594ab9..6af6765ac3 100644
--- a/tests/kfdtest/src/KFDLocalMemoryTest.cpp
+++ b/tests/kfdtest/src/KFDLocalMemoryTest.cpp
@@ -50,6 +50,32 @@ void KFDLocalMemoryTest::TearDown() {
     ROUTINE_END
 }
 
+TEST_F(KFDLocalMemoryTest, AccessLocalMem) {
+    TEST_START(TESTPROFILE_RUNALL)
+
+    int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
+    ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
+
+    //local memory
+    HsaMemoryBuffer destBuf(PAGE_SIZE, defaultGPUNode, false, true);
+    HsaEvent *event;
+    ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event));
+
+    PM4Queue queue;
+
+    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
+
+    queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As<unsigned int*>(), 0, 0));
+
+    queue.Wait4PacketConsumption(event);
+
+    hsaKmtDestroyEvent(event);
+    EXPECT_SUCCESS(queue.Destroy());
+
+
+    TEST_END
+}
+
 TEST_F(KFDLocalMemoryTest, BasicTest) {
     TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
     TEST_START(TESTPROFILE_RUNALL);
diff --git a/tests/kfdtest/src/KFDMemoryTest.cpp b/tests/kfdtest/src/KFDMemoryTest.cpp
index d27d796eb7..7b1e18a5f9 100644
--- a/tests/kfdtest/src/KFDMemoryTest.cpp
+++ b/tests/kfdtest/src/KFDMemoryTest.cpp
@@ -108,6 +108,29 @@ wave_size(32)\n\
 end\n\
 ";
 
+const char* aldbrn_ScratchCopyDword =
+"\
+shader ScratchCopyDword\n\
+asic(ALDEBARAN)\n\
+type(CS)\n\
+/*copy the parameters from scalar registers to vector registers*/\n\
+    v_mov_b32 v0, s0\n\
+    v_mov_b32 v1, s1\n\
+    v_mov_b32 v2, s2\n\
+    v_mov_b32 v3, s3\n\
+/*set up the scratch parameters. This assumes a single 16-reg block.*/\n\
+    s_mov_b32 flat_scratch_lo, s4\n\
+    s_mov_b32 flat_scratch_hi, s5\n\
+/*copy a dword between the passed addresses*/\n\
+    flat_load_dword v4, v[0:1] slc\n\
+    s_waitcnt vmcnt(0)&lgkmcnt(0)\n\
+    flat_store_dword v[2:3], v4 slc\n\
+    \n\
+    s_endpgm\n\
+    \n\
+end\n\
+";
+
 
 
 /* Continuously poll src buffer and check buffer value
@@ -131,6 +154,32 @@ type(CS)\n\
     end\n\
 ";
 
+/* Similar to gfx9_PollMemory except that the buffer
+ * polled can be Non-coherant memory. SCC system-level
+ * cache coherence is not supported in scalar (smem) path.
+ * Use vmem operations with scc
+ */
+const char* gfx9_PollNCMemory =
+"\
+shader ReadMemory\n\
+asic(ALDEBARAN)\n\
+wave_size(32)\n\
+type(CS)\n\
+/* Assume src address in s0, s1 and dst address in s2, s3*/\n\
+    v_mov_b32 v6, 0x5678\n\
+    v_mov_b32 v0, s0\n\
+    v_mov_b32 v1, s1\n\
+    LOOP:\n\
+    flat_load_dword v4, v[0:1] scc\n\
+    v_cmp_eq_u32 vcc, v4, v6\n\
+    s_cbranch_vccz   LOOP\n\
+    v_mov_b32 v0, s2\n\
+    v_mov_b32 v1, s3\n\
+    flat_store_dword v[0:1], v6 scc\n\
+    s_endpgm\n\
+    end\n\
+";
+
 const char* gfx10_PollMemory =
 "\
 shader ReadMemory\n\
@@ -226,6 +275,81 @@ type(CS)\n\
     end\n\
 ";
 
+/* Continuously poll the flag at src buffer
+ * After the flag of s[0:1] is 1 filled,
+ * copy the value from s[0:1]+4 to dst buffer
+ */
+const char* gfx9_PollAndCopy =
+"\
+shader CopyMemory\n\
+wave_size(32)\n\
+type(CS)\n\
+/* Assume src buffer in s[0:1] and dst buffer in s[2:3]*/\n\
+    s_movk_i32 s18, 0x1\n\
+    LOOP:\n\
+    s_load_dword s16, s[0:1], 0x0 glc\n\
+    s_cmp_eq_i32 s16, s18\n\
+    s_cbranch_scc0   LOOP\n\
+    s_load_dword s17, s[0:1], 0x4 glc\n\
+    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
+    s_store_dword s17, s[2:3], 0x0 glc:1\n\
+    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
+    s_endpgm\n\
+    end\n\
+";
+
+const char* gfx9aldbrn_PollAndCopy =
+"\
+shader CopyMemory\n\
+wave_size(32)\n\
+type(CS)\n\
+/* Assume src buffer in s[0:1] and dst buffer in s[2:3]*/\n\
+    v_mov_b32 v0, s0\n\
+    v_mov_b32 v1, s1\n\
+    v_mov_b32 v18, 0x1\n\
+    LOOP:\n\
+    flat_load_dword v16, v[0:1] scc:1\n\
+    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
+    v_cmp_eq_i32 vcc, v16, v18\n\
+    s_cbranch_vccz   LOOP\n\
+    buffer_invl2\n\
+    s_load_dword s17, s[0:1], 0x4 glc\n\
+    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
+    s_store_dword s17, s[2:3], 0x0 glc\n\
+    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
+    buffer_wbl2\n\
+    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
+    s_endpgm\n\
+    end\n\
+";
+
+/* Input0: A buffer of at least 2 dwords.
+ * DW0: used as a signal. Write 0x1 to signal
+ * DW1: Write the value from 2nd input buffer
+ *      for other device to read.
+ * Input1: A buffer of at least 2 dwords.
+ * DW0: used as the value to be written.
+ */
+const char* gfx9aldbrn_WriteFlagAndValue =
+"\
+shader WriteMemory\n\
+wave_size(32)\n\
+type(CS)\n\
+/* Assume two inputs buffer in s[0:1] and s[2:3]*/\n\
+    v_mov_b32 v0, s0\n\
+    v_mov_b32 v1, s1\n\
+    s_load_dword s18, s[2:3], 0x0 glc\n\
+    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
+    s_store_dword s18, s[0:1], 0x4 glc\n\
+    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
+    buffer_wbl2\n\
+    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
+    v_mov_b32 v16, 0x1\n\
+    flat_store_dword v[0:1], v16 scc:1\n\
+    s_endpgm\n\
+    end\n\
+";
+
 const char* gfx10_WriteAndSignal =
 "\
 shader WriteAndSignal\n\
@@ -389,7 +513,11 @@ TEST_F(KFDMemoryTest, MapUnmapToNodes) {
     else
         pReadMemory = gfx10_PollMemory;
 
-    m_pIsaGen->CompileShader(pReadMemory, "ReadMemory", isaBuffer);
+    if (m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode))
+        /* On A+A system memory is mapped as NC */
+        m_pIsaGen->CompileShader(gfx9_PollNCMemory, "ReadMemory", isaBuffer);
+    else
+        m_pIsaGen->CompileShader(pReadMemory, "ReadMemory", isaBuffer);
 
     PM4Queue pm4Queue;
     ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode));
@@ -485,14 +613,18 @@ TEST_F(KFDMemoryTest, AccessPPRMem) {
 
     ASSERT_SUCCESS(queue.Create(defaultGPUNode));
 
+    HsaEvent *event;
+    ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event));
+
     queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf,
                                 0xABCDEF09, 0x12345678));
 
-    queue.Wait4PacketConsumption();
+    queue.Wait4PacketConsumption(event);
 
     WaitOnValue(destBuf, 0xABCDEF09);
     WaitOnValue(destBuf + 1, 0x12345678);
 
+    hsaKmtDestroyEvent(event);
     EXPECT_SUCCESS(queue.Destroy());
 
     /* This sleep hides the dmesg PPR message storm on Raven, which happens
@@ -726,8 +858,10 @@ TEST_F(KFDMemoryTest, FlatScratchAccess) {
     const char *pScratchCopyDword;
     if (m_FamilyId < FAMILY_AI)
         pScratchCopyDword = gfx8_ScratchCopyDword;
-    else if (m_FamilyId < FAMILY_NV)
+    else if (m_FamilyId < FAMILY_AL)
         pScratchCopyDword = gfx9_ScratchCopyDword;
+    else if (m_FamilyId == FAMILY_AL)
+        pScratchCopyDword = aldbrn_ScratchCopyDword;
     else
         pScratchCopyDword = gfx10_ScratchCopyDword;
     m_pIsaGen->CompileShader(pScratchCopyDword, "ScratchCopyDword", isaBuffer);
@@ -1514,6 +1648,7 @@ TEST_F(KFDMemoryTest, PtraceAccessInvisibleVram) {
     mem1 = reinterpret_cast<void *>(reinterpret_cast<HSAuint8 *>(mem) + VRAM_OFFSET + sizeof(HSAuint64));
     PM4Queue queue;
     ASSERT_SUCCESS(queue.Create(defaultGPUNode));
+
     queue.PlaceAndSubmitPacket(PM4WriteDataPacket((unsigned int *)mem0,
                                                   data0[0], data0[1]));
     queue.PlaceAndSubmitPacket(PM4WriteDataPacket((unsigned int *)mem1,
@@ -1592,8 +1727,10 @@ TEST_F(KFDMemoryTest, PtraceAccessInvisibleVram) {
     const char *pScratchCopyDword;
     if (m_FamilyId < FAMILY_AI)
         pScratchCopyDword = gfx8_ScratchCopyDword;
-    else if (m_FamilyId < FAMILY_NV)
+    else if (m_FamilyId < FAMILY_AL)
         pScratchCopyDword = gfx9_ScratchCopyDword;
+    else if (m_FamilyId == FAMILY_AL)
+        pScratchCopyDword = aldbrn_ScratchCopyDword;
     else
         pScratchCopyDword = gfx10_ScratchCopyDword;
 
@@ -2294,3 +2431,211 @@ TEST_F(KFDMemoryTest, CacheInvalidateOnRemoteWrite) {
 
     TEST_END
 }
+
+/* Test is for new cache coherence on Aldebaran. It is to verify
+ * two GPUs can coherently share a fine grain FB.
+ */
+TEST_F(KFDMemoryTest, VramCacheCoherenceWithRemoteGPU) {
+    TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
+    TEST_START(TESTPROFILE_RUNALL);
+
+    HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
+    HsaMemoryBuffer tmpBuffer(PAGE_SIZE, 0, true /* zero */);
+    volatile HSAuint32 *tmp = tmpBuffer.As<volatile HSAuint32 *>();
+    const int dwSource = 0x40 * sizeof(int); /* At 3rd cache line */
+    const int dwLocation = 0x80 * sizeof(int); /* At 5th cache line  */
+
+    if (m_FamilyId != FAMILY_AL) {
+        LOG() << "Skipping test: Test requires aldebaran series asics." << std::endl;
+        return;
+    }
+
+    const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
+    if (gpuNodes.size() < 2) {
+        LOG() << "Skipping test: At least two GPUs are required." << std::endl;
+        return;
+    }
+
+    HSAuint32 nondefaultNode;
+    for (unsigned i = 0; i < gpuNodes.size(); i++) {
+        if (gpuNodes.at(i) != defaultGPUNode) {
+            nondefaultNode = gpuNodes.at(i);
+            break;
+        }
+    }
+
+    unsigned int nodes[2] = {defaultGPUNode, nondefaultNode};
+
+    /* Allocate a local FB */
+    HsaMemoryBuffer buffer(PAGE_SIZE, defaultGPUNode, false/*zero*/, true/*local*/, false/*exec*/);
+    buffer.MapMemToNodes(&nodes[0], 2);
+    SDMAQueue sdmaQueue;
+    ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode));
+    buffer.Fill(0, sdmaQueue, 0, PAGE_SIZE);
+    buffer.Fill(0x5678, sdmaQueue, dwSource, 4);
+
+    /* Read buffer[0] as flag from local shader to fill cache line (64 dws)
+     * which should has 0 at buffer[1]
+     */
+    PM4Queue queue;
+    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
+    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
+    m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer);
+    Dispatch dispatch(isaBuffer);
+    dispatch.SetArgs(buffer.As<char *>(), buffer.As<char *>()+dwLocation);
+    dispatch.Submit(queue);
+
+    /* Delay 100ms to make sure shader executed*/
+    Delay(100);
+
+    /* Using remote shader to write the flag and copy value from dwSource
+     * to dwLocation in buffer.
+     * Local shader should get the flag and execute CopyMemory
+     */
+    PM4Queue queue1;
+    ASSERT_SUCCESS(queue1.Create(nondefaultNode));
+    HsaMemoryBuffer isaBuffer1(PAGE_SIZE, nondefaultNode, true/*zero*/, false/*local*/, true/*exec*/);
+    m_pIsaGen->CompileShader(gfx9aldbrn_WriteFlagAndValue, "WriteMemory", isaBuffer1);
+    Dispatch dispatch1(isaBuffer1);
+    dispatch1.SetArgs(buffer.As<char *>(), buffer.As<char *>()+dwSource);
+    dispatch1.Submit(queue1);
+    dispatch1.Sync(g_TestTimeOut);
+
+    /* Check test result*/
+    dispatch.Sync(g_TestTimeOut);
+    EXPECT_EQ(buffer.IsPattern(dwLocation, 0x5678, sdmaQueue, tmp), true);
+
+    // Clean up
+    EXPECT_SUCCESS(queue.Destroy());
+    EXPECT_SUCCESS(queue1.Destroy());
+    EXPECT_SUCCESS(sdmaQueue.Destroy());
+
+    TEST_END
+}
+
+/* Test is for new cache coherence on A+A(Aldebaran). It is to verify
+ * new XGMI coherence HW link in caches between CPU and GPUs
+ * in local FB with fine grain mode.
+ */
+TEST_F(KFDMemoryTest, VramCacheCoherenceWithCPU) {
+    TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
+    TEST_START(TESTPROFILE_RUNALL);
+
+    if (m_FamilyId != FAMILY_AL) {
+        LOG() << "Skipping test: Test requires aldebaran series asics." << std::endl;
+        return;
+    }
+
+    HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
+    const int dwLocation = 0x80;
+
+    if (!m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode)) {
+        LOG() << "Skipping test: XGMI link to CPU is required." << std::endl;
+        return;
+    }
+
+    unsigned int *buffer;
+    HsaMemFlags memFlags = {0};
+    /* Allocate a fine grain local FB accessed by CPU */
+    memFlags.ui32.HostAccess = 1;
+    memFlags.ui32.NonPaged = 1;
+    ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode, PAGE_SIZE, memFlags,
+            reinterpret_cast<void**>(&buffer)));
+    ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(buffer, PAGE_SIZE, NULL));
+    buffer[0] = 0;
+    buffer[dwLocation] = 0;
+
+    /* Read buffer from shader to fill cache */
+    PM4Queue queue;
+    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
+    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
+    m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer);
+    Dispatch dispatch(isaBuffer);
+    dispatch.SetArgs(buffer, buffer+dwLocation);
+    dispatch.Submit(queue);
+
+    /* Delay 100ms to make sure shader executed*/
+    Delay(100);
+
+    /* CPU writes to buffer. Shader should get 0x5678 CPU writes
+     * after cache invalidating(buffer_invl2) and quits
+     */
+    buffer[1] = 0x5678;
+    buffer[0] = 1;
+
+    /* Check test result*/
+    dispatch.Sync(g_TestTimeOut);
+    EXPECT_EQ(buffer[dwLocation], 0x5678);
+
+    // Clean up
+    EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(buffer));
+    EXPECT_SUCCESS(hsaKmtFreeMemory(buffer, PAGE_SIZE));
+    EXPECT_SUCCESS(queue.Destroy());
+
+    TEST_END
+}
+
+/* Test is for new cache coherence on Aldebaran. It is to verify
+ * new XGMI coherence HW link in caches between CPU and GPUs
+ * in system RAM.
+ */
+TEST_F(KFDMemoryTest, SramCacheCoherenceWithGPU) {
+    TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
+    TEST_START(TESTPROFILE_RUNALL);
+
+    if (m_FamilyId != FAMILY_AL) {
+        LOG() << "Skipping test: Test requires aldebaran series asics." << std::endl;
+        return;
+    }
+
+    unsigned int *fineBuffer = NULL;
+    unsigned int tmp;
+
+    int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
+    const int dwLocation = 0x80;
+
+    ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode /* system */, PAGE_SIZE, m_MemoryFlags,
+                       reinterpret_cast<void**>(&fineBuffer)));
+    ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(fineBuffer, PAGE_SIZE, NULL));
+    fineBuffer[0] = 0;
+    fineBuffer[1] = 0;
+    /* Read buffer from CPU to fill cache */
+    tmp = fineBuffer[dwLocation];
+
+    /* Read fine grain buffer from shader to fill cache */
+    PM4Queue queue;
+    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
+    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
+
+    if (m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode))
+        m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer);
+    else
+        m_pIsaGen->CompileShader(gfx9_PollAndCopy, "CopyMemory", isaBuffer);
+
+    Dispatch dispatch(isaBuffer);
+    dispatch.SetArgs(fineBuffer, fineBuffer+dwLocation);
+    dispatch.Submit(queue);
+
+    /* Delay 100ms to make sure shader executed*/
+    Delay(100);
+
+    /* CPU writes to buffer. Shader should get what CPU writes and quits*/
+    fineBuffer[1] = 0x5678;
+    fineBuffer[0] = 1;
+
+    /* Check test result, based on KFDEventTest.SignalEvent passed.
+     * if Sync times out,
+     * it means coherence issue that GPU doesn't read what CPU wrote.
+     * if buffer value is not expected,
+     * it means coherence issue that CPU doesn't read what GPU wrote.
+     */
+    dispatch.Sync(g_TestTimeOut);
+    EXPECT_EQ(fineBuffer[dwLocation], 0x5678);
+
+    // Clean up
+    EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(fineBuffer));
+    EXPECT_SUCCESS(hsaKmtFreeMemory(fineBuffer, PAGE_SIZE));
+    EXPECT_SUCCESS(queue.Destroy());
+
+    TEST_END
+}
diff --git a/tests/kfdtest/src/KFDPMTest.cpp b/tests/kfdtest/src/KFDPMTest.cpp
index 79b385cf72..98c2348a8c 100644
--- a/tests/kfdtest/src/KFDPMTest.cpp
+++ b/tests/kfdtest/src/KFDPMTest.cpp
@@ -78,8 +78,11 @@ TEST_F(KFDPMTest, SuspendWithIdleQueueAfterWork) {
 
     ASSERT_SUCCESS(queue.Create(defaultGPUNode));
 
+    HsaEvent *event;
+    ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event));
+
     queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuffer.As<unsigned int*>(), 0x1, 0x2));
-    queue.Wait4PacketConsumption();
+    queue.Wait4PacketConsumption(event);
     WaitOnValue(&(destBuffer.As<unsigned int*>()[0]), 0x1);
     WaitOnValue(&(destBuffer.As<unsigned int*>()[1]), 0x2);
 
@@ -88,7 +91,7 @@ TEST_F(KFDPMTest, SuspendWithIdleQueueAfterWork) {
     EXPECT_EQ(true, SuspendAndWakeUp());
 
     queue.PlaceAndSubmitPacket(PM4WriteDataPacket(&(destBuffer.As<unsigned int*>()[2]), 0x3, 0x4));
-    queue.Wait4PacketConsumption();
+    queue.Wait4PacketConsumption(event);
 
     EXPECT_EQ(destBuffer.As<unsigned int*>()[0], 0);
     EXPECT_EQ(destBuffer.As<unsigned int*>()[1], 0);
@@ -96,6 +99,7 @@ TEST_F(KFDPMTest, SuspendWithIdleQueueAfterWork) {
     WaitOnValue(&(destBuffer.As<unsigned int*>()[2]), 0x3);
     WaitOnValue(&(destBuffer.As<unsigned int*>()[3]), 0x4);
 
+    hsaKmtDestroyEvent(event);
     EXPECT_SUCCESS(queue.Destroy());
 
     TEST_END
diff --git a/tests/kfdtest/src/KFDQMTest.cpp b/tests/kfdtest/src/KFDQMTest.cpp
index ae561fccb2..6d4cb7cbef 100644
--- a/tests/kfdtest/src/KFDQMTest.cpp
+++ b/tests/kfdtest/src/KFDQMTest.cpp
@@ -78,13 +78,16 @@ TEST_F(KFDQMTest, SubmitNopCpQueue) {
     ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
 
     PM4Queue queue;
+    HsaEvent *event;
+    ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event));
 
     ASSERT_SUCCESS(queue.Create(defaultGPUNode));
 
     queue.PlaceAndSubmitPacket(PM4NopPacket());
 
-    queue.Wait4PacketConsumption();
+    queue.Wait4PacketConsumption(event);
 
+    hsaKmtDestroyEvent(event);
     EXPECT_SUCCESS(queue.Destroy());
 
     TEST_END
@@ -99,17 +102,19 @@ TEST_F(KFDQMTest, SubmitPacketCpQueue) {
     HsaMemoryBuffer destBuf(PAGE_SIZE, defaultGPUNode, false);
 
     destBuf.Fill(0xFF);
+    HsaEvent *event;
+    ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event));
 
     PM4Queue queue;
-
     ASSERT_SUCCESS(queue.Create(defaultGPUNode));
 
     queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As<unsigned int*>(), 0, 0));
 
-    queue.Wait4PacketConsumption();
+    queue.Wait4PacketConsumption(event);
 
     EXPECT_TRUE(WaitOnValue(destBuf.As<unsigned int*>(), 0));
 
+    hsaKmtDestroyEvent(event);
     EXPECT_SUCCESS(queue.Destroy());
 
     TEST_END
@@ -132,7 +137,7 @@ TEST_F(KFDQMTest, AllCpQueues) {
 
     for (unsigned int qidx = 0; qidx < m_numCpQueues; ++qidx) {
         queues[qidx].PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As<unsigned int*>()+qidx*2, qidx, qidx));
-
+        queues[qidx].PlaceAndSubmitPacket(PM4ReleaseMemoryPacket(m_FamilyId, true, 0, 0));
         queues[qidx].Wait4PacketConsumption();
 
         EXPECT_TRUE(WaitOnValue(destBuf.As<unsigned int*>()+qidx*2, qidx));
@@ -330,6 +335,7 @@ TEST_F(KFDQMTest, AllQueues) {
 
     for (i = 0; i < numCpQueues; ++i) {
         cpQueues[i].PlaceAndSubmitPacket(PM4WriteDataPacket(destBufCp.As<unsigned int*>()+i*2, i, i));
+        cpQueues[i].PlaceAndSubmitPacket(PM4ReleaseMemoryPacket(m_FamilyId, true, 0, 0));
 
         cpQueues[i].Wait4PacketConsumption();
 
@@ -460,9 +466,12 @@ TEST_F(KFDQMTest, DisableCpQueueByUpdateWithNullAddress) {
 
     ASSERT_SUCCESS(queue.Create(defaultGPUNode));
 
+    HsaEvent *event;
+    ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event));
+
     queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As<unsigned int*>(), 0, 0));
 
-    queue.Wait4PacketConsumption();
+    queue.Wait4PacketConsumption(event);
 
     WaitOnValue(destBuf.As<unsigned int*>(), 0);
 
@@ -480,10 +489,11 @@ TEST_F(KFDQMTest, DisableCpQueueByUpdateWithNullAddress) {
 
     EXPECT_SUCCESS(queue.Update(BaseQueue::DEFAULT_QUEUE_PERCENTAGE, BaseQueue::DEFAULT_PRIORITY, false));
 
-    queue.Wait4PacketConsumption();
+    queue.Wait4PacketConsumption(event);
 
     WaitOnValue(destBuf.As<unsigned int*>(), 1);
 
+    hsaKmtDestroyEvent(event);
     EXPECT_SUCCESS(queue.Destroy());
 
     TEST_END
@@ -544,13 +554,16 @@ TEST_F(KFDQMTest, DisableCpQueueByUpdateWithZeroPercentage) {
 
     ASSERT_SUCCESS(queue.Create(defaultGPUNode));
 
+    HsaEvent *event;
+    ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event));
+
     PM4WriteDataPacket packet1, packet2;
     packet1.InitPacket(destBuf.As<unsigned int*>(), 0, 0);
     packet2.InitPacket(destBuf.As<unsigned int*>(), 1, 1);
 
     queue.PlaceAndSubmitPacket(packet1);
 
-    queue.Wait4PacketConsumption();
+    queue.Wait4PacketConsumption(event);
 
     WaitOnValue(destBuf.As<unsigned int*>(), 0);
 
@@ -568,7 +581,7 @@ TEST_F(KFDQMTest, DisableCpQueueByUpdateWithZeroPercentage) {
 
     EXPECT_SUCCESS(queue.Update(BaseQueue::DEFAULT_QUEUE_PERCENTAGE, BaseQueue::DEFAULT_PRIORITY, false));
 
-    queue.Wait4PacketConsumption();
+    queue.Wait4PacketConsumption(event);
 
     WaitOnValue(destBuf.As<unsigned int*>(), 1);
 
@@ -1228,6 +1241,8 @@ TEST_F(KFDQMTest, CpuWriteCoherence) {
     HsaMemoryBuffer destBuf(PAGE_SIZE, defaultGPUNode);
 
     ASSERT_SUCCESS(queue.Create(defaultGPUNode));
+    HsaEvent *event;
+    ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event));
 
     /* The queue might be full and we fail to submit. There is always one word space unused in queue.
      * So let rptr one step ahead then we continually submit packet.
@@ -1249,10 +1264,11 @@ TEST_F(KFDQMTest, CpuWriteCoherence) {
      */
     queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As<unsigned int*>(), 0x42, 0x42));
 
-    queue.Wait4PacketConsumption();
+    queue.Wait4PacketConsumption(event);
 
     WaitOnValue(destBuf.As<unsigned int*>(), 0x42);
 
+    hsaKmtDestroyEvent(event);
     TEST_END
 }
 
@@ -1420,18 +1436,22 @@ TEST_F(KFDQMTest, CpQueueWraparound) {
 
     ASSERT_SUCCESS(queue.Create(defaultGPUNode));
 
+    HsaEvent *event;
+    ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event));
+
     for (unsigned int pktIdx = 0; pktIdx <= PAGE_SIZE/sizeof(PM4WRITE_DATA_CI); ++pktIdx) {
         queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As<unsigned int*>(), pktIdx, pktIdx));
-        queue.Wait4PacketConsumption();
+        queue.Wait4PacketConsumption(event);
         WaitOnValue(destBuf.As<unsigned int*>(), pktIdx);
     }
 
     for (unsigned int pktIdx = 0; pktIdx <= PAGE_SIZE/sizeof(PM4WRITE_DATA_CI); ++pktIdx) {
         queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As<unsigned int*>(), pktIdx, pktIdx));
-        queue.Wait4PacketConsumption();
+        queue.Wait4PacketConsumption(event);
         WaitOnValue(destBuf.As<unsigned int*>(), pktIdx);
     }
 
+    hsaKmtDestroyEvent(event);
     EXPECT_SUCCESS(queue.Destroy());
 
     TEST_END
@@ -1669,18 +1689,13 @@ TEST_F(KFDQMTest, P2PTest) {
     HsaMemFlags memFlags = {0};
     HsaMemMapFlags mapFlags = {0};
     memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
-    memFlags.ui32.HostAccess = 1;
+    memFlags.ui32.HostAccess = 0;
     memFlags.ui32.NonPaged = 1;
     memFlags.ui32.NoNUMABind = 1;
     unsigned int end = size / sizeof(HSAuint32) - 1;
 
-    if (!m_NodeInfo.IsGPUNodeLargeBar(g_TestDstNodeId) &&
-         m_NodeInfo.AreGPUNodesXGMI(g_TestNodeId, g_TestDstNodeId)) {
-        memFlags.ui32.HostAccess = 0;
-    }
-
     /* 1. Allocate a system buffer and allow the access to GPUs */
-    EXPECT_SUCCESS(hsaKmtAllocMemory(0, size, memFlags,
+    EXPECT_SUCCESS(hsaKmtAllocMemory(0, size, m_MemoryFlags,
                                      reinterpret_cast<void **>(&sysBuf)));
     EXPECT_SUCCESS(hsaKmtMapMemoryToGPUNodes(sysBuf, size, NULL,
                                              mapFlags, nodes.size(), (HSAuint32 *)&nodes[0]));
diff --git a/tests/kfdtest/src/KFDTestFlags.hpp b/tests/kfdtest/src/KFDTestFlags.hpp
index 921b8bc832..9087ba23f8 100644
--- a/tests/kfdtest/src/KFDTestFlags.hpp
+++ b/tests/kfdtest/src/KFDTestFlags.hpp
@@ -59,6 +59,7 @@ enum KfdFamilyId {
     FAMILY_AI,  // Arctic Islands
     FAMILY_RV,  // Raven
     FAMILY_AR,  // Arcturus
+    FAMILY_AL,  // Aldebaran
     FAMILY_NV,  // Navi10
 };
 
diff --git a/tests/kfdtest/src/KFDTestUtil.cpp b/tests/kfdtest/src/KFDTestUtil.cpp
index 1e1b85abda..8651eaa51d 100644
--- a/tests/kfdtest/src/KFDTestUtil.cpp
+++ b/tests/kfdtest/src/KFDTestUtil.cpp
@@ -149,8 +149,10 @@ unsigned int FamilyIdFromNode(const HsaNodeProperties *props) {
         familyId = FAMILY_AI;
         if (props->EngineId.ui32.Stepping == 2)
             familyId = FAMILY_RV;
-        if (props->EngineId.ui32.Stepping == 8)
+        else if (props->EngineId.ui32.Stepping == 8)
             familyId = FAMILY_AR;
+        else if (props->EngineId.ui32.Stepping == 10)
+            familyId = FAMILY_AL;
         break;
     case 10:
         familyId = FAMILY_NV;
@@ -201,7 +203,7 @@ HSAuint64 GetSystemTickCountInMicroSec() {
 const HsaMemoryBuffer HsaMemoryBuffer::Null;
 
 HsaMemoryBuffer::HsaMemoryBuffer(HSAuint64 size, unsigned int node, bool zero, bool isLocal, bool isExec,
-                                 bool isScratch, bool isReadOnly)
+                                 bool isScratch, bool isReadOnly, bool isUncached)
     :m_Size(size),
     m_pUser(NULL),
     m_pBuf(NULL),
@@ -222,11 +224,13 @@ HsaMemoryBuffer::HsaMemoryBuffer(HSAuint64 size, unsigned int node, bool zero, b
             m_Flags.ui32.HostAccess = 0;
             m_Flags.ui32.NonPaged = 1;
             m_Flags.ui32.CoarseGrain = 1;
+            EXPECT_EQ(isUncached, 0) << "Uncached flag is relevant only for system or host memory";
         } else {
             m_Flags.ui32.HostAccess = 1;
             m_Flags.ui32.NonPaged = 0;
             m_Flags.ui32.CoarseGrain = 0;
             m_Flags.ui32.NoNUMABind = 1;
+            m_Flags.ui32.Uncached = isUncached;
         }
 
         if (isExec)
@@ -667,3 +671,26 @@ int HsaNodeInfo::FindAccessiblePeers(std::vector<int> *peers,
     }
     return peers->size();
 }
+
+const bool HsaNodeInfo::IsNodeXGMItoCPU(int node) const {
+    const HsaNodeProperties *pNodeProperties;
+    bool ret = false;
+
+    pNodeProperties = GetNodeProperties(node);
+    if (pNodeProperties && pNodeProperties->NumIOLinks) {
+        HsaIoLinkProperties  *IolinkProperties =  new HsaIoLinkProperties[pNodeProperties->NumIOLinks];
+        EXPECT_SUCCESS(hsaKmtGetNodeIoLinkProperties(node, pNodeProperties->NumIOLinks, IolinkProperties));
+
+        for (int linkId = 0; linkId < pNodeProperties->NumIOLinks; linkId++) {
+            EXPECT_EQ(node, IolinkProperties[linkId].NodeFrom);
+            const HsaNodeProperties *pNodeProperties0 =
+                    GetNodeProperties(IolinkProperties[linkId].NodeTo);
+            if (pNodeProperties0->NumFComputeCores == 0 &&
+                    IolinkProperties[linkId].IoLinkType == HSA_IOLINK_TYPE_XGMI)
+                ret = true;
+        }
+        delete [] IolinkProperties;
+    }
+
+    return ret;
+}
diff --git a/tests/kfdtest/src/KFDTestUtil.hpp b/tests/kfdtest/src/KFDTestUtil.hpp
index e55ca95062..2076e27df0 100644
--- a/tests/kfdtest/src/KFDTestUtil.hpp
+++ b/tests/kfdtest/src/KFDTestUtil.hpp
@@ -66,7 +66,7 @@ class HsaMemoryBuffer {
 
  public:
     HsaMemoryBuffer(HSAuint64 size, unsigned int node, bool zero = true, bool isLocal = false,
-                    bool isExec = false, bool isScratch = false, bool isReadOnly = false);
+                    bool isExec = false, bool isScratch = false, bool isReadOnly = false, bool isUncached = false);
     HsaMemoryBuffer(void *addr, HSAuint64 size);
     template<typename RetType>
     RetType As() {
@@ -197,6 +197,11 @@ class HsaNodeInfo {
     const bool AreGPUNodesXGMI(int node0, int node1) const;
     int FindAccessiblePeers(std::vector<int> *peers,
                                         HSAuint32 node) const;
+    /* @brief: to determine if the node is XGMI-linked to CPU
+     * @param: node index of the node we are looking at
+     * @return: bool true or false
+     */
+    const bool IsNodeXGMItoCPU(int node) const;
 };
 
 #endif  // __KFD__TEST__UTIL__H__
diff --git a/tests/kfdtest/src/KFDTopologyTest.cpp b/tests/kfdtest/src/KFDTopologyTest.cpp
index 334317e943..c675e4ca88 100644
--- a/tests/kfdtest/src/KFDTopologyTest.cpp
+++ b/tests/kfdtest/src/KFDTopologyTest.cpp
@@ -58,7 +58,6 @@ TEST_F(KFDTopologyTest , BasicTest) {
                 EXPECT_GT(pNodeProperties->EngineId.ui32.uCode, 0) << "uCode version is 0";
                 EXPECT_GE(pNodeProperties->EngineId.ui32.Major, 7) << "Major Version is less than 7";
                 EXPECT_LT(pNodeProperties->EngineId.ui32.Minor, 10) << "Minor Version is greater than 9";
-                EXPECT_LT(pNodeProperties->EngineId.ui32.Stepping, 10) << "Stepping is greater than 9";
                 EXPECT_GT(pNodeProperties->uCodeEngineVersions.uCodeSDMA, 0) << "sDMA firmware version is 0";
             }
             EXPECT_GT(pNodeProperties->NumMemoryBanks, HSAuint32(0)) << "Node index: " << node << "No MemoryBanks.";