Add pointer attributes API

Add two pointer attributes APIs: hsaKmtQueryPointerInfo - allow the user to query the memory information using a pointer. This pointer can point to any address inside the range known to HSA. hsaKmtSetMemoryUserData - allow the user to attach data to a pointer to add memory tracking information. This pointer must match the start address of a memory allocation or registration. TODO: This patch implements support on dGPU. Needs to add APU. Change-Id: I4711809274248434901f0794f50ebfa13a7371a8 [ROCm/ROCR-Runtime commit: 51e4d27c37]
2016-09-01 23:25:42 -04:00
@@ -661,6 +661,26 @@ hsaKmtGetTileConfig(
    HsaGpuTileConfig*   config      // IN & OUT
    );

+/**
+  Returns information about pointers
+*/
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtQueryPointerInfo(
+    const void *        Pointer,        //IN
+    HsaPointerInfo *    PointerInfo     //OUT
+    );
+
+/**
+  Associates user data with a memory allocation
+*/
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtSetMemoryUserData(
+    const void *    Pointer,    //IN
+    void *          UserData    //IN
+    );
+
 #ifdef __cplusplus
 }   //extern "C"
 #endif
@@ -982,6 +982,28 @@ typedef struct _HsaGpuTileConfig
    HSAuint32 Reserved[7]; /* Round up to 16 dwords for future extension */
 } HsaGpuTileConfig;

+typedef enum _HSA_POINTER_TYPE {
+    HSA_POINTER_UNKNOWN = 0,
+    HSA_POINTER_ALLOCATED = 1,           // Allocated with hsaKmtAllocMemory (except scratch)
+    HSA_POINTER_REGISTERED_USER = 2,     // Registered user pointer
+    HSA_POINTER_REGISTERED_GRAPHICS = 3  // Registered graphics buffer
+                                         // (hsaKmtRegisterGraphicsToNodes)
+} HSA_POINTER_TYPE;
+
+typedef struct _HsaPointerInfo {
+    HSA_POINTER_TYPE   Type;             // Pointer type
+    HSAuint32          Node;             // Node where the memory is located
+    HsaMemFlags        MemFlags;         // Only valid for HSA_POINTER_ALLOCATED
+    void               *CPUAddress;      // Start address for CPU access
+    HSAuint64          GPUAddress;       // Start address for GPU access
+    HSAuint64          SizeInBytes;      // Size in bytes
+    HSAuint32          NRegisteredNodes; // Number of nodes the memory is registered to
+    HSAuint32          NMappedNodes;     // Number of nodes the memory is mapped to
+    const HSAuint32    *RegisteredNodes; // Array of registered nodes
+    const HSAuint32    *MappedNodes;     // Array of mapped nodes
+    void               *UserData;        // User data associated with the memory
+} HsaPointerInfo;
+
 #pragma pack(pop, hsakmttypes_h)


@@ -49,8 +49,13 @@
 struct vm_object {
 	void *start;
 	void *userptr;
-	uint64_t size;
+	uint64_t userptr_size;
+	uint64_t size; /* size allocated on GPU. When the user requests a random
+		        * size, Thunk aligns it to page size and allocates this
+		        * aligned size on GPU
+		        */
 	uint64_t handle; /* opaque */
+	uint32_t node_id;
 	struct vm_object *next;
 	struct vm_object *prev;
 	uint32_t flags; /* memory allocation flags */
@@ -59,13 +64,17 @@ struct vm_object {
 	 */
 	uint32_t *registered_device_id_array;
 	uint32_t registered_device_id_array_size;
+	uint32_t *registered_node_id_array;
 	/*
 	 * Nodes that mapped already
 	 */
 	uint32_t *mapped_device_id_array;
 	uint32_t mapped_device_id_array_size;
+	uint32_t *mapped_node_id_array;
 	/* Metadata of imported graphics buffers */
 	void *metadata;
+	/* User data associated with the memory */
+	void *user_data;
 };
 typedef struct vm_object vm_object_t;

@@ -181,11 +190,14 @@ static vm_object_t *vm_create_and_init_object(void *start, uint64_t size,
 	if (object) {
 		object->start = start;
 		object->userptr = NULL;
+		object->userptr_size = 0;
 		object->size = size;
 		object->handle = handle;
 		object->next = object->prev = NULL;
 		object->registered_device_id_array_size = 0;
 		object->mapped_device_id_array_size = 0;
+		object->registered_node_id_array = NULL;
+		object->mapped_node_id_array = NULL;
 		object->flags = flags;
 		object->metadata = NULL;
 	}
@@ -276,7 +288,22 @@ static void vm_split_area(manageble_aperture_t *app, vm_area_t *area,
 }

 static vm_object_t *vm_find_object_by_address(manageble_aperture_t *app,
-						void *address, uint64_t size)
+                                                const void *address)
+{
+	vm_object_t *cur = app->vm_objects;
+
+	while (cur) {
+		if (address >= cur->start &&
+			(uint64_t)address < ((uint64_t)cur->start + cur->size))
+			break;
+		cur = cur->next;
+	}
+
+	return cur; /* NULL if not found */
+}
+
+static vm_object_t *vm_find_object_by_start_address(manageble_aperture_t *app,
+					const void *address, uint64_t size)
 {
 	vm_object_t *cur = app->vm_objects;

@@ -287,13 +314,13 @@ static vm_object_t *vm_find_object_by_address(manageble_aperture_t *app,
 		if (cur->start == address && (cur->size == size || size == 0))
 			break;
 		cur = cur->next;
-	};
+	}

 	return cur; /* NULL if not found */
 }

 static vm_object_t *vm_find_object_by_userptr(manageble_aperture_t *app,
-						void *address)
+						const void *address)
 {
 	vm_object_t *cur = app->vm_objects;

@@ -466,7 +493,10 @@ static int32_t gpu_mem_find_by_gpu_id(uint32_t gpu_id)
 	return -1;
 }

-static int fmm_allocate_memory_in_device(uint32_t gpu_id, void *mem,
+/* After allocating the memory, return the vm_object created for this memory.
+ * Return NULL if any failure.
+ */
+static vm_object_t *fmm_allocate_memory_in_device(uint32_t gpu_id, void *mem,
 						uint64_t MemorySizeInBytes,
 						manageble_aperture_t *aperture,
 						uint64_t *mmap_offset,
@@ -474,9 +504,10 @@ static int fmm_allocate_memory_in_device(uint32_t gpu_id, void *mem,
 {
 	struct kfd_ioctl_alloc_memory_of_gpu_new_args args;
 	struct kfd_ioctl_free_memory_of_gpu_args free_args;
+	vm_object_t *vm_obj = NULL;

 	if (!mem)
-		return -1;
+		return NULL;

 	/* Allocate memory from amdkfd */
 	args.gpu_id = gpu_id;
@@ -490,26 +521,26 @@ static int fmm_allocate_memory_in_device(uint32_t gpu_id, void *mem,
 		args.mmap_offset = *mmap_offset;

 	if (kmtIoctl(kfd_fd, AMDKFD_IOC_ALLOC_MEMORY_OF_GPU_NEW, &args))
-		return -1;
+		return NULL;

 	/* Allocate object */
 	pthread_mutex_lock(&aperture->fmm_mutex);
-	if (!aperture_allocate_object(aperture, mem, args.handle,
-				      MemorySizeInBytes, flags))
+	if (!(vm_obj = aperture_allocate_object(aperture, mem, args.handle,
+				      MemorySizeInBytes, flags)))
 		goto err_object_allocation_failed;
 	pthread_mutex_unlock(&aperture->fmm_mutex);

 	if (mmap_offset)
 		*mmap_offset = args.mmap_offset;

-	return 0;
+	return vm_obj;

 err_object_allocation_failed:
 	pthread_mutex_unlock(&aperture->fmm_mutex);
 	free_args.handle = args.handle;
 	kmtIoctl(kfd_fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &free_args);

-	return -1;
+	return NULL;
 }

 bool fmm_is_inside_some_aperture(void *address)
@@ -696,9 +727,11 @@ void *fmm_allocate_scratch(uint32_t gpu_id, uint64_t MemorySizeInBytes)

 static void* __fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes,
 		manageble_aperture_t *aperture, uint64_t offset, uint64_t *mmap_offset,
-		uint32_t flags)
+		uint32_t flags, vm_object_t **vm_obj)
 {
 	void *mem = NULL;
+	vm_object_t *obj;
+
 	/* Check that aperture is properly initialized/supported */
 	if (!aperture_is_valid(aperture->base, aperture->limit))
 		return NULL;
@@ -713,8 +746,9 @@ static void* __fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes,
 	 * Now that we have the area reserved, allocate memory in the device
 	 * itself
 	 */
-	if (fmm_allocate_memory_in_device(gpu_id, mem,
-			MemorySizeInBytes, aperture, mmap_offset, flags)) {
+	obj = fmm_allocate_memory_in_device(gpu_id, mem,
+			MemorySizeInBytes, aperture, mmap_offset, flags);
+	if (obj == NULL) {
 		/*
 		 * allocation of memory in device failed.
 		 * Release region in aperture
@@ -726,6 +760,8 @@ static void* __fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes,
 		/* Assign NULL to mem to indicate failure to calling function */
 		mem = NULL;
 	}
+	if (vm_obj)
+		*vm_obj = obj;

 	return mem;
 }
@@ -742,6 +778,7 @@ void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes, HsaMemFla
 	uint32_t ioc_flags, offset;
 	uint64_t size, mmap_offset;
 	void *mem;
+	vm_object_t *vm_obj = NULL;

 	/* Retrieve gpu_mem id according to gpu_id */
 	gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
@@ -766,7 +803,15 @@ void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes, HsaMemFla

 	mem = __fmm_allocate_device(gpu_id, size,
 			aperture, offset, &mmap_offset,
-			ioc_flags);
+			ioc_flags, &vm_obj);
+
+	if (mem && vm_obj) {
+		pthread_mutex_lock(&aperture->fmm_mutex);
+		/* Store memory allocation flags, not ioc flags */
+		vm_obj->flags = flags.Value;
+		gpuid_to_nodeid(gpu_id, &vm_obj->node_id);
+		pthread_mutex_unlock(&aperture->fmm_mutex);
+	}

 	if (mem && flags.ui32.HostAccess) {
 		void *ret = mmap(mem, MemorySizeInBytes,
@@ -805,7 +850,7 @@ static void* fmm_allocate_host_cpu(uint64_t MemorySizeInBytes,
 	return mem;
 }

-static void* fmm_allocate_host_gpu(uint64_t MemorySizeInBytes,
+static void* fmm_allocate_host_gpu(uint32_t node_id, uint64_t MemorySizeInBytes,
 				   HsaMemFlags flags)
 {
 	void *mem;
@@ -815,6 +860,7 @@ static void* fmm_allocate_host_gpu(uint64_t MemorySizeInBytes,
 	uint64_t size;
 	int32_t i;
 	uint32_t gpu_id;
+	vm_object_t *vm_obj = NULL;

 	i = find_first_dgpu(&gpu_id);
 	if (i < 0)
@@ -833,7 +879,15 @@ static void* fmm_allocate_host_gpu(uint64_t MemorySizeInBytes,

 	mem =  __fmm_allocate_device(gpu_id, size,
 			aperture, 0, &mmap_offset,
-			ioc_flags);
+			ioc_flags, &vm_obj);
+
+	if (mem && vm_obj) {
+		/* Store memory allocation flags, not ioc flags */
+		pthread_mutex_lock(&aperture->fmm_mutex);
+		vm_obj->flags = flags.Value;
+		vm_obj->node_id = node_id;
+		pthread_mutex_unlock(&aperture->fmm_mutex);
+	}

 	if (flags.ui32.HostAccess) {
 		void *ret = mmap(mem, MemorySizeInBytes,
@@ -856,10 +910,11 @@ static void* fmm_allocate_host_gpu(uint64_t MemorySizeInBytes,
 	return mem;
 }

-void* fmm_allocate_host(uint64_t MemorySizeInBytes, HsaMemFlags flags)
+void* fmm_allocate_host(uint32_t node_id, uint64_t MemorySizeInBytes,
+			HsaMemFlags flags)
 {
 	if (is_dgpu)
-		return fmm_allocate_host_gpu(MemorySizeInBytes, flags);
+		return fmm_allocate_host_gpu(node_id, MemorySizeInBytes, flags);
 	return fmm_allocate_host_cpu(MemorySizeInBytes, flags);
 }

@@ -933,7 +988,7 @@ static void __fmm_release(void *address, manageble_aperture_t *aperture)
 	pthread_mutex_lock(&aperture->fmm_mutex);

 	/* Find the object to retrieve the handle */
-	object = vm_find_object_by_address(aperture, address, 0);
+	object = vm_find_object_by_start_address(aperture, address, 0);
 	if (!object) {
 		pthread_mutex_unlock(&aperture->fmm_mutex);
 		return;
@@ -944,6 +999,7 @@ static void __fmm_release(void *address, manageble_aperture_t *aperture)
 				object->registered_device_id_array) {
 			object->mapped_device_id_array_size = 0;
 			object->mapped_device_id_array = NULL;
+			object->mapped_node_id_array = NULL;
 		}
 		free(object->registered_device_id_array);
 		object->registered_device_id_array_size = 0;
@@ -959,6 +1015,13 @@ static void __fmm_release(void *address, manageble_aperture_t *aperture)
 	if (object->metadata)
 		free(object->metadata);

+	if (object->registered_node_id_array)
+		free(object->registered_node_id_array);
+	object->registered_node_id_array = NULL;
+	if (object->mapped_node_id_array)
+		free(object->mapped_node_id_array);
+	object->mapped_node_id_array = NULL;
+
 	if (address >= dgpu_shared_aperture_base &&
 	    address <= dgpu_shared_aperture_limit) {
 		/* Remove any CPU mapping, but keep the address range reserved */
@@ -1289,7 +1352,7 @@ static int _fmm_map_to_gpu_gtt(manageble_aperture_t *aperture,
 	object = obj;
 	if (!object) {
 		/* Find the object to retrieve the handle */
-		object = vm_find_object_by_address(aperture, address, 0);
+		object = vm_find_object_by_start_address(aperture, address, 0);
 		if (!object)
 			goto err_object_not_found;
 	}
@@ -1345,6 +1408,7 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageble_aperture_t *apertu
 	bool is_debugger = 0;
 	void *mmap_ret = NULL;
 	uint64_t mmap_offset = 0;
+
 	/* Retrieve gpu_mem id according to gpu_id */
 	gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
 	if (gpu_mem_id < 0)
@@ -1363,7 +1427,7 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageble_aperture_t *apertu
 	if (!ret && !is_debugger) {
 		offset = VOID_PTRS_SUB(address, aperture->base);
 		mem = __fmm_allocate_device(gpu_id, size, aperture, offset,
-				NULL, KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE);
+			NULL, KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE, NULL);
 		if (mem == NULL)
 			return -1;

@@ -1416,7 +1480,7 @@ static int _fmm_map_to_gpu(uint32_t gpu_id, manageble_aperture_t *aperture,
 	pthread_mutex_lock(&aperture->fmm_mutex);

 	/* Find the object to retrieve the handle */
-	object = vm_find_object_by_address(aperture, address, 0);
+	object = vm_find_object_by_start_address(aperture, address, 0);
 	if (!object)
 		goto err_object_not_found;

@@ -1583,7 +1647,7 @@ static int _fmm_unmap_from_gpu(manageble_aperture_t *aperture, void *address,
 	/* Find the object to retrieve the handle */
 	object = obj;
 	if (!object) {
-		object = vm_find_object_by_address(aperture, address, 0);
+		object = vm_find_object_by_start_address(aperture, address, 0);
 		if (!object) {
 			ret = -1;
 			goto err;
@@ -1622,6 +1686,9 @@ static int _fmm_unmap_from_gpu(manageble_aperture_t *aperture, void *address,

 	object->mapped_device_id_array = NULL;
 	object->mapped_device_id_array_size = 0;
+	if (object->mapped_node_id_array)
+		free(object->mapped_node_id_array);
+	object->mapped_node_id_array = NULL;

 	if (!obj)
 		pthread_mutex_unlock(&aperture->fmm_mutex);
@@ -1652,7 +1719,7 @@ static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id,
 	pthread_mutex_lock(&aperture->fmm_mutex);

 	/* Find the object to retrieve the handle and size */
-	object = vm_find_object_by_address(aperture, address, 0);
+	object = vm_find_object_by_start_address(aperture, address, 0);
 	if (!object)
 		goto err;

@@ -1678,6 +1745,9 @@ static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id,

 	object->mapped_device_id_array = NULL;
 	object->mapped_device_id_array_size = 0;
+	if (object->mapped_node_id_array)
+		free(object->mapped_node_id_array);
+	object->mapped_node_id_array = NULL;

 	pthread_mutex_unlock(&aperture->fmm_mutex);

@@ -1937,7 +2007,7 @@ bool fmm_get_handle(void *address, uint64_t *handle)

 	pthread_mutex_lock(&aperture->fmm_mutex);
 	/* Find the object to retrieve the handle */
-	object = vm_find_object_by_address(aperture, address, 0);
+	object = vm_find_object_by_start_address(aperture, address, 0);
 	if (object && handle) {
 		*handle = object->handle;
 		found = true;
@@ -1974,19 +2044,19 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr, HSAuint64 size, vm_obj

 	/* Allocate BO, userptr address is passed in mmap_offset */
 	svm_addr = __fmm_allocate_device(gpu_id, aligned_size, aperture, 0,
-					 &aligned_addr, KFD_IOC_ALLOC_MEM_FLAGS_USERPTR);
+			 &aligned_addr, KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, &obj);
 	if (svm_addr == NULL)
 		return HSAKMT_STATUS_ERROR;

-	/* Find the object and set its userptr address */
-	pthread_mutex_lock(&aperture->fmm_mutex);
-	obj = vm_find_object_by_address(aperture, svm_addr, aligned_size);
-	if (obj == NULL) {
+	if (obj) {
+		pthread_mutex_lock(&aperture->fmm_mutex);
+		obj->userptr = addr;
+		gpuid_to_nodeid(gpu_id, &obj->node_id);
+		obj->userptr_size = size;
 		pthread_mutex_unlock(&aperture->fmm_mutex);
-		return HSAKMT_STATUS_ERROR;
 	}
-	obj->userptr = addr;
-	pthread_mutex_unlock(&aperture->fmm_mutex);
+	else
+		return HSAKMT_STATUS_ERROR;

 	if (obj_ret)
 		*obj_ret = obj;
@@ -2026,7 +2096,7 @@ HSAKMT_STATUS fmm_register_memory(void *address, uint64_t size_in_bytes,

 	if (!object) {
 		pthread_mutex_lock(&aperture->fmm_mutex);
-		object = vm_find_object_by_address(aperture, address, 0);
+		object = vm_find_object_by_start_address(aperture, address, 0);
 		pthread_mutex_unlock(&aperture->fmm_mutex);
 	}

@@ -2117,6 +2187,7 @@ HSAKMT_STATUS fmm_register_graphics_handle(HSAuint64 GraphicsResourceHandle,
 		obj->metadata = metadata;
 		obj->registered_device_id_array = gpu_id_array;
 		obj->registered_device_id_array_size = gpu_id_array_size;
+		gpuid_to_nodeid(infoArgs.gpu_id, &obj->node_id);
 	}
 	pthread_mutex_unlock(&aperture->fmm_mutex);
 	if (!obj)
@@ -2205,7 +2276,7 @@ HSAKMT_STATUS fmm_deregister_memory(void *address)

 	pthread_mutex_lock(&aperture->fmm_mutex);

-	object = vm_find_object_by_address(aperture, address, 0);
+	object = vm_find_object_by_start_address(aperture, address, 0);
 	if (!object) {
 		pthread_mutex_unlock(&aperture->fmm_mutex);
 		return HSAKMT_STATUS_MEMORY_NOT_REGISTERED;
@@ -2230,6 +2301,9 @@ HSAKMT_STATUS fmm_deregister_memory(void *address)
 	free(object->registered_device_id_array);
 	object->registered_device_id_array = NULL;
 	object->registered_device_id_array_size = 0;
+	if (object->registered_node_id_array)
+		free(object->registered_node_id_array);
+	object->registered_node_id_array = NULL;

 	pthread_mutex_unlock(&aperture->fmm_mutex);

@@ -2275,7 +2349,7 @@ HSAKMT_STATUS fmm_map_to_gpu_nodes(void *address, uint64_t size,
 	if (userptr && is_dgpu)
 		object = vm_find_object_by_userptr(aperture, address);
 	else
-		object = vm_find_object_by_address(aperture, address, 0);
+		object = vm_find_object_by_start_address(aperture, address, 0);

 	if (!object) {
 		pthread_mutex_unlock(&aperture->fmm_mutex);
@@ -2364,3 +2438,112 @@ HSAKMT_STATUS fmm_map_to_gpu_nodes(void *address, uint64_t size,

 	return 0;
 }
+
+HSAKMT_STATUS fmm_get_mem_info(const void *address, HsaPointerInfo *info)
+{
+	HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+	uint32_t i;
+	manageble_aperture_t *aperture;
+	vm_object_t *vm_obj;
+
+	memset(info, 0, sizeof(HsaPointerInfo));
+
+	/* TODO: APU */
+
+	if (address >= svm.dgpu_aperture.base &&
+		address <= svm.dgpu_aperture.limit)
+		aperture = &svm.dgpu_aperture;
+	else if (address >= svm.dgpu_alt_aperture.base &&
+		address <= svm.dgpu_alt_aperture.limit)
+		aperture = &svm.dgpu_alt_aperture;
+	else
+		/* Not in SVM, it can be system memory registered by userptr */
+		aperture = &svm.dgpu_aperture;
+
+	vm_obj = vm_find_object_by_address(aperture, address);
+	if (!vm_obj)
+		vm_obj = vm_find_object_by_userptr(aperture, address);
+
+	if (!vm_obj) {
+		info->Type = HSA_POINTER_UNKNOWN;
+		ret = HSAKMT_STATUS_ERROR;
+		goto exit;
+	}
+
+	if (vm_obj->metadata)
+		info->Type = HSA_POINTER_REGISTERED_GRAPHICS;
+	else if (vm_obj->userptr)
+		info->Type = HSA_POINTER_REGISTERED_USER;
+	else
+		info->Type = HSA_POINTER_ALLOCATED;
+
+	info->Node = vm_obj->node_id;
+	info->GPUAddress = (HSAuint64)vm_obj->start;
+	info->SizeInBytes = vm_obj->size;
+	/* registered nodes */
+	info->NRegisteredNodes =
+		vm_obj->registered_device_id_array_size / sizeof(uint32_t);
+	if (info->NRegisteredNodes && !vm_obj->registered_node_id_array) {
+		vm_obj->registered_node_id_array = (uint32_t *)
+			(uint32_t *)malloc(vm_obj->registered_device_id_array_size);
+		/* vm_obj->registered_node_id_array allocated here will be
+		 * freed whenever the registration is deregistered or the
+		 * memory being freed
+		 */
+		for (i=0; i<info->NRegisteredNodes; i++)
+			gpuid_to_nodeid(vm_obj->registered_device_id_array[i],
+				&vm_obj->registered_node_id_array[i]);
+	}
+	info->RegisteredNodes = vm_obj->registered_node_id_array;
+	/* mapped nodes */
+	info->NMappedNodes =
+		vm_obj->mapped_device_id_array_size / sizeof(uint32_t);
+	if (info->NMappedNodes && !vm_obj->mapped_node_id_array) {
+		vm_obj->mapped_node_id_array =
+			(uint32_t *)malloc(vm_obj->mapped_device_id_array_size);
+		/* vm_obj->mapped_node_id_array allocated here will be
+		 * freed whenever the mapping is unmapped or memory being freed
+		 */
+		for (i=0; i<info->NMappedNodes; i++)
+			gpuid_to_nodeid(vm_obj->mapped_device_id_array[i],
+				&vm_obj->mapped_node_id_array[i]);
+	}
+	info->MappedNodes = vm_obj->mapped_node_id_array;
+	info->UserData = vm_obj->user_data;
+
+	if (info->Type == HSA_POINTER_REGISTERED_USER) {
+		info->CPUAddress = vm_obj->userptr;
+		info->SizeInBytes = vm_obj->userptr_size;
+		info->GPUAddress += ((HSAuint64)info->CPUAddress & (PAGE_SIZE-1));
+	}
+	else if (info->Type == HSA_POINTER_ALLOCATED) {
+		info->MemFlags.Value = vm_obj->flags;
+		info->CPUAddress = vm_obj->start;
+	}
+
+exit:
+	return ret;
+}
+
+HSAKMT_STATUS fmm_set_mem_user_data(const void *mem, void *usr_data)
+{
+	manageble_aperture_t *aperture;
+	vm_object_t *vm_obj;
+
+	/* TODO: APU */
+
+	if (mem >= svm.dgpu_alt_aperture.base &&
+		mem <= svm.dgpu_alt_aperture.limit)
+		aperture = &svm.dgpu_alt_aperture;
+	else
+		aperture = &svm.dgpu_aperture;
+
+	vm_obj = vm_find_object_by_start_address(aperture, mem, 0);
+	if (!vm_obj)
+		vm_obj = vm_find_object_by_userptr(aperture, mem);
+	if (!vm_obj)
+		return HSAKMT_STATUS_ERROR;
+
+	vm_obj->user_data = usr_data;
+	return HSAKMT_STATUS_SUCCESS;
+}
@@ -52,7 +52,8 @@ void fmm_destroy_process_apertures(void);
 */
 void* fmm_allocate_scratch(uint32_t gpu_id, uint64_t MemorySizeInBytes);
 void* fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes, HsaMemFlags flags);
-void* fmm_allocate_host(uint64_t MemorySizeInBytes, HsaMemFlags flags);
+void* fmm_allocate_host(uint32_t node_id, uint64_t MemorySizeInBytes,
+			HsaMemFlags flags);
 void* fmm_open_graphic_handle(uint32_t gpu_id,
        int32_t graphic_device_handle,
        uint32_t graphic_handle,
@@ -63,6 +64,8 @@ void fmm_release(void* address);
 int fmm_map_to_gpu(void *address, uint64_t size, uint64_t *gpuvm_address);
 int fmm_unmap_from_gpu(void *address);
 bool fmm_get_handle(void *address, uint64_t *handle);
+HSAKMT_STATUS fmm_get_mem_info(const void *address, HsaPointerInfo *info);
+HSAKMT_STATUS fmm_set_mem_user_data(const void *mem, void *usr_data);

 /* Topology interface*/
 HSAKMT_STATUS fmm_node_added(HSAuint32 gpu_id);
@@ -48,6 +48,8 @@ hsaKmtMapGraphicHandle;
 hsaKmtUnmapGraphicHandle;
 hsaKmtSetTrapHandler;
 hsaKmtGetTileConfig;
+hsaKmtQueryPointerInfo;
+hsaKmtSetMemoryUserData;

 local: *;
 };
@@ -133,7 +133,8 @@ hsaKmtAllocMemory(
 	}

 	if (gpu_id == 0 && !MemFlags.ui32.Scratch) {
-		*MemoryAddress = fmm_allocate_host(SizeInBytes, MemFlags);
+		*MemoryAddress = fmm_allocate_host(PreferredNode, SizeInBytes,
+						MemFlags);

 		if (*MemoryAddress == NULL)
 			return HSAKMT_STATUS_ERROR;
@@ -161,7 +162,8 @@ hsaKmtAllocMemory(
 	/* Backwards compatibility hack: Allocate system memory if app
 	 * asks for paged memory from a GPU node. */
 	if (gpu_id && !MemFlags.ui32.NonPaged && !MemFlags.ui32.Scratch) {
-		*MemoryAddress = fmm_allocate_host(SizeInBytes, MemFlags);
+		*MemoryAddress = fmm_allocate_host(PreferredNode, SizeInBytes,
+						MemFlags);

 		if (*MemoryAddress == NULL)
 			return HSAKMT_STATUS_ERROR;
@@ -422,3 +424,25 @@ hsaKmtGetTileConfig(

 	return HSAKMT_STATUS_SUCCESS;
 }
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtQueryPointerInfo(
+	const void	*Pointer,	/* IN */
+	HsaPointerInfo	*PointerInfo	/* OUT */
+)
+{
+	if (!PointerInfo)
+		return HSAKMT_STATUS_INVALID_PARAMETER;
+	return fmm_get_mem_info(Pointer, PointerInfo);
+}
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtSetMemoryUserData(
+	const void	*Pointer,	/* IN */
+	void		*UserData	/* IN */
+)
+{
+	return fmm_set_mem_user_data(Pointer, UserData);
+}