diff --git a/projects/rocr-runtime/src/fmm.c b/projects/rocr-runtime/src/fmm.c index a90fb957e9..6e3d8a1ece 100644 --- a/projects/rocr-runtime/src/fmm.c +++ b/projects/rocr-runtime/src/fmm.c @@ -45,8 +45,8 @@ #define INIT_GPUs_MEM {[0 ... (NUM_OF_SUPPORTED_GPUS-1)] = INIT_GPU_MEM} struct vm_object{ void* start; - HSAuint64 size; - HSAuint64 handle; // opaque + uint64_t size; + uint64_t handle; // opaque struct vm_object* next; struct vm_object* prev; }; @@ -74,7 +74,7 @@ typedef struct { } aperture_t; typedef struct{ - HSAuint32 gpu_id; + uint32_t gpu_id; aperture_t lds_aperture; manageble_aperture_t scratch_aperture; manageble_aperture_t gpuvm_aperture; @@ -183,7 +183,7 @@ static vm_object_t* vm_find_object_by_address(manageble_aperture_t* app, void* a // Look up the appropriate address range containing the given address while(cur){ - if(cur->start == address && cur->size == size) + if(cur->start == address && (cur->size == size || size == 0)) break; cur = cur->next; }; @@ -213,105 +213,96 @@ static bool aperture_is_valid(void* app_base, void* app_limit){ /* * Assumes that fmm_mutex is locked on entry. */ -static int aperture_release(manageble_aperture_t* app, void* address, uint64_t MemorySizeInBytes){ - int rc = -1; +static void aperture_release_area(manageble_aperture_t* app, void* address, uint64_t MemorySizeInBytes){ vm_area_t* area; area = vm_find(app, address); - vm_object_t* object = vm_find_object_by_address(app, address, MemorySizeInBytes); - if (object && area){ - vm_remove_object(app, object); - if (VOID_PTRS_SUB(area->end, area->start) + 1 > MemorySizeInBytes){ // the size of the released block is less than the size of area - if (area->start == address){ // shrink from the start + if(area) { + if(VOID_PTRS_SUB(area->end, area->start) + 1 > MemorySizeInBytes) { // the size of the released block is less than the size of area + if(area->start == address) { // shrink from the start area->start = VOID_PTR_ADD(area->start,MemorySizeInBytes); - } else if (VOID_PTRS_SUB(area->end, address) + 1 == MemorySizeInBytes){ // shrink from the end + } else if(VOID_PTRS_SUB(area->end, address) + 1 == MemorySizeInBytes) { // shrink from the end area->end = VOID_PTR_SUB(area->end, MemorySizeInBytes); } else { // split the area vm_split_area(app, area, address, MemorySizeInBytes); } - rc = 0; - } else if (VOID_PTRS_SUB(area->end, area->start) + 1 == MemorySizeInBytes){ // the size of the released block is exactly the same as the size of area + } else if(VOID_PTRS_SUB(area->end, area->start) + 1 == MemorySizeInBytes) { // the size of the released block is exactly the same as the size of area vm_remove_area(app, area); - rc = 0; - } else { - //Inconsistent data. Fail it? - rc = -1; } } - - return rc; } /* * returns allocated address or NULL. Assumes, that fmm_mutex is locked on entry. */ -static void* aperture_allocate(manageble_aperture_t* app, uint64_t MemorySizeInBytes){ +static void* aperture_allocate_area(manageble_aperture_t* app, uint64_t MemorySizeInBytes, uint64_t offset){ vm_area_t* cur, *next, *new_area, *start; - vm_object_t* new_object; void* new_address = NULL; next = NULL; new_area = NULL; cur = app->vm_ranges; - if (cur){ // not empty + if(cur) { // not empty // Look up the appropriate address space "hole" or end of the list - while(cur){ + while (cur) { next = cur->next; // End of the list reached - if (!next) + if(!next) break; // address space "hole" - if ((VOID_PTRS_SUB(next->start,cur->end) >= MemorySizeInBytes)) + if((VOID_PTRS_SUB(next->start,cur->end) >= MemorySizeInBytes)) break; cur = next; }; // If the new range is inside the reserved aperture - if (VOID_PTRS_SUB(app->limit, cur->end) + 1 >= MemorySizeInBytes){ + if(VOID_PTRS_SUB(app->limit, cur->end) + 1 >= MemorySizeInBytes) { // cur points to the last inspected element: the tail of the list or the found "hole" // Just extend the existing region new_address = VOID_PTR_ADD(cur->end, 1); cur->end = VOID_PTR_ADD(cur->end, MemorySizeInBytes); - } else - new_address = NULL; + } else new_address = NULL; } else { // empty - create the first area - start = (void*)app->base; + start = VOID_PTR_ADD(app->base, offset); // Some offset from the base new_area = vm_create_and_init_area(start, VOID_PTR_ADD(start, (MemorySizeInBytes - 1))); - if (new_area){ + if(new_area) { app->vm_ranges = new_area; new_address = new_area->start; } } - // Allocate new object - if (new_address){ - new_object = vm_create_and_init_object(new_address, MemorySizeInBytes, 0); - if (new_object){ - if (app->vm_objects == NULL){ // empty list - // Update head - app->vm_objects = new_object; - } else { - // Add it before the first element - vm_add_object_before(app->vm_objects, new_object); - // Update head - app->vm_objects = new_object; - } - } else{ - // Failed to allocate object: remove just allocated range and return NULL - aperture_release(app, new_address, MemorySizeInBytes); - new_address = NULL; - } - } - return new_address; } +/* + * returns 0 on success. Assumes, that fmm_mutex is locked on entry. + */ +static int aperture_allocate_object(manageble_aperture_t* app, void* new_address, uint64_t handle, uint64_t MemorySizeInBytes){ + vm_object_t* new_object; + + // Allocate new object + new_object = vm_create_and_init_object(new_address, MemorySizeInBytes, handle); + if(!new_object) + return -1; + + if(app->vm_objects == NULL ) { // empty list + // Update head + app->vm_objects = new_object; + } else { + // Add it before the first element + vm_add_object_before(app->vm_objects, new_object); + // Update head + app->vm_objects = new_object; + } + + return 0; +} static int32_t gpu_mem_find_by_gpu_id(uint32_t gpu_id){ @@ -387,21 +378,12 @@ void fmm_print(uint32_t gpu_id){ void* fmm_allocate_scratch(uint32_t gpu_id, uint64_t MemorySizeInBytes){ - - void* mem = NULL; - int32_t i = gpu_mem_find_by_gpu_id(gpu_id); - - // If not found or aperture isn't properly initialized/supported - if(i < 0 || !aperture_is_valid(gpu_mem[i].scratch_aperture.base, gpu_mem[i].scratch_aperture.limit)) - return NULL; - - pthread_mutex_lock(&gpu_mem[i].scratch_aperture.fmm_mutex); - mem = aperture_allocate(&gpu_mem[i].scratch_aperture, MemorySizeInBytes); - pthread_mutex_unlock(&gpu_mem[i].scratch_aperture.fmm_mutex); - - return mem; + // Not supported yet + return NULL; } +// The offset from GPUVM aperture base address to ensure that address 0 (after base subtraction) won't be used +#define GPUVM_APP_OFFSET 0x10000 void* fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes){ void* mem = NULL; @@ -409,37 +391,88 @@ void* fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes){ // If not found or aperture isn't properly initialized/supported if(i < 0 || !aperture_is_valid(gpu_mem[i].gpuvm_aperture.base, gpu_mem[i].gpuvm_aperture.limit)) - return NULL; + return NULL ; + // Allocate address space pthread_mutex_lock(&gpu_mem[i].gpuvm_aperture.fmm_mutex); - mem = aperture_allocate(&gpu_mem[i].gpuvm_aperture, MemorySizeInBytes); - pthread_mutex_unlock(&gpu_mem[i].gpuvm_aperture.fmm_mutex); + mem = aperture_allocate_area(&gpu_mem[i].gpuvm_aperture, MemorySizeInBytes, GPUVM_APP_OFFSET); + pthread_mutex_unlock(&gpu_mem[i].gpuvm_aperture.fmm_mutex); return mem; } +void* fmm_open_graphic_handle(uint32_t gpu_id, + int32_t graphic_device_handle, + uint32_t graphic_handle, + uint64_t MemorySizeInBytes){ -int fmm_release(void* address, uint64_t MemorySizeInBytes){ + void* mem = NULL; + int32_t i = gpu_mem_find_by_gpu_id(gpu_id); + struct kfd_ioctl_open_graphic_handle_args open_graphic_handle_args; + struct kfd_ioctl_unmap_memory_from_gpu_args unmap_args; + + // If not found or aperture isn't properly initialized/supported + if (i < 0 || !aperture_is_valid(gpu_mem[i].gpuvm_aperture.base, gpu_mem[i].gpuvm_aperture.limit)) + return NULL; + + pthread_mutex_lock(&gpu_mem[i].gpuvm_aperture.fmm_mutex); + // Allocate address space + mem = aperture_allocate_area(&gpu_mem[i].gpuvm_aperture, MemorySizeInBytes, GPUVM_APP_OFFSET); + if (!mem) + goto out; + + // Allocate local memory + open_graphic_handle_args.gpu_id = gpu_id; + open_graphic_handle_args.graphic_device_fd = graphic_device_handle; + open_graphic_handle_args.graphic_handle = graphic_handle; + open_graphic_handle_args.va_addr = VOID_PTRS_SUB(mem, gpu_mem[i].gpuvm_aperture.base); + if (kmtIoctl(kfd_fd, AMDKFD_IOC_OPEN_GRAPHIC_HANDLE, &open_graphic_handle_args)) + goto release_area; + + // Allocate object + if (aperture_allocate_object(&gpu_mem[i].gpuvm_aperture, mem, open_graphic_handle_args.handle, MemorySizeInBytes)) + goto release_mem; + + pthread_mutex_unlock(&gpu_mem[i].gpuvm_aperture.fmm_mutex); + + // That's all. Just return the new address + return mem; + +release_mem: + unmap_args.handle = open_graphic_handle_args.handle; + kmtIoctl(kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &unmap_args); +release_area: + aperture_release_area(&gpu_mem[i].gpuvm_aperture, mem, MemorySizeInBytes); +out: + pthread_mutex_unlock(&gpu_mem[i].gpuvm_aperture.fmm_mutex); + + return NULL ; +} + + +void fmm_release(void* address, uint64_t MemorySizeInBytes){ uint32_t i; - int32_t rc = -1; + bool found = false; - for(i = 0; i < NUM_OF_SUPPORTED_GPUS; i++){ + for (i = 0; i < NUM_OF_SUPPORTED_GPUS && !found; i++) { if(gpu_mem[i].gpu_id == NON_VALID_GPU_ID) continue; - if (address >= gpu_mem[i].gpuvm_aperture.base && address <= gpu_mem[i].gpuvm_aperture.limit){ - pthread_mutex_lock(&gpu_mem[i].gpuvm_aperture.fmm_mutex); - rc = aperture_release(&gpu_mem[i].gpuvm_aperture, address, MemorySizeInBytes); - pthread_mutex_unlock(&gpu_mem[i].gpuvm_aperture.fmm_mutex); + if(address >= gpu_mem[i].gpuvm_aperture.base && address <= gpu_mem[i].gpuvm_aperture.limit) { + found = true; + pthread_mutex_lock(&gpu_mem[i].gpuvm_aperture.fmm_mutex); + aperture_release_area(&gpu_mem[i].gpuvm_aperture, address, MemorySizeInBytes); + pthread_mutex_unlock(&gpu_mem[i].gpuvm_aperture.fmm_mutex); fmm_print(gpu_mem[i].gpu_id); - } else if (address >= gpu_mem[i].scratch_aperture.base && address <= gpu_mem[i].scratch_aperture.limit) - pthread_mutex_lock(&gpu_mem[i].scratch_aperture.fmm_mutex); - rc = aperture_release(&gpu_mem[i].scratch_aperture, address, MemorySizeInBytes); - pthread_mutex_unlock(&gpu_mem[i].scratch_aperture.fmm_mutex); + + } } - return rc; + // If memory address isn't inside of any defined aperture - it refers to the system memory + if (!found) { + free(address); + } } HSAKMT_STATUS fmm_init_process_apertures(){ @@ -484,3 +517,106 @@ HSAuint64 fmm_get_aperture_base(aperture_type_e aperture_type, HSAuint32 gpu_id) } } + + +static bool _fmm_map_to_gpu(uint32_t gpu_id, manageble_aperture_t* aperture, void* address, uint64_t size, uint64_t* gpuvm_address) { + + struct kfd_ioctl_map_memory_to_gpu_args args; + struct kfd_ioctl_unmap_memory_from_gpu_args unmap_args; + + // Check that address space was previously reserved + if (vm_find(aperture, address) == NULL) + return false; + + // Allocate local memory + args.gpu_id = gpu_id; + args.size = size; + args.va_addr = VOID_PTRS_SUB(address, aperture->base); //va_addr is 40 bit GPUVM address + if(kmtIoctl(kfd_fd, AMDKFD_IOC_MAP_MEMORY_TO_GPU, &args)) + return false; + + // Allocate object + pthread_mutex_lock(&aperture->fmm_mutex); + if (aperture_allocate_object(aperture, address, args.handle, size)) + goto err_object_allocation_failed; + pthread_mutex_unlock(&aperture->fmm_mutex); + + *gpuvm_address = args.va_addr; + + return true; + +err_object_allocation_failed: + pthread_mutex_unlock(&aperture->fmm_mutex); + unmap_args.handle = args.handle; + kmtIoctl(kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &unmap_args); + + *gpuvm_address = 0; + return false; +} + +bool fmm_map_to_gpu(void* address, uint64_t size, uint64_t* gpuvm_address) { + + int32_t i; + uint64_t pi; + + // Find an aperture the requested address belongs to + for(i = 0; i < NUM_OF_SUPPORTED_GPUS; i++){ + if(gpu_mem[i].gpu_id != NON_VALID_GPU_ID){ + if ((address>= gpu_mem[i].gpuvm_aperture.base) && (address<= gpu_mem[i].gpuvm_aperture.limit)) { + // map it + return _fmm_map_to_gpu(gpu_mem[i].gpu_id, &gpu_mem[i].gpuvm_aperture, address, size, gpuvm_address); + } + } + } + + // If address isn't Local memory address, we assume that this is + // system memory address accessed through IOMMU. + // Thus we "prefetch" it + for(pi = 0; pi < size / PAGE_SIZE; pi++) { + ((char*)address)[pi*PAGE_SIZE] = 0; + } + return true; +} + +static bool _fmm_unmap_from_gpu(manageble_aperture_t* aperture, void* address) { + + vm_object_t* object; + struct kfd_ioctl_unmap_memory_from_gpu_args args; + + pthread_mutex_lock(&aperture->fmm_mutex); + + // Find the object to retrieve the handle + object = vm_find_object_by_address(aperture, address, 0); + if (!object) + goto err; + + args.handle = object->handle; + kmtIoctl(kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &args); + + vm_remove_object(aperture, object); + + pthread_mutex_unlock(&aperture->fmm_mutex); + return true; + +err: + pthread_mutex_unlock(&aperture->fmm_mutex); + return false; + +} + +bool fmm_unmap_from_gpu(void* address) { + + int32_t i; + + // Find the aperture the requested address belongs to + for(i = 0; i < NUM_OF_SUPPORTED_GPUS; i++){ + if(gpu_mem[i].gpu_id != NON_VALID_GPU_ID){ + if ((address>= gpu_mem[i].gpuvm_aperture.base) && (address<= gpu_mem[i].gpuvm_aperture.limit)) { + // unmap it + return _fmm_unmap_from_gpu(&gpu_mem[i].gpuvm_aperture, address); + } + } + } + + return true; +} diff --git a/projects/rocr-runtime/src/fmm.h b/projects/rocr-runtime/src/fmm.h index 5924247645..ca0a692ead 100644 --- a/projects/rocr-runtime/src/fmm.h +++ b/projects/rocr-runtime/src/fmm.h @@ -49,9 +49,15 @@ HSAKMT_STATUS fmm_init_process_apertures(void); */ void* fmm_allocate_scratch(uint32_t gpu_id, uint64_t MemorySizeInBytes); void* fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes); +void* fmm_open_graphic_handle(uint32_t gpu_id, + int32_t graphic_device_handle, + uint32_t graphic_handle, + uint64_t MemorySizeInBytes); void fmm_print(uint32_t node); bool fmm_is_inside_some_aperture(void* address); -int fmm_release(void* address, HSAuint64 MemorySizeInBytes); +void fmm_release(void* address, HSAuint64 MemorySizeInBytes); +bool fmm_map_to_gpu(void* address, uint64_t size, uint64_t* gpuvm_address); +bool fmm_unmap_from_gpu(void* address); /* Topology interface*/ HSAKMT_STATUS fmm_node_added(HSAuint32 gpu_id); diff --git a/projects/rocr-runtime/src/memory.c b/projects/rocr-runtime/src/memory.c index d65730a336..9c8e63f55b 100644 --- a/projects/rocr-runtime/src/memory.c +++ b/projects/rocr-runtime/src/memory.c @@ -115,6 +115,13 @@ hsaKmtAllocMemory( else return HSAKMT_STATUS_NO_MEMORY; } + else if(!MemFlags.ui32.HostAccess && MemFlags.ui32.NonPaged){ + *MemoryAddress = fmm_allocate_device(gpu_id, SizeInBytes); + if (*MemoryAddress) + return HSAKMT_STATUS_SUCCESS; + else + return HSAKMT_STATUS_NO_MEMORY; + } else return HSAKMT_STATUS_INVALID_PARAMETER; @@ -127,17 +134,10 @@ hsaKmtFreeMemory( HSAuint64 SizeInBytes //IN ) { - HSAKMT_STATUS hsa_status = HSAKMT_STATUS_SUCCESS; CHECK_KFD_OPEN(); - if (fmm_is_inside_some_aperture(MemoryAddress)){ - if (fmm_release( MemoryAddress, SizeInBytes)) - hsa_status = HSAKMT_STATUS_INVALID_PARAMETER; - } - else - free(MemoryAddress); - - return hsa_status; + fmm_release( MemoryAddress, SizeInBytes); + return HSAKMT_STATUS_SUCCESS; } HSAKMT_STATUS @@ -173,13 +173,16 @@ hsaKmtMapMemoryToGPU( { CHECK_KFD_OPEN(); - // We don't support GPUVM in the stub, there should never be a request for a GPUVA. if (AlternateVAGPU) - { *AlternateVAGPU = 0; + + if (fmm_map_to_gpu(MemoryAddress, MemorySizeInBytes, AlternateVAGPU)){ + return HSAKMT_STATUS_SUCCESS; + } + else { + return HSAKMT_STATUS_NOT_IMPLEMENTED; } - return HSAKMT_STATUS_SUCCESS; } HSAKMT_STATUS @@ -189,6 +192,56 @@ hsaKmtUnmapMemoryToGPU( ) { CHECK_KFD_OPEN(); + if (fmm_unmap_from_gpu(MemoryAddress)) + return HSAKMT_STATUS_SUCCESS; + else + return HSAKMT_STATUS_NOT_IMPLEMENTED; - return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtMapGraphicHandle( + HSAuint32 NodeId, //IN + HSAuint64 GraphicDeviceHandle, //IN + HSAuint64 GraphicResourceHandle, //IN + HSAuint64 GraphicResourceOffset, //IN + HSAuint64 GraphicResourceSize, //IN + HSAuint64* FlatMemoryAddress //OUT + ) +{ + CHECK_KFD_OPEN(); + HSAKMT_STATUS result; + uint32_t gpu_id; + void *graphic_handle; + + if (GraphicResourceOffset != 0) + return HSAKMT_STATUS_NOT_IMPLEMENTED; + + result = validate_nodeid(NodeId, &gpu_id); + if (result != HSAKMT_STATUS_SUCCESS) + return result; + + graphic_handle = fmm_open_graphic_handle(gpu_id, + GraphicDeviceHandle, + GraphicResourceHandle, + GraphicResourceSize); + + *FlatMemoryAddress = PORT_VPTR_TO_UINT64(graphic_handle); + + if (*FlatMemoryAddress) + return HSAKMT_STATUS_SUCCESS; + else + return HSAKMT_STATUS_NO_MEMORY; +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtUnmapGraphicHandle( + HSAuint32 NodeId, //IN + HSAuint64 FlatMemoryAddress, //IN + HSAuint64 SizeInBytes //IN + ) +{ + return hsaKmtUnmapMemoryToGPU(PORT_UINT64_TO_VPTR(FlatMemoryAddress)); }