diff --git a/projects/rocr-runtime/src/fmm.c b/projects/rocr-runtime/src/fmm.c index 6e3d8a1ece..be67d5d41a 100644 --- a/projects/rocr-runtime/src/fmm.c +++ b/projects/rocr-runtime/src/fmm.c @@ -33,58 +33,73 @@ #define NON_VALID_GPU_ID 0 #define ARRAY_LEN(array) (sizeof(array) / sizeof(array[0])) -#define INIT_APERTURE(base_value, limit_value) {.base = (void*)base_value, .limit = (void*)limit_value } -#define INIT_MANAGEBLE_APERTURE(base_value, limit_value) {.base = (void*)base_value,.limit = (void*)limit_value, .vm_ranges = NULL, .vm_objects = NULL, .fmm_mutex = PTHREAD_MUTEX_INITIALIZER} -#define INIT_GPU_MEM \ -{ .gpu_id = NON_VALID_GPU_ID,\ - .lds_aperture = INIT_APERTURE(0, 0), \ - .scratch_aperture = INIT_MANAGEBLE_APERTURE(0, 0),\ - .gpuvm_aperture = INIT_MANAGEBLE_APERTURE(0, 0)\ + +#define INIT_APERTURE(base_value, limit_value) { \ + .base = (void *) base_value, \ + .limit = (void *) limit_value \ + } + +#define INIT_MANAGEBLE_APERTURE(base_value, limit_value) { \ + .base = (void *) base_value, \ + .limit = (void *) limit_value, \ + .vm_ranges = NULL, \ + .vm_objects = NULL, \ + .fmm_mutex = PTHREAD_MUTEX_INITIALIZER \ + } + +#define INIT_GPU_MEM { \ + .gpu_id = NON_VALID_GPU_ID, \ + .lds_aperture = INIT_APERTURE(0, 0), \ + .scratch_aperture = INIT_MANAGEBLE_APERTURE(0, 0), \ + .gpuvm_aperture = INIT_MANAGEBLE_APERTURE(0, 0) \ } #define INIT_GPUs_MEM {[0 ... (NUM_OF_SUPPORTED_GPUS-1)] = INIT_GPU_MEM} -struct vm_object{ - void* start; + +struct vm_object { + void *start; uint64_t size; - uint64_t handle; // opaque - struct vm_object* next; - struct vm_object* prev; + uint64_t handle; /* opaque */ + struct vm_object *next; + struct vm_object *prev; }; typedef struct vm_object vm_object_t; -struct vm_area{ - void* start; - void* end; - struct vm_area* next; - struct vm_area* prev; +struct vm_area { + void *start; + void *end; + struct vm_area *next; + struct vm_area *prev; }; typedef struct vm_area vm_area_t; typedef struct { - void* base; - void* limit; - vm_area_t* vm_ranges; - vm_object_t* vm_objects; + void *base; + void *limit; + vm_area_t *vm_ranges; + vm_object_t *vm_objects; pthread_mutex_t fmm_mutex; } manageble_aperture_t; typedef struct { - void* base; - void* limit; + void *base; + void *limit; } aperture_t; -typedef struct{ +typedef struct { uint32_t gpu_id; aperture_t lds_aperture; manageble_aperture_t scratch_aperture; manageble_aperture_t gpuvm_aperture; -}gpu_mem_t; +} gpu_mem_t; static gpu_mem_t gpu_mem[] = INIT_GPUs_MEM; -static vm_area_t* vm_create_and_init_area(void* start, void* end){ - vm_area_t* area = (vm_area_t*)malloc(sizeof(vm_area_t));// TODO: Memory pool ??? - if (area){ +static vm_area_t *vm_create_and_init_area(void *start, void *end) +{ + vm_area_t *area = (vm_area_t *) malloc(sizeof(vm_area_t)); + + if (area) { area->start = start; area->end = end; area->next = area->prev = NULL; @@ -93,9 +108,12 @@ static vm_area_t* vm_create_and_init_area(void* start, void* end){ return area; } -static vm_object_t* vm_create_and_init_object(void* start, uint64_t size, uint64_t handle){ - vm_object_t* object = (vm_object_t*)malloc(sizeof(vm_object_t)); // TODO: Memory pool ??? - if (object){ +static vm_object_t *vm_create_and_init_object(void *start, uint64_t size, + uint64_t handle) +{ + vm_object_t *object = (vm_object_t *) malloc(sizeof(vm_object_t)); + + if (object) { object->start = start; object->size = size; object->handle = handle; @@ -106,48 +124,49 @@ static vm_object_t* vm_create_and_init_object(void* start, uint64_t size, uint64 } -static void vm_remove_area(manageble_aperture_t* app, vm_area_t* area){ - vm_area_t* next; - vm_area_t* prev; +static void vm_remove_area(manageble_aperture_t *app, vm_area_t *area) +{ + vm_area_t *next; + vm_area_t *prev; next = area->next; prev = area->prev; - if (prev == NULL )// The first element + if (prev == NULL) /* The first element */ app->vm_ranges = next; else prev->next = next; - if(next) // If not the last element + if (next) /* If not the last element */ next->prev = prev; free(area); - } -static void vm_remove_object(manageble_aperture_t* app, vm_object_t* object){ - vm_object_t* next; - vm_object_t* prev; +static void vm_remove_object(manageble_aperture_t *app, vm_object_t *object) +{ + vm_object_t *next; + vm_object_t *prev; next = object->next; prev = object->prev; - if (prev == NULL )// The first element + if (prev == NULL) /* The first element */ app->vm_objects = next; else prev->next = next; - if(next) // If not the last element + if (next) /* If not the last element */ next->prev = prev; free(object); } +static void vm_add_area_after(vm_area_t *after_this, vm_area_t *new_area) +{ + vm_area_t *next = after_this->next; - -static void vm_add_area_after(vm_area_t* after_this, vm_area_t* new_area){ - vm_area_t* next = after_this->next; after_this->next = new_area; new_area->next = next; @@ -156,8 +175,11 @@ static void vm_add_area_after(vm_area_t* after_this, vm_area_t* new_area){ next->prev = new_area; } -static void vm_add_object_before(vm_object_t* before_this, vm_object_t* new_object){ - vm_object_t* prev = before_this->prev; +static void vm_add_object_before(vm_object_t *before_this, + vm_object_t *new_object) +{ + vm_object_t *prev = before_this->prev; + before_this->prev = new_object; new_object->next = before_this; @@ -166,45 +188,54 @@ static void vm_add_object_before(vm_object_t* before_this, vm_object_t* new_obje prev->next = new_object; } -static void vm_split_area(manageble_aperture_t* app, vm_area_t* area, void* address, uint64_t MemorySizeInBytes){ +static void vm_split_area(manageble_aperture_t *app, vm_area_t *area, + void *address, uint64_t MemorySizeInBytes) +{ + /* + * The existing area is split to: [area->start, address - 1] + * and [address + MemorySizeInBytes, area->end] + */ + vm_area_t *new_area = vm_create_and_init_area( + VOID_PTR_ADD(address, MemorySizeInBytes), + area->end); - // The existing area is split to: [area->start, address - 1] and [address + MemorySizeInBytes, area->end] - vm_area_t* new_area = vm_create_and_init_area(VOID_PTR_ADD(address,MemorySizeInBytes), area->end); - - // Shrink the existing area - area->end = VOID_PTR_SUB(address,1); + /* Shrink the existing area */ + area->end = VOID_PTR_SUB(address, 1); vm_add_area_after(area, new_area); - } -static vm_object_t* vm_find_object_by_address(manageble_aperture_t* app, void* address, uint64_t size){ - vm_object_t* cur = app->vm_objects; +static vm_object_t *vm_find_object_by_address(manageble_aperture_t *app, + void *address, uint64_t size) +{ + vm_object_t *cur = app->vm_objects; - // Look up the appropriate address range containing the given address - while(cur){ - if(cur->start == address && (cur->size == size || size == 0)) + /* Look up the appropriate address range containing the given address */ + while (cur) { + if (cur->start == address && (cur->size == size || size == 0)) break; cur = cur->next; }; - return cur; // NULL if not found + return cur; /* NULL if not found */ } -static vm_area_t* vm_find(manageble_aperture_t* app, void* address){ - vm_area_t* cur = app->vm_ranges; +static vm_area_t *vm_find(manageble_aperture_t *app, void *address) +{ + vm_area_t *cur = app->vm_ranges; - // Look up the appropriate address range containing the given address - while(cur){ - if(cur->start <= address && cur->end >= address) + /* Look up the appropriate address range containing the given address */ + while (cur) { + if (cur->start <= address && cur->end >= address) break; cur = cur->next; }; - return cur; // NULL if not found + return cur; /* NULL if not found */ } -static bool aperture_is_valid(void* app_base, void* app_limit){ +static bool aperture_is_valid(void *app_base, void *app_limit) +{ if (app_base && app_limit && app_base < app_limit) return true; return false; @@ -213,329 +244,405 @@ static bool aperture_is_valid(void* app_base, void* app_limit){ /* * Assumes that fmm_mutex is locked on entry. */ -static void aperture_release_area(manageble_aperture_t* app, void* address, uint64_t MemorySizeInBytes){ - vm_area_t* area; +static void aperture_release_area(manageble_aperture_t *app, void *address, + uint64_t MemorySizeInBytes) +{ + vm_area_t *area; + uint64_t SizeOfRegion; area = vm_find(app, address); - if(area) { - if(VOID_PTRS_SUB(area->end, area->start) + 1 > MemorySizeInBytes) { // the size of the released block is less than the size of area - if(area->start == address) { // shrink from the start - area->start = VOID_PTR_ADD(area->start,MemorySizeInBytes); - } else if(VOID_PTRS_SUB(area->end, address) + 1 == MemorySizeInBytes) { // shrink from the end - area->end = VOID_PTR_SUB(area->end, MemorySizeInBytes); - } else { // split the area - vm_split_area(app, area, address, MemorySizeInBytes); - } - } else if(VOID_PTRS_SUB(area->end, area->start) + 1 == MemorySizeInBytes) { // the size of the released block is exactly the same as the size of area - vm_remove_area(app, area); - } + if (!area) + return; + + SizeOfRegion = VOID_PTRS_SUB(area->end, area->start) + 1; + + /* check if block is whole region or part of it */ + if (SizeOfRegion == MemorySizeInBytes) { + vm_remove_area(app, area); + } else if (SizeOfRegion > MemorySizeInBytes) { + /* shrink from the start */ + if (area->start == address) + area->start = + VOID_PTR_ADD(area->start, MemorySizeInBytes); + /* shrink from the end */ + else if (VOID_PTRS_SUB(area->end, address) + 1 == + MemorySizeInBytes) + area->end = VOID_PTR_SUB(area->end, MemorySizeInBytes); + /* split the area */ + else + vm_split_area(app, area, address, MemorySizeInBytes); } } /* - * returns allocated address or NULL. Assumes, that fmm_mutex is locked on entry. + * returns allocated address or NULL. Assumes, that fmm_mutex is locked + * on entry. */ -static void* aperture_allocate_area(manageble_aperture_t* app, uint64_t MemorySizeInBytes, uint64_t offset){ - vm_area_t* cur, *next, *new_area, *start; - void* new_address = NULL; +static void *aperture_allocate_area(manageble_aperture_t *app, + uint64_t MemorySizeInBytes, + uint64_t offset) +{ + vm_area_t *cur, *next, *new_area, *start; + void *new_address = NULL; + next = NULL; new_area = NULL; cur = app->vm_ranges; - if(cur) { // not empty - - // Look up the appropriate address space "hole" or end of the list + if (cur) { /* not empty */ + /* + * Look up the appropriate address space "hole" or end of + * the list + */ while (cur) { next = cur->next; - // End of the list reached - if(!next) + /* End of the list reached */ + if (!next) break; - // address space "hole" - if((VOID_PTRS_SUB(next->start,cur->end) >= MemorySizeInBytes)) + /* address space "hole" */ + if ((VOID_PTRS_SUB(next->start, cur->end) >= + MemorySizeInBytes)) break; cur = next; }; - // If the new range is inside the reserved aperture - if(VOID_PTRS_SUB(app->limit, cur->end) + 1 >= MemorySizeInBytes) { - // cur points to the last inspected element: the tail of the list or the found "hole" - // Just extend the existing region + /* If the new range is inside the reserved aperture */ + if (VOID_PTRS_SUB(app->limit, cur->end) + 1 >= + MemorySizeInBytes) { + /* + * cur points to the last inspected element: the tail + * of the list or the found "hole". + * Just extend the existing region + */ new_address = VOID_PTR_ADD(cur->end, 1); cur->end = VOID_PTR_ADD(cur->end, MemorySizeInBytes); - } else new_address = NULL; - - } else { // empty - create the first area - start = VOID_PTR_ADD(app->base, offset); // Some offset from the base - new_area = vm_create_and_init_area(start, VOID_PTR_ADD(start, (MemorySizeInBytes - 1))); - if(new_area) { + } else { + new_address = NULL; + } + } else { /* empty - create the first area */ + /* Some offset from the base */ + start = VOID_PTR_ADD(app->base, offset); + new_area = vm_create_and_init_area(start, + VOID_PTR_ADD(start, (MemorySizeInBytes - 1))); + if (new_area) { app->vm_ranges = new_area; new_address = new_area->start; } } return new_address; - } -/* - * returns 0 on success. Assumes, that fmm_mutex is locked on entry. - */ -static int aperture_allocate_object(manageble_aperture_t* app, void* new_address, uint64_t handle, uint64_t MemorySizeInBytes){ - vm_object_t* new_object; +/* returns 0 on success. Assumes, that fmm_mutex is locked on entry */ +static int aperture_allocate_object(manageble_aperture_t *app, + void *new_address, + uint64_t handle, + uint64_t MemorySizeInBytes) +{ + vm_object_t *new_object; - // Allocate new object - new_object = vm_create_and_init_object(new_address, MemorySizeInBytes, handle); - if(!new_object) + /* Allocate new object */ + new_object = vm_create_and_init_object(new_address, + MemorySizeInBytes, + handle); + if (!new_object) return -1; - if(app->vm_objects == NULL ) { // empty list - // Update head - app->vm_objects = new_object; - } else { - // Add it before the first element + /* check for non-empty list */ + if (app->vm_objects != NULL) + /* Add it before the first element */ vm_add_object_before(app->vm_objects, new_object); - // Update head - app->vm_objects = new_object; - } + + app->vm_objects = new_object; /* Update head */ return 0; } - -static int32_t gpu_mem_find_by_gpu_id(uint32_t gpu_id){ +static int32_t gpu_mem_find_by_gpu_id(uint32_t gpu_id) +{ int32_t i; - for(i = 0; i < NUM_OF_SUPPORTED_GPUS; i++){ - if(gpu_mem[i].gpu_id == gpu_id) + for (i = 0 ; i < NUM_OF_SUPPORTED_GPUS ; i++) + if (gpu_mem[i].gpu_id == gpu_id) return i; - } return -1; } -bool fmm_is_inside_some_aperture(void* address){ - +bool fmm_is_inside_some_aperture(void *address) +{ int32_t i; - for(i = 0; i < NUM_OF_SUPPORTED_GPUS; i++){ - if(gpu_mem[i].gpu_id != NON_VALID_GPU_ID){ - if ((address>= gpu_mem[i].lds_aperture.base) && (address<= gpu_mem[i].lds_aperture.limit)) - return true; - if ((address>= gpu_mem[i].gpuvm_aperture.base) && (address<= gpu_mem[i].gpuvm_aperture.limit)) - return true; - if ((address>= gpu_mem[i].scratch_aperture.base) && (address<= gpu_mem[i].scratch_aperture.limit)) - return true; - } + for (i = 0 ; i < NUM_OF_SUPPORTED_GPUS ; i++) { + if (gpu_mem[i].gpu_id == NON_VALID_GPU_ID) + continue; + if ((address >= gpu_mem[i].lds_aperture.base) && + (address <= gpu_mem[i].lds_aperture.limit)) + return true; + if ((address >= gpu_mem[i].gpuvm_aperture.base) && + (address <= gpu_mem[i].gpuvm_aperture.limit)) + return true; + if ((address >= gpu_mem[i].scratch_aperture.base) && + (address <= gpu_mem[i].scratch_aperture.limit)) + return true; } return false; } #ifdef DEBUG_PRINT_APERTURE -static void aperture_print(aperture_t* app){ +static void aperture_print(aperture_t *app) +{ printf("\t Base: %p\n", app->base); printf("\t Limit: %p\n", app->limit); } -static void manageble_aperture_print(manageble_aperture_t* app){ - vm_area_t* cur = app->vm_ranges; +static void manageble_aperture_print(manageble_aperture_t *app) +{ + vm_area_t *cur = app->vm_ranges; vm_object_t *object = app->vm_objects; printf("\t Base: %p\n", app->base); printf("\t Limit: %p\n", app->limit); - printf("\t Ranges: \n"); - while(cur){ - printf("\t\t Range [%p - %p] \n", cur->start, cur->end); + printf("\t Ranges:\n"); + while (cur) { + printf("\t\t Range [%p - %p]\n", cur->start, cur->end); cur = cur->next; }; - printf("\t Objects: \n"); - while(object){ - printf("\t\t Object [%p - %" PRIu64 "] \n", object->start, object->size); + printf("\t Objects:\n"); + while (object) { + printf("\t\t Object [%p - %" PRIu64 "]\n", + object->start, object->size); object = object->next; }; } -void fmm_print(uint32_t gpu_id){ +void fmm_print(uint32_t gpu_id) +{ int32_t i = gpu_mem_find_by_gpu_id(gpu_id); - if(i >= 0){ // Found - printf("LDS aperture: \n"); - aperture_print(&gpu_mem[i].lds_aperture); - printf("GPUVM aperture: \n"); - manageble_aperture_print(&gpu_mem[i].gpuvm_aperture); - printf("Scratch aperture: \n"); - manageble_aperture_print(&gpu_mem[i].scratch_aperture); + if (i >= 0) { /* Found */ + printf("LDS aperture:\n"); + aperture_print(&gpu_mem[i].lds_aperture); + printf("GPUVM aperture:\n"); + manageble_aperture_print(&gpu_mem[i].gpuvm_aperture); + printf("Scratch aperture:\n"); + manageble_aperture_print(&gpu_mem[i].scratch_aperture); } } #else -void fmm_print(uint32_t gpu_id){ - +void fmm_print(uint32_t gpu_id) +{ } #endif -void* fmm_allocate_scratch(uint32_t gpu_id, uint64_t MemorySizeInBytes){ - // Not supported yet +void *fmm_allocate_scratch(uint32_t gpu_id, uint64_t MemorySizeInBytes) +{ + /* Not supported yet */ return NULL; } -// The offset from GPUVM aperture base address to ensure that address 0 (after base subtraction) won't be used +/* + * The offset from GPUVM aperture base address to ensure that address 0 + * (after base subtraction) won't be used + */ #define GPUVM_APP_OFFSET 0x10000 -void* fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes){ - - void* mem = NULL; +void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes) +{ + void *mem = NULL; int32_t i = gpu_mem_find_by_gpu_id(gpu_id); - // If not found or aperture isn't properly initialized/supported - if(i < 0 || !aperture_is_valid(gpu_mem[i].gpuvm_aperture.base, gpu_mem[i].gpuvm_aperture.limit)) - return NULL ; + /* If not found or aperture isn't properly initialized/supported */ + if (i < 0 || + !aperture_is_valid(gpu_mem[i].gpuvm_aperture.base, + gpu_mem[i].gpuvm_aperture.limit)) + return NULL; - // Allocate address space + /* Allocate address space */ pthread_mutex_lock(&gpu_mem[i].gpuvm_aperture.fmm_mutex); - mem = aperture_allocate_area(&gpu_mem[i].gpuvm_aperture, MemorySizeInBytes, GPUVM_APP_OFFSET); + mem = aperture_allocate_area(&gpu_mem[i].gpuvm_aperture, + MemorySizeInBytes, GPUVM_APP_OFFSET); pthread_mutex_unlock(&gpu_mem[i].gpuvm_aperture.fmm_mutex); return mem; } -void* fmm_open_graphic_handle(uint32_t gpu_id, +void *fmm_open_graphic_handle(uint32_t gpu_id, int32_t graphic_device_handle, uint32_t graphic_handle, - uint64_t MemorySizeInBytes){ + uint64_t MemorySizeInBytes) +{ - void* mem = NULL; + void *mem = NULL; int32_t i = gpu_mem_find_by_gpu_id(gpu_id); struct kfd_ioctl_open_graphic_handle_args open_graphic_handle_args; struct kfd_ioctl_unmap_memory_from_gpu_args unmap_args; - // If not found or aperture isn't properly initialized/supported - if (i < 0 || !aperture_is_valid(gpu_mem[i].gpuvm_aperture.base, gpu_mem[i].gpuvm_aperture.limit)) + /* If not found or aperture isn't properly initialized/supported */ + if (i < 0 || !aperture_is_valid(gpu_mem[i].gpuvm_aperture.base, + gpu_mem[i].gpuvm_aperture.limit)) return NULL; pthread_mutex_lock(&gpu_mem[i].gpuvm_aperture.fmm_mutex); - // Allocate address space - mem = aperture_allocate_area(&gpu_mem[i].gpuvm_aperture, MemorySizeInBytes, GPUVM_APP_OFFSET); + /* Allocate address space */ + mem = aperture_allocate_area(&gpu_mem[i].gpuvm_aperture, + MemorySizeInBytes, GPUVM_APP_OFFSET); if (!mem) goto out; - // Allocate local memory + /* Allocate local memory */ open_graphic_handle_args.gpu_id = gpu_id; open_graphic_handle_args.graphic_device_fd = graphic_device_handle; open_graphic_handle_args.graphic_handle = graphic_handle; - open_graphic_handle_args.va_addr = VOID_PTRS_SUB(mem, gpu_mem[i].gpuvm_aperture.base); - if (kmtIoctl(kfd_fd, AMDKFD_IOC_OPEN_GRAPHIC_HANDLE, &open_graphic_handle_args)) + open_graphic_handle_args.va_addr = + VOID_PTRS_SUB(mem, gpu_mem[i].gpuvm_aperture.base); + + if (kmtIoctl(kfd_fd, AMDKFD_IOC_OPEN_GRAPHIC_HANDLE, + &open_graphic_handle_args)) goto release_area; - // Allocate object - if (aperture_allocate_object(&gpu_mem[i].gpuvm_aperture, mem, open_graphic_handle_args.handle, MemorySizeInBytes)) + /* Allocate object */ + if (aperture_allocate_object(&gpu_mem[i].gpuvm_aperture, mem, + open_graphic_handle_args.handle, + MemorySizeInBytes)) goto release_mem; pthread_mutex_unlock(&gpu_mem[i].gpuvm_aperture.fmm_mutex); - // That's all. Just return the new address + /* That's all. Just return the new address */ return mem; release_mem: unmap_args.handle = open_graphic_handle_args.handle; kmtIoctl(kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &unmap_args); release_area: - aperture_release_area(&gpu_mem[i].gpuvm_aperture, mem, MemorySizeInBytes); + aperture_release_area(&gpu_mem[i].gpuvm_aperture, mem, + MemorySizeInBytes); out: pthread_mutex_unlock(&gpu_mem[i].gpuvm_aperture.fmm_mutex); - return NULL ; + return NULL; } -void fmm_release(void* address, uint64_t MemorySizeInBytes){ - +void fmm_release(void *address, uint64_t MemorySizeInBytes) +{ uint32_t i; bool found = false; - for (i = 0; i < NUM_OF_SUPPORTED_GPUS && !found; i++) { - if(gpu_mem[i].gpu_id == NON_VALID_GPU_ID) + for (i = 0 ; i < NUM_OF_SUPPORTED_GPUS && !found ; i++) { + if (gpu_mem[i].gpu_id == NON_VALID_GPU_ID) continue; - if(address >= gpu_mem[i].gpuvm_aperture.base && address <= gpu_mem[i].gpuvm_aperture.limit) { + if (address >= gpu_mem[i].gpuvm_aperture.base && + address <= gpu_mem[i].gpuvm_aperture.limit) { found = true; pthread_mutex_lock(&gpu_mem[i].gpuvm_aperture.fmm_mutex); - aperture_release_area(&gpu_mem[i].gpuvm_aperture, address, MemorySizeInBytes); + aperture_release_area(&gpu_mem[i].gpuvm_aperture, address, + MemorySizeInBytes); pthread_mutex_unlock(&gpu_mem[i].gpuvm_aperture.fmm_mutex); fmm_print(gpu_mem[i].gpu_id); - } } - // If memory address isn't inside of any defined aperture - it refers to the system memory - if (!found) { + /* + * If memory address isn't inside of any defined aperture - it refers + * to the system memory + */ + if (!found) free(address); - } } -HSAKMT_STATUS fmm_init_process_apertures(){ +HSAKMT_STATUS fmm_init_process_apertures(void) +{ struct kfd_ioctl_get_process_apertures_args args; uint8_t node_id; - if (0 == kmtIoctl(kfd_fd, AMDKFD_IOC_GET_PROCESS_APERTURES, (void*)&args)){ - for(node_id = 0; node_id < args.num_of_nodes; node_id++){ - gpu_mem[node_id].gpu_id = args.process_apertures[node_id].gpu_id; - gpu_mem[node_id].lds_aperture.base = PORT_UINT64_TO_VPTR(args.process_apertures[node_id].lds_base); - gpu_mem[node_id].lds_aperture.limit = PORT_UINT64_TO_VPTR(args.process_apertures[node_id].lds_limit); - gpu_mem[node_id].gpuvm_aperture.base = PORT_UINT64_TO_VPTR(args.process_apertures[node_id].gpuvm_base); - gpu_mem[node_id].gpuvm_aperture.limit = PORT_UINT64_TO_VPTR(args.process_apertures[node_id].gpuvm_limit); - gpu_mem[node_id].scratch_aperture.base = PORT_UINT64_TO_VPTR(args.process_apertures[node_id].scratch_base); - gpu_mem[node_id].scratch_aperture.limit = PORT_UINT64_TO_VPTR(args.process_apertures[node_id].scratch_limit); - } + if (kmtIoctl(kfd_fd, AMDKFD_IOC_GET_PROCESS_APERTURES, (void *) &args)) + return HSAKMT_STATUS_ERROR; - return HSAKMT_STATUS_SUCCESS; + for (node_id = 0 ; node_id < args.num_of_nodes ; node_id++) { + gpu_mem[node_id].gpu_id = + args.process_apertures[node_id].gpu_id; + + gpu_mem[node_id].lds_aperture.base = + PORT_UINT64_TO_VPTR(args.process_apertures[node_id].lds_base); + + gpu_mem[node_id].lds_aperture.limit = + PORT_UINT64_TO_VPTR(args.process_apertures[node_id].lds_limit); + + gpu_mem[node_id].gpuvm_aperture.base = + PORT_UINT64_TO_VPTR(args.process_apertures[node_id].gpuvm_base); + + gpu_mem[node_id].gpuvm_aperture.limit = + PORT_UINT64_TO_VPTR(args.process_apertures[node_id].gpuvm_limit); + + gpu_mem[node_id].scratch_aperture.base = + PORT_UINT64_TO_VPTR(args.process_apertures[node_id].scratch_base); + + gpu_mem[node_id].scratch_aperture.limit = + PORT_UINT64_TO_VPTR(args.process_apertures[node_id].scratch_limit); } - return HSAKMT_STATUS_ERROR; - + return HSAKMT_STATUS_SUCCESS; } -HSAuint64 fmm_get_aperture_base(aperture_type_e aperture_type, HSAuint32 gpu_id){ +HSAuint64 fmm_get_aperture_base(aperture_type_e aperture_type, HSAuint32 gpu_id) +{ int32_t slot = gpu_mem_find_by_gpu_id(gpu_id); - if (slot<0) + + if (slot < 0) return HSAKMT_STATUS_INVALID_PARAMETER; - switch(aperture_type){ + switch (aperture_type) { case FMM_GPUVM: - return aperture_is_valid(gpu_mem[slot].gpuvm_aperture.base, gpu_mem[slot].gpuvm_aperture.limit) ? PORT_VPTR_TO_UINT64(gpu_mem[slot].gpuvm_aperture.base) : 0; + return aperture_is_valid(gpu_mem[slot].gpuvm_aperture.base, + gpu_mem[slot].gpuvm_aperture.limit) ? + PORT_VPTR_TO_UINT64(gpu_mem[slot].gpuvm_aperture.base) : 0; break; + case FMM_SCRATCH: - return aperture_is_valid(gpu_mem[slot].scratch_aperture.base, gpu_mem[slot].scratch_aperture.limit) ? PORT_VPTR_TO_UINT64(gpu_mem[slot].scratch_aperture.base) : 0; + return aperture_is_valid(gpu_mem[slot].scratch_aperture.base, + gpu_mem[slot].scratch_aperture.limit) ? + PORT_VPTR_TO_UINT64(gpu_mem[slot].scratch_aperture.base) : 0; break; + case FMM_LDS: - return aperture_is_valid(gpu_mem[slot].lds_aperture.base, gpu_mem[slot].lds_aperture.limit) ? PORT_VPTR_TO_UINT64(gpu_mem[slot].lds_aperture.base) : 0; + return aperture_is_valid(gpu_mem[slot].lds_aperture.base, + gpu_mem[slot].lds_aperture.limit) ? + PORT_VPTR_TO_UINT64(gpu_mem[slot].lds_aperture.base) : 0; break; + default: return 0; } - } - -static bool _fmm_map_to_gpu(uint32_t gpu_id, manageble_aperture_t* aperture, void* address, uint64_t size, uint64_t* gpuvm_address) { - +static int _fmm_map_to_gpu(uint32_t gpu_id, manageble_aperture_t *aperture, + void *address, uint64_t size, + uint64_t *gpuvm_address) +{ struct kfd_ioctl_map_memory_to_gpu_args args; struct kfd_ioctl_unmap_memory_from_gpu_args unmap_args; - // Check that address space was previously reserved + /* Check that address space was previously reserved */ if (vm_find(aperture, address) == NULL) - return false; + return -1; - // Allocate local memory + /* Allocate local memory */ args.gpu_id = gpu_id; args.size = size; - args.va_addr = VOID_PTRS_SUB(address, aperture->base); //va_addr is 40 bit GPUVM address - if(kmtIoctl(kfd_fd, AMDKFD_IOC_MAP_MEMORY_TO_GPU, &args)) - return false; - // Allocate object + /* va_addr is 40 bit GPUVM address */ + args.va_addr = VOID_PTRS_SUB(address, aperture->base); + if (kmtIoctl(kfd_fd, AMDKFD_IOC_MAP_MEMORY_TO_GPU, &args)) + goto err_map_ioctl_failed; + + /* Allocate object */ pthread_mutex_lock(&aperture->fmm_mutex); if (aperture_allocate_object(aperture, address, args.handle, size)) goto err_object_allocation_failed; @@ -543,49 +650,54 @@ static bool _fmm_map_to_gpu(uint32_t gpu_id, manageble_aperture_t* aperture, voi *gpuvm_address = args.va_addr; - return true; + return 0; err_object_allocation_failed: pthread_mutex_unlock(&aperture->fmm_mutex); unmap_args.handle = args.handle; kmtIoctl(kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &unmap_args); +err_map_ioctl_failed: *gpuvm_address = 0; - return false; + return -1; } -bool fmm_map_to_gpu(void* address, uint64_t size, uint64_t* gpuvm_address) { - +int fmm_map_to_gpu(void *address, uint64_t size, uint64_t *gpuvm_address) +{ int32_t i; uint64_t pi; - // Find an aperture the requested address belongs to - for(i = 0; i < NUM_OF_SUPPORTED_GPUS; i++){ - if(gpu_mem[i].gpu_id != NON_VALID_GPU_ID){ - if ((address>= gpu_mem[i].gpuvm_aperture.base) && (address<= gpu_mem[i].gpuvm_aperture.limit)) { - // map it - return _fmm_map_to_gpu(gpu_mem[i].gpu_id, &gpu_mem[i].gpuvm_aperture, address, size, gpuvm_address); - } - } + /* Find an aperture the requested address belongs to */ + for (i = 0; i < NUM_OF_SUPPORTED_GPUS; i++) { + if (gpu_mem[i].gpu_id == NON_VALID_GPU_ID) + continue; + + if ((address >= gpu_mem[i].gpuvm_aperture.base) && + (address <= gpu_mem[i].gpuvm_aperture.limit)) + /* map it */ + return _fmm_map_to_gpu(gpu_mem[i].gpu_id, + &gpu_mem[i].gpuvm_aperture, + address, size, gpuvm_address); } - // If address isn't Local memory address, we assume that this is - // system memory address accessed through IOMMU. - // Thus we "prefetch" it - for(pi = 0; pi < size / PAGE_SIZE; pi++) { - ((char*)address)[pi*PAGE_SIZE] = 0; - } - return true; + /* + * If address isn't Local memory address, we assume that this is + * system memory address accessed through IOMMU. Thus we "prefetch" it + */ + for (pi = 0; pi < size / PAGE_SIZE; pi++) + ((char *) address)[pi * PAGE_SIZE] = 0; + + return 0; } -static bool _fmm_unmap_from_gpu(manageble_aperture_t* aperture, void* address) { - - vm_object_t* object; +static int _fmm_unmap_from_gpu(manageble_aperture_t *aperture, void *address) +{ + vm_object_t *object; struct kfd_ioctl_unmap_memory_from_gpu_args args; pthread_mutex_lock(&aperture->fmm_mutex); - // Find the object to retrieve the handle + /* Find the object to retrieve the handle */ object = vm_find_object_by_address(aperture, address, 0); if (!object) goto err; @@ -596,27 +708,28 @@ static bool _fmm_unmap_from_gpu(manageble_aperture_t* aperture, void* address) { vm_remove_object(aperture, object); pthread_mutex_unlock(&aperture->fmm_mutex); - return true; + return 0; err: pthread_mutex_unlock(&aperture->fmm_mutex); - return false; - + return -1; } -bool fmm_unmap_from_gpu(void* address) { - +int fmm_unmap_from_gpu(void *address) +{ int32_t i; - // Find the aperture the requested address belongs to - for(i = 0; i < NUM_OF_SUPPORTED_GPUS; i++){ - if(gpu_mem[i].gpu_id != NON_VALID_GPU_ID){ - if ((address>= gpu_mem[i].gpuvm_aperture.base) && (address<= gpu_mem[i].gpuvm_aperture.limit)) { - // unmap it - return _fmm_unmap_from_gpu(&gpu_mem[i].gpuvm_aperture, address); - } - } + /* Find the aperture the requested address belongs to */ + for (i = 0; i < NUM_OF_SUPPORTED_GPUS; i++) { + if (gpu_mem[i].gpu_id == NON_VALID_GPU_ID) + continue; + + if ((address >= gpu_mem[i].gpuvm_aperture.base) && + (address <= gpu_mem[i].gpuvm_aperture.limit)) + /* unmap it */ + return _fmm_unmap_from_gpu(&gpu_mem[i].gpuvm_aperture, + address); } - return true; + return 0; } diff --git a/projects/rocr-runtime/src/fmm.h b/projects/rocr-runtime/src/fmm.h index ca0a692ead..c3b99d0b9e 100644 --- a/projects/rocr-runtime/src/fmm.h +++ b/projects/rocr-runtime/src/fmm.h @@ -56,8 +56,8 @@ void* fmm_open_graphic_handle(uint32_t gpu_id, void fmm_print(uint32_t node); bool fmm_is_inside_some_aperture(void* address); void fmm_release(void* address, HSAuint64 MemorySizeInBytes); -bool fmm_map_to_gpu(void* address, uint64_t size, uint64_t* gpuvm_address); -bool fmm_unmap_from_gpu(void* address); +int fmm_map_to_gpu(void *address, uint64_t size, uint64_t *gpuvm_address); +int fmm_unmap_from_gpu(void *address); /* Topology interface*/ HSAKMT_STATUS fmm_node_added(HSAuint32 gpu_id); diff --git a/projects/rocr-runtime/src/memory.c b/projects/rocr-runtime/src/memory.c index a2c1eb49b6..68d86d5e5e 100644 --- a/projects/rocr-runtime/src/memory.c +++ b/projects/rocr-runtime/src/memory.c @@ -37,13 +37,14 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryPolicy( - HSAuint32 Node, - HSAuint32 DefaultPolicy, - HSAuint32 AlternatePolicy, - void* MemoryAddressAlternate, - HSAuint64 MemorySizeInBytes - ) + HSAuint32 Node, + HSAuint32 DefaultPolicy, + HSAuint32 AlternatePolicy, + void *MemoryAddressAlternate, + HSAuint64 MemorySizeInBytes +) { + struct kfd_ioctl_set_memory_policy_args args; HSAKMT_STATUS result; uint32_t gpu_id; @@ -53,23 +54,31 @@ hsaKmtSetMemoryPolicy( if (result != HSAKMT_STATUS_SUCCESS) return result; - // We accept any legal policy and alternate address location. You get CC everywhere anyway. - if ((DefaultPolicy != HSA_CACHING_CACHED && DefaultPolicy != HSA_CACHING_NONCACHED) - || (AlternatePolicy != HSA_CACHING_CACHED && AlternatePolicy != HSA_CACHING_NONCACHED)) - { + /* + * We accept any legal policy and alternate address location. + * You get CC everywhere anyway. + */ + if ((DefaultPolicy != HSA_CACHING_CACHED && + DefaultPolicy != HSA_CACHING_NONCACHED) || + (AlternatePolicy != HSA_CACHING_CACHED && + AlternatePolicy != HSA_CACHING_NONCACHED)) return HSAKMT_STATUS_INVALID_PARAMETER; - } CHECK_PAGE_MULTIPLE(MemoryAddressAlternate); CHECK_PAGE_MULTIPLE(MemorySizeInBytes); - struct kfd_ioctl_set_memory_policy_args args; memset(&args, 0, sizeof(args)); args.gpu_id = gpu_id; - args.default_policy = (DefaultPolicy == HSA_CACHING_CACHED) ? KFD_IOC_CACHE_POLICY_COHERENT : KFD_IOC_CACHE_POLICY_NONCOHERENT; - args.alternate_policy = (AlternatePolicy == HSA_CACHING_CACHED) ? KFD_IOC_CACHE_POLICY_COHERENT : KFD_IOC_CACHE_POLICY_NONCOHERENT; - args.alternate_aperture_base = (uintptr_t)MemoryAddressAlternate; + args.default_policy = (DefaultPolicy == HSA_CACHING_CACHED) ? + KFD_IOC_CACHE_POLICY_COHERENT : + KFD_IOC_CACHE_POLICY_NONCOHERENT; + + args.alternate_policy = (AlternatePolicy == HSA_CACHING_CACHED) ? + KFD_IOC_CACHE_POLICY_COHERENT : + KFD_IOC_CACHE_POLICY_NONCOHERENT; + + args.alternate_aperture_base = (uintptr_t) MemoryAddressAlternate; args.alternate_aperture_size = MemorySizeInBytes; int err = kmtIoctl(kfd_fd, AMDKFD_IOC_SET_MEMORY_POLICY, &args); @@ -79,46 +88,51 @@ hsaKmtSetMemoryPolicy( static HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags) { - switch (pageSizeFlags) - { + switch (pageSizeFlags) { case HSA_PAGE_SIZE_4KB: return 4*1024; case HSA_PAGE_SIZE_64KB: return 64*1024; case HSA_PAGE_SIZE_2MB: return 2*1024*1024; case HSA_PAGE_SIZE_1GB: return 1024*1024*1024; - default: assert(false); return 4*1024; + default: + assert(false); + return 4*1024; } } HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory( - HSAuint32 PreferredNode, //IN - HSAuint64 SizeInBytes, //IN (multiple of page size) - HsaMemFlags MemFlags, //IN - void** MemoryAddress //OUT (page-aligned) - ) + HSAuint32 PreferredNode, /* IN */ + HSAuint64 SizeInBytes, /* IN (multiple of page size) */ + HsaMemFlags MemFlags, /* IN */ + void **MemoryAddress /* OUT (page-aligned) */ +) { - CHECK_KFD_OPEN(); HSAKMT_STATUS result; + HSAuint64 page_size; uint32_t gpu_id; int err; + CHECK_KFD_OPEN(); + result = validate_nodeid(PreferredNode, &gpu_id); if (result != HSAKMT_STATUS_SUCCESS) return result; - // The required size should be page aligned (GDS?) - HSAuint64 page_size = PageSizeFromFlags(MemFlags.ui32.PageSize); - if ((SizeInBytes & (page_size-1)) && !MemFlags.ui32.GDSMemory){ + /* The required size should be page aligned (GDS?) */ + page_size = PageSizeFromFlags(MemFlags.ui32.PageSize); + if ((SizeInBytes & (page_size-1)) && !MemFlags.ui32.GDSMemory) return HSAKMT_STATUS_INVALID_PARAMETER; - } if (MemFlags.ui32.HostAccess && !MemFlags.ui32.NonPaged) { err = posix_memalign(MemoryAddress, page_size, SizeInBytes); if (err != 0) return HSAKMT_STATUS_NO_MEMORY; + if (MemFlags.ui32.ExecuteAccess) { - err = mprotect(*MemoryAddress, SizeInBytes, PROT_READ | PROT_WRITE | PROT_EXEC); + err = mprotect(*MemoryAddress, SizeInBytes, + PROT_READ | PROT_WRITE | PROT_EXEC); + if (err != 0) { free(*MemoryAddress); return err; @@ -127,10 +141,12 @@ hsaKmtAllocMemory( return HSAKMT_STATUS_SUCCESS; } - if(!MemFlags.ui32.HostAccess && MemFlags.ui32.NonPaged){ - *MemoryAddress = fmm_allocate_device(gpu_id, SizeInBytes); + if (!MemFlags.ui32.HostAccess && MemFlags.ui32.NonPaged) { + *MemoryAddress = fmm_allocate_device(gpu_id, SizeInBytes); + if (*MemoryAddress == NULL) return HSAKMT_STATUS_NO_MEMORY; + return HSAKMT_STATUS_SUCCESS; } @@ -140,22 +156,22 @@ hsaKmtAllocMemory( HSAKMT_STATUS HSAKMTAPI hsaKmtFreeMemory( - void* MemoryAddress, //IN (page-aligned) - HSAuint64 SizeInBytes //IN - ) + void *MemoryAddress, /* IN (page-aligned) */ + HSAuint64 SizeInBytes /* IN */ +) { CHECK_KFD_OPEN(); - fmm_release( MemoryAddress, SizeInBytes); + fmm_release(MemoryAddress, SizeInBytes); return HSAKMT_STATUS_SUCCESS; } HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory( - void* MemoryAddress, //IN (page-aligned) - HSAuint64 MemorySizeInBytes //IN (page-aligned) - ) + void *MemoryAddress, /* IN (page-aligned) */ + HSAuint64 MemorySizeInBytes /* IN (page-aligned) */ +) { CHECK_KFD_OPEN(); @@ -165,8 +181,8 @@ hsaKmtRegisterMemory( HSAKMT_STATUS HSAKMTAPI hsaKmtDeregisterMemory( - void* MemoryAddress //IN - ) + void *MemoryAddress /* IN */ +) { CHECK_KFD_OPEN(); @@ -176,50 +192,47 @@ hsaKmtDeregisterMemory( HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPU( - void* MemoryAddress, //IN (page-aligned) - HSAuint64 MemorySizeInBytes, //IN (page-aligned) - HSAuint64* AlternateVAGPU //OUT (page-aligned) - ) + void *MemoryAddress, /* IN (page-aligned) */ + HSAuint64 MemorySizeInBytes, /* IN (page-aligned) */ + HSAuint64 *AlternateVAGPU /* OUT (page-aligned) */ +) { CHECK_KFD_OPEN(); if (AlternateVAGPU) *AlternateVAGPU = 0; - if (fmm_map_to_gpu(MemoryAddress, MemorySizeInBytes, AlternateVAGPU)){ + if (!fmm_map_to_gpu(MemoryAddress, MemorySizeInBytes, AlternateVAGPU)) return HSAKMT_STATUS_SUCCESS; - } - else { + else return HSAKMT_STATUS_ERROR; - } - } HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapMemoryToGPU( - void* MemoryAddress //IN (page-aligned) - ) + void *MemoryAddress /* IN (page-aligned) */ +) { CHECK_KFD_OPEN(); - if (fmm_unmap_from_gpu(MemoryAddress)) + if (!fmm_unmap_from_gpu(MemoryAddress)) return HSAKMT_STATUS_SUCCESS; else return HSAKMT_STATUS_ERROR; - } HSAKMT_STATUS HSAKMTAPI hsaKmtMapGraphicHandle( - HSAuint32 NodeId, //IN - HSAuint64 GraphicDeviceHandle, //IN - HSAuint64 GraphicResourceHandle, //IN - HSAuint64 GraphicResourceOffset, //IN - HSAuint64 GraphicResourceSize, //IN - HSAuint64* FlatMemoryAddress //OUT - ) + HSAuint32 NodeId, /* IN */ + HSAuint64 GraphicDeviceHandle, /* IN */ + HSAuint64 GraphicResourceHandle, /* IN */ + HSAuint64 GraphicResourceOffset, /* IN */ + HSAuint64 GraphicResourceSize, /* IN */ + HSAuint64 *FlatMemoryAddress /* OUT */ +) { + CHECK_KFD_OPEN(); HSAKMT_STATUS result; uint32_t gpu_id; @@ -248,10 +261,11 @@ hsaKmtMapGraphicHandle( HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapGraphicHandle( - HSAuint32 NodeId, //IN - HSAuint64 FlatMemoryAddress, //IN - HSAuint64 SizeInBytes //IN - ) + HSAuint32 NodeId, /* IN */ + HSAuint64 FlatMemoryAddress, /* IN */ + HSAuint64 SizeInBytes /* IN */ +) { + return hsaKmtUnmapMemoryToGPU(PORT_UINT64_TO_VPTR(FlatMemoryAddress)); }