diff --git a/projects/rocr-runtime/src/events.c b/projects/rocr-runtime/src/events.c index 7cedacb459..22949d595d 100644 --- a/projects/rocr-runtime/src/events.c +++ b/projects/rocr-runtime/src/events.c @@ -74,7 +74,7 @@ hsaKmtCreateEvent( /* dGPU code */ if (is_dgpu && events_page == NULL) { - events_page = allocate_exec_aligned_memory_gpu(KFD_SIGNAL_EVENT_LIMIT * 8, 0x9000); + events_page = allocate_exec_aligned_memory_gpu(KFD_SIGNAL_EVENT_LIMIT * 8, TONGA_PAGE_SIZE); if (!events_page) { return HSAKMT_STATUS_ERROR; } diff --git a/projects/rocr-runtime/src/fmm.c b/projects/rocr-runtime/src/fmm.c index 3d5d486d3a..318f6497da 100644 --- a/projects/rocr-runtime/src/fmm.c +++ b/projects/rocr-runtime/src/fmm.c @@ -42,6 +42,7 @@ #define INIT_MANAGEBLE_APERTURE(base_value, limit_value) { \ .base = (void *) base_value, \ .limit = (void *) limit_value, \ + .align = PAGE_SIZE, \ .vm_ranges = NULL, \ .vm_objects = NULL, \ .fmm_mutex = PTHREAD_MUTEX_INITIALIZER \ @@ -78,6 +79,7 @@ typedef struct vm_area vm_area_t; typedef struct { void *base; void *limit; + uint64_t align; vm_area_t *vm_ranges; vm_object_t *vm_objects; pthread_mutex_t fmm_mutex; @@ -103,6 +105,8 @@ typedef struct { } gpu_mem_t; static gpu_mem_t gpu_mem[] = INIT_GPUs_MEM; +static void *dgpu_shared_aperture_base = NULL; +static void *dgpu_shared_aperture_limit = NULL; static HSAKMT_STATUS dgpu_mem_init(uint8_t node_id, void **base, void **limit); static int set_dgpu_aperture(uint32_t node_id, uint64_t base, uint64_t limit); @@ -224,6 +228,8 @@ static vm_object_t *vm_find_object_by_address(manageble_aperture_t *app, { vm_object_t *cur = app->vm_objects; + size = ALIGN_UP(size, app->align); + /* Look up the appropriate address range containing the given address */ while (cur) { if (cur->start == address && (cur->size == size || size == 0)) @@ -264,6 +270,8 @@ static void aperture_release_area(manageble_aperture_t *app, void *address, vm_area_t *area; uint64_t SizeOfRegion; + MemorySizeInBytes = ALIGN_UP(MemorySizeInBytes, app->align); + area = vm_find(app, address); if (!area) return; @@ -302,6 +310,8 @@ static void *aperture_allocate_area(manageble_aperture_t *app, next = NULL; new_area = NULL; + MemorySizeInBytes = ALIGN_UP(MemorySizeInBytes, app->align); + cur = app->vm_ranges; if (cur) { /* not empty */ /* @@ -358,6 +368,8 @@ static int aperture_allocate_object(manageble_aperture_t *app, { vm_object_t *new_object; + MemorySizeInBytes = ALIGN_UP(MemorySizeInBytes, app->align); + /* Allocate new object */ new_object = vm_create_and_init_object(new_address, MemorySizeInBytes, @@ -400,7 +412,7 @@ static int fmm_allocate_memory_in_device(uint32_t gpu_id, void *mem, /* Allocate memory from amdkfd */ args.gpu_id = gpu_id; - args.size = MemorySizeInBytes; + args.size = ALIGN_UP(MemorySizeInBytes, aperture->align); args.flags = flags; args.va_addr = (uint64_t)mem; @@ -596,8 +608,6 @@ void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes) if (topology_is_dgpu(get_device_id_by_gpu_id(gpu_id))) { flags = KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE; - /* Alignment is needed to match a workaround for a VI HW bug in the kernel */ - MemorySizeInBytes = (MemorySizeInBytes + 0x7fffULL) & ~0x7fffULL; /* * TODO: Once VA limit is raised from 0x200000000 (8GB) use gpuvm_aperture. * In that way the host access range won't be used for local memory @@ -655,10 +665,6 @@ static void* fmm_allocate_host_gpu(uint32_t gpu_id, else aperture = &gpu_mem[gpu_mem_id].dgpu_alt_aperture; /* coherent */ - /* Alignment is needed to match a workaround for a VI HW bug in the kernel */ - /* FIXME: this breaks fmm_release! */ - MemorySizeInBytes = (MemorySizeInBytes + 0x7fffULL) & ~0x7fffULL; - mem = __fmm_allocate_device(gpu_id, MemorySizeInBytes, aperture, 0, &mmap_offset, KFD_IOC_ALLOC_MEM_FLAGS_DGPU_HOST); @@ -806,6 +812,14 @@ void fmm_release(void *address, uint64_t MemorySizeInBytes) } } + if (found && + address >= dgpu_shared_aperture_base && + address <= dgpu_shared_aperture_limit) { + /* Remove any CPU mapping, but keep the address range reserved */ + mmap(address, MemorySizeInBytes, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE | MAP_FIXED, -1, 0); + } + /* * If memory address isn't inside of any defined aperture - it refers * to the system memory @@ -872,6 +886,7 @@ HSAKMT_STATUS fmm_init_process_apertures(void) &gpu_mem[node_id].dgpu_aperture.limit); set_dgpu_aperture(node_id, (uint64_t)gpu_mem[node_id].dgpu_aperture.base, (uint64_t)gpu_mem[node_id].dgpu_aperture.limit); + gpu_mem[node_id].dgpu_aperture.align = TONGA_PAGE_SIZE; /* Place GPUVM aperture after dGPU aperture * (FK: I think this is broken but leaving it for now) */ @@ -880,6 +895,7 @@ HSAKMT_STATUS fmm_init_process_apertures(void) gpu_mem[node_id].dgpu_aperture.base); gpu_mem[node_id].gpuvm_aperture.limit = VOID_PTR_ADD(gpu_mem[node_id].gpuvm_aperture.limit, (unsigned long)gpu_mem[node_id].gpuvm_aperture.base); + gpu_mem[node_id].gpuvm_aperture.align = TONGA_PAGE_SIZE; /* Use the first 1/4 of the dGPU aperture as * alternate aperture for coherent access. @@ -900,6 +916,7 @@ HSAKMT_STATUS fmm_init_process_apertures(void) fprintf(stderr, "Error! Failed to set alt aperture for node %d\n", node_id); ret = HSAKMT_STATUS_ERROR; } + gpu_mem[node_id].dgpu_alt_aperture.align = TONGA_PAGE_SIZE; } } } @@ -1129,8 +1146,6 @@ int fmm_unmap_from_gpu(void *address) /* Tonga dGPU specific functions */ static bool is_dgpu_mem_init = false; -static void *dgpu_shared_aperture_base = NULL; -static void *dgpu_shared_aperture_limit = NULL; static int set_dgpu_aperture(uint32_t node_id, uint64_t base, uint64_t limit) { @@ -1208,9 +1223,9 @@ static HSAKMT_STATUS dgpu_mem_init(uint8_t node_id, void **base, void **limit) max_len = props.LocalMemSize; found = false; - for (addr = (void *)PAGE_SIZE, ret_addr = NULL; + for (addr = (void *)TONGA_PAGE_SIZE, ret_addr = NULL; ret_addr != addr; - addr = (void *)((unsigned long)addr + 0x8000)) + addr = (void *)((unsigned long)addr + TONGA_PAGE_SIZE)) { ret_addr = reserve_address(addr, max_len); if (!ret_addr) diff --git a/projects/rocr-runtime/src/libhsakmt.h b/projects/rocr-runtime/src/libhsakmt.h index a1399fc65d..1ceb583923 100644 --- a/projects/rocr-runtime/src/libhsakmt.h +++ b/projects/rocr-runtime/src/libhsakmt.h @@ -49,11 +49,14 @@ extern bool is_dgpu; do { if (kfd_open_count == 0) return HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED; } while (0) #define PAGE_SIZE 4096 +/* VI HW bug requires this virtual address alignment */ +#define TONGA_PAGE_SIZE 0x8000 #define CHECK_PAGE_MULTIPLE(x) \ do { if ((uint64_t)PORT_VPTR_TO_UINT64(x) % PAGE_SIZE) return HSAKMT_STATUS_INVALID_PARAMETER; } while(0) -#define PAGE_ALIGN_UP(x) (((uint64_t)(x) + PAGE_SIZE - 1) & ~(uint64_t)(PAGE_SIZE-1)) +#define ALIGN_UP(x,align) (((uint64_t)(x) + (align) - 1) & ~(uint64_t)((align)-1)) +#define PAGE_ALIGN_UP(x) ALIGN_UP(x,PAGE_SIZE) #define BITMASK(n) (((n) < sizeof(1ULL) * CHAR_BIT ? (1ULL << (n)) : 0) - 1ULL) /* @@ -74,7 +77,7 @@ bool topology_is_dgpu(uint16_t gpu_id); HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags); void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align); -void free_exec_aligned_memory_gpu(void *addr, uint32_t size); +void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align); extern int kmtIoctl(int fd, unsigned long request, void *arg); diff --git a/projects/rocr-runtime/src/queues.c b/projects/rocr-runtime/src/queues.c index 9702ccd6ab..7c7c446b81 100644 --- a/projects/rocr-runtime/src/queues.c +++ b/projects/rocr-runtime/src/queues.c @@ -34,8 +34,6 @@ #include #include -#define TONGA_PAGE_SIZE 0x9000 - /* 1024 doorbells, 4 bytes each doorbell */ #define DOORBELLS_PAGE_SIZE 1024 * 4 @@ -201,7 +199,7 @@ void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align) flags.ui32.ExecuteAccess = 1; flags.ui32.PageSize = HSA_PAGE_SIZE_4KB; - size += align - (size % align); + size = ALIGN_UP(size, align); ret = hsaKmtAllocMemory(0, size, flags, &mem); if (ret != HSAKMT_STATUS_SUCCESS) { @@ -215,9 +213,9 @@ void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align) return mem; } -void free_exec_aligned_memory_gpu(void *addr, uint32_t size) +void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align) { - size += TONGA_PAGE_SIZE - (size % TONGA_PAGE_SIZE); + size = ALIGN_UP(size, align); if (hsaKmtUnmapMemoryToGPU(addr) == HSAKMT_STATUS_SUCCESS) { hsaKmtFreeMemory(addr, size);