Allocate paged system memory as userptr
Change-Id: I0864e678681788df37eccd9d7ebc70086e1f93bf
このコミットが含まれているのは:
+1
-1
@@ -83,7 +83,7 @@ hsaKmtCreateEvent(
|
||||
|
||||
if (is_dgpu && events_page == NULL) {
|
||||
events_page = allocate_exec_aligned_memory_gpu(
|
||||
KFD_SIGNAL_EVENT_LIMIT * 8, PAGE_SIZE, 0);
|
||||
KFD_SIGNAL_EVENT_LIMIT * 8, PAGE_SIZE, 0, true);
|
||||
if (!events_page) {
|
||||
pthread_mutex_unlock(&hsakmt_mutex);
|
||||
return HSAKMT_STATUS_ERROR;
|
||||
|
||||
@@ -129,6 +129,9 @@ typedef struct {
|
||||
/* used for coherent (fine-grain) system memory on dGPU,
|
||||
* This aperture is shared by all dGPUs */
|
||||
manageble_aperture_t dgpu_alt_aperture;
|
||||
|
||||
/* whether to use userptr for paged memory */
|
||||
bool userptr_for_paged_mem;
|
||||
} svm_t;
|
||||
|
||||
/* The other apertures are specific to each GPU. gpu_mem_t manages GPU
|
||||
@@ -141,7 +144,8 @@ static void *dgpu_shared_aperture_limit = NULL;
|
||||
|
||||
static svm_t svm = {
|
||||
INIT_MANAGEBLE_APERTURE(0, 0),
|
||||
INIT_MANAGEBLE_APERTURE(0, 0)
|
||||
INIT_MANAGEBLE_APERTURE(0, 0),
|
||||
true
|
||||
};
|
||||
|
||||
/* On APU, for memory allocated on the system memory that GPU doesn't access
|
||||
@@ -1104,7 +1108,7 @@ static void* fmm_allocate_host_gpu(uint32_t node_id, uint64_t MemorySizeInBytes,
|
||||
return NULL;
|
||||
|
||||
size = MemorySizeInBytes;
|
||||
ioc_flags = KFD_IOC_ALLOC_MEM_FLAGS_DGPU_HOST;
|
||||
ioc_flags = 0;
|
||||
if (flags.ui32.CoarseGrain)
|
||||
aperture = &svm.dgpu_aperture;
|
||||
else
|
||||
@@ -1114,9 +1118,69 @@ static void* fmm_allocate_host_gpu(uint32_t node_id, uint64_t MemorySizeInBytes,
|
||||
ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_DGPU_AQL_QUEUE_MEM;
|
||||
}
|
||||
|
||||
mem = __fmm_allocate_device(gpu_id, size,
|
||||
aperture, 0, &mmap_offset,
|
||||
ioc_flags, &vm_obj);
|
||||
/* Paged memory is allocated as a userptr mapping, non-paged
|
||||
* memory is allocated from KFD */
|
||||
if (!flags.ui32.NonPaged && svm.userptr_for_paged_mem) {
|
||||
/* Allocate address space */
|
||||
pthread_mutex_lock(&aperture->fmm_mutex);
|
||||
mem = aperture_allocate_area(aperture, size, 0);
|
||||
pthread_mutex_unlock(&aperture->fmm_mutex);
|
||||
if (mem == NULL)
|
||||
return NULL;
|
||||
|
||||
/* Map anonymous pages */
|
||||
if (mmap(mem, MemorySizeInBytes, PROT_READ | PROT_WRITE,
|
||||
MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0)
|
||||
== MAP_FAILED) {
|
||||
/* Release address space */
|
||||
pthread_mutex_lock(&aperture->fmm_mutex);
|
||||
aperture_release_area(aperture, mem, size);
|
||||
pthread_mutex_unlock(&aperture->fmm_mutex);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Create userptr BO */
|
||||
mmap_offset = (uint64_t)mem;
|
||||
ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_USERPTR;
|
||||
vm_obj = fmm_allocate_memory_in_device(gpu_id, mem, size,
|
||||
aperture, &mmap_offset,
|
||||
ioc_flags);
|
||||
if (!vm_obj) {
|
||||
/* Release address space */
|
||||
pthread_mutex_lock(&aperture->fmm_mutex);
|
||||
aperture_release_area(aperture, mem, size);
|
||||
pthread_mutex_unlock(&aperture->fmm_mutex);
|
||||
/* Remove any CPU mapping, but keep the
|
||||
* address range reserved */
|
||||
mmap(mem, MemorySizeInBytes, PROT_NONE,
|
||||
MAP_ANONYMOUS | MAP_NORESERVE |
|
||||
MAP_PRIVATE | MAP_FIXED, -1, 0);
|
||||
return NULL;
|
||||
}
|
||||
} else {
|
||||
ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_DGPU_HOST;
|
||||
mem = __fmm_allocate_device(gpu_id, size,
|
||||
aperture, 0, &mmap_offset,
|
||||
ioc_flags, &vm_obj);
|
||||
|
||||
if (mem && flags.ui32.HostAccess) {
|
||||
void *ret = mmap(mem, MemorySizeInBytes,
|
||||
PROT_READ | PROT_WRITE,
|
||||
MAP_SHARED | MAP_FIXED, kfd_fd , mmap_offset);
|
||||
if (ret == MAP_FAILED) {
|
||||
__fmm_release(mem, aperture);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (flags.ui32.AQLQueueMemory) {
|
||||
uint64_t my_buf_size = ALIGN_UP(size, aperture->align) / 2;
|
||||
memset(ret, 0, MemorySizeInBytes);
|
||||
mmap(VOID_PTR_ADD(mem, my_buf_size), MemorySizeInBytes,
|
||||
PROT_READ | PROT_WRITE,
|
||||
MAP_SHARED | MAP_FIXED, kfd_fd , mmap_offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (mem && vm_obj) {
|
||||
/* Store memory allocation flags, not ioc flags */
|
||||
@@ -1126,24 +1190,6 @@ static void* fmm_allocate_host_gpu(uint32_t node_id, uint64_t MemorySizeInBytes,
|
||||
pthread_mutex_unlock(&aperture->fmm_mutex);
|
||||
}
|
||||
|
||||
if (flags.ui32.HostAccess) {
|
||||
void *ret = mmap(mem, MemorySizeInBytes,
|
||||
PROT_READ | PROT_WRITE,
|
||||
MAP_SHARED | MAP_FIXED, kfd_fd , mmap_offset);
|
||||
if (ret == MAP_FAILED) {
|
||||
__fmm_release(mem, aperture);
|
||||
return NULL;
|
||||
}
|
||||
if (flags.ui32.AQLQueueMemory) {
|
||||
uint64_t my_buf_size = ALIGN_UP(size, aperture->align) / 2;
|
||||
memset(ret, 0, MemorySizeInBytes);
|
||||
mmap(VOID_PTR_ADD(mem, my_buf_size), MemorySizeInBytes,
|
||||
PROT_READ | PROT_WRITE,
|
||||
MAP_SHARED | MAP_FIXED, kfd_fd , mmap_offset);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return mem;
|
||||
}
|
||||
|
||||
@@ -1334,6 +1380,7 @@ HSAKMT_STATUS fmm_init_process_apertures(unsigned int NumNodes)
|
||||
struct kfd_process_device_apertures * process_apertures;
|
||||
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
|
||||
char *disableCache;
|
||||
char *pagedUserptr;
|
||||
struct pci_access *pacc;
|
||||
|
||||
/* If HSA_DISABLE_CACHE is set to a non-0 value, disable caching */
|
||||
@@ -1341,6 +1388,11 @@ HSAKMT_STATUS fmm_init_process_apertures(unsigned int NumNodes)
|
||||
if (disableCache && strcmp(disableCache, "0") == 0)
|
||||
disableCache = NULL;
|
||||
|
||||
/* If HSA_USERPTR_FOR_PAGED_MEM unset or set to a non-0 value,
|
||||
* enable userptr for all paged memory allocations */
|
||||
pagedUserptr = getenv("HSA_USERPTR_FOR_PAGED_MEM");
|
||||
svm.userptr_for_paged_mem = (!pagedUserptr || strcmp(pagedUserptr, "0"));
|
||||
|
||||
/* Trade off - NumNodes includes GPU nodes + CPU Node. So in
|
||||
* systems with CPU node, slightly more memory is allocated than
|
||||
* necessary*/
|
||||
|
||||
+1
-1
@@ -78,7 +78,7 @@ bool topology_is_dgpu(uint16_t device_id);
|
||||
HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags);
|
||||
|
||||
void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align,
|
||||
uint32_t NodeId);
|
||||
uint32_t NodeId, bool NonPaged);
|
||||
void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align);
|
||||
HSAKMT_STATUS init_process_doorbells(unsigned int NumNodes);
|
||||
void destroy_process_doorbells(void);
|
||||
|
||||
+4
-2
@@ -397,7 +397,7 @@ static bool update_ctx_save_restore_size(uint32_t nodeid, struct queue *q)
|
||||
}
|
||||
|
||||
void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align,
|
||||
uint32_t NodeId)
|
||||
uint32_t NodeId, bool nonPaged)
|
||||
{
|
||||
void *mem;
|
||||
HSAuint64 gpu_va;
|
||||
@@ -407,6 +407,7 @@ void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align,
|
||||
flags.Value = 0;
|
||||
flags.ui32.HostAccess = 1;
|
||||
flags.ui32.ExecuteAccess = 1;
|
||||
flags.ui32.NonPaged = nonPaged;
|
||||
flags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
|
||||
|
||||
size = ALIGN_UP(size, align);
|
||||
@@ -448,7 +449,8 @@ static void* allocate_exec_aligned_memory(uint32_t size,
|
||||
uint32_t NodeId)
|
||||
{
|
||||
if (IS_DGPU(type))
|
||||
return allocate_exec_aligned_memory_gpu(size, align, NodeId);
|
||||
return allocate_exec_aligned_memory_gpu(size, align, NodeId,
|
||||
false);
|
||||
return allocate_exec_aligned_memory_cpu(size, align);
|
||||
}
|
||||
|
||||
|
||||
新しいイシューから参照
ユーザーをブロックする