libhsakmt: Add alignment for memory allocations
New API to support optional alignment parameter for memory allocations.
The alignment should be larger than or equal to page size and a power
of 2.
Change-Id: Ic3fec43b3c4281f74dd33a57ab4143dcf76e1186
Signed-off-by: Chris Freehill <cfreehil@amd.com>
[ROCm/ROCR-Runtime commit: a31e84eaef]
Этот коммит содержится в:
коммит произвёл
Chris Freehill
родитель
f38a5ea841
Коммит
d141223daf
@@ -406,6 +406,21 @@ hsaKmtAllocMemory(
|
||||
void** MemoryAddress //IN/OUT (page-aligned)
|
||||
);
|
||||
|
||||
/**
|
||||
Allocates a memory buffer with specific alignment that may be accessed by the GPU
|
||||
If Alignment is 0, the smallest possible alignment will be used
|
||||
*/
|
||||
|
||||
HSAKMT_STATUS
|
||||
HSAKMTAPI
|
||||
hsaKmtAllocMemoryAlign(
|
||||
HSAuint32 PreferredNode, //IN
|
||||
HSAuint64 SizeInBytes, //IN (multiple of page size)
|
||||
HSAuint64 Alignment, //IN (power of 2 and >= page size)
|
||||
HsaMemFlags MemFlags, //IN
|
||||
void** MemoryAddress //IN/OUT (page-aligned)
|
||||
);
|
||||
|
||||
/**
|
||||
Frees a memory buffer
|
||||
*/
|
||||
|
||||
@@ -856,7 +856,7 @@ static void *aperture_allocate_area_aligned(manageable_aperture_t *app,
|
||||
uint64_t MemorySizeInBytes,
|
||||
uint64_t align)
|
||||
{
|
||||
return app->ops->allocate_area_aligned(app, address, MemorySizeInBytes, align);
|
||||
return app->ops->allocate_area_aligned(app, address, MemorySizeInBytes, align ? align : app->align);
|
||||
}
|
||||
static void *aperture_allocate_area(manageable_aperture_t *app, void *address,
|
||||
uint64_t MemorySizeInBytes)
|
||||
@@ -1448,7 +1448,7 @@ void *fmm_allocate_scratch(uint32_t gpu_id, void *address, uint64_t MemorySizeIn
|
||||
|
||||
static void *__fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t MemorySizeInBytes,
|
||||
manageable_aperture_t *aperture, uint64_t *mmap_offset,
|
||||
uint32_t ioc_flags, vm_object_t **vm_obj)
|
||||
uint32_t ioc_flags, uint64_t alignment, vm_object_t **vm_obj)
|
||||
{
|
||||
void *mem = NULL;
|
||||
vm_object_t *obj;
|
||||
@@ -1459,7 +1459,7 @@ static void *__fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t Memo
|
||||
|
||||
/* Allocate address space */
|
||||
pthread_mutex_lock(&aperture->fmm_mutex);
|
||||
mem = aperture_allocate_area(aperture, address, MemorySizeInBytes);
|
||||
mem = aperture_allocate_area_aligned(aperture, address, MemorySizeInBytes, alignment);
|
||||
pthread_mutex_unlock(&aperture->fmm_mutex);
|
||||
|
||||
/*
|
||||
@@ -1504,7 +1504,7 @@ static void *fmm_map_to_cpu(void *mem, uint64_t size, bool host_access,
|
||||
}
|
||||
|
||||
static void *fmm_allocate_va(uint32_t gpu_id, void *address, uint64_t size,
|
||||
manageable_aperture_t *aperture, HsaMemFlags mflags)
|
||||
manageable_aperture_t *aperture, uint64_t alignment, HsaMemFlags mflags)
|
||||
{
|
||||
void *mem = NULL;
|
||||
vm_object_t *vm_obj = NULL;
|
||||
@@ -1515,7 +1515,7 @@ static void *fmm_allocate_va(uint32_t gpu_id, void *address, uint64_t size,
|
||||
|
||||
/* Allocate address space */
|
||||
pthread_mutex_lock(&aperture->fmm_mutex);
|
||||
mem = aperture_allocate_area(aperture, address, size);
|
||||
mem = aperture_allocate_area_aligned(aperture, address, size, alignment);
|
||||
/* assing handle 0 to vm_obj since no mem allocted */
|
||||
vm_obj = aperture_allocate_object(aperture, mem, 0,
|
||||
size, mflags);
|
||||
@@ -1533,7 +1533,7 @@ static void *fmm_allocate_va(uint32_t gpu_id, void *address, uint64_t size,
|
||||
}
|
||||
|
||||
void *fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address,
|
||||
uint64_t MemorySizeInBytes, HsaMemFlags mflags)
|
||||
uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags mflags)
|
||||
{
|
||||
manageable_aperture_t *aperture;
|
||||
int32_t gpu_mem_id;
|
||||
@@ -1564,7 +1564,7 @@ void *fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address,
|
||||
|
||||
/* special case for va allocation without vram alloc */
|
||||
if (mflags.ui32.OnlyAddress)
|
||||
return fmm_allocate_va(gpu_id, address, size, aperture, mflags);
|
||||
return fmm_allocate_va(gpu_id, address, size, aperture, alignment, mflags);
|
||||
|
||||
/* special case for vram allocation without addr */
|
||||
if(mflags.ui32.NoAddress)
|
||||
@@ -1583,7 +1583,7 @@ void *fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address,
|
||||
ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT;
|
||||
|
||||
mem = __fmm_allocate_device(gpu_id, address, size, aperture, &mmap_offset,
|
||||
ioc_flags, &vm_obj);
|
||||
ioc_flags, alignment, &vm_obj);
|
||||
|
||||
if (mem && vm_obj) {
|
||||
pthread_mutex_lock(&aperture->fmm_mutex);
|
||||
@@ -1637,7 +1637,7 @@ void *fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes,
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_COHERENT;
|
||||
|
||||
mem = __fmm_allocate_device(gpu_id, NULL, MemorySizeInBytes, aperture, NULL,
|
||||
ioc_flags, &vm_obj);
|
||||
ioc_flags, 0, &vm_obj);
|
||||
|
||||
if (mem && vm_obj) {
|
||||
HsaMemFlags mflags;
|
||||
@@ -1768,7 +1768,7 @@ static int bind_mem_to_numa(uint32_t node_id, void *mem,
|
||||
}
|
||||
|
||||
static void *fmm_allocate_host_gpu(uint32_t gpu_id, uint32_t node_id, void *address,
|
||||
uint64_t MemorySizeInBytes, HsaMemFlags mflags)
|
||||
uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags mflags)
|
||||
{
|
||||
manageable_aperture_t *aperture;
|
||||
vm_object_t *vm_obj = NULL;
|
||||
@@ -1822,7 +1822,7 @@ static void *fmm_allocate_host_gpu(uint32_t gpu_id, uint32_t node_id, void *addr
|
||||
if (!mflags.ui32.NonPaged && svm.userptr_for_paged_mem) {
|
||||
/* Allocate address space */
|
||||
pthread_mutex_lock(&aperture->fmm_mutex);
|
||||
mem = aperture_allocate_area(aperture, address, size);
|
||||
mem = aperture_allocate_area_aligned(aperture, address, size, alignment);
|
||||
pthread_mutex_unlock(&aperture->fmm_mutex);
|
||||
if (!mem)
|
||||
return NULL;
|
||||
@@ -1854,7 +1854,7 @@ static void *fmm_allocate_host_gpu(uint32_t gpu_id, uint32_t node_id, void *addr
|
||||
} else {
|
||||
ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_GTT;
|
||||
mem = __fmm_allocate_device(preferred_gpu_id, address, size, aperture,
|
||||
&mmap_offset, ioc_flags, &vm_obj);
|
||||
&mmap_offset, ioc_flags, alignment, &vm_obj);
|
||||
|
||||
if (mem && mflags.ui32.HostAccess) {
|
||||
void *ret = fmm_map_to_cpu(mem, MemorySizeInBytes,
|
||||
@@ -1896,10 +1896,16 @@ out_release_area:
|
||||
}
|
||||
|
||||
void *fmm_allocate_host(uint32_t gpu_id, uint32_t node_id, void *address,
|
||||
uint64_t MemorySizeInBytes, HsaMemFlags mflags)
|
||||
uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags mflags)
|
||||
{
|
||||
if (is_dgpu)
|
||||
return fmm_allocate_host_gpu(gpu_id, node_id, address, MemorySizeInBytes, mflags);
|
||||
return fmm_allocate_host_gpu(gpu_id, node_id, address, MemorySizeInBytes, alignment, mflags);
|
||||
|
||||
if (alignment) {//Alignment not supported on non-dgpu
|
||||
pr_err("Non-default alignment not supported on non-dgpu\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return fmm_allocate_host_cpu(address, MemorySizeInBytes, mflags);
|
||||
}
|
||||
|
||||
@@ -2365,7 +2371,7 @@ static void *map_mmio(uint32_t node_id, uint32_t gpu_id, int mmap_fd)
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_COHERENT;
|
||||
mem = __fmm_allocate_device(gpu_id, NULL, PAGE_SIZE, aperture,
|
||||
&mmap_offset, ioc_flags, &vm_obj);
|
||||
&mmap_offset, ioc_flags, 0, &vm_obj);
|
||||
|
||||
if (!mem || !vm_obj)
|
||||
return NULL;
|
||||
@@ -3455,6 +3461,7 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr,
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE |
|
||||
(coarse_grain ? 0 : KFD_IOC_ALLOC_MEM_FLAGS_COHERENT) |
|
||||
(ext_coherent ? KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT : 0),
|
||||
0,
|
||||
&obj);
|
||||
if (!svm_addr)
|
||||
return HSAKMT_STATUS_ERROR;
|
||||
|
||||
@@ -52,10 +52,10 @@ void fmm_destroy_process_apertures(void);
|
||||
/* Memory interface */
|
||||
void *fmm_allocate_scratch(uint32_t gpu_id, void *address, uint64_t MemorySizeInBytes);
|
||||
void *fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address,
|
||||
uint64_t MemorySizeInBytes, HsaMemFlags flags);
|
||||
uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags flags);
|
||||
void *fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes, uint64_t doorbell_offset);
|
||||
void *fmm_allocate_host(uint32_t gpu_id, uint32_t node_id, void *address, uint64_t MemorySizeInBytes,
|
||||
HsaMemFlags flags);
|
||||
uint64_t alignment, HsaMemFlags flags);
|
||||
void fmm_print(uint32_t node);
|
||||
HSAKMT_STATUS fmm_release(void *address);
|
||||
HSAKMT_STATUS fmm_map_to_gpu(void *address, uint64_t size, uint64_t *gpuvm_address);
|
||||
|
||||
@@ -108,6 +108,17 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
|
||||
HSAuint64 SizeInBytes,
|
||||
HsaMemFlags MemFlags,
|
||||
void **MemoryAddress)
|
||||
{
|
||||
return hsaKmtAllocMemoryAlign(PreferredNode, SizeInBytes, 0, MemFlags, MemoryAddress);
|
||||
}
|
||||
|
||||
#define POWER_OF_2(x) ((x && (!(x & (x - 1)))) ? 1 : 0)
|
||||
|
||||
HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode,
|
||||
HSAuint64 SizeInBytes,
|
||||
HSAuint64 Alignment,
|
||||
HsaMemFlags MemFlags,
|
||||
void **MemoryAddress)
|
||||
{
|
||||
HSAKMT_STATUS result;
|
||||
uint32_t gpu_id;
|
||||
@@ -128,6 +139,9 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
|
||||
|
||||
page_size = PageSizeFromFlags(MemFlags.ui32.PageSize);
|
||||
|
||||
if (Alignment && (Alignment < page_size || !POWER_OF_2(Alignment)))
|
||||
return HSAKMT_STATUS_INVALID_PARAMETER;
|
||||
|
||||
if (!MemoryAddress || !SizeInBytes || (SizeInBytes & (page_size-1)))
|
||||
return HSAKMT_STATUS_INVALID_PARAMETER;
|
||||
|
||||
@@ -143,6 +157,12 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
|
||||
return HSAKMT_STATUS_INVALID_PARAMETER;
|
||||
|
||||
if (MemFlags.ui32.Scratch) {
|
||||
if (Alignment) {
|
||||
// Scratch memory currently forced to SCRATCH_ALIGN
|
||||
pr_err("[%s] Alignment not supported for scratch memory: %d\n", __func__, PreferredNode);
|
||||
return HSAKMT_STATUS_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
*MemoryAddress = fmm_allocate_scratch(gpu_id, *MemoryAddress, SizeInBytes);
|
||||
|
||||
if (!(*MemoryAddress)) {
|
||||
@@ -165,7 +185,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
|
||||
MemFlags.ui32.CoarseGrain = 1;
|
||||
|
||||
*MemoryAddress = fmm_allocate_host(gpu_id, MemFlags.ui32.GTTAccess ? 0 : PreferredNode,
|
||||
*MemoryAddress, SizeInBytes, MemFlags);
|
||||
*MemoryAddress, SizeInBytes, Alignment, MemFlags);
|
||||
|
||||
if (!(*MemoryAddress)) {
|
||||
pr_err("[%s] failed to allocate %lu bytes from host\n",
|
||||
@@ -185,7 +205,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
|
||||
}
|
||||
|
||||
*MemoryAddress = fmm_allocate_device(gpu_id, PreferredNode, *MemoryAddress,
|
||||
SizeInBytes, MemFlags);
|
||||
SizeInBytes, Alignment, MemFlags);
|
||||
|
||||
if (!(*MemoryAddress)) {
|
||||
pr_err("[%s] failed to allocate %lu bytes from device\n",
|
||||
|
||||
@@ -337,7 +337,7 @@ void *allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t g
|
||||
size = ALIGN_UP(size, align);
|
||||
|
||||
if (DeviceLocal && !zfb_support)
|
||||
mem = fmm_allocate_device(gpu_id, NodeId, mem, size, flags);
|
||||
mem = fmm_allocate_device(gpu_id, NodeId, mem, size, 0, flags);
|
||||
else {
|
||||
/* VRAM under ZFB mode should be supported here without any
|
||||
* additional code
|
||||
@@ -352,7 +352,7 @@ void *allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t g
|
||||
cpu_id = 0;
|
||||
}
|
||||
}
|
||||
mem = fmm_allocate_host(gpu_id, cpu_id, mem, size, flags);
|
||||
mem = fmm_allocate_host(gpu_id, cpu_id, mem, size, 0, flags);
|
||||
}
|
||||
|
||||
if (!mem) {
|
||||
|
||||
Ссылка в новой задаче
Block a user