libhsakmt: Add alignment for memory allocations

New API to support optional alignment parameter for memory allocations.
The alignment should be larger than or equal to page size and a power
of 2.

Change-Id: Ic3fec43b3c4281f74dd33a57ab4143dcf76e1186
Signed-off-by: Chris Freehill <cfreehil@amd.com>
这个提交包含在:
David Yat Sin
2024-05-16 16:32:23 +00:00
提交者 Chris Freehill
父节点 1abd02af32
当前提交 a31e84eaef
修改 5 个文件,包含 63 行新增21 行删除
+15
查看文件
@@ -406,6 +406,21 @@ hsaKmtAllocMemory(
void** MemoryAddress //IN/OUT (page-aligned)
);
/**
Allocates a memory buffer with specific alignment that may be accessed by the GPU
If Alignment is 0, the smallest possible alignment will be used
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtAllocMemoryAlign(
HSAuint32 PreferredNode, //IN
HSAuint64 SizeInBytes, //IN (multiple of page size)
HSAuint64 Alignment, //IN (power of 2 and >= page size)
HsaMemFlags MemFlags, //IN
void** MemoryAddress //IN/OUT (page-aligned)
);
/**
Frees a memory buffer
*/
+22 -15
查看文件
@@ -856,7 +856,7 @@ static void *aperture_allocate_area_aligned(manageable_aperture_t *app,
uint64_t MemorySizeInBytes,
uint64_t align)
{
return app->ops->allocate_area_aligned(app, address, MemorySizeInBytes, align);
return app->ops->allocate_area_aligned(app, address, MemorySizeInBytes, align ? align : app->align);
}
static void *aperture_allocate_area(manageable_aperture_t *app, void *address,
uint64_t MemorySizeInBytes)
@@ -1448,7 +1448,7 @@ void *fmm_allocate_scratch(uint32_t gpu_id, void *address, uint64_t MemorySizeIn
static void *__fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t MemorySizeInBytes,
manageable_aperture_t *aperture, uint64_t *mmap_offset,
uint32_t ioc_flags, vm_object_t **vm_obj)
uint32_t ioc_flags, uint64_t alignment, vm_object_t **vm_obj)
{
void *mem = NULL;
vm_object_t *obj;
@@ -1459,7 +1459,7 @@ static void *__fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t Memo
/* Allocate address space */
pthread_mutex_lock(&aperture->fmm_mutex);
mem = aperture_allocate_area(aperture, address, MemorySizeInBytes);
mem = aperture_allocate_area_aligned(aperture, address, MemorySizeInBytes, alignment);
pthread_mutex_unlock(&aperture->fmm_mutex);
/*
@@ -1504,7 +1504,7 @@ static void *fmm_map_to_cpu(void *mem, uint64_t size, bool host_access,
}
static void *fmm_allocate_va(uint32_t gpu_id, void *address, uint64_t size,
manageable_aperture_t *aperture, HsaMemFlags mflags)
manageable_aperture_t *aperture, uint64_t alignment, HsaMemFlags mflags)
{
void *mem = NULL;
vm_object_t *vm_obj = NULL;
@@ -1515,7 +1515,7 @@ static void *fmm_allocate_va(uint32_t gpu_id, void *address, uint64_t size,
/* Allocate address space */
pthread_mutex_lock(&aperture->fmm_mutex);
mem = aperture_allocate_area(aperture, address, size);
mem = aperture_allocate_area_aligned(aperture, address, size, alignment);
/* assing handle 0 to vm_obj since no mem allocted */
vm_obj = aperture_allocate_object(aperture, mem, 0,
size, mflags);
@@ -1533,7 +1533,7 @@ static void *fmm_allocate_va(uint32_t gpu_id, void *address, uint64_t size,
}
void *fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address,
uint64_t MemorySizeInBytes, HsaMemFlags mflags)
uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags mflags)
{
manageable_aperture_t *aperture;
int32_t gpu_mem_id;
@@ -1564,7 +1564,7 @@ void *fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address,
/* special case for va allocation without vram alloc */
if (mflags.ui32.OnlyAddress)
return fmm_allocate_va(gpu_id, address, size, aperture, mflags);
return fmm_allocate_va(gpu_id, address, size, aperture, alignment, mflags);
/* special case for vram allocation without addr */
if(mflags.ui32.NoAddress)
@@ -1583,7 +1583,7 @@ void *fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address,
ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT;
mem = __fmm_allocate_device(gpu_id, address, size, aperture, &mmap_offset,
ioc_flags, &vm_obj);
ioc_flags, alignment, &vm_obj);
if (mem && vm_obj) {
pthread_mutex_lock(&aperture->fmm_mutex);
@@ -1637,7 +1637,7 @@ void *fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes,
KFD_IOC_ALLOC_MEM_FLAGS_COHERENT;
mem = __fmm_allocate_device(gpu_id, NULL, MemorySizeInBytes, aperture, NULL,
ioc_flags, &vm_obj);
ioc_flags, 0, &vm_obj);
if (mem && vm_obj) {
HsaMemFlags mflags;
@@ -1768,7 +1768,7 @@ static int bind_mem_to_numa(uint32_t node_id, void *mem,
}
static void *fmm_allocate_host_gpu(uint32_t gpu_id, uint32_t node_id, void *address,
uint64_t MemorySizeInBytes, HsaMemFlags mflags)
uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags mflags)
{
manageable_aperture_t *aperture;
vm_object_t *vm_obj = NULL;
@@ -1822,7 +1822,7 @@ static void *fmm_allocate_host_gpu(uint32_t gpu_id, uint32_t node_id, void *addr
if (!mflags.ui32.NonPaged && svm.userptr_for_paged_mem) {
/* Allocate address space */
pthread_mutex_lock(&aperture->fmm_mutex);
mem = aperture_allocate_area(aperture, address, size);
mem = aperture_allocate_area_aligned(aperture, address, size, alignment);
pthread_mutex_unlock(&aperture->fmm_mutex);
if (!mem)
return NULL;
@@ -1854,7 +1854,7 @@ static void *fmm_allocate_host_gpu(uint32_t gpu_id, uint32_t node_id, void *addr
} else {
ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_GTT;
mem = __fmm_allocate_device(preferred_gpu_id, address, size, aperture,
&mmap_offset, ioc_flags, &vm_obj);
&mmap_offset, ioc_flags, alignment, &vm_obj);
if (mem && mflags.ui32.HostAccess) {
void *ret = fmm_map_to_cpu(mem, MemorySizeInBytes,
@@ -1896,10 +1896,16 @@ out_release_area:
}
void *fmm_allocate_host(uint32_t gpu_id, uint32_t node_id, void *address,
uint64_t MemorySizeInBytes, HsaMemFlags mflags)
uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags mflags)
{
if (is_dgpu)
return fmm_allocate_host_gpu(gpu_id, node_id, address, MemorySizeInBytes, mflags);
return fmm_allocate_host_gpu(gpu_id, node_id, address, MemorySizeInBytes, alignment, mflags);
if (alignment) {//Alignment not supported on non-dgpu
pr_err("Non-default alignment not supported on non-dgpu\n");
return NULL;
}
return fmm_allocate_host_cpu(address, MemorySizeInBytes, mflags);
}
@@ -2365,7 +2371,7 @@ static void *map_mmio(uint32_t node_id, uint32_t gpu_id, int mmap_fd)
KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
KFD_IOC_ALLOC_MEM_FLAGS_COHERENT;
mem = __fmm_allocate_device(gpu_id, NULL, PAGE_SIZE, aperture,
&mmap_offset, ioc_flags, &vm_obj);
&mmap_offset, ioc_flags, 0, &vm_obj);
if (!mem || !vm_obj)
return NULL;
@@ -3455,6 +3461,7 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr,
KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE |
(coarse_grain ? 0 : KFD_IOC_ALLOC_MEM_FLAGS_COHERENT) |
(ext_coherent ? KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT : 0),
0,
&obj);
if (!svm_addr)
return HSAKMT_STATUS_ERROR;
+2 -2
查看文件
@@ -52,10 +52,10 @@ void fmm_destroy_process_apertures(void);
/* Memory interface */
void *fmm_allocate_scratch(uint32_t gpu_id, void *address, uint64_t MemorySizeInBytes);
void *fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address,
uint64_t MemorySizeInBytes, HsaMemFlags flags);
uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags flags);
void *fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes, uint64_t doorbell_offset);
void *fmm_allocate_host(uint32_t gpu_id, uint32_t node_id, void *address, uint64_t MemorySizeInBytes,
HsaMemFlags flags);
uint64_t alignment, HsaMemFlags flags);
void fmm_print(uint32_t node);
HSAKMT_STATUS fmm_release(void *address);
HSAKMT_STATUS fmm_map_to_gpu(void *address, uint64_t size, uint64_t *gpuvm_address);
+22 -2
查看文件
@@ -108,6 +108,17 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
HSAuint64 SizeInBytes,
HsaMemFlags MemFlags,
void **MemoryAddress)
{
return hsaKmtAllocMemoryAlign(PreferredNode, SizeInBytes, 0, MemFlags, MemoryAddress);
}
#define POWER_OF_2(x) ((x && (!(x & (x - 1)))) ? 1 : 0)
HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode,
HSAuint64 SizeInBytes,
HSAuint64 Alignment,
HsaMemFlags MemFlags,
void **MemoryAddress)
{
HSAKMT_STATUS result;
uint32_t gpu_id;
@@ -128,6 +139,9 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
page_size = PageSizeFromFlags(MemFlags.ui32.PageSize);
if (Alignment && (Alignment < page_size || !POWER_OF_2(Alignment)))
return HSAKMT_STATUS_INVALID_PARAMETER;
if (!MemoryAddress || !SizeInBytes || (SizeInBytes & (page_size-1)))
return HSAKMT_STATUS_INVALID_PARAMETER;
@@ -143,6 +157,12 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
return HSAKMT_STATUS_INVALID_PARAMETER;
if (MemFlags.ui32.Scratch) {
if (Alignment) {
// Scratch memory currently forced to SCRATCH_ALIGN
pr_err("[%s] Alignment not supported for scratch memory: %d\n", __func__, PreferredNode);
return HSAKMT_STATUS_NOT_IMPLEMENTED;
}
*MemoryAddress = fmm_allocate_scratch(gpu_id, *MemoryAddress, SizeInBytes);
if (!(*MemoryAddress)) {
@@ -165,7 +185,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
MemFlags.ui32.CoarseGrain = 1;
*MemoryAddress = fmm_allocate_host(gpu_id, MemFlags.ui32.GTTAccess ? 0 : PreferredNode,
*MemoryAddress, SizeInBytes, MemFlags);
*MemoryAddress, SizeInBytes, Alignment, MemFlags);
if (!(*MemoryAddress)) {
pr_err("[%s] failed to allocate %lu bytes from host\n",
@@ -185,7 +205,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
}
*MemoryAddress = fmm_allocate_device(gpu_id, PreferredNode, *MemoryAddress,
SizeInBytes, MemFlags);
SizeInBytes, Alignment, MemFlags);
if (!(*MemoryAddress)) {
pr_err("[%s] failed to allocate %lu bytes from device\n",
+2 -2
查看文件
@@ -337,7 +337,7 @@ void *allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t g
size = ALIGN_UP(size, align);
if (DeviceLocal && !zfb_support)
mem = fmm_allocate_device(gpu_id, NodeId, mem, size, flags);
mem = fmm_allocate_device(gpu_id, NodeId, mem, size, 0, flags);
else {
/* VRAM under ZFB mode should be supported here without any
* additional code
@@ -352,7 +352,7 @@ void *allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t g
cpu_id = 0;
}
}
mem = fmm_allocate_host(gpu_id, cpu_id, mem, size, flags);
mem = fmm_allocate_host(gpu_id, cpu_id, mem, size, 0, flags);
}
if (!mem) {