Add extended coherence memory flag

Add support for new flag for memory allocation that will provide
system-scope coherent atomics

Change-Id: I426d66223e8d2b570f69b4c0e61145ce9b2290d2


[ROCm/ROCR-Runtime commit: 8e06dce573]
This commit is contained in:
David Yat Sin
2023-07-21 16:35:41 -04:00
والد abc017f83b
کامیت 351cbe9dc7
6فایلهای تغییر یافته به همراه55 افزوده شده و 15 حذف شده
@@ -566,7 +566,8 @@ typedef struct _HsaMemFlags
unsigned int Uncached: 1; // Caching flag for fine-grained memory on A+A HW platform
unsigned int NoAddress: 1; // only do vram allocation, return a handle, not allocate virtual address.
unsigned int OnlyAddress: 1; // only do virtal address allocation without vram allocation.
unsigned int Reserved: 12;
unsigned int ExtendedCoherent: 1; // system-scope coherence on atomic instructions
unsigned int Reserved: 11;
} ui32;
HSAuint32 Value;
@@ -1372,6 +1373,7 @@ typedef enum _HSA_SVM_FLAGS {
HSA_SVM_FLAG_GPU_EXEC = 0x00000010, // Allow execution on GPU
HSA_SVM_FLAG_GPU_READ_MOSTLY = 0x00000020, // GPUs mostly read, may allow similar optimizations as RO, but writes fault
HSA_SVM_FLAG_GPU_ALWAYS_MAPPED = 0x00000040, // Keep GPU memory mapping always valid as if XNACK is disable
HSA_SVM_FLAG_EXT_COHERENT = 0x00000080, // Fine grained coherency between all devices using device-scope atomics
} HSA_SVM_FLAGS;
typedef enum _HSA_SVM_ATTR_TYPE {
@@ -1027,6 +1027,7 @@ struct kfd_ioctl_acquire_vm_args {
#define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27)
#define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26)
#define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25)
#define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT (1 << 24)
/* Allocate memory for later SVM (shared virtual memory) mapping.
*
@@ -1375,6 +1376,8 @@ struct kfd_ioctl_cross_memory_copy_args {
#define KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY 0x00000020
/* Keep GPU memory mapping always valid as if XNACK is disable */
#define KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED 0x00000040
/* Fine grained coherency between all devices using device-scope atomics */
#define KFD_IOCTL_SVM_FLAG_EXT_COHERENT 0x00000080
/**
* kfd_ioctl_svm_op - SVM ioctl operations
@@ -291,6 +291,8 @@ static HSAKMT_STATUS get_mem_info_svm_api(uint64_t address, uint32_t gpu_id)
pr_err("GPU exec allowed\n");
if (args->attrs[i].value & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)
pr_err("GPU always mapped\n");
if (args->attrs[i].value & KFD_IOCTL_SVM_FLAG_EXT_COHERENT)
pr_err("Extended-scope fine grained coherency between devices\n");
break;
default:
pr_debug("get invalid attr type 0x%x\n", args->attrs[i].type);
+34 -10
مشاهده پرونده
@@ -1017,6 +1017,8 @@ static HsaMemFlags fmm_translate_ioc_to_hsa_flags(uint32_t ioc_flags)
mflags.ui32.ReadOnly = 1;
if (!(ioc_flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT))
mflags.ui32.CoarseGrain = 1;
if (ioc_flags & KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT)
mflags.ui32.ExtendedCoherent = 1;
if (ioc_flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC)
mflags.ui32.HostAccess = 1;
return mflags;
@@ -1024,7 +1026,8 @@ static HsaMemFlags fmm_translate_ioc_to_hsa_flags(uint32_t ioc_flags)
static HSAKMT_STATUS fmm_register_mem_svm_api(void *address,
uint64_t size,
bool coarse_grain)
bool coarse_grain,
bool ext_coherent)
{
struct kfd_ioctl_svm_args *args;
size_t s_attr;
@@ -1035,15 +1038,17 @@ static HSAKMT_STATUS fmm_register_mem_svm_api(void *address,
if (!g_first_gpu_mem)
return HSAKMT_STATUS_ERROR;
s_attr = sizeof(struct kfd_ioctl_svm_attribute);
s_attr = 2 * sizeof(struct kfd_ioctl_svm_attribute);
args = alloca(sizeof(*args) + s_attr);
args->start_addr = aligned_addr;
args->size = aligned_size;
args->op = KFD_IOCTL_SVM_OP_SET_ATTR;
args->nattr = 1;
args->nattr = 2;
args->attrs[0].type = coarse_grain ?
HSA_SVM_ATTR_CLR_FLAGS : HSA_SVM_ATTR_SET_FLAGS;
args->attrs[0].value = HSA_SVM_FLAG_COHERENT;
args->attrs[1].type = ext_coherent ? HSA_SVM_ATTR_CLR_FLAGS : HSA_SVM_ATTR_SET_FLAGS;
args->attrs[1].value = HSA_SVM_FLAG_EXT_COHERENT;
pr_debug("Registering to SVM %p size: %ld\n", (void*)aligned_addr,
aligned_size);
/* Driver does one copy_from_user, with extra attrs size */
@@ -1575,6 +1580,9 @@ void *fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address,
if (mflags.ui32.Uncached || svm.disable_cache)
ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED;
if (mflags.ui32.ExtendedCoherent)
ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT;
mem = __fmm_allocate_device(gpu_id, address, size, aperture, &mmap_offset,
ioc_flags, &vm_obj);
@@ -1639,7 +1647,7 @@ void *fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes,
mflags.Value = 0;
mflags.ui32.NonPaged = 1;
mflags.ui32.HostAccess = 1;
mflags.ui32.Reserved = 0xBe1;
mflags.ui32.Reserved = 0x3e1;
pthread_mutex_lock(&aperture->fmm_mutex);
vm_obj->mflags = mflags;
@@ -3409,8 +3417,11 @@ bool fmm_get_handle(void *address, uint64_t *handle)
return found;
}
static HSAKMT_STATUS fmm_register_user_memory(void *addr, HSAuint64 size,
vm_object_t **obj_ret, bool coarse_grain)
static HSAKMT_STATUS fmm_register_user_memory(void *addr,
HSAuint64 size,
vm_object_t **obj_ret,
bool coarse_grain,
bool ext_coherent)
{
manageable_aperture_t *aperture = svm.dgpu_aperture;
HSAuint32 page_offset = (HSAuint64)addr & (PAGE_SIZE-1);
@@ -3435,7 +3446,8 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr, HSAuint64 size,
&aligned_addr, KFD_IOC_ALLOC_MEM_FLAGS_USERPTR |
KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE |
(coarse_grain ? 0 : KFD_IOC_ALLOC_MEM_FLAGS_COHERENT),
(coarse_grain ? 0 : KFD_IOC_ALLOC_MEM_FLAGS_COHERENT) |
(ext_coherent ? 0 : KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT),
&obj);
if (!svm_addr)
return HSAKMT_STATUS_ERROR;
@@ -3472,7 +3484,8 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr, HSAuint64 size,
HSAKMT_STATUS fmm_register_memory(void *address, uint64_t size_in_bytes,
uint32_t *gpu_id_array,
uint32_t gpu_id_array_size,
bool coarse_grain)
bool coarse_grain,
bool ext_coherent)
{
manageable_aperture_t *aperture = NULL;
vm_object_t *object = NULL;
@@ -3481,6 +3494,9 @@ HSAKMT_STATUS fmm_register_memory(void *address, uint64_t size_in_bytes,
if (gpu_id_array_size > 0 && !gpu_id_array)
return HSAKMT_STATUS_INVALID_PARAMETER;
if (coarse_grain && ext_coherent)
return HSAKMT_STATUS_INVALID_PARAMETER;
object = vm_find_object(address, size_in_bytes, &aperture);
if (!object) {
if (!is_dgpu)
@@ -3489,9 +3505,17 @@ HSAKMT_STATUS fmm_register_memory(void *address, uint64_t size_in_bytes,
/* Register a new user ptr */
if (svm.is_svm_api_supported)
return fmm_register_mem_svm_api(address, size_in_bytes, coarse_grain);
return fmm_register_mem_svm_api(address,
size_in_bytes,
coarse_grain,
ext_coherent);
ret = fmm_register_user_memory(address,
size_in_bytes,
&object,
coarse_grain,
ext_coherent);
ret = fmm_register_user_memory(address, size_in_bytes, &object, coarse_grain);
if (ret != HSAKMT_STATUS_SUCCESS)
return ret;
if (gpu_id_array_size == 0)
@@ -77,7 +77,8 @@ HSAKMT_STATUS fmm_get_aperture_base_and_limit(aperture_type_e aperture_type, HSA
HSAKMT_STATUS fmm_register_memory(void *address, uint64_t size_in_bytes,
uint32_t *gpu_id_array,
uint32_t gpu_id_array_size,
bool coarse_grain);
bool coarse_grain,
bool ext_coherent);
HSAKMT_STATUS fmm_register_graphics_handle(HSAuint64 GraphicsResourceHandle,
HsaGraphicsResourceInfo *GraphicsResourceInfo,
uint32_t *gpu_id_array,
@@ -136,6 +136,11 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
} else
*MemoryAddress = NULL;
if ((MemFlags.ui32.CoarseGrain && MemFlags.ui32.ExtendedCoherent) ||
(MemFlags.ui32.CoarseGrain && MemFlags.ui32.Uncached) ||
(MemFlags.ui32.ExtendedCoherent && MemFlags.ui32.Uncached))
return HSAKMT_STATUS_INVALID_PARAMETER;
if (MemFlags.ui32.Scratch) {
*MemoryAddress = fmm_allocate_scratch(gpu_id, *MemoryAddress, SizeInBytes);
@@ -242,7 +247,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress,
return HSAKMT_STATUS_SUCCESS;
return fmm_register_memory(MemoryAddress, MemorySizeInBytes,
NULL, 0, true);
NULL, 0, true, false);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress,
@@ -268,7 +273,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress,
ret = fmm_register_memory(MemoryAddress, MemorySizeInBytes,
gpu_id_array,
NumberOfNodes*sizeof(uint32_t),
true);
true, false);
if (ret != HSAKMT_STATUS_SUCCESS)
free(gpu_id_array);
}
@@ -286,6 +291,9 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags(void *MemoryAddress,
pr_debug("[%s] address %p\n",
__func__, MemoryAddress);
if (MemFlags.ui32.ExtendedCoherent && MemFlags.ui32.CoarseGrain)
return HSAKMT_STATUS_INVALID_PARAMETER;
// Registered memory should be ordinary paged host memory.
if ((MemFlags.ui32.HostAccess != 1) || (MemFlags.ui32.NonPaged == 1))
return HSAKMT_STATUS_NOT_SUPPORTED;
@@ -295,7 +303,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags(void *MemoryAddress,
return HSAKMT_STATUS_NOT_SUPPORTED;
ret = fmm_register_memory(MemoryAddress, MemorySizeInBytes,
NULL, 0, MemFlags.ui32.CoarseGrain);
NULL, 0, MemFlags.ui32.CoarseGrain, MemFlags.ui32.ExtendedCoherent);
return ret;
}