Add extended coherence memory flag
Add support for new flag for memory allocation that will provide
system-scope coherent atomics
Change-Id: I426d66223e8d2b570f69b4c0e61145ce9b2290d2
[ROCm/ROCR-Runtime commit: 8e06dce573]
This commit is contained in:
@@ -566,7 +566,8 @@ typedef struct _HsaMemFlags
|
||||
unsigned int Uncached: 1; // Caching flag for fine-grained memory on A+A HW platform
|
||||
unsigned int NoAddress: 1; // only do vram allocation, return a handle, not allocate virtual address.
|
||||
unsigned int OnlyAddress: 1; // only do virtal address allocation without vram allocation.
|
||||
unsigned int Reserved: 12;
|
||||
unsigned int ExtendedCoherent: 1; // system-scope coherence on atomic instructions
|
||||
unsigned int Reserved: 11;
|
||||
|
||||
} ui32;
|
||||
HSAuint32 Value;
|
||||
@@ -1372,6 +1373,7 @@ typedef enum _HSA_SVM_FLAGS {
|
||||
HSA_SVM_FLAG_GPU_EXEC = 0x00000010, // Allow execution on GPU
|
||||
HSA_SVM_FLAG_GPU_READ_MOSTLY = 0x00000020, // GPUs mostly read, may allow similar optimizations as RO, but writes fault
|
||||
HSA_SVM_FLAG_GPU_ALWAYS_MAPPED = 0x00000040, // Keep GPU memory mapping always valid as if XNACK is disable
|
||||
HSA_SVM_FLAG_EXT_COHERENT = 0x00000080, // Fine grained coherency between all devices using device-scope atomics
|
||||
} HSA_SVM_FLAGS;
|
||||
|
||||
typedef enum _HSA_SVM_ATTR_TYPE {
|
||||
|
||||
@@ -1027,6 +1027,7 @@ struct kfd_ioctl_acquire_vm_args {
|
||||
#define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27)
|
||||
#define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26)
|
||||
#define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25)
|
||||
#define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT (1 << 24)
|
||||
|
||||
/* Allocate memory for later SVM (shared virtual memory) mapping.
|
||||
*
|
||||
@@ -1375,6 +1376,8 @@ struct kfd_ioctl_cross_memory_copy_args {
|
||||
#define KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY 0x00000020
|
||||
/* Keep GPU memory mapping always valid as if XNACK is disable */
|
||||
#define KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED 0x00000040
|
||||
/* Fine grained coherency between all devices using device-scope atomics */
|
||||
#define KFD_IOCTL_SVM_FLAG_EXT_COHERENT 0x00000080
|
||||
|
||||
/**
|
||||
* kfd_ioctl_svm_op - SVM ioctl operations
|
||||
|
||||
@@ -291,6 +291,8 @@ static HSAKMT_STATUS get_mem_info_svm_api(uint64_t address, uint32_t gpu_id)
|
||||
pr_err("GPU exec allowed\n");
|
||||
if (args->attrs[i].value & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)
|
||||
pr_err("GPU always mapped\n");
|
||||
if (args->attrs[i].value & KFD_IOCTL_SVM_FLAG_EXT_COHERENT)
|
||||
pr_err("Extended-scope fine grained coherency between devices\n");
|
||||
break;
|
||||
default:
|
||||
pr_debug("get invalid attr type 0x%x\n", args->attrs[i].type);
|
||||
|
||||
@@ -1017,6 +1017,8 @@ static HsaMemFlags fmm_translate_ioc_to_hsa_flags(uint32_t ioc_flags)
|
||||
mflags.ui32.ReadOnly = 1;
|
||||
if (!(ioc_flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT))
|
||||
mflags.ui32.CoarseGrain = 1;
|
||||
if (ioc_flags & KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT)
|
||||
mflags.ui32.ExtendedCoherent = 1;
|
||||
if (ioc_flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC)
|
||||
mflags.ui32.HostAccess = 1;
|
||||
return mflags;
|
||||
@@ -1024,7 +1026,8 @@ static HsaMemFlags fmm_translate_ioc_to_hsa_flags(uint32_t ioc_flags)
|
||||
|
||||
static HSAKMT_STATUS fmm_register_mem_svm_api(void *address,
|
||||
uint64_t size,
|
||||
bool coarse_grain)
|
||||
bool coarse_grain,
|
||||
bool ext_coherent)
|
||||
{
|
||||
struct kfd_ioctl_svm_args *args;
|
||||
size_t s_attr;
|
||||
@@ -1035,15 +1038,17 @@ static HSAKMT_STATUS fmm_register_mem_svm_api(void *address,
|
||||
if (!g_first_gpu_mem)
|
||||
return HSAKMT_STATUS_ERROR;
|
||||
|
||||
s_attr = sizeof(struct kfd_ioctl_svm_attribute);
|
||||
s_attr = 2 * sizeof(struct kfd_ioctl_svm_attribute);
|
||||
args = alloca(sizeof(*args) + s_attr);
|
||||
args->start_addr = aligned_addr;
|
||||
args->size = aligned_size;
|
||||
args->op = KFD_IOCTL_SVM_OP_SET_ATTR;
|
||||
args->nattr = 1;
|
||||
args->nattr = 2;
|
||||
args->attrs[0].type = coarse_grain ?
|
||||
HSA_SVM_ATTR_CLR_FLAGS : HSA_SVM_ATTR_SET_FLAGS;
|
||||
args->attrs[0].value = HSA_SVM_FLAG_COHERENT;
|
||||
args->attrs[1].type = ext_coherent ? HSA_SVM_ATTR_CLR_FLAGS : HSA_SVM_ATTR_SET_FLAGS;
|
||||
args->attrs[1].value = HSA_SVM_FLAG_EXT_COHERENT;
|
||||
pr_debug("Registering to SVM %p size: %ld\n", (void*)aligned_addr,
|
||||
aligned_size);
|
||||
/* Driver does one copy_from_user, with extra attrs size */
|
||||
@@ -1575,6 +1580,9 @@ void *fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address,
|
||||
if (mflags.ui32.Uncached || svm.disable_cache)
|
||||
ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED;
|
||||
|
||||
if (mflags.ui32.ExtendedCoherent)
|
||||
ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT;
|
||||
|
||||
mem = __fmm_allocate_device(gpu_id, address, size, aperture, &mmap_offset,
|
||||
ioc_flags, &vm_obj);
|
||||
|
||||
@@ -1639,7 +1647,7 @@ void *fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes,
|
||||
mflags.Value = 0;
|
||||
mflags.ui32.NonPaged = 1;
|
||||
mflags.ui32.HostAccess = 1;
|
||||
mflags.ui32.Reserved = 0xBe1;
|
||||
mflags.ui32.Reserved = 0x3e1;
|
||||
|
||||
pthread_mutex_lock(&aperture->fmm_mutex);
|
||||
vm_obj->mflags = mflags;
|
||||
@@ -3409,8 +3417,11 @@ bool fmm_get_handle(void *address, uint64_t *handle)
|
||||
return found;
|
||||
}
|
||||
|
||||
static HSAKMT_STATUS fmm_register_user_memory(void *addr, HSAuint64 size,
|
||||
vm_object_t **obj_ret, bool coarse_grain)
|
||||
static HSAKMT_STATUS fmm_register_user_memory(void *addr,
|
||||
HSAuint64 size,
|
||||
vm_object_t **obj_ret,
|
||||
bool coarse_grain,
|
||||
bool ext_coherent)
|
||||
{
|
||||
manageable_aperture_t *aperture = svm.dgpu_aperture;
|
||||
HSAuint32 page_offset = (HSAuint64)addr & (PAGE_SIZE-1);
|
||||
@@ -3435,7 +3446,8 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr, HSAuint64 size,
|
||||
&aligned_addr, KFD_IOC_ALLOC_MEM_FLAGS_USERPTR |
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE |
|
||||
(coarse_grain ? 0 : KFD_IOC_ALLOC_MEM_FLAGS_COHERENT),
|
||||
(coarse_grain ? 0 : KFD_IOC_ALLOC_MEM_FLAGS_COHERENT) |
|
||||
(ext_coherent ? 0 : KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT),
|
||||
&obj);
|
||||
if (!svm_addr)
|
||||
return HSAKMT_STATUS_ERROR;
|
||||
@@ -3472,7 +3484,8 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr, HSAuint64 size,
|
||||
HSAKMT_STATUS fmm_register_memory(void *address, uint64_t size_in_bytes,
|
||||
uint32_t *gpu_id_array,
|
||||
uint32_t gpu_id_array_size,
|
||||
bool coarse_grain)
|
||||
bool coarse_grain,
|
||||
bool ext_coherent)
|
||||
{
|
||||
manageable_aperture_t *aperture = NULL;
|
||||
vm_object_t *object = NULL;
|
||||
@@ -3481,6 +3494,9 @@ HSAKMT_STATUS fmm_register_memory(void *address, uint64_t size_in_bytes,
|
||||
if (gpu_id_array_size > 0 && !gpu_id_array)
|
||||
return HSAKMT_STATUS_INVALID_PARAMETER;
|
||||
|
||||
if (coarse_grain && ext_coherent)
|
||||
return HSAKMT_STATUS_INVALID_PARAMETER;
|
||||
|
||||
object = vm_find_object(address, size_in_bytes, &aperture);
|
||||
if (!object) {
|
||||
if (!is_dgpu)
|
||||
@@ -3489,9 +3505,17 @@ HSAKMT_STATUS fmm_register_memory(void *address, uint64_t size_in_bytes,
|
||||
|
||||
/* Register a new user ptr */
|
||||
if (svm.is_svm_api_supported)
|
||||
return fmm_register_mem_svm_api(address, size_in_bytes, coarse_grain);
|
||||
return fmm_register_mem_svm_api(address,
|
||||
size_in_bytes,
|
||||
coarse_grain,
|
||||
ext_coherent);
|
||||
|
||||
ret = fmm_register_user_memory(address,
|
||||
size_in_bytes,
|
||||
&object,
|
||||
coarse_grain,
|
||||
ext_coherent);
|
||||
|
||||
ret = fmm_register_user_memory(address, size_in_bytes, &object, coarse_grain);
|
||||
if (ret != HSAKMT_STATUS_SUCCESS)
|
||||
return ret;
|
||||
if (gpu_id_array_size == 0)
|
||||
|
||||
@@ -77,7 +77,8 @@ HSAKMT_STATUS fmm_get_aperture_base_and_limit(aperture_type_e aperture_type, HSA
|
||||
HSAKMT_STATUS fmm_register_memory(void *address, uint64_t size_in_bytes,
|
||||
uint32_t *gpu_id_array,
|
||||
uint32_t gpu_id_array_size,
|
||||
bool coarse_grain);
|
||||
bool coarse_grain,
|
||||
bool ext_coherent);
|
||||
HSAKMT_STATUS fmm_register_graphics_handle(HSAuint64 GraphicsResourceHandle,
|
||||
HsaGraphicsResourceInfo *GraphicsResourceInfo,
|
||||
uint32_t *gpu_id_array,
|
||||
|
||||
@@ -136,6 +136,11 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
|
||||
} else
|
||||
*MemoryAddress = NULL;
|
||||
|
||||
if ((MemFlags.ui32.CoarseGrain && MemFlags.ui32.ExtendedCoherent) ||
|
||||
(MemFlags.ui32.CoarseGrain && MemFlags.ui32.Uncached) ||
|
||||
(MemFlags.ui32.ExtendedCoherent && MemFlags.ui32.Uncached))
|
||||
return HSAKMT_STATUS_INVALID_PARAMETER;
|
||||
|
||||
if (MemFlags.ui32.Scratch) {
|
||||
*MemoryAddress = fmm_allocate_scratch(gpu_id, *MemoryAddress, SizeInBytes);
|
||||
|
||||
@@ -242,7 +247,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress,
|
||||
return HSAKMT_STATUS_SUCCESS;
|
||||
|
||||
return fmm_register_memory(MemoryAddress, MemorySizeInBytes,
|
||||
NULL, 0, true);
|
||||
NULL, 0, true, false);
|
||||
}
|
||||
|
||||
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress,
|
||||
@@ -268,7 +273,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress,
|
||||
ret = fmm_register_memory(MemoryAddress, MemorySizeInBytes,
|
||||
gpu_id_array,
|
||||
NumberOfNodes*sizeof(uint32_t),
|
||||
true);
|
||||
true, false);
|
||||
if (ret != HSAKMT_STATUS_SUCCESS)
|
||||
free(gpu_id_array);
|
||||
}
|
||||
@@ -286,6 +291,9 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags(void *MemoryAddress,
|
||||
pr_debug("[%s] address %p\n",
|
||||
__func__, MemoryAddress);
|
||||
|
||||
if (MemFlags.ui32.ExtendedCoherent && MemFlags.ui32.CoarseGrain)
|
||||
return HSAKMT_STATUS_INVALID_PARAMETER;
|
||||
|
||||
// Registered memory should be ordinary paged host memory.
|
||||
if ((MemFlags.ui32.HostAccess != 1) || (MemFlags.ui32.NonPaged == 1))
|
||||
return HSAKMT_STATUS_NOT_SUPPORTED;
|
||||
@@ -295,7 +303,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags(void *MemoryAddress,
|
||||
return HSAKMT_STATUS_NOT_SUPPORTED;
|
||||
|
||||
ret = fmm_register_memory(MemoryAddress, MemorySizeInBytes,
|
||||
NULL, 0, MemFlags.ui32.CoarseGrain);
|
||||
NULL, 0, MemFlags.ui32.CoarseGrain, MemFlags.ui32.ExtendedCoherent);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
مرجع در شماره جدید
Block a user