diff --git a/projects/rocr-runtime/include/hsakmttypes.h b/projects/rocr-runtime/include/hsakmttypes.h index ef0a4a23a4..2937c35915 100644 --- a/projects/rocr-runtime/include/hsakmttypes.h +++ b/projects/rocr-runtime/include/hsakmttypes.h @@ -566,7 +566,8 @@ typedef struct _HsaMemFlags unsigned int Uncached: 1; // Caching flag for fine-grained memory on A+A HW platform unsigned int NoAddress: 1; // only do vram allocation, return a handle, not allocate virtual address. unsigned int OnlyAddress: 1; // only do virtal address allocation without vram allocation. - unsigned int Reserved: 12; + unsigned int ExtendedCoherent: 1; // system-scope coherence on atomic instructions + unsigned int Reserved: 11; } ui32; HSAuint32 Value; @@ -1372,6 +1373,7 @@ typedef enum _HSA_SVM_FLAGS { HSA_SVM_FLAG_GPU_EXEC = 0x00000010, // Allow execution on GPU HSA_SVM_FLAG_GPU_READ_MOSTLY = 0x00000020, // GPUs mostly read, may allow similar optimizations as RO, but writes fault HSA_SVM_FLAG_GPU_ALWAYS_MAPPED = 0x00000040, // Keep GPU memory mapping always valid as if XNACK is disable + HSA_SVM_FLAG_EXT_COHERENT = 0x00000080, // Fine grained coherency between all devices using device-scope atomics } HSA_SVM_FLAGS; typedef enum _HSA_SVM_ATTR_TYPE { diff --git a/projects/rocr-runtime/include/linux/kfd_ioctl.h b/projects/rocr-runtime/include/linux/kfd_ioctl.h index c56347a7d6..d3e6cee06d 100644 --- a/projects/rocr-runtime/include/linux/kfd_ioctl.h +++ b/projects/rocr-runtime/include/linux/kfd_ioctl.h @@ -1027,6 +1027,7 @@ struct kfd_ioctl_acquire_vm_args { #define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27) #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26) #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25) +#define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT (1 << 24) /* Allocate memory for later SVM (shared virtual memory) mapping. * @@ -1375,6 +1376,8 @@ struct kfd_ioctl_cross_memory_copy_args { #define KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY 0x00000020 /* Keep GPU memory mapping always valid as if XNACK is disable */ #define KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED 0x00000040 +/* Fine grained coherency between all devices using device-scope atomics */ +#define KFD_IOCTL_SVM_FLAG_EXT_COHERENT 0x00000080 /** * kfd_ioctl_svm_op - SVM ioctl operations diff --git a/projects/rocr-runtime/src/events.c b/projects/rocr-runtime/src/events.c index 6d3cd70805..9ec199ac7a 100644 --- a/projects/rocr-runtime/src/events.c +++ b/projects/rocr-runtime/src/events.c @@ -291,6 +291,8 @@ static HSAKMT_STATUS get_mem_info_svm_api(uint64_t address, uint32_t gpu_id) pr_err("GPU exec allowed\n"); if (args->attrs[i].value & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED) pr_err("GPU always mapped\n"); + if (args->attrs[i].value & KFD_IOCTL_SVM_FLAG_EXT_COHERENT) + pr_err("Extended-scope fine grained coherency between devices\n"); break; default: pr_debug("get invalid attr type 0x%x\n", args->attrs[i].type); diff --git a/projects/rocr-runtime/src/fmm.c b/projects/rocr-runtime/src/fmm.c index d60a9ad647..76ddc40f4f 100644 --- a/projects/rocr-runtime/src/fmm.c +++ b/projects/rocr-runtime/src/fmm.c @@ -1017,6 +1017,8 @@ static HsaMemFlags fmm_translate_ioc_to_hsa_flags(uint32_t ioc_flags) mflags.ui32.ReadOnly = 1; if (!(ioc_flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT)) mflags.ui32.CoarseGrain = 1; + if (ioc_flags & KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT) + mflags.ui32.ExtendedCoherent = 1; if (ioc_flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) mflags.ui32.HostAccess = 1; return mflags; @@ -1024,7 +1026,8 @@ static HsaMemFlags fmm_translate_ioc_to_hsa_flags(uint32_t ioc_flags) static HSAKMT_STATUS fmm_register_mem_svm_api(void *address, uint64_t size, - bool coarse_grain) + bool coarse_grain, + bool ext_coherent) { struct kfd_ioctl_svm_args *args; size_t s_attr; @@ -1035,15 +1038,17 @@ static HSAKMT_STATUS fmm_register_mem_svm_api(void *address, if (!g_first_gpu_mem) return HSAKMT_STATUS_ERROR; - s_attr = sizeof(struct kfd_ioctl_svm_attribute); + s_attr = 2 * sizeof(struct kfd_ioctl_svm_attribute); args = alloca(sizeof(*args) + s_attr); args->start_addr = aligned_addr; args->size = aligned_size; args->op = KFD_IOCTL_SVM_OP_SET_ATTR; - args->nattr = 1; + args->nattr = 2; args->attrs[0].type = coarse_grain ? HSA_SVM_ATTR_CLR_FLAGS : HSA_SVM_ATTR_SET_FLAGS; args->attrs[0].value = HSA_SVM_FLAG_COHERENT; + args->attrs[1].type = ext_coherent ? HSA_SVM_ATTR_CLR_FLAGS : HSA_SVM_ATTR_SET_FLAGS; + args->attrs[1].value = HSA_SVM_FLAG_EXT_COHERENT; pr_debug("Registering to SVM %p size: %ld\n", (void*)aligned_addr, aligned_size); /* Driver does one copy_from_user, with extra attrs size */ @@ -1575,6 +1580,9 @@ void *fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address, if (mflags.ui32.Uncached || svm.disable_cache) ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED; + if (mflags.ui32.ExtendedCoherent) + ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT; + mem = __fmm_allocate_device(gpu_id, address, size, aperture, &mmap_offset, ioc_flags, &vm_obj); @@ -1639,7 +1647,7 @@ void *fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes, mflags.Value = 0; mflags.ui32.NonPaged = 1; mflags.ui32.HostAccess = 1; - mflags.ui32.Reserved = 0xBe1; + mflags.ui32.Reserved = 0x3e1; pthread_mutex_lock(&aperture->fmm_mutex); vm_obj->mflags = mflags; @@ -3409,8 +3417,11 @@ bool fmm_get_handle(void *address, uint64_t *handle) return found; } -static HSAKMT_STATUS fmm_register_user_memory(void *addr, HSAuint64 size, - vm_object_t **obj_ret, bool coarse_grain) +static HSAKMT_STATUS fmm_register_user_memory(void *addr, + HSAuint64 size, + vm_object_t **obj_ret, + bool coarse_grain, + bool ext_coherent) { manageable_aperture_t *aperture = svm.dgpu_aperture; HSAuint32 page_offset = (HSAuint64)addr & (PAGE_SIZE-1); @@ -3435,7 +3446,8 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr, HSAuint64 size, &aligned_addr, KFD_IOC_ALLOC_MEM_FLAGS_USERPTR | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | - (coarse_grain ? 0 : KFD_IOC_ALLOC_MEM_FLAGS_COHERENT), + (coarse_grain ? 0 : KFD_IOC_ALLOC_MEM_FLAGS_COHERENT) | + (ext_coherent ? 0 : KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT), &obj); if (!svm_addr) return HSAKMT_STATUS_ERROR; @@ -3472,7 +3484,8 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr, HSAuint64 size, HSAKMT_STATUS fmm_register_memory(void *address, uint64_t size_in_bytes, uint32_t *gpu_id_array, uint32_t gpu_id_array_size, - bool coarse_grain) + bool coarse_grain, + bool ext_coherent) { manageable_aperture_t *aperture = NULL; vm_object_t *object = NULL; @@ -3481,6 +3494,9 @@ HSAKMT_STATUS fmm_register_memory(void *address, uint64_t size_in_bytes, if (gpu_id_array_size > 0 && !gpu_id_array) return HSAKMT_STATUS_INVALID_PARAMETER; + if (coarse_grain && ext_coherent) + return HSAKMT_STATUS_INVALID_PARAMETER; + object = vm_find_object(address, size_in_bytes, &aperture); if (!object) { if (!is_dgpu) @@ -3489,9 +3505,17 @@ HSAKMT_STATUS fmm_register_memory(void *address, uint64_t size_in_bytes, /* Register a new user ptr */ if (svm.is_svm_api_supported) - return fmm_register_mem_svm_api(address, size_in_bytes, coarse_grain); + return fmm_register_mem_svm_api(address, + size_in_bytes, + coarse_grain, + ext_coherent); + + ret = fmm_register_user_memory(address, + size_in_bytes, + &object, + coarse_grain, + ext_coherent); - ret = fmm_register_user_memory(address, size_in_bytes, &object, coarse_grain); if (ret != HSAKMT_STATUS_SUCCESS) return ret; if (gpu_id_array_size == 0) diff --git a/projects/rocr-runtime/src/fmm.h b/projects/rocr-runtime/src/fmm.h index 6cd7898cac..b8c9b84bfd 100644 --- a/projects/rocr-runtime/src/fmm.h +++ b/projects/rocr-runtime/src/fmm.h @@ -77,7 +77,8 @@ HSAKMT_STATUS fmm_get_aperture_base_and_limit(aperture_type_e aperture_type, HSA HSAKMT_STATUS fmm_register_memory(void *address, uint64_t size_in_bytes, uint32_t *gpu_id_array, uint32_t gpu_id_array_size, - bool coarse_grain); + bool coarse_grain, + bool ext_coherent); HSAKMT_STATUS fmm_register_graphics_handle(HSAuint64 GraphicsResourceHandle, HsaGraphicsResourceInfo *GraphicsResourceInfo, uint32_t *gpu_id_array, diff --git a/projects/rocr-runtime/src/memory.c b/projects/rocr-runtime/src/memory.c index 6b3eaf496b..ba33abda3e 100644 --- a/projects/rocr-runtime/src/memory.c +++ b/projects/rocr-runtime/src/memory.c @@ -136,6 +136,11 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode, } else *MemoryAddress = NULL; + if ((MemFlags.ui32.CoarseGrain && MemFlags.ui32.ExtendedCoherent) || + (MemFlags.ui32.CoarseGrain && MemFlags.ui32.Uncached) || + (MemFlags.ui32.ExtendedCoherent && MemFlags.ui32.Uncached)) + return HSAKMT_STATUS_INVALID_PARAMETER; + if (MemFlags.ui32.Scratch) { *MemoryAddress = fmm_allocate_scratch(gpu_id, *MemoryAddress, SizeInBytes); @@ -242,7 +247,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress, return HSAKMT_STATUS_SUCCESS; return fmm_register_memory(MemoryAddress, MemorySizeInBytes, - NULL, 0, true); + NULL, 0, true, false); } HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress, @@ -268,7 +273,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress, ret = fmm_register_memory(MemoryAddress, MemorySizeInBytes, gpu_id_array, NumberOfNodes*sizeof(uint32_t), - true); + true, false); if (ret != HSAKMT_STATUS_SUCCESS) free(gpu_id_array); } @@ -286,6 +291,9 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags(void *MemoryAddress, pr_debug("[%s] address %p\n", __func__, MemoryAddress); + if (MemFlags.ui32.ExtendedCoherent && MemFlags.ui32.CoarseGrain) + return HSAKMT_STATUS_INVALID_PARAMETER; + // Registered memory should be ordinary paged host memory. if ((MemFlags.ui32.HostAccess != 1) || (MemFlags.ui32.NonPaged == 1)) return HSAKMT_STATUS_NOT_SUPPORTED; @@ -295,7 +303,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags(void *MemoryAddress, return HSAKMT_STATUS_NOT_SUPPORTED; ret = fmm_register_memory(MemoryAddress, MemorySizeInBytes, - NULL, 0, MemFlags.ui32.CoarseGrain); + NULL, 0, MemFlags.ui32.CoarseGrain, MemFlags.ui32.ExtendedCoherent); return ret; }