Calculate and store the first gpu mem during initializaiton

Previously we used the first dgpu mem, but after careful examination, we
found it only needs to be a GPU, so we modify the code to reflect that as
well.

Change-Id: I069d9b8e247aed55c1f885b79f743ea8e03ddf93
Signed-off-by: Yong Zhao <Yong.Zhao@amd.com>


[ROCm/ROCR-Runtime commit: fe04dd6890]
This commit is contained in:
Yong Zhao
2018-08-03 03:01:22 -04:00
zatwierdzone przez Yong Zhao
rodzic a1348526a3
commit 569a2dc80f
+28 -35
Wyświetl plik
@@ -171,6 +171,8 @@ typedef struct {
*/
static gpu_mem_t *gpu_mem;
static unsigned int gpu_mem_count;
static gpu_mem_t *g_first_gpu_mem;
static bool hsa_debug;
static void *dgpu_shared_aperture_base;
static void *dgpu_shared_aperture_limit;
@@ -238,24 +240,6 @@ static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id,
void *address);
static void print_device_id_array(uint32_t *device_id_array, uint32_t device_id_array_size);
static int32_t find_first_dgpu(HSAuint32 *gpu_id)
{
uint32_t i;
*gpu_id = NON_VALID_GPU_ID;
for (i = 0; i < gpu_mem_count; i++) {
if (gpu_mem[i].gpu_id == NON_VALID_GPU_ID)
continue;
if (!topology_is_dgpu(gpu_mem[i].device_id))
continue;
*gpu_id = gpu_mem[i].gpu_id;
return i;
}
return -1;
}
static vm_area_t *vm_create_and_init_area(void *start, void *end)
{
vm_area_t *area = (vm_area_t *) malloc(sizeof(vm_area_t));
@@ -1213,14 +1197,16 @@ static void *fmm_allocate_host_gpu(uint32_t node_id, uint64_t MemorySizeInBytes,
uint64_t mmap_offset;
uint32_t ioc_flags;
uint64_t size;
int32_t gpu_mem_id;
int32_t gpu_drm_fd;
uint32_t gpu_id;
vm_object_t *vm_obj = NULL;
gpu_mem_id = find_first_dgpu(&gpu_id);
if (gpu_mem_id < 0)
if (!g_first_gpu_mem)
return NULL;
gpu_id = g_first_gpu_mem->gpu_id;
gpu_drm_fd = g_first_gpu_mem->drm_render_fd;
size = MemorySizeInBytes;
ioc_flags = 0;
if (flags.ui32.CoarseGrain)
@@ -1296,8 +1282,7 @@ static void *fmm_allocate_host_gpu(uint32_t node_id, uint64_t MemorySizeInBytes,
ioc_flags, &vm_obj);
if (mem && flags.ui32.HostAccess) {
int map_fd = mmap_offset >= (1ULL<<40) ? kfd_fd :
gpu_mem[gpu_mem_id].drm_render_fd;
int map_fd = mmap_offset >= (1ULL<<40) ? kfd_fd : gpu_drm_fd;
void *ret = mmap(mem, MemorySizeInBytes,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, map_fd, mmap_offset);
@@ -1709,7 +1694,7 @@ static void fmm_init_rbtree(void)
HSAKMT_STATUS fmm_init_process_apertures(unsigned int NumNodes)
{
uint32_t i = 0;
uint32_t i;
int32_t gpu_mem_id = 0;
uint32_t gpu_id;
HsaNodeProperties props;
@@ -1748,6 +1733,9 @@ HSAKMT_STATUS fmm_init_process_apertures(unsigned int NumNodes)
if (!guardPagesStr || sscanf(guardPagesStr, "%u", &guardPages) != 1)
guardPages = 1;
gpu_mem_count = 0;
g_first_gpu_mem = NULL;
/* Trade off - NumNodes includes GPU nodes + CPU Node. So in
* systems with CPU node, slightly more memory is allocated than
* necessary
@@ -1760,10 +1748,10 @@ HSAKMT_STATUS fmm_init_process_apertures(unsigned int NumNodes)
* set to 0 by calloc. This is necessary because this function
* gets called before hsaKmtAcquireSystemProperties() is called.
*/
gpu_mem_count = 0;
pacc = pci_alloc();
pci_init(pacc);
while (i < NumNodes) {
for (i = 0; i < NumNodes; i++) {
memset(&props, 0, sizeof(props));
ret = topology_sysfs_get_node_props(i, &props, &gpu_id, pacc);
if (ret != HSAKMT_STATUS_SUCCESS)
@@ -1790,9 +1778,12 @@ HSAKMT_STATUS fmm_init_process_apertures(unsigned int NumNodes)
get_vm_alignment(props.DeviceId);
gpu_mem[gpu_mem_count].gpuvm_aperture.guard_pages = guardPages;
pthread_mutex_init(&gpu_mem[gpu_mem_count].gpuvm_aperture.fmm_mutex, NULL);
if (!g_first_gpu_mem)
g_first_gpu_mem = &gpu_mem[gpu_mem_count];
gpu_mem_count++;
}
i++;
}
pci_cleanup(pacc);
@@ -2592,7 +2583,6 @@ static HSAuint8 fmm_check_user_memory(const void *addr, HSAuint64 size)
static HSAKMT_STATUS fmm_register_user_memory(void *addr, HSAuint64 size, vm_object_t **obj_ret)
{
int32_t i;
HSAuint32 gpu_id;
manageable_aperture_t *aperture;
void *svm_addr = NULL;
@@ -2601,10 +2591,12 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr, HSAuint64 size, vm_obj
HSAuint64 aligned_addr = (HSAuint64)addr - page_offset;
HSAuint64 aligned_size = PAGE_ALIGN_UP(page_offset + size);
/* Find first dGPU for creating the userptr BO */
i = find_first_dgpu(&gpu_id);
if (i < 0)
/* Find first GPU for creating the userptr BO */
if (!g_first_gpu_mem)
return HSAKMT_STATUS_ERROR;
gpu_id = g_first_gpu_mem->gpu_id;
aperture = &svm.dgpu_aperture;
/* Check if this address was already registered */
@@ -2856,12 +2848,13 @@ HSAKMT_STATUS fmm_share_memory(void *MemoryAddress,
if (r != HSAKMT_STATUS_SUCCESS)
return r;
if (!gpu_id && is_dgpu) {
/* Sharing non paged system memory. Use first dgpu which was
/* Sharing non paged system memory. Use first GPU which was
* used during allocation. See fmm_allocate_host_gpu()
*/
r = find_first_dgpu(&gpu_id);
if (r != HSAKMT_STATUS_SUCCESS)
return r;
if (!g_first_gpu_mem)
return HSAKMT_STATUS_ERROR;
gpu_id = g_first_gpu_mem->gpu_id;
}
exportArgs.handle = obj->handle;
exportArgs.gpu_id = gpu_id;