diff --git a/projects/rocr-runtime/src/debug.c b/projects/rocr-runtime/src/debug.c index bb04cfbc2c..ff7ec7fb33 100644 --- a/projects/rocr-runtime/src/debug.c +++ b/projects/rocr-runtime/src/debug.c @@ -28,9 +28,29 @@ #include #include -static bool is_device_debugged[MAX_NODES] = {false}; +static bool *is_device_debugged; int debug_get_reg_status(uint32_t node_id, bool* is_debugged); +HSAKMT_STATUS init_device_debugging_memory(unsigned int NumNodes) +{ + unsigned int i; + + is_device_debugged = malloc(NumNodes * sizeof(bool)); + if (is_device_debugged == NULL) + return HSAKMT_STATUS_NO_MEMORY; + + for (i = 0; i < NumNodes; i++) + is_device_debugged[i] = false; + + return HSAKMT_STATUS_SUCCESS; +} + +void destroy_device_debugging_memory(void) +{ + if (is_device_debugged) + free(is_device_debugged); +} + HSAKMT_STATUS HSAKMTAPI hsaKmtDbgRegister( @@ -41,6 +61,9 @@ hsaKmtDbgRegister( uint32_t gpu_id; CHECK_KFD_OPEN(); + if (is_device_debugged == NULL) + return HSAKMT_STATUS_NO_MEMORY; + result = validate_nodeid(NodeId, &gpu_id); if (result != HSAKMT_STATUS_SUCCESS) return result; @@ -71,6 +94,9 @@ hsaKmtDbgUnregister( uint32_t gpu_id; CHECK_KFD_OPEN(); + if (is_device_debugged == NULL) + return HSAKMT_STATUS_NO_MEMORY; + result = validate_nodeid(NodeId, &gpu_id); if (result != HSAKMT_STATUS_SUCCESS) return result; @@ -283,7 +309,8 @@ hsaKmtDbgAddressWatch( /* =============================================================================== */ int debug_get_reg_status(uint32_t node_id, bool* is_debugged) { - if ( node_id >= MAX_NODES) + *is_debugged = NULL; + if (is_device_debugged == NULL) return -1; else { *is_debugged = is_device_debugged[node_id]; diff --git a/projects/rocr-runtime/src/fmm.c b/projects/rocr-runtime/src/fmm.c index b615311c65..056daba975 100644 --- a/projects/rocr-runtime/src/fmm.c +++ b/projects/rocr-runtime/src/fmm.c @@ -964,25 +964,20 @@ static int fmm_set_memory_policy(uint32_t gpu_id, int default_policy, int alt_po return kmtIoctl(kfd_fd, AMDKFD_IOC_SET_MEMORY_POLICY, &args); } -HSAKMT_STATUS fmm_init_process_apertures(void) +HSAKMT_STATUS fmm_init_process_apertures(unsigned int NumNodes) { struct kfd_ioctl_get_process_apertures_new_args args; uint32_t i = 0; int32_t gpu_mem_id =0; uint32_t gpu_id; - HsaSystemProperties sys_props; HsaNodeProperties props; struct kfd_process_device_apertures * process_apertures; HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; - ret = topology_sysfs_get_system_props(&sys_props); - if (ret != HSAKMT_STATUS_SUCCESS) - return ret; - - /* Trade off - sys_props.NumNodes includes GPU nodes + CPU Node. So in + /* Trade off - NumNodes includes GPU nodes + CPU Node. So in * systems with CPU node, slightly more memory is allocated than * necessary*/ - gpu_mem = (gpu_mem_t *)calloc(sys_props.NumNodes * sizeof(gpu_mem_t), 1); + gpu_mem = (gpu_mem_t *)calloc(NumNodes, sizeof(gpu_mem_t)); if (gpu_mem == NULL) return HSAKMT_STATUS_NO_MEMORY; @@ -990,7 +985,7 @@ HSAKMT_STATUS fmm_init_process_apertures(void) * 0 by calloc. This is necessary because this function * gets called before hsaKmtAcquireSystemProperties() is called.*/ gpu_mem_count = 0; - while (i < sys_props.NumNodes) { + while (i < NumNodes) { ret = topology_sysfs_get_node_props(i, &props, &gpu_id); if (ret != HSAKMT_STATUS_SUCCESS) goto sysfs_parse_failed; @@ -1240,9 +1235,9 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageble_aperture_t *apertu VOID_PTR_ADD(address, size -1) > aperture->limit) return -1; - debug_get_reg_status(gpu_mem[gpu_mem_id].node_id, &is_debugger); + ret = debug_get_reg_status(gpu_mem[gpu_mem_id].node_id, &is_debugger); /* allocate object within the scratch backing aperture */ - if (!is_debugger) { + if (!ret && !is_debugger) { offset = VOID_PTRS_SUB(address, aperture->base); mem = __fmm_allocate_device(gpu_id, size, aperture, offset, NULL, KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE); diff --git a/projects/rocr-runtime/src/fmm.h b/projects/rocr-runtime/src/fmm.h index 45fedb4d21..881413bbdf 100644 --- a/projects/rocr-runtime/src/fmm.h +++ b/projects/rocr-runtime/src/fmm.h @@ -43,7 +43,7 @@ typedef struct { void* start_address; } aperture_properties_t; -HSAKMT_STATUS fmm_init_process_apertures(void); +HSAKMT_STATUS fmm_init_process_apertures(unsigned int NumNodes); void fmm_destroy_process_apertures(void); /* diff --git a/projects/rocr-runtime/src/libhsakmt.h b/projects/rocr-runtime/src/libhsakmt.h index b976d5a3e5..e1d52b7efe 100644 --- a/projects/rocr-runtime/src/libhsakmt.h +++ b/projects/rocr-runtime/src/libhsakmt.h @@ -59,12 +59,6 @@ extern bool is_dgpu; #define PAGE_ALIGN_UP(x) ALIGN_UP(x,PAGE_SIZE) #define BITMASK(n) (((n) < sizeof(1ULL) * CHAR_BIT ? (1ULL << (n)) : 0) - 1ULL) -/* - * Even though the toplogy code doesn't limit us to maximum number of nodes, - * the current HSA spec says the maximum is 8 nodes - */ -#define MAX_NODES 8 - HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id); HSAKMT_STATUS gpuid_to_nodeid(uint32_t gpu_id, uint32_t* node_id); uint16_t get_device_id_by_node(HSAuint32 node_id); @@ -80,8 +74,12 @@ HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags); void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t NodeId, bool peer_to_peer); void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align); -HSAKMT_STATUS init_process_doorbells(void); +HSAKMT_STATUS init_process_doorbells(unsigned int NumNodes); void destroy_process_doorbells(void); +HSAKMT_STATUS init_device_debugging_memory(unsigned int NumNodes); +void destroy_device_debugging_memory(void); +HSAKMT_STATUS init_counter_props(unsigned int NumNodes); +void destroy_counter_props(void); extern int kmtIoctl(int fd, unsigned long request, void *arg); diff --git a/projects/rocr-runtime/src/openclose.c b/projects/rocr-runtime/src/openclose.c index 620db3debf..72baf7c997 100644 --- a/projects/rocr-runtime/src/openclose.c +++ b/projects/rocr-runtime/src/openclose.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "fmm.h" static const char kfd_device_name[] = "/dev/kfd"; @@ -42,6 +43,7 @@ hsaKmtOpenKFD(void) { HSAKMT_STATUS result; int fd; + HsaSystemProperties sys_props; pthread_mutex_lock(&hsakmt_mutex); @@ -57,14 +59,24 @@ hsaKmtOpenKFD(void) goto open_failed; } - result = fmm_init_process_apertures(); + result = topology_sysfs_get_system_props(&sys_props); + if (result != HSAKMT_STATUS_SUCCESS) + goto topology_sysfs_failed; + + result = fmm_init_process_apertures(sys_props.NumNodes); if (result != HSAKMT_STATUS_SUCCESS) goto init_process_aperture_failed; - result = init_process_doorbells(); + result = init_process_doorbells(sys_props.NumNodes); if (result != HSAKMT_STATUS_SUCCESS) goto init_doorbell_failed; + if (init_device_debugging_memory(sys_props.NumNodes) != HSAKMT_STATUS_SUCCESS) + printf("Insufficient Memory. Debugging unavailable\n"); + + if (init_counter_props(sys_props.NumNodes) != HSAKMT_STATUS_SUCCESS) + printf("Insufficient Memory. Performance Counter information unavailable\n"); + amd_hsa_thunk_lock_fd = open(tmp_file, O_CREAT | //create the file if it's not present. O_RDWR, //only need write access for the internal locking semantics. @@ -82,6 +94,7 @@ hsaKmtOpenKFD(void) init_doorbell_failed: fmm_destroy_process_apertures(); init_process_aperture_failed: +topology_sysfs_failed: close(fd); open_failed: pthread_mutex_unlock(&hsakmt_mutex); @@ -101,6 +114,8 @@ hsaKmtCloseKFD(void) { if (--kfd_open_count == 0) { + destroy_counter_props(); + destroy_device_debugging_memory(); destroy_process_doorbells(); fmm_destroy_process_apertures(); close(kfd_fd); diff --git a/projects/rocr-runtime/src/perfctr.c b/projects/rocr-runtime/src/perfctr.c index 403ff4ad3e..95102a19ed 100644 --- a/projects/rocr-runtime/src/perfctr.c +++ b/projects/rocr-runtime/src/perfctr.c @@ -46,9 +46,34 @@ struct perf_trace { extern int amd_hsa_thunk_lock_fd; -static HsaCounterProperties *counter_props[MAX_NODES] = {NULL}; +static HsaCounterProperties **counter_props; +static unsigned int counter_props_count; -void __attribute__ ((destructor)) perfctr_release_global_resources(void); +HSAKMT_STATUS init_counter_props(unsigned int NumNodes) +{ + counter_props = calloc(NumNodes, sizeof(struct HsaCounterProperties*)); + if (counter_props == NULL) + return HSAKMT_STATUS_NO_MEMORY; + + counter_props_count = NumNodes; + return HSAKMT_STATUS_SUCCESS; +} + +void destroy_counter_props(void) +{ + unsigned int i; + + if (counter_props == NULL) + return; + + for (i = 0; i= MAX_NODES || !node || !_system || _system->NumNodes <= nodeid) + if (!node || !_system || _system->NumNodes <= nodeid) return HSAKMT_STATUS_INVALID_NODE_UNIT; if (gpu_id) *gpu_id = node[nodeid].gpu_id;