2
0

Don't limit number of supported HSA Nodes

Remove #define MAX_NODES 8

Change-Id: I756cadc652543dd17ea48a1c956adc08c3d2631a


[ROCm/ROCR-Runtime commit: 5e53205b9e]
Este cometimento está contido em:
Harish Kasiviswanathan
2016-01-14 17:07:28 -05:00
ascendente 14358ee07f
cometimento b687eaf2c2
8 ficheiros modificados com 95 adições e 45 eliminações
+29 -2
Ver ficheiro
@@ -28,9 +28,29 @@
#include <stdlib.h>
#include <string.h>
static bool is_device_debugged[MAX_NODES] = {false};
static bool *is_device_debugged;
int debug_get_reg_status(uint32_t node_id, bool* is_debugged);
HSAKMT_STATUS init_device_debugging_memory(unsigned int NumNodes)
{
unsigned int i;
is_device_debugged = malloc(NumNodes * sizeof(bool));
if (is_device_debugged == NULL)
return HSAKMT_STATUS_NO_MEMORY;
for (i = 0; i < NumNodes; i++)
is_device_debugged[i] = false;
return HSAKMT_STATUS_SUCCESS;
}
void destroy_device_debugging_memory(void)
{
if (is_device_debugged)
free(is_device_debugged);
}
HSAKMT_STATUS
HSAKMTAPI
hsaKmtDbgRegister(
@@ -41,6 +61,9 @@ hsaKmtDbgRegister(
uint32_t gpu_id;
CHECK_KFD_OPEN();
if (is_device_debugged == NULL)
return HSAKMT_STATUS_NO_MEMORY;
result = validate_nodeid(NodeId, &gpu_id);
if (result != HSAKMT_STATUS_SUCCESS)
return result;
@@ -71,6 +94,9 @@ hsaKmtDbgUnregister(
uint32_t gpu_id;
CHECK_KFD_OPEN();
if (is_device_debugged == NULL)
return HSAKMT_STATUS_NO_MEMORY;
result = validate_nodeid(NodeId, &gpu_id);
if (result != HSAKMT_STATUS_SUCCESS)
return result;
@@ -283,7 +309,8 @@ hsaKmtDbgAddressWatch(
/* =============================================================================== */
int debug_get_reg_status(uint32_t node_id, bool* is_debugged)
{
if ( node_id >= MAX_NODES)
*is_debugged = NULL;
if (is_device_debugged == NULL)
return -1;
else {
*is_debugged = is_device_debugged[node_id];
+6 -11
Ver ficheiro
@@ -964,25 +964,20 @@ static int fmm_set_memory_policy(uint32_t gpu_id, int default_policy, int alt_po
return kmtIoctl(kfd_fd, AMDKFD_IOC_SET_MEMORY_POLICY, &args);
}
HSAKMT_STATUS fmm_init_process_apertures(void)
HSAKMT_STATUS fmm_init_process_apertures(unsigned int NumNodes)
{
struct kfd_ioctl_get_process_apertures_new_args args;
uint32_t i = 0;
int32_t gpu_mem_id =0;
uint32_t gpu_id;
HsaSystemProperties sys_props;
HsaNodeProperties props;
struct kfd_process_device_apertures * process_apertures;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
ret = topology_sysfs_get_system_props(&sys_props);
if (ret != HSAKMT_STATUS_SUCCESS)
return ret;
/* Trade off - sys_props.NumNodes includes GPU nodes + CPU Node. So in
/* Trade off - NumNodes includes GPU nodes + CPU Node. So in
* systems with CPU node, slightly more memory is allocated than
* necessary*/
gpu_mem = (gpu_mem_t *)calloc(sys_props.NumNodes * sizeof(gpu_mem_t), 1);
gpu_mem = (gpu_mem_t *)calloc(NumNodes, sizeof(gpu_mem_t));
if (gpu_mem == NULL)
return HSAKMT_STATUS_NO_MEMORY;
@@ -990,7 +985,7 @@ HSAKMT_STATUS fmm_init_process_apertures(void)
* 0 by calloc. This is necessary because this function
* gets called before hsaKmtAcquireSystemProperties() is called.*/
gpu_mem_count = 0;
while (i < sys_props.NumNodes) {
while (i < NumNodes) {
ret = topology_sysfs_get_node_props(i, &props, &gpu_id);
if (ret != HSAKMT_STATUS_SUCCESS)
goto sysfs_parse_failed;
@@ -1240,9 +1235,9 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageble_aperture_t *apertu
VOID_PTR_ADD(address, size -1) > aperture->limit)
return -1;
debug_get_reg_status(gpu_mem[gpu_mem_id].node_id, &is_debugger);
ret = debug_get_reg_status(gpu_mem[gpu_mem_id].node_id, &is_debugger);
/* allocate object within the scratch backing aperture */
if (!is_debugger) {
if (!ret && !is_debugger) {
offset = VOID_PTRS_SUB(address, aperture->base);
mem = __fmm_allocate_device(gpu_id, size, aperture, offset,
NULL, KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE);
+1 -1
Ver ficheiro
@@ -43,7 +43,7 @@ typedef struct {
void* start_address;
} aperture_properties_t;
HSAKMT_STATUS fmm_init_process_apertures(void);
HSAKMT_STATUS fmm_init_process_apertures(unsigned int NumNodes);
void fmm_destroy_process_apertures(void);
/*
+5 -7
Ver ficheiro
@@ -59,12 +59,6 @@ extern bool is_dgpu;
#define PAGE_ALIGN_UP(x) ALIGN_UP(x,PAGE_SIZE)
#define BITMASK(n) (((n) < sizeof(1ULL) * CHAR_BIT ? (1ULL << (n)) : 0) - 1ULL)
/*
* Even though the toplogy code doesn't limit us to maximum number of nodes,
* the current HSA spec says the maximum is 8 nodes
*/
#define MAX_NODES 8
HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id);
HSAKMT_STATUS gpuid_to_nodeid(uint32_t gpu_id, uint32_t* node_id);
uint16_t get_device_id_by_node(HSAuint32 node_id);
@@ -80,8 +74,12 @@ HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags);
void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align,
uint32_t NodeId, bool peer_to_peer);
void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align);
HSAKMT_STATUS init_process_doorbells(void);
HSAKMT_STATUS init_process_doorbells(unsigned int NumNodes);
void destroy_process_doorbells(void);
HSAKMT_STATUS init_device_debugging_memory(unsigned int NumNodes);
void destroy_device_debugging_memory(void);
HSAKMT_STATUS init_counter_props(unsigned int NumNodes);
void destroy_counter_props(void);
extern int kmtIoctl(int fd, unsigned long request, void *arg);
+17 -2
Ver ficheiro
@@ -30,6 +30,7 @@
#include <sys/ioctl.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include "fmm.h"
static const char kfd_device_name[] = "/dev/kfd";
@@ -42,6 +43,7 @@ hsaKmtOpenKFD(void)
{
HSAKMT_STATUS result;
int fd;
HsaSystemProperties sys_props;
pthread_mutex_lock(&hsakmt_mutex);
@@ -57,14 +59,24 @@ hsaKmtOpenKFD(void)
goto open_failed;
}
result = fmm_init_process_apertures();
result = topology_sysfs_get_system_props(&sys_props);
if (result != HSAKMT_STATUS_SUCCESS)
goto topology_sysfs_failed;
result = fmm_init_process_apertures(sys_props.NumNodes);
if (result != HSAKMT_STATUS_SUCCESS)
goto init_process_aperture_failed;
result = init_process_doorbells();
result = init_process_doorbells(sys_props.NumNodes);
if (result != HSAKMT_STATUS_SUCCESS)
goto init_doorbell_failed;
if (init_device_debugging_memory(sys_props.NumNodes) != HSAKMT_STATUS_SUCCESS)
printf("Insufficient Memory. Debugging unavailable\n");
if (init_counter_props(sys_props.NumNodes) != HSAKMT_STATUS_SUCCESS)
printf("Insufficient Memory. Performance Counter information unavailable\n");
amd_hsa_thunk_lock_fd = open(tmp_file,
O_CREAT | //create the file if it's not present.
O_RDWR, //only need write access for the internal locking semantics.
@@ -82,6 +94,7 @@ hsaKmtOpenKFD(void)
init_doorbell_failed:
fmm_destroy_process_apertures();
init_process_aperture_failed:
topology_sysfs_failed:
close(fd);
open_failed:
pthread_mutex_unlock(&hsakmt_mutex);
@@ -101,6 +114,8 @@ hsaKmtCloseKFD(void)
{
if (--kfd_open_count == 0)
{
destroy_counter_props();
destroy_device_debugging_memory();
destroy_process_doorbells();
fmm_destroy_process_apertures();
close(kfd_fd);
+33 -13
Ver ficheiro
@@ -46,9 +46,34 @@ struct perf_trace {
extern int amd_hsa_thunk_lock_fd;
static HsaCounterProperties *counter_props[MAX_NODES] = {NULL};
static HsaCounterProperties **counter_props;
static unsigned int counter_props_count;
void __attribute__ ((destructor)) perfctr_release_global_resources(void);
HSAKMT_STATUS init_counter_props(unsigned int NumNodes)
{
counter_props = calloc(NumNodes, sizeof(struct HsaCounterProperties*));
if (counter_props == NULL)
return HSAKMT_STATUS_NO_MEMORY;
counter_props_count = NumNodes;
return HSAKMT_STATUS_SUCCESS;
}
void destroy_counter_props(void)
{
unsigned int i;
if (counter_props == NULL)
return;
for (i = 0; i<counter_props_count; i++)
if (counter_props[i] != NULL) {
free(counter_props[i]);
counter_props[i] = NULL;
}
free(counter_props);
}
static int blockid2uuid(enum perf_block_id block_id, HSA_UUID *uuid)
{
@@ -80,6 +105,9 @@ hsaKmtPmcGetCounterProperties(
uint32_t total_concurrent = 0;
struct perf_counter_block block = {0};
if (counter_props == NULL)
return HSAKMT_STATUS_NO_MEMORY;
if (CounterProperties == NULL)
return HSAKMT_STATUS_INVALID_PARAMETER;
@@ -157,6 +185,9 @@ hsaKmtPmcRegisterTrace(
uint32_t concurrent_counters[PERFCOUNTER_BLOCKID__MAX] = {0};
struct perf_trace *trace = NULL;
if (counter_props == NULL)
return HSAKMT_STATUS_NO_MEMORY;
if (Counters == NULL || TraceRoot == NULL || NumberOfCounters == 0)
return HSAKMT_STATUS_INVALID_PARAMETER;
@@ -370,14 +401,3 @@ hsaKmtPmcStopTrace(
return HSAKMT_STATUS_SUCCESS;
}
void perfctr_release_global_resources(void)
{
int i;
for (i=0; i<MAX_NODES; i++)
if (counter_props[i] != NULL) {
free(counter_props[i]);
counter_props[i] = NULL;
}
}
+3 -8
Ver ficheiro
@@ -147,23 +147,18 @@ struct process_doorbells
static struct process_doorbells *doorbells;
HSAKMT_STATUS init_process_doorbells(void)
HSAKMT_STATUS init_process_doorbells(unsigned int NumNodes)
{
HsaSystemProperties sys_props;
unsigned int i;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
ret = topology_sysfs_get_system_props(&sys_props);
if (ret != HSAKMT_STATUS_SUCCESS)
return ret;
/* doorbells[] is accessed using Topology NodeId. This means doorbells[0],
* which corresponds to CPU only Node, might not be used */
doorbells = malloc(sys_props.NumNodes * sizeof(struct process_doorbells));
doorbells = malloc(NumNodes * sizeof(struct process_doorbells));
if (doorbells == NULL)
return HSAKMT_STATUS_NO_MEMORY;
for (i = 0; i < sys_props.NumNodes; i++) {
for (i = 0; i < NumNodes; i++) {
doorbells[i].need_mmap = true;
doorbells[i].doorbells = NULL;
pthread_mutex_init(&doorbells[i].doorbells_mutex, NULL);
+1 -1
Ver ficheiro
@@ -1033,7 +1033,7 @@ out:
HSAKMT_STATUS
validate_nodeid(uint32_t nodeid, uint32_t *gpu_id)
{
if (nodeid >= MAX_NODES || !node || !_system || _system->NumNodes <= nodeid)
if (!node || !_system || _system->NumNodes <= nodeid)
return HSAKMT_STATUS_INVALID_NODE_UNIT;
if (gpu_id)
*gpu_id = node[nodeid].gpu_id;