Don't limit number of supported HSA Nodes
Remove #define MAX_NODES 8
Change-Id: I756cadc652543dd17ea48a1c956adc08c3d2631a
[ROCm/ROCR-Runtime commit: 5e53205b9e]
Este cometimento está contido em:
@@ -28,9 +28,29 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
static bool is_device_debugged[MAX_NODES] = {false};
|
||||
static bool *is_device_debugged;
|
||||
int debug_get_reg_status(uint32_t node_id, bool* is_debugged);
|
||||
|
||||
HSAKMT_STATUS init_device_debugging_memory(unsigned int NumNodes)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
is_device_debugged = malloc(NumNodes * sizeof(bool));
|
||||
if (is_device_debugged == NULL)
|
||||
return HSAKMT_STATUS_NO_MEMORY;
|
||||
|
||||
for (i = 0; i < NumNodes; i++)
|
||||
is_device_debugged[i] = false;
|
||||
|
||||
return HSAKMT_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void destroy_device_debugging_memory(void)
|
||||
{
|
||||
if (is_device_debugged)
|
||||
free(is_device_debugged);
|
||||
}
|
||||
|
||||
HSAKMT_STATUS
|
||||
HSAKMTAPI
|
||||
hsaKmtDbgRegister(
|
||||
@@ -41,6 +61,9 @@ hsaKmtDbgRegister(
|
||||
uint32_t gpu_id;
|
||||
CHECK_KFD_OPEN();
|
||||
|
||||
if (is_device_debugged == NULL)
|
||||
return HSAKMT_STATUS_NO_MEMORY;
|
||||
|
||||
result = validate_nodeid(NodeId, &gpu_id);
|
||||
if (result != HSAKMT_STATUS_SUCCESS)
|
||||
return result;
|
||||
@@ -71,6 +94,9 @@ hsaKmtDbgUnregister(
|
||||
uint32_t gpu_id;
|
||||
CHECK_KFD_OPEN();
|
||||
|
||||
if (is_device_debugged == NULL)
|
||||
return HSAKMT_STATUS_NO_MEMORY;
|
||||
|
||||
result = validate_nodeid(NodeId, &gpu_id);
|
||||
if (result != HSAKMT_STATUS_SUCCESS)
|
||||
return result;
|
||||
@@ -283,7 +309,8 @@ hsaKmtDbgAddressWatch(
|
||||
/* =============================================================================== */
|
||||
int debug_get_reg_status(uint32_t node_id, bool* is_debugged)
|
||||
{
|
||||
if ( node_id >= MAX_NODES)
|
||||
*is_debugged = NULL;
|
||||
if (is_device_debugged == NULL)
|
||||
return -1;
|
||||
else {
|
||||
*is_debugged = is_device_debugged[node_id];
|
||||
|
||||
@@ -964,25 +964,20 @@ static int fmm_set_memory_policy(uint32_t gpu_id, int default_policy, int alt_po
|
||||
return kmtIoctl(kfd_fd, AMDKFD_IOC_SET_MEMORY_POLICY, &args);
|
||||
}
|
||||
|
||||
HSAKMT_STATUS fmm_init_process_apertures(void)
|
||||
HSAKMT_STATUS fmm_init_process_apertures(unsigned int NumNodes)
|
||||
{
|
||||
struct kfd_ioctl_get_process_apertures_new_args args;
|
||||
uint32_t i = 0;
|
||||
int32_t gpu_mem_id =0;
|
||||
uint32_t gpu_id;
|
||||
HsaSystemProperties sys_props;
|
||||
HsaNodeProperties props;
|
||||
struct kfd_process_device_apertures * process_apertures;
|
||||
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
|
||||
|
||||
ret = topology_sysfs_get_system_props(&sys_props);
|
||||
if (ret != HSAKMT_STATUS_SUCCESS)
|
||||
return ret;
|
||||
|
||||
/* Trade off - sys_props.NumNodes includes GPU nodes + CPU Node. So in
|
||||
/* Trade off - NumNodes includes GPU nodes + CPU Node. So in
|
||||
* systems with CPU node, slightly more memory is allocated than
|
||||
* necessary*/
|
||||
gpu_mem = (gpu_mem_t *)calloc(sys_props.NumNodes * sizeof(gpu_mem_t), 1);
|
||||
gpu_mem = (gpu_mem_t *)calloc(NumNodes, sizeof(gpu_mem_t));
|
||||
if (gpu_mem == NULL)
|
||||
return HSAKMT_STATUS_NO_MEMORY;
|
||||
|
||||
@@ -990,7 +985,7 @@ HSAKMT_STATUS fmm_init_process_apertures(void)
|
||||
* 0 by calloc. This is necessary because this function
|
||||
* gets called before hsaKmtAcquireSystemProperties() is called.*/
|
||||
gpu_mem_count = 0;
|
||||
while (i < sys_props.NumNodes) {
|
||||
while (i < NumNodes) {
|
||||
ret = topology_sysfs_get_node_props(i, &props, &gpu_id);
|
||||
if (ret != HSAKMT_STATUS_SUCCESS)
|
||||
goto sysfs_parse_failed;
|
||||
@@ -1240,9 +1235,9 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageble_aperture_t *apertu
|
||||
VOID_PTR_ADD(address, size -1) > aperture->limit)
|
||||
return -1;
|
||||
|
||||
debug_get_reg_status(gpu_mem[gpu_mem_id].node_id, &is_debugger);
|
||||
ret = debug_get_reg_status(gpu_mem[gpu_mem_id].node_id, &is_debugger);
|
||||
/* allocate object within the scratch backing aperture */
|
||||
if (!is_debugger) {
|
||||
if (!ret && !is_debugger) {
|
||||
offset = VOID_PTRS_SUB(address, aperture->base);
|
||||
mem = __fmm_allocate_device(gpu_id, size, aperture, offset,
|
||||
NULL, KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE);
|
||||
|
||||
@@ -43,7 +43,7 @@ typedef struct {
|
||||
void* start_address;
|
||||
} aperture_properties_t;
|
||||
|
||||
HSAKMT_STATUS fmm_init_process_apertures(void);
|
||||
HSAKMT_STATUS fmm_init_process_apertures(unsigned int NumNodes);
|
||||
void fmm_destroy_process_apertures(void);
|
||||
|
||||
/*
|
||||
|
||||
@@ -59,12 +59,6 @@ extern bool is_dgpu;
|
||||
#define PAGE_ALIGN_UP(x) ALIGN_UP(x,PAGE_SIZE)
|
||||
#define BITMASK(n) (((n) < sizeof(1ULL) * CHAR_BIT ? (1ULL << (n)) : 0) - 1ULL)
|
||||
|
||||
/*
|
||||
* Even though the toplogy code doesn't limit us to maximum number of nodes,
|
||||
* the current HSA spec says the maximum is 8 nodes
|
||||
*/
|
||||
#define MAX_NODES 8
|
||||
|
||||
HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id);
|
||||
HSAKMT_STATUS gpuid_to_nodeid(uint32_t gpu_id, uint32_t* node_id);
|
||||
uint16_t get_device_id_by_node(HSAuint32 node_id);
|
||||
@@ -80,8 +74,12 @@ HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags);
|
||||
void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align,
|
||||
uint32_t NodeId, bool peer_to_peer);
|
||||
void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align);
|
||||
HSAKMT_STATUS init_process_doorbells(void);
|
||||
HSAKMT_STATUS init_process_doorbells(unsigned int NumNodes);
|
||||
void destroy_process_doorbells(void);
|
||||
HSAKMT_STATUS init_device_debugging_memory(unsigned int NumNodes);
|
||||
void destroy_device_debugging_memory(void);
|
||||
HSAKMT_STATUS init_counter_props(unsigned int NumNodes);
|
||||
void destroy_counter_props(void);
|
||||
|
||||
extern int kmtIoctl(int fd, unsigned long request, void *arg);
|
||||
|
||||
|
||||
@@ -30,6 +30,7 @@
|
||||
#include <sys/ioctl.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include "fmm.h"
|
||||
|
||||
static const char kfd_device_name[] = "/dev/kfd";
|
||||
@@ -42,6 +43,7 @@ hsaKmtOpenKFD(void)
|
||||
{
|
||||
HSAKMT_STATUS result;
|
||||
int fd;
|
||||
HsaSystemProperties sys_props;
|
||||
|
||||
pthread_mutex_lock(&hsakmt_mutex);
|
||||
|
||||
@@ -57,14 +59,24 @@ hsaKmtOpenKFD(void)
|
||||
goto open_failed;
|
||||
}
|
||||
|
||||
result = fmm_init_process_apertures();
|
||||
result = topology_sysfs_get_system_props(&sys_props);
|
||||
if (result != HSAKMT_STATUS_SUCCESS)
|
||||
goto topology_sysfs_failed;
|
||||
|
||||
result = fmm_init_process_apertures(sys_props.NumNodes);
|
||||
if (result != HSAKMT_STATUS_SUCCESS)
|
||||
goto init_process_aperture_failed;
|
||||
|
||||
result = init_process_doorbells();
|
||||
result = init_process_doorbells(sys_props.NumNodes);
|
||||
if (result != HSAKMT_STATUS_SUCCESS)
|
||||
goto init_doorbell_failed;
|
||||
|
||||
if (init_device_debugging_memory(sys_props.NumNodes) != HSAKMT_STATUS_SUCCESS)
|
||||
printf("Insufficient Memory. Debugging unavailable\n");
|
||||
|
||||
if (init_counter_props(sys_props.NumNodes) != HSAKMT_STATUS_SUCCESS)
|
||||
printf("Insufficient Memory. Performance Counter information unavailable\n");
|
||||
|
||||
amd_hsa_thunk_lock_fd = open(tmp_file,
|
||||
O_CREAT | //create the file if it's not present.
|
||||
O_RDWR, //only need write access for the internal locking semantics.
|
||||
@@ -82,6 +94,7 @@ hsaKmtOpenKFD(void)
|
||||
init_doorbell_failed:
|
||||
fmm_destroy_process_apertures();
|
||||
init_process_aperture_failed:
|
||||
topology_sysfs_failed:
|
||||
close(fd);
|
||||
open_failed:
|
||||
pthread_mutex_unlock(&hsakmt_mutex);
|
||||
@@ -101,6 +114,8 @@ hsaKmtCloseKFD(void)
|
||||
{
|
||||
if (--kfd_open_count == 0)
|
||||
{
|
||||
destroy_counter_props();
|
||||
destroy_device_debugging_memory();
|
||||
destroy_process_doorbells();
|
||||
fmm_destroy_process_apertures();
|
||||
close(kfd_fd);
|
||||
|
||||
@@ -46,9 +46,34 @@ struct perf_trace {
|
||||
|
||||
extern int amd_hsa_thunk_lock_fd;
|
||||
|
||||
static HsaCounterProperties *counter_props[MAX_NODES] = {NULL};
|
||||
static HsaCounterProperties **counter_props;
|
||||
static unsigned int counter_props_count;
|
||||
|
||||
void __attribute__ ((destructor)) perfctr_release_global_resources(void);
|
||||
HSAKMT_STATUS init_counter_props(unsigned int NumNodes)
|
||||
{
|
||||
counter_props = calloc(NumNodes, sizeof(struct HsaCounterProperties*));
|
||||
if (counter_props == NULL)
|
||||
return HSAKMT_STATUS_NO_MEMORY;
|
||||
|
||||
counter_props_count = NumNodes;
|
||||
return HSAKMT_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void destroy_counter_props(void)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
if (counter_props == NULL)
|
||||
return;
|
||||
|
||||
for (i = 0; i<counter_props_count; i++)
|
||||
if (counter_props[i] != NULL) {
|
||||
free(counter_props[i]);
|
||||
counter_props[i] = NULL;
|
||||
}
|
||||
|
||||
free(counter_props);
|
||||
}
|
||||
|
||||
static int blockid2uuid(enum perf_block_id block_id, HSA_UUID *uuid)
|
||||
{
|
||||
@@ -80,6 +105,9 @@ hsaKmtPmcGetCounterProperties(
|
||||
uint32_t total_concurrent = 0;
|
||||
struct perf_counter_block block = {0};
|
||||
|
||||
if (counter_props == NULL)
|
||||
return HSAKMT_STATUS_NO_MEMORY;
|
||||
|
||||
if (CounterProperties == NULL)
|
||||
return HSAKMT_STATUS_INVALID_PARAMETER;
|
||||
|
||||
@@ -157,6 +185,9 @@ hsaKmtPmcRegisterTrace(
|
||||
uint32_t concurrent_counters[PERFCOUNTER_BLOCKID__MAX] = {0};
|
||||
struct perf_trace *trace = NULL;
|
||||
|
||||
if (counter_props == NULL)
|
||||
return HSAKMT_STATUS_NO_MEMORY;
|
||||
|
||||
if (Counters == NULL || TraceRoot == NULL || NumberOfCounters == 0)
|
||||
return HSAKMT_STATUS_INVALID_PARAMETER;
|
||||
|
||||
@@ -370,14 +401,3 @@ hsaKmtPmcStopTrace(
|
||||
|
||||
return HSAKMT_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void perfctr_release_global_resources(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i=0; i<MAX_NODES; i++)
|
||||
if (counter_props[i] != NULL) {
|
||||
free(counter_props[i]);
|
||||
counter_props[i] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -147,23 +147,18 @@ struct process_doorbells
|
||||
|
||||
static struct process_doorbells *doorbells;
|
||||
|
||||
HSAKMT_STATUS init_process_doorbells(void)
|
||||
HSAKMT_STATUS init_process_doorbells(unsigned int NumNodes)
|
||||
{
|
||||
HsaSystemProperties sys_props;
|
||||
unsigned int i;
|
||||
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
|
||||
|
||||
ret = topology_sysfs_get_system_props(&sys_props);
|
||||
if (ret != HSAKMT_STATUS_SUCCESS)
|
||||
return ret;
|
||||
|
||||
/* doorbells[] is accessed using Topology NodeId. This means doorbells[0],
|
||||
* which corresponds to CPU only Node, might not be used */
|
||||
doorbells = malloc(sys_props.NumNodes * sizeof(struct process_doorbells));
|
||||
doorbells = malloc(NumNodes * sizeof(struct process_doorbells));
|
||||
if (doorbells == NULL)
|
||||
return HSAKMT_STATUS_NO_MEMORY;
|
||||
|
||||
for (i = 0; i < sys_props.NumNodes; i++) {
|
||||
for (i = 0; i < NumNodes; i++) {
|
||||
doorbells[i].need_mmap = true;
|
||||
doorbells[i].doorbells = NULL;
|
||||
pthread_mutex_init(&doorbells[i].doorbells_mutex, NULL);
|
||||
|
||||
@@ -1033,7 +1033,7 @@ out:
|
||||
HSAKMT_STATUS
|
||||
validate_nodeid(uint32_t nodeid, uint32_t *gpu_id)
|
||||
{
|
||||
if (nodeid >= MAX_NODES || !node || !_system || _system->NumNodes <= nodeid)
|
||||
if (!node || !_system || _system->NumNodes <= nodeid)
|
||||
return HSAKMT_STATUS_INVALID_NODE_UNIT;
|
||||
if (gpu_id)
|
||||
*gpu_id = node[nodeid].gpu_id;
|
||||
|
||||
Criar uma nova questão referindo esta
Bloquear um utilizador