diff --git a/src/perfctr.c b/src/perfctr.c index 2b312affd4..0ec59b6ba8 100644 --- a/src/perfctr.c +++ b/src/perfctr.c @@ -56,6 +56,9 @@ HSAKMT_STATUS init_counter_props(unsigned int NumNodes) return HSAKMT_STATUS_NO_MEMORY; counter_props_count = NumNodes; + + alloc_pmc_blocks(); + return HSAKMT_STATUS_SUCCESS; } @@ -73,18 +76,24 @@ void destroy_counter_props(void) } free(counter_props); + free_pmc_blocks(); } static int blockid2uuid(enum perf_block_id block_id, HSA_UUID *uuid) { int rc = 0; + switch (block_id) { case PERFCOUNTER_BLOCKID__SQ: *uuid = HSA_PROFILEBLOCK_AMD_SQ; break; + case PERFCOUNTER_BLOCKID__IOMMUV2: + *uuid = HSA_PROFILEBLOCK_AMD_IOMMUV2; + break; default: /* If we reach this point, it's a bug */ rc = -1; + break; } return rc; @@ -99,11 +108,12 @@ hsaKmtPmcGetCounterProperties( { HSAKMT_STATUS rc = HSAKMT_STATUS_SUCCESS; uint32_t gpu_id, i, block_id; - uint16_t dev_id; uint32_t counter_props_size = 0; uint32_t total_counters = 0; uint32_t total_concurrent = 0; struct perf_counter_block block = {0}; + uint32_t total_blocks = 0; + uint32_t entry; if (counter_props == NULL) return HSAKMT_STATUS_NO_MEMORY; @@ -114,50 +124,63 @@ hsaKmtPmcGetCounterProperties( if (validate_nodeid(NodeId, &gpu_id) != 0) return HSAKMT_STATUS_INVALID_NODE_UNIT; - if (counter_props[NodeId] == NULL) { - dev_id = get_device_id_by_node(NodeId); - for (i = 0; i < PERFCOUNTER_BLOCKID__MAX; i++) { - rc = get_block_properties(dev_id, i, &block); - if (rc != HSAKMT_STATUS_SUCCESS) - return rc; - total_concurrent += block.num_of_slots; - total_counters += block.num_of_counters; + if (counter_props[NodeId] != NULL) { + *CounterProperties = counter_props[NodeId]; + return HSAKMT_STATUS_SUCCESS; + } + + for (i = 0; i < PERFCOUNTER_BLOCKID__MAX; i++) { + rc = get_block_properties(NodeId, i, &block); + if (rc != HSAKMT_STATUS_SUCCESS) + return rc; + total_concurrent += block.num_of_slots; + total_counters += block.num_of_counters; + /* If num_of_slots=0, this block doesn't exist */ + if (block.num_of_slots) + total_blocks++; + } + + counter_props_size = sizeof(HsaCounterProperties) + + sizeof(HsaCounterBlockProperties)*(total_blocks-1) + + sizeof(HsaCounter)*(total_counters-1); + + counter_props[NodeId] = malloc(counter_props_size); + if (counter_props[NodeId] == NULL) + return HSAKMT_STATUS_NO_MEMORY; + + counter_props[NodeId]->NumBlocks = total_blocks; + counter_props[NodeId]->NumConcurrent = total_concurrent; + + entry = 0; + for (block_id = 0; block_id < PERFCOUNTER_BLOCKID__MAX; block_id++) { + rc = get_block_properties(NodeId, block_id, &block); + if (rc != HSAKMT_STATUS_SUCCESS) { + free(counter_props[NodeId]); + return rc; } - counter_props_size = sizeof(HsaCounterProperties) + - sizeof(HsaCounterBlockProperties)*(PERFCOUNTER_BLOCKID__MAX-1) + - sizeof(HsaCounter)*(total_counters-1); + if (!block.num_of_slots) /* not a valid block */ + continue; - counter_props[NodeId] = malloc(counter_props_size); + blockid2uuid(block_id, + &counter_props[NodeId]->Blocks[entry].BlockId); + counter_props[NodeId]->Blocks[entry].NumCounters = + block.num_of_counters; + counter_props[NodeId]->Blocks[entry].NumConcurrent = + block.num_of_slots; - if (counter_props[NodeId] == NULL) - return HSAKMT_STATUS_NO_MEMORY; - - counter_props[NodeId]->NumBlocks = PERFCOUNTER_BLOCKID__MAX; - counter_props[NodeId]->NumConcurrent = total_concurrent; - - for (block_id = 0; block_id < PERFCOUNTER_BLOCKID__MAX; block_id++) - { - rc = get_block_properties(dev_id, block_id, &block); - if (rc != HSAKMT_STATUS_SUCCESS) { - free(counter_props[NodeId]); - return rc; - } - - /* Filling the SQ block */ - blockid2uuid(block_id, &counter_props[NodeId]->Blocks[block_id].BlockId); - counter_props[NodeId]->Blocks[block_id].NumCounters = block.num_of_counters; - counter_props[NodeId]->Blocks[block_id].NumConcurrent = block.num_of_slots; - - for (i = 0; i < block.num_of_counters; i++) { - counter_props[NodeId]->Blocks[block_id].Counters[i].BlockIndex = block_id; - counter_props[NodeId]->Blocks[block_id].Counters[i].CounterId = block.counter_ids[i]; - counter_props[NodeId]->Blocks[block_id].Counters[i].CounterSizeInBits = block.counter_size_in_bits; - counter_props[NodeId]->Blocks[block_id].Counters[i].CounterMask = block.counter_mask; - counter_props[NodeId]->Blocks[block_id].Counters[i].Flags.ui32.Global = 1; - counter_props[NodeId]->Blocks[block_id].Counters[i].Type = HSA_PROFILE_TYPE_NONPRIV_IMMEDIATE; - } + for (i = 0; i < block.num_of_counters; i++) { + counter_props[NodeId]->Blocks[entry].Counters[i].BlockIndex = block_id; + counter_props[NodeId]->Blocks[entry].Counters[i].CounterId = block.counter_ids[i]; + counter_props[NodeId]->Blocks[entry].Counters[i].CounterSizeInBits = block.counter_size_in_bits; + counter_props[NodeId]->Blocks[entry].Counters[i].CounterMask = block.counter_mask; + counter_props[NodeId]->Blocks[entry].Counters[i].Flags.ui32.Global = 1; + if (block_id == PERFCOUNTER_BLOCKID__IOMMUV2) + counter_props[NodeId]->Blocks[entry].Counters[i].Type = HSA_PROFILE_TYPE_PRIVILEGED_IMMEDIATE; + else + counter_props[NodeId]->Blocks[entry].Counters[i].Type = HSA_PROFILE_TYPE_NONPRIV_IMMEDIATE; } + entry++; } *CounterProperties = counter_props[NodeId]; diff --git a/src/pmc_table.c b/src/pmc_table.c index b0db2fea70..6465507c41 100644 --- a/src/pmc_table.c +++ b/src/pmc_table.c @@ -23,6 +23,11 @@ * DEALINGS IN THE SOFTWARE. */ +#include +#include +#include +#include +#include #include "libhsakmt.h" #include "pmc_table.h" @@ -182,15 +187,135 @@ static struct perf_counter_block polaris_blocks[PERFCOUNTER_BLOCKID__MAX] = { }, }; +/* Current APUs only have one IOMMU. If NUMA is introduced to APUs, we'll need + * to expand the struct here to an array. + */ +static struct perf_counter_block iommu_block; + +static HSAKMT_STATUS +alloc_pmc_blocks_iommu(void) +{ + DIR *dir; + struct dirent *dent; + const char sysfs_amdiommu_event_path[] = + "/sys/bus/event_source/devices/amd_iommu/events"; + /* Counter source in IOMMU's Counter Bank Addressing register is 8 bits, + * so the biggest counter number/id possible is 0xff. + */ + const int max_counter_id = 0xff; + char path[256]; + const int len = sizeof(path); + FILE *file; + int num; + char counter_id[max_counter_id + 1]; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + uint32_t *ptr; + struct perf_counter_block *block = &iommu_block; + + memset(block, 0, sizeof(struct perf_counter_block)); + + dir = opendir(sysfs_amdiommu_event_path); + if (!dir) + goto out; + + memset(counter_id, 0, max_counter_id + 1); + while ((dent = readdir(dir))) { + if (!strcmp(dent->d_name, ".") || !strcmp(dent->d_name, "..")) + continue; + if (snprintf(path, len, "%s/%s", sysfs_amdiommu_event_path, + dent->d_name) >= len) { + fprintf(stderr, "Increase path length.\n"); + ret = HSAKMT_STATUS_NO_MEMORY; + goto out; + } + file = fopen(path, "r"); + if (!file) { + ret = HSAKMT_STATUS_ERROR; + goto out; + } + if (fscanf(file, "csource=0x%x", &num) != 1) { + ret = HSAKMT_STATUS_ERROR; + fclose(file); + goto out; + } + if (num > max_counter_id) + /* This should never happen. If it does, check IOMMU driver. */ + fprintf(stderr, + "Error: max_counter_id %d is set too small.\n", + max_counter_id); + else { + counter_id[num] = 1; + ++block->num_of_counters; + } + fclose(file); + } + + block->counter_ids = malloc(sizeof(uint32_t) * block->num_of_counters); + if (!block->counter_ids) { + ret = HSAKMT_STATUS_NO_MEMORY; + goto out; + } + ptr = block->counter_ids; + for (num = 0; num < (max_counter_id + 1); num++) { + if (counter_id[num]) { + ptr[0] = num; + ++ptr; + } + } + + if (snprintf(path, len, "%s/%d/%s", + "/sys/devices/virtual/kfd/kfd/topology/nodes", + 0, /* IOMMU is in node 0. Change this if NUMA is introduced to APU. */ + "perf/iommu/max_concurrent") >= len) { + fprintf(stderr, "Increase path length\n"); + ret = HSAKMT_STATUS_NO_MEMORY; + goto out; + }; + file = fopen(path, "r"); + if (!file) { + ret = HSAKMT_STATUS_ERROR; + goto out; + } + if (fscanf(file, "%d", &block->num_of_slots) != 1) + ret = HSAKMT_STATUS_ERROR; + fclose(file); + +out: + if (dir) + closedir(dir); + return ret; +} + +HSAKMT_STATUS alloc_pmc_blocks(void) +{ + return alloc_pmc_blocks_iommu(); +} + +void free_pmc_blocks(void) +{ + if (iommu_block.counter_ids) + free(iommu_block.counter_ids); + iommu_block.counter_ids = NULL; + iommu_block.num_of_counters = 0; +} + HSAKMT_STATUS -get_block_properties(uint16_t dev_id, - enum perf_block_id block_id, - struct perf_counter_block *block) +get_block_properties(uint32_t node_id, + enum perf_block_id block_id, + struct perf_counter_block *block) { HSAKMT_STATUS rc = HSAKMT_STATUS_SUCCESS; - if (block_id > PERFCOUNTER_BLOCKID__MAX || block_id < PERFCOUNTER_BLOCKID__FIRST) + uint16_t dev_id = get_device_id_by_node(node_id); + + if (block_id > PERFCOUNTER_BLOCKID__MAX || + block_id < PERFCOUNTER_BLOCKID__FIRST) return HSAKMT_STATUS_INVALID_PARAMETER; + if (block_id == PERFCOUNTER_BLOCKID__IOMMUV2) { + *block = iommu_block; + return HSAKMT_STATUS_SUCCESS; + } + /* To avoid the long list, we read the 12 most significant digits of DID * to identify the GPU instead of listing the complete 16 bits. If one * day 12-bits is not good enough to distinguish the GPU, change the diff --git a/src/pmc_table.h b/src/pmc_table.h index 820edae4e0..adc4af2298 100644 --- a/src/pmc_table.h +++ b/src/pmc_table.h @@ -31,6 +31,7 @@ enum perf_block_id { PERFCOUNTER_BLOCKID__FIRST = 0, PERFCOUNTER_BLOCKID__SQ = PERFCOUNTER_BLOCKID__FIRST, + PERFCOUNTER_BLOCKID__IOMMUV2, PERFCOUNTER_BLOCKID__MAX }; @@ -42,9 +43,12 @@ struct perf_counter_block { uint64_t counter_mask; }; +HSAKMT_STATUS alloc_pmc_blocks(void); +void free_pmc_blocks(void); + HSAKMT_STATUS -get_block_properties(uint16_t dev_id, - enum perf_block_id block_id, - struct perf_counter_block *block); +get_block_properties(uint32_t node_id, + enum perf_block_id block_id, + struct perf_counter_block *block); #endif // PMC_TABLE_H