From 39ad1ba008fb64a65cf5f2bcb3ad870af0a530bc Mon Sep 17 00:00:00 2001 From: Amber Lin Date: Wed, 15 Feb 2017 11:32:49 -0500 Subject: [PATCH] Implement PMC AcquireTrace Existing code uses lockf to ensure exclusive PMC access of one process and one TraceId. However Thunk spec allows hsaKmtPmcAcquireTraceAccess to get exclusive access to the defined set of counters, not exclusive to one process or one TraceId. Multiple counter sets of multiple TraceIds is allowed if they meet the concurrent access limit evaluated by the hardware /driver. Change-Id: I59cacb855a707fe326a4070452fcbbd3c95ac223 [ROCm/ROCR-Runtime commit: 1025579c0bb171cbd109830e65955f6d6aa92989] --- projects/rocr-runtime/src/openclose.c | 13 +- projects/rocr-runtime/src/perfctr.c | 309 +++++++++++++++++++++++--- projects/rocr-runtime/src/pmc_table.c | 10 + projects/rocr-runtime/src/pmc_table.h | 1 + 4 files changed, 300 insertions(+), 33 deletions(-) diff --git a/projects/rocr-runtime/src/openclose.c b/projects/rocr-runtime/src/openclose.c index 8901cffcdc..9983ab098b 100644 --- a/projects/rocr-runtime/src/openclose.c +++ b/projects/rocr-runtime/src/openclose.c @@ -113,13 +113,16 @@ hsaKmtOpenKFD(void) if (init_device_debugging_memory(sys_props.NumNodes) != HSAKMT_STATUS_SUCCESS) printf("Insufficient Memory. Debugging unavailable\n"); - if (init_counter_props(sys_props.NumNodes) != HSAKMT_STATUS_SUCCESS) - printf("Insufficient Memory. Performance Counter information unavailable\n"); - amd_hsa_thunk_lock_fd = open(tmp_file, O_CREAT | //create the file if it's not present. - O_RDWR, //only need write access for the internal locking semantics. - S_IRUSR | S_IWUSR); //permissions on the file, 600 here. + O_RDWR, + S_IROTH | S_IWOTH); //allow others to read/write + if (amd_hsa_thunk_lock_fd < 0) + fprintf(stderr, + "Profiling of privileged counters is not available\n"); + if (init_counter_props(sys_props.NumNodes) != + HSAKMT_STATUS_SUCCESS) + fprintf(stderr, "Profiling is not available\n"); } else { diff --git a/projects/rocr-runtime/src/perfctr.c b/projects/rocr-runtime/src/perfctr.c index 1dbcd856e4..669f801d7b 100644 --- a/projects/rocr-runtime/src/perfctr.c +++ b/projects/rocr-runtime/src/perfctr.c @@ -25,10 +25,15 @@ #include #include +#include +#include +#include #include "libhsakmt.h" #include "pmc_table.h" #include "linux/kfd_ioctl.h" #include +#include +#include #define BITS_PER_BYTE CHAR_BIT @@ -43,6 +48,7 @@ struct perf_trace_block { enum perf_block_id block_id; uint32_t num_counters; uint64_t *counter_id; + int *perf_event_fd; }; struct perf_trace { @@ -53,11 +59,84 @@ struct perf_trace { struct perf_trace_block blocks[0]; }; +enum perf_trace_action { + PERF_TRACE_ACTION__ACQUIRE = 0, + PERF_TRACE_ACTION__RELEASE +}; + +struct perf_lockf_tbl { + uint32_t magic4cc; + uint32_t iommu_slots_left; +}; + +struct perf_counts_values { + union { + struct { + u64 val; + u64 ena; + u64 run; + }; + u64 values[3]; + }; +}; + extern int amd_hsa_thunk_lock_fd; static HsaCounterProperties **counter_props; static unsigned int counter_props_count; +static ssize_t readn(int fd, void *buf, size_t n) +{ + size_t left = n; + ssize_t bytes; + + while (left) { + bytes = read(fd, buf, left); + if (bytes < 0 && errno == EINTR) /* read got interrupted */ + continue; + if (bytes <= 0) + return (n - left); + left -= bytes; + buf = VOID_PTR_ADD(buf, bytes); + } + return n; +} + +static HSAKMT_STATUS init_lockf_perf_section(void) +{ + struct perf_lockf_tbl lockf_tbl; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + if (amd_hsa_thunk_lock_fd <= 0) + return HSAKMT_STATUS_UNAVAILABLE; + + if (lockf(amd_hsa_thunk_lock_fd, F_TLOCK, sizeof(lockf_tbl))) + return HSAKMT_STATUS_ERROR; + + memset(&lockf_tbl, 0, sizeof(lockf_tbl)); + if (readn(amd_hsa_thunk_lock_fd, &lockf_tbl, sizeof(lockf_tbl)) < 0) { + ret = HSAKMT_STATUS_ERROR; + goto out; + } + /* If the magic number exists, the lock file table has been + * initialized by another process and is in use. Don't overwrite it. + */ + if (lockf_tbl.magic4cc == HSA_PERF_MAGIC4CC) + goto out; + /* write the perf content */ + lockf_tbl.magic4cc = HSA_PERF_MAGIC4CC; + lockf_tbl.iommu_slots_left = + pmc_table_get_max_concurrent(PERFCOUNTER_BLOCKID__IOMMUV2); + if (write(amd_hsa_thunk_lock_fd, &lockf_tbl, sizeof(lockf_tbl)) < 0) + ret = HSAKMT_STATUS_ERROR; +out: + /* unlock the perf section */ + if (lockf(amd_hsa_thunk_lock_fd, F_ULOCK, sizeof(lockf_tbl))) + ret = HSAKMT_STATUS_ERROR; + + return ret; +} + HSAKMT_STATUS init_counter_props(unsigned int NumNodes) { counter_props = calloc(NumNodes, sizeof(struct HsaCounterProperties*)); @@ -67,6 +146,7 @@ HSAKMT_STATUS init_counter_props(unsigned int NumNodes) counter_props_count = NumNodes; alloc_pmc_blocks(); + init_lockf_perf_section(); return HSAKMT_STATUS_SUCCESS; } @@ -121,6 +201,149 @@ static HSAuint32 get_block_concurrent_limit(uint32_t node_id, return 0; } +static HSAKMT_STATUS update_block_slots(enum perf_trace_action action, + uint32_t block_id, uint32_t num_slots) +{ + struct perf_lockf_tbl lockf_tbl; + uint32_t *slots_left; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + if (amd_hsa_thunk_lock_fd <= 0) + return HSAKMT_STATUS_UNAVAILABLE; + + if (lockf(amd_hsa_thunk_lock_fd, F_TLOCK, sizeof(lockf_tbl)) < 0) + return HSAKMT_STATUS_ERROR; + + if (readn(amd_hsa_thunk_lock_fd, &lockf_tbl, sizeof(lockf_tbl)) < 0) { + ret = HSAKMT_STATUS_ERROR; + goto out; + } + + if (block_id == PERFCOUNTER_BLOCKID__IOMMUV2) + slots_left = &lockf_tbl.iommu_slots_left; + else { + ret = HSAKMT_STATUS_UNAVAILABLE; + goto out; + } + + switch (action) { + case PERF_TRACE_ACTION__ACQUIRE: + if (*slots_left >= num_slots) + *slots_left -= num_slots; + else + ret = HSAKMT_STATUS_UNAVAILABLE; + break; + case PERF_TRACE_ACTION__RELEASE: + if ((*slots_left + num_slots) <= + pmc_table_get_max_concurrent(block_id)) + *slots_left += num_slots; + else + ret = HSAKMT_STATUS_ERROR; + break; + default: + ret = HSAKMT_STATUS_INVALID_PARAMETER; + break; + } + + if (ret == HSAKMT_STATUS_SUCCESS) { + if (write(amd_hsa_thunk_lock_fd, &lockf_tbl, sizeof(lockf_tbl)) < 0) + ret = HSAKMT_STATUS_ERROR; + } +out: + /* unlock the perf section */ + if (lockf(amd_hsa_thunk_lock_fd, F_ULOCK, sizeof(lockf_tbl))) + ret = HSAKMT_STATUS_ERROR; + + return ret; +} + +static unsigned int get_perf_event_type(enum perf_block_id block_id) +{ + FILE *file = NULL; + unsigned int type = 0; + + if (block_id == PERFCOUNTER_BLOCKID__IOMMUV2) + file = fopen("/sys/bus/event_source/devices/amd_iommu/type", + "r"); + if (!file) + return 0; + + if (fscanf(file, "%d", &type) != 1) + type = 0; + fclose(file); + + return type; +} + +/* close_perf_event_fd - Close all FDs opened for this block. + * When RT acquires the trace access, RT has no ideas about each + * individual FD opened for this block. We should treat the whole + * block as one and close all of them. + */ +static void close_perf_event_fd(struct perf_trace_block *block) +{ + uint32_t i; + + if (!block || !block->perf_event_fd) + return; + + for (i = 0; i < block->num_counters; i++) + if (block->perf_event_fd[i] > 0) { + close(block->perf_event_fd[i]); + block->perf_event_fd[i] = 0; + } +} + +/* open_perf_event_fd - Open FDs required for this block. + * If one of them fails, we should close all FDs that have been + * opened because RT has no ideas about those FDs successfully + * opened and it won't send anything to close them. + */ +static HSAKMT_STATUS open_perf_event_fd(struct perf_trace_block *block) +{ + struct perf_event_attr attr; + uint32_t i; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + if (!block || !block->perf_event_fd) + return HSAKMT_STATUS_INVALID_HANDLE; + + if (getuid()) { + fprintf(stderr, + "Error. Must be root to open perf_event.\n"); + return HSAKMT_STATUS_ERROR; + } + + memset(&attr, 0, sizeof(struct perf_event_attr)); + attr.type = get_perf_event_type(block->block_id); + if (!attr.type) + return HSAKMT_STATUS_ERROR; + + for (i = 0; i < block->num_counters; i++) { + attr.size = sizeof(struct perf_event_attr); + attr.config = block->counter_id[i]; + attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | + PERF_FORMAT_TOTAL_TIME_RUNNING; + attr.disabled = 1; + attr.inherit = 1; + + /* We are profiling system wide, not per cpu, so no threads, + * no groups -> pid=-1 and group_fd=-1. cpu = 0 + * flags=PERF_FLAG_FD_NO_GROUP + */ + block->perf_event_fd[i] = syscall(__NR_perf_event_open, &attr, + -1, 0, -1, PERF_FLAG_FD_NO_GROUP); + + if (block->perf_event_fd[i] < 0) { + ret = HSAKMT_STATUS_ERROR; + close_perf_event_fd(block); + break; + } + } + + return ret; +} + HSAKMT_STATUS HSAKMTAPI hsaKmtPmcGetCounterProperties( @@ -232,6 +455,7 @@ hsaKmtPmcRegisterTrace( uint32_t num_counters[PERFCOUNTER_BLOCKID__MAX] = {0}; uint32_t block, num_blocks = 0, total_counters = 0; uint64_t *counter_id_ptr; + int *fd_ptr; if (counter_props == NULL) return HSAKMT_STATUS_NO_MEMORY; @@ -290,15 +514,34 @@ hsaKmtPmcRegisterTrace( */ trace = (struct perf_trace *)calloc(sizeof(struct perf_trace) + sizeof(struct perf_trace_block) * num_blocks - + sizeof(uint64_t) * total_counters, + + sizeof(uint64_t) * total_counters + + sizeof(int) * total_counters, 1); if (trace == NULL) return HSAKMT_STATUS_NO_MEMORY; + /* Allocated area is partitioned as: + * +---------------------------------+ trace + * | perf_trace | + * |---------------------------------| trace->blocks[0] + * | perf_trace_block 0 | + * | .... | + * | perf_trace_block N-1 | trace->blocks[N-1] + * |---------------------------------| <-- counter_id_ptr starts here + * | block 0's counter IDs(uint64_t) | + * | ...... | + * | block N-1's counter IDs | + * |---------------------------------| <-- perf_event_fd starts here + * | block 0's perf_event_fds(int) | + * | ...... | + * | block N-1's perf_event_fds | + * +---------------------------------+ + */ block = 0; counter_id_ptr = (uint64_t *)((char *) trace + sizeof(struct perf_trace) + sizeof(struct perf_trace_block) * num_blocks); + fd_ptr = (int *)(counter_id_ptr + total_counters); /* Fill in each block's information to the TraceId */ for (i = 0; i < PERFCOUNTER_BLOCKID__MAX; i++) { if (!num_counters[i]) /* not a block to trace */ @@ -311,12 +554,14 @@ hsaKmtPmcRegisterTrace( /* Fill in counter IDs to the counter_id array. */ for (j = 0; j < num_counters[i]; j++) trace->blocks[block].counter_id[j] = counter_id[i][j]; + trace->blocks[block].perf_event_fd = fd_ptr; /* how many counters to trace */ trace->blocks[block].num_counters = num_counters[i]; /* block index in "enum perf_block_id" */ trace->blocks[block].block_id = i; block++; /* move to next */ counter_id_ptr += num_counters[i]; + fd_ptr += num_counters[i]; } trace->magic4cc = HSA_PERF_MAGIC4CC; @@ -371,12 +616,6 @@ hsaKmtPmcUnregisterTrace( return HSAKMT_STATUS_SUCCESS; } - -/** - Allows a user mode process to get exclusive access to the defined set of (HW) counters - used for tracing/profiling -*/ - HSAKMT_STATUS HSAKMTAPI hsaKmtPmcAcquireTraceAccess( @@ -385,6 +624,9 @@ hsaKmtPmcAcquireTraceAccess( ) { struct perf_trace *trace; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + uint32_t gpu_id, i; + int j; if (TraceId == 0) return HSAKMT_STATUS_INVALID_PARAMETER; @@ -394,23 +636,35 @@ hsaKmtPmcAcquireTraceAccess( if (trace->magic4cc != HSA_PERF_MAGIC4CC) return HSAKMT_STATUS_INVALID_HANDLE; - if (amd_hsa_thunk_lock_fd > 0) { - if (lockf( amd_hsa_thunk_lock_fd, F_TLOCK, 0 ) != 0) - return HSAKMT_STATUS_ERROR; - else - return HSAKMT_STATUS_SUCCESS; + if (validate_nodeid(NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS) + return HSAKMT_STATUS_INVALID_NODE_UNIT; + + for (i = 0; i < trace->num_blocks; i++) { + ret = update_block_slots(PERF_TRACE_ACTION__ACQUIRE, + trace->blocks[i].block_id, + trace->blocks[i].num_counters); + if (ret != HSAKMT_STATUS_SUCCESS) + goto out; + ret = open_perf_event_fd(&trace->blocks[i]); + if (ret != HSAKMT_STATUS_SUCCESS) { + i++; /* to release slots just reserved */ + goto out; + } } - else { - return HSAKMT_STATUS_ERROR; + +out: + if (ret != HSAKMT_STATUS_SUCCESS) { + for (j = i-1; j >= 0; j--) { + update_block_slots(PERF_TRACE_ACTION__RELEASE, + trace->blocks[j].block_id, + trace->blocks[j].num_counters); + close_perf_event_fd(&trace->blocks[j]); + } } + + return ret; } - -/** - Allows a user mode process to release exclusive access to the defined set of (HW) counters - used for tracing/profiling -*/ - HSAKMT_STATUS HSAKMTAPI hsaKmtPmcReleaseTraceAccess( @@ -419,6 +673,7 @@ hsaKmtPmcReleaseTraceAccess( ) { struct perf_trace *trace; + uint32_t i; if (TraceId == 0) return HSAKMT_STATUS_INVALID_PARAMETER; @@ -428,16 +683,14 @@ hsaKmtPmcReleaseTraceAccess( if (trace->magic4cc != HSA_PERF_MAGIC4CC) return HSAKMT_STATUS_INVALID_HANDLE; - if (amd_hsa_thunk_lock_fd > 0) { - if (lockf( amd_hsa_thunk_lock_fd, F_ULOCK, 0 ) != 0) - return HSAKMT_STATUS_ERROR; - else - return HSAKMT_STATUS_SUCCESS; - } - else { - return HSAKMT_STATUS_ERROR; + for (i = 0; i < trace->num_blocks; i++) { + update_block_slots(PERF_TRACE_ACTION__RELEASE, + trace->blocks[i].block_id, + trace->blocks[i].num_counters); + close_perf_event_fd(&trace->blocks[i]); } + return HSAKMT_STATUS_SUCCESS; } diff --git a/projects/rocr-runtime/src/pmc_table.c b/projects/rocr-runtime/src/pmc_table.c index 6465507c41..5bd16c44cf 100644 --- a/projects/rocr-runtime/src/pmc_table.c +++ b/projects/rocr-runtime/src/pmc_table.c @@ -192,6 +192,16 @@ static struct perf_counter_block polaris_blocks[PERFCOUNTER_BLOCKID__MAX] = { */ static struct perf_counter_block iommu_block; +uint32_t pmc_table_get_max_concurrent(int block_id) +{ + switch (block_id) { + case PERFCOUNTER_BLOCKID__IOMMUV2: + return iommu_block.num_of_slots; + default: + return 0; + } +} + static HSAKMT_STATUS alloc_pmc_blocks_iommu(void) { diff --git a/projects/rocr-runtime/src/pmc_table.h b/projects/rocr-runtime/src/pmc_table.h index adc4af2298..2aa1002bdb 100644 --- a/projects/rocr-runtime/src/pmc_table.h +++ b/projects/rocr-runtime/src/pmc_table.h @@ -45,6 +45,7 @@ struct perf_counter_block { HSAKMT_STATUS alloc_pmc_blocks(void); void free_pmc_blocks(void); +uint32_t pmc_table_get_max_concurrent(int block_id); HSAKMT_STATUS get_block_properties(uint32_t node_id,