diff --git a/projects/rocm-smi-lib/docs/ROCm_SMI_Manual.pdf b/projects/rocm-smi-lib/docs/ROCm_SMI_Manual.pdf new file mode 100644 index 0000000000..a51f7f90d9 Binary files /dev/null and b/projects/rocm-smi-lib/docs/ROCm_SMI_Manual.pdf differ diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h index 803417576b..8b434c213c 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h @@ -244,7 +244,9 @@ typedef enum { */ typedef enum { RSMI_CNTR_CMD_START = 0, //!< Start the counter - RSMI_CNTR_CMD_STOP, //!< Stop the counter + RSMI_CNTR_CMD_STOP, //!< Stop the counter; note that this should not + //!< be used before reading. It is for temporarily + //!< disabling the counter. } rsmi_counter_command_t; /** @@ -525,11 +527,11 @@ typedef enum { * @brief Types for IO Link */ typedef enum _RSMI_IO_LINK_TYPE { - RSMI_IOLINK_TYPE_UNDEFINED = 0, //!< unknown type. - RSMI_IOLINK_TYPE_PCIEXPRESS = 1, //!< PCI Express - RSMI_IOLINK_TYPE_XGMI = 2, //!< XGMI - RSMI_IOLINK_TYPE_NUMIOLINKTYPES, //!< Number of IO Link types - RSMI_IOLINK_TYPE_SIZE = 0xFFFFFFFF //!< Max of IO Link types + RSMI_IOLINK_TYPE_UNDEFINED = 0, //!< unknown type. + RSMI_IOLINK_TYPE_PCIEXPRESS = 1, //!< PCI Express + RSMI_IOLINK_TYPE_XGMI = 2, //!< XGMI + RSMI_IOLINK_TYPE_NUMIOLINKTYPES, //!< Number of IO Link types + RSMI_IOLINK_TYPE_SIZE = 0xFFFFFFFF //!< Max of IO Link types } RSMI_IO_LINK_TYPE; /** @@ -2333,6 +2335,98 @@ rsmi_status_string(rsmi_status_t status, const char **status_string); /** @defgroup PerfCntr Performance Counter Functions * These functions are used to configure, query and control performance * counting. + * + * These functions use the same mechanisms as the "perf" command line + * utility. They share the same underlying resources and have some similarities + * in how they are used. The events supported by this API should have + * corresponding perf events that can be seen with "perf stat ...". The events + * supported by perf can be seen with "perf list" + * + * The types of events available and the ability to count those + * events are dependent on which device is being targeted and if counters are + * still available for that device, respectively. + * ::rsmi_dev_counter_group_supported() can be used to see which event types + * (::rsmi_event_group_t) are supported for a given device. Assuming a device + * supports a given event type, we can then check to see if there are counters + * available to count a specific event with + * ::rsmi_counter_available_counters_get(). Counters may be occupied by other + * perf based programs. + * + * Once it is determined that events are supported and counters are available, + * an event counter can be created/destroyed and controlled. + * + * ::rsmi_dev_counter_create() allocates internal data structures that will be + * used to used to control the event counter, and return a handle to this data + * structure. + * + * Once an event counter handle is obtained, the event counter can be + * controlled (i.e., started, stopped,...) with ::rsmi_counter_control() by + * passing ::rsmi_counter_command_t commands. ::RSMI_CNTR_CMD_START starts an + * event counter and ::RSMI_CNTR_CMD_STOP stops a counter. + * ::rsmi_counter_read() reads an event counter. + * + * Once the counter is no longer needed, the resources it uses should be freed + * by calling ::rsmi_dev_counter_destroy(). + * + * + * Important Notes about Counter Values + * ==================================== + * - A running "absolute" counter is kept internally. For the discussion that + * follows, we will call the internal counter value at time \a t \a + * valt + * - Issuing ::RSMI_CNTR_CMD_START or calling ::rsmi_counter_read(), causes + * RSMI (in kernel) to internally record the current absolute counter value + * - ::rsmi_counter_read() returns the number of events that have occurred + * since the previously recorded value (ie, a relative value, + * \a valt - valt-1) from the issuing of + * ::RSMI_CNTR_CMD_START or calling ::rsmi_counter_read() + * + * Example of event counting sequence: + * + * \latexonly + * \pagebreak + * \endlatexonly + * \code{.cpp} + * + * rsmi_counter_value_t value; + * + * // Determine if RSMI_EVNT_GRP_XGMI is supported for device dv_ind + * ret = rsmi_dev_counter_group_supported(dv_ind, RSMI_EVNT_GRP_XGMI); + * + * // See if there are counters available for device dv_ind for event + * // RSMI_EVNT_GRP_XGMI + * + * ret = rsmi_counter_available_counters_get(dv_ind, + * RSMI_EVNT_GRP_XGMI, &counters_available); + * + * // Assuming RSMI_EVNT_GRP_XGMI is supported and there is at least 1 + * // counter available for RSMI_EVNT_GRP_XGMI on device dv_ind, create + * // an event object and get the handle (rsmi_event_handle_t). + * + * ret = rsmi_dev_counter_create(dv_ind, RSMI_EVNT_GRP_XGMI, &evnt_handle); + * + * // A program that generates the events of interest can be started + * // immediately before or after starting the counters. + * // Start counting: + * ret = rsmi_counter_control(evnt_handle, RSMI_CNTR_CMD_START, NULL); + * + * // Wait... + * + * // Get the number of events since RSMI_CNTR_CMD_START was issued: + * ret = rsmi_counter_read(rsmi_event_handle_t evt_handle, &value) + * + * // Wait... + * + * // Get the number of events since rsmi_counter_read() was last called: + * ret = rsmi_counter_read(rsmi_event_handle_t evt_handle, &value) + * + * // Stop counting. + * ret = rsmi_counter_control(evnt_handle, RSMI_CNTR_CMD_STOP, NULL); + * + * // Release all resources (e.g., counter and memory resources) associated + * with evnt_handle. + * ret = rsmi_dev_counter_destroy(evnt_handle); + * \endcode * @{ */ diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_io_link.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_io_link.h index cd10b30ad6..124fc5ec02 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_io_link.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_io_link.h @@ -48,6 +48,7 @@ #include #include #include +#include #include "rocm_smi/rocm_smi.h" diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h index f59950f15d..844b2e08bb 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h @@ -54,6 +54,7 @@ #include #include #include // NOLINT +#include #include "rocm_smi/rocm_smi_io_link.h" #include "rocm_smi/rocm_smi_kfd.h" diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index 905290ea7a..9d595e1696 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -566,7 +566,7 @@ rsmi_init(uint64_t flags) { smi.Initialize(flags); } catch(...) { smi.Cleanup(); - throw; + throw amd::smi::rsmi_exception(RSMI_STATUS_INIT_ERROR, __FUNCTION__); } } refGuard.Dismiss(); @@ -2663,14 +2663,17 @@ rsmi_dev_counter_destroy(rsmi_event_handle_t evnt_handle) { return RSMI_STATUS_INVALID_ARGS; } + uint32_t ret = 0; amd::smi::evt::Event *evt = reinterpret_cast(evnt_handle); uint32_t dv_ind = evt->dev_ind(); DEVICE_MUTEX REQUIRE_ROOT_ACCESS + ret = evt->stopCounter(); + delete evt; - return RSMI_STATUS_SUCCESS; + return errno_to_rsmi_status(ret);; CATCH } @@ -2730,6 +2733,12 @@ rsmi_counter_read(rsmi_event_handle_t evt_handle, ret = evt->getValue(value); + // If value > 2^48, then an overflow has occurred. We need to discard this + // value and re-read: + if (ret == 0 && value->value > 0xFFFFFFFFFFFF) { + ret = evt->getValue(value); + } + return errno_to_rsmi_status(ret); CATCH } diff --git a/projects/rocm-smi-lib/src/rocm_smi_counters.cc b/projects/rocm-smi-lib/src/rocm_smi_counters.cc index 6541d956be..45ea210460 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_counters.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_counters.cc @@ -315,6 +315,7 @@ amd::smi::evt::Event::openPerfHandle(void) { attr_.size = sizeof(struct perf_event_attr); attr_.config = get_perf_attr_config(&event_info_); + attr_.sample_type = PERF_SAMPLE_IDENTIFIER; attr_.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; attr_.disabled = 1; diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/hw_topology_read.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/hw_topology_read.cc index 490adb6f37..532e5fce21 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/hw_topology_read.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/hw_topology_read.cc @@ -48,6 +48,7 @@ #include #include +#include #include "gtest/gtest.h" #include "rocm_smi/rocm_smi.h" @@ -62,7 +63,8 @@ typedef struct { TestHWTopologyRead::TestHWTopologyRead() : TestBase() { set_title("RSMI Hardware Topology Read Test"); - set_description("This test verifies that Hardware Topology can be read properly."); + set_description( + "This test verifies that Hardware Topology can be read properly."); } TestHWTopologyRead::~TestHWTopologyRead(void) { @@ -103,16 +105,20 @@ void TestHWTopologyRead::Run(void) { err = rsmi_num_monitor_devices(&num_devices); CHK_ERR_ASRT(err) - gpu_link_t gpu_links[num_devices][num_devices]; - uint32_t numa_numbers[num_devices]; + // gpu_link_t gpu_links[num_devices][num_devices]; + std::vector> gpu_links(num_devices, + std::vector(num_devices)); + // uint32_t numa_numbers[num_devices]; + std::vector numa_numbers(num_devices); - for (uint32_t dv_ind=0; dv_ind 1 event group -void -TestPerfCntrReadWrite::testEventsIndividually(uint32_t dv_ind) { + +void TestPerfCntrReadWrite::CountEvents(uint32_t dv_ind, + rsmi_event_type_t evnt, rsmi_counter_value_t *val, int32_t sleep_sec) { rsmi_event_handle_t evt_handle; rsmi_status_t ret; + + ret = rsmi_dev_counter_create(dv_ind, + static_cast(evnt), &evt_handle); + CHK_ERR_ASRT(ret) + + // Note that rsmi_dev_counter_create() should never return + // RSMI_STATUS_NOT_SUPPORTED. It will return RSMI_STATUS_OUT_OF_RESOURCES + // if it is unable to create a counter. + ret = rsmi_dev_counter_create(dv_ind, + static_cast(evnt), nullptr); + ASSERT_EQ(ret, RSMI_STATUS_INVALID_ARGS); + + ret = rsmi_counter_control(evt_handle, RSMI_CNTR_CMD_START, nullptr); + CHK_ERR_ASRT(ret) + + sleep(sleep_sec); + + ret = rsmi_counter_read(evt_handle, val); + CHK_ERR_ASRT(ret) + + IF_VERB(STANDARD) { + std::cout << "\t\t\tValue: " << val->value << std::endl; + std::cout << "\t\t\tTime Enabled: " << val->time_enabled << std::endl; + std::cout << "\t\t\tTime Running: " << val->time_running << std::endl; + std::cout << "\t\t\tEvents/Second Running: " << + val->value/(float)val->time_running << std::endl; + } + ret = rsmi_dev_counter_destroy(evt_handle); + CHK_ERR_ASRT(ret) +} + +static const uint64_t kGig = 1000000000; + +static const uint64_t kVg20Level1Bandwidth = 23; // 23 GB/sec + + +void +TestPerfCntrReadWrite::testEventsIndividually(uint32_t dv_ind) { + rsmi_status_t ret; rsmi_counter_value_t val; + uint64_t throughput; + + auto utiliz = [&](rsmi_event_type_t evt, uint32_t chan) { + std::cout << "****************************" << std::endl; + std::cout << "Test XGMI Link Utilization (channel " << + chan << ")" << std::endl; + std::cout << "****************************" << std::endl; + std::cout << "Assumed Level 1 Bandwidth: " << + kVg20Level1Bandwidth << "GB/sec" << std::endl; + + uint32_t tmp_verbosity = verbosity(); + set_verbosity(0); + for (int i = 0; i < 5; ++i) { + std::cout << "\t\tPass " << i << ":" << std::endl; + + CountEvents(dv_ind, evt, &val, 1); + double coll_time_sec = static_cast(val.time_running)/kGig; + throughput = (val.value * 32)/coll_time_sec; + std::cout << "\t\t\tCollected events for " << coll_time_sec << + " seconds" << std::endl; + std::cout << "\t\t\tEvents collected: " << val.value << std::endl; + std::cout << "\t\t\tXGMI throughput: " << throughput << + " bytes/second" << std::endl; + std::cout << "\t\t\tXGMI Channel Utilization: " << + 100*throughput/ (float)(kVg20Level1Bandwidth*kGig) << + "%" << std::endl; + std::cout << "\t\t\t****" << std::endl; + } + set_verbosity(tmp_verbosity); + }; + + utiliz(RSMI_EVNT_XGMI_1_BEATS_TX, 1); + utiliz(RSMI_EVNT_XGMI_0_BEATS_TX, 0); std::cout << "****************************" << std::endl; std::cout << "Test each event individually" << std::endl; @@ -131,71 +204,11 @@ TestPerfCntrReadWrite::testEventsIndividually(uint32_t dv_ind) { IF_VERB(STANDARD) { std::cout << "\tTesting Event Type " << evnt << std::endl; } - - IF_VERB(STANDARD) { - std::cout << "\t\tCreating event..." << std::endl; - } - ret = rsmi_dev_counter_create(dv_ind, - static_cast(evnt), &evt_handle); - CHK_ERR_ASRT(ret) - - // Note that rsmi_dev_counter_create() should never return - // RSMI_STATUS_NOT_SUPPORTED. It will return RSMI_STATUS_OUT_OF_RESOURCES - // if it is unable to create a counter. - ret = rsmi_dev_counter_create(dv_ind, - static_cast(evnt), nullptr); - ASSERT_EQ(ret, RSMI_STATUS_INVALID_ARGS); - - IF_VERB(STANDARD) { - std::cout << "\t\tStart Counting..." << std::endl; - } - ret = rsmi_counter_control(evt_handle, RSMI_CNTR_CMD_START, nullptr); - CHK_ERR_ASRT(ret) - - sleep(1); - IF_VERB(STANDARD) { - std::cout << "\t\tStop Counting..." << std::endl; - } - ret = rsmi_counter_control(evt_handle, RSMI_CNTR_CMD_STOP, nullptr); - CHK_ERR_ASRT(ret) - - IF_VERB(STANDARD) { - std::cout << "\t\tRead Counter..." << std::endl; - } - ret = rsmi_counter_read(evt_handle, &val); - CHK_ERR_ASRT(ret) - - IF_VERB(STANDARD) { - std::cout << "\t\tSuccessfully read value: " << std::endl; - std::cout << "\t\t\tValue: " << val.value << std::endl; - std::cout << "\t\t\tTime Enabled: " << val.time_enabled << std::endl; - std::cout << "\t\t\tTime Running: " << val.time_running << std::endl; - } - IF_VERB(STANDARD) { - std::cout << "\t\tRe-start Counting..." << std::endl; - } - ret = rsmi_counter_control(evt_handle, RSMI_CNTR_CMD_START, nullptr); - CHK_ERR_ASRT(ret) - - IF_VERB(STANDARD) { - std::cout << "\t\tRead free-running Counter..." << std::endl; - } - ret = rsmi_counter_read(evt_handle, &val); - CHK_ERR_ASRT(ret) - - IF_VERB(STANDARD) { - std::cout << "\t\tSuccessfully read value: " << std::endl; - std::cout << "\t\t\tValue: " << val.value << std::endl; - std::cout << "\t\t\tTime Enabled: " << val.time_enabled << std::endl; - std::cout << "\t\t\tTime Running: " << val.time_running << std::endl; - } - ret = rsmi_dev_counter_destroy(evt_handle); - CHK_ERR_ASRT(ret) + CountEvents(dv_ind, static_cast(evnt), &val); } } } -// Refactor this to handle different event groups once we have > 1 event group void TestPerfCntrReadWrite::testEventsSimultaneously(uint32_t dv_ind) { rsmi_event_handle_t evt_handle[RSMI_EVNT_XGMI_LAST - @@ -261,19 +274,10 @@ TestPerfCntrReadWrite::testEventsSimultaneously(uint32_t dv_ind) { ret = rsmi_counter_available_counters_get(dv_ind, grp.group(), &tmp_cntrs); CHK_ERR_ASRT(ret) - ASSERT_EQ(tmp_cntrs, (avail_counters - j -1)); + ASSERT_EQ(tmp_cntrs, (avail_counters - j - 1)); } - sleep(5); - IF_VERB(STANDARD) { - std::cout << "\tStop Counters..." << std::endl; - } - for (uint32_t j = 0; j < avail_counters; ++j) { - tmp = static_cast(evnt + j); - ret = rsmi_counter_control(evt_handle[tmp], RSMI_CNTR_CMD_STOP, - nullptr); - CHK_ERR_ASRT(ret) - } + sleep(1); IF_VERB(STANDARD) { std::cout << "\tRead Counters..." << std::endl; diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/perf_cntr_read_write.h b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/perf_cntr_read_write.h index 7762aa521a..337fca3ba6 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/perf_cntr_read_write.h +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/perf_cntr_read_write.h @@ -72,6 +72,9 @@ class TestPerfCntrReadWrite : public TestBase { virtual void DisplayTestInfo(void); private: + void CountEvents(uint32_t dv_ind, + rsmi_event_type_t evnt, rsmi_counter_value_t *val, + int32_t sleep_sec = 1); void testEventsIndividually(uint32_t dv_ind); void testEventsSimultaneously(uint32_t dv_ind); };