Adam/bill cleanup (#209)

Co-authored-by: Bill(Shuzhou) Liu <shuzhou.liu@amd.com>


[ROCm/rdc commit: ca9d8c4bae]
This commit is contained in:
Pryor, Adam
2025-07-07 15:41:22 -05:00
gecommit door GitHub
bovenliggende a03fbdd66a
commit 07346922f5
3 gewijzigde bestanden met toevoegingen van 95 en 68 verwijderingen
@@ -81,11 +81,15 @@ class RdcRocpBase {
*/
rdc_status_t map_entity_to_profiler();
void init_rocp_if_not();
std::vector<rocprofiler_agent_v0_t> agents = {};
std::vector<std::shared_ptr<CounterSampler>> samplers = {};
std::map<rdc_field_t, const char*> field_to_metric = {};
std::map<uint32_t, uint32_t> entity_to_prof_map = {};
bool m_is_initialized = false;
// these fields must be divided by time passed
std::unordered_set<rdc_field_t> eval_fields = {
RDC_FI_PROF_EVAL_MEM_R_BW, RDC_FI_PROF_EVAL_MEM_W_BW,
@@ -51,6 +51,69 @@ THE SOFTWARE.
namespace amd {
namespace rdc {
static const std::map<rdc_field_t, const char*> temp_field_map_k = {
{RDC_FI_PROF_OCCUPANCY_PERCENT, "OccupancyPercent"},
{RDC_FI_PROF_ACTIVE_CYCLES, "GRBM_GUI_ACTIVE"},
{RDC_FI_PROF_ACTIVE_WAVES, "SQ_WAVES"},
{RDC_FI_PROF_ELAPSED_CYCLES, "GRBM_COUNT"},
{RDC_FI_PROF_TENSOR_ACTIVE_PERCENT,
"MfmaUtil"}, // same as TENSOR_ACTIVE but available for more GPUs
{RDC_FI_PROF_GPU_UTIL_PERCENT, "GPU_UTIL"}, // metric is divided by 100 to get percent
// metrics below are divided by time passed
{RDC_FI_PROF_EVAL_MEM_R_BW, "FETCH_SIZE"},
{RDC_FI_PROF_EVAL_MEM_W_BW, "WRITE_SIZE"},
{RDC_FI_PROF_EVAL_FLOPS_16, "TOTAL_16_OPS"},
{RDC_FI_PROF_EVAL_FLOPS_32, "TOTAL_32_OPS"},
{RDC_FI_PROF_EVAL_FLOPS_64, "TOTAL_64_OPS"},
{RDC_FI_PROF_EVAL_FLOPS_16_PERCENT, "RDC_OPS_16_PER_SIMDCYCLE"},
{RDC_FI_PROF_EVAL_FLOPS_32_PERCENT, "RDC_OPS_32_PER_SIMDCYCLE"},
{RDC_FI_PROF_EVAL_FLOPS_64_PERCENT, "RDC_OPS_64_PER_SIMDCYCLE"},
// metrics below are not divided by time passed
{RDC_FI_PROF_VALU_PIPE_ISSUE_UTIL, "ValuPipeIssueUtil"},
{RDC_FI_PROF_SM_ACTIVE, "VALUBusy"},
{RDC_FI_PROF_OCC_PER_ACTIVE_CU, "MeanOccupancyPerActiveCU"},
{RDC_FI_PROF_OCC_ELAPSED,
"GRBM_GUI_ACTIVE"}, // this metric is derived from OCC_PER_ACTIVE_CU and ACTIVE_CYCLES
{RDC_FI_PROF_CPC_CPC_STAT_BUSY, "CPC_CPC_STAT_BUSY"},
{RDC_FI_PROF_CPC_CPC_STAT_IDLE, "CPC_CPC_STAT_IDLE"},
{RDC_FI_PROF_CPC_CPC_STAT_STALL, "CPC_CPC_STAT_STALL"},
{RDC_FI_PROF_CPC_CPC_TCIU_BUSY, "CPC_CPC_TCIU_BUSY"},
{RDC_FI_PROF_CPC_CPC_TCIU_IDLE, "CPC_CPC_TCIU_IDLE"},
{RDC_FI_PROF_CPC_CPC_UTCL2IU_BUSY, "CPC_CPC_UTCL2IU_BUSY"},
{RDC_FI_PROF_CPC_CPC_UTCL2IU_IDLE, "CPC_CPC_UTCL2IU_IDLE"},
{RDC_FI_PROF_CPC_CPC_UTCL2IU_STALL, "CPC_CPC_UTCL2IU_STALL"},
{RDC_FI_PROF_CPC_ME1_BUSY_FOR_PACKET_DECODE, "CPC_ME1_BUSY_FOR_PACKET_DECODE"},
{RDC_FI_PROF_CPC_ME1_DC0_SPI_BUSY, "CPC_ME1_DC0_SPI_BUSY"},
{RDC_FI_PROF_CPC_UTCL1_STALL_ON_TRANSLATION, "CPC_UTCL1_STALL_ON_TRANSLATION"},
{RDC_FI_PROF_CPC_ALWAYS_COUNT, "CPC_ALWAYS_COUNT"},
{RDC_FI_PROF_CPC_ADC_VALID_CHUNK_NOT_AVAIL, "CPC_ADC_VALID_CHUNK_NOT_AVAIL"},
{RDC_FI_PROF_CPC_ADC_DISPATCH_ALLOC_DONE, "CPC_ADC_DISPATCH_ALLOC_DONE"},
{RDC_FI_PROF_CPC_ADC_VALID_CHUNK_END, "CPC_ADC_VALID_CHUNK_END"},
{RDC_FI_PROF_CPC_SYNC_FIFO_FULL_LEVEL, "CPC_SYNC_FIFO_FULL_LEVEL"},
{RDC_FI_PROF_CPC_SYNC_FIFO_FULL, "CPC_SYNC_FIFO_FULL"},
{RDC_FI_PROF_CPC_GD_BUSY, "CPC_GD_BUSY"},
{RDC_FI_PROF_CPC_TG_SEND, "CPC_TG_SEND"},
{RDC_FI_PROF_CPC_WALK_NEXT_CHUNK, "CPC_WALK_NEXT_CHUNK"},
{RDC_FI_PROF_CPC_STALLED_BY_SE0_SPI, "CPC_STALLED_BY_SE0_SPI"},
{RDC_FI_PROF_CPC_STALLED_BY_SE1_SPI, "CPC_STALLED_BY_SE1_SPI"},
{RDC_FI_PROF_CPC_STALLED_BY_SE2_SPI, "CPC_STALLED_BY_SE2_SPI"},
{RDC_FI_PROF_CPC_STALLED_BY_SE3_SPI, "CPC_STALLED_BY_SE3_SPI"},
{RDC_FI_PROF_CPC_LTE_ALL, "CPC_LTE_ALL"},
{RDC_FI_PROF_CPC_SYNC_WRREQ_FIFO_BUSY, "CPC_SYNC_WRREQ_FIFO_BUSY"},
{RDC_FI_PROF_CPC_CANE_BUSY, "CPC_CANE_BUSY"},
{RDC_FI_PROF_CPC_CANE_STALL, "CPC_CANE_STALL"},
{RDC_FI_PROF_CPF_CMP_UTCL1_STALL_ON_TRANSLATION, "CPF_CMP_UTCL1_STALL_ON_TRANSLATION"},
{RDC_FI_PROF_CPF_CPF_STAT_BUSY, "CPF_CPF_STAT_BUSY"},
{RDC_FI_PROF_CPF_CPF_STAT_IDLE, "CPF_CPF_STAT_IDLE"},
{RDC_FI_PROF_CPF_CPF_STAT_STALL, "CPF_CPF_STAT_STALL"},
{RDC_FI_PROF_CPF_CPF_TCIU_BUSY, "CPF_CPF_TCIU_BUSY"},
{RDC_FI_PROF_CPF_CPF_TCIU_IDLE, "CPF_CPF_TCIU_IDLE"},
{RDC_FI_PROF_CPF_CPF_TCIU_STALL, "CPF_CPF_TCIU_STALL"},
{RDC_FI_PROF_SIMD_UTILIZATION, "SIMD_UTILIZATION"},
{RDC_FI_PROF_UUID, "SQ_WAVES"}, // dummy value,
{RDC_FI_PROF_KFD_ID, "SQ_WAVES"}, // dummy value,
};
double RdcRocpBase::run_profiler(uint32_t agent_index, rdc_field_t field) {
thread_local std::vector<rocprofiler_record_counter_t> records;
@@ -200,70 +263,13 @@ rdc_status_t RdcRocpBase::map_entity_to_profiler() {
return RDC_ST_OK;
}
RdcRocpBase::RdcRocpBase() {
// all fields
static const std::map<rdc_field_t, const char*> temp_field_map_k = {
{RDC_FI_PROF_OCCUPANCY_PERCENT, "OccupancyPercent"},
{RDC_FI_PROF_ACTIVE_CYCLES, "GRBM_GUI_ACTIVE"},
{RDC_FI_PROF_ACTIVE_WAVES, "SQ_WAVES"},
{RDC_FI_PROF_ELAPSED_CYCLES, "GRBM_COUNT"},
{RDC_FI_PROF_TENSOR_ACTIVE_PERCENT,
"MfmaUtil"}, // same as TENSOR_ACTIVE but available for more GPUs
{RDC_FI_PROF_GPU_UTIL_PERCENT, "GPU_UTIL"}, // metric is divided by 100 to get percent
// metrics below are divided by time passed
{RDC_FI_PROF_EVAL_MEM_R_BW, "FETCH_SIZE"},
{RDC_FI_PROF_EVAL_MEM_W_BW, "WRITE_SIZE"},
{RDC_FI_PROF_EVAL_FLOPS_16, "TOTAL_16_OPS"},
{RDC_FI_PROF_EVAL_FLOPS_32, "TOTAL_32_OPS"},
{RDC_FI_PROF_EVAL_FLOPS_64, "TOTAL_64_OPS"},
{RDC_FI_PROF_EVAL_FLOPS_16_PERCENT, "RDC_OPS_16_PER_SIMDCYCLE"},
{RDC_FI_PROF_EVAL_FLOPS_32_PERCENT, "RDC_OPS_32_PER_SIMDCYCLE"},
{RDC_FI_PROF_EVAL_FLOPS_64_PERCENT, "RDC_OPS_64_PER_SIMDCYCLE"},
// metrics below are not divided by time passed
{RDC_FI_PROF_VALU_PIPE_ISSUE_UTIL, "ValuPipeIssueUtil"},
{RDC_FI_PROF_SM_ACTIVE, "VALUBusy"},
{RDC_FI_PROF_OCC_PER_ACTIVE_CU, "MeanOccupancyPerActiveCU"},
{RDC_FI_PROF_OCC_ELAPSED, "GRBM_GUI_ACTIVE"}, // this metric is derived from
// OCC_PER_ACTIVE_CU and ACTIVE_CYCLES
{RDC_FI_PROF_CPC_CPC_STAT_BUSY, "CPC_CPC_STAT_BUSY"},
{RDC_FI_PROF_CPC_CPC_STAT_IDLE, "CPC_CPC_STAT_IDLE"},
{RDC_FI_PROF_CPC_CPC_STAT_STALL, "CPC_CPC_STAT_STALL"},
{RDC_FI_PROF_CPC_CPC_TCIU_BUSY, "CPC_CPC_TCIU_BUSY"},
{RDC_FI_PROF_CPC_CPC_TCIU_IDLE, "CPC_CPC_TCIU_IDLE"},
{RDC_FI_PROF_CPC_CPC_UTCL2IU_BUSY, "CPC_CPC_UTCL2IU_BUSY"},
{RDC_FI_PROF_CPC_CPC_UTCL2IU_IDLE, "CPC_CPC_UTCL2IU_IDLE"},
{RDC_FI_PROF_CPC_CPC_UTCL2IU_STALL, "CPC_CPC_UTCL2IU_STALL"},
{RDC_FI_PROF_CPC_ME1_BUSY_FOR_PACKET_DECODE, "CPC_ME1_BUSY_FOR_PACKET_DECODE"},
{RDC_FI_PROF_CPC_ME1_DC0_SPI_BUSY, "CPC_ME1_DC0_SPI_BUSY"},
{RDC_FI_PROF_CPC_UTCL1_STALL_ON_TRANSLATION, "CPC_UTCL1_STALL_ON_TRANSLATION"},
{RDC_FI_PROF_CPC_ALWAYS_COUNT, "CPC_ALWAYS_COUNT"},
{RDC_FI_PROF_CPC_ADC_VALID_CHUNK_NOT_AVAIL, "CPC_ADC_VALID_CHUNK_NOT_AVAIL"},
{RDC_FI_PROF_CPC_ADC_DISPATCH_ALLOC_DONE, "CPC_ADC_DISPATCH_ALLOC_DONE"},
{RDC_FI_PROF_CPC_ADC_VALID_CHUNK_END, "CPC_ADC_VALID_CHUNK_END"},
{RDC_FI_PROF_CPC_SYNC_FIFO_FULL_LEVEL, "CPC_SYNC_FIFO_FULL_LEVEL"},
{RDC_FI_PROF_CPC_SYNC_FIFO_FULL, "CPC_SYNC_FIFO_FULL"},
{RDC_FI_PROF_CPC_GD_BUSY, "CPC_GD_BUSY"},
{RDC_FI_PROF_CPC_TG_SEND, "CPC_TG_SEND"},
{RDC_FI_PROF_CPC_WALK_NEXT_CHUNK, "CPC_WALK_NEXT_CHUNK"},
{RDC_FI_PROF_CPC_STALLED_BY_SE0_SPI, "CPC_STALLED_BY_SE0_SPI"},
{RDC_FI_PROF_CPC_STALLED_BY_SE1_SPI, "CPC_STALLED_BY_SE1_SPI"},
{RDC_FI_PROF_CPC_STALLED_BY_SE2_SPI, "CPC_STALLED_BY_SE2_SPI"},
{RDC_FI_PROF_CPC_STALLED_BY_SE3_SPI, "CPC_STALLED_BY_SE3_SPI"},
{RDC_FI_PROF_CPC_LTE_ALL, "CPC_LTE_ALL"},
{RDC_FI_PROF_CPC_SYNC_WRREQ_FIFO_BUSY, "CPC_SYNC_WRREQ_FIFO_BUSY"},
{RDC_FI_PROF_CPC_CANE_BUSY, "CPC_CANE_BUSY"},
{RDC_FI_PROF_CPC_CANE_STALL, "CPC_CANE_STALL"},
{RDC_FI_PROF_CPF_CMP_UTCL1_STALL_ON_TRANSLATION, "CPF_CMP_UTCL1_STALL_ON_TRANSLATION"},
{RDC_FI_PROF_CPF_CPF_STAT_BUSY, "CPF_CPF_STAT_BUSY"},
{RDC_FI_PROF_CPF_CPF_STAT_IDLE, "CPF_CPF_STAT_IDLE"},
{RDC_FI_PROF_CPF_CPF_STAT_STALL, "CPF_CPF_STAT_STALL"},
{RDC_FI_PROF_CPF_CPF_TCIU_BUSY, "CPF_CPF_TCIU_BUSY"},
{RDC_FI_PROF_CPF_CPF_TCIU_IDLE, "CPF_CPF_TCIU_IDLE"},
{RDC_FI_PROF_CPF_CPF_TCIU_STALL, "CPF_CPF_TCIU_STALL"},
{RDC_FI_PROF_SIMD_UTILIZATION, "SIMD_UTILIZATION"},
{RDC_FI_PROF_UUID, "SQ_WAVES"}, // dummy value,
{RDC_FI_PROF_KFD_ID, "SQ_WAVES"}, // dummy value,
};
void RdcRocpBase::init_rocp_if_not() {
if (m_is_initialized) {
return;
}
// ensure initialization is attempted only once, even if it fails
m_is_initialized = true;
hsa_status_t status = hsa_init();
if (status != HSA_STATUS_SUCCESS) {
@@ -314,6 +320,19 @@ RdcRocpBase::RdcRocpBase() {
RDC_LOG(RDC_DEBUG, "Profiler supports " << field_to_metric.size() << " fields");
}
RdcRocpBase::RdcRocpBase() {
// To verify if a field is actually supported by rocprofiler,
// initialization and agent querying are required.
// This initialization is deferred until the first call to rocp_lookup.
// Here, we define the potential fields that rocprofiler may support,
// allowing get_field_ids() to return them.
for (const auto& [k, v] : temp_field_map_k) {
field_to_metric.insert({k, v});
}
RDC_LOG(RDC_DEBUG, "Rocprofiler by default supports " << field_to_metric.size() << " fields");
}
RdcRocpBase::~RdcRocpBase() {
hsa_status_t status = HSA_STATUS_SUCCESS;
status = hsa_shut_down();
@@ -335,6 +354,8 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
return RDC_ST_BAD_PARAMETER;
}
init_rocp_if_not();
const bool is_eval_field = (eval_fields.find(field) != eval_fields.end());
const auto start_time = std::chrono::high_resolution_clock::now();
@@ -83,10 +83,12 @@ RdcRVSBase::RdcRVSBase() {
RDC_LOG(RDC_DEBUG, "RVS CONFIG PATH: " << config_path);
// populate configs
for (auto& ent : std::filesystem::directory_iterator(config_path)) {
if (ent.is_regular_file()) {
_rvs_config_list.push_back(ent.path().string());
if (std::filesystem::exists(config_path) && std::filesystem::is_directory(config_path)) {
// populate configs
for (auto& ent : std::filesystem::directory_iterator(config_path)) {
if (ent.is_regular_file()) {
_rvs_config_list.push_back(ent.path().string());
}
}
}