diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index bf99e58006..a58d626bce 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -53,6 +53,50 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. * sL1D-L2 BW Utilization (section 1401) * Bandwidth Utilization (section 1601) +* Update `System Speed-of-Light` panel to `GPU Speed-of-Light` in TUI with the following metrics: + * Theoretical LDS Bandwidth + * vL1D Cache BW + * L2 Cache BW + * L2-Fabric Read BW + * L2-Fabric Write BW + * Kernel Time + * Kernel Time (Cycles) + * SIMD Utilization + * Clock Rate + +* Add `Compute Throughput` panel to TUI with the following metrics: + * VALU FLOPs + * VALU IOPs + * MFMA FLOPs (F8) + * MFMA FLOPs (BF16) + * MFMA FLOPs (F16) + * MFMA FLOPs (F32) + * MFMA FLOPs (F64) + * MFMA FLOPs (F6F4) (in gfx950) + * MFMA IOPs (Int8) + * SALU Utilization + * VALU Utilization + * MFMA Utilization + * VMEM Utilization + * Branch Utilization + * IPC + +* Add `Memory Throughput` panel to TUI with the following metrics: + * vL1D Cache BW + * vL1D Cache Utilization + * Theoretical LDS Bandwidth + * LDS Utilization + * L2 Cache BW + * L2 Cache Utilization + * L2-Fabric Read BW + * L2-Fabric Write BW + * sL1D Cache BW + * L1I BW + * Address Processing Unit Busy + * Data-Return Busy + * L1I-L2 Bandwidth + * sL1D-L2 BW + ### Resolved issues * Fixed not detecting memory clock issue when using amd-smi diff --git a/projects/rocprofiler-compute/src/config.py b/projects/rocprofiler-compute/src/config.py index 42a599c718..fd3bab67f6 100644 --- a/projects/rocprofiler-compute/src/config.py +++ b/projects/rocprofiler-compute/src/config.py @@ -23,6 +23,7 @@ ############################################################################## + from pathlib import Path # NB: Creating a new module to share global vars across modules diff --git a/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py b/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py index ea7539f42b..0aeab3902e 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py @@ -32,6 +32,7 @@ from pathlib import Path import pandas as pd +import config from utils import file_io, parser, schema from utils.logger import ( console_debug, @@ -76,9 +77,14 @@ class OmniAnalyze_Base: if list_stats: ac.panel_configs = file_io.top_stats_build_in_config else: - arch_panel_config = ( + arch_panel_config = [ config_dir if single_panel_config else config_dir.joinpath(arch) - ) + ] + # Use restructured perf metrics in TUI analyze mode + if self.__args.tui and arch in ["gfx942", "gfx950"]: + arch_panel_config.append( + f"{config.rocprof_compute_home}/rocprof_compute_tui/utils/{arch}" + ) ac.panel_configs = file_io.load_panel_configs(arch_panel_config) # TODO: filter_metrics should/might be one per arch diff --git a/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx942/3200_gpu_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx942/3200_gpu_speed_of_light.yaml new file mode 100644 index 0000000000..053b0e039b --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx942/3200_gpu_speed_of_light.yaml @@ -0,0 +1,103 @@ +# TUI use only +# NOTE: This is used as a TUI-only yaml file for the beta release of the new performance metric organization +Panel Config: + id: 3200 + title: GPU Speed-of-Light + metrics_description: + Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have + been loaded from, stored to, or atomically updated in the LDS per unit time + (see LDS Bandwidth example for more detail). This is also presented as a percent + of the peak theoretical F64 MFMA operations achievable on the specific accelerator. + vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of + VMEM instructions per unit time. The number of bytes is calculated as the number + of cache lines requested multiplied by the cache line size. This value does + not consider partial requests, so e.g., if only a single value is requested + in a cache line, the data movement will still be counted as a full cache line. + This is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The + number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so e.g., + if only a single value is requested in a cache line, the data movement will + still be counted as a full cache line. This is also presented as a percent of + the peak theoretical bandwidth achievable on the specific accelerator. + L2-Fabric Read BW: "The number of bytes read by the L2 over the Infinity Fabric\u2122\ + \ interface per unit time. This is also presented as a percent of the peak theoretical\ + \ bandwidth achievable on the specific accelerator." + L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric + interface by write and atomic operations per unit time. This is also presented + as a percent of the peak theoretical bandwidth achievable on the specific accelerator. + Kernel Time: The total duration of the executed kernel. + Kernel Time (Cycles): The total duration of the executed kernel in cycles. + SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD + on a CU was actively doing any work, summed over all CUs. Low values (less than + 100%) indicate that the accelerator was not fully saturated by the kernel, or + a potential load-imbalance issue. + Clock Rate: + data source: + - metric_table: + id: 3201 + title: GPU Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + peak: Peak + pop: Pct of Peak + metric: + Theoretical LDS Bandwidth: + value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: (($max_sclk * $cu_per_gpu) * 0.128) + pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) + vL1D Cache BW: + value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) + pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) + L2 Cache BW: + value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) + pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + L2-Fabric Read BW: + value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)))) / $hbmBandwidth) + L2-Fabric Write BW: + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - + TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) / + $hbmBandwidth) + Kernel Time: + avg: AVG((End_Timestamp - Start_Timestamp)) + unit: ns + peak: N/A + pop: N/A + Kernel Time (Cycles): + avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) + unit: Cycle + peak: N/A + pop: N/A + SIMD Utilization: + value: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + peak: 100 + pop: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + Clock Rate: + value: (GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) / (End_Timestamp - Start_Timestamp) + unit: MHz + peak: N/A # attainable peak? theoretical freq? + pop: N/A \ No newline at end of file diff --git a/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx942/3300_compute_throughput.yaml b/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx942/3300_compute_throughput.yaml new file mode 100644 index 0000000000..44445dc601 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx942/3300_compute_throughput.yaml @@ -0,0 +1,163 @@ +# TUI use only +# NOTE: This is used as a TUI-only yaml file for the beta release of the new performance metric organization +Panel Config: + id: 3300 + title: Compute Throughput + metrics_description: + VALU FLOPs: 'The total floating-point operations executed per second on the VALU. + This is also presented as a percent of the peak theoretical FLOPs achievable + on the specific accelerator. Note: this does not include any floating-point + operations from MFMA instructions.' + VALU IOPs: 'The total integer operations executed per second on the VALU. This + is also presented as a percent of the peak theoretical IOPs achievable on the + specific accelerator. Note: this does not include any integer operations from + MFMA instructions.' + MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations + executed per second. This does not include any 16-bit brain floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F8 MFMA operations achievable on the specific accelerator. It is supported on + AMD Instinct MI300 series and later only. + MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations + executed per second. Note: this does not include any 16-bit brain floating point + operations from VALU instructions. This is also presented as a percent of the + peak theoretical BF16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed + per second. Note: this does not include any 16-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed + per second. Note: this does not include any 32-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F32 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed + per second. Note: this does not include any 64-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F64 MFMA operations achievable on the specific accelerator.' + MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed + per second. Note: this does not include any 8-bit integer operations from VALU + instructions. This is also presented as a percent of the peak theoretical INT8 + MFMA operations achievable on the specific accelerator.' + SALU Utilization: Indicates what percent of the kernel's duration the SALU was + busy executing instructions. Computed as the ratio of the total number of cycles + spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles. + VALU Utilization: Indicates what percent of the kernel's duration the VALU was + busy executing instructions. Does not include VMEM operations. Computed as the + ratio of the total number of cycles spent by the scheduler issuing VALU instructions + over the total CU cycles. + MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit + was busy executing instructions. Computed as the ratio of the total number of + cycles the MFMA was busy over the total CU cycles. + VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit + was busy executing instructions, including both global/generic and spill/scratch + operations (see the VMEM instruction count metrics) for more detail). Does not + include VALU operations. Computed as the ratio of the total number of cycles + spent by the scheduler issuing VMEM instructions over the total CU cycles. + Branch Utilization: Indicates what percent of the kernel's duration the branch + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the scheduler issuing branch instructions over the total + CU cycles + IPC: The ratio of the total number of instructions executed on the CU over the + total active CU cycles. This is also presented as a percent of the peak theoretical + bandwidth achievable on the specific accelerator. + data source: + - metric_table: + id: 3301 + title: Compute Throughput + header: + metric: Metric + value: Avg + unit: Unit + peak: Peak + pop: Pct of Peak + metric: + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) + / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp))) + unit: GIOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + MFMA FLOPs (F8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA IOPs (Int8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GIOP/s + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + SALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + VALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + MFMA Utilization: + value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + VMEM Utilization: + value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + Branch Utilization: + value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + IPC: + value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + peak: 5 + pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) \ No newline at end of file diff --git a/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx942/3400_memory_throughput.yaml b/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx942/3400_memory_throughput.yaml new file mode 100644 index 0000000000..165ac26d92 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx942/3400_memory_throughput.yaml @@ -0,0 +1,162 @@ +# TUI use only +# NOTE: This is used as a TUI-only yaml file for the beta release of the new performance metric organization +Panel Config: + id: 3400 + title: Memory Throughput + metrics_description: + vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of + VMEM instructions per unit time. The number of bytes is calculated as the number + of cache lines requested multiplied by the cache line size. This value does + not consider partial requests, so e.g., if only a single value is requested + in a cache line, the data movement will still be counted as a full cache line. + This is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + vL1D Cache Utilization: Indicates how busy the vL1D Cache RAM was during the kernel execution. + The number of cycles where the vL1D Cache RAM is actively processing any request + divided by the number of cycles where the vL1D is active. + Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have + been loaded from, stored to, or atomically updated in the LDS per unit time + (see LDS Bandwidth example for more detail). This is also presented as a percent + of the peak theoretical F64 MFMA operations achievable on the specific accelerator. + LDS Utilization: Indicates what percent of the kernel's duration the LDS was actively + executing instructions (including, but not limited to, load, store, atomic and + HIP's __shfl operations). Calculated as the ratio of the total number of cycles + LDS was active over the total CU cycles. + L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The + number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so e.g., + if only a single value is requested in a cache line, the data movement will + still be counted as a full cache line. This is also presented as a percent of + the peak theoretical bandwidth achievable on the specific accelerator. + L2 Cache Utilization: The ratio of the number of cycles an L2 channel was active, summed + over all L2 channels on the accelerator over the total L2 cycles. + L2-Fabric Read BW: "The number of bytes read by the L2 over the Infinity Fabric\u2122\ + \ interface per unit time. This is also presented as a percent of the peak theoretical\ + \ bandwidth achievable on the specific accelerator." + L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric + interface by write and atomic operations per unit time. This is also presented + as a percent of the peak theoretical bandwidth achievable on the specific accelerator. + sL1D Cache BW: The number of bytes looked up in the sL1D cache per unit time. + This is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + L1I BW: The percent of L1I requests that hit on a previously loaded line the cache. + Calculated as the ratio of the number of L1I requests that hit over the number + of all L1I requests. + Address Processing Unit Busy: Percent of the total CU cycles the address processor + was busy. + Data-Return Busy: Percent of the total CU cycles the data-return unit was busy + processing or waiting on data to return to the CU. + L1I-L2 Bandwidth: Total number of bytes transferred across L1I - L2 interface + divided by total duration. + sL1D-L2 BW: "The total number of bytes read from, written to, or atomically updated\ + \ across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D\ + \ writes and atomics are typically unused on current CDNA accelerators, so in\ + \ the majority of cases this can be interpreted as an sL1D\u2192L2 read bandwidth." + data source: + - metric_table: + id: 3401 + title: Memory Throughput + header: + metric: Metric + value: Avg + unit: Unit + peak: Peak + pop: Pct of Peak + metric: + vL1D Cache BW: + value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) + pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) + vL1D Cache Utilization: + value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None)) + unit: Pct of Peak + peak: 100 + pop: None + Theoretical LDS Bandwidth: + value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: (($max_sclk * $cu_per_gpu) * 0.128) + pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) + LDS Utilization: + value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + peak: 100 + pop: None + L2 Cache Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + L2 Cache BW: + value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) + pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + L2 Cache Utilization: + value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + peak: 100 + pop: None + L2-Fabric Read BW: + value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)))) / $hbmBandwidth) + L2-Fabric Write BW: + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - + TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) / + $hbmBandwidth) + sL1D Cache BW: + value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * + 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Hit Rate: + value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + unit: pct + peak: 100 + pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + L1I BW: + value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * + 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + Address Processing Unit Busy: + avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: N/A + Data-Return Busy: + avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: N/A + L1I-L2 Bandwidth: + avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: N/A + pop: N/A + sL1D-L2 BW: + value: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: N/A + pop: N/A diff --git a/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx950/3200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx950/3200_system_speed_of_light.yaml new file mode 100644 index 0000000000..2010e86763 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx950/3200_system_speed_of_light.yaml @@ -0,0 +1,103 @@ +# TUI use only +# NOTE: This is used as a TUI-only yaml file for the beta release of the new performance metric organization +Panel Config: + id: 3200 + title: GPU Speed-of-Light + metrics_description: + Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have + been loaded from, stored to, or atomically updated in the LDS per unit time + (see LDS Bandwidth example for more detail). This is also presented as a percent + of the peak theoretical F64 MFMA operations achievable on the specific accelerator. + vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of + VMEM instructions per unit time. The number of bytes is calculated as the number + of cache lines requested multiplied by the cache line size. This value does + not consider partial requests, so e.g., if only a single value is requested + in a cache line, the data movement will still be counted as a full cache line. + This is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The + number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so e.g., + if only a single value is requested in a cache line, the data movement will + still be counted as a full cache line. This is also presented as a percent of + the peak theoretical bandwidth achievable on the specific accelerator. + L2-Fabric Read BW: "The number of bytes read by the L2 over the Infinity Fabric\u2122\ + \ interface per unit time. This is also presented as a percent of the peak theoretical\ + \ bandwidth achievable on the specific accelerator." + L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric + interface by write and atomic operations per unit time. This is also presented + as a percent of the peak theoretical bandwidth achievable on the specific accelerator. + Kernel Time: The total duration of the executed kernel. + Kernel Time (Cycles): The total duration of the executed kernel in cycles. + SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD + on a CU was actively doing any work, summed over all CUs. Low values (less than + 100%) indicate that the accelerator was not fully saturated by the kernel, or + a potential load-imbalance issue. + Clock Rate: + data source: + - metric_table: + id: 3201 + title: GPU Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + peak: Peak + pop: Pct of Peak + metric: + Theoretical LDS Bandwidth: + value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: (($max_sclk * $cu_per_gpu) * 0.128) + pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) + vL1D Cache BW: + value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) + pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) + L2 Cache BW: + value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) + pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + L2-Fabric Read BW: + value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)))) / $hbmBandwidth) + L2-Fabric Write BW: + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - + TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) / + $hbmBandwidth) + Kernel Time: + avg: AVG((End_Timestamp - Start_Timestamp)) + unit: ns + peak: N/A + pop: N/A + Kernel Time (Cycles): + avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) + unit: Cycle + peak: N/A + pop: N/A + SIMD Utilization: + value: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + peak: 100 + pop: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + Clock Rate: + value: (GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) / (End_Timestamp - Start_Timestamp) + unit: ns + peak: N/A + pop: N/A \ No newline at end of file diff --git a/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx950/3300_compute_throughput.yaml b/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx950/3300_compute_throughput.yaml new file mode 100644 index 0000000000..b88c1a852c --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx950/3300_compute_throughput.yaml @@ -0,0 +1,169 @@ +# TUI use only +# NOTE: This is used as a TUI-only yaml file for the beta release of the new performance metric organization +Panel Config: + id: 3300 + title: Compute Throughput + metrics_description: + VALU FLOPs: 'The total floating-point operations executed per second on the VALU. + This is also presented as a percent of the peak theoretical FLOPs achievable + on the specific accelerator. Note: this does not include any floating-point + operations from MFMA instructions.' + VALU IOPs: 'The total integer operations executed per second on the VALU. This + is also presented as a percent of the peak theoretical IOPs achievable on the + specific accelerator. Note: this does not include any integer operations from + MFMA instructions.' + MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations + executed per second. This does not include any 16-bit brain floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F8 MFMA operations achievable on the specific accelerator. It is supported on + AMD Instinct MI300 series and later only. + MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations + executed per second. Note: this does not include any 16-bit brain floating point + operations from VALU instructions. This is also presented as a percent of the + peak theoretical BF16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed + per second. Note: this does not include any 16-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed + per second. Note: this does not include any 32-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F32 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed + per second. Note: this does not include any 64-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F64 MFMA operations achievable on the specific accelerator.' + MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed + per second. Note: this does not include any 8-bit integer operations from VALU + instructions. This is also presented as a percent of the peak theoretical INT8 + MFMA operations achievable on the specific accelerator.' + SALU Utilization: Indicates what percent of the kernel's duration the SALU was + busy executing instructions. Computed as the ratio of the total number of cycles + spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles. + VALU Utilization: Indicates what percent of the kernel's duration the VALU was + busy executing instructions. Does not include VMEM operations. Computed as the + ratio of the total number of cycles spent by the scheduler issuing VALU instructions + over the total CU cycles. + MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit + was busy executing instructions. Computed as the ratio of the total number of + cycles the MFMA was busy over the total CU cycles. + VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit + was busy executing instructions, including both global/generic and spill/scratch + operations (see the VMEM instruction count metrics) for more detail). Does not + include VALU operations. Computed as the ratio of the total number of cycles + spent by the scheduler issuing VMEM instructions over the total CU cycles. + Branch Utilization: Indicates what percent of the kernel's duration the branch + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the scheduler issuing branch instructions over the total + CU cycles + IPC: The ratio of the total number of instructions executed on the CU over the + total active CU cycles. This is also presented as a percent of the peak theoretical + bandwidth achievable on the specific accelerator. + data source: + - metric_table: + id: 3301 + title: Compute Throughput + header: + metric: Metric + value: Avg + unit: Unit + peak: Peak + pop: Pct of Peak + metric: + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) + / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp))) + unit: GIOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + MFMA FLOPs (F8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) + MFMA FLOPs (F6F4): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000)) + MFMA IOPs (Int8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GIOP/s + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + SALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + VALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + MFMA Utilization: + value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + VMEM Utilization: + value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + Branch Utilization: + value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + IPC: + value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + peak: 5 + pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) \ No newline at end of file diff --git a/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx950/3400_memory_throughput.yaml b/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx950/3400_memory_throughput.yaml new file mode 100644 index 0000000000..2a2ca7fd5f --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx950/3400_memory_throughput.yaml @@ -0,0 +1,161 @@ +# TUI use only +# NOTE: This is used as a TUI-only yaml file for the beta release of the new performance metric organization +Panel Config: + id: 3400 + title: Memory Throughput + metrics_description: + vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of + VMEM instructions per unit time. The number of bytes is calculated as the number + of cache lines requested multiplied by the cache line size. This value does + not consider partial requests, so e.g., if only a single value is requested + in a cache line, the data movement will still be counted as a full cache line. + This is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + vL1D Cache Utilization: Indicates how busy the vL1D Cache RAM was during the kernel execution. + The number of cycles where the vL1D Cache RAM is actively processing any request + divided by the number of cycles where the vL1D is active. + Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have + been loaded from, stored to, or atomically updated in the LDS per unit time + (see LDS Bandwidth example for more detail). This is also presented as a percent + of the peak theoretical F64 MFMA operations achievable on the specific accelerator. + LDS Utilization: Indicates what percent of the kernel's duration the LDS was actively + executing instructions (including, but not limited to, load, store, atomic and + HIP's __shfl operations). Calculated as the ratio of the total number of cycles + LDS was active over the total CU cycles. + L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The + number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so e.g., + if only a single value is requested in a cache line, the data movement will + still be counted as a full cache line. This is also presented as a percent of + the peak theoretical bandwidth achievable on the specific accelerator. + L2 Cache Utilization: The ratio of the number of cycles an L2 channel was active, summed + over all L2 channels on the accelerator over the total L2 cycles. + L2-Fabric Read BW: "The number of bytes read by the L2 over the Infinity Fabric\u2122\ + \ interface per unit time. This is also presented as a percent of the peak theoretical\ + \ bandwidth achievable on the specific accelerator." + L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric + interface by write and atomic operations per unit time. This is also presented + as a percent of the peak theoretical bandwidth achievable on the specific accelerator. + sL1D Cache BW: The number of bytes looked up in the sL1D cache per unit time. + This is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + L1I BW: The percent of L1I requests that hit on a previously loaded line the cache. + Calculated as the ratio of the number of L1I requests that hit over the number + of all L1I requests. + Address Processing Unit Busy: Percent of the total CU cycles the address processor + was busy. + Data-Return Busy: Percent of the total CU cycles the data-return unit was busy + processing or waiting on data to return to the CU. + L1I-L2 Bandwidth: Total number of bytes transferred across L1I - L2 interface + divided by total duration. + sL1D-L2 BW: "The total number of bytes read from, written to, or atomically updated\ + \ across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D\ + \ writes and atomics are typically unused on current CDNA accelerators, so in\ + \ the majority of cases this can be interpreted as an sL1D\u2192L2 read bandwidth." + data source: + - metric_table: + id: 3401 + title: Memory Throughput + header: + metric: Metric + value: Avg + unit: Unit + peak: Peak + pop: Pct of Peak + metric: + vL1D Cache BW: + value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) + pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) + vL1D Cache Utilization: + value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None)) + unit: Pct of Peak + peak: 100 + pop: None + Theoretical LDS Bandwidth: + value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: (($max_sclk * $cu_per_gpu) * 0.128) + pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) + LDS Utilization: + value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + peak: 100 + pop: None + L2 Cache Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + L2 Cache BW: + value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) + pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + L2 Cache Utilization: + value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + peak: 100 + pop: None + L2-Fabric Read BW: + value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)))) / $hbmBandwidth) + L2-Fabric Write BW: + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - + TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) / + $hbmBandwidth) + sL1D Cache BW: + value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * + 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Hit Rate: + value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + unit: pct + peak: 100 + pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + L1I BW: + value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * + 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + Address Processing Unit Busy: + avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: N/A + Data-Return Busy: + avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: N/A + L1I-L2 Bandwidth: + avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: N/A + pop: N/A + sL1D-L2 BW: + value: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: N/A + pop: N/A diff --git a/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/kernel_view_config.yaml b/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/kernel_view_config.yaml index a12e2f846b..c5dbee67f0 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/kernel_view_config.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/kernel_view_config.yaml @@ -7,8 +7,14 @@ sections: collapsed: true class: "sysinfo-section" subsections: - - title: "System Speed-of-Light" - data_path: ["2. System Speed-of-Light", "2.1 System Speed-of-Light"] + - title: "GPU Speed-of-Light" + data_path: ["32. GPU Speed-of-Light", "32.1 GPU Speed-of-Light"] + collapsed: true + - title: "Compute Throughput" + data_path: ["33. Compute Throughput", "33.1 Compute Throughput"] + collapsed: true + - title: "Memory Throughput" + data_path: ["34. Memory Throughput", "34.1 Memory Throughput"] collapsed: true - title: "Memory Chart" data_path: ["3. Memory Chart", "3.1 Memory Chart"] @@ -17,14 +23,16 @@ sections: - title: "Detailed Block Analysis" collapsed: true - class: "kernels-section" - dynamic_sections: true - skip_sections: - - "0. Top Stats" - - "1. System Info" - - "2. System Speed-of-Light" - - "3. Memory Chart" - - "4. Roofline" + class: "block-section" + subsections: + - arch_config_data: true + exclude_keys: + - "0. Top Stats" + - "1. System Info" + - "2. System Speed-of-Light" + - "3. Memory Chart" + - "4. Roofline" + collapsed: true - title: "Source Level Analysis" collapsed: true @@ -32,4 +40,4 @@ sections: subsections: - title: "PC Sampling" data_path: ["21. PC Sampling", "21.1 PC Sampling"] - collapsed: true + collapsed: true \ No newline at end of file diff --git a/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/tui_utils.py b/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/tui_utils.py index fe9e34f5b3..085f08426e 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/tui_utils.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/tui_utils.py @@ -99,6 +99,8 @@ def get_top_kernels_and_dispatch_ids(runs): top_kernel_df, dispatch_id_df, on="Kernel_Name", how="outer" ).sort_values("Pct", ascending=False) + # Remove unwanted columns + merged_df = merged_df.drop(columns=["Count", "GPU_ID"]) return merged_df.to_dict("records") diff --git a/projects/rocprofiler-compute/src/rocprof_compute_tui/views/kernel_view.py b/projects/rocprofiler-compute/src/rocprof_compute_tui/views/kernel_view.py index 89ab15ad96..28a48fe5f6 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_tui/views/kernel_view.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_tui/views/kernel_view.py @@ -55,22 +55,19 @@ class KernelView(Container): def __init__(self, config_path: Optional[str] = None): super().__init__(id="kernel-view") - self.status_label = None - self.dfs = {} - self.top_kernel = [] - - if rocprof_compute_home: - config_path = ( - rocprof_compute_home - / "rocprof_compute_tui" - / "utils" - / "kernel_view_config.yaml" - ) - self.config_path = config_path - - self.keys = None + self.kernel_to_df_dict = {} + self.top_kernel_to_df_list = [] self.current_selection = None + self.config_path = config_path or ( + rocprof_compute_home + / "rocprof_compute_tui" + / "utils" + / "kernel_view_config.yaml" + if rocprof_compute_home + else None + ) + def compose(self): """ Compose the split panel layout with two scrollable containers. @@ -88,94 +85,85 @@ class KernelView(Container): # empty on init pass - def update_results(self, per_kernel_dfs, top_kernels) -> None: - self.dfs = per_kernel_dfs - self.top_kernel = top_kernels + def update_results(self, kernel_to_df_dict, top_kernel_to_df_list) -> None: + self.kernel_to_df_dict = kernel_to_df_dict + self.top_kernel_to_df_list = top_kernel_to_df_list top_container = self.query_one("#top-container", VerticalScroll) top_container.remove_children() - if self.top_kernel: - try: - header = self.build_header() - top_container.mount(header) - selector = self.build_selector() - top_container.mount(selector) - except Exception as e: - top_container.mount( - Label(f"Error displaying kernel list: {str(e)}", classes="error") - ) - else: + if not self.top_kernel_to_df_list: top_container.mount(Label("No kernels available", classes="placeholder")) + return - self.current_selection = self.top_kernel[0]["Kernel_Name"] - self._update_bottom_content() + # Build and mount components + self.new_perf_metric() + # build header section + keys = self.top_kernel_to_df_list[0].keys() + header_text = " | ".join(f"{key:25}" for key in keys) + top_container.mount(Label(header_text, classes="kernel-table-header")) + + # build selector section + radio_buttons = [] + for i, kernel in enumerate(self.top_kernel_to_df_list): + row_text = " | ".join( + f"{str(kernel.get(key, 'N/A'))[:18]:25}" for key in keys + ) + button = RadioButton(row_text, id=f"kernel-{i}") + button.kernel_data = kernel + radio_buttons.append(button) + top_container.mount(RadioSet(*radio_buttons)) + + # build analysis section + self.current_selection = self.top_kernel_to_df_list[0]["Kernel_Name"] + self.update_bottom_content() def update_view(self, message: str, log_level: str) -> None: - """ - Update the view with a status message. - """ - if self.status_label is None: - self.status_label = Label(f"{message}", classes=log_level) + if not hasattr(self, "status_label") or self.status_label is None: + self.status_label = Label(message, classes=log_level) self.mount(self.status_label) else: - self.status_label.update(f"{message}") + self.status_label.update(message) self.status_label.set_classes(log_level) - def reload_config(self, config_path: str = None) -> None: - if config_path: - self.config_path = config_path + def new_perf_metric(self): + new_metrics = ["VGPRs", "Grid Size", "Workgroup Size"] + for new_metric in new_metrics: + for i, kernel in enumerate(self.top_kernel_to_df_list): + df_path = self.kernel_to_df_dict[kernel["Kernel_Name"]]["7. Wavefront"][ + "7.1 Wavefront Launch Stats" + ]["df"] + metric_avg = ( + df_path[df_path["Metric"] == new_metric]["Avg"].iloc[0].item() + ) + self.top_kernel_to_df_list[i][new_metric] = metric_avg - if self.dfs and self.top_kernel: - self.update_results() - - def build_header(self): - all_keys = set() - - for kernel in self.top_kernel: - all_keys.update(kernel.keys()) - - self.keys = sorted(all_keys) - - if "Kernel_Name" in self.keys: - self.keys.remove("Kernel_Name") - self.keys.insert(0, "Kernel_Name") - - header_text = " | ".join(f"{key:25}" for key in self.keys) - header_label = Label(header_text, classes="kernel-table-header") - - return header_label - - def build_selector(self): - radio_buttons = [] - - for i, kernel in enumerate(self.top_kernel): - row_data = [] - for key in self.keys: - value = str(kernel.get(key, "N/A")) - if len(value) > 18: - value = value[:15] + "..." - row_data.append(f"{value:25}") - - row_text = " | ".join(row_data) - radio_button = RadioButton(row_text, id=f"kernel-{i}") - radio_button.kernel_data = kernel - radio_buttons.append(radio_button) - - selector = RadioSet(*radio_buttons) - - return selector + """ + header_order = [ + "Dispatch_ID", + "Kernel_Name", + "Mean(ns)", + "Median(ns)", + "Sum(ns)", + "Compute Throughput", + "Memory Throughput", + "VGPRs", + "Grid Size", + "Workgroup Size", + ] + """ @on(RadioSet.Changed) def on_radio_changed(self, event: RadioSet.Changed) -> None: - if event.pressed: - kernel_data = getattr(event.pressed, "kernel_data", None) - if kernel_data and "Kernel_Name" in kernel_data: - selected_kernel = kernel_data["Kernel_Name"] - self.current_selection = selected_kernel - self._update_bottom_content() + if not event.pressed: + return - def _update_bottom_content(self): + kernel_data = getattr(event.pressed, "kernel_data", None) + if kernel_data and "Kernel_Name" in kernel_data: + self.current_selection = kernel_data["Kernel_Name"] + self.update_bottom_content() + + def update_bottom_content(self): bottom_container = self.query_one("#bottom-container", VerticalScroll) bottom_container.remove_children() @@ -183,24 +171,28 @@ class KernelView(Container): Label("Toggle kernel selection to view detailed analysis.") ) - if self.current_selection and self.current_selection in self.dfs: - bottom_container.mount( - Label(f"Current kernel selection: {self.current_selection}") - ) - filtered_dfs = self.dfs[self.current_selection] - - try: - sections = build_all_sections(filtered_dfs, self.config_path) - for section in sections: - bottom_container.mount(section) - except Exception as e: - bottom_container.mount( - Label(f"Error displaying results: {str(e)}", classes="error") - ) - else: + if not ( + self.current_selection and self.current_selection in self.kernel_to_df_dict + ): bottom_container.mount( Label( f"No data available for kernel: {self.current_selection}", classes="error", ) ) + return + + bottom_container.mount( + Label(f"Current kernel selection: {self.current_selection}") + ) + + try: + sections = build_all_sections( + self.kernel_to_df_dict[self.current_selection], self.config_path + ) + for section in sections: + bottom_container.mount(section) + except Exception as e: + bottom_container.mount( + Label(f"Error displaying results: {str(e)}", classes="error") + ) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_tui/views/main_view.py b/projects/rocprofiler-compute/src/rocprof_compute_tui/views/main_view.py index 4e38dad557..765b5919a2 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_tui/views/main_view.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_tui/views/main_view.py @@ -50,18 +50,12 @@ class MainView(Horizontal): """Main view layout for the application.""" selected_path = reactive(None) - per_kernel_dfs = reactive({}) - top_kernels = reactive([]) + kernel_to_df_dict = reactive({}) + top_kernel_to_df_list = reactive([]) def __init__(self): super().__init__(id="main-container") - self.start_path = ( - # NOTE: is cwd the best choice? - Path.cwd() - if DEFAULT_START_PATH is None - else Path(DEFAULT_START_PATH) - ) - + self.start_path = Path(DEFAULT_START_PATH) if DEFAULT_START_PATH else Path.cwd() self.logger = Logger() self.logger.info("MainView initialized", update_ui=False) @@ -77,9 +71,7 @@ class MainView(Horizontal): with Horizontal(id="center-container"): with Vertical(id="activity-container"): # Center Panel - Analysis results display - center_panel = CenterPanel() - yield center_panel - self.center = center_panel + yield CenterPanel() # Bottom Panel - Output, terminal, and metric description tabs = TabsArea() @@ -97,215 +89,91 @@ class MainView(Horizontal): @on(DataTable.CellSelected) def on_data_table_cell_selected(self, event: DataTable.CellSelected) -> None: - table = event.data_table - row_idx = event.coordinate.row - - self.logger.info(f"Cell selected at row {row_idx}") - try: - row_data = table.get_row_at(row_idx) - content = f"Selected Metric ID: {row_data[0]}\n" - content += f"Selected Metric: {row_data[1]}\n" - # content += f"Metric Description:\n\t{row_data[-1]}" - - self.metric_description.text = content - self.logger.info(f"Row {row_idx} data displayed in metric_description") - + row_data = event.data_table.get_row_at(event.coordinate.row) + self.metric_description.text = ( + f"Selected Metric ID: {row_data[0]}\nSelected Metric: {row_data[1]}\n" + ) + self.logger.info(f"Row {event.coordinate.row} data displayed") except Exception as e: - error_msg = f"Error displaying row {row_idx}: {str(e)}" - table.add_column("Error") - table.add_row(str(e)) + error_msg = f"Error displaying row {event.coordinate.row}: {str(e)}" self.metric_description.text = error_msg self.logger.error(error_msg) @work(thread=True) def run_analysis(self) -> None: - self.per_kernel_dfs = {} - self.top_kernels = [] + self.kernel_to_df_dict = {} + self.top_kernel_to_df_list = [] if not self.selected_path: - error_msg = "No directory selected for analysis" - self._update_view(error_msg, LogLevel.ERROR) - self.logger.error(error_msg) + self.app.call_from_thread( + lambda: self.query_one("#kernel-view").update_view( + "No directory selected for analysis", LogLevel.ERROR + ) + ) return try: self.logger.info(f"Starting analysis on: {self.selected_path}") - self._update_view( - f"Running analysis on: {self.selected_path}", LogLevel.SUCCESS + + self.app.call_from_thread( + lambda: self.query_one("#kernel-view").update_view( + f"Running analysis on: {self.selected_path}", LogLevel.SUCCESS + ) ) - # Step 1: Create analyzer - try: - self.logger.info("Step 1: Creating analyzer") - self.logger.info(f"Step 1: args {self.app.args}") - self.logger.info(f"Step 1: arch {self.app.supported_archs}") - self.logger.info("Step 1: Creating analyzer") - analyzer = tui_analysis( - self.app.args, self.app.supported_archs, self.selected_path - ) - self.logger.info("Step 1: Analyzer created successfully") - except Exception as e: - self.logger.error(f"Step 1 failed - Error creating analyzer: {str(e)}") - raise + # 1. Create and TUI analyzer + analyzer = tui_analysis( + self.app.args, self.app.supported_archs, self.selected_path + ) + analyzer.sanitize() - # Step 2: Sanitize analyzer - try: - self.logger.info("Step 2: Sanitizing analyzer") - analyzer.sanitize() - self.logger.info("Step 2: Analyzer sanitized successfully") - except Exception as e: - self.logger.error( - f"Step 2 failed - Error sanitizing analyzer: {str(e)}" - ) - raise + # 2. Load and process system info and Configure SoC + sysinfo_path = Path(self.selected_path) / "sysinfo.csv" + if not sysinfo_path.exists(): + raise FileNotFoundError(f"sysinfo.csv not found at {sysinfo_path}") + sys_info = file_io.load_sys_info(sysinfo_path).iloc[0].to_dict() + self.app.load_soc_specs(sys_info) - # Step 3: Load sys_info - try: - self.logger.info("Step 3: Loading sys_info") - sysinfo_path = Path(self.selected_path).joinpath("sysinfo.csv") - self.logger.info(f"Step 3: sysinfo_path = {sysinfo_path}") + # 3. run analysis + analyzer.set_soc(self.app.soc) + analyzer.pre_processing() + self.kernel_to_df_dict = analyzer.run_kernel_analysis() + self.top_kernel_to_df_list = analyzer.run_top_kernel() - if not sysinfo_path.exists(): - raise FileNotFoundError(f"sysinfo.csv not found at {sysinfo_path}") - - sys_info_df = file_io.load_sys_info(sysinfo_path) - self.logger.info(f"Step 3: sys_info_df type = {type(sys_info_df)}") - shape_info = ( - sys_info_df.shape - if hasattr(sys_info_df, "shape") - else "No shape attribute" - ) - self.logger.info(f"Step 3: sys_info_df shape = {shape_info}") - - except Exception as e: - self.logger.error(f"Step 3 failed - Error loading sys_info: {str(e)}") - raise - - # Step 4: Convert sys_info to dict - try: - self.logger.info("Step 4: Converting sys_info to dict") - - # Check if it's actually a DataFrame - if hasattr(sys_info_df, "iloc"): - sys_info = sys_info_df.iloc[0].to_dict() - elif hasattr(sys_info_df, "to_dict"): - # If it's already a Series - sys_info = sys_info_df.to_dict() - elif isinstance(sys_info_df, dict): - # If it's already a dict - sys_info = sys_info_df - else: - raise TypeError( - f"Unexpected type for sys_info: {type(sys_info_df)}" + if not self.kernel_to_df_dict or not self.top_kernel_to_df_list: + self.app.call_from_thread( + lambda: self.query_one("#kernel-view").update_view( + "Analysis completed but not all data was returned", + LogLevel.WARNING, ) - - self.logger.info(f"Step 4: sys_info converted = {sys_info}") - - except Exception as e: - self.logger.error( - f"Step 4 failed - Error converting sys_info: {str(e)}" ) - raise - - # Step 5: Load SoC specs - try: - self.logger.info("Step 5: Loading SoC specs") - self.app.load_soc_specs(sys_info) - self.logger.info(f"Step 5: SoC loaded = {self.app.soc}") - except Exception as e: - self.logger.error(f"Step 5 failed - Error loading SoC specs: {str(e)}") - raise - - # Step 6: Set SoC in analyzer - try: - self.logger.info("Step 6: Setting SoC in analyzer") - analyzer.set_soc(self.app.soc) - self.logger.info("Step 6: SoC set successfully") - except Exception as e: - self.logger.error(f"Step 6 failed - Error setting SoC: {str(e)}") - raise - - # Step 7: Pre-processing - try: - self.logger.info("Step 7: Running pre-processing") - analyzer.pre_processing() - self.logger.info("Step 7: Pre-processing completed") - except Exception as e: - self.logger.error(f"Step 7 failed - Error in pre-processing: {str(e)}") - raise - # Step 8: Run analysis - try: - self.logger.info("Step 8: Running analysis") - self.per_kernel_dfs = analyzer.run_kernel_analysis() - self.top_kernels = analyzer.run_top_kernel() - - # TODO: add per kernel Roofline support when available - - if not self.per_kernel_dfs or not self.top_kernels: - warning_msg = ( - "Step 8: Per Kernel Analysis completed but not all data " - "was returned" - ) - self._update_view(warning_msg, LogLevel.WARNING) - self.logger.warning(warning_msg) - else: - self.app.call_from_thread(self.refresh_results) - self.logger.info("Step 8: Kernel Analysis completed successfully") - # self.logger.info(f"{self.per_kernel_dfs}") - except Exception as e: - self.logger.error(f"Step 8 failed - Error running analysis: {str(e)}") - raise + else: + self.app.call_from_thread(self.refresh_results) + self.logger.info("Kernel Analysis completed successfully") + # self.logger.info(f"{self.kernel_to_df_dict}") except Exception as e: import traceback - error_msg = f"Unexpected error during analysis: {str(e)}" - self.logger.error(error_msg) - self.logger.error(f"Full traceback:\n{traceback.format_exc()}") - self._update_view(error_msg, LogLevel.ERROR) - - def _update_view(self, message: str, log_level: LogLevel) -> None: - try: - self.app.call_from_thread(self._safe_update_view, message, log_level) - except Exception as e: - self.logger.error(f"View update scheduling error: {str(e)}") - - def _safe_update_view(self, message: str, log_level: LogLevel) -> None: - try: - kernel_view = self.query_one("#kernel-view") - if kernel_view: - kernel_view.update_view(message, log_level) - else: - self.logger.warning("Analysis view not found when updating log") - except Exception as e: - self.logger.error(f"Log update error: {str(e)}") + error_msg = f"Analysis failed: {str(e)}" + self.logger.error(f"{error_msg}\n{traceback.format_exc()}") + self.app.call_from_thread( + lambda: self.query_one("#kernel-view").update_view( + error_msg, LogLevel.ERROR + ) + ) def refresh_results(self) -> None: - try: - self.logger.info("Refreshing kernel results") - kernel_view = self.query_one("#kernel-view") - if not kernel_view: - self.logger.error("Kernel view not found") - return - - if ( - not hasattr(self, "per_kernel_dfs") - or self.per_kernel_dfs is None - or not hasattr(self, "top_kernels") - or self.top_kernels is None - ): - self.logger.error("No kernel analysis data available to display") - return - - kernel_view.update_results(self.per_kernel_dfs, self.top_kernels) + kernel_view = self.query_one("#kernel-view") + if kernel_view: + kernel_view.update_results(self.kernel_to_df_dict, self.top_kernel_to_df_list) self.logger.success("Results displayed successfully.") - except Exception as e: - self.logger.error(f"Error refreshing results: {str(e)}") + else: + self.logger.error("Kernel view not found or no data available") def refresh_view(self) -> None: - self.logger.info("Refreshing view...") - if self.top_kernels: + if self.kernel_to_df_dict and self.top_kernel_to_df_list: self.refresh_results() else: self.logger.warning("No data available for refresh") diff --git a/projects/rocprofiler-compute/src/rocprof_compute_tui/widgets/collapsibles.py b/projects/rocprofiler-compute/src/rocprof_compute_tui/widgets/collapsibles.py index 9796c3244a..3c3fbd8caa 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_tui/widgets/collapsibles.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_tui/widgets/collapsibles.py @@ -23,7 +23,8 @@ ############################################################################## -from typing import Any, Dict, List, Optional + +from typing import Any, Dict, List import pandas as pd import yaml @@ -31,7 +32,6 @@ from textual.widgets import Collapsible, DataTable, Label from rocprof_compute_tui.widgets.charts import ( MemoryChart, - RooflinePlot, SimpleBar, SimpleBox, SimpleMultiBar, @@ -40,17 +40,38 @@ from rocprof_compute_tui.widgets.charts import ( def create_table(df: pd.DataFrame) -> DataTable: table = DataTable(zebra_stripes=True) - df = df.reset_index() df = df[~df.apply(lambda row: row.astype(str).str.strip().eq("").any(), axis=1)] - str_columns = [str(col) for col in df.columns] table.add_columns(*str_columns) table.add_rows([tuple(str(x) for x in row) for row in df.itertuples(index=False)]) - return table +def create_widget_from_data(df: pd.DataFrame, tui_style: str = None, context: str = ""): + if df is None or df.empty: + return Label( + f"Data not available{f' for {context}' if context else ''}", classes="warning" + ) + + match tui_style: + # TODO: implement tui_style == "roofline" + # case "roofline": + # return Roofline(df) + case None: + return create_table(df) + case "mem_chart": + return MemoryChart(df) + case "simple_bar": + return SimpleBar(df) + case "simple_box": + return SimpleBox(df) + case "simple_multiple_bar": + return SimpleMultiBar(df) + case _: + return Label(f"Unknown display type: {tui_style}") + + def load_config(config_path) -> Dict[str, Any]: try: with open(config_path, "r") as file: @@ -66,257 +87,60 @@ def load_config(config_path) -> Dict[str, Any]: raise ValueError(f"Error parsing YAML configuration: {e}") -def get_data_from_path(dfs: Dict[str, Any], path: List[str]) -> Optional[pd.DataFrame]: - try: - current = dfs - for key in path: - current = current[key] - return current["df"] - except (KeyError, TypeError): - return None - - -def get_tui_style_from_path(dfs: Dict[str, Any], path: List[str]) -> Optional[str]: - try: - current = dfs - for key in path: - current = current[key] - return current.get("tui_style") - except (KeyError, TypeError): - return None - - -def create_widget_from_data(df: pd.DataFrame, tui_style: Optional[str] = None) -> Any: - if df is not None and not df.empty: - match tui_style: # noqa - case None: - return create_table(df) - - case "mem_chart": - return MemoryChart(df) - - case "simple_bar": - return SimpleBar(df) - - case "simple_box": - return SimpleBox(df) - - case "simple_multiple_bar": - return SimpleMultiBar(df) - - case _: - return Label(f"Unknown display type: {tui_style}") - else: - return Label(f"Data not available for display in {tui_style}.") - - -def build_subsection( - subsection_config: Dict[str, Any], dfs: Dict[str, Any] -) -> Collapsible: - title = subsection_config["title"] - collapsed = subsection_config.get("collapsed", True) - tui_style = subsection_config.get("tui_style") - - # Handle data-driven widgets - if "data_path" in subsection_config: - data_path = subsection_config["data_path"] - - if tui_style is None: - tui_style = ( - get_tui_style_from_path(dfs, data_path) if dfs is not None else None - ) - - df = get_data_from_path(dfs, data_path) - - if df is None and tui_style is None: - error_msg = ( - f"{title} data not available: Path {' -> '.join(data_path)} not found" - ) - return Collapsible( - Label(error_msg, classes="warning"), title=title, collapsed=collapsed - ) - - # Create main widget - widget = create_widget_from_data(df, tui_style) - - # Add header label if specified - widgets = [] - if "header_label" in subsection_config: - header_class = subsection_config.get("header_class", "") - widgets.append( - Label(subsection_config["header_label"], classes=header_class) - ) - - widgets.append(widget) - - collapsible = Collapsible(*widgets, title=title, collapsed=collapsed) - elif tui_style == "roofline": - if dfs["4. Roofline"]: - widget = RooflinePlot(dfs) - collapsible = Collapsible(widget, title=title, collapsed=collapsed) - else: - return None - # Fallback for subsections without data or style - else: - collapsible = Collapsible( - Label(f"No data or style configuration for {title}"), - title=title, - collapsed=collapsed, - ) - - # Add ID if specified - if "widget_id" in subsection_config: - collapsible.id = subsection_config["widget_id"] - - return collapsible - - -def build_kernel_sections( - dfs: Dict[str, Any], skip_sections: List[str] -) -> List[Collapsible]: - children = [] - - def add_warning(message: str): - children.append(Label(message, classes="warning")) - - def validate_data_structure(data, name: str, parent_name: str = None) -> bool: - if data is None: - location = f"'{parent_name}' > '{name}'" if parent_name else f"'{name}'" - add_warning(f"Analysis result for {location} is not available") - return False - - if not isinstance(data, dict): - location = f"'{parent_name}' > '{name}'" if parent_name else f"'{name}'" - add_warning( - f"Analysis result for {location} is not a dictionary type: {type(data)}" - ) - return False - - return True - - def create_safe_widget(subsection_name: str, data: dict, section_name: str): - if not (isinstance(data, dict) and "df" in data): - add_warning( - ( - f"Invalid data structure for '{subsection_name}' " - f"in section '{section_name}'" - ) - ) - return None - - try: - if data["df"] is None or data["df"].empty: - return None - tui_style = data.get("tui_style") - widget = create_widget_from_data(data["df"], tui_style) - - if widget is None: - add_warning(f"Widget creation returned None for '{subsection_name}'") - return None - - return widget - except Exception as e: - add_warning(f"Failed to create widget for '{subsection_name}': {str(e)}") - return None - - def create_safe_collapsible(widget, title): - try: - return Collapsible(widget, title=title, collapsed=True) - except Exception as e: - add_warning(f"Failed to create collapsible for '{title}': {str(e)}") - return None - - try: - if not validate_data_structure(dfs, "analysis result"): - return children - - for section_name, subsections in dfs.items(): - if section_name in skip_sections: - continue - - if not validate_data_structure(subsections, section_name): - continue - - kernel_children = [] - for subsection_name, data in subsections.items(): - try: - widget = create_safe_widget(subsection_name, data, section_name) - if widget: - collapsible = create_safe_collapsible(widget, subsection_name) - if collapsible: - kernel_children.append(collapsible) - except Exception as e: - add_warning( - ( - f"Error processing subsection '{subsection_name}' " - f"in section '{section_name}': {str(e)}" - ) - ) - - if kernel_children: - try: - section_collapsible = Collapsible( - *kernel_children, title=section_name, collapsed=True - ) - children.append(section_collapsible) - except Exception as e: - add_warning( - ( - "Failed to create collapsible for section " - f"'{section_name}': {str(e)}" - ) - ) - - except Exception as e: - add_warning(f"Unexpected error in Kernel Section processing: {str(e)}") - - return children - - def build_section_from_config( - section_config: Dict[str, Any], dfs: Dict[str, Any] + dfs: Dict[str, Any], section_config: Dict[str, Any] ) -> Collapsible: title = section_config["title"] collapsed = section_config.get("collapsed", True) - css_class = section_config.get("class") - # Handle under construction sections - if section_config.get("under_construction", False): - construction_label = section_config.get( - "construction_label", "Under Construction" - ) - construction_class = section_config.get("construction_class", "") - children = [Label(construction_label, classes=construction_class)] + children = [] + for subsection_config in section_config["subsections"]: + # Handle arch_config_data + if subsection_config.get("arch_config_data", False): + if isinstance(dfs, dict): + exclude_keys = subsection_config.get("exclude_keys", []) + for section_name, subsections in dfs.items(): + if section_name not in exclude_keys and isinstance(subsections, dict): + kernel_children = [] + for subsection_name, data in subsections.items(): + if isinstance(data, dict) and "df" in data: + widget = create_widget_from_data( + data["df"], data.get("tui_style"), subsection_name + ) + kernel_children.append( + Collapsible( + widget, title=subsection_name, collapsed=True + ) + ) - # Handle dynamic sections (like kernel sections) - elif section_config.get("dynamic_sections", False): - skip_sections = section_config.get("skip_sections", []) - children = build_kernel_sections(dfs, skip_sections) + if kernel_children: + children.append( + Collapsible( + *kernel_children, title=section_name, collapsed=True + ) + ) + else: + # Handle data_path + tui_style = subsection_config.get("tui_style") + data_path = subsection_config["data_path"] - # Handle regular sections with subsections - elif "subsections" in section_config: - children = [] - for subsection_config in section_config["subsections"]: - try: - subsection = build_subsection(subsection_config, dfs) - if subsection: - children.append(subsection) - except Exception as e: - error_msg = ( - f"{subsection_config.get('title', 'Unknown')} error: {str(e)}" + df = dfs.get(data_path[0], {}).get(data_path[1], {}) + df = df.get("df") if isinstance(df, dict) else None + if df is not None and isinstance(df, dict) and tui_style is None: + tui_style = df.get("tui_style") + + widgets = [ + create_widget_from_data(df, tui_style, f"path {' -> '.join(data_path)}") + ] + + children.append( + Collapsible( + *widgets, + title=subsection_config.get("title", "Untitled"), + collapsed=subsection_config.get("collapsed", True), ) - children.append(Label(error_msg, classes="warning")) - else: - children = [Label("No configuration provided for this section")] - - # Create the main collapsible - collapsible = Collapsible(*children, title=title, collapsed=collapsed) - - # Add CSS class if specified - if css_class: - collapsible.add_class(css_class) - - return collapsible + ) + return Collapsible(*children, title=title, collapsed=collapsed) def build_all_sections(dfs: Dict[str, Any], config_path) -> List[Collapsible]: @@ -324,17 +148,7 @@ def build_all_sections(dfs: Dict[str, Any], config_path) -> List[Collapsible]: sections = [] for section_config in config["sections"]: - try: - section = build_section_from_config(section_config, dfs) - sections.append(section) - except Exception as e: - # Create error section if something goes wrong - error_title = section_config.get("title", "Unknown Section") - error_section = Collapsible( - Label(f"Error building section: {str(e)}", classes="error"), - title=f"❌ {error_title}", - collapsed=True, - ) - sections.append(error_section) + section = build_section_from_config(dfs, section_config) + sections.append(section) return sections diff --git a/projects/rocprofiler-compute/src/utils/file_io.py b/projects/rocprofiler-compute/src/utils/file_io.py index fdab58ca49..756af30b82 100644 --- a/projects/rocprofiler-compute/src/utils/file_io.py +++ b/projects/rocprofiler-compute/src/utils/file_io.py @@ -63,24 +63,24 @@ def load_sys_info(f): return pd.read_csv(f) -def load_panel_configs(dir): +def load_panel_configs(dirs): """ Load all panel configs from yaml file. """ d = {} - for root, dirs, files in os.walk(dir): - for f in files: - if f.endswith(".yaml"): - with open(str(Path(root).joinpath(f))) as file: - config = yaml.safe_load(file) - # metric key can be None due to some metric tables - # not having any metrics - # metric key should be empty dict instead of None - for data_source in config["Panel Config"]["data source"]: - metric_table = data_source.get("metric_table") - if metric_table and metric_table["metric"] is None: - metric_table["metric"] = {} - d[config["Panel Config"]["id"]] = config["Panel Config"] + for dir in dirs: + for root, _, files in os.walk(dir): + for f in files: + if f.endswith(".yaml"): + with open(Path(root) / f) as file: + config_yml = yaml.safe_load(file) + # metric key can be None due to some metric tables not having any metrics + # metric key should be empty dict instead of None + for data_source in config_yml["Panel Config"]["data source"]: + metric_table = data_source.get("metric_table") + if metric_table and metric_table["metric"] is None: + metric_table["metric"] = {} + d[config_yml["Panel Config"]["id"]] = config_yml["Panel Config"] # TODO: sort metrics as the header order in case they- # are not defined in the same order @@ -160,9 +160,9 @@ def create_df_kernel_top_stats( axis=1, ) - grouped = time_stats.groupby(by=["Kernel_Name"]).agg({ - "ExeTime": ["count", "sum", "mean", "median"] - }) + grouped = time_stats.groupby(by=["Kernel_Name"]).agg( + {"ExeTime": ["count", "sum", "mean", "median"]} + ) time_unit_str = "(" + time_unit + ")" grouped.columns = [