MI350 Fix L2 cache to HBM read counters/metrics (#2501)

* Fix rocprofiler-sdk metrics definition

* Use TCC_EA0_RDREQ_128B instead of TCC_BUBBLE counter for L2 cache to
  HBM counters and metrics

* Update MI350 counter definitions
    * FETCH_SIZE
    * BANDWIDTH_EA

* Update MI350 metrics definitions
    * System Speed of Light, L2-Fabric Read BW
    * Roofline Plot Points, AI (Arithmetic Intensity) HBM
    * Roofline Performance Rates, HBM Bandwidth

* Remove redundant definition for gfx950 and fix BANDWIDTH_EA definition

Test HBM bandwidth metric for memcopy workload

* Add memcopy.cpp workload

* Add metric validation test suite to validate HBM Bandwidth metric for
  memcopy workload

* Move gpu_soc() to test_utils.py for better re-usability

* Update TUI analysis config

* Fix hbm bandwidth formula for mi350 in calc_ai_profile

Co-authored-by: Alysa Liu <Alysa.Liu@amd.com>
Esse commit está contido em:
vedithal-amd
2026-01-23 15:56:24 -05:00
commit de GitHub
commit aa5dfb98f9
24 arquivos alterados com 528 adições e 223 exclusões
+2
Ver Arquivo
@@ -58,6 +58,8 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
* Fix redundant warnings for compute/memory partition not found for < MI 300 series GPUs by skipping partition checks
* Fixed formula for metrics related to reads from L2 cache to HBM for MI350
### Removed
* Removed "VL1 Lat" metric for AMD Instinct MI300 series GPUs, due to MI300 series not supporting TCP_TCP_LATENCY_sum counter.
@@ -388,6 +388,13 @@ add_test(
tests/test_profile_general.py ${WORKING_DIR_OPTION}
)
add_test(
NAME test_metric_validation
COMMAND
${PYTHON_TEST_COMMAND} -m pytest --junitxml=tests/test_metric_validation.xml
${COV_OPTION} tests/test_metric_validation.py ${WORKING_DIR_OPTION}
)
set_tests_properties(
test_profile_kernel_execution
test_profile_dispatch
@@ -748,6 +755,7 @@ if(INSTALL_TESTS)
tests/laplace_eqn
tests/mat_mul_max
tests/rocflop
tests/memcopy
DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/${PROJECT_NAME}/tests
COMPONENT tests
)
+2 -2
Ver Arquivo
@@ -38,8 +38,8 @@ python3 -m pip install -r requirements.txt
## Testing
Populate the <username> variable in `docker/docker-compose.customrocmtest.yml`.
Populate the <tarball_name> variable in `docker/Dockerfile.customrocmtest` based on latest TheRock nightly build information.
Populate the <usename> variable in `docker/docker-compose.customrocmtest.yml`.
Populate the <rocm_build_image> variable in `docker/Dockerfile.customrocmtest` based on latest ROCm CI build information.
To quickly get the environment (bash shell) for building and testing, run the following commands:
* `cd docker`
@@ -1,5 +1,5 @@
# Use a base image
FROM ubuntu:22.04
FROM <rocm_build_image>
# Install curl first (needed for ROCm download)
RUN apt-get update && apt-get install -y curl
@@ -7,7 +7,7 @@ RUN apt-get update && apt-get install -y curl
# Define the tarball name as a variable
# Check https://therock-nightly-tarball.s3.amazonaws.com/index.html for latest builds
# Use therock-dist-linux-gfx<arch>-dcgpu-<rocm-version>.tar.gz naming convention
ARG TARBALL_NAME=<tarball_name>
ARG TARBALL_NAME=therock-dist-linux-gfx94X-dcgpu-7.11.0a20260116.tar.gz
# Install ROCm from TheRock Nightly build
RUN mkdir -p /rocm && \
@@ -24,29 +24,27 @@ ENV PATH="/rocm/bin:${PATH}" \
# Update package list and install prerequisites
RUN apt-get update && apt-get install -y \
software-properties-common cmake locales git \
software-properties-common cmake locales git curl \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update
# Allows running git commands in /app
RUN git config --global --add safe.directory /app
# Generate the desired locale
RUN locale-gen en_US.UTF-8
# Install Python 3.10 and pip
RUN apt-get install -y python3.10 python3.10-venv python3.10-dev python3-pip libsqlite3-dev
RUN python3.10 -m venv /venv
ENV PATH="/venv/bin:$PATH"
RUN python3.10 -m venv venv
ENV PATH="venv/bin:$PATH"
RUN python -m pip install --upgrade pip
# Install any rocprofiler-compute dependencies specified in requirements.txt
# Install any dependencies specified in requirements.txt
WORKDIR /app/projects/rocprofiler-compute
COPY projects/rocprofiler-compute/requirements.txt /app/projects/rocprofiler-compute/requirements.txt
COPY projects/rocprofiler-compute/requirements-test.txt /app/projects/rocprofiler-compute/requirements-test.txt
RUN python -m pip install -r /app/projects/rocprofiler-compute/requirements.txt -r /app/projects/rocprofiler-compute/requirements-test.txt
# Set the working directory
WORKDIR /app
# Allows running git commands in /app
RUN git config --global --add safe.directory /app
RUN python -m pip install -r requirements.txt -r requirements-test.txt
# Run interactive bash shell
CMD ["/bin/bash"]
@@ -0,0 +1,125 @@
#include <hip/hip_runtime.h>
#include <iostream>
#include <iomanip>
// Simple memory copy kernel - one thread per element
__global__ void memoryCopyKernel(const double* __restrict__ X,
double* __restrict__ Y,
size_t N) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) {
Y[idx] = X[idx];
}
}
int main() {
const size_t N = 268435456; // Number of elements
const size_t bytes = N * sizeof(double);
// Print array size in GiB
double sizeGiB = bytes / (1024.0 * 1024.0 * 1024.0);
std::cout << "Array size: " << std::fixed << std::setprecision(2)
<< sizeGiB << " GiB (" << bytes << " bytes)" << std::endl;
std::cout << "Number of elements: " << N << std::endl;
std::cout << std::string(60, '-') << std::endl;
// Allocate device memory
double *d_X, *d_Y;
hipMalloc(&d_X, bytes);
hipMalloc(&d_Y, bytes);
// Allocate and initialize host memory
double *h_X = new double[N];
for (size_t i = 0; i < N; i++) {
h_X[i] = static_cast<double>(i);
}
// Copy data to device
hipMemcpy(d_X, h_X, bytes, hipMemcpyHostToDevice);
// Launch configuration
int blockSize = 256;
int gridSize = (N + blockSize - 1) / blockSize;
std::cout << "Launch configuration:" << std::endl;
std::cout << " Block size: " << blockSize << std::endl;
std::cout << " Grid size: " << gridSize << std::endl;
std::cout << std::string(60, '-') << std::endl;
// Warmup run
memoryCopyKernel<<<gridSize, blockSize>>>(d_X, d_Y, N);
hipDeviceSynchronize();
// Timed runs
const int numRuns = 1000;
hipEvent_t start, stop;
hipEventCreate(&start);
hipEventCreate(&stop);
float totalTime = 0.0f;
for (int run = 0; run < numRuns; run++) {
hipEventRecord(start);
memoryCopyKernel<<<gridSize, blockSize>>>(d_X, d_Y, N);
hipEventRecord(stop);
hipEventSynchronize(stop);
float milliseconds = 0;
hipEventElapsedTime(&milliseconds, start, stop);
totalTime += milliseconds;
}
float avgTime = totalTime / numRuns;
// Calculate bandwidth
// Memory copy reads N elements and writes N elements = 2*N*sizeof(double) bytes
double totalBytesTransferred = 2.0 * N * sizeof(double);
double bandwidthGBps = (totalBytesTransferred / (avgTime / 1000.0)) / (1024.0 * 1024.0 * 1024.0);
std::cout << "Performance Results (averaged over " << numRuns << " runs):" << std::endl;
std::cout << " Kernel execution time: " << std::fixed << std::setprecision(3)
<< avgTime << " ms" << std::endl;
std::cout << " Data transferred: " << std::setprecision(2)
<< (totalBytesTransferred / (1024.0 * 1024.0 * 1024.0)) << " GiB" << std::endl;
std::cout << " Achieved bandwidth: " << std::setprecision(2)
<< bandwidthGBps << " GiB/s" << std::endl;
std::cout << std::string(60, '-') << std::endl;
// Verify correctness
double *h_Y = new double[N];
hipMemcpy(h_Y, d_Y, bytes, hipMemcpyDeviceToHost);
bool correct = true;
for (size_t i = 0; i < N; i++) {
if (h_Y[i] != h_X[i]) {
correct = false;
std::cout << "Mismatch at index " << i << ": "
<< h_Y[i] << " != " << h_X[i] << std::endl;
break;
}
}
if (correct) {
std::cout << "✓ Verification PASSED - all elements copied correctly" << std::endl;
} else {
std::cout << "✗ Verification FAILED" << std::endl;
}
std::cout << "\nFirst 5 elements:" << std::endl;
for (int i = 0; i < 5; i++) {
std::cout << " Y[" << i << "] = " << h_Y[i] << std::endl;
}
// Cleanup
hipEventDestroy(start);
hipEventDestroy(stop);
delete[] h_X;
delete[] h_Y;
hipFree(d_X);
hipFree(d_Y);
return 0;
}
@@ -1203,10 +1203,10 @@ Modification:
pop: |
((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
- L2-Fabric Read BW:
value: |
AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp))
pop: |
((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth)
((100 * (AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth)
value: |
AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp))
- MFMA FLOPs (BF16):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
@@ -1275,13 +1275,20 @@ Modification:
id: 400
title: Roofline
metric_tables:
- metric_table:
id: 401
title: Roofline Performance Rates
metrics:
- HBM Bandwidth:
value: |
AVG(((TCC_EA0_RDREQ_128B_sum * 128 + TCC_EA0_RDREQ_32B_sum * 32 + TCC_EA0_RDREQ_64B_sum * 64 + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
- metric_table:
id: 402
title: Roofline Plot Points
metrics:
- AI HBM:
value: |
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_EA0_RDREQ_128B_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
- AI L1:
value: |
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) )
@@ -1474,9 +1481,9 @@ Modification:
title: L2-Fabric Read Stall (Cycles per normUnit)
metrics:
- ::_1:
ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom))
ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom))
ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) / $denom))
ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom))
- metric_table:
id: 1810
title: L2-Fabric Write and Atomic Stall (Cycles per normUnit)
@@ -781,14 +781,14 @@ Modification:
metrics:
- HBM Bandwidth:
value: |
AVG((( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
AVG(((TCC_EA0_RDREQ_128B_sum * 128 + TCC_EA0_RDREQ_32B_sum * 32 + TCC_EA0_RDREQ_64B_sum * 64 + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
- metric_table:
id: 402
title: Roofline Plot Points
metrics:
- AI HBM:
value: |
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_EA0_RDREQ_128B_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
- AI L1:
value: |
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) )
@@ -807,37 +807,37 @@ Modification:
title: Workgroup manager utilizations
metrics:
- Dispatched Wavefronts:
min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
- Dispatched Workgroups:
min: |
MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
avg: |
AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
max: |
MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
min: |
MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
- SGPR Writes:
min: |
MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
avg: |
AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
max: |
MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
min: |
MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
- Scheduler-Pipe Utilization:
min: |
MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
avg: |
AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
max: |
MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
min: |
MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
- VGPR Writes:
min: |
MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
avg: |
AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
max: |
MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
min: |
MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
- Panel Config:
id: 700
title: Wavefront
@@ -847,9 +847,9 @@ Modification:
title: Wavefront Launch Stats
metrics:
- Total Wavefronts:
min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
- Panel Config:
id: 1000
title: Compute Units - Instruction Mix
@@ -859,9 +859,9 @@ Modification:
title: Overall Instruction Mix
metrics:
- VMEM:
min: MIN(((SQ_INSTS_VMEM) / $denom))
avg: AVG(((SQ_INSTS_VMEM) / $denom))
max: MAX(((SQ_INSTS_VMEM) / $denom))
min: MIN(((SQ_INSTS_VMEM) / $denom))
- Panel Config:
id: 1100
title: Compute Units - Compute Pipeline
@@ -891,12 +891,12 @@ Modification:
title: Arithmetic Operations
metrics:
- FLOPs (Total):
min: |
MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
avg: |
AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
max: |
MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
min: |
MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
- Panel Config:
id: 1500
title: Address Processing Unit and Data Return Path (TA/TD)
@@ -923,21 +923,21 @@ Modification:
title: vL1D cache access metrics
metrics:
- Cache BW:
min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
- L1 Access Latency:
unit: (Cycles + $normUnit)
avg: AVG((TCP_TCP_LATENCY_sum / $denom))
max: MAX((TCP_TCP_LATENCY_sum / $denom))
min: MIN((TCP_TCP_LATENCY_sum / $denom))
- L1-L2 BW:
min: |
MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
avg: |
AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
max: |
MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
min: |
MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
- L1-L2 Read Latency:
unit: (Cycles + $normUnit)
avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
@@ -967,48 +967,50 @@ Modification:
title: L2-Fabric interface metrics
metrics:
- Atomic Latency:
min: |
MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None))
avg: |
AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None))
max: |
MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None))
min: |
MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None))
- Atomic Traffic:
min: |
MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
avg: |
AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
max: |
MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
min: |
MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
- HBM Read Traffic:
min: |
MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
avg: |
AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
max: |
MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
min: |
MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
- HBM Write and Atomic Traffic:
min: |
MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
avg: |
AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
max: |
MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
min: |
MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
- Read BW:
min: |
MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
avg: |
AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
max: |
MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
min: |
MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
- Read Latency:
min: |
MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
avg: |
AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
max: |
MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
min: |
MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
- Remote Read Traffic:
min: |
MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
avg: |
AVG((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
max: |
@@ -1016,6 +1018,8 @@ Modification:
min: |
MIN((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
- Remote Write and Atomic Traffic:
min: |
MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
avg: |
AVG((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
max: |
@@ -1023,19 +1027,19 @@ Modification:
min: |
MIN((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
- Uncached Read Traffic:
min: |
MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
avg: |
AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
max: |
MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
min: |
MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
- Uncached Write and Atomic Traffic:
min: |
MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
avg: |
AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
max: |
MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
min: |
MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
- Write and Atomic BW:
unit: Gbps
avg: |
@@ -1045,40 +1049,40 @@ Modification:
min: |
MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))
- Write and Atomic Latency:
min: |
MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
avg: |
AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
max: |
MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
min: |
MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
- metric_table:
id: 1706
title: L2 - Fabric interface detailed metrics
metrics:
- Atomic:
min: MIN((TCC_EA0_ATOMIC_sum / $denom))
avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
max: MAX((TCC_EA0_ATOMIC_sum / $denom))
min: MIN((TCC_EA0_ATOMIC_sum / $denom))
- HBM Read:
min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
- HBM Write and Atomic:
min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
- Read (32B):
min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
- Read (64B):
min: MIN((TCC_EA0_RDREQ_64B_sum / $denom))
avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom))
max: MAX((TCC_EA0_RDREQ_64B_sum / $denom))
min: MIN((TCC_EA0_RDREQ_64B_sum / $denom))
- Read (Uncached):
min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
- Remote Read:
avg: |
AVG((NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / $denom))
@@ -1101,13 +1105,13 @@ Modification:
min: |
MIN(NOISE_CLAMP(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), TCC_EA0_WRREQ_sum))
- Write and Atomic (64B):
min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
- Write and Atomic (Uncached):
min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
- Panel Config:
id: 1800
title: L2 Cache (per Channel)
@@ -1125,14 +1129,20 @@ Modification:
MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
min: |
MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
std dev: |
STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
avg: |
AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
max: |
MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
- metric_table:
id: 1805
title: L2-Fabric Requests (per normUnit)
metrics:
- ::_1:
read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom))
atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom))
write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom))
atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom))
- metric_table:
id: 1806
title: L2-Fabric Read Latency (Cycles)
@@ -714,6 +714,11 @@ Modification:
id: 201
title: System Speed-of-Light
metrics:
- L2-Fabric Read BW:
pop: |
((100 * (AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth)
value: |
AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp))
- MFMA FLOPs (BF16):
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
pop: |
@@ -751,13 +756,20 @@ Modification:
id: 400
title: Roofline
metric_tables:
- metric_table:
id: 401
title: Roofline Performance Rates
metrics:
- HBM Bandwidth:
value: |
AVG(((TCC_EA0_RDREQ_128B_sum * 128 + TCC_EA0_RDREQ_32B_sum * 32 + TCC_EA0_RDREQ_64B_sum * 64 + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
- metric_table:
id: 402
title: Roofline Plot Points
metrics:
- AI HBM:
value: |
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_EA0_RDREQ_128B_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
- AI L1:
value: |
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) )
@@ -901,9 +913,9 @@ Modification:
title: L2-Fabric Read Stall (Cycles per normUnit)
metrics:
- ::_1:
ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom))
ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom))
ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) / $denom))
ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom))
- metric_table:
id: 1810
title: L2-Fabric Write and Atomic Stall (Cycles per normUnit)
@@ -714,27 +714,32 @@ Modification:
id: 201
title: System Speed-of-Light
metrics:
- L2-Fabric Read BW:
value: |
AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp))
pop: |
((100 * (AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth)
- MFMA FLOPs (BF16):
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
pop: |
((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
- MFMA FLOPs (F16):
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
- MFMA FLOPs (F16):
pop: |
((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
- MFMA FLOPs (F64):
peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
pop: |
((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000))
peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
- MFMA FLOPs (F8):
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
unit: GFLOP/s
pop: |
((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
- MFMA IOPs (Int8):
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
- MFMA IOPs (Int8):
pop: |
((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
- Panel Config:
id: 300
title: Memory Chart
@@ -752,13 +757,20 @@ Modification:
id: 400
title: Roofline
metric_tables:
- metric_table:
id: 401
title: Roofline Performance Rates
metrics:
- HBM Bandwidth:
value: |
AVG(((TCC_EA0_RDREQ_128B_sum * 128 + TCC_EA0_RDREQ_32B_sum * 32 + TCC_EA0_RDREQ_64B_sum * 64 + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
- metric_table:
id: 402
title: Roofline Plot Points
metrics:
- AI HBM:
value: |
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_EA0_RDREQ_128B_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
- AI L1:
value: |
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) )
@@ -780,6 +792,7 @@ Modification:
min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
- Dispatched Workgroups:
min: |
MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
@@ -787,6 +800,8 @@ Modification:
MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
avg: |
AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
min: |
MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
- SGPR Writes:
min: |
MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
@@ -794,6 +809,8 @@ Modification:
MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
avg: |
AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
min: |
MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
- Scheduler-Pipe Utilization:
min: |
MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
@@ -801,6 +818,8 @@ Modification:
MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
avg: |
AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
min: |
MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
- VGPR Writes:
min: |
MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
@@ -808,6 +827,8 @@ Modification:
MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
avg: |
AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
min: |
MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
- Panel Config:
id: 700
title: Wavefront
@@ -820,6 +841,7 @@ Modification:
min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
- Panel Config:
id: 1100
title: Compute Units - Compute Pipeline
@@ -829,25 +851,25 @@ Modification:
title: Compute Speed-of-Light
metrics:
- MFMA FLOPs (BF16):
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
pop: |
((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
- MFMA FLOPs (F16):
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
- MFMA FLOPs (F16):
pop: |
((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
- MFMA FLOPs (F64):
peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
pop: |
((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000))
peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
- MFMA FLOPs (F8):
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
pop: |
((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
- MFMA IOPs (INT8):
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
- MFMA IOPs (INT8):
pop: |
((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
- metric_table:
id: 1103
title: Arithmetic Operations
@@ -859,6 +881,8 @@ Modification:
MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
avg: |
AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
min: |
MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
- Panel Config:
id: 1700
title: L2 Cache
@@ -889,10 +913,12 @@ Modification:
min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
- Read (64B):
min: MIN((TCC_EA0_RDREQ_64B_sum / $denom))
max: MAX((TCC_EA0_RDREQ_64B_sum / $denom))
avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom))
min: MIN((TCC_EA0_RDREQ_64B_sum / $denom))
- Panel Config:
id: 1800
title: L2 Cache (per Channel)
@@ -709,6 +709,11 @@ Modification:
id: 201
title: System Speed-of-Light
metrics:
- L2-Fabric Read BW:
value: |
AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp))
pop: |
((100 * (AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth)
- MFMA FLOPs (BF16):
pop: |
((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
@@ -746,13 +751,20 @@ Modification:
id: 400
title: Roofline
metric_tables:
- metric_table:
id: 401
title: Roofline Performance Rates
metrics:
- HBM Bandwidth:
value: |
AVG(((TCC_EA0_RDREQ_128B_sum * 128 + TCC_EA0_RDREQ_32B_sum * 32 + TCC_EA0_RDREQ_64B_sum * 64 + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
- metric_table:
id: 402
title: Roofline Plot Points
metrics:
- AI HBM:
value: |
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_EA0_RDREQ_128B_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
- AI L1:
value: |
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) )
@@ -174,13 +174,13 @@ Panel Config:
pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
L2-Fabric Read BW:
value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
- TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
value: AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum
+ 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
- Start_Timestamp))
unit: GB/s
peak: $hbmBandwidth
pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
- TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
pop: ((100 * (AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum
+ 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
- Start_Timestamp)))) / $hbmBandwidth)
L2-Fabric Write BW:
value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
@@ -68,8 +68,8 @@ Panel Config:
unit: GIOP/s
peak: $MFMAI8Ops_empirical_peak
HBM Bandwidth:
value: AVG((( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum
- TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum
value: AVG(((TCC_EA0_RDREQ_128B_sum * 128 + TCC_EA0_RDREQ_32B_sum * 32
+ TCC_EA0_RDREQ_64B_sum * 64 + ((TCC_EA0_WRREQ_sum
- TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp
- Start_Timestamp) / 1e9)) / 1e9)
unit: GB/s
@@ -108,9 +108,9 @@ Panel Config:
+ SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 *
512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8
* 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum
* 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
- TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
* 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_EA0_RDREQ_128B_sum
* 128) + (TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum
* 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
* 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
unit: FLOPs/Byte
AI L2:
@@ -587,8 +587,10 @@ rocprofiler-sdk:
- gfx940
- gfx941
- gfx942
- gfx950
expression: (TCC_BUBBLE_sum*128 + (TCC_EA0_RDREQ_sum-TCC_BUBBLE_sum-TCC_EA0_RDREQ_32B_sum)*64 + TCC_EA0_RDREQ_32B_sum*32)/1024
- architectures:
- gfx950
expression: (TCC_EA0_RDREQ_128B_sum*128+TCC_EA0_RDREQ_64B_sum*64+TCC_EA0_RDREQ_32B_sum*32)/1024
- architectures:
- gfx10
- gfx1010
@@ -612,12 +614,16 @@ rocprofiler-sdk:
- architectures:
- gfx90a
expression: 1024*(WRITE_SIZE+FETCH_SIZE)/reduce(GRBM_GUI_ACTIVE,max)
# Approximation which excludes TCC_EA0_RDREQ_32B due to the hardware limitation on the number of counters that can be collected.
- architectures:
- gfx940
- gfx941
- gfx942
expression: (WRITE_SIZE*1024+TCC_BUBBLE_sum*128+(TCC_EA0_RDREQ_sum-TCC_BUBBLE_sum)*64)/reduce(GRBM_GUI_ACTIVE,max)
# Approximation which excludes TCC_EA0_RDREQ_32B due to the hardware limitation on the number of counters that can be collected.
- architectures:
- gfx950
expression: (WRITE_SIZE*1024+TCC_BUBBLE_sum*128+(TCC_BUBBLE_sum-TCC_EA0_RDREQ_sum)*64)/reduce(GRBM_GUI_ACTIVE,max)
expression: (WRITE_SIZE*1024+TCC_EA0_RDREQ_128B_sum*128+TCC_EA0_RDREQ_64B_sum*64)/reduce(GRBM_GUI_ACTIVE,max)
- name: FetchSize
description: The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache
or memory effects taken into account.
@@ -65,13 +65,13 @@ Panel Config:
pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
L2-Fabric Read BW:
value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
- TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
value: AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum
+ 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
- Start_Timestamp))
unit: GB/s
peak: $hbmBandwidth
pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
- TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
pop: ((100 * (AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum
+ 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
- Start_Timestamp)))) / $hbmBandwidth)
L2-Fabric Write BW:
value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
@@ -100,4 +100,4 @@ Panel Config:
value: (GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) / (End_Timestamp - Start_Timestamp)
unit: ns
peak: N/A
pop: N/A
pop: N/A
@@ -1,7 +1,7 @@
{
"archs": {
"gfx908": {
"delta_hash": "bae93343a258d4b3f5e64c2f2ce91d1a",
"delta_hash": "1417ef1cfc1b47b7907608f2425f9f11",
"files": {
"0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
"0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
@@ -24,7 +24,7 @@
}
},
"gfx90a": {
"delta_hash": "d64f63cee8ace777ae46002c13756ab8",
"delta_hash": "c3ff3aa253d044f895d776eff72279ad",
"files": {
"0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
"0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
@@ -47,7 +47,7 @@
}
},
"gfx940": {
"delta_hash": "7b5e4d755c2c4e1654d0d01576284df2",
"delta_hash": "35aa77b45341e2a215b8251ad73f472f",
"files": {
"0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
"0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
@@ -70,7 +70,7 @@
}
},
"gfx941": {
"delta_hash": "9e62e4734bcc6a318ecdbb6d5432ce98",
"delta_hash": "8057ce09c13a4fa3e4863085c18a158c",
"files": {
"0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
"0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
@@ -93,7 +93,7 @@
}
},
"gfx942": {
"delta_hash": "4738dc07f7d7f08cfde42ece633e648f",
"delta_hash": "76bacc3fd8b2d4da43c3b8d670a097d9",
"files": {
"0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
"0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
@@ -120,9 +120,9 @@
"files": {
"0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
"0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
"0200_system_speed_of_light.yaml": "bf2ca00d4b255dbbe191a7641b81dc4c",
"0200_system_speed_of_light.yaml": "8e2a370cba884f1e4f695ee70b3bd7b6",
"0300_memory_chart.yaml": "2c82fa6f81a0dda679706d36b99e7913",
"0400_roofline.yaml": "2bd3b630b72d6d165c0d30cf481136a9",
"0400_roofline.yaml": "e39c92773e1499158fd47ce416e7cee5",
"0500_command_processor_cpc_cpf.yaml": "3f7dab1663ad7a6fae3801aec2b1e8d0",
"0600_workgroup_manager_spi.yaml": "e6546a92d283fed5a5dc6df203efb670",
"0700_wavefront.yaml": "330468fd711057b422de9b952c5cfe69",
@@ -34,7 +34,13 @@ import yaml
import config
from utils import rocpd_data, schema
from utils.kernel_name_shortener import kernel_name_shortener
from utils.logger import console_debug, console_error, console_log, demarcate
from utils.logger import (
console_debug,
console_error,
console_log,
console_warning,
demarcate,
)
# TODO: use pandas chunksize or dask to read really large csv file
# from dask import dataframe as dd
@@ -376,7 +382,9 @@ def is_single_panel_config(
elif arch_count == len(arch_names):
return False
else:
console_error("Found multiple panel config sets but incomplete for all archs.")
console_warning(
"Found multiple panel config sets but incomplete for all archs."
)
def find_1st_sub_dir(directory: str) -> Optional[str]:
@@ -1048,7 +1048,7 @@ def create_empirical_peaks_dict(empirical_peaks_df: pd.DataFrame) -> dict[str, f
]
# initialize peaks to 0
for peak_name in peak_names:
empirical_peaks[f"ammolite__{peak_name}_empirical_peak"] = 0
empirical_peaks[f"ammolite__{peak_name}_empirical_peak"] = np.nan
return empirical_peaks
@@ -656,7 +656,21 @@ def calc_ai_profile(
* 32
)
)
elif mspec.gpu_series == "MI350":
# Use TCC_EA0_RDREQ_128B_sum TCC_EA0_RDREQ_64B_sum to calculate hbm_data
hbm_data += (
(df["TCC_EA0_RDREQ_128B_sum"][idx] * 128)
+ (df["TCC_EA0_RDREQ_64B_sum"][idx] * 64)
+ (df["TCC_EA0_RDREQ_32B_sum"][idx] * 32)
+ (
(
df["TCC_EA0_WRREQ_sum"][idx]
- df["TCC_EA0_WRREQ_64B_sum"][idx]
)
* 32
)
+ (df["TCC_EA0_WRREQ_64B_sum"][idx] * 64)
)
else:
# Use TCC_BUBBLE_sum to calculate hbm_data
hbm_data += (
@@ -75,3 +75,11 @@ set_target_properties(
rocflop
PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/tests
)
set(MEMCOPY_SOURCES ../sample/memcopy.cpp)
set_source_files_properties(${MEMCOPY_SOURCES} PROPERTIES LANGUAGE HIP)
add_executable(memcopy ${MEMCOPY_SOURCES})
set_target_properties(
memcopy
PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/tests
)
@@ -24,8 +24,6 @@
##############################################################################
import csv
import re
import subprocess
from pathlib import Path
import pytest
@@ -39,52 +37,6 @@ config["COUNTER_LOGGING"] = False
config["METRIC_COMPARE"] = False
config["METRIC_LOGGING"] = False
SUPPORTED_ARCHS = {
"gfx940": {"mi300": ["MI300A_A0"]},
"gfx941": {"mi300": ["MI300X_A0"]},
"gfx942": {"mi300": ["MI300A_A1", "MI300X_A1"]},
}
MI300_CHIP_IDS = {
"29856": "MI300A_A1",
"29857": "MI300X_A1",
"29858": "MI308X",
}
def gpu_soc():
## 1) Parse arch details from rocminfo
rocminfo = str(
# decode with utf-8 to account for rocm-smi changes in latest rocm
subprocess.run(
["rocminfo"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
).stdout.decode("utf-8")
)
rocminfo = rocminfo.split("\n")
soc_regex = re.compile(r"^\s*Name\s*:\s+ ([a-zA-Z0-9]+)\s*$", re.MULTILINE)
devices = list(filter(soc_regex.match, rocminfo))
gpu_arch = devices[0].split()[1]
if not gpu_arch in SUPPORTED_ARCHS.keys():
return None
## 2) Parse chip id from rocminfo
chip_id = re.compile(r"^\s*Chip ID:\s+ ([a-zA-Z0-9]+)\s*", re.MULTILINE)
ids = list(filter(chip_id.match, rocminfo))
for id in ids:
chip_id = re.match(r"^[^()]+", id.split()[2]).group(0)
## 3) Deduce gpu model name from arch
gpu_model = list(SUPPORTED_ARCHS[gpu_arch].keys())[0].upper()
# For testing purposes we only care about gpu model series not the specific model
# if gpu_model == "MI300":
# if chip_id in MI300_CHIP_IDS:
# gpu_model = MI300_CHIP_IDS[chip_id]
# else:
# return None
return gpu_model
def load_metrics(csv_file_path):
"""
@@ -115,7 +67,7 @@ def load_metrics(csv_file_path):
return metrics_data
soc = gpu_soc()
soc = test_utils.gpu_soc()
@pytest.mark.L1_cache
@@ -0,0 +1,144 @@
##############################################################################
# MIT License
#
# Copyright (c) 2026 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
##############################################################################
import pandas as pd
import pytest
import test_utils
config = {}
config["memcopy"] = ["tests/memcopy"]
config["cleanup"] = True
soc = test_utils.gpu_soc()
# workload -> gfx -> metric definition
VALIDATE_METRICS = {
"memcopy": {
"MI100": [
{
"name": "HBM Bandwidth",
"metric_id": "4.1.8",
"csv_file": "4.1_Roofline_Performance_Rates.csv",
"column": "Value",
"expected_value": 1044.48,
},
],
"MI200": [
{
"name": "HBM Bandwidth",
"metric_id": "4.1.8",
"csv_file": "4.1_Roofline_Performance_Rates.csv",
"column": "Value",
"expected_value": 1389.17,
},
],
"MI300": [
{
"name": "HBM Bandwidth",
"metric_id": "4.1.9",
"csv_file": "4.1_Roofline_Performance_Rates.csv",
"column": "Value",
"expected_value": 3910.62,
},
],
"MI350": [
{
"name": "HBM Bandwidth",
"metric_id": "4.1.10",
"csv_file": "4.1_Roofline_Performance_Rates.csv",
"column": "Value",
"expected_value": 5690.42,
},
],
# Ignore warmup dispatch
# Collect roofline block
"profile_options": ["-d", "2-1001", "-b", "4"],
"roof": True,
}
}
@pytest.mark.path
def test_validate_metrics(
binary_handler_profile_rocprof_compute, binary_handler_analyze_rocprof_compute
):
for workload in VALIDATE_METRICS.keys():
metrics = VALIDATE_METRICS[workload].get(soc, [])
metric_ids = [metric["metric_id"] for metric in metrics]
if not metric_ids:
print(
f"Skipping metric validation for {workload} on {soc}. "
"No metrics to validate."
)
continue
profile_workload_dir = test_utils.get_output_dir(param_id=f"{workload}_profile")
analysis_workload_dir = test_utils.get_output_dir(
param_id=f"{workload}_analysis"
)
try:
# Ensure non zero length of profile df
options = VALIDATE_METRICS[workload].get("profile_options", [])
_ = binary_handler_profile_rocprof_compute(
config,
profile_workload_dir,
options,
check_success=True,
roof=VALIDATE_METRICS[workload].get("roof", False),
app_name=workload,
)
_ = test_utils.check_csv_files(
profile_workload_dir, num_devices=1, num_kernels=1
)
# Check whether metric values are correct
code = binary_handler_analyze_rocprof_compute([
"analyze",
"--output-name",
f"{analysis_workload_dir}",
"--output-format",
"csv",
"-b",
*metric_ids,
"--path",
profile_workload_dir,
])
assert code == 0
for metric in metrics:
actual = pd.read_csv(f"{analysis_workload_dir}/{metric['csv_file']}")[
metric["column"]
].values[0]
expected = metric["expected_value"]
# 5% tolerance in checking
assert abs(actual - expected) / expected <= 0.05, (
f"{metric['name']} ({metric['metric_id']}): "
f"actual={actual}, expected={expected}, "
f"diff={(abs(actual - expected) / expected * 100):.2f}% "
f"(tolerance: 5%)"
)
finally:
test_utils.clean_output_dir(config["cleanup"], analysis_workload_dir)
test_utils.clean_output_dir(config["cleanup"], profile_workload_dir)
@@ -38,29 +38,7 @@ import pytest
import test_utils
from scipy.stats import zscore
# Globals
# TODO: MI350 What are the gpu models in MI 350 series
SUPPORTED_ARCHS = {
"gfx908": {"mi100": ["MI100"]},
"gfx90a": {"mi200": ["MI210", "MI250", "MI250X"]},
"gfx940": {"mi300": ["MI300A_A0"]},
"gfx941": {"mi300": ["MI300X_A0"]},
"gfx942": {"mi300": ["MI300A_A1", "MI300X_A1"]},
"gfx950": {"mi350": ["MI350"]},
}
CHIP_IDS = {
"29856": "MI300A_A1",
"29857": "MI300X_A1",
"29858": "MI308X",
"30112": "MI350",
}
# --
# Runtime config options
# --
config = {}
config["kernel_name_1"] = "vecCopy"
config["app_1"] = ["./tests/vcopy", "-n", "1048576", "-b", "256", "-i", "3"]
@@ -248,47 +226,7 @@ def counter_compare(test_name, errors_pd, baseline_df, run_df, threshold=5):
return errors_pd
def gpu_soc():
global num_devices
## 1) Parse arch details from rocminfo
rocminfo = str(
# decode with utf-8 to account for rocm-smi changes in latest rocm
subprocess.run(
["rocminfo"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
).stdout.decode("utf-8")
)
rocminfo = rocminfo.split("\n")
soc_regex = re.compile(r"^\s*Name\s*:\s+ ([a-zA-Z0-9]+)\s*$", re.MULTILINE)
devices = list(filter(soc_regex.match, rocminfo))
gpu_arch = devices[0].split()[1]
if not gpu_arch in SUPPORTED_ARCHS.keys():
print("Cannot find a supported arch in rocminfo")
assert 0
else:
num_devices = (
len(devices)
if not "CI_VISIBLE_DEVICES" in os.environ
else os.environ["CI_VISIBLE_DEVICES"]
)
## 2) Parse chip id from rocminfo
chip_id = re.compile(r"^\s*Chip ID:\s+ ([a-zA-Z0-9]+)\s*", re.MULTILINE)
ids = list(filter(chip_id.match, rocminfo))
for id in ids:
chip_id = re.match(r"^[^()]+", id.split()[2]).group(0)
## 3) Deduce gpu model name from arch
gpu_model = list(SUPPORTED_ARCHS[gpu_arch].keys())[0].upper()
# For testing purposes we only care about gpu model series not the specific model
# if gpu_model not in ("MI50", "MI100", "MI200"):
# if chip_id in CHIP_IDS:
# gpu_model = CHIP_IDS[chip_id]
return gpu_model
soc = gpu_soc()
soc = test_utils.gpu_soc()
os.environ["ROCPROF"] = "rocprofiler-sdk"
@@ -645,9 +583,7 @@ def test_path(binary_handler_profile_rocprof_compute):
@pytest.mark.path
def test_path_rocflop(
binary_handler_profile_rocprof_compute,
):
def test_path_rocflop(binary_handler_profile_rocprof_compute):
# Test whether multiprocess workloads like rocflop are handled correctly
workload_dir = test_utils.get_output_dir()
options = ["--block", "2.1.1"]
@@ -32,6 +32,7 @@ import logging
import os
import re
import shutil
import subprocess
import tempfile
from pathlib import Path
from unittest import mock
@@ -41,6 +42,15 @@ import pytest
import utils.utils as utils
SUPPORTED_ARCHS = {
"gfx908": {"mi100": ["MI100"]},
"gfx90a": {"mi200": ["MI210", "MI250", "MI250X"]},
"gfx940": {"mi300": ["MI300A_A0"]},
"gfx941": {"mi300": ["MI300X_A0"]},
"gfx942": {"mi300": ["MI300A_A1", "MI300X_A1"]},
"gfx950": {"mi350": ["MI350"]},
}
class MockMSpec:
def __init__(
@@ -228,6 +238,27 @@ def get_num_pmc_file(output_dir):
])
def gpu_soc():
# Parse arch details from rocminfo
rocminfo = str(
# decode with utf-8 to account for rocm-smi changes in latest rocm
subprocess.run(
["rocminfo"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
).stdout.decode("utf-8")
)
rocminfo = rocminfo.split("\n")
soc_regex = re.compile(r"^\s*Name\s*:\s+ ([a-zA-Z0-9]+)\s*$", re.MULTILINE)
devices = list(filter(soc_regex.match, rocminfo))
gpu_arch = devices[0].split()[1]
if not gpu_arch in SUPPORTED_ARCHS.keys():
return None
gpu_model = list(SUPPORTED_ARCHS[gpu_arch].keys())[0].upper()
return gpu_model
# =============================================================================
# VERSION UTILITIES TESTS
# =============================================================================
@@ -588,8 +588,10 @@ rocprofiler-sdk:
- gfx940
- gfx941
- gfx942
- gfx950
expression: (TCC_BUBBLE_sum*128 + (TCC_EA0_RDREQ_sum-TCC_BUBBLE_sum-TCC_EA0_RDREQ_32B_sum)*64 + TCC_EA0_RDREQ_32B_sum*32)/1024
- architectures:
- gfx950
expression: (TCC_EA0_RDREQ_128B_sum*128+TCC_EA0_RDREQ_64B_sum*64+TCC_EA0_RDREQ_32B_sum*32)/1024
- architectures:
- gfx10
- gfx1010
@@ -615,12 +617,16 @@ rocprofiler-sdk:
- architectures:
- gfx90a
expression: 1024*(WRITE_SIZE+FETCH_SIZE)/reduce(GRBM_GUI_ACTIVE,max)
# Approximation which excludes TCC_EA0_RDREQ_32B due to the hardware limitation on the number of counters that can be collected.
- architectures:
- gfx940
- gfx941
- gfx942
expression: (WRITE_SIZE*1024+TCC_BUBBLE_sum*128+(TCC_EA0_RDREQ_sum-TCC_BUBBLE_sum)*64)/reduce(GRBM_GUI_ACTIVE,max)
# Approximation which excludes TCC_EA0_RDREQ_32B due to the hardware limitation on the number of counters that can be collected.
- architectures:
- gfx950
expression: (WRITE_SIZE*1024+TCC_BUBBLE_sum*128+(TCC_BUBBLE_sum-TCC_EA0_RDREQ_sum)*64)/reduce(GRBM_GUI_ACTIVE,max)
expression: (WRITE_SIZE*1024+TCC_EA0_RDREQ_128B_sum*128+TCC_EA0_RDREQ_64B_sum*64)/reduce(GRBM_GUI_ACTIVE,max)
- name: FetchSize
description: The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache
or memory effects taken into account.