MI350 Fix L2 cache to HBM read counters/metrics (#2501)
* Fix rocprofiler-sdk metrics definition
* Use TCC_EA0_RDREQ_128B instead of TCC_BUBBLE counter for L2 cache to
HBM counters and metrics
* Update MI350 counter definitions
* FETCH_SIZE
* BANDWIDTH_EA
* Update MI350 metrics definitions
* System Speed of Light, L2-Fabric Read BW
* Roofline Plot Points, AI (Arithmetic Intensity) HBM
* Roofline Performance Rates, HBM Bandwidth
* Remove redundant definition for gfx950 and fix BANDWIDTH_EA definition
Test HBM bandwidth metric for memcopy workload
* Add memcopy.cpp workload
* Add metric validation test suite to validate HBM Bandwidth metric for
memcopy workload
* Move gpu_soc() to test_utils.py for better re-usability
* Update TUI analysis config
* Fix hbm bandwidth formula for mi350 in calc_ai_profile
Co-authored-by: Alysa Liu <Alysa.Liu@amd.com>
Цей коміт міститься в:
@@ -58,6 +58,8 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
|
||||
|
||||
* Fix redundant warnings for compute/memory partition not found for < MI 300 series GPUs by skipping partition checks
|
||||
|
||||
* Fixed formula for metrics related to reads from L2 cache to HBM for MI350
|
||||
|
||||
### Removed
|
||||
|
||||
* Removed "VL1 Lat" metric for AMD Instinct MI300 series GPUs, due to MI300 series not supporting TCP_TCP_LATENCY_sum counter.
|
||||
|
||||
@@ -388,6 +388,13 @@ add_test(
|
||||
tests/test_profile_general.py ${WORKING_DIR_OPTION}
|
||||
)
|
||||
|
||||
add_test(
|
||||
NAME test_metric_validation
|
||||
COMMAND
|
||||
${PYTHON_TEST_COMMAND} -m pytest --junitxml=tests/test_metric_validation.xml
|
||||
${COV_OPTION} tests/test_metric_validation.py ${WORKING_DIR_OPTION}
|
||||
)
|
||||
|
||||
set_tests_properties(
|
||||
test_profile_kernel_execution
|
||||
test_profile_dispatch
|
||||
@@ -748,6 +755,7 @@ if(INSTALL_TESTS)
|
||||
tests/laplace_eqn
|
||||
tests/mat_mul_max
|
||||
tests/rocflop
|
||||
tests/memcopy
|
||||
DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/${PROJECT_NAME}/tests
|
||||
COMPONENT tests
|
||||
)
|
||||
|
||||
@@ -38,8 +38,8 @@ python3 -m pip install -r requirements.txt
|
||||
|
||||
## Testing
|
||||
|
||||
Populate the <username> variable in `docker/docker-compose.customrocmtest.yml`.
|
||||
Populate the <tarball_name> variable in `docker/Dockerfile.customrocmtest` based on latest TheRock nightly build information.
|
||||
Populate the <usename> variable in `docker/docker-compose.customrocmtest.yml`.
|
||||
Populate the <rocm_build_image> variable in `docker/Dockerfile.customrocmtest` based on latest ROCm CI build information.
|
||||
|
||||
To quickly get the environment (bash shell) for building and testing, run the following commands:
|
||||
* `cd docker`
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# Use a base image
|
||||
FROM ubuntu:22.04
|
||||
FROM <rocm_build_image>
|
||||
|
||||
# Install curl first (needed for ROCm download)
|
||||
RUN apt-get update && apt-get install -y curl
|
||||
@@ -7,7 +7,7 @@ RUN apt-get update && apt-get install -y curl
|
||||
# Define the tarball name as a variable
|
||||
# Check https://therock-nightly-tarball.s3.amazonaws.com/index.html for latest builds
|
||||
# Use therock-dist-linux-gfx<arch>-dcgpu-<rocm-version>.tar.gz naming convention
|
||||
ARG TARBALL_NAME=<tarball_name>
|
||||
ARG TARBALL_NAME=therock-dist-linux-gfx94X-dcgpu-7.11.0a20260116.tar.gz
|
||||
|
||||
# Install ROCm from TheRock Nightly build
|
||||
RUN mkdir -p /rocm && \
|
||||
@@ -24,29 +24,27 @@ ENV PATH="/rocm/bin:${PATH}" \
|
||||
|
||||
# Update package list and install prerequisites
|
||||
RUN apt-get update && apt-get install -y \
|
||||
software-properties-common cmake locales git \
|
||||
software-properties-common cmake locales git curl \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||
&& apt-get update
|
||||
|
||||
# Allows running git commands in /app
|
||||
RUN git config --global --add safe.directory /app
|
||||
|
||||
# Generate the desired locale
|
||||
RUN locale-gen en_US.UTF-8
|
||||
|
||||
# Install Python 3.10 and pip
|
||||
RUN apt-get install -y python3.10 python3.10-venv python3.10-dev python3-pip libsqlite3-dev
|
||||
RUN python3.10 -m venv /venv
|
||||
ENV PATH="/venv/bin:$PATH"
|
||||
RUN python3.10 -m venv venv
|
||||
ENV PATH="venv/bin:$PATH"
|
||||
RUN python -m pip install --upgrade pip
|
||||
|
||||
# Install any rocprofiler-compute dependencies specified in requirements.txt
|
||||
# Install any dependencies specified in requirements.txt
|
||||
WORKDIR /app/projects/rocprofiler-compute
|
||||
COPY projects/rocprofiler-compute/requirements.txt /app/projects/rocprofiler-compute/requirements.txt
|
||||
COPY projects/rocprofiler-compute/requirements-test.txt /app/projects/rocprofiler-compute/requirements-test.txt
|
||||
RUN python -m pip install -r /app/projects/rocprofiler-compute/requirements.txt -r /app/projects/rocprofiler-compute/requirements-test.txt
|
||||
|
||||
# Set the working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Allows running git commands in /app
|
||||
RUN git config --global --add safe.directory /app
|
||||
RUN python -m pip install -r requirements.txt -r requirements-test.txt
|
||||
|
||||
# Run interactive bash shell
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
@@ -0,0 +1,125 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
|
||||
// Simple memory copy kernel - one thread per element
|
||||
__global__ void memoryCopyKernel(const double* __restrict__ X,
|
||||
double* __restrict__ Y,
|
||||
size_t N) {
|
||||
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (idx < N) {
|
||||
Y[idx] = X[idx];
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
const size_t N = 268435456; // Number of elements
|
||||
const size_t bytes = N * sizeof(double);
|
||||
|
||||
// Print array size in GiB
|
||||
double sizeGiB = bytes / (1024.0 * 1024.0 * 1024.0);
|
||||
std::cout << "Array size: " << std::fixed << std::setprecision(2)
|
||||
<< sizeGiB << " GiB (" << bytes << " bytes)" << std::endl;
|
||||
std::cout << "Number of elements: " << N << std::endl;
|
||||
std::cout << std::string(60, '-') << std::endl;
|
||||
|
||||
// Allocate device memory
|
||||
double *d_X, *d_Y;
|
||||
hipMalloc(&d_X, bytes);
|
||||
hipMalloc(&d_Y, bytes);
|
||||
|
||||
// Allocate and initialize host memory
|
||||
double *h_X = new double[N];
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
h_X[i] = static_cast<double>(i);
|
||||
}
|
||||
|
||||
// Copy data to device
|
||||
hipMemcpy(d_X, h_X, bytes, hipMemcpyHostToDevice);
|
||||
|
||||
// Launch configuration
|
||||
int blockSize = 256;
|
||||
int gridSize = (N + blockSize - 1) / blockSize;
|
||||
|
||||
std::cout << "Launch configuration:" << std::endl;
|
||||
std::cout << " Block size: " << blockSize << std::endl;
|
||||
std::cout << " Grid size: " << gridSize << std::endl;
|
||||
std::cout << std::string(60, '-') << std::endl;
|
||||
|
||||
// Warmup run
|
||||
memoryCopyKernel<<<gridSize, blockSize>>>(d_X, d_Y, N);
|
||||
hipDeviceSynchronize();
|
||||
|
||||
// Timed runs
|
||||
const int numRuns = 1000;
|
||||
hipEvent_t start, stop;
|
||||
hipEventCreate(&start);
|
||||
hipEventCreate(&stop);
|
||||
|
||||
float totalTime = 0.0f;
|
||||
|
||||
for (int run = 0; run < numRuns; run++) {
|
||||
hipEventRecord(start);
|
||||
|
||||
memoryCopyKernel<<<gridSize, blockSize>>>(d_X, d_Y, N);
|
||||
|
||||
hipEventRecord(stop);
|
||||
hipEventSynchronize(stop);
|
||||
|
||||
float milliseconds = 0;
|
||||
hipEventElapsedTime(&milliseconds, start, stop);
|
||||
totalTime += milliseconds;
|
||||
}
|
||||
|
||||
float avgTime = totalTime / numRuns;
|
||||
|
||||
// Calculate bandwidth
|
||||
// Memory copy reads N elements and writes N elements = 2*N*sizeof(double) bytes
|
||||
double totalBytesTransferred = 2.0 * N * sizeof(double);
|
||||
double bandwidthGBps = (totalBytesTransferred / (avgTime / 1000.0)) / (1024.0 * 1024.0 * 1024.0);
|
||||
|
||||
std::cout << "Performance Results (averaged over " << numRuns << " runs):" << std::endl;
|
||||
std::cout << " Kernel execution time: " << std::fixed << std::setprecision(3)
|
||||
<< avgTime << " ms" << std::endl;
|
||||
std::cout << " Data transferred: " << std::setprecision(2)
|
||||
<< (totalBytesTransferred / (1024.0 * 1024.0 * 1024.0)) << " GiB" << std::endl;
|
||||
std::cout << " Achieved bandwidth: " << std::setprecision(2)
|
||||
<< bandwidthGBps << " GiB/s" << std::endl;
|
||||
std::cout << std::string(60, '-') << std::endl;
|
||||
|
||||
// Verify correctness
|
||||
double *h_Y = new double[N];
|
||||
hipMemcpy(h_Y, d_Y, bytes, hipMemcpyDeviceToHost);
|
||||
|
||||
bool correct = true;
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
if (h_Y[i] != h_X[i]) {
|
||||
correct = false;
|
||||
std::cout << "Mismatch at index " << i << ": "
|
||||
<< h_Y[i] << " != " << h_X[i] << std::endl;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (correct) {
|
||||
std::cout << "✓ Verification PASSED - all elements copied correctly" << std::endl;
|
||||
} else {
|
||||
std::cout << "✗ Verification FAILED" << std::endl;
|
||||
}
|
||||
|
||||
std::cout << "\nFirst 5 elements:" << std::endl;
|
||||
for (int i = 0; i < 5; i++) {
|
||||
std::cout << " Y[" << i << "] = " << h_Y[i] << std::endl;
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
hipEventDestroy(start);
|
||||
hipEventDestroy(stop);
|
||||
delete[] h_X;
|
||||
delete[] h_Y;
|
||||
hipFree(d_X);
|
||||
hipFree(d_Y);
|
||||
|
||||
return 0;
|
||||
}
|
||||
+12
-5
@@ -1203,10 +1203,10 @@ Modification:
|
||||
pop: |
|
||||
((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
|
||||
- L2-Fabric Read BW:
|
||||
value: |
|
||||
AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp))
|
||||
pop: |
|
||||
((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth)
|
||||
((100 * (AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth)
|
||||
value: |
|
||||
AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp))
|
||||
- MFMA FLOPs (BF16):
|
||||
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
|
||||
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
||||
@@ -1275,13 +1275,20 @@ Modification:
|
||||
id: 400
|
||||
title: Roofline
|
||||
metric_tables:
|
||||
- metric_table:
|
||||
id: 401
|
||||
title: Roofline Performance Rates
|
||||
metrics:
|
||||
- HBM Bandwidth:
|
||||
value: |
|
||||
AVG(((TCC_EA0_RDREQ_128B_sum * 128 + TCC_EA0_RDREQ_32B_sum * 32 + TCC_EA0_RDREQ_64B_sum * 64 + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
||||
- metric_table:
|
||||
id: 402
|
||||
title: Roofline Plot Points
|
||||
metrics:
|
||||
- AI HBM:
|
||||
value: |
|
||||
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
|
||||
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_EA0_RDREQ_128B_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
|
||||
- AI L1:
|
||||
value: |
|
||||
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) )
|
||||
@@ -1474,9 +1481,9 @@ Modification:
|
||||
title: L2-Fabric Read Stall (Cycles per normUnit)
|
||||
metrics:
|
||||
- ::_1:
|
||||
ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom))
|
||||
ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom))
|
||||
ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) / $denom))
|
||||
ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom))
|
||||
- metric_table:
|
||||
id: 1810
|
||||
title: L2-Fabric Write and Atomic Stall (Cycles per normUnit)
|
||||
|
||||
+55
-45
@@ -781,14 +781,14 @@ Modification:
|
||||
metrics:
|
||||
- HBM Bandwidth:
|
||||
value: |
|
||||
AVG((( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
||||
AVG(((TCC_EA0_RDREQ_128B_sum * 128 + TCC_EA0_RDREQ_32B_sum * 32 + TCC_EA0_RDREQ_64B_sum * 64 + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
||||
- metric_table:
|
||||
id: 402
|
||||
title: Roofline Plot Points
|
||||
metrics:
|
||||
- AI HBM:
|
||||
value: |
|
||||
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
|
||||
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_EA0_RDREQ_128B_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
|
||||
- AI L1:
|
||||
value: |
|
||||
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) )
|
||||
@@ -807,37 +807,37 @@ Modification:
|
||||
title: Workgroup manager utilizations
|
||||
metrics:
|
||||
- Dispatched Wavefronts:
|
||||
min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
||||
avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
||||
max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
||||
min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
||||
- Dispatched Workgroups:
|
||||
min: |
|
||||
MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
|
||||
avg: |
|
||||
AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
|
||||
max: |
|
||||
MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
|
||||
min: |
|
||||
MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
|
||||
- SGPR Writes:
|
||||
min: |
|
||||
MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
|
||||
avg: |
|
||||
AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
|
||||
max: |
|
||||
MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
|
||||
min: |
|
||||
MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
|
||||
- Scheduler-Pipe Utilization:
|
||||
min: |
|
||||
MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
|
||||
avg: |
|
||||
AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
|
||||
max: |
|
||||
MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
|
||||
min: |
|
||||
MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
|
||||
- VGPR Writes:
|
||||
min: |
|
||||
MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
|
||||
avg: |
|
||||
AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
|
||||
max: |
|
||||
MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
|
||||
min: |
|
||||
MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
|
||||
- Panel Config:
|
||||
id: 700
|
||||
title: Wavefront
|
||||
@@ -847,9 +847,9 @@ Modification:
|
||||
title: Wavefront Launch Stats
|
||||
metrics:
|
||||
- Total Wavefronts:
|
||||
min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
||||
avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
||||
max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
||||
min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
||||
- Panel Config:
|
||||
id: 1000
|
||||
title: Compute Units - Instruction Mix
|
||||
@@ -859,9 +859,9 @@ Modification:
|
||||
title: Overall Instruction Mix
|
||||
metrics:
|
||||
- VMEM:
|
||||
min: MIN(((SQ_INSTS_VMEM) / $denom))
|
||||
avg: AVG(((SQ_INSTS_VMEM) / $denom))
|
||||
max: MAX(((SQ_INSTS_VMEM) / $denom))
|
||||
min: MIN(((SQ_INSTS_VMEM) / $denom))
|
||||
- Panel Config:
|
||||
id: 1100
|
||||
title: Compute Units - Compute Pipeline
|
||||
@@ -891,12 +891,12 @@ Modification:
|
||||
title: Arithmetic Operations
|
||||
metrics:
|
||||
- FLOPs (Total):
|
||||
min: |
|
||||
MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
|
||||
avg: |
|
||||
AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
|
||||
max: |
|
||||
MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
|
||||
min: |
|
||||
MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
|
||||
- Panel Config:
|
||||
id: 1500
|
||||
title: Address Processing Unit and Data Return Path (TA/TD)
|
||||
@@ -923,21 +923,21 @@ Modification:
|
||||
title: vL1D cache access metrics
|
||||
metrics:
|
||||
- Cache BW:
|
||||
min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
||||
avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
||||
max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
||||
min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
||||
- L1 Access Latency:
|
||||
unit: (Cycles + $normUnit)
|
||||
avg: AVG((TCP_TCP_LATENCY_sum / $denom))
|
||||
max: MAX((TCP_TCP_LATENCY_sum / $denom))
|
||||
min: MIN((TCP_TCP_LATENCY_sum / $denom))
|
||||
- L1-L2 BW:
|
||||
min: |
|
||||
MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
|
||||
avg: |
|
||||
AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
|
||||
max: |
|
||||
MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
|
||||
min: |
|
||||
MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
|
||||
- L1-L2 Read Latency:
|
||||
unit: (Cycles + $normUnit)
|
||||
avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
|
||||
@@ -967,48 +967,50 @@ Modification:
|
||||
title: L2-Fabric interface metrics
|
||||
metrics:
|
||||
- Atomic Latency:
|
||||
min: |
|
||||
MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None))
|
||||
avg: |
|
||||
AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None))
|
||||
max: |
|
||||
MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None))
|
||||
min: |
|
||||
MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None))
|
||||
- Atomic Traffic:
|
||||
min: |
|
||||
MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
||||
avg: |
|
||||
AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
||||
max: |
|
||||
MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
||||
min: |
|
||||
MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
||||
- HBM Read Traffic:
|
||||
min: |
|
||||
MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
||||
avg: |
|
||||
AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
||||
max: |
|
||||
MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
||||
min: |
|
||||
MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
||||
- HBM Write and Atomic Traffic:
|
||||
min: |
|
||||
MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
||||
avg: |
|
||||
AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
||||
max: |
|
||||
MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
||||
min: |
|
||||
MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
||||
- Read BW:
|
||||
min: |
|
||||
MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
|
||||
avg: |
|
||||
AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
|
||||
max: |
|
||||
MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
|
||||
min: |
|
||||
MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
|
||||
- Read Latency:
|
||||
min: |
|
||||
MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
||||
avg: |
|
||||
AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
||||
max: |
|
||||
MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
||||
min: |
|
||||
MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
||||
- Remote Read Traffic:
|
||||
min: |
|
||||
MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
||||
avg: |
|
||||
AVG((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
||||
max: |
|
||||
@@ -1016,6 +1018,8 @@ Modification:
|
||||
min: |
|
||||
MIN((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
||||
- Remote Write and Atomic Traffic:
|
||||
min: |
|
||||
MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
||||
avg: |
|
||||
AVG((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
||||
max: |
|
||||
@@ -1023,19 +1027,19 @@ Modification:
|
||||
min: |
|
||||
MIN((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
||||
- Uncached Read Traffic:
|
||||
min: |
|
||||
MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
||||
avg: |
|
||||
AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
||||
max: |
|
||||
MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
||||
min: |
|
||||
MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
||||
- Uncached Write and Atomic Traffic:
|
||||
min: |
|
||||
MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
||||
avg: |
|
||||
AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
||||
max: |
|
||||
MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
||||
min: |
|
||||
MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
||||
- Write and Atomic BW:
|
||||
unit: Gbps
|
||||
avg: |
|
||||
@@ -1045,40 +1049,40 @@ Modification:
|
||||
min: |
|
||||
MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))
|
||||
- Write and Atomic Latency:
|
||||
min: |
|
||||
MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
||||
avg: |
|
||||
AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
||||
max: |
|
||||
MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
||||
min: |
|
||||
MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
||||
- metric_table:
|
||||
id: 1706
|
||||
title: L2 - Fabric interface detailed metrics
|
||||
metrics:
|
||||
- Atomic:
|
||||
min: MIN((TCC_EA0_ATOMIC_sum / $denom))
|
||||
avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
|
||||
max: MAX((TCC_EA0_ATOMIC_sum / $denom))
|
||||
min: MIN((TCC_EA0_ATOMIC_sum / $denom))
|
||||
- HBM Read:
|
||||
min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
|
||||
avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
|
||||
max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
|
||||
min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
|
||||
- HBM Write and Atomic:
|
||||
min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
|
||||
avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
|
||||
max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
|
||||
min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
|
||||
- Read (32B):
|
||||
min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
|
||||
avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
|
||||
max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
|
||||
min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
|
||||
- Read (64B):
|
||||
min: MIN((TCC_EA0_RDREQ_64B_sum / $denom))
|
||||
avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom))
|
||||
max: MAX((TCC_EA0_RDREQ_64B_sum / $denom))
|
||||
min: MIN((TCC_EA0_RDREQ_64B_sum / $denom))
|
||||
- Read (Uncached):
|
||||
min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
|
||||
avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
|
||||
max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
|
||||
min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
|
||||
- Remote Read:
|
||||
avg: |
|
||||
AVG((NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / $denom))
|
||||
@@ -1101,13 +1105,13 @@ Modification:
|
||||
min: |
|
||||
MIN(NOISE_CLAMP(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), TCC_EA0_WRREQ_sum))
|
||||
- Write and Atomic (64B):
|
||||
min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
|
||||
avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
|
||||
max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
|
||||
min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
|
||||
- Write and Atomic (Uncached):
|
||||
min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
|
||||
avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
|
||||
max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
|
||||
min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
|
||||
- Panel Config:
|
||||
id: 1800
|
||||
title: L2 Cache (per Channel)
|
||||
@@ -1125,14 +1129,20 @@ Modification:
|
||||
MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
|
||||
min: |
|
||||
MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
|
||||
std dev: |
|
||||
STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
|
||||
avg: |
|
||||
AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
|
||||
max: |
|
||||
MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
|
||||
- metric_table:
|
||||
id: 1805
|
||||
title: L2-Fabric Requests (per normUnit)
|
||||
metrics:
|
||||
- ::_1:
|
||||
read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom))
|
||||
atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom))
|
||||
write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom))
|
||||
atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom))
|
||||
- metric_table:
|
||||
id: 1806
|
||||
title: L2-Fabric Read Latency (Cycles)
|
||||
|
||||
+14
-2
@@ -714,6 +714,11 @@ Modification:
|
||||
id: 201
|
||||
title: System Speed-of-Light
|
||||
metrics:
|
||||
- L2-Fabric Read BW:
|
||||
pop: |
|
||||
((100 * (AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth)
|
||||
value: |
|
||||
AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp))
|
||||
- MFMA FLOPs (BF16):
|
||||
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
||||
pop: |
|
||||
@@ -751,13 +756,20 @@ Modification:
|
||||
id: 400
|
||||
title: Roofline
|
||||
metric_tables:
|
||||
- metric_table:
|
||||
id: 401
|
||||
title: Roofline Performance Rates
|
||||
metrics:
|
||||
- HBM Bandwidth:
|
||||
value: |
|
||||
AVG(((TCC_EA0_RDREQ_128B_sum * 128 + TCC_EA0_RDREQ_32B_sum * 32 + TCC_EA0_RDREQ_64B_sum * 64 + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
||||
- metric_table:
|
||||
id: 402
|
||||
title: Roofline Plot Points
|
||||
metrics:
|
||||
- AI HBM:
|
||||
value: |
|
||||
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
|
||||
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_EA0_RDREQ_128B_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
|
||||
- AI L1:
|
||||
value: |
|
||||
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) )
|
||||
@@ -901,9 +913,9 @@ Modification:
|
||||
title: L2-Fabric Read Stall (Cycles per normUnit)
|
||||
metrics:
|
||||
- ::_1:
|
||||
ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom))
|
||||
ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom))
|
||||
ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) / $denom))
|
||||
ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom))
|
||||
- metric_table:
|
||||
id: 1810
|
||||
title: L2-Fabric Write and Atomic Stall (Cycles per normUnit)
|
||||
|
||||
+37
-11
@@ -714,27 +714,32 @@ Modification:
|
||||
id: 201
|
||||
title: System Speed-of-Light
|
||||
metrics:
|
||||
- L2-Fabric Read BW:
|
||||
value: |
|
||||
AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp))
|
||||
pop: |
|
||||
((100 * (AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth)
|
||||
- MFMA FLOPs (BF16):
|
||||
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
||||
pop: |
|
||||
((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
|
||||
- MFMA FLOPs (F16):
|
||||
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
||||
- MFMA FLOPs (F16):
|
||||
pop: |
|
||||
((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
|
||||
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
||||
- MFMA FLOPs (F64):
|
||||
peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
|
||||
pop: |
|
||||
((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000))
|
||||
peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
|
||||
- MFMA FLOPs (F8):
|
||||
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
|
||||
unit: GFLOP/s
|
||||
pop: |
|
||||
((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
|
||||
- MFMA IOPs (Int8):
|
||||
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
|
||||
- MFMA IOPs (Int8):
|
||||
pop: |
|
||||
((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
|
||||
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
|
||||
- Panel Config:
|
||||
id: 300
|
||||
title: Memory Chart
|
||||
@@ -752,13 +757,20 @@ Modification:
|
||||
id: 400
|
||||
title: Roofline
|
||||
metric_tables:
|
||||
- metric_table:
|
||||
id: 401
|
||||
title: Roofline Performance Rates
|
||||
metrics:
|
||||
- HBM Bandwidth:
|
||||
value: |
|
||||
AVG(((TCC_EA0_RDREQ_128B_sum * 128 + TCC_EA0_RDREQ_32B_sum * 32 + TCC_EA0_RDREQ_64B_sum * 64 + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
||||
- metric_table:
|
||||
id: 402
|
||||
title: Roofline Plot Points
|
||||
metrics:
|
||||
- AI HBM:
|
||||
value: |
|
||||
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
|
||||
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_EA0_RDREQ_128B_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
|
||||
- AI L1:
|
||||
value: |
|
||||
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) )
|
||||
@@ -780,6 +792,7 @@ Modification:
|
||||
min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
||||
max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
||||
avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
||||
min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
||||
- Dispatched Workgroups:
|
||||
min: |
|
||||
MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
|
||||
@@ -787,6 +800,8 @@ Modification:
|
||||
MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
|
||||
avg: |
|
||||
AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
|
||||
min: |
|
||||
MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
|
||||
- SGPR Writes:
|
||||
min: |
|
||||
MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
|
||||
@@ -794,6 +809,8 @@ Modification:
|
||||
MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
|
||||
avg: |
|
||||
AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
|
||||
min: |
|
||||
MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
|
||||
- Scheduler-Pipe Utilization:
|
||||
min: |
|
||||
MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
|
||||
@@ -801,6 +818,8 @@ Modification:
|
||||
MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
|
||||
avg: |
|
||||
AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
|
||||
min: |
|
||||
MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
|
||||
- VGPR Writes:
|
||||
min: |
|
||||
MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
|
||||
@@ -808,6 +827,8 @@ Modification:
|
||||
MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
|
||||
avg: |
|
||||
AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
|
||||
min: |
|
||||
MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
|
||||
- Panel Config:
|
||||
id: 700
|
||||
title: Wavefront
|
||||
@@ -820,6 +841,7 @@ Modification:
|
||||
min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
||||
max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
||||
avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
||||
min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
||||
- Panel Config:
|
||||
id: 1100
|
||||
title: Compute Units - Compute Pipeline
|
||||
@@ -829,25 +851,25 @@ Modification:
|
||||
title: Compute Speed-of-Light
|
||||
metrics:
|
||||
- MFMA FLOPs (BF16):
|
||||
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
||||
pop: |
|
||||
((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
|
||||
- MFMA FLOPs (F16):
|
||||
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
||||
- MFMA FLOPs (F16):
|
||||
pop: |
|
||||
((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
|
||||
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
||||
- MFMA FLOPs (F64):
|
||||
peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
|
||||
pop: |
|
||||
((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000))
|
||||
peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
|
||||
- MFMA FLOPs (F8):
|
||||
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
|
||||
pop: |
|
||||
((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
|
||||
- MFMA IOPs (INT8):
|
||||
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
|
||||
- MFMA IOPs (INT8):
|
||||
pop: |
|
||||
((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
|
||||
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
|
||||
- metric_table:
|
||||
id: 1103
|
||||
title: Arithmetic Operations
|
||||
@@ -859,6 +881,8 @@ Modification:
|
||||
MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
|
||||
avg: |
|
||||
AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
|
||||
min: |
|
||||
MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
|
||||
- Panel Config:
|
||||
id: 1700
|
||||
title: L2 Cache
|
||||
@@ -889,10 +913,12 @@ Modification:
|
||||
min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
|
||||
max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
|
||||
avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
|
||||
min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
|
||||
- Read (64B):
|
||||
min: MIN((TCC_EA0_RDREQ_64B_sum / $denom))
|
||||
max: MAX((TCC_EA0_RDREQ_64B_sum / $denom))
|
||||
avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom))
|
||||
min: MIN((TCC_EA0_RDREQ_64B_sum / $denom))
|
||||
- Panel Config:
|
||||
id: 1800
|
||||
title: L2 Cache (per Channel)
|
||||
|
||||
+13
-1
@@ -709,6 +709,11 @@ Modification:
|
||||
id: 201
|
||||
title: System Speed-of-Light
|
||||
metrics:
|
||||
- L2-Fabric Read BW:
|
||||
value: |
|
||||
AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp))
|
||||
pop: |
|
||||
((100 * (AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth)
|
||||
- MFMA FLOPs (BF16):
|
||||
pop: |
|
||||
((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
|
||||
@@ -746,13 +751,20 @@ Modification:
|
||||
id: 400
|
||||
title: Roofline
|
||||
metric_tables:
|
||||
- metric_table:
|
||||
id: 401
|
||||
title: Roofline Performance Rates
|
||||
metrics:
|
||||
- HBM Bandwidth:
|
||||
value: |
|
||||
AVG(((TCC_EA0_RDREQ_128B_sum * 128 + TCC_EA0_RDREQ_32B_sum * 32 + TCC_EA0_RDREQ_64B_sum * 64 + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
||||
- metric_table:
|
||||
id: 402
|
||||
title: Roofline Plot Points
|
||||
metrics:
|
||||
- AI HBM:
|
||||
value: |
|
||||
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
|
||||
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_EA0_RDREQ_128B_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
|
||||
- AI L1:
|
||||
value: |
|
||||
( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) )
|
||||
|
||||
+4
-4
@@ -174,13 +174,13 @@ Panel Config:
|
||||
pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
|
||||
/ ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
|
||||
L2-Fabric Read BW:
|
||||
value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
|
||||
- TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
|
||||
value: AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum
|
||||
+ 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
|
||||
- Start_Timestamp))
|
||||
unit: GB/s
|
||||
peak: $hbmBandwidth
|
||||
pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
|
||||
- TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
|
||||
pop: ((100 * (AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum
|
||||
+ 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
|
||||
- Start_Timestamp)))) / $hbmBandwidth)
|
||||
L2-Fabric Write BW:
|
||||
value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
||||
|
||||
+5
-5
@@ -68,8 +68,8 @@ Panel Config:
|
||||
unit: GIOP/s
|
||||
peak: $MFMAI8Ops_empirical_peak
|
||||
HBM Bandwidth:
|
||||
value: AVG((( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum
|
||||
- TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum
|
||||
value: AVG(((TCC_EA0_RDREQ_128B_sum * 128 + TCC_EA0_RDREQ_32B_sum * 32
|
||||
+ TCC_EA0_RDREQ_64B_sum * 64 + ((TCC_EA0_WRREQ_sum
|
||||
- TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp
|
||||
- Start_Timestamp) / 1e9)) / 1e9)
|
||||
unit: GB/s
|
||||
@@ -108,9 +108,9 @@ Panel Config:
|
||||
+ SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
||||
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 *
|
||||
512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8
|
||||
* 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum
|
||||
* 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
|
||||
- TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
||||
* 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_EA0_RDREQ_128B_sum
|
||||
* 128) + (TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum
|
||||
* 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
||||
* 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
|
||||
unit: FLOPs/Byte
|
||||
AI L2:
|
||||
|
||||
+8
-2
@@ -587,8 +587,10 @@ rocprofiler-sdk:
|
||||
- gfx940
|
||||
- gfx941
|
||||
- gfx942
|
||||
- gfx950
|
||||
expression: (TCC_BUBBLE_sum*128 + (TCC_EA0_RDREQ_sum-TCC_BUBBLE_sum-TCC_EA0_RDREQ_32B_sum)*64 + TCC_EA0_RDREQ_32B_sum*32)/1024
|
||||
- architectures:
|
||||
- gfx950
|
||||
expression: (TCC_EA0_RDREQ_128B_sum*128+TCC_EA0_RDREQ_64B_sum*64+TCC_EA0_RDREQ_32B_sum*32)/1024
|
||||
- architectures:
|
||||
- gfx10
|
||||
- gfx1010
|
||||
@@ -612,12 +614,16 @@ rocprofiler-sdk:
|
||||
- architectures:
|
||||
- gfx90a
|
||||
expression: 1024*(WRITE_SIZE+FETCH_SIZE)/reduce(GRBM_GUI_ACTIVE,max)
|
||||
# Approximation which excludes TCC_EA0_RDREQ_32B due to the hardware limitation on the number of counters that can be collected.
|
||||
- architectures:
|
||||
- gfx940
|
||||
- gfx941
|
||||
- gfx942
|
||||
expression: (WRITE_SIZE*1024+TCC_BUBBLE_sum*128+(TCC_EA0_RDREQ_sum-TCC_BUBBLE_sum)*64)/reduce(GRBM_GUI_ACTIVE,max)
|
||||
# Approximation which excludes TCC_EA0_RDREQ_32B due to the hardware limitation on the number of counters that can be collected.
|
||||
- architectures:
|
||||
- gfx950
|
||||
expression: (WRITE_SIZE*1024+TCC_BUBBLE_sum*128+(TCC_BUBBLE_sum-TCC_EA0_RDREQ_sum)*64)/reduce(GRBM_GUI_ACTIVE,max)
|
||||
expression: (WRITE_SIZE*1024+TCC_EA0_RDREQ_128B_sum*128+TCC_EA0_RDREQ_64B_sum*64)/reduce(GRBM_GUI_ACTIVE,max)
|
||||
- name: FetchSize
|
||||
description: The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache
|
||||
or memory effects taken into account.
|
||||
|
||||
+5
-5
@@ -65,13 +65,13 @@ Panel Config:
|
||||
pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
|
||||
/ ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
|
||||
L2-Fabric Read BW:
|
||||
value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
|
||||
- TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
|
||||
value: AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum
|
||||
+ 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
|
||||
- Start_Timestamp))
|
||||
unit: GB/s
|
||||
peak: $hbmBandwidth
|
||||
pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
|
||||
- TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
|
||||
pop: ((100 * (AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum
|
||||
+ 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
|
||||
- Start_Timestamp)))) / $hbmBandwidth)
|
||||
L2-Fabric Write BW:
|
||||
value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
||||
@@ -100,4 +100,4 @@ Panel Config:
|
||||
value: (GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) / (End_Timestamp - Start_Timestamp)
|
||||
unit: ns
|
||||
peak: N/A
|
||||
pop: N/A
|
||||
pop: N/A
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"archs": {
|
||||
"gfx908": {
|
||||
"delta_hash": "bae93343a258d4b3f5e64c2f2ce91d1a",
|
||||
"delta_hash": "1417ef1cfc1b47b7907608f2425f9f11",
|
||||
"files": {
|
||||
"0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
|
||||
"0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
|
||||
@@ -24,7 +24,7 @@
|
||||
}
|
||||
},
|
||||
"gfx90a": {
|
||||
"delta_hash": "d64f63cee8ace777ae46002c13756ab8",
|
||||
"delta_hash": "c3ff3aa253d044f895d776eff72279ad",
|
||||
"files": {
|
||||
"0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
|
||||
"0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
|
||||
@@ -47,7 +47,7 @@
|
||||
}
|
||||
},
|
||||
"gfx940": {
|
||||
"delta_hash": "7b5e4d755c2c4e1654d0d01576284df2",
|
||||
"delta_hash": "35aa77b45341e2a215b8251ad73f472f",
|
||||
"files": {
|
||||
"0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
|
||||
"0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
|
||||
@@ -70,7 +70,7 @@
|
||||
}
|
||||
},
|
||||
"gfx941": {
|
||||
"delta_hash": "9e62e4734bcc6a318ecdbb6d5432ce98",
|
||||
"delta_hash": "8057ce09c13a4fa3e4863085c18a158c",
|
||||
"files": {
|
||||
"0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
|
||||
"0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
|
||||
@@ -93,7 +93,7 @@
|
||||
}
|
||||
},
|
||||
"gfx942": {
|
||||
"delta_hash": "4738dc07f7d7f08cfde42ece633e648f",
|
||||
"delta_hash": "76bacc3fd8b2d4da43c3b8d670a097d9",
|
||||
"files": {
|
||||
"0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
|
||||
"0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
|
||||
@@ -120,9 +120,9 @@
|
||||
"files": {
|
||||
"0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
|
||||
"0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
|
||||
"0200_system_speed_of_light.yaml": "bf2ca00d4b255dbbe191a7641b81dc4c",
|
||||
"0200_system_speed_of_light.yaml": "8e2a370cba884f1e4f695ee70b3bd7b6",
|
||||
"0300_memory_chart.yaml": "2c82fa6f81a0dda679706d36b99e7913",
|
||||
"0400_roofline.yaml": "2bd3b630b72d6d165c0d30cf481136a9",
|
||||
"0400_roofline.yaml": "e39c92773e1499158fd47ce416e7cee5",
|
||||
"0500_command_processor_cpc_cpf.yaml": "3f7dab1663ad7a6fae3801aec2b1e8d0",
|
||||
"0600_workgroup_manager_spi.yaml": "e6546a92d283fed5a5dc6df203efb670",
|
||||
"0700_wavefront.yaml": "330468fd711057b422de9b952c5cfe69",
|
||||
|
||||
@@ -34,7 +34,13 @@ import yaml
|
||||
import config
|
||||
from utils import rocpd_data, schema
|
||||
from utils.kernel_name_shortener import kernel_name_shortener
|
||||
from utils.logger import console_debug, console_error, console_log, demarcate
|
||||
from utils.logger import (
|
||||
console_debug,
|
||||
console_error,
|
||||
console_log,
|
||||
console_warning,
|
||||
demarcate,
|
||||
)
|
||||
|
||||
# TODO: use pandas chunksize or dask to read really large csv file
|
||||
# from dask import dataframe as dd
|
||||
@@ -376,7 +382,9 @@ def is_single_panel_config(
|
||||
elif arch_count == len(arch_names):
|
||||
return False
|
||||
else:
|
||||
console_error("Found multiple panel config sets but incomplete for all archs.")
|
||||
console_warning(
|
||||
"Found multiple panel config sets but incomplete for all archs."
|
||||
)
|
||||
|
||||
|
||||
def find_1st_sub_dir(directory: str) -> Optional[str]:
|
||||
|
||||
@@ -1048,7 +1048,7 @@ def create_empirical_peaks_dict(empirical_peaks_df: pd.DataFrame) -> dict[str, f
|
||||
]
|
||||
# initialize peaks to 0
|
||||
for peak_name in peak_names:
|
||||
empirical_peaks[f"ammolite__{peak_name}_empirical_peak"] = 0
|
||||
empirical_peaks[f"ammolite__{peak_name}_empirical_peak"] = np.nan
|
||||
|
||||
return empirical_peaks
|
||||
|
||||
|
||||
@@ -656,7 +656,21 @@ def calc_ai_profile(
|
||||
* 32
|
||||
)
|
||||
)
|
||||
|
||||
elif mspec.gpu_series == "MI350":
|
||||
# Use TCC_EA0_RDREQ_128B_sum TCC_EA0_RDREQ_64B_sum to calculate hbm_data
|
||||
hbm_data += (
|
||||
(df["TCC_EA0_RDREQ_128B_sum"][idx] * 128)
|
||||
+ (df["TCC_EA0_RDREQ_64B_sum"][idx] * 64)
|
||||
+ (df["TCC_EA0_RDREQ_32B_sum"][idx] * 32)
|
||||
+ (
|
||||
(
|
||||
df["TCC_EA0_WRREQ_sum"][idx]
|
||||
- df["TCC_EA0_WRREQ_64B_sum"][idx]
|
||||
)
|
||||
* 32
|
||||
)
|
||||
+ (df["TCC_EA0_WRREQ_64B_sum"][idx] * 64)
|
||||
)
|
||||
else:
|
||||
# Use TCC_BUBBLE_sum to calculate hbm_data
|
||||
hbm_data += (
|
||||
|
||||
@@ -75,3 +75,11 @@ set_target_properties(
|
||||
rocflop
|
||||
PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/tests
|
||||
)
|
||||
|
||||
set(MEMCOPY_SOURCES ../sample/memcopy.cpp)
|
||||
set_source_files_properties(${MEMCOPY_SOURCES} PROPERTIES LANGUAGE HIP)
|
||||
add_executable(memcopy ${MEMCOPY_SOURCES})
|
||||
set_target_properties(
|
||||
memcopy
|
||||
PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/tests
|
||||
)
|
||||
|
||||
@@ -24,8 +24,6 @@
|
||||
##############################################################################
|
||||
|
||||
import csv
|
||||
import re
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
@@ -39,52 +37,6 @@ config["COUNTER_LOGGING"] = False
|
||||
config["METRIC_COMPARE"] = False
|
||||
config["METRIC_LOGGING"] = False
|
||||
|
||||
SUPPORTED_ARCHS = {
|
||||
"gfx940": {"mi300": ["MI300A_A0"]},
|
||||
"gfx941": {"mi300": ["MI300X_A0"]},
|
||||
"gfx942": {"mi300": ["MI300A_A1", "MI300X_A1"]},
|
||||
}
|
||||
|
||||
MI300_CHIP_IDS = {
|
||||
"29856": "MI300A_A1",
|
||||
"29857": "MI300X_A1",
|
||||
"29858": "MI308X",
|
||||
}
|
||||
|
||||
|
||||
def gpu_soc():
|
||||
## 1) Parse arch details from rocminfo
|
||||
rocminfo = str(
|
||||
# decode with utf-8 to account for rocm-smi changes in latest rocm
|
||||
subprocess.run(
|
||||
["rocminfo"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||
).stdout.decode("utf-8")
|
||||
)
|
||||
rocminfo = rocminfo.split("\n")
|
||||
soc_regex = re.compile(r"^\s*Name\s*:\s+ ([a-zA-Z0-9]+)\s*$", re.MULTILINE)
|
||||
devices = list(filter(soc_regex.match, rocminfo))
|
||||
gpu_arch = devices[0].split()[1]
|
||||
|
||||
if not gpu_arch in SUPPORTED_ARCHS.keys():
|
||||
return None
|
||||
|
||||
## 2) Parse chip id from rocminfo
|
||||
chip_id = re.compile(r"^\s*Chip ID:\s+ ([a-zA-Z0-9]+)\s*", re.MULTILINE)
|
||||
ids = list(filter(chip_id.match, rocminfo))
|
||||
for id in ids:
|
||||
chip_id = re.match(r"^[^()]+", id.split()[2]).group(0)
|
||||
|
||||
## 3) Deduce gpu model name from arch
|
||||
gpu_model = list(SUPPORTED_ARCHS[gpu_arch].keys())[0].upper()
|
||||
# For testing purposes we only care about gpu model series not the specific model
|
||||
# if gpu_model == "MI300":
|
||||
# if chip_id in MI300_CHIP_IDS:
|
||||
# gpu_model = MI300_CHIP_IDS[chip_id]
|
||||
# else:
|
||||
# return None
|
||||
|
||||
return gpu_model
|
||||
|
||||
|
||||
def load_metrics(csv_file_path):
|
||||
"""
|
||||
@@ -115,7 +67,7 @@ def load_metrics(csv_file_path):
|
||||
return metrics_data
|
||||
|
||||
|
||||
soc = gpu_soc()
|
||||
soc = test_utils.gpu_soc()
|
||||
|
||||
|
||||
@pytest.mark.L1_cache
|
||||
|
||||
@@ -0,0 +1,144 @@
|
||||
##############################################################################
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2026 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
##############################################################################
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
import test_utils
|
||||
|
||||
config = {}
|
||||
config["memcopy"] = ["tests/memcopy"]
|
||||
config["cleanup"] = True
|
||||
|
||||
soc = test_utils.gpu_soc()
|
||||
|
||||
# workload -> gfx -> metric definition
|
||||
VALIDATE_METRICS = {
|
||||
"memcopy": {
|
||||
"MI100": [
|
||||
{
|
||||
"name": "HBM Bandwidth",
|
||||
"metric_id": "4.1.8",
|
||||
"csv_file": "4.1_Roofline_Performance_Rates.csv",
|
||||
"column": "Value",
|
||||
"expected_value": 1044.48,
|
||||
},
|
||||
],
|
||||
"MI200": [
|
||||
{
|
||||
"name": "HBM Bandwidth",
|
||||
"metric_id": "4.1.8",
|
||||
"csv_file": "4.1_Roofline_Performance_Rates.csv",
|
||||
"column": "Value",
|
||||
"expected_value": 1389.17,
|
||||
},
|
||||
],
|
||||
"MI300": [
|
||||
{
|
||||
"name": "HBM Bandwidth",
|
||||
"metric_id": "4.1.9",
|
||||
"csv_file": "4.1_Roofline_Performance_Rates.csv",
|
||||
"column": "Value",
|
||||
"expected_value": 3910.62,
|
||||
},
|
||||
],
|
||||
"MI350": [
|
||||
{
|
||||
"name": "HBM Bandwidth",
|
||||
"metric_id": "4.1.10",
|
||||
"csv_file": "4.1_Roofline_Performance_Rates.csv",
|
||||
"column": "Value",
|
||||
"expected_value": 5690.42,
|
||||
},
|
||||
],
|
||||
# Ignore warmup dispatch
|
||||
# Collect roofline block
|
||||
"profile_options": ["-d", "2-1001", "-b", "4"],
|
||||
"roof": True,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.path
|
||||
def test_validate_metrics(
|
||||
binary_handler_profile_rocprof_compute, binary_handler_analyze_rocprof_compute
|
||||
):
|
||||
for workload in VALIDATE_METRICS.keys():
|
||||
metrics = VALIDATE_METRICS[workload].get(soc, [])
|
||||
metric_ids = [metric["metric_id"] for metric in metrics]
|
||||
if not metric_ids:
|
||||
print(
|
||||
f"Skipping metric validation for {workload} on {soc}. "
|
||||
"No metrics to validate."
|
||||
)
|
||||
continue
|
||||
|
||||
profile_workload_dir = test_utils.get_output_dir(param_id=f"{workload}_profile")
|
||||
analysis_workload_dir = test_utils.get_output_dir(
|
||||
param_id=f"{workload}_analysis"
|
||||
)
|
||||
try:
|
||||
# Ensure non zero length of profile df
|
||||
options = VALIDATE_METRICS[workload].get("profile_options", [])
|
||||
_ = binary_handler_profile_rocprof_compute(
|
||||
config,
|
||||
profile_workload_dir,
|
||||
options,
|
||||
check_success=True,
|
||||
roof=VALIDATE_METRICS[workload].get("roof", False),
|
||||
app_name=workload,
|
||||
)
|
||||
_ = test_utils.check_csv_files(
|
||||
profile_workload_dir, num_devices=1, num_kernels=1
|
||||
)
|
||||
|
||||
# Check whether metric values are correct
|
||||
code = binary_handler_analyze_rocprof_compute([
|
||||
"analyze",
|
||||
"--output-name",
|
||||
f"{analysis_workload_dir}",
|
||||
"--output-format",
|
||||
"csv",
|
||||
"-b",
|
||||
*metric_ids,
|
||||
"--path",
|
||||
profile_workload_dir,
|
||||
])
|
||||
assert code == 0
|
||||
|
||||
for metric in metrics:
|
||||
actual = pd.read_csv(f"{analysis_workload_dir}/{metric['csv_file']}")[
|
||||
metric["column"]
|
||||
].values[0]
|
||||
expected = metric["expected_value"]
|
||||
# 5% tolerance in checking
|
||||
assert abs(actual - expected) / expected <= 0.05, (
|
||||
f"{metric['name']} ({metric['metric_id']}): "
|
||||
f"actual={actual}, expected={expected}, "
|
||||
f"diff={(abs(actual - expected) / expected * 100):.2f}% "
|
||||
f"(tolerance: 5%)"
|
||||
)
|
||||
finally:
|
||||
test_utils.clean_output_dir(config["cleanup"], analysis_workload_dir)
|
||||
test_utils.clean_output_dir(config["cleanup"], profile_workload_dir)
|
||||
@@ -38,29 +38,7 @@ import pytest
|
||||
import test_utils
|
||||
from scipy.stats import zscore
|
||||
|
||||
# Globals
|
||||
|
||||
# TODO: MI350 What are the gpu models in MI 350 series
|
||||
SUPPORTED_ARCHS = {
|
||||
"gfx908": {"mi100": ["MI100"]},
|
||||
"gfx90a": {"mi200": ["MI210", "MI250", "MI250X"]},
|
||||
"gfx940": {"mi300": ["MI300A_A0"]},
|
||||
"gfx941": {"mi300": ["MI300X_A0"]},
|
||||
"gfx942": {"mi300": ["MI300A_A1", "MI300X_A1"]},
|
||||
"gfx950": {"mi350": ["MI350"]},
|
||||
}
|
||||
|
||||
CHIP_IDS = {
|
||||
"29856": "MI300A_A1",
|
||||
"29857": "MI300X_A1",
|
||||
"29858": "MI308X",
|
||||
"30112": "MI350",
|
||||
}
|
||||
|
||||
# --
|
||||
# Runtime config options
|
||||
# --
|
||||
|
||||
config = {}
|
||||
config["kernel_name_1"] = "vecCopy"
|
||||
config["app_1"] = ["./tests/vcopy", "-n", "1048576", "-b", "256", "-i", "3"]
|
||||
@@ -248,47 +226,7 @@ def counter_compare(test_name, errors_pd, baseline_df, run_df, threshold=5):
|
||||
return errors_pd
|
||||
|
||||
|
||||
def gpu_soc():
|
||||
global num_devices
|
||||
## 1) Parse arch details from rocminfo
|
||||
rocminfo = str(
|
||||
# decode with utf-8 to account for rocm-smi changes in latest rocm
|
||||
subprocess.run(
|
||||
["rocminfo"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||
).stdout.decode("utf-8")
|
||||
)
|
||||
rocminfo = rocminfo.split("\n")
|
||||
soc_regex = re.compile(r"^\s*Name\s*:\s+ ([a-zA-Z0-9]+)\s*$", re.MULTILINE)
|
||||
devices = list(filter(soc_regex.match, rocminfo))
|
||||
gpu_arch = devices[0].split()[1]
|
||||
|
||||
if not gpu_arch in SUPPORTED_ARCHS.keys():
|
||||
print("Cannot find a supported arch in rocminfo")
|
||||
assert 0
|
||||
else:
|
||||
num_devices = (
|
||||
len(devices)
|
||||
if not "CI_VISIBLE_DEVICES" in os.environ
|
||||
else os.environ["CI_VISIBLE_DEVICES"]
|
||||
)
|
||||
|
||||
## 2) Parse chip id from rocminfo
|
||||
chip_id = re.compile(r"^\s*Chip ID:\s+ ([a-zA-Z0-9]+)\s*", re.MULTILINE)
|
||||
ids = list(filter(chip_id.match, rocminfo))
|
||||
for id in ids:
|
||||
chip_id = re.match(r"^[^()]+", id.split()[2]).group(0)
|
||||
|
||||
## 3) Deduce gpu model name from arch
|
||||
gpu_model = list(SUPPORTED_ARCHS[gpu_arch].keys())[0].upper()
|
||||
# For testing purposes we only care about gpu model series not the specific model
|
||||
# if gpu_model not in ("MI50", "MI100", "MI200"):
|
||||
# if chip_id in CHIP_IDS:
|
||||
# gpu_model = CHIP_IDS[chip_id]
|
||||
|
||||
return gpu_model
|
||||
|
||||
|
||||
soc = gpu_soc()
|
||||
soc = test_utils.gpu_soc()
|
||||
|
||||
os.environ["ROCPROF"] = "rocprofiler-sdk"
|
||||
|
||||
@@ -645,9 +583,7 @@ def test_path(binary_handler_profile_rocprof_compute):
|
||||
|
||||
|
||||
@pytest.mark.path
|
||||
def test_path_rocflop(
|
||||
binary_handler_profile_rocprof_compute,
|
||||
):
|
||||
def test_path_rocflop(binary_handler_profile_rocprof_compute):
|
||||
# Test whether multiprocess workloads like rocflop are handled correctly
|
||||
workload_dir = test_utils.get_output_dir()
|
||||
options = ["--block", "2.1.1"]
|
||||
|
||||
@@ -32,6 +32,7 @@ import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
@@ -41,6 +42,15 @@ import pytest
|
||||
|
||||
import utils.utils as utils
|
||||
|
||||
SUPPORTED_ARCHS = {
|
||||
"gfx908": {"mi100": ["MI100"]},
|
||||
"gfx90a": {"mi200": ["MI210", "MI250", "MI250X"]},
|
||||
"gfx940": {"mi300": ["MI300A_A0"]},
|
||||
"gfx941": {"mi300": ["MI300X_A0"]},
|
||||
"gfx942": {"mi300": ["MI300A_A1", "MI300X_A1"]},
|
||||
"gfx950": {"mi350": ["MI350"]},
|
||||
}
|
||||
|
||||
|
||||
class MockMSpec:
|
||||
def __init__(
|
||||
@@ -228,6 +238,27 @@ def get_num_pmc_file(output_dir):
|
||||
])
|
||||
|
||||
|
||||
def gpu_soc():
|
||||
# Parse arch details from rocminfo
|
||||
rocminfo = str(
|
||||
# decode with utf-8 to account for rocm-smi changes in latest rocm
|
||||
subprocess.run(
|
||||
["rocminfo"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||
).stdout.decode("utf-8")
|
||||
)
|
||||
rocminfo = rocminfo.split("\n")
|
||||
soc_regex = re.compile(r"^\s*Name\s*:\s+ ([a-zA-Z0-9]+)\s*$", re.MULTILINE)
|
||||
devices = list(filter(soc_regex.match, rocminfo))
|
||||
gpu_arch = devices[0].split()[1]
|
||||
|
||||
if not gpu_arch in SUPPORTED_ARCHS.keys():
|
||||
return None
|
||||
|
||||
gpu_model = list(SUPPORTED_ARCHS[gpu_arch].keys())[0].upper()
|
||||
|
||||
return gpu_model
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# VERSION UTILITIES TESTS
|
||||
# =============================================================================
|
||||
|
||||
@@ -588,8 +588,10 @@ rocprofiler-sdk:
|
||||
- gfx940
|
||||
- gfx941
|
||||
- gfx942
|
||||
- gfx950
|
||||
expression: (TCC_BUBBLE_sum*128 + (TCC_EA0_RDREQ_sum-TCC_BUBBLE_sum-TCC_EA0_RDREQ_32B_sum)*64 + TCC_EA0_RDREQ_32B_sum*32)/1024
|
||||
- architectures:
|
||||
- gfx950
|
||||
expression: (TCC_EA0_RDREQ_128B_sum*128+TCC_EA0_RDREQ_64B_sum*64+TCC_EA0_RDREQ_32B_sum*32)/1024
|
||||
- architectures:
|
||||
- gfx10
|
||||
- gfx1010
|
||||
@@ -615,12 +617,16 @@ rocprofiler-sdk:
|
||||
- architectures:
|
||||
- gfx90a
|
||||
expression: 1024*(WRITE_SIZE+FETCH_SIZE)/reduce(GRBM_GUI_ACTIVE,max)
|
||||
# Approximation which excludes TCC_EA0_RDREQ_32B due to the hardware limitation on the number of counters that can be collected.
|
||||
- architectures:
|
||||
- gfx940
|
||||
- gfx941
|
||||
- gfx942
|
||||
expression: (WRITE_SIZE*1024+TCC_BUBBLE_sum*128+(TCC_EA0_RDREQ_sum-TCC_BUBBLE_sum)*64)/reduce(GRBM_GUI_ACTIVE,max)
|
||||
# Approximation which excludes TCC_EA0_RDREQ_32B due to the hardware limitation on the number of counters that can be collected.
|
||||
- architectures:
|
||||
- gfx950
|
||||
expression: (WRITE_SIZE*1024+TCC_BUBBLE_sum*128+(TCC_BUBBLE_sum-TCC_EA0_RDREQ_sum)*64)/reduce(GRBM_GUI_ACTIVE,max)
|
||||
expression: (WRITE_SIZE*1024+TCC_EA0_RDREQ_128B_sum*128+TCC_EA0_RDREQ_64B_sum*64)/reduce(GRBM_GUI_ACTIVE,max)
|
||||
- name: FetchSize
|
||||
description: The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache
|
||||
or memory effects taken into account.
|
||||
|
||||
Посилання в новій задачі
Заблокувати користувача