diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index ba8b65a1e7..cc8d1cf725 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -58,6 +58,8 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. * Fix redundant warnings for compute/memory partition not found for < MI 300 series GPUs by skipping partition checks +* Fixed formula for metrics related to reads from L2 cache to HBM for MI350 + ### Removed * Removed "VL1 Lat" metric for AMD Instinct MI300 series GPUs, due to MI300 series not supporting TCP_TCP_LATENCY_sum counter. diff --git a/projects/rocprofiler-compute/CMakeLists.txt b/projects/rocprofiler-compute/CMakeLists.txt index e9daca6786..95e81c0916 100644 --- a/projects/rocprofiler-compute/CMakeLists.txt +++ b/projects/rocprofiler-compute/CMakeLists.txt @@ -388,6 +388,13 @@ add_test( tests/test_profile_general.py ${WORKING_DIR_OPTION} ) +add_test( + NAME test_metric_validation + COMMAND + ${PYTHON_TEST_COMMAND} -m pytest --junitxml=tests/test_metric_validation.xml + ${COV_OPTION} tests/test_metric_validation.py ${WORKING_DIR_OPTION} +) + set_tests_properties( test_profile_kernel_execution test_profile_dispatch @@ -748,6 +755,7 @@ if(INSTALL_TESTS) tests/laplace_eqn tests/mat_mul_max tests/rocflop + tests/memcopy DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/${PROJECT_NAME}/tests COMPONENT tests ) diff --git a/projects/rocprofiler-compute/README.md b/projects/rocprofiler-compute/README.md index f913e5ad47..b7ed73b217 100644 --- a/projects/rocprofiler-compute/README.md +++ b/projects/rocprofiler-compute/README.md @@ -38,8 +38,8 @@ python3 -m pip install -r requirements.txt ## Testing -Populate the variable in `docker/docker-compose.customrocmtest.yml`. -Populate the variable in `docker/Dockerfile.customrocmtest` based on latest TheRock nightly build information. +Populate the variable in `docker/docker-compose.customrocmtest.yml`. +Populate the variable in `docker/Dockerfile.customrocmtest` based on latest ROCm CI build information. To quickly get the environment (bash shell) for building and testing, run the following commands: * `cd docker` diff --git a/projects/rocprofiler-compute/docker/Dockerfile.customrocmtest b/projects/rocprofiler-compute/docker/Dockerfile.customrocmtest index 020f8b0e4c..b1b11e38af 100644 --- a/projects/rocprofiler-compute/docker/Dockerfile.customrocmtest +++ b/projects/rocprofiler-compute/docker/Dockerfile.customrocmtest @@ -1,5 +1,5 @@ # Use a base image -FROM ubuntu:22.04 +FROM # Install curl first (needed for ROCm download) RUN apt-get update && apt-get install -y curl @@ -7,7 +7,7 @@ RUN apt-get update && apt-get install -y curl # Define the tarball name as a variable # Check https://therock-nightly-tarball.s3.amazonaws.com/index.html for latest builds # Use therock-dist-linux-gfx-dcgpu-.tar.gz naming convention -ARG TARBALL_NAME= +ARG TARBALL_NAME=therock-dist-linux-gfx94X-dcgpu-7.11.0a20260116.tar.gz # Install ROCm from TheRock Nightly build RUN mkdir -p /rocm && \ @@ -24,29 +24,27 @@ ENV PATH="/rocm/bin:${PATH}" \ # Update package list and install prerequisites RUN apt-get update && apt-get install -y \ - software-properties-common cmake locales git \ + software-properties-common cmake locales git curl \ && add-apt-repository ppa:deadsnakes/ppa \ && apt-get update +# Allows running git commands in /app +RUN git config --global --add safe.directory /app + # Generate the desired locale RUN locale-gen en_US.UTF-8 # Install Python 3.10 and pip RUN apt-get install -y python3.10 python3.10-venv python3.10-dev python3-pip libsqlite3-dev -RUN python3.10 -m venv /venv -ENV PATH="/venv/bin:$PATH" +RUN python3.10 -m venv venv +ENV PATH="venv/bin:$PATH" RUN python -m pip install --upgrade pip -# Install any rocprofiler-compute dependencies specified in requirements.txt +# Install any dependencies specified in requirements.txt +WORKDIR /app/projects/rocprofiler-compute COPY projects/rocprofiler-compute/requirements.txt /app/projects/rocprofiler-compute/requirements.txt COPY projects/rocprofiler-compute/requirements-test.txt /app/projects/rocprofiler-compute/requirements-test.txt -RUN python -m pip install -r /app/projects/rocprofiler-compute/requirements.txt -r /app/projects/rocprofiler-compute/requirements-test.txt - -# Set the working directory -WORKDIR /app - -# Allows running git commands in /app -RUN git config --global --add safe.directory /app +RUN python -m pip install -r requirements.txt -r requirements-test.txt # Run interactive bash shell CMD ["/bin/bash"] diff --git a/projects/rocprofiler-compute/sample/memcopy.cpp b/projects/rocprofiler-compute/sample/memcopy.cpp new file mode 100644 index 0000000000..2ccd5de4c5 --- /dev/null +++ b/projects/rocprofiler-compute/sample/memcopy.cpp @@ -0,0 +1,125 @@ +#include +#include +#include + +// Simple memory copy kernel - one thread per element +__global__ void memoryCopyKernel(const double* __restrict__ X, + double* __restrict__ Y, + size_t N) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < N) { + Y[idx] = X[idx]; + } +} + +int main() { + const size_t N = 268435456; // Number of elements + const size_t bytes = N * sizeof(double); + + // Print array size in GiB + double sizeGiB = bytes / (1024.0 * 1024.0 * 1024.0); + std::cout << "Array size: " << std::fixed << std::setprecision(2) + << sizeGiB << " GiB (" << bytes << " bytes)" << std::endl; + std::cout << "Number of elements: " << N << std::endl; + std::cout << std::string(60, '-') << std::endl; + + // Allocate device memory + double *d_X, *d_Y; + hipMalloc(&d_X, bytes); + hipMalloc(&d_Y, bytes); + + // Allocate and initialize host memory + double *h_X = new double[N]; + for (size_t i = 0; i < N; i++) { + h_X[i] = static_cast(i); + } + + // Copy data to device + hipMemcpy(d_X, h_X, bytes, hipMemcpyHostToDevice); + + // Launch configuration + int blockSize = 256; + int gridSize = (N + blockSize - 1) / blockSize; + + std::cout << "Launch configuration:" << std::endl; + std::cout << " Block size: " << blockSize << std::endl; + std::cout << " Grid size: " << gridSize << std::endl; + std::cout << std::string(60, '-') << std::endl; + + // Warmup run + memoryCopyKernel<<>>(d_X, d_Y, N); + hipDeviceSynchronize(); + + // Timed runs + const int numRuns = 1000; + hipEvent_t start, stop; + hipEventCreate(&start); + hipEventCreate(&stop); + + float totalTime = 0.0f; + + for (int run = 0; run < numRuns; run++) { + hipEventRecord(start); + + memoryCopyKernel<<>>(d_X, d_Y, N); + + hipEventRecord(stop); + hipEventSynchronize(stop); + + float milliseconds = 0; + hipEventElapsedTime(&milliseconds, start, stop); + totalTime += milliseconds; + } + + float avgTime = totalTime / numRuns; + + // Calculate bandwidth + // Memory copy reads N elements and writes N elements = 2*N*sizeof(double) bytes + double totalBytesTransferred = 2.0 * N * sizeof(double); + double bandwidthGBps = (totalBytesTransferred / (avgTime / 1000.0)) / (1024.0 * 1024.0 * 1024.0); + + std::cout << "Performance Results (averaged over " << numRuns << " runs):" << std::endl; + std::cout << " Kernel execution time: " << std::fixed << std::setprecision(3) + << avgTime << " ms" << std::endl; + std::cout << " Data transferred: " << std::setprecision(2) + << (totalBytesTransferred / (1024.0 * 1024.0 * 1024.0)) << " GiB" << std::endl; + std::cout << " Achieved bandwidth: " << std::setprecision(2) + << bandwidthGBps << " GiB/s" << std::endl; + std::cout << std::string(60, '-') << std::endl; + + // Verify correctness + double *h_Y = new double[N]; + hipMemcpy(h_Y, d_Y, bytes, hipMemcpyDeviceToHost); + + bool correct = true; + for (size_t i = 0; i < N; i++) { + if (h_Y[i] != h_X[i]) { + correct = false; + std::cout << "Mismatch at index " << i << ": " + << h_Y[i] << " != " << h_X[i] << std::endl; + break; + } + } + + if (correct) { + std::cout << "✓ Verification PASSED - all elements copied correctly" << std::endl; + } else { + std::cout << "✗ Verification FAILED" << std::endl; + } + + std::cout << "\nFirst 5 elements:" << std::endl; + for (int i = 0; i < 5; i++) { + std::cout << " Y[" << i << "] = " << h_Y[i] << std::endl; + } + + // Cleanup + hipEventDestroy(start); + hipEventDestroy(stop); + delete[] h_X; + delete[] h_Y; + hipFree(d_X); + hipFree(d_Y); + + return 0; +} diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/config_delta/gfx950_diff.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/config_delta/gfx950_diff.yaml index a5ee25bb53..b3dab9216e 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/config_delta/gfx950_diff.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/config_delta/gfx950_diff.yaml @@ -1203,10 +1203,10 @@ Modification: pop: | ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) - L2-Fabric Read BW: - value: | - AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)) pop: | - ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) + ((100 * (AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) + value: | + AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)) - MFMA FLOPs (BF16): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) @@ -1275,13 +1275,20 @@ Modification: id: 400 title: Roofline metric_tables: + - metric_table: + id: 401 + title: Roofline Performance Rates + metrics: + - HBM Bandwidth: + value: | + AVG(((TCC_EA0_RDREQ_128B_sum * 128 + TCC_EA0_RDREQ_32B_sum * 32 + TCC_EA0_RDREQ_64B_sum * 64 + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - metric_table: id: 402 title: Roofline Plot Points metrics: - AI HBM: value: | - ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) ) + ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_EA0_RDREQ_128B_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) ) - AI L1: value: | ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) ) @@ -1474,9 +1481,9 @@ Modification: title: L2-Fabric Read Stall (Cycles per normUnit) metrics: - ::_1: - ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom)) ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) / $denom)) + ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom)) - metric_table: id: 1810 title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/config_delta/gfx950_diff.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/config_delta/gfx950_diff.yaml index d2a834f642..f1d147bf1d 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/config_delta/gfx950_diff.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/config_delta/gfx950_diff.yaml @@ -781,14 +781,14 @@ Modification: metrics: - HBM Bandwidth: value: | - AVG((( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) + AVG(((TCC_EA0_RDREQ_128B_sum * 128 + TCC_EA0_RDREQ_32B_sum * 32 + TCC_EA0_RDREQ_64B_sum * 64 + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - metric_table: id: 402 title: Roofline Plot Points metrics: - AI HBM: value: | - ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) ) + ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_EA0_RDREQ_128B_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) ) - AI L1: value: | ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) ) @@ -807,37 +807,37 @@ Modification: title: Workgroup manager utilizations metrics: - Dispatched Wavefronts: + min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - Dispatched Workgroups: + min: | + MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) avg: | AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) max: | MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) - min: | - MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) - SGPR Writes: + min: | + MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) avg: | AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) max: | MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - min: | - MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - Scheduler-Pipe Utilization: + min: | + MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) avg: | AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) max: | MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - min: | - MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - VGPR Writes: + min: | + MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) avg: | AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) max: | MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - min: | - MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - Panel Config: id: 700 title: Wavefront @@ -847,9 +847,9 @@ Modification: title: Wavefront Launch Stats metrics: - Total Wavefronts: + min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - Panel Config: id: 1000 title: Compute Units - Instruction Mix @@ -859,9 +859,9 @@ Modification: title: Overall Instruction Mix metrics: - VMEM: + min: MIN(((SQ_INSTS_VMEM) / $denom)) avg: AVG(((SQ_INSTS_VMEM) / $denom)) max: MAX(((SQ_INSTS_VMEM) / $denom)) - min: MIN(((SQ_INSTS_VMEM) / $denom)) - Panel Config: id: 1100 title: Compute Units - Compute Pipeline @@ -891,12 +891,12 @@ Modification: title: Arithmetic Operations metrics: - FLOPs (Total): + min: | + MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) avg: | AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) max: | MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) - min: | - MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) - Panel Config: id: 1500 title: Address Processing Unit and Data Return Path (TA/TD) @@ -923,21 +923,21 @@ Modification: title: vL1D cache access metrics metrics: - Cache BW: + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - L1 Access Latency: unit: (Cycles + $normUnit) avg: AVG((TCP_TCP_LATENCY_sum / $denom)) max: MAX((TCP_TCP_LATENCY_sum / $denom)) min: MIN((TCP_TCP_LATENCY_sum / $denom)) - L1-L2 BW: + min: | + MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp))) avg: | AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp))) max: | MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp))) - min: | - MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp))) - L1-L2 Read Latency: unit: (Cycles + $normUnit) avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) @@ -967,48 +967,50 @@ Modification: title: L2-Fabric interface metrics metrics: - Atomic Latency: + min: | + MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None)) avg: | AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None)) max: | MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None)) - min: | - MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None)) - Atomic Traffic: + min: | + MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) avg: | AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) max: | MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: | - MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - HBM Read Traffic: + min: | + MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) avg: | AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) max: | MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: | - MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - HBM Write and Atomic Traffic: + min: | + MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) avg: | AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) max: | MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: | - MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - Read BW: + min: | + MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) avg: | AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) max: | MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) - min: | - MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) - Read Latency: + min: | + MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) avg: | AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) max: | MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: | - MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - Remote Read Traffic: + min: | + MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) avg: | AVG((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) max: | @@ -1016,6 +1018,8 @@ Modification: min: | MIN((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - Remote Write and Atomic Traffic: + min: | + MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) avg: | AVG((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) max: | @@ -1023,19 +1027,19 @@ Modification: min: | MIN((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - Uncached Read Traffic: + min: | + MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) avg: | AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) max: | MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: | - MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - Uncached Write and Atomic Traffic: + min: | + MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) avg: | AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) max: | MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: | - MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - Write and Atomic BW: unit: Gbps avg: | @@ -1045,40 +1049,40 @@ Modification: min: | MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) - Write and Atomic Latency: + min: | + MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) avg: | AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) max: | MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: | - MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - metric_table: id: 1706 title: L2 - Fabric interface detailed metrics metrics: - Atomic: + min: MIN((TCC_EA0_ATOMIC_sum / $denom)) avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) max: MAX((TCC_EA0_ATOMIC_sum / $denom)) - min: MIN((TCC_EA0_ATOMIC_sum / $denom)) - HBM Read: + min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) - HBM Write and Atomic: + min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) - Read (32B): + min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom)) max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) - Read (64B): + min: MIN((TCC_EA0_RDREQ_64B_sum / $denom)) avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom)) max: MAX((TCC_EA0_RDREQ_64B_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_64B_sum / $denom)) - Read (Uncached): + min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - Remote Read: avg: | AVG((NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / $denom)) @@ -1101,13 +1105,13 @@ Modification: min: | MIN(NOISE_CLAMP(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), TCC_EA0_WRREQ_sum)) - Write and Atomic (64B): + min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) - Write and Atomic (Uncached): + min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - Panel Config: id: 1800 title: L2 Cache (per Channel) @@ -1125,14 +1129,20 @@ Modification: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) min: | MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + std dev: | + STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + avg: | + AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + max: | + MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - metric_table: id: 1805 title: L2-Fabric Requests (per normUnit) metrics: - ::_1: read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) - metric_table: id: 1806 title: L2-Fabric Read Latency (Cycles) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/config_delta/gfx950_diff.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/config_delta/gfx950_diff.yaml index 2b3b9c6cbd..7a9f4ea38d 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/config_delta/gfx950_diff.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/config_delta/gfx950_diff.yaml @@ -714,6 +714,11 @@ Modification: id: 201 title: System Speed-of-Light metrics: + - L2-Fabric Read BW: + pop: | + ((100 * (AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) + value: | + AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)) - MFMA FLOPs (BF16): peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | @@ -751,13 +756,20 @@ Modification: id: 400 title: Roofline metric_tables: + - metric_table: + id: 401 + title: Roofline Performance Rates + metrics: + - HBM Bandwidth: + value: | + AVG(((TCC_EA0_RDREQ_128B_sum * 128 + TCC_EA0_RDREQ_32B_sum * 32 + TCC_EA0_RDREQ_64B_sum * 64 + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - metric_table: id: 402 title: Roofline Plot Points metrics: - AI HBM: value: | - ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) ) + ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_EA0_RDREQ_128B_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) ) - AI L1: value: | ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) ) @@ -901,9 +913,9 @@ Modification: title: L2-Fabric Read Stall (Cycles per normUnit) metrics: - ::_1: - ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom)) ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) / $denom)) + ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) - metric_table: id: 1810 title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/config_delta/gfx950_diff.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/config_delta/gfx950_diff.yaml index 1e95051077..f5555c8721 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/config_delta/gfx950_diff.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/config_delta/gfx950_diff.yaml @@ -714,27 +714,32 @@ Modification: id: 201 title: System Speed-of-Light metrics: + - L2-Fabric Read BW: + value: | + AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)) + pop: | + ((100 * (AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) - MFMA FLOPs (BF16): - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - - MFMA FLOPs (F16): peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + - MFMA FLOPs (F16): pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - MFMA FLOPs (F64): - peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) + peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) - MFMA FLOPs (F8): - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) unit: GFLOP/s pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - - MFMA IOPs (Int8): peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + - MFMA IOPs (Int8): pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - Panel Config: id: 300 title: Memory Chart @@ -752,13 +757,20 @@ Modification: id: 400 title: Roofline metric_tables: + - metric_table: + id: 401 + title: Roofline Performance Rates + metrics: + - HBM Bandwidth: + value: | + AVG(((TCC_EA0_RDREQ_128B_sum * 128 + TCC_EA0_RDREQ_32B_sum * 32 + TCC_EA0_RDREQ_64B_sum * 64 + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - metric_table: id: 402 title: Roofline Plot Points metrics: - AI HBM: value: | - ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) ) + ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_EA0_RDREQ_128B_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) ) - AI L1: value: | ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) ) @@ -780,6 +792,7 @@ Modification: min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - Dispatched Workgroups: min: | MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) @@ -787,6 +800,8 @@ Modification: MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) avg: | AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) + min: | + MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) - SGPR Writes: min: | MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) @@ -794,6 +809,8 @@ Modification: MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) avg: | AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + min: | + MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - Scheduler-Pipe Utilization: min: | MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) @@ -801,6 +818,8 @@ Modification: MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) avg: | AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + min: | + MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - VGPR Writes: min: | MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) @@ -808,6 +827,8 @@ Modification: MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) avg: | AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + min: | + MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - Panel Config: id: 700 title: Wavefront @@ -820,6 +841,7 @@ Modification: min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - Panel Config: id: 1100 title: Compute Units - Compute Pipeline @@ -829,25 +851,25 @@ Modification: title: Compute Speed-of-Light metrics: - MFMA FLOPs (BF16): - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - - MFMA FLOPs (F16): peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + - MFMA FLOPs (F16): pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - MFMA FLOPs (F64): - peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) + peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) - MFMA FLOPs (F8): - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - - MFMA IOPs (INT8): peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + - MFMA IOPs (INT8): pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - metric_table: id: 1103 title: Arithmetic Operations @@ -859,6 +881,8 @@ Modification: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) avg: | AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) + min: | + MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) - Panel Config: id: 1700 title: L2 Cache @@ -889,10 +913,12 @@ Modification: min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) - Read (64B): min: MIN((TCC_EA0_RDREQ_64B_sum / $denom)) max: MAX((TCC_EA0_RDREQ_64B_sum / $denom)) avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_64B_sum / $denom)) - Panel Config: id: 1800 title: L2 Cache (per Channel) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/config_delta/gfx950_diff.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/config_delta/gfx950_diff.yaml index cad25779ea..b4c8e0ed09 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/config_delta/gfx950_diff.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/config_delta/gfx950_diff.yaml @@ -709,6 +709,11 @@ Modification: id: 201 title: System Speed-of-Light metrics: + - L2-Fabric Read BW: + value: | + AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)) + pop: | + ((100 * (AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) - MFMA FLOPs (BF16): pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) @@ -746,13 +751,20 @@ Modification: id: 400 title: Roofline metric_tables: + - metric_table: + id: 401 + title: Roofline Performance Rates + metrics: + - HBM Bandwidth: + value: | + AVG(((TCC_EA0_RDREQ_128B_sum * 128 + TCC_EA0_RDREQ_32B_sum * 32 + TCC_EA0_RDREQ_64B_sum * 64 + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - metric_table: id: 402 title: Roofline Plot Points metrics: - AI HBM: value: | - ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) ) + ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_EA0_RDREQ_128B_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) ) - AI L1: value: | ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) ) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml index 2a5c4b2183..c69d376711 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml @@ -174,13 +174,13 @@ Panel Config: pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) L2-Fabric Read BW: - value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + value: AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)) unit: GB/s peak: $hbmBandwidth - pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + pop: ((100 * (AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) L2-Fabric Write BW: value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml index 83240870e8..c6777d26f2 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml @@ -68,8 +68,8 @@ Panel Config: unit: GIOP/s peak: $MFMAI8Ops_empirical_peak HBM Bandwidth: - value: AVG((( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum + value: AVG(((TCC_EA0_RDREQ_128B_sum * 128 + TCC_EA0_RDREQ_32B_sum * 32 + + TCC_EA0_RDREQ_64B_sum * 64 + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) unit: GB/s @@ -108,9 +108,9 @@ Panel Config: + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 - * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum - * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_EA0_RDREQ_128B_sum + * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum + * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) ) unit: FLOPs/Byte AI L2: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/counter_defs.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/counter_defs.yaml index b955e9b28f..986295b29c 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/counter_defs.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/counter_defs.yaml @@ -587,8 +587,10 @@ rocprofiler-sdk: - gfx940 - gfx941 - gfx942 - - gfx950 expression: (TCC_BUBBLE_sum*128 + (TCC_EA0_RDREQ_sum-TCC_BUBBLE_sum-TCC_EA0_RDREQ_32B_sum)*64 + TCC_EA0_RDREQ_32B_sum*32)/1024 + - architectures: + - gfx950 + expression: (TCC_EA0_RDREQ_128B_sum*128+TCC_EA0_RDREQ_64B_sum*64+TCC_EA0_RDREQ_32B_sum*32)/1024 - architectures: - gfx10 - gfx1010 @@ -612,12 +614,16 @@ rocprofiler-sdk: - architectures: - gfx90a expression: 1024*(WRITE_SIZE+FETCH_SIZE)/reduce(GRBM_GUI_ACTIVE,max) + # Approximation which excludes TCC_EA0_RDREQ_32B due to the hardware limitation on the number of counters that can be collected. - architectures: - gfx940 - gfx941 - gfx942 + expression: (WRITE_SIZE*1024+TCC_BUBBLE_sum*128+(TCC_EA0_RDREQ_sum-TCC_BUBBLE_sum)*64)/reduce(GRBM_GUI_ACTIVE,max) + # Approximation which excludes TCC_EA0_RDREQ_32B due to the hardware limitation on the number of counters that can be collected. + - architectures: - gfx950 - expression: (WRITE_SIZE*1024+TCC_BUBBLE_sum*128+(TCC_BUBBLE_sum-TCC_EA0_RDREQ_sum)*64)/reduce(GRBM_GUI_ACTIVE,max) + expression: (WRITE_SIZE*1024+TCC_EA0_RDREQ_128B_sum*128+TCC_EA0_RDREQ_64B_sum*64)/reduce(GRBM_GUI_ACTIVE,max) - name: FetchSize description: The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx950/3200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx950/3200_system_speed_of_light.yaml index 2010e86763..490aeaf46a 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx950/3200_system_speed_of_light.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx950/3200_system_speed_of_light.yaml @@ -65,13 +65,13 @@ Panel Config: pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) L2-Fabric Read BW: - value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + value: AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)) unit: GB/s peak: $hbmBandwidth - pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + pop: ((100 * (AVG((128 * TCC_EA0_RDREQ_128B_sum + 64 * TCC_EA0_RDREQ_64B_sum + + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) L2-Fabric Write BW: value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) @@ -100,4 +100,4 @@ Panel Config: value: (GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) / (End_Timestamp - Start_Timestamp) unit: ns peak: N/A - pop: N/A \ No newline at end of file + pop: N/A diff --git a/projects/rocprofiler-compute/src/utils/.config_hashes.json b/projects/rocprofiler-compute/src/utils/.config_hashes.json index ef22bef8e5..42245f51fc 100644 --- a/projects/rocprofiler-compute/src/utils/.config_hashes.json +++ b/projects/rocprofiler-compute/src/utils/.config_hashes.json @@ -1,7 +1,7 @@ { "archs": { "gfx908": { - "delta_hash": "bae93343a258d4b3f5e64c2f2ce91d1a", + "delta_hash": "1417ef1cfc1b47b7907608f2425f9f11", "files": { "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f", "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8", @@ -24,7 +24,7 @@ } }, "gfx90a": { - "delta_hash": "d64f63cee8ace777ae46002c13756ab8", + "delta_hash": "c3ff3aa253d044f895d776eff72279ad", "files": { "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f", "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8", @@ -47,7 +47,7 @@ } }, "gfx940": { - "delta_hash": "7b5e4d755c2c4e1654d0d01576284df2", + "delta_hash": "35aa77b45341e2a215b8251ad73f472f", "files": { "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f", "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8", @@ -70,7 +70,7 @@ } }, "gfx941": { - "delta_hash": "9e62e4734bcc6a318ecdbb6d5432ce98", + "delta_hash": "8057ce09c13a4fa3e4863085c18a158c", "files": { "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f", "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8", @@ -93,7 +93,7 @@ } }, "gfx942": { - "delta_hash": "4738dc07f7d7f08cfde42ece633e648f", + "delta_hash": "76bacc3fd8b2d4da43c3b8d670a097d9", "files": { "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f", "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8", @@ -120,9 +120,9 @@ "files": { "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f", "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8", - "0200_system_speed_of_light.yaml": "bf2ca00d4b255dbbe191a7641b81dc4c", + "0200_system_speed_of_light.yaml": "8e2a370cba884f1e4f695ee70b3bd7b6", "0300_memory_chart.yaml": "2c82fa6f81a0dda679706d36b99e7913", - "0400_roofline.yaml": "2bd3b630b72d6d165c0d30cf481136a9", + "0400_roofline.yaml": "e39c92773e1499158fd47ce416e7cee5", "0500_command_processor_cpc_cpf.yaml": "3f7dab1663ad7a6fae3801aec2b1e8d0", "0600_workgroup_manager_spi.yaml": "e6546a92d283fed5a5dc6df203efb670", "0700_wavefront.yaml": "330468fd711057b422de9b952c5cfe69", diff --git a/projects/rocprofiler-compute/src/utils/file_io.py b/projects/rocprofiler-compute/src/utils/file_io.py index 172ddb66e1..1755b65eed 100644 --- a/projects/rocprofiler-compute/src/utils/file_io.py +++ b/projects/rocprofiler-compute/src/utils/file_io.py @@ -34,7 +34,13 @@ import yaml import config from utils import rocpd_data, schema from utils.kernel_name_shortener import kernel_name_shortener -from utils.logger import console_debug, console_error, console_log, demarcate +from utils.logger import ( + console_debug, + console_error, + console_log, + console_warning, + demarcate, +) # TODO: use pandas chunksize or dask to read really large csv file # from dask import dataframe as dd @@ -376,7 +382,9 @@ def is_single_panel_config( elif arch_count == len(arch_names): return False else: - console_error("Found multiple panel config sets but incomplete for all archs.") + console_warning( + "Found multiple panel config sets but incomplete for all archs." + ) def find_1st_sub_dir(directory: str) -> Optional[str]: diff --git a/projects/rocprofiler-compute/src/utils/parser.py b/projects/rocprofiler-compute/src/utils/parser.py index 1df0947de1..fe5185884f 100755 --- a/projects/rocprofiler-compute/src/utils/parser.py +++ b/projects/rocprofiler-compute/src/utils/parser.py @@ -1048,7 +1048,7 @@ def create_empirical_peaks_dict(empirical_peaks_df: pd.DataFrame) -> dict[str, f ] # initialize peaks to 0 for peak_name in peak_names: - empirical_peaks[f"ammolite__{peak_name}_empirical_peak"] = 0 + empirical_peaks[f"ammolite__{peak_name}_empirical_peak"] = np.nan return empirical_peaks diff --git a/projects/rocprofiler-compute/src/utils/roofline_calc.py b/projects/rocprofiler-compute/src/utils/roofline_calc.py index 277777f00b..fded40a77f 100644 --- a/projects/rocprofiler-compute/src/utils/roofline_calc.py +++ b/projects/rocprofiler-compute/src/utils/roofline_calc.py @@ -656,7 +656,21 @@ def calc_ai_profile( * 32 ) ) - + elif mspec.gpu_series == "MI350": + # Use TCC_EA0_RDREQ_128B_sum TCC_EA0_RDREQ_64B_sum to calculate hbm_data + hbm_data += ( + (df["TCC_EA0_RDREQ_128B_sum"][idx] * 128) + + (df["TCC_EA0_RDREQ_64B_sum"][idx] * 64) + + (df["TCC_EA0_RDREQ_32B_sum"][idx] * 32) + + ( + ( + df["TCC_EA0_WRREQ_sum"][idx] + - df["TCC_EA0_WRREQ_64B_sum"][idx] + ) + * 32 + ) + + (df["TCC_EA0_WRREQ_64B_sum"][idx] * 64) + ) else: # Use TCC_BUBBLE_sum to calculate hbm_data hbm_data += ( diff --git a/projects/rocprofiler-compute/tests/CMakeLists.txt b/projects/rocprofiler-compute/tests/CMakeLists.txt index 24bce39d50..c8e6e4ac6b 100644 --- a/projects/rocprofiler-compute/tests/CMakeLists.txt +++ b/projects/rocprofiler-compute/tests/CMakeLists.txt @@ -75,3 +75,11 @@ set_target_properties( rocflop PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/tests ) + +set(MEMCOPY_SOURCES ../sample/memcopy.cpp) +set_source_files_properties(${MEMCOPY_SOURCES} PROPERTIES LANGUAGE HIP) +add_executable(memcopy ${MEMCOPY_SOURCES}) +set_target_properties( + memcopy + PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/tests +) diff --git a/projects/rocprofiler-compute/tests/test_TCP_counters.py b/projects/rocprofiler-compute/tests/test_TCP_counters.py index 5d35b40a32..38fdff562d 100644 --- a/projects/rocprofiler-compute/tests/test_TCP_counters.py +++ b/projects/rocprofiler-compute/tests/test_TCP_counters.py @@ -24,8 +24,6 @@ ############################################################################## import csv -import re -import subprocess from pathlib import Path import pytest @@ -39,52 +37,6 @@ config["COUNTER_LOGGING"] = False config["METRIC_COMPARE"] = False config["METRIC_LOGGING"] = False -SUPPORTED_ARCHS = { - "gfx940": {"mi300": ["MI300A_A0"]}, - "gfx941": {"mi300": ["MI300X_A0"]}, - "gfx942": {"mi300": ["MI300A_A1", "MI300X_A1"]}, -} - -MI300_CHIP_IDS = { - "29856": "MI300A_A1", - "29857": "MI300X_A1", - "29858": "MI308X", -} - - -def gpu_soc(): - ## 1) Parse arch details from rocminfo - rocminfo = str( - # decode with utf-8 to account for rocm-smi changes in latest rocm - subprocess.run( - ["rocminfo"], stdout=subprocess.PIPE, stderr=subprocess.PIPE - ).stdout.decode("utf-8") - ) - rocminfo = rocminfo.split("\n") - soc_regex = re.compile(r"^\s*Name\s*:\s+ ([a-zA-Z0-9]+)\s*$", re.MULTILINE) - devices = list(filter(soc_regex.match, rocminfo)) - gpu_arch = devices[0].split()[1] - - if not gpu_arch in SUPPORTED_ARCHS.keys(): - return None - - ## 2) Parse chip id from rocminfo - chip_id = re.compile(r"^\s*Chip ID:\s+ ([a-zA-Z0-9]+)\s*", re.MULTILINE) - ids = list(filter(chip_id.match, rocminfo)) - for id in ids: - chip_id = re.match(r"^[^()]+", id.split()[2]).group(0) - - ## 3) Deduce gpu model name from arch - gpu_model = list(SUPPORTED_ARCHS[gpu_arch].keys())[0].upper() - # For testing purposes we only care about gpu model series not the specific model - # if gpu_model == "MI300": - # if chip_id in MI300_CHIP_IDS: - # gpu_model = MI300_CHIP_IDS[chip_id] - # else: - # return None - - return gpu_model - def load_metrics(csv_file_path): """ @@ -115,7 +67,7 @@ def load_metrics(csv_file_path): return metrics_data -soc = gpu_soc() +soc = test_utils.gpu_soc() @pytest.mark.L1_cache diff --git a/projects/rocprofiler-compute/tests/test_metric_validation.py b/projects/rocprofiler-compute/tests/test_metric_validation.py new file mode 100644 index 0000000000..8117447e8c --- /dev/null +++ b/projects/rocprofiler-compute/tests/test_metric_validation.py @@ -0,0 +1,144 @@ +############################################################################## +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +############################################################################## + +import pandas as pd +import pytest +import test_utils + +config = {} +config["memcopy"] = ["tests/memcopy"] +config["cleanup"] = True + +soc = test_utils.gpu_soc() + +# workload -> gfx -> metric definition +VALIDATE_METRICS = { + "memcopy": { + "MI100": [ + { + "name": "HBM Bandwidth", + "metric_id": "4.1.8", + "csv_file": "4.1_Roofline_Performance_Rates.csv", + "column": "Value", + "expected_value": 1044.48, + }, + ], + "MI200": [ + { + "name": "HBM Bandwidth", + "metric_id": "4.1.8", + "csv_file": "4.1_Roofline_Performance_Rates.csv", + "column": "Value", + "expected_value": 1389.17, + }, + ], + "MI300": [ + { + "name": "HBM Bandwidth", + "metric_id": "4.1.9", + "csv_file": "4.1_Roofline_Performance_Rates.csv", + "column": "Value", + "expected_value": 3910.62, + }, + ], + "MI350": [ + { + "name": "HBM Bandwidth", + "metric_id": "4.1.10", + "csv_file": "4.1_Roofline_Performance_Rates.csv", + "column": "Value", + "expected_value": 5690.42, + }, + ], + # Ignore warmup dispatch + # Collect roofline block + "profile_options": ["-d", "2-1001", "-b", "4"], + "roof": True, + } +} + + +@pytest.mark.path +def test_validate_metrics( + binary_handler_profile_rocprof_compute, binary_handler_analyze_rocprof_compute +): + for workload in VALIDATE_METRICS.keys(): + metrics = VALIDATE_METRICS[workload].get(soc, []) + metric_ids = [metric["metric_id"] for metric in metrics] + if not metric_ids: + print( + f"Skipping metric validation for {workload} on {soc}. " + "No metrics to validate." + ) + continue + + profile_workload_dir = test_utils.get_output_dir(param_id=f"{workload}_profile") + analysis_workload_dir = test_utils.get_output_dir( + param_id=f"{workload}_analysis" + ) + try: + # Ensure non zero length of profile df + options = VALIDATE_METRICS[workload].get("profile_options", []) + _ = binary_handler_profile_rocprof_compute( + config, + profile_workload_dir, + options, + check_success=True, + roof=VALIDATE_METRICS[workload].get("roof", False), + app_name=workload, + ) + _ = test_utils.check_csv_files( + profile_workload_dir, num_devices=1, num_kernels=1 + ) + + # Check whether metric values are correct + code = binary_handler_analyze_rocprof_compute([ + "analyze", + "--output-name", + f"{analysis_workload_dir}", + "--output-format", + "csv", + "-b", + *metric_ids, + "--path", + profile_workload_dir, + ]) + assert code == 0 + + for metric in metrics: + actual = pd.read_csv(f"{analysis_workload_dir}/{metric['csv_file']}")[ + metric["column"] + ].values[0] + expected = metric["expected_value"] + # 5% tolerance in checking + assert abs(actual - expected) / expected <= 0.05, ( + f"{metric['name']} ({metric['metric_id']}): " + f"actual={actual}, expected={expected}, " + f"diff={(abs(actual - expected) / expected * 100):.2f}% " + f"(tolerance: 5%)" + ) + finally: + test_utils.clean_output_dir(config["cleanup"], analysis_workload_dir) + test_utils.clean_output_dir(config["cleanup"], profile_workload_dir) diff --git a/projects/rocprofiler-compute/tests/test_profile_general.py b/projects/rocprofiler-compute/tests/test_profile_general.py index 1ea0d62759..4c4a397718 100644 --- a/projects/rocprofiler-compute/tests/test_profile_general.py +++ b/projects/rocprofiler-compute/tests/test_profile_general.py @@ -38,29 +38,7 @@ import pytest import test_utils from scipy.stats import zscore -# Globals - -# TODO: MI350 What are the gpu models in MI 350 series -SUPPORTED_ARCHS = { - "gfx908": {"mi100": ["MI100"]}, - "gfx90a": {"mi200": ["MI210", "MI250", "MI250X"]}, - "gfx940": {"mi300": ["MI300A_A0"]}, - "gfx941": {"mi300": ["MI300X_A0"]}, - "gfx942": {"mi300": ["MI300A_A1", "MI300X_A1"]}, - "gfx950": {"mi350": ["MI350"]}, -} - -CHIP_IDS = { - "29856": "MI300A_A1", - "29857": "MI300X_A1", - "29858": "MI308X", - "30112": "MI350", -} - -# -- # Runtime config options -# -- - config = {} config["kernel_name_1"] = "vecCopy" config["app_1"] = ["./tests/vcopy", "-n", "1048576", "-b", "256", "-i", "3"] @@ -248,47 +226,7 @@ def counter_compare(test_name, errors_pd, baseline_df, run_df, threshold=5): return errors_pd -def gpu_soc(): - global num_devices - ## 1) Parse arch details from rocminfo - rocminfo = str( - # decode with utf-8 to account for rocm-smi changes in latest rocm - subprocess.run( - ["rocminfo"], stdout=subprocess.PIPE, stderr=subprocess.PIPE - ).stdout.decode("utf-8") - ) - rocminfo = rocminfo.split("\n") - soc_regex = re.compile(r"^\s*Name\s*:\s+ ([a-zA-Z0-9]+)\s*$", re.MULTILINE) - devices = list(filter(soc_regex.match, rocminfo)) - gpu_arch = devices[0].split()[1] - - if not gpu_arch in SUPPORTED_ARCHS.keys(): - print("Cannot find a supported arch in rocminfo") - assert 0 - else: - num_devices = ( - len(devices) - if not "CI_VISIBLE_DEVICES" in os.environ - else os.environ["CI_VISIBLE_DEVICES"] - ) - - ## 2) Parse chip id from rocminfo - chip_id = re.compile(r"^\s*Chip ID:\s+ ([a-zA-Z0-9]+)\s*", re.MULTILINE) - ids = list(filter(chip_id.match, rocminfo)) - for id in ids: - chip_id = re.match(r"^[^()]+", id.split()[2]).group(0) - - ## 3) Deduce gpu model name from arch - gpu_model = list(SUPPORTED_ARCHS[gpu_arch].keys())[0].upper() - # For testing purposes we only care about gpu model series not the specific model - # if gpu_model not in ("MI50", "MI100", "MI200"): - # if chip_id in CHIP_IDS: - # gpu_model = CHIP_IDS[chip_id] - - return gpu_model - - -soc = gpu_soc() +soc = test_utils.gpu_soc() os.environ["ROCPROF"] = "rocprofiler-sdk" @@ -645,9 +583,7 @@ def test_path(binary_handler_profile_rocprof_compute): @pytest.mark.path -def test_path_rocflop( - binary_handler_profile_rocprof_compute, -): +def test_path_rocflop(binary_handler_profile_rocprof_compute): # Test whether multiprocess workloads like rocflop are handled correctly workload_dir = test_utils.get_output_dir() options = ["--block", "2.1.1"] diff --git a/projects/rocprofiler-compute/tests/test_utils.py b/projects/rocprofiler-compute/tests/test_utils.py index cf17a15756..7e2886bfed 100644 --- a/projects/rocprofiler-compute/tests/test_utils.py +++ b/projects/rocprofiler-compute/tests/test_utils.py @@ -32,6 +32,7 @@ import logging import os import re import shutil +import subprocess import tempfile from pathlib import Path from unittest import mock @@ -41,6 +42,15 @@ import pytest import utils.utils as utils +SUPPORTED_ARCHS = { + "gfx908": {"mi100": ["MI100"]}, + "gfx90a": {"mi200": ["MI210", "MI250", "MI250X"]}, + "gfx940": {"mi300": ["MI300A_A0"]}, + "gfx941": {"mi300": ["MI300X_A0"]}, + "gfx942": {"mi300": ["MI300A_A1", "MI300X_A1"]}, + "gfx950": {"mi350": ["MI350"]}, +} + class MockMSpec: def __init__( @@ -228,6 +238,27 @@ def get_num_pmc_file(output_dir): ]) +def gpu_soc(): + # Parse arch details from rocminfo + rocminfo = str( + # decode with utf-8 to account for rocm-smi changes in latest rocm + subprocess.run( + ["rocminfo"], stdout=subprocess.PIPE, stderr=subprocess.PIPE + ).stdout.decode("utf-8") + ) + rocminfo = rocminfo.split("\n") + soc_regex = re.compile(r"^\s*Name\s*:\s+ ([a-zA-Z0-9]+)\s*$", re.MULTILINE) + devices = list(filter(soc_regex.match, rocminfo)) + gpu_arch = devices[0].split()[1] + + if not gpu_arch in SUPPORTED_ARCHS.keys(): + return None + + gpu_model = list(SUPPORTED_ARCHS[gpu_arch].keys())[0].upper() + + return gpu_model + + # ============================================================================= # VERSION UTILITIES TESTS # ============================================================================= diff --git a/projects/rocprofiler-sdk/source/share/rocprofiler-sdk/counter_defs.yaml b/projects/rocprofiler-sdk/source/share/rocprofiler-sdk/counter_defs.yaml index 61fe279cf4..dce5e9b0d9 100644 --- a/projects/rocprofiler-sdk/source/share/rocprofiler-sdk/counter_defs.yaml +++ b/projects/rocprofiler-sdk/source/share/rocprofiler-sdk/counter_defs.yaml @@ -588,8 +588,10 @@ rocprofiler-sdk: - gfx940 - gfx941 - gfx942 - - gfx950 expression: (TCC_BUBBLE_sum*128 + (TCC_EA0_RDREQ_sum-TCC_BUBBLE_sum-TCC_EA0_RDREQ_32B_sum)*64 + TCC_EA0_RDREQ_32B_sum*32)/1024 + - architectures: + - gfx950 + expression: (TCC_EA0_RDREQ_128B_sum*128+TCC_EA0_RDREQ_64B_sum*64+TCC_EA0_RDREQ_32B_sum*32)/1024 - architectures: - gfx10 - gfx1010 @@ -615,12 +617,16 @@ rocprofiler-sdk: - architectures: - gfx90a expression: 1024*(WRITE_SIZE+FETCH_SIZE)/reduce(GRBM_GUI_ACTIVE,max) + # Approximation which excludes TCC_EA0_RDREQ_32B due to the hardware limitation on the number of counters that can be collected. - architectures: - gfx940 - gfx941 - gfx942 + expression: (WRITE_SIZE*1024+TCC_BUBBLE_sum*128+(TCC_EA0_RDREQ_sum-TCC_BUBBLE_sum)*64)/reduce(GRBM_GUI_ACTIVE,max) + # Approximation which excludes TCC_EA0_RDREQ_32B due to the hardware limitation on the number of counters that can be collected. + - architectures: - gfx950 - expression: (WRITE_SIZE*1024+TCC_BUBBLE_sum*128+(TCC_BUBBLE_sum-TCC_EA0_RDREQ_sum)*64)/reduce(GRBM_GUI_ACTIVE,max) + expression: (WRITE_SIZE*1024+TCC_EA0_RDREQ_128B_sum*128+TCC_EA0_RDREQ_64B_sum*64)/reduce(GRBM_GUI_ACTIVE,max) - name: FetchSize description: The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.