diff --git a/projects/rocprofiler-compute/CMakeLists.txt b/projects/rocprofiler-compute/CMakeLists.txt index 7382f795f7..93dc58e11e 100644 --- a/projects/rocprofiler-compute/CMakeLists.txt +++ b/projects/rocprofiler-compute/CMakeLists.txt @@ -285,6 +285,18 @@ add_test( ${PROJECT_SOURCE_DIR}/tests/test_analyze_workloads.py WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) +# --------------------------- +# TCP counter tests +# --------------------------- + +add_test( + NAME test_L1_cache_counters + COMMAND + ${Python3_EXECUTABLE} -m pytest -m L1_cache + --junitxml=tests/test_TCP_counters.xml ${COV_OPTION} + ${PROJECT_SOURCE_DIR}/tests/test_TCP_counters.py + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) + # --------- # Install # --------- diff --git a/projects/rocprofiler-compute/pyproject.toml b/projects/rocprofiler-compute/pyproject.toml index a5cfb0c168..b954c336d5 100644 --- a/projects/rocprofiler-compute/pyproject.toml +++ b/projects/rocprofiler-compute/pyproject.toml @@ -70,4 +70,5 @@ markers = [ "col", "kernel_verbose", "serial", + "L1_cache", ] diff --git a/projects/rocprofiler-compute/sample/vrandom_access.cpp b/projects/rocprofiler-compute/sample/vrandom_access.cpp new file mode 100644 index 0000000000..3fddb549fa --- /dev/null +++ b/projects/rocprofiler-compute/sample/vrandom_access.cpp @@ -0,0 +1,103 @@ +/* +##############################################################################bl +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +##############################################################################el + + + +An example code to execute random access to explore cache hits/misses in L2 Cache. +*/ + + +#include +#include +#include +#include +#include +#include + +#define HIP_ASSERT(x) (assert((x) == hipSuccess)) + +// Kernel: random access, each thread picks a random index +__global__ void randomAccessKernel(int *d_data, int N, unsigned int *d_seeds) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < N) + { + unsigned int seed = d_seeds[tid]; + // Simple XORShift + seed ^= (seed << 13); + seed ^= (seed >> 17); + seed ^= (seed << 5); + int idx = seed % N; + d_data[idx] += 1; + } +} + +int main() +{ + hipError_t hip_status; + + const int N = 1 << 24; // Try 16M elements to exceed cache + size_t size = N * sizeof(int); + + // Host memory + std::vector h_data(N, 0); + std::vector h_seeds(N); + + // Generate seeds + srand(time(nullptr)); + for (int i = 0; i < N; ++i) + { + // Keep them diverse. Could be random or based on i + h_seeds[i] = rand(); + } + + // Allocate device memory + int *d_data; + unsigned int *d_seeds; + HIP_ASSERT(hipMalloc(&d_data, size)); + HIP_ASSERT(hipMalloc(&d_seeds, N * sizeof(unsigned int))); + + // Copy h_data to device + HIP_ASSERT(hipMemcpy(d_data, h_data.data(), size, hipMemcpyHostToDevice)); + HIP_ASSERT(hipMemcpy(d_seeds, h_seeds.data(), N * sizeof(unsigned int), hipMemcpyHostToDevice)); + + // Configure kernel + dim3 blockSize(64); + dim3 gridSize((N + blockSize.x - 1) / blockSize.x); + + // Launch kernel + hipLaunchKernelGGL(randomAccessKernel, gridSize, blockSize, 0, 0, d_data, N, d_seeds); + hip_status = hipDeviceSynchronize(); + + HIP_ASSERT(hipMemcpy(h_data.data(), d_data, size, hipMemcpyDeviceToHost)); + + // Cleanup + HIP_ASSERT(hipFree(d_data)); + HIP_ASSERT(hipFree(d_seeds)); + ; + + std::cout << "RandomAccess HIP test completed.\n"; + return 0; +} diff --git a/projects/rocprofiler-compute/sample/vsequential_access.cpp b/projects/rocprofiler-compute/sample/vsequential_access.cpp new file mode 100644 index 0000000000..3d1d3f66a9 --- /dev/null +++ b/projects/rocprofiler-compute/sample/vsequential_access.cpp @@ -0,0 +1,81 @@ +/* +##############################################################################bl +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +##############################################################################el + + + +An example code to execute sequential access to explore cache hits/misses in L2 Cache. +*/ + + +#include +#include + +#define HIP_ASSERT(x) (assert((x)==hipSuccess)) + +// Kernel: sequential access, each thread reads/writes an element in order +__global__ void sequentialAccessKernel(int *d_data, int N) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < N) + { + d_data[tid] += 1; + } +} + +int main() +{ + hipError_t hip_status; + + const int N = 1 << 20; // 1M elements + size_t size = N * sizeof(int); + // Allocate host memory + int *h_data = (int *)malloc(size); + std::fill_n(h_data, N, 0); + + // Allocate device memory + int *d_data; + HIP_ASSERT(hipMalloc(&d_data, size)); + + // Copy h_data to device + HIP_ASSERT(hipMemcpy(d_data, h_data, size, hipMemcpyHostToDevice)); + + // Configure kernel + dim3 blockSize(64); + dim3 gridSize((N + blockSize.x - 1) / blockSize.x); + + // Launch kernel + hipLaunchKernelGGL(sequentialAccessKernel, gridSize, blockSize, 0, 0, d_data, N); + hip_status = hipDeviceSynchronize(); + + // Copy back to host + HIP_ASSERT(hipMemcpy(h_data, d_data, size, hipMemcpyDeviceToHost)); + + // Cleanup + HIP_ASSERT(hipFree(d_data)); + free(h_data); + + std::cout << "SequentialAccess HIP test completed.\n"; + return 0; +} diff --git a/projects/rocprofiler-compute/tests/CMakeLists.txt b/projects/rocprofiler-compute/tests/CMakeLists.txt index 01b3be1cd8..6914a18bd3 100644 --- a/projects/rocprofiler-compute/tests/CMakeLists.txt +++ b/projects/rocprofiler-compute/tests/CMakeLists.txt @@ -18,3 +18,15 @@ set(VMEM_SOURCES ../sample/vmem.hip) set_source_files_properties(${VMEM_SOURCES} PROPERTIES LANGUAGE HIP) add_executable(vmem ${VMEM_SOURCES}) set_target_properties(vmem PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/tests) + +set(VSEQ_SOURCES ../sample/vsequential_access.cpp) +set_source_files_properties(${VSEQ_SOURCES} PROPERTIES LANGUAGE HIP) +add_executable(vsequential_access ${VSEQ_SOURCES}) +set_target_properties(vsequential_access PROPERTIES RUNTIME_OUTPUT_DIRECTORY + ${CMAKE_SOURCE_DIR}/tests) + +set(VRAND_SOURCES ../sample/vrandom_access.cpp) +set_source_files_properties(${VRAND_SOURCES} PROPERTIES LANGUAGE HIP) +add_executable(vrandom_access ${VRAND_SOURCES}) +set_target_properties(vrandom_access PROPERTIES RUNTIME_OUTPUT_DIRECTORY + ${CMAKE_SOURCE_DIR}/tests) diff --git a/projects/rocprofiler-compute/tests/conftest.py b/projects/rocprofiler-compute/tests/conftest.py index bfd9772930..5a97d0b479 100644 --- a/projects/rocprofiler-compute/tests/conftest.py +++ b/projects/rocprofiler-compute/tests/conftest.py @@ -18,13 +18,15 @@ def pytest_addoption(parser): @pytest.fixture def binary_handler_profile_rocprof_compute(request): - def _handler(config, workload_dir, options=[], check_success=True, roof=False): + def _handler( + config, workload_dir, options=[], check_success=True, roof=False, app_name="app_1" + ): if request.config.getoption("--call-binary"): baseline_opts = [ "build/rocprof-compute.bin", "profile", "-n", - "app_1", + app_name, "-VVV", ] if not roof: @@ -33,7 +35,7 @@ def binary_handler_profile_rocprof_compute(request): baseline_opts + options + ["--path", workload_dir, "--"] - + config["app_1"], + + config[app_name], text=True, ) # verify run status @@ -41,7 +43,7 @@ def binary_handler_profile_rocprof_compute(request): assert process.returncode == 0 return process.returncode else: - baseline_opts = ["rocprof-compute", "profile", "-n", "app_1", "-VVV"] + baseline_opts = ["rocprof-compute", "profile", "-n", app_name, "-VVV"] if not roof: baseline_opts.append("--no-roof") with pytest.raises(SystemExit) as e: @@ -50,7 +52,7 @@ def binary_handler_profile_rocprof_compute(request): baseline_opts + options + ["--path", workload_dir, "--"] - + config["app_1"], + + config[app_name], ): rocprof_compute.main() # verify run status diff --git a/projects/rocprofiler-compute/tests/test_TCP_counters.py b/projects/rocprofiler-compute/tests/test_TCP_counters.py new file mode 100644 index 0000000000..826a383c11 --- /dev/null +++ b/projects/rocprofiler-compute/tests/test_TCP_counters.py @@ -0,0 +1,169 @@ +import csv +import inspect +import os +import re +import shutil +import subprocess +import sys +from importlib.machinery import SourceFileLoader +from pathlib import Path +from unittest.mock import patch + +import pandas as pd +import pytest +import test_utils + +rocprof_compute = SourceFileLoader("rocprof-compute", "src/rocprof-compute").load_module() + +config = {} +config["vseq"] = ["./tests/vsequential_access"] +config["vrand"] = ["./tests/vrandom_access"] +config["cleanup"] = True +config["COUNTER_LOGGING"] = False +config["METRIC_COMPARE"] = False +config["METRIC_LOGGING"] = False + + +SUPPORTED_ARCHS = { + "gfx940": {"mi300": ["MI300A_A0"]}, + "gfx941": {"mi300": ["MI300X_A0"]}, + "gfx942": {"mi300": ["MI300A_A1", "MI300X_A1"]}, +} + +MI300_CHIP_IDS = { + "29856": "MI300A_A1", + "29857": "MI300X_A1", + "29858": "MI308X", +} + + +def run(cmd): + p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if cmd[0] == "amd-smi" and p.returncode == 8: + print("ERROR: No GPU detected. Unable to load amd-smi") + assert 0 + return p.stdout.decode("ascii") + + +def gpu_soc(): + ## 1) Parse arch details from rocminfo + rocminfo = str( + # decode with utf-8 to account for rocm-smi changes in latest rocm + subprocess.run( + ["rocminfo"], stdout=subprocess.PIPE, stderr=subprocess.PIPE + ).stdout.decode("utf-8") + ) + rocminfo = rocminfo.split("\n") + soc_regex = re.compile(r"^\s*Name\s*:\s+ ([a-zA-Z0-9]+)\s*$", re.MULTILINE) + devices = list(filter(soc_regex.match, rocminfo)) + gpu_arch = devices[0].split()[1] + + if not gpu_arch in SUPPORTED_ARCHS.keys(): + return None + + ## 2) Parse chip id from rocminfo + chip_id = re.compile(r"^\s*Chip ID:\s+ ([a-zA-Z0-9]+)\s*", re.MULTILINE) + ids = list(filter(chip_id.match, rocminfo)) + for id in ids: + chip_id = re.match(r"^[^()]+", id.split()[2]).group(0) + + ## 3) Deduce gpu model name from arch + gpu_model = list(SUPPORTED_ARCHS[gpu_arch].keys())[0].upper() + if gpu_model == "MI300": + if chip_id in MI300_CHIP_IDS: + gpu_model = MI300_CHIP_IDS[chip_id] + else: + return None + + return gpu_model + + +def load_metrics(csv_file_path): + """ + Reads the CSV file into a dictionary of dictionaries: + { + "Metric_1": { + "Avg": value, + "Min": value, + "Max": value, + "Unit": "unit" + }, + "Metric_2": { ... }, + ... + } + """ + metrics_data = {} + with open(csv_file_path, newline="") as csvfile: + reader = csv.DictReader(csvfile) # reads header from first line + + for row in reader: + metric_name = row["Metric"].strip() + metrics_data[metric_name] = { + "Avg": float(row["Avg"]) if row["Avg"] else None, + "Min": float(row["Min"]) if row["Min"] else None, + "Max": float(row["Max"]) if row["Max"] else None, + "Unit": row["Unit"].strip() if row["Unit"] else None, + } + return metrics_data + + +soc = gpu_soc() + + +@pytest.mark.L1_cache +def test_L1_cache_counters( + binary_handler_profile_rocprof_compute, binary_handler_analyze_rocprof_compute +): + if not soc or "MI300" not in soc: + pytest.skip("Skipping L1 cache test for non-mi300 socs.") + + # set up two apps: sequential and random access + app_names = ["vseq", "vrand"] + options = ["-b", "TCP"] + + result = {} + metrics = ["Read Req", "Write Req", "Cache Hit Rate"] + base = Path(test_utils.get_output_dir()) + + for app_name in app_names: + + workload_dir = str(base / app_name) + + # 1. profile the app + return_code = binary_handler_profile_rocprof_compute( + config, + workload_dir, + options, + check_success=False, + roof=False, + app_name=app_name + ) + assert return_code == 0 + + # 2. analyze the results + return_code = binary_handler_analyze_rocprof_compute( + ["analyze", "--path", workload_dir, "-b", "16.3", "--save-dfs", workload_dir] + ) + assert return_code == 0 + + # 3. save results in local + + # FIXME: customize file name to avoid hardcode + csv_path = workload_dir + "/16.3_L1D_Cache_Accesses.csv" + data = load_metrics(csv_path) + + for metric in metrics: + if app_name not in result or not isinstance(result[app_name], dict): + result[app_name] = {} + result[app_name][metric] = data[metric]["Avg"] + + # 4. clean local output + test_utils.clean_output_dir(config["cleanup"], workload_dir) + test_utils.clean_output_dir(config["cleanup"], base) + + # 5. check results are expected + + # FIXME: use a range for comparison to account for different results + assert result["vseq"]["Cache Hit Rate"] >= result["vrand"]["Cache Hit Rate"] + assert result["vseq"]["Read Req"] <= result["vrand"]["Read Req"] + assert result["vseq"]["Write Req"] <= result["vrand"]["Write Req"]