Add mi300 TCP counter tests (#644)

* Add new sample applications.

* Generalize py test launcher for additional apps.

* Add TCP pytest, and add to ctest.

* Update licensing.

* Disable for non-mi300 machines.

[ROCm/rocprofiler-compute commit: 591632dd69]
Cette révision appartient à :
xuchen-amd
2025-04-02 20:32:13 -04:00
révisé par GitHub
Parent 35acf4c410
révision 08e083cc25
7 fichiers modifiés avec 385 ajouts et 5 suppressions
+12
Voir le fichier
@@ -285,6 +285,18 @@ add_test(
${PROJECT_SOURCE_DIR}/tests/test_analyze_workloads.py
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
# ---------------------------
# TCP counter tests
# ---------------------------
add_test(
NAME test_L1_cache_counters
COMMAND
${Python3_EXECUTABLE} -m pytest -m L1_cache
--junitxml=tests/test_TCP_counters.xml ${COV_OPTION}
${PROJECT_SOURCE_DIR}/tests/test_TCP_counters.py
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
# ---------
# Install
# ---------
+1
Voir le fichier
@@ -70,4 +70,5 @@ markers = [
"col",
"kernel_verbose",
"serial",
"L1_cache",
]
+103
Voir le fichier
@@ -0,0 +1,103 @@
/*
##############################################################################bl
# MIT License
#
# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
##############################################################################el
An example code to execute random access to explore cache hits/misses in L2 Cache.
*/
#include <hip/hip_runtime.h>
#include <iostream>
#include <cstdlib>
#include <ctime>
#include <vector>
#include <algorithm>
#define HIP_ASSERT(x) (assert((x) == hipSuccess))
// Kernel: random access, each thread picks a random index
__global__ void randomAccessKernel(int *d_data, int N, unsigned int *d_seeds)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < N)
{
unsigned int seed = d_seeds[tid];
// Simple XORShift
seed ^= (seed << 13);
seed ^= (seed >> 17);
seed ^= (seed << 5);
int idx = seed % N;
d_data[idx] += 1;
}
}
int main()
{
hipError_t hip_status;
const int N = 1 << 24; // Try 16M elements to exceed cache
size_t size = N * sizeof(int);
// Host memory
std::vector<int> h_data(N, 0);
std::vector<unsigned int> h_seeds(N);
// Generate seeds
srand(time(nullptr));
for (int i = 0; i < N; ++i)
{
// Keep them diverse. Could be random or based on i
h_seeds[i] = rand();
}
// Allocate device memory
int *d_data;
unsigned int *d_seeds;
HIP_ASSERT(hipMalloc(&d_data, size));
HIP_ASSERT(hipMalloc(&d_seeds, N * sizeof(unsigned int)));
// Copy h_data to device
HIP_ASSERT(hipMemcpy(d_data, h_data.data(), size, hipMemcpyHostToDevice));
HIP_ASSERT(hipMemcpy(d_seeds, h_seeds.data(), N * sizeof(unsigned int), hipMemcpyHostToDevice));
// Configure kernel
dim3 blockSize(64);
dim3 gridSize((N + blockSize.x - 1) / blockSize.x);
// Launch kernel
hipLaunchKernelGGL(randomAccessKernel, gridSize, blockSize, 0, 0, d_data, N, d_seeds);
hip_status = hipDeviceSynchronize();
HIP_ASSERT(hipMemcpy(h_data.data(), d_data, size, hipMemcpyDeviceToHost));
// Cleanup
HIP_ASSERT(hipFree(d_data));
HIP_ASSERT(hipFree(d_seeds));
;
std::cout << "RandomAccess HIP test completed.\n";
return 0;
}
+81
Voir le fichier
@@ -0,0 +1,81 @@
/*
##############################################################################bl
# MIT License
#
# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
##############################################################################el
An example code to execute sequential access to explore cache hits/misses in L2 Cache.
*/
#include <hip/hip_runtime.h>
#include <iostream>
#define HIP_ASSERT(x) (assert((x)==hipSuccess))
// Kernel: sequential access, each thread reads/writes an element in order
__global__ void sequentialAccessKernel(int *d_data, int N)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < N)
{
d_data[tid] += 1;
}
}
int main()
{
hipError_t hip_status;
const int N = 1 << 20; // 1M elements
size_t size = N * sizeof(int);
// Allocate host memory
int *h_data = (int *)malloc(size);
std::fill_n(h_data, N, 0);
// Allocate device memory
int *d_data;
HIP_ASSERT(hipMalloc(&d_data, size));
// Copy h_data to device
HIP_ASSERT(hipMemcpy(d_data, h_data, size, hipMemcpyHostToDevice));
// Configure kernel
dim3 blockSize(64);
dim3 gridSize((N + blockSize.x - 1) / blockSize.x);
// Launch kernel
hipLaunchKernelGGL(sequentialAccessKernel, gridSize, blockSize, 0, 0, d_data, N);
hip_status = hipDeviceSynchronize();
// Copy back to host
HIP_ASSERT(hipMemcpy(h_data, d_data, size, hipMemcpyDeviceToHost));
// Cleanup
HIP_ASSERT(hipFree(d_data));
free(h_data);
std::cout << "SequentialAccess HIP test completed.\n";
return 0;
}
+12
Voir le fichier
@@ -18,3 +18,15 @@ set(VMEM_SOURCES ../sample/vmem.hip)
set_source_files_properties(${VMEM_SOURCES} PROPERTIES LANGUAGE HIP)
add_executable(vmem ${VMEM_SOURCES})
set_target_properties(vmem PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/tests)
set(VSEQ_SOURCES ../sample/vsequential_access.cpp)
set_source_files_properties(${VSEQ_SOURCES} PROPERTIES LANGUAGE HIP)
add_executable(vsequential_access ${VSEQ_SOURCES})
set_target_properties(vsequential_access PROPERTIES RUNTIME_OUTPUT_DIRECTORY
${CMAKE_SOURCE_DIR}/tests)
set(VRAND_SOURCES ../sample/vrandom_access.cpp)
set_source_files_properties(${VRAND_SOURCES} PROPERTIES LANGUAGE HIP)
add_executable(vrandom_access ${VRAND_SOURCES})
set_target_properties(vrandom_access PROPERTIES RUNTIME_OUTPUT_DIRECTORY
${CMAKE_SOURCE_DIR}/tests)
+7 -5
Voir le fichier
@@ -18,13 +18,15 @@ def pytest_addoption(parser):
@pytest.fixture
def binary_handler_profile_rocprof_compute(request):
def _handler(config, workload_dir, options=[], check_success=True, roof=False):
def _handler(
config, workload_dir, options=[], check_success=True, roof=False, app_name="app_1"
):
if request.config.getoption("--call-binary"):
baseline_opts = [
"build/rocprof-compute.bin",
"profile",
"-n",
"app_1",
app_name,
"-VVV",
]
if not roof:
@@ -33,7 +35,7 @@ def binary_handler_profile_rocprof_compute(request):
baseline_opts
+ options
+ ["--path", workload_dir, "--"]
+ config["app_1"],
+ config[app_name],
text=True,
)
# verify run status
@@ -41,7 +43,7 @@ def binary_handler_profile_rocprof_compute(request):
assert process.returncode == 0
return process.returncode
else:
baseline_opts = ["rocprof-compute", "profile", "-n", "app_1", "-VVV"]
baseline_opts = ["rocprof-compute", "profile", "-n", app_name, "-VVV"]
if not roof:
baseline_opts.append("--no-roof")
with pytest.raises(SystemExit) as e:
@@ -50,7 +52,7 @@ def binary_handler_profile_rocprof_compute(request):
baseline_opts
+ options
+ ["--path", workload_dir, "--"]
+ config["app_1"],
+ config[app_name],
):
rocprof_compute.main()
# verify run status
+169
Voir le fichier
@@ -0,0 +1,169 @@
import csv
import inspect
import os
import re
import shutil
import subprocess
import sys
from importlib.machinery import SourceFileLoader
from pathlib import Path
from unittest.mock import patch
import pandas as pd
import pytest
import test_utils
rocprof_compute = SourceFileLoader("rocprof-compute", "src/rocprof-compute").load_module()
config = {}
config["vseq"] = ["./tests/vsequential_access"]
config["vrand"] = ["./tests/vrandom_access"]
config["cleanup"] = True
config["COUNTER_LOGGING"] = False
config["METRIC_COMPARE"] = False
config["METRIC_LOGGING"] = False
SUPPORTED_ARCHS = {
"gfx940": {"mi300": ["MI300A_A0"]},
"gfx941": {"mi300": ["MI300X_A0"]},
"gfx942": {"mi300": ["MI300A_A1", "MI300X_A1"]},
}
MI300_CHIP_IDS = {
"29856": "MI300A_A1",
"29857": "MI300X_A1",
"29858": "MI308X",
}
def run(cmd):
p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if cmd[0] == "amd-smi" and p.returncode == 8:
print("ERROR: No GPU detected. Unable to load amd-smi")
assert 0
return p.stdout.decode("ascii")
def gpu_soc():
## 1) Parse arch details from rocminfo
rocminfo = str(
# decode with utf-8 to account for rocm-smi changes in latest rocm
subprocess.run(
["rocminfo"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
).stdout.decode("utf-8")
)
rocminfo = rocminfo.split("\n")
soc_regex = re.compile(r"^\s*Name\s*:\s+ ([a-zA-Z0-9]+)\s*$", re.MULTILINE)
devices = list(filter(soc_regex.match, rocminfo))
gpu_arch = devices[0].split()[1]
if not gpu_arch in SUPPORTED_ARCHS.keys():
return None
## 2) Parse chip id from rocminfo
chip_id = re.compile(r"^\s*Chip ID:\s+ ([a-zA-Z0-9]+)\s*", re.MULTILINE)
ids = list(filter(chip_id.match, rocminfo))
for id in ids:
chip_id = re.match(r"^[^()]+", id.split()[2]).group(0)
## 3) Deduce gpu model name from arch
gpu_model = list(SUPPORTED_ARCHS[gpu_arch].keys())[0].upper()
if gpu_model == "MI300":
if chip_id in MI300_CHIP_IDS:
gpu_model = MI300_CHIP_IDS[chip_id]
else:
return None
return gpu_model
def load_metrics(csv_file_path):
"""
Reads the CSV file into a dictionary of dictionaries:
{
"Metric_1": {
"Avg": value,
"Min": value,
"Max": value,
"Unit": "unit"
},
"Metric_2": { ... },
...
}
"""
metrics_data = {}
with open(csv_file_path, newline="") as csvfile:
reader = csv.DictReader(csvfile) # reads header from first line
for row in reader:
metric_name = row["Metric"].strip()
metrics_data[metric_name] = {
"Avg": float(row["Avg"]) if row["Avg"] else None,
"Min": float(row["Min"]) if row["Min"] else None,
"Max": float(row["Max"]) if row["Max"] else None,
"Unit": row["Unit"].strip() if row["Unit"] else None,
}
return metrics_data
soc = gpu_soc()
@pytest.mark.L1_cache
def test_L1_cache_counters(
binary_handler_profile_rocprof_compute, binary_handler_analyze_rocprof_compute
):
if not soc or "MI300" not in soc:
pytest.skip("Skipping L1 cache test for non-mi300 socs.")
# set up two apps: sequential and random access
app_names = ["vseq", "vrand"]
options = ["-b", "TCP"]
result = {}
metrics = ["Read Req", "Write Req", "Cache Hit Rate"]
base = Path(test_utils.get_output_dir())
for app_name in app_names:
workload_dir = str(base / app_name)
# 1. profile the app
return_code = binary_handler_profile_rocprof_compute(
config,
workload_dir,
options,
check_success=False,
roof=False,
app_name=app_name
)
assert return_code == 0
# 2. analyze the results
return_code = binary_handler_analyze_rocprof_compute(
["analyze", "--path", workload_dir, "-b", "16.3", "--save-dfs", workload_dir]
)
assert return_code == 0
# 3. save results in local
# FIXME: customize file name to avoid hardcode
csv_path = workload_dir + "/16.3_L1D_Cache_Accesses.csv"
data = load_metrics(csv_path)
for metric in metrics:
if app_name not in result or not isinstance(result[app_name], dict):
result[app_name] = {}
result[app_name][metric] = data[metric]["Avg"]
# 4. clean local output
test_utils.clean_output_dir(config["cleanup"], workload_dir)
test_utils.clean_output_dir(config["cleanup"], base)
# 5. check results are expected
# FIXME: use a range for comparison to account for different results
assert result["vseq"]["Cache Hit Rate"] >= result["vrand"]["Cache Hit Rate"]
assert result["vseq"]["Read Req"] <= result["vrand"]["Read Req"]
assert result["vseq"]["Write Req"] <= result["vrand"]["Write Req"]