Add mi300 TCP counter tests (#644)
* Add new sample applications. * Generalize py test launcher for additional apps. * Add TCP pytest, and add to ctest. * Update licensing. * Disable for non-mi300 machines.
Этот коммит содержится в:
@@ -285,6 +285,18 @@ add_test(
|
||||
${PROJECT_SOURCE_DIR}/tests/test_analyze_workloads.py
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
|
||||
|
||||
# ---------------------------
|
||||
# TCP counter tests
|
||||
# ---------------------------
|
||||
|
||||
add_test(
|
||||
NAME test_L1_cache_counters
|
||||
COMMAND
|
||||
${Python3_EXECUTABLE} -m pytest -m L1_cache
|
||||
--junitxml=tests/test_TCP_counters.xml ${COV_OPTION}
|
||||
${PROJECT_SOURCE_DIR}/tests/test_TCP_counters.py
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
|
||||
|
||||
# ---------
|
||||
# Install
|
||||
# ---------
|
||||
|
||||
@@ -70,4 +70,5 @@ markers = [
|
||||
"col",
|
||||
"kernel_verbose",
|
||||
"serial",
|
||||
"L1_cache",
|
||||
]
|
||||
|
||||
@@ -0,0 +1,103 @@
|
||||
/*
|
||||
##############################################################################bl
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
##############################################################################el
|
||||
|
||||
|
||||
|
||||
An example code to execute random access to explore cache hits/misses in L2 Cache.
|
||||
*/
|
||||
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <ctime>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
#define HIP_ASSERT(x) (assert((x) == hipSuccess))
|
||||
|
||||
// Kernel: random access, each thread picks a random index
|
||||
__global__ void randomAccessKernel(int *d_data, int N, unsigned int *d_seeds)
|
||||
{
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < N)
|
||||
{
|
||||
unsigned int seed = d_seeds[tid];
|
||||
// Simple XORShift
|
||||
seed ^= (seed << 13);
|
||||
seed ^= (seed >> 17);
|
||||
seed ^= (seed << 5);
|
||||
int idx = seed % N;
|
||||
d_data[idx] += 1;
|
||||
}
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
hipError_t hip_status;
|
||||
|
||||
const int N = 1 << 24; // Try 16M elements to exceed cache
|
||||
size_t size = N * sizeof(int);
|
||||
|
||||
// Host memory
|
||||
std::vector<int> h_data(N, 0);
|
||||
std::vector<unsigned int> h_seeds(N);
|
||||
|
||||
// Generate seeds
|
||||
srand(time(nullptr));
|
||||
for (int i = 0; i < N; ++i)
|
||||
{
|
||||
// Keep them diverse. Could be random or based on i
|
||||
h_seeds[i] = rand();
|
||||
}
|
||||
|
||||
// Allocate device memory
|
||||
int *d_data;
|
||||
unsigned int *d_seeds;
|
||||
HIP_ASSERT(hipMalloc(&d_data, size));
|
||||
HIP_ASSERT(hipMalloc(&d_seeds, N * sizeof(unsigned int)));
|
||||
|
||||
// Copy h_data to device
|
||||
HIP_ASSERT(hipMemcpy(d_data, h_data.data(), size, hipMemcpyHostToDevice));
|
||||
HIP_ASSERT(hipMemcpy(d_seeds, h_seeds.data(), N * sizeof(unsigned int), hipMemcpyHostToDevice));
|
||||
|
||||
// Configure kernel
|
||||
dim3 blockSize(64);
|
||||
dim3 gridSize((N + blockSize.x - 1) / blockSize.x);
|
||||
|
||||
// Launch kernel
|
||||
hipLaunchKernelGGL(randomAccessKernel, gridSize, blockSize, 0, 0, d_data, N, d_seeds);
|
||||
hip_status = hipDeviceSynchronize();
|
||||
|
||||
HIP_ASSERT(hipMemcpy(h_data.data(), d_data, size, hipMemcpyDeviceToHost));
|
||||
|
||||
// Cleanup
|
||||
HIP_ASSERT(hipFree(d_data));
|
||||
HIP_ASSERT(hipFree(d_seeds));
|
||||
;
|
||||
|
||||
std::cout << "RandomAccess HIP test completed.\n";
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,81 @@
|
||||
/*
|
||||
##############################################################################bl
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
##############################################################################el
|
||||
|
||||
|
||||
|
||||
An example code to execute sequential access to explore cache hits/misses in L2 Cache.
|
||||
*/
|
||||
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <iostream>
|
||||
|
||||
#define HIP_ASSERT(x) (assert((x)==hipSuccess))
|
||||
|
||||
// Kernel: sequential access, each thread reads/writes an element in order
|
||||
__global__ void sequentialAccessKernel(int *d_data, int N)
|
||||
{
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < N)
|
||||
{
|
||||
d_data[tid] += 1;
|
||||
}
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
hipError_t hip_status;
|
||||
|
||||
const int N = 1 << 20; // 1M elements
|
||||
size_t size = N * sizeof(int);
|
||||
// Allocate host memory
|
||||
int *h_data = (int *)malloc(size);
|
||||
std::fill_n(h_data, N, 0);
|
||||
|
||||
// Allocate device memory
|
||||
int *d_data;
|
||||
HIP_ASSERT(hipMalloc(&d_data, size));
|
||||
|
||||
// Copy h_data to device
|
||||
HIP_ASSERT(hipMemcpy(d_data, h_data, size, hipMemcpyHostToDevice));
|
||||
|
||||
// Configure kernel
|
||||
dim3 blockSize(64);
|
||||
dim3 gridSize((N + blockSize.x - 1) / blockSize.x);
|
||||
|
||||
// Launch kernel
|
||||
hipLaunchKernelGGL(sequentialAccessKernel, gridSize, blockSize, 0, 0, d_data, N);
|
||||
hip_status = hipDeviceSynchronize();
|
||||
|
||||
// Copy back to host
|
||||
HIP_ASSERT(hipMemcpy(h_data, d_data, size, hipMemcpyDeviceToHost));
|
||||
|
||||
// Cleanup
|
||||
HIP_ASSERT(hipFree(d_data));
|
||||
free(h_data);
|
||||
|
||||
std::cout << "SequentialAccess HIP test completed.\n";
|
||||
return 0;
|
||||
}
|
||||
@@ -18,3 +18,15 @@ set(VMEM_SOURCES ../sample/vmem.hip)
|
||||
set_source_files_properties(${VMEM_SOURCES} PROPERTIES LANGUAGE HIP)
|
||||
add_executable(vmem ${VMEM_SOURCES})
|
||||
set_target_properties(vmem PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/tests)
|
||||
|
||||
set(VSEQ_SOURCES ../sample/vsequential_access.cpp)
|
||||
set_source_files_properties(${VSEQ_SOURCES} PROPERTIES LANGUAGE HIP)
|
||||
add_executable(vsequential_access ${VSEQ_SOURCES})
|
||||
set_target_properties(vsequential_access PROPERTIES RUNTIME_OUTPUT_DIRECTORY
|
||||
${CMAKE_SOURCE_DIR}/tests)
|
||||
|
||||
set(VRAND_SOURCES ../sample/vrandom_access.cpp)
|
||||
set_source_files_properties(${VRAND_SOURCES} PROPERTIES LANGUAGE HIP)
|
||||
add_executable(vrandom_access ${VRAND_SOURCES})
|
||||
set_target_properties(vrandom_access PROPERTIES RUNTIME_OUTPUT_DIRECTORY
|
||||
${CMAKE_SOURCE_DIR}/tests)
|
||||
|
||||
@@ -18,13 +18,15 @@ def pytest_addoption(parser):
|
||||
|
||||
@pytest.fixture
|
||||
def binary_handler_profile_rocprof_compute(request):
|
||||
def _handler(config, workload_dir, options=[], check_success=True, roof=False):
|
||||
def _handler(
|
||||
config, workload_dir, options=[], check_success=True, roof=False, app_name="app_1"
|
||||
):
|
||||
if request.config.getoption("--call-binary"):
|
||||
baseline_opts = [
|
||||
"build/rocprof-compute.bin",
|
||||
"profile",
|
||||
"-n",
|
||||
"app_1",
|
||||
app_name,
|
||||
"-VVV",
|
||||
]
|
||||
if not roof:
|
||||
@@ -33,7 +35,7 @@ def binary_handler_profile_rocprof_compute(request):
|
||||
baseline_opts
|
||||
+ options
|
||||
+ ["--path", workload_dir, "--"]
|
||||
+ config["app_1"],
|
||||
+ config[app_name],
|
||||
text=True,
|
||||
)
|
||||
# verify run status
|
||||
@@ -41,7 +43,7 @@ def binary_handler_profile_rocprof_compute(request):
|
||||
assert process.returncode == 0
|
||||
return process.returncode
|
||||
else:
|
||||
baseline_opts = ["rocprof-compute", "profile", "-n", "app_1", "-VVV"]
|
||||
baseline_opts = ["rocprof-compute", "profile", "-n", app_name, "-VVV"]
|
||||
if not roof:
|
||||
baseline_opts.append("--no-roof")
|
||||
with pytest.raises(SystemExit) as e:
|
||||
@@ -50,7 +52,7 @@ def binary_handler_profile_rocprof_compute(request):
|
||||
baseline_opts
|
||||
+ options
|
||||
+ ["--path", workload_dir, "--"]
|
||||
+ config["app_1"],
|
||||
+ config[app_name],
|
||||
):
|
||||
rocprof_compute.main()
|
||||
# verify run status
|
||||
|
||||
@@ -0,0 +1,169 @@
|
||||
import csv
|
||||
import inspect
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from importlib.machinery import SourceFileLoader
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
import test_utils
|
||||
|
||||
rocprof_compute = SourceFileLoader("rocprof-compute", "src/rocprof-compute").load_module()
|
||||
|
||||
config = {}
|
||||
config["vseq"] = ["./tests/vsequential_access"]
|
||||
config["vrand"] = ["./tests/vrandom_access"]
|
||||
config["cleanup"] = True
|
||||
config["COUNTER_LOGGING"] = False
|
||||
config["METRIC_COMPARE"] = False
|
||||
config["METRIC_LOGGING"] = False
|
||||
|
||||
|
||||
SUPPORTED_ARCHS = {
|
||||
"gfx940": {"mi300": ["MI300A_A0"]},
|
||||
"gfx941": {"mi300": ["MI300X_A0"]},
|
||||
"gfx942": {"mi300": ["MI300A_A1", "MI300X_A1"]},
|
||||
}
|
||||
|
||||
MI300_CHIP_IDS = {
|
||||
"29856": "MI300A_A1",
|
||||
"29857": "MI300X_A1",
|
||||
"29858": "MI308X",
|
||||
}
|
||||
|
||||
|
||||
def run(cmd):
|
||||
p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
if cmd[0] == "amd-smi" and p.returncode == 8:
|
||||
print("ERROR: No GPU detected. Unable to load amd-smi")
|
||||
assert 0
|
||||
return p.stdout.decode("ascii")
|
||||
|
||||
|
||||
def gpu_soc():
|
||||
## 1) Parse arch details from rocminfo
|
||||
rocminfo = str(
|
||||
# decode with utf-8 to account for rocm-smi changes in latest rocm
|
||||
subprocess.run(
|
||||
["rocminfo"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||
).stdout.decode("utf-8")
|
||||
)
|
||||
rocminfo = rocminfo.split("\n")
|
||||
soc_regex = re.compile(r"^\s*Name\s*:\s+ ([a-zA-Z0-9]+)\s*$", re.MULTILINE)
|
||||
devices = list(filter(soc_regex.match, rocminfo))
|
||||
gpu_arch = devices[0].split()[1]
|
||||
|
||||
if not gpu_arch in SUPPORTED_ARCHS.keys():
|
||||
return None
|
||||
|
||||
## 2) Parse chip id from rocminfo
|
||||
chip_id = re.compile(r"^\s*Chip ID:\s+ ([a-zA-Z0-9]+)\s*", re.MULTILINE)
|
||||
ids = list(filter(chip_id.match, rocminfo))
|
||||
for id in ids:
|
||||
chip_id = re.match(r"^[^()]+", id.split()[2]).group(0)
|
||||
|
||||
## 3) Deduce gpu model name from arch
|
||||
gpu_model = list(SUPPORTED_ARCHS[gpu_arch].keys())[0].upper()
|
||||
if gpu_model == "MI300":
|
||||
if chip_id in MI300_CHIP_IDS:
|
||||
gpu_model = MI300_CHIP_IDS[chip_id]
|
||||
else:
|
||||
return None
|
||||
|
||||
return gpu_model
|
||||
|
||||
|
||||
def load_metrics(csv_file_path):
|
||||
"""
|
||||
Reads the CSV file into a dictionary of dictionaries:
|
||||
{
|
||||
"Metric_1": {
|
||||
"Avg": value,
|
||||
"Min": value,
|
||||
"Max": value,
|
||||
"Unit": "unit"
|
||||
},
|
||||
"Metric_2": { ... },
|
||||
...
|
||||
}
|
||||
"""
|
||||
metrics_data = {}
|
||||
with open(csv_file_path, newline="") as csvfile:
|
||||
reader = csv.DictReader(csvfile) # reads header from first line
|
||||
|
||||
for row in reader:
|
||||
metric_name = row["Metric"].strip()
|
||||
metrics_data[metric_name] = {
|
||||
"Avg": float(row["Avg"]) if row["Avg"] else None,
|
||||
"Min": float(row["Min"]) if row["Min"] else None,
|
||||
"Max": float(row["Max"]) if row["Max"] else None,
|
||||
"Unit": row["Unit"].strip() if row["Unit"] else None,
|
||||
}
|
||||
return metrics_data
|
||||
|
||||
|
||||
soc = gpu_soc()
|
||||
|
||||
|
||||
@pytest.mark.L1_cache
|
||||
def test_L1_cache_counters(
|
||||
binary_handler_profile_rocprof_compute, binary_handler_analyze_rocprof_compute
|
||||
):
|
||||
if not soc or "MI300" not in soc:
|
||||
pytest.skip("Skipping L1 cache test for non-mi300 socs.")
|
||||
|
||||
# set up two apps: sequential and random access
|
||||
app_names = ["vseq", "vrand"]
|
||||
options = ["-b", "TCP"]
|
||||
|
||||
result = {}
|
||||
metrics = ["Read Req", "Write Req", "Cache Hit Rate"]
|
||||
base = Path(test_utils.get_output_dir())
|
||||
|
||||
for app_name in app_names:
|
||||
|
||||
workload_dir = str(base / app_name)
|
||||
|
||||
# 1. profile the app
|
||||
return_code = binary_handler_profile_rocprof_compute(
|
||||
config,
|
||||
workload_dir,
|
||||
options,
|
||||
check_success=False,
|
||||
roof=False,
|
||||
app_name=app_name
|
||||
)
|
||||
assert return_code == 0
|
||||
|
||||
# 2. analyze the results
|
||||
return_code = binary_handler_analyze_rocprof_compute(
|
||||
["analyze", "--path", workload_dir, "-b", "16.3", "--save-dfs", workload_dir]
|
||||
)
|
||||
assert return_code == 0
|
||||
|
||||
# 3. save results in local
|
||||
|
||||
# FIXME: customize file name to avoid hardcode
|
||||
csv_path = workload_dir + "/16.3_L1D_Cache_Accesses.csv"
|
||||
data = load_metrics(csv_path)
|
||||
|
||||
for metric in metrics:
|
||||
if app_name not in result or not isinstance(result[app_name], dict):
|
||||
result[app_name] = {}
|
||||
result[app_name][metric] = data[metric]["Avg"]
|
||||
|
||||
# 4. clean local output
|
||||
test_utils.clean_output_dir(config["cleanup"], workload_dir)
|
||||
test_utils.clean_output_dir(config["cleanup"], base)
|
||||
|
||||
# 5. check results are expected
|
||||
|
||||
# FIXME: use a range for comparison to account for different results
|
||||
assert result["vseq"]["Cache Hit Rate"] >= result["vrand"]["Cache Hit Rate"]
|
||||
assert result["vseq"]["Read Req"] <= result["vrand"]["Read Req"]
|
||||
assert result["vseq"]["Write Req"] <= result["vrand"]["Write Req"]
|
||||
Ссылка в новой задаче
Block a user