Files
rocm-systems/tests/test_TCP_counters.py
T
vedithal-amd f9aa7be97c Support MI 350 profiling (#632)
* Add MI 350 hardware information

* Refactor MI GPU YAML file and corresponding interface

* Add SoC file for gfx950 architecture

* Add analysis report configs for MI 350 containing existing metrics

* Add placeholder None valued metrics for previous architectures to make
  baseline comparison work

* Enable testing on MI 350

* Analysis config metric changes
    - SPI changes
        - Update metric formula for default SPI pipe counter
             - Use efficiently collected pipe wise SPI counters
        - Add SPI Wave Occupancy
        - Add Scheduler-Pipe Wave Utilization
        - Update formula for VGPR Writes
        - Add Scheduler-Pipe FIFO Full Rate
   - CPC changes
	- Add CPC SYNC FIFO Full Rate
	- Add CPC CANE Stall Rate
        - Add CPC ADC Utilization
   - SQ changes
        - Add VALU co-issue efficiency
        - Add F6F4 datatype metrics
        - Update formula for total FLOPs by adding F6F4 counters
        - Add LDS STORE / LOAD / ATOMIC metrics
        - Add LDS STORE / LOAD / ATOMIC bandwidth
        - Add LDS FIFO and TA ADDR / CMD / DATA FIFO full rates

* Collect TCP_TCP_LATENCY_sum only for gfx950 (MI 350)

* Do not inject SQ_ACCUM_PREV_HIRES unnecesarily

* Do not hardcode memory and shader clock speeds

* Write num_hbm_channels to sysinfo.csv instead of hbm_bw while profiling

* Move generate sysinfo.csv to pre processing step of profiling

* Add warnings to use --specs-correction for missing sysinfo.csv values during analysis phase

* Update CHANGELOG

* Analysis phase warning to use --specs-correction when needed
2025-04-03 02:21:18 -04:00

170 строки
5.0 KiB
Python

import csv
import inspect
import os
import re
import shutil
import subprocess
import sys
from importlib.machinery import SourceFileLoader
from pathlib import Path
from unittest.mock import patch
import pandas as pd
import pytest
import test_utils
rocprof_compute = SourceFileLoader("rocprof-compute", "src/rocprof-compute").load_module()
config = {}
config["vseq"] = ["./tests/vsequential_access"]
config["vrand"] = ["./tests/vrandom_access"]
config["cleanup"] = True
config["COUNTER_LOGGING"] = False
config["METRIC_COMPARE"] = False
config["METRIC_LOGGING"] = False
SUPPORTED_ARCHS = {
"gfx940": {"mi300": ["MI300A_A0"]},
"gfx941": {"mi300": ["MI300X_A0"]},
"gfx942": {"mi300": ["MI300A_A1", "MI300X_A1"]},
}
MI300_CHIP_IDS = {
"29856": "MI300A_A1",
"29857": "MI300X_A1",
"29858": "MI308X",
}
def run(cmd):
p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if cmd[0] == "amd-smi" and p.returncode == 8:
print("ERROR: No GPU detected. Unable to load amd-smi")
assert 0
return p.stdout.decode("ascii")
def gpu_soc():
## 1) Parse arch details from rocminfo
rocminfo = str(
# decode with utf-8 to account for rocm-smi changes in latest rocm
subprocess.run(
["rocminfo"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
).stdout.decode("utf-8")
)
rocminfo = rocminfo.split("\n")
soc_regex = re.compile(r"^\s*Name\s*:\s+ ([a-zA-Z0-9]+)\s*$", re.MULTILINE)
devices = list(filter(soc_regex.match, rocminfo))
gpu_arch = devices[0].split()[1]
if not gpu_arch in SUPPORTED_ARCHS.keys():
return None
## 2) Parse chip id from rocminfo
chip_id = re.compile(r"^\s*Chip ID:\s+ ([a-zA-Z0-9]+)\s*", re.MULTILINE)
ids = list(filter(chip_id.match, rocminfo))
for id in ids:
chip_id = re.match(r"^[^()]+", id.split()[2]).group(0)
## 3) Deduce gpu model name from arch
gpu_model = list(SUPPORTED_ARCHS[gpu_arch].keys())[0].upper()
if gpu_model == "MI300":
if chip_id in MI300_CHIP_IDS:
gpu_model = MI300_CHIP_IDS[chip_id]
else:
return None
return gpu_model
def load_metrics(csv_file_path):
"""
Reads the CSV file into a dictionary of dictionaries:
{
"Metric_1": {
"Avg": value,
"Min": value,
"Max": value,
"Unit": "unit"
},
"Metric_2": { ... },
...
}
"""
metrics_data = {}
with open(csv_file_path, newline="") as csvfile:
reader = csv.DictReader(csvfile) # reads header from first line
for row in reader:
metric_name = row["Metric"].strip()
metrics_data[metric_name] = {
"Avg": float(row["Avg"]) if row["Avg"] else None,
"Min": float(row["Min"]) if row["Min"] else None,
"Max": float(row["Max"]) if row["Max"] else None,
"Unit": row["Unit"].strip() if row["Unit"] else None,
}
return metrics_data
soc = gpu_soc()
@pytest.mark.L1_cache
def test_L1_cache_counters(
binary_handler_profile_rocprof_compute, binary_handler_analyze_rocprof_compute
):
if not soc or "MI300" not in soc:
pytest.skip("Skipping L1 cache test for non-mi300 socs.")
# set up two apps: sequential and random access
app_names = ["vseq", "vrand"]
options = ["-b", "TCP"]
result = {}
metrics = ["Read Req", "Write Req", "Cache Hit Rate"]
base = Path(test_utils.get_output_dir())
for app_name in app_names:
workload_dir = str(base / app_name)
# 1. profile the app
return_code = binary_handler_profile_rocprof_compute(
config,
workload_dir,
options,
check_success=False,
roof=False,
app_name=app_name,
)
assert return_code == 0
# 2. analyze the results
return_code = binary_handler_analyze_rocprof_compute(
["analyze", "--path", workload_dir, "-b", "16.3", "--save-dfs", workload_dir]
)
assert return_code == 0
# 3. save results in local
# FIXME: customize file name to avoid hardcode
csv_path = workload_dir + "/16.3_L1D_Cache_Accesses.csv"
data = load_metrics(csv_path)
for metric in metrics:
if app_name not in result or not isinstance(result[app_name], dict):
result[app_name] = {}
result[app_name][metric] = data[metric]["Avg"]
# 4. clean local output
test_utils.clean_output_dir(config["cleanup"], workload_dir)
test_utils.clean_output_dir(config["cleanup"], base)
# 5. check results are expected
# FIXME: use a range for comparison to account for different results
assert result["vseq"]["Cache Hit Rate"] >= result["vrand"]["Cache Hit Rate"]
assert result["vseq"]["Read Req"] <= result["vrand"]["Read Req"]
assert result["vseq"]["Write Req"] <= result["vrand"]["Write Req"]