Files
rocm-systems/tests/pytest-packages/pc_sampling/exec_mask_manipulation/csv.py
T
Indic, Vladimir 49ce79a5b5 [SDK][rocprofv3] MI300 Stochastic PC sampling (#92)
* MI300 Stochastic PC sampling SDK API implementation

* ROCProfV3: Stochastic PC sampling Support (#94)

* ROCProfV3: MI300 Stochastic PC sampling initial draft

* ROCProfV3: Initial Stochastic PC sampling Tests (#95)

ROCProfV3: Initial Stochastic PC sampling tests

* Update rocprofiler_pc_sampling_record_stochastic_v0_t

- update doxygen docs for members
- replace rocprofiler_correlation_id_t with rocprofiler_async_correlation_id_t

* Relax the check in JSON tests

* drain PC sampling buffer during finalize_rocprofv3

* Increase timeout for "Test Install Build" step

- 10 minutes -> 20 minutes
- "Test Installed Packages" has 20 minutes so "Test Install Build" should also

---------

Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com>
2025-03-21 14:40:45 -05:00

211 baris
8.4 KiB
Python

# MIT License
#
# Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from __future__ import absolute_import
import numpy as np
import pandas as pd
def stochastic_assert(df, df_condition_selection, max_failing_samples=10):
# TODO: When asserting certain conditions related to exec_masks for all samples,
# we observe some failures.
# This usually happens because some small number of samples (e.g., 1-10 out of 100k)
# do not satisfy the condition. This is either a regression in the ROCr 2nd level trap
# handler (as sometimes execution mask or correlation ID mismatches), or
# just stochastic nature of the sampling (meaning our checks are too strict).
# To relax checks, we introduce an assertion that will allow some small number
# of samples to disobey the condition.
# This is a temporary solution until we find the root cause of the issue.
# extract the failing samples
failing_samples = df[~df_condition_selection]
assert len(failing_samples) <= max_failing_samples, "Too many failing samples"
# Keep this in case we decide to revert workgroup_id information
def validate_workgoup_id_x_y_z(df, max_x, max_y, max_z):
assert (df["Workgroup_Size_X"].astype(int) >= 0).all()
assert (df["Workgroup_Size_X"].astype(int) <= max_x).all()
assert (df["Workgroup_Size_Y"].astype(int) >= 0).all()
assert (df["Workgroup_Size_Y"].astype(int) <= max_y).all()
assert (df["Workgroup_Size_Z"].astype(int) >= 0).all()
assert (df["Workgroup_Size_Z"].astype(int) <= max_z).all()
# Keep this in case we decide to revert wave_id information
def validate_wave_id(df, max_wave_id):
assert (df["Wave_Id"].astype(int) <= max_wave_id).all()
# Keep this in case we decide to revert wave_id information
def validate_chiplet(df, max_chiplet):
assert (df["Chiplet"].astype(int) <= max_chiplet).all()
def validate_instruction_decoding(
df,
inst_str,
exec_mask_uint64: np.uint64 = None,
source_code_lines_range: (int, int) = None,
all_source_lines_samples=False,
):
# Make a copy, so that we don't work (modify) a view.
df_inst = df[df["Instruction"].apply(lambda inst: inst.startswith(inst_str))].copy()
assert not df_inst.empty
# assert the exec mask if requested
if exec_mask_uint64 is not None:
stochastic_assert(
df_inst, df_inst["Exec_Mask"].astype(np.uint64) == exec_mask_uint64
)
# assert whether the samples source code lines belongs to the provided range
if source_code_lines_range is not None:
start_range, end_range = source_code_lines_range
# The instruction comment is isually in the following format: /path/to/source/file.cpp:line_num
df_inst["source_line_num"] = df_inst["Instruction_Comment"].apply(
lambda source_line: int(source_line.split(":")[-1])
)
assert (df_inst["source_line_num"] >= start_range).all()
assert (df_inst["source_line_num"] <= end_range).all()
# if requested, check if all lines from the range are sampled
if all_source_lines_samples:
assert len(df_inst["source_line_num"].unique()) == (
end_range - start_range + 1
)
def validate_instruction_comment(df):
# Instruction comment must always be present, since the testing application
# is built with debug symbols.
assert (
(df["Instruction_Comment"] != "") & (df["Instruction_Comment"] != "nullptr")
).all()
def validate_instruction_correlation_id_relation(df):
# Samples with no decoded instructions originates from either
# blit kernels or self modifying code. The correlation id for this
# type of samples should alway be zero.
# Thus, Correlation_Id is 0 `iff`` instruction is not decoded.
# The previous statement has two implications.
# Implication 1: If the instruction is not decoded, then correlation id is 0.
samples_no_instruction_df = df[
(df["Instruction"] == "") | (df["Instruction"] == "nullptr")
]
assert (samples_no_instruction_df["Correlation_Id"] == 0).all()
# Implication 2: If the correlation id is 0, then the instruction is not decoded.
samples_cid_zero_df = df[df["Correlation_Id"] == 0]
assert (
(samples_cid_zero_df["Instruction"] == "")
| (samples_cid_zero_df["Instruction"] == "nullptr")
).all()
assert len(samples_no_instruction_df) == len(samples_cid_zero_df)
# Since we're not enabling any kind of API tracing,
# internal correlation id should match the dispatch id
assert all(df["Correlation_Id"] == df["Dispatch_Id"])
def validate_exec_mask_based_on_correlation_id(df):
# The function assumes that each kernel launches 1024 blocks.
# Each block contains number of threads that matches correlation ID of the kernel.
# The exec mask of a sample should contain number of ones equal to
# the correlation ID of the kernel during which execution the sample was generated.
df["active_SIMD_threads"] = df["Exec_Mask"].apply(
lambda exec_mask: bin(exec_mask).count("1")
)
stochastic_assert(df, df["active_SIMD_threads"] == df["Correlation_Id"])
# TODO: Comment out the following code if it causes spurious fails.
# The more conservative constraint based on the experience follows.
# The exec mask of sampled instructions of the kernels respect the following pattern:
# cid -> exec
# 1 -> 0b1
# 2 -> 0b11
# 3 -> 0b111
# ...
# 64 -> 0xffffffffffffffff
df["Exec_Mask2"] = (
df["Correlation_Id"].astype(int).apply(lambda x: int("0b" + (x * "1"), 2))
)
# TODO: exec should be in hex and that will ease the comparison
stochastic_assert(
df, df["Exec_Mask"].astype(np.uint64) == df["Exec_Mask2"].astype(np.uint64)
)
def exec_mask_manipulation_validate_csv(df, all_sampled=False):
assert not df.empty
validate_instruction_comment(df)
validate_instruction_correlation_id_relation(df)
# Validate samples with non-zero correlation IDs (and with decoded instructions)
samples_cid_non_zero_df = df[df["Correlation_Id"] != 0]
# exactly 65 kernels and 65 correlation id
assert (samples_cid_non_zero_df["Correlation_Id"].astype(int) >= 1).all()
assert (samples_cid_non_zero_df["Correlation_Id"].astype(int) <= 65).all()
if all_sampled:
# all correlation IDs must be sampled
assert len(samples_cid_non_zero_df["Correlation_Id"].astype(int).unique()) == 65
first_64_kernels_df = samples_cid_non_zero_df[
samples_cid_non_zero_df["Correlation_Id"] <= 64
]
# Make a copy, so that we don't work (modify) a view.
validate_exec_mask_based_on_correlation_id(first_64_kernels_df.copy())
# validate the last kernel
kernel_65_df = df[df["Correlation_Id"] == 65]
# assert that v_rcp instructions are properly decoded
# the v_rcp is executed by even SIMD threads
validate_instruction_decoding(
kernel_65_df,
"v_rcp_f64",
exec_mask_uint64=np.uint64(int("5555555555555555", 16)),
source_code_lines_range=(288, 387),
all_source_lines_samples=all_sampled,
)
# assert that v_rcp_f32 instructions are properly decoded
# the v_rcp_f32 is executed by odd SIMD threads
validate_instruction_decoding(
kernel_65_df,
"v_rcp_f32",
exec_mask_uint64=np.uint64(int("AAAAAAAAAAAAAAAA", 16)),
source_code_lines_range=(391, 490),
all_source_lines_samples=all_sampled,
)