attach/detach: change workload of unit test to accommodate SDK's current limitation (#1169)

* add double mode of workload dynamic_share with on remove sleeping and
set ROCP_TOOL_ATTACH=1 for running workload

* add comment in dynamic_shared.hip to exaplain how to use argv

* refactor the attach/detach profiling time in unit tests
此提交包含在:
ywang103-amd
2025-09-30 16:16:43 -04:00
提交者 GitHub
父節點 f45c8d5f6b
當前提交 eeeaa06159
共有 2 個檔案被更改,包括 134 行新增51 行删除
+59 -43
查看文件
@@ -77,82 +77,87 @@ std::vector<float> matrix_transpose_reference(const std::vector<float>& input,
return output;
}
int main()
// argv: Array of command-line arguments. Run with "--enable-sleep" to enable
// the mode with delay implemented by thread sleep
int main(int argc, char* argv[])
{
// Number of rows and columns in the transposed square matrix.
bool enable_sleep = false;
// Check command-line arguments
for(int i = 1; i < argc; ++i)
{
std::string arg = argv[i];
if(arg == "--enable-sleep")
{
enable_sleep = true;
}
}
constexpr unsigned int width = 4;
// Number of threads in each kernel block along the X dimension.
// Because each thread will process exactly one element, this value
// is equal to the width of the matrix.
constexpr unsigned int threads_per_block_x = width;
// Number of threads in each kernel block along the Y dimension.
// Because each thread will process exactly one element, this value
// is equal to the width of the matrix.
constexpr unsigned int threads_per_block_y = width;
// Total element count of the transposed matrix.
constexpr unsigned int size = width * width;
// Total size (in bytes) of the transposed matrix.
constexpr size_t size_bytes = sizeof(float) * size;
// Total amount of shared memory that each block is going to use.
// Exactly one matrix will be stored in shared memory.
constexpr size_t shared_memory_bytes = size_bytes;
std::cout << "Run transpose continuously" << std::endl;
// Set a timer to 30 seconds for rocprofv3 preparation
std::this_thread::sleep_for(std::chrono::seconds(30));
if(enable_sleep)
std::this_thread::sleep_for(std::chrono::seconds(30));
unsigned int pass_count = 0;
unsigned int fail_count = 0;
unsigned int cycle_count = 0;
constexpr float eps = 1.0E-6f;
while (true)
{
std::this_thread::sleep_for(std::chrono::seconds(5));
// Allocate host vectors.
if(enable_sleep)
std::this_thread::sleep_for(std::chrono::seconds(5));
// Allocate host vectors
std::vector<float> h_matrix(size);
std::vector<float> h_transposed_matrix(size);
// Set up input data.
// Set up input data
for(unsigned int i = 0; i < size; i++)
{
h_matrix[i] = i * 10.0f;
}
// Allocate device memory for the input and output matrices.
// Allocate device memory
float* d_matrix{};
float* d_transposed_matrix{};
HIP_CHECK(hipMalloc(&d_matrix, size_bytes));
HIP_CHECK(hipMalloc(&d_transposed_matrix, size_bytes));
// Transfer the input matrix to the device memory.
// Copy input to device
HIP_CHECK(hipMemcpy(d_matrix, h_matrix.data(), size_bytes, hipMemcpyHostToDevice));
// Lauching kernel from host.
// Launch kernel
matrix_transpose_kernel<<<dim3(width / threads_per_block_x, width / threads_per_block_y),
dim3(threads_per_block_x, threads_per_block_y),
shared_memory_bytes,
hipStreamDefault>>>(d_transposed_matrix, d_matrix, width);
dim3(threads_per_block_x, threads_per_block_y),
shared_memory_bytes,
hipStreamDefault>>>(d_transposed_matrix, d_matrix, width);
// Check if the kernel launch was successful.
HIP_CHECK(hipGetLastError());
// Transfer the result back to the host.
// Copy result back
HIP_CHECK(hipMemcpy(h_transposed_matrix.data(),
d_transposed_matrix,
size_bytes,
hipMemcpyDeviceToHost));
// Free the resources on the device.
// Free device memory
HIP_CHECK(hipFree(d_matrix));
HIP_CHECK(hipFree(d_transposed_matrix));
// Perform the reference (CPU) calculation.
// CPU reference transpose
std::vector<float> ref_transposed_matrix = matrix_transpose_reference(h_matrix, width);
// Check the results' validity.
constexpr float eps = 1.0E-6f;
unsigned int errors{};
// Validate
unsigned int errors = 0;
for(unsigned int i = 0; i < size; i++)
{
if(std::fabs(h_transposed_matrix[i] - ref_transposed_matrix[i]) > eps)
@@ -161,14 +166,25 @@ int main()
}
}
if(errors != 0)
{
std::cout << "Validation failed. Errors: " << errors << std::endl;
return error_exit_code;
}
// Update pass/fail counters
if(errors == 0)
pass_count++;
else
fail_count++;
cycle_count++;
// Every 10000 cycles, print summary and reset counters
if(cycle_count == 10000)
{
std::cout << "Validation passed." << std::endl;
std::cout << "10000 Validation cycles completed: "
<< "Passes = " << pass_count
<< ", Failures = " << fail_count << std::endl;
// Reset counters
cycle_count = 0;
pass_count = 0;
fail_count = 0;
}
}
}
}
+75 -8
查看文件
@@ -71,6 +71,9 @@ config["METRIC_LOGGING"] = False
num_kernels = 3
num_devices = 1
attach_detach_interval_msec_no_delay = 10000
attach_detach_interval_msec_with_delay = 60000
DEFAULT_ABS_DIFF = 15
DEFAULT_REL_DIFF = 50
MAX_REOCCURING_COUNT = 28
@@ -1778,14 +1781,75 @@ def test_pc_sampling_stochastic(binary_handler_profile_rocprof_compute):
def test_live_attach_detach_block(binary_handler_profile_rocprof_compute):
options = ["--block", "3.1.1", "4.1.1", "5.1.1"]
workload_dir = test_utils.get_output_dir()
process_workload = subprocess.Popen(config["app_hip_dynamic_shared"])
# TODO: temp fix for sdk defautly disable attach/detach,
# remove after it sets default to enable
env = os.environ.copy()
env["ROCP_TOOL_ATTACH"] = "1"
# set the time to detach here to 1 mins, which is 60000 msec
time_to_detach = "60000"
process_workload = subprocess.Popen(config["app_hip_dynamic_shared"], env=env)
attach_detach = dict()
attach_detach["attach_pid"] = process_workload.pid
attach_detach["attach-duration-msec"] = time_to_detach
attach_detach["attach-duration-msec"] = attach_detach_interval_msec_no_delay
_ = binary_handler_profile_rocprof_compute(
config,
workload_dir,
options,
check_success=True,
roof=False,
app_name="app_hip_dynamic_shared",
attach_detach_para=attach_detach,
)
# kill the process of the workload at thsi point if it's still running
if process_workload.poll() is None:
print(
f"rocprof-compute has detached and finished, "
f"killing workload process (pid={process_workload.pid})..."
)
process_workload.kill()
process_workload.wait()
file_dict = test_utils.check_csv_files(workload_dir, 1, num_kernels)
validate(
inspect.stack()[0][3],
workload_dir,
file_dict,
)
assert test_utils.check_file_pattern(
"- 3.1.1", f"{workload_dir}/profiling_config.yaml"
)
assert test_utils.check_file_pattern(
"- 4.1.1", f"{workload_dir}/profiling_config.yaml"
)
assert test_utils.check_file_pattern(
"- 5.1.1", f"{workload_dir}/profiling_config.yaml"
)
test_utils.clean_output_dir(config["cleanup"], workload_dir)
@pytest.mark.skip(
reason="Temporarily disabled: \
waiting for SDK fix for no outputfile with thread sleeping"
)
@pytest.mark.live_attach_detach
def test_live_attach_detach_block_thread_sleep(binary_handler_profile_rocprof_compute):
options = ["--block", "3.1.1", "4.1.1", "5.1.1"]
workload_dir = test_utils.get_output_dir()
# TODO: temp fix for sdk defautly disable attach/detach,
# remove after it sets default to enable
env = os.environ.copy()
env["ROCP_TOOL_ATTACH"] = "1"
process_workload = subprocess.Popen(
[config["app_hip_dynamic_shared"], "--enable-sleep"], env=env
)
attach_detach = dict()
attach_detach["attach_pid"] = process_workload.pid
attach_detach["attach-duration-msec"] = attach_detach_interval_msec_with_delay
_ = binary_handler_profile_rocprof_compute(
config,
@@ -1831,14 +1895,17 @@ def test_live_attach_detach_singlepath_launch_stats(
):
options = ["--set", "launch_stats"]
workload_dir = test_utils.get_output_dir()
process_workload = subprocess.Popen(config["app_hip_dynamic_shared"])
# set the time to detach here to 1 mins, which is 60000 msec
time_to_detach = "60000"
# TODO: temp fix for sdk defautly disable attach/detach,
# remove after it sets default to enable
env = os.environ.copy()
env["ROCP_TOOL_ATTACH"] = "1"
process_workload = subprocess.Popen(config["app_hip_dynamic_shared"], env=env)
attach_detach = dict()
attach_detach["attach_pid"] = process_workload.pid
attach_detach["attach-duration-msec"] = time_to_detach
attach_detach["attach-duration-msec"] = attach_detach_interval_msec_no_delay
_ = binary_handler_profile_rocprof_compute(
config,