From eeeaa061597718fa4deae4e048f100277a3e3920 Mon Sep 17 00:00:00 2001 From: ywang103-amd Date: Tue, 30 Sep 2025 16:16:43 -0400 Subject: [PATCH] attach/detach: change workload of unit test to accommodate SDK's current limitation (#1169) * add double mode of workload dynamic_share with on remove sleeping and set ROCP_TOOL_ATTACH=1 for running workload * add comment in dynamic_shared.hip to exaplain how to use argv * refactor the attach/detach profiling time in unit tests --- .../sample/dynamic_shared/dynamic_shared.hip | 102 ++++++++++-------- .../tests/test_profile_general.py | 83 ++++++++++++-- 2 files changed, 134 insertions(+), 51 deletions(-) diff --git a/projects/rocprofiler-compute/sample/dynamic_shared/dynamic_shared.hip b/projects/rocprofiler-compute/sample/dynamic_shared/dynamic_shared.hip index 1b0611fdce..ee24d05d1b 100644 --- a/projects/rocprofiler-compute/sample/dynamic_shared/dynamic_shared.hip +++ b/projects/rocprofiler-compute/sample/dynamic_shared/dynamic_shared.hip @@ -77,82 +77,87 @@ std::vector matrix_transpose_reference(const std::vector& input, return output; } -int main() +// argv: Array of command-line arguments. Run with "--enable-sleep" to enable +// the mode with delay implemented by thread sleep +int main(int argc, char* argv[]) { - // Number of rows and columns in the transposed square matrix. + bool enable_sleep = false; + + // Check command-line arguments + for(int i = 1; i < argc; ++i) + { + std::string arg = argv[i]; + if(arg == "--enable-sleep") + { + enable_sleep = true; + } + } + constexpr unsigned int width = 4; - - // Number of threads in each kernel block along the X dimension. - // Because each thread will process exactly one element, this value - // is equal to the width of the matrix. constexpr unsigned int threads_per_block_x = width; - - // Number of threads in each kernel block along the Y dimension. - // Because each thread will process exactly one element, this value - // is equal to the width of the matrix. constexpr unsigned int threads_per_block_y = width; - - // Total element count of the transposed matrix. constexpr unsigned int size = width * width; - - // Total size (in bytes) of the transposed matrix. constexpr size_t size_bytes = sizeof(float) * size; - - // Total amount of shared memory that each block is going to use. - // Exactly one matrix will be stored in shared memory. constexpr size_t shared_memory_bytes = size_bytes; + std::cout << "Run transpose continuously" << std::endl; - // Set a timer to 30 seconds for rocprofv3 preparation - std::this_thread::sleep_for(std::chrono::seconds(30)); - + if(enable_sleep) + std::this_thread::sleep_for(std::chrono::seconds(30)); + + unsigned int pass_count = 0; + unsigned int fail_count = 0; + unsigned int cycle_count = 0; + + constexpr float eps = 1.0E-6f; + while (true) { - std::this_thread::sleep_for(std::chrono::seconds(5)); - // Allocate host vectors. + if(enable_sleep) + std::this_thread::sleep_for(std::chrono::seconds(5)); + + // Allocate host vectors std::vector h_matrix(size); std::vector h_transposed_matrix(size); - // Set up input data. + // Set up input data for(unsigned int i = 0; i < size; i++) { h_matrix[i] = i * 10.0f; } - // Allocate device memory for the input and output matrices. + // Allocate device memory float* d_matrix{}; float* d_transposed_matrix{}; HIP_CHECK(hipMalloc(&d_matrix, size_bytes)); HIP_CHECK(hipMalloc(&d_transposed_matrix, size_bytes)); - // Transfer the input matrix to the device memory. + // Copy input to device HIP_CHECK(hipMemcpy(d_matrix, h_matrix.data(), size_bytes, hipMemcpyHostToDevice)); - // Lauching kernel from host. + // Launch kernel matrix_transpose_kernel<<>>(d_transposed_matrix, d_matrix, width); + dim3(threads_per_block_x, threads_per_block_y), + shared_memory_bytes, + hipStreamDefault>>>(d_transposed_matrix, d_matrix, width); - // Check if the kernel launch was successful. HIP_CHECK(hipGetLastError()); - // Transfer the result back to the host. + // Copy result back HIP_CHECK(hipMemcpy(h_transposed_matrix.data(), d_transposed_matrix, size_bytes, hipMemcpyDeviceToHost)); - // Free the resources on the device. + // Free device memory HIP_CHECK(hipFree(d_matrix)); HIP_CHECK(hipFree(d_transposed_matrix)); - // Perform the reference (CPU) calculation. + // CPU reference transpose std::vector ref_transposed_matrix = matrix_transpose_reference(h_matrix, width); - // Check the results' validity. - constexpr float eps = 1.0E-6f; - unsigned int errors{}; + // Validate + unsigned int errors = 0; for(unsigned int i = 0; i < size; i++) { if(std::fabs(h_transposed_matrix[i] - ref_transposed_matrix[i]) > eps) @@ -161,14 +166,25 @@ int main() } } - if(errors != 0) - { - std::cout << "Validation failed. Errors: " << errors << std::endl; - return error_exit_code; - } + // Update pass/fail counters + if(errors == 0) + pass_count++; else + fail_count++; + + cycle_count++; + + // Every 10000 cycles, print summary and reset counters + if(cycle_count == 10000) { - std::cout << "Validation passed." << std::endl; + std::cout << "10000 Validation cycles completed: " + << "Passes = " << pass_count + << ", Failures = " << fail_count << std::endl; + + // Reset counters + cycle_count = 0; + pass_count = 0; + fail_count = 0; } } -} +} \ No newline at end of file diff --git a/projects/rocprofiler-compute/tests/test_profile_general.py b/projects/rocprofiler-compute/tests/test_profile_general.py index 2b318465c6..2b50bf97bf 100644 --- a/projects/rocprofiler-compute/tests/test_profile_general.py +++ b/projects/rocprofiler-compute/tests/test_profile_general.py @@ -71,6 +71,9 @@ config["METRIC_LOGGING"] = False num_kernels = 3 num_devices = 1 +attach_detach_interval_msec_no_delay = 10000 +attach_detach_interval_msec_with_delay = 60000 + DEFAULT_ABS_DIFF = 15 DEFAULT_REL_DIFF = 50 MAX_REOCCURING_COUNT = 28 @@ -1778,14 +1781,75 @@ def test_pc_sampling_stochastic(binary_handler_profile_rocprof_compute): def test_live_attach_detach_block(binary_handler_profile_rocprof_compute): options = ["--block", "3.1.1", "4.1.1", "5.1.1"] workload_dir = test_utils.get_output_dir() - process_workload = subprocess.Popen(config["app_hip_dynamic_shared"]) + # TODO: temp fix for sdk defautly disable attach/detach, + # remove after it sets default to enable + env = os.environ.copy() + env["ROCP_TOOL_ATTACH"] = "1" - # set the time to detach here to 1 mins, which is 60000 msec - time_to_detach = "60000" + process_workload = subprocess.Popen(config["app_hip_dynamic_shared"], env=env) attach_detach = dict() attach_detach["attach_pid"] = process_workload.pid - attach_detach["attach-duration-msec"] = time_to_detach + attach_detach["attach-duration-msec"] = attach_detach_interval_msec_no_delay + + _ = binary_handler_profile_rocprof_compute( + config, + workload_dir, + options, + check_success=True, + roof=False, + app_name="app_hip_dynamic_shared", + attach_detach_para=attach_detach, + ) + + # kill the process of the workload at thsi point if it's still running + if process_workload.poll() is None: + print( + f"rocprof-compute has detached and finished, " + f"killing workload process (pid={process_workload.pid})..." + ) + process_workload.kill() + process_workload.wait() + + file_dict = test_utils.check_csv_files(workload_dir, 1, num_kernels) + validate( + inspect.stack()[0][3], + workload_dir, + file_dict, + ) + + assert test_utils.check_file_pattern( + "- 3.1.1", f"{workload_dir}/profiling_config.yaml" + ) + assert test_utils.check_file_pattern( + "- 4.1.1", f"{workload_dir}/profiling_config.yaml" + ) + assert test_utils.check_file_pattern( + "- 5.1.1", f"{workload_dir}/profiling_config.yaml" + ) + test_utils.clean_output_dir(config["cleanup"], workload_dir) + + +@pytest.mark.skip( + reason="Temporarily disabled: \ + waiting for SDK fix for no outputfile with thread sleeping" +) +@pytest.mark.live_attach_detach +def test_live_attach_detach_block_thread_sleep(binary_handler_profile_rocprof_compute): + options = ["--block", "3.1.1", "4.1.1", "5.1.1"] + workload_dir = test_utils.get_output_dir() + # TODO: temp fix for sdk defautly disable attach/detach, + # remove after it sets default to enable + env = os.environ.copy() + env["ROCP_TOOL_ATTACH"] = "1" + + process_workload = subprocess.Popen( + [config["app_hip_dynamic_shared"], "--enable-sleep"], env=env + ) + + attach_detach = dict() + attach_detach["attach_pid"] = process_workload.pid + attach_detach["attach-duration-msec"] = attach_detach_interval_msec_with_delay _ = binary_handler_profile_rocprof_compute( config, @@ -1831,14 +1895,17 @@ def test_live_attach_detach_singlepath_launch_stats( ): options = ["--set", "launch_stats"] workload_dir = test_utils.get_output_dir() - process_workload = subprocess.Popen(config["app_hip_dynamic_shared"]) - # set the time to detach here to 1 mins, which is 60000 msec - time_to_detach = "60000" + # TODO: temp fix for sdk defautly disable attach/detach, + # remove after it sets default to enable + env = os.environ.copy() + env["ROCP_TOOL_ATTACH"] = "1" + + process_workload = subprocess.Popen(config["app_hip_dynamic_shared"], env=env) attach_detach = dict() attach_detach["attach_pid"] = process_workload.pid - attach_detach["attach-duration-msec"] = time_to_detach + attach_detach["attach-duration-msec"] = attach_detach_interval_msec_no_delay _ = binary_handler_profile_rocprof_compute( config,