attach/detach: change workload of unit test to accommodate SDK's current limitation (#1169)
* add double mode of workload dynamic_share with on remove sleeping and set ROCP_TOOL_ATTACH=1 for running workload * add comment in dynamic_shared.hip to exaplain how to use argv * refactor the attach/detach profiling time in unit tests
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
f45c8d5f6b
Коммит
eeeaa06159
@@ -77,82 +77,87 @@ std::vector<float> matrix_transpose_reference(const std::vector<float>& input,
|
||||
return output;
|
||||
}
|
||||
|
||||
int main()
|
||||
// argv: Array of command-line arguments. Run with "--enable-sleep" to enable
|
||||
// the mode with delay implemented by thread sleep
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// Number of rows and columns in the transposed square matrix.
|
||||
bool enable_sleep = false;
|
||||
|
||||
// Check command-line arguments
|
||||
for(int i = 1; i < argc; ++i)
|
||||
{
|
||||
std::string arg = argv[i];
|
||||
if(arg == "--enable-sleep")
|
||||
{
|
||||
enable_sleep = true;
|
||||
}
|
||||
}
|
||||
|
||||
constexpr unsigned int width = 4;
|
||||
|
||||
// Number of threads in each kernel block along the X dimension.
|
||||
// Because each thread will process exactly one element, this value
|
||||
// is equal to the width of the matrix.
|
||||
constexpr unsigned int threads_per_block_x = width;
|
||||
|
||||
// Number of threads in each kernel block along the Y dimension.
|
||||
// Because each thread will process exactly one element, this value
|
||||
// is equal to the width of the matrix.
|
||||
constexpr unsigned int threads_per_block_y = width;
|
||||
|
||||
// Total element count of the transposed matrix.
|
||||
constexpr unsigned int size = width * width;
|
||||
|
||||
// Total size (in bytes) of the transposed matrix.
|
||||
constexpr size_t size_bytes = sizeof(float) * size;
|
||||
|
||||
// Total amount of shared memory that each block is going to use.
|
||||
// Exactly one matrix will be stored in shared memory.
|
||||
constexpr size_t shared_memory_bytes = size_bytes;
|
||||
|
||||
std::cout << "Run transpose continuously" << std::endl;
|
||||
|
||||
// Set a timer to 30 seconds for rocprofv3 preparation
|
||||
std::this_thread::sleep_for(std::chrono::seconds(30));
|
||||
|
||||
if(enable_sleep)
|
||||
std::this_thread::sleep_for(std::chrono::seconds(30));
|
||||
|
||||
unsigned int pass_count = 0;
|
||||
unsigned int fail_count = 0;
|
||||
unsigned int cycle_count = 0;
|
||||
|
||||
constexpr float eps = 1.0E-6f;
|
||||
|
||||
while (true)
|
||||
{
|
||||
std::this_thread::sleep_for(std::chrono::seconds(5));
|
||||
// Allocate host vectors.
|
||||
if(enable_sleep)
|
||||
std::this_thread::sleep_for(std::chrono::seconds(5));
|
||||
|
||||
// Allocate host vectors
|
||||
std::vector<float> h_matrix(size);
|
||||
std::vector<float> h_transposed_matrix(size);
|
||||
|
||||
// Set up input data.
|
||||
// Set up input data
|
||||
for(unsigned int i = 0; i < size; i++)
|
||||
{
|
||||
h_matrix[i] = i * 10.0f;
|
||||
}
|
||||
|
||||
// Allocate device memory for the input and output matrices.
|
||||
// Allocate device memory
|
||||
float* d_matrix{};
|
||||
float* d_transposed_matrix{};
|
||||
HIP_CHECK(hipMalloc(&d_matrix, size_bytes));
|
||||
HIP_CHECK(hipMalloc(&d_transposed_matrix, size_bytes));
|
||||
|
||||
// Transfer the input matrix to the device memory.
|
||||
// Copy input to device
|
||||
HIP_CHECK(hipMemcpy(d_matrix, h_matrix.data(), size_bytes, hipMemcpyHostToDevice));
|
||||
|
||||
// Lauching kernel from host.
|
||||
// Launch kernel
|
||||
matrix_transpose_kernel<<<dim3(width / threads_per_block_x, width / threads_per_block_y),
|
||||
dim3(threads_per_block_x, threads_per_block_y),
|
||||
shared_memory_bytes,
|
||||
hipStreamDefault>>>(d_transposed_matrix, d_matrix, width);
|
||||
dim3(threads_per_block_x, threads_per_block_y),
|
||||
shared_memory_bytes,
|
||||
hipStreamDefault>>>(d_transposed_matrix, d_matrix, width);
|
||||
|
||||
// Check if the kernel launch was successful.
|
||||
HIP_CHECK(hipGetLastError());
|
||||
|
||||
// Transfer the result back to the host.
|
||||
// Copy result back
|
||||
HIP_CHECK(hipMemcpy(h_transposed_matrix.data(),
|
||||
d_transposed_matrix,
|
||||
size_bytes,
|
||||
hipMemcpyDeviceToHost));
|
||||
|
||||
// Free the resources on the device.
|
||||
// Free device memory
|
||||
HIP_CHECK(hipFree(d_matrix));
|
||||
HIP_CHECK(hipFree(d_transposed_matrix));
|
||||
|
||||
// Perform the reference (CPU) calculation.
|
||||
// CPU reference transpose
|
||||
std::vector<float> ref_transposed_matrix = matrix_transpose_reference(h_matrix, width);
|
||||
|
||||
// Check the results' validity.
|
||||
constexpr float eps = 1.0E-6f;
|
||||
unsigned int errors{};
|
||||
// Validate
|
||||
unsigned int errors = 0;
|
||||
for(unsigned int i = 0; i < size; i++)
|
||||
{
|
||||
if(std::fabs(h_transposed_matrix[i] - ref_transposed_matrix[i]) > eps)
|
||||
@@ -161,14 +166,25 @@ int main()
|
||||
}
|
||||
}
|
||||
|
||||
if(errors != 0)
|
||||
{
|
||||
std::cout << "Validation failed. Errors: " << errors << std::endl;
|
||||
return error_exit_code;
|
||||
}
|
||||
// Update pass/fail counters
|
||||
if(errors == 0)
|
||||
pass_count++;
|
||||
else
|
||||
fail_count++;
|
||||
|
||||
cycle_count++;
|
||||
|
||||
// Every 10000 cycles, print summary and reset counters
|
||||
if(cycle_count == 10000)
|
||||
{
|
||||
std::cout << "Validation passed." << std::endl;
|
||||
std::cout << "10000 Validation cycles completed: "
|
||||
<< "Passes = " << pass_count
|
||||
<< ", Failures = " << fail_count << std::endl;
|
||||
|
||||
// Reset counters
|
||||
cycle_count = 0;
|
||||
pass_count = 0;
|
||||
fail_count = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -71,6 +71,9 @@ config["METRIC_LOGGING"] = False
|
||||
num_kernels = 3
|
||||
num_devices = 1
|
||||
|
||||
attach_detach_interval_msec_no_delay = 10000
|
||||
attach_detach_interval_msec_with_delay = 60000
|
||||
|
||||
DEFAULT_ABS_DIFF = 15
|
||||
DEFAULT_REL_DIFF = 50
|
||||
MAX_REOCCURING_COUNT = 28
|
||||
@@ -1778,14 +1781,75 @@ def test_pc_sampling_stochastic(binary_handler_profile_rocprof_compute):
|
||||
def test_live_attach_detach_block(binary_handler_profile_rocprof_compute):
|
||||
options = ["--block", "3.1.1", "4.1.1", "5.1.1"]
|
||||
workload_dir = test_utils.get_output_dir()
|
||||
process_workload = subprocess.Popen(config["app_hip_dynamic_shared"])
|
||||
# TODO: temp fix for sdk defautly disable attach/detach,
|
||||
# remove after it sets default to enable
|
||||
env = os.environ.copy()
|
||||
env["ROCP_TOOL_ATTACH"] = "1"
|
||||
|
||||
# set the time to detach here to 1 mins, which is 60000 msec
|
||||
time_to_detach = "60000"
|
||||
process_workload = subprocess.Popen(config["app_hip_dynamic_shared"], env=env)
|
||||
|
||||
attach_detach = dict()
|
||||
attach_detach["attach_pid"] = process_workload.pid
|
||||
attach_detach["attach-duration-msec"] = time_to_detach
|
||||
attach_detach["attach-duration-msec"] = attach_detach_interval_msec_no_delay
|
||||
|
||||
_ = binary_handler_profile_rocprof_compute(
|
||||
config,
|
||||
workload_dir,
|
||||
options,
|
||||
check_success=True,
|
||||
roof=False,
|
||||
app_name="app_hip_dynamic_shared",
|
||||
attach_detach_para=attach_detach,
|
||||
)
|
||||
|
||||
# kill the process of the workload at thsi point if it's still running
|
||||
if process_workload.poll() is None:
|
||||
print(
|
||||
f"rocprof-compute has detached and finished, "
|
||||
f"killing workload process (pid={process_workload.pid})..."
|
||||
)
|
||||
process_workload.kill()
|
||||
process_workload.wait()
|
||||
|
||||
file_dict = test_utils.check_csv_files(workload_dir, 1, num_kernels)
|
||||
validate(
|
||||
inspect.stack()[0][3],
|
||||
workload_dir,
|
||||
file_dict,
|
||||
)
|
||||
|
||||
assert test_utils.check_file_pattern(
|
||||
"- 3.1.1", f"{workload_dir}/profiling_config.yaml"
|
||||
)
|
||||
assert test_utils.check_file_pattern(
|
||||
"- 4.1.1", f"{workload_dir}/profiling_config.yaml"
|
||||
)
|
||||
assert test_utils.check_file_pattern(
|
||||
"- 5.1.1", f"{workload_dir}/profiling_config.yaml"
|
||||
)
|
||||
test_utils.clean_output_dir(config["cleanup"], workload_dir)
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason="Temporarily disabled: \
|
||||
waiting for SDK fix for no outputfile with thread sleeping"
|
||||
)
|
||||
@pytest.mark.live_attach_detach
|
||||
def test_live_attach_detach_block_thread_sleep(binary_handler_profile_rocprof_compute):
|
||||
options = ["--block", "3.1.1", "4.1.1", "5.1.1"]
|
||||
workload_dir = test_utils.get_output_dir()
|
||||
# TODO: temp fix for sdk defautly disable attach/detach,
|
||||
# remove after it sets default to enable
|
||||
env = os.environ.copy()
|
||||
env["ROCP_TOOL_ATTACH"] = "1"
|
||||
|
||||
process_workload = subprocess.Popen(
|
||||
[config["app_hip_dynamic_shared"], "--enable-sleep"], env=env
|
||||
)
|
||||
|
||||
attach_detach = dict()
|
||||
attach_detach["attach_pid"] = process_workload.pid
|
||||
attach_detach["attach-duration-msec"] = attach_detach_interval_msec_with_delay
|
||||
|
||||
_ = binary_handler_profile_rocprof_compute(
|
||||
config,
|
||||
@@ -1831,14 +1895,17 @@ def test_live_attach_detach_singlepath_launch_stats(
|
||||
):
|
||||
options = ["--set", "launch_stats"]
|
||||
workload_dir = test_utils.get_output_dir()
|
||||
process_workload = subprocess.Popen(config["app_hip_dynamic_shared"])
|
||||
|
||||
# set the time to detach here to 1 mins, which is 60000 msec
|
||||
time_to_detach = "60000"
|
||||
# TODO: temp fix for sdk defautly disable attach/detach,
|
||||
# remove after it sets default to enable
|
||||
env = os.environ.copy()
|
||||
env["ROCP_TOOL_ATTACH"] = "1"
|
||||
|
||||
process_workload = subprocess.Popen(config["app_hip_dynamic_shared"], env=env)
|
||||
|
||||
attach_detach = dict()
|
||||
attach_detach["attach_pid"] = process_workload.pid
|
||||
attach_detach["attach-duration-msec"] = time_to_detach
|
||||
attach_detach["attach-duration-msec"] = attach_detach_interval_msec_no_delay
|
||||
|
||||
_ = binary_handler_profile_rocprof_compute(
|
||||
config,
|
||||
|
||||
Ссылка в новой задаче
Block a user