[rocprofiler-compute] add try catch to ensure subprocess killed if test of attach/detach fails (#2139)
* add try catch to ensure subprocess killed if test of attach/detach fails * remove unnecessary comments * remove duplicated cleanup
Esse commit está contido em:
@@ -2072,52 +2072,43 @@ def test_pc_sampling_stochastic(binary_handler_profile_rocprof_compute):
|
||||
def test_live_attach_detach_block(binary_handler_profile_rocprof_compute):
|
||||
options = ["--block", "3.1.1", "4.1.1", "5.1.1"]
|
||||
workload_dir = test_utils.get_output_dir()
|
||||
|
||||
# TODO: temp fix for sdk defautly disable attach/detach,
|
||||
# remove after it sets default to enable
|
||||
env = os.environ.copy()
|
||||
env["ROCP_TOOL_ATTACH"] = "1"
|
||||
|
||||
process_workload = subprocess.Popen(config["app_hip_dynamic_shared"], env=env)
|
||||
process_workload = None
|
||||
|
||||
attach_detach = dict()
|
||||
attach_detach["attach_pid"] = process_workload.pid
|
||||
attach_detach["attach-duration-msec"] = attach_detach_interval_msec_no_delay
|
||||
try:
|
||||
# Start workload
|
||||
process_workload = subprocess.Popen(config["app_hip_dynamic_shared"], env=env)
|
||||
|
||||
_ = binary_handler_profile_rocprof_compute(
|
||||
config,
|
||||
workload_dir,
|
||||
options,
|
||||
check_success=True,
|
||||
roof=False,
|
||||
app_name="app_hip_dynamic_shared",
|
||||
attach_detach_para=attach_detach,
|
||||
)
|
||||
attach_detach = {
|
||||
"attach_pid": process_workload.pid,
|
||||
"attach-duration-msec": attach_detach_interval_msec_no_delay,
|
||||
}
|
||||
|
||||
# kill the process of the workload at thsi point if it's still running
|
||||
if process_workload.poll() is None:
|
||||
print(
|
||||
f"rocprof-compute has detached and finished, "
|
||||
f"killing workload process (pid={process_workload.pid})..."
|
||||
# Run profiler (might fail / timeout / throw)
|
||||
binary_handler_profile_rocprof_compute(
|
||||
config,
|
||||
workload_dir,
|
||||
options,
|
||||
check_success=True,
|
||||
roof=False,
|
||||
app_name="app_hip_dynamic_shared",
|
||||
attach_detach_para=attach_detach,
|
||||
)
|
||||
process_workload.kill()
|
||||
process_workload.wait()
|
||||
|
||||
finally:
|
||||
if process_workload and process_workload.poll() is None:
|
||||
print(f"[finally] killing workload pid={process_workload.pid}")
|
||||
process_workload.kill()
|
||||
process_workload.wait()
|
||||
|
||||
# Validate results
|
||||
file_dict = test_utils.check_csv_files(workload_dir, 1, num_kernels)
|
||||
validate(
|
||||
inspect.stack()[0][3],
|
||||
workload_dir,
|
||||
file_dict,
|
||||
)
|
||||
|
||||
assert test_utils.check_file_pattern(
|
||||
"- 3.1.1", f"{workload_dir}/profiling_config.yaml"
|
||||
)
|
||||
assert test_utils.check_file_pattern(
|
||||
"- 4.1.1", f"{workload_dir}/profiling_config.yaml"
|
||||
)
|
||||
assert test_utils.check_file_pattern(
|
||||
"- 5.1.1", f"{workload_dir}/profiling_config.yaml"
|
||||
)
|
||||
validate(inspect.stack()[0][3], workload_dir, file_dict)
|
||||
test_utils.clean_output_dir(config["cleanup"], workload_dir)
|
||||
|
||||
|
||||
@@ -2129,38 +2120,43 @@ def test_live_attach_detach_block(binary_handler_profile_rocprof_compute):
|
||||
def test_live_attach_detach_block_thread_sleep(binary_handler_profile_rocprof_compute):
|
||||
options = ["--block", "3.1.1", "4.1.1", "5.1.1"]
|
||||
workload_dir = test_utils.get_output_dir()
|
||||
|
||||
# TODO: temp fix for sdk defautly disable attach/detach,
|
||||
# remove after it sets default to enable
|
||||
env = os.environ.copy()
|
||||
env["ROCP_TOOL_ATTACH"] = "1"
|
||||
|
||||
process_workload = subprocess.Popen(
|
||||
[config["app_hip_dynamic_shared"], "--enable-sleep"], env=env
|
||||
)
|
||||
process_workload = None
|
||||
|
||||
attach_detach = dict()
|
||||
attach_detach["attach_pid"] = process_workload.pid
|
||||
attach_detach["attach-duration-msec"] = attach_detach_interval_msec_with_delay
|
||||
|
||||
_ = binary_handler_profile_rocprof_compute(
|
||||
config,
|
||||
workload_dir,
|
||||
options,
|
||||
check_success=True,
|
||||
roof=False,
|
||||
app_name="app_hip_dynamic_shared",
|
||||
attach_detach_para=attach_detach,
|
||||
)
|
||||
|
||||
# kill the process of the workload at thsi point if it's still running
|
||||
if process_workload.poll() is None:
|
||||
print(
|
||||
f"rocprof-compute has detached and finished, "
|
||||
f"killing workload process (pid={process_workload.pid})..."
|
||||
try:
|
||||
# Start workload with sleep mode enabled
|
||||
process_workload = subprocess.Popen(
|
||||
[config["app_hip_dynamic_shared"], "--enable-sleep"], env=env
|
||||
)
|
||||
process_workload.kill()
|
||||
process_workload.wait()
|
||||
|
||||
attach_detach = {
|
||||
"attach_pid": process_workload.pid,
|
||||
"attach-duration-msec": attach_detach_interval_msec_with_delay,
|
||||
}
|
||||
|
||||
# Main profiling call (can fail or hang)
|
||||
binary_handler_profile_rocprof_compute(
|
||||
config,
|
||||
workload_dir,
|
||||
options,
|
||||
check_success=True,
|
||||
roof=False,
|
||||
app_name="app_hip_dynamic_shared",
|
||||
attach_detach_para=attach_detach,
|
||||
)
|
||||
|
||||
finally:
|
||||
if process_workload and process_workload.poll() is None:
|
||||
print(f"[finally] killing workload pid={process_workload.pid}")
|
||||
process_workload.kill()
|
||||
process_workload.wait()
|
||||
|
||||
# Validate output
|
||||
file_dict = test_utils.check_csv_files(workload_dir, 1, num_kernels)
|
||||
validate(
|
||||
inspect.stack()[0][3],
|
||||
@@ -2168,15 +2164,11 @@ def test_live_attach_detach_block_thread_sleep(binary_handler_profile_rocprof_co
|
||||
file_dict,
|
||||
)
|
||||
|
||||
assert test_utils.check_file_pattern(
|
||||
"- 3.1.1", f"{workload_dir}/profiling_config.yaml"
|
||||
)
|
||||
assert test_utils.check_file_pattern(
|
||||
"- 4.1.1", f"{workload_dir}/profiling_config.yaml"
|
||||
)
|
||||
assert test_utils.check_file_pattern(
|
||||
"- 5.1.1", f"{workload_dir}/profiling_config.yaml"
|
||||
)
|
||||
# Check profiling_config.yaml block entries
|
||||
config_file = f"{workload_dir}/profiling_config.yaml"
|
||||
assert test_utils.check_file_pattern("- 3.1.1", config_file)
|
||||
assert test_utils.check_file_pattern("- 4.1.1", config_file)
|
||||
assert test_utils.check_file_pattern("- 5.1.1", config_file)
|
||||
test_utils.clean_output_dir(config["cleanup"], workload_dir)
|
||||
|
||||
|
||||
@@ -2192,31 +2184,35 @@ def test_live_attach_detach_singlepath_launch_stats(
|
||||
env = os.environ.copy()
|
||||
env["ROCP_TOOL_ATTACH"] = "1"
|
||||
|
||||
process_workload = subprocess.Popen(config["app_hip_dynamic_shared"], env=env)
|
||||
process_workload = None
|
||||
|
||||
attach_detach = dict()
|
||||
attach_detach["attach_pid"] = process_workload.pid
|
||||
attach_detach["attach-duration-msec"] = attach_detach_interval_msec_no_delay
|
||||
try:
|
||||
# Start workload
|
||||
process_workload = subprocess.Popen(config["app_hip_dynamic_shared"], env=env)
|
||||
|
||||
_ = binary_handler_profile_rocprof_compute(
|
||||
config,
|
||||
workload_dir,
|
||||
options,
|
||||
check_success=True,
|
||||
roof=False,
|
||||
app_name="app_hip_dynamic_shared",
|
||||
attach_detach_para=attach_detach,
|
||||
)
|
||||
attach_detach = {
|
||||
"attach_pid": process_workload.pid,
|
||||
"attach-duration-msec": attach_detach_interval_msec_no_delay,
|
||||
}
|
||||
|
||||
# kill the process of the workload at thsi point if it's still running
|
||||
if process_workload.poll() is None:
|
||||
print(
|
||||
f"rocprof-compute has detached and finished, "
|
||||
f"killing workload process (pid={process_workload.pid})..."
|
||||
# Profiling step (may fail)
|
||||
binary_handler_profile_rocprof_compute(
|
||||
config,
|
||||
workload_dir,
|
||||
options,
|
||||
check_success=True,
|
||||
roof=False,
|
||||
app_name="app_hip_dynamic_shared",
|
||||
attach_detach_para=attach_detach,
|
||||
)
|
||||
process_workload.kill()
|
||||
process_workload.wait()
|
||||
|
||||
finally:
|
||||
if process_workload and process_workload.poll() is None:
|
||||
print(f"[finally] killing workload pid={process_workload.pid}")
|
||||
process_workload.kill()
|
||||
process_workload.wait()
|
||||
|
||||
# Validate CSVs & output correctness
|
||||
file_dict = test_utils.check_csv_files(workload_dir, 1, num_kernels)
|
||||
validate(
|
||||
inspect.stack()[0][3],
|
||||
@@ -2224,30 +2220,20 @@ def test_live_attach_detach_singlepath_launch_stats(
|
||||
file_dict,
|
||||
)
|
||||
|
||||
assert test_utils.check_file_pattern(
|
||||
"- 7.1.0", f"{workload_dir}/profiling_config.yaml"
|
||||
)
|
||||
assert test_utils.check_file_pattern(
|
||||
"- 7.1.1", f"{workload_dir}/profiling_config.yaml"
|
||||
)
|
||||
assert test_utils.check_file_pattern(
|
||||
"- 7.1.2", f"{workload_dir}/profiling_config.yaml"
|
||||
)
|
||||
assert test_utils.check_file_pattern(
|
||||
"- 7.1.5", f"{workload_dir}/profiling_config.yaml"
|
||||
)
|
||||
assert test_utils.check_file_pattern(
|
||||
"- 7.1.6", f"{workload_dir}/profiling_config.yaml"
|
||||
)
|
||||
assert test_utils.check_file_pattern(
|
||||
"- 7.1.7", f"{workload_dir}/profiling_config.yaml"
|
||||
)
|
||||
assert test_utils.check_file_pattern(
|
||||
"- 7.1.8", f"{workload_dir}/profiling_config.yaml"
|
||||
)
|
||||
assert test_utils.check_file_pattern(
|
||||
"- 7.1.9", f"{workload_dir}/profiling_config.yaml"
|
||||
)
|
||||
# Check that launch-stat sets were applied
|
||||
config_file = f"{workload_dir}/profiling_config.yaml"
|
||||
for tag in [
|
||||
"7.1.0",
|
||||
"7.1.1",
|
||||
"7.1.2",
|
||||
"7.1.5",
|
||||
"7.1.6",
|
||||
"7.1.7",
|
||||
"7.1.8",
|
||||
"7.1.9",
|
||||
]:
|
||||
assert test_utils.check_file_pattern(f"- {tag}", config_file)
|
||||
|
||||
test_utils.clean_output_dir(config["cleanup"], workload_dir)
|
||||
|
||||
|
||||
|
||||
Referência em uma Nova Issue
Bloquear um usuário