09b8342e22
* Add XGMI and PCIe metrics to the profiling data Add support for AMD XGMI (GPU-to-GPU interconnect) and PCIe metrics: * XGMI link width in bits * XGMI link speed in GT/s * Per-link read bandwidth (KB) * Per-link write bandwidth (KB) - Add new categories for PCIe metrics: * PCIe link width * PCIe link speed in GT/s * Accumulated bandwidth (MB) * Instantaneous bandwidth (MB/s) * Fix VCN/JPEG insert logic * Modify the gpu_metrics struct to accomodate XCP structure * Add ctest automation for gpu interconnect metrics * Refactor to move gpu_metrics struct and serialization to another file * Possible fix for timeout in CI Fix redundant skip check in ctest Add xgmi and pcie option in rocprof-sys-avail. * Change2: Address review comments Change ctest sampling to avoid timeout Change variable name and code structuring * Add option in ctest to run rocprof-sys-run without rewrite Run transferbench with rocprof-sys-run without sampling * Change3: Fix sample insert bug and address review comments xgmi and pci support check renaming variables additional hip_api validation in rocpd * Reduce the load from the trnasferBench sample The CI builds were timing out when flushing a big temporary file to the DB: (2720824.23 KB / 2720.82 MB / 2.72 GB)...
94 строки
3.7 KiB
JSON
94 строки
3.7 KiB
JSON
{
|
|
"required_tables": [
|
|
{
|
|
"commit": "Validation rules for hip_api",
|
|
"name": "events_args",
|
|
"required_columns": [
|
|
"event_id",
|
|
"category",
|
|
"stack_id",
|
|
"parent_stack_id",
|
|
"correlation_id"
|
|
],
|
|
"validation_queries": [
|
|
{
|
|
"comparison": "greater_than",
|
|
"description": "Verify that 'rocm_hip_api' appears in category at least 100 times in table events_args",
|
|
"error_message": "'rocm_hip_api' category entries are fewer than expected in events_args",
|
|
"expected_result": 100,
|
|
"query": "SELECT COUNT(*) FROM events_args WHERE category = 'rocm_hip_api';"
|
|
},
|
|
{
|
|
"comparison": "equals",
|
|
"description": "Check for missing category entries",
|
|
"error_message": "Empty or NULL category entries found in events_args",
|
|
"expected_result": 0,
|
|
"query": "SELECT COUNT(*) FROM events_args WHERE category IS NULL OR TRIM(category) = '';"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"commit": "Validation rules for hip_api",
|
|
"name": "regions",
|
|
"required_columns": [
|
|
"id",
|
|
"guid",
|
|
"category",
|
|
"name"
|
|
],
|
|
"validation_queries": [
|
|
{
|
|
"comparison": "greater_than",
|
|
"description": "Verify that 'rocm_hip_api' appears in category at least 50 times in table regions",
|
|
"error_message": "'rocm_hip_api' category entries are fewer than expected in regions",
|
|
"expected_result": 50,
|
|
"query": "SELECT COUNT(*) FROM regions WHERE category = 'rocm_hip_api';"
|
|
},
|
|
{
|
|
"comparison": "equals",
|
|
"description": "Ensure there are no HIP API calls that last 0 seconds",
|
|
"error_message": "Found HIP API captures where duration is 0",
|
|
"expected_result": 0,
|
|
"query": "SELECT COUNT(*) FROM regions WHERE category = 'rocm_hip_api' AND duration = 0;"
|
|
},
|
|
{
|
|
"comparison": "equals",
|
|
"description": "Check for any NULL values in the 'name' column of regions",
|
|
"error_message": "NULL entries found in the name column of regions",
|
|
"expected_result": 0,
|
|
"query": "SELECT COUNT(*) FROM regions WHERE name IS NULL;"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "rocpd_info_agent",
|
|
"required_columns": [
|
|
"id",
|
|
"guid",
|
|
"nid",
|
|
"pid",
|
|
"type",
|
|
"name"
|
|
],
|
|
"validation_queries": [
|
|
{
|
|
"comparison": "greater_than",
|
|
"description": "Check that we have GPU agents detected",
|
|
"error_message": "No GPU agents found",
|
|
"expected_result": 0,
|
|
"query": "SELECT COUNT(*) as count FROM rocpd_info_agent WHERE type = 'GPU'"
|
|
},
|
|
{
|
|
"comparison": "equals",
|
|
"description": "Check for NULL agent names",
|
|
"error_message": "Found agents with NULL names",
|
|
"expected_result": 0,
|
|
"query": "SELECT COUNT(*) as count FROM rocpd_info_agent WHERE name IS NULL"
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
|
|
|