diff --git a/projects/rocprofiler-sdk/CHANGELOG.md b/projects/rocprofiler-sdk/CHANGELOG.md index 9363388c98..0f52f8235c 100644 --- a/projects/rocprofiler-sdk/CHANGELOG.md +++ b/projects/rocprofiler-sdk/CHANGELOG.md @@ -157,6 +157,7 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec - Added usage documentation for MPI applications - SDK: `rocprofiler_agent_v0_t` support for agent UUIDs - SDK: `rocprofiler_agent_v0_t` support for agent visibility based on gpu isolation environment variables (`ROCR_VISIBLE_DEVICES`, etc.) +- Accumulation VGPR support for rocprofv3. ### Changed diff --git a/projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3.rst b/projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3.rst index 30db6e8a72..a79c594777 100644 --- a/projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3.rst +++ b/projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3.rst @@ -917,11 +917,11 @@ To collect counters for the kernels matching the filters specified in the preced rocprofv3 -i input.yml -- $ cat pass_1/312_counter_collection.csv - "Correlation_Id","Dispatch_Id","Agent_Id","Queue_Id","Process_Id","Thread_Id","Grid_Size","Kernel_Name","Workgroup_Size","LDS_Block_Size","Scratch_Size","VGPR_Count","SGPR_Count","Counter_Name","Counter_Value","Start_Timestamp","End_Timestamp" - 4,4,1,1,36499,36499,1048576,"divide_kernel(float*, float const*, float const*, int, int)",64,0,0,12,16,"SQ_WAVES",16384,2228955885095594,2228955885119754 - 8,8,1,2,36499,36499,1048576,"divide_kernel(float*, float const*, float const*, int, int)",64,0,0,12,16,"SQ_WAVES",16384,2228955885095594,2228955885119754 - 12,12,1,3,36499,36499,1048576,"divide_kernel(float*, float const*, float const*, int, int)",64,0,0,12,16,"SQ_WAVES",16384,2228955892986914,2228955893006114 - 16,16,1,4,36499,36499,1048576,"divide_kernel(float*, float const*, float const*, int, int)",64,0,0,12,16,"SQ_WAVES",16384,2228955892986914,2228955893006114 + "Correlation_Id","Dispatch_Id","Agent_Id","Queue_Id","Process_Id","Thread_Id","Grid_Size","Kernel_Id","Kernel_Name","Workgroup_Size","LDS_Block_Size","Scratch_Size","VGPR_Count","Accum_VGPR_Count","SGPR_Count","Counter_Name","Counter_Value","Start_Timestamp","End_Timestamp" + 1,1,4,1,225049,225049,1048576,10,"void addition_kernel(float*, float const*, float const*, int, int)",64,0,0,8,0,16,"SQ_WAVES",16384.000000,317095766765717,317095766775957 + 2,2,4,1,225049,225049,1048576,13,"subtract_kernel(float*, float const*, float const*, int, int)",64,0,0,8,0,16,"SQ_WAVES",16384.000000,317095767013157,317095767022957 + 3,3,4,1,225049,225049,1048576,11,"multiply_kernel(float*, float const*, float const*, int, int)",64,0,0,8,0,16,"SQ_WAVES",16384.000000,317095767176998,317095767186678 + 4,4,4,1,225049,225049,1048576,12,"divide_kernel(float*, float const*, float const*, int, int)",64,0,0,12,4,16,"SQ_WAVES",16384.000000,317095767380718,317095767390878 I/O control options @@ -1088,7 +1088,10 @@ The following table lists the various fields or the columns in the output CSV fi - Kernel's Scalar General Purpose Register (SGPR) count. * - VGPR_Count - - Kernel's Vector General Purpose Register (VGPR) count. + - Kernel's Architected Vector General Purpose Register (VGPR) count. + + * - Accum_VGPR_Count + - Kernel's Accumulation Vector General Purpose Register (Accum_VGPR/AGPR) count. Output formats ---------------- @@ -1313,7 +1316,8 @@ Here are the properties of the JSON output schema: - **`handle`** *(integer, required)*: Handle of the counter. - **`value`** *(number, required)*: Value of the counter. - **`thread_id`** *(integer, required)*: Thread ID. - - **`arch_vgpr_count`** *(integer, required)*: Count of VGPRs. + - **`arch_vgpr_count`** *(integer, required)*: Count of Architected VGPRs. + - **`accum_vgpr_count`** *(integer, required)*: Count of Accumulation VGPRs. - **`sgpr_count`** *(integer, required)*: Count of SGPRs. - **`lds_block_size_v`** *(integer, required)*: Size of LDS block. - **``pc_sample_host_trap``** *(array)*: Host Trap PC Sampling records. diff --git a/projects/rocprofiler-sdk/source/lib/output/csv.hpp b/projects/rocprofiler-sdk/source/lib/output/csv.hpp index c2307762f7..a89f6b6f89 100644 --- a/projects/rocprofiler-sdk/source/lib/output/csv.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/csv.hpp @@ -102,7 +102,7 @@ struct csv_encoder using api_csv_encoder = csv_encoder<7>; using agent_info_csv_encoder = csv_encoder<53>; using kernel_trace_csv_encoder = csv_encoder<18>; -using counter_collection_csv_encoder = csv_encoder<18>; +using counter_collection_csv_encoder = csv_encoder<19>; using memory_copy_csv_encoder = csv_encoder<7>; using memory_allocation_csv_encoder = csv_encoder<8>; using marker_csv_encoder = csv_encoder<7>; diff --git a/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp b/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp index 386a0e28cc..243ed12dd5 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp @@ -573,6 +573,7 @@ generate_csv(const output_config& cfg, "LDS_Block_Size", "Scratch_Size", "VGPR_Count", + "Accum_VGPR_Count", "SGPR_Count", "Counter_Name", "Counter_Value", @@ -621,6 +622,7 @@ generate_csv(const output_config& cfg, lds_block_size_v, record.dispatch_data.dispatch_info.private_segment_size, kernel_info->arch_vgpr_count, + kernel_info->accum_vgpr_count, kernel_info->sgpr_count, counter_id_to_name.at(counter_id), counter_value, diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/code_object.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/code_object.cpp index 9016fc3503..332e3a15f4 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/code_object.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/code_object.cpp @@ -244,10 +244,11 @@ accum_vgpr_count(std::string_view name, kernel_descriptor_t kernel_code) if(name == "gfx908") return arch_vgpr_count(name, kernel_code); else if(name == "gfx90a" || name.find("gfx94") == 0) - return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1, - AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT) + - 1) * - (8 - arch_vgpr_count(name, kernel_code)); + return ((AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1, + AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT) + + 1) * + 8) - + arch_vgpr_count(name, kernel_code); bool emplaced = false; {