diff --git a/projects/rocprofiler-compute/src/omniperf_analyze/utils/roofline_calc.py b/projects/rocprofiler-compute/src/omniperf_analyze/utils/roofline_calc.py index 7ba4725f9e..1fbca73e02 100644 --- a/projects/rocprofiler-compute/src/omniperf_analyze/utils/roofline_calc.py +++ b/projects/rocprofiler-compute/src/omniperf_analyze/utils/roofline_calc.py @@ -210,17 +210,144 @@ def plot_application(sortType, ret_df, verbose): kernelName = "" myList = [] - for index, row in df.iterrows(): + at_end = False + next_kernelName = "" + + for idx in df.index: # CASE: Top kernels # Calculate + append AI data if # a) current KernelName is different than previous OR # b) We've reached the end of list - if sortType == "kernels" and ( - (row["KernelName"] != kernelName and kernelName != "") - or index == df.shape[0] - 1 - ): - if df.shape[0] - 1 == index: - calls += 1 + if(idx + 1 == df.shape[0]): + at_end = True + else: + next_kernelName = df["KernelName"][idx+1] + + kernelName = df["KernelName"][idx] + try: + total_flops += ( + ( + 64 + * ( + df["SQ_INSTS_VALU_ADD_F16"][idx] + + df["SQ_INSTS_VALU_MUL_F16"][idx] + + (2 * df["SQ_INSTS_VALU_FMA_F16"][idx]) + + df["SQ_INSTS_VALU_TRANS_F16"][idx] + ) + ) + + ( + 64 + * ( + df["SQ_INSTS_VALU_ADD_F32"][idx] + + df["SQ_INSTS_VALU_MUL_F32"][idx] + + (2 * df["SQ_INSTS_VALU_FMA_F32"][idx]) + + df["SQ_INSTS_VALU_TRANS_F32"][idx] + ) + ) + + ( + 64 + * ( + df["SQ_INSTS_VALU_ADD_F64"][idx] + + df["SQ_INSTS_VALU_MUL_F64"][idx] + + (2 * df["SQ_INSTS_VALU_FMA_F64"][idx]) + + df["SQ_INSTS_VALU_TRANS_F64"][idx] + ) + ) + + (df["SQ_INSTS_VALU_MFMA_MOPS_F16"][idx] * 512) + + (df["SQ_INSTS_VALU_MFMA_MOPS_BF16"][idx] * 512) + + (df["SQ_INSTS_VALU_MFMA_MOPS_F32"][idx] * 512) + + (df["SQ_INSTS_VALU_MFMA_MOPS_F64"][idx] * 512) + ) + except KeyError: + if verbose >= 3: + print("{}: Skipped total_flops at index {}".format(kernelName[:35], idx)) + pass + try: + valu_flops += ( + 64 + * ( + df["SQ_INSTS_VALU_ADD_F16"][idx] + + df["SQ_INSTS_VALU_MUL_F16"][idx] + + (2 * df["SQ_INSTS_VALU_FMA_F16"][idx]) + + df["SQ_INSTS_VALU_TRANS_F16"][idx] + ) + + 64 + * ( + df["SQ_INSTS_VALU_ADD_F32"][idx] + + df["SQ_INSTS_VALU_MUL_F32"][idx] + + (2 * df["SQ_INSTS_VALU_FMA_F32"][idx]) + + df["SQ_INSTS_VALU_TRANS_F32"][idx] + ) + + 64 + * ( + df["SQ_INSTS_VALU_ADD_F64"][idx] + + df["SQ_INSTS_VALU_MUL_F64"][idx] + + (2 * df["SQ_INSTS_VALU_FMA_F64"][idx]) + + df["SQ_INSTS_VALU_TRANS_F64"][idx] + ) + ) + except KeyError: + if verbose >= 3: + print("{}: Skipped valu_flops at index {}".format(kernelName[:35], idx)) + pass + + try: + mfma_flops_f16 += df["SQ_INSTS_VALU_MFMA_MOPS_F16"][idx] * 512 + mfma_flops_bf16 += df["SQ_INSTS_VALU_MFMA_MOPS_BF16"][idx] * 512 + mfma_flops_f32 += df["SQ_INSTS_VALU_MFMA_MOPS_F32"][idx] * 512 + mfma_flops_f64 += df["SQ_INSTS_VALU_MFMA_MOPS_F64"][idx] * 512 + mfma_iops_i8 += df["SQ_INSTS_VALU_MFMA_MOPS_I8"][idx] * 512 + except KeyError: + if verbose >= 3: + print("{}: Skipped mfma ops at index {}".format(kernelName[:35], idx)) + pass + + try: + lds_data += ( + (df["SQ_LDS_IDX_ACTIVE"][idx] - df["SQ_LDS_BANK_CONFLICT"][idx]) * 4 * L2_BANKS + ) # L2_BANKS = 32 (since assuming mi200) + except KeyError: + if verbose >= 3: + print("{}: Skipped lds_data at index {}".format(kernelName[:35], idx)) + pass + + try: + L1cache_data += df["TCP_TOTAL_CACHE_ACCESSES_sum"][idx] * 64 + except KeyError: + if verbose >= 3: + print("{}: Skipped L1cache_data at index {}".format(kernelName[:35], idx)) + pass + + try: + L2cache_data += ( + df["TCP_TCC_WRITE_REQ_sum"][idx] * 64 + + df["TCP_TCC_ATOMIC_WITH_RET_REQ_sum"][idx] * 64 + + df["TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum"][idx] * 64 + + df["TCP_TCC_READ_REQ_sum"][idx] * 64 + ) + except KeyError: + if verbose >= 3: + print("{}: Skipped L2cache_data at index {}".format(kernelName[:35], idx)) + pass + try: + hbm_data += ( + (df["TCC_EA_RDREQ_32B_sum"][idx] * 32) + + ((df["TCC_EA_RDREQ_sum"][idx] - df["TCC_EA_RDREQ_32B_sum"][idx]) * 64) + + (df["TCC_EA_WRREQ_64B_sum"][idx] * 64) + + ((df["TCC_EA_WRREQ_sum"][idx] - df["TCC_EA_WRREQ_64B_sum"][idx]) * 32) + ) + except KeyError: + if verbose >= 3: + print("{}: Skipped hbm_data at index {}".format(kernelName[:35], idx)) + pass + + totalDuration += df["EndNs"][idx] - df["BeginNs"][idx] + + avgDuration += df["EndNs"][idx] - df["BeginNs"][idx] + + calls += 1 + + if sortType == "kernels" and (at_end == True or (kernelName != next_kernelName)): myList.append( AI_Data( kernelName, @@ -243,7 +370,7 @@ def plot_application(sortType, ret_df, verbose): if verbose >= 2: print( "Just added {} to AI_Data at index {}. # of calls: {}".format( - kernelName, index, calls + kernelName, idx, calls ) ) total_flops = ( @@ -264,129 +391,6 @@ def plot_application(sortType, ret_df, verbose): L1cache_data ) = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0 - kernelName = row["KernelName"] - try: - total_flops += ( - ( - 64 - * ( - row["SQ_INSTS_VALU_ADD_F16"] - + row["SQ_INSTS_VALU_MUL_F16"] - + (2 * row["SQ_INSTS_VALU_FMA_F16"]) - + row["SQ_INSTS_VALU_TRANS_F16"] - ) - ) - + ( - 64 - * ( - row["SQ_INSTS_VALU_ADD_F32"] - + row["SQ_INSTS_VALU_MUL_F32"] - + (2 * row["SQ_INSTS_VALU_FMA_F32"]) - + row["SQ_INSTS_VALU_TRANS_F32"] - ) - ) - + ( - 64 - * ( - row["SQ_INSTS_VALU_ADD_F64"] - + row["SQ_INSTS_VALU_MUL_F64"] - + (2 * row["SQ_INSTS_VALU_FMA_F64"]) - + row["SQ_INSTS_VALU_TRANS_F64"] - ) - ) - + (row["SQ_INSTS_VALU_MFMA_MOPS_F16"] * 512) - + (row["SQ_INSTS_VALU_MFMA_MOPS_BF16"] * 512) - + (row["SQ_INSTS_VALU_MFMA_MOPS_F32"] * 512) - + (row["SQ_INSTS_VALU_MFMA_MOPS_F64"] * 512) - ) - except KeyError: - if verbose >= 2: - print("Skipped total_flops at index {}".format(index)) - pass - try: - valu_flops += ( - 64 - * ( - row["SQ_INSTS_VALU_ADD_F16"] - + row["SQ_INSTS_VALU_MUL_F16"] - + (2 * row["SQ_INSTS_VALU_FMA_F16"]) - + row["SQ_INSTS_VALU_TRANS_F16"] - ) - + 64 - * ( - row["SQ_INSTS_VALU_ADD_F32"] - + row["SQ_INSTS_VALU_MUL_F32"] - + (2 * row["SQ_INSTS_VALU_FMA_F32"]) - + row["SQ_INSTS_VALU_TRANS_F32"] - ) - + 64 - * ( - row["SQ_INSTS_VALU_ADD_F64"] - + row["SQ_INSTS_VALU_MUL_F64"] - + (2 * row["SQ_INSTS_VALU_FMA_F64"]) - + row["SQ_INSTS_VALU_TRANS_F64"] - ) - ) - except KeyError: - if verbose >= 2: - print("Skipped valu_flops at index {}".format(index)) - pass - - try: - mfma_flops_f16 += row["SQ_INSTS_VALU_MFMA_MOPS_F16"] * 512 - mfma_flops_bf16 += row["SQ_INSTS_VALU_MFMA_MOPS_BF16"] * 512 - mfma_flops_f32 += row["SQ_INSTS_VALU_MFMA_MOPS_F32"] * 512 - mfma_flops_f64 += row["SQ_INSTS_VALU_MFMA_MOPS_F64"] * 512 - mfma_iops_i8 += row["SQ_INSTS_VALU_MFMA_MOPS_I8"] * 512 - except KeyError: - if verbose >= 2: - print("Skipped mfma ops at index {}".format(index)) - pass - - try: - lds_data += ( - (row["SQ_LDS_IDX_ACTIVE"] - row["SQ_LDS_BANK_CONFLICT"]) * 4 * L2_BANKS - ) # L2_BANKS = 32 (since assuming mi200) - except KeyError: - if verbose >= 2: - print("Skipped lds_data at index {}".format(index)) - pass - - try: - L1cache_data += row["TCP_TOTAL_CACHE_ACCESSES_sum"] * 64 - except KeyError: - if verbose >= 2: - print("Skipped L1cache_data at index {}".format(index)) - pass - - try: - L2cache_data += ( - row["TCP_TCC_WRITE_REQ_sum"] * 64 - + row["TCP_TCC_ATOMIC_WITH_RET_REQ_sum"] * 64 - + row["TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum"] * 64 - + row["TCP_TCC_READ_REQ_sum"] * 64 - ) - except KeyError: - if verbose >= 2: - print("Skipped L2cache_data at index {}".format(index)) - pass - try: - hbm_data += ( - (row["TCC_EA_RDREQ_32B_sum"] * 32) - + ((row["TCC_EA_RDREQ_sum"] - row["TCC_EA_RDREQ_32B_sum"]) * 64) - + (row["TCC_EA_WRREQ_64B_sum"] * 64) - + ((row["TCC_EA_WRREQ_sum"] - row["TCC_EA_WRREQ_64B_sum"]) * 32) - ) - except KeyError: - if verbose >= 2: - print("Skipped hbm_data at index {}".format(index)) - pass - - totalDuration += row["EndNs"] - row["BeginNs"] - - avgDuration += row["EndNs"] - row["BeginNs"] - - calls += 1 if sortType == "dispatches": myList.append( AI_Data(