Fix roofline calculation for single dispatch

Signed-off-by: coleramos425 <colramos@amd.com>


[ROCm/rocprofiler-compute commit: f32c192abe]
This commit is contained in:
coleramos425
2023-03-13 13:38:41 -05:00
parent ac0b36f86b
commit b76f0ce558
@@ -210,17 +210,144 @@ def plot_application(sortType, ret_df, verbose):
kernelName = ""
myList = []
for index, row in df.iterrows():
at_end = False
next_kernelName = ""
for idx in df.index:
# CASE: Top kernels
# Calculate + append AI data if
# a) current KernelName is different than previous OR
# b) We've reached the end of list
if sortType == "kernels" and (
(row["KernelName"] != kernelName and kernelName != "")
or index == df.shape[0] - 1
):
if df.shape[0] - 1 == index:
calls += 1
if(idx + 1 == df.shape[0]):
at_end = True
else:
next_kernelName = df["KernelName"][idx+1]
kernelName = df["KernelName"][idx]
try:
total_flops += (
(
64
* (
df["SQ_INSTS_VALU_ADD_F16"][idx]
+ df["SQ_INSTS_VALU_MUL_F16"][idx]
+ (2 * df["SQ_INSTS_VALU_FMA_F16"][idx])
+ df["SQ_INSTS_VALU_TRANS_F16"][idx]
)
)
+ (
64
* (
df["SQ_INSTS_VALU_ADD_F32"][idx]
+ df["SQ_INSTS_VALU_MUL_F32"][idx]
+ (2 * df["SQ_INSTS_VALU_FMA_F32"][idx])
+ df["SQ_INSTS_VALU_TRANS_F32"][idx]
)
)
+ (
64
* (
df["SQ_INSTS_VALU_ADD_F64"][idx]
+ df["SQ_INSTS_VALU_MUL_F64"][idx]
+ (2 * df["SQ_INSTS_VALU_FMA_F64"][idx])
+ df["SQ_INSTS_VALU_TRANS_F64"][idx]
)
)
+ (df["SQ_INSTS_VALU_MFMA_MOPS_F16"][idx] * 512)
+ (df["SQ_INSTS_VALU_MFMA_MOPS_BF16"][idx] * 512)
+ (df["SQ_INSTS_VALU_MFMA_MOPS_F32"][idx] * 512)
+ (df["SQ_INSTS_VALU_MFMA_MOPS_F64"][idx] * 512)
)
except KeyError:
if verbose >= 3:
print("{}: Skipped total_flops at index {}".format(kernelName[:35], idx))
pass
try:
valu_flops += (
64
* (
df["SQ_INSTS_VALU_ADD_F16"][idx]
+ df["SQ_INSTS_VALU_MUL_F16"][idx]
+ (2 * df["SQ_INSTS_VALU_FMA_F16"][idx])
+ df["SQ_INSTS_VALU_TRANS_F16"][idx]
)
+ 64
* (
df["SQ_INSTS_VALU_ADD_F32"][idx]
+ df["SQ_INSTS_VALU_MUL_F32"][idx]
+ (2 * df["SQ_INSTS_VALU_FMA_F32"][idx])
+ df["SQ_INSTS_VALU_TRANS_F32"][idx]
)
+ 64
* (
df["SQ_INSTS_VALU_ADD_F64"][idx]
+ df["SQ_INSTS_VALU_MUL_F64"][idx]
+ (2 * df["SQ_INSTS_VALU_FMA_F64"][idx])
+ df["SQ_INSTS_VALU_TRANS_F64"][idx]
)
)
except KeyError:
if verbose >= 3:
print("{}: Skipped valu_flops at index {}".format(kernelName[:35], idx))
pass
try:
mfma_flops_f16 += df["SQ_INSTS_VALU_MFMA_MOPS_F16"][idx] * 512
mfma_flops_bf16 += df["SQ_INSTS_VALU_MFMA_MOPS_BF16"][idx] * 512
mfma_flops_f32 += df["SQ_INSTS_VALU_MFMA_MOPS_F32"][idx] * 512
mfma_flops_f64 += df["SQ_INSTS_VALU_MFMA_MOPS_F64"][idx] * 512
mfma_iops_i8 += df["SQ_INSTS_VALU_MFMA_MOPS_I8"][idx] * 512
except KeyError:
if verbose >= 3:
print("{}: Skipped mfma ops at index {}".format(kernelName[:35], idx))
pass
try:
lds_data += (
(df["SQ_LDS_IDX_ACTIVE"][idx] - df["SQ_LDS_BANK_CONFLICT"][idx]) * 4 * L2_BANKS
) # L2_BANKS = 32 (since assuming mi200)
except KeyError:
if verbose >= 3:
print("{}: Skipped lds_data at index {}".format(kernelName[:35], idx))
pass
try:
L1cache_data += df["TCP_TOTAL_CACHE_ACCESSES_sum"][idx] * 64
except KeyError:
if verbose >= 3:
print("{}: Skipped L1cache_data at index {}".format(kernelName[:35], idx))
pass
try:
L2cache_data += (
df["TCP_TCC_WRITE_REQ_sum"][idx] * 64
+ df["TCP_TCC_ATOMIC_WITH_RET_REQ_sum"][idx] * 64
+ df["TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum"][idx] * 64
+ df["TCP_TCC_READ_REQ_sum"][idx] * 64
)
except KeyError:
if verbose >= 3:
print("{}: Skipped L2cache_data at index {}".format(kernelName[:35], idx))
pass
try:
hbm_data += (
(df["TCC_EA_RDREQ_32B_sum"][idx] * 32)
+ ((df["TCC_EA_RDREQ_sum"][idx] - df["TCC_EA_RDREQ_32B_sum"][idx]) * 64)
+ (df["TCC_EA_WRREQ_64B_sum"][idx] * 64)
+ ((df["TCC_EA_WRREQ_sum"][idx] - df["TCC_EA_WRREQ_64B_sum"][idx]) * 32)
)
except KeyError:
if verbose >= 3:
print("{}: Skipped hbm_data at index {}".format(kernelName[:35], idx))
pass
totalDuration += df["EndNs"][idx] - df["BeginNs"][idx]
avgDuration += df["EndNs"][idx] - df["BeginNs"][idx]
calls += 1
if sortType == "kernels" and (at_end == True or (kernelName != next_kernelName)):
myList.append(
AI_Data(
kernelName,
@@ -243,7 +370,7 @@ def plot_application(sortType, ret_df, verbose):
if verbose >= 2:
print(
"Just added {} to AI_Data at index {}. # of calls: {}".format(
kernelName, index, calls
kernelName, idx, calls
)
)
total_flops = (
@@ -264,129 +391,6 @@ def plot_application(sortType, ret_df, verbose):
L1cache_data
) = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0
kernelName = row["KernelName"]
try:
total_flops += (
(
64
* (
row["SQ_INSTS_VALU_ADD_F16"]
+ row["SQ_INSTS_VALU_MUL_F16"]
+ (2 * row["SQ_INSTS_VALU_FMA_F16"])
+ row["SQ_INSTS_VALU_TRANS_F16"]
)
)
+ (
64
* (
row["SQ_INSTS_VALU_ADD_F32"]
+ row["SQ_INSTS_VALU_MUL_F32"]
+ (2 * row["SQ_INSTS_VALU_FMA_F32"])
+ row["SQ_INSTS_VALU_TRANS_F32"]
)
)
+ (
64
* (
row["SQ_INSTS_VALU_ADD_F64"]
+ row["SQ_INSTS_VALU_MUL_F64"]
+ (2 * row["SQ_INSTS_VALU_FMA_F64"])
+ row["SQ_INSTS_VALU_TRANS_F64"]
)
)
+ (row["SQ_INSTS_VALU_MFMA_MOPS_F16"] * 512)
+ (row["SQ_INSTS_VALU_MFMA_MOPS_BF16"] * 512)
+ (row["SQ_INSTS_VALU_MFMA_MOPS_F32"] * 512)
+ (row["SQ_INSTS_VALU_MFMA_MOPS_F64"] * 512)
)
except KeyError:
if verbose >= 2:
print("Skipped total_flops at index {}".format(index))
pass
try:
valu_flops += (
64
* (
row["SQ_INSTS_VALU_ADD_F16"]
+ row["SQ_INSTS_VALU_MUL_F16"]
+ (2 * row["SQ_INSTS_VALU_FMA_F16"])
+ row["SQ_INSTS_VALU_TRANS_F16"]
)
+ 64
* (
row["SQ_INSTS_VALU_ADD_F32"]
+ row["SQ_INSTS_VALU_MUL_F32"]
+ (2 * row["SQ_INSTS_VALU_FMA_F32"])
+ row["SQ_INSTS_VALU_TRANS_F32"]
)
+ 64
* (
row["SQ_INSTS_VALU_ADD_F64"]
+ row["SQ_INSTS_VALU_MUL_F64"]
+ (2 * row["SQ_INSTS_VALU_FMA_F64"])
+ row["SQ_INSTS_VALU_TRANS_F64"]
)
)
except KeyError:
if verbose >= 2:
print("Skipped valu_flops at index {}".format(index))
pass
try:
mfma_flops_f16 += row["SQ_INSTS_VALU_MFMA_MOPS_F16"] * 512
mfma_flops_bf16 += row["SQ_INSTS_VALU_MFMA_MOPS_BF16"] * 512
mfma_flops_f32 += row["SQ_INSTS_VALU_MFMA_MOPS_F32"] * 512
mfma_flops_f64 += row["SQ_INSTS_VALU_MFMA_MOPS_F64"] * 512
mfma_iops_i8 += row["SQ_INSTS_VALU_MFMA_MOPS_I8"] * 512
except KeyError:
if verbose >= 2:
print("Skipped mfma ops at index {}".format(index))
pass
try:
lds_data += (
(row["SQ_LDS_IDX_ACTIVE"] - row["SQ_LDS_BANK_CONFLICT"]) * 4 * L2_BANKS
) # L2_BANKS = 32 (since assuming mi200)
except KeyError:
if verbose >= 2:
print("Skipped lds_data at index {}".format(index))
pass
try:
L1cache_data += row["TCP_TOTAL_CACHE_ACCESSES_sum"] * 64
except KeyError:
if verbose >= 2:
print("Skipped L1cache_data at index {}".format(index))
pass
try:
L2cache_data += (
row["TCP_TCC_WRITE_REQ_sum"] * 64
+ row["TCP_TCC_ATOMIC_WITH_RET_REQ_sum"] * 64
+ row["TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum"] * 64
+ row["TCP_TCC_READ_REQ_sum"] * 64
)
except KeyError:
if verbose >= 2:
print("Skipped L2cache_data at index {}".format(index))
pass
try:
hbm_data += (
(row["TCC_EA_RDREQ_32B_sum"] * 32)
+ ((row["TCC_EA_RDREQ_sum"] - row["TCC_EA_RDREQ_32B_sum"]) * 64)
+ (row["TCC_EA_WRREQ_64B_sum"] * 64)
+ ((row["TCC_EA_WRREQ_sum"] - row["TCC_EA_WRREQ_64B_sum"]) * 32)
)
except KeyError:
if verbose >= 2:
print("Skipped hbm_data at index {}".format(index))
pass
totalDuration += row["EndNs"] - row["BeginNs"]
avgDuration += row["EndNs"] - row["BeginNs"]
calls += 1
if sortType == "dispatches":
myList.append(
AI_Data(