aa5dfb98f9
* Fix rocprofiler-sdk metrics definition
* Use TCC_EA0_RDREQ_128B instead of TCC_BUBBLE counter for L2 cache to
HBM counters and metrics
* Update MI350 counter definitions
* FETCH_SIZE
* BANDWIDTH_EA
* Update MI350 metrics definitions
* System Speed of Light, L2-Fabric Read BW
* Roofline Plot Points, AI (Arithmetic Intensity) HBM
* Roofline Performance Rates, HBM Bandwidth
* Remove redundant definition for gfx950 and fix BANDWIDTH_EA definition
Test HBM bandwidth metric for memcopy workload
* Add memcopy.cpp workload
* Add metric validation test suite to validate HBM Bandwidth metric for
memcopy workload
* Move gpu_soc() to test_utils.py for better re-usability
* Update TUI analysis config
* Fix hbm bandwidth formula for mi350 in calc_ai_profile
Co-authored-by: Alysa Liu <Alysa.Liu@amd.com>
126 satır
3.8 KiB
C++
126 satır
3.8 KiB
C++
#include <hip/hip_runtime.h>
|
|
#include <iostream>
|
|
#include <iomanip>
|
|
|
|
// Simple memory copy kernel - one thread per element
|
|
__global__ void memoryCopyKernel(const double* __restrict__ X,
|
|
double* __restrict__ Y,
|
|
size_t N) {
|
|
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
if (idx < N) {
|
|
Y[idx] = X[idx];
|
|
}
|
|
}
|
|
|
|
int main() {
|
|
const size_t N = 268435456; // Number of elements
|
|
const size_t bytes = N * sizeof(double);
|
|
|
|
// Print array size in GiB
|
|
double sizeGiB = bytes / (1024.0 * 1024.0 * 1024.0);
|
|
std::cout << "Array size: " << std::fixed << std::setprecision(2)
|
|
<< sizeGiB << " GiB (" << bytes << " bytes)" << std::endl;
|
|
std::cout << "Number of elements: " << N << std::endl;
|
|
std::cout << std::string(60, '-') << std::endl;
|
|
|
|
// Allocate device memory
|
|
double *d_X, *d_Y;
|
|
hipMalloc(&d_X, bytes);
|
|
hipMalloc(&d_Y, bytes);
|
|
|
|
// Allocate and initialize host memory
|
|
double *h_X = new double[N];
|
|
for (size_t i = 0; i < N; i++) {
|
|
h_X[i] = static_cast<double>(i);
|
|
}
|
|
|
|
// Copy data to device
|
|
hipMemcpy(d_X, h_X, bytes, hipMemcpyHostToDevice);
|
|
|
|
// Launch configuration
|
|
int blockSize = 256;
|
|
int gridSize = (N + blockSize - 1) / blockSize;
|
|
|
|
std::cout << "Launch configuration:" << std::endl;
|
|
std::cout << " Block size: " << blockSize << std::endl;
|
|
std::cout << " Grid size: " << gridSize << std::endl;
|
|
std::cout << std::string(60, '-') << std::endl;
|
|
|
|
// Warmup run
|
|
memoryCopyKernel<<<gridSize, blockSize>>>(d_X, d_Y, N);
|
|
hipDeviceSynchronize();
|
|
|
|
// Timed runs
|
|
const int numRuns = 1000;
|
|
hipEvent_t start, stop;
|
|
hipEventCreate(&start);
|
|
hipEventCreate(&stop);
|
|
|
|
float totalTime = 0.0f;
|
|
|
|
for (int run = 0; run < numRuns; run++) {
|
|
hipEventRecord(start);
|
|
|
|
memoryCopyKernel<<<gridSize, blockSize>>>(d_X, d_Y, N);
|
|
|
|
hipEventRecord(stop);
|
|
hipEventSynchronize(stop);
|
|
|
|
float milliseconds = 0;
|
|
hipEventElapsedTime(&milliseconds, start, stop);
|
|
totalTime += milliseconds;
|
|
}
|
|
|
|
float avgTime = totalTime / numRuns;
|
|
|
|
// Calculate bandwidth
|
|
// Memory copy reads N elements and writes N elements = 2*N*sizeof(double) bytes
|
|
double totalBytesTransferred = 2.0 * N * sizeof(double);
|
|
double bandwidthGBps = (totalBytesTransferred / (avgTime / 1000.0)) / (1024.0 * 1024.0 * 1024.0);
|
|
|
|
std::cout << "Performance Results (averaged over " << numRuns << " runs):" << std::endl;
|
|
std::cout << " Kernel execution time: " << std::fixed << std::setprecision(3)
|
|
<< avgTime << " ms" << std::endl;
|
|
std::cout << " Data transferred: " << std::setprecision(2)
|
|
<< (totalBytesTransferred / (1024.0 * 1024.0 * 1024.0)) << " GiB" << std::endl;
|
|
std::cout << " Achieved bandwidth: " << std::setprecision(2)
|
|
<< bandwidthGBps << " GiB/s" << std::endl;
|
|
std::cout << std::string(60, '-') << std::endl;
|
|
|
|
// Verify correctness
|
|
double *h_Y = new double[N];
|
|
hipMemcpy(h_Y, d_Y, bytes, hipMemcpyDeviceToHost);
|
|
|
|
bool correct = true;
|
|
for (size_t i = 0; i < N; i++) {
|
|
if (h_Y[i] != h_X[i]) {
|
|
correct = false;
|
|
std::cout << "Mismatch at index " << i << ": "
|
|
<< h_Y[i] << " != " << h_X[i] << std::endl;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (correct) {
|
|
std::cout << "✓ Verification PASSED - all elements copied correctly" << std::endl;
|
|
} else {
|
|
std::cout << "✗ Verification FAILED" << std::endl;
|
|
}
|
|
|
|
std::cout << "\nFirst 5 elements:" << std::endl;
|
|
for (int i = 0; i < 5; i++) {
|
|
std::cout << " Y[" << i << "] = " << h_Y[i] << std::endl;
|
|
}
|
|
|
|
// Cleanup
|
|
hipEventDestroy(start);
|
|
hipEventDestroy(stop);
|
|
delete[] h_X;
|
|
delete[] h_Y;
|
|
hipFree(d_X);
|
|
hipFree(d_Y);
|
|
|
|
return 0;
|
|
}
|