Add example apps to compliment docs
Co-authored-by: Nick Curtis <nicholas.curtis@amd.com>
Signed-off-by: colramos-amd <colramos@amd.com>
[ROCm/rocprofiler-compute commit: 54f0fa8c81]
This commit is contained in:
gecommit door
Cole Ramos
bovenliggende
3b0dce88ca
commit
265c0f4856
@@ -0,0 +1,17 @@
|
||||
#pragma once
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <iostream>
|
||||
|
||||
#define hipCheck(stmt) \
|
||||
do { \
|
||||
hipError_t err = stmt; \
|
||||
if (err != hipSuccess) { \
|
||||
char msg[256]; \
|
||||
sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt, __FILE__, \
|
||||
__FUNCTION__, __LINE__); \
|
||||
std::string errstring = hipGetErrorString(err); \
|
||||
std::cerr << msg << "\t" << errstring << std::endl; \
|
||||
throw std::runtime_error(msg); \
|
||||
} \
|
||||
} while (0)
|
||||
@@ -0,0 +1,315 @@
|
||||
/*
|
||||
##############################################################################bl
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
##############################################################################el
|
||||
|
||||
|
||||
|
||||
A data-fabric exerciser example, written by Nicholas Curtis [AMD]
|
||||
|
||||
The test allows the user to control the:
|
||||
- The granularity of an allocation (Coarse vs Fine-grained),
|
||||
- The owner of an allocation (local HBM, CPU DRAM or remote HBM),
|
||||
- The size of an allocation (the default is ~4GiB), and
|
||||
- The type of operation we are executing (read, write, atomics of various flavors)
|
||||
|
||||
This lets the user explore the impact of these choices on the generated
|
||||
data-fabric traffic.
|
||||
*/
|
||||
|
||||
|
||||
#include <getopt.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
enum class mtype : int { FineGrained = 0, CoarseGrained = 1, Undef = 3 };
|
||||
enum class mowner : int { Device = 0, Host = 1, Remote = 2, Undef = 3 };
|
||||
enum class mspace : int { Global = 0, Undef = 1 };
|
||||
enum class mop : int {
|
||||
Read = 0,
|
||||
Write = 1,
|
||||
AtomicAdd = 2,
|
||||
AtomicCas = 3,
|
||||
AtomicOr = 4,
|
||||
AtomicMax = 5,
|
||||
Undef = 6
|
||||
};
|
||||
enum class mdata : int { Unsigned = 0, UnsignedLong = 1, Float = 2, Double = 3, Undef = 4 };
|
||||
|
||||
template<typename T>
|
||||
T parse(const char* value) {
|
||||
int ivalue = std::atoi(value);
|
||||
if (ivalue < 0 || ivalue >= int(T::Undef)) {
|
||||
throw std::runtime_error("bad enum value!");
|
||||
}
|
||||
return T(ivalue);
|
||||
}
|
||||
|
||||
void parse(int argc, char** argv, mtype& mytype, mowner& myowner,
|
||||
mspace& myspace, size_t& size, mop& myop, mdata& mydata,
|
||||
int& remoteId) {
|
||||
while (1) {
|
||||
static struct option long_options[] = {
|
||||
/* These options set a flag. */
|
||||
{"type", required_argument, 0, 't'},
|
||||
{"owner", required_argument, 0, 'o'},
|
||||
{"size", required_argument, 0, 'z'},
|
||||
{"op", required_argument, 0, 'p'},
|
||||
{"remote", required_argument, 0, 'r'},
|
||||
{"data", required_argument, 0, 'd'},
|
||||
{0, 0, 0, 0}};
|
||||
/* getopt_long stores the option index here. */
|
||||
int option_index = 0;
|
||||
|
||||
int c =
|
||||
getopt_long(argc, argv, "t:o:z:p:r:d:", long_options, &option_index);
|
||||
|
||||
/* Detect the end of the options. */
|
||||
if (c == -1) break;
|
||||
|
||||
switch (c) {
|
||||
case 't':
|
||||
mytype = parse<mtype>(optarg);
|
||||
break;
|
||||
|
||||
case 'o':
|
||||
myowner = parse<mowner>(optarg);
|
||||
break;
|
||||
|
||||
case 'z':
|
||||
size = std::atoll(optarg);
|
||||
break;
|
||||
|
||||
case 'p':
|
||||
myop = parse<mop>(optarg);
|
||||
break;
|
||||
|
||||
case 'r':
|
||||
remoteId = std::atoi(optarg);
|
||||
break;
|
||||
|
||||
case 'd':
|
||||
mydata = parse<mdata>(optarg);
|
||||
break;
|
||||
|
||||
case '?':
|
||||
/* getopt_long already printed an error message. */
|
||||
break;
|
||||
|
||||
default:
|
||||
abort();
|
||||
}
|
||||
}
|
||||
std::cout << "Using: " << std::endl;
|
||||
std::cout << "\tmtype:"
|
||||
<< ((mytype == mtype::FineGrained) ? "FineGrained"
|
||||
: "CoarseGrained")
|
||||
<< std::endl;
|
||||
std::cout << "\tmowner:"
|
||||
<< ((myowner == mowner::Device)
|
||||
? "Device"
|
||||
: ((myowner == mowner::Host) ? "Host" : "Remote"))
|
||||
<< std::endl;
|
||||
std::cout << "\tmspace:Global" << std::endl;
|
||||
std::cout << "\tmop:" << ((myop == mop::Read) ? "Read" : (myop == mop::Write ? "Write" : (myop == mop::AtomicAdd ? "Add" : (myop == mop::AtomicCas ? "CAS" : (myop == mop::AtomicOr ? "Or" : "Max"))))) << std::endl;
|
||||
std::cout << "\tmdata:" << (mydata == mdata::Unsigned ? "Unsigned" : (mydata == mdata::UnsignedLong ? "Unsigned Long" : (mydata == mdata::Float ? "Float" : "Double"))) << std::endl;
|
||||
std::cout << "\tremoteId:" << remoteId << std::endl;
|
||||
}
|
||||
|
||||
// dummy intialization kernel
|
||||
__global__ void init() {}
|
||||
|
||||
template <typename T>
|
||||
void alloc(mtype memory, mowner owner, T** ptr, size_t Nbytes, int devId,
|
||||
int remoteId) {
|
||||
bool is_device = (owner == mowner::Device) || (owner == mowner::Remote);
|
||||
if (owner == mowner::Remote) {
|
||||
// enable remote access
|
||||
hipCheck(hipDeviceEnablePeerAccess(remoteId, 0));
|
||||
// set id for alloc
|
||||
hipCheck(hipSetDevice(remoteId));
|
||||
}
|
||||
init<<<1, 1>>>();
|
||||
|
||||
if (memory == mtype::FineGrained && is_device) {
|
||||
hipCheck(
|
||||
hipExtMallocWithFlags((void**)ptr, Nbytes, hipDeviceMallocFinegrained));
|
||||
} else if (memory == mtype::CoarseGrained && is_device) {
|
||||
hipCheck(hipMalloc(ptr, Nbytes));
|
||||
} else if (memory == mtype::FineGrained && owner == mowner::Host) {
|
||||
hipCheck(hipHostMalloc(ptr, Nbytes, hipHostMallocCoherent));
|
||||
} else if (memory == mtype::CoarseGrained && owner == mowner::Host) {
|
||||
hipCheck(hipHostMalloc(ptr, Nbytes, hipHostMallocNonCoherent));
|
||||
} else {
|
||||
assert(false && "unknown combo");
|
||||
}
|
||||
|
||||
// set to random
|
||||
std::vector<T> host(Nbytes / sizeof(T), T(0));
|
||||
hipCheck(hipMemcpy(*ptr, &host[0], Nbytes,
|
||||
(is_device ? hipMemcpyHostToDevice : hipMemcpyHostToHost)));
|
||||
|
||||
if (owner == mowner::Remote) {
|
||||
// reset id for execution
|
||||
hipCheck(hipSetDevice(devId));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void release(mtype memory, mowner owner, T* ptr) {
|
||||
bool is_device = (owner == mowner::Device) || (owner == mowner::Remote);
|
||||
if (memory == mtype::FineGrained && is_device) {
|
||||
hipCheck(hipFree(ptr));
|
||||
} else if (memory == mtype::CoarseGrained && is_device) {
|
||||
hipCheck(hipFree(ptr));
|
||||
} else if (memory == mtype::FineGrained && owner == mowner::Host) {
|
||||
hipCheck(hipHostFree(ptr));
|
||||
} else if (memory == mtype::CoarseGrained && owner == mowner::Host) {
|
||||
hipCheck(hipHostFree(ptr));
|
||||
} else {
|
||||
assert(false && "unknown combo");
|
||||
}
|
||||
}
|
||||
|
||||
// the main streaming kernel
|
||||
template <mop op, typename T, int repeats = 10>
|
||||
__global__ void kernel(T* x, size_t N, T zero, T foo) {
|
||||
int sum = 0;
|
||||
const size_t offset_start = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
for (int i = 0; i < repeats; ++i) {
|
||||
for (size_t offset = offset_start; offset < N;
|
||||
offset += blockDim.x * gridDim.x) {
|
||||
T uniq = (foo + offset) + i;
|
||||
if constexpr (op == mop::Read) {
|
||||
sum += x[offset];
|
||||
} else if constexpr (op == mop::Write) {
|
||||
x[offset] = (T)offset;
|
||||
} else if constexpr (op == mop::AtomicAdd) {
|
||||
atomicAdd(&x[offset], uniq);
|
||||
} else if constexpr (op == mop::AtomicCas) {
|
||||
atomicCAS(&x[offset], uniq, uniq);
|
||||
} else if constexpr (op == mop::AtomicOr) {
|
||||
atomicOr(&x[offset], uniq);
|
||||
} else if constexpr (op == mop::AtomicMax) {
|
||||
atomicMax(&x[offset], uniq);
|
||||
}
|
||||
}
|
||||
}
|
||||
if constexpr (op == mop::Read) {
|
||||
if (sum != 0) {
|
||||
x[offset_start] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <mop op, typename T, int nrepeats = 10>
|
||||
void run_kernel(T* x, size_t size) {
|
||||
if constexpr (op == mop::AtomicOr && std::is_floating_point_v<T>) {
|
||||
throw std::runtime_error("bad");
|
||||
} else {
|
||||
kernel<op, T, nrepeats><<<4096, 1024>>>(x, size, 0, T(23456789));
|
||||
// then run once for data collection
|
||||
kernel<op, T, nrepeats><<<4096, 1024>>>(x, size, 0, T(23456789));
|
||||
}
|
||||
}
|
||||
|
||||
template <mop op, typename T>
|
||||
void run_atomic(mowner myowner, T* x, size_t size) {
|
||||
if (myowner == mowner::Host) {
|
||||
// speed it up
|
||||
run_kernel<op, T, 1>(x, size / 10);
|
||||
} else {
|
||||
run_kernel<op, T>(x, size);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void run(mtype mytype, mspace myspace, mowner myowner, mop myop, int remoteId,
|
||||
size_t size) {
|
||||
int devId = 0;
|
||||
if (myowner == mowner::Remote && remoteId == -1) {
|
||||
// need to find a remote GPU
|
||||
int ndevices;
|
||||
hipCheck(hipGetDeviceCount(&ndevices));
|
||||
if (ndevices <= 1) {
|
||||
throw std::runtime_error(
|
||||
"Need >=2 devices available for mowner = Remote");
|
||||
}
|
||||
for (int i = 0; i < ndevices; ++i) {
|
||||
if (i != devId) {
|
||||
remoteId = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
T* x;
|
||||
alloc(mytype, myowner, &x, size * sizeof(T), devId, remoteId);
|
||||
|
||||
// run the kernel once for warmup
|
||||
assert(4096 * 1024 < size);
|
||||
if (myop == mop::Read) {
|
||||
run_kernel<mop::Read>(x, size);
|
||||
} else if (myop == mop::Write) {
|
||||
run_kernel<mop::Write>(x, size);
|
||||
} else if (myop == mop::AtomicAdd) {
|
||||
run_atomic<mop::AtomicAdd>(myowner, x, size);
|
||||
} else if (myop == mop::AtomicCas) {
|
||||
run_atomic<mop::AtomicCas>(myowner, x, size);
|
||||
} else if (myop == mop::AtomicOr) {
|
||||
run_atomic<mop::AtomicOr>(myowner, x, size);
|
||||
} else if (myop == mop::AtomicMax) {
|
||||
run_atomic<mop::AtomicMax>(myowner, x, size);
|
||||
} else {
|
||||
throw std::runtime_error("bad");
|
||||
}
|
||||
hipCheck(hipDeviceSynchronize());
|
||||
release(mytype, myowner, x);
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
mtype mytype = (mtype)0;
|
||||
mspace myspace = (mspace)0;
|
||||
mowner myowner = (mowner)0;
|
||||
mop myop = (mop)0;
|
||||
mdata mydata = (mdata)0;
|
||||
int remoteId = -1;
|
||||
size_t size = 1024ull * 1024ull *
|
||||
1024ull; // 4 GiB, purposefully much larger than caches.
|
||||
parse(argc, argv, mytype, myowner, myspace, size, myop, mydata, remoteId);
|
||||
if (mydata == mdata::Unsigned)
|
||||
run<unsigned>(mytype, myspace, myowner, myop, remoteId, size);
|
||||
else if (mydata == mdata::UnsignedLong)
|
||||
run<unsigned long>(mytype, myspace, myowner, myop, remoteId, size);
|
||||
else if (mydata == mdata::Float)
|
||||
run<float>(mytype, myspace, myowner, myop, remoteId, size);
|
||||
else if (mydata == mdata::Double)
|
||||
run<double>(mytype, myspace, myowner, myop, remoteId, size);
|
||||
else {
|
||||
throw std::runtime_error("bad");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,113 @@
|
||||
/*
|
||||
##############################################################################bl
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
##############################################################################el
|
||||
|
||||
|
||||
|
||||
A instruction mix exerciser example, written by Gina Sitaraman and Nicholas Curtis [AMD].
|
||||
Although inline assembly is inherently unportable, this is expected to work on all CDNA accelerators.
|
||||
*/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
__global__ void kernelasm() {
|
||||
// int32
|
||||
int i, j;
|
||||
asm volatile("v_add_u32_e32 %0, %1, %0\n" : "=v"(j) : "v"(i));
|
||||
|
||||
// int 64
|
||||
long int l1, l2;
|
||||
asm volatile("v_cmp_eq_i64 %0, %1\n" : "=v"(l2) : "v"(l1), "v"(i));
|
||||
|
||||
// fp32: add, mul, transcendental and fma
|
||||
float f1, f2;
|
||||
asm volatile(
|
||||
"v_add_f32_e32 %0, %1, %0\n"
|
||||
"v_mul_f32_e32 %0, %1, %0\n"
|
||||
"v_sqrt_f32 %0, %1\n"
|
||||
"v_fma_f32 %0, %1, %0, %1\n"
|
||||
: "=v"(f1)
|
||||
: "v"(f2));
|
||||
|
||||
// fp64: add, mul, transcendental and fma
|
||||
double d1, d2, d3, d4;
|
||||
asm volatile(
|
||||
"v_add_f64 %0, %1, %0\n"
|
||||
"v_mul_f64 %0, %1, %0\n"
|
||||
"v_fma_f64 %0, %1, %0, %1\n"
|
||||
"v_sqrt_f64 %0, %1\n"
|
||||
"v_min_f64 %0, %1, %0\n"
|
||||
: "+v"(d1)
|
||||
: "v"(d2));
|
||||
|
||||
// fp16: add, mul, transcendental and fma
|
||||
_Float16 h1, h2;
|
||||
asm volatile(
|
||||
"v_add_f16_e32 %0, %1, %0\n"
|
||||
"v_mul_f16_e32 %0, %1, %0\n"
|
||||
"v_sqrt_f16 %0, %1\n"
|
||||
"v_cvt_f16_f32 %0 %2\n"
|
||||
"v_fma_f16 %0, %1, %0, %0\n"
|
||||
: "=v"(h2)
|
||||
: "v"(h1), "v"(f1));
|
||||
|
||||
// MFMA ops
|
||||
double2 dd;
|
||||
unsigned short us;
|
||||
long2 ll;
|
||||
#if defined(__gfx90a__)
|
||||
asm volatile("v_mfma_f64_4x4x4f64 %0 %1 %2 %3\n"
|
||||
: "=v"(d4)
|
||||
: "v"(d1), "v"(d2), "v"(d3));
|
||||
asm volatile("v_mfma_f32_16x16x4f32 %0 %1 %2 1\n"
|
||||
: "=v"(dd)
|
||||
: "v"(f1), "v"(f2));
|
||||
asm volatile("v_mfma_f32_16x16x16f16 %0 %1 %2 1\n"
|
||||
: "=v"(dd)
|
||||
: "v"(d1), "v"(d2));
|
||||
asm volatile("v_mfma_f32_16x16x8bf16 %0 %1 %2 1\n"
|
||||
: "=v"(dd)
|
||||
: "v"(f1), "v"(f2));
|
||||
asm volatile("v_mfma_i32_16x16x16i8 %0 %1 %2 1\n"
|
||||
: "=v"(ll)
|
||||
: "v"(i), "v"(j));
|
||||
#endif
|
||||
|
||||
// Scalar op
|
||||
asm volatile("s_add_i32 %0 %1 %0\n" : "=s"(j) : "s"(i));
|
||||
|
||||
// LDS
|
||||
asm volatile("ds_read_b32 %0 %0\n" : "=v"(i) : "v"(j));
|
||||
|
||||
// Branch
|
||||
asm volatile(
|
||||
"s_branch .LDUMMY\n"
|
||||
".LDUMMY:\n"
|
||||
"s_endpgm\n");
|
||||
}
|
||||
int main() {
|
||||
kernelasm<<<1, 64>>>();
|
||||
hipCheck(hipDeviceSynchronize());
|
||||
}
|
||||
@@ -0,0 +1,127 @@
|
||||
/*
|
||||
##############################################################################bl
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
##############################################################################el
|
||||
|
||||
|
||||
|
||||
An example to explore IPC and divergence, written by Nicholas Curtis [AMD].
|
||||
This example may not work on all CDNA accelerators, but has been verified on MI2XX.
|
||||
*/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
template<int N=1000>
|
||||
__device__ void vrcp_op() {
|
||||
int dummy;
|
||||
if constexpr (N >= 1) {
|
||||
asm volatile("v_rcp_f64 v[0:1], v[0:1]\n" : : "{v31}"(dummy));
|
||||
vrcp_op<N - 1>();
|
||||
}
|
||||
}
|
||||
|
||||
template<int N=1000>
|
||||
__global__ void vrcp() {
|
||||
vrcp_op<N>();
|
||||
}
|
||||
|
||||
template<int N=1000>
|
||||
__device__ void vmov_op() {
|
||||
int dummy;
|
||||
if constexpr (N >= 1) {
|
||||
asm volatile("v_mov_b32 v0, v1\n" : : "{v31}"(dummy));
|
||||
vmov_op<N - 1>();
|
||||
}
|
||||
}
|
||||
|
||||
template<int N=1000>
|
||||
__global__ void vmov() {
|
||||
vmov_op<N>();
|
||||
}
|
||||
|
||||
template<int N=1000>
|
||||
__device__ void mfma_op() {
|
||||
int dummy;
|
||||
if constexpr (N >= 1) {
|
||||
asm volatile("v_mfma_f32_32x32x8bf16_1k v[0:15], v[16:17], v[18:19], v[0:15]\n" : : "{v31}"(dummy));
|
||||
mfma_op<N - 1>();
|
||||
}
|
||||
}
|
||||
|
||||
template<int N=1000>
|
||||
__global__ void mfma() {
|
||||
mfma_op<N>();
|
||||
}
|
||||
|
||||
template<int N=1000>
|
||||
__device__ void snop_op() {
|
||||
int dummy;
|
||||
if constexpr (N >= 1) {
|
||||
asm volatile("s_nop 0x0\n" : : "{v31}"(dummy));
|
||||
snop_op<N - 1>();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<int N=1000>
|
||||
__global__ void snop() {
|
||||
snop_op<N>();
|
||||
}
|
||||
|
||||
template<int N=1000>
|
||||
__device__ void smov_op() {
|
||||
int dummy;
|
||||
if constexpr (N >= 1) {
|
||||
asm volatile("s_mov_b32 s0, s1\n" : : "{s31}"(dummy));
|
||||
smov_op<N - 1>();
|
||||
}
|
||||
}
|
||||
|
||||
template<int N=1000>
|
||||
__global__ void smov() {
|
||||
smov_op<N>();
|
||||
}
|
||||
|
||||
template<int N=1000>
|
||||
__global__ void vmov_with_divergence() {
|
||||
if (threadIdx.x % 64 == 0)
|
||||
vmov_op<N>();
|
||||
}
|
||||
|
||||
int main() {
|
||||
// warmups, spam to all CUs
|
||||
vrcp<<<1024 * 1024, 1024>>>();
|
||||
vmov<<<1024 * 1024, 1024>>>();
|
||||
mfma<<<1024 * 1024, 1024>>>();
|
||||
snop<<<1024 * 1024, 1024>>>();
|
||||
smov<<<1024 * 1024, 1024>>>();
|
||||
vmov_with_divergence<<<1024 * 1024, 1024>>>();
|
||||
hipCheck(hipDeviceSynchronize());
|
||||
vrcp<<<1024 * 1024, 1024>>>();
|
||||
vmov<<<1024 * 1024, 1024>>>();
|
||||
mfma<<<1024 * 1024, 1024>>>();
|
||||
snop<<<1024 * 1024, 1024>>>();
|
||||
smov<<<1024 * 1024, 1024>>>();
|
||||
vmov_with_divergence<<<1024 * 1024, 1024>>>();
|
||||
hipCheck(hipDeviceSynchronize());
|
||||
}
|
||||
@@ -0,0 +1,78 @@
|
||||
/*
|
||||
##############################################################################bl
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
##############################################################################el
|
||||
|
||||
|
||||
|
||||
An example to explore LDS bandwidth and bank conflicts, written by Nicholas Curtis [AMD].
|
||||
*/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
constexpr unsigned max_threads = 256;
|
||||
constexpr unsigned nbanks = 32;
|
||||
|
||||
__global__ void load(int* out, int flag) {
|
||||
__shared__ int array[max_threads];
|
||||
int index = threadIdx.x;
|
||||
// fake a store to the LDS array to avoid unwanted behavior
|
||||
if (flag)
|
||||
array[max_threads - index] = index;
|
||||
__syncthreads();
|
||||
int x = array[index];
|
||||
if (x == int(-1234567))
|
||||
out[threadIdx.x] = x;
|
||||
}
|
||||
|
||||
__global__ void conflicts(int* out, int flag) {
|
||||
constexpr unsigned nelements = nbanks * max_threads;
|
||||
__shared__ int array[nelements];
|
||||
// each thread reads from the same bank
|
||||
int index = threadIdx.x * nbanks;
|
||||
// fake a store to the LDS array to avoid unwanted behavior
|
||||
if (flag)
|
||||
array[max_threads - index] = index;
|
||||
__syncthreads();
|
||||
int x = array[index];
|
||||
if (x == int(-1234567))
|
||||
out[threadIdx.x] = x;
|
||||
}
|
||||
|
||||
void bandwidth_demo(int N) {
|
||||
for (int i = 1; i <= N; ++i)
|
||||
load<<<1,i>>>(nullptr, 0);
|
||||
hipCheck(hipDeviceSynchronize());
|
||||
}
|
||||
|
||||
void conflicts_demo(int N) {
|
||||
for (int i = 1; i <= N; ++i)
|
||||
conflicts<<<1,i>>>(nullptr, 0);
|
||||
hipCheck(hipDeviceSynchronize());
|
||||
}
|
||||
|
||||
int main() {
|
||||
bandwidth_demo(max_threads);
|
||||
conflicts_demo(max_threads);
|
||||
}
|
||||
@@ -0,0 +1,109 @@
|
||||
/*
|
||||
##############################################################################bl
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
##############################################################################el
|
||||
|
||||
|
||||
|
||||
An example to explore achieved occupancy, and various occupancy limiters.
|
||||
Written by Nicholas Curtis [AMD].
|
||||
*/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
__global__ void empty(int N, double* ptr) {
|
||||
|
||||
}
|
||||
|
||||
constexpr int bound = 16;
|
||||
__launch_bounds__(256)
|
||||
__global__ void vgprbound(int N, double* ptr) {
|
||||
double intermediates[bound];
|
||||
for (int i = 0 ; i < bound; ++i) intermediates[i] = N * threadIdx.x;
|
||||
double x = ptr[threadIdx.x];
|
||||
for (int i = 0; i < 100; ++i) {
|
||||
x += sin(pow(__shfl(x, i % warpSize) * intermediates[(i - 1) % bound], intermediates[i % bound]));
|
||||
intermediates[i % bound] = x;
|
||||
}
|
||||
if (x == N) ptr[threadIdx.x] = x;
|
||||
}
|
||||
|
||||
constexpr size_t fully_allocate_lds = 64ul * 1024ul / sizeof(double);
|
||||
__launch_bounds__(256)
|
||||
__global__ void ldsbound(int N, double* ptr) {
|
||||
__shared__ double intermediates[fully_allocate_lds];
|
||||
for (int i = threadIdx.x ; i < fully_allocate_lds; i += blockDim.x) intermediates[i] = N * threadIdx.x;
|
||||
__syncthreads();
|
||||
double x = ptr[threadIdx.x];
|
||||
for (int i = threadIdx.x; i < fully_allocate_lds; i += blockDim.x) {
|
||||
x += sin(pow(__shfl(x, i % warpSize) * intermediates[(i - 1) % fully_allocate_lds], intermediates[i % fully_allocate_lds]));
|
||||
__syncthreads();
|
||||
intermediates[i % fully_allocate_lds] = x;
|
||||
}
|
||||
if (x == N) ptr[threadIdx.x] = x;
|
||||
}
|
||||
|
||||
constexpr int sgprlim = 1;
|
||||
__launch_bounds__(1024, 8)
|
||||
__global__ void sgprbound(int N, double* ptr) {
|
||||
double intermediates[sgprlim];
|
||||
for (int i = 0 ; i < sgprlim; ++i) intermediates[i] = i;
|
||||
double x = ptr[0];
|
||||
#pragma unroll 1
|
||||
for (int i = 0; i < 100; ++i) {
|
||||
x += sin(pow(intermediates[(i - 1) % sgprlim], intermediates[i % sgprlim]));
|
||||
intermediates[i % sgprlim] = x;
|
||||
}
|
||||
if (x == N) ptr[0] = x;
|
||||
}
|
||||
|
||||
int main() {
|
||||
double* ptr;
|
||||
hipCheck(hipMalloc(&ptr, 1024 * sizeof(double)));
|
||||
vgprbound<<<1024 * 1024, 256>>>(0, ptr);
|
||||
hipCheck(hipGetLastError());
|
||||
hipCheck(hipDeviceSynchronize());
|
||||
vgprbound<<<1024 * 1024, 256>>>(0, ptr);
|
||||
hipCheck(hipGetLastError());
|
||||
hipCheck(hipDeviceSynchronize());
|
||||
ldsbound<<<1024 * 1024, 256>>>(0, ptr);
|
||||
hipCheck(hipGetLastError());
|
||||
hipCheck(hipDeviceSynchronize());
|
||||
ldsbound<<<1024 * 1024, 256>>>(0, ptr);
|
||||
hipCheck(hipGetLastError());
|
||||
hipCheck(hipDeviceSynchronize());
|
||||
sgprbound<<<1024 * 1024, 256>>>(0, ptr);
|
||||
hipCheck(hipGetLastError());
|
||||
hipCheck(hipDeviceSynchronize());
|
||||
sgprbound<<<1024 * 1024, 256>>>(0, ptr);
|
||||
hipCheck(hipGetLastError());
|
||||
hipCheck(hipDeviceSynchronize());
|
||||
empty<<<1024 * 1024, 256>>>(0, ptr);
|
||||
hipCheck(hipGetLastError());
|
||||
hipCheck(hipDeviceSynchronize());
|
||||
empty<<<1024 * 1024, 256>>>(0, ptr);
|
||||
hipCheck(hipGetLastError());
|
||||
hipCheck(hipDeviceSynchronize());
|
||||
hipCheck(hipFree(ptr));
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
/*
|
||||
##############################################################################bl
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
##############################################################################el
|
||||
|
||||
|
||||
|
||||
An example to explore spill/stack instructions.
|
||||
Written by Nicholas Curtis [AMD].
|
||||
*/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
__global__ void knl(int* out, int filter) {
|
||||
int x[1024];
|
||||
x[filter] = 0;
|
||||
if (threadIdx.x < filter) out[threadIdx.x] = x[threadIdx.x];
|
||||
}
|
||||
|
||||
int main() {
|
||||
knl<<<1, 1>>>(nullptr, 0);
|
||||
hipCheck(hipDeviceSynchronize());
|
||||
}
|
||||
@@ -1,3 +1,29 @@
|
||||
/*
|
||||
##############################################################################bl
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
##############################################################################el
|
||||
*/
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
@@ -0,0 +1,98 @@
|
||||
/*
|
||||
##############################################################################bl
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
##############################################################################el
|
||||
|
||||
|
||||
|
||||
An example to explore global/generic instructions.
|
||||
Written by Nicholas Curtis [AMD].
|
||||
*/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
typedef int __attribute__((address_space(0)))* generic_ptr;
|
||||
|
||||
__attribute__((noinline)) __device__ void generic_store(generic_ptr ptr, int zero) { *ptr = zero; }
|
||||
__attribute__((noinline)) __device__ int generic_load(generic_ptr ptr) { return *ptr; }
|
||||
__attribute__((noinline)) __device__ void generic_atomic(generic_ptr ptr, int zero) { atomicAdd((int*)ptr, zero); }
|
||||
|
||||
__global__ void global_write(int* ptr, int zero) {
|
||||
ptr[threadIdx.x] = zero;
|
||||
}
|
||||
|
||||
__global__ void generic_write(int* ptr, int zero, int filter) {
|
||||
__shared__ int lds[1024];
|
||||
int* generic = (threadIdx.x < filter) ? &ptr[threadIdx.x] : &lds[threadIdx.x];
|
||||
generic_store((generic_ptr)generic, zero);
|
||||
}
|
||||
|
||||
__global__ void global_read(int* ptr, int zero) {
|
||||
int x = ptr[threadIdx.x];
|
||||
if (x != zero) {
|
||||
ptr[threadIdx.x] = x + 1;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void generic_read(int* ptr, int zero, int filter) {
|
||||
__shared__ int lds[1024];
|
||||
if (static_cast<int>(filter - 1) == zero) {
|
||||
lds[threadIdx.x] = 0; // initialize to zero to avoid conditional, but hide behind _another_ conditional
|
||||
}
|
||||
int* generic;
|
||||
if (static_cast<int>(threadIdx.x) > filter - 1) {
|
||||
generic = &ptr[threadIdx.x];
|
||||
} else {
|
||||
generic = &lds[threadIdx.x];
|
||||
abort();
|
||||
}
|
||||
int x = generic_load((generic_ptr)generic);
|
||||
if (x != zero) {
|
||||
ptr[threadIdx.x] = x + 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void global_atomic(int* ptr, int zero) {
|
||||
atomicAdd(ptr, zero);
|
||||
}
|
||||
|
||||
__global__ void generic_atomic(int* ptr, int filter, int zero) {
|
||||
__shared__ int lds[1024];
|
||||
int* generic = (threadIdx.x % 2 == filter) ? &ptr[threadIdx.x] : &lds[threadIdx.x];
|
||||
generic_atomic((generic_ptr)generic, zero);
|
||||
}
|
||||
|
||||
int main() {
|
||||
int* ptr;
|
||||
hipCheck(hipMalloc(&ptr, sizeof(int)));
|
||||
hipCheck(hipMemset(ptr, 0, sizeof(int)));
|
||||
global_write<<<1,1>>>(ptr, 0);
|
||||
generic_write<<<1,1>>>(ptr, 0, 0);
|
||||
global_read<<<1,1>>>(ptr, 0);
|
||||
generic_read<<<1,1>>>(ptr, 0, 0);
|
||||
global_atomic<<<1,1>>>(ptr, 0);
|
||||
generic_atomic<<<1,1>>>(ptr, 0, 0);
|
||||
hipCheck(hipDeviceSynchronize());
|
||||
hipCheck(hipFree(ptr));
|
||||
}
|
||||
Verwijs in nieuw issue
Block a user