diff --git a/projects/rocprofiler-compute/sample/common.h b/projects/rocprofiler-compute/sample/common.h new file mode 100644 index 0000000000..b6edfeab06 --- /dev/null +++ b/projects/rocprofiler-compute/sample/common.h @@ -0,0 +1,17 @@ +#pragma once + +#include +#include + +#define hipCheck(stmt) \ + do { \ + hipError_t err = stmt; \ + if (err != hipSuccess) { \ + char msg[256]; \ + sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt, __FILE__, \ + __FUNCTION__, __LINE__); \ + std::string errstring = hipGetErrorString(err); \ + std::cerr << msg << "\t" << errstring << std::endl; \ + throw std::runtime_error(msg); \ + } \ + } while (0) diff --git a/projects/rocprofiler-compute/sample/fabric.hip b/projects/rocprofiler-compute/sample/fabric.hip new file mode 100644 index 0000000000..2c1f6b5ffd --- /dev/null +++ b/projects/rocprofiler-compute/sample/fabric.hip @@ -0,0 +1,315 @@ +/* +##############################################################################bl +# MIT License +# +# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +##############################################################################el + + + +A data-fabric exerciser example, written by Nicholas Curtis [AMD] + +The test allows the user to control the: + - The granularity of an allocation (Coarse vs Fine-grained), + - The owner of an allocation (local HBM, CPU DRAM or remote HBM), + - The size of an allocation (the default is ~4GiB), and + - The type of operation we are executing (read, write, atomics of various flavors) + +This lets the user explore the impact of these choices on the generated +data-fabric traffic. +*/ + + +#include +#include + +#include +#include + +#include "common.h" + +enum class mtype : int { FineGrained = 0, CoarseGrained = 1, Undef = 3 }; +enum class mowner : int { Device = 0, Host = 1, Remote = 2, Undef = 3 }; +enum class mspace : int { Global = 0, Undef = 1 }; +enum class mop : int { + Read = 0, + Write = 1, + AtomicAdd = 2, + AtomicCas = 3, + AtomicOr = 4, + AtomicMax = 5, + Undef = 6 +}; +enum class mdata : int { Unsigned = 0, UnsignedLong = 1, Float = 2, Double = 3, Undef = 4 }; + +template +T parse(const char* value) { + int ivalue = std::atoi(value); + if (ivalue < 0 || ivalue >= int(T::Undef)) { + throw std::runtime_error("bad enum value!"); + } + return T(ivalue); +} + +void parse(int argc, char** argv, mtype& mytype, mowner& myowner, + mspace& myspace, size_t& size, mop& myop, mdata& mydata, + int& remoteId) { + while (1) { + static struct option long_options[] = { + /* These options set a flag. */ + {"type", required_argument, 0, 't'}, + {"owner", required_argument, 0, 'o'}, + {"size", required_argument, 0, 'z'}, + {"op", required_argument, 0, 'p'}, + {"remote", required_argument, 0, 'r'}, + {"data", required_argument, 0, 'd'}, + {0, 0, 0, 0}}; + /* getopt_long stores the option index here. */ + int option_index = 0; + + int c = + getopt_long(argc, argv, "t:o:z:p:r:d:", long_options, &option_index); + + /* Detect the end of the options. */ + if (c == -1) break; + + switch (c) { + case 't': + mytype = parse(optarg); + break; + + case 'o': + myowner = parse(optarg); + break; + + case 'z': + size = std::atoll(optarg); + break; + + case 'p': + myop = parse(optarg); + break; + + case 'r': + remoteId = std::atoi(optarg); + break; + + case 'd': + mydata = parse(optarg); + break; + + case '?': + /* getopt_long already printed an error message. */ + break; + + default: + abort(); + } + } + std::cout << "Using: " << std::endl; + std::cout << "\tmtype:" + << ((mytype == mtype::FineGrained) ? "FineGrained" + : "CoarseGrained") + << std::endl; + std::cout << "\tmowner:" + << ((myowner == mowner::Device) + ? "Device" + : ((myowner == mowner::Host) ? "Host" : "Remote")) + << std::endl; + std::cout << "\tmspace:Global" << std::endl; + std::cout << "\tmop:" << ((myop == mop::Read) ? "Read" : (myop == mop::Write ? "Write" : (myop == mop::AtomicAdd ? "Add" : (myop == mop::AtomicCas ? "CAS" : (myop == mop::AtomicOr ? "Or" : "Max"))))) << std::endl; + std::cout << "\tmdata:" << (mydata == mdata::Unsigned ? "Unsigned" : (mydata == mdata::UnsignedLong ? "Unsigned Long" : (mydata == mdata::Float ? "Float" : "Double"))) << std::endl; + std::cout << "\tremoteId:" << remoteId << std::endl; +} + +// dummy intialization kernel +__global__ void init() {} + +template +void alloc(mtype memory, mowner owner, T** ptr, size_t Nbytes, int devId, + int remoteId) { + bool is_device = (owner == mowner::Device) || (owner == mowner::Remote); + if (owner == mowner::Remote) { + // enable remote access + hipCheck(hipDeviceEnablePeerAccess(remoteId, 0)); + // set id for alloc + hipCheck(hipSetDevice(remoteId)); + } + init<<<1, 1>>>(); + + if (memory == mtype::FineGrained && is_device) { + hipCheck( + hipExtMallocWithFlags((void**)ptr, Nbytes, hipDeviceMallocFinegrained)); + } else if (memory == mtype::CoarseGrained && is_device) { + hipCheck(hipMalloc(ptr, Nbytes)); + } else if (memory == mtype::FineGrained && owner == mowner::Host) { + hipCheck(hipHostMalloc(ptr, Nbytes, hipHostMallocCoherent)); + } else if (memory == mtype::CoarseGrained && owner == mowner::Host) { + hipCheck(hipHostMalloc(ptr, Nbytes, hipHostMallocNonCoherent)); + } else { + assert(false && "unknown combo"); + } + + // set to random + std::vector host(Nbytes / sizeof(T), T(0)); + hipCheck(hipMemcpy(*ptr, &host[0], Nbytes, + (is_device ? hipMemcpyHostToDevice : hipMemcpyHostToHost))); + + if (owner == mowner::Remote) { + // reset id for execution + hipCheck(hipSetDevice(devId)); + } +} + +template +void release(mtype memory, mowner owner, T* ptr) { + bool is_device = (owner == mowner::Device) || (owner == mowner::Remote); + if (memory == mtype::FineGrained && is_device) { + hipCheck(hipFree(ptr)); + } else if (memory == mtype::CoarseGrained && is_device) { + hipCheck(hipFree(ptr)); + } else if (memory == mtype::FineGrained && owner == mowner::Host) { + hipCheck(hipHostFree(ptr)); + } else if (memory == mtype::CoarseGrained && owner == mowner::Host) { + hipCheck(hipHostFree(ptr)); + } else { + assert(false && "unknown combo"); + } +} + +// the main streaming kernel +template +__global__ void kernel(T* x, size_t N, T zero, T foo) { + int sum = 0; + const size_t offset_start = threadIdx.x + blockIdx.x * blockDim.x; + for (int i = 0; i < repeats; ++i) { + for (size_t offset = offset_start; offset < N; + offset += blockDim.x * gridDim.x) { + T uniq = (foo + offset) + i; + if constexpr (op == mop::Read) { + sum += x[offset]; + } else if constexpr (op == mop::Write) { + x[offset] = (T)offset; + } else if constexpr (op == mop::AtomicAdd) { + atomicAdd(&x[offset], uniq); + } else if constexpr (op == mop::AtomicCas) { + atomicCAS(&x[offset], uniq, uniq); + } else if constexpr (op == mop::AtomicOr) { + atomicOr(&x[offset], uniq); + } else if constexpr (op == mop::AtomicMax) { + atomicMax(&x[offset], uniq); + } + } + } + if constexpr (op == mop::Read) { + if (sum != 0) { + x[offset_start] = sum; + } + } +} + +template +void run_kernel(T* x, size_t size) { + if constexpr (op == mop::AtomicOr && std::is_floating_point_v) { + throw std::runtime_error("bad"); + } else { + kernel<<<4096, 1024>>>(x, size, 0, T(23456789)); + // then run once for data collection + kernel<<<4096, 1024>>>(x, size, 0, T(23456789)); + } +} + +template +void run_atomic(mowner myowner, T* x, size_t size) { + if (myowner == mowner::Host) { + // speed it up + run_kernel(x, size / 10); + } else { + run_kernel(x, size); + } +} + +template +void run(mtype mytype, mspace myspace, mowner myowner, mop myop, int remoteId, + size_t size) { + int devId = 0; + if (myowner == mowner::Remote && remoteId == -1) { + // need to find a remote GPU + int ndevices; + hipCheck(hipGetDeviceCount(&ndevices)); + if (ndevices <= 1) { + throw std::runtime_error( + "Need >=2 devices available for mowner = Remote"); + } + for (int i = 0; i < ndevices; ++i) { + if (i != devId) { + remoteId = i; + break; + } + } + } + + T* x; + alloc(mytype, myowner, &x, size * sizeof(T), devId, remoteId); + + // run the kernel once for warmup + assert(4096 * 1024 < size); + if (myop == mop::Read) { + run_kernel(x, size); + } else if (myop == mop::Write) { + run_kernel(x, size); + } else if (myop == mop::AtomicAdd) { + run_atomic(myowner, x, size); + } else if (myop == mop::AtomicCas) { + run_atomic(myowner, x, size); + } else if (myop == mop::AtomicOr) { + run_atomic(myowner, x, size); + } else if (myop == mop::AtomicMax) { + run_atomic(myowner, x, size); + } else { + throw std::runtime_error("bad"); + } + hipCheck(hipDeviceSynchronize()); + release(mytype, myowner, x); +} + +int main(int argc, char** argv) { + mtype mytype = (mtype)0; + mspace myspace = (mspace)0; + mowner myowner = (mowner)0; + mop myop = (mop)0; + mdata mydata = (mdata)0; + int remoteId = -1; + size_t size = 1024ull * 1024ull * + 1024ull; // 4 GiB, purposefully much larger than caches. + parse(argc, argv, mytype, myowner, myspace, size, myop, mydata, remoteId); + if (mydata == mdata::Unsigned) + run(mytype, myspace, myowner, myop, remoteId, size); + else if (mydata == mdata::UnsignedLong) + run(mytype, myspace, myowner, myop, remoteId, size); + else if (mydata == mdata::Float) + run(mytype, myspace, myowner, myop, remoteId, size); + else if (mydata == mdata::Double) + run(mytype, myspace, myowner, myop, remoteId, size); + else { + throw std::runtime_error("bad"); + } +} \ No newline at end of file diff --git a/projects/rocprofiler-compute/sample/instmix.hip b/projects/rocprofiler-compute/sample/instmix.hip new file mode 100644 index 0000000000..a409db4b02 --- /dev/null +++ b/projects/rocprofiler-compute/sample/instmix.hip @@ -0,0 +1,113 @@ +/* +##############################################################################bl +# MIT License +# +# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +##############################################################################el + + + +A instruction mix exerciser example, written by Gina Sitaraman and Nicholas Curtis [AMD]. +Although inline assembly is inherently unportable, this is expected to work on all CDNA accelerators. +*/ + + +#include "common.h" + +__global__ void kernelasm() { + // int32 + int i, j; + asm volatile("v_add_u32_e32 %0, %1, %0\n" : "=v"(j) : "v"(i)); + + // int 64 + long int l1, l2; + asm volatile("v_cmp_eq_i64 %0, %1\n" : "=v"(l2) : "v"(l1), "v"(i)); + + // fp32: add, mul, transcendental and fma + float f1, f2; + asm volatile( + "v_add_f32_e32 %0, %1, %0\n" + "v_mul_f32_e32 %0, %1, %0\n" + "v_sqrt_f32 %0, %1\n" + "v_fma_f32 %0, %1, %0, %1\n" + : "=v"(f1) + : "v"(f2)); + + // fp64: add, mul, transcendental and fma + double d1, d2, d3, d4; + asm volatile( + "v_add_f64 %0, %1, %0\n" + "v_mul_f64 %0, %1, %0\n" + "v_fma_f64 %0, %1, %0, %1\n" + "v_sqrt_f64 %0, %1\n" + "v_min_f64 %0, %1, %0\n" + : "+v"(d1) + : "v"(d2)); + + // fp16: add, mul, transcendental and fma + _Float16 h1, h2; + asm volatile( + "v_add_f16_e32 %0, %1, %0\n" + "v_mul_f16_e32 %0, %1, %0\n" + "v_sqrt_f16 %0, %1\n" + "v_cvt_f16_f32 %0 %2\n" + "v_fma_f16 %0, %1, %0, %0\n" + : "=v"(h2) + : "v"(h1), "v"(f1)); + + // MFMA ops + double2 dd; + unsigned short us; + long2 ll; +#if defined(__gfx90a__) + asm volatile("v_mfma_f64_4x4x4f64 %0 %1 %2 %3\n" + : "=v"(d4) + : "v"(d1), "v"(d2), "v"(d3)); + asm volatile("v_mfma_f32_16x16x4f32 %0 %1 %2 1\n" + : "=v"(dd) + : "v"(f1), "v"(f2)); + asm volatile("v_mfma_f32_16x16x16f16 %0 %1 %2 1\n" + : "=v"(dd) + : "v"(d1), "v"(d2)); + asm volatile("v_mfma_f32_16x16x8bf16 %0 %1 %2 1\n" + : "=v"(dd) + : "v"(f1), "v"(f2)); + asm volatile("v_mfma_i32_16x16x16i8 %0 %1 %2 1\n" + : "=v"(ll) + : "v"(i), "v"(j)); +#endif + + // Scalar op + asm volatile("s_add_i32 %0 %1 %0\n" : "=s"(j) : "s"(i)); + + // LDS + asm volatile("ds_read_b32 %0 %0\n" : "=v"(i) : "v"(j)); + + // Branch + asm volatile( + "s_branch .LDUMMY\n" + ".LDUMMY:\n" + "s_endpgm\n"); +} +int main() { + kernelasm<<<1, 64>>>(); + hipCheck(hipDeviceSynchronize()); +} diff --git a/projects/rocprofiler-compute/sample/ipc.hip b/projects/rocprofiler-compute/sample/ipc.hip new file mode 100644 index 0000000000..9fcdf462d8 --- /dev/null +++ b/projects/rocprofiler-compute/sample/ipc.hip @@ -0,0 +1,127 @@ +/* +##############################################################################bl +# MIT License +# +# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +##############################################################################el + + + +An example to explore IPC and divergence, written by Nicholas Curtis [AMD]. +This example may not work on all CDNA accelerators, but has been verified on MI2XX. +*/ + +#include "common.h" + +template +__device__ void vrcp_op() { + int dummy; + if constexpr (N >= 1) { + asm volatile("v_rcp_f64 v[0:1], v[0:1]\n" : : "{v31}"(dummy)); + vrcp_op(); + } +} + +template +__global__ void vrcp() { + vrcp_op(); +} + +template +__device__ void vmov_op() { + int dummy; + if constexpr (N >= 1) { + asm volatile("v_mov_b32 v0, v1\n" : : "{v31}"(dummy)); + vmov_op(); + } +} + +template +__global__ void vmov() { + vmov_op(); +} + +template +__device__ void mfma_op() { + int dummy; + if constexpr (N >= 1) { + asm volatile("v_mfma_f32_32x32x8bf16_1k v[0:15], v[16:17], v[18:19], v[0:15]\n" : : "{v31}"(dummy)); + mfma_op(); + } +} + +template +__global__ void mfma() { + mfma_op(); +} + +template +__device__ void snop_op() { + int dummy; + if constexpr (N >= 1) { + asm volatile("s_nop 0x0\n" : : "{v31}"(dummy)); + snop_op(); + } +} + + +template +__global__ void snop() { + snop_op(); +} + +template +__device__ void smov_op() { + int dummy; + if constexpr (N >= 1) { + asm volatile("s_mov_b32 s0, s1\n" : : "{s31}"(dummy)); + smov_op(); + } +} + +template +__global__ void smov() { + smov_op(); +} + +template +__global__ void vmov_with_divergence() { + if (threadIdx.x % 64 == 0) + vmov_op(); +} + +int main() { + // warmups, spam to all CUs + vrcp<<<1024 * 1024, 1024>>>(); + vmov<<<1024 * 1024, 1024>>>(); + mfma<<<1024 * 1024, 1024>>>(); + snop<<<1024 * 1024, 1024>>>(); + smov<<<1024 * 1024, 1024>>>(); + vmov_with_divergence<<<1024 * 1024, 1024>>>(); + hipCheck(hipDeviceSynchronize()); + vrcp<<<1024 * 1024, 1024>>>(); + vmov<<<1024 * 1024, 1024>>>(); + mfma<<<1024 * 1024, 1024>>>(); + snop<<<1024 * 1024, 1024>>>(); + smov<<<1024 * 1024, 1024>>>(); + vmov_with_divergence<<<1024 * 1024, 1024>>>(); + hipCheck(hipDeviceSynchronize()); +} \ No newline at end of file diff --git a/projects/rocprofiler-compute/sample/lds.hip b/projects/rocprofiler-compute/sample/lds.hip new file mode 100644 index 0000000000..2018ad8da4 --- /dev/null +++ b/projects/rocprofiler-compute/sample/lds.hip @@ -0,0 +1,78 @@ +/* +##############################################################################bl +# MIT License +# +# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +##############################################################################el + + + +An example to explore LDS bandwidth and bank conflicts, written by Nicholas Curtis [AMD]. +*/ + + +#include "common.h" + +constexpr unsigned max_threads = 256; +constexpr unsigned nbanks = 32; + +__global__ void load(int* out, int flag) { + __shared__ int array[max_threads]; + int index = threadIdx.x; + // fake a store to the LDS array to avoid unwanted behavior + if (flag) + array[max_threads - index] = index; + __syncthreads(); + int x = array[index]; + if (x == int(-1234567)) + out[threadIdx.x] = x; +} + +__global__ void conflicts(int* out, int flag) { + constexpr unsigned nelements = nbanks * max_threads; + __shared__ int array[nelements]; + // each thread reads from the same bank + int index = threadIdx.x * nbanks; + // fake a store to the LDS array to avoid unwanted behavior + if (flag) + array[max_threads - index] = index; + __syncthreads(); + int x = array[index]; + if (x == int(-1234567)) + out[threadIdx.x] = x; +} + +void bandwidth_demo(int N) { + for (int i = 1; i <= N; ++i) + load<<<1,i>>>(nullptr, 0); + hipCheck(hipDeviceSynchronize()); +} + +void conflicts_demo(int N) { + for (int i = 1; i <= N; ++i) + conflicts<<<1,i>>>(nullptr, 0); + hipCheck(hipDeviceSynchronize()); +} + +int main() { + bandwidth_demo(max_threads); + conflicts_demo(max_threads); +} diff --git a/projects/rocprofiler-compute/sample/occupancy.hip b/projects/rocprofiler-compute/sample/occupancy.hip new file mode 100644 index 0000000000..7c7099e30b --- /dev/null +++ b/projects/rocprofiler-compute/sample/occupancy.hip @@ -0,0 +1,109 @@ +/* +##############################################################################bl +# MIT License +# +# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +##############################################################################el + + + +An example to explore achieved occupancy, and various occupancy limiters. +Written by Nicholas Curtis [AMD]. +*/ + + +#include "common.h" + +__global__ void empty(int N, double* ptr) { + +} + +constexpr int bound = 16; +__launch_bounds__(256) +__global__ void vgprbound(int N, double* ptr) { + double intermediates[bound]; + for (int i = 0 ; i < bound; ++i) intermediates[i] = N * threadIdx.x; + double x = ptr[threadIdx.x]; + for (int i = 0; i < 100; ++i) { + x += sin(pow(__shfl(x, i % warpSize) * intermediates[(i - 1) % bound], intermediates[i % bound])); + intermediates[i % bound] = x; + } + if (x == N) ptr[threadIdx.x] = x; +} + +constexpr size_t fully_allocate_lds = 64ul * 1024ul / sizeof(double); +__launch_bounds__(256) +__global__ void ldsbound(int N, double* ptr) { + __shared__ double intermediates[fully_allocate_lds]; + for (int i = threadIdx.x ; i < fully_allocate_lds; i += blockDim.x) intermediates[i] = N * threadIdx.x; + __syncthreads(); + double x = ptr[threadIdx.x]; + for (int i = threadIdx.x; i < fully_allocate_lds; i += blockDim.x) { + x += sin(pow(__shfl(x, i % warpSize) * intermediates[(i - 1) % fully_allocate_lds], intermediates[i % fully_allocate_lds])); + __syncthreads(); + intermediates[i % fully_allocate_lds] = x; + } + if (x == N) ptr[threadIdx.x] = x; +} + +constexpr int sgprlim = 1; +__launch_bounds__(1024, 8) +__global__ void sgprbound(int N, double* ptr) { + double intermediates[sgprlim]; + for (int i = 0 ; i < sgprlim; ++i) intermediates[i] = i; + double x = ptr[0]; + #pragma unroll 1 + for (int i = 0; i < 100; ++i) { + x += sin(pow(intermediates[(i - 1) % sgprlim], intermediates[i % sgprlim])); + intermediates[i % sgprlim] = x; + } + if (x == N) ptr[0] = x; +} + +int main() { + double* ptr; + hipCheck(hipMalloc(&ptr, 1024 * sizeof(double))); + vgprbound<<<1024 * 1024, 256>>>(0, ptr); + hipCheck(hipGetLastError()); + hipCheck(hipDeviceSynchronize()); + vgprbound<<<1024 * 1024, 256>>>(0, ptr); + hipCheck(hipGetLastError()); + hipCheck(hipDeviceSynchronize()); + ldsbound<<<1024 * 1024, 256>>>(0, ptr); + hipCheck(hipGetLastError()); + hipCheck(hipDeviceSynchronize()); + ldsbound<<<1024 * 1024, 256>>>(0, ptr); + hipCheck(hipGetLastError()); + hipCheck(hipDeviceSynchronize()); + sgprbound<<<1024 * 1024, 256>>>(0, ptr); + hipCheck(hipGetLastError()); + hipCheck(hipDeviceSynchronize()); + sgprbound<<<1024 * 1024, 256>>>(0, ptr); + hipCheck(hipGetLastError()); + hipCheck(hipDeviceSynchronize()); + empty<<<1024 * 1024, 256>>>(0, ptr); + hipCheck(hipGetLastError()); + hipCheck(hipDeviceSynchronize()); + empty<<<1024 * 1024, 256>>>(0, ptr); + hipCheck(hipGetLastError()); + hipCheck(hipDeviceSynchronize()); + hipCheck(hipFree(ptr)); +} diff --git a/projects/rocprofiler-compute/sample/stack.hip b/projects/rocprofiler-compute/sample/stack.hip new file mode 100644 index 0000000000..9f030309a0 --- /dev/null +++ b/projects/rocprofiler-compute/sample/stack.hip @@ -0,0 +1,43 @@ +/* +##############################################################################bl +# MIT License +# +# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +##############################################################################el + + + +An example to explore spill/stack instructions. +Written by Nicholas Curtis [AMD]. +*/ + +#include "common.h" + +__global__ void knl(int* out, int filter) { + int x[1024]; + x[filter] = 0; + if (threadIdx.x < filter) out[threadIdx.x] = x[threadIdx.x]; +} + +int main() { + knl<<<1, 1>>>(nullptr, 0); + hipCheck(hipDeviceSynchronize()); +} \ No newline at end of file diff --git a/projects/rocprofiler-compute/sample/vcopy.cpp b/projects/rocprofiler-compute/sample/vcopy.cpp index 7f4620eacf..dc0e21808d 100644 --- a/projects/rocprofiler-compute/sample/vcopy.cpp +++ b/projects/rocprofiler-compute/sample/vcopy.cpp @@ -1,3 +1,29 @@ +/* +##############################################################################bl +# MIT License +# +# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +##############################################################################el +*/ + #include "hip/hip_runtime.h" #include #include diff --git a/projects/rocprofiler-compute/sample/vmem.hip b/projects/rocprofiler-compute/sample/vmem.hip new file mode 100644 index 0000000000..e85d1baa53 --- /dev/null +++ b/projects/rocprofiler-compute/sample/vmem.hip @@ -0,0 +1,98 @@ +/* +##############################################################################bl +# MIT License +# +# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +##############################################################################el + + + +An example to explore global/generic instructions. +Written by Nicholas Curtis [AMD]. +*/ + +#include "common.h" + +typedef int __attribute__((address_space(0)))* generic_ptr; + +__attribute__((noinline)) __device__ void generic_store(generic_ptr ptr, int zero) { *ptr = zero; } +__attribute__((noinline)) __device__ int generic_load(generic_ptr ptr) { return *ptr; } +__attribute__((noinline)) __device__ void generic_atomic(generic_ptr ptr, int zero) { atomicAdd((int*)ptr, zero); } + +__global__ void global_write(int* ptr, int zero) { + ptr[threadIdx.x] = zero; +} + +__global__ void generic_write(int* ptr, int zero, int filter) { + __shared__ int lds[1024]; + int* generic = (threadIdx.x < filter) ? &ptr[threadIdx.x] : &lds[threadIdx.x]; + generic_store((generic_ptr)generic, zero); +} + +__global__ void global_read(int* ptr, int zero) { + int x = ptr[threadIdx.x]; + if (x != zero) { + ptr[threadIdx.x] = x + 1; + } +} + +__global__ void generic_read(int* ptr, int zero, int filter) { + __shared__ int lds[1024]; + if (static_cast(filter - 1) == zero) { + lds[threadIdx.x] = 0; // initialize to zero to avoid conditional, but hide behind _another_ conditional + } + int* generic; + if (static_cast(threadIdx.x) > filter - 1) { + generic = &ptr[threadIdx.x]; + } else { + generic = &lds[threadIdx.x]; + abort(); + } + int x = generic_load((generic_ptr)generic); + if (x != zero) { + ptr[threadIdx.x] = x + 1; + } +} + + +__global__ void global_atomic(int* ptr, int zero) { + atomicAdd(ptr, zero); +} + +__global__ void generic_atomic(int* ptr, int filter, int zero) { + __shared__ int lds[1024]; + int* generic = (threadIdx.x % 2 == filter) ? &ptr[threadIdx.x] : &lds[threadIdx.x]; + generic_atomic((generic_ptr)generic, zero); +} + +int main() { + int* ptr; + hipCheck(hipMalloc(&ptr, sizeof(int))); + hipCheck(hipMemset(ptr, 0, sizeof(int))); + global_write<<<1,1>>>(ptr, 0); + generic_write<<<1,1>>>(ptr, 0, 0); + global_read<<<1,1>>>(ptr, 0); + generic_read<<<1,1>>>(ptr, 0, 0); + global_atomic<<<1,1>>>(ptr, 0); + generic_atomic<<<1,1>>>(ptr, 0, 0); + hipCheck(hipDeviceSynchronize()); + hipCheck(hipFree(ptr)); +}