Files
rocm-systems/src/util.hpp
T
2024-12-12 10:21:08 -06:00

275 line
8.2 KiB
C++

/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_UTIL_HPP_
#define LIBRARY_SRC_UTIL_HPP_
#include <hip/hip_runtime.h>
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#include <cstdio>
#include "assembly.hpp"
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "constants.hpp"
namespace rocshmem {
#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
#define CHECK_HIP(cmd) \
{ \
hipError_t error = cmd; \
if (error != hipSuccess) { \
fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), \
error, __FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} \
}
#ifdef DEBUG
#define DPRINTF(...) \
do { \
printf(__VA_ARGS__); \
} while (0);
#else
#define DPRINTF(...) \
do { \
} while (0);
#endif
#ifdef DEBUG
#define GPU_DPRINTF(...) \
do { \
gpu_dprintf(__VA_ARGS__); \
} while (0);
#else
#define GPU_DPRINTF(...) \
do { \
} while (0);
#endif
extern const int gpu_clock_freq_mhz;
/* Device-side internal functions */
__device__ __forceinline__ uint32_t lowerID() {
return __ffsll(__ballot(1)) - 1;
}
__device__ __forceinline__ int wave_SZ() { return __popcll(__ballot(1)); }
/*
* Returns true if the caller's thread index is (0, 0, 0) in its block.
*/
__device__ __forceinline__ bool is_thread_zero_in_block() {
return hipThreadIdx_x == 0 && hipThreadIdx_y == 0 && hipThreadIdx_z == 0;
}
/*
* Returns true if the caller's block index is (0, 0, 0) in its grid. All
* threads in the same block will return the same answer.
*/
__device__ __forceinline__ bool is_block_zero_in_grid() {
return hipBlockIdx_x == 0 && hipBlockIdx_y == 0 && hipBlockIdx_z == 0;
}
/*
* Returns the number of threads in the caller's flattened thread block.
*/
__device__ __forceinline__ int get_flat_block_size() {
return hipBlockDim_x * hipBlockDim_y * hipBlockDim_z;
}
/*
* Returns the number of threads in the caller's flattened grid.
*/
__device__ __forceinline__ int get_flat_grid_size() {
return get_flat_block_size() * hipGridDim_x * hipGridDim_y * hipGridDim_z;
}
/*
* Returns the flattened thread index of the calling thread within its
* thread block.
*/
__device__ __forceinline__ int get_flat_block_id() {
return hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x +
hipThreadIdx_z * hipBlockDim_x * hipBlockDim_y;
}
/*
* Returns the flattened block index that the calling thread is a member of in
* in the grid. Callers from the same block will have the same index.
*/
__device__ __forceinline__ int get_flat_grid_id() {
return hipBlockIdx_x + hipBlockIdx_y * hipGridDim_x +
hipBlockIdx_z * hipGridDim_x * hipGridDim_y;
}
/*
* Returns the flattened thread index of the calling thread within the grid.
*/
__device__ __forceinline__ int get_flat_id() {
return get_flat_grid_id() * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z) + get_flat_block_id();
}
/*
* Returns true if the caller's thread flad_id is 0 in its wave.
*/
__device__ __forceinline__ bool is_thread_zero_in_wave() {
return (get_flat_block_id() % WF_SIZE) == 0;
}
extern __constant__ int* print_lock;
template <typename... Args>
__device__ void gpu_dprintf(const char* fmt, const Args&... args) {
for (int i{0}; i < WF_SIZE; i++) {
if ((get_flat_block_id() % WF_SIZE) == i) {
/*
* GPU-wide global lock that ensures that both prints are executed
* by a single thread atomically. We deliberately break control
* flow so that only a single thread in a WF accesses the lock at a
* time. If multiple threads in the same WF attempt to gain the
* lock at the same time, you have a classic GPU control flow
* deadlock caused by threads in the same WF waiting on each other.
*/
while (atomicCAS(print_lock, 0, 1) == 1) {
}
printf("WG (%u, %u, %u) TH (%u, %u, %u) ", hipBlockIdx_x,
hipBlockIdx_y, hipBlockIdx_z, hipThreadIdx_x, hipThreadIdx_y,
hipThreadIdx_z);
printf(fmt, args...);
*print_lock = 0;
}
}
}
__device__ __forceinline__ void memcpy(void* dst, void* src, size_t size) {
uint8_t* dst_bytes{static_cast<uint8_t*>(dst)};
uint8_t* src_bytes{static_cast<uint8_t*>(src)};
for (size_t i = 8; i > 1; i >>= 1) {
while (size >= i) {
store_asm(src_bytes, dst_bytes, i);
src_bytes += i;
dst_bytes += i;
size -= i;
}
}
if (size == 1) {
*dst_bytes = *src_bytes;
}
}
__device__ __forceinline__ void memcpy_wg(void* dst, void* src, size_t size) {
int thread_id{get_flat_block_id()};
int block_size{get_flat_block_size()};
int cpy_size{};
uint8_t* dst_bytes{nullptr};
uint8_t* dst_def{nullptr};
uint8_t* src_bytes{nullptr};
uint8_t* src_def{nullptr};
dst_def = reinterpret_cast<uint8_t*>(dst);
src_def = reinterpret_cast<uint8_t*>(src);
dst_bytes = dst_def;
src_bytes = src_def;
for (int j{8}; j > 1; j >>= 1) {
cpy_size = size / j;
for (int i{thread_id}; i < cpy_size; i += block_size) {
dst_bytes = dst_def;
src_bytes = src_def;
src_bytes += i * j;
dst_bytes += i * j;
store_asm(src_bytes, dst_bytes, j);
}
size -= cpy_size * j;
dst_def += cpy_size * j;
src_def += cpy_size * j;
}
if (size == 1) {
if (is_thread_zero_in_block()) {
*dst_bytes = *src_bytes;
}
}
}
__device__ __forceinline__ void memcpy_wave(void* dst, void* src, size_t size) {
int wave_tid = get_flat_block_id() % WF_SIZE;
int wave_size{wave_SZ()};
int cpy_size{};
uint8_t* dst_bytes{nullptr};
uint8_t* dst_def{nullptr};
uint8_t* src_bytes{nullptr};
uint8_t* src_def{nullptr};
dst_def = reinterpret_cast<uint8_t*>(dst);
src_def = reinterpret_cast<uint8_t*>(src);
dst_bytes = dst_def;
src_bytes = src_def;
for (int j{8}; j > 1; j >>= 1) {
cpy_size = size / j;
for (int i{wave_tid}; i < cpy_size; i += wave_size) {
dst_bytes = dst_def;
src_bytes = src_def;
src_bytes += i * j;
dst_bytes += i * j;
store_asm(src_bytes, dst_bytes, j);
}
size -= cpy_size * j;
dst_def += cpy_size * j;
src_def += cpy_size * j;
}
if (size == 1) {
if (is_thread_zero_in_wave()) {
*dst_bytes = *src_bytes;
}
}
}
int rocm_init();
void rocm_memory_lock_to_fine_grain(void* ptr, size_t size, void** gpu_ptr,
int gpu_id);
// Returns clock frequency used by s_memrealtime() in Mhz
uint64_t wallClk_freq_mhz();
} // namespace rocshmem
#endif // LIBRARY_SRC_UTIL_HPP_