275 line
8.2 KiB
C++
275 line
8.2 KiB
C++
/******************************************************************************
|
|
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
* of this software and associated documentation files (the "Software"), to
|
|
* deal in the Software without restriction, including without limitation the
|
|
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
|
* sell copies of the Software, and to permit persons to whom the Software is
|
|
* furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
* IN THE SOFTWARE.
|
|
*****************************************************************************/
|
|
|
|
#ifndef LIBRARY_SRC_UTIL_HPP_
|
|
#define LIBRARY_SRC_UTIL_HPP_
|
|
|
|
#include <hip/hip_runtime.h>
|
|
#include <hsa/hsa.h>
|
|
#include <hsa/hsa_ext_amd.h>
|
|
|
|
#include <cstdio>
|
|
|
|
#include "assembly.hpp"
|
|
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
|
#include "constants.hpp"
|
|
|
|
namespace rocshmem {
|
|
|
|
#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
|
|
#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
|
|
|
|
#define CHECK_HIP(cmd) \
|
|
{ \
|
|
hipError_t error = cmd; \
|
|
if (error != hipSuccess) { \
|
|
fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), \
|
|
error, __FILE__, __LINE__); \
|
|
exit(EXIT_FAILURE); \
|
|
} \
|
|
}
|
|
|
|
#ifdef DEBUG
|
|
#define DPRINTF(...) \
|
|
do { \
|
|
printf(__VA_ARGS__); \
|
|
} while (0);
|
|
#else
|
|
#define DPRINTF(...) \
|
|
do { \
|
|
} while (0);
|
|
#endif
|
|
|
|
#ifdef DEBUG
|
|
#define GPU_DPRINTF(...) \
|
|
do { \
|
|
gpu_dprintf(__VA_ARGS__); \
|
|
} while (0);
|
|
#else
|
|
#define GPU_DPRINTF(...) \
|
|
do { \
|
|
} while (0);
|
|
#endif
|
|
|
|
extern const int gpu_clock_freq_mhz;
|
|
|
|
/* Device-side internal functions */
|
|
__device__ __forceinline__ uint32_t lowerID() {
|
|
return __ffsll(__ballot(1)) - 1;
|
|
}
|
|
|
|
__device__ __forceinline__ int wave_SZ() { return __popcll(__ballot(1)); }
|
|
|
|
/*
|
|
* Returns true if the caller's thread index is (0, 0, 0) in its block.
|
|
*/
|
|
__device__ __forceinline__ bool is_thread_zero_in_block() {
|
|
return hipThreadIdx_x == 0 && hipThreadIdx_y == 0 && hipThreadIdx_z == 0;
|
|
}
|
|
|
|
/*
|
|
* Returns true if the caller's block index is (0, 0, 0) in its grid. All
|
|
* threads in the same block will return the same answer.
|
|
*/
|
|
__device__ __forceinline__ bool is_block_zero_in_grid() {
|
|
return hipBlockIdx_x == 0 && hipBlockIdx_y == 0 && hipBlockIdx_z == 0;
|
|
}
|
|
|
|
/*
|
|
* Returns the number of threads in the caller's flattened thread block.
|
|
*/
|
|
__device__ __forceinline__ int get_flat_block_size() {
|
|
return hipBlockDim_x * hipBlockDim_y * hipBlockDim_z;
|
|
}
|
|
|
|
/*
|
|
* Returns the number of threads in the caller's flattened grid.
|
|
*/
|
|
__device__ __forceinline__ int get_flat_grid_size() {
|
|
return get_flat_block_size() * hipGridDim_x * hipGridDim_y * hipGridDim_z;
|
|
}
|
|
|
|
/*
|
|
* Returns the flattened thread index of the calling thread within its
|
|
* thread block.
|
|
*/
|
|
__device__ __forceinline__ int get_flat_block_id() {
|
|
return hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x +
|
|
hipThreadIdx_z * hipBlockDim_x * hipBlockDim_y;
|
|
}
|
|
|
|
/*
|
|
* Returns the flattened block index that the calling thread is a member of in
|
|
* in the grid. Callers from the same block will have the same index.
|
|
*/
|
|
__device__ __forceinline__ int get_flat_grid_id() {
|
|
return hipBlockIdx_x + hipBlockIdx_y * hipGridDim_x +
|
|
hipBlockIdx_z * hipGridDim_x * hipGridDim_y;
|
|
}
|
|
|
|
/*
|
|
* Returns the flattened thread index of the calling thread within the grid.
|
|
*/
|
|
__device__ __forceinline__ int get_flat_id() {
|
|
return get_flat_grid_id() * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z) + get_flat_block_id();
|
|
}
|
|
|
|
/*
|
|
* Returns true if the caller's thread flad_id is 0 in its wave.
|
|
*/
|
|
__device__ __forceinline__ bool is_thread_zero_in_wave() {
|
|
return (get_flat_block_id() % WF_SIZE) == 0;
|
|
}
|
|
|
|
extern __constant__ int* print_lock;
|
|
|
|
template <typename... Args>
|
|
__device__ void gpu_dprintf(const char* fmt, const Args&... args) {
|
|
for (int i{0}; i < WF_SIZE; i++) {
|
|
if ((get_flat_block_id() % WF_SIZE) == i) {
|
|
/*
|
|
* GPU-wide global lock that ensures that both prints are executed
|
|
* by a single thread atomically. We deliberately break control
|
|
* flow so that only a single thread in a WF accesses the lock at a
|
|
* time. If multiple threads in the same WF attempt to gain the
|
|
* lock at the same time, you have a classic GPU control flow
|
|
* deadlock caused by threads in the same WF waiting on each other.
|
|
*/
|
|
while (atomicCAS(print_lock, 0, 1) == 1) {
|
|
}
|
|
|
|
printf("WG (%u, %u, %u) TH (%u, %u, %u) ", hipBlockIdx_x,
|
|
hipBlockIdx_y, hipBlockIdx_z, hipThreadIdx_x, hipThreadIdx_y,
|
|
hipThreadIdx_z);
|
|
printf(fmt, args...);
|
|
|
|
*print_lock = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
__device__ __forceinline__ void memcpy(void* dst, void* src, size_t size) {
|
|
uint8_t* dst_bytes{static_cast<uint8_t*>(dst)};
|
|
uint8_t* src_bytes{static_cast<uint8_t*>(src)};
|
|
|
|
for (size_t i = 8; i > 1; i >>= 1) {
|
|
while (size >= i) {
|
|
store_asm(src_bytes, dst_bytes, i);
|
|
src_bytes += i;
|
|
dst_bytes += i;
|
|
size -= i;
|
|
}
|
|
}
|
|
|
|
if (size == 1) {
|
|
*dst_bytes = *src_bytes;
|
|
}
|
|
}
|
|
|
|
__device__ __forceinline__ void memcpy_wg(void* dst, void* src, size_t size) {
|
|
int thread_id{get_flat_block_id()};
|
|
int block_size{get_flat_block_size()};
|
|
|
|
int cpy_size{};
|
|
uint8_t* dst_bytes{nullptr};
|
|
uint8_t* dst_def{nullptr};
|
|
uint8_t* src_bytes{nullptr};
|
|
uint8_t* src_def{nullptr};
|
|
|
|
dst_def = reinterpret_cast<uint8_t*>(dst);
|
|
src_def = reinterpret_cast<uint8_t*>(src);
|
|
dst_bytes = dst_def;
|
|
src_bytes = src_def;
|
|
|
|
for (int j{8}; j > 1; j >>= 1) {
|
|
cpy_size = size / j;
|
|
for (int i{thread_id}; i < cpy_size; i += block_size) {
|
|
dst_bytes = dst_def;
|
|
src_bytes = src_def;
|
|
|
|
src_bytes += i * j;
|
|
dst_bytes += i * j;
|
|
|
|
store_asm(src_bytes, dst_bytes, j);
|
|
}
|
|
size -= cpy_size * j;
|
|
dst_def += cpy_size * j;
|
|
src_def += cpy_size * j;
|
|
}
|
|
|
|
if (size == 1) {
|
|
if (is_thread_zero_in_block()) {
|
|
*dst_bytes = *src_bytes;
|
|
}
|
|
}
|
|
}
|
|
|
|
__device__ __forceinline__ void memcpy_wave(void* dst, void* src, size_t size) {
|
|
int wave_tid = get_flat_block_id() % WF_SIZE;
|
|
int wave_size{wave_SZ()};
|
|
|
|
int cpy_size{};
|
|
uint8_t* dst_bytes{nullptr};
|
|
uint8_t* dst_def{nullptr};
|
|
uint8_t* src_bytes{nullptr};
|
|
uint8_t* src_def{nullptr};
|
|
|
|
dst_def = reinterpret_cast<uint8_t*>(dst);
|
|
src_def = reinterpret_cast<uint8_t*>(src);
|
|
dst_bytes = dst_def;
|
|
src_bytes = src_def;
|
|
|
|
for (int j{8}; j > 1; j >>= 1) {
|
|
cpy_size = size / j;
|
|
for (int i{wave_tid}; i < cpy_size; i += wave_size) {
|
|
dst_bytes = dst_def;
|
|
src_bytes = src_def;
|
|
|
|
src_bytes += i * j;
|
|
dst_bytes += i * j;
|
|
|
|
store_asm(src_bytes, dst_bytes, j);
|
|
}
|
|
size -= cpy_size * j;
|
|
dst_def += cpy_size * j;
|
|
src_def += cpy_size * j;
|
|
}
|
|
|
|
if (size == 1) {
|
|
if (is_thread_zero_in_wave()) {
|
|
*dst_bytes = *src_bytes;
|
|
}
|
|
}
|
|
}
|
|
|
|
int rocm_init();
|
|
|
|
void rocm_memory_lock_to_fine_grain(void* ptr, size_t size, void** gpu_ptr,
|
|
int gpu_id);
|
|
|
|
// Returns clock frequency used by s_memrealtime() in Mhz
|
|
uint64_t wallClk_freq_mhz();
|
|
|
|
} // namespace rocshmem
|
|
|
|
#endif // LIBRARY_SRC_UTIL_HPP_
|