/************************************************************************* * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef COPY_KERNEL_H_ #define COPY_KERNEL_H_ #include #include // Define min for ssize_t static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; } typedef uint64_t PackType; template struct MULTI { __device__ PackType operator()(const PackType x, const PackType y) const { return FUNC()(x, y); } }; #define ALIGNUP(x, a) ((((x)-1) & ~((a)-1)) + (a)) template __device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) { size_t ptrval = reinterpret_cast(ptr); return reinterpret_cast(ALIGNUP(ptrval, align)); } template inline __device__ T vFetch(const volatile T* ptr) { return *ptr; } template inline __device__ void vStore(volatile T* ptr, const T val) { *ptr = val; } template __attribute__((noinline)) __device__ inline void ReduceCopy( const int tid, const int nthreads, const volatile T * __restrict__ const src0, const volatile T * __restrict__ const src1, volatile T * __restrict__ const dest0, volatile T * __restrict__ const dest1, const int N) { for (int idx = tid; idx < N; idx += nthreads) { T val = vFetch(src0+idx); if (TWO_INPUTS) { val = FUNC()(val, vFetch(src1+idx)); } vStore(dest0+idx, val); if (TWO_OUTPUTS) { vStore(dest1+idx, val); } } } template struct FuncPassA { __device__ T operator()(const T x, const T y) const { return x; } }; template struct FuncSum { __device__ T operator()(const T x, const T y) const { return x + y; } }; template struct MULTI { static_assert(sizeof(PackType) == 2 * sizeof(float), "PackType must be twice the size of float."); union converter { PackType storage; struct { float a, b; }; }; __device__ PackType operator()(const PackType x, const PackType y) const { converter cx, cy, cr; cx.storage = x; cy.storage = y; cr.a = FUNC()(cx.a, cy.a); cr.b = FUNC()(cx.b, cy.b); return cr.storage; } }; typedef ulong2 Pack128; template struct MULTI128 { __device__ void operator()(Pack128& x, Pack128& y) { x.x = MULTI()(x.x, y.x); x.y = MULTI()(x.y, y.y); } }; inline __device__ void Fetch128(Pack128& v, const Pack128* p) { v.x = p->x; v.y = p->y; } inline __device__ void Store128(Pack128* p, Pack128& v) { p->x = v.x; p->y = v.y; } template __device__ void ReduceCopyMulti(const int tid, const int nthreads, int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS], const int offset, const int N) { for (int idx = offset+tid; idx < offset+N; idx += nthreads) { T val = vFetch(srcs[0]+idx); #pragma unroll for (int i=1; i __device__ void ReduceCopy128bMulti( const int w, const int nw, const int t, int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS], const int elemOffset, const int Npack) { const int inc = nw * UNROLL * WARP_SIZE; int offset = w * UNROLL * WARP_SIZE + t; const Pack128* srcs[MAXSRCS]; for (int i=0; i()(vals[u], vals2[u]); } #pragma unroll 1 for (int i=MINSRCS; i()(vals[u], vals2[u]); } // Store for (int i = 0; i < MINDSTS; i++) { for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]); } #pragma unroll 1 for (int i=MINDSTS; i __device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); } // Try to limit consecutive load/stores to 8. // Use UNROLL 8 when we have a single source and a single destination, 4 otherwise #define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS))) template __device__ void ReduceOrCopyMulti(const int tid, const int nthreads, int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS], int N) { int Nrem = N; if (Nrem <= 0) return; int alignDiff = 0; int align = ptrAlign128(srcs[0]); #pragma unroll for (int i=1; i(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble); Nrem -= Npreamble; if (Nrem == 0) return; } int offset = Npreamble; // stage 2: fast path: use 128b loads/stores to do the bulk of the work, // assuming the pointers we have are all 128-bit alignable. int w = tid / WARP_SIZE; // Warp number int nw = nthreads / WARP_SIZE; // Number of warps int t = tid % WARP_SIZE; // Thread (inside the warp) const int packFactor = sizeof(Pack128) / sizeof(T); // stage 2a: main loop int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE)) * (AUTOUNROLL * WARP_SIZE); // round down int Nelem2a = Npack2a * packFactor; ReduceCopy128bMulti(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a); Nrem -= Nelem2a; if (Nrem == 0) return; offset += Nelem2a; // stage 2b: slightly less optimized for section when we don't have full // unrolling int Npack2b = Nrem / packFactor; int Nelem2b = Npack2b * packFactor; ReduceCopy128bMulti(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b); Nrem -= Nelem2b; if (Nrem == 0) return; offset += Nelem2b; // stage 2c: tail ReduceCopyMulti(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem); } // Assumptions: // - there is exactly 1 block // - THREADS is the number of producer threads // - this function is called by all producer threads template __device__ void Copy(volatile T * __restrict__ const dest, const volatile T * __restrict__ const src, const int N) { const T* srcs[2]; T* dsts[2]; srcs[0] = (const T*)src; dsts[0] = (T*)dest; ReduceOrCopyMulti, T, 1, 2, 1, 2>(threadIdx.x, THREADS, 1, srcs, 1, dsts, N); } template __device__ void DoubleCopy(volatile T * __restrict__ const dest0, volatile T * __restrict__ const dest1, const volatile T * __restrict__ const src, const int N) { const T* srcs[2]; T* dsts[2]; srcs[0] = (const T*)src; dsts[0] = (T*)dest0; dsts[1] = (T*)dest1; ReduceOrCopyMulti, T, 1, 2, 1, 2>(threadIdx.x, THREADS, 1, srcs, 2, dsts, N); } template __device__ void Reduce(volatile T * __restrict__ const dest, const volatile T * __restrict__ const src0, const volatile T * __restrict__ const src1, const int N) { const T* srcs[2]; T* dsts[2]; srcs[0] = (const T*)src0; srcs[1] = (const T*)src1; dsts[0] = (T*)dest; ReduceOrCopyMulti, T, 1, 2, 1, 2>(threadIdx.x, THREADS, 2, srcs, 1, dsts, N); } template __device__ void ReduceCopy(volatile T * __restrict__ const dest0, volatile T * __restrict__ const dest1, const volatile T * __restrict__ const src0, const volatile T * __restrict__ const src1, const int N) { const T* srcs[2]; T* dsts[2]; srcs[0] = (const T*)src0; srcs[1] = (const T*)src1; dsts[0] = (T*)dest0; dsts[1] = (T*)dest1; ReduceOrCopyMulti, T, 1, 2, 1, 2>(threadIdx.x, THREADS, 2, srcs, 2, dsts, N); } #endif // COPY_KERNEL_H_