Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -37,9 +37,8 @@ THE SOFTWARE.
|
||||
#include <string.h>
|
||||
#include <stddef.h>
|
||||
|
||||
|
||||
|
||||
|
||||
// Define NVCC_COMPAT for CUDA compatibility
|
||||
#define NVCC_COMPAT
|
||||
#define CUDA_SUCCESS hipSuccess
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
|
||||
@@ -836,7 +836,7 @@ __device__ float erfcf(float x)
|
||||
}
|
||||
__device__ float erfcinvf(float y)
|
||||
{
|
||||
return __hip_erfinvf(1 - y);
|
||||
return __hip_erfinvf(1 - y);
|
||||
}
|
||||
__device__ float erfcxf(float x)
|
||||
{
|
||||
@@ -1697,75 +1697,111 @@ __device__ unsigned int test__popc(unsigned int input);
|
||||
|
||||
__device__ unsigned int __popcll( unsigned long long int input)
|
||||
{
|
||||
return hc::__popcount_u32_b64(input);
|
||||
return hc::__popcount_u32_b64(input);
|
||||
}
|
||||
|
||||
__device__ unsigned int __clz(unsigned int input)
|
||||
{
|
||||
return hc::__firstbit_u32_u32( input);
|
||||
#ifdef NVCC_COMPAT
|
||||
return input == 0 ? 32 : hc::__firstbit_u32_u32( input);
|
||||
#else
|
||||
return hc::__firstbit_u32_u32( input);
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ unsigned int __clzll(unsigned long long int input)
|
||||
{
|
||||
return hc::__firstbit_u32_u64( input);
|
||||
#ifdef NVCC_COMPAT
|
||||
return input == 0 ? 64 : hc::__firstbit_u32_u64( input);
|
||||
#else
|
||||
return hc::__firstbit_u32_u64( input);
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ unsigned int __clz(int input)
|
||||
__device__ unsigned int __clz( int input)
|
||||
{
|
||||
return hc::__firstbit_u32_s32( input);
|
||||
#ifdef NVCC_COMPAT
|
||||
return input == 0 ? 32 : hc::__firstbit_u32_s32( input);
|
||||
#else
|
||||
return hc::__firstbit_u32_s32( input);
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ unsigned int __clzll(long long int input)
|
||||
__device__ unsigned int __clzll( long long int input)
|
||||
{
|
||||
return hc::__firstbit_u32_s64( input);
|
||||
#ifdef NVCC_COMPAT
|
||||
return input == 0 ? 64 : hc::__firstbit_u32_s64( input);
|
||||
#else
|
||||
return hc::__firstbit_u32_s64( input);
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ unsigned int __ffs(unsigned int input)
|
||||
{
|
||||
return hc::__lastbit_u32_u32( input)+1;
|
||||
#ifdef NVCC_COMPAT
|
||||
return hc::__lastbit_u32_u32( input)+1;
|
||||
#else
|
||||
return hc::__lastbit_u32_u32( input);
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ unsigned int __ffsll(unsigned long long int input)
|
||||
{
|
||||
return hc::__lastbit_u32_u64( input)+1;
|
||||
#ifdef NVCC_COMPAT
|
||||
return hc::__lastbit_u32_u64( input)+1;
|
||||
#else
|
||||
return hc::__lastbit_u32_u64( input);
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ unsigned int __ffs(int input)
|
||||
__device__ unsigned int __ffs( int input)
|
||||
{
|
||||
return hc::__lastbit_u32_s32( input)+1;
|
||||
#ifdef NVCC_COMPAT
|
||||
return hc::__lastbit_u32_s32( input)+1;
|
||||
#else
|
||||
return hc::__lastbit_u32_s32( input);
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ unsigned int __ffsll(long long int input)
|
||||
__device__ unsigned int __ffsll( long long int input)
|
||||
{
|
||||
return hc::__lastbit_u32_s64( input)+1;
|
||||
#ifdef NVCC_COMPAT
|
||||
return hc::__lastbit_u32_s64( input)+1;
|
||||
#else
|
||||
return hc::__lastbit_u32_s64( input);
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ unsigned int __brev( unsigned int input)
|
||||
{
|
||||
return hc::__bitrev_b32( input);
|
||||
return hc::__bitrev_b32( input);
|
||||
}
|
||||
|
||||
__device__ unsigned long long int __brevll( unsigned long long int input)
|
||||
{
|
||||
return hc::__bitrev_b64( input);
|
||||
return hc::__bitrev_b64( input);
|
||||
}
|
||||
|
||||
// warp vote function __all __any __ballot
|
||||
__device__ int __all( int input)
|
||||
{
|
||||
return hc::__all( input);
|
||||
return hc::__all( input);
|
||||
}
|
||||
|
||||
|
||||
__device__ int __any( int input)
|
||||
{
|
||||
if( hc::__any( input)!=0) return 1;
|
||||
else return 0;
|
||||
#ifdef NVCC_COMPAT
|
||||
if( hc::__any( input)!=0) return 1;
|
||||
else return 0;
|
||||
#else
|
||||
return hc::__any( input);
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ unsigned long long int __ballot( int input)
|
||||
{
|
||||
return hc::__ballot( input);
|
||||
return hc::__ballot( input);
|
||||
}
|
||||
|
||||
// warp shuffle functions
|
||||
@@ -1809,11 +1845,11 @@ __device__ float __shfl_xor(float input, int lane_mask, int width)
|
||||
return hc::__shfl_xor(input,lane_mask,width);
|
||||
}
|
||||
|
||||
__host__ __device__ int min(int arg1, int arg2)
|
||||
{
|
||||
__host__ __device__ int min(int arg1, int arg2)
|
||||
{
|
||||
return (int)(hc::precise_math::fmin((float)arg1, (float)arg2));
|
||||
}
|
||||
__host__ __device__ int max(int arg1, int arg2)
|
||||
__host__ __device__ int max(int arg1, int arg2)
|
||||
{
|
||||
return (int)(hc::precise_math::fmax((float)arg1, (float)arg2));
|
||||
}
|
||||
|
||||
@@ -27,7 +27,7 @@ THE SOFTWARE.
|
||||
#include <hip_runtime.h>
|
||||
#define HIP_ASSERT(x) (assert((x)==hipSuccess))
|
||||
|
||||
__global__ void
|
||||
__global__ void
|
||||
warpvote(hipLaunchParm lp, int* device_any, int* device_all , int Num_Warps_per_Block, int pshift)
|
||||
{
|
||||
|
||||
@@ -36,13 +36,11 @@ __global__ void
|
||||
device_all[hipThreadIdx_x>>pshift] = __all(tid -77);
|
||||
}
|
||||
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{ int warpSize, pshift;
|
||||
hipDeviceProp_t devProp;
|
||||
hipGetDeviceProperties(&devProp, 0);
|
||||
if(strncmp(devProp.name,"Fiji",1)==0)
|
||||
if(strncmp(devProp.name,"Fiji",1)==0)
|
||||
{ warpSize =64;
|
||||
pshift =6;
|
||||
}
|
||||
@@ -53,14 +51,14 @@ int main(int argc, char *argv[])
|
||||
int Num_Blocks_per_Grid = 1;
|
||||
int Num_Warps_per_Block = Num_Threads_per_Block/warpSize;
|
||||
int Num_Warps_per_Grid = (Num_Threads_per_Block*Num_Blocks_per_Grid)/warpSize;
|
||||
|
||||
|
||||
int * host_any = ( int*)malloc(Num_Warps_per_Grid*sizeof(int));
|
||||
int * host_all = ( int*)malloc(Num_Warps_per_Grid*sizeof(int));
|
||||
int *device_any;
|
||||
int *device_any;
|
||||
int *device_all;
|
||||
HIP_ASSERT(hipMalloc((void**)&device_any,Num_Warps_per_Grid*sizeof( int)));
|
||||
HIP_ASSERT(hipMalloc((void**)&device_all,Num_Warps_per_Grid*sizeof(int)));
|
||||
for (int i=0; i<Num_Warps_per_Grid; i++)
|
||||
for (int i=0; i<Num_Warps_per_Grid; i++)
|
||||
{
|
||||
host_any[i] = 0;
|
||||
host_all[i] = 0;
|
||||
@@ -77,11 +75,20 @@ for (int i=0; i<Num_Warps_per_Grid; i++)
|
||||
|
||||
printf("warp no. %d __any = %d \n",i,host_any[i]);
|
||||
printf("warp no. %d __all = %d \n",i,host_all[i]);
|
||||
if (host_any[i]!=1) ++anycount;
|
||||
if (host_all[i]!=1) ++allcount;
|
||||
|
||||
if (host_all[i]!=1) ++allcount;
|
||||
#if defined (__HIP_PLATFORM_HCC__) && !defined ( NVCC_COMPAT )
|
||||
if (host_any[i]!=64) ++anycount;
|
||||
#else
|
||||
if (host_any[i]!=1) ++anycount;
|
||||
#endif
|
||||
}
|
||||
if (anycount == 0 && allcount ==1) printf("PASSED\n"); else printf("FAILED\n");
|
||||
|
||||
#if defined (__HIP_PLATFORM_HCC__) && !defined ( NVCC_COMPAT )
|
||||
if (anycount == 1 && allcount ==1) printf("PASSED\n"); else printf("FAILED\n");
|
||||
#else
|
||||
if (anycount == 0 && allcount ==1) printf("PASSED\n"); else printf("FAILED\n");
|
||||
#endif
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
|
||||
|
||||
@@ -27,13 +27,9 @@ THE SOFTWARE.
|
||||
#include <iostream>
|
||||
#include "hip_runtime.h"
|
||||
|
||||
|
||||
#define HIP_ASSERT(x) (assert((x)==hipSuccess))
|
||||
|
||||
|
||||
#define WIDTH 32
|
||||
#define HEIGHT 32
|
||||
|
||||
#define WIDTH 8
|
||||
#define HEIGHT 8
|
||||
#define NUM (WIDTH*HEIGHT)
|
||||
|
||||
#define THREADS_PER_BLOCK_X 8
|
||||
@@ -43,41 +39,41 @@ THE SOFTWARE.
|
||||
unsigned int firstbit_u32(unsigned int a)
|
||||
{
|
||||
if (a == 0)
|
||||
{
|
||||
#if defined (__HIP_PLATFORM_HCC__) && !defined ( NVCC_COMPAT )
|
||||
|
||||
return -1;
|
||||
#else
|
||||
return 32;
|
||||
#endif
|
||||
}
|
||||
unsigned int pos = 0;
|
||||
while ((int )a > 0) {
|
||||
a <<= 1; pos++;
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
unsigned int firstbit_s32(int a)
|
||||
{
|
||||
unsigned int u = a >= 0? a: ~a; // complement negative numbers
|
||||
return firstbit_u32(u);
|
||||
}
|
||||
|
||||
unsigned int firstbit_u64(unsigned long long int a)
|
||||
{
|
||||
if (a == 0)
|
||||
{
|
||||
#if defined (__HIP_PLATFORM_HCC__) && !defined ( NVCC_COMPAT )
|
||||
return -1;
|
||||
#else
|
||||
return 64;
|
||||
#endif
|
||||
}
|
||||
unsigned int pos = 0;
|
||||
while ((long long int)a > 0) {
|
||||
a <<= 1; pos++;
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
unsigned int firstbit_s64(long long int a)
|
||||
{
|
||||
unsigned long long int u = a >= 0? a: ~a; // complement negative numbers
|
||||
return firstbit_u64(u);
|
||||
}
|
||||
|
||||
|
||||
|
||||
__global__ void
|
||||
HIP_kernel(hipLaunchParm lp,
|
||||
unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d,
|
||||
unsigned int* e, int* f,unsigned int* g, long long int* h, int width, int height)
|
||||
unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height)
|
||||
{
|
||||
|
||||
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
@@ -87,14 +83,9 @@ HIP_kernel(hipLaunchParm lp,
|
||||
if ( i < (width * height)) {
|
||||
a[i] = __clz(b[i]);
|
||||
c[i] = __clzll(d[i]);
|
||||
e[i] = __clz(f[i]);
|
||||
g[i] = __clzll(h[i]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main() {
|
||||
@@ -103,19 +94,11 @@ int main() {
|
||||
unsigned int* hostB;
|
||||
unsigned int* hostC;
|
||||
unsigned long long int* hostD;
|
||||
unsigned int* hostE;
|
||||
int* hostF;
|
||||
unsigned int* hostG;
|
||||
long long int* hostH;
|
||||
|
||||
unsigned int* deviceA;
|
||||
unsigned int* deviceB;
|
||||
unsigned int* deviceC;
|
||||
unsigned long long int* deviceD;
|
||||
unsigned int* deviceE;
|
||||
int* deviceF;
|
||||
unsigned int* deviceG;
|
||||
long long int* deviceH;
|
||||
|
||||
hipDeviceProp_t devProp;
|
||||
hipGetDeviceProperties(&devProp, 0);
|
||||
@@ -125,57 +108,56 @@ int main() {
|
||||
|
||||
cout << "hip Device prop succeeded " << endl ;
|
||||
|
||||
|
||||
int i;
|
||||
unsigned int i;
|
||||
int errors;
|
||||
|
||||
hostA = (unsigned int*)malloc(NUM * sizeof(unsigned int));
|
||||
hostB = (unsigned int*)malloc(NUM * sizeof(unsigned int));
|
||||
hostC = (unsigned int*)malloc(NUM * sizeof(unsigned int));
|
||||
hostD = (unsigned long long int*)malloc(NUM * sizeof(unsigned long long int));
|
||||
hostE = (unsigned int*)malloc(NUM * sizeof(unsigned int));
|
||||
hostF = (int*)malloc(NUM * sizeof(int));
|
||||
hostG = (unsigned int*)malloc(NUM * sizeof(unsigned int));
|
||||
hostH = (long long int*)malloc(NUM * sizeof(long long int));
|
||||
|
||||
// initialize the input data
|
||||
for (i = 0; i < NUM; i++) {
|
||||
hostB[i] = i;
|
||||
hostD[i] = 1099511627776+i;
|
||||
hostF[i] = -2100+i;
|
||||
hostH[i] = 1099511627776+i;
|
||||
hostB[i] = 419430*i;
|
||||
hostD[i] = i;
|
||||
}
|
||||
|
||||
HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(unsigned int)));
|
||||
HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(unsigned int)));
|
||||
HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(unsigned int)));
|
||||
HIP_ASSERT(hipMalloc((void**)&deviceD, NUM * sizeof(unsigned long long int)));
|
||||
HIP_ASSERT(hipMalloc((void**)&deviceE, NUM * sizeof(unsigned int)));
|
||||
HIP_ASSERT(hipMalloc((void**)&deviceF, NUM * sizeof(int)));
|
||||
HIP_ASSERT(hipMalloc((void**)&deviceG, NUM * sizeof(unsigned int)));
|
||||
HIP_ASSERT(hipMalloc((void**)&deviceH, NUM * sizeof(long long int)));
|
||||
|
||||
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(unsigned int), hipMemcpyHostToDevice));
|
||||
HIP_ASSERT(hipMemcpy(deviceD, hostD, NUM*sizeof(unsigned long long int), hipMemcpyHostToDevice));
|
||||
HIP_ASSERT(hipMemcpy(deviceF, hostF, NUM*sizeof(int), hipMemcpyHostToDevice));
|
||||
HIP_ASSERT(hipMemcpy(deviceH, hostD, NUM*sizeof(long long int), hipMemcpyHostToDevice));
|
||||
|
||||
hipLaunchKernel(HIP_kernel,
|
||||
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
0, 0,
|
||||
deviceA ,deviceB, deviceC,deviceD ,deviceE ,deviceF, deviceG,deviceH, WIDTH ,HEIGHT);
|
||||
deviceA ,deviceB, deviceC ,deviceD , WIDTH ,HEIGHT);
|
||||
|
||||
|
||||
HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(unsigned int), hipMemcpyDeviceToHost));
|
||||
HIP_ASSERT(hipMemcpy(hostC, deviceC, NUM*sizeof(unsigned int), hipMemcpyDeviceToHost));
|
||||
HIP_ASSERT(hipMemcpy(hostE, deviceE, NUM*sizeof(unsigned int), hipMemcpyDeviceToHost));
|
||||
HIP_ASSERT(hipMemcpy(hostG, deviceG, NUM*sizeof(unsigned int), hipMemcpyDeviceToHost));
|
||||
|
||||
// verify the results
|
||||
errors = 0;
|
||||
for (i = 0; i < NUM; i++) {
|
||||
if (hostA[i] != firstbit_u32(hostB[i])) {
|
||||
printf("gpu_clz =%d, cpu_clz =%d \n",hostA[i],firstbit_u32(hostB[i]));
|
||||
if (hostA[i] != firstbit_u32(hostB[i])) {
|
||||
errors++;
|
||||
}
|
||||
}
|
||||
if (errors!=0) {
|
||||
cout << "FAILED clz" << endl;
|
||||
return -1;
|
||||
} else {
|
||||
cout << "__clz() checked!" << endl;
|
||||
}
|
||||
errors = 0;
|
||||
for (i = 0; i < NUM; i++) {
|
||||
printf("gpu_clzll =%d, cpu_clzll =%d \n",hostC[i],firstbit_u64(hostD[i]));
|
||||
if (hostC[i] != firstbit_u64(hostD[i])) {
|
||||
errors++;
|
||||
}
|
||||
}
|
||||
@@ -183,43 +165,7 @@ int main() {
|
||||
cout << "FAILED clz" << endl;
|
||||
return -1;
|
||||
} else {
|
||||
cout << "__clz_u() for unsigned checked!" << endl;
|
||||
}
|
||||
errors = 0;
|
||||
for (i = 0; i < NUM; i++) {
|
||||
if (hostC[i] != firstbit_u64(hostD[i])) {
|
||||
errors++;
|
||||
}
|
||||
}
|
||||
if (errors!=0) {
|
||||
cout << "FAILED clz" << endl;
|
||||
return -1;
|
||||
} else {
|
||||
cout << "__clzll_u() for unsigned checked!" << endl;
|
||||
}
|
||||
errors = 0;
|
||||
for (i = 0; i < NUM; i++) {
|
||||
if (hostE[i] != firstbit_s32(hostF[i])) {
|
||||
errors++;
|
||||
}
|
||||
}
|
||||
if (errors!=0) {
|
||||
cout << "FAILED clz\n" << endl;
|
||||
return -1;
|
||||
} else {
|
||||
cout << "__clz_s() checked!" << endl;
|
||||
}
|
||||
errors = 0;
|
||||
for (i = 0; i < NUM; i++) {
|
||||
if (hostG[i] != firstbit_s64(hostH[i])) {
|
||||
errors++;
|
||||
}
|
||||
}
|
||||
if (errors!=0) {
|
||||
cout << "FAILED clz" << endl;
|
||||
return -1;
|
||||
} else {
|
||||
cout << "__clzll_s() checked!" << endl;
|
||||
cout << "__clzll() checked!" << endl;
|
||||
}
|
||||
|
||||
cout << "clz test PASSED!" << endl;
|
||||
@@ -228,19 +174,11 @@ int main() {
|
||||
HIP_ASSERT(hipFree(deviceB));
|
||||
HIP_ASSERT(hipFree(deviceC));
|
||||
HIP_ASSERT(hipFree(deviceD));
|
||||
HIP_ASSERT(hipFree(deviceE));
|
||||
HIP_ASSERT(hipFree(deviceF));
|
||||
HIP_ASSERT(hipFree(deviceG));
|
||||
HIP_ASSERT(hipFree(deviceH));
|
||||
|
||||
free(hostA);
|
||||
free(hostB);
|
||||
free(hostC);
|
||||
free(hostD);
|
||||
free(hostE);
|
||||
free(hostF);
|
||||
free(hostG);
|
||||
free(hostH);
|
||||
|
||||
return errors;
|
||||
}
|
||||
|
||||
@@ -31,8 +31,8 @@ THE SOFTWARE.
|
||||
#define HIP_ASSERT(x) (assert((x)==hipSuccess))
|
||||
|
||||
|
||||
#define WIDTH 32
|
||||
#define HEIGHT 32
|
||||
#define WIDTH 8
|
||||
#define HEIGHT 8
|
||||
|
||||
#define NUM (WIDTH*HEIGHT)
|
||||
|
||||
@@ -44,12 +44,20 @@ template<typename T>
|
||||
int lastbit( T a)
|
||||
{
|
||||
if (a == 0)
|
||||
#if defined (__HIP_PLATFORM_HCC__) && !defined ( NVCC_COMPAT )
|
||||
return -1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
int pos = 1;
|
||||
while ((a&1) != 1) {
|
||||
a >>= 1; pos++;
|
||||
}
|
||||
return pos;
|
||||
#if defined (__HIP_PLATFORM_HCC__) && !defined ( NVCC_COMPAT )
|
||||
return pos-1;
|
||||
#else
|
||||
return pos;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -130,6 +138,7 @@ int main() {
|
||||
// verify the results
|
||||
errors = 0;
|
||||
for (i = 0; i < NUM; i++) {
|
||||
printf("gpu_ffs =%d, cpu_ffs =%d \n",hostA[i],lastbit(hostB[i]));
|
||||
if (hostA[i] != lastbit(hostB[i])) {
|
||||
errors++;
|
||||
}
|
||||
@@ -142,6 +151,7 @@ int main() {
|
||||
}
|
||||
errors = 0;
|
||||
for (i = 0; i < NUM; i++) {
|
||||
printf("gpu_ffsll =%d, cpu_ffsll =%d \n",hostC[i],lastbit(hostD[i]));
|
||||
if (hostC[i] != lastbit(hostD[i])) {
|
||||
errors++;
|
||||
}
|
||||
|
||||
Ссылка в новой задаче
Block a user