added fma for double and float

1. Added fma intrinsic support for double and float
2. Added test for fma

Change-Id: I909fdbec34a3d12c03ba6eff3a39376a7128ee43


[ROCm/hip commit: 30674382a4]
This commit is contained in:
Aditya Atluri
2016-11-23 18:22:05 -06:00
parent d4d76b7018
commit a04dad833b
2 changed files with 39 additions and 4 deletions
@@ -509,7 +509,10 @@ __device__ float __hip_fast_powf(float, float);
__device__ void __hip_fast_sincosf(float,float*,float*);
extern __attribute__((const)) float __hip_fast_sinf(float) __asm("llvm.sin.f32");
__device__ float __hip_fast_tanf(float);
extern __attribute__((const)) float __hip_fast_fmaf(float,float,float) __asm("llvm.fma.f32");
extern __attribute__((const)) double __hip_fast_dsqrt(double) __asm("llvm.sqrt.f64");
extern __attribute__((const)) double __hip_fast_fma(double,double,double) __asm("llvm.fma.f64");
#ifdef HIP_FAST_MATH
// Single Precision Precise Math when enabled
@@ -629,6 +632,22 @@ __device__ inline float __tanf(float x) {
return __hip_fast_tanf(x);
}
__device__ inline float __fmaf_rd(float x, float y, float z) {
return __hip_fast_fmaf(x, y, z);
}
__device__ inline float __fmaf_rn(float x, float y, float z) {
return __hip_fast_fmaf(x, y, z);
}
__device__ inline float __fmaf_ru(float x, float y, float z) {
return __hip_fast_fmaf(x, y, z);
}
__device__ inline float __fmaf_rz(float x, float y, float z) {
return __hip_fast_fmaf(x, y, z);
}
__device__ inline double __dsqrt_rd(double x) {
return __hip_fast_dsqrt(x);
}
@@ -645,6 +664,22 @@ __device__ inline double __dsqrt_rz(double x) {
return __hip_fast_dsqrt(x);
}
__device__ inline double __fma_rd(double x, double y, double z) {
return __hip_fast_fma(x, y, z);
}
__device__ inline double __fma_rn(double x, double y, double z) {
return __hip_fast_fma(x, y, z);
}
__device__ inline double __fma_ru(double x, double y, double z) {
return __hip_fast_fma(x, y, z);
}
__device__ inline double __fma_rz(double x, double y, double z) {
return __hip_fast_fma(x, y, z);
}
/**
* CUDA 8 device function features
@@ -51,10 +51,10 @@ __device__ void double_precision_intrinsics()
//__dsub_rn(2.0, 1.0);
//__dsub_ru(2.0, 1.0);
//__dsub_rz(2.0, 1.0);
//__fma_rd(1.0, 2.0, 3.0);
//__fma_rn(1.0, 2.0, 3.0);
//__fma_ru(1.0, 2.0, 3.0);
//__fma_rz(1.0, 2.0, 3.0);
__fma_rd(1.0, 2.0, 3.0);
__fma_rn(1.0, 2.0, 3.0);
__fma_ru(1.0, 2.0, 3.0);
__fma_rz(1.0, 2.0, 3.0);
}
__global__ void compileDoublePrecisionIntrinsics(hipLaunchParm lp, int ignored)