added fma for double and float

1. Added fma intrinsic support for double and float 2. Added test for fma Change-Id: I909fdbec34a3d12c03ba6eff3a39376a7128ee43 [ROCm/hip commit: 30674382a4]
2016-11-23 18:22:05 -06:00
parent d4d76b7018
commit a04dad833b
2 changed files with 39 additions and 4 deletions
@@ -509,7 +509,10 @@ __device__ float __hip_fast_powf(float, float);
 __device__ void __hip_fast_sincosf(float,float*,float*);
 extern __attribute__((const)) float __hip_fast_sinf(float) __asm("llvm.sin.f32");
 __device__ float __hip_fast_tanf(float);
+extern __attribute__((const)) float __hip_fast_fmaf(float,float,float) __asm("llvm.fma.f32");
+
 extern __attribute__((const)) double __hip_fast_dsqrt(double) __asm("llvm.sqrt.f64");
+extern __attribute__((const)) double __hip_fast_fma(double,double,double) __asm("llvm.fma.f64");

 #ifdef HIP_FAST_MATH
 // Single Precision Precise Math when enabled
@@ -629,6 +632,22 @@ __device__ inline float __tanf(float x) {
  return __hip_fast_tanf(x);
 }

+__device__ inline float __fmaf_rd(float x, float y, float z) {
+  return __hip_fast_fmaf(x, y, z);
+}
+
+__device__ inline float __fmaf_rn(float x, float y, float z) {
+  return __hip_fast_fmaf(x, y, z);
+}
+
+__device__ inline float __fmaf_ru(float x, float y, float z) {
+  return __hip_fast_fmaf(x, y, z);
+}
+
+__device__ inline float __fmaf_rz(float x, float y, float z) {
+  return __hip_fast_fmaf(x, y, z);
+}
+
 __device__ inline double __dsqrt_rd(double x) {
  return __hip_fast_dsqrt(x);
 }
@@ -645,6 +664,22 @@ __device__ inline double __dsqrt_rz(double x) {
  return __hip_fast_dsqrt(x);
 }

+__device__ inline double __fma_rd(double x, double y, double z) {
+  return __hip_fast_fma(x, y, z);
+}
+
+__device__ inline double __fma_rn(double x, double y, double z) {
+  return __hip_fast_fma(x, y, z);
+}
+
+__device__ inline double __fma_ru(double x, double y, double z) {
+  return __hip_fast_fma(x, y, z);
+}
+
+__device__ inline double __fma_rz(double x, double y, double z) {
+  return __hip_fast_fma(x, y, z);
+}
+
 /**
 * CUDA 8 device function features

@@ -51,10 +51,10 @@ __device__ void double_precision_intrinsics()
    //__dsub_rn(2.0, 1.0);
    //__dsub_ru(2.0, 1.0);
    //__dsub_rz(2.0, 1.0);
-    //__fma_rd(1.0, 2.0, 3.0);
-    //__fma_rn(1.0, 2.0, 3.0);
-    //__fma_ru(1.0, 2.0, 3.0);
-    //__fma_rz(1.0, 2.0, 3.0);
+    __fma_rd(1.0, 2.0, 3.0);
+    __fma_rn(1.0, 2.0, 3.0);
+    __fma_ru(1.0, 2.0, 3.0);
+    __fma_rz(1.0, 2.0, 3.0);
 }

 __global__ void compileDoublePrecisionIntrinsics(hipLaunchParm lp, int ignored)