From a04dad833b46ee9ecd5670fa89dc49730eab4c6d Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Wed, 23 Nov 2016 18:22:05 -0600
Subject: [PATCH] added fma for double and float 1. Added fma intrinsic support
 for double and float 2. Added test for fma

Change-Id: I909fdbec34a3d12c03ba6eff3a39376a7128ee43


[ROCm/hip commit: 30674382a434d9820f7b35f88f2c1fc81e850d11]
---
 .../hip/include/hip/hcc_detail/hip_runtime.h  | 35 +++++++++++++++++++
 .../hipDoublePrecisionIntrinsics.cpp          |  8 ++---
 2 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime.h b/projects/hip/include/hip/hcc_detail/hip_runtime.h
index 45dbeff5a4..63cfb2ea3c 100644
--- a/projects/hip/include/hip/hcc_detail/hip_runtime.h
+++ b/projects/hip/include/hip/hcc_detail/hip_runtime.h
@@ -509,7 +509,10 @@ __device__ float __hip_fast_powf(float, float);
 __device__ void __hip_fast_sincosf(float,float*,float*);
 extern __attribute__((const)) float __hip_fast_sinf(float) __asm("llvm.sin.f32");
 __device__ float __hip_fast_tanf(float);
+extern __attribute__((const)) float __hip_fast_fmaf(float,float,float) __asm("llvm.fma.f32");
+
 extern __attribute__((const)) double __hip_fast_dsqrt(double) __asm("llvm.sqrt.f64");
+extern __attribute__((const)) double __hip_fast_fma(double,double,double) __asm("llvm.fma.f64");
 
 #ifdef HIP_FAST_MATH
 // Single Precision Precise Math when enabled
@@ -629,6 +632,22 @@ __device__ inline float __tanf(float x) {
   return __hip_fast_tanf(x);
 }
 
+__device__ inline float __fmaf_rd(float x, float y, float z) {
+  return __hip_fast_fmaf(x, y, z);
+}
+
+__device__ inline float __fmaf_rn(float x, float y, float z) {
+  return __hip_fast_fmaf(x, y, z);
+}
+
+__device__ inline float __fmaf_ru(float x, float y, float z) {
+  return __hip_fast_fmaf(x, y, z);
+}
+
+__device__ inline float __fmaf_rz(float x, float y, float z) {
+  return __hip_fast_fmaf(x, y, z);
+}
+
 __device__ inline double __dsqrt_rd(double x) {
   return __hip_fast_dsqrt(x);
 }
@@ -645,6 +664,22 @@ __device__ inline double __dsqrt_rz(double x) {
   return __hip_fast_dsqrt(x);
 }
 
+__device__ inline double __fma_rd(double x, double y, double z) {
+  return __hip_fast_fma(x, y, z);
+}
+
+__device__ inline double __fma_rn(double x, double y, double z) {
+  return __hip_fast_fma(x, y, z);
+}
+
+__device__ inline double __fma_ru(double x, double y, double z) {
+  return __hip_fast_fma(x, y, z);
+}
+
+__device__ inline double __fma_rz(double x, double y, double z) {
+  return __hip_fast_fma(x, y, z);
+}
+
 /**
  * CUDA 8 device function features
 
diff --git a/projects/hip/tests/src/deviceLib/hipDoublePrecisionIntrinsics.cpp b/projects/hip/tests/src/deviceLib/hipDoublePrecisionIntrinsics.cpp
index 0dab2d7106..0b4e0840a4 100644
--- a/projects/hip/tests/src/deviceLib/hipDoublePrecisionIntrinsics.cpp
+++ b/projects/hip/tests/src/deviceLib/hipDoublePrecisionIntrinsics.cpp
@@ -51,10 +51,10 @@ __device__ void double_precision_intrinsics()
     //__dsub_rn(2.0, 1.0);
     //__dsub_ru(2.0, 1.0);
     //__dsub_rz(2.0, 1.0);
-    //__fma_rd(1.0, 2.0, 3.0);
-    //__fma_rn(1.0, 2.0, 3.0);
-    //__fma_ru(1.0, 2.0, 3.0);
-    //__fma_rz(1.0, 2.0, 3.0);
+    __fma_rd(1.0, 2.0, 3.0);
+    __fma_rn(1.0, 2.0, 3.0);
+    __fma_ru(1.0, 2.0, 3.0);
+    __fma_rz(1.0, 2.0, 3.0);
 }
 
 __global__ void compileDoublePrecisionIntrinsics(hipLaunchParm lp, int ignored)