From a2ecbc2d6ef347988b70399b7e2799bb0c6cbca4 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Wed, 23 Nov 2016 11:19:15 -0600
Subject: [PATCH] Added fast math flag 1. Use -DHIP_FAST_MATH to make precise
 math functions compiled to fast math 2. Added double fast math functions for
 sqrt 3. Changed hipcc to parse -use_fast_math (not working) 4. Added passed
 tag to hipFloatMath test

Change-Id: I72884b2436b4efe61e9a9297346c1358fee38a2d


[ROCm/clr commit: c2f6ecf2640474c6d00449e5f4195abc1172326d]
---
 projects/clr/hipamd/bin/hipcc                 |   5 +
 .../include/hip/hcc_detail/hip_runtime.h      | 119 +++++++----------
 .../tests/src/deviceLib/hipFloatMath.cpp      |   1 +
 .../src/deviceLib/hipFloatMathPrecise.cpp     | 122 ++++++++++++++++++
 4 files changed, 177 insertions(+), 70 deletions(-)
 create mode 100644 projects/clr/hipamd/tests/src/deviceLib/hipFloatMathPrecise.cpp

diff --git a/projects/clr/hipamd/bin/hipcc b/projects/clr/hipamd/bin/hipcc
index 641f70b065..09c4d813d0 100755
--- a/projects/clr/hipamd/bin/hipcc
+++ b/projects/clr/hipamd/bin/hipcc
@@ -274,6 +274,11 @@ foreach $arg (@ARGV)
         $buildDeps = 1;
     }
 
+    if($arg eq '-use_fast_math') {
+        print "In fast Math";
+        $HIPCXXFLAGS .= " -DHIP_FAST_MATH ";
+    }
+
     if ($arg =~ m/^-/) {
         # options start with -
 
diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h
index 4b781b44ae..45dbeff5a4 100644
--- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h
+++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h
@@ -130,7 +130,6 @@ __device__ float atanhf(float x);
 __device__ float cbrtf(float x);
 __device__ float ceilf(float x);
 __device__ float copysignf(float x, float y);
-__device__ float cosf(float x);
 __device__ float coshf(float x);
 __device__ float cyl_bessel_i0f(float x);
 __device__ float cyl_bessel_i1f(float x);
@@ -142,9 +141,7 @@ __host__ float erfcxf(float x);
 __device__ float erff(float x);
 __device__ float erfinvf(float y);
 __host__ float erfinvf(float y);
-__device__ float exp10f(float x);
 __device__ float exp2f(float x);
-__device__ float expf(float x);
 __device__ float expm1f(float x);
 __device__ float fabsf(float x);
 __device__ float fdimf(float x, float y);
@@ -167,11 +164,8 @@ __device__ float ldexpf(float x, int exp);
 __device__ float lgammaf(float x);
 __device__ long long int llrintf(float x);
 __device__ long long int llroundf(float x);
-__device__ float log10f(float x);
 __device__ float log1pf(float x);
-__device__ float log2f(float x);
 __device__ float logbf(float x);
-__device__ float logf(float x);
 __device__ long int lrintf(float x);
 __device__ long int lroundf(float x);
 __device__ float modff(float x, float *iptr);
@@ -187,7 +181,6 @@ __host__ float normcdff(float y);
 __device__ float normcdfinvf(float y);
 __host__ float normcdfinvf(float y);
 __device__ float normf(int dim, const float *a);
-__device__ float powf(float x, float y);
 __device__ float rcbrtf(float x);
 __host__ float rcbrtf(float x);
 __device__ float remainderf(float x, float y);
@@ -206,14 +199,11 @@ __device__ float rsqrtf(float x);
 __device__ float scalblnf(float x, long int n);
 __device__ float scalbnf(float x, int n);
 __host__ __device__ unsigned signbit(float a);
-__device__ void sincosf(float x, float *sptr, float *cptr);
 __device__ void sincospif(float x, float *sptr, float *cptr);
 __host__ void sincospif(float x, float *sptr, float *cptr);
-__device__ float sinf(float x);
 __device__ float sinhf(float x);
 __device__ float sinpif(float x);
 __device__ float sqrtf(float x);
-__device__ float tanf(float x);
 __device__ float tanhf(float x);
 __device__ float tgammaf(float x);
 __device__ float truncf(float x);
@@ -519,90 +509,65 @@ __device__ float __hip_fast_powf(float, float);
 __device__ void __hip_fast_sincosf(float,float*,float*);
 extern __attribute__((const)) float __hip_fast_sinf(float) __asm("llvm.sin.f32");
 __device__ float __hip_fast_tanf(float);
+extern __attribute__((const)) double __hip_fast_dsqrt(double) __asm("llvm.sqrt.f64");
 
-#ifdef HIP_PRECISE_MATH
+#ifdef HIP_FAST_MATH
 // Single Precision Precise Math when enabled
 
-__device__ inline float __cosf(float x) {
-  return __hip_precise_cosf(x);
+__device__ inline float cosf(float x) {
+  return __hip_fast_cosf(x);
 }
 
-__device__ inline float __exp10f(float x) {
-  return __hip_precise_exp10f(x);
+__device__ inline float exp10f(float x) {
+  return __hip_fast_exp10f(x);
 }
 
-__device__ inline float __expf(float x) {
-  return __hip_precise_expf(x);
+__device__ inline float expf(float x) {
+  return __hip_fast_expf(x);
 }
 
-__device__ inline float __frsqrt_rn(float x) {
-  return __hip_precise_frsqrt_rn(x);
+__device__ inline float log10f(float x) {
+  return __hip_fast_log10f(x);
 }
 
-__device__ inline float __fsqrt_rd(float x) {
-  return __hip_precise_fsqrt_rd(x);
+__device__ inline float log2f(float x) {
+  return __hip_fast_log2f(x);
 }
 
-__device__ inline float __fsqrt_rn(float x) {
-  return __hip_precise_fsqrt_rn(x);
+__device__ inline float logf(float x) {
+  return __hip_fast_logf(x);
 }
 
-__device__ inline float __fsqrt_ru(float x) {
-  return __hip_precise_fsqrt_ru(x);
+__device__ inline float powf(float base, float exponent) {
+  return __hip_fast_powf(base, exponent);
 }
 
-__device__ inline float __fsqrt_rz(float x) {
-  return __hip_precise_fsqrt_rz(x);
+__device__ inline void sincosf(float x, float *s, float *c) {
+  return __hip_fast_sincosf(x, s, c);
 }
 
-__device__ inline float __log10f(float x) {
-  return __hip_precise_log10f(x);
+__device__ inline float sinf(float x) {
+  return __hip_fast_sinf(x);
 }
 
-__device__ inline float __log2f(float x) {
-  return __hip_precise_log2f(x);
-}
-
-__device__ inline float __logf(float x) {
-  return __hip_precise_logf(x);
-}
-
-__device__ inline float __powf(float base, float exponent) {
-  return __hip_precise_powf(base, exponent);
-}
-
-__device__ inline void __sincosf(float x, float *s, float *c) {
-  return __hip_precise_sincosf(x, s, c);
-}
-
-__device__ inline float __sinf(float x) {
-  return __hip_precise_sinf(x);
-}
-
-__device__ inline float __tanf(float x) {
-  return __hip_precise_tanf(x);
-}
-
-// Double Precision
-
-__device__ double __dsqrt_rd(double x) {
-  return __hip_precise_dsqrt_rd(x);
-}
-
-__device__ double __dsqrt_rn(double x) {
-  return __hip_precise_dsqrt_rn(x);
-}
-
-__device__ double __dsqrt_ru(double x) {
-  return __hip_precise_dsqrt_ru(x);
-}
-
-__device__ double __dsqrt_rz(double x) {
-  return __hip_precise_dsqrt_rz(x);
+__device__ inline float tanf(float x) {
+  return __hip_fast_tanf(x);
 }
 
 #else
 
+__device__ float sinf(float);
+__device__ float cosf(float);
+__device__ float tanf(float);
+__device__ void sincosf(float, float*, float*);
+__device__ float logf(float);
+__device__ float log2f(float);
+__device__ float log10f(float);
+__device__ float expf(float);
+__device__ float exp10f(float);
+__device__ float powf(float, float);
+
+#endif
 // Single Precision Fast Math
 __device__ inline float __cosf(float x) {
   return __hip_fast_cosf(x);
@@ -664,8 +629,22 @@ __device__ inline float __tanf(float x) {
   return __hip_fast_tanf(x);
 }
 
+__device__ inline double __dsqrt_rd(double x) {
+  return __hip_fast_dsqrt(x);
+}
+
+__device__ inline double __dsqrt_rn(double x) {
+  return __hip_fast_dsqrt(x);
+}
+
+__device__ inline double __dsqrt_ru(double x) {
+  return __hip_fast_dsqrt(x);
+}
+
+__device__ inline double __dsqrt_rz(double x) {
+  return __hip_fast_dsqrt(x);
+}
 
-#endif
 /**
  * CUDA 8 device function features
 
diff --git a/projects/clr/hipamd/tests/src/deviceLib/hipFloatMath.cpp b/projects/clr/hipamd/tests/src/deviceLib/hipFloatMath.cpp
index eb70eb6b0b..f137ca2602 100644
--- a/projects/clr/hipamd/tests/src/deviceLib/hipFloatMath.cpp
+++ b/projects/clr/hipamd/tests/src/deviceLib/hipFloatMath.cpp
@@ -58,4 +58,5 @@ int main(){
   hipMalloc((void**)&Ind, SIZE);
   hipMalloc((void**)&Outd, SIZE);
   hipLaunchKernel(floatMath, dim3(LEN,1,1), dim3(1,1,1), 0, 0, Ind, Outd);
+  passed();
 }
diff --git a/projects/clr/hipamd/tests/src/deviceLib/hipFloatMathPrecise.cpp b/projects/clr/hipamd/tests/src/deviceLib/hipFloatMathPrecise.cpp
new file mode 100644
index 0000000000..4f6c2cd44a
--- /dev/null
+++ b/projects/clr/hipamd/tests/src/deviceLib/hipFloatMathPrecise.cpp
@@ -0,0 +1,122 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include "hip/hip_runtime.h"
+#include "test_common.h"
+
+__global__ void FloatMathPrecise(hipLaunchParm lp)
+{
+    int iX;
+    float fX, fY;
+
+    acosf(1.0f);
+    acoshf(1.0f);
+    asinf(0.0f);
+    asinhf(0.0f);
+    atan2f(0.0f, 1.0f);
+    atanf(0.0f);
+    atanhf(0.0f);
+    cbrtf(0.0f);
+    fX = ceilf(0.0f);
+    fX = copysignf(1.0f, -2.0f);
+    cosf(0.0f);
+    coshf(0.0f);
+    cospif(0.0f);
+    //cyl_bessel_i0f(0.0f);
+    //cyl_bessel_i1f(0.0f);
+    erfcf(0.0f);
+    erfcinvf(2.0f);
+    erfcxf(0.0f);
+    erff(0.0f);
+    erfinvf(1.0f);
+    exp10f(0.0f);
+    exp2f(0.0f);
+    expf(0.0f);
+    expm1f(0.0f);
+    fX = fabsf(1.0f);
+    fdimf(1.0f, 0.0f);
+    fdividef(0.0f, 1.0f);
+    fX = floorf(0.0f);
+    fmaf(1.0f, 2.0f, 3.0f);
+    fX = fmaxf(0.0f, 0.0f);
+    fX = fminf(0.0f, 0.0f);
+    fmodf(0.0f, 1.0f);
+    //frexpf(0.0f, &iX);
+    hypotf(1.0f, 0.0f);
+    ilogbf(1.0f);
+    isfinite(0.0f);
+    fX = isinf(0.0f);
+    fX = isnan(0.0f);
+    j0f(0.0f);
+    j1f(0.0f);
+    jnf(-1.0f, 1.0f);
+    ldexpf(0.0f, 0);
+    //lgammaf(1.0f);
+    llrintf(0.0f);
+    llroundf(0.0f);
+    log10f(1.0f);
+    log1pf(-1.0f);
+    log2f(1.0f);
+    logbf(1.0f);
+    logf(1.0f);
+    lrintf(0.0f);
+    lroundf(0.0f);
+    //modff(0.0f, &fX);
+    fX = nanf("1");
+    fX = nearbyintf(0.0f);
+    //nextafterf(0.0f);
+    norm3df(1.0f, 0.0f, 0.0f);
+    norm4df(1.0f, 0.0f, 0.0f, 0.0f);
+    normcdff(0.0f);
+    normcdfinvf(1.0f);
+    fX = 1.0f; normf(1, &fX);
+    powf(1.0f, 0.0f);
+    rcbrtf(1.0f);
+    remainderf(2.0f, 1.0f);
+    //remquof(1.0f, 2.0f, &iX);
+    rhypotf(0.0f, 1.0f);
+    fY = rintf(1.0f);
+    rnorm3df(0.0f, 0.0f, 1.0f);
+    rnorm4df(0.0f, 0.0f, 0.0f, 1.0f);
+    fX = 1.0f; rnormf(1, &fX);
+    fY = roundf(0.0f);
+    rsqrtf(1.0f);
+    scalblnf(0.0f, 1);
+    scalbnf(0.0f, 1);
+    signbit(1.0f);
+    sincosf(0.0f, &fX, &fY);
+    sincospif(0.0f, &fX, &fY);
+    sinf(0.0f);
+    sinhf(0.0f);
+    sinpif(0.0f);
+    sqrtf(0.0f);
+    tanf(0.0f);
+    tanhf(0.0f);
+    tgammaf(2.0f);
+    fY = truncf(0.0f);
+    y0f(1.0f);
+    y1f(1.0f);
+    ynf(1, 1.0f);
+}
+
+int main() {
+    hipLaunchKernel(FloatMathPrecise, dim3(1,1,1), dim3(1,1,1), 0, 0);
+}