From 16d2b986aaaf59739cb46529cf3add8b289e1ece Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Tue, 24 Apr 2018 18:10:07 +0000
Subject: [PATCH 1/3] Checkin to fix bugs in math functions.

This change fixes the following bugs that were discovered while debuggnig TF unit test failures (cwise_ops_test)

1. __hisinf and __hisnan routines
   Both had incorrect implementations.

2. abs
   A "long long" (64bit int) version was missing, resulting in the 32bit version being used for 64bit ints (which resulted in incorrect results, when the value passed in was outside the 32bit int range)

3. lgamma
  We seemed to have a custom version for the 'double' datatype (which was giving incorrect results). Replaced it with a call to the 'double' version of the underlying 'hc::precision_math::lgamma'


[ROCm/hip commit: af586bbbf292d01dd6d305bc637aeefcaf2e0c86]
---
 .../include/hip/hcc_detail/math_functions.h   |  1 +
 projects/hip/src/hip_fp16.cpp                 | 24 ++++++++++++-------
 projects/hip/src/math_functions.cpp           | 12 ++++------
 3 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/projects/hip/include/hip/hcc_detail/math_functions.h b/projects/hip/include/hip/hcc_detail/math_functions.h
index 5482f34093..e717df07c1 100644
--- a/projects/hip/include/hip/hcc_detail/math_functions.h
+++ b/projects/hip/include/hip/hcc_detail/math_functions.h
@@ -57,6 +57,7 @@ __device__ float exp2f(float x);
 __device__ float expf(float x);
 __device__ float expm1f(float x);
 __device__ int abs(int x);
+__device__ long long abs(long long x);
 __device__ float fabsf(float x);
 __device__ float fdimf(float x, float y);
 __device__ float fdividef(float x, float y);
diff --git a/projects/hip/src/hip_fp16.cpp b/projects/hip/src/hip_fp16.cpp
index e4c1b43786..3004b7805d 100644
--- a/projects/hip/src/hip_fp16.cpp
+++ b/projects/hip/src/hip_fp16.cpp
@@ -29,10 +29,6 @@ struct hipHalfHolder {
     };
 };
 
-#define HINF 65504
-
-__device__ static struct hipHalfHolder __hInfValue = {HINF};
-
 __device__ __half __hadd(__half a, __half b) { return a + b; }
 
 __device__ __half __hadd_sat(__half a, __half b) { return a + b; }
@@ -63,9 +59,21 @@ __device__ bool __hge(__half a, __half b) { return a >= b ? true : false; }
 
 __device__ bool __hgt(__half a, __half b) { return a > b ? true : false; }
 
-__device__ bool __hisinf(__half a) { return a == HINF ? true : false; }
+__device__ bool __hisinf(__half a) {
+  hipHalfHolder hH;
+  hH.h = a;
+  // mask with 0x7fff to drop the sign bit
+  // 0x7c00 is bit pattern for inf (exp = 11111, significand = 0)
+  return ((hH.s & 0x7fff) == 0x7c00) ? true : false;
+}
 
-__device__ bool __hisnan(__half a) { return a > HINF ? true : false; }
+__device__ bool __hisnan(__half a) {
+  hipHalfHolder hH;
+  hH.h = a;
+  // mask with 0x7fff to drop the sign bit
+  // 0x7cXX is bit pattern for inf (exp = 11111, significand = 0)
+  return ((hH.s & 0x7fff) > 0x7c00) ? true : false;
+}
 
 __device__ bool __hle(__half a, __half b) { return a <= b ? true : false; }
 
@@ -124,8 +132,8 @@ __device__ __half2 __hgt2(__half2 a, __half2 b) {
 
 __device__ __half2 __hisnan2(__half2 a) {
     __half2 c;
-    c.x = (a.x > HINF) ? (__half)1 : (__half)0;
-    c.y = (a.y > HINF) ? (__half)1 : (__half)0;
+    c.x = (__hisnan(a.x)) ? (__half)1 : (__half)0;
+    c.y = (__hisnan(a.y)) ? (__half)1 : (__half)0;
     return c;
 }
 
diff --git a/projects/hip/src/math_functions.cpp b/projects/hip/src/math_functions.cpp
index 3c0a7f6541..dedc40f2ae 100644
--- a/projects/hip/src/math_functions.cpp
+++ b/projects/hip/src/math_functions.cpp
@@ -56,6 +56,9 @@ __device__ float expm1f(float x) { return hc::precise_math::expm1f(x); }
 __device__ int abs(int x) {
     return x >= 0 ? x : -x;  // TODO - optimize with OCML
 }
+__device__ long long abs(long long x) {
+    return x >= 0 ? x : -x;  
+}
 __device__ float fabsf(float x) { return hc::precise_math::fabsf(x); }
 __device__ float fdimf(float x, float y) { return hc::precise_math::fdimf(x, y); }
 __device__ float fdividef(float x, float y) { return x / y; }
@@ -220,14 +223,7 @@ __device__ double j0(double x) { return __hip_j0(x); }
 __device__ double j1(double x) { return __hip_j1(x); }
 __device__ double jn(int n, double x) { return __hip_jn(n, x); }
 __device__ double ldexp(double x, int exp) { return hc::precise_math::ldexp(x, exp); }
-__device__ double lgamma(double x) {
-    double val = 0.0;
-    double y = x - 1;
-    while (y > 0) {
-        val += log(y--);
-    }
-    return val;
-}
+__device__ double lgamma(double x) { return hc::precise_math::lgamma(x); }
 __device__ long long int llrint(double x) {
     long long int y = hc::precise_math::round(x);
     return y;

From 2b3332ac726c620abf16ddbb77a4d48dbdebfa0e Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Wed, 25 Apr 2018 17:53:21 +0000
Subject: [PATCH 2/3] Checkin to add unit tests for fixes in my previous commit

This change adds unit tests for

1. __hisinf
2. __hisnan
2. abs(long long)
3. lgamma(double)


[ROCm/hip commit: 4be4cf644f576aa3b5f0455b7e38add755dbecb7]
---
 .../tests/src/deviceLib/hipMathFunctions.cpp  | 138 +++++++++++++++++
 .../hip/tests/src/deviceLib/hipTestHalf.cpp   | 145 ++++++++++++++++++
 2 files changed, 283 insertions(+)

diff --git a/projects/hip/tests/src/deviceLib/hipMathFunctions.cpp b/projects/hip/tests/src/deviceLib/hipMathFunctions.cpp
index ba3707bc59..ff69087f82 100644
--- a/projects/hip/tests/src/deviceLib/hipMathFunctions.cpp
+++ b/projects/hip/tests/src/deviceLib/hipMathFunctions.cpp
@@ -20,12 +20,150 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
+/* HIT_START
+ * BUILD: %t %s ../test_common.cpp
+ * RUN: %t
+ * HIT_END
+ */
 
 #include "hip/hip_runtime.h"
 #include "test_common.h"
 
+#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__
+
+__global__ void kernel_abs_int64(hipLaunchParm lp, long long *input, long long *output) {
+    int tx = threadIdx.x;
+    output[tx] = abs(input[tx]);
+}
+
+__global__ void kernel_lgamma_double(hipLaunchParm lp, double *input, double *output) {
+    int tx = threadIdx.x;
+    output[tx] = lgamma(input[tx]);
+}
+
+#endif
+
+#define CHECK_LGAMMA_DOUBLE(IN, OUT, EXP)		 \
+  {						 \
+    if (OUT != EXP)  {				 \
+      failed("check_abs_int64 failed on %f (output = %f, expected = %fd)\n", IN, OUT, EXP); \
+    }						 \
+  }
+
+#define CHECK_ABS_INT64(IN, OUT, EXP)		 \
+  {						 \
+    if (OUT != EXP)  {				 \
+      failed("check_abs_int64 failed on %lld (output = %lld, expected = %lld)\n", IN, OUT, EXP); \
+    }						 \
+  }
+
+void check_lgamma_double() {
+
+  using datatype_t = double;
+  
+  const int NUM_INPUTS = 8;
+  auto memsize = NUM_INPUTS * sizeof(datatype_t);
+  
+  // allocate memories
+  datatype_t *inputCPU = (datatype_t *) malloc(memsize);
+  datatype_t *outputCPU = (datatype_t *) malloc(memsize);
+  datatype_t *inputGPU = nullptr; hipMalloc((void**)&inputGPU, memsize);
+  datatype_t *outputGPU = nullptr; hipMalloc((void**)&outputGPU, memsize);
+  
+  // populate input
+  inputCPU[0] = -3.5;
+  inputCPU[0] = -2.5;
+  inputCPU[0] = -1.5;
+  inputCPU[0] = -0.5;
+  inputCPU[0] =  0.5;
+  inputCPU[0] =  1.5;
+  inputCPU[0] =  2.5;
+  inputCPU[0] =  3.5;
+  
+  // copy inputs to device
+  hipMemcpy(inputGPU, inputCPU, memsize, hipMemcpyHostToDevice);
+
+  // launch kernel
+  hipLaunchKernel(kernel_lgamma_double, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);
+
+  // copy outputs from device
+  hipMemcpy(outputCPU, outputGPU, memsize, hipMemcpyDeviceToHost);
+
+  // check outputs
+  for (int i=0; i<NUM_INPUTS; i++) {
+    CHECK_LGAMMA_DOUBLE(inputCPU[i], outputCPU[i], lgamma(inputCPU[i]));
+  }
+  
+  // free memories
+  hipFree(inputGPU);
+  hipFree(outputGPU);
+  free(inputCPU);
+  free(outputCPU);
+  
+  // done
+  return;
+}
+
+
+void check_abs_int64() {
+
+  using datatype_t = long long;
+
+  const int NUM_INPUTS = 8;
+  auto memsize = NUM_INPUTS * sizeof(datatype_t);
+  
+  // allocate memories
+  datatype_t *inputCPU = (datatype_t *) malloc(memsize);
+  datatype_t *outputCPU = (datatype_t *) malloc(memsize);
+  datatype_t *inputGPU = nullptr; hipMalloc((void**)&inputGPU, memsize);
+  datatype_t *outputGPU = nullptr; hipMalloc((void**)&outputGPU, memsize);
+  
+  // populate input
+  inputCPU[0] = -81985529216486895ll; 
+  inputCPU[1] =  81985529216486895ll;
+  inputCPU[2] = -1250999896491ll;
+  inputCPU[3] =  1250999896491ll;
+  inputCPU[4] = -19088743ll;
+  inputCPU[5] =  19088743ll;
+  inputCPU[6] = -291ll;
+  inputCPU[7] =  291ll;
+  
+  // copy inputs to device
+  hipMemcpy(inputGPU, inputCPU, memsize, hipMemcpyHostToDevice);
+
+  // launch kernel
+  hipLaunchKernel(kernel_abs_int64, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);
+
+  // copy outputs from device
+  hipMemcpy(outputCPU, outputGPU, memsize, hipMemcpyDeviceToHost);
+
+  // check outputs
+  CHECK_ABS_INT64(inputCPU[0], outputCPU[0], outputCPU[1]);
+  CHECK_ABS_INT64(inputCPU[1], outputCPU[1], outputCPU[1]);
+  CHECK_ABS_INT64(inputCPU[2], outputCPU[2], outputCPU[3]);
+  CHECK_ABS_INT64(inputCPU[3], outputCPU[3], outputCPU[3]);
+  CHECK_ABS_INT64(inputCPU[4], outputCPU[4], outputCPU[5]);
+  CHECK_ABS_INT64(inputCPU[5], outputCPU[5], outputCPU[5]);
+  CHECK_ABS_INT64(inputCPU[6], outputCPU[6], outputCPU[7]);
+  CHECK_ABS_INT64(inputCPU[7], outputCPU[7], outputCPU[7]);
+  
+  // free memories
+  hipFree(inputGPU);
+  hipFree(outputGPU);
+  free(inputCPU);
+  free(outputCPU);
+  
+  // done
+  return;
+}
+  
+
 int main(int argc, char* argv[]) {
     HipTest::parseStandardArguments(argc, argv, true);
 
+    check_abs_int64();
+
+    // check_lgamma_double();
+    
     passed();
 }
diff --git a/projects/hip/tests/src/deviceLib/hipTestHalf.cpp b/projects/hip/tests/src/deviceLib/hipTestHalf.cpp
index 9e5962f38f..4a63260bf7 100644
--- a/projects/hip/tests/src/deviceLib/hipTestHalf.cpp
+++ b/projects/hip/tests/src/deviceLib/hipTestHalf.cpp
@@ -17,6 +17,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
+/* HIT_START
+ * BUILD: %t %s ../test_common.cpp
+ * RUN: %t
+ * HIT_END
+ */
+
 #include <iostream>
 #include <hip/hip_fp16.h>
 #include "hip/hip_runtime.h"
@@ -59,8 +65,143 @@ __global__ void __half2Math(hipLaunchParm lp, __half2* A, __half2* B, __half2* C
     c = __hmul2_sat(b, c);
 }
 
+__global__ void kernel_hisnan(hipLaunchParm lp, __half* input, int* output) {
+    int tx = threadIdx.x;
+    output[tx] = __hisnan(input[tx]);
+}
+
+__global__ void kernel_hisinf(hipLaunchParm lp, __half* input, int* output) {
+    int tx = threadIdx.x;
+    output[tx] = __hisinf(input[tx]);
+}
+
 #endif
 
+
+__half host_ushort_as_half(unsigned short s) {
+  union {__half h; unsigned short s; } converter;
+  converter.s = s;
+  return converter.h;
+}
+
+
+void check_hisnan(int NUM_INPUTS, __half* inputCPU, __half* inputGPU) {
+
+  // allocate memory
+  auto memsize = NUM_INPUTS * sizeof(int);
+  int* outputGPU = nullptr;
+  hipMalloc((void**)&outputGPU, memsize);
+
+  // launch the kernel
+  hipLaunchKernel(kernel_hisnan, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);
+
+  // copy output from device
+  int* outputCPU = (int*) malloc(memsize);
+  hipMemcpy(outputCPU, outputGPU, memsize, hipMemcpyDeviceToHost);
+
+  // check output
+  for (int i=0; i<NUM_INPUTS; i++) {
+    if ((2 <= i) && (i <= 5)) { // inputs are nan, output should be true
+      if (outputCPU[i] == 0) {
+	failed("__hisnan() returned false for %f (input idx = %d)\n", inputCPU[i], i);
+      }
+    }
+    else { // inputs are NOT nan, output should be false
+      if (outputCPU[i] != 0) {
+	failed("__hisnan() returned true for %f (input idx = %d)\n", inputCPU[i], i);
+      }
+    }
+  }
+
+  // free memory
+  free(outputCPU);
+  hipFree(outputGPU);
+
+  // done
+  return;
+}
+
+
+void check_hisinf(int NUM_INPUTS, __half* inputCPU, __half* inputGPU) {
+  // allocate memory
+  auto memsize = NUM_INPUTS * sizeof(int);
+  int* outputGPU = nullptr;
+  hipMalloc((void**)&outputGPU, memsize);
+
+  // launch the kernel
+  hipLaunchKernel(kernel_hisinf, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);
+
+  // copy output from device
+  int* outputCPU = (int*) malloc(memsize);
+  hipMemcpy(outputCPU, outputGPU, memsize, hipMemcpyDeviceToHost);
+
+  // check output
+  for (int i=0; i<NUM_INPUTS; i++) {
+    if ((0 <= i) && (i <= 1)) { // inputs are inf, output should be true
+      if (outputCPU[i] == 0) {
+	failed("__hisinf() returned false for %f (input idx = %d)\n", inputCPU[i], i);
+      }
+    }
+    else { // inputs are NOT inf, output should be false
+      if (outputCPU[i] != 0) {
+	failed("__hisinf() returned true for %f (input idx = %d)\n", inputCPU[i], i);
+      }
+    }
+  }
+
+  // free memory
+  free(outputCPU);
+  hipFree(outputGPU);
+
+  // done
+  return;
+}
+
+
+void checkFunctional() {
+
+  // allocate memory 
+  const int NUM_INPUTS = 16;
+  auto memsize = NUM_INPUTS * sizeof(__half);
+  __half* inputCPU = (__half*) malloc(memsize);
+  
+  // populate inputs
+  inputCPU[0] = host_ushort_as_half(0x7c00);  //  inf
+  inputCPU[1] = host_ushort_as_half(0xfc00);  // -inf
+  inputCPU[2] = host_ushort_as_half(0x7c01);  //  nan
+  inputCPU[3] = host_ushort_as_half(0x7e00);  //  nan
+  inputCPU[4] = host_ushort_as_half(0xfc01);  //  nan
+  inputCPU[5] = host_ushort_as_half(0xfe00);  //  nan
+  inputCPU[6] = host_ushort_as_half(0x0000);  //  0
+  inputCPU[7] = host_ushort_as_half(0x8000);  // -0
+  inputCPU[8] = host_ushort_as_half(0x7bff);  // max +ve normal
+  inputCPU[9] = host_ushort_as_half(0xfbff);  // max -ve normal
+  inputCPU[10] = host_ushort_as_half(0x0400); // min +ve normal
+  inputCPU[11] = host_ushort_as_half(0x8400); // min -ve normal
+  inputCPU[12] = host_ushort_as_half(0x03ff); // max +ve sub-normal
+  inputCPU[13] = host_ushort_as_half(0x83ff); // max -ve sub-normal
+  inputCPU[14] = host_ushort_as_half(0x0001); // min +ve sub-normal
+  inputCPU[15] = host_ushort_as_half(0x8001); // min -ve sub-normal
+
+  // copy inputs to the GPU
+  __half* inputGPU = nullptr;
+  hipMalloc((void**)&inputGPU, memsize);
+  hipMemcpy(inputGPU, inputCPU, memsize, hipMemcpyHostToDevice);
+
+  // run checks
+
+  check_hisnan(NUM_INPUTS, inputCPU, inputGPU);
+
+  check_hisinf(NUM_INPUTS, inputCPU, inputGPU);
+
+  // free memory
+  hipFree(inputGPU);
+  free(inputCPU);
+
+  // all done
+  return;
+}
+
 int main() {
     __half *A, *B, *C;
     hipMalloc(&A, HALF_SIZE);
@@ -78,5 +219,9 @@ int main() {
     hipFree(A2);
     hipFree(B2);
     hipFree(C2);
+
+    // run some functional checks
+    checkFunctional();
+    
     passed();
 }

From 43c2724907b0c0f74836f886f41fb23a884cb4c9 Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Fri, 27 Apr 2018 12:59:51 +0000
Subject: [PATCH 3/3] Fixing a copy/paste error in my previous checkin

[ROCm/hip commit: 76a7d7e374ff193dfa73dcdc0504895f23aaada6]
---
 projects/hip/tests/src/deviceLib/hipMathFunctions.cpp | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/projects/hip/tests/src/deviceLib/hipMathFunctions.cpp b/projects/hip/tests/src/deviceLib/hipMathFunctions.cpp
index ff69087f82..7fe0003672 100644
--- a/projects/hip/tests/src/deviceLib/hipMathFunctions.cpp
+++ b/projects/hip/tests/src/deviceLib/hipMathFunctions.cpp
@@ -71,14 +71,9 @@ void check_lgamma_double() {
   datatype_t *outputGPU = nullptr; hipMalloc((void**)&outputGPU, memsize);
   
   // populate input
-  inputCPU[0] = -3.5;
-  inputCPU[0] = -2.5;
-  inputCPU[0] = -1.5;
-  inputCPU[0] = -0.5;
-  inputCPU[0] =  0.5;
-  inputCPU[0] =  1.5;
-  inputCPU[0] =  2.5;
-  inputCPU[0] =  3.5;
+  for (int i=0; i<NUM_INPUTS; i++) {
+    inputCPU[i] = -3.5 + i;
+  }
   
   // copy inputs to device
   hipMemcpy(inputGPU, inputCPU, memsize, hipMemcpyHostToDevice);