From 5c1dc7a0718d5fd863f46949483104cb946c071e Mon Sep 17 00:00:00 2001
From: Aaron Enye Shi <enye.shi@gmail.com>
Date: Mon, 5 Nov 2018 22:34:16 +0000
Subject: [PATCH 1/4] Remove non-working non-default-rounded math apis

In ROCm-Device-Libs, they have dropped the non-default-rounded versions of add, sub, mul, div, sqrt and fma. Therefore, ocml has removed the rte, rtp, rtn, and rtz counterparts. This will remove the same math APIs in HIP for _ru, _rd, _rn, and _rz.


[ROCm/clr commit: cef6e8ef1fdda3378245a935f03949b0f0d606d6]
---
 .../include/hip/hcc_detail/math_functions.h   | 336 +++++++++---------
 .../hipamd/include/hip/hcc_detail/math_fwd.h  |  50 ++-
 .../hipDoublePrecisionIntrinsics.cpp          |  48 +--
 .../hipSinglePrecisionIntrinsics.cpp          |  48 +--
 4 files changed, 265 insertions(+), 217 deletions(-)

diff --git a/projects/clr/hipamd/include/hip/hcc_detail/math_functions.h b/projects/clr/hipamd/include/hip/hcc_detail/math_functions.h
index b12e7aca89..8a6091858b 100644
--- a/projects/clr/hipamd/include/hip/hcc_detail/math_functions.h
+++ b/projects/clr/hipamd/include/hip/hcc_detail/math_functions.h
@@ -514,69 +514,69 @@ float __exp10f(float x) { return __ocml_exp10_f32(x); }
 __DEVICE__
 inline
 float __expf(float x) { return __ocml_exp_f32(x); }
-__DEVICE__
-inline
-float __fadd_rd(float x, float y) { return __ocml_add_rtp_f32(x, y); }
-__DEVICE__
-inline
-float __fadd_rn(float x, float y) { return __ocml_add_rte_f32(x, y); }
-__DEVICE__
-inline
-float __fadd_ru(float x, float y) { return __ocml_add_rtn_f32(x, y); }
-__DEVICE__
-inline
-float __fadd_rz(float x, float y) { return __ocml_add_rtz_f32(x, y); }
-__DEVICE__
-inline
-float __fdiv_rd(float x, float y) { return x / y; }
-__DEVICE__
-inline
-float __fdiv_rn(float x, float y) { return x / y; }
-__DEVICE__
-inline
-float __fdiv_ru(float x, float y) { return x / y; }
-__DEVICE__
-inline
-float __fdiv_rz(float x, float y) { return x / y; }
+// __DEVICE__
+// inline
+// float __fadd_rd(float x, float y) { return __ocml_add_rtn_f32(x, y); }
+// __DEVICE__
+// inline
+// float __fadd_rn(float x, float y) { return __ocml_add_rte_f32(x, y); }
+// __DEVICE__
+// inline
+// float __fadd_ru(float x, float y) { return __ocml_add_rtp_f32(x, y); }
+// __DEVICE__
+// inline
+// float __fadd_rz(float x, float y) { return __ocml_add_rtz_f32(x, y); }
+// __DEVICE__
+// inline
+// float __fdiv_rd(float x, float y) { return __ocml_div_rtn_f32(x, y); }
+// __DEVICE__
+// inline
+// float __fdiv_rn(float x, float y) { return __ocml_div_rte_f32(x, y); }
+// __DEVICE__
+// inline
+// float __fdiv_ru(float x, float y) { return __ocml_div_rtp_f32(x, y); }
+// __DEVICE__
+// inline
+// float __fdiv_rz(float x, float y) { return __ocml_div_rtz_f32(x, y); }
 __DEVICE__
 inline
 float __fdividef(float x, float y) { return x / y; }
-__DEVICE__
-inline
-float __fmaf_rd(float x, float y, float z)
-{
-    return __ocml_fma_rtp_f32(x, y, z);
-}
-__DEVICE__
-inline
-float __fmaf_rn(float x, float y, float z)
-{
-    return __ocml_fma_rte_f32(x, y, z);
-}
-__DEVICE__
-inline
-float __fmaf_ru(float x, float y, float z)
-{
-    return __ocml_fma_rtn_f32(x, y, z);
-}
-__DEVICE__
-inline
-float __fmaf_rz(float x, float y, float z)
-{
-   return __ocml_fma_rtz_f32(x, y, z);
-}
-__DEVICE__
-inline
-float __fmul_rd(float x, float y) { return __ocml_mul_rtp_f32(x, y); }
-__DEVICE__
-inline
-float __fmul_rn(float x, float y) { return __ocml_mul_rte_f32(x, y); }
-__DEVICE__
-inline
-float __fmul_ru(float x, float y)  { return __ocml_mul_rtn_f32(x, y); }
-__DEVICE__
-inline
-float __fmul_rz(float x, float y) { return __ocml_mul_rtz_f32(x, y); }
+// __DEVICE__
+// inline
+// float __fmaf_rd(float x, float y, float z)
+// {
+//     return __ocml_fma_rtn_f32(x, y, z);
+// }
+// __DEVICE__
+// inline
+// float __fmaf_rn(float x, float y, float z)
+// {
+//     return __ocml_fma_rte_f32(x, y, z);
+// }
+// __DEVICE__
+// inline
+// float __fmaf_ru(float x, float y, float z)
+// {
+//     return __ocml_fma_rtp_f32(x, y, z);
+// }
+// __DEVICE__
+// inline
+// float __fmaf_rz(float x, float y, float z)
+// {
+//    return __ocml_fma_rtz_f32(x, y, z);
+// }
+// __DEVICE__
+// inline
+// float __fmul_rd(float x, float y) { return __ocml_mul_rtn_f32(x, y); }
+// __DEVICE__
+// inline
+// float __fmul_rn(float x, float y) { return __ocml_mul_rte_f32(x, y); }
+// __DEVICE__
+// inline
+// float __fmul_ru(float x, float y)  { return __ocml_mul_rtp_f32(x, y); }
+// __DEVICE__
+// inline
+// float __fmul_rz(float x, float y) { return __ocml_mul_rtz_f32(x, y); }
 __DEVICE__
 inline
 float __frcp_rd(float x) { return __llvm_amdgcn_rcp_f32(x); }
@@ -592,30 +592,30 @@ float __frcp_rz(float x) { return __llvm_amdgcn_rcp_f32(x); }
 __DEVICE__
 inline
 float __frsqrt_rn(float x) { return __llvm_amdgcn_rsq_f32(x); }
-__DEVICE__
-inline
-float __fsqrt_rd(float x) { return __ocml_sqrt_f32(x); }
-__DEVICE__
-inline
-float __fsqrt_rn(float x) { return __ocml_sqrt_f32(x); }
-__DEVICE__
-inline
-float __fsqrt_ru(float x) { return __ocml_sqrt_f32(x); }
-__DEVICE__
-inline
-float __fsqrt_rz(float x) { return __ocml_sqrt_f32(x); }
-__DEVICE__
-inline
-float __fsub_rd(float x, float y) { return __ocml_sub_rtp_f32(x, y); }
-__DEVICE__
-inline
-float __fsub_rn(float x, float y) { return __ocml_sub_rte_f32(x, y); }
-__DEVICE__
-inline
-float __fsub_ru(float x, float y) { return __ocml_sub_rtn_f32(x, y); }
-__DEVICE__
-inline
-float __fsub_rz(float x, float y) { return __ocml_sub_rtz_f32(x, y); }
+// __DEVICE__
+// inline
+// float __fsqrt_rd(float x) { return __ocml_sqrt_rtn_f32(x); }
+// __DEVICE__
+// inline
+// float __fsqrt_rn(float x) { return __ocml_sqrt_rte_f32(x); }
+// __DEVICE__
+// inline
+// float __fsqrt_ru(float x) { return __ocml_sqrt_rtp_f32(x); }
+// __DEVICE__
+// inline
+// float __fsqrt_rz(float x) { return __ocml_sqrt_rtz_f32(x); }
+// __DEVICE__
+// inline
+// float __fsub_rd(float x, float y) { return __ocml_sub_rtn_f32(x, y); }
+// __DEVICE__
+// inline
+// float __fsub_rn(float x, float y) { return __ocml_sub_rte_f32(x, y); }
+// __DEVICE__
+// inline
+// float __fsub_ru(float x, float y) { return __ocml_sub_rtp_f32(x, y); }
+// __DEVICE__
+// inline
+// float __fsub_rz(float x, float y) { return __ocml_sub_rtz_f32(x, y); }
 __DEVICE__
 inline
 float __log10f(float x) { return __ocml_log10_f32(x); }
@@ -1034,42 +1034,42 @@ double yn(int n, double x)
 }
 
 // BEGIN INTRINSICS
-__DEVICE__
-inline
-double __dadd_rd(double x, double y) { return __ocml_add_rtp_f64(x, y); }
-__DEVICE__
-inline
-double __dadd_rn(double x, double y) { return __ocml_add_rte_f64(x, y); }
-__DEVICE__
-inline
-double __dadd_ru(double x, double y) { return __ocml_add_rtn_f64(x, y); }
-__DEVICE__
-inline
-double __dadd_rz(double x, double y) { return __ocml_add_rtz_f64(x, y); }
-__DEVICE__
-inline
-double __ddiv_rd(double x, double y) { return x / y; }
-__DEVICE__
-inline
-double __ddiv_rn(double x, double y) { return x / y; }
-__DEVICE__
-inline
-double __ddiv_ru(double x, double y) { return x / y; }
-__DEVICE__
-inline
-double __ddiv_rz(double x, double y) { return x / y; }
-__DEVICE__
-inline
-double __dmul_rd(double x, double y) { return __ocml_mul_rtp_f64(x, y); }
-__DEVICE__
-inline
-double __dmul_rn(double x, double y) { return __ocml_mul_rte_f64(x, y); }
-__DEVICE__
-inline
-double __dmul_ru(double x, double y) { return __ocml_mul_rtn_f64(x, y); }
-__DEVICE__
-inline
-double __dmul_rz(double x, double y) { return __ocml_mul_rtz_f64(x, y); }
+// __DEVICE__
+// inline
+// double __dadd_rd(double x, double y) { return __ocml_add_rtn_f64(x, y); }
+// __DEVICE__
+// inline
+// double __dadd_rn(double x, double y) { return __ocml_add_rte_f64(x, y); }
+// __DEVICE__
+// inline
+// double __dadd_ru(double x, double y) { return __ocml_add_rtp_f64(x, y); }
+// __DEVICE__
+// inline
+// double __dadd_rz(double x, double y) { return __ocml_add_rtz_f64(x, y); }
+// __DEVICE__
+// inline
+// double __ddiv_rd(double x, double y) { return __ocml_div_rtn_f64(x, y); }
+// __DEVICE__
+// inline
+// double __ddiv_rn(double x, double y) { return __ocml_div_rte_f64(x, y); }
+// __DEVICE__
+// inline
+// double __ddiv_ru(double x, double y) { return __ocml_div_rtp_f64(x, y); }
+// __DEVICE__
+// inline
+// double __ddiv_rz(double x, double y) { return __ocml_div_rtz_f64(x, y); }
+// __DEVICE__
+// inline
+// double __dmul_rd(double x, double y) { return __ocml_mul_rtn_f64(x, y); }
+// __DEVICE__
+// inline
+// double __dmul_rn(double x, double y) { return __ocml_mul_rte_f64(x, y); }
+// __DEVICE__
+// inline
+// double __dmul_ru(double x, double y) { return __ocml_mul_rtp_f64(x, y); }
+// __DEVICE__
+// inline
+// double __dmul_rz(double x, double y) { return __ocml_mul_rtz_f64(x, y); }
 __DEVICE__
 inline
 double __drcp_rd(double x) { return __llvm_amdgcn_rcp_f64(x); }
@@ -1082,54 +1082,54 @@ double __drcp_ru(double x) { return __llvm_amdgcn_rcp_f64(x); }
 __DEVICE__
 inline
 double __drcp_rz(double x) { return __llvm_amdgcn_rcp_f64(x); }
-__DEVICE__
-inline
-double __dsqrt_rd(double x) { return __ocml_sqrt_f64(x); }
-__DEVICE__
-inline
-double __dsqrt_rn(double x) { return __ocml_sqrt_f64(x); }
-__DEVICE__
-inline
-double __dsqrt_ru(double x) { return __ocml_sqrt_f64(x); }
-__DEVICE__
-inline
-double __dsqrt_rz(double x) { return __ocml_sqrt_f64(x); }
-__DEVICE__
-inline
-double __dsub_rd(double x, double y) { return __ocml_sub_rtp_f64(x, y); }
-__DEVICE__
-inline
-double __dsub_rn(double x, double y) { return __ocml_sub_rte_f64(x, y); }
-__DEVICE__
-inline
-double __dsub_ru(double x, double y) { return __ocml_sub_rtn_f64(x, y); }
-__DEVICE__
-inline
-double __dsub_rz(double x, double y) { return __ocml_sub_rtz_f64(x, y); }
-__DEVICE__
-inline
-double __fma_rd(double x, double y, double z)
-{
-    return __ocml_fma_rtp_f64(x, y, z);
-}
-__DEVICE__
-inline
-double __fma_rn(double x, double y, double z)
-{
-    return __ocml_fma_rte_f64(x, y, z);
-}
-__DEVICE__
-inline
-double __fma_ru(double x, double y, double z)
-{
-    return __ocml_fma_rtn_f64(x, y, z);
-}
-__DEVICE__
-inline
-double __fma_rz(double x, double y, double z)
-{
-    return __ocml_fma_rtz_f64(x, y, z);
-}
+// __DEVICE__
+// inline
+// double __dsqrt_rd(double x) { return __ocml_sqrt_rtn_f64(x); }
+// __DEVICE__
+// inline
+// double __dsqrt_rn(double x) { return __ocml_sqrt_rte_f64(x); }
+// __DEVICE__
+// inline
+// double __dsqrt_ru(double x) { return __ocml_sqrt_rtp_f64(x); }
+// __DEVICE__
+// inline
+// double __dsqrt_rz(double x) { return __ocml_sqrt_rtz_f64(x); }
+// __DEVICE__
+// inline
+// double __dsub_rd(double x, double y) { return __ocml_sub_rtn_f64(x, y); }
+// __DEVICE__
+// inline
+// double __dsub_rn(double x, double y) { return __ocml_sub_rte_f64(x, y); }
+// __DEVICE__
+// inline
+// double __dsub_ru(double x, double y) { return __ocml_sub_rtp_f64(x, y); }
+// __DEVICE__
+// inline
+// double __dsub_rz(double x, double y) { return __ocml_sub_rtz_f64(x, y); }
+// __DEVICE__
+// inline
+// double __fma_rd(double x, double y, double z)
+// {
+//     return __ocml_fma_rtn_f64(x, y, z);
+// }
+// __DEVICE__
+// inline
+// double __fma_rn(double x, double y, double z)
+// {
+//     return __ocml_fma_rte_f64(x, y, z);
+// }
+// __DEVICE__
+// inline
+// double __fma_ru(double x, double y, double z)
+// {
+//     return __ocml_fma_rtp_f64(x, y, z);
+// }
+// __DEVICE__
+// inline
+// double __fma_rz(double x, double y, double z)
+// {
+//     return __ocml_fma_rtz_f64(x, y, z);
+// }
 // END INTRINSICS
 // END DOUBLE
 
diff --git a/projects/clr/hipamd/include/hip/hcc_detail/math_fwd.h b/projects/clr/hipamd/include/hip/hcc_detail/math_fwd.h
index 404c2f81d5..e5594924ba 100644
--- a/projects/clr/hipamd/include/hip/hcc_detail/math_fwd.h
+++ b/projects/clr/hipamd/include/hip/hcc_detail/math_fwd.h
@@ -288,6 +288,30 @@ __attribute__((const))
 float __ocml_mul_rtz_f32(float, float);
 __device__
 __attribute__((const))
+float __ocml_div_rte_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_div_rtn_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_div_rtp_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_div_rtz_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_sqrt_rte_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_sqrt_rtn_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_sqrt_rtp_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_sqrt_rtz_f32(float, float);
+__device__
+__attribute__((const))
 float __ocml_fma_rte_f32(float, float, float);
 __device__
 __attribute__((const))
@@ -572,6 +596,30 @@ __attribute__((const))
 double __ocml_mul_rtz_f64(double, double);
 __device__
 __attribute__((const))
+double __ocml_div_rte_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_div_rtn_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_div_rtp_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_div_rtz_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_sqrt_rte_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_sqrt_rtn_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_sqrt_rtp_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_sqrt_rtz_f64(double, double);
+__device__
+__attribute__((const))
 double __ocml_fma_rte_f64(double, double, double);
 __device__
 __attribute__((const))
@@ -594,4 +642,4 @@ double __llvm_amdgcn_rsq_f64(double) __asm("llvm.amdgcn.rsq.f64");
 
 #if defined(__cplusplus)
     } // extern "C"
-#endif
\ No newline at end of file
+#endif
diff --git a/projects/clr/hipamd/tests/src/deviceLib/hipDoublePrecisionIntrinsics.cpp b/projects/clr/hipamd/tests/src/deviceLib/hipDoublePrecisionIntrinsics.cpp
index 17cd82c9ab..f6c515c03a 100644
--- a/projects/clr/hipamd/tests/src/deviceLib/hipDoublePrecisionIntrinsics.cpp
+++ b/projects/clr/hipamd/tests/src/deviceLib/hipDoublePrecisionIntrinsics.cpp
@@ -34,34 +34,34 @@ THE SOFTWARE.
 #pragma clang diagnostic ignored "-Wunused-variable"
 
 __device__ void double_precision_intrinsics() {
-    __dadd_rd(0.0, 1.0);
-    __dadd_rn(0.0, 1.0);
-    __dadd_ru(0.0, 1.0);
-    __dadd_rz(0.0, 1.0);
-    __ddiv_rd(0.0, 1.0);
-    __ddiv_rn(0.0, 1.0);
-    __ddiv_ru(0.0, 1.0);
-    __ddiv_rz(0.0, 1.0);
-    __dmul_rd(1.0, 2.0);
-    __dmul_rn(1.0, 2.0);
-    __dmul_ru(1.0, 2.0);
-    __dmul_rz(1.0, 2.0);
+    // __dadd_rd(0.0, 1.0);
+    // __dadd_rn(0.0, 1.0);
+    // __dadd_ru(0.0, 1.0);
+    // __dadd_rz(0.0, 1.0);
+    // __ddiv_rd(0.0, 1.0);
+    // __ddiv_rn(0.0, 1.0);
+    // __ddiv_ru(0.0, 1.0);
+    // __ddiv_rz(0.0, 1.0);
+    // __dmul_rd(1.0, 2.0);
+    // __dmul_rn(1.0, 2.0);
+    // __dmul_ru(1.0, 2.0);
+    // __dmul_rz(1.0, 2.0);
     __drcp_rd(2.0);
     __drcp_rn(2.0);
     __drcp_ru(2.0);
     __drcp_rz(2.0);
-    __dsqrt_rd(4.0);
-    __dsqrt_rn(4.0);
-    __dsqrt_ru(4.0);
-    __dsqrt_rz(4.0);
-    __dsub_rd(2.0, 1.0);
-    __dsub_rn(2.0, 1.0);
-    __dsub_ru(2.0, 1.0);
-    __dsub_rz(2.0, 1.0);
-    __fma_rd(1.0, 2.0, 3.0);
-    __fma_rn(1.0, 2.0, 3.0);
-    __fma_ru(1.0, 2.0, 3.0);
-    __fma_rz(1.0, 2.0, 3.0);
+    // __dsqrt_rd(4.0);
+    // __dsqrt_rn(4.0);
+    // __dsqrt_ru(4.0);
+    // __dsqrt_rz(4.0);
+    // __dsub_rd(2.0, 1.0);
+    // __dsub_rn(2.0, 1.0);
+    // __dsub_ru(2.0, 1.0);
+    // __dsub_rz(2.0, 1.0);
+    // __fma_rd(1.0, 2.0, 3.0);
+    // __fma_rn(1.0, 2.0, 3.0);
+    // __fma_ru(1.0, 2.0, 3.0);
+    // __fma_rz(1.0, 2.0, 3.0);
 }
 
 __global__ void compileDoublePrecisionIntrinsics(int ignored) {
diff --git a/projects/clr/hipamd/tests/src/deviceLib/hipSinglePrecisionIntrinsics.cpp b/projects/clr/hipamd/tests/src/deviceLib/hipSinglePrecisionIntrinsics.cpp
index f3d2a36931..623ea08a94 100644
--- a/projects/clr/hipamd/tests/src/deviceLib/hipSinglePrecisionIntrinsics.cpp
+++ b/projects/clr/hipamd/tests/src/deviceLib/hipSinglePrecisionIntrinsics.cpp
@@ -39,36 +39,36 @@ __device__ void single_precision_intrinsics() {
     __cosf(0.0f);
     __exp10f(0.0f);
     __expf(0.0f);
-    __fadd_rd(0.0f, 1.0f);
-    __fadd_rn(0.0f, 1.0f);
-    __fadd_ru(0.0f, 1.0f);
-    __fadd_rz(0.0f, 1.0f);
-    __fdiv_rd(4.0f, 2.0f);
-    __fdiv_rn(4.0f, 2.0f);
-    __fdiv_ru(4.0f, 2.0f);
-    __fdiv_rz(4.0f, 2.0f);
+    // __fadd_rd(0.0f, 1.0f);
+    // __fadd_rn(0.0f, 1.0f);
+    // __fadd_ru(0.0f, 1.0f);
+    // __fadd_rz(0.0f, 1.0f);
+    // __fdiv_rd(4.0f, 2.0f);
+    // __fdiv_rn(4.0f, 2.0f);
+    // __fdiv_ru(4.0f, 2.0f);
+    // __fdiv_rz(4.0f, 2.0f);
     __fdividef(4.0f, 2.0f);
-    __fmaf_rd(1.0f, 2.0f, 3.0f);
-    __fmaf_rn(1.0f, 2.0f, 3.0f);
-    __fmaf_ru(1.0f, 2.0f, 3.0f);
-    __fmaf_rz(1.0f, 2.0f, 3.0f);
-    __fmul_rd(1.0f, 2.0f);
-    __fmul_rn(1.0f, 2.0f);
-    __fmul_ru(1.0f, 2.0f);
-    __fmul_rz(1.0f, 2.0f);
+    // __fmaf_rd(1.0f, 2.0f, 3.0f);
+    // __fmaf_rn(1.0f, 2.0f, 3.0f);
+    // __fmaf_ru(1.0f, 2.0f, 3.0f);
+    // __fmaf_rz(1.0f, 2.0f, 3.0f);
+    // __fmul_rd(1.0f, 2.0f);
+    // __fmul_rn(1.0f, 2.0f);
+    // __fmul_ru(1.0f, 2.0f);
+    // __fmul_rz(1.0f, 2.0f);
     __frcp_rd(2.0f);
     __frcp_rn(2.0f);
     __frcp_ru(2.0f);
     __frcp_rz(2.0f);
     __frsqrt_rn(4.0f);
-    __fsqrt_rd(4.0f);
-    __fsqrt_rn(4.0f);
-    __fsqrt_ru(4.0f);
-    __fsqrt_rz(4.0f);
-    __fsub_rd(2.0f, 1.0f);
-    __fsub_rn(2.0f, 1.0f);
-    __fsub_ru(2.0f, 1.0f);
-    __fsub_rz(2.0f, 1.0f);
+    // __fsqrt_rd(4.0f);
+    // __fsqrt_rn(4.0f);
+    // __fsqrt_ru(4.0f);
+    // __fsqrt_rz(4.0f);
+    // __fsub_rd(2.0f, 1.0f);
+    // __fsub_rn(2.0f, 1.0f);
+    // __fsub_ru(2.0f, 1.0f);
+    // __fsub_rz(2.0f, 1.0f);
     __log10f(1.0f);
     __log2f(1.0f);
     __logf(1.0f);

From 4587e32e46ce8344e42e71b8e4b32fdf5a13d74d Mon Sep 17 00:00:00 2001
From: Aaron Enye Shi <enye.shi@gmail.com>
Date: Mon, 5 Nov 2018 22:54:55 +0000
Subject: [PATCH 2/4] Update hip-math-api doc to remove non-default-rounded

[ROCm/clr commit: 789814ab3055ae21d5452877a5c7831962e5549b]
---
 .../clr/hipamd/docs/markdown/hip-math-api.md  | 102 ++++++++++--------
 1 file changed, 55 insertions(+), 47 deletions(-)

diff --git a/projects/clr/hipamd/docs/markdown/hip-math-api.md b/projects/clr/hipamd/docs/markdown/hip-math-api.md
index 37efafbbbf..b3698ff2b3 100644
--- a/projects/clr/hipamd/docs/markdown/hip-math-api.md
+++ b/projects/clr/hipamd/docs/markdown/hip-math-api.md
@@ -1433,7 +1433,7 @@ __device__  float __expf(float x);
 __device__ static  float __fadd_rd(float x, float y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fadd_rn
@@ -1441,7 +1441,7 @@ __device__ static  float __fadd_rd(float x, float y);
 __device__ static  float __fadd_rn(float x, float y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fadd_ru
@@ -1449,7 +1449,7 @@ __device__ static  float __fadd_rn(float x, float y);
 __device__ static  float __fadd_ru(float x, float y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fadd_rz
@@ -1457,7 +1457,7 @@ __device__ static  float __fadd_ru(float x, float y);
 __device__ static  float __fadd_rz(float x, float y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fdiv_rd
@@ -1465,7 +1465,7 @@ __device__ static  float __fadd_rz(float x, float y);
 __device__ static  float __fdiv_rd(float x, float y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fdiv_rn
@@ -1473,7 +1473,7 @@ __device__ static  float __fdiv_rd(float x, float y);
 __device__ static  float __fdiv_rn(float x, float y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fdiv_ru
@@ -1481,7 +1481,7 @@ __device__ static  float __fdiv_rn(float x, float y);
 __device__ static  float __fdiv_ru(float x, float y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fdiv_rz
@@ -1489,7 +1489,7 @@ __device__ static  float __fdiv_ru(float x, float y);
 __device__ static  float __fdiv_rz(float x, float y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fdividef
@@ -1505,7 +1505,7 @@ __device__ static  float __fdividef(float x, float y);
 __device__  float __fmaf_rd(float x, float y, float z);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fmaf_rn
@@ -1513,7 +1513,7 @@ __device__  float __fmaf_rd(float x, float y, float z);
 __device__  float __fmaf_rn(float x, float y, float z);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fmaf_ru
@@ -1521,7 +1521,7 @@ __device__  float __fmaf_rn(float x, float y, float z);
 __device__  float __fmaf_ru(float x, float y, float z);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fmaf_rz
@@ -1529,7 +1529,7 @@ __device__  float __fmaf_ru(float x, float y, float z);
 __device__  float __fmaf_rz(float x, float y, float z);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fmul_rd
@@ -1537,7 +1537,7 @@ __device__  float __fmaf_rz(float x, float y, float z);
 __device__ static  float __fmul_rd(float x, float y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fmul_rn
@@ -1545,7 +1545,7 @@ __device__ static  float __fmul_rd(float x, float y);
 __device__ static  float __fmul_rn(float x, float y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fmul_ru
@@ -1553,7 +1553,7 @@ __device__ static  float __fmul_rn(float x, float y);
 __device__ static  float __fmul_ru(float x, float y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fmul_rz
@@ -1561,7 +1561,7 @@ __device__ static  float __fmul_ru(float x, float y);
 __device__ static  float __fmul_rz(float x, float y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __frcp_rd
@@ -1609,7 +1609,7 @@ __device__  float __frsqrt_rn(float x);
 __device__  float __fsqrt_rd(float x);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fsqrt_rn
@@ -1617,7 +1617,7 @@ __device__  float __fsqrt_rd(float x);
 __device__  float __fsqrt_rn(float x);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fsqrt_ru
@@ -1625,7 +1625,7 @@ __device__  float __fsqrt_rn(float x);
 __device__  float __fsqrt_ru(float x);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fsqrt_rz
@@ -1633,7 +1633,7 @@ __device__  float __fsqrt_ru(float x);
 __device__  float __fsqrt_rz(float x);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fsub_rd
@@ -1641,7 +1641,7 @@ __device__  float __fsqrt_rz(float x);
 __device__ static  float __fsub_rd(float x, float y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fsub_rn
@@ -1649,7 +1649,7 @@ __device__ static  float __fsub_rd(float x, float y);
 __device__ static  float __fsub_rn(float x, float y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fsub_ru
@@ -1657,7 +1657,15 @@ __device__ static  float __fsub_rn(float x, float y);
 __device__ static  float __fsub_ru(float x, float y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
+
+
+### __fsub_rz
+```cpp 
+__device__ static  float __fsub_rz(float x, float y);
+
+```
+**Description:**  Unsupported
 
 
 ### __log10f
@@ -1729,7 +1737,7 @@ __device__  float __tanf(float x);
 __device__ static  double __dadd_rd(double x, double y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __dadd_rn
@@ -1737,7 +1745,7 @@ __device__ static  double __dadd_rd(double x, double y);
 __device__ static  double __dadd_rn(double x, double y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __dadd_ru
@@ -1745,7 +1753,7 @@ __device__ static  double __dadd_rn(double x, double y);
 __device__ static  double __dadd_ru(double x, double y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __dadd_rz
@@ -1753,7 +1761,7 @@ __device__ static  double __dadd_ru(double x, double y);
 __device__ static  double __dadd_rz(double x, double y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __ddiv_rd
@@ -1761,7 +1769,7 @@ __device__ static  double __dadd_rz(double x, double y);
 __device__ static  double __ddiv_rd(double x, double y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __ddiv_rn
@@ -1769,7 +1777,7 @@ __device__ static  double __ddiv_rd(double x, double y);
 __device__ static  double __ddiv_rn(double x, double y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __ddiv_ru
@@ -1777,7 +1785,7 @@ __device__ static  double __ddiv_rn(double x, double y);
 __device__ static  double __ddiv_ru(double x, double y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __ddiv_rz
@@ -1785,7 +1793,7 @@ __device__ static  double __ddiv_ru(double x, double y);
 __device__ static  double __ddiv_rz(double x, double y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __dmul_rd
@@ -1793,7 +1801,7 @@ __device__ static  double __ddiv_rz(double x, double y);
 __device__ static  double __dmul_rd(double x, double y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __dmul_rn
@@ -1801,7 +1809,7 @@ __device__ static  double __dmul_rd(double x, double y);
 __device__ static  double __dmul_rn(double x, double y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __dmul_ru
@@ -1809,7 +1817,7 @@ __device__ static  double __dmul_rn(double x, double y);
 __device__ static  double __dmul_ru(double x, double y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __dmul_rz
@@ -1817,7 +1825,7 @@ __device__ static  double __dmul_ru(double x, double y);
 __device__ static  double __dmul_rz(double x, double y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __drcp_rd
@@ -1857,7 +1865,7 @@ __device__  double __drcp_rz(double x);
 __device__  double __dsqrt_rd(double x);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __dsqrt_rn
@@ -1865,7 +1873,7 @@ __device__  double __dsqrt_rd(double x);
 __device__  double __dsqrt_rn(double x);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __dsqrt_ru
@@ -1873,7 +1881,7 @@ __device__  double __dsqrt_rn(double x);
 __device__  double __dsqrt_ru(double x);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __dsqrt_rz
@@ -1881,7 +1889,7 @@ __device__  double __dsqrt_ru(double x);
 __device__  double __dsqrt_rz(double x);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __dsub_rd
@@ -1889,7 +1897,7 @@ __device__  double __dsqrt_rz(double x);
 __device__ static  double __dsub_rd(double x, double y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __dsub_rn
@@ -1897,7 +1905,7 @@ __device__ static  double __dsub_rd(double x, double y);
 __device__ static  double __dsub_rn(double x, double y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __dsub_ru
@@ -1905,7 +1913,7 @@ __device__ static  double __dsub_rn(double x, double y);
 __device__ static  double __dsub_ru(double x, double y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __dsub_rz
@@ -1913,7 +1921,7 @@ __device__ static  double __dsub_ru(double x, double y);
 __device__ static  double __dsub_rz(double x, double y);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fma_rd
@@ -1921,7 +1929,7 @@ __device__ static  double __dsub_rz(double x, double y);
 __device__  double __fma_rd(double x, double y, double z);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fma_rn
@@ -1929,7 +1937,7 @@ __device__  double __fma_rd(double x, double y, double z);
 __device__  double __fma_rn(double x, double y, double z);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fma_ru
@@ -1937,7 +1945,7 @@ __device__  double __fma_rn(double x, double y, double z);
 __device__  double __fma_ru(double x, double y, double z);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fma_rz
@@ -1945,7 +1953,7 @@ __device__  double __fma_ru(double x, double y, double z);
 __device__  double __fma_rz(double x, double y, double z);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __brev

From 4480bb6d065a6724db1a889f6197cc10b5940d6c Mon Sep 17 00:00:00 2001
From: Aaron Enye Shi <enye.shi@gmail.com>
Date: Tue, 6 Nov 2018 16:32:14 +0000
Subject: [PATCH 3/4] Guard the OCML rounded operations instead

Instead of commenting all these functions out, guard the functions with a macro OCML_BASIC_ROUNDED_OPERATIONS.


[ROCm/clr commit: 9aa92238ab6cb42df6bcee70e7f54008ec0bd370]
---
 .../include/hip/hcc_detail/math_functions.h   | 346 +++++++++---------
 .../hipDoublePrecisionIntrinsics.cpp          |  52 +--
 .../hipSinglePrecisionIntrinsics.cpp          |  54 +--
 3 files changed, 236 insertions(+), 216 deletions(-)

diff --git a/projects/clr/hipamd/include/hip/hcc_detail/math_functions.h b/projects/clr/hipamd/include/hip/hcc_detail/math_functions.h
index 8a6091858b..08be321d68 100644
--- a/projects/clr/hipamd/include/hip/hcc_detail/math_functions.h
+++ b/projects/clr/hipamd/include/hip/hcc_detail/math_functions.h
@@ -514,69 +514,73 @@ float __exp10f(float x) { return __ocml_exp10_f32(x); }
 __DEVICE__
 inline
 float __expf(float x) { return __ocml_exp_f32(x); }
-// __DEVICE__
-// inline
-// float __fadd_rd(float x, float y) { return __ocml_add_rtn_f32(x, y); }
-// __DEVICE__
-// inline
-// float __fadd_rn(float x, float y) { return __ocml_add_rte_f32(x, y); }
-// __DEVICE__
-// inline
-// float __fadd_ru(float x, float y) { return __ocml_add_rtp_f32(x, y); }
-// __DEVICE__
-// inline
-// float __fadd_rz(float x, float y) { return __ocml_add_rtz_f32(x, y); }
-// __DEVICE__
-// inline
-// float __fdiv_rd(float x, float y) { return __ocml_div_rtn_f32(x, y); }
-// __DEVICE__
-// inline
-// float __fdiv_rn(float x, float y) { return __ocml_div_rte_f32(x, y); }
-// __DEVICE__
-// inline
-// float __fdiv_ru(float x, float y) { return __ocml_div_rtp_f32(x, y); }
-// __DEVICE__
-// inline
-// float __fdiv_rz(float x, float y) { return __ocml_div_rtz_f32(x, y); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+float __fadd_rd(float x, float y) { return __ocml_add_rtn_f32(x, y); }
+__DEVICE__
+inline
+float __fadd_rn(float x, float y) { return __ocml_add_rte_f32(x, y); }
+__DEVICE__
+inline
+float __fadd_ru(float x, float y) { return __ocml_add_rtp_f32(x, y); }
+__DEVICE__
+inline
+float __fadd_rz(float x, float y) { return __ocml_add_rtz_f32(x, y); }
+__DEVICE__
+inline
+float __fdiv_rd(float x, float y) { return __ocml_div_rtn_f32(x, y); }
+__DEVICE__
+inline
+float __fdiv_rn(float x, float y) { return __ocml_div_rte_f32(x, y); }
+__DEVICE__
+inline
+float __fdiv_ru(float x, float y) { return __ocml_div_rtp_f32(x, y); }
+__DEVICE__
+inline
+float __fdiv_rz(float x, float y) { return __ocml_div_rtz_f32(x, y); }
+#endif
 __DEVICE__
 inline
 float __fdividef(float x, float y) { return x / y; }
-// __DEVICE__
-// inline
-// float __fmaf_rd(float x, float y, float z)
-// {
-//     return __ocml_fma_rtn_f32(x, y, z);
-// }
-// __DEVICE__
-// inline
-// float __fmaf_rn(float x, float y, float z)
-// {
-//     return __ocml_fma_rte_f32(x, y, z);
-// }
-// __DEVICE__
-// inline
-// float __fmaf_ru(float x, float y, float z)
-// {
-//     return __ocml_fma_rtp_f32(x, y, z);
-// }
-// __DEVICE__
-// inline
-// float __fmaf_rz(float x, float y, float z)
-// {
-//    return __ocml_fma_rtz_f32(x, y, z);
-// }
-// __DEVICE__
-// inline
-// float __fmul_rd(float x, float y) { return __ocml_mul_rtn_f32(x, y); }
-// __DEVICE__
-// inline
-// float __fmul_rn(float x, float y) { return __ocml_mul_rte_f32(x, y); }
-// __DEVICE__
-// inline
-// float __fmul_ru(float x, float y)  { return __ocml_mul_rtp_f32(x, y); }
-// __DEVICE__
-// inline
-// float __fmul_rz(float x, float y) { return __ocml_mul_rtz_f32(x, y); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+float __fmaf_rd(float x, float y, float z)
+{
+    return __ocml_fma_rtn_f32(x, y, z);
+}
+__DEVICE__
+inline
+float __fmaf_rn(float x, float y, float z)
+{
+    return __ocml_fma_rte_f32(x, y, z);
+}
+__DEVICE__
+inline
+float __fmaf_ru(float x, float y, float z)
+{
+    return __ocml_fma_rtp_f32(x, y, z);
+}
+__DEVICE__
+inline
+float __fmaf_rz(float x, float y, float z)
+{
+   return __ocml_fma_rtz_f32(x, y, z);
+}
+__DEVICE__
+inline
+float __fmul_rd(float x, float y) { return __ocml_mul_rtn_f32(x, y); }
+__DEVICE__
+inline
+float __fmul_rn(float x, float y) { return __ocml_mul_rte_f32(x, y); }
+__DEVICE__
+inline
+float __fmul_ru(float x, float y)  { return __ocml_mul_rtp_f32(x, y); }
+__DEVICE__
+inline
+float __fmul_rz(float x, float y) { return __ocml_mul_rtz_f32(x, y); }
+#endif
 __DEVICE__
 inline
 float __frcp_rd(float x) { return __llvm_amdgcn_rcp_f32(x); }
@@ -592,30 +596,32 @@ float __frcp_rz(float x) { return __llvm_amdgcn_rcp_f32(x); }
 __DEVICE__
 inline
 float __frsqrt_rn(float x) { return __llvm_amdgcn_rsq_f32(x); }
-// __DEVICE__
-// inline
-// float __fsqrt_rd(float x) { return __ocml_sqrt_rtn_f32(x); }
-// __DEVICE__
-// inline
-// float __fsqrt_rn(float x) { return __ocml_sqrt_rte_f32(x); }
-// __DEVICE__
-// inline
-// float __fsqrt_ru(float x) { return __ocml_sqrt_rtp_f32(x); }
-// __DEVICE__
-// inline
-// float __fsqrt_rz(float x) { return __ocml_sqrt_rtz_f32(x); }
-// __DEVICE__
-// inline
-// float __fsub_rd(float x, float y) { return __ocml_sub_rtn_f32(x, y); }
-// __DEVICE__
-// inline
-// float __fsub_rn(float x, float y) { return __ocml_sub_rte_f32(x, y); }
-// __DEVICE__
-// inline
-// float __fsub_ru(float x, float y) { return __ocml_sub_rtp_f32(x, y); }
-// __DEVICE__
-// inline
-// float __fsub_rz(float x, float y) { return __ocml_sub_rtz_f32(x, y); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+float __fsqrt_rd(float x) { return __ocml_sqrt_rtn_f32(x); }
+__DEVICE__
+inline
+float __fsqrt_rn(float x) { return __ocml_sqrt_rte_f32(x); }
+__DEVICE__
+inline
+float __fsqrt_ru(float x) { return __ocml_sqrt_rtp_f32(x); }
+__DEVICE__
+inline
+float __fsqrt_rz(float x) { return __ocml_sqrt_rtz_f32(x); }
+__DEVICE__
+inline
+float __fsub_rd(float x, float y) { return __ocml_sub_rtn_f32(x, y); }
+__DEVICE__
+inline
+float __fsub_rn(float x, float y) { return __ocml_sub_rte_f32(x, y); }
+__DEVICE__
+inline
+float __fsub_ru(float x, float y) { return __ocml_sub_rtp_f32(x, y); }
+__DEVICE__
+inline
+float __fsub_rz(float x, float y) { return __ocml_sub_rtz_f32(x, y); }
+#endif
 __DEVICE__
 inline
 float __log10f(float x) { return __ocml_log10_f32(x); }
@@ -1034,42 +1040,44 @@ double yn(int n, double x)
 }
 
 // BEGIN INTRINSICS
-// __DEVICE__
-// inline
-// double __dadd_rd(double x, double y) { return __ocml_add_rtn_f64(x, y); }
-// __DEVICE__
-// inline
-// double __dadd_rn(double x, double y) { return __ocml_add_rte_f64(x, y); }
-// __DEVICE__
-// inline
-// double __dadd_ru(double x, double y) { return __ocml_add_rtp_f64(x, y); }
-// __DEVICE__
-// inline
-// double __dadd_rz(double x, double y) { return __ocml_add_rtz_f64(x, y); }
-// __DEVICE__
-// inline
-// double __ddiv_rd(double x, double y) { return __ocml_div_rtn_f64(x, y); }
-// __DEVICE__
-// inline
-// double __ddiv_rn(double x, double y) { return __ocml_div_rte_f64(x, y); }
-// __DEVICE__
-// inline
-// double __ddiv_ru(double x, double y) { return __ocml_div_rtp_f64(x, y); }
-// __DEVICE__
-// inline
-// double __ddiv_rz(double x, double y) { return __ocml_div_rtz_f64(x, y); }
-// __DEVICE__
-// inline
-// double __dmul_rd(double x, double y) { return __ocml_mul_rtn_f64(x, y); }
-// __DEVICE__
-// inline
-// double __dmul_rn(double x, double y) { return __ocml_mul_rte_f64(x, y); }
-// __DEVICE__
-// inline
-// double __dmul_ru(double x, double y) { return __ocml_mul_rtp_f64(x, y); }
-// __DEVICE__
-// inline
-// double __dmul_rz(double x, double y) { return __ocml_mul_rtz_f64(x, y); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+double __dadd_rd(double x, double y) { return __ocml_add_rtn_f64(x, y); }
+__DEVICE__
+inline
+double __dadd_rn(double x, double y) { return __ocml_add_rte_f64(x, y); }
+__DEVICE__
+inline
+double __dadd_ru(double x, double y) { return __ocml_add_rtp_f64(x, y); }
+__DEVICE__
+inline
+double __dadd_rz(double x, double y) { return __ocml_add_rtz_f64(x, y); }
+__DEVICE__
+inline
+double __ddiv_rd(double x, double y) { return __ocml_div_rtn_f64(x, y); }
+__DEVICE__
+inline
+double __ddiv_rn(double x, double y) { return __ocml_div_rte_f64(x, y); }
+__DEVICE__
+inline
+double __ddiv_ru(double x, double y) { return __ocml_div_rtp_f64(x, y); }
+__DEVICE__
+inline
+double __ddiv_rz(double x, double y) { return __ocml_div_rtz_f64(x, y); }
+__DEVICE__
+inline
+double __dmul_rd(double x, double y) { return __ocml_mul_rtn_f64(x, y); }
+__DEVICE__
+inline
+double __dmul_rn(double x, double y) { return __ocml_mul_rte_f64(x, y); }
+__DEVICE__
+inline
+double __dmul_ru(double x, double y) { return __ocml_mul_rtp_f64(x, y); }
+__DEVICE__
+inline
+double __dmul_rz(double x, double y) { return __ocml_mul_rtz_f64(x, y); }
+#endif
 __DEVICE__
 inline
 double __drcp_rd(double x) { return __llvm_amdgcn_rcp_f64(x); }
@@ -1082,54 +1090,56 @@ double __drcp_ru(double x) { return __llvm_amdgcn_rcp_f64(x); }
 __DEVICE__
 inline
 double __drcp_rz(double x) { return __llvm_amdgcn_rcp_f64(x); }
-// __DEVICE__
-// inline
-// double __dsqrt_rd(double x) { return __ocml_sqrt_rtn_f64(x); }
-// __DEVICE__
-// inline
-// double __dsqrt_rn(double x) { return __ocml_sqrt_rte_f64(x); }
-// __DEVICE__
-// inline
-// double __dsqrt_ru(double x) { return __ocml_sqrt_rtp_f64(x); }
-// __DEVICE__
-// inline
-// double __dsqrt_rz(double x) { return __ocml_sqrt_rtz_f64(x); }
-// __DEVICE__
-// inline
-// double __dsub_rd(double x, double y) { return __ocml_sub_rtn_f64(x, y); }
-// __DEVICE__
-// inline
-// double __dsub_rn(double x, double y) { return __ocml_sub_rte_f64(x, y); }
-// __DEVICE__
-// inline
-// double __dsub_ru(double x, double y) { return __ocml_sub_rtp_f64(x, y); }
-// __DEVICE__
-// inline
-// double __dsub_rz(double x, double y) { return __ocml_sub_rtz_f64(x, y); }
-// __DEVICE__
-// inline
-// double __fma_rd(double x, double y, double z)
-// {
-//     return __ocml_fma_rtn_f64(x, y, z);
-// }
-// __DEVICE__
-// inline
-// double __fma_rn(double x, double y, double z)
-// {
-//     return __ocml_fma_rte_f64(x, y, z);
-// }
-// __DEVICE__
-// inline
-// double __fma_ru(double x, double y, double z)
-// {
-//     return __ocml_fma_rtp_f64(x, y, z);
-// }
-// __DEVICE__
-// inline
-// double __fma_rz(double x, double y, double z)
-// {
-//     return __ocml_fma_rtz_f64(x, y, z);
-// }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+double __dsqrt_rd(double x) { return __ocml_sqrt_rtn_f64(x); }
+__DEVICE__
+inline
+double __dsqrt_rn(double x) { return __ocml_sqrt_rte_f64(x); }
+__DEVICE__
+inline
+double __dsqrt_ru(double x) { return __ocml_sqrt_rtp_f64(x); }
+__DEVICE__
+inline
+double __dsqrt_rz(double x) { return __ocml_sqrt_rtz_f64(x); }
+__DEVICE__
+inline
+double __dsub_rd(double x, double y) { return __ocml_sub_rtn_f64(x, y); }
+__DEVICE__
+inline
+double __dsub_rn(double x, double y) { return __ocml_sub_rte_f64(x, y); }
+__DEVICE__
+inline
+double __dsub_ru(double x, double y) { return __ocml_sub_rtp_f64(x, y); }
+__DEVICE__
+inline
+double __dsub_rz(double x, double y) { return __ocml_sub_rtz_f64(x, y); }
+__DEVICE__
+inline
+double __fma_rd(double x, double y, double z)
+{
+    return __ocml_fma_rtn_f64(x, y, z);
+}
+__DEVICE__
+inline
+double __fma_rn(double x, double y, double z)
+{
+    return __ocml_fma_rte_f64(x, y, z);
+}
+__DEVICE__
+inline
+double __fma_ru(double x, double y, double z)
+{
+    return __ocml_fma_rtp_f64(x, y, z);
+}
+__DEVICE__
+inline
+double __fma_rz(double x, double y, double z)
+{
+    return __ocml_fma_rtz_f64(x, y, z);
+}
+#endif
 // END INTRINSICS
 // END DOUBLE
 
diff --git a/projects/clr/hipamd/tests/src/deviceLib/hipDoublePrecisionIntrinsics.cpp b/projects/clr/hipamd/tests/src/deviceLib/hipDoublePrecisionIntrinsics.cpp
index f6c515c03a..295fd83708 100644
--- a/projects/clr/hipamd/tests/src/deviceLib/hipDoublePrecisionIntrinsics.cpp
+++ b/projects/clr/hipamd/tests/src/deviceLib/hipDoublePrecisionIntrinsics.cpp
@@ -34,34 +34,38 @@ THE SOFTWARE.
 #pragma clang diagnostic ignored "-Wunused-variable"
 
 __device__ void double_precision_intrinsics() {
-    // __dadd_rd(0.0, 1.0);
-    // __dadd_rn(0.0, 1.0);
-    // __dadd_ru(0.0, 1.0);
-    // __dadd_rz(0.0, 1.0);
-    // __ddiv_rd(0.0, 1.0);
-    // __ddiv_rn(0.0, 1.0);
-    // __ddiv_ru(0.0, 1.0);
-    // __ddiv_rz(0.0, 1.0);
-    // __dmul_rd(1.0, 2.0);
-    // __dmul_rn(1.0, 2.0);
-    // __dmul_ru(1.0, 2.0);
-    // __dmul_rz(1.0, 2.0);
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+    __dadd_rd(0.0, 1.0);
+    __dadd_rn(0.0, 1.0);
+    __dadd_ru(0.0, 1.0);
+    __dadd_rz(0.0, 1.0);
+    __ddiv_rd(0.0, 1.0);
+    __ddiv_rn(0.0, 1.0);
+    __ddiv_ru(0.0, 1.0);
+    __ddiv_rz(0.0, 1.0);
+    __dmul_rd(1.0, 2.0);
+    __dmul_rn(1.0, 2.0);
+    __dmul_ru(1.0, 2.0);
+    __dmul_rz(1.0, 2.0);
+#endif
     __drcp_rd(2.0);
     __drcp_rn(2.0);
     __drcp_ru(2.0);
     __drcp_rz(2.0);
-    // __dsqrt_rd(4.0);
-    // __dsqrt_rn(4.0);
-    // __dsqrt_ru(4.0);
-    // __dsqrt_rz(4.0);
-    // __dsub_rd(2.0, 1.0);
-    // __dsub_rn(2.0, 1.0);
-    // __dsub_ru(2.0, 1.0);
-    // __dsub_rz(2.0, 1.0);
-    // __fma_rd(1.0, 2.0, 3.0);
-    // __fma_rn(1.0, 2.0, 3.0);
-    // __fma_ru(1.0, 2.0, 3.0);
-    // __fma_rz(1.0, 2.0, 3.0);
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+    __dsqrt_rd(4.0);
+    __dsqrt_rn(4.0);
+    __dsqrt_ru(4.0);
+    __dsqrt_rz(4.0);
+    __dsub_rd(2.0, 1.0);
+    __dsub_rn(2.0, 1.0);
+    __dsub_ru(2.0, 1.0);
+    __dsub_rz(2.0, 1.0);
+    __fma_rd(1.0, 2.0, 3.0);
+    __fma_rn(1.0, 2.0, 3.0);
+    __fma_ru(1.0, 2.0, 3.0);
+    __fma_rz(1.0, 2.0, 3.0);
+#endif
 }
 
 __global__ void compileDoublePrecisionIntrinsics(int ignored) {
diff --git a/projects/clr/hipamd/tests/src/deviceLib/hipSinglePrecisionIntrinsics.cpp b/projects/clr/hipamd/tests/src/deviceLib/hipSinglePrecisionIntrinsics.cpp
index 623ea08a94..db60099558 100644
--- a/projects/clr/hipamd/tests/src/deviceLib/hipSinglePrecisionIntrinsics.cpp
+++ b/projects/clr/hipamd/tests/src/deviceLib/hipSinglePrecisionIntrinsics.cpp
@@ -39,36 +39,42 @@ __device__ void single_precision_intrinsics() {
     __cosf(0.0f);
     __exp10f(0.0f);
     __expf(0.0f);
-    // __fadd_rd(0.0f, 1.0f);
-    // __fadd_rn(0.0f, 1.0f);
-    // __fadd_ru(0.0f, 1.0f);
-    // __fadd_rz(0.0f, 1.0f);
-    // __fdiv_rd(4.0f, 2.0f);
-    // __fdiv_rn(4.0f, 2.0f);
-    // __fdiv_ru(4.0f, 2.0f);
-    // __fdiv_rz(4.0f, 2.0f);
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+    __fadd_rd(0.0f, 1.0f);
+    __fadd_rn(0.0f, 1.0f);
+    __fadd_ru(0.0f, 1.0f);
+    __fadd_rz(0.0f, 1.0f);
+    __fdiv_rd(4.0f, 2.0f);
+    __fdiv_rn(4.0f, 2.0f);
+    __fdiv_ru(4.0f, 2.0f);
+    __fdiv_rz(4.0f, 2.0f);
+#endif
     __fdividef(4.0f, 2.0f);
-    // __fmaf_rd(1.0f, 2.0f, 3.0f);
-    // __fmaf_rn(1.0f, 2.0f, 3.0f);
-    // __fmaf_ru(1.0f, 2.0f, 3.0f);
-    // __fmaf_rz(1.0f, 2.0f, 3.0f);
-    // __fmul_rd(1.0f, 2.0f);
-    // __fmul_rn(1.0f, 2.0f);
-    // __fmul_ru(1.0f, 2.0f);
-    // __fmul_rz(1.0f, 2.0f);
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+    __fmaf_rd(1.0f, 2.0f, 3.0f);
+    __fmaf_rn(1.0f, 2.0f, 3.0f);
+    __fmaf_ru(1.0f, 2.0f, 3.0f);
+    __fmaf_rz(1.0f, 2.0f, 3.0f);
+    __fmul_rd(1.0f, 2.0f);
+    __fmul_rn(1.0f, 2.0f);
+    __fmul_ru(1.0f, 2.0f);
+    __fmul_rz(1.0f, 2.0f);
+#endif
     __frcp_rd(2.0f);
     __frcp_rn(2.0f);
     __frcp_ru(2.0f);
     __frcp_rz(2.0f);
     __frsqrt_rn(4.0f);
-    // __fsqrt_rd(4.0f);
-    // __fsqrt_rn(4.0f);
-    // __fsqrt_ru(4.0f);
-    // __fsqrt_rz(4.0f);
-    // __fsub_rd(2.0f, 1.0f);
-    // __fsub_rn(2.0f, 1.0f);
-    // __fsub_ru(2.0f, 1.0f);
-    // __fsub_rz(2.0f, 1.0f);
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+    __fsqrt_rd(4.0f);
+    __fsqrt_rn(4.0f);
+    __fsqrt_ru(4.0f);
+    __fsqrt_rz(4.0f);
+    __fsub_rd(2.0f, 1.0f);
+    __fsub_rn(2.0f, 1.0f);
+    __fsub_ru(2.0f, 1.0f);
+    __fsub_rz(2.0f, 1.0f);
+#endif
     __log10f(1.0f);
     __log2f(1.0f);
     __logf(1.0f);

From 890beb81d618dff8fa71406665b8582eb6771999 Mon Sep 17 00:00:00 2001
From: Aaron Enye Shi <enye.shi@gmail.com>
Date: Tue, 6 Nov 2018 19:53:28 +0000
Subject: [PATCH 4/4] Guard rcp rounded implementation as well

Since rcp implementations of non-default rounded versions are not correct or supported in OCML, guard them using the same macro OCML_BASIC_ROUNDED_OPERATIONS. Also update the docs and tests.


[ROCm/clr commit: 7b3bbc85c5575f5f61569f1e3678a0b0fe030552]
---
 .../clr/hipamd/docs/markdown/hip-math-api.md   | 18 +++++++++---------
 .../include/hip/hcc_detail/math_functions.h    |  4 ----
 .../deviceLib/hipDoublePrecisionIntrinsics.cpp |  2 --
 .../tests/src/deviceLib/hipFloatMath.cpp       | 10 ++++++----
 .../deviceLib/hipSinglePrecisionIntrinsics.cpp |  2 --
 5 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/projects/clr/hipamd/docs/markdown/hip-math-api.md b/projects/clr/hipamd/docs/markdown/hip-math-api.md
index b3698ff2b3..9b8a3f2f11 100644
--- a/projects/clr/hipamd/docs/markdown/hip-math-api.md
+++ b/projects/clr/hipamd/docs/markdown/hip-math-api.md
@@ -1569,7 +1569,7 @@ __device__ static  float __fmul_rz(float x, float y);
 __device__  float __frcp_rd(float x);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __frcp_rn
@@ -1577,7 +1577,7 @@ __device__  float __frcp_rd(float x);
 __device__  float __frcp_rn(float x);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __frcp_ru
@@ -1585,7 +1585,7 @@ __device__  float __frcp_rn(float x);
 __device__  float __frcp_ru(float x);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __frcp_rz
@@ -1593,7 +1593,7 @@ __device__  float __frcp_ru(float x);
 __device__  float __frcp_rz(float x);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __frsqrt_rn
@@ -1601,7 +1601,7 @@ __device__  float __frcp_rz(float x);
 __device__  float __frsqrt_rn(float x);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __fsqrt_rd
@@ -1833,7 +1833,7 @@ __device__ static  double __dmul_rz(double x, double y);
 __device__  double __drcp_rd(double x);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __drcp_rn
@@ -1841,7 +1841,7 @@ __device__  double __drcp_rd(double x);
 __device__  double __drcp_rn(double x);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __drcp_ru
@@ -1849,7 +1849,7 @@ __device__  double __drcp_rn(double x);
 __device__  double __drcp_ru(double x);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __drcp_rz
@@ -1857,7 +1857,7 @@ __device__  double __drcp_ru(double x);
 __device__  double __drcp_rz(double x);
 
 ```
-**Description:**  Supported
+**Description:**  Unsupported
 
 
 ### __dsqrt_rd
diff --git a/projects/clr/hipamd/include/hip/hcc_detail/math_functions.h b/projects/clr/hipamd/include/hip/hcc_detail/math_functions.h
index 08be321d68..557257b2b0 100644
--- a/projects/clr/hipamd/include/hip/hcc_detail/math_functions.h
+++ b/projects/clr/hipamd/include/hip/hcc_detail/math_functions.h
@@ -580,7 +580,6 @@ float __fmul_ru(float x, float y)  { return __ocml_mul_rtp_f32(x, y); }
 __DEVICE__
 inline
 float __fmul_rz(float x, float y) { return __ocml_mul_rtz_f32(x, y); }
-#endif
 __DEVICE__
 inline
 float __frcp_rd(float x) { return __llvm_amdgcn_rcp_f32(x); }
@@ -596,7 +595,6 @@ float __frcp_rz(float x) { return __llvm_amdgcn_rcp_f32(x); }
 __DEVICE__
 inline
 float __frsqrt_rn(float x) { return __llvm_amdgcn_rsq_f32(x); }
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
 __DEVICE__
 inline
 float __fsqrt_rd(float x) { return __ocml_sqrt_rtn_f32(x); }
@@ -1077,7 +1075,6 @@ double __dmul_ru(double x, double y) { return __ocml_mul_rtp_f64(x, y); }
 __DEVICE__
 inline
 double __dmul_rz(double x, double y) { return __ocml_mul_rtz_f64(x, y); }
-#endif
 __DEVICE__
 inline
 double __drcp_rd(double x) { return __llvm_amdgcn_rcp_f64(x); }
@@ -1090,7 +1087,6 @@ double __drcp_ru(double x) { return __llvm_amdgcn_rcp_f64(x); }
 __DEVICE__
 inline
 double __drcp_rz(double x) { return __llvm_amdgcn_rcp_f64(x); }
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
 __DEVICE__
 inline
 double __dsqrt_rd(double x) { return __ocml_sqrt_rtn_f64(x); }
diff --git a/projects/clr/hipamd/tests/src/deviceLib/hipDoublePrecisionIntrinsics.cpp b/projects/clr/hipamd/tests/src/deviceLib/hipDoublePrecisionIntrinsics.cpp
index 295fd83708..939bdae743 100644
--- a/projects/clr/hipamd/tests/src/deviceLib/hipDoublePrecisionIntrinsics.cpp
+++ b/projects/clr/hipamd/tests/src/deviceLib/hipDoublePrecisionIntrinsics.cpp
@@ -47,12 +47,10 @@ __device__ void double_precision_intrinsics() {
     __dmul_rn(1.0, 2.0);
     __dmul_ru(1.0, 2.0);
     __dmul_rz(1.0, 2.0);
-#endif
     __drcp_rd(2.0);
     __drcp_rn(2.0);
     __drcp_ru(2.0);
     __drcp_rz(2.0);
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
     __dsqrt_rd(4.0);
     __dsqrt_rn(4.0);
     __dsqrt_ru(4.0);
diff --git a/projects/clr/hipamd/tests/src/deviceLib/hipFloatMath.cpp b/projects/clr/hipamd/tests/src/deviceLib/hipFloatMath.cpp
index ee83309f28..c6a07e26a9 100644
--- a/projects/clr/hipamd/tests/src/deviceLib/hipFloatMath.cpp
+++ b/projects/clr/hipamd/tests/src/deviceLib/hipFloatMath.cpp
@@ -38,11 +38,13 @@ __global__ void floatMath(float* In, float* Out) {
     Out[tid] = __cosf(In[tid]);
     Out[tid] = __exp10f(Out[tid]);
     Out[tid] = __expf(Out[tid]);
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
     Out[tid] = __frsqrt_rn(Out[tid]);
-    //Out[tid] = __fsqrt_rd(Out[tid]);
-    //Out[tid] = __fsqrt_rn(Out[tid]);
-    //Out[tid] = __fsqrt_ru(Out[tid]);
-    //Out[tid] = __fsqrt_rz(Out[tid]);
+    Out[tid] = __fsqrt_rd(Out[tid]);
+    Out[tid] = __fsqrt_rn(Out[tid]);
+    Out[tid] = __fsqrt_ru(Out[tid]);
+    Out[tid] = __fsqrt_rz(Out[tid]);
+#endif
     Out[tid] = __log10f(Out[tid]);
     Out[tid] = __log2f(Out[tid]);
     Out[tid] = __logf(Out[tid]);
diff --git a/projects/clr/hipamd/tests/src/deviceLib/hipSinglePrecisionIntrinsics.cpp b/projects/clr/hipamd/tests/src/deviceLib/hipSinglePrecisionIntrinsics.cpp
index db60099558..b216b3cb54 100644
--- a/projects/clr/hipamd/tests/src/deviceLib/hipSinglePrecisionIntrinsics.cpp
+++ b/projects/clr/hipamd/tests/src/deviceLib/hipSinglePrecisionIntrinsics.cpp
@@ -59,13 +59,11 @@ __device__ void single_precision_intrinsics() {
     __fmul_rn(1.0f, 2.0f);
     __fmul_ru(1.0f, 2.0f);
     __fmul_rz(1.0f, 2.0f);
-#endif
     __frcp_rd(2.0f);
     __frcp_rn(2.0f);
     __frcp_ru(2.0f);
     __frcp_rz(2.0f);
     __frsqrt_rn(4.0f);
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
     __fsqrt_rd(4.0f);
     __fsqrt_rn(4.0f);
     __fsqrt_ru(4.0f);