diff --git a/projects/clr/hipamd/CMakeLists.txt b/projects/clr/hipamd/CMakeLists.txt
index 075b916a4d..46355b5a85 100644
--- a/projects/clr/hipamd/CMakeLists.txt
+++ b/projects/clr/hipamd/CMakeLists.txt
@@ -196,8 +196,7 @@ if(HIP_PLATFORM STREQUAL "hcc")
         src/device_util.cpp
         src/hip_ldg.cpp
         src/hip_fp16.cpp
-        src/device_functions.cpp
-        src/math_functions.cpp)
+        src/device_functions.cpp)
 
     execute_process(COMMAND ${HCC_HOME}/bin/hcc-config --ldflags OUTPUT_VARIABLE HCC_LD_FLAGS)
     set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${HCC_LD_FLAGS} -Wl,-Bsymbolic")
diff --git a/projects/clr/hipamd/include/hip/hcc_detail/device_functions.h b/projects/clr/hipamd/include/hip/hcc_detail/device_functions.h
index 28d874b27a..ca236a1125 100644
--- a/projects/clr/hipamd/include/hip/hcc_detail/device_functions.h
+++ b/projects/clr/hipamd/include/hip/hcc_detail/device_functions.h
@@ -26,263 +26,6 @@ THE SOFTWARE.
 #include <hip/hip_runtime.h>
 #include <hip/hip_vector_types.h>
 
-
-// Single Precision Fast Math
-__device__ float __cosf(float x);
-__device__ float __exp10f(float x);
-__device__ float __expf(float x);
-__device__ static float __fadd_rd(float x, float y);
-__device__ static float __fadd_rn(float x, float y);
-__device__ static float __fadd_ru(float x, float y);
-__device__ static float __fadd_rz(float x, float y);
-__device__ static float __fdiv_rd(float x, float y);
-__device__ static float __fdiv_rn(float x, float y);
-__device__ static float __fdiv_ru(float x, float y);
-__device__ static float __fdiv_rz(float x, float y);
-__device__ static float __fdividef(float x, float y);
-__device__ float __fmaf_rd(float x, float y, float z);
-__device__ float __fmaf_rn(float x, float y, float z);
-__device__ float __fmaf_ru(float x, float y, float z);
-__device__ float __fmaf_rz(float x, float y, float z);
-__device__ static float __fmul_rd(float x, float y);
-__device__ static float __fmul_rn(float x, float y);
-__device__ static float __fmul_ru(float x, float y);
-__device__ static float __fmul_rz(float x, float y);
-__device__ float __frcp_rd(float x);
-__device__ float __frcp_rn(float x);
-__device__ float __frcp_ru(float x);
-__device__ float __frcp_rz(float x);
-__device__ float __frsqrt_rn(float x);
-__device__ float __fsqrt_rd(float x);
-__device__ float __fsqrt_rn(float x);
-__device__ float __fsqrt_ru(float x);
-__device__ float __fsqrt_rz(float x);
-__device__ static float __fsub_rd(float x, float y);
-__device__ static float __fsub_rn(float x, float y);
-__device__ static float __fsub_ru(float x, float y);
-__device__ float __log10f(float x);
-__device__ float __log2f(float x);
-__device__ float __logf(float x);
-__device__ float __powf(float base, float exponent);
-__device__ static float __saturatef(float x);
-__device__ void __sincosf(float x, float* s, float* c);
-__device__ float __sinf(float x);
-__device__ float __tanf(float x);
-
-
-/*
-Double Precision Intrinsics
-*/
-
-__device__ static double __dadd_rd(double x, double y);
-__device__ static double __dadd_rn(double x, double y);
-__device__ static double __dadd_ru(double x, double y);
-__device__ static double __dadd_rz(double x, double y);
-__device__ static double __ddiv_rd(double x, double y);
-__device__ static double __ddiv_rn(double x, double y);
-__device__ static double __ddiv_ru(double x, double y);
-__device__ static double __ddiv_rz(double x, double y);
-__device__ static double __dmul_rd(double x, double y);
-__device__ static double __dmul_rn(double x, double y);
-__device__ static double __dmul_ru(double x, double y);
-__device__ static double __dmul_rz(double x, double y);
-__device__ double __drcp_rd(double x);
-__device__ double __drcp_rn(double x);
-__device__ double __drcp_ru(double x);
-__device__ double __drcp_rz(double x);
-__device__ double __dsqrt_rd(double x);
-__device__ double __dsqrt_rn(double x);
-__device__ double __dsqrt_ru(double x);
-__device__ double __dsqrt_rz(double x);
-__device__ static double __dsub_rd(double x, double y);
-__device__ static double __dsub_rn(double x, double y);
-__device__ static double __dsub_ru(double x, double y);
-__device__ static double __dsub_rz(double x, double y);
-__device__ double __fma_rd(double x, double y, double z);
-__device__ double __fma_rn(double x, double y, double z);
-__device__ double __fma_ru(double x, double y, double z);
-__device__ double __fma_rz(double x, double y, double z);
-
-// Single Precision Fast Math
-extern __device__ __attribute__((const)) float __hip_fast_cosf(float) __asm("llvm.cos.f32");
-extern __device__ __attribute__((const)) float __hip_fast_exp2f(float) __asm("llvm.exp2.f32");
-__device__ float __hip_fast_exp10f(float);
-__device__ float __hip_fast_expf(float);
-__device__ float __hip_fast_frsqrt_rn(float);
-extern __device__ __attribute__((const)) float __hip_fast_fsqrt_rd(float) __asm("llvm.sqrt.f32");
-__device__ float __hip_fast_fsqrt_rn(float);
-__device__ float __hip_fast_fsqrt_ru(float);
-__device__ float __hip_fast_fsqrt_rz(float);
-__device__ float __hip_fast_log10f(float);
-extern __device__ __attribute__((const)) float __hip_fast_log2f(float) __asm("llvm.log2.f32");
-__device__ float __hip_fast_logf(float);
-__device__ float __hip_fast_powf(float, float);
-__device__ void __hip_fast_sincosf(float, float*, float*);
-extern __device__ __attribute__((const)) float __hip_fast_sinf(float) __asm("llvm.sin.f32");
-__device__ float __hip_fast_tanf(float);
-extern __device__ __attribute__((const)) float __hip_fast_fmaf(float, float, float) __asm("llvm.fma.f32");
-extern __device__ __attribute__((const)) float __hip_fast_frcp(float) __asm("llvm.amdgcn.rcp.f32");
-
-extern __device__ __attribute__((const)) double __hip_fast_dsqrt(double) __asm("llvm.sqrt.f64");
-extern __device__ __attribute__((const)) double __hip_fast_fma(double, double, double) __asm("llvm.fma.f64");
-extern __device__ __attribute__((const)) double __hip_fast_drcp(double) __asm("llvm.amdgcn.rcp.f64");
-
-
-// Single Precision Fast Math
-__device__ inline float __cosf(float x) { return __hip_fast_cosf(x); }
-
-__device__ inline float __exp10f(float x) { return __hip_fast_exp10f(x); }
-
-__device__ inline float __expf(float x) { return __hip_fast_expf(x); }
-
-__device__ static inline float __fadd_rd(float x, float y) { return x + y; }
-
-__device__ static inline float __fadd_rn(float x, float y) { return x + y; }
-
-__device__ static inline float __fadd_ru(float x, float y) { return x + y; }
-
-__device__ static inline float __fadd_rz(float x, float y) { return x + y; }
-
-__device__ static inline float __fdiv_rd(float x, float y) { return x / y; }
-
-__device__ static inline float __fdiv_rn(float x, float y) { return x / y; }
-
-__device__ static inline float __fdiv_ru(float x, float y) { return x / y; }
-
-__device__ static inline float __fdiv_rz(float x, float y) { return x / y; }
-
-__device__ static inline float __fdividef(float x, float y) { return x / y; }
-
-__device__ inline float __fmaf_rd(float x, float y, float z) { return __hip_fast_fmaf(x, y, z); }
-
-__device__ inline float __fmaf_rn(float x, float y, float z) { return __hip_fast_fmaf(x, y, z); }
-
-__device__ inline float __fmaf_ru(float x, float y, float z) { return __hip_fast_fmaf(x, y, z); }
-
-__device__ inline float __fmaf_rz(float x, float y, float z) { return __hip_fast_fmaf(x, y, z); }
-
-__device__ static inline float __fmul_rd(float x, float y) { return x * y; }
-
-__device__ static inline float __fmul_rn(float x, float y) { return x * y; }
-
-__device__ static inline float __fmul_ru(float x, float y) { return x * y; }
-
-__device__ static inline float __fmul_rz(float x, float y) { return x * y; }
-
-__device__ inline float __frcp_rd(float x) { return __hip_fast_frcp(x); }
-
-__device__ inline float __frcp_rn(float x) { return __hip_fast_frcp(x); }
-
-__device__ inline float __frcp_ru(float x) { return __hip_fast_frcp(x); }
-
-__device__ inline float __frcp_rz(float x) { return __hip_fast_frcp(x); }
-
-__device__ inline float __frsqrt_rn(float x) { return __hip_fast_frsqrt_rn(x); }
-
-__device__ inline float __fsqrt_rd(float x) { return __hip_fast_fsqrt_rd(x); }
-
-__device__ inline float __fsqrt_rn(float x) { return __hip_fast_fsqrt_rn(x); }
-
-__device__ inline float __fsqrt_ru(float x) { return __hip_fast_fsqrt_ru(x); }
-
-__device__ inline float __fsqrt_rz(float x) { return __hip_fast_fsqrt_rz(x); }
-
-__device__ static inline float __fsub_rd(float x, float y) { return x - y; }
-
-__device__ static inline float __fsub_rn(float x, float y) { return x - y; }
-
-__device__ static inline float __fsub_ru(float x, float y) { return x - y; }
-
-__device__ static inline float __fsub_rz(float x, float y) { return x - y; }
-
-
-__device__ inline float __log10f(float x) { return __hip_fast_log10f(x); }
-
-__device__ inline float __log2f(float x) { return __hip_fast_log2f(x); }
-
-__device__ inline float __logf(float x) { return __hip_fast_logf(x); }
-
-__device__ inline float __powf(float base, float exponent) {
-    return __hip_fast_powf(base, exponent);
-}
-
-__device__ static inline float __saturatef(float x) {
-    x = x > 1.0f ? 1.0f : x;
-    x = x < 0.0f ? 0.0f : x;
-    return x;
-}
-
-__device__ inline void __sincosf(float x, float* s, float* c) {
-    return __hip_fast_sincosf(x, s, c);
-}
-
-__device__ inline float __sinf(float x) { return __hip_fast_sinf(x); }
-
-__device__ inline float __tanf(float x) { return __hip_fast_tanf(x); }
-
-
-/*
-Double Precision Intrinsics
-*/
-
-__device__ static inline double __dadd_rd(double x, double y) { return x + y; }
-
-__device__ static inline double __dadd_rn(double x, double y) { return x + y; }
-
-__device__ static inline double __dadd_ru(double x, double y) { return x + y; }
-
-__device__ static inline double __dadd_rz(double x, double y) { return x + y; }
-
-__device__ static inline double __ddiv_rd(double x, double y) { return x / y; }
-
-__device__ static inline double __ddiv_rn(double x, double y) { return x / y; }
-
-__device__ static inline double __ddiv_ru(double x, double y) { return x / y; }
-
-__device__ static inline double __ddiv_rz(double x, double y) { return x / y; }
-
-__device__ static inline double __dmul_rd(double x, double y) { return x * y; }
-
-__device__ static inline double __dmul_rn(double x, double y) { return x * y; }
-
-__device__ static inline double __dmul_ru(double x, double y) { return x * y; }
-
-__device__ static inline double __dmul_rz(double x, double y) { return x * y; }
-
-__device__ inline double __drcp_rd(double x) { return __hip_fast_drcp(x); }
-
-__device__ inline double __drcp_rn(double x) { return __hip_fast_drcp(x); }
-
-__device__ inline double __drcp_ru(double x) { return __hip_fast_drcp(x); }
-
-__device__ inline double __drcp_rz(double x) { return __hip_fast_drcp(x); }
-
-
-__device__ inline double __dsqrt_rd(double x) { return __hip_fast_dsqrt(x); }
-
-__device__ inline double __dsqrt_rn(double x) { return __hip_fast_dsqrt(x); }
-
-__device__ inline double __dsqrt_ru(double x) { return __hip_fast_dsqrt(x); }
-
-__device__ inline double __dsqrt_rz(double x) { return __hip_fast_dsqrt(x); }
-
-__device__ static inline double __dsub_rd(double x, double y) { return x - y; }
-
-__device__ static inline double __dsub_rn(double x, double y) { return x - y; }
-
-__device__ static inline double __dsub_ru(double x, double y) { return x - y; }
-
-__device__ static inline double __dsub_rz(double x, double y) { return x - y; }
-
-__device__ inline double __fma_rd(double x, double y, double z) { return __hip_fast_fma(x, y, z); }
-
-__device__ inline double __fma_rn(double x, double y, double z) { return __hip_fast_fma(x, y, z); }
-
-__device__ inline double __fma_ru(double x, double y, double z) { return __hip_fast_fma(x, y, z); }
-
-__device__ inline double __fma_rz(double x, double y, double z) { return __hip_fast_fma(x, y, z); }
-
-
 extern "C" __device__ unsigned int __hip_hc_ir_umul24_int(unsigned int, unsigned int);
 extern "C" __device__ signed int __hip_hc_ir_mul24_int(signed int, signed int);
 extern "C" __device__ signed int __hip_hc_ir_mulhi_int(signed int, signed int);
diff --git a/projects/clr/hipamd/include/hip/hcc_detail/math_functions.h b/projects/clr/hipamd/include/hip/hcc_detail/math_functions.h
index e717df07c1..c11112b1dc 100644
--- a/projects/clr/hipamd/include/hip/hcc_detail/math_functions.h
+++ b/projects/clr/hipamd/include/hip/hcc_detail/math_functions.h
@@ -20,234 +20,1085 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
-#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_MATH_FUNCTIONS_H
-#define HIP_INCLUDE_HIP_HCC_DETAIL_MATH_FUNCTIONS_H
-
-#if defined(__HCC__)
-#include <kalmar_math.h>
-#endif
+#pragma once
 
+#include "ocml_math_fwd.h"
 
 #include <hip/hip_runtime.h>
-#include <hip/hip_vector_types.h>
-#include <hip/hcc_detail/device_functions.h>
 
-__device__ float acosf(float x);
-__device__ float acoshf(float x);
-__device__ float asinf(float x);
-__device__ float asinhf(float x);
-__device__ float atan2f(float y, float x);
-__device__ float atanf(float x);
-__device__ float atanhf(float x);
-__device__ float cbrtf(float x);
-__device__ float ceilf(float x);
-__device__ float copysignf(float x, float y);
-__device__ float cosf(float x);
-__device__ float coshf(float x);
-__device__ float cospif(float x);
-//__device__ float cyl_bessel_i0f(float x);
-//__device__ float cyl_bessel_i1f(float x);
-__device__ float erfcf(float x);
-__device__ float erfcinvf(float y);
-__device__ float erfcxf(float x);
-__device__ float erff(float x);
-__device__ float erfinvf(float y);
-__device__ float exp10f(float x);
-__device__ float exp2f(float x);
-__device__ float expf(float x);
-__device__ float expm1f(float x);
-__device__ int abs(int x);
-__device__ long long abs(long long x);
-__device__ float fabsf(float x);
-__device__ float fdimf(float x, float y);
-__device__ float fdividef(float x, float y);
-__device__ float floorf(float x);
-__device__ float fmaf(float x, float y, float z);
-__device__ float fmaxf(float x, float y);
-__device__ float fminf(float x, float y);
-__device__ float fmodf(float x, float y);
-__device__ float frexpf(float x, int* nptr);
-__device__ float hypotf(float x, float y);
-__device__ int ilogbf(float x);
-__device__ int isfinite(float a);
-__device__ unsigned isinf(float a);
-__device__ unsigned isnan(float a);
-__device__ float j0f(float x);
-__device__ float j1f(float x);
-__device__ float jnf(int n, float x);
-__device__ float ldexpf(float x, int exp);
-__device__ float lgammaf(float x);
-__device__ long long int llrintf(float x);
-__device__ long long int llroundf(float x);
-__device__ float log10f(float x);
-__device__ float log1pf(float x);
-__device__ float logbf(float x);
-__device__ long int lrintf(float x);
-__device__ long int lroundf(float x);
-//__device__ float modff(float x, float *iptr);
-__device__ float nanf(const char* tagp);
-__device__ float nearbyintf(float x);
-//__device__ float nextafterf(float x, float y);
-__device__ float norm3df(float a, float b, float c);
-__device__ float norm4df(float a, float b, float c, float d);
-__device__ float normcdff(float y);
-__device__ float normcdfinvf(float y);
-__device__ float normf(int dim, const float* a);
-__device__ float powf(float x, float y);
-__device__ float rcbrtf(float x);
-__device__ float remainderf(float x, float y);
-__device__ float remquof(float x, float y, int* quo);
-__device__ float rhypotf(float x, float y);
-__device__ float rintf(float x);
-__device__ float rnorm3df(float a, float b, float c);
-__device__ float rnorm4df(float a, float b, float c, float d);
-__device__ float rnormf(int dim, const float* a);
-__device__ float roundf(float x);
-__device__ float rsqrtf(float x);
-__device__ float scalblnf(float x, long int n);
-__device__ float scalbnf(float x, int n);
-__device__ int signbit(float a);
-__device__ void sincosf(float x, float* sptr, float* cptr);
-__device__ void sincospif(float x, float* sptr, float* cptr);
-__device__ float sinf(float x);
-__device__ float sinhf(float x);
-__device__ float sinpif(float x);
-__device__ float sqrtf(float x);
-__device__ float tanf(float x);
-__device__ float tanhf(float x);
-__device__ float tgammaf(float x);
-__device__ float truncf(float x);
-__device__ float y0f(float x);
-__device__ float y1f(float x);
-__device__ float ynf(int n, float x);
+#include <assert.h>
+#include <stdint.h>
 
-__device__ double acos(double x);
-__device__ double acosh(double x);
-__device__ double asin(double x);
-__device__ double asinh(double x);
-__device__ double atan(double x);
-__device__ double atan2(double y, double x);
-__device__ double atanh(double x);
-__device__ double cbrt(double x);
-__device__ double ceil(double x);
-__device__ double copysign(double x, double y);
-__device__ double cos(double x);
-__device__ double cosh(double x);
-__device__ double cospi(double x);
-//__device__ double cyl_bessel_i0(double x);
-//__device__ double cyl_bessel_i1(double x);
-__device__ double erf(double x);
-__device__ double erfc(double x);
-__device__ double erfcinv(double y);
-__device__ double erfcx(double x);
-__device__ double erfinv(double x);
-__device__ double exp(double x);
-__device__ double exp10(double x);
-__device__ double exp2(double x);
-__device__ double expm1(double x);
-__device__ double fabs(double x);
-__device__ double fdim(double x, double y);
-__device__ double floor(double x);
-__device__ double fma(double x, double y, double z);
-__device__ double fmax(double x, double y);
-__device__ double fmin(double x, double y);
-__device__ double fmod(double x, double y);
-__device__ double frexp(double x, int* nptr);
-__device__ double hypot(double x, double y);
-__device__ int ilogb(double x);
-__device__ int isfinite(double x);
-__device__ unsigned isinf(double x);
-__device__ unsigned isnan(double x);
-__device__ double j0(double x);
-__device__ double j1(double x);
-__device__ double jn(int n, double x);
-__device__ double ldexp(double x, int exp);
-__device__ double lgamma(double x);
-__device__ long long llrint(double x);
-__device__ long long llround(double x);
-__device__ double log(double x);
-__device__ double log10(double x);
-__device__ double log1p(double x);
-__device__ double log2(double x);
-__device__ double logb(double x);
-__device__ long int lrint(double x);
-__device__ long int lround(double x);
-//__device__ double modf(double x, double *iptr);
-__device__ double nan(const char* tagp);
-__device__ double nearbyint(double x);
-__device__ double nextafter(double x, double y);
-__device__ double norm(int dim, const double* t);
-__device__ double norm3d(double a, double b, double c);
-__device__ double norm4d(double a, double b, double c, double d);
-__device__ double normcdf(double y);
-__device__ double normcdfinv(double y);
-__device__ double pow(double x, double y);
-__device__ double rcbrt(double x);
-__device__ double remainder(double x, double y);
-//__device__ double remquo(double x, double y, int *quo);
-__device__ double rhypot(double x, double y);
-__device__ double rint(double x);
-__device__ double rnorm(int dim, const double* t);
-__device__ double rnorm3d(double a, double b, double c);
-__device__ double rnorm4d(double a, double b, double c, double d);
-__device__ double round(double x);
-__device__ double rsqrt(double x);
-__device__ double scalbln(double x, long int n);
-__device__ double scalbn(double x, int n);
-__device__ int signbit(double a);
-__device__ double sin(double a);
-__device__ void sincos(double x, double* sptr, double* cptr);
-__device__ void sincospi(double x, double* sptr, double* cptr);
-__device__ double sinh(double x);
-__device__ double sinpi(double x);
-__device__ double sqrt(double x);
-__device__ double tan(double x);
-__device__ double tanh(double x);
-__device__ double tgamma(double x);
-__device__ double trunc(double x);
-__device__ double y0(double x);
-__device__ double y1(double y);
-__device__ double yn(int n, double x);
+__device__
+inline
+uint64_t __make_mantissa_base8(const char* tagp)
+{
+    uint64_t r = 0;
+    while (tagp) {
+        char tmp = *tagp;
 
-// ENDPARSER
+        if (tmp >= '0' && tmp <= '7') r = (r * 8u) + tmp - '0';
+        else return 0;
 
-#ifdef HIP_FAST_MATH
-// Single Precision Precise Math when enabled
+        ++tagp;
+    }
 
-__device__ inline float cosf(float x) { return __hip_fast_cosf(x); }
+    return r;
+}
 
-__device__ inline float exp10f(float x) { return __hip_fast_exp10f(x); }
+__device__
+inline
+uint64_t __make_mantissa_base10(const char* tagp)
+{
+    uint64_t r = 0;
+    while (tagp) {
+        char tmp = *tagp;
 
-__device__ inline float expf(float x) { return __hip_fast_expf(x); }
+        if (tmp >= '0' && tmp <= '9') r = (r * 10u) + tmp - '0';
+        else return 0;
 
-__device__ inline float log10f(float x) { return __hip_fast_log10f(x); }
+        ++tagp;
+    }
 
-__device__ inline float log2f(float x) { return __hip_fast_log2f(x); }
+    return r;
+}
 
-__device__ inline float logf(float x) { return __hip_fast_logf(x); }
+__device__
+inline
+uint64_t __make_mantissa_base16(const char* tagp)
+{
+    uint64_t r = 0;
+    while (tagp) {
+        char tmp = *tagp;
 
-__device__ inline float powf(float base, float exponent) { return __hip_fast_powf(base, exponent); }
+        if (tmp >= '0' && tmp <= '9') r = (r * 16u) + tmp - '0';
+        else if (tmp >= 'a' && tmp <= 'f') r = (r * 16u) + tmp - 'a' + 10;
+        else if (tmp >= 'A' && tmp <= 'F') r = (r * 16u) + tmp - 'A' + 10;
+        else return 0;
 
-__device__ inline void sincosf(float x, float* s, float* c) { return __hip_fast_sincosf(x, s, c); }
+        ++tagp;
+    }
 
-__device__ inline float sinf(float x) { return __hip_fast_sinf(x); }
+    return r;
+}
 
-__device__ inline float tanf(float x) { return __hip_fast_tanf(x); }
+__device__
+inline
+uint64_t __make_mantissa(const char* tagp)
+{
+    if (!tagp) return 0u;
 
-#else
+    if (*tagp == '0') {
+        ++tagp;
 
-__device__ float sinf(float);
-__device__ float cosf(float);
-__device__ float tanf(float);
-__device__ void sincosf(float, float*, float*);
-__device__ float logf(float);
-__device__ float log2f(float);
-__device__ float log10f(float);
-__device__ float expf(float);
-__device__ float exp10f(float);
-__device__ float powf(float, float);
+        if (*tagp == 'x' || *tagp == 'X') return __make_mantissa_base16(tagp);
+        else return __make_mantissa_base8(tagp);
+    }
 
-#endif
+    return __make_mantissa_base10(tagp);
+}
 
+// BEGIN FLOAT
+__device__
+inline
+float acosf(float x) { return __ocml_acos_f32(x); }
+__device__
+inline
+float acoshf(float x) { return __ocml_acosh_f32(x); }
+__device__
+inline
+float asinf(float x) { return __ocml_asin_f32(x); }
+__device__
+inline
+float asinhf(float x) { return __ocml_asinh_f32(x); }
+__device__
+inline
+float atan2f(float x, float y) { return __ocml_atan2_f32(x, y); }
+__device__
+inline
+float atanf(float x) { return __ocml_atan_f32(x); }
+__device__
+inline
+float atanhf(float x) { return __ocml_atanh_f32(x); }
+__device__
+inline
+float cbrtf(float x) { return __ocml_cbrt_f32(x); }
+__device__
+inline
+float ceilf(float x) { return __ocml_ceil_f32(x); }
+__device__
+inline
+float copysignf(float x, float y) { return __ocml_copysign_f32(x, y); }
+__device__
+inline
+float cosf(float x) { return __ocml_cos_f32(x); }
+__device__
+inline
+float coshf(float x) { return __ocml_cosh_f32(x); }
+__device__
+inline
+float cospif(float x) { return __ocml_cospi_f32(x); }
+__device__
+inline
+float cyl_bessel_i0f(float x) { return __ocml_i0_f32(x); }
+__device__
+inline
+float cyl_bessel_i1f(float x) { return __ocml_i1_f32(x); }
+__device__
+inline
+float erfcf(float x) { return __ocml_erfc_f32(x); }
+__device__
+inline
+float erfcinvf(float x) { return __ocml_erfcinv_f32(x); }
+__device__
+inline
+float erfcxf(float x) { return __ocml_erfcx_f32(x); }
+__device__
+inline
+float erff(float x) { return __ocml_erf_f32(x); }
+__device__
+inline
+float erfinvf(float x) { return __ocml_erfinv_f32(x); }
+__device__
+inline
+float exp10f(float x) { return __ocml_exp10_f32(x); }
+__device__
+inline
+float exp2f(float x) { return __ocml_exp2_f32(x); }
+__device__
+inline
+float expf(float x) { return __ocml_exp_f32(x); }
+__device__
+inline
+float expm1f(float x) { return __ocml_expm1_f32(x); }
+__device__
+inline
+float fabsf(float x) { return __ocml_fabs_f32(x); }
+__device__
+inline
+float fdimf(float x, float y) { return __ocml_fdim_f32(x, y); }
+__device__
+inline
+float fdividef(float x, float y) { return x / y; }
+__device__
+inline
+float floorf(float x) { return __ocml_floor_f32(x); }
+__device__
+inline
+float fmaf(float x, float y, float z) { return __ocml_fma_f32(x, y, z); }
+__device__
+inline
+float fmaxf(float x, float y) { return __ocml_fmax_f32(x, y); }
+__device__
+inline
+float fminf(float x, float y) { return __ocml_fmin_f32(x, y); }
+__device__
+inline
+float fmodf(float x, float y) { return __ocml_fmod_f32(x, y); }
+__device__
+inline
+float frexpf(float x, int* nptr)
+{
+    int tmp;
+    float r =
+        __ocml_frexp_f32(x, (__attribute__((address_space(5))) int*) &tmp);
+    *nptr = tmp;
 
-#endif
+    return r;
+}
+__device__
+inline
+float hypotf(float x, float y) { return __ocml_hypot_f32(x, y); }
+__device__
+inline
+int ilogbf(float x) { return __ocml_ilogb_f32(x); }
+__device__
+inline
+int isfinite(float x) { return __ocml_isfinite_f32(x); }
+__device__
+inline
+int isinf(float x) { return __ocml_isinf_f32(x); }
+__device__
+inline
+int isnan(float x) { return __ocml_isnan_f32(x); }
+__device__
+inline
+float j0f(float x) { return __ocml_j0_f32(x); }
+__device__
+inline
+float j1f(float x) { return __ocml_j1_f32(x); }
+__device__
+inline
+float jnf(int n, float x)
+{   // TODO: we could use Ahmes multiplication and the Miller & Brown algorithm
+    //       for linear recurrences to get O(log n) steps, but it's unclear if
+    //       it'd be beneficial in this case.
+    if (n == 0) return j0f(x);
+    if (n == 1) return j1f(x);
+
+    float x0 = j0f(x);
+    float x1 = j1f(x);
+    for (int i = 1; i < n; ++i) {
+        float x2 = (2 * i) / x * x1 - x0;
+        x0 = x1;
+        x1 = x2;
+    }
+
+    return x1;
+}
+__device__
+inline
+float ldexpf(float x, int e) { return __ocml_ldexp_f32(x, e); }
+__device__
+inline
+float lgammaf(float x) { return __ocml_lgamma_f32(x); }
+__device__
+inline
+long long int llrintf(float x) { return __ocml_rint_f32(x); }
+__device__
+inline
+long long int llroundf(float x) { return __ocml_round_f32(x); }
+__device__
+inline
+float log10f(float x) { return __ocml_log10_f32(x); }
+__device__
+inline
+float log1pf(float x) { return __ocml_log1p_f32(x); }
+__device__
+inline
+float log2f(float x) { return __ocml_log2_f32(x); }
+__device__
+inline
+float logbf(float x) { return __ocml_logb_f32(x); }
+__device__
+inline
+float logf(float x) { return __ocml_log_f32(x); }
+__device__
+inline
+long int lrintf(float x) { return __ocml_rint_f32(x); }
+__device__
+inline
+long int lroundf(float x) { return __ocml_round_f32(x); }
+__device__
+inline
+float modff(float x, float* iptr)
+{
+    float tmp;
+    float r =
+        __ocml_modf_f32(x, (__attribute__((address_space(5))) float*) &tmp);
+    *iptr = tmp;
+
+    return r;
+}
+__device__
+inline
+float nanf(const char* tagp)
+{
+    union {
+        float val;
+        struct ieee_float {
+            uint32_t mantissa : 22;
+            uint32_t quiet : 1;
+            uint32_t exponent : 8;
+            uint32_t sign : 1;
+        } bits;
+
+        static_assert(sizeof(float) == sizeof(ieee_float), "");
+    } tmp;
+
+    tmp.bits.sign = 0u;
+    tmp.bits.exponent = ~0u;
+    tmp.bits.quiet = 1u;
+    tmp.bits.mantissa = __make_mantissa(tagp);
+
+    return tmp.val;
+}
+__device__
+inline
+float nearbyintf(float x) { return __ocml_nearbyint_f32(x); }
+__device__
+inline
+float nextafterf(float x, float y) { return __ocml_nextafter_f32(x, y); }
+__device__
+inline
+float norm3df(float x, float y, float z) { return __ocml_len3_f32(x, y, z); }
+__device__
+inline
+float norm4df(float x, float y, float z, float w)
+{
+    return __ocml_len4_f32(x, y, z, w);
+}
+__device__
+inline
+float normcdff(float x) { return __ocml_ncdf_f32(x); }
+__device__
+inline
+float normcdfinvf(float x) { return __ocml_ncdfinv_f32(x); }
+__device__
+inline
+float normf(int dim, const float* a)
+{
+    float r = 0;
+    while (dim--) { r += a[0] * a[0]; ++a; }
+
+    return __ocml_sqrt_f32(r);
+}
+__device__
+inline
+float powf(float x, float y) { return __ocml_pow_f32(x, y); }
+__device__
+inline
+float rcbrtf(float x) { return __ocml_rcbrt_f32(x); }
+__device__
+inline
+float remainderf(float x, float y) { return __ocml_remainder_f32(x, y); }
+__device__
+inline
+float remquof(float x, float y, int* quo)
+{
+    int tmp;
+    float r =
+        __ocml_remquo_f32(x, y, (__attribute__((address_space(5))) int*) &tmp);
+    *quo = tmp;
+
+    return r;
+}
+__device__
+inline
+float rhypotf(float x, float y) { return __ocml_rhypot_f32(x, y); }
+__device__
+inline
+float rintf(float x) { return __ocml_rint_f32(x); }
+__device__
+inline
+float rnorm3df(float x, float y, float z)
+{
+    return __ocml_rlen3_f32(x, y, z);
+}
+
+__device__
+inline
+float rnorm4df(float x, float y, float z, float w)
+{
+    return __ocml_rlen4_f32(x, y, z, w);
+}
+__device__
+inline
+float rnormf(int dim, const float* a)
+{
+    float r = 0;
+    while (dim--) { r += a[0] * a[0]; ++a; }
+
+    return __ocml_rsqrt_f32(r);
+}
+__device__
+inline
+float roundf(float x) { return __ocml_round_f32(x); }
+__device__
+inline
+float rsqrtf(float x) { return __ocml_rsqrt_f32(x); }
+__device__
+inline
+float scalblnf(float x, long int n)
+{
+    return (n < INT_MAX) ? __ocml_scalbn_f32(x, n) : __ocml_scalb_f32(x, n);
+}
+__device__
+inline
+float scalbnf(float x, int n) { return __ocml_scalbn_f32(x, n); }
+__device__
+inline
+int signbit(float x) { return __ocml_signbit_f32(x); }
+__device__
+inline
+void sincosf(float x, float* sptr, float* cptr)
+{
+    float tmp;
+
+    *sptr =
+        __ocml_sincos_f32(x, (__attribute__((address_space(5))) float*) &tmp);
+    *cptr = tmp;
+}
+__device__
+inline
+void sincospif(float x, float* sptr, float* cptr)
+{
+    float tmp;
+
+    *sptr =
+        __ocml_sincospi_f32(x, (__attribute__((address_space(5))) float*) &tmp);
+    *cptr = tmp;
+}
+__device__
+inline
+float sinf(float x) { return __ocml_sin_f32(x); }
+__device__
+inline
+float sinhf(float x) { return __ocml_sinh_f32(x); }
+__device__
+inline
+float sinpif(float x) { return __ocml_sinpi_f32(x); }
+__device__
+inline
+float sqrtf(float x) { return __ocml_sqrt_f32(x); }
+__device__
+inline
+float tanf(float x) { return __ocml_tan_f32(x); }
+__device__
+inline
+float tanhf(float x) { return __ocml_tanh_f32(x); }
+__device__
+inline
+float tgammaf(float x) { return __ocml_tgamma_f32(x); }
+__device__
+inline
+float truncf(float x) { return __ocml_trunc_f32(x); }
+__device__
+inline
+float y0f(float x) { return __ocml_y0_f32(x); }
+__device__
+inline
+float y1f(float x) { return __ocml_y1_f32(x); }
+__device__
+inline
+float ynf(int n, float x)
+{   // TODO: we could use Ahmes multiplication and the Miller & Brown algorithm
+    //       for linear recurrences to get O(log n) steps, but it's unclear if
+    //       it'd be beneficial in this case.
+    if (n == 0) return y0f(x);
+    if (n == 1) return y1f(x);
+
+    float x0 = y0f(x);
+    float x1 = y1f(x);
+    for (int i = 1; i < n; ++i) {
+        float x2 = (2 * i) / x * x1 - x0;
+        x0 = x1;
+        x1 = x2;
+    }
+
+    return x1;
+}
+
+// BEGIN INTRINSICS
+__device__
+inline
+float __cosf(float x) { return __llvm_amdgcn_cos_f32(x); }
+__device__
+inline
+float __exp10f(float x) { return __ocml_exp10_f32(x); }
+__device__
+inline
+float __expf(float x) { return __ocml_exp_f32(x); }
+__device__
+inline
+float __fadd_rd(float x, float y) { return __llvm_add_rte_f32(x, y); }
+__device__
+inline
+float __fadd_rn(float x, float y) { return __llvm_add_rtn_f32(x, y); }
+__device__
+inline
+float __fadd_ru(float x, float y) { return __llvm_add_rtp_f32(x, y); }
+__device__
+inline
+float __fadd_rz(float x, float y) { return __llvm_add_rtz_f32(x, y); }
+__device__
+inline
+float __fdiv_rd(float x, float y) { return __llvm_div_rte_f32(x, y); }
+__device__
+inline
+float __fdiv_rn(float x, float y) { return __llvm_div_rtn_f32(x, y); }
+__device__
+inline
+float __fdiv_ru(float x, float y) { return __llvm_div_rtp_f32(x, y); }
+__device__
+inline
+float __fdiv_rz(float x, float y) { return __llvm_div_rtz_f32(x, y); }
+__device__
+inline
+float __fdividef(float x, float y) { return __llvm_div_rte_f32(x, y); }
+__device__
+inline
+float __fmaf_rd(float x, float y, float z)
+{
+    return __llvm_fma_rte_f32(x, y, z);
+}
+__device__
+inline
+float __fmaf_rn(float x, float y, float z)
+{
+    return __llvm_fma_rtn_f32(x, y, z);
+}
+__device__
+inline
+float __fmaf_ru(float x, float y, float z)
+{
+    return __llvm_fma_rtp_f32(x, y, z);
+}
+__device__
+inline
+float __fmaf_rz(float x, float y, float z)
+{
+   return __llvm_fma_rtz_f32(x, y, z);
+}
+__device__
+inline
+float __fmul_rd(float x, float y) { return __llvm_mul_rte_f32(x, y); }
+__device__
+inline
+float __fmul_rn(float x, float y) { return __llvm_mul_rtn_f32(x, y); }
+__device__
+inline
+float __fmul_ru(float x, float y)  { return __llvm_mul_rtp_f32(x, y); }
+__device__
+inline
+float __fmul_rz(float x, float y) { return __llvm_mul_rtz_f32(x, y); }
+__device__
+inline
+float __frcp_rd(float x) { return __llvm_amdgcn_rcp_f32(x); }
+__device__
+inline
+float __frcp_rn(float x) { return __llvm_amdgcn_rcp_f32(x); }
+__device__
+inline
+float __frcp_ru(float x) { return __llvm_amdgcn_rcp_f32(x); }
+__device__
+inline
+float __frcp_rz(float x) { return __llvm_amdgcn_rcp_f32(x); }
+__device__
+inline
+float __frsqrt_rn(float x) { return __llvm_amdgcn_rsq_f32(x); }
+__device__
+inline
+float __fsqrt_rd(float x) { return __llvm_sqrt_rte_f32(x); }
+__device__
+inline
+float __fsqrt_rn(float x) { return __llvm_sqrt_rtn_f32(x); }
+__device__
+inline
+float __fsqrt_ru(float x) { return __llvm_sqrt_rtp_f32(x); }
+__device__
+inline
+float __fsqrt_rz(float x) { return __llvm_sqrt_rtz_f32(x); }
+__device__
+inline
+float __fsub_rd(float x, float y) { return __llvm_sub_rte_f32(x, y); }
+__device__
+inline
+float __fsub_rn(float x, float y) { return __llvm_sub_rtn_f32(x, y); }
+__device__
+inline
+float __fsub_ru(float x, float y) { return __llvm_sub_rtp_f32(x, y); }
+__device__
+inline
+float __fsub_rz(float x, float y) { return __llvm_sub_rtz_f32(x, y); }
+__device__
+inline
+float __log10f(float x) { return __ocml_log10_f32(x); }
+__device__
+inline
+float __log2f(float x) { return __ocml_log2_f32(x); }
+__device__
+inline
+float __logf(float x) { return __ocml_log_f32(x); }
+__device__
+inline
+float __powf(float x, float y) { return __ocml_pow_f32(x, y); }
+__device__
+inline
+float __saturatef(float x) { return (x < 0) ? 0 : ((x > 1) ? 1 : x); }
+__device__
+inline
+void __sincosf(float x, float* sptr, float* cptr)
+{
+    float tmp;
+
+    *sptr =
+    __ocml_sincos_f32(x, (__attribute__((address_space(5))) float*) &tmp);
+    *cptr = tmp;
+}
+__device__
+inline
+float __sinf(float x) { return __llvm_amdgcn_sin_f32(x); }
+__device__
+inline
+float __tanf(float x) { return __ocml_tan_f32(x); }
+// END INTRINSICS
+// END FLOAT
+
+// BEGIN DOUBLE
+__device__
+inline
+double acos(double x) { return __ocml_acos_f64(x); }
+__device__
+inline
+double acosh(double x) { return __ocml_acosh_f64(x); }
+__device__
+inline
+double asin(double x) { return __ocml_asin_f64(x); }
+__device__
+inline
+double asinh(double x) { return __ocml_asinh_f64(x); }
+__device__
+inline
+double atan(double x) { return __ocml_atan_f64(x); }
+__device__
+inline
+double atan2(double x, double y) { return __ocml_atan2_f64(x, y); }
+__device__
+inline
+double atanh(double x) { return __ocml_atanh_f64(x); }
+__device__
+inline
+double cbrt(double x) { return __ocml_cbrt_f64(x); }
+__device__
+inline
+double ceil(double x) { return __ocml_ceil_f64(x); }
+__device__
+inline
+double copysign(double x, double y) { return __ocml_copysign_f64(x, y); }
+__device__
+inline
+double cos(double x)  { return __ocml_cos_f64(x); }
+__device__
+inline
+double cosh(double x) { return __ocml_cosh_f64(x); }
+__device__
+inline
+double cospi(double x) { return __ocml_cospi_f64(x); }
+__device__
+inline
+double cyl_bessel_i0(double x) { return __ocml_i0_f64(x); }
+__device__
+inline
+double cyl_bessel_i1(double x) { return __ocml_i1_f64(x); }
+__device__
+inline
+double erf(double x) { return __ocml_erf_f64(x); }
+__device__
+inline
+double erfc(double x) { return __ocml_erfc_f64(x); }
+__device__
+inline
+double erfcinv(double x) { return __ocml_erfcinv_f64(x); }
+__device__
+inline
+double erfcx(double x) { return __ocml_erfcx_f64(x); }
+__device__
+inline
+double erfinv(double x) { return __ocml_erfinv_f64(x); }
+__device__
+inline
+double exp(double x) { return __ocml_exp_f64(x); }
+__device__
+inline
+double exp10(double x) { return __ocml_exp10_f64(x); }
+__device__
+inline
+double exp2(double x) { return __ocml_exp2_f64(x); }
+__device__
+inline
+double expm1(double x) { return __ocml_expm1_f64(x); }
+__device__
+inline
+double fabs(double x) { return __ocml_fabs_f64(x); }
+__device__
+inline
+double fdim(double x, double y) { return __ocml_fdim_f64(x, y); }
+__device__
+inline
+double floor(double x) { return __ocml_floor_f64(x); }
+__device__
+inline
+double fma(double x, double y, double z) { return __ocml_fma_f64(x, y, z); }
+__device__
+inline
+double fmax(double x, double y) { return __ocml_fmax_f64(x, y); }
+__device__
+inline
+double fmin(double x, double y) { return __ocml_fmin_f64(x, y); }
+__device__
+inline
+double fmod(double x, double y) { return __ocml_fmod_f64(x, y); }
+__device__
+inline
+double frexp(double x, int* nptr)
+{
+    int tmp;
+    double r =
+        __ocml_frexp_f64(x, (__attribute__((address_space(5))) int*) &tmp);
+    *nptr = tmp;
+
+    return r;
+}
+__device__
+inline
+double hypot(double x, double y) { return __ocml_hypot_f64(x, y); }
+__device__
+inline
+int ilogb(double x) { return __ocml_ilogb_f64(x); }
+__device__
+inline
+int isfinite(double x) { return __ocml_isfinite_f64(x); }
+__device__
+inline
+int isinf(double x) { return __ocml_isinf_f64(x); }
+__device__
+inline
+int isnan(double x) { return __ocml_isnan_f64(x); }
+__device__
+inline
+double j0(double x) { return __ocml_j0_f64(x); }
+__device__
+inline
+double j1(double x) { return __ocml_j1_f64(x); }
+__device__
+inline
+double jn(int n, double x)
+{   // TODO: we could use Ahmes multiplication and the Miller & Brown algorithm
+    //       for linear recurrences to get O(log n) steps, but it's unclear if
+    //       it'd be beneficial in this case.
+    if (n == 0) return j0f(x);
+    if (n == 1) return j1f(x);
+
+    double x0 = j0f(x);
+    double x1 = j1f(x);
+    for (int i = 1; i < n; ++i) {
+        double x2 = (2 * i) / x * x1 - x0;
+        x0 = x1;
+        x1 = x2;
+    }
+
+    return x1;
+}
+__device__
+inline
+double ldexp(double x, int e) { return __ocml_ldexp_f64(x, e); }
+__device__
+inline
+double lgamma(double x) { return __ocml_lgamma_f64(x); }
+__device__
+inline
+long long int llrint(double x) { return __ocml_rint_f64(x); }
+__device__
+inline
+long long int llround(double x) { return __ocml_round_f64(x); }
+__device__
+inline
+double log(double x) { return __ocml_log_f64(x); }
+__device__
+inline
+double log10(double x) { return __ocml_log10_f64(x); }
+__device__
+inline
+double log1p(double x) { return __ocml_log1p_f64(x); }
+__device__
+inline
+double log2(double x) { return __ocml_log2_f64(x); }
+__device__
+inline
+double logb(double x) { return __ocml_logb_f64(x); }
+__device__
+inline
+long int lrint(double x) { return __ocml_rint_f64(x); }
+__device__
+inline
+long int lround(double x) { return __ocml_round_f64(x); }
+__device__
+inline
+double modf(double x, double* iptr)
+{
+    double tmp;
+    double r =
+        __ocml_modf_f64(x, (__attribute__((address_space(5))) double*) &tmp);
+    *iptr = tmp;
+
+    return r;
+}
+__device__
+inline
+double nan(const char* tagp)
+{
+    union {
+        double val;
+        struct ieee_double {
+            uint64_t mantissa : 51;
+            uint32_t quiet : 1;
+            uint32_t exponent : 11;
+            uint32_t sign : 1;
+        } bits;
+
+        static_assert(sizeof(double) == sizeof(ieee_double), "");
+    } tmp;
+
+    tmp.bits.sign = 0u;
+    tmp.bits.exponent = ~0u;
+    tmp.bits.quiet = 1u;
+    tmp.bits.mantissa = __make_mantissa(tagp);
+
+    return tmp.val;
+}
+__device__
+inline
+double nearbyint(double x) { return __ocml_nearbyint_f64(x); }
+__device__
+inline
+double nextafter(double x, double y) { return __ocml_nextafter_f64(x, y); }
+__device__
+inline
+double norm(int dim, const double* a)
+{
+    double r = 0;
+    while (dim--) { r += a[0] * a[0]; ++a; }
+
+    return __ocml_sqrt_f64(r);
+}
+__device__
+inline
+double norm3d(double x, double y, double z)
+{
+    return __ocml_len3_f64(x, y, z);
+}
+__device__
+inline
+double norm4d(double x, double y, double z, double w)
+{
+    return __ocml_len4_f64(x, y, z, w);
+}
+__device__
+inline
+double normcdf(double x) { return __ocml_ncdf_f64(x); }
+__device__
+inline
+double normcdfinv(double x) { return __ocml_ncdfinv_f64(x); }
+__device__
+inline
+double pow(double x, double y) { return __ocml_pow_f64(x, y); }
+__device__
+inline
+double rcbrt(double x) { return __ocml_rcbrt_f64(x); }
+__device__
+inline
+double remainder(double x, double y) { return __ocml_remainder_f64(x, y); }
+__device__
+inline
+double remquo(double x, double y, int* quo)
+{
+    int tmp;
+    double r =
+        __ocml_remquo_f64(x, y, (__attribute__((address_space(5))) int*) &tmp);
+    *quo = tmp;
+
+    return r;
+}
+__device__
+inline
+double rhypot(double x, double y) { return __ocml_rhypot_f64(x, y); }
+__device__
+inline
+double rint(double x) { return __ocml_rint_f64(x); }
+__device__
+inline
+double rnorm(int dim, const double* a)
+{
+    double r = 0;
+    while (dim--) { r += a[0] * a[0]; ++a; }
+
+    return __ocml_rsqrt_f64(r);
+}
+__device__
+inline
+double rnorm3d(double x, double y, double z)
+{
+    return __ocml_rlen3_f64(x, y, z);
+}
+__device__
+inline
+double rnorm4d(double x, double y, double z, double w)
+{
+    return __ocml_rlen4_f64(x, y, z, w);
+}
+__device__
+inline
+double round(double x) { return __ocml_round_f64(x); }
+__device__
+inline
+double rsqrt(double x) { return __ocml_rsqrt_f64(x); }
+__device__
+inline
+double scalbln(double x, long int n)
+{
+    return (n < INT_MAX) ? __ocml_scalbn_f64(x, n) : __ocml_scalb_f64(x, n);
+}
+__device__
+inline
+double scalbn(double x, int n) { return __ocml_scalbn_f64(x, n); }
+__device__
+inline
+int signbit(double x) { return __ocml_signbit_f64(x); }
+__device__
+inline
+double sin(double x) { return __ocml_sin_f64(x); }
+__device__
+inline
+void sincos(double x, double* sptr, double* cptr)
+{
+    double tmp;
+    *sptr =
+        __ocml_sincos_f64(x, (__attribute__((address_space(5))) double*) &tmp);
+    *cptr = tmp;
+}
+__device__
+inline
+void sincospi(double x, double* sptr, double* cptr)
+{
+    double tmp;
+    *sptr = __ocml_sincospi_f64(
+        x, (__attribute__((address_space(5))) double*) &tmp);
+    *cptr = tmp;
+}
+__device__
+inline
+double sinh(double x) { return __ocml_sinh_f64(x); }
+__device__
+inline
+double sinpi(double x) { return __ocml_sinpi_f64(x); }
+__device__
+inline
+double sqrt(double x) { return __ocml_sqrt_f64(x); }
+__device__
+inline
+double tan(double x) { return __ocml_tan_f64(x); }
+__device__
+inline
+double tanh(double x) { return __ocml_tanh_f64(x); }
+__device__
+inline
+double tgamma(double x) { return __ocml_tgamma_f64(x); }
+__device__
+inline
+double trunc(double x) { return __ocml_trunc_f64(x); }
+__device__
+inline
+double y0(double x) { return __ocml_y0_f64(x); }
+__device__
+inline
+double y1(double x) { return __ocml_y1_f64(x); }
+__device__
+inline
+double yn(int n, double x)
+{   // TODO: we could use Ahmes multiplication and the Miller & Brown algorithm
+    //       for linear recurrences to get O(log n) steps, but it's unclear if
+    //       it'd be beneficial in this case.
+    if (n == 0) return j0f(x);
+    if (n == 1) return j1f(x);
+
+    double x0 = j0f(x);
+    double x1 = j1f(x);
+    for (int i = 1; i < n; ++i) {
+        double x2 = (2 * i) / x * x1 - x0;
+        x0 = x1;
+        x1 = x2;
+    }
+
+    return x1;
+}
+
+// BEGIN INTRINSICS
+__device__
+inline
+double __dadd_rd(double x, double y) { return __llvm_add_rtp_f64(x, y); }
+__device__
+inline
+double __dadd_rn(double x, double y) { return __llvm_add_rte_f64(x, y); }
+__device__
+inline
+double __dadd_ru(double x, double y) { return __llvm_add_rtn_f64(x, y); }
+__device__
+inline
+double __dadd_rz(double x, double y) { return __llvm_add_rtz_f64(x, y); }
+__device__
+inline
+double __ddiv_rd(double x, double y) { return __llvm_div_rtp_f64(x, y); }
+__device__
+inline
+double __ddiv_rn(double x, double y) { return __llvm_div_rte_f64(x, y); }
+__device__
+inline
+double __ddiv_ru(double x, double y) { return __llvm_div_rtn_f64(x, y); }
+__device__
+inline
+double __ddiv_rz(double x, double y) { return __llvm_div_rtz_f64(x, y); }
+__device__
+inline
+double __dmul_rd(double x, double y) { return __llvm_mul_rtp_f64(x, y); }
+__device__
+inline
+double __dmul_rn(double x, double y) { return __llvm_mul_rte_f64(x, y); }
+__device__
+inline
+double __dmul_ru(double x, double y) { return __llvm_mul_rtn_f64(x, y); }
+__device__
+inline
+double __dmul_rz(double x, double y) { return __llvm_mul_rtz_f64(x, y); }
+__device__
+inline
+double __drcp_rd(double x) { return __llvm_amdgcn_rcp_f64(x); }
+__device__
+inline
+double __drcp_rn(double x) { return __llvm_amdgcn_rcp_f64(x); }
+__device__
+inline
+double __drcp_ru(double x) { return __llvm_amdgcn_rcp_f64(x); }
+__device__
+inline
+double __drcp_rz(double x) { return __llvm_amdgcn_rcp_f64(x); }
+__device__
+inline
+double __dsqrt_rd(double x) { return __llvm_sqrt_rtp_f64(x); }
+__device__
+inline
+double __dsqrt_rn(double x) { return __llvm_sqrt_rte_f64(x); }
+__device__
+inline
+double __dsqrt_ru(double x) { return __llvm_sqrt_rtn_f64(x); }
+__device__
+inline
+double __dsqrt_rz(double x) { return __llvm_sqrt_rtz_f64(x); }
+__device__
+inline
+double __dsub_rd(double x, double y) { return __llvm_sub_rtp_f64(x, y); }
+__device__
+inline
+double __dsub_rn(double x, double y) { return __llvm_sub_rte_f64(x, y); }
+__device__
+inline
+double __dsub_ru(double x, double y) { return __llvm_sub_rtn_f64(x, y); }
+__device__
+inline
+double __dsub_rz(double x, double y) { return __llvm_sub_rtz_f64(x, y); }
+__device__
+inline
+double __fma_rd(double x, double y, double z)
+{
+    return __llvm_fma_rtp_f64(x, y, z);
+}
+__device__
+inline
+double __fma_rn(double x, double y, double z)
+{
+    return __llvm_fma_rte_f64(x, y, z);
+}
+__device__
+inline
+double __fma_ru(double x, double y, double z)
+{
+    return __llvm_fma_rtn_f64(x, y, z);
+}
+__device__
+inline
+double __fma_rz(double x, double y, double z)
+{
+    return __llvm_fma_rtz_f64(x, y, z);
+}
+// END INTRINSICS
+// END DOUBLE
\ No newline at end of file
diff --git a/projects/clr/hipamd/include/hip/hcc_detail/ocml_math_fwd.h b/projects/clr/hipamd/include/hip/hcc_detail/ocml_math_fwd.h
new file mode 100644
index 0000000000..773fe0bf9b
--- /dev/null
+++ b/projects/clr/hipamd/include/hip/hcc_detail/ocml_math_fwd.h
@@ -0,0 +1,431 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#if defined(__cplusplus)
+    extern "C" {
+#endif
+
+// BEGIN FLOAT
+__attribute__((const))
+float __ocml_acos_f32(float);
+__attribute__((pure))
+float __ocml_acosh_f32(float);
+__attribute__((const))
+float __ocml_asin_f32(float);
+__attribute__((pure))
+float __ocml_asinh_f32(float);
+__attribute__((const))
+float __ocml_atan2_f32(float, float);
+__attribute__((const))
+float __ocml_atan_f32(float);
+__attribute__((pure))
+float __ocml_atanh_f32(float);
+__attribute__((pure))
+float __ocml_cbrt_f32(float);
+__attribute__((const))
+float __ocml_ceil_f32(float);
+__attribute__((const))
+float __ocml_copysign_f32(float, float);
+float __ocml_cos_f32(float);
+__attribute__((pure))
+float __ocml_cosh_f32(float);
+float __ocml_cospi_f32(float);
+float __ocml_div_rtz_f32(float, float);
+float __ocml_i0_f32(float);
+float __ocml_i1_f32(float);
+__attribute__((pure))
+float __ocml_erfc_f32(float);
+__attribute__((pure))
+float __ocml_erfcinv_f32(float);
+__attribute__((pure))
+float __ocml_erfcx_f32(float);
+__attribute__((pure))
+float __ocml_erf_f32(float);
+__attribute__((pure))
+float __ocml_erfinv_f32(float);
+__attribute__((pure))
+float __ocml_exp10_f32(float);
+__attribute__((pure))
+float __ocml_exp2_f32(float);
+__attribute__((pure))
+float __ocml_exp_f32(float);
+__attribute__((pure))
+float __ocml_expm1_f32(float);
+__attribute__((const))
+float __ocml_fabs_f32(float);
+__attribute__((const))
+float __ocml_fdim_f32(float, float);
+__attribute__((const))
+float __ocml_floor_f32(float);
+__attribute__((const))
+float __ocml_fma_f32(float, float, float);
+__attribute__((const))
+float __ocml_fmax_f32(float, float);
+__attribute__((const))
+float __ocml_fmin_f32(float, float);
+__attribute__((const))
+float __ocml_fmod_f32(float, float);
+float __ocml_frexp_f32(float, __attribute__((address_space(5))) int*);
+__attribute__((const))
+float __ocml_hypot_f32(float, float);
+__attribute__((const))
+int __ocml_ilogb_f32(float);
+__attribute__((const))
+int __ocml_isfinite_f32(float);
+__attribute__((const))
+int __ocml_isinf_f32(float);
+__attribute__((const))
+int __ocml_isnan_f32(float);
+float __ocml_j0_f32(float);
+float __ocml_j1_f32(float);
+__attribute__((const))
+float __ocml_ldexp_f32(float, int);
+float __ocml_lgamma_f32(float);
+__attribute__((pure))
+float __ocml_log10_f32(float);
+__attribute__((pure))
+float __ocml_log1p_f32(float);
+__attribute__((pure))
+float __ocml_log2_f32(float);
+__attribute__((const))
+float __ocml_logb_f32(float);
+__attribute__((pure))
+float __ocml_log_f32(float);
+float __ocml_modf_f32(float, __attribute__((address_space(5))) float*);
+__attribute__((const))
+float __ocml_nearbyint_f32(float);
+__attribute__((const))
+float __ocml_nextafter_f32(float, float);
+__attribute__((const))
+float __ocml_len3_f32(float, float, float);
+__attribute__((const))
+float __ocml_len4_f32(float, float, float, float);
+__attribute__((pure))
+float __ocml_ncdf_f32(float);
+__attribute__((pure))
+float __ocml_ncdfinv_f32(float);
+__attribute__((pure))
+float __ocml_pow_f32(float, float);
+__attribute__((pure))
+float __ocml_rcbrt_f32(float);
+__attribute__((const))
+float __ocml_remainder_f32(float, float);
+float __ocml_remquo_f32(float, float, __attribute__((address_space(5))) int*);
+__attribute__((const))
+float __ocml_rhypot_f32(float, float);
+__attribute__((const))
+float __ocml_rint_f32(float);
+__attribute__((const))
+float __ocml_rlen3_f32(float, float, float);
+__attribute__((const))
+float __ocml_rlen4_f32(float, float, float, float);
+__attribute__((const))
+float __ocml_round_f32(float);
+__attribute__((pure))
+float __ocml_rsqrt_f32(float);
+__attribute__((const))
+float __ocml_scalb_f32(float, float);
+__attribute__((const))
+float __ocml_scalbn_f32(float, int);
+__attribute__((const))
+int __ocml_signbit_f32(float);
+float __ocml_sincos_f32(float, __attribute__((address_space(5))) float*);
+float __ocml_sincospi_f32(float, __attribute__((address_space(5))) float*);
+float __ocml_sin_f32(float);
+__attribute__((pure))
+float __ocml_sinh_f32(float);
+float __ocml_sinpi_f32(float);
+__attribute__((const))
+float __ocml_sqrt_f32(float);
+float __ocml_tan_f32(float);
+__attribute__((pure))
+float __ocml_tanh_f32(float);
+float __ocml_tgamma_f32(float);
+__attribute__((const))
+float __ocml_trunc_f32(float);
+float __ocml_y0_f32(float);
+float __ocml_y1_f32(float);
+
+// BEGIN INTRINSICS
+__attribute__((const))
+float __llvm_add_rte_f32(float, float);
+__attribute__((const))
+float __llvm_add_rtn_f32(float, float);
+__attribute__((const))
+float __llvm_add_rtp_f32(float, float);
+__attribute__((const))
+float __llvm_add_rtz_f32(float, float);
+__attribute__((const))
+float __llvm_sub_rte_f32(float, float);
+__attribute__((const))
+float __llvm_sub_rtn_f32(float, float);
+__attribute__((const))
+float __llvm_sub_rtp_f32(float, float);
+__attribute__((const))
+float __llvm_sub_rtz_f32(float, float);
+__attribute__((const))
+float __llvm_mul_rte_f32(float, float);
+__attribute__((const))
+float __llvm_mul_rtn_f32(float, float);
+__attribute__((const))
+float __llvm_mul_rtp_f32(float, float);
+__attribute__((const))
+float __llvm_mul_rtz_f32(float, float);
+__attribute__((const))
+float __llvm_div_rte_f32(float, float);
+__attribute__((const))
+float __llvm_div_rtn_f32(float, float);
+__attribute__((const))
+float __llvm_div_rtp_f32(float, float);
+__attribute__((const))
+float __llvm_div_rtz_f32(float, float);
+__attribute__((const))
+float __llvm_sqrt_rte_f32(float);
+__attribute__((const))
+float __llvm_sqrt_rtn_f32(float);
+__attribute__((const))
+float __llvm_sqrt_rtp_f32(float);
+__attribute__((const))
+float __llvm_sqrt_rtz_f32(float);
+__attribute__((const))
+float __llvm_fma_rte_f32(float, float, float);
+__attribute__((const))
+float __llvm_fma_rtn_f32(float, float, float);
+__attribute__((const))
+float __llvm_fma_rtp_f32(float, float, float);
+__attribute__((const))
+float __llvm_fma_rtz_f32(float, float, float);
+
+__attribute__((const))
+float __llvm_amdgcn_cos_f32(float) __asm("llvm.amdgcn.cos.f32");
+__attribute__((const))
+float __llvm_amdgcn_rcp_f32(float) __asm("llvm.amdgcn.rcp.f32");
+__attribute__((const))
+float __llvm_amdgcn_rsq_f32(float) __asm("llvm.amdgcn.rsq.f32");
+__attribute__((const))
+float __llvm_amdgcn_sin_f32(float) __asm("llvm.amdgcn.sin.f32");
+// END INTRINSICS
+// END FLOAT
+
+// BEGIN DOUBLE
+__attribute__((const))
+double __ocml_acos_f64(double);
+__attribute__((pure))
+double __ocml_acosh_f64(double);
+__attribute__((const))
+double __ocml_asin_f64(double);
+__attribute__((pure))
+double __ocml_asinh_f64(double);
+__attribute__((const))
+double __ocml_atan2_f64(double, double);
+__attribute__((const))
+double __ocml_atan_f64(double);
+__attribute__((pure))
+double __ocml_atanh_f64(double);
+__attribute__((pure))
+double __ocml_cbrt_f64(double);
+__attribute__((const))
+double __ocml_ceil_f64(double);
+__attribute__((const))
+double __ocml_copysign_f64(double, double);
+double __ocml_cos_f64(double);
+__attribute__((pure))
+double __ocml_cosh_f64(double);
+double __ocml_cospi_f64(double);
+double __ocml_i0_f64(double);
+double __ocml_i1_f64(double);
+__attribute__((pure))
+double __ocml_erfc_f64(double);
+__attribute__((pure))
+double __ocml_erfcinv_f64(double);
+__attribute__((pure))
+double __ocml_erfcx_f64(double);
+__attribute__((pure))
+double __ocml_erf_f64(double);
+__attribute__((pure))
+double __ocml_erfinv_f64(double);
+__attribute__((pure))
+double __ocml_exp10_f64(double);
+__attribute__((pure))
+double __ocml_exp2_f64(double);
+__attribute__((pure))
+double __ocml_exp_f64(double);
+__attribute__((pure))
+double __ocml_expm1_f64(double);
+__attribute__((const))
+double __ocml_fabs_f64(double);
+__attribute__((const))
+double __ocml_fdim_f64(double, double);
+__attribute__((const))
+double __ocml_floor_f64(double);
+__attribute__((const))
+double __ocml_fma_f64(double, double, double);
+__attribute__((const))
+double __ocml_fmax_f64(double, double);
+__attribute__((const))
+double __ocml_fmin_f64(double, double);
+__attribute__((const))
+double __ocml_fmod_f64(double, double);
+double __ocml_frexp_f64(double, __attribute__((address_space(5))) int*);
+__attribute__((const))
+double __ocml_hypot_f64(double, double);
+__attribute__((const))
+int __ocml_ilogb_f64(double);
+__attribute__((const))
+int __ocml_isfinite_f64(double);
+__attribute__((const))
+int __ocml_isinf_f64(double);
+__attribute__((const))
+int __ocml_isnan_f64(double);
+double __ocml_j0_f64(double);
+double __ocml_j1_f64(double);
+__attribute__((const))
+double __ocml_ldexp_f64(double, int);
+double __ocml_lgamma_f64(double);
+__attribute__((pure))
+double __ocml_log10_f64(double);
+__attribute__((pure))
+double __ocml_log1p_f64(double);
+__attribute__((pure))
+double __ocml_log2_f64(double);
+__attribute__((const))
+double __ocml_logb_f64(double);
+__attribute__((pure))
+double __ocml_log_f64(double);
+double __ocml_modf_f64(double, __attribute__((address_space(5))) double*);
+__attribute__((const))
+double __ocml_nearbyint_f64(double);
+__attribute__((const))
+double __ocml_nextafter_f64(double, double);
+__attribute__((const))
+double __ocml_len3_f64(double, double, double);
+__attribute__((const))
+double __ocml_len4_f64(double, double, double, double);
+__attribute__((pure))
+double __ocml_ncdf_f64(double);
+__attribute__((pure))
+double __ocml_ncdfinv_f64(double);
+__attribute__((pure))
+double __ocml_pow_f64(double, double);
+__attribute__((pure))
+double __ocml_rcbrt_f64(double);
+__attribute__((const))
+double __ocml_remainder_f64(double, double);
+double __ocml_remquo_f64(
+    double, double, __attribute__((address_space(5))) int*);
+__attribute__((const))
+double __ocml_rhypot_f64(double, double);
+__attribute__((const))
+double __ocml_rint_f64(double);
+__attribute__((const))
+double __ocml_rlen3_f64(double, double, double);
+__attribute__((const))
+double __ocml_rlen4_f64(double, double, double, double);
+__attribute__((const))
+double __ocml_round_f64(double);
+__attribute__((pure))
+double __ocml_rsqrt_f64(double);
+__attribute__((const))
+double __ocml_scalb_f64(double, double);
+__attribute__((const))
+double __ocml_scalbn_f64(double, int);
+__attribute__((const))
+int __ocml_signbit_f64(double);
+double __ocml_sincos_f64(double, __attribute__((address_space(5))) double*);
+double __ocml_sincospi_f64(double, __attribute__((address_space(5))) double*);
+double __ocml_sin_f64(double);
+__attribute__((pure))
+double __ocml_sinh_f64(double);
+double __ocml_sinpi_f64(double);
+__attribute__((const))
+double __ocml_sqrt_f64(double);
+double __ocml_tan_f64(double);
+__attribute__((pure))
+double __ocml_tanh_f64(double);
+double __ocml_tgamma_f64(double);
+__attribute__((const))
+double __ocml_trunc_f64(double);
+double __ocml_y0_f64(double);
+double __ocml_y1_f64(double);
+
+// BEGIN INTRINSICS
+__attribute__((const))
+double __llvm_add_rte_f64(double, double);
+__attribute__((const))
+double __llvm_add_rtn_f64(double, double);
+__attribute__((const))
+double __llvm_add_rtp_f64(double, double);
+__attribute__((const))
+double __llvm_add_rtz_f64(double, double);
+__attribute__((const))
+double __llvm_sub_rte_f64(double, double);
+__attribute__((const))
+double __llvm_sub_rtn_f64(double, double);
+__attribute__((const))
+double __llvm_sub_rtp_f64(double, double);
+__attribute__((const))
+double __llvm_sub_rtz_f64(double, double);
+__attribute__((const))
+double __llvm_mul_rte_f64(double, double);
+__attribute__((const))
+double __llvm_mul_rtn_f64(double, double);
+__attribute__((const))
+double __llvm_mul_rtp_f64(double, double);
+__attribute__((const))
+double __llvm_mul_rtz_f64(double, double);
+__attribute__((const))
+double __llvm_div_rte_f64(double, double);
+__attribute__((const))
+double __llvm_div_rtn_f64(double, double);
+__attribute__((const))
+double __llvm_div_rtp_f64(double, double);
+__attribute__((const))
+double __llvm_div_rtz_f64(double, double);
+__attribute__((const))
+double __llvm_sqrt_rte_f64(double);
+__attribute__((const))
+double __llvm_sqrt_rtn_f64(double);
+__attribute__((const))
+double __llvm_sqrt_rtp_f64(double);
+__attribute__((const))
+double __llvm_sqrt_rtz_f64(double);
+__attribute__((const))
+double __llvm_fma_rte_f64(float, float, float);
+__attribute__((const))
+double __llvm_fma_rtn_f64(float, float, float);
+__attribute__((const))
+double __llvm_fma_rtp_f64(float, float, float);
+__attribute__((const))
+double __llvm_fma_rtz_f64(float, float, float);
+
+__attribute__((const))
+double __llvm_amdgcn_rcp_f64(double);
+__attribute__((const))
+double __llvm_amdgcn_rsq_f64(double);
+// END INTRINSICS
+// END DOUBLE
+
+#if defined(__cplusplus)
+    } // extern "C"
+#endif
\ No newline at end of file
diff --git a/projects/clr/hipamd/src/device_functions.cpp b/projects/clr/hipamd/src/device_functions.cpp
index 86d0530817..1af66f97e5 100644
--- a/projects/clr/hipamd/src/device_functions.cpp
+++ b/projects/clr/hipamd/src/device_functions.cpp
@@ -23,11 +23,6 @@ THE SOFTWARE.
 #include <hc_math.hpp>
 #include "device_util.h"
 
-extern "C" float __ocml_floor_f32(float);
-extern "C" float __ocml_rint_f32(float);
-extern "C" float __ocml_ceil_f32(float);
-extern "C" float __ocml_trunc_f32(float);
-
 __device__ float __double2float_rd(double x) { return (double)x; }
 __device__ float __double2float_rn(double x) { return (double)x; }
 __device__ float __double2float_ru(double x) { return (double)x; }
diff --git a/projects/clr/hipamd/src/device_util.cpp b/projects/clr/hipamd/src/device_util.cpp
index 5ce014b2b9..50dfb48c8a 100644
--- a/projects/clr/hipamd/src/device_util.cpp
+++ b/projects/clr/hipamd/src/device_util.cpp
@@ -943,97 +943,6 @@ __device__ void* __get_dynamicgroupbaseptr() {
 
 __host__ void* __get_dynamicgroupbaseptr() { return nullptr; }
 
-// Precise Math Functions
-__device__ float __hip_precise_cosf(float x) { return hc::precise_math::cosf(x); }
-
-__device__ float __hip_precise_exp10f(float x) { return hc::precise_math::exp10f(x); }
-
-__device__ float __hip_precise_expf(float x) { return hc::precise_math::expf(x); }
-
-__device__ float __hip_precise_frsqrt_rn(float x) { return hc::precise_math::rsqrt(x); }
-
-__device__ float __hip_precise_fsqrt_rd(float x) { return hc::precise_math::sqrt(x); }
-
-__device__ float __hip_precise_fsqrt_rn(float x) { return hc::precise_math::sqrt(x); }
-
-__device__ float __hip_precise_fsqrt_ru(float x) { return hc::precise_math::sqrt(x); }
-
-__device__ float __hip_precise_fsqrt_rz(float x) { return hc::precise_math::sqrt(x); }
-
-__device__ float __hip_precise_log10f(float x) { return hc::precise_math::log10(x); }
-
-__device__ float __hip_precise_log2f(float x) { return hc::precise_math::log2(x); }
-
-__device__ float __hip_precise_logf(float x) { return hc::precise_math::logf(x); }
-
-__device__ float __hip_precise_powf(float base, float exponent) {
-    return hc::precise_math::powf(base, exponent);
-}
-
-__device__ void __hip_precise_sincosf(float x, float* s, float* c) {
-    hc::precise_math::sincosf(x, s, c);
-}
-
-__device__ float __hip_precise_sinf(float x) { return hc::precise_math::sinf(x); }
-
-__device__ float __hip_precise_tanf(float x) { return hc::precise_math::tanf(x); }
-
-// Double Precision Math
-__device__ double __hip_precise_dsqrt_rd(double x) { return hc::precise_math::sqrt(x); }
-
-__device__ double __hip_precise_dsqrt_rn(double x) { return hc::precise_math::sqrt(x); }
-
-__device__ double __hip_precise_dsqrt_ru(double x) { return hc::precise_math::sqrt(x); }
-
-__device__ double __hip_precise_dsqrt_rz(double x) { return hc::precise_math::sqrt(x); }
-
-#define LOG_BASE2_E 1.4426950408889634
-#define LOG_BASE2_10 3.32192809488736
-#define ONE_DIV_LOG_BASE2_E 0.69314718056
-#define ONE_DIV_LOG_BASE2_10 0.30102999566
-
-// Fast Math Intrinsics
-__device__ float __hip_fast_exp10f(float x) { return __hip_fast_exp2f(x * LOG_BASE2_E); }
-
-__device__ float __hip_fast_expf(float x) { return __hip_fast_exp2f(x * LOG_BASE2_10); }
-
-__device__ float __hip_fast_frsqrt_rn(float x) {
-    return 1 / __hip_fast_fsqrt_rd(x);
-    ;
-}
-
-__device__ float __hip_fast_fsqrt_rn(float x) { return __hip_fast_fsqrt_rd(x); }
-
-__device__ float __hip_fast_fsqrt_ru(float x) { return __hip_fast_fsqrt_rd(x); }
-
-__device__ float __hip_fast_fsqrt_rz(float x) { return __hip_fast_fsqrt_rd(x); }
-
-__device__ float __hip_fast_log10f(float x) { return ONE_DIV_LOG_BASE2_E * __hip_fast_log2f(x); }
-
-__device__ float __hip_fast_logf(float x) { return ONE_DIV_LOG_BASE2_10 * __hip_fast_log2f(x); }
-
-__device__ float __hip_fast_powf(float base, float exponent) {
-    return hc::fast_math::powf(base, exponent);
-}
-
-__device__ void __hip_fast_sincosf(float x, float* s, float* c) {
-    *s = __hip_fast_sinf(x);
-    *c = __hip_fast_cosf(x);
-}
-
-__device__ float __hip_fast_tanf(float x) { return hc::fast_math::tanf(x); }
-
-// Double Precision Math
-// FIXME - HCC doesn't have a fast_math version double FP sqrt
-// Another issue is that these intrinsics call for a specific rounding mode;
-// however, their implementation all map to the same sqrt builtin
-__device__ double __hip_fast_dsqrt_rd(double x) { return hc::precise_math::sqrt(x); }
-
-__device__ double __hip_fast_dsqrt_rn(double x) { return hc::precise_math::sqrt(x); }
-
-__device__ double __hip_fast_dsqrt_ru(double x) { return hc::precise_math::sqrt(x); }
-
-__device__ double __hip_fast_dsqrt_rz(double x) { return hc::precise_math::sqrt(x); }
 
 __device__ void __threadfence_system(void) { std::atomic_thread_fence(std::memory_order_seq_cst); }
 
diff --git a/projects/clr/hipamd/src/math_functions.cpp b/projects/clr/hipamd/src/math_functions.cpp
deleted file mode 100644
index dedc40f2ae..0000000000
--- a/projects/clr/hipamd/src/math_functions.cpp
+++ /dev/null
@@ -1,429 +0,0 @@
-
-/*
-Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <hc.hpp>
-#include <grid_launch.h>
-#include <hc_math.hpp>
-#include "device_util.h"
-#include "hip/hcc_detail/device_functions.h"
-#include "hip/hip_runtime.h"
-
-__device__ float acosf(float x) { return hc::precise_math::acosf(x); }
-__device__ float acoshf(float x) { return hc::precise_math::acoshf(x); }
-__device__ float asinf(float x) { return hc::precise_math::asinf(x); }
-__device__ float asinhf(float x) { return hc::precise_math::asinhf(x); }
-__device__ float atan2f(float y, float x) { return hc::precise_math::atan2f(y, x); }
-__device__ float atanf(float x) { return hc::precise_math::atanf(x); }
-__device__ float atanhf(float x) { return hc::precise_math::atanhf(x); }
-__device__ float cbrtf(float x) { return hc::precise_math::cbrtf(x); }
-__device__ float ceilf(float x) { return hc::precise_math::ceilf(x); }
-__device__ float copysignf(float x, float y) { return hc::precise_math::copysignf(x, y); }
-__device__ float cosf(float x) { return hc::precise_math::cosf(x); }
-__device__ float coshf(float x) { return hc::precise_math::coshf(x); }
-__device__ float cyl_bessel_i0f(float x);
-__device__ float cyl_bessel_i1f(float x);
-__device__ float erfcf(float x) { return hc::precise_math::erfcf(x); }
-__device__ float erfcinvf(float y) { return hc::precise_math::erfcinvf(y); }
-__device__ float erfcxf(float x) {
-    return hc::precise_math::expf(x * x) * hc::precise_math::erfcf(x);
-}
-__device__ float erff(float x) { return hc::precise_math::erff(x); }
-__device__ float erfinvf(float y) { return hc::precise_math::erfinvf(y); }
-__device__ float exp10f(float x) { return hc::precise_math::exp10f(x); }
-__device__ float exp2f(float x) { return hc::precise_math::exp2f(x); }
-__device__ float expf(float x) { return hc::precise_math::expf(x); }
-__device__ float expm1f(float x) { return hc::precise_math::expm1f(x); }
-__device__ int abs(int x) {
-    return x >= 0 ? x : -x;  // TODO - optimize with OCML
-}
-__device__ long long abs(long long x) {
-    return x >= 0 ? x : -x;  
-}
-__device__ float fabsf(float x) { return hc::precise_math::fabsf(x); }
-__device__ float fdimf(float x, float y) { return hc::precise_math::fdimf(x, y); }
-__device__ float fdividef(float x, float y) { return x / y; }
-__device__ float floorf(float x) { return hc::precise_math::floorf(x); }
-__device__ float fmaf(float x, float y, float z) { return hc::precise_math::fmaf(x, y, z); }
-__device__ float fmaxf(float x, float y) { return hc::precise_math::fmaxf(x, y); }
-__device__ float fminf(float x, float y) { return hc::precise_math::fminf(x, y); }
-__device__ float fmodf(float x, float y) { return hc::precise_math::fmodf(x, y); }
-__device__ float frexpf(float x, int* nptr) { return hc::precise_math::frexpf(x, nptr); }
-__device__ float hypotf(float x, float y) { return hc::precise_math::hypotf(x, y); }
-__device__ int ilogbf(float x) { return hc::precise_math::ilogbf(x); }
-__device__ int isfinite(float a) { return hc::precise_math::isfinite(a); }
-__device__ unsigned isinf(float a) { return hc::precise_math::isinf(a); }
-__device__ unsigned isnan(float a) { return hc::precise_math::isnan(a); }
-__device__ float j0f(float x) { return __hip_j0f(x); }
-__device__ float j1f(float x) { return __hip_j1f(x); }
-__device__ float jnf(int n, float x) { return __hip_jnf(n, x); }
-__device__ float ldexpf(float x, int exp) { return hc::precise_math::ldexpf(x, exp); }
-__device__ float lgammaf(float x) { return hc::precise_math::lgammaf(x); }
-__device__ long long int llrintf(float x) {
-    int y = hc::precise_math::roundf(x);
-    long long int z = y;
-    return z;
-}
-__device__ long long int llroundf(float x) {
-    int y = hc::precise_math::roundf(x);
-    long long int z = y;
-    return z;
-}
-__device__ float log10f(float x) { return hc::precise_math::log10f(x); }
-__device__ float log1pf(float x) { return hc::precise_math::log1pf(x); }
-__device__ float log2f(float x) { return hc::precise_math::log2f(x); }
-__device__ float logbf(float x) { return hc::precise_math::logbf(x); }
-__device__ float logf(float x) { return hc::precise_math::logf(x); }
-__device__ long int lrintf(float x) {
-    int y = hc::precise_math::roundf(x);
-    long int z = y;
-    return z;
-}
-__device__ long int lroundf(float x) {
-    long int y = hc::precise_math::roundf(x);
-    return y;
-}
-__device__ float modff(float x, float* iptr) { return hc::precise_math::modff(x, iptr); }
-__device__ float nanf(const char* tagp) { return hc::precise_math::nanf((int)*tagp); }
-__device__ float nearbyintf(float x) { return hc::precise_math::nearbyintf(x); }
-__device__ float nextafterf(float x, float y) { return hc::precise_math::nextafter(x, y); }
-__device__ float norm3df(float a, float b, float c) {
-    float x = a * a + b * b + c * c;
-    return hc::precise_math::sqrtf(x);
-}
-__device__ float norm4df(float a, float b, float c, float d) {
-    float x = a * a + b * b;
-    float y = c * c + d * d;
-    return hc::precise_math::sqrtf(x + y);
-}
-
-__device__ float normcdff(float y) { return ((hc::precise_math::erff(y) / 1.41421356237) + 1) / 2; }
-__device__ float normcdfinvf(float y) { return HIP_SQRT_2 * __hip_erfinvf(2 * y - 1); }
-__device__ float normf(int dim, const float* a) {
-    float x = 0.0f;
-    for (int i = 0; i < dim; i++) {
-        x = hc::precise_math::fmaf(a[i], a[i], x);
-    }
-    return hc::precise_math::sqrtf(x);
-}
-__device__ float powf(float x, float y) { return hc::precise_math::powf(x, y); }
-__device__ float rcbrtf(float x) { return hc::precise_math::rcbrtf(x); }
-__device__ float remainderf(float x, float y) { return hc::precise_math::remainderf(x, y); }
-__device__ float remquof(float x, float y, int* quo) {
-    return hc::precise_math::remquof(x, y, quo);
-}
-__device__ float rhypotf(float x, float y) { return 1 / hc::precise_math::hypotf(x, y); }
-__device__ float rintf(float x) { return hc::precise_math::roundf(x); }
-__device__ float rnorm3df(float a, float b, float c) {
-    float x = a * a + b * b + c * c;
-    return 1 / hc::precise_math::sqrtf(x);
-}
-__device__ float rnorm4df(float a, float b, float c, float d) {
-    float x = a * a + b * b;
-    float y = c * c + d * d;
-    return 1 / hc::precise_math::sqrtf(x + y);
-}
-__device__ float rnormf(int dim, const float* a) {
-    float x = 0.0f;
-    for (int i = 0; i < dim; i++) {
-        x = hc::precise_math::fmaf(a[i], a[i], x);
-    }
-    return 1 / hc::precise_math::sqrtf(x);
-}
-__device__ float roundf(float x) { return hc::precise_math::roundf(x); }
-__device__ float scalblnf(float x, long int n) { return hc::precise_math::scalb(x, n); }
-__device__ float scalbnf(float x, int n) { return hc::precise_math::scalbnf(x, n); }
-__device__ int signbit(float a) { return hc::precise_math::signbit(a); }
-__device__ void sincosf(float x, float* sptr, float* cptr) {
-    *sptr = hc::precise_math::sinf(x);
-    *cptr = hc::precise_math::cosf(x);
-}
-__device__ void sincospif(float x, float* sptr, float* cptr) {
-    *sptr = hc::precise_math::sinpif(x);
-    *cptr = hc::precise_math::cospif(x);
-}
-__device__ float sinf(float x) { return hc::precise_math::sinf(x); }
-__device__ float sinhf(float x) { return hc::precise_math::sinhf(x); }
-__device__ float tanf(float x) { return hc::precise_math::tanf(x); }
-__device__ float tanhf(float x) { return hc::precise_math::tanhf(x); }
-__device__ float tgammaf(float x) { return hc::precise_math::tgammaf(x); }
-__device__ float truncf(float x) { return hc::precise_math::truncf(x); }
-__device__ float y0f(float x) { return __hip_y0f(x); }
-__device__ float y1f(float x) { return __hip_y1f(x); }
-__device__ float ynf(int n, float x) { return __hip_ynf(n, x); }
-__device__ float cospif(float x) { return hc::precise_math::cospif(x); }
-__device__ float sinpif(float x) { return hc::precise_math::sinpif(x); }
-__device__ float sqrtf(float x) { return hc::precise_math::sqrtf(x); }
-__device__ float rsqrtf(float x) { return hc::precise_math::rsqrtf(x); }
-
-/*
- * Double precision device math functions
- */
-
-__device__ double acos(double x) { return hc::precise_math::acos(x); }
-__device__ double acosh(double x) { return hc::precise_math::acosh(x); }
-__device__ double asin(double x) { return hc::precise_math::asin(x); }
-__device__ double asinh(double x) { return hc::precise_math::asinh(x); }
-__device__ double atan(double x) { return hc::precise_math::atan(x); }
-__device__ double atan2(double y, double x) { return hc::precise_math::atan2(y, x); }
-__device__ double atanh(double x) { return hc::precise_math::atanh(x); }
-__device__ double cbrt(double x) { return hc::precise_math::cbrt(x); }
-__device__ double ceil(double x) { return hc::precise_math::ceil(x); }
-__device__ double copysign(double x, double y) { return hc::precise_math::copysign(x, y); }
-__device__ double cos(double x) { return hc::precise_math::cos(x); }
-__device__ double cosh(double x) { return hc::precise_math::cosh(x); }
-__device__ double cospi(double x) { return hc::precise_math::cospi(x); }
-__device__ double cyl_bessel_i0(double x);
-__device__ double cyl_bessel_i1(double x);
-__device__ double erf(double x) { return hc::precise_math::erf(x); }
-__device__ double erfc(double x) { return hc::precise_math::erfc(x); }
-__device__ double erfcinv(double x) { return __hip_erfinv(1 - x); }
-__device__ double erfcx(double x) {
-    return hc::precise_math::exp(x * x) * hc::precise_math::erf(x);
-}
-__device__ double erfinv(double x) { return __hip_erfinv(x); }
-__device__ double exp(double x) { return hc::precise_math::exp(x); }
-__device__ double exp10(double x) { return hc::precise_math::exp10(x); }
-__device__ double exp2(double x) { return hc::precise_math::exp2(x); }
-__device__ double expm1(double x) { return hc::precise_math::expm1(x); }
-__device__ double fabs(double x) { return hc::precise_math::fabs(x); }
-__device__ double fdim(double x, double y) { return hc::precise_math::fdim(x, y); }
-__device__ double fdivide(double x, double y) { return x / y; }
-__device__ double floor(double x) { return hc::precise_math::floor(x); }
-__device__ double fma(double x, double y, double z) { return hc::precise_math::fma(x, y, z); }
-__device__ double fmax(double x, double y) { return hc::precise_math::fmax(x, y); }
-__device__ double fmin(double x, double y) { return hc::precise_math::fmin(x, y); }
-__device__ double fmod(double x, double y) { return hc::precise_math::fmod(x, y); }
-__device__ double frexp(double x, int* y) { return hc::precise_math::frexp(x, y); }
-__device__ double hypot(double x, double y) { return hc::precise_math::hypot(x, y); }
-__device__ int ilogb(double x) { return hc::precise_math::ilogb(x); }
-__device__ int isfinite(double x) { return hc::precise_math::isfinite(x); }
-__device__ unsigned isinf(double x) { return hc::precise_math::isinf(x); }
-__device__ unsigned isnan(double x) { return hc::precise_math::isnan(x); }
-__device__ double j0(double x) { return __hip_j0(x); }
-__device__ double j1(double x) { return __hip_j1(x); }
-__device__ double jn(int n, double x) { return __hip_jn(n, x); }
-__device__ double ldexp(double x, int exp) { return hc::precise_math::ldexp(x, exp); }
-__device__ double lgamma(double x) { return hc::precise_math::lgamma(x); }
-__device__ long long int llrint(double x) {
-    long long int y = hc::precise_math::round(x);
-    return y;
-}
-__device__ long long int llround(double x) {
-    long long int y = hc::precise_math::round(x);
-    return y;
-}
-__device__ double log(double x) { return hc::precise_math::log(x); }
-__device__ double log10(double x) { return hc::precise_math::log10(x); }
-__device__ double log1p(double x) { return hc::precise_math::log1p(x); }
-__device__ double log2(double x) { return hc::precise_math::log2(x); }
-__device__ double logb(double x) { return hc::precise_math::logb(x); }
-__device__ long int lrint(double x) {
-    long int y = hc::precise_math::round(x);
-    return y;
-}
-__device__ long int lround(double x) {
-    long int y = hc::precise_math::round(x);
-    return y;
-}
-__device__ double modf(double x, double* iptr) { return hc::precise_math::modf(x, iptr); }
-__device__ double nan(const char* tagp) { return hc::precise_math::nan((int)*tagp); }
-__device__ double nearbyint(double x) { return hc::precise_math::nearbyint(x); }
-__device__ double nextafter(double x, double y) { return hc::precise_math::nextafter(x, y); }
-__device__ double norm(int x, const double* d) {
-    double val = 0;
-    for (int i = 0; i < x; i++) {
-        val += d[i] * d[i];
-    }
-    return hc::precise_math::sqrt(val);
-}
-__device__ double norm3d(double a, double b, double c) {
-    double x = a * a + b * b + c * c;
-    return hc::precise_math::sqrt(x);
-}
-__device__ double norm4d(double a, double b, double c, double d) {
-    double x = a * a + b * b;
-    double y = c * c + d * d;
-    return hc::precise_math::sqrt(x + y);
-}
-__device__ double normcdf(double y) { return ((hc::precise_math::erf(y) / HIP_SQRT_2) + 1) / 2; }
-__device__ double normcdfinv(double y) { return HIP_SQRT_2 * __hip_erfinv(2 * y - 1); }
-__device__ double pow(double x, double y) { return hc::precise_math::pow(x, y); }
-__device__ double rcbrt(double x) { return hc::precise_math::rcbrt(x); }
-__device__ double remainder(double x, double y) { return hc::precise_math::remainder(x, y); }
-__device__ double remquo(double x, double y, int* quo) {
-    return hc::precise_math::remquo(x, y, quo);
-}
-__device__ double rhypot(double x, double y) { return 1 / hc::precise_math::sqrt(x * x + y * y); }
-__device__ double rint(double x) { return hc::precise_math::round(x); }
-__device__ double rnorm3d(double a, double b, double c) {
-    return hc::precise_math::rsqrt(a * a + b * b + c * c);
-}
-__device__ double rnorm4d(double a, double b, double c, double d) {
-    return hc::precise_math::rsqrt(a * a + b * b + c * c + d * d);
-}
-__device__ double rnorm(int dim, const double* t) {
-    double x = 0.0;
-    for (int i = 0; i < dim; i++) {
-        x = hc::precise_math::fma(t[i], t[i], x);
-    }
-    return 1 / x;
-}
-__device__ double round(double x) { return hc::precise_math::round(x); }
-__device__ double rsqrt(double x) { return hc::precise_math::rsqrt(x); }
-__device__ double scalbln(double x, long int n) { return hc::precise_math::scalb(x, n); }
-__device__ double scalbn(double x, int n) { return hc::precise_math::scalbn(x, n); }
-__device__ int signbit(double x) { return hc::precise_math::signbit(x); }
-__device__ double sin(double x) { return hc::precise_math::sin(x); }
-__device__ void sincos(double x, double* sptr, double* cptr) {
-    *sptr = hc::precise_math::sin(x);
-    *cptr = hc::precise_math::cos(x);
-}
-__device__ void sincospi(double x, double* sptr, double* cptr) {
-    *sptr = hc::precise_math::sinpi(x);
-    *cptr = hc::precise_math::cospi(x);
-}
-__device__ double sinh(double x) { return hc::precise_math::sinh(x); }
-__device__ double sinpi(double x) { return hc::precise_math::sinpi(x); }
-__device__ double sqrt(double x) { return hc::precise_math::sqrt(x); }
-__device__ double tan(double x) { return hc::precise_math::tan(x); }
-__device__ double tanh(double x) { return hc::precise_math::tanh(x); }
-__device__ double tgamma(double x) { return hc::precise_math::tgamma(x); }
-__device__ double trunc(double x) { return hc::precise_math::trunc(x); }
-__device__ double y0(double x) { return __hip_y0(x); }
-__device__ double y1(double x) { return __hip_y1(x); }
-__device__ double yn(int n, double x) { return __hip_yn(n, x); }
-
-
-__host__ float cospif(float x) { return std::cos(x * HIP_PI); }
-
-__host__ float fdividef(float x, float y) { return x / y; }
-
-__host__ int isfinite(float x) { return std::isfinite(x); }
-
-__host__ int signbit(float x) { return std::signbit(x); }
-
-__host__ float sinpif(float x) { return std::sin(x * HIP_PI); }
-
-__host__ float rsqrtf(float x) { return 1 / std::sqrt(x); }
-
-__host__ float modff(float x, float* iptr) { return std::modf(x, iptr); }
-
-__host__ double fdivide(double x, double y) { return x / y; }
-
-__host__ float normcdff(float t) { return (1 - std::erf(-t / std::sqrt(2))) / 2; }
-
-__host__ double normcdf(double x) { return (1 - std::erf(-x / std::sqrt(2))) / 2; }
-
-__host__ float erfcxf(float x) { return std::exp(x * x) * std::erfc(x); }
-
-__host__ double erfcx(double x) { return std::exp(x * x) * std::erfc(x); }
-
-__host__ float rhypotf(float x, float y) { return 1 / std::sqrt(x * x + y * y); }
-
-__host__ double rhypot(double x, double y) { return 1 / std::sqrt(x * x + y * y); }
-
-__host__ float rcbrtf(float a) { return 1 / std::cbrt(a); }
-
-__host__ double rcbrt(double a) { return 1 / std::cbrt(a); }
-
-__host__ float normf(int dim, const float* a) {
-    float val = 0.0f;
-    for (int i = 0; i < dim; i++) {
-        val = val + a[i] * a[i];
-    }
-    return val;
-}
-
-__host__ float rnormf(int dim, const float* t) {
-    float val = 0.0f;
-    for (int i = 0; i < dim; i++) {
-        val = val + t[i] * t[i];
-    }
-    return 1 / std::sqrt(val);
-}
-
-__host__ double rnorm(int dim, const double* t) {
-    double val = 0.0;
-    for (int i = 0; i < dim; i++) {
-        val = val + t[i] * t[i];
-    }
-    return 1 / std::sqrt(val);
-}
-
-__host__ float rnorm4df(float a, float b, float c, float d) {
-    return 1 / std::sqrt(a * a + b * b + c * c + d * d);
-}
-
-__host__ double rnorm4d(double a, double b, double c, double d) {
-    return 1 / std::sqrt(a * a + b * b + c * c + d * d);
-}
-
-__host__ float rnorm3df(float a, float b, float c) { return 1 / std::sqrt(a * a + b * b + c * c); }
-
-__host__ double rnorm3d(double a, double b, double c) {
-    return 1 / std::sqrt(a * a + b * b + c * c);
-}
-
-__host__ void sincospif(float x, float* sptr, float* cptr) {
-    *sptr = std::sin(HIP_PI * x);
-    *cptr = std::cos(HIP_PI * x);
-}
-
-__host__ void sincospi(double x, double* sptr, double* cptr) {
-    *sptr = std::sin(HIP_PI * x);
-    *cptr = std::cos(HIP_PI * x);
-}
-
-__host__ float nextafterf(float x, float y) { return std::nextafter(x, y); }
-
-__host__ double nextafter(double x, double y) { return std::nextafter(x, y); }
-
-__host__ float norm3df(float a, float b, float c) { return std::sqrt(a * a + b * b + c * c); }
-
-__host__ float norm4df(float a, float b, float c, float d) {
-    return std::sqrt(a * a + b * b + c * c + d * d);
-}
-
-__host__ double norm3d(double a, double b, double c) { return std::sqrt(a * a + b * b + c * c); }
-
-__host__ double norm4d(double a, double b, double c, double d) {
-    return std::sqrt(a * a + b * b + c * c + d * d);
-}
-
-__host__ double sinpi(double a) { return std::sin(HIP_PI * a); }
-
-__host__ double cospi(double a) { return std::cos(HIP_PI * a); }
-
-__host__ int isfinite(double a) { return std::isfinite(a); }
-
-__host__ double norm(int dim, const double* t) {
-    double val = 0;
-    for (int i = 0; i < dim; i++) {
-        val += t[i] * t[i];
-    }
-    return std::sqrt(val);
-}
-
-__host__ double rsqrt(double x) { return 1 / std::sqrt(x); }
-
-__host__ int signbit(double x) { return std::signbit(x); }
diff --git a/projects/clr/hipamd/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp b/projects/clr/hipamd/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp
index 739660c04d..2620a3cbde 100644
--- a/projects/clr/hipamd/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp
+++ b/projects/clr/hipamd/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp
@@ -50,8 +50,8 @@ __device__ void double_precision_math_functions() {
     cos(0.0);
     cosh(0.0);
     cospi(0.0);
-    //    cyl_bessel_i0(0.0);
-    //    cyl_bessel_i1(0.0);
+    cyl_bessel_i0(0.0);
+    cyl_bessel_i1(0.0);
     erf(0.0);
     erfc(0.0);
     erfcinv(2.0);
@@ -78,7 +78,7 @@ __device__ void double_precision_math_functions() {
     j1(0.0);
     jn(-1.0, 1.0);
     ldexp(0.0, 0);
-    //    lgamma(1.0);
+    lgamma(1.0);
     llrint(0.0);
     llround(0.0);
     log(1.0);
@@ -88,7 +88,7 @@ __device__ void double_precision_math_functions() {
     logb(1.0);
     lrint(0.0);
     lround(0.0);
-    //    modf(0.0, &fX);
+    modf(0.0, &fX);
     nan("1");
     nearbyint(0.0);
     nextafter(0.0, 0.0);
@@ -99,9 +99,9 @@ __device__ void double_precision_math_functions() {
     normcdf(0.0);
     normcdfinv(1.0);
     pow(1.0, 0.0);
-    // rcbrt(1.0);
+    rcbrt(1.0);
     remainder(2.0, 1.0);
-    //    remquo(1.0, 2.0, &iX);
+    remquo(1.0, 2.0, &iX);
     rhypot(0.0, 1.0);
     rint(1.0);
     fX = 1.0;
@@ -110,8 +110,8 @@ __device__ void double_precision_math_functions() {
     rnorm4d(0.0, 0.0, 0.0, 1.0);
     round(0.0);
     rsqrt(1.0);
-    // scalbln(0.0, 1);
-    // scalbn(0.0, 1);
+    scalbln(0.0, 1);
+    scalbn(0.0, 1);
     signbit(1.0);
     sin(0.0);
     sincos(0.0, &fX, &fY);
@@ -128,11 +128,17 @@ __device__ void double_precision_math_functions() {
     yn(1, 1.0);
 }
 
-__global__ void compileDoublePrecisionMathOnDevice(hipLaunchParm lp, int ignored) {
+__global__ void compileDoublePrecisionMathOnDevice(int) {
     double_precision_math_functions();
 }
 
 int main() {
-    hipLaunchKernel(compileDoublePrecisionMathOnDevice, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
+    hipLaunchKernelGGL(
+        compileDoublePrecisionMathOnDevice,
+        dim3(1, 1, 1),
+        dim3(1, 1, 1),
+        0,
+        0,
+        1);
     passed();
 }
diff --git a/projects/clr/hipamd/tests/src/deviceLib/hipFloatMathPrecise.cpp b/projects/clr/hipamd/tests/src/deviceLib/hipFloatMathPrecise.cpp
index f9f38efa96..6a0c91e890 100644
--- a/projects/clr/hipamd/tests/src/deviceLib/hipFloatMathPrecise.cpp
+++ b/projects/clr/hipamd/tests/src/deviceLib/hipFloatMathPrecise.cpp
@@ -30,8 +30,8 @@ THE SOFTWARE.
 #include <hip/math_functions.h>
 #include "test_common.h"
 
-__global__ void FloatMathPrecise(hipLaunchParm lp) {
-    // int iX; //uncomment this when remqouf() is enabled again
+__global__ void FloatMathPrecise() {
+    int iX;
     float fX, fY;
 
     acosf(1.0f);
@@ -47,8 +47,8 @@ __global__ void FloatMathPrecise(hipLaunchParm lp) {
     cosf(0.0f);
     coshf(0.0f);
     cospif(0.0f);
-    // cyl_bessel_i0f(0.0f);
-    // cyl_bessel_i1f(0.0f);
+    cyl_bessel_i0f(0.0f);
+    cyl_bessel_i1f(0.0f);
     erfcf(0.0f);
     erfcinvf(2.0f);
     erfcxf(0.0f);
@@ -66,7 +66,7 @@ __global__ void FloatMathPrecise(hipLaunchParm lp) {
     fX = fmaxf(0.0f, 0.0f);
     fX = fminf(0.0f, 0.0f);
     fmodf(0.0f, 1.0f);
-    // frexpf(0.0f, &iX);
+    frexpf(0.0f, &iX);
     hypotf(1.0f, 0.0f);
     ilogbf(1.0f);
     isfinite(0.0f);
@@ -76,7 +76,7 @@ __global__ void FloatMathPrecise(hipLaunchParm lp) {
     j1f(0.0f);
     jnf(-1.0f, 1.0f);
     ldexpf(0.0f, 0);
-    // lgammaf(1.0f);
+    lgammaf(1.0f);
     llrintf(0.0f);
     llroundf(0.0f);
     log10f(1.0f);
@@ -86,10 +86,10 @@ __global__ void FloatMathPrecise(hipLaunchParm lp) {
     logf(1.0f);
     lrintf(0.0f);
     lroundf(0.0f);
-    // modff(0.0f, &fX);
+    modff(0.0f, &fX);
     fX = nanf("1");
     fX = nearbyintf(0.0f);
-    // nextafterf(0.0f);
+    nextafterf(0.0f, 0.0f);
     norm3df(1.0f, 0.0f, 0.0f);
     norm4df(1.0f, 0.0f, 0.0f, 0.0f);
     normcdff(0.0f);
@@ -99,7 +99,7 @@ __global__ void FloatMathPrecise(hipLaunchParm lp) {
     powf(1.0f, 0.0f);
     rcbrtf(1.0f);
     remainderf(2.0f, 1.0f);
-    // remquof(1.0f, 2.0f, &iX);
+    remquof(1.0f, 2.0f, &iX);
     rhypotf(0.0f, 1.0f);
     fY = rintf(1.0f);
     rnorm3df(0.0f, 0.0f, 1.0f);
@@ -127,6 +127,6 @@ __global__ void FloatMathPrecise(hipLaunchParm lp) {
 }
 
 int main() {
-    hipLaunchKernel(FloatMathPrecise, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0);
+    hipLaunchKernelGGL(FloatMathPrecise, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0);
     passed();
 }
diff --git a/projects/clr/hipamd/tests/src/deviceLib/hipTestDevice.cpp b/projects/clr/hipamd/tests/src/deviceLib/hipTestDevice.cpp
index 395e63a330..2bb5163dc9 100644
--- a/projects/clr/hipamd/tests/src/deviceLib/hipTestDevice.cpp
+++ b/projects/clr/hipamd/tests/src/deviceLib/hipTestDevice.cpp
@@ -31,82 +31,82 @@ THE SOFTWARE.
 #define N 512
 #define SIZE N * sizeof(float)
 
-__global__ void test_sincosf(hipLaunchParm lp, float* a, float* b, float* c) {
+__global__ void test_sincosf(float* a, float* b, float* c) {
     int tid = threadIdx.x;
     sincosf(a[tid], b + tid, c + tid);
 }
 
-__global__ void test_sincospif(hipLaunchParm lp, float* a, float* b, float* c) {
+__global__ void test_sincospif(float* a, float* b, float* c) {
     int tid = threadIdx.x;
     sincospif(a[tid], b + tid, c + tid);
 }
 
-__global__ void test_fdividef(hipLaunchParm lp, float* a, float* b, float* c) {
+__global__ void test_fdividef(float* a, float* b, float* c) {
     int tid = threadIdx.x;
     c[tid] = fdividef(a[tid], b[tid]);
 }
 
-__global__ void test_llrintf(hipLaunchParm lp, float* a, long long int* b) {
+__global__ void test_llrintf(float* a, long long int* b) {
     int tid = threadIdx.x;
     b[tid] = llrintf(a[tid]);
 }
 
-__global__ void test_lrintf(hipLaunchParm lp, float* a, long int* b) {
+__global__ void test_lrintf(float* a, long int* b) {
     int tid = threadIdx.x;
     b[tid] = lrintf(a[tid]);
 }
 
-__global__ void test_rintf(hipLaunchParm lp, float* a, float* b) {
+__global__ void test_rintf(float* a, float* b) {
     int tid = threadIdx.x;
     b[tid] = rintf(a[tid]);
 }
 
-__global__ void test_llroundf(hipLaunchParm lp, float* a, long long int* b) {
+__global__ void test_llroundf(float* a, long long int* b) {
     int tid = threadIdx.x;
     b[tid] = llroundf(a[tid]);
 }
 
-__global__ void test_lroundf(hipLaunchParm lp, float* a, long int* b) {
+__global__ void test_lroundf(float* a, long int* b) {
     int tid = threadIdx.x;
     b[tid] = lroundf(a[tid]);
 }
 
-__global__ void test_rhypotf(hipLaunchParm lp, float* a, float* b, float* c) {
+__global__ void test_rhypotf(float* a, float* b, float* c) {
     int tid = threadIdx.x;
     c[tid] = rhypotf(a[tid], b[tid]);
 }
 
-__global__ void test_norm3df(hipLaunchParm lp, float* a, float* b, float* c, float* d) {
+__global__ void test_norm3df(float* a, float* b, float* c, float* d) {
     int tid = threadIdx.x;
     d[tid] = norm3df(a[tid], b[tid], c[tid]);
 }
 
-__global__ void test_norm4df(hipLaunchParm lp, float* a, float* b, float* c, float* d, float* e) {
+__global__ void test_norm4df(float* a, float* b, float* c, float* d, float* e) {
     int tid = threadIdx.x;
     e[tid] = norm4df(a[tid], b[tid], c[tid], d[tid]);
 }
 
-__global__ void test_normf(hipLaunchParm lp, float* a, float* b) {
+__global__ void test_normf(float* a, float* b) {
     int tid = threadIdx.x;
     b[tid] = normf(N, a);
 }
 
-__global__ void test_rnorm3df(hipLaunchParm lp, float* a, float* b, float* c, float* d) {
+__global__ void test_rnorm3df(float* a, float* b, float* c, float* d) {
     int tid = threadIdx.x;
     d[tid] = rnorm3df(a[tid], b[tid], c[tid]);
 }
 
-__global__ void test_rnorm4df(hipLaunchParm lp, float* a, float* b, float* c, float* d, float* e) {
+__global__ void test_rnorm4df(float* a, float* b, float* c, float* d, float* e) {
     int tid = threadIdx.x;
     e[tid] = rnorm4df(a[tid], b[tid], c[tid], d[tid]);
 }
 
-__global__ void test_rnormf(hipLaunchParm lp, float* a, float* b) {
+__global__ void test_rnormf(float* a, float* b) {
     int tid = threadIdx.x;
     b[tid] = rnormf(N, a);
 }
 
-__global__ void test_erfinvf(hipLaunchParm lp, float* a, float* b) {
+__global__ void test_erfinvf(float* a, float* b) {
     int tid = threadIdx.x;
     b[tid] = erff(erfinvf(a[tid]));
 }
@@ -124,7 +124,7 @@ bool run_sincosf() {
     hipMalloc((void**)&Bd, SIZE);
     hipMalloc((void**)&Cd, SIZE);
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_sincosf, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
+    hipLaunchKernelGGL(test_sincosf, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
     hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
     hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
     int passed = 0;
@@ -166,7 +166,7 @@ bool run_sincospif() {
     hipMalloc((void**)&Bd, SIZE);
     hipMalloc((void**)&Cd, SIZE);
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_sincospif, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
+    hipLaunchKernelGGL(test_sincospif, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
     hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
     hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
     int passed = 0;
@@ -210,7 +210,7 @@ bool run_fdividef() {
     hipMalloc((void**)&Cd, SIZE);
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
     hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_fdividef, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
+    hipLaunchKernelGGL(test_fdividef, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
     hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -244,7 +244,7 @@ bool run_llrintf() {
     hipMalloc((void**)&Ad, SIZE);
     hipMalloc((void**)&Bd, N * sizeof(long long int));
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_llrintf, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_llrintf, dim3(1), dim3(N), 0, 0, Ad, Bd);
     hipMemcpy(B, Bd, N * sizeof(long long int), hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -277,7 +277,7 @@ bool run_lrintf() {
     hipMalloc((void**)&Ad, SIZE);
     hipMalloc((void**)&Bd, N * sizeof(long int));
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_lrintf, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_lrintf, dim3(1), dim3(N), 0, 0, Ad, Bd);
     hipMemcpy(B, Bd, N * sizeof(long int), hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -310,7 +310,7 @@ bool run_rintf() {
     hipMalloc((void**)&Ad, SIZE);
     hipMalloc((void**)&Bd, SIZE);
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_rintf, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_rintf, dim3(1), dim3(N), 0, 0, Ad, Bd);
     hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -344,7 +344,7 @@ bool run_llroundf() {
     hipMalloc((void**)&Ad, SIZE);
     hipMalloc((void**)&Bd, N * sizeof(long long int));
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_llroundf, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_llroundf, dim3(1), dim3(N), 0, 0, Ad, Bd);
     hipMemcpy(B, Bd, N * sizeof(long long int), hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -377,7 +377,7 @@ bool run_lroundf() {
     hipMalloc((void**)&Ad, SIZE);
     hipMalloc((void**)&Bd, N * sizeof(long int));
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_lroundf, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_lroundf, dim3(1), dim3(N), 0, 0, Ad, Bd);
     hipMemcpy(B, Bd, N * sizeof(long int), hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -420,7 +420,7 @@ bool run_norm3df() {
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
     hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
     hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_norm3df, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd);
+    hipLaunchKernelGGL(test_norm3df, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd);
     hipMemcpy(D, Dd, SIZE, hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -469,7 +469,7 @@ bool run_norm4df() {
     hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
     hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
     hipMemcpy(Dd, D, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_norm4df, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd, Ed);
+    hipLaunchKernelGGL(test_norm4df, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd, Ed);
     hipMemcpy(E, Ed, SIZE, hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -510,7 +510,7 @@ bool run_normf() {
     hipMalloc((void**)&Ad, SIZE);
     hipMalloc((void**)&Bd, SIZE);
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_normf, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_normf, dim3(1), dim3(N), 0, 0, Ad, Bd);
     hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -547,7 +547,7 @@ bool run_rhypotf() {
     hipMalloc((void**)&Cd, SIZE);
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
     hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_rhypotf, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
+    hipLaunchKernelGGL(test_rhypotf, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
     hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -590,7 +590,7 @@ bool run_rnorm3df() {
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
     hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
     hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_rnorm3df, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd);
+    hipLaunchKernelGGL(test_rnorm3df, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd);
     hipMemcpy(D, Dd, SIZE, hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -639,7 +639,7 @@ bool run_rnorm4df() {
     hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
     hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
     hipMemcpy(Dd, D, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_rnorm4df, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd, Ed);
+    hipLaunchKernelGGL(test_rnorm4df, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd, Ed);
     hipMemcpy(E, Ed, SIZE, hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -680,7 +680,7 @@ bool run_rnormf() {
     hipMalloc((void**)&Ad, SIZE);
     hipMalloc((void**)&Bd, SIZE);
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_rnormf, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_rnormf, dim3(1), dim3(N), 0, 0, Ad, Bd);
     hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -712,7 +712,7 @@ bool run_erfinvf() {
     hipMalloc((void**)&Ad, SIZE);
     hipMalloc((void**)&Bd, SIZE);
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_erfinvf, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_erfinvf, dim3(1), dim3(N), 0, 0, Ad, Bd);
     hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {