Moved device code to mimic cuda header behavior

1. All fp32, fp64 math device/host functions should be in math_functions.h/.cpp 2. All fp32, fp64 fast math intrinsics for device/host functions should be in device_functions.h/.cpp 3. All the device code implementations should be in device_util.h/.cpp 4. Hence, made changes appropriately by moving code and creating new header files 5. Added math_functions.cpp/.h 6. Changed #ifndef signature to make sure no conflicts between headers with same names in hip/hip_runtime.h and hip/hcc_detail/hip_runtime.h 7. Changed tests to fit the code changes, making them to include appropriate headers 8. Added math_functions.cpp to CMakeLists.txt 9. Some of the tests are still broken, mostly host math functions will fix them in next commit 10. TODO: FIX compilation issues for host math functions Change-Id: I7a17637d7e294a7d224ffba932c1a08668febd26
2017-01-17 14:57:51 -06:00
@@ -181,7 +181,8 @@ if(HIP_PLATFORM STREQUAL "hcc")
        src/device_util.cpp
        src/hip_ldg.cpp
        src/hip_fp16.cpp
-        src/device_functions.cpp)
+        src/device_functions.cpp
+        src/math_functions.cpp)

    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L${HCC_HOME}/lib -lmcwamp -Wl,-Bsymbolic -Wl,-rpath ${HCC_HOME}/lib")
    add_library(hip_hcc SHARED ${SOURCE_FILES_RUNTIME})
@@ -23,6 +23,173 @@ THE SOFTWARE.
 #include <hip/hip_runtime.h>
 #include <hip/hip_vector_types.h>

+// Single Precision Fast Math
+extern __attribute__((const)) float __hip_fast_cosf(float) __asm("llvm.cos.f32");
+extern __attribute__((const)) float __hip_fast_exp2f(float) __asm("llvm.exp2.f32");
+__device__ float __hip_fast_exp10f(float);
+__device__ float __hip_fast_expf(float);
+__device__ float __hip_fast_frsqrt_rn(float);
+extern __attribute__((const)) float __hip_fast_fsqrt_rd(float) __asm("llvm.sqrt.f32");
+__device__ float __hip_fast_fsqrt_rn(float);
+__device__ float __hip_fast_fsqrt_ru(float);
+__device__ float __hip_fast_fsqrt_rz(float);
+__device__ float __hip_fast_log10f(float);
+extern __attribute__((const)) float __hip_fast_log2f(float) __asm("llvm.log2.f32");
+__device__ float __hip_fast_logf(float);
+__device__ float __hip_fast_powf(float, float);
+__device__ void __hip_fast_sincosf(float,float*,float*);
+extern __attribute__((const)) float __hip_fast_sinf(float) __asm("llvm.sin.f32");
+__device__ float __hip_fast_tanf(float);
+extern __attribute__((const)) float __hip_fast_fmaf(float,float,float) __asm("llvm.fma.f32");
+extern __attribute__((const)) float __hip_fast_frcp(float) __asm("llvm.amdgcn.rcp.f32");
+
+extern __attribute__((const)) double __hip_fast_dsqrt(double) __asm("llvm.sqrt.f64");
+extern __attribute__((const)) double __hip_fast_fma(double,double,double) __asm("llvm.fma.f64");
+extern __attribute__((const)) double __hip_fast_drcp(double) __asm("llvm.amdgcn.rcp.f64");
+
+
+// Single Precision Fast Math
+__device__ inline float __cosf(float x) {
+  return __hip_fast_cosf(x);
+}
+
+__device__ inline float __exp10f(float x) {
+  return __hip_fast_exp10f(x);
+}
+
+__device__ inline float __expf(float x) {
+  return __hip_fast_expf(x);
+}
+
+__device__ inline float __frsqrt_rn(float x) {
+  return __hip_fast_frsqrt_rn(x);
+}
+
+__device__ inline float __fsqrt_rd(float x) {
+  return __hip_fast_fsqrt_rd(x);
+}
+
+__device__ inline float __fsqrt_rn(float x) {
+  return __hip_fast_fsqrt_rn(x);
+}
+
+__device__ inline float __fsqrt_ru(float x) {
+  return __hip_fast_fsqrt_ru(x);
+}
+
+__device__ inline float __fsqrt_rz(float x) {
+  return __hip_fast_fsqrt_rz(x);
+}
+
+__device__ inline float __log10f(float x) {
+  return __hip_fast_log10f(x);
+}
+
+__device__ inline float __log2f(float x) {
+  return __hip_fast_log2f(x);
+}
+
+__device__ inline float __logf(float x) {
+  return __hip_fast_logf(x);
+}
+
+__device__ inline float __powf(float base, float exponent) {
+  return __hip_fast_powf(base, exponent);
+}
+
+__device__ inline void __sincosf(float x, float *s, float *c) {
+  return __hip_fast_sincosf(x, s, c);
+}
+
+__device__ inline float __sinf(float x) {
+  return __hip_fast_sinf(x);
+}
+
+__device__ inline float __tanf(float x) {
+  return __hip_fast_tanf(x);
+}
+
+__device__ inline float __fmaf_rd(float x, float y, float z) {
+  return __hip_fast_fmaf(x, y, z);
+}
+
+__device__ inline float __fmaf_rn(float x, float y, float z) {
+  return __hip_fast_fmaf(x, y, z);
+}
+
+__device__ inline float __fmaf_ru(float x, float y, float z) {
+  return __hip_fast_fmaf(x, y, z);
+}
+
+__device__ inline float __fmaf_rz(float x, float y, float z) {
+  return __hip_fast_fmaf(x, y, z);
+}
+
+__device__ inline float __frcp_rd(float x) {
+  return __hip_fast_frcp(x);
+}
+
+__device__ inline float __frcp_rn(float x) {
+  return __hip_fast_frcp(x);
+}
+
+__device__ inline float __frcp_ru(float x) {
+  return __hip_fast_frcp(x);
+}
+
+__device__ inline float __frcp_rz(float x) {
+  return __hip_fast_frcp(x);
+}
+
+__device__ inline double __dsqrt_rd(double x) {
+  return __hip_fast_dsqrt(x);
+}
+
+__device__ inline double __dsqrt_rn(double x) {
+  return __hip_fast_dsqrt(x);
+}
+
+__device__ inline double __dsqrt_ru(double x) {
+  return __hip_fast_dsqrt(x);
+}
+
+__device__ inline double __dsqrt_rz(double x) {
+  return __hip_fast_dsqrt(x);
+}
+
+__device__ inline double __fma_rd(double x, double y, double z) {
+  return __hip_fast_fma(x, y, z);
+}
+
+__device__ inline double __fma_rn(double x, double y, double z) {
+  return __hip_fast_fma(x, y, z);
+}
+
+__device__ inline double __fma_ru(double x, double y, double z) {
+  return __hip_fast_fma(x, y, z);
+}
+
+__device__ inline double __fma_rz(double x, double y, double z) {
+  return __hip_fast_fma(x, y, z);
+}
+
+__device__ inline double __drcp_rd(double x) {
+  return __hip_fast_drcp(x);
+}
+
+__device__ inline double __drcp_rn(double x) {
+  return __hip_fast_drcp(x);
+}
+
+__device__ inline double __drcp_ru(double x) {
+  return __hip_fast_drcp(x);
+}
+
+__device__ inline double __drcp_rz(double x) {
+  return __hip_fast_drcp(x);
+}
+
+
 extern "C" unsigned int __hip_hc_ir_umul24_int(unsigned int, unsigned int);
 extern "C" signed int __hip_hc_ir_mul24_int(signed int, signed int);
 extern "C" signed int __hip_hc_ir_mulhi_int(signed int, signed int);
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -20,8 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-#ifndef HIP_FP16_H
-#define HIP_FP16_H
+#ifndef HIP_HCC_DETAIL_FP16_H
+#define HIP_HCC_DETAIL_FP16_H

 #include "hip/hip_runtime.h"

@@ -452,8 +452,6 @@ typedef struct __attribute__((aligned(4))){
 } __half2;


-
-
 #endif


@@ -121,208 +121,6 @@ extern int HIP_TRACE_API;
 #define __HCC_C__
 #endif

-__device__ float acosf(float x);
-__device__ float acoshf(float x);
-__device__ float asinf(float x);
-__device__ float asinhf(float x);
-__device__ float atan2f(float y, float x);
-__device__ float atanf(float x);
-__device__ float atanhf(float x);
-__device__ float cbrtf(float x);
-__device__ float ceilf(float x);
-__device__ float copysignf(float x, float y);
-__device__ float coshf(float x);
-__device__ float cyl_bessel_i0f(float x);
-__device__ float cyl_bessel_i1f(float x);
-__device__ float erfcf(float x);
-__device__  float erfcinvf(float y);
-
-__device__ float erfcxf(float x);
-__device__ float erff(float x);
-__device__ float erfinvf(float y);
-__device__ float exp2f(float x);
-__device__ float expm1f(float x);
-__device__ float fabsf(float x);
-__device__ float fdimf(float x, float y);
-__device__ __host__ float fdividef(float x, float y);
-__device__ float floorf(float x);
-__device__ float fmaf(float x, float y, float z);
-__device__ float fmaxf(float x, float y);
-__device__ float fminf(float x, float y);
-__device__ float fmodf(float x, float y);
-__device__ float frexpf(float x, float y);
-__device__ float hypotf(float x, float y);
-__device__ float ilogbf(float x);
-__host__ __device__ unsigned isfinite(float a);
-__device__ unsigned isinf(float a);
-__device__ unsigned isnan(float a);
-__device__ float j0f(float x);
-__device__ float j1f(float x);
-__device__ float jnf(int n, float x);
-__device__ float ldexpf(float x, int exp);
-__device__ float lgammaf(float x);
-__device__ long long int llrintf(float x);
-__device__ long long int llroundf(float x);
-__device__ float log1pf(float x);
-__device__ float logbf(float x);
-__device__ long int lrintf(float x);
-__device__ long int lroundf(float x);
-__device__ float modff(float x, float *iptr);
-__device__ float nanf(const char* tagp);
-__device__ float nearbyintf(float x);
-__device__ float nextafterf(float x, float y);
-__device__ float norm3df(float a, float b, float c);
-__device__ float norm4df(float a, float b, float c, float d);
-__device__ float normcdff(float y);
-__device__ float normcdfinvf(float y);
-__device__ float normf(int dim, const float *a);
-__device__ float rcbrtf(float x);
-__device__ float remainderf(float x, float y);
-__device__ float remquof(float x, float y, int *quo);
-__device__ float rhypotf(float x, float y);
-__device__ float rintf(float x);
-__device__ float rnorm3df(float a, float b, float c);
-__device__ float rnorm4df(float a, float b, float c, float d);
-__device__ float rnormf(int dim, const float* a);
-__device__ float roundf(float x);
-__device__ float rsqrtf(float x);
-__device__ float scalblnf(float x, long int n);
-__device__ float scalbnf(float x, int n);
-__host__ __device__ unsigned signbit(float a);
-__device__ void sincospif(float x, float *sptr, float *cptr);
-__device__ float sinhf(float x);
-__device__ float sinpif(float x);
-__device__ float sqrtf(float x);
-__device__ float tanhf(float x);
-__device__ float tgammaf(float x);
-__device__ float truncf(float x);
-__device__ float y0f(float x);
-__device__ float y1f(float x);
-__device__ float ynf(int n, float x);
-
-__host__ __device__ float cospif(float x);
-__host__ __device__ float sinpif(float x);
-// /__device__ float sqrtf(float x);
-__host__ __device__ float rsqrtf(float x);
-__host__ float normcdff(float y);
-
-__host__ float erfcinvf(float y);
-__host__ float erfcxf(float x);
-__host__ float erfinvf(float y);
-__host__ float norm3df(float a, float b, float c);
-__host__ float normcdfinvf(float y);
-__host__ float norm4df(float a, float b, float c, float d);
-__host__ float rcbrtf(float x);
-__host__ float rhypotf(float x, float y);
-__host__ float rnorm3df(float a, float b, float c);
-__host__ float rnormf(int dim, const float* a);
-__host__ float rnorm4df(float a, float b, float c, float d);
-__host__ void sincospif(float x, float *sptr, float *cptr);
-
-__device__ double acos(double x);
-__device__ double acosh(double x);
-__device__ double asin(double x);
-__device__ double asinh(double x);
-__device__ double atan(double x);
-__device__ double atan2(double y, double x);
-__device__ double atanh(double x);
-__device__ double cbrt(double x);
-__device__ double ceil(double x);
-__device__ double copysign(double x, double y);
-__device__ double cos(double x);
-__device__ double cosh(double x);
-__host__ __device__ double cospi(double x);
-__device__ double cyl_bessel_i0(double x);
-__device__ double cyl_bessel_i1(double x);
-__device__ double erf(double x);
-__device__ double erfc(double x);
-__device__ double erfcinv(double y);
-__device__ double erfcx(double x);
-__device__ double erfinv(double x);
-__device__ double exp(double x);
-__device__ double exp10(double x);
-__device__ double exp2(double x);
-__device__ double expm1(double x);
-__device__ double fabs(double x);
-__device__ double fdim(double x, double y);
-__device__ double fdivide(double x, double y);
-__device__ double floor(double x);
-__device__ double fma(double x, double y, double z);
-__device__ double fmax(double x, double y);
-__device__ double fmin(double x, double y);
-__device__ double fmod(double x, double y);
-__device__ double frexp(double x, int *nptr);
-__device__ double hypot(double x, double y);
-__device__ double ilogb(double x);
-__host__ __device__ unsigned isfinite(double x);
-__device__ unsigned isinf(double x);
-__device__ unsigned isnan(double x);
-__device__ double j0(double x);
-__device__ double j1(double x);
-__device__ double jn(int n, double x);
-__device__ double ldexp(double x, int exp);
-__device__ double lgamma(double x);
-__device__ long long llrint(double x);
-__device__ long long llround(double x);
-__device__ double log(double x);
-__device__ double log10(double x);
-__device__ double log1p(double x);
-__device__ double log2(double x);
-__device__ double logb(double x);
-__device__ long int lrint(double x);
-__device__ long int lround(double x);
-__device__ double modf(double x, double *iptr);
-__device__ double nan(const char* tagp);
-__device__ double nearbyint(double x);
-__device__ double nextafter(double x, double y);
-__device__ double norm(int dim, const double* t);
-__device__ double norm3d(double a, double b, double c);
-__host__ double norm3d(double a, double b, double c);
-__device__ double norm4d(double a, double b, double c, double d);
-__host__ double norm4d(double a, double b, double c, double d);
-__device__ double normcdf(double y);
-__host__ double normcdf(double y);
-__device__ double normcdfinv(double y);
-__host__ double normcdfinv(double y);
-__device__ double pow(double x, double y);
-__device__ double rcbrt(double x);
-__host__ double rcbrt(double x);
-__device__ double remainder(double x, double y);
-__device__ double remquo(double x, double y, int *quo);
-__device__ double rhypot(double x, double y);
-__host__ double rhypot(double x, double y);
-__device__ double rint(double x);
-__device__ double rnorm(int dim, const double* t);
-__host__ double rnorm(int dim, const double* t);
-__device__ double rnorm3d(double a, double b, double c);
-__host__ double rnorm3d(double a, double b, double c);
-__device__ double rnorm4d(double a, double b, double c, double d);
-__host__ double rnorm4d(double a, double b, double c, double d);
-__device__ double round(double x);
-__host__ __device__ double rsqrt(double x);
-__device__ double scalbln(double x, long int n);
-__device__ double scalbn(double x, int n);
-__host__ __device__ unsigned signbit(double a);
-__device__ double sin(double a);
-__device__ void sincos(double x, double *sptr, double *cptr);
-__device__ void sincospi(double x, double *sptr, double *cptr);
-__host__ void sincospi(double x, double *sptr, double *cptr);
-__device__ double sinh(double x);
-__host__ __device__ double sinpi(double x);
-__device__ double sqrt(double x);
-__device__ double tan(double x);
-__device__ double tanh(double x);
-__device__ double tgamma(double x);
-__device__ double trunc(double x);
-__device__ double y0(double x);
-__device__ double y1(double y);
-__device__ double yn(int n, double x);
-
-__host__ double erfcinv(double y);
-__host__ double erfcx(double x);
-__host__ double erfinv(double y);
-__host__ double fdivide(double x, double y);
-
 // TODO - hipify-clang - change to use the function call.
 //#define warpSize hc::__wavesize()
 extern const int warpSize;
@@ -451,252 +249,6 @@ __host__ __device__ int max(int arg1, int arg2);

 __device__ __attribute__((address_space(3))) void* __get_dynamicgroupbaseptr();

-//TODO - add a couple fast math operations here, the set here will grow :
-
-// Single Precision Precise Math
-__device__ float __hip_precise_cosf(float);
-__device__ float __hip_precise_exp10f(float);
-__device__ float __hip_precise_expf(float);
-__device__ float __hip_precise_frsqrt_rn(float);
-__device__ float __hip_precise_fsqrt_rd(float);
-__device__ float __hip_precise_fsqrt_rn(float);
-__device__ float __hip_precise_fsqrt_ru(float);
-__device__ float __hip_precise_fsqrt_rz(float);
-__device__ float __hip_precise_log10f(float);
-__device__ float __hip_precise_log2f(float);
-__device__ float __hip_precise_logf(float);
-__device__ float __hip_precise_powf(float, float);
-__device__ void __hip_precise_sincosf(float,float*,float*);
-__device__ float __hip_precise_sinf(float);
-__device__ float __hip_precise_tanf(float);
-
-// Double Precision Precise Math
-__device__ double __hip_precise_dsqrt_rd(double);
-__device__ double __hip_precise_dsqrt_rn(double);
-__device__ double __hip_precise_dsqrt_ru(double);
-__device__ double __hip_precise_dsqrt_rz(double);
-
-// Single Precision Fast Math
-extern __attribute__((const)) float __hip_fast_cosf(float) __asm("llvm.cos.f32");
-extern __attribute__((const)) float __hip_fast_exp2f(float) __asm("llvm.exp2.f32");
-__device__ float __hip_fast_exp10f(float);
-__device__ float __hip_fast_expf(float);
-__device__ float __hip_fast_frsqrt_rn(float);
-extern __attribute__((const)) float __hip_fast_fsqrt_rd(float) __asm("llvm.sqrt.f32");
-__device__ float __hip_fast_fsqrt_rn(float);
-__device__ float __hip_fast_fsqrt_ru(float);
-__device__ float __hip_fast_fsqrt_rz(float);
-__device__ float __hip_fast_log10f(float);
-extern __attribute__((const)) float __hip_fast_log2f(float) __asm("llvm.log2.f32");
-__device__ float __hip_fast_logf(float);
-__device__ float __hip_fast_powf(float, float);
-__device__ void __hip_fast_sincosf(float,float*,float*);
-extern __attribute__((const)) float __hip_fast_sinf(float) __asm("llvm.sin.f32");
-__device__ float __hip_fast_tanf(float);
-extern __attribute__((const)) float __hip_fast_fmaf(float,float,float) __asm("llvm.fma.f32");
-extern __attribute__((const)) float __hip_fast_frcp(float) __asm("llvm.amdgcn.rcp.f32");
-
-extern __attribute__((const)) double __hip_fast_dsqrt(double) __asm("llvm.sqrt.f64");
-extern __attribute__((const)) double __hip_fast_fma(double,double,double) __asm("llvm.fma.f64");
-extern __attribute__((const)) double __hip_fast_drcp(double) __asm("llvm.amdgcn.rcp.f64");
-
-#ifdef HIP_FAST_MATH
-// Single Precision Precise Math when enabled
-
-__device__ inline float cosf(float x) {
-  return __hip_fast_cosf(x);
-}
-
-__device__ inline float exp10f(float x) {
-  return __hip_fast_exp10f(x);
-}
-
-__device__ inline float expf(float x) {
-  return __hip_fast_expf(x);
-}
-
-__device__ inline float log10f(float x) {
-  return __hip_fast_log10f(x);
-}
-
-__device__ inline float log2f(float x) {
-  return __hip_fast_log2f(x);
-}
-
-__device__ inline float logf(float x) {
-  return __hip_fast_logf(x);
-}
-
-__device__ inline float powf(float base, float exponent) {
-  return __hip_fast_powf(base, exponent);
-}
-
-__device__ inline void sincosf(float x, float *s, float *c) {
-  return __hip_fast_sincosf(x, s, c);
-}
-
-__device__ inline float sinf(float x) {
-  return __hip_fast_sinf(x);
-}
-
-__device__ inline float tanf(float x) {
-  return __hip_fast_tanf(x);
-}
-
-#else
-
-__device__ float sinf(float);
-__device__ float cosf(float);
-__device__ float tanf(float);
-__device__ void sincosf(float, float*, float*);
-__device__ float logf(float);
-__device__ float log2f(float);
-__device__ float log10f(float);
-__device__ float expf(float);
-__device__ float exp10f(float);
-__device__ float powf(float, float);
-
-#endif
-// Single Precision Fast Math
-__device__ inline float __cosf(float x) {
-  return __hip_fast_cosf(x);
-}
-
-__device__ inline float __exp10f(float x) {
-  return __hip_fast_exp10f(x);
-}
-
-__device__ inline float __expf(float x) {
-  return __hip_fast_expf(x);
-}
-
-__device__ inline float __frsqrt_rn(float x) {
-  return __hip_fast_frsqrt_rn(x);
-}
-
-__device__ inline float __fsqrt_rd(float x) {
-  return __hip_fast_fsqrt_rd(x);
-}
-
-__device__ inline float __fsqrt_rn(float x) {
-  return __hip_fast_fsqrt_rn(x);
-}
-
-__device__ inline float __fsqrt_ru(float x) {
-  return __hip_fast_fsqrt_ru(x);
-}
-
-__device__ inline float __fsqrt_rz(float x) {
-  return __hip_fast_fsqrt_rz(x);
-}
-
-__device__ inline float __log10f(float x) {
-  return __hip_fast_log10f(x);
-}
-
-__device__ inline float __log2f(float x) {
-  return __hip_fast_log2f(x);
-}
-
-__device__ inline float __logf(float x) {
-  return __hip_fast_logf(x);
-}
-
-__device__ inline float __powf(float base, float exponent) {
-  return __hip_fast_powf(base, exponent);
-}
-
-__device__ inline void __sincosf(float x, float *s, float *c) {
-  return __hip_fast_sincosf(x, s, c);
-}
-
-__device__ inline float __sinf(float x) {
-  return __hip_fast_sinf(x);
-}
-
-__device__ inline float __tanf(float x) {
-  return __hip_fast_tanf(x);
-}
-
-__device__ inline float __fmaf_rd(float x, float y, float z) {
-  return __hip_fast_fmaf(x, y, z);
-}
-
-__device__ inline float __fmaf_rn(float x, float y, float z) {
-  return __hip_fast_fmaf(x, y, z);
-}
-
-__device__ inline float __fmaf_ru(float x, float y, float z) {
-  return __hip_fast_fmaf(x, y, z);
-}
-
-__device__ inline float __fmaf_rz(float x, float y, float z) {
-  return __hip_fast_fmaf(x, y, z);
-}
-
-__device__ inline float __frcp_rd(float x) {
-  return __hip_fast_frcp(x);
-}
-
-__device__ inline float __frcp_rn(float x) {
-  return __hip_fast_frcp(x);
-}
-
-__device__ inline float __frcp_ru(float x) {
-  return __hip_fast_frcp(x);
-}
-
-__device__ inline float __frcp_rz(float x) {
-  return __hip_fast_frcp(x);
-}
-
-__device__ inline double __dsqrt_rd(double x) {
-  return __hip_fast_dsqrt(x);
-}
-
-__device__ inline double __dsqrt_rn(double x) {
-  return __hip_fast_dsqrt(x);
-}
-
-__device__ inline double __dsqrt_ru(double x) {
-  return __hip_fast_dsqrt(x);
-}
-
-__device__ inline double __dsqrt_rz(double x) {
-  return __hip_fast_dsqrt(x);
-}
-
-__device__ inline double __fma_rd(double x, double y, double z) {
-  return __hip_fast_fma(x, y, z);
-}
-
-__device__ inline double __fma_rn(double x, double y, double z) {
-  return __hip_fast_fma(x, y, z);
-}
-
-__device__ inline double __fma_ru(double x, double y, double z) {
-  return __hip_fast_fma(x, y, z);
-}
-
-__device__ inline double __fma_rz(double x, double y, double z) {
-  return __hip_fast_fma(x, y, z);
-}
-
-__device__ inline double __drcp_rd(double x) {
-  return __hip_fast_drcp(x);
-}
-
-__device__ inline double __drcp_rn(double x) {
-  return __hip_fast_drcp(x);
-}
-
-__device__ inline double __drcp_ru(double x) {
-  return __hip_fast_drcp(x);
-}
-
-__device__ inline double __drcp_rz(double x) {
-  return __hip_fast_drcp(x);
-}

 /**
 * CUDA 8 device function features
@@ -0,0 +1,288 @@
+/*
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_HCC_DETAIL_MATH_FUNCTIONS_H
+#define HIP_HCC_DETAIL_MATH_FUNCTIONS_H
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_vector_types.h>
+#include <hip/hcc_detail/device_functions.h>
+
+__device__ float acosf(float x);
+__device__ float acoshf(float x);
+__device__ float asinf(float x);
+__device__ float asinhf(float x);
+__device__ float atan2f(float y, float x);
+__device__ float atanf(float x);
+__device__ float atanhf(float x);
+__device__ float cbrtf(float x);
+__device__ float ceilf(float x);
+__device__ float copysignf(float x, float y);
+__device__ float coshf(float x);
+__device__ float cyl_bessel_i0f(float x);
+__device__ float cyl_bessel_i1f(float x);
+__device__ float erfcf(float x);
+__device__  float erfcinvf(float y);
+
+__device__ float erfcxf(float x);
+__device__ float erff(float x);
+__device__ float erfinvf(float y);
+__device__ float exp2f(float x);
+__device__ float expm1f(float x);
+__device__ float fabsf(float x);
+__device__ float fdimf(float x, float y);
+__device__ __host__ float fdividef(float x, float y);
+__device__ float floorf(float x);
+__device__ float fmaf(float x, float y, float z);
+__device__ float fmaxf(float x, float y);
+__device__ float fminf(float x, float y);
+__device__ float fmodf(float x, float y);
+__device__ float frexpf(float x, float y);
+__device__ float hypotf(float x, float y);
+__device__ float ilogbf(float x);
+__host__ __device__ int isfinite(float a);
+__device__ unsigned isinf(float a);
+__device__ unsigned isnan(float a);
+__device__ float j0f(float x);
+__device__ float j1f(float x);
+__device__ float jnf(int n, float x);
+__device__ float ldexpf(float x, int exp);
+__device__ float lgammaf(float x);
+__device__ long long int llrintf(float x);
+__device__ long long int llroundf(float x);
+__device__ float log1pf(float x);
+__device__ float logbf(float x);
+__device__ long int lrintf(float x);
+__device__ long int lroundf(float x);
+__device__ float modff(float x, float *iptr);
+__device__ float nanf(const char* tagp);
+__device__ float nearbyintf(float x);
+__device__ float nextafterf(float x, float y);
+__device__ float norm3df(float a, float b, float c);
+__device__ float norm4df(float a, float b, float c, float d);
+__device__ float normcdff(float y);
+__device__ float normcdfinvf(float y);
+__device__ float normf(int dim, const float *a);
+__device__ float rcbrtf(float x);
+__device__ float remainderf(float x, float y);
+__device__ float remquof(float x, float y, int *quo);
+__device__ float rhypotf(float x, float y);
+__device__ float rintf(float x);
+__device__ float rnorm3df(float a, float b, float c);
+__device__ float rnorm4df(float a, float b, float c, float d);
+__device__ float rnormf(int dim, const float* a);
+__device__ float roundf(float x);
+__device__ float rsqrtf(float x);
+__device__ float scalblnf(float x, long int n);
+__device__ float scalbnf(float x, int n);
+__host__ __device__ unsigned signbit(float a);
+__device__ void sincospif(float x, float *sptr, float *cptr);
+__device__ float sinhf(float x);
+__device__ float sinpif(float x);
+__device__ float sqrtf(float x);
+__device__ float tanhf(float x);
+__device__ float tgammaf(float x);
+__device__ float truncf(float x);
+__device__ float y0f(float x);
+__device__ float y1f(float x);
+__device__ float ynf(int n, float x);
+
+__host__ __device__ float cospif(float x);
+__host__ __device__ float sinpif(float x);
+// /__device__ float sqrtf(float x);
+__host__ __device__ float rsqrtf(float x);
+__host__ float normcdff(float y);
+
+__host__ float erfcinvf(float y);
+__host__ float erfcxf(float x);
+__host__ float erfinvf(float y);
+__host__ float norm3df(float a, float b, float c);
+__host__ float normcdfinvf(float y);
+__host__ float norm4df(float a, float b, float c, float d);
+__host__ float rcbrtf(float x);
+__host__ float rhypotf(float x, float y);
+__host__ float rnorm3df(float a, float b, float c);
+__host__ float rnormf(int dim, const float* a);
+__host__ float rnorm4df(float a, float b, float c, float d);
+__host__ void sincospif(float x, float *sptr, float *cptr);
+
+__device__ double acos(double x);
+__device__ double acosh(double x);
+__device__ double asin(double x);
+__device__ double asinh(double x);
+__device__ double atan(double x);
+__device__ double atan2(double y, double x);
+__device__ double atanh(double x);
+__device__ double cbrt(double x);
+__device__ double ceil(double x);
+__device__ double copysign(double x, double y);
+__device__ double cos(double x);
+__device__ double cosh(double x);
+__host__ __device__ double cospi(double x);
+__device__ double cyl_bessel_i0(double x);
+__device__ double cyl_bessel_i1(double x);
+__device__ double erf(double x);
+__device__ double erfc(double x);
+__device__ double erfcinv(double y);
+__device__ double erfcx(double x);
+__device__ double erfinv(double x);
+__device__ double exp(double x);
+__device__ double exp10(double x);
+__device__ double exp2(double x);
+__device__ double expm1(double x);
+__device__ double fabs(double x);
+__device__ double fdim(double x, double y);
+__device__ double floor(double x);
+__device__ double fma(double x, double y, double z);
+__device__ double fmax(double x, double y);
+__device__ double fmin(double x, double y);
+__device__ double fmod(double x, double y);
+__device__ double frexp(double x, int *nptr);
+__device__ double hypot(double x, double y);
+__device__ double ilogb(double x);
+__host__ __device__ unsigned isfinite(double x);
+__device__ unsigned isinf(double x);
+__device__ unsigned isnan(double x);
+__device__ double j0(double x);
+__device__ double j1(double x);
+__device__ double jn(int n, double x);
+__device__ double ldexp(double x, int exp);
+__device__ double lgamma(double x);
+__device__ long long llrint(double x);
+__device__ long long llround(double x);
+__device__ double log(double x);
+__device__ double log10(double x);
+__device__ double log1p(double x);
+__device__ double log2(double x);
+__device__ double logb(double x);
+__device__ long int lrint(double x);
+__device__ long int lround(double x);
+__device__ double modf(double x, double *iptr);
+__device__ double nan(const char* tagp);
+__device__ double nearbyint(double x);
+__device__ double nextafter(double x, double y);
+__device__ double norm(int dim, const double* t);
+__device__ double norm3d(double a, double b, double c);
+__host__ double norm3d(double a, double b, double c);
+__device__ double norm4d(double a, double b, double c, double d);
+__host__ double norm4d(double a, double b, double c, double d);
+__device__ double normcdf(double y);
+__host__ double normcdf(double y);
+__device__ double normcdfinv(double y);
+__host__ double normcdfinv(double y);
+__device__ double pow(double x, double y);
+__device__ double rcbrt(double x);
+__host__ double rcbrt(double x);
+__device__ double remainder(double x, double y);
+__device__ double remquo(double x, double y, int *quo);
+__device__ double rhypot(double x, double y);
+__host__ double rhypot(double x, double y);
+__device__ double rint(double x);
+__device__ double rnorm(int dim, const double* t);
+__host__ double rnorm(int dim, const double* t);
+__device__ double rnorm3d(double a, double b, double c);
+__host__ double rnorm3d(double a, double b, double c);
+__device__ double rnorm4d(double a, double b, double c, double d);
+__host__ double rnorm4d(double a, double b, double c, double d);
+__device__ double round(double x);
+__host__ __device__ double rsqrt(double x);
+__device__ double scalbln(double x, long int n);
+__device__ double scalbn(double x, int n);
+__host__ __device__ unsigned signbit(double a);
+__device__ double sin(double a);
+__device__ void sincos(double x, double *sptr, double *cptr);
+__device__ void sincospi(double x, double *sptr, double *cptr);
+__host__ void sincospi(double x, double *sptr, double *cptr);
+__device__ double sinh(double x);
+__host__ __device__ double sinpi(double x);
+__device__ double sqrt(double x);
+__device__ double tan(double x);
+__device__ double tanh(double x);
+__device__ double tgamma(double x);
+__device__ double trunc(double x);
+__device__ double y0(double x);
+__device__ double y1(double y);
+__device__ double yn(int n, double x);
+
+__host__ double erfcinv(double y);
+__host__ double erfcx(double x);
+__host__ double erfinv(double y);
+__host__ double fdivide(double x, double y);
+__host__ double norm(double x, const double *t);
+
+#ifdef HIP_FAST_MATH
+// Single Precision Precise Math when enabled
+
+__device__ inline float cosf(float x) {
+  return __hip_fast_cosf(x);
+}
+
+__device__ inline float exp10f(float x) {
+  return __hip_fast_exp10f(x);
+}
+
+__device__ inline float expf(float x) {
+  return __hip_fast_expf(x);
+}
+
+__device__ inline float log10f(float x) {
+  return __hip_fast_log10f(x);
+}
+
+__device__ inline float log2f(float x) {
+  return __hip_fast_log2f(x);
+}
+
+__device__ inline float logf(float x) {
+  return __hip_fast_logf(x);
+}
+
+__device__ inline float powf(float base, float exponent) {
+  return __hip_fast_powf(base, exponent);
+}
+
+__device__ inline void sincosf(float x, float *s, float *c) {
+  return __hip_fast_sincosf(x, s, c);
+}
+
+__device__ inline float sinf(float x) {
+  return __hip_fast_sinf(x);
+}
+
+__device__ inline float tanf(float x) {
+  return __hip_fast_tanf(x);
+}
+
+#else
+
+__device__ float sinf(float);
+__device__ float cosf(float);
+__device__ float tanf(float);
+__device__ void sincosf(float, float*, float*);
+__device__ float logf(float);
+__device__ float log2f(float);
+__device__ float log10f(float);
+__device__ float expf(float);
+__device__ float exp10f(float);
+__device__ float powf(float, float);
+
+#endif
+
+
+#endif
@@ -0,0 +1,49 @@
+/*
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+//! HIP = Heterogeneous-compute Interface for Portability
+//!
+//! Define a extremely thin runtime layer that allows source code to be compiled unmodified
+//! through either AMD HCC or NVCC.   Key features tend to be in the spirit
+//! and terminology of CUDA, but with a portable path to other accelerators as well:
+//
+//! Both paths support rich C++ features including classes, templates, lambdas, etc.
+//! Runtime API is C
+//! Memory management is based on pure pointers and resembles malloc/free/copy.
+//
+//! hip_runtime.h     : includes everything in hip_api.h, plus math builtins and kernel launch macros.
+//! hip_runtime_api.h : Defines HIP API.  This is a C header file and does not use any C++ features.
+
+#pragma once
+
+// Some standard header files, these are included by hc.hpp and so want to make them avail on both
+// paths to provide a consistent include env and avoid "missing symbol" errors that only appears
+// on NVCC path:
+
+
+#if defined(__HIP_PLATFORM_HCC__) && !defined (__HIP_PLATFORM_NVCC__)
+#include <hip/hcc_detail/math_functions.h>
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
+#include <hip/nvcc_detail/math_functions.h>
+#else
+#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
+#endif
@@ -523,3 +523,71 @@ __device__ unsigned long long __umul64hi(unsigned long long int x, unsigned long
  uHold1.ul = uHold1.ui[1] * uHold2.ui[1];
  return uHold1.ul;
 }
+
+/*
+HIP specific device functions
+*/
+
+__device__ unsigned __hip_ds_bpermute(int index, unsigned src) {
+    return hc::__amdgcn_ds_bpermute(index, src);
+}
+
+__device__ float __hip_ds_bpermutef(int index, float src) {
+    return hc::__amdgcn_ds_bpermute(index, src);
+}
+
+__device__ unsigned __hip_ds_permute(int index, unsigned src) {
+    return hc::__amdgcn_ds_permute(index, src);
+}
+
+__device__ float __hip_ds_permutef(int index, float src) {
+    return hc::__amdgcn_ds_permute(index, src);
+}
+
+__device__ unsigned __hip_ds_swizzle(unsigned int src, int pattern) {
+    return hc::__amdgcn_ds_swizzle(src, pattern);
+}
+
+__device__ float __hip_ds_swizzlef(float src, int pattern) {
+    return hc::__amdgcn_ds_swizzle(src, pattern);
+}
+
+__device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl) {
+    return hc::__amdgcn_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
+}
+
+#define MASK1 0x00ff00ff
+#define MASK2 0xff00ff00
+
+__device__ char4 __hip_hc_add8pk(char4 in1, char4 in2) {
+    char4 out;
+    unsigned one1 = in1.a & MASK1;
+    unsigned one2 = in2.a & MASK1;
+    out.a = (one1 + one2) & MASK1;
+    one1 = in1.a & MASK2;
+    one2 = in2.a & MASK2;
+    out.a = out.a | ((one1 + one2) & MASK2);
+    return out;
+}
+
+__device__ char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
+    char4 out;
+    unsigned one1 = in1.a & MASK1;
+    unsigned one2 = in2.a & MASK1;
+    out.a = (one1 - one2) & MASK1;
+    one1 = in1.a & MASK2;
+    one2 = in2.a & MASK2;
+    out.a = out.a | ((one1 - one2) & MASK2);
+    return out;
+}
+
+__device__ char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
+    char4 out;
+    unsigned one1 = in1.a & MASK1;
+    unsigned one2 = in2.a & MASK1;
+    out.a = (one1 * one2) & MASK1;
+    one1 = in1.a & MASK2;
+    one2 = in2.a & MASK2;
+    out.a = out.a | ((one1 * one2) & MASK2);
+    return out;
+}
@@ -23,6 +23,8 @@ THE SOFTWARE.
 #ifndef DEVICE_UTIL_H
 #define DEVICE_UTIL_H

+#include<hip/hcc_detail/hip_runtime.h>
+
 /*
 Heap size computation for malloc and free device functions.
 */
@@ -35,4 +37,119 @@ THE SOFTWARE.
 #define SIZE_MALLOC NUM_PAGES * SIZE_OF_PAGE
 #define SIZE_OF_HEAP SIZE_MALLOC

+#define HIP_SQRT_2 1.41421356237
+#define HIP_SQRT_PI 1.77245385091
+
+#define __hip_erfinva3 -0.140543331
+#define __hip_erfinva2 0.914624893
+#define __hip_erfinva1 -1.645349621
+#define __hip_erfinva0 0.886226899
+
+#define __hip_erfinvb4 0.012229801
+#define __hip_erfinvb3 -0.329097515
+#define __hip_erfinvb2 1.442710462
+#define __hip_erfinvb1 -2.118377725
+#define __hip_erfinvb0 1
+
+#define __hip_erfinvc3 1.641345311
+#define __hip_erfinvc2 3.429567803
+#define __hip_erfinvc1 -1.62490649
+#define __hip_erfinvc0 -1.970840454
+
+#define __hip_erfinvd2 1.637067800
+#define __hip_erfinvd1 3.543889200
+#define __hip_erfinvd0 1
+
+#define HIP_PI 3.14159265358979323846
+
+__device__ void* __hip_hc_malloc(size_t size);
+__device__ void* __hip_hc_free(void* ptr);
+
+__device__ float __hip_erfinvf(float x);
+__device__ double __hip_erfinv(double x);
+
+__device__ float __hip_j0f(float x);
+__device__ double __hip_j0(double x);
+
+__device__ float __hip_j1f(float x);
+__device__ double __hip_j1(double x);
+
+__device__ float __hip_y0f(float x);
+__device__ double __hip_y0(double x);
+
+__device__ float __hip_y1f(float x);
+__device__ double __hip_y1(double x);
+
+__device__ float __hip_jnf(int n, float x);
+__device__ double __hip_jn(int n, double x);
+
+__device__ float __hip_ynf(int n, float x);
+__device__ double __hip_yn(int n, double x);
+
+__device__ float __hip_precise_cosf(float x);
+__device__ float __hip_precise_exp10f(float x);
+__device__ float __hip_precise_expf(float x);
+__device__ float __hip_precise_frsqrt_rn(float x);
+__device__ float __hip_precise_fsqrt_rd(float x);
+__device__ float __hip_precise_fsqrt_rn(float x);
+__device__ float __hip_precise_fsqrt_ru(float x);
+__device__ float __hip_precise_fsqrt_rz(float x);
+__device__ float __hip_precise_log10f(float x);
+__device__ float __hip_precise_log2f(float x);
+__device__ float __hip_precise_logf(float x);
+__device__ float __hip_precise_powf(float base, float exponent);
+__device__ void __hip_precise_sincosf(float x, float *s, float *c);
+__device__ float __hip_precise_sinf(float x);
+__device__ float __hip_precise_tanf(float x);
+// Double Precision Math
+__device__ double __hip_precise_dsqrt_rd(double x);
+__device__ double __hip_precise_dsqrt_rn(double x);
+__device__ double __hip_precise_dsqrt_ru(double x);
+__device__ double __hip_precise_dsqrt_rz(double x);
+
+
+
+// Float Fast Math
+__device__ float __hip_fast_exp10f(float x);
+__device__ float __hip_fast_expf(float x);
+__device__ float __hip_fast_frsqrt_rn(float x);
+__device__ float __hip_fast_fsqrt_rn(float x);
+__device__ float __hip_fast_fsqrt_ru(float x);
+__device__ float __hip_fast_fsqrt_rz(float x);
+__device__ float __hip_fast_log10f(float x);
+__device__ float __hip_fast_logf(float x);
+__device__ float __hip_fast_powf(float base, float exponent);
+__device__ void __hip_fast_sincosf(float x, float *s, float *c);
+__device__ float __hip_fast_tanf(float x);
+// Double Precision Math
+__device__ double __hip_fast_dsqrt_rd(double x);
+__device__ double __hip_fast_dsqrt_rn(double x);
+__device__ double __hip_fast_dsqrt_ru(double x);
+__device__ double __hip_fast_dsqrt_rz(double x);
+__device__ void  __threadfence_system(void);
+
+float __hip_host_erfinvf(float x);
+double __hip_host_erfinv(double x);
+
+float __hip_host_erfcinvf(float y);
+double __hip_host_erfcinv(double y);
+
+float __hip_host_j0f(float x);
+double __hip_host_j0(double x);
+
+float __hip_host_j1f(float x);
+double __hip_host_j1(double x);
+
+float __hip_host_y0f(float x);
+double __hip_host_y1(double x);
+
+float __hip_host_y1f(float x);
+double __hip_host_y1(double x);
+
+float __hip_host_jnf(int n, float x);
+double __hip_host_jn(int n, double x);
+
+float __hip_host_ynf(int n, float x);
+double __hip_host_yn(int n, double x);
+
 #endif
@@ -0,0 +1,971 @@
+
+/*
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hc.hpp>
+#include <grid_launch.h>
+#include <hc_math.hpp>
+#include "device_util.h"
+#include "hip/hcc_detail/device_functions.h"
+#include "hip/hip_runtime.h"
+
+__device__ float acosf(float x)
+{
+    return hc::precise_math::acosf(x);
+}
+__device__ float acoshf(float x)
+{
+    return hc::precise_math::acoshf(x);
+}
+__device__ float asinf(float x)
+{
+    return hc::precise_math::asinf(x);
+}
+__device__ float asinhf(float x)
+{
+    return hc::precise_math::asinhf(x);
+}
+__device__ float atan2f(float y, float x)
+{
+    return hc::precise_math::atan2f(x, y);
+}
+__device__ float atanf(float x)
+{
+    return hc::precise_math::atanf(x);
+}
+__device__ float atanhf(float x)
+{
+    return hc::precise_math::atanhf(x);
+}
+__device__ float cbrtf(float x)
+{
+    return hc::precise_math::cbrtf(x);
+}
+__device__ float ceilf(float x)
+{
+    return hc::precise_math::ceilf(x);
+}
+__device__ float copysignf(float x, float y)
+{
+    return hc::precise_math::copysignf(x, y);
+}
+__device__ float cosf(float x)
+{
+    return hc::precise_math::cosf(x);
+}
+__device__ float coshf(float x)
+{
+    return hc::precise_math::coshf(x);
+}
+__device__ float cyl_bessel_i0f(float x);
+__device__ float cyl_bessel_i1f(float x);
+__device__ float erfcf(float x)
+{
+    return hc::precise_math::erfcf(x);
+}
+__device__ float erfcinvf(float y)
+{
+    return __hip_erfinvf(1 - y);
+}
+__device__ float erfcxf(float x)
+{
+    return hc::precise_math::expf(x*x)*hc::precise_math::erfcf(x);
+}
+__device__ float erff(float x)
+{
+    return hc::precise_math::erff(x);
+}
+__device__ float erfinvf(float y)
+{
+    return __hip_erfinvf(y);
+}
+__device__ float exp10f(float x)
+{
+    return hc::precise_math::exp10f(x);
+}
+__device__ float exp2f(float x)
+{
+    return hc::precise_math::exp2f(x);
+}
+__device__ float expf(float x)
+{
+    return hc::precise_math::expf(x);
+}
+__device__ float expm1f(float x)
+{
+    return hc::precise_math::expm1f(x);
+}
+__device__ float fabsf(float x)
+{
+    return hc::precise_math::fabsf(x);
+}
+__device__ float fdimf(float x, float y)
+{
+    return hc::precise_math::fdimf(x, y);
+}
+__device__ float fdividef(float x, float y)
+{
+    return x/y;
+}
+__device__ float floorf(float x)
+{
+    return hc::precise_math::floorf(x);
+}
+__device__ float fmaf(float x, float y, float z)
+{
+    return hc::precise_math::fmaf(x, y, z);
+}
+__device__ float fmaxf(float x, float y)
+{
+    return hc::precise_math::fmaxf(x, y);
+}
+__device__ float fminf(float x, float y)
+{
+    return hc::precise_math::fminf(x, y);
+}
+__device__ float fmodf(float x, float y)
+{
+    return hc::precise_math::fmodf(x, y);
+}
+__device__ float frexpf(float x, int *nptr)
+{
+    return hc::precise_math::frexpf(x, nptr);
+}
+__device__ float hypotf(float x, float y)
+{
+    return hc::precise_math::hypotf(x, y);
+}
+__device__ float ilogbf(float x)
+{
+    return hc::precise_math::ilogbf(x);
+}
+__device__ unsigned isfinite(float a)
+{
+    return hc::precise_math::isfinite(a);
+}
+__device__ unsigned isinf(float a)
+{
+    return hc::precise_math::isinf(a);
+}
+__device__ unsigned isnan(float a)
+{
+    return hc::precise_math::isnan(a);
+}
+__device__ float j0f(float x)
+{
+    return __hip_j0f(x);
+}
+__device__ float j1f(float x)
+{
+    return __hip_j1f(x);
+}
+__device__ float jnf(int n, float x)
+{
+    return __hip_jnf(n, x);
+}
+__device__ float ldexpf(float x, int exp)
+{
+    return hc::precise_math::ldexpf(x, exp);
+}
+__device__ float lgammaf(float x, int *sign)
+{
+    return hc::precise_math::lgammaf(x, sign);
+}
+__device__ long long int llrintf(float x)
+{
+    int y = hc::precise_math::roundf(x);
+    long long int z = y;
+    return z;
+}
+__device__ long long int llroundf(float x)
+{
+    int y = hc::precise_math::roundf(x);
+    long long int z = y;
+    return z;
+}__device__ float log10f(float x)
+{
+    return hc::precise_math::log10f(x);
+}
+__device__ float log1pf(float x)
+{
+    return hc::precise_math::log1pf(x);
+}
+__device__ float log2f(float x)
+{
+    return hc::precise_math::log2f(x);
+}
+__device__ float logbf(float x)
+{
+    return hc::precise_math::logbf(x);
+}
+__device__ float logf(float x)
+{
+    return hc::precise_math::logf(x);
+}
+__device__ long int lrintf(float x)
+{
+    int y = hc::precise_math::roundf(x);
+    long int z = y;
+    return z;
+}
+__device__ long int lroundf(float x)
+{
+    long int y = hc::precise_math::roundf(x);
+    return y;
+}
+__device__ float modff(float x, float *iptr)
+{
+    return hc::precise_math::modff(x, iptr);
+}
+__device__ float nanf(const char* tagp)
+{
+    return hc::precise_math::nanf((int)*tagp);
+}
+__device__ float nearbyintf(float x)
+{
+    return hc::precise_math::nearbyintf(x);
+}
+__device__ float nextafterf(float x, float y)
+{
+    return hc::precise_math::nextafter(x, y);
+}
+__device__ float norm3df(float a, float b, float c)
+{
+     float x = a*a + b*b + c*c;
+     return hc::precise_math::sqrtf(x);
+}
+__device__ float norm4df(float a, float b, float c, float d)
+{
+     float x = a*a + b*b;
+     float y = c*c + d*d;
+     return hc::precise_math::sqrtf(x+y);
+}
+
+__device__ float normcdff(float y)
+{
+     return ((hc::precise_math::erff(y)/1.41421356237) + 1)/2;
+}
+__device__ float normcdfinvf(float y)
+{
+     return HIP_SQRT_2 * __hip_erfinvf(2*y-1);
+}
+__device__ float normf(int dim, const float *a)
+{
+    float x = 0.0f;
+    for(int i=0;i<dim;i++)
+    {
+        x = hc::precise_math::fmaf(a[i], a[i], x);
+    }
+    return hc::precise_math::sqrtf(x);
+}
+__device__ float powf(float x, float y)
+{
+    return hc::precise_math::powf(x, y);
+}
+__device__ float rcbrtf(float x)
+{
+    return hc::precise_math::rcbrtf(x);
+}
+__device__ float remainderf(float x, float y)
+{
+    return hc::precise_math::remainderf(x, y);
+}
+__device__ float remquof(float x, float y, int *quo)
+{
+    return hc::precise_math::remquof(x, y, quo);
+}
+__device__ float rhypotf(float x, float y)
+{
+    return 1/hc::precise_math::hypotf(x, y);
+}
+__device__ float rintf(float x)
+{
+    return hc::precise_math::roundf(x);
+}
+__device__ float rnorm3df(float a, float b, float c)
+{
+    float x = a*a + b*b + c*c;
+    return 1/hc::precise_math::sqrtf(x);
+}
+__device__ float rnorm4df(float a, float b, float c, float d)
+{
+    float x = a*a + b*b;
+    float y = c*c + d*d;
+    return 1/hc::precise_math::sqrtf(x+y);
+}
+__device__ float rnormf(int dim, const float* a)
+{
+    float x = 0.0f;
+    for(int i=0;i<dim;i++)
+    {
+        x = hc::precise_math::fmaf(a[i], a[i], x);
+    }
+    return 1/hc::precise_math::sqrtf(x);
+}
+__device__ float roundf(float x)
+{
+    return hc::precise_math::roundf(x);
+}
+__device__ float scalblnf(float x, long int n)
+{
+    return hc::precise_math::scalb(x, n);
+}
+__device__ float scalbnf(float x, int n)
+{
+    return hc::precise_math::scalbnf(x, n);
+}
+__device__ unsigned signbit(float a)
+{
+    return hc::precise_math::signbit(a);
+}
+__device__ void sincosf(float x, float *sptr, float *cptr)
+{
+    *sptr = hc::precise_math::sinf(x);
+    *cptr = hc::precise_math::cosf(x);
+}
+__device__ void sincospif(float x, float *sptr, float *cptr)
+{
+    *sptr = hc::precise_math::sinpif(x);
+    *cptr = hc::precise_math::cospif(x);
+}
+__device__ float sinf(float x)
+{
+    return hc::precise_math::sinf(x);
+}
+__device__ float sinhf(float x)
+{
+    return hc::precise_math::sinhf(x);
+}
+__device__ float tanf(float x)
+{
+    return hc::precise_math::tanf(x);
+}
+__device__ float tanhf(float x)
+{
+    return hc::precise_math::tanhf(x);
+}
+__device__ float tgammaf(float x)
+{
+    return hc::precise_math::tgammaf(x);
+}
+__device__ float truncf(float x)
+{
+    return hc::precise_math::truncf(x);
+}
+__device__ float y0f(float x)
+{
+    return __hip_y0f(x);
+}
+__device__ float y1f(float x)
+{
+    return __hip_y1f(x);
+}
+__device__ float ynf(int n, float x)
+{
+    return __hip_ynf(n, x);
+}
+__device__ float cospif(float x)
+{
+    return hc::precise_math::cospif(x);
+}
+__device__ float sinpif(float x)
+{
+    return hc::precise_math::sinpif(x);
+}
+__device__ float sqrtf(float x)
+{
+    return hc::precise_math::sqrtf(x);
+}
+__device__ float rsqrtf(float x)
+{
+    return hc::precise_math::rsqrtf(x);
+}
+
+/*
+ * Double precision device math functions
+ */
+
+__device__ double acos(double x)
+{
+    return hc::precise_math::acos(x);
+}
+__device__ double acosh(double x)
+{
+    return hc::precise_math::acosh(x);
+}
+__device__ double asin(double x)
+{
+    return hc::precise_math::asin(x);
+}
+__device__ double asinh(double x)
+{
+    return hc::precise_math::asinh(x);
+}
+__device__ double atan(double x)
+{
+    return hc::precise_math::atan(x);
+}
+__device__ double atan2(double y, double x)
+{
+    return hc::precise_math::atan2(y, x);
+}
+__device__ double atanh(double x)
+{
+    return hc::precise_math::atanh(x);
+}
+__device__ double cbrt(double x)
+{
+    return hc::precise_math::cbrt(x);
+}
+__device__ double ceil(double x)
+{
+    return hc::precise_math::ceil(x);
+}
+__device__ double copysign(double x, double y)
+{
+    return hc::precise_math::copysign(x, y);
+}
+__device__ double cos(double x)
+{
+    return hc::precise_math::cos(x);
+}
+__device__ double cosh(double x)
+{
+    return hc::precise_math::cosh(x);
+}
+__device__ double cospi(double x)
+{
+    return hc::precise_math::cospi(x);
+}
+__device__ double cyl_bessel_i0(double x);
+__device__ double cyl_bessel_i1(double x);
+__device__ double erf(double x)
+{
+    return hc::precise_math::erf(x);
+}
+__device__ double erfc(double x)
+{
+    return hc::precise_math::erfc(x);
+}
+__device__ double erfcinv(double x)
+{
+    return __hip_erfinv(1 - x);
+}
+__device__ double erfcx(double x)
+{
+    return hc::precise_math::exp(x*x)*hc::precise_math::erf(x);
+}
+__device__ double erfinv(double x)
+{
+    return __hip_erfinv(x);
+}
+__device__ double exp(double x)
+{
+    return hc::precise_math::exp(x);
+}
+__device__ double exp10(double x)
+{
+    return hc::precise_math::exp10(x);
+}
+__device__ double exp2(double x)
+{
+    return hc::precise_math::exp2(x);
+}
+__device__ double expm1(double x)
+{
+    return hc::precise_math::expm1(x);
+}
+__device__ double fabs(double x)
+{
+    return hc::precise_math::fabs(x);
+}
+__device__ double fdim(double x, double y)
+{
+    return hc::precise_math::fdim(x, y);
+}
+__device__ double fdivide(double x, double y)
+{
+    return x/y;
+}
+__device__ double floor(double x)
+{
+    return hc::precise_math::floor(x);
+}
+__device__ double fma(double x, double y, double z)
+{
+    return hc::precise_math::fma(x, y, z);
+}
+__device__ double fmax(double x, double y)
+{
+    return hc::precise_math::fmax(x, y);
+}
+__device__ double fmin(double x, double y)
+{
+    return hc::precise_math::fmin(x, y);
+}
+__device__ double fmod(double x, double y)
+{
+    return hc::precise_math::fmod(x, y);
+}
+__device__ double frexp(double x, int *y)
+{
+    return hc::precise_math::frexp(x, y);
+}
+__device__ double hypot(double x, double y)
+{
+    return hc::precise_math::hypot(x, y);
+}
+__device__ double ilogb(double x)
+{
+    return hc::precise_math::ilogb(x);
+}
+__device__ unsigned isfinite(double x)
+{
+    return hc::precise_math::isfinite(x);
+}
+__device__ unsigned isinf(double x)
+{
+    return hc::precise_math::isinf(x);
+}
+__device__ unsigned isnan(double x)
+{
+    return hc::precise_math::isnan(x);
+}
+__device__ double j0(double x)
+{
+    return __hip_j0(x);
+}
+__device__ double j1(double x)
+{
+    return __hip_j1(x);
+}
+__device__ double jn(int n, double x)
+{
+    return __hip_jn(n, x);
+}
+__device__ double ldexp(double x, int exp)
+{
+    return hc::precise_math::ldexp(x, exp);
+}
+__device__ double lgamma(double x, int *sign)
+{
+    return hc::precise_math::lgamma(x, sign);
+}
+__device__ long long int llrint(double x)
+{
+    long long int y = hc::precise_math::round(x);
+    return y;
+}
+__device__ long long int llround(double x)
+{
+    long long int y = hc::precise_math::round(x);
+    return y;
+}
+__device__ double log(double x)
+{
+    return hc::precise_math::log(x);
+}
+__device__ double log10(double x)
+{
+    return hc::precise_math::log10(x);
+}
+__device__ double log1p(double x)
+{
+    return hc::precise_math::log1p(x);
+}
+__device__ double log2(double x)
+{
+    return hc::precise_math::log2(x);
+}
+__device__ double logb(double x)
+{
+    return hc::precise_math::logb(x);
+}
+__device__ long int lrint(double x)
+{
+    long int y = hc::precise_math::round(x);
+    return y;
+}
+__device__ long int lround(double x)
+{
+    long int y = hc::precise_math::round(x);
+    return y;
+}
+__device__ double modf(double x, double *iptr)
+{
+    return hc::precise_math::modf(x, iptr);
+}
+__device__ double nan(const char *tagp)
+{
+    return hc::precise_math::nan((int)*tagp);
+}
+__device__ double nearbyint(double x)
+{
+    return hc::precise_math::nearbyint(x);
+}
+__device__ double nextafter(double x, double y)
+{
+    return hc::precise_math::nextafter(x, y);
+}
+__device__ double norm3d(double a, double b, double c)
+{
+    double x = a*a + b*b + c*c;
+    return hc::precise_math::sqrt(x);
+}
+__device__ double norm4d(double a, double b, double c, double d)
+{
+    double x = a*a + b*b;
+    double y = c*c + d*d;
+    return hc::precise_math::sqrt(x+y);
+}
+__device__ double normcdf(double y)
+{
+     return ((hc::precise_math::erf(y)/HIP_SQRT_2) + 1)/2;
+}
+__device__ double pow(double x, double y)
+{
+    return hc::precise_math::pow(x, y);
+}
+__device__ double rcbrt(double x)
+{
+    return hc::precise_math::rcbrt(x);
+}
+__device__ double remainder(double x, double y)
+{
+    return hc::precise_math::remainder(x, y);
+}
+__device__ double remquo(double x, double y, int *quo)
+{
+    return hc::precise_math::remquo(x, y, quo);
+}
+__device__ double rhypot(double x, double y)
+{
+    return 1/hc::precise_math::sqrt(x*x + y*y);
+}
+__device__ double rint(double x)
+{
+    return hc::precise_math::round(x);
+}
+__device__ double rnorm3d(double a, double b, double c)
+{
+    return hc::precise_math::rsqrt(a*a + b*b + c*c);
+}
+__device__ double rnorm4d(double a, double b, double c, double d)
+{
+    return hc::precise_math::rsqrt(a*a + b*b + c*c + d*d);
+}
+__device__ double rnorm(int dim, const double* t)
+{
+    double x = 0.0;
+    for(int i=0;i<dim;i++)
+    {
+        x = hc::precise_math::fma(t[i], t[i], x);
+    }
+    return 1/x;
+}
+__device__ double round(double x)
+{
+    return hc::precise_math::round(x);
+}
+__device__ double rsqrt(double x)
+{
+    return hc::precise_math::rsqrt(x);
+}
+__device__ double scalbln(double x, long int n)
+{
+    return hc::precise_math::scalb(x, n);
+}
+__device__ double scalbn(double x, int n)
+{
+    return hc::precise_math::scalbn(x, n);
+}
+__device__ unsigned signbit(double x)
+{
+    return hc::precise_math::signbit(x);
+}
+__device__ double sin(double x)
+{
+    return hc::precise_math::sin(x);
+}
+__device__ void sincos(double x, double *sptr, double *cptr)
+{
+    *sptr = hc::precise_math::sin(x);
+    *cptr = hc::precise_math::cos(x);
+}
+__device__ void sincospi(double x, double *sptr, double *cptr)
+{
+    *sptr = hc::precise_math::sinpi(x);
+    *cptr = hc::precise_math::cospi(x);
+}
+__device__ double sinh(double x)
+{
+    return hc::precise_math::sinh(x);
+}
+__device__ double sinpi(double x)
+{
+    return hc::precise_math::sinpi(x);
+}
+__device__ double sqrt(double x)
+{
+    return hc::precise_math::sqrt(x);
+}
+__device__ double tan(double x)
+{
+    return hc::precise_math::tan(x);
+}
+__device__ double tanh(double x)
+{
+    return hc::precise_math::tanh(x);
+}
+__device__ double tgamma(double x)
+{
+    return hc::precise_math::tgamma(x);
+}
+__device__ double trunc(double x)
+{
+    return hc::precise_math::trunc(x);
+}
+__device__ double y0(double x)
+{
+    return __hip_y0(x);
+}
+__device__ double y1(double x)
+{
+    return __hip_y1(x);
+}
+__device__ double yn(int n, double x)
+{
+    return __hip_yn(n, x);
+}
+
+
+__host__ float cospif(float x)
+{
+    return std::cos(x*HIP_PI);
+}
+
+__host__ float fdividef(float x, float y)
+{
+  return x / y;
+}
+
+__host__ int isfinite(float x)
+{
+  return std::isfinite(x);
+}
+
+__host__ int signbit(float x)
+{
+  return std::signbit(x);
+}
+
+__host__ int sinpif(float x)
+{
+  return std::sin(x*HIP_PI);
+}
+
+__host__ float rsqrtf(float x)
+{
+  return 1 / std::sqrt(x);
+}
+
+__host__ float modff(float x, float *iptr)
+{
+    return std::modf(x, iptr);
+}
+
+__host__ float erfcinvf(float y)
+{
+    return __hip_host_erfcinvf(y);
+}
+
+__host__ double erfcinv(double y)
+{
+    return __hip_host_erfcinv(y);
+}
+
+__host__ float erfinvf(float x)
+{
+    return __hip_host_erfinvf(x);
+}
+
+__host__ double erfinv(double x)
+{
+    return __hip_host_erfinv(x);
+}
+
+__host__ double fdivide(double x, double y)
+{
+    return x/y;
+}
+
+__host__ float normcdff(float t)
+{
+     return (1 - std::erf(-t/std::sqrt(2)))/2;
+}
+
+__host__ double normcdf(double x)
+{
+     return (1 - std::erf(-x/std::sqrt(2)))/2;
+}
+
+__host__ float erfcxf(float x)
+{
+     return std::exp(x*x) * std::erfc(x);
+}
+
+__host__ double erfcx(double x)
+{
+     return std::exp(x*x) * std::erfc(x);
+}
+
+__host__ float rhypotf(float x, float y)
+{
+     return 1 / std::sqrt(x*x + y*y);
+}
+
+__host__ double rhypot(double x, double y)
+{
+    return 1 / std::sqrt(x*x + y*y);
+}
+
+__host__ float rcbrtf(float a)
+{
+    return 1 / std::cbrt(a);
+}
+
+__host__ double rcbrt(double a)
+{
+    return 1 / std::cbrt(a);
+}
+
+__host__ float normf(int dim, const float *a)
+{
+    float val = 0.0f;
+    for(int i=0;i<dim;i++)
+    {
+        val = val + a[i] * a[i];
+    }
+    return val;
+}
+
+__host__ double norm(int dim, const double *a)
+{
+    double val = 0.0;
+    for(int i=0;i<dim;i++)
+    {
+        val = val + a[i] * a[i];
+    }
+    return val;
+}
+
+__host__ float rnormf(int dim, const float *t)
+{
+    float val = 0.0f;
+    for(int i=0;i<dim;i++)
+    {
+        val = val + t[i] * t[i];
+    }
+    return 1 / std::sqrt(val);
+}
+
+__host__ double rnorm(int dim, const double *t)
+{
+    double val = 0.0;
+    for(int i=0;i<dim;i++)
+    {
+        val = val + t[i] * t[i];
+    }
+    return 1 / std::sqrt(val);
+}
+
+__host__ float rnorm4df(float a, float b, float c, float d)
+{
+    return 1 / std::sqrt(a*a + b*b + c*c + d*d);
+}
+
+__host__ double rnorm4d(double a, double b, double c, double d)
+{
+    return 1 / std::sqrt(a*a + b*b + c*c + d*d);
+}
+
+__host__ float rnorm3df(float a, float b, float c)
+{
+    return 1 / std::sqrt(a*a + b*b + c*c);
+}
+
+__host__ double rnorm3d(double a, double b, double c)
+{
+    return 1 / std::sqrt(a*a + b*b + c*c);
+}
+
+__host__ void sincospif(float x, float *sptr, float *cptr)
+{
+    *sptr = std::sin(HIP_PI*x);
+    *cptr = std::cos(HIP_PI*x);
+}
+
+__host__ void sincospi(double x, double *sptr, double *cptr)
+{
+    *sptr = std::sin(HIP_PI*x);
+    *cptr = std::cos(HIP_PI*x);
+}
+
+__host__ float normcdfinvf(float x)
+{
+    return std::sqrt(2) * erfinv(2*x-1);
+}
+
+__host__ double normcdfinv(double x)
+{
+    return std::sqrt(2) * erfinv(2*x-1);
+}
+
+__host__ float nextafterf(float x, float y)
+{
+    return std::nextafter(x, y);
+}
+
+__host__ double nextafter(double x, double y)
+{
+    return std::nextafter(x, y);
+}
+
+__host__ float norm3df(float a, float b, float c)
+{
+    return std::sqrt(a*a + b*b + c*c);
+}
+
+__host__ float norm4df(float a, float b, float c, float d)
+{
+    return std::sqrt(a*a + b*b + c*c + d*d);
+}
+
+__host__ double norm3d(double a, double b, double c)
+{
+    return std::sqrt(a*a + b*b + c*c);
+}
+
+__host__ double norm4d(double a, double b, double c, double d)
+{
+    return std::sqrt(a*a + b*b + c*c + d*d);
+}
@@ -19,7 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
-#include "hip/hip_runtime.h"
+#include <hip/hip_runtime.h>
+#include <hip/device_functions.h>
 #include "test_common.h"

 #pragma GCC diagnostic ignored "-Wall"
@@ -27,18 +28,18 @@ THE SOFTWARE.

 __device__ void double_precision_intrinsics()
 {
-    //__dadd_rd(0.0, 1.0);
-    //__dadd_rn(0.0, 1.0);
-    //__dadd_ru(0.0, 1.0);
-    //__dadd_rz(0.0, 1.0);
-    //__ddiv_rd(0.0, 1.0);
-    //__ddiv_rn(0.0, 1.0);
-    //__ddiv_ru(0.0, 1.0);
-    //__ddiv_rz(0.0, 1.0);
-    //__dmul_rd(1.0, 2.0);
-    //__dmul_rn(1.0, 2.0);
-    //__dmul_ru(1.0, 2.0);
-    //__dmul_rz(1.0, 2.0);
+    __dadd_rd(0.0, 1.0);
+    __dadd_rn(0.0, 1.0);
+    __dadd_ru(0.0, 1.0);
+    __dadd_rz(0.0, 1.0);
+    __ddiv_rd(0.0, 1.0);
+    __ddiv_rn(0.0, 1.0);
+    __ddiv_ru(0.0, 1.0);
+    __ddiv_rz(0.0, 1.0);
+    __dmul_rd(1.0, 2.0);
+    __dmul_rn(1.0, 2.0);
+    __dmul_ru(1.0, 2.0);
+    __dmul_rz(1.0, 2.0);
    __drcp_rd(2.0);
    __drcp_rn(2.0);
    __drcp_ru(2.0);
@@ -47,10 +48,10 @@ __device__ void double_precision_intrinsics()
    __dsqrt_rn(4.0);
    __dsqrt_ru(4.0);
    __dsqrt_rz(4.0);
-    //__dsub_rd(2.0, 1.0);
-    //__dsub_rn(2.0, 1.0);
-    //__dsub_ru(2.0, 1.0);
-    //__dsub_rz(2.0, 1.0);
+    __dsub_rd(2.0, 1.0);
+    __dsub_rn(2.0, 1.0);
+    __dsub_ru(2.0, 1.0);
+    __dsub_rz(2.0, 1.0);
    __fma_rd(1.0, 2.0, 3.0);
    __fma_rn(1.0, 2.0, 3.0);
    __fma_ru(1.0, 2.0, 3.0);
@@ -19,7 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
-#include "hip/hip_runtime.h"
+#include <hip/hip_runtime.h>
+#include <hip/math_functions.h>
 #include "test_common.h"

 #pragma GCC diagnostic ignored "-Wall"
@@ -43,8 +44,8 @@ __device__ void double_precision_math_functions()
    cos(0.0);
    cosh(0.0);
    cospi(0.0);
-    //cyl_bessel_i0(0.0);
-    //cyl_bessel_i1(0.0);
+    cyl_bessel_i0(0.0);
+    cyl_bessel_i1(0.0);
    erf(0.0);
    erfc(0.0);
    erfcinv(2.0);
@@ -61,7 +62,7 @@ __device__ void double_precision_math_functions()
    fmax(0.0, 0.0);
    fmin(0.0, 0.0);
    fmod(0.0, 1.0);
-    //frexp(0.0, &iX);
+    frexp(0.0, &iX);
    hypot(1.0, 0.0);
    ilogb(1.0);
    isfinite(0.0);
@@ -71,7 +72,7 @@ __device__ void double_precision_math_functions()
    j1(0.0);
    jn(-1.0, 1.0);
    ldexp(0.0, 0);
-    //lgamma(1.0);
+    lgamma(1.0);
    llrint(0.0);
    llround(0.0);
    log(1.0);
@@ -81,19 +82,19 @@ __device__ void double_precision_math_functions()
    logb(1.0);
    lrint(0.0);
    lround(0.0);
-    //modf(0.0, &fX);
+    modf(0.0, &fX);
    nan("1");
    nearbyint(0.0);
-    //nextafter(0.0);
-    //fX = 1.0; norm(1, &fX);
+    nextafter(0.0, 0.0);
+    fX = 1.0; norm(1, &fX);
    norm3d(1.0, 0.0, 0.0);
    norm4d(1.0, 0.0, 0.0, 0.0);
    normcdf(0.0);
-    //normcdfinv(1.0);
+    normcdfinv(1.0);
    pow(1.0, 0.0);
    rcbrt(1.0);
    remainder(2.0, 1.0);
-    //remquo(1.0, 2.0, &iX);
+    remquo(1.0, 2.0, &iX);
    rhypot(0.0, 1.0);
    rint(1.0);
    fX = 1.0; rnorm(1, &fX);
@@ -19,7 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
-#include "hip/hip_runtime.h"
+#include <hip/hip_runtime.h>
+#include <hip/math_functions.h>
 #include "test_common.h"

 #pragma GCC diagnostic ignored "-Wall"
@@ -85,7 +86,7 @@ __host__ void double_precision_math_functions()
    nan("1");
    nearbyint(0.0);
    //nextafter(0.0);
-    //fX = 1.0; norm(1, &fX);
+    fX = 1.0; norm(1, &fX);
 #if defined(__HIP_PLATFORM_HCC__)
    norm3d(1.0, 0.0, 0.0);
    norm4d(1.0, 0.0, 0.0, 0.0);
@@ -27,6 +27,7 @@ THE SOFTWARE.
 */

 #include "test_common.h"
+#include <hip/device_functions.h>

 #define LEN 512
 #define SIZE LEN<<2
@@ -19,7 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
-#include "hip/hip_runtime.h"
+#include <hip/hip_runtime.h>
+#include <hip/math_functions.h>
 #include "test_common.h"

 __global__ void FloatMathPrecise(hipLaunchParm lp)
@@ -19,8 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
-#include "hip/hip_runtime.h"
-#include "hip/device_functions.h"
+#include <hip/hip_runtime.h>
+#include <hip/device_functions.h>
 #include "test_common.h"

 #pragma GCC diagnostic ignored "-Wall"
@@ -19,7 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
-#include "hip/hip_runtime.h"
+#include <hip/hip_runtime.h>
+#include <hip/device_functions.h>
 #include "test_common.h"

 #pragma GCC diagnostic ignored "-Wall"
@@ -30,44 +31,44 @@ __device__ void single_precision_intrinsics()
    float fX, fY;

    __cosf(0.0f);
-    //__exp10f(0.0f);
+    __exp10f(0.0f);
    __expf(0.0f);
-    //__fadd_rd(0.0f, 1.0f);
-    //__fadd_rn(0.0f, 1.0f);
-    //__fadd_ru(0.0f, 1.0f);
-    //__fadd_rz(0.0f, 1.0f);
-    //__fdiv_rd(4.0f, 2.0f);
-    //__fdiv_rn(4.0f, 2.0f);
-    //__fdiv_ru(4.0f, 2.0f);
-    //__fdiv_rz(4.0f, 2.0f);
-    //__fdividef(4.0f, 2.0f);
-    //__fmaf_rd(1.0f, 2.0f, 3.0f);
-    //__fmaf_rn(1.0f, 2.0f, 3.0f);
-    //__fmaf_ru(1.0f, 2.0f, 3.0f);
-    //__fmaf_rz(1.0f, 2.0f, 3.0f);
-    //__fmul_rd(1.0f, 2.0f);
-    //__fmul_rn(1.0f, 2.0f);
-    //__fmul_ru(1.0f, 2.0f);
-    //__fmul_rz(1.0f, 2.0f);
-    //__frcp_rd(2.0f);
-    //__frcp_rn(2.0f);
-    //__frcp_ru(2.0f);
-    //__frcp_rz(2.0f);
+    __fadd_rd(0.0f, 1.0f);
+    __fadd_rn(0.0f, 1.0f);
+    __fadd_ru(0.0f, 1.0f);
+    __fadd_rz(0.0f, 1.0f);
+    __fdiv_rd(4.0f, 2.0f);
+    __fdiv_rn(4.0f, 2.0f);
+    __fdiv_ru(4.0f, 2.0f);
+    __fdiv_rz(4.0f, 2.0f);
+    __fdividef(4.0f, 2.0f);
+    __fmaf_rd(1.0f, 2.0f, 3.0f);
+    __fmaf_rn(1.0f, 2.0f, 3.0f);
+    __fmaf_ru(1.0f, 2.0f, 3.0f);
+    __fmaf_rz(1.0f, 2.0f, 3.0f);
+    __fmul_rd(1.0f, 2.0f);
+    __fmul_rn(1.0f, 2.0f);
+    __fmul_ru(1.0f, 2.0f);
+    __fmul_rz(1.0f, 2.0f);
+    __frcp_rd(2.0f);
+    __frcp_rn(2.0f);
+    __frcp_ru(2.0f);
+    __frcp_rz(2.0f);
    __frsqrt_rn(4.0f);
    __fsqrt_rd(4.0f);
    __fsqrt_rn(4.0f);
    __fsqrt_ru(4.0f);
    __fsqrt_rz(4.0f);
-    //__fsub_rd(2.0f, 1.0f);
-    //__fsub_rn(2.0f, 1.0f);
-    //__fsub_ru(2.0f, 1.0f);
-    //__fsub_rz(2.0f, 1.0f);
+    __fsub_rd(2.0f, 1.0f);
+    __fsub_rn(2.0f, 1.0f);
+    __fsub_ru(2.0f, 1.0f);
+    __fsub_rz(2.0f, 1.0f);
    __log10f(1.0f);
    __log2f(1.0f);
    __logf(1.0f);
    __powf(1.0f, 0.0f);
-    //__saturatef(0.1f);
-    //__sincosf(0.0f, &fX, &fY);
+    __saturatef(0.1f);
+    __sincosf(0.0f, &fX, &fY);
    __sinf(0.0f);
    __tanf(0.0f);
 }
@@ -19,7 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
-#include "hip/hip_runtime.h"
+#include <hip/hip_runtime.h>
+#include <hip/math_functions.h>
 #include "test_common.h"

 #pragma GCC diagnostic ignored "-Wall"
@@ -19,7 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
-#include "hip/hip_runtime.h"
+#include <hip/hip_runtime.h>
+#include <hip/math_functions.h>
 #include "test_common.h"

 #pragma GCC diagnostic ignored "-Wall"
@@ -24,8 +24,9 @@ THE SOFTWARE.
 */

 #include"test_common.h"
-#include "hip/hip_runtime.h"
-#include "hip/hip_runtime_api.h"
+#include <hip/hip_runtime.h>
+#include <hip/math_functions.h>
+#include <hip/hip_runtime_api.h>

 #define N 512
 #define SIZE N*sizeof(float)
@@ -24,8 +24,9 @@ THE SOFTWARE.
 */

 #include"test_common.h"
-#include "hip/hip_runtime.h"
-#include "hip/hip_runtime_api.h"
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/math_functions.h>

 #define N 512
 #define SIZE N*sizeof(double)
@@ -29,7 +29,8 @@ THE SOFTWARE.
 #include <stdio.h>
 #include <iostream>

-#include "hip/hip_runtime.h"
+#include <hip/hip_runtime.h>
+#include <hip/device_functions.h>
 #define HIP_ASSERT(x) (assert((x)==hipSuccess))

 __global__ void
@@ -25,8 +25,8 @@ THE SOFTWARE.

 #include <iostream>

-#include "hip/hip_runtime.h"
-#include "hip/device_functions.h"
+#include <hip/hip_runtime.h>
+#include <hip/device_functions.h>

 #define HIP_ASSERT(x) (assert((x)==hipSuccess))

@@ -32,7 +32,7 @@ THE SOFTWARE.
 #include <stdlib.h>
 #include <iostream>
 #include "hip/hip_runtime.h"
-#include "hip/device_functions.h"
+#include <hip/device_functions.h>

 #define HIP_ASSERT(x) (assert((x)==hipSuccess))

@@ -32,7 +32,7 @@ THE SOFTWARE.
 #include <stdlib.h>
 #include <iostream>
 #include "hip/hip_runtime.h"
-#include "hip/device_functions.h"
+#include <hip/device_functions.h>

 #define HIP_ASSERT(x) (assert((x)==hipSuccess))
 #define WIDTH     8
@@ -31,8 +31,8 @@ THE SOFTWARE.
 #include <algorithm>
 #include <stdlib.h>
 #include <iostream>
-#include "hip/hip_runtime.h"
-#include "hip/device_functions.h"
+#include <hip/hip_runtime.h>
+#include <hip/device_functions.h>

 #define HIP_ASSERT(x) (assert((x)==hipSuccess))

@@ -31,8 +31,8 @@ THE SOFTWARE.
 #include <algorithm>
 #include <stdlib.h>
 #include <iostream>
-#include "hip/hip_runtime.h"
-#include "hip/device_functions.h"
+#include <hip/hip_runtime.h>
+#include <hip/device_functions.h>

 #define HIP_ASSERT(x) (assert((x)==hipSuccess))

@@ -21,7 +21,7 @@ THE SOFTWARE.
 */

 /* HIT_START
- * BUILD: %t %s 
+ * BUILD: %t %s
 * RUN: %t
 * HIT_END
 */
@@ -30,6 +30,7 @@ THE SOFTWARE.
 #include<hip/hip_runtime.h>
 #include<iostream>
 #include"test_common.h"
+#include<hip/device_functions.h>

 #define LEN 512
 #define SIZE LEN<<2
@@ -24,6 +24,7 @@ THE SOFTWARE.

 #include<iostream>
 #include"test_common.h"
+#include"hip/math_functions.h"

 const int NN = 1 << 21;

@@ -31,7 +32,7 @@ __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
 	int tid = hipThreadIdx_x;
 	if(tid < 1){
 		for(int i=0;i<n;i++){
-			x[i] = sqrt(pow(3.14159,i));
+			x[i] = sqrt(powf(3.14159,i));
 		}
 		y[tid] = y[tid] + 1.0f;
 	}
@@ -26,6 +26,7 @@ THE SOFTWARE.

 #include<iostream>
 #include"test_common.h"
+#include"hip/math_functions.h"

 const int NN = 1 << 21;

@@ -33,7 +34,7 @@ __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
 	int tid = hipThreadIdx_x;
 	if(tid < 1){
 		for(int i=0;i<n;i++){
-			x[i] = sqrt(pow(3.14159,i));
+			x[i] = sqrt(powf(3.14159,i));
 		}
 		y[tid] = y[tid] + 1.0f;
 	}