Merge "added erfinv software implementation" into amd-master

[ROCm/clr commit: 1dbb2f5205]
2016-06-14 02:13:01 -04:00
parent 46e60b1bd1 657a1222d7
commit bdfa974253
4 changed files with 183 additions and 3 deletions
@@ -220,6 +220,7 @@ __device__ double erf(double x);
 __device__ double erfc(double x);
 __device__ double erfcinv(double y);
 __device__ double erfcx(double x);
+__device__ double erfinv(double x);
 __device__ double exp(double x);
 __device__ double exp10(double x);
 __device__ double exp2(double x);
@@ -27,6 +27,117 @@ THE SOFTWARE.
 using namespace hc::precise_math;
 #endif

+#define __hip_erfinva3 -0.140543331
+#define __hip_erfinva2 0.914624893
+#define __hip_erfinva1 -1.645349621
+#define __hip_erfinva0 0.886226899
+
+#define __hip_erfinvb4 0.012229801
+#define __hip_erfinvb3 -0.329097515
+#define __hip_erfinvb2 1.442710462
+#define __hip_erfinvb1 -2.118377725
+#define __hip_erfinvb0 1
+
+#define __hip_erfinvc3 1.641345311
+#define __hip_erfinvc2 3.429567803
+#define __hip_erfinvc1 -1.62490649
+#define __hip_erfinvc0 -1.970840454
+
+#define __hip_erfinvd2 1.637067800
+#define __hip_erfinvd1 3.543889200
+#define __hip_erfinvd0 1
+
+#define HIP_PI 3.14159265358979323846
+
+__device__ float __hip_erfinvf(float x){
+    float ret;
+    int  sign;
+    if (x < -1 || x > 1){
+        return NAN;
+    }
+    if (x == 0){
+        return 0;
+    }
+    if (x > 0){
+        sign = 1;
+    } else {
+        sign = -1;
+        x = -x;
+    }
+    if (x <= 0.7) {
+        float x1 = x * x;
+        float x2 = hc::precise_math::fmaf(__hip_erfinva3, x1, __hip_erfinva2);
+        float x3 = hc::precise_math::fmaf(x2, x1, __hip_erfinva1);
+        float x4 = x * hc::precise_math::fmaf(x3, x1, __hip_erfinva0);
+
+        float r1 = hc::precise_math::fmaf(__hip_erfinvb4, x1, __hip_erfinvb3);
+        float r2 = hc::precise_math::fmaf(r1, x1, __hip_erfinvb2);
+        float r3 = hc::precise_math::fmaf(r2, x1, __hip_erfinvb1);
+        ret = x4 / hc::precise_math::fmaf(r3, x1, __hip_erfinvb0);
+    } else {
+        float x1 = hc::precise_math::sqrtf(-hc::precise_math::logf((1 - x) / 2));
+        float x2 = hc::precise_math::fmaf(__hip_erfinvc3, x1, __hip_erfinvc2);
+        float x3 = hc::precise_math::fmaf(x2, x1, __hip_erfinvc1);
+        float x4 = hc::precise_math::fmaf(x3, x1, __hip_erfinvc0);
+
+        float r1 = hc::precise_math::fmaf(__hip_erfinvd2, x1, __hip_erfinvd1);
+        ret = x4 / hc::precise_math::fmaf(r1, x1, __hip_erfinvd0);
+    }
+
+    ret = ret * sign;
+    x = x * sign;
+
+    ret -= (hc::precise_math::erff(ret) - x) / (2 / hc::precise_math::sqrtf(HIP_PI) * hc::precise_math::expf(-ret * ret));
+    ret -= (hc::precise_math::erff(ret) - x) / (2 / hc::precise_math::sqrtf(HIP_PI) * hc::precise_math::expf(-ret * ret));
+
+    return ret;
+}
+
+__device__ double __hip_erfinv(double x){
+    double ret;
+    int  sign;
+    if (x < -1 || x > 1){
+        return NAN;
+    }
+    if (x == 0){
+        return 0;
+    }
+    if (x > 0){
+        sign = 1;
+    } else {
+        sign = -1;
+        x = -x;
+    }
+    if (x <= 0.7) {
+        double x1 = x * x;
+        double x2 = hc::precise_math::fma(__hip_erfinva3, x1, __hip_erfinva2);
+        double x3 = hc::precise_math::fma(x2, x1, __hip_erfinva1);
+        double x4 = x * hc::precise_math::fma(x3, x1, __hip_erfinva0);
+
+        double r1 = hc::precise_math::fma(__hip_erfinvb4, x1, __hip_erfinvb3);
+        double r2 = hc::precise_math::fma(r1, x1, __hip_erfinvb2);
+        double r3 = hc::precise_math::fma(r2, x1, __hip_erfinvb1);
+        ret = x4 / hc::precise_math::fma(r3, x1, __hip_erfinvb0);
+    } else {
+        double x1 = hc::precise_math::sqrt(-hc::precise_math::log((1 - x) / 2));
+        double x2 = hc::precise_math::fma(__hip_erfinvc3, x1, __hip_erfinvc2);
+        double x3 = hc::precise_math::fma(x2, x1, __hip_erfinvc1);
+        double x4 = hc::precise_math::fma(x3, x1, __hip_erfinvc0);
+
+        double r1 = hc::precise_math::fma(__hip_erfinvd2, x1, __hip_erfinvd1);
+        ret = x4 / hc::precise_math::fma(r1, x1, __hip_erfinvd0);
+    }
+
+    ret = ret * sign;
+    x = x * sign;
+
+    ret -= (hc::precise_math::erf(ret) - x) / (2 / hc::precise_math::sqrt(HIP_PI) * hc::precise_math::exp(-ret * ret));
+    ret -= (hc::precise_math::erf(ret) - x) / (2 / hc::precise_math::sqrt(HIP_PI) * hc::precise_math::exp(-ret * ret));
+
+    return ret;
+}
+
+
 __device__ float acosf(float x)
 {
    return hc::precise_math::acosf(x);
@@ -87,7 +198,10 @@ __device__ float erff(float x)
 {
    return hc::precise_math::erff(x);
 }
-__device__ float erfinvf(float y);
+__device__ float erfinvf(float y)
+{
+    return __hip_erfinvf(y);
+}
 __device__ float exp10f(float x)
 {
    return hc::precise_math::exp10f(x);
@@ -442,6 +556,10 @@ __device__ double erfc(double x)
 {
    return hc::precise_math::erfc(x);
 }
+__device__ double erfinv(double x)
+{
+    return __hip_erfinv(x);
+}
 __device__ double exp(double x)
 {
    return hc::precise_math::exp(x);
@@ -98,6 +98,10 @@ __global__ void test_rnormf(hipLaunchParm lp, float *a, float *b){
    b[tid] = rnormf(N, a);
 }

+__global__ void test_erfinvf(hipLaunchParm lp, float *a, float *b){
+    int tid = hipThreadIdx_x;
+    b[tid] = erff(erfinvf(a[tid]));
+}


 bool run_sincosf(){
@@ -591,13 +595,39 @@ assert(passed == 1);
 return false;
 }

+bool run_erfinvf(){
+float *A, *Ad, *B, *Bd;
+A = new float[N];
+B = new float[N];
+for(int i=0;i<N;i++){
+A[i] = -0.6f;
+B[i] = 0.0f;
+}
+hipMalloc((void**)&Ad, SIZE);
+hipMalloc((void**)&Bd, SIZE);
+hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
+hipLaunchKernel(test_erfinvf, dim3(1), dim3(N), 0, 0, Ad, Bd);
+hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
+int passed = 0;
+for(int i=0;i<512;i++){
+    if(B[i] - A[i] < 0.000001){
+        passed = 1;
+    }
+}
+free(A);
+if(passed == 1){
+    return true;
+}
+assert(passed == 1);
+return false;
+}

 int main(){
 if(run_sincosf() && run_sincospif() && run_fdividef() && 
   run_llrintf() && run_norm3df() && run_norm4df() &&
   run_normf() && run_rnorm3df() && run_rnorm4df() &&
   run_rnormf() && run_lroundf() && run_llroundf() &&
-   run_rintf() && run_rhypotf()
+   run_rintf() && run_rhypotf() && run_erfinvf()
 ){
 passed();
 }
@@ -88,6 +88,11 @@ __global__ void test_rnorm(hipLaunchParm lp, double *a, double *b){
    b[tid] = rnorm(N, a);
 }

+__global__ void test_erfinv(hipLaunchParm lp, double *a, double *b){
+    int tid = hipThreadIdx_x;
+    b[tid] = erf(erfinv(a[tid]));
+}
+
 bool run_sincos(){
 double *A, *Ad, *B, *C, *Bd, *Cd;
 A = new double[N];
@@ -517,12 +522,38 @@ assert(passed == 1);
 return false;
 }

+bool run_erfinv(){
+double *A, *Ad, *B, *Bd;
+A = new double[N];
+B = new double[N];
+for(int i=0;i<N;i++){
+A[i] = -0.6;
+B[i] = 0.0;
+}
+hipMalloc((void**)&Ad, SIZE);
+hipMalloc((void**)&Bd, SIZE);
+hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
+hipLaunchKernel(test_erfinv, dim3(1), dim3(N), 0, 0, Ad, Bd);
+hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
+int passed = 0;
+for(int i=0;i<512;i++){
+    if(B[i] - A[i] < 0.000001){
+        passed = 1;
+    }
+}
+free(A);
+if(passed == 1){
+    return true;
+}
+assert(passed == 1);
+return false;
+}

 int main(){
 if(run_sincos() && run_sincospi() && run_llrint() && run_norm3d() && run_norm4d() &&
   run_rnorm3d() && run_rnorm4d() &&
   run_rnorm() && run_lround() && run_llround() &&
-   run_rint() && run_rhypot()
+   run_rint() && run_rhypot() && run_erfinv()
 ){
 passed();
 }