Merge pull request #415 from deven-amd/master

Checkin to fix bugs in math functions.

[ROCm/hip commit: e17ade3afb]
Этот коммит содержится в:
Maneesh Gupta
2018-05-01 12:29:03 +05:30
коммит произвёл GitHub
родитель b272e57c33 43c2724907
Коммит ef13aa3a0b
5 изменённых файлов: 299 добавлений и 16 удалений
+1
Просмотреть файл
@@ -57,6 +57,7 @@ __device__ float exp2f(float x);
__device__ float expf(float x);
__device__ float expm1f(float x);
__device__ int abs(int x);
__device__ long long abs(long long x);
__device__ float fabsf(float x);
__device__ float fdimf(float x, float y);
__device__ float fdividef(float x, float y);
+16 -8
Просмотреть файл
@@ -29,10 +29,6 @@ struct hipHalfHolder {
};
};
#define HINF 65504
__device__ static struct hipHalfHolder __hInfValue = {HINF};
__device__ __half __hadd(__half a, __half b) { return a + b; }
__device__ __half __hadd_sat(__half a, __half b) { return a + b; }
@@ -63,9 +59,21 @@ __device__ bool __hge(__half a, __half b) { return a >= b ? true : false; }
__device__ bool __hgt(__half a, __half b) { return a > b ? true : false; }
__device__ bool __hisinf(__half a) { return a == HINF ? true : false; }
__device__ bool __hisinf(__half a) {
hipHalfHolder hH;
hH.h = a;
// mask with 0x7fff to drop the sign bit
// 0x7c00 is bit pattern for inf (exp = 11111, significand = 0)
return ((hH.s & 0x7fff) == 0x7c00) ? true : false;
}
__device__ bool __hisnan(__half a) { return a > HINF ? true : false; }
__device__ bool __hisnan(__half a) {
hipHalfHolder hH;
hH.h = a;
// mask with 0x7fff to drop the sign bit
// 0x7cXX is bit pattern for inf (exp = 11111, significand = 0)
return ((hH.s & 0x7fff) > 0x7c00) ? true : false;
}
__device__ bool __hle(__half a, __half b) { return a <= b ? true : false; }
@@ -124,8 +132,8 @@ __device__ __half2 __hgt2(__half2 a, __half2 b) {
__device__ __half2 __hisnan2(__half2 a) {
__half2 c;
c.x = (a.x > HINF) ? (__half)1 : (__half)0;
c.y = (a.y > HINF) ? (__half)1 : (__half)0;
c.x = (__hisnan(a.x)) ? (__half)1 : (__half)0;
c.y = (__hisnan(a.y)) ? (__half)1 : (__half)0;
return c;
}
+4 -8
Просмотреть файл
@@ -56,6 +56,9 @@ __device__ float expm1f(float x) { return hc::precise_math::expm1f(x); }
__device__ int abs(int x) {
return x >= 0 ? x : -x; // TODO - optimize with OCML
}
__device__ long long abs(long long x) {
return x >= 0 ? x : -x;
}
__device__ float fabsf(float x) { return hc::precise_math::fabsf(x); }
__device__ float fdimf(float x, float y) { return hc::precise_math::fdimf(x, y); }
__device__ float fdividef(float x, float y) { return x / y; }
@@ -220,14 +223,7 @@ __device__ double j0(double x) { return __hip_j0(x); }
__device__ double j1(double x) { return __hip_j1(x); }
__device__ double jn(int n, double x) { return __hip_jn(n, x); }
__device__ double ldexp(double x, int exp) { return hc::precise_math::ldexp(x, exp); }
__device__ double lgamma(double x) {
double val = 0.0;
double y = x - 1;
while (y > 0) {
val += log(y--);
}
return val;
}
__device__ double lgamma(double x) { return hc::precise_math::lgamma(x); }
__device__ long long int llrint(double x) {
long long int y = hc::precise_math::round(x);
return y;
+133
Просмотреть файл
@@ -20,12 +20,145 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../test_common.cpp
* RUN: %t
* HIT_END
*/
#include "hip/hip_runtime.h"
#include "test_common.h"
#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__
__global__ void kernel_abs_int64(hipLaunchParm lp, long long *input, long long *output) {
int tx = threadIdx.x;
output[tx] = abs(input[tx]);
}
__global__ void kernel_lgamma_double(hipLaunchParm lp, double *input, double *output) {
int tx = threadIdx.x;
output[tx] = lgamma(input[tx]);
}
#endif
#define CHECK_LGAMMA_DOUBLE(IN, OUT, EXP) \
{ \
if (OUT != EXP) { \
failed("check_abs_int64 failed on %f (output = %f, expected = %fd)\n", IN, OUT, EXP); \
} \
}
#define CHECK_ABS_INT64(IN, OUT, EXP) \
{ \
if (OUT != EXP) { \
failed("check_abs_int64 failed on %lld (output = %lld, expected = %lld)\n", IN, OUT, EXP); \
} \
}
void check_lgamma_double() {
using datatype_t = double;
const int NUM_INPUTS = 8;
auto memsize = NUM_INPUTS * sizeof(datatype_t);
// allocate memories
datatype_t *inputCPU = (datatype_t *) malloc(memsize);
datatype_t *outputCPU = (datatype_t *) malloc(memsize);
datatype_t *inputGPU = nullptr; hipMalloc((void**)&inputGPU, memsize);
datatype_t *outputGPU = nullptr; hipMalloc((void**)&outputGPU, memsize);
// populate input
for (int i=0; i<NUM_INPUTS; i++) {
inputCPU[i] = -3.5 + i;
}
// copy inputs to device
hipMemcpy(inputGPU, inputCPU, memsize, hipMemcpyHostToDevice);
// launch kernel
hipLaunchKernel(kernel_lgamma_double, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);
// copy outputs from device
hipMemcpy(outputCPU, outputGPU, memsize, hipMemcpyDeviceToHost);
// check outputs
for (int i=0; i<NUM_INPUTS; i++) {
CHECK_LGAMMA_DOUBLE(inputCPU[i], outputCPU[i], lgamma(inputCPU[i]));
}
// free memories
hipFree(inputGPU);
hipFree(outputGPU);
free(inputCPU);
free(outputCPU);
// done
return;
}
void check_abs_int64() {
using datatype_t = long long;
const int NUM_INPUTS = 8;
auto memsize = NUM_INPUTS * sizeof(datatype_t);
// allocate memories
datatype_t *inputCPU = (datatype_t *) malloc(memsize);
datatype_t *outputCPU = (datatype_t *) malloc(memsize);
datatype_t *inputGPU = nullptr; hipMalloc((void**)&inputGPU, memsize);
datatype_t *outputGPU = nullptr; hipMalloc((void**)&outputGPU, memsize);
// populate input
inputCPU[0] = -81985529216486895ll;
inputCPU[1] = 81985529216486895ll;
inputCPU[2] = -1250999896491ll;
inputCPU[3] = 1250999896491ll;
inputCPU[4] = -19088743ll;
inputCPU[5] = 19088743ll;
inputCPU[6] = -291ll;
inputCPU[7] = 291ll;
// copy inputs to device
hipMemcpy(inputGPU, inputCPU, memsize, hipMemcpyHostToDevice);
// launch kernel
hipLaunchKernel(kernel_abs_int64, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);
// copy outputs from device
hipMemcpy(outputCPU, outputGPU, memsize, hipMemcpyDeviceToHost);
// check outputs
CHECK_ABS_INT64(inputCPU[0], outputCPU[0], outputCPU[1]);
CHECK_ABS_INT64(inputCPU[1], outputCPU[1], outputCPU[1]);
CHECK_ABS_INT64(inputCPU[2], outputCPU[2], outputCPU[3]);
CHECK_ABS_INT64(inputCPU[3], outputCPU[3], outputCPU[3]);
CHECK_ABS_INT64(inputCPU[4], outputCPU[4], outputCPU[5]);
CHECK_ABS_INT64(inputCPU[5], outputCPU[5], outputCPU[5]);
CHECK_ABS_INT64(inputCPU[6], outputCPU[6], outputCPU[7]);
CHECK_ABS_INT64(inputCPU[7], outputCPU[7], outputCPU[7]);
// free memories
hipFree(inputGPU);
hipFree(outputGPU);
free(inputCPU);
free(outputCPU);
// done
return;
}
int main(int argc, char* argv[]) {
HipTest::parseStandardArguments(argc, argv, true);
check_abs_int64();
// check_lgamma_double();
passed();
}
+145
Просмотреть файл
@@ -17,6 +17,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../test_common.cpp
* RUN: %t
* HIT_END
*/
#include <iostream>
#include <hip/hip_fp16.h>
#include "hip/hip_runtime.h"
@@ -59,8 +65,143 @@ __global__ void __half2Math(hipLaunchParm lp, __half2* A, __half2* B, __half2* C
c = __hmul2_sat(b, c);
}
__global__ void kernel_hisnan(hipLaunchParm lp, __half* input, int* output) {
int tx = threadIdx.x;
output[tx] = __hisnan(input[tx]);
}
__global__ void kernel_hisinf(hipLaunchParm lp, __half* input, int* output) {
int tx = threadIdx.x;
output[tx] = __hisinf(input[tx]);
}
#endif
__half host_ushort_as_half(unsigned short s) {
union {__half h; unsigned short s; } converter;
converter.s = s;
return converter.h;
}
void check_hisnan(int NUM_INPUTS, __half* inputCPU, __half* inputGPU) {
// allocate memory
auto memsize = NUM_INPUTS * sizeof(int);
int* outputGPU = nullptr;
hipMalloc((void**)&outputGPU, memsize);
// launch the kernel
hipLaunchKernel(kernel_hisnan, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);
// copy output from device
int* outputCPU = (int*) malloc(memsize);
hipMemcpy(outputCPU, outputGPU, memsize, hipMemcpyDeviceToHost);
// check output
for (int i=0; i<NUM_INPUTS; i++) {
if ((2 <= i) && (i <= 5)) { // inputs are nan, output should be true
if (outputCPU[i] == 0) {
failed("__hisnan() returned false for %f (input idx = %d)\n", inputCPU[i], i);
}
}
else { // inputs are NOT nan, output should be false
if (outputCPU[i] != 0) {
failed("__hisnan() returned true for %f (input idx = %d)\n", inputCPU[i], i);
}
}
}
// free memory
free(outputCPU);
hipFree(outputGPU);
// done
return;
}
void check_hisinf(int NUM_INPUTS, __half* inputCPU, __half* inputGPU) {
// allocate memory
auto memsize = NUM_INPUTS * sizeof(int);
int* outputGPU = nullptr;
hipMalloc((void**)&outputGPU, memsize);
// launch the kernel
hipLaunchKernel(kernel_hisinf, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);
// copy output from device
int* outputCPU = (int*) malloc(memsize);
hipMemcpy(outputCPU, outputGPU, memsize, hipMemcpyDeviceToHost);
// check output
for (int i=0; i<NUM_INPUTS; i++) {
if ((0 <= i) && (i <= 1)) { // inputs are inf, output should be true
if (outputCPU[i] == 0) {
failed("__hisinf() returned false for %f (input idx = %d)\n", inputCPU[i], i);
}
}
else { // inputs are NOT inf, output should be false
if (outputCPU[i] != 0) {
failed("__hisinf() returned true for %f (input idx = %d)\n", inputCPU[i], i);
}
}
}
// free memory
free(outputCPU);
hipFree(outputGPU);
// done
return;
}
void checkFunctional() {
// allocate memory
const int NUM_INPUTS = 16;
auto memsize = NUM_INPUTS * sizeof(__half);
__half* inputCPU = (__half*) malloc(memsize);
// populate inputs
inputCPU[0] = host_ushort_as_half(0x7c00); // inf
inputCPU[1] = host_ushort_as_half(0xfc00); // -inf
inputCPU[2] = host_ushort_as_half(0x7c01); // nan
inputCPU[3] = host_ushort_as_half(0x7e00); // nan
inputCPU[4] = host_ushort_as_half(0xfc01); // nan
inputCPU[5] = host_ushort_as_half(0xfe00); // nan
inputCPU[6] = host_ushort_as_half(0x0000); // 0
inputCPU[7] = host_ushort_as_half(0x8000); // -0
inputCPU[8] = host_ushort_as_half(0x7bff); // max +ve normal
inputCPU[9] = host_ushort_as_half(0xfbff); // max -ve normal
inputCPU[10] = host_ushort_as_half(0x0400); // min +ve normal
inputCPU[11] = host_ushort_as_half(0x8400); // min -ve normal
inputCPU[12] = host_ushort_as_half(0x03ff); // max +ve sub-normal
inputCPU[13] = host_ushort_as_half(0x83ff); // max -ve sub-normal
inputCPU[14] = host_ushort_as_half(0x0001); // min +ve sub-normal
inputCPU[15] = host_ushort_as_half(0x8001); // min -ve sub-normal
// copy inputs to the GPU
__half* inputGPU = nullptr;
hipMalloc((void**)&inputGPU, memsize);
hipMemcpy(inputGPU, inputCPU, memsize, hipMemcpyHostToDevice);
// run checks
check_hisnan(NUM_INPUTS, inputCPU, inputGPU);
check_hisinf(NUM_INPUTS, inputCPU, inputGPU);
// free memory
hipFree(inputGPU);
free(inputCPU);
// all done
return;
}
int main() {
__half *A, *B, *C;
hipMalloc(&A, HALF_SIZE);
@@ -78,5 +219,9 @@ int main() {
hipFree(A2);
hipFree(B2);
hipFree(C2);
// run some functional checks
checkFunctional();
passed();
}