Merge pull request #415 from deven-amd/master
Checkin to fix bugs in math functions.
[ROCm/hip commit: e17ade3afb]
Этот коммит содержится в:
@@ -57,6 +57,7 @@ __device__ float exp2f(float x);
|
||||
__device__ float expf(float x);
|
||||
__device__ float expm1f(float x);
|
||||
__device__ int abs(int x);
|
||||
__device__ long long abs(long long x);
|
||||
__device__ float fabsf(float x);
|
||||
__device__ float fdimf(float x, float y);
|
||||
__device__ float fdividef(float x, float y);
|
||||
|
||||
@@ -29,10 +29,6 @@ struct hipHalfHolder {
|
||||
};
|
||||
};
|
||||
|
||||
#define HINF 65504
|
||||
|
||||
__device__ static struct hipHalfHolder __hInfValue = {HINF};
|
||||
|
||||
__device__ __half __hadd(__half a, __half b) { return a + b; }
|
||||
|
||||
__device__ __half __hadd_sat(__half a, __half b) { return a + b; }
|
||||
@@ -63,9 +59,21 @@ __device__ bool __hge(__half a, __half b) { return a >= b ? true : false; }
|
||||
|
||||
__device__ bool __hgt(__half a, __half b) { return a > b ? true : false; }
|
||||
|
||||
__device__ bool __hisinf(__half a) { return a == HINF ? true : false; }
|
||||
__device__ bool __hisinf(__half a) {
|
||||
hipHalfHolder hH;
|
||||
hH.h = a;
|
||||
// mask with 0x7fff to drop the sign bit
|
||||
// 0x7c00 is bit pattern for inf (exp = 11111, significand = 0)
|
||||
return ((hH.s & 0x7fff) == 0x7c00) ? true : false;
|
||||
}
|
||||
|
||||
__device__ bool __hisnan(__half a) { return a > HINF ? true : false; }
|
||||
__device__ bool __hisnan(__half a) {
|
||||
hipHalfHolder hH;
|
||||
hH.h = a;
|
||||
// mask with 0x7fff to drop the sign bit
|
||||
// 0x7cXX is bit pattern for inf (exp = 11111, significand = 0)
|
||||
return ((hH.s & 0x7fff) > 0x7c00) ? true : false;
|
||||
}
|
||||
|
||||
__device__ bool __hle(__half a, __half b) { return a <= b ? true : false; }
|
||||
|
||||
@@ -124,8 +132,8 @@ __device__ __half2 __hgt2(__half2 a, __half2 b) {
|
||||
|
||||
__device__ __half2 __hisnan2(__half2 a) {
|
||||
__half2 c;
|
||||
c.x = (a.x > HINF) ? (__half)1 : (__half)0;
|
||||
c.y = (a.y > HINF) ? (__half)1 : (__half)0;
|
||||
c.x = (__hisnan(a.x)) ? (__half)1 : (__half)0;
|
||||
c.y = (__hisnan(a.y)) ? (__half)1 : (__half)0;
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
@@ -56,6 +56,9 @@ __device__ float expm1f(float x) { return hc::precise_math::expm1f(x); }
|
||||
__device__ int abs(int x) {
|
||||
return x >= 0 ? x : -x; // TODO - optimize with OCML
|
||||
}
|
||||
__device__ long long abs(long long x) {
|
||||
return x >= 0 ? x : -x;
|
||||
}
|
||||
__device__ float fabsf(float x) { return hc::precise_math::fabsf(x); }
|
||||
__device__ float fdimf(float x, float y) { return hc::precise_math::fdimf(x, y); }
|
||||
__device__ float fdividef(float x, float y) { return x / y; }
|
||||
@@ -220,14 +223,7 @@ __device__ double j0(double x) { return __hip_j0(x); }
|
||||
__device__ double j1(double x) { return __hip_j1(x); }
|
||||
__device__ double jn(int n, double x) { return __hip_jn(n, x); }
|
||||
__device__ double ldexp(double x, int exp) { return hc::precise_math::ldexp(x, exp); }
|
||||
__device__ double lgamma(double x) {
|
||||
double val = 0.0;
|
||||
double y = x - 1;
|
||||
while (y > 0) {
|
||||
val += log(y--);
|
||||
}
|
||||
return val;
|
||||
}
|
||||
__device__ double lgamma(double x) { return hc::precise_math::lgamma(x); }
|
||||
__device__ long long int llrint(double x) {
|
||||
long long int y = hc::precise_math::round(x);
|
||||
return y;
|
||||
|
||||
@@ -20,12 +20,145 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp
|
||||
* RUN: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "test_common.h"
|
||||
|
||||
#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__
|
||||
|
||||
__global__ void kernel_abs_int64(hipLaunchParm lp, long long *input, long long *output) {
|
||||
int tx = threadIdx.x;
|
||||
output[tx] = abs(input[tx]);
|
||||
}
|
||||
|
||||
__global__ void kernel_lgamma_double(hipLaunchParm lp, double *input, double *output) {
|
||||
int tx = threadIdx.x;
|
||||
output[tx] = lgamma(input[tx]);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#define CHECK_LGAMMA_DOUBLE(IN, OUT, EXP) \
|
||||
{ \
|
||||
if (OUT != EXP) { \
|
||||
failed("check_abs_int64 failed on %f (output = %f, expected = %fd)\n", IN, OUT, EXP); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define CHECK_ABS_INT64(IN, OUT, EXP) \
|
||||
{ \
|
||||
if (OUT != EXP) { \
|
||||
failed("check_abs_int64 failed on %lld (output = %lld, expected = %lld)\n", IN, OUT, EXP); \
|
||||
} \
|
||||
}
|
||||
|
||||
void check_lgamma_double() {
|
||||
|
||||
using datatype_t = double;
|
||||
|
||||
const int NUM_INPUTS = 8;
|
||||
auto memsize = NUM_INPUTS * sizeof(datatype_t);
|
||||
|
||||
// allocate memories
|
||||
datatype_t *inputCPU = (datatype_t *) malloc(memsize);
|
||||
datatype_t *outputCPU = (datatype_t *) malloc(memsize);
|
||||
datatype_t *inputGPU = nullptr; hipMalloc((void**)&inputGPU, memsize);
|
||||
datatype_t *outputGPU = nullptr; hipMalloc((void**)&outputGPU, memsize);
|
||||
|
||||
// populate input
|
||||
for (int i=0; i<NUM_INPUTS; i++) {
|
||||
inputCPU[i] = -3.5 + i;
|
||||
}
|
||||
|
||||
// copy inputs to device
|
||||
hipMemcpy(inputGPU, inputCPU, memsize, hipMemcpyHostToDevice);
|
||||
|
||||
// launch kernel
|
||||
hipLaunchKernel(kernel_lgamma_double, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);
|
||||
|
||||
// copy outputs from device
|
||||
hipMemcpy(outputCPU, outputGPU, memsize, hipMemcpyDeviceToHost);
|
||||
|
||||
// check outputs
|
||||
for (int i=0; i<NUM_INPUTS; i++) {
|
||||
CHECK_LGAMMA_DOUBLE(inputCPU[i], outputCPU[i], lgamma(inputCPU[i]));
|
||||
}
|
||||
|
||||
// free memories
|
||||
hipFree(inputGPU);
|
||||
hipFree(outputGPU);
|
||||
free(inputCPU);
|
||||
free(outputCPU);
|
||||
|
||||
// done
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void check_abs_int64() {
|
||||
|
||||
using datatype_t = long long;
|
||||
|
||||
const int NUM_INPUTS = 8;
|
||||
auto memsize = NUM_INPUTS * sizeof(datatype_t);
|
||||
|
||||
// allocate memories
|
||||
datatype_t *inputCPU = (datatype_t *) malloc(memsize);
|
||||
datatype_t *outputCPU = (datatype_t *) malloc(memsize);
|
||||
datatype_t *inputGPU = nullptr; hipMalloc((void**)&inputGPU, memsize);
|
||||
datatype_t *outputGPU = nullptr; hipMalloc((void**)&outputGPU, memsize);
|
||||
|
||||
// populate input
|
||||
inputCPU[0] = -81985529216486895ll;
|
||||
inputCPU[1] = 81985529216486895ll;
|
||||
inputCPU[2] = -1250999896491ll;
|
||||
inputCPU[3] = 1250999896491ll;
|
||||
inputCPU[4] = -19088743ll;
|
||||
inputCPU[5] = 19088743ll;
|
||||
inputCPU[6] = -291ll;
|
||||
inputCPU[7] = 291ll;
|
||||
|
||||
// copy inputs to device
|
||||
hipMemcpy(inputGPU, inputCPU, memsize, hipMemcpyHostToDevice);
|
||||
|
||||
// launch kernel
|
||||
hipLaunchKernel(kernel_abs_int64, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);
|
||||
|
||||
// copy outputs from device
|
||||
hipMemcpy(outputCPU, outputGPU, memsize, hipMemcpyDeviceToHost);
|
||||
|
||||
// check outputs
|
||||
CHECK_ABS_INT64(inputCPU[0], outputCPU[0], outputCPU[1]);
|
||||
CHECK_ABS_INT64(inputCPU[1], outputCPU[1], outputCPU[1]);
|
||||
CHECK_ABS_INT64(inputCPU[2], outputCPU[2], outputCPU[3]);
|
||||
CHECK_ABS_INT64(inputCPU[3], outputCPU[3], outputCPU[3]);
|
||||
CHECK_ABS_INT64(inputCPU[4], outputCPU[4], outputCPU[5]);
|
||||
CHECK_ABS_INT64(inputCPU[5], outputCPU[5], outputCPU[5]);
|
||||
CHECK_ABS_INT64(inputCPU[6], outputCPU[6], outputCPU[7]);
|
||||
CHECK_ABS_INT64(inputCPU[7], outputCPU[7], outputCPU[7]);
|
||||
|
||||
// free memories
|
||||
hipFree(inputGPU);
|
||||
hipFree(outputGPU);
|
||||
free(inputCPU);
|
||||
free(outputCPU);
|
||||
|
||||
// done
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
HipTest::parseStandardArguments(argc, argv, true);
|
||||
|
||||
check_abs_int64();
|
||||
|
||||
// check_lgamma_double();
|
||||
|
||||
passed();
|
||||
}
|
||||
|
||||
@@ -17,6 +17,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp
|
||||
* RUN: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <hip/hip_fp16.h>
|
||||
#include "hip/hip_runtime.h"
|
||||
@@ -59,8 +65,143 @@ __global__ void __half2Math(hipLaunchParm lp, __half2* A, __half2* B, __half2* C
|
||||
c = __hmul2_sat(b, c);
|
||||
}
|
||||
|
||||
__global__ void kernel_hisnan(hipLaunchParm lp, __half* input, int* output) {
|
||||
int tx = threadIdx.x;
|
||||
output[tx] = __hisnan(input[tx]);
|
||||
}
|
||||
|
||||
__global__ void kernel_hisinf(hipLaunchParm lp, __half* input, int* output) {
|
||||
int tx = threadIdx.x;
|
||||
output[tx] = __hisinf(input[tx]);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
__half host_ushort_as_half(unsigned short s) {
|
||||
union {__half h; unsigned short s; } converter;
|
||||
converter.s = s;
|
||||
return converter.h;
|
||||
}
|
||||
|
||||
|
||||
void check_hisnan(int NUM_INPUTS, __half* inputCPU, __half* inputGPU) {
|
||||
|
||||
// allocate memory
|
||||
auto memsize = NUM_INPUTS * sizeof(int);
|
||||
int* outputGPU = nullptr;
|
||||
hipMalloc((void**)&outputGPU, memsize);
|
||||
|
||||
// launch the kernel
|
||||
hipLaunchKernel(kernel_hisnan, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);
|
||||
|
||||
// copy output from device
|
||||
int* outputCPU = (int*) malloc(memsize);
|
||||
hipMemcpy(outputCPU, outputGPU, memsize, hipMemcpyDeviceToHost);
|
||||
|
||||
// check output
|
||||
for (int i=0; i<NUM_INPUTS; i++) {
|
||||
if ((2 <= i) && (i <= 5)) { // inputs are nan, output should be true
|
||||
if (outputCPU[i] == 0) {
|
||||
failed("__hisnan() returned false for %f (input idx = %d)\n", inputCPU[i], i);
|
||||
}
|
||||
}
|
||||
else { // inputs are NOT nan, output should be false
|
||||
if (outputCPU[i] != 0) {
|
||||
failed("__hisnan() returned true for %f (input idx = %d)\n", inputCPU[i], i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// free memory
|
||||
free(outputCPU);
|
||||
hipFree(outputGPU);
|
||||
|
||||
// done
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void check_hisinf(int NUM_INPUTS, __half* inputCPU, __half* inputGPU) {
|
||||
// allocate memory
|
||||
auto memsize = NUM_INPUTS * sizeof(int);
|
||||
int* outputGPU = nullptr;
|
||||
hipMalloc((void**)&outputGPU, memsize);
|
||||
|
||||
// launch the kernel
|
||||
hipLaunchKernel(kernel_hisinf, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);
|
||||
|
||||
// copy output from device
|
||||
int* outputCPU = (int*) malloc(memsize);
|
||||
hipMemcpy(outputCPU, outputGPU, memsize, hipMemcpyDeviceToHost);
|
||||
|
||||
// check output
|
||||
for (int i=0; i<NUM_INPUTS; i++) {
|
||||
if ((0 <= i) && (i <= 1)) { // inputs are inf, output should be true
|
||||
if (outputCPU[i] == 0) {
|
||||
failed("__hisinf() returned false for %f (input idx = %d)\n", inputCPU[i], i);
|
||||
}
|
||||
}
|
||||
else { // inputs are NOT inf, output should be false
|
||||
if (outputCPU[i] != 0) {
|
||||
failed("__hisinf() returned true for %f (input idx = %d)\n", inputCPU[i], i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// free memory
|
||||
free(outputCPU);
|
||||
hipFree(outputGPU);
|
||||
|
||||
// done
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void checkFunctional() {
|
||||
|
||||
// allocate memory
|
||||
const int NUM_INPUTS = 16;
|
||||
auto memsize = NUM_INPUTS * sizeof(__half);
|
||||
__half* inputCPU = (__half*) malloc(memsize);
|
||||
|
||||
// populate inputs
|
||||
inputCPU[0] = host_ushort_as_half(0x7c00); // inf
|
||||
inputCPU[1] = host_ushort_as_half(0xfc00); // -inf
|
||||
inputCPU[2] = host_ushort_as_half(0x7c01); // nan
|
||||
inputCPU[3] = host_ushort_as_half(0x7e00); // nan
|
||||
inputCPU[4] = host_ushort_as_half(0xfc01); // nan
|
||||
inputCPU[5] = host_ushort_as_half(0xfe00); // nan
|
||||
inputCPU[6] = host_ushort_as_half(0x0000); // 0
|
||||
inputCPU[7] = host_ushort_as_half(0x8000); // -0
|
||||
inputCPU[8] = host_ushort_as_half(0x7bff); // max +ve normal
|
||||
inputCPU[9] = host_ushort_as_half(0xfbff); // max -ve normal
|
||||
inputCPU[10] = host_ushort_as_half(0x0400); // min +ve normal
|
||||
inputCPU[11] = host_ushort_as_half(0x8400); // min -ve normal
|
||||
inputCPU[12] = host_ushort_as_half(0x03ff); // max +ve sub-normal
|
||||
inputCPU[13] = host_ushort_as_half(0x83ff); // max -ve sub-normal
|
||||
inputCPU[14] = host_ushort_as_half(0x0001); // min +ve sub-normal
|
||||
inputCPU[15] = host_ushort_as_half(0x8001); // min -ve sub-normal
|
||||
|
||||
// copy inputs to the GPU
|
||||
__half* inputGPU = nullptr;
|
||||
hipMalloc((void**)&inputGPU, memsize);
|
||||
hipMemcpy(inputGPU, inputCPU, memsize, hipMemcpyHostToDevice);
|
||||
|
||||
// run checks
|
||||
|
||||
check_hisnan(NUM_INPUTS, inputCPU, inputGPU);
|
||||
|
||||
check_hisinf(NUM_INPUTS, inputCPU, inputGPU);
|
||||
|
||||
// free memory
|
||||
hipFree(inputGPU);
|
||||
free(inputCPU);
|
||||
|
||||
// all done
|
||||
return;
|
||||
}
|
||||
|
||||
int main() {
|
||||
__half *A, *B, *C;
|
||||
hipMalloc(&A, HALF_SIZE);
|
||||
@@ -78,5 +219,9 @@ int main() {
|
||||
hipFree(A2);
|
||||
hipFree(B2);
|
||||
hipFree(C2);
|
||||
|
||||
// run some functional checks
|
||||
checkFunctional();
|
||||
|
||||
passed();
|
||||
}
|
||||
|
||||
Ссылка в новой задаче
Block a user