Added script for generating math api docs

1. Commented out unsupported device math functions 2. Moved function signatures to the top of implementation snippets 3. Added script to generate markdown documentation for device math apis 4. Added the generated file from the script which should be present everytime Change-Id: Ic579dd8b8fdffa6e1b4d4f5f3fd8a803f4dcaac7 [ROCm/clr commit: 3d4dcee35d]
2017-01-18 14:40:50 -06:00
@@ -0,0 +1,508 @@
+"""
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+"""
+1. This files uses Python3 to run
+
+List of device functions:
+acosf
+acoshf
+asinf
+asinhf
+atan2f
+atanf
+atanhf
+cbrtf
+ceilf
+copysignf
+cosf
+coshf
+cospif
+cyl_bessel_i0f
+cyl_bessel_i1f
+erfcf
+erfcinvf
+erfcxf
+erff
+erfinvf
+exp10f
+exp2f
+expf
+expm1f
+fabsf
+fdimf
+fdividef
+floorf
+fmaf
+fmaxf
+fminf
+fmodf
+frexpf
+hypotf
+ilogbf
+isfinite
+isinf
+isnan
+j0f
+j1f
+jnf
+ldexpf
+lgammaf
+llrintf
+llroundf
+log10f
+log1pf
+logbf
+lrintf
+lroundf
+modff
+nanf
+nearbyintf
+nextafterf
+norm3df
+norm4df
+normcdff
+normcdfinvf
+normf
+powf
+rcbrtf
+remainderf
+remquof
+rhypotf
+rintf
+rnorm3df
+rnorm4df
+rnormf
+roundf
+rsqrtf
+scalblnf
+scalbnf
+signbit
+sincosf
+sincospif
+sinf
+sinhf
+sinpif
+sqrtf
+tanf
+tanhf
+tgammaf
+truncf
+y0f
+y1f
+ynf
+acos
+acosh
+asin
+asinh
+atan
+atan2
+atanh
+cbrt
+ceil
+copysign
+cos
+cosh
+cospi
+cyl_bessel_i0
+cyl_bessel_i1
+erf
+erfc
+erfcinv
+erfcx
+erfinv
+exp
+exp10
+exp2
+expm1
+fabs
+fdim
+floor
+fma
+fmax
+fmin
+fmod
+frexp
+hypot
+ilogb
+isfinite
+isinf
+isnan
+j0
+j1
+jn
+ldexp
+lgamma
+llrint
+llround
+log
+log10
+log1p
+log2
+logb
+lrint
+lround
+modf
+nan
+nearbyint
+nextafter
+norm
+norm3d
+norm4d
+normcdf
+normcdfinv
+pow
+rcbrt
+remainder
+remquo
+rhypot
+rint
+rnorm
+rnorm3d
+rnorm4d
+round
+rsqrt
+scalbln
+scalbn
+signbit
+sin
+sincos
+sincospi
+sinh
+sinpi
+sqrt
+tan
+tanh
+tgamma
+trunc
+y0
+y1
+yn
+__cosf
+__exp10f
+__expf
+__fadd_rd
+__fadd_rn
+__fadd_ru
+__fadd_rz
+__fdiv_rd
+__fdiv_rn
+__fdiv_ru
+__fdiv_rz
+__fdividef
+__fmaf_rd
+__fmaf_rn
+__fmaf_ru
+__fmaf_rz
+__fmul_rd
+__fmul_rn
+__fmul_ru
+__fmul_rz
+__frcp_rd
+__frcp_rn
+__frcp_ru
+__frcp_rz
+__frsqrt_rn
+__fsqrt_rd
+__fsqrt_rn
+__fsqrt_ru
+__fsqrt_rz
+__fsub_rd
+__fsub_rn
+__fsub_ru
+__log10f
+__log2f
+__logf
+__powf
+__saturatef
+__sincosf
+__sinf
+__tanf
+__dadd_rd
+__dadd_rn
+__dadd_ru
+__dadd_rz
+__ddiv_rd
+__ddiv_rn
+__ddiv_ru
+__ddiv_rz
+__dmul_rd
+__dmul_rn
+__dmul_ru
+__dmul_rz
+__drcp_rd
+__drcp_rn
+__drcp_ru
+__drcp_rz
+__dsqrt_rd
+__dsqrt_rn
+__dsqrt_ru
+__dsqrt_rz
+__dsub_rd
+__dsub_rn
+__dsub_ru
+__dsub_rz
+__fma_rd
+__fma_rn
+__fma_ru
+__fma_rz
+__brev
+__brevll
+__byte_perm
+__clz
+__clzll
+__ffs
+__ffsll
+__hadd
+__mul24
+__mul64hi
+__mulhi
+__popc
+__popcll
+__rhadd
+__sad
+__uhadd
+__umul24
+__umul64hi
+__umulhi
+__urhadd
+__usad
+__double2float_rd
+__double2float_rn
+__double2float_ru
+__double2float_rz
+__double2hiint
+__double2int_rd
+__double2int_rn
+__double2int_ru
+__double2int_rz
+__double2ll_rd
+__double2ll_rn
+__double2ll_ru
+__double2ll_rz
+__double2loint
+__double2uint_rd
+__double2uint_rn
+__double2uint_ru
+__double2uint_rz
+__double2ull_rd
+__double2ull_rn
+__double2ull_ru
+__double2ull_rz
+__double_as_longlong
+__float2half_rn
+__half2float
+__float2half_rn
+__half2float
+__float2int_rd
+__float2int_rn
+__float2int_ru
+__float2int_rz
+__float2ll_rd
+__float2ll_rn
+__float2ll_ru
+__float2ll_rz
+__float2uint_rd
+__float2uint_rn
+__float2uint_ru
+__float2uint_rz
+__float2ull_rd
+__float2ull_rn
+__float2ull_ru
+__float2ull_rz
+__float_as_int
+__float_as_uint
+__hiloint2double
+__int2double_rn
+__int2float_rd
+__int2float_rn
+__int2float_ru
+__int2float_rz
+__int_as_float
+__ll2double_rd
+__ll2double_rn
+__ll2double_ru
+__ll2double_rz
+__ll2float_rd
+__ll2float_rn
+__ll2float_ru
+__ll2float_rz
+__longlong_as_double
+__uint2double_rn
+__uint2float_rd
+__uint2float_rn
+__uint2float_ru
+__uint2float_rz
+__uint_as_float
+__ull2double_rd
+__ull2double_rn
+__ull2double_ru
+__ull2double_rz
+__ull2float_rd
+__ull2float_rn
+__ull2float_ru
+__ull2float_rz
+__heq
+__hge
+__hgt
+__hisinf
+__hisnan
+__hle
+__hlt
+__hne
+__hbeq2
+__hbge2
+__hbgt2
+__hble2
+__hblt2
+__hbne2
+__heq2
+__hge2
+__hgt2
+__hisnan2
+__hle2
+__hlt2
+__hne2
+__float22half2_rn
+__float2half
+__float2half2_rn
+__float2half_rd
+__float2half_rn
+__float2half_ru
+__float2half_rz
+__floats2half2_rn
+__half22float2
+__half2float
+half2half2
+__half2int_rd
+__half2int_rn
+__half2int_ru
+__half2int_rz
+__half2ll_rd
+__half2ll_rn
+__half2ll_ru
+__half2ll_rz
+__half2short_rd
+__half2short_rn
+__half2short_ru
+__half2short_rz
+__half2uint_rd
+__half2uint_rn
+__half2uint_ru
+__half2uint_rz
+__half2ull_rd
+__half2ull_rn
+__half2ull_ru
+__half2ull_rz
+__half2ushort_rd
+__half2ushort_rn
+__half2ushort_ru
+__half2ushort_rz
+__half_as_short
+__half_as_ushort
+__halves2half2
+__high2float
+__high2half
+__high2half2
+__highs2half2
+__int2half_rd
+__int2half_rn
+__int2half_ru
+__int2half_rz
+__ll2half_rd
+__ll2half_rn
+__ll2half_ru
+__ll2half_rz
+__low2float
+__low2half
+__low2half2
+__low2half2
+__lowhigh2highlow
+__lows2half2
+__short2half_rd
+__short2half_rn
+__short2half_ru
+__short2half_rz
+__uint2half_rd
+__uint2half_rn
+__uint2half_ru
+__uint2half_rz
+__ull2half_rd
+__ull2half_rn
+__ull2half_ru
+__ull2half_rz
+__ushort2half_rd
+__ushort2half_rn
+__ushort2half_ru
+__ushort2half_rz
+__ushort_as_half
+"""
+# The dictionary is to place description of each device function. Expand it to all the device functions
+deviceFuncDesc = {'acosf': "This function returns floating point of arc cosine from a floating point input"}
+
+fnames = ["../../include/hip/hcc_detail/math_functions.h","../../include/hip/hcc_detail/device_functions.h","../../include/hip/hcc_detail/hip_fp16.h"]
+markdownFileName = "./hip-math-api.md"
+
+preamble = "# HIP MATH APIs Documentation \n"+\
+"HIP supports most of the device functions supported by CUDA. Way to find the unsupported one is to search for the function and check its description\n" + \
+"Note: This document is not human generated. Any changes to this file will be discarded. Please make changes to Python3 script docs/markdown/device_md_gen.py\n\n" + \
+"## For Developers \n" + \
+"If you add or fixed a device function, make sure to add a signature of the function and definition later.\n" + \
+"For example, if you want to add `__device__ float __dotf(float4, float4)`, which does a dot product on 4 float vector components \n" + \
+"The way to add to the header is, \n" + \
+"```cpp \n" + \
+"__device__ static float __dotf(float4, float4); \n" + \
+"/*Way down in the file....*/\n" + \
+"__device__ static inline float __dotf(float4 x, float4 y) { \n" + \
+" /*implementation*/\n}\n" + \
+"```\n\n" + \
+"This helps python script to add the device function newly declared into markdown documentation (as it looks at functions with `;` at the end and `__device__` at the beginning)\n\n" + \
+"The next step would be to add Description to  `deviceFuncDesc`.\n" + \
+"From the above example, it can be writtern as,\n`deviceFuncDesc['__dotf'] = 'This functions takes 2 4 component float vector and outputs dot product across them'`\n\n"
+
+def generateSnippet(name, description, signature):
+    return "### " + name + "\n" + \
+    "```cpp \n" + signature + "\n```\n" + \
+    "**Description:**  " + description + "\n\n\n"
+
+def getName(line):
+    l1 = line.split('(')
+    l2 = l1[0].split(' ')
+    return l2[-1]
+
+with open(markdownFileName, 'w') as mdfd:
+    mdfd.truncate()
+    mdfd.write(preamble)
+    for fname in fnames:
+        with open(fname) as fd:
+            lines = fd.readlines()
+            for line in lines:
+                if line.find('HIP_FAST_MATH') != -1:
+                    break;
+                if line.find('__device__') != -1 and line.find(';') != -1 and line.find('hip') == -1:
+                    name = getName(line)
+                    if line.find('//') == -1:
+                        if name in deviceFuncDesc:
+                            mdfd.write(generateSnippet(name, deviceFuncDesc[name], line))
+                        else:
+                            mdfd.write(generateSnippet(name, "Supported", line))
+                    else:
+                        mdfd.write(generateSnippet(name, "**NOT Supported**", line))
+            fd.close()
+    mdfd.close()
@@ -23,6 +23,86 @@ THE SOFTWARE.
 #include <hip/hip_runtime.h>
 #include <hip/hip_vector_types.h>

+
+
+
+
+// Single Precision Fast Math
+__device__  float __cosf(float x);
+__device__  float __exp10f(float x);
+__device__  float __expf(float x);
+__device__ static  float __fadd_rd(float x, float y);
+__device__ static  float __fadd_rn(float x, float y);
+__device__ static  float __fadd_ru(float x, float y);
+__device__ static  float __fadd_rz(float x, float y);
+__device__ static  float __fdiv_rd(float x, float y);
+__device__ static  float __fdiv_rn(float x, float y);
+__device__ static  float __fdiv_ru(float x, float y);
+__device__ static  float __fdiv_rz(float x, float y);
+__device__ static  float __fdividef(float x, float y);
+__device__  float __fmaf_rd(float x, float y, float z);
+__device__  float __fmaf_rn(float x, float y, float z);
+__device__  float __fmaf_ru(float x, float y, float z);
+__device__  float __fmaf_rz(float x, float y, float z);
+__device__ static  float __fmul_rd(float x, float y);
+__device__ static  float __fmul_rn(float x, float y);
+__device__ static  float __fmul_ru(float x, float y);
+__device__ static  float __fmul_rz(float x, float y);
+__device__  float __frcp_rd(float x);
+__device__  float __frcp_rn(float x);
+__device__  float __frcp_ru(float x);
+__device__  float __frcp_rz(float x);
+__device__  float __frsqrt_rn(float x);
+__device__  float __fsqrt_rd(float x);
+__device__  float __fsqrt_rn(float x);
+__device__  float __fsqrt_ru(float x);
+__device__  float __fsqrt_rz(float x);
+__device__ static  float __fsub_rd(float x, float y);
+__device__ static  float __fsub_rn(float x, float y);
+__device__ static  float __fsub_ru(float x, float y);
+__device__  float __log10f(float x);
+__device__  float __log2f(float x);
+__device__  float __logf(float x);
+__device__  float __powf(float base, float exponent);
+__device__ static  float __saturatef(float x);
+__device__  void __sincosf(float x, float *s, float *c);
+__device__  float __sinf(float x);
+__device__  float __tanf(float x);
+
+
+/*
+Double Precision Intrinsics
+*/
+
+__device__ static  double __dadd_rd(double x, double y);
+__device__ static  double __dadd_rn(double x, double y);
+__device__ static  double __dadd_ru(double x, double y);
+__device__ static  double __dadd_rz(double x, double y);
+__device__ static  double __ddiv_rd(double x, double y);
+__device__ static  double __ddiv_rn(double x, double y);
+__device__ static  double __ddiv_ru(double x, double y);
+__device__ static  double __ddiv_rz(double x, double y);
+__device__ static  double __dmul_rd(double x, double y);
+__device__ static  double __dmul_rn(double x, double y);
+__device__ static  double __dmul_ru(double x, double y);
+__device__ static  double __dmul_rz(double x, double y);
+__device__  double __drcp_rd(double x);
+__device__  double __drcp_rn(double x);
+__device__  double __drcp_ru(double x);
+__device__  double __drcp_rz(double x);
+__device__  double __dsqrt_rd(double x);
+__device__  double __dsqrt_rn(double x);
+__device__  double __dsqrt_ru(double x);
+__device__  double __dsqrt_rz(double x);
+__device__ static  double __dsub_rd(double x, double y);
+__device__ static  double __dsub_rn(double x, double y);
+__device__ static  double __dsub_ru(double x, double y);
+__device__ static  double __dsub_rz(double x, double y);
+__device__  double __fma_rd(double x, double y, double z);
+__device__  double __fma_rn(double x, double y, double z);
+__device__  double __fma_ru(double x, double y, double z);
+__device__  double __fma_rz(double x, double y, double z);
+
 // Single Precision Fast Math
 extern __attribute__((const)) float __hip_fast_cosf(float) __asm("llvm.cos.f32");
 extern __attribute__((const)) float __hip_fast_exp2f(float) __asm("llvm.exp2.f32");
@@ -349,6 +429,21 @@ __device__ unsigned int __clz(int x);
 __device__ unsigned int __clzll(long long int x);
 __device__ unsigned int __ffs(int x);
 __device__ unsigned int __ffsll(long long int x);
+__device__ static unsigned int __hadd(int x, int y);
+__device__ static int __mul24(int x, int y);
+__device__ long long int __mul64hi(long long int x, long long int y);
+__device__ static int __mulhi(int x, int y);
+__device__ unsigned int __popc(unsigned int x);
+__device__ unsigned int __popcll(unsigned long long int x);
+__device__ static int __rhadd(int x, int y);
+__device__ static unsigned int __sad(int x, int y, int z);
+__device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
+__device__ static int __umul24(unsigned int x, unsigned int y);
+__device__ unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
+__device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
+__device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
+__device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
+
 __device__ static inline unsigned int __hadd(int x, int y) {
  int z = x + y;
  int sign = z & 0x8000000;
@@ -358,12 +453,9 @@ __device__ static inline unsigned int __hadd(int x, int y) {
 __device__ static inline int __mul24(int x, int y) {
  return __hip_hc_ir_mul24_int(x, y);
 }
-__device__ long long int __mul64hi(long long int x, long long int y);
 __device__ static inline int __mulhi(int x, int y) {
  return __hip_hc_ir_mulhi_int(x, y);
 }
-__device__ unsigned int __popc( unsigned int x);
-__device__ unsigned int __popcll( unsigned long long int x);
 __device__ static inline int __rhadd(int x, int y) {
  int z = x + y + 1;
  int sign = z & 0x8000000;
@@ -373,14 +465,12 @@ __device__ static inline int __rhadd(int x, int y) {
 __device__ static inline unsigned int __sad(int x, int y, int z) {
  return x > y ? x - y + z : y - x + z;
 }
-
 __device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
  return (x + y) >> 1;
 }
 __device__ static inline int __umul24(unsigned int x, unsigned int y) {
  return __hip_hc_ir_umul24_int(x, y);
 }
-__device__ unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
 __device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
  return __hip_hc_ir_umulhi_int(x, y);
 }
@@ -440,10 +530,10 @@ CUDA implements half as unsigned short whereas, HIP doesn't.

 */

-__device__ int float2int_rd(float x);
-__device__ int float2int_rn(float x);
-__device__ int float2int_ru(float x);
-__device__ int float2int_rz(float x);
+__device__ int __float2int_rd(float x);
+__device__ int __float2int_rn(float x);
+__device__ int __float2int_ru(float x);
+__device__ int __float2int_rz(float x);

 __device__ long long int __float2ll_rd(float x);
 __device__ long long int __float2ll_rn(float x);
@@ -141,8 +141,8 @@ __device__ double copysign(double x, double y);
 __device__ double cos(double x);
 __device__ double cosh(double x);
 __device__ __host__ double cospi(double x);
-__device__ double cyl_bessel_i0(double x);
-__device__ double cyl_bessel_i1(double x);
+//__device__ double cyl_bessel_i0(double x);
+//__device__ double cyl_bessel_i1(double x);
 __device__ double erf(double x);
 __device__ double erfc(double x);
 __device__ double erfcinv(double y);
@@ -232,6 +232,8 @@ __host__ double rnorm3d(double a, double b, double c);
 __host__ double rnorm4d(double a, double b, double c, double d);
 __host__ void sincospi(double x, double *sptr, double *cptr);

+// ENDPARSER
+
 #ifdef HIP_FAST_MATH
 // Single Precision Precise Math when enabled