SWDEV-503089 - Fix and enable disabled HIP tests from math group (#1319)

* SWDEV-503089 - Fix and enable disabled HIP tests from math group

* SWDEV-503089 - Move single precision reduced run to a common function
Этот коммит содержится в:
amilanov-amd
2025-11-26 10:34:05 +01:00
коммит произвёл GitHub
родитель ee48f6221d
Коммит da9bb4efae
28 изменённых файлов: 313 добавлений и 575 удалений
+61 -488
Просмотреть файл
@@ -17,25 +17,6 @@
"NOTE: The following 2 tests are disabled due to defect - EXSWHTEC-238",
"Unit_hipMemRangeGetAttribute_Positive_AccessedBy_Basic",
"Unit_hipMemRangeGetAttribute_Positive_AccessedBy_Partial_Range",
"=== Below 2 tests are disable due to defect EXSWHTEC-356 ===",
"Unit_Device___hisinf2_Accuracy_Positive",
"Unit_Device___hisnan2_Accuracy_Positive",
"Unit_Device___hbequ2_Accuracy_Positive",
"Unit_Device___hne_Accuracy_Positive",
"Unit_Device___hne2_Accuracy_Positive",
"Unit_Device___hbne2_Accuracy_Positive",
"Unit_Device___hbgeu2_Accuracy_Positive",
"Unit_Device___hbgtu2_Accuracy_Positive",
"Unit_Device___hbleu2_Accuracy_Positive",
"Unit_Device___hbltu2_Accuracy_Positive",
"=== Below 4 tests are disable due to defect EXSWHTEC-355 ===",
"Unit_Device___hadd_Sanity_Positive",
"Unit_Device___uhadd_Sanity_Positive",
"Unit_Device___rhadd_Sanity_Positive",
"Unit_Device___urhadd_Sanity_Positive",
"=== Below 2 tests are disable due to defect EXSWHTEC-369 ===",
"Unit_Device_ilogbf_Accuracy_Positive",
"Unit_Device_ilogb_Accuracy_Positive",
"=== SWDEV-434171: Below tests took long time to complete in stress test on 17/11/23 ===",
"Unit_Warp_Shfl_Positive_Basic - int",
"Unit_Warp_Shfl_Positive_Basic - unsigned int",
@@ -77,470 +58,8 @@
"Unit_hipStreamBeginCapture_Positive_Functional",
"Unit_Kernel_Launch_bounds_Negative_OutOfBounds",
"Unit_Kernel_Launch_bounds_Negative_Parameters_RTC",
"Unit_Device_sin_Accuracy_Positive - float",
"Unit_Device_sin_Accuracy_Positive - double",
"Unit_Device_cos_Accuracy_Positive - float",
"Unit_Device_cos_Accuracy_Positive - double",
"Unit_Device_tan_Accuracy_Positive - float",
"Unit_Device_tan_Accuracy_Positive - double",
"Unit_Device_asin_Accuracy_Positive - float",
"Unit_Device_asin_Accuracy_Positive - double",
"Unit_Device_acos_Accuracy_Positive - float",
"Unit_Device_acos_Accuracy_Positive - double",
"Unit_Device_atan_Accuracy_Positive - float",
"Unit_Device_atan_Accuracy_Positive - double",
"Unit_Device_sinh_Accuracy_Positive - float",
"Unit_Device_sinh_Accuracy_Positive - double",
"Unit_Device_cosh_Accuracy_Positive - float",
"Unit_Device_cosh_Accuracy_Positive - double",
"Unit_Device_tanh_Accuracy_Positive - float",
"Unit_Device_tanh_Accuracy_Positive - double",
"Unit_Device_asinh_Accuracy_Positive - float",
"Unit_Device_asinh_Accuracy_Positive - double",
"Unit_Device_acosh_Accuracy_Positive - float",
"Unit_Device_acosh_Accuracy_Positive - double",
"Unit_Device_atanh_Accuracy_Positive - float",
"Unit_Device_atanh_Accuracy_Positive - double",
"Unit_Device_sinpi_Accuracy_Positive - float",
"Unit_Device_sinpi_Accuracy_Positive - double",
"Unit_Device_cospi_Accuracy_Positive - float",
"Unit_Device_cospi_Accuracy_Positive - double",
"Unit_Device_tanpi_Accuracy_Positive - float",
"Unit_Device_tanpi_Accuracy_Positive - double",
"Unit_Device_atan2_Accuracy_Positive - float",
"Unit_Device_atan2_Accuracy_Positive - double",
"Unit_Device_sincos_Accuracy_Positive - float",
"Unit_Device_sincos_Accuracy_Positive - double",
"Unit_Device_sincospi_Accuracy_Positive - float",
"Unit_Device_sincospi_Accuracy_Positive - double",
"Unit_Device_fabs_Accuracy_Positive - float",
"Unit_Device_fabs_Accuracy_Positive - double",
"Unit_Device_copysign_Accuracy_Positive - float",
"Unit_Device_copysign_Accuracy_Positive - double",
"Unit_Device_fmax_Accuracy_Positive - float",
"Unit_Device_fmax_Accuracy_Positive - double",
"Unit_Device_fmin_Accuracy_Positive - float",
"Unit_Device_fmin_Accuracy_Positive - double",
"Unit_Device_nextafter_Accuracy_Positive - float",
"Unit_Device_nextafter_Accuracy_Positive - double",
"Unit_Device_fma_Accuracy_Positive - float",
"Unit_Device_fma_Accuracy_Positive - double",
"Unit_Device_fdividef_Accuracy_Positive",
"Unit_Device_isfinite_Accuracy_Positive - float",
"Unit_Device_isfinite_Accuracy_Positive - double",
"Unit_Device_isinf_Accuracy_Positive - float",
"Unit_Device_isinf_Accuracy_Positive - double",
"Unit_Device_isnan_Accuracy_Positive - float",
"Unit_Device_isnan_Accuracy_Positive - double",
"Unit_Device_signbit_Accuracy_Positive - float",
"Unit_Device_signbit_Accuracy_Positive - double",
"Unit_Device_fmod_Accuracy_Positive - float",
"Unit_Device_fmod_Accuracy_Positive - double",
"Unit_Device_remainder_Accuracy_Positive - float",
"Unit_Device_remainder_Accuracy_Positive - double",
"Unit_Device_fdim_Accuracy_Positive - float",
"Unit_Device_fdim_Accuracy_Positive - double",
"Unit_Device_trunc_Accuracy_Positive - float",
"Unit_Device_trunc_Accuracy_Positive - double",
"Unit_Device_round_Accuracy_Positive - float",
"Unit_Device_round_Accuracy_Positive - double",
"Unit_Device_rint_Accuracy_Positive - float",
"Unit_Device_rint_Accuracy_Positive - double",
"Unit_Device_nearbyint_Accuracy_Positive - float",
"Unit_Device_nearbyint_Accuracy_Positive - double",
"Unit_Device_ceil_Accuracy_Positive - float",
"Unit_Device_ceil_Accuracy_Positive - double",
"Unit_Device_floor_Accuracy_Positive - float",
"Unit_Device_floor_Accuracy_Positive - double",
"Unit_Device_lrint_Accuracy_Positive - float",
"Unit_Device_lrint_Accuracy_Positive - double",
"Unit_Device_lround_Accuracy_Positive - float",
"Unit_Device_lround_Accuracy_Positive - double",
"Unit_Device_llrint_Accuracy_Positive - float",
"Unit_Device_llrint_Accuracy_Positive - double",
"Unit_Device_llround_Accuracy_Positive - float",
"Unit_Device_llround_Accuracy_Positive - double",
"Unit_Device_remquo_Accuracy_Positive - float",
"Unit_Device_remquo_Accuracy_Positive - double",
"Unit_Device_modf_Accuracy_Positive - float",
"Unit_Device_modf_Accuracy_Positive - double",
"=== Below tests cause timeout in stress test of 09/02/24 ===",
"Unit_Device___half2half2_Accuracy_Positive",
"Unit_Device_make_half2_Accuracy_Positive",
"Unit_Device___halves2half2_Accuracy_Positive",
"Unit_Device___low2half_Accuracy_Positive",
"Unit_Device___high2half_Accuracy_Positive",
"Unit_Device___low2half2_Accuracy_Positive",
"Unit_Device___high2half2_Accuracy_Positive",
"Unit_Device___lowhigh2highlow_Accuracy_Positive",
"Unit_Device___lows2half2_Accuracy_Positive",
"Unit_Device___highs2half2_Accuracy_Positive",
"Unit_Device___float2half2_rn_Accuracy_Positive",
"Unit_Device___floats2half2_rn_Accuracy_Positive",
"Unit_Device___float22half2_rn_Accuracy_Positive",
"Unit_Device___low2float_Accuracy_Positive",
"Unit_Device___high2float_Accuracy_Positive",
"Unit_Device___half22float2_Accuracy_Positive",
"Unit_Device_hcos_Accuracy_Positive",
"Unit_Device_h2cos_Accuracy_Positive",
"Unit_Device_hsin_Accuracy_Positive",
"Unit_Device_h2sin_Accuracy_Positive",
"Unit_Device_hexp_Accuracy_Positive",
"Unit_Device_h2exp_Accuracy_Positive",
"Unit_Device_hexp10_Accuracy_Positive",
"Unit_Device_h2exp10_Accuracy_Positive",
"Unit_Device_hexp2_Accuracy_Positive",
"Unit_Device_h2exp2_Accuracy_Positive",
"Unit_Device_hlog_Accuracy_Positive",
"Unit_Device_h2log_Accuracy_Positive",
"Unit_Device_hlog10_Accuracy_Positive",
"Unit_Device_h2log10_Accuracy_Positive",
"Unit_Device_hlog2_Accuracy_Positive",
"Unit_Device_h2log2_Accuracy_Positive",
"Unit_Device_hsqrt_Accuracy_Positive",
"Unit_Device_h2sqrt_Accuracy_Positive",
"Unit_Device_hceil_Accuracy_Positive",
"Unit_Device_h2ceil_Accuracy_Positive",
"Unit_Device_hfloor_Accuracy_Positive",
"Unit_Device_h2floor_Accuracy_Positive",
"Unit_Device_htrunc_Accuracy_Positive",
"Unit_Device_h2trunc_Accuracy_Positive",
"Unit_Device_hrcp_Accuracy_Positive",
"Unit_Device_h2rcp_Accuracy_Positive",
"Unit_Device_hrsqrt_Accuracy_Positive",
"Unit_Device_h2rsqrt_Accuracy_Positive",
"Unit_Device_hrint_Accuracy_Positive",
"Unit_Device_h2rint_Accuracy_Positive",
"Unit_Device___habs_Accuracy_Positive",
"Unit_Device___habs2_Accuracy_Positive",
"Unit_Device___hneg_Accuracy_Positive",
"Unit_Device___hneg2_Accuracy_Positive",
"Unit_Device___hadd_wrapper_Accuracy_Positive",
"Unit_Device___hadd2_Accuracy_Positive",
"Unit_Device___hadd_sat_Accuracy_Positive",
"Unit_Device___hadd2_sat_Accuracy_Positive",
"Unit_Device___hsub_Accuracy_Positive",
"Unit_Device___hsub2_Accuracy_Positive",
"Unit_Device___hsub_sat_Accuracy_Positive",
"Unit_Device___hsub2_sat_Accuracy_Positive",
"Unit_Device___hmul_Accuracy_Positive",
"Unit_Device___hmul2_Accuracy_Positive",
"Unit_Device___hmul_sat_Accuracy_Positive",
"Unit_Device___hmul2_sat_Accuracy_Positive",
"Unit_Device___hdiv_Accuracy_Positive",
"Unit_Device___h2div_Accuracy_Positive",
"Unit_Device___hfma_Accuracy_Positive",
"Unit_Device___hfma2_Accuracy_Positive",
"Unit_Device___hfma_sat_Accuracy_Positive",
"Unit_Device___hfma2_sat_Accuracy_Positive",
"Unit_Device___hisinf_Accuracy_Positive",
"Unit_Device___hisinf2_Accuracy_Positive",
"Unit_Device___hisnan_Accuracy_Positive",
"Unit_Device___hisnan2_Accuracy_Positive",
"Unit_Device___heq_Accuracy_Positive",
"Unit_Device___hbeq2_Accuracy_Positive",
"Unit_Device___hequ_Accuracy_Positive",
"Unit_Device___hbequ2_Accuracy_Positive",
"Unit_Device___heq2_Accuracy_Positive",
"Unit_Device___hequ2_Accuracy_Positive",
"Unit_Device___hne_Accuracy_Positive",
"Unit_Device___hbne2_Accuracy_Positive",
"Unit_Device___hneu_Accuracy_Positive",
"Unit_Device___hbneu2_Accuracy_Positive",
"Unit_Device___hne2_Accuracy_Positive",
"Unit_Device___hneu2_Accuracy_Positive",
"Unit_Device___hge_Accuracy_Positive",
"Unit_Device___hbge2_Accuracy_Positive",
"Unit_Device___hgeu_Accuracy_Positive",
"Unit_Device___hbgeu2_Accuracy_Positive",
"Unit_Device___hge2_Accuracy_Positive",
"Unit_Device___hgeu2_Accuracy_Positive",
"Unit_Device___hgt_Accuracy_Positive",
"Unit_Device___hbgt2_Accuracy_Positive",
"Unit_Device___hgtu_Accuracy_Positive",
"Unit_Device___hbgtu2_Accuracy_Positive",
"Unit_Device___hgt2_Accuracy_Positive",
"Unit_Device___hgtu2_Accuracy_Positive",
"Unit_Device___hle_Accuracy_Positive",
"Unit_Device___hble2_Accuracy_Positive",
"Unit_Device___hleu_Accuracy_Positive",
"Unit_Device___hbleu2_Accuracy_Positive",
"Unit_Device___hle2_Accuracy_Positive",
"Unit_Device___hleu2_Accuracy_Positive",
"Unit_Device___hlt_Accuracy_Positive",
"Unit_Device___hblt2_Accuracy_Positive",
"Unit_Device___hltu_Accuracy_Positive",
"Unit_Device___hbltu2_Accuracy_Positive",
"Unit_Device___hlt2_Accuracy_Positive",
"Unit_Device___hltu2_Accuracy_Positive",
"Unit_Device___hmax_Accuracy_Positive",
"Unit_Device___hmin_Accuracy_Positive",
"Unit_Device___hmax_nan_Accuracy_Positive",
"Unit_Device___hmin_nan_Accuracy_Positive",
"Unit_Device___half2int_rn_Accuracy_Positive",
"Unit_Device___half2int_rz_Accuracy_Positive",
"Unit_Device___half2int_rd_Accuracy_Positive",
"Unit_Device___half2int_ru_Accuracy_Positive",
"Unit_Device___half2uint_rn_Accuracy_Positive",
"Unit_Device___half2uint_rz_Accuracy_Positive",
"Unit_Device___half2uint_rd_Accuracy_Positive",
"Unit_Device___half2uint_ru_Accuracy_Positive",
"Unit_Device___half2short_rn_Accuracy_Positive",
"Unit_Device___half2short_rz_Accuracy_Positive",
"Unit_Device___half2short_rd_Accuracy_Positive",
"Unit_Device___half2short_ru_Accuracy_Positive",
"Unit_Device___half2ushort_rn_Accuracy_Positive",
"Unit_Device___half2ushort_rz_Accuracy_Positive",
"Unit_Device___half2ushort_rd_Accuracy_Positive",
"Unit_Device___half2ushort_ru_Accuracy_Positive",
"Unit_Device___half2ll_rn_Accuracy_Positive",
"Unit_Device___half2ll_rz_Accuracy_Positive",
"Unit_Device___half2ll_rd_Accuracy_Positive",
"Unit_Device___half2ll_ru_Accuracy_Positive",
"Unit_Device___half2ull_rn_Accuracy_Positive",
"Unit_Device___half2ull_rz_Accuracy_Positive",
"Unit_Device___half2ull_rd_Accuracy_Positive",
"Unit_Device___half2ull_ru_Accuracy_Positive",
"Unit_Device___half_as_short_Accuracy_Positive",
"Unit_Device___half_as_ushort_Accuracy_Positive",
"Unit_Device___int2half_rn_Accuracy_Positive",
"Unit_Device___int2half_rz_Accuracy_Positive",
"Unit_Device___int2half_rd_Accuracy_Positive",
"Unit_Device___int2half_ru_Accuracy_Positive",
"Unit_Device___uint2half_rn_Accuracy_Positive",
"Unit_Device___uint2half_rz_Accuracy_Positive",
"Unit_Device___uint2half_rd_Accuracy_Positive",
"Unit_Device___uint2half_ru_Accuracy_Positive",
"Unit_Device___short2half_rn_Accuracy_Positive",
"Unit_Device___short2half_rz_Accuracy_Positive",
"Unit_Device___short2half_rd_Accuracy_Positive",
"Unit_Device___short2half_ru_Accuracy_Positive",
"Unit_Device___ushort2half_rn_Accuracy_Positive",
"Unit_Device___ushort2half_rz_Accuracy_Positive",
"Unit_Device___ushort2half_rd_Accuracy_Positive",
"Unit_Device___ushort2half_ru_Accuracy_Positive",
"Unit_Device___ll2half_rn_Accuracy_Positive",
"Unit_Device___ll2half_rz_Accuracy_Positive",
"Unit_Device___ll2half_rd_Accuracy_Positive",
"Unit_Device___ll2half_ru_Accuracy_Positive",
"Unit_Device___ull2half_rn_Accuracy_Positive",
"Unit_Device___ull2half_rz_Accuracy_Positive",
"Unit_Device___ull2half_rd_Accuracy_Positive",
"Unit_Device___ull2half_ru_Accuracy_Positive",
"Unit_Device___short_as_half_Accuracy_Positive",
"Unit_Device___ushort_as_half_Accuracy_Positive",
"Unit_Device___float2half_rn_Accuracy_Positive",
"Unit_Device___float2half_Accuracy_Positive",
"Unit_Device___half2float_Accuracy_Positive",
"Unit_Device___frcp_rn_Accuracy_Positive",
"Unit_Device___fsqrt_rn_Accuracy_Positive",
"Unit_Device___frsqrt_rn_Accuracy_Positive",
"Unit_Device___expf_Accuracy_Positive",
"Unit_Device___exp10f_Accuracy_Positive",
"Unit_Device___logf_Accuracy_Positive",
"Unit_Device___log2f_Accuracy_Positive",
"Unit_Device___log10f_Accuracy_Positive",
"Unit_Device___sinf_Accuracy_Positive",
"Unit_Device___sincosf_sin_Accuracy_Positive",
"Unit_Device___cosf_Accuracy_Positive",
"Unit_Device___sincosf_cos_Accuracy_Positive",
"Unit_Device___fadd_rn_Accuracy_Positive",
"Unit_Device___fsub_rn_Accuracy_Positive",
"Unit_Device___fmul_rn_Accuracy_Positive",
"Unit_Device___fdiv_rn_Accuracy_Positive",
"Unit_Device___fdividef_Accuracy_Positive",
"Unit_Device___fmaf_rn_Accuracy_Positive",
"Unit_Device___drcp_rn_Accuracy_Positive",
"Unit_Device___dsqrt_rn_Accuracy_Positive",
"Unit_Device___dadd_rn_Accuracy_Positive",
"Unit_Device___dsub_rn_Accuracy_Positive",
"Unit_Device___dmul_rn_Accuracy_Positive",
"Unit_Device___ddiv_rn_Accuracy_Positive",
"Unit_Device___fma_rn_Accuracy_Positive",
"Unit_Device_sqrtf_Accuracy_Positive",
"Unit_Device_sqrt_Accuracy_Positive",
"Unit_Device_rsqrtf_Accuracy_Positive",
"Unit_Device_rsqrt_Accuracy_Positive",
"Unit_Device_cbrt_Accuracy_Positive - float",
"Unit_Device_cbrt_Accuracy_Positive - double",
"Unit_Device_rcbrtf_Accuracy_Positive",
"Unit_Device_rcbrt_Accuracy_Positive",
"Unit_Device_hypot_Accuracy_Positive - float",
"Unit_Device_hypot_Accuracy_Positive - double",
"Unit_Device_rhypot_Accuracy_Positive - float",
"Unit_Device_rhypot_Accuracy_Positive - double",
"Unit_Device_norm3d_Accuracy_Positive - float",
"Unit_Device_norm3d_Accuracy_Positive - double",
"Unit_Device_rnorm3d_Accuracy_Positive - float",
"Unit_Device_rnorm3d_Accuracy_Positive - double",
"Unit_Device_norm4d_Accuracy_Positive - float",
"Unit_Device_norm4d_Accuracy_Positive - double",
"Unit_Device_rnorm4d_Accuracy_Positive - float",
"Unit_Device_rnorm4d_Accuracy_Positive - double",
"Unit_Device_exp_Accuracy_Positive - float",
"Unit_Device_exp_Accuracy_Positive - double",
"Unit_Device_exp2_Accuracy_Positive - float",
"Unit_Device_exp2_Accuracy_Positive - double",
"Unit_Device_expm1_Accuracy_Positive - float",
"Unit_Device_expm1_Accuracy_Positive - double",
"Unit_Device_exp10f_Accuracy_Positive",
"Unit_Device_exp10_Accuracy_Positive",
"Unit_Device_frexpf_Accuracy_Positive",
"Unit_Device_frexp_Accuracy_Positive",
"Unit_Device_pow_Accuracy_Positive - float",
"Unit_Device_pow_Accuracy_Positive - double",
"Unit_Device_ldexp_Accuracy_Positive - float",
"Unit_Device_ldexp_Accuracy_Positive - double",
"Unit_Device_powi_Accuracy_Positive - float",
"Unit_Device_powi_Accuracy_Positive - double",
"Unit_Device_scalbn_Accuracy_Positive - float",
"Unit_Device_scalbn_Accuracy_Positive - double",
"Unit_Device_scalbln_Accuracy_Positive - float",
"Unit_Device_scalbln_Accuracy_Positive - double",
"Unit_Device_log_Accuracy_Positive - float",
"Unit_Device_log_Accuracy_Positive - double",
"Unit_Device_log2_Accuracy_Positive - float",
"Unit_Device_log2_Accuracy_Positive - double",
"Unit_Device_log10_Accuracy_Positive - float",
"Unit_Device_log10_Accuracy_Positive - double",
"Unit_Device_log1p_Accuracy_Positive - float",
"Unit_Device_log1p_Accuracy_Positive - double",
"Unit_Device_logb_Accuracy_Positive - float",
"Unit_Device_logb_Accuracy_Positive - double",
"Unit_Device_ilogbf_Accuracy_Positive",
"Unit_Device_ilogb_Accuracy_Positive",
"Unit_Device_erf_Accuracy_Positive - float",
"Unit_Device_erf_Accuracy_Positive - double",
"Unit_Device_erfc_Accuracy_Positive - float",
"Unit_Device_erfc_Accuracy_Positive - double",
"Unit_Device_erfinvf_Accuracy_Positive",
"Unit_Device_erfinv_Accuracy_Positive",
"Unit_Device_erfcinvf_Accuracy_Positive",
"Unit_Device_erfcinv_Accuracy_Positive",
"Unit_Device_normcdff_Accuracy_Positive",
"Unit_Device_normcdf_Accuracy_Positive",
"Unit_Device_tgammaf_Accuracy_Limited_Positive",
"Unit_Device_tgamma_Accuracy_Limited_Positive",
"Unit_Device_lgammaf_Accuracy_Limited_Positive",
"Unit_Device_lgamma_Accuracy_Limited_Positive",
"Unit_Device_cyl_bessel_i0f_Accuracy_Limited_Positive",
"Unit_Device_cyl_bessel_i0_Accuracy_Limited_Positive",
"Unit_Device_cyl_bessel_i1f_Accuracy_Limited_Positive",
"Unit_Device_cyl_bessel_i1_Accuracy_Limited_Positive",
"Unit_Device_y0f_Accuracy_Limited_Positive",
"Unit_Device_y0_Accuracy_Limited_Positive",
"Unit_Device_y1f_Accuracy_Limited_Positive",
"Unit_Device_y1_Accuracy_Limited_Positive",
"Unit_Device_ynf_Accuracy_Limited_Positive",
"Unit_Device_yn_Accuracy_Limited_Positive",
"Unit_Device_j0f_Accuracy_Limited_Positive",
"Unit_Device_j0_Accuracy_Limited_Positive",
"Unit_Device_j1f_Accuracy_Limited_Positive",
"Unit_Device_j1_Accuracy_Limited_Positive",
"Unit_Device_jnf_Accuracy_Limited_Positive",
"Unit_Device_jn_Accuracy_Limited_Positive",
"Unit_Device___double2int_rd_Positive",
"Unit_Device___double2int_rn_Positive",
"Unit_Device___double2int_ru_Positive",
"Unit_Device___double2int_rz_Positive",
"Unit_Device___double2int_Negative_RTC",
"Unit_Device___double2uint_rd_Positive",
"Unit_Device___double2uint_rn_Positive",
"Unit_Device___double2uint_ru_Positive",
"Unit_Device___double2uint_rz_Positive",
"Unit_Device___double2uint_Negative_RTC",
"Unit_Device___double2ll_rd_Positive",
"Unit_Device___double2ll_rn_Positive",
"Unit_Device___double2ll_ru_Positive",
"Unit_Device___double2ll_rz_Positive",
"Unit_Device___double2ll_Negative_RTC",
"Unit_Device___double2ull_rd_Positive",
"Unit_Device___double2ull_rn_Positive",
"Unit_Device___double2ull_ru_Positive",
"Unit_Device___double2ull_rz_Positive",
"Unit_Device___double2ull_Negative_RTC",
"Unit_Device___double2float_rd_Positive",
"Unit_Device___double2float_rn_Positive",
"Unit_Device___double2float_ru_Positive",
"Unit_Device___double2float_rz_Positive",
"Unit_Device___double2float_Negative_RTC",
"Unit_Device___double2hiint_Positive",
"Unit_Device___double2hiint_Negative_RTC",
"Unit_Device___double2loint_Positive",
"Unit_Device___double2loint_Negative_RTC",
"Unit_Device___double_as_longlong_Positive",
"Unit_Device___double_as_longlong_Negative_RTC",
"Unit_Device___float2int_rd_Positive",
"Unit_Device___float2int_rn_Positive",
"Unit_Device___float2int_ru_Positive",
"Unit_Device___float2int_rz_Positive",
"Unit_Device___float2int_Negative_RTC",
"Unit_Device___float2uint_rd_Positive",
"Unit_Device___float2uint_rn_Positive",
"Unit_Device___float2uint_ru_Positive",
"Unit_Device___float2uint_rz_Positive",
"Unit_Device___float2uint_Negative_RTC",
"Unit_Device___float2ll_rd_Positive",
"Unit_Device___float2ll_rn_Positive",
"Unit_Device___float2ll_ru_Positive",
"Unit_Device___float2ll_rz_Positive",
"Unit_Device___float2ll_Negative_RTC",
"Unit_Device___float2ull_rd_Positive",
"Unit_Device___float2ull_rn_Positive",
"Unit_Device___float2ull_ru_Positive",
"Unit_Device___float2ull_rz_Positive",
"Unit_Device___float2ull_Negative_RTC",
"Unit_Device___float_as_int_Positive",
"Unit_Device___float_as_int_Negative_RTC",
"Unit_Device___float_as_uint_Positive",
"Unit_Device___float_as_uint_Negative_RTC",
"Unit_Device___int2float_rd_Positive",
"Unit_Device___int2float_rn_Positive",
"Unit_Device___int2float_ru_Positive",
"Unit_Device___int2float_rz_Positive",
"Unit_Device_int2float___Negative_RTC",
"Unit_Device___uint2float_rd_Positive",
"Unit_Device___uint2float_rn_Positive",
"Unit_Device___uint2float_ru_Positive",
"Unit_Device___uint2float_rz_Positive",
"Unit_Device___uint2float_Negative_RTC",
"Unit_Device___int2double_rn_Positive",
"Unit_Device___int2double_Negative_RTC",
"Unit_Device___uint2double_rn_Positive",
"Unit_Device___uint2double_Negative_RTC",
"Unit_Device___ll2float_rd_Positive",
"Unit_Device___ll2float_rn_Positive",
"Unit_Device___ll2float_ru_Positive",
"Unit_Device___ll2float_rz_Positive",
"Unit_Device___ll2float_Negative_RTC",
"Unit_Device___ull2float_rd_Positive",
"Unit_Device___ull2float_rn_Positive",
"Unit_Device___ull2float_ru_Positive",
"Unit_Device___ull2float_rz_Positive",
"Unit_Device___ull2float_Negative_RTC",
"Unit_Device___ll2double_rd_Positive",
"Unit_Device___ll2double_rn_Positive",
"Unit_Device___ll2double_ru_Positive",
"Unit_Device___ll2double_rz_Positive",
"Unit_Device___ll2double_Negative_RTC",
"Unit_Device___ull2double_rd_Positive",
"Unit_Device___ull2double_rn_Positive",
"Unit_Device___ull2double_ru_Positive",
"Unit_Device___ull2double_rz_Positive",
"Unit_Device___ull2double_Negative_RTC",
"Unit_Device___int_as_float_Positive",
"Unit_Device___int_as_float_Negative_RTC",
"Unit_Device___uint_as_float_Positive",
"Unit_Device___uint_as_float_Negative_RTC",
"Unit_Device___longlong_as_double_Positive",
"Unit_Device___longlong_as_double_Negative_RTC",
"Unit_Device___hiloint2double_Positive",
"Unit_Device___hiloint2double_Negative_RTC",
"SWDEV-447384, SWDEV-447932: These tests fail in gfx1100, gfx1101 & gfx1102",
"SWDEV-445928: These tests fail in PSDB stress test on 09/02/2024",
"Unit_Device___float2half_rd_Accuracy_Limited_Positive",
"Unit_Device___float2half_ru_Accuracy_Limited_Positive",
"Unit_Device___float2half_rz_Accuracy_Limited_Positive",
"Unit_hipGraphInstantiateWithFlags_StreamCaptureDeviceContextChg",
"=== SWDEV-511679 : Below tests fail in stress test ===",
"Unit_hipIpcOpenMemHandle_Negative_Open_In_Two_Contexts_Same_Device",
@@ -551,6 +70,67 @@
"Unit_hipGraphInstantiateWithFlags_DependencyGraphDeviceCtxtChg",
"=== SWDEV-553920 disabled until multi device graph issues are resolved ===",
"Unit_hipGraphUpload_Functional_multidevice_test",
"=== special values test, fix: comment out [HEX_DBL(-, 1, fffffffffffff, +, 31), HEX_DBL(+, 1, fffffffffffff, +, 31)]",
"Unit_Device_sinpi_Accuracy_Positive - double",
"Unit_Device_cospi_Accuracy_Positive - double",
"Unit_Device_sincospi_Accuracy_Positive - double",
"=== special values test, fix: comment out [-3.0f, -1.5f, HEX_FLT(-, 0, 00000c, -, 126), HEX_FLT(-, 0, 000006, -, 126), +3.0f, 1.5f, HEX_FLT(+, 0, 00000c, -, 126), HEX_FLT(+, 0, 000006, -, 126),]",
"Unit_Device_fma_Accuracy_Positive - float",
"=== special values test, fix: comment out [HEX_DBL(-, 1, fffffffffffff, +, 31), HEX_DBL(-, 1, fffffffffffff, +, 30), HEX_DBL(+, 1, fffffffffffff, +, 31), HEX_DBL(+, 1, fffffffffffff, +, 30)]",
"Unit_Device_fdim_Accuracy_Positive - double",
"=== TODO special value test error, fix: comment out special value test",
"Unit_Device_remquo_Accuracy_Positive - float",
"Unit_Device_remquo_Accuracy_Positive - double",
"=== float* can be casted to double* [math_remainder_rounding_negative_kernels_rtc.hh:247]",
"Unit_Device_modf_modff_Negative_RTC",
"=== TODO round error",
"Unit_Device___fsqrt_rn_Accuracy_Positive",
"Unit_Device___frsqrt_rn_Accuracy_Positive",
"=== TODO terminated, ulp formula seems to be wrong [produces ulp of 4294967298]",
"Unit_Device___expf_Accuracy_Positive",
"Unit_Device___exp10f_Accuracy_Positive",
"=== TODO error, input +-1.40129846e-45",
"Unit_Device___log2f_Accuracy_Positive",
"=== special values test, fix: comment out [HEX_DBL(-, 0, 0000000000001, -, 1022)]",
"Unit_Device_log1p_Accuracy_Positive - double",
"=== Below 2 tests are disable due to defect EXSWHTEC-369 ===",
"Unit_Device_ilogbf_Accuracy_Positive",
"Unit_Device_ilogb_Accuracy_Positive",
"=== rounding float16 types not supported?",
"Unit_Device___float2half_rd_Accuracy_Limited_Positive",
"Unit_Device___float2half_ru_Accuracy_Limited_Positive",
"Unit_Device___float2half_rz_Accuracy_Limited_Positive",
"=== long int exponents are too large, works fine with int",
"Unit_Device_scalbln_Accuracy_Positive - float",
"Unit_Device_scalbln_Accuracy_Positive - double",
"=== Float16 problem",
"Unit_Device_hcos_Accuracy_Positive",
"Unit_Device_h2cos_Accuracy_Positive",
"Unit_Device_hsin_Accuracy_Positive",
"Unit_Device_h2sin_Accuracy_Positive",
"=== TODO [first 2 passed] Below 2 tests are disable due to defect EXSWHTEC-356 ===",
"Unit_Device___hisinf2_Accuracy_Positive",
"Unit_Device___hisnan2_Accuracy_Positive",
"Unit_Device___hbequ2_Accuracy_Positive",
"Unit_Device___hne_Accuracy_Positive",
"Unit_Device___hne2_Accuracy_Positive",
"Unit_Device___hbne2_Accuracy_Positive",
"Unit_Device___hbgeu2_Accuracy_Positive",
"Unit_Device___hbgtu2_Accuracy_Positive",
"Unit_Device___hbleu2_Accuracy_Positive",
"Unit_Device___hbltu2_Accuracy_Positive",
"=== TODO ===",
"Unit_Device_tgammaf_Accuracy_Limited_Positive",
"=== TODO === fail on 100% test data",
"Unit_Device_hexp10_Accuracy_Positive",
"Unit_Device_h2exp10_Accuracy_Positive",
"Unit_Device_hexp2_Accuracy_Positive",
"Unit_Device_h2exp2_Accuracy_Positive",
"Unit_Device_hlog_Accuracy_Positive",
"Unit_Device_h2log_Accuracy_Positive",
"Unit_Device_hlog10_Accuracy_Positive",
"Unit_Device_h2log10_Accuracy_Positive",
"Unit_Device___hfma2_Accuracy_Positive",
#endif
#if defined gfx90a || defined gfx942 || defined gfx950
"=== SWDEV-443630 : Below test failed in stress test on 19/01/24 ===",
@@ -575,13 +155,6 @@
"Unit_Warp_Shfl_Down_Positive_Basic - double",
"Unit_Warp_Shfl_Down_Positive_Basic - __half",
"Unit_Warp_Shfl_Down_Positive_Basic - __half2",
"Unit_Device_norm_Sanity_Positive - float",
"Unit_Device_norm_Sanity_Positive - double",
"Unit_Device_rnorm_Sanity_Positive - float",
"Unit_Device_rnorm_Sanity_Positive - double",
"Unit_Device___float2half_rd_SmallVals_Sanity_Positive",
"Unit_Device___float2half_ru_SmallVals_Sanity_Positive",
"Unit_Device___float2half_rz_SmallVals_Sanity_Positive",
#endif
#if defined gfx1200 || defined gfx1201
"=== SWDEV-470751 : Fine Grain memory is MTYPE_NC due to HW bug.",
+55
Просмотреть файл
@@ -12,5 +12,60 @@
"=== Below tests are failing PSDB ===",
"Unit_Assert_Positive_Basic_KernelFail",
"Unit_hipMemMapArrayAsync_Positive_Basic",
"=== special values test, fix: comment out [HEX_DBL(-, 1, fffffffffffff, +, 31), HEX_DBL(+, 1, fffffffffffff, +, 31)]",
"Unit_Device_sinpi_Accuracy_Positive - double",
"Unit_Device_cospi_Accuracy_Positive - double",
"Unit_Device_sincospi_Accuracy_Positive - double",
"=== special values test, fix: comment out [-3.0f, -1.5f, HEX_FLT(-, 0, 00000c, -, 126), HEX_FLT(-, 0, 000006, -, 126), +3.0f, 1.5f, HEX_FLT(+, 0, 00000c, -, 126), HEX_FLT(+, 0, 000006, -, 126),]",
"Unit_Device_fma_Accuracy_Positive - float",
"=== special values test, fix: comment out [HEX_DBL(-, 1, fffffffffffff, +, 31), HEX_DBL(-, 1, fffffffffffff, +, 30), HEX_DBL(+, 1, fffffffffffff, +, 31), HEX_DBL(+, 1, fffffffffffff, +, 30)]",
"Unit_Device_fdim_Accuracy_Positive - double",
"=== TODO special value test error, fix: comment out special value test",
"Unit_Device_remquo_Accuracy_Positive - float",
"Unit_Device_remquo_Accuracy_Positive - double",
"=== float* can be casted to double* [math_remainder_rounding_negative_kernels_rtc.hh:247]",
"Unit_Device_modf_modff_Negative_RTC",
"=== TODO round error",
"Unit_Device___frsqrt_rn_Accuracy_Positive",
"=== TODO terminated, ulp formula seems to be wrong [produces ulp of 4294967298]",
"Unit_Device___expf_Accuracy_Positive",
"Unit_Device___exp10f_Accuracy_Positive",
"=== rounding float16 types not supported?",
"Unit_Device___float2half_rd_Accuracy_Limited_Positive",
"Unit_Device___float2half_ru_Accuracy_Limited_Positive",
"Unit_Device___float2half_rz_Accuracy_Limited_Positive",
"=== TODO rounding error ===",
"Unit_Device___half2int_rn_Accuracy_Positive",
"Unit_Device___half2int_rd_Accuracy_Positive",
"Unit_Device___half2int_ru_Accuracy_Positive",
"Unit_Device___half2uint_rn_Accuracy_Positive",
"Unit_Device___half2uint_ru_Accuracy_Positive",
"Unit_Device___half2short_rn_Accuracy_Positive",
"Unit_Device___half2short_rd_Accuracy_Positive",
"Unit_Device___half2short_ru_Accuracy_Positive",
"Unit_Device___half2ushort_rn_Accuracy_Positive",
"Unit_Device___half2ushort_ru_Accuracy_Positive",
"Unit_Device___half2ll_rn_Accuracy_Positive",
"Unit_Device___half2ll_rd_Accuracy_Positive",
"Unit_Device___half2ll_ru_Accuracy_Positive",
"Unit_Device___half2ull_rn_Accuracy_Positive",
"Unit_Device___half2ull_ru_Accuracy_Positive",
"Unit_Device___int2half_rz_Accuracy_Positive",
"Unit_Device___int2half_rd_Accuracy_Positive",
"Unit_Device___int2half_ru_Accuracy_Positive",
"Unit_Device___uint2half_rz_Accuracy_Positive",
"Unit_Device___uint2half_rd_Accuracy_Positive",
"Unit_Device___uint2half_ru_Accuracy_Positive",
"Unit_Device___short2half_rz_Accuracy_Positive",
"Unit_Device___short2half_rd_Accuracy_Positive",
"Unit_Device___short2half_ru_Accuracy_Positive",
"Unit_Device___ushort2half_rz_Accuracy_Positive",
"Unit_Device___ushort2half_rd_Accuracy_Positive",
"Unit_Device___ushort2half_ru_Accuracy_Positive",
"Unit_Device___ll2half_rz_Accuracy_Positive",
"Unit_Device___ll2half_rd_Accuracy_Positive",
"Unit_Device___ll2half_ru_Accuracy_Positive",
"Unit_Device___ull2half_rz_Accuracy_Positive",
"Unit_Device___ull2half_rd_Accuracy_Positive"
]
}
+7 -1
Просмотреть файл
@@ -59,13 +59,19 @@ int main(int argc, char** argv) {
("Number of iterations used for math accuracy tests with randomly generated inputs (default: 2^32)")
| Opt(cmd_options.accuracy_max_memory, "accuracy_max_memory")
["-M"]["--accuracy-max-memory"]
("Percentage of global device memory allowed for math accuracy tests (default: 80%)")
("Percentage of global device memory allowed for math accuracy tests in case the global device memory is lower than max_memory (default: 80%)")
| Opt(cmd_options.reduce_iterations, "reduce_iterations")
["-R"]["--reduce-iterations"]
("Number of iterations for fuzzing reduce operations (default: 1)")
| Opt(cmd_options.reduce_input_size, "reduce_input_size")
["-Z"]["--reduce-input-size"]
("Size of the input for the reduce sync operations performance test (megabytes) (default: 50)")
| Opt(cmd_options.max_memory, "max_memory")
["-X"]["--max-memory"]
("Maximum amount of memory to use for math accuracy tests (default: 2GB)")
| Opt(cmd_options.reduction_factor, "reduction_factor")
["-R"]["--reduction-factor"]
("Percentage of test data to be actually tested (default: 0.1%)")
;
// clang-format on
+2
Просмотреть файл
@@ -36,6 +36,8 @@ struct CmdOptions {
uint64_t reduce_iterations = 1;
uint64_t reduce_input_size = 50;
int accuracy_max_memory = 80;
uint64_t max_memory = 2147483648; // 2 GB
double reduction_factor = 0.1;
};
extern CmdOptions cmd_options;
+1 -1
Просмотреть файл
@@ -35,7 +35,7 @@ namespace cg = cooperative_groups;
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
for (size_t i = tid; i < num_xs; i += stride) { \
if constexpr (std::is_same_v<float, T>) { \
ys[i] = func_name##f(x1s[i], x2s[i]); \
} else if constexpr (std::is_same_v<double, T>) { \
+35 -12
Просмотреть файл
@@ -31,7 +31,7 @@ namespace cg = cooperative_groups;
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
for (size_t i = tid; i < num_xs; i += stride) { \
ys[i] = func_name(xs[i]); \
} \
}
@@ -42,7 +42,7 @@ namespace cg = cooperative_groups;
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
for (size_t i = tid; i < num_xs; i += stride) { \
ys[i] = func_name(x1s[i], x2s[i]); \
} \
}
@@ -93,7 +93,10 @@ void CastUnaryHalfPrecisionBruteForceTest(kernel_sig<T, Float16> kernel,
ref_sig<RT, RTArg> ref_func,
const ValidatorBuilder& validator_builder) {
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
uint64_t stop = std::numeric_limits<uint16_t>::max() + 1ul;
const auto reduction_factor = GetTestReductionFactor();
const auto inv_reduction_factor = 1 / reduction_factor;
const auto stop = static_cast<uint64_t>(
std::ceil((std::numeric_limits<uint16_t>::max() + 1ul) * reduction_factor));
const auto max_batch_size =
std::min(GetMaxAllowedDeviceMemoryUsage() / (sizeof(Float16) + sizeof(T)), stop);
LinearAllocGuard<Float16> values{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(Float16)};
@@ -114,10 +117,11 @@ void CastUnaryHalfPrecisionBruteForceTest(kernel_sig<T, Float16> kernel,
const auto sub_batch_size = min_sub_batch_size + (i < tail);
thread_pool.Post([=, &values] {
auto t = v;
auto t = v * inv_reduction_factor;
uint16_t val;
for (auto j = 0u; j < sub_batch_size; ++j) {
val = static_cast<uint16_t>(t++);
val = static_cast<uint16_t>(std::floor(t));
t += inv_reduction_factor;
values.ptr()[base_idx + j] = *reinterpret_cast<Float16*>(&val);
if (std::isnan(values.ptr()[base_idx + j]) || std::isinf(values.ptr()[base_idx + j])) {
values.ptr()[base_idx + j] = 0;
@@ -178,15 +182,24 @@ void CastIntRangeTest(kernel_sig<T, TArg> kernel, ref_sig<RT, RTArg> ref_func,
const TArg a = std::numeric_limits<TArg>::lowest(),
const TArg b = std::numeric_limits<TArg>::max()) {
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
const auto reduction_factor = GetTestReductionFactor();
const auto inv_reduction_factor = 1 / reduction_factor;
const auto max_batch_size = GetMaxAllowedDeviceMemoryUsage() / (sizeof(T) + sizeof(TArg));
LinearAllocGuard<TArg> values{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(TArg)};
MathTest math_test(kernel, max_batch_size);
bool running = true;
size_t inserted = 0u;
for (TArg v = a; v <= b; v++) {
values.ptr()[inserted++] = v;
if (inserted < max_batch_size) continue;
auto v = static_cast<double>(a);
while (running) {
if (std::floor(v) > b) {
running = false;
} else {
values.ptr()[inserted++] = static_cast<TArg>(std::floor(v));
v += inv_reduction_factor;
if (inserted < max_batch_size) continue;
}
math_test.Run(validator_builder, grid_size, block_size, ref_func, inserted, values.ptr());
inserted = 0u;
@@ -240,17 +253,27 @@ void CastBinaryIntRangeTest(kernel_sig<T1, T2, T2> kernel, ref_sig<T1, T2, T2> r
const T2 a = std::numeric_limits<T2>::lowest(),
const T2 b = std::numeric_limits<T2>::max()) {
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
const auto reduction_factor = GetTestReductionFactor();
const auto inv_reduction_factor = 1 / reduction_factor;
const auto max_batch_size = GetMaxAllowedDeviceMemoryUsage() / (sizeof(T1) + 2 * sizeof(T2));
LinearAllocGuard<T2> values1{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(T2)};
LinearAllocGuard<T2> values2{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(T2)};
MathTest math_test(kernel, max_batch_size);
bool running = true;
size_t inserted = 0u;
for (T2 v = a; v <= b; v++) {
values1.ptr()[inserted] = v;
values2.ptr()[inserted++] = b - v;
if (inserted < max_batch_size) continue;
auto v = static_cast<double>(a);
while (running) {
if (std::floor(v) > b) {
running = false;
} else {
const auto t = static_cast<T2>(std::floor(v));
values1.ptr()[inserted] = t;
values2.ptr()[inserted++] = b - t;
v += inv_reduction_factor;
if (inserted < max_batch_size) continue;
}
math_test.Run(validator_builder, grid_size, block_size, ref_func, inserted, values1.ptr(),
values2.ptr());
+2 -2
Просмотреть файл
@@ -31,7 +31,7 @@ namespace cg = cooperative_groups;
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
for (size_t i = tid; i < num_xs; i += stride) { \
ys[i] = func_name(__half2{xs[i], -xs[i]}); \
} \
}
@@ -42,7 +42,7 @@ namespace cg = cooperative_groups;
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
for (size_t i = tid; i < num_xs; i += stride) { \
ys[i] = func_name(__half2{x1s[i], -x1s[i]}, __half2{x2s[i], -x2s[i]}); \
} \
}
+1 -1
Просмотреть файл
@@ -318,7 +318,7 @@ __global__ void __float22half2_rn_kernel(__half2* const ys, const size_t num_xs,
const auto tid = cg::this_grid().thread_rank();
const auto stride = cg::this_grid().size();
for (auto i = tid; i < num_xs; i += stride) {
for (size_t i = tid; i < num_xs; i += stride) {
ys[i] = __float22half2_rn(make_float2(xs[i], -xs[i]));
}
}
+2 -2
Просмотреть файл
@@ -36,7 +36,7 @@ THE SOFTWARE.
TEST_CASE("Unit_Device_" #kern_name "_Accuracy_Limited_Positive") { \
Float16 (*ref)(float) = kern_name##_ref; \
UnarySinglePrecisionRangeTest(kern_name##_kernel, ref, EqValidatorBuilderFactory<Float16>(), \
std::numeric_limits<float>::min(), 0.f); \
std::numeric_limits<float>::lowest(), 0.f); \
UnarySinglePrecisionRangeTest(kern_name##_kernel, ref, EqValidatorBuilderFactory<Float16>(), \
0.0001f, std::numeric_limits<float>::max()); \
}
@@ -48,7 +48,7 @@ THE SOFTWARE.
TEST_CASE("Unit_Device_" #kern_name "_Accuracy_Positive") { \
Float16 (*ref)(float) = kern_name##_ref; \
UnarySinglePrecisionRangeTest(kern_name##_kernel, ref, EqValidatorBuilderFactory<Float16>(), \
std::numeric_limits<float>::min(), \
std::numeric_limits<float>::lowest(), \
std::numeric_limits<float>::max()); \
}
+1 -1
Просмотреть файл
@@ -688,7 +688,7 @@ __global__ void __hiloint2double_kernel(double* const ys, const size_t num_xs, i
const auto tid = cg::this_grid().thread_rank();
const auto stride = cg::this_grid().size();
for (auto i = tid; i < num_xs; i += stride) {
for (size_t i = tid; i < num_xs; i += stride) {
ys[i] = __hiloint2double(x1s[i], x2s[i]);
}
}
+3 -3
Просмотреть файл
@@ -33,7 +33,7 @@ THE SOFTWARE.
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
for (size_t i = tid; i < num_xs; i += stride) { \
ys[i] = func_name(xs[i]); \
} \
}
@@ -100,7 +100,7 @@ MATH_UNARY_DP_TEST_DEF_IMPL(__dsqrt_rn, static_cast<double (*)(double)>(std::sqr
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
for (size_t i = tid; i < num_xs; i += stride) { \
ys[i] = func_name(x1s[i], x2s[i]); \
} \
}
@@ -206,7 +206,7 @@ MATH_BINARY_DP_TEST_DEF_IMPL(__ddiv_rn, __ddiv_rn_ref, EqValidatorBuilderFactory
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
for (size_t i = tid; i < num_xs; i += stride) { \
ys[i] = func_name(x1s[i], x2s[i], x3s[i]); \
} \
}
+3 -3
Просмотреть файл
@@ -34,7 +34,7 @@ THE SOFTWARE.
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
for (size_t i = tid; i < num_xs; i += stride) { \
ys[i] = func_name(xs[i]); \
} \
}
@@ -59,7 +59,7 @@ THE SOFTWARE.
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
for (size_t i = tid; i < num_xs; i += stride) { \
ys[i] = func_name(x1s[i], x2s[i]); \
} \
}
@@ -85,7 +85,7 @@ THE SOFTWARE.
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
for (size_t i = tid; i < num_xs; i += stride) { \
ys[i] = func_name(x1s[i], x2s[i], x3s[i]); \
} \
}
+2 -2
Просмотреть файл
@@ -35,7 +35,7 @@ THE SOFTWARE.
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
for (size_t i = tid; i < num_xs; i += stride) { \
ys[i] = func_name(xs[i]); \
} \
} \
@@ -120,7 +120,7 @@ MATH_UNARY_HP_TEST_DEF_IMPL(__hisnan2, __hisnan2_ref, EqValidatorBuilderFactory<
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
for (size_t i = tid; i < num_xs; i += stride) { \
ys[i] = func_name(x1s[i], x2s[i]); \
} \
} \
+4 -3
Просмотреть файл
@@ -35,7 +35,8 @@ THE SOFTWARE.
* ------------------------
* - Tests the numerical accuracy of `logf(x)` for all possible inputs and `log(x)` against a
* table of difficult values, followed by a large number of randomly generated values. The results
* are compared against reference function `T std::log(T)`. The maximum ulp error is 1.
* are compared against reference function `T std::log(T)`. The maximum ulp error
* for single precision is 2 and for double precision is 1.
*
* Test source
* ------------------------
@@ -44,7 +45,7 @@ THE SOFTWARE.
* ------------------------
* - HIP_VERSION >= 5.2
*/
MATH_UNARY_WITHIN_ULP_STL_REF_TEST_DEF(log, 1, 1)
MATH_UNARY_WITHIN_ULP_STL_REF_TEST_DEF(log, 2, 1)
/**
* Test Description
@@ -187,7 +188,7 @@ __global__ void ilogb_kernel(int* const ys, const size_t num_xs, T* const xs) {
const auto tid = cg::this_grid().thread_rank();
const auto stride = cg::this_grid().size();
for (auto i = tid; i < num_xs; i += stride) {
for (size_t i = tid; i < num_xs; i += stride) {
if constexpr (std::is_same_v<float, T>) {
ys[i] = ilogbf(xs[i]);
} else if constexpr (std::is_same_v<double, T>) {
+77 -2
Просмотреть файл
@@ -212,10 +212,17 @@ template <typename F> auto GetOccupancyMaxPotentialBlockSize(F kernel) {
inline size_t GetMaxAllowedDeviceMemoryUsage() {
hipDeviceProp_t props;
HIP_CHECK(hipGetDeviceProperties(&props, 0));
return props.totalGlobalMem * (cmd_options.accuracy_max_memory * 0.01f);
return props.totalGlobalMem > cmd_options.max_memory
? cmd_options.max_memory
: props.totalGlobalMem * (cmd_options.accuracy_max_memory * 0.01f);
}
inline uint64_t GetTestIterationCount() { return cmd_options.accuracy_iterations; }
inline double GetTestReductionFactor() { return cmd_options.reduction_factor * 0.01; }
inline uint64_t GetTestIterationCount() {
return static_cast<uint64_t>(
std::ceil(cmd_options.accuracy_iterations * GetTestReductionFactor()));
}
template <typename T, typename... Ts> using kernel_sig = void (*)(T*, const size_t, Ts*...);
@@ -254,3 +261,71 @@ template <int error_num> void NegativeTestRTCWrapper(const char* program_source)
HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
REQUIRE(error_count == expected_error_count);
}
inline void SinglePrecisionReducedRun(std::function<void(size_t)> run,
const LinearAllocGuard<float>& values, const float a,
const float b, const double reduction_factor,
const size_t max_batch_size) {
bool test_positive = true;
float positive_start = 0.0f;
float positive_end = 0.0f;
bool test_negative = true;
float negative_start = 0.0f;
float negative_end = 0.0f;
if (a < 0 && b <= 0) {
test_positive = false;
negative_start = -b;
negative_end = -a;
}
if (a < 0 && b > 0) {
positive_start = 0.0f;
positive_end = b;
negative_start = 0.0f;
negative_end = -a;
}
if (a >= 0 && b > 0) {
positive_start = a;
positive_end = b;
test_negative = false;
}
const auto inv_reduction_factor = 1 / reduction_factor;
size_t inserted = 0u;
constexpr int radix = std::numeric_limits<float>::radix;
float increment = std::numeric_limits<float>::min() * std::numeric_limits<float>::epsilon();
float limit = std::numeric_limits<float>::min() * radix;
const auto iterate = [&](float start, float end, int positive) {
for (float v = start; v < end; limit *= radix, increment *= radix) {
const auto start_v = v;
double count = 0ul;
while (v < limit && v < end) {
values.ptr()[inserted++] = (v * positive);
count += inv_reduction_factor;
v = start_v + increment * static_cast<uint64_t>(std::floor(count));
if (inserted < max_batch_size) continue;
run(inserted);
inserted = 0u;
}
}
if (inserted > 0u) {
run(inserted);
inserted = 0u;
}
};
if (test_positive) {
iterate(positive_start, positive_end, 1);
}
increment = std::numeric_limits<float>::min() * std::numeric_limits<float>::epsilon();
limit = std::numeric_limits<float>::min() * radix;
if (test_negative) {
iterate(negative_start, negative_end, -1);
}
}
+4 -4
Просмотреть файл
@@ -37,12 +37,12 @@ TEST_CASE("Unit_Device_fmax_fmaxf_Negative_RTC") { NegativeTestRTCWrapper<8>(kFm
MATH_BINARY_WITHIN_ULP_TEST_DEF(fmin, std::fmin, 0, 0)
TEST_CASE("Unit_Device_fmin_fminf_Negative_RTC") { NegativeTestRTCWrapper<8>(kFmin); }
MATH_BINARY_WITHIN_ULP_TEST_DEF(nextafter, std::nextafter, 0, 0)
MATH_BINARY_WITHIN_ULP_TEST_DEF(nextafter, std::nextafter, 1, 1)
TEST_CASE("Unit_Device_nextafter_nextafterf_Negative_RTC") {
NegativeTestRTCWrapper<8>(kNextAfter);
}
MATH_TERNARY_WITHIN_ULP_TEST_DEF(fma, std::fma, 0, 0)
MATH_TERNARY_WITHIN_ULP_TEST_DEF(fma, std::fma, 0, 1)
TEST_CASE("Unit_Device_fma_fmaf_Negative_RTC") { NegativeTestRTCWrapper<12>(kFma); }
__global__ void fdividef_kernel(float* const ys, const size_t num_xs, float* const x1s,
@@ -50,7 +50,7 @@ __global__ void fdividef_kernel(float* const ys, const size_t num_xs, float* con
const auto tid = cg::this_grid().thread_rank();
const auto stride = cg::this_grid().size();
for (auto i = tid; i < num_xs; i += stride) {
for (size_t i = tid; i < num_xs; i += stride) {
ys[i] = fdividef(x1s[i], x2s[i]);
}
}
@@ -68,7 +68,7 @@ TEST_CASE("Unit_Device_fdividef_Negative_RTC") { NegativeTestRTCWrapper<4>(kFdiv
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
for (size_t i = tid; i < num_xs; i += stride) { \
ys[i] = kern_name(xs[i]); \
} \
} \
+3 -3
Просмотреть файл
@@ -123,9 +123,9 @@ class Dummy {
__device__ ~Dummy() {}
};
__global__ void fdividef_kernel_v1(float* x, float y) { float result = fdividef(x, y); }
__global__ void fdividef_kernel_v2(Dummy x, float y) { float result = fdivide(x); }
__global__ void fdividef_kernel_v3(float x, float* y) { float result = fdivide(x); }
__global__ void fdividef_kernel_v4(float x, Dummy y) { float result = fdivide(x); }
__global__ void fdividef_kernel_v2(Dummy x, float y) { float result = fdividef(x, y); }
__global__ void fdividef_kernel_v3(float x, float* y) { float result = fdividef(x, y); }
__global__ void fdividef_kernel_v4(float x, Dummy y) { float result = fdividef(x, y); }
)"};
static constexpr auto kIsFinite{R"(
+1 -1
Просмотреть файл
@@ -35,7 +35,7 @@ namespace cg = cooperative_groups;
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
for (size_t i = tid; i < num_xs; i += stride) { \
if constexpr (std::is_same_v<float, T1>) { \
ys[i] = func_name##f(x1s[i], x2s[i]); \
} else if constexpr (std::is_same_v<double, T1>) { \
+1 -1
Просмотреть файл
@@ -183,7 +183,7 @@ __global__ void frexp_kernel(std::pair<T, int>* const ys, const size_t num_xs, T
const auto tid = cg::this_grid().thread_rank();
const auto stride = cg::this_grid().size();
for (auto i = tid; i < num_xs; i += stride) {
for (size_t i = tid; i < num_xs; i += stride) {
if constexpr (std::is_same_v<float, T>) {
ys[i].first = frexpf(xs[i], &ys[i].second);
} else if constexpr (std::is_same_v<double, T>) {
+1 -1
Просмотреть файл
@@ -35,7 +35,7 @@ namespace cg = cooperative_groups;
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
for (size_t i = tid; i < num_xs; i += stride) { \
if constexpr (std::is_same_v<float, T>) { \
ys[i] = func_name##f(x1s[i], x2s[i], x3s[i], x4s[i]); \
} else if constexpr (std::is_same_v<double, T>) { \
+2 -2
Просмотреть файл
@@ -91,7 +91,7 @@ template <typename T> __global__ void remquo_kernel(std::pair<T, int>* const ys,
const auto tid = cg::this_grid().thread_rank();
const auto stride = cg::this_grid().size();
for (auto i = tid; i < num_xs; i += stride) {
for (size_t i = tid; i < num_xs; i += stride) {
if constexpr (std::is_same_v<float, T>) {
ys[i].first = remquof(x1s[i], x2s[i], &ys[i].second);
} else if constexpr (std::is_same_v<double, T>) {
@@ -123,7 +123,7 @@ __global__ void modf_kernel(std::pair<T, T>* const ys, const size_t num_xs, T* c
const auto tid = cg::this_grid().thread_rank();
const auto stride = cg::this_grid().size();
for (auto i = tid; i < num_xs; i += stride) {
for (size_t i = tid; i < num_xs; i += stride) {
if constexpr (std::is_same_v<float, T>) {
ys[i].first = modff(xs[i], &ys[i].second);
} else if constexpr (std::is_same_v<double, T>) {
+7 -7
Просмотреть файл
@@ -265,7 +265,7 @@ MATH_BINARY_KERNEL_DEF(rhypot)
* ------------------------
* - Tests the numerical accuracy of `rhypotf(x, y)` and `rhypot(x, y)`against a table of
* difficult values, followed by a large number of randomly generated values. The maximum ulp error
* for single precision is 2 and for double precision is 1.
* is 2.
*
* Test source
* ------------------------
@@ -278,7 +278,7 @@ TEMPLATE_TEST_CASE("Unit_Device_rhypot_Accuracy_Positive", "", float, double) {
using RT = RefType_t<TestType>;
auto rhypot_ref = [](RT arg1, RT arg2) -> RT { return 1. / std::hypot(arg1, arg2); };
RT (*ref)(RT, RT) = rhypot_ref;
const auto ulp = std::is_same_v<float, TestType> ? 2 : 1;
const auto ulp = std::is_same_v<float, TestType> ? 2 : 2;
BinaryFloatingPointTest(rhypot_kernel<TestType>, ref, ULPValidatorBuilderFactory<TestType>(ulp));
}
@@ -348,7 +348,7 @@ MATH_TERNARY_KERNEL_DEF(rnorm3d)
* ------------------------
* - Tests the numerical accuracy of `rnorm3df(x, y, z)` and `rnorm3d(x, y, z)`against a table of
* difficult values, followed by a large number of randomly generated values. The maximum ulp error
* for single precision is 2 and for double precision is 1.
* is 2.
*
* Test source
* ------------------------
@@ -366,7 +366,7 @@ TEMPLATE_TEST_CASE("Unit_Device_rnorm3d_Accuracy_Positive", "", float, double) {
return 1. / std::sqrt(arg1 * arg1 + arg2 * arg2 + arg3 * arg3);
};
RT (*ref)(RT, RT, RT) = rnorm3d_ref;
const auto ulp = std::is_same_v<float, TestType> ? 2 : 1;
const auto ulp = std::is_same_v<float, TestType> ? 2 : 2;
TernaryFloatingPointTest(rnorm3d_kernel<TestType>, ref,
ULPValidatorBuilderFactory<TestType>(ulp));
}
@@ -438,7 +438,7 @@ MATH_QUATERNARY_KERNEL_DEF(rnorm4d)
* ------------------------
* - Tests the numerical accuracy of `rnorm4df(x, y, z, t)` and `rnorm4d(x, y, z, t)`against a
* table of difficult values, followed by a large number of randomly generated values. The maximum
* ulp error for single precision is 2 and for double precision is 1.
* ulp error is 2.
*
* Test source
* ------------------------
@@ -456,7 +456,7 @@ TEMPLATE_TEST_CASE("Unit_Device_rnorm4d_Accuracy_Positive", "", float, double) {
return 1. / std::sqrt(arg1 * arg1 + arg2 * arg2 + arg3 * arg3 + arg4 * arg4);
};
RT (*ref)(RT, RT, RT, RT) = rnorm4d_ref;
const auto ulp = std::is_same_v<float, TestType> ? 2 : 1;
const auto ulp = std::is_same_v<float, TestType> ? 2 : 2;
QuaternaryFloatingPointTest(rnorm4d_kernel<TestType>, ref,
ULPValidatorBuilderFactory<TestType>(ulp));
}
@@ -488,7 +488,7 @@ TEST_CASE("Unit_Device_rnorm4d_rnorm4df_Negative_RTC") { NegativeTestRTCWrapper<
template <typename T, typename F, typename RF, typename ValidatorBuilder>
void NormSimpleTest(F kernel, RF ref_func, const ValidatorBuilder& validator_builder) {
const auto max_dim = 10000;
const auto max_dim = static_cast<uint64_t>(std::ceil(10000 * GetTestReductionFactor()));
LinearAllocGuard<T> x{LinearAllocs::hipHostMalloc, max_dim * sizeof(T)};
LinearAllocGuard<T> x_dev{LinearAllocs::hipMalloc, max_dim * sizeof(T)};
+3 -3
Просмотреть файл
@@ -33,7 +33,7 @@ THE SOFTWARE.
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
for (size_t i = tid; i < num_xs; i += stride) { \
ys[i] = func_name(xs[i]); \
} \
}
@@ -359,7 +359,7 @@ MATH_UNARY_SP_TEST_DEF_IMPL(__sincosf_cos, static_cast<double (*)(double)>(std::
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
for (size_t i = tid; i < num_xs; i += stride) { \
ys[i] = func_name(x1s[i], x2s[i]); \
} \
}
@@ -493,7 +493,7 @@ MATH_BINARY_SP_TEST_DEF(__fdividef, __fdiv_rn_ref);
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
for (size_t i = tid; i < num_xs; i += stride) { \
ys[i] = func_name(x1s[i], x2s[i], x3s[i]); \
} \
}
+6 -9
Просмотреть файл
@@ -32,7 +32,7 @@ namespace cg = cooperative_groups;
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
for (size_t i = tid; i < num_xs; i += stride) { \
if constexpr (std::is_same_v<float, T>) { \
ys[i] = func_name##f(n[i], xs[i]); \
} else if constexpr (std::is_same_v<double, T>) { \
@@ -96,6 +96,7 @@ void BesselSinglePrecisionRangeTest(kernel_bessel_n_sig<float> kernel,
const ValidatorBuilder& validator_builder, int n_input,
const float a, const float b) {
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
const auto reduction_factor = GetTestReductionFactor();
const auto max_batch_size = GetMaxAllowedDeviceMemoryUsage() / (sizeof(float) * 2 + sizeof(int));
LinearAllocGuard<int> x1s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(int)};
LinearAllocGuard<float> x2s{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(float)};
@@ -103,15 +104,11 @@ void BesselSinglePrecisionRangeTest(kernel_bessel_n_sig<float> kernel,
MathTest math_test(kernel, max_batch_size);
std::fill_n(x1s.ptr(), max_batch_size, n_input);
size_t inserted = 0u;
for (float v = a; v != b; v = std::nextafter(v, b)) {
x2s.ptr()[inserted++] = v;
if (inserted < max_batch_size) continue;
const auto run = [&, gs = grid_size, bs = block_size](size_t inserted) {
math_test.Run(validator_builder, gs, bs, ref_func, inserted, x1s.ptr(), x2s.ptr());
};
math_test.Run(validator_builder, grid_size, block_size, ref_func, inserted, x1s.ptr(),
x2s.ptr());
inserted = 0u;
}
SinglePrecisionReducedRun(run, x2s, a, b, reduction_factor, max_batch_size);
}
template <typename T, typename F, typename ValidatorBuilder>
+4 -4
Просмотреть файл
@@ -103,7 +103,7 @@ MATH_UNARY_KERNEL_DEF(erfinv)
* ------------------------
* - Tests the numerical accuracy of `erfinvf(x)` for all possible inputs. The results are
* compared against reference function `double boost::math::erf_inv(double)`. The maximum ulp error
* is 2.
* is 4.
*
* Test source
* ------------------------
@@ -124,7 +124,7 @@ TEST_CASE("Unit_Device_erfinvf_Accuracy_Positive") {
return boost::math::erf_inv(arg);
};
double (*ref)(double) = erfinv_ref;
UnarySinglePrecisionTest(erfinv_kernel<float>, ref, ULPValidatorBuilderFactory<float>(2));
UnarySinglePrecisionTest(erfinv_kernel<float>, ref, ULPValidatorBuilderFactory<float>(4));
}
/**
@@ -535,9 +535,9 @@ MATH_UNARY_KERNEL_DEF(lgamma)
*/
TEST_CASE("Unit_Device_lgammaf_Accuracy_Limited_Positive") {
double (*ref)(double) = std::lgamma;
UnarySinglePrecisionRangeTest(lgamma_kernel<float>, ref, ULPValidatorBuilderFactory<float>(6),
UnarySinglePrecisionRangeTest(lgamma_kernel<float>, ref, ULPValidatorBuilderFactory<float>(7),
std::numeric_limits<float>::lowest(), -11.0001f);
UnarySinglePrecisionRangeTest(lgamma_kernel<float>, ref, ULPValidatorBuilderFactory<float>(6),
UnarySinglePrecisionRangeTest(lgamma_kernel<float>, ref, ULPValidatorBuilderFactory<float>(7),
-2.2636f, std::numeric_limits<float>::max());
}
+1 -1
Просмотреть файл
@@ -35,7 +35,7 @@ namespace cg = cooperative_groups;
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
for (size_t i = tid; i < num_xs; i += stride) { \
if constexpr (std::is_same_v<float, T>) { \
ys[i] = func_name##f(x1s[i], x2s[i], x3s[i]); \
} else if constexpr (std::is_same_v<double, T>) { \
+3 -3
Просмотреть файл
@@ -36,7 +36,7 @@ TEST_CASE("Unit_Device_cos_cosf_Negative_RTC") { NegativeTestRTCWrapper<4>(kCos)
MATH_UNARY_WITHIN_ULP_TEST_DEF(tan, std::tan, 4, 2)
TEST_CASE("Unit_Device_tan_tanf_Negative_RTC") { NegativeTestRTCWrapper<4>(kTan); }
MATH_UNARY_WITHIN_ULP_TEST_DEF(asin, std::asin, 2, 2)
MATH_UNARY_WITHIN_ULP_TEST_DEF(asin, std::asin, 3, 2)
TEST_CASE("Unit_Device_asin_asinf_Negative_RTC") { NegativeTestRTCWrapper<4>(kAsin); }
MATH_UNARY_WITHIN_ULP_TEST_DEF(acos, std::acos, 2, 2)
@@ -78,7 +78,7 @@ __global__ void sincos_kernel(std::pair<T, T>* const ys, const size_t num_xs, T*
const auto tid = cg::this_grid().thread_rank();
const auto stride = cg::this_grid().size();
for (auto i = tid; i < num_xs; i += stride) {
for (size_t i = tid; i < num_xs; i += stride) {
if constexpr (std::is_same_v<float, T>) {
sincosf(xs[i], &ys[i].first, &ys[i].second);
} else if constexpr (std::is_same_v<double, T>) {
@@ -109,7 +109,7 @@ __global__ void sincospi_kernel(std::pair<T, T>* const ys, const size_t num_xs,
const auto tid = cg::this_grid().thread_rank();
const auto stride = cg::this_grid().size();
for (auto i = tid; i < num_xs; i += stride) {
for (size_t i = tid; i < num_xs; i += stride) {
if constexpr (std::is_same_v<float, T>) {
sincospif(xs[i], &ys[i].first, &ys[i].second);
} else if constexpr (std::is_same_v<double, T>) {
+21 -15
Просмотреть файл
@@ -34,7 +34,7 @@ namespace cg = cooperative_groups;
const auto tid = cg::this_grid().thread_rank(); \
const auto stride = cg::this_grid().size(); \
\
for (auto i = tid; i < num_xs; i += stride) { \
for (size_t i = tid; i < num_xs; i += stride) { \
if constexpr (std::is_same_v<float, T>) { \
ys[i] = func_name##f(xs[i]); \
} else if constexpr (std::is_same_v<double, T>) { \
@@ -47,7 +47,10 @@ template <typename T, typename RT, typename RTArg, typename ValidatorBuilder>
void UnaryHalfPrecisionBruteForceTest(kernel_sig<T, Float16> kernel, ref_sig<RT, RTArg> ref_func,
const ValidatorBuilder& validator_builder) {
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
uint64_t stop = std::numeric_limits<uint16_t>::max() + 1ul;
const auto reduction_factor = GetTestReductionFactor();
const auto inv_reduction_factor = 1 / reduction_factor;
const auto stop = static_cast<uint64_t>(
std::ceil((std::numeric_limits<uint16_t>::max() + 1ul) * reduction_factor));
const auto max_batch_size =
std::min(GetMaxAllowedDeviceMemoryUsage() / (sizeof(Float16) + sizeof(T)), stop);
LinearAllocGuard<Float16> values{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(Float16)};
@@ -68,10 +71,11 @@ void UnaryHalfPrecisionBruteForceTest(kernel_sig<T, Float16> kernel, ref_sig<RT,
const auto sub_batch_size = min_sub_batch_size + (i < tail);
thread_pool.Post([=, &values] {
auto t = v;
auto t = v * inv_reduction_factor;
uint16_t val;
for (auto j = 0u; j < sub_batch_size; ++j) {
val = static_cast<uint16_t>(t++);
val = static_cast<uint16_t>(std::floor(t));
t += inv_reduction_factor;
values.ptr()[base_idx + j] = *reinterpret_cast<Float16*>(&val);
}
});
@@ -90,7 +94,10 @@ template <typename T, typename RT, typename RTArg, typename ValidatorBuilder>
void UnarySinglePrecisionBruteForceTest(kernel_sig<T, float> kernel, ref_sig<RT, RTArg> ref_func,
const ValidatorBuilder& validator_builder) {
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
uint64_t stop = std::numeric_limits<uint32_t>::max() + 1ul;
const auto reduction_factor = GetTestReductionFactor();
const auto inv_reduction_factor = 1 / reduction_factor;
const auto stop = static_cast<uint64_t>(
std::ceil((std::numeric_limits<uint32_t>::max() + 1ul) * reduction_factor));
const auto max_batch_size =
std::min(GetMaxAllowedDeviceMemoryUsage() / (sizeof(float) + sizeof(T)), stop);
LinearAllocGuard<float> values{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(float)};
@@ -111,10 +118,11 @@ void UnarySinglePrecisionBruteForceTest(kernel_sig<T, float> kernel, ref_sig<RT,
const auto sub_batch_size = min_sub_batch_size + (i < tail);
thread_pool.Post([=, &values] {
auto t = v;
auto t = v * inv_reduction_factor;
uint32_t val;
for (auto j = 0u; j < sub_batch_size; ++j) {
val = static_cast<uint32_t>(t++);
val = static_cast<uint32_t>(std::floor(t));
t += inv_reduction_factor;
values.ptr()[base_idx + j] = *reinterpret_cast<float*>(&val);
}
});
@@ -134,19 +142,17 @@ void UnarySinglePrecisionRangeTest(kernel_sig<T, float> kernel, ref_sig<RT, RTAr
const ValidatorBuilder& validator_builder, const float a,
const float b) {
const auto [grid_size, block_size] = GetOccupancyMaxPotentialBlockSize(kernel);
const auto reduction_factor = GetTestReductionFactor();
const auto max_batch_size = GetMaxAllowedDeviceMemoryUsage() / (sizeof(float) + sizeof(T));
LinearAllocGuard<float> values{LinearAllocs::hipHostMalloc, max_batch_size * sizeof(float)};
MathTest math_test(kernel, max_batch_size);
size_t inserted = 0u;
for (float v = a; v != b; v = std::nextafter(v, b)) {
values.ptr()[inserted++] = v;
if (inserted < max_batch_size) continue;
const auto run = [&, gs = grid_size, bs = block_size](size_t inserted) {
math_test.Run(validator_builder, gs, bs, ref_func, inserted, values.ptr());
};
math_test.Run(validator_builder, grid_size, block_size, ref_func, inserted, values.ptr());
inserted = 0u;
}
SinglePrecisionReducedRun(run, values, a, b, reduction_factor, max_batch_size);
}
template <typename T, typename RT, typename RTArg, typename ValidatorBuilder>