From efa52f77efa389e825a723b2fbea403acccc10b4 Mon Sep 17 00:00:00 2001 From: foreman Date: Mon, 2 May 2016 16:43:00 -0400 Subject: [PATCH] P4 to Git Change 1264269 by smekhano@stas-nova-hsa on 2016/05/02 16:32:25 SWDEV-77584 - HSA HLC: refactoring of min/max processing and folding 1. Fixed correctness bug: if a source contains code like (x > y) ? x : y, HLC was folding this and similar patterns to min and max instructions. The problem is with NaN handling. Such a pattern may return NaN if one of two arguments is a NaN. All our instructions return a number in this case, except for gcn instruction returning a qNaN if input is sNaN. For a qNaN a number is retuned in any way. Therefor such folding is only correct if NaN handling is disabled. Patterns are predicated to work with -cl-finite-math-only or -cl-fast-relaxed-math which includes the former option. NB: Performance regressions are expected in programs which do not use either of these options. 2. Compiler lib did hot handle -cl-finite-math-only. Also added handling of -cl-no-signed-zeros, even though it does not affect code generation because there is no llvm counterpart for it. 3. Patterns for NaN agnostic comparison codes are added. We are getting these in case if finite only math is requested. 4. Removed patterns for __hsail_min_f* and __hsail_max_f*. Instead these intrinsics are lowered to fminnum and fmaxnum llvm operations with the same semantics. This allows to decrease the number of patterns and simplify handling. 5. For f32 we were only producing gcn versions min and max with source patterns if gcn is enabled. Added similar lowering to standard min/max HSAIL operations if gcn is disabled. 6. Added lowering of fmaxnum/fminnum to more efficient gcn operations if gcn is enabled. Neither OpenCL nor LLVM IR semantics are violated by this. 7. Moved GCN media intrinsics definitions into the GCN directory. 8. Added folding of gcn f32 instructions min(max), min(min), max(max) into corresponding gcn instructions med3, min3 and max3. This should have been helpful for color clamping. Performance testing showed these are slow, however. T-Rex test from compubench has slowed down by 50 times for no obvious reason. Therefor folding is disabled by default. The option -enable-gcn-mm3 is added to enable the folding for testing purposes. Testing: smoke, precheckin, luxmark, compubench, BasemarkCL, conformance: commonfns, bruteforce -w, relationals, select Reviewed by Brian Sumner Affected files ... ... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/common/codegen.cpp#68 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/common/opt_level.cpp#29 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/utils/options.cpp#35 edit ... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Target/HSAIL/GCN/HSAILArithmetic.td#3 edit ... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Target/HSAIL/GCN/HSAILFusion.td#3 edit ... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Target/HSAIL/GCN/HSAILIntrinsics.td#4 edit ... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Target/HSAIL/HSAILArithmetic.td#45 edit ... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Target/HSAIL/HSAILFusion.td#28 edit ... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Target/HSAIL/HSAILISelDAGToDAG.cpp#68 edit ... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Target/HSAIL/HSAILISelLowering.cpp#113 edit ... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Target/HSAIL/HSAILInstrInfo.td#21 edit ... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Target/HSAIL/HSAILIntrinsics.td#70 edit ... //depot/stg/opencl/drivers/opencl/tests/hsa/src/llc/opt/minmax/minmaxf3pat.cl#1 add ... //depot/stg/opencl/drivers/opencl/tests/hsa/tlst/llc_opt.tlst#93 edit --- rocclr/compiler/lib/backends/common/codegen.cpp | 2 +- rocclr/compiler/lib/backends/common/opt_level.cpp | 2 +- rocclr/compiler/lib/utils/options.cpp | 12 ++++++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/rocclr/compiler/lib/backends/common/codegen.cpp b/rocclr/compiler/lib/backends/common/codegen.cpp index 528f6fa497..739a28cb87 100644 --- a/rocclr/compiler/lib/backends/common/codegen.cpp +++ b/rocclr/compiler/lib/backends/common/codegen.cpp @@ -546,7 +546,7 @@ llvmCodeGen( OptionsObj->oVariables->EnableMAD; targetOptions.NoInfsFPMath = OptionsObj->oVariables->FiniteMathOnly; // Need to add a support for OptionsObj->oVariables->NoSignedZeros, - targetOptions.NoNaNsFPMath = OptionsObj->oVariables->FastRelaxedMath; + targetOptions.NoNaNsFPMath = OptionsObj->oVariables->FiniteMathOnly; std::auto_ptr target(TheTarget->createTargetMachine(TheTriple.getTriple(), diff --git a/rocclr/compiler/lib/backends/common/opt_level.cpp b/rocclr/compiler/lib/backends/common/opt_level.cpp index fc7af4875a..5c6500000e 100644 --- a/rocclr/compiler/lib/backends/common/opt_level.cpp +++ b/rocclr/compiler/lib/backends/common/opt_level.cpp @@ -119,7 +119,7 @@ OptLevel::run(aclBinary *elf) targetOptions.LessPreciseFPMADOption = Options()->oVariables->MadEnable || Options()->oVariables->EnableMAD; targetOptions.NoInfsFPMath = Options()->oVariables->FiniteMathOnly; - targetOptions.NoNaNsFPMath = Options()->oVariables->FastRelaxedMath; + targetOptions.NoNaNsFPMath = Options()->oVariables->FiniteMathOnly; llvm::CodeGenOpt::Level OLvl = CodeGenOpt::None; switch (Options()->oVariables->OptLevel) { diff --git a/rocclr/compiler/lib/utils/options.cpp b/rocclr/compiler/lib/utils/options.cpp index 131cc56eda..671b167299 100644 --- a/rocclr/compiler/lib/utils/options.cpp +++ b/rocclr/compiler/lib/utils/options.cpp @@ -740,6 +740,18 @@ processOption(int OptDescTableIx, Options& Opts, const std::string& Value, } break; + case OID_FiniteMathOnly: + Opts.setFlag(OID_FiniteMathOnly, 1); + tod = &OptDescTable[OID_FiniteMathOnly]; + (void)setOptionVariable (tod, ovars, (int64_t)1, NULL); + break; + + case OID_NoSignedZeros: + Opts.setFlag(OID_NoSignedZeros, 1); + tod = &OptDescTable[OID_NoSignedZeros]; + (void)setOptionVariable (tod, ovars, (int64_t)1, NULL); + break; + case OID_FastRelaxedMath: // -cl-fast-relaxed-math implies: // -cl-finite-math-only