added threadfence feature for hcc

1. Added feature for __threadfence and __threadfence_block 2. Added feature for using LLVM IR files directly while compilation 3. Added test for threadfence and threadfence_block Change-Id: Ib7e5d89b4cca1a135952b317e5809cd05b56a3c9 [ROCm/clr commit: 09c9953649]
2016-10-10 15:29:50 -05:00
@@ -222,6 +222,7 @@ if(HIP_PLATFORM STREQUAL "hcc")

    # Install .buildInfo
    install(FILES ${PROJECT_BINARY_DIR}/.buildInfo DESTINATION lib)
+    install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/hip_ir.ll DESTINATION lib)
 endif()

 # Install .version
@@ -196,6 +196,11 @@ if($HIP_PLATFORM eq "hcc"){
  }
 }

+if($HIP_PLATFORM eq "hcc"){
+    $EXPORT_LL=" ";
+    $ENV{HCC_EXTRA_LIBRARIES}="$HIP_PATH/lib/hip_ir.ll\n";
+}
+
 if($HIP_PLATFORM eq "nvcc"){
  $ISACMD .= "$HIP_PATH/bin/hipcc -ptx ";
  if($ARGV[0] eq "--genco"){
@@ -533,7 +533,8 @@ __device__ float __dsqrt_rz(double x);
 *
 * @warning __threadfence_block is a stub and map to no-op.
 */
-__device__ void  __threadfence_block(void);
+// __device__ void  __threadfence_block(void);
+extern "C" __device__ void __threadfence_block(void);

 /**
  * @brief threadfence makes wirtes visible to other threads running on same GPU.
@@ -544,7 +545,8 @@ __device__ void  __threadfence_block(void);
 *
 * @warning __threadfence is a stub and map to no-op, application should set "export HSA_DISABLE_CACHE=1" to disable both L1 and L2 caches.
 */
-__device__ void  __threadfence(void) __attribute__((deprecated("Provided for compile-time compatibility, not yet functional")));
+// __device__ void  __threadfence(void) __attribute__((deprecated("Provided for compile-time compatibility, not yet functional")));
+extern "C" __device__ void __threadfence(void);

 /**
 * @brief threadfence_system makes writes to pinned system memory visible on host CPU.
@@ -0,0 +1,15 @@
+target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+target triple = "amdgcn--amdhsa"
+
+
+define void @__threadfence() #1 {
+    fence syncscope(2) seq_cst
+    ret void
+}
+
+define void @__threadfence_block()  #1 {
+    fence syncscope(3) seq_cst
+    ret void
+}
+
+attributes #1 = { alwaysinline nounwind }
@@ -0,0 +1,69 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<iostream>
+#include<hip/hip_runtime_api.h>
+#include<hip/hip_runtime.h>
+
+#define NUM 1024
+#define SIZE NUM*sizeof(float)
+
+__global__ void vAdd(hipLaunchParm lp, float *In1, float *In2, float *In3, float *In4, float *Out)
+{
+    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    In4[tid] = In1[tid] + In2[tid];
+    __threadfence();
+    In3[tid] = In3[tid] + In4[tid];
+    __threadfence_block();
+    Out[tid] = In4[tid] + In3[tid];
+
+}
+
+int main(){
+    float *In1 = new float[1024];
+    float *In2 = new float[1024];
+    float *In3 = new float[1024];
+    float *In4 = new float[1024];
+    float *Out = new float[1024];
+
+    for(uint32_t i=0;i<1024;i++)
+    {
+        In1[i] = 1.0f;
+        In2[i] = 1.0f;
+        In3[i] = 1.0f;
+        In4[i] = 1.0f;
+    }
+
+    float *In1d, *In2d, *In3d, *In4d, *Outd;
+    hipMalloc((void**)&In1d, SIZE);
+    hipMalloc((void**)&In2d, SIZE);
+    hipMalloc((void**)&In3d, SIZE);
+    hipMalloc((void**)&In4d, SIZE);
+    hipMalloc((void**)&Outd, SIZE);
+
+    hipMemcpy(In1d, In1, SIZE, hipMemcpyHostToDevice);
+    hipMemcpy(In2d, In2, SIZE, hipMemcpyHostToDevice);
+    hipMemcpy(In3d, In3, SIZE, hipMemcpyHostToDevice);
+    hipMemcpy(In4d, In4, SIZE, hipMemcpyHostToDevice);
+
+    hipLaunchKernel(vAdd, dim3(32,1,1), dim3(32,1,1), 0, 0, In1d, In2d, In3d, In4d, Outd);
+    hipMemcpy(Out, Outd, SIZE, hipMemcpyDeviceToHost);
+    assert(Out[10] == 2*In1[10] + 2*In2[10] + In3[10]);
+
+}