Merge pull request #138 from scchan/threadfence_system

implement __threadfence_system
2017-08-10 12:51:05 +05:30
@@ -130,7 +130,7 @@ extern int HIP_TRACE_API;
 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__          (0)

 //sync
-#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__        (0)
+#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__        (1)
 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__            (0)

 // misc
@@ -1264,7 +1264,7 @@ __device__ double __hip_fast_dsqrt_rz(double x) {
 }

 __device__ void  __threadfence_system(void){
-    // no-op
+  std::atomic_thread_fence(std::memory_order_seq_cst);
 }

 double __hip_host_j0(double x)
@@ -0,0 +1,128 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * RUN: %t
+ * HIT_END
+ */
+
+#include <thread>
+#include <cassert>
+#include <cstdio>
+#include "hip/hip_runtime.h"
+#include "hip/device_functions.h"
+#include "test_common.h"
+
+
+#define HIP_ASSERT(x) (assert((x)==hipSuccess))
+
+__host__ void fence_system() {
+  std::atomic_thread_fence(std::memory_order_seq_cst);
+}
+
+__device__ void fence_system() {
+  __threadfence_system();
+}
+
+__host__ __device__ void round_robin(const int id, const int num_dev, const int num_iter, volatile int* data, volatile int* flag) {
+  for (int i = 0; i < num_iter; i++) {
+    while(*flag%num_dev != id)
+      fence_system();  // invalid the cache for read
+
+    (*data)++;
+    fence_system();    // make sure the store to data is sequenced before the store to flag
+    (*flag)++;
+    fence_system();    // invalid the cache to flush out flag
+  }
+}
+
+__global__ void gpu_round_robin(const int id, const int num_dev, const int num_iter, volatile int* data, volatile int* flag) {
+  round_robin(id, num_dev, num_iter, data, flag); 
+}
+
+int main() {
+
+  int num_gpus = 0;
+  HIP_ASSERT(hipGetDeviceCount(&num_gpus));
+  if (num_gpus == 0) {
+    passed();
+    return 0;
+  }
+
+  volatile int* data;
+  HIP_ASSERT(hipHostMalloc(&data, sizeof(int), hipHostMallocCoherent));
+  constexpr int init_data = 1000;
+  *data = init_data;
+
+  volatile int* flag;
+  HIP_ASSERT(hipHostMalloc(&flag, sizeof(int), hipHostMallocCoherent));
+  *flag = 0;
+
+  // number of rounds per device
+  constexpr int num_iter = 1000;
+
+  // one CPU thread + 1 kernel/GPU
+  const int num_dev = num_gpus + 1;
+
+  int next_id = 0;
+  std::vector<std::thread> threads;
+
+  // create a CPU thread for the round_robin
+  threads.push_back(std::thread(round_robin, next_id++, num_dev, num_iter, data, flag));
+
+  // run one thread per GPU
+  dim3 dim_block(1,1,1);
+  dim3 dim_grid(1,1,1);
+
+  // launch one kernel per device for the round robin
+  for (; next_id < num_dev; ++next_id) {
+    threads.push_back(std::thread([=]() {
+      HIP_ASSERT(hipSetDevice(next_id-1));
+      hipLaunchKernelGGL(gpu_round_robin, dim_grid, dim_block, 0, 0x0
+                          , next_id, num_dev, num_iter, data, flag);
+      HIP_ASSERT(hipDeviceSynchronize());
+    }));
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  int expected_data = init_data + num_dev * num_iter;
+  int expected_flag = num_dev * num_iter;
+
+  bool passed = *data == expected_data 
+                && *flag == expected_flag;
+
+  HIP_ASSERT(hipHostFree((void*)data));
+  HIP_ASSERT(hipHostFree((void*)flag));
+
+  if (passed) {
+    passed();
+  }
+  else {
+    failed("Failed Verification!\n");
+  }
+
+  return 0;
+}