SWDEV-389689 - Rework oversubscription test to use threads instead of process. Also consider system memory when oversubscribing (#317)

Change-Id: If063552e9e2815f07e944259237310f6fef37ad5 [ROCm/hip-tests commit: 329a350ec0]
2023-06-28 12:47:48 +05:30
parent 16eee59975
commit be8a0d7d2c
@@ -55,7 +55,6 @@
 	   "Disabling test tracked SWDEV-391718",
 	   "Unit_hipMemRangeGetAttribute_TstCountParam",
 	   "Fails in Stress test SWDEV-398971",
-	   "Unit_HMM_OverSubscriptionTst",
 	   "SWDEV-398975 Seg faults in stress test",
 	   "Unit_hipMemcpyWithStream_MultiThread",
 	   "SWDEV-398977 fails in stress tests",
@@ -31,6 +31,8 @@ THE SOFTWARE.
 #include <random>
 #include <fstream>
 #include <streambuf>
+#include <thread>
+#include <future>

 namespace hip {
 /*
@@ -46,6 +48,7 @@ class SpawnProc {
  std::string exeName;
  std::string resultStr;
  std::string tmpFileName;
+  std::future<int> ret_from_run;
  bool captureOutput;

  std::string getRandomString(size_t len = 6) {
@@ -68,7 +71,7 @@ class SpawnProc {
    exeName = dir.string();
    // On Windows, fs::exists returns false without extension.
    if (TestContext::get().isWindows()) {
-      if(fs::path(exeName).extension().empty()) {
+      if (fs::path(exeName).extension().empty()) {
        exeName += ".exe";
      }
    }
@@ -112,6 +115,15 @@ class SpawnProc {
 #endif
  }

+  void run_async(std::string commandLineArgs = "") {
+    ret_from_run = std::async(std::launch::async, &hip::SpawnProc::run, this, commandLineArgs);
+  }
+
+  int wait() {
+    ret_from_run.wait();
+    return ret_from_run.get();
+  }
+
  std::string getOutput() { return resultStr; }
 };
 }  // namespace hip
@@ -7,6 +7,13 @@ set(TEST_SRC
    hipHostMallocStress.cc
 )

+if(UNIX)
+  set(TEST_SRC ${TEST_SRC}
+               hipHmmOvrSubscriptionTst.cc)
+  add_executable(hold_memory EXCLUDE_FROM_ALL hold_memory.cc)
+  add_dependencies(stress_test hold_memory)
+endif()
+
 hip_add_exe_to_target(NAME memory_stress
                      TEST_SRC ${TEST_SRC}
                      TEST_TARGET_NAME stress_test)
@@ -0,0 +1,114 @@
+/*
+Copyright (c) 2021-Present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* Test Case Description: This test case tests the working of OverSubscription
+   feature which is part of HMM.*/
+
+#include <hip_test_common.hh>
+#include <hip_test_helper.hh>
+#include <hip_test_process.hh>
+
+__global__ void floatx2(float* ptr, size_t size) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < size) {
+    ptr[i] *= 2;
+  }
+}
+
+TEST_CASE("Stress_HMM_OverSubscriptionTst") {
+  int hmm = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&hmm, hipDeviceAttributeManagedMemory, 0));
+
+  bool shouldRun = []() -> bool {
+#if HT_AMD  // For AMD this gcn arch needs to have xnack+
+    int device = 0;
+    hipDeviceProp_t props{};
+    HIP_CHECK(hipGetDevice(&device));
+    HIP_CHECK(hipGetDeviceProperties(&props, device));
+    std::string arch(props.gcnArchName);
+    return arch.find("xnack+") != std::string::npos;
+#else  // For CUDA this depends on SM and attribute check should be fine
+    return true;
+#endif
+  }();
+
+  if (hmm == 1 && shouldRun) {
+    hip::SpawnProc proc("hold_memory", true);
+    proc.run_async();
+    size_t freeMem, totalMem;
+    HIP_CHECK(hipMemGetInfo(&freeMem, &totalMem));
+
+    constexpr float oversub_factor = 1.2f;
+    auto system_ram = HipTest::getMemoryAmount();  // In MB
+
+    // Take in account of system memory
+    size_t max_memory = std::min(freeMem / (1024 * 1024), system_ram);
+
+    size_t max_mem_used = (max_memory * oversub_factor) / 1024;  // GB
+
+    auto OneGBTest = []() {
+      constexpr size_t oneGB = 1024 * 1024 * 1024;
+
+      hipStream_t stream;
+      HIP_CHECK_THREAD(hipStreamCreate(&stream));
+
+      float* data;
+      constexpr size_t alloc_elem = oneGB / sizeof(float);
+      HIP_CHECK_THREAD(hipMallocManaged(&data, oneGB, hipMemAttachGlobal));
+
+      constexpr float init_val = 1.1f;
+
+      std::for_each(data, data + alloc_elem, [](float& a) { a = init_val; });
+
+      // basic sanity - first and last val are same
+      REQUIRE_THREAD(data[0] == init_val);
+      REQUIRE_THREAD(data[alloc_elem - 1] == init_val);
+
+      // Page migrated to GPU
+      floatx2<<<(alloc_elem / 256) + 1, 256, 0, stream>>>(data, alloc_elem);
+
+      HIP_CHECK_THREAD(hipStreamSynchronize(stream));
+
+      // Back to host
+      REQUIRE_THREAD(
+          std::all_of(data, data + alloc_elem, [](float a) { return a == (2.0f * init_val); }));
+
+      HIP_CHECK_THREAD(hipFree(data));
+      HIP_CHECK_THREAD(hipStreamDestroy(stream));
+    };
+
+    std::vector<std::thread> thread_pool;
+    thread_pool.reserve(max_mem_used);
+
+    for (size_t i = 0; i < max_mem_used; i++) {
+      thread_pool.emplace_back(std::thread(OneGBTest));
+    }
+
+    std::for_each(thread_pool.begin(), thread_pool.end(),
+                  [](std::thread& thread) { thread.join(); });
+
+    HIP_CHECK_THREAD_FINALIZE();
+    REQUIRE(proc.wait() == 0);
+  } else {
+    HipTest::HIP_SKIP_TEST("Tests only supposed to run on xnack+ devices");
+  }
+}
@@ -66,7 +66,7 @@ static void ReleaseResource(int *Hmm, hipStream_t *strm) {
 /* The following test allocates a managed memory and prefetch it in
   one-to-all and all-to-one fahsion followed by kernel launch within available
   devices*/
-TEST_CASE("Unit_hipMemPrefetchAsyncOneToAll") {
+TEST_CASE("Stress_hipMemPrefetchAsyncOneToAll") {
  int MangdMem = HmmAttrPrint();
  if (MangdMem == 1) {
    int *Hmm1 = nullptr, NumDevs, MemSz = (4096 * 4);
@@ -0,0 +1,45 @@
+/*
+   Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+ */
+
+#include <hip/hip_runtime.h>
+#include <iostream>
+#include <chrono>
+#include <thread>
+
+#define HIP_CHECK(call)                                                                            \
+  {                                                                                                \
+    auto res_ = (call);                                                                            \
+    if (res_ != hipSuccess) {                                                                      \
+      std::cout << "Failed in: " << #call << std::endl;                                            \
+      return -1;                                                                                   \
+    }                                                                                              \
+  }
+
+int main() {
+  size_t freeMem = 0, totalMem = 0;
+  HIP_CHECK(hipMemGetInfo(&freeMem, &totalMem));
+
+  void* ptr;
+  HIP_CHECK(hipMalloc(&ptr, 0.4 * totalMem));  // hold 40% of total gpu memory
+  std::cout << "Sleeping..." << std::endl;
+  std::this_thread::sleep_for(
+      std::chrono::seconds(4));  //  sleep for few seconds till test complete
+  std::cout << "Waking up..." << std::endl;
+  HIP_CHECK(hipFree(ptr));
+}
@@ -120,13 +120,6 @@ else()
  set(TEST_SRC ${TEST_SRC} hipGetSymbolSizeAddress.cc)
 endif()

-# skipped due to os related code in tests need to work on them when all the
-# tests are enabled
-if(UNIX)
-  set(TEST_SRC ${TEST_SRC} hipHmmOvrSubscriptionTst.cc
-               hipMemoryAllocateCoherent.cc)
-endif()
-
 hip_add_exe_to_target(NAME MemoryTest
  TEST_SRC ${TEST_SRC}
  TEST_TARGET_NAME build_tests COMMON_SHARED_SRC ${COMMON_SHARED_SRC})
@@ -1,221 +0,0 @@
-/*
-Copyright (c) 2021-Present Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/* Test Case Description: This test case tests the working of OverSubscription
-   feature which is part of HMM.*/
-
-#include <hip_test_common.hh>
-#ifdef __linux__
-#include <sys/types.h>
-#include <sys/ipc.h>
-#include <sys/shm.h>
-#include <sys/stat.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/wait.h>
-#endif
-#include <list>
-
-#define INIT_VAL 2.5
-#define NUM_ELMS 268435456  // 268435456 * 4 = 1GB
-#define ITERATIONS 10
-#define ONE_GB 1024 * 1024 * 1024
-
-static void GetTotGpuMem(int *TotMem);
-static void DisplayHmmFlgs(int *Signal);
-// Kernel function
-__global__ void Square(int n, float *x) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = index; i < n; i += stride) {
-    x[i] = x[i] + 10;
-  }
-}
-
-static void OneGBMemTest(int dev) {
-  int DataMismatch = 0;
-  float *HmmAG = nullptr;
-  hipStream_t strm;
-  HIP_CHECK(hipStreamCreate(&strm));
-  // Testing hipMemAttachGlobal Flag
-  HIP_CHECK(hipMallocManaged(&HmmAG, NUM_ELMS * sizeof(float),
-                            hipMemAttachGlobal));
-
-  // Initializing HmmAG memory
-  for (int i = 0; i < NUM_ELMS; i++) {
-    HmmAG[i] = INIT_VAL;
-  }
-
-  int blockSize = 256;
-  int numBlocks = (NUM_ELMS + blockSize - 1) / blockSize;
-  dim3 dimGrid(numBlocks, 1, 1);
-  dim3 dimBlock(blockSize, 1, 1);
-  HIP_CHECK(hipSetDevice(dev));
-  for (int i = 0; i < ITERATIONS; ++i) {
-    Square<<<dimGrid, dimBlock, 0, strm>>>(NUM_ELMS, HmmAG);
-  }
-  HIP_CHECK(hipStreamSynchronize(strm));
-  for (int j = 0; j < NUM_ELMS; ++j) {
-    if (HmmAG[j] != (INIT_VAL + ITERATIONS * 10)) {
-      DataMismatch++;
-      break;
-    }
-  }
-  if (DataMismatch != 0) {
-    WARN("Data Mismatch observed when kernel launched on device: " << dev);
-    REQUIRE(false);
-  }
-  HIP_CHECK(hipFree(HmmAG));
-  HIP_CHECK(hipStreamDestroy(strm));
-}
-
-static void GetTotGpuMem(int *TotMem) {
-  size_t FreeMem, TotGpuMem;
-  HIP_CHECK(hipMemGetInfo(&FreeMem, &TotGpuMem));
-  TotMem[0] = (TotGpuMem/(ONE_GB));
-  TotMem[1] = 1;
-}
-
-static void DisplayHmmFlgs(int *Signal) {
-  int managed = 0;
-  WARN("The following are the attribute values related to HMM for"
-         " device 0:\n");
-  HIP_CHECK(hipDeviceGetAttribute(&managed,
-                    hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
-  WARN("hipDeviceAttributeDirectManagedMemAccessFromHost: " << managed);
-  HIP_CHECK(hipDeviceGetAttribute(&managed,
-                                 hipDeviceAttributeConcurrentManagedAccess, 0));
-  WARN("hipDeviceAttributeConcurrentManagedAccess: " << managed);
-  HIP_CHECK(hipDeviceGetAttribute(&managed,
-                                 hipDeviceAttributePageableMemoryAccess, 0));
-  WARN("hipDeviceAttributePageableMemoryAccess: " << managed);
-  HIP_CHECK(hipDeviceGetAttribute(&managed,
-              hipDeviceAttributePageableMemoryAccessUsesHostPageTables, 0));
-  WARN("hipDeviceAttributePageableMemoryAccessUsesHostPageTables:"
-          << managed);
-
-  HIP_CHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeManagedMemory,
-                                  0));
-  WARN("hipDeviceAttributeManagedMemory: " << managed);
-
-  // Checking for Vega20 or MI100
-  hipDeviceProp_t prop;
-  HIP_CHECK(hipGetDeviceProperties(&prop, 0));
-  char *p = NULL;
-  p = strstr(prop.gcnArchName, "gfx906");
-  if (p) {
-    WARN("This system has MI60 gpu hence OverSubscription test will be");
-    WARN(" skipped");
-    Signal[2] = 1;
-  }
-  p = strstr(prop.gcnArchName, "gfx908");
-  if (p) {
-    WARN("This system has MI100 gpu hence OverSubscription test will be");
-    WARN(" skipped");
-    Signal[2] = 1;
-  }
-  Signal[1] = managed;
-  Signal[0] = 1;
-}
-
-TEST_CASE("Unit_HMM_OverSubscriptionTst") {
-  hipDeviceProp_t prop;
-  HIP_CHECK(hipGetDeviceProperties(&prop, 0));
-  char *p = nullptr;
-  p = strstr(prop.gcnArchName, "xnack+");
-  if (p == nullptr) {
-    INFO("Skipped due current device is non xnack device.");
-    return;
-  }
-  int HmmEnabled = 0;
-  // The following Shared Mem is to get Max GPU Mem
-  // The size requested is for three ints
-  // 1) To get Max GPU Mem in GB
-  // 2) To Signal parent that req. info is available to consume
-  // 3) To know if MI60 or MI100 gpu are there in the system
-  key_t key = ftok("shmTotMem", 66);
-  int shmid = shmget(key, (3 * sizeof(int)), 0666|IPC_CREAT);
-  int *TotGpuMem = reinterpret_cast<int*>(shmat(shmid, NULL, 0));
-  TotGpuMem[0] = 0; TotGpuMem[1] = 0;
-  // The following function DisplayHmmFlgs() displays the flag values related
-  // to HMM and also sends us ManagedMemory attribute value
-  if (fork() == 0) {
-    DisplayHmmFlgs(TotGpuMem);
-    exit(1);
-  }
-  while (TotGpuMem[0] == 0) {
-      sleep(2);
-  }
-  // The following if block will skip test if either of MI60 or MI100 is found
-  if (TotGpuMem[2] == 1) {
-    SUCCEED("Test is skipped!!");
-    REQUIRE(true);
-  } else {
-    HmmEnabled = TotGpuMem[1];
-
-    // Re-setting the shared memory values for further usage
-    TotGpuMem[0] = 0;
-    TotGpuMem[1] = 0;
-
-    std::list<pid_t> PidLst;
-    // The following function gets the MaxGpu memory in GBs and also launches
-    // OverSubscription test
-    if (HmmEnabled) {
-      if ((setenv("HSA_XNACK", "1", 1)) != 0) {
-        WARN("Unable to turn on HSA_XNACK, hence terminating the Test case!");
-        REQUIRE(false);
-      }
-      if (fork() == 0) {
-        GetTotGpuMem(TotGpuMem);
-      }
-      while (TotGpuMem[1] == 0) {
-        sleep(2);
-      }
-      int NumGB = TotGpuMem[0], TotalThreads = (NumGB + 10);
-      WARN("Launching " << TotalThreads);
-      WARN(" processes to test OverSubscription.");
-      pid_t pid;
-      for (int k = 0; k < TotalThreads; ++k) {
-        pid = fork();
-        PidLst.push_back(pid);
-        if (pid == 0) {
-          OneGBMemTest(0);
-          exit(10);
-        }
-      }
-    } else {
-      SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory "
-              "attribute. Hence skipping the testing with Pass result.\n");
-    }
-    int status;
-    for (pid_t pd : PidLst) {
-      waitpid(pd, &status, 0);
-      if (!(WIFEXITED(status))) {
-        REQUIRE(false);
-      }
-    }
-  }
-  shmdt(TotGpuMem);
-  shmctl(shmid, IPC_RMID, NULL);
-}