SWDEV-389689 - Rework oversubscription test to use threads instead of process. Also consider system memory when oversubscribing (#317)
Change-Id: If063552e9e2815f07e944259237310f6fef37ad5
[ROCm/hip-tests commit: 329a350ec0]
This commit is contained in:
committato da
GitHub
parent
16eee59975
commit
be8a0d7d2c
@@ -55,7 +55,6 @@
|
||||
"Disabling test tracked SWDEV-391718",
|
||||
"Unit_hipMemRangeGetAttribute_TstCountParam",
|
||||
"Fails in Stress test SWDEV-398971",
|
||||
"Unit_HMM_OverSubscriptionTst",
|
||||
"SWDEV-398975 Seg faults in stress test",
|
||||
"Unit_hipMemcpyWithStream_MultiThread",
|
||||
"SWDEV-398977 fails in stress tests",
|
||||
|
||||
@@ -31,6 +31,8 @@ THE SOFTWARE.
|
||||
#include <random>
|
||||
#include <fstream>
|
||||
#include <streambuf>
|
||||
#include <thread>
|
||||
#include <future>
|
||||
|
||||
namespace hip {
|
||||
/*
|
||||
@@ -46,6 +48,7 @@ class SpawnProc {
|
||||
std::string exeName;
|
||||
std::string resultStr;
|
||||
std::string tmpFileName;
|
||||
std::future<int> ret_from_run;
|
||||
bool captureOutput;
|
||||
|
||||
std::string getRandomString(size_t len = 6) {
|
||||
@@ -68,7 +71,7 @@ class SpawnProc {
|
||||
exeName = dir.string();
|
||||
// On Windows, fs::exists returns false without extension.
|
||||
if (TestContext::get().isWindows()) {
|
||||
if(fs::path(exeName).extension().empty()) {
|
||||
if (fs::path(exeName).extension().empty()) {
|
||||
exeName += ".exe";
|
||||
}
|
||||
}
|
||||
@@ -112,6 +115,15 @@ class SpawnProc {
|
||||
#endif
|
||||
}
|
||||
|
||||
void run_async(std::string commandLineArgs = "") {
|
||||
ret_from_run = std::async(std::launch::async, &hip::SpawnProc::run, this, commandLineArgs);
|
||||
}
|
||||
|
||||
int wait() {
|
||||
ret_from_run.wait();
|
||||
return ret_from_run.get();
|
||||
}
|
||||
|
||||
std::string getOutput() { return resultStr; }
|
||||
};
|
||||
} // namespace hip
|
||||
|
||||
@@ -7,6 +7,13 @@ set(TEST_SRC
|
||||
hipHostMallocStress.cc
|
||||
)
|
||||
|
||||
if(UNIX)
|
||||
set(TEST_SRC ${TEST_SRC}
|
||||
hipHmmOvrSubscriptionTst.cc)
|
||||
add_executable(hold_memory EXCLUDE_FROM_ALL hold_memory.cc)
|
||||
add_dependencies(stress_test hold_memory)
|
||||
endif()
|
||||
|
||||
hip_add_exe_to_target(NAME memory_stress
|
||||
TEST_SRC ${TEST_SRC}
|
||||
TEST_TARGET_NAME stress_test)
|
||||
|
||||
@@ -0,0 +1,114 @@
|
||||
/*
|
||||
Copyright (c) 2021-Present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* Test Case Description: This test case tests the working of OverSubscription
|
||||
feature which is part of HMM.*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_helper.hh>
|
||||
#include <hip_test_process.hh>
|
||||
|
||||
__global__ void floatx2(float* ptr, size_t size) {
|
||||
auto i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (i < size) {
|
||||
ptr[i] *= 2;
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Stress_HMM_OverSubscriptionTst") {
|
||||
int hmm = 0;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&hmm, hipDeviceAttributeManagedMemory, 0));
|
||||
|
||||
bool shouldRun = []() -> bool {
|
||||
#if HT_AMD // For AMD this gcn arch needs to have xnack+
|
||||
int device = 0;
|
||||
hipDeviceProp_t props{};
|
||||
HIP_CHECK(hipGetDevice(&device));
|
||||
HIP_CHECK(hipGetDeviceProperties(&props, device));
|
||||
std::string arch(props.gcnArchName);
|
||||
return arch.find("xnack+") != std::string::npos;
|
||||
#else // For CUDA this depends on SM and attribute check should be fine
|
||||
return true;
|
||||
#endif
|
||||
}();
|
||||
|
||||
if (hmm == 1 && shouldRun) {
|
||||
hip::SpawnProc proc("hold_memory", true);
|
||||
proc.run_async();
|
||||
size_t freeMem, totalMem;
|
||||
HIP_CHECK(hipMemGetInfo(&freeMem, &totalMem));
|
||||
|
||||
constexpr float oversub_factor = 1.2f;
|
||||
auto system_ram = HipTest::getMemoryAmount(); // In MB
|
||||
|
||||
// Take in account of system memory
|
||||
size_t max_memory = std::min(freeMem / (1024 * 1024), system_ram);
|
||||
|
||||
size_t max_mem_used = (max_memory * oversub_factor) / 1024; // GB
|
||||
|
||||
auto OneGBTest = []() {
|
||||
constexpr size_t oneGB = 1024 * 1024 * 1024;
|
||||
|
||||
hipStream_t stream;
|
||||
HIP_CHECK_THREAD(hipStreamCreate(&stream));
|
||||
|
||||
float* data;
|
||||
constexpr size_t alloc_elem = oneGB / sizeof(float);
|
||||
HIP_CHECK_THREAD(hipMallocManaged(&data, oneGB, hipMemAttachGlobal));
|
||||
|
||||
constexpr float init_val = 1.1f;
|
||||
|
||||
std::for_each(data, data + alloc_elem, [](float& a) { a = init_val; });
|
||||
|
||||
// basic sanity - first and last val are same
|
||||
REQUIRE_THREAD(data[0] == init_val);
|
||||
REQUIRE_THREAD(data[alloc_elem - 1] == init_val);
|
||||
|
||||
// Page migrated to GPU
|
||||
floatx2<<<(alloc_elem / 256) + 1, 256, 0, stream>>>(data, alloc_elem);
|
||||
|
||||
HIP_CHECK_THREAD(hipStreamSynchronize(stream));
|
||||
|
||||
// Back to host
|
||||
REQUIRE_THREAD(
|
||||
std::all_of(data, data + alloc_elem, [](float a) { return a == (2.0f * init_val); }));
|
||||
|
||||
HIP_CHECK_THREAD(hipFree(data));
|
||||
HIP_CHECK_THREAD(hipStreamDestroy(stream));
|
||||
};
|
||||
|
||||
std::vector<std::thread> thread_pool;
|
||||
thread_pool.reserve(max_mem_used);
|
||||
|
||||
for (size_t i = 0; i < max_mem_used; i++) {
|
||||
thread_pool.emplace_back(std::thread(OneGBTest));
|
||||
}
|
||||
|
||||
std::for_each(thread_pool.begin(), thread_pool.end(),
|
||||
[](std::thread& thread) { thread.join(); });
|
||||
|
||||
HIP_CHECK_THREAD_FINALIZE();
|
||||
REQUIRE(proc.wait() == 0);
|
||||
} else {
|
||||
HipTest::HIP_SKIP_TEST("Tests only supposed to run on xnack+ devices");
|
||||
}
|
||||
}
|
||||
@@ -66,7 +66,7 @@ static void ReleaseResource(int *Hmm, hipStream_t *strm) {
|
||||
/* The following test allocates a managed memory and prefetch it in
|
||||
one-to-all and all-to-one fahsion followed by kernel launch within available
|
||||
devices*/
|
||||
TEST_CASE("Unit_hipMemPrefetchAsyncOneToAll") {
|
||||
TEST_CASE("Stress_hipMemPrefetchAsyncOneToAll") {
|
||||
int MangdMem = HmmAttrPrint();
|
||||
if (MangdMem == 1) {
|
||||
int *Hmm1 = nullptr, NumDevs, MemSz = (4096 * 4);
|
||||
|
||||
@@ -0,0 +1,45 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <iostream>
|
||||
#include <chrono>
|
||||
#include <thread>
|
||||
|
||||
#define HIP_CHECK(call) \
|
||||
{ \
|
||||
auto res_ = (call); \
|
||||
if (res_ != hipSuccess) { \
|
||||
std::cout << "Failed in: " << #call << std::endl; \
|
||||
return -1; \
|
||||
} \
|
||||
}
|
||||
|
||||
int main() {
|
||||
size_t freeMem = 0, totalMem = 0;
|
||||
HIP_CHECK(hipMemGetInfo(&freeMem, &totalMem));
|
||||
|
||||
void* ptr;
|
||||
HIP_CHECK(hipMalloc(&ptr, 0.4 * totalMem)); // hold 40% of total gpu memory
|
||||
std::cout << "Sleeping..." << std::endl;
|
||||
std::this_thread::sleep_for(
|
||||
std::chrono::seconds(4)); // sleep for few seconds till test complete
|
||||
std::cout << "Waking up..." << std::endl;
|
||||
HIP_CHECK(hipFree(ptr));
|
||||
}
|
||||
@@ -120,13 +120,6 @@ else()
|
||||
set(TEST_SRC ${TEST_SRC} hipGetSymbolSizeAddress.cc)
|
||||
endif()
|
||||
|
||||
# skipped due to os related code in tests need to work on them when all the
|
||||
# tests are enabled
|
||||
if(UNIX)
|
||||
set(TEST_SRC ${TEST_SRC} hipHmmOvrSubscriptionTst.cc
|
||||
hipMemoryAllocateCoherent.cc)
|
||||
endif()
|
||||
|
||||
hip_add_exe_to_target(NAME MemoryTest
|
||||
TEST_SRC ${TEST_SRC}
|
||||
TEST_TARGET_NAME build_tests COMMON_SHARED_SRC ${COMMON_SHARED_SRC})
|
||||
|
||||
@@ -1,221 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2021-Present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* Test Case Description: This test case tests the working of OverSubscription
|
||||
feature which is part of HMM.*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#ifdef __linux__
|
||||
#include <sys/types.h>
|
||||
#include <sys/ipc.h>
|
||||
#include <sys/shm.h>
|
||||
#include <sys/stat.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/wait.h>
|
||||
#endif
|
||||
#include <list>
|
||||
|
||||
#define INIT_VAL 2.5
|
||||
#define NUM_ELMS 268435456 // 268435456 * 4 = 1GB
|
||||
#define ITERATIONS 10
|
||||
#define ONE_GB 1024 * 1024 * 1024
|
||||
|
||||
static void GetTotGpuMem(int *TotMem);
|
||||
static void DisplayHmmFlgs(int *Signal);
|
||||
// Kernel function
|
||||
__global__ void Square(int n, float *x) {
|
||||
int index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
for (int i = index; i < n; i += stride) {
|
||||
x[i] = x[i] + 10;
|
||||
}
|
||||
}
|
||||
|
||||
static void OneGBMemTest(int dev) {
|
||||
int DataMismatch = 0;
|
||||
float *HmmAG = nullptr;
|
||||
hipStream_t strm;
|
||||
HIP_CHECK(hipStreamCreate(&strm));
|
||||
// Testing hipMemAttachGlobal Flag
|
||||
HIP_CHECK(hipMallocManaged(&HmmAG, NUM_ELMS * sizeof(float),
|
||||
hipMemAttachGlobal));
|
||||
|
||||
// Initializing HmmAG memory
|
||||
for (int i = 0; i < NUM_ELMS; i++) {
|
||||
HmmAG[i] = INIT_VAL;
|
||||
}
|
||||
|
||||
int blockSize = 256;
|
||||
int numBlocks = (NUM_ELMS + blockSize - 1) / blockSize;
|
||||
dim3 dimGrid(numBlocks, 1, 1);
|
||||
dim3 dimBlock(blockSize, 1, 1);
|
||||
HIP_CHECK(hipSetDevice(dev));
|
||||
for (int i = 0; i < ITERATIONS; ++i) {
|
||||
Square<<<dimGrid, dimBlock, 0, strm>>>(NUM_ELMS, HmmAG);
|
||||
}
|
||||
HIP_CHECK(hipStreamSynchronize(strm));
|
||||
for (int j = 0; j < NUM_ELMS; ++j) {
|
||||
if (HmmAG[j] != (INIT_VAL + ITERATIONS * 10)) {
|
||||
DataMismatch++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (DataMismatch != 0) {
|
||||
WARN("Data Mismatch observed when kernel launched on device: " << dev);
|
||||
REQUIRE(false);
|
||||
}
|
||||
HIP_CHECK(hipFree(HmmAG));
|
||||
HIP_CHECK(hipStreamDestroy(strm));
|
||||
}
|
||||
|
||||
static void GetTotGpuMem(int *TotMem) {
|
||||
size_t FreeMem, TotGpuMem;
|
||||
HIP_CHECK(hipMemGetInfo(&FreeMem, &TotGpuMem));
|
||||
TotMem[0] = (TotGpuMem/(ONE_GB));
|
||||
TotMem[1] = 1;
|
||||
}
|
||||
|
||||
static void DisplayHmmFlgs(int *Signal) {
|
||||
int managed = 0;
|
||||
WARN("The following are the attribute values related to HMM for"
|
||||
" device 0:\n");
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed,
|
||||
hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
|
||||
WARN("hipDeviceAttributeDirectManagedMemAccessFromHost: " << managed);
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed,
|
||||
hipDeviceAttributeConcurrentManagedAccess, 0));
|
||||
WARN("hipDeviceAttributeConcurrentManagedAccess: " << managed);
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed,
|
||||
hipDeviceAttributePageableMemoryAccess, 0));
|
||||
WARN("hipDeviceAttributePageableMemoryAccess: " << managed);
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed,
|
||||
hipDeviceAttributePageableMemoryAccessUsesHostPageTables, 0));
|
||||
WARN("hipDeviceAttributePageableMemoryAccessUsesHostPageTables:"
|
||||
<< managed);
|
||||
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeManagedMemory,
|
||||
0));
|
||||
WARN("hipDeviceAttributeManagedMemory: " << managed);
|
||||
|
||||
// Checking for Vega20 or MI100
|
||||
hipDeviceProp_t prop;
|
||||
HIP_CHECK(hipGetDeviceProperties(&prop, 0));
|
||||
char *p = NULL;
|
||||
p = strstr(prop.gcnArchName, "gfx906");
|
||||
if (p) {
|
||||
WARN("This system has MI60 gpu hence OverSubscription test will be");
|
||||
WARN(" skipped");
|
||||
Signal[2] = 1;
|
||||
}
|
||||
p = strstr(prop.gcnArchName, "gfx908");
|
||||
if (p) {
|
||||
WARN("This system has MI100 gpu hence OverSubscription test will be");
|
||||
WARN(" skipped");
|
||||
Signal[2] = 1;
|
||||
}
|
||||
Signal[1] = managed;
|
||||
Signal[0] = 1;
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_HMM_OverSubscriptionTst") {
|
||||
hipDeviceProp_t prop;
|
||||
HIP_CHECK(hipGetDeviceProperties(&prop, 0));
|
||||
char *p = nullptr;
|
||||
p = strstr(prop.gcnArchName, "xnack+");
|
||||
if (p == nullptr) {
|
||||
INFO("Skipped due current device is non xnack device.");
|
||||
return;
|
||||
}
|
||||
int HmmEnabled = 0;
|
||||
// The following Shared Mem is to get Max GPU Mem
|
||||
// The size requested is for three ints
|
||||
// 1) To get Max GPU Mem in GB
|
||||
// 2) To Signal parent that req. info is available to consume
|
||||
// 3) To know if MI60 or MI100 gpu are there in the system
|
||||
key_t key = ftok("shmTotMem", 66);
|
||||
int shmid = shmget(key, (3 * sizeof(int)), 0666|IPC_CREAT);
|
||||
int *TotGpuMem = reinterpret_cast<int*>(shmat(shmid, NULL, 0));
|
||||
TotGpuMem[0] = 0; TotGpuMem[1] = 0;
|
||||
// The following function DisplayHmmFlgs() displays the flag values related
|
||||
// to HMM and also sends us ManagedMemory attribute value
|
||||
if (fork() == 0) {
|
||||
DisplayHmmFlgs(TotGpuMem);
|
||||
exit(1);
|
||||
}
|
||||
while (TotGpuMem[0] == 0) {
|
||||
sleep(2);
|
||||
}
|
||||
// The following if block will skip test if either of MI60 or MI100 is found
|
||||
if (TotGpuMem[2] == 1) {
|
||||
SUCCEED("Test is skipped!!");
|
||||
REQUIRE(true);
|
||||
} else {
|
||||
HmmEnabled = TotGpuMem[1];
|
||||
|
||||
// Re-setting the shared memory values for further usage
|
||||
TotGpuMem[0] = 0;
|
||||
TotGpuMem[1] = 0;
|
||||
|
||||
std::list<pid_t> PidLst;
|
||||
// The following function gets the MaxGpu memory in GBs and also launches
|
||||
// OverSubscription test
|
||||
if (HmmEnabled) {
|
||||
if ((setenv("HSA_XNACK", "1", 1)) != 0) {
|
||||
WARN("Unable to turn on HSA_XNACK, hence terminating the Test case!");
|
||||
REQUIRE(false);
|
||||
}
|
||||
if (fork() == 0) {
|
||||
GetTotGpuMem(TotGpuMem);
|
||||
}
|
||||
while (TotGpuMem[1] == 0) {
|
||||
sleep(2);
|
||||
}
|
||||
int NumGB = TotGpuMem[0], TotalThreads = (NumGB + 10);
|
||||
WARN("Launching " << TotalThreads);
|
||||
WARN(" processes to test OverSubscription.");
|
||||
pid_t pid;
|
||||
for (int k = 0; k < TotalThreads; ++k) {
|
||||
pid = fork();
|
||||
PidLst.push_back(pid);
|
||||
if (pid == 0) {
|
||||
OneGBMemTest(0);
|
||||
exit(10);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory "
|
||||
"attribute. Hence skipping the testing with Pass result.\n");
|
||||
}
|
||||
int status;
|
||||
for (pid_t pd : PidLst) {
|
||||
waitpid(pd, &status, 0);
|
||||
if (!(WIFEXITED(status))) {
|
||||
REQUIRE(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
shmdt(TotGpuMem);
|
||||
shmctl(shmid, IPC_RMID, NULL);
|
||||
}
|
||||
Fai riferimento in un nuovo problema
Block a user