From da360c2aabb93d398b730c11bf9d1ecff5d4267a Mon Sep 17 00:00:00 2001 From: Jatin Chaudhary Date: Fri, 21 May 2021 02:31:28 -0700 Subject: [PATCH] SWDEV-277697 - [CatchTest] Fix Documentation, Add test to AMD specific, Add HIP Macros, New Binary for multiproc tests Change-Id: I3783caf85c694b724ed55b778220b8ef9a39f84b --- catch/CMakeLists.txt | 14 +- catch/README.md | 7 +- catch/hipTestMain/CMakeLists.txt | 30 +++- catch/hipTestMain/hip_test_context.cc | 63 +------- catch/hipTestMain/main.cc | 2 +- catch/include/hip_test_checkers.hh | 164 +++++++++++++++++++++ catch/include/hip_test_common.hh | 14 ++ catch/include/hip_test_context.hh | 5 - catch/include/hip_test_kernels.hh | 62 ++++++++ catch/multiproc/CMakeLists.txt | 13 ++ catch/multiproc/childMalloc.cc | 62 ++++++++ catch/multiproc/hipMallocConcurrency.cc | 188 ++++++++++++++++++++++++ catch/unit/deviceLib/CMakeLists.txt | 9 ++ catch/unit/kernels/add.cc | 9 +- catch/unit/rtc/CMakeLists.txt | 18 +-- catch/unit/rtc/saxpy.cc | 67 +++++---- catch/unit/rtc/test.cc | 6 - 17 files changed, 603 insertions(+), 130 deletions(-) create mode 100644 catch/include/hip_test_checkers.hh create mode 100644 catch/include/hip_test_kernels.hh create mode 100644 catch/multiproc/CMakeLists.txt create mode 100644 catch/multiproc/childMalloc.cc create mode 100644 catch/multiproc/hipMallocConcurrency.cc delete mode 100644 catch/unit/rtc/test.cc diff --git a/catch/CMakeLists.txt b/catch/CMakeLists.txt index 53122d59f0..9aa59126be 100644 --- a/catch/CMakeLists.txt +++ b/catch/CMakeLists.txt @@ -44,15 +44,25 @@ include_directories( ${HIP_PATH}/include ${JSON_PARSER} ) + +if(HIP_PLATFORM MATCHES "amd" AND HIP_COMPILER MATCHES "clang") + add_compile_options(-Wall -Wextra -pedantic -Werror) +endif() + cmake_policy(PUSH) if(POLICY CMP0037) cmake_policy(SET CMP0037 OLD) endif() + +# Use clang as host compiler with nvcc +if(HIP_COMPILER MATCHES "nvcc") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ccbin clang") +endif() + add_custom_target(build_tests) -add_custom_target(test COMMAND ${CMAKE_CTEST_COMMAND}) -add_dependencies(test build_tests) add_subdirectory(unit) add_subdirectory(hipTestMain) add_subdirectory(stress) +add_subdirectory(multiproc) cmake_policy(POP) diff --git a/catch/README.md b/catch/README.md index 38d507c9c0..9d57c8a59d 100644 --- a/catch/README.md +++ b/catch/README.md @@ -12,7 +12,7 @@ Tests in Catch2 are declared via ```TEST_CASE```. ## Taking care of existing features - Don’t build on platform: EXCLUDE_(HIP_PLATFORM/HIP_RUNTIME), can be done via CMAKE. Adding source in if(HIP_PLATFORM == amd/nvidia). -- HIPCC_OPTIONS/CLANG Options: Can be done via: set_source_files_properties(src.cc PROPERTIES COMPILE_FLAGS “…”). +- HIPCC_OPTIONS/CLANG Options: Can be done via: set_source_files_properties(src.cc PROPERTIES COMPILE_FLAGS “…”). - Additional libraries: Can be done via target_link_libraries() - Multiple runs with different args: This can be done by Catch’s Feature: GENERATE(…) Running Subtest: ctest –R “...” (Regex to match the subtest name) @@ -31,8 +31,6 @@ Some useful functions are: - `bool isLinux()` : true if os is linux - `bool isAmd()` : true if platform is AMD - `bool isNvidia()` : true if platform is NVIDIA -- `std::vector getDevices()` : returns a vector of strings that contains device names (eg: For AMD: gfx906, gfx908 etc / For NVIDIA: RTX 2070 Super) -- `std::vector getTargetId()` : (AMD Only) returns target id for gpus (eg: gfx906:sramecc+:xnack- etc) This information can be accessed in any test via using: `TestContext::get().isAmd()`. @@ -72,5 +70,4 @@ Catch2 allows multiple ways in which you can debug the test case. ## External Libs being used - [Catch2](https://github.com/catchorg/Catch2) - Testing framework -- [taocpp/json](https://github.com/taocpp/json) - For config file parsing -- [taocpp/PEGTL](https://github.com/taocpp/PEGTL) - Helper lib for taojson +- [picojson](https://github.com/kazuho/picojson) - For config file parsing diff --git a/catch/hipTestMain/CMakeLists.txt b/catch/hipTestMain/CMakeLists.txt index e0a7dfb0b6..3895a296f7 100644 --- a/catch/hipTestMain/CMakeLists.txt +++ b/catch/hipTestMain/CMakeLists.txt @@ -1,11 +1,29 @@ if(CMAKE_BUILD_TYPE MATCHES "^Debug$") add_definitions(-DHT_LOG_ENABLE) endif() -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTAO_PEGTL_STD_EXPERIMENTAL_FILESYSTEM=1") -add_library(ht_context SHARED EXCLUDE_FROM_ALL hip_test_context.cc) -set_property(TARGET ht_context PROPERTY CXX_STANDARD 17) -target_link_libraries(ht_context PRIVATE stdc++fs) -add_executable(UnitTests EXCLUDE_FROM_ALL main.cc) -target_link_libraries(UnitTests PRIVATE ht_context DeviceLibs MemoryTest Kernels stdc++fs) + +add_executable(UnitTests EXCLUDE_FROM_ALL main.cc hip_test_context.cc) +set_property(TARGET UnitTests PROPERTY CXX_STANDARD 17) + +target_link_libraries(UnitTests PRIVATE DeviceLibs + MemoryTest + Kernels + stdc++fs) + +# Add AMD Only Tests +if(HIP_PLATFORM MATCHES "amd") + target_link_libraries(UnitTests PRIVATE RTC) +endif() + catch_discover_tests(UnitTests PROPERTIES SKIP_REGULAR_EXPRESSION "HIP_SKIP_THIS_TEST") add_dependencies(build_tests UnitTests) + +# Add Multiproc tests as seperate binary +if(UNIX) + add_executable(MultiProcTests EXCLUDE_FROM_ALL main.cc hip_test_context.cc) + set_property(TARGET MultiProcTests PROPERTY CXX_STANDARD 17) + target_link_libraries(MultiProcTests PRIVATE MultiProc + stdc++fs) + catch_discover_tests(MultiProcTests PROPERTIES SKIP_REGULAR_EXPRESSION "HIP_SKIP_THIS_TEST") + add_dependencies(build_tests MultiProcTests) +endif() diff --git a/catch/hipTestMain/hip_test_context.cc b/catch/hipTestMain/hip_test_context.cc index 4fb6b70314..e10dc1a5f6 100644 --- a/catch/hipTestMain/hip_test_context.cc +++ b/catch/hipTestMain/hip_test_context.cc @@ -15,26 +15,6 @@ namespace fs = std::experimental::filesystem; #include -static std::string getValue(std::string option, const std::string& opt_str) { - std::string s_opt = opt_str; - return s_opt.erase(0, option.size()); -} - -static std::string trimName(std::string input, char trim) { - auto pos_ = input.find(trim); - auto res = input; - if (pos_ == std::string::npos) { - input = ""; - } else { - res = input.substr(0, pos_); - input = input.substr(pos_); - } - return res; -} - -const std::vector& TestContext::getDevices() const { return config_.devices; } -const std::vector& TestContext::getTargetId() const { return config_.targetId; } - void TestContext::detectOS() { #if (HT_WIN == 1) p_windows = true; @@ -57,18 +37,21 @@ void TestContext::fillConfig() { (env_config != nullptr) ? env_config : "Not found, using default config"); // Check if path has been provided + std::string def_config_json = "config.json"; std::string config_str; if (env_config != nullptr) { config_str = env_config; } else { - config_str = "config.json"; + config_str = def_config_json; } fs::path config_path = config_str; - if (config_path.has_parent_path()) { + if (config_path.has_parent_path() && config_path.has_filename()) { config_.json_file = config_str; + } else if (config_path.has_parent_path()) { + config_.json_file = config_path / def_config_json; } else { - config_.json_file = exe_path + config_str; + config_.json_file = exe_path + def_config_json; } LogPrintf("Config file path: %s", config_.json_file.c_str()); @@ -79,37 +62,6 @@ void TestContext::fillConfig() { LogPrintf("%s", "Either Config or Os is unknown, this wont end well"); abort(); } - - int deviceCount = 0; - auto res = hipGetDeviceCount(&deviceCount); - if (res != hipSuccess) { - LogPrintf("HIP Device Count query failed with: %s", hipGetErrorString(res)); - abort(); - } - if (deviceCount == 0) { - LogPrintf("%s", "No hip devices found"); - abort(); - } - config_.devices.reserve(deviceCount); - for (int i = 0; i < deviceCount; i++) { - hipDeviceProp_t props; - res = hipGetDeviceProperties(&props, i); - if (res != hipSuccess) { - LogPrintf("HIP Device Count query failed with: %s", hipGetErrorString(res)); - abort(); - } - if (amd) { - std::string tid = std::string(props.gcnArchName); - config_.targetId.push_back(tid); - std::string dev = trimName(tid, ':'); - config_.devices.push_back(dev); - } else if (nvidia) { - config_.devices.push_back(std::string(props.name)); - } else { - LogPrintf("%s", "Unknown platform"); - abort(); - } - } } TestContext::TestContext(int argc, char** argv) { @@ -122,6 +74,7 @@ TestContext::TestContext(int argc, char** argv) { } void TestContext::setExePath(int argc, char** argv) { + if (argc == 0) return; fs::path p = std::string(argv[0]); if (p.has_filename()) p.remove_filename(); exe_path = p.string(); @@ -168,8 +121,6 @@ bool TestContext::parseJsonFile() { picojson::value v; std::string err; - const char* json_end = - picojson::parse(v, json_str.data(), json_str.data() + json_str.size(), &err); if (err.size() > 1) { LogPrintf("Error from PicoJson: %s", err.data()); return false; diff --git a/catch/hipTestMain/main.cc b/catch/hipTestMain/main.cc index c70bf755cf..886aa7a8dc 100644 --- a/catch/hipTestMain/main.cc +++ b/catch/hipTestMain/main.cc @@ -6,7 +6,7 @@ int main(int argc, char** argv) { auto& context = TestContext::get(argc, argv); if (context.skipTest()) { // CTest uses this regex to figure out if the test has been skipped - std::cout << "HIP_SKIP_THIS_TEST" << context.getCurrentTest() << std::endl; + std::cout << "HIP_SKIP_THIS_TEST" << std::endl; return 0; } return Catch::Session().run(argc, argv); diff --git a/catch/include/hip_test_checkers.hh b/catch/include/hip_test_checkers.hh new file mode 100644 index 0000000000..68f0a1e405 --- /dev/null +++ b/catch/include/hip_test_checkers.hh @@ -0,0 +1,164 @@ +#pragma once +#include "hip_test_common.hh" + +namespace HipTest { +template +size_t checkVectors(T* A, T* B, T* Out, size_t N, T (*F)(T a, T b), bool expectMatch = true, + bool reportMismatch = true) { + size_t mismatchCount = 0; + size_t firstMismatch = 0; + size_t mismatchesToPrint = 10; + for (size_t i = 0; i < N; i++) { + T expected = F(A[i], B[i]); + if (Out[i] != expected) { + if (mismatchCount == 0) { + firstMismatch = i; + } + mismatchCount++; + if ((mismatchCount <= mismatchesToPrint) && expectMatch) { + INFO("Mismatch at " << i << " Computed: " << Out[i] << " Expeted: " << expected); + CHECK(false); + } + } + } + + if (reportMismatch) { + if (expectMatch) { + if (mismatchCount) { + INFO(mismatchCount << " Mismatches First Mismatch at index : " << firstMismatch); + REQUIRE(false); + } + } else { + if (mismatchCount == 0) { + INFO("Expected Mismatch but not found any"); + REQUIRE(false); + } + } + } + + return mismatchCount; +} + +template +size_t checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch = true, + bool reportMismatch = true) { + return checkVectors( + A_h, B_h, result_H, N, [](T a, T b) { return a + b; }, expectMatch, reportMismatch); +} + +template +void checkTest(T* expected_H, T* result_H, size_t N, bool expectMatch = true) { + checkVectors( + expected_H, expected_H, result_H, N, + [](T a, T b) { + assert(a == b); + return a; + }, + expectMatch); +} + + +// Setters and Memory Management + +template void setDefaultData(size_t numElements, T* A_h, T* B_h, T* C_h) { + // Initialize the host data: + for (size_t i = 0; i < numElements; i++) { + if (A_h) (A_h)[i] = 3.146f + i; // Pi + if (B_h) (B_h)[i] = 1.618f + i; // Phi + if (C_h) (C_h)[i] = 0.0f + i; + } +} + +template +bool initArraysForHost(T** A_h, T** B_h, T** C_h, size_t N, bool usePinnedHost = false) { + size_t Nbytes = N * sizeof(T); + + if (usePinnedHost) { + if (A_h) { + HIPCHECK(hipHostMalloc((void**)A_h, Nbytes)); + } + if (B_h) { + HIPCHECK(hipHostMalloc((void**)B_h, Nbytes)); + } + if (C_h) { + HIPCHECK(hipHostMalloc((void**)C_h, Nbytes)); + } + } else { + if (A_h) { + *A_h = (T*)malloc(Nbytes); + REQUIRE(*A_h != NULL); + } + + if (B_h) { + *B_h = (T*)malloc(Nbytes); + REQUIRE(*B_h != NULL); + } + + if (C_h) { + *C_h = (T*)malloc(Nbytes); + REQUIRE(*C_h != NULL); + } + } + + setDefaultData(N, A_h ? *A_h : NULL, B_h ? *B_h : NULL, C_h ? *C_h : NULL); + return true; +} + +template +bool initArrays(T** A_d, T** B_d, T** C_d, T** A_h, T** B_h, T** C_h, size_t N, + bool usePinnedHost = false) { + size_t Nbytes = N * sizeof(T); + + if (A_d) { + HIPCHECK(hipMalloc(A_d, Nbytes)); + } + if (B_d) { + HIPCHECK(hipMalloc(B_d, Nbytes)); + } + if (C_d) { + HIPCHECK(hipMalloc(C_d, Nbytes)); + } + + return initArraysForHost(A_h, B_h, C_h, N, usePinnedHost); +} + +template bool freeArraysForHost(T* A_h, T* B_h, T* C_h, bool usePinnedHost) { + if (usePinnedHost) { + if (A_h) { + HIPCHECK(hipHostFree(A_h)); + } + if (B_h) { + HIPCHECK(hipHostFree(B_h)); + } + if (C_h) { + HIPCHECK(hipHostFree(C_h)); + } + } else { + if (A_h) { + free(A_h); + } + if (B_h) { + free(B_h); + } + if (C_h) { + free(C_h); + } + } + return true; +} + +template +bool freeArrays(T* A_d, T* B_d, T* C_d, T* A_h, T* B_h, T* C_h, bool usePinnedHost) { + if (A_d) { + HIPCHECK(hipFree(A_d)); + } + if (B_d) { + HIPCHECK(hipFree(B_d)); + } + if (C_d) { + HIPCHECK(hipFree(C_d)); + } + + return freeArraysForHost(A_h, B_h, C_h, usePinnedHost); +} +} // namespace HipTest diff --git a/catch/include/hip_test_common.hh b/catch/include/hip_test_common.hh index b654445486..a6e07973f9 100644 --- a/catch/include/hip_test_common.hh +++ b/catch/include/hip_test_common.hh @@ -1,2 +1,16 @@ +#pragma once #include "hip_test_context.hh" #include + +#define HIP_PRINT_STATUS(status) INFO(hipGetErrorName(status) << " at line: " << __LINE__); + +#define HIPCHECK(error) \ + { \ + hipError_t localError = error; \ + if ((localError != hipSuccess) && (localError != hipErrorPeerAccessAlreadyEnabled)) { \ + INFO("Error: " << hipGetErrorString(localError) << " Code: " << localError << " Str: " \ + << #error << " In File: " << __FILE__ << " At line: " << __LINE__); \ + REQUIRE(false); \ + } \ + } + diff --git a/catch/include/hip_test_context.hh b/catch/include/hip_test_context.hh index 6b0100fe12..cd81024aa6 100644 --- a/catch/include/hip_test_context.hh +++ b/catch/include/hip_test_context.hh @@ -34,12 +34,9 @@ static int _log_enable = (std::getenv("HT_LOG_ENABLE") ? 1 : 0); } \ } - typedef struct Config_ { std::string json_file; // Json file std::string platform; // amd/nvidia - std::vector devices; // gfx906, etc - std::vector targetId; // Target Ids, only for AMD, gfx906:sramecc+:xnack- std::string os; // windows/linux } Config; @@ -73,8 +70,6 @@ class TestContext { bool isNvidia() const; bool isAmd() const; bool skipTest() const; - const std::vector& getDevices() const; - const std::vector& getTargetId() const; const std::string& getCurrentTest() const { return current_test; } std::string currentPath(); diff --git a/catch/include/hip_test_kernels.hh b/catch/include/hip_test_kernels.hh new file mode 100644 index 0000000000..7196accd97 --- /dev/null +++ b/catch/include/hip_test_kernels.hh @@ -0,0 +1,62 @@ +#pragma once + +#include + +namespace HipTest { +template __global__ void vectorADD(const T* A_d, const T* B_d, T* C_d, size_t NELEM) { + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + + for (size_t i = offset; i < NELEM; i += stride) { + C_d[i] = A_d[i] + B_d[i]; + } +} + + +template +__global__ void vectorADDReverse(const T* A_d, const T* B_d, T* C_d, size_t NELEM) { + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + + for (int64_t i = NELEM - stride + offset; i >= 0; i -= stride) { + C_d[i] = A_d[i] + B_d[i]; + } +} + + +template __global__ void addCount(const T* A_d, T* C_d, size_t NELEM, int count) { + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + + // Deliberately do this in an inefficient way to increase kernel runtime + for (int i = 0; i < count; i++) { + for (size_t i = offset; i < NELEM; i += stride) { + C_d[i] = A_d[i] + (T)count; + } + } +} + + +template +__global__ void addCountReverse(const T* A_d, T* C_d, int64_t NELEM, int count) { + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + + // Deliberately do this in an inefficient way to increase kernel runtime + for (int i = 0; i < count; i++) { + for (int64_t i = NELEM - stride + offset; i >= 0; i -= stride) { + C_d[i] = A_d[i] + (T)count; + } + } +} + + +template __global__ void memsetReverse(T* C_d, T val, int64_t NELEM) { + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + + for (int64_t i = NELEM - stride + offset; i >= 0; i -= stride) { + C_d[i] = val; + } +} +} // namespace HipTest \ No newline at end of file diff --git a/catch/multiproc/CMakeLists.txt b/catch/multiproc/CMakeLists.txt new file mode 100644 index 0000000000..a782262380 --- /dev/null +++ b/catch/multiproc/CMakeLists.txt @@ -0,0 +1,13 @@ +# AMD Tests +set(LINUX_TEST_SRC + hipMallocConcurrency.cc + childMalloc.cc +) + +if(UNIX) + # Create shared lib of all tests + add_library(MultiProc SHARED EXCLUDE_FROM_ALL ${LINUX_TEST_SRC}) + + # Add dependency on build_tests to build it on this custom target + add_dependencies(build_tests MultiProc) +endif() diff --git a/catch/multiproc/childMalloc.cc b/catch/multiproc/childMalloc.cc new file mode 100644 index 0000000000..858fd0878e --- /dev/null +++ b/catch/multiproc/childMalloc.cc @@ -0,0 +1,62 @@ +#include +#include +#include + +#ifdef __linux__ +#include +#include +#include +#include +#endif + + +bool testMallocFromChild() { + int fd[2]; + pid_t childpid; + bool testResult = false; + + // create pipe descriptors + pipe(fd); + + childpid = fork(); + if (childpid > 0) { // Parent + close(fd[1]); + // parent will wait to read the device cnt + read(fd[0], &testResult, sizeof(testResult)); + + // close the read-descriptor + close(fd[0]); + + // wait for child exit + wait(NULL); + + return testResult; + + } else if (!childpid) { // Child + // writing only, no need for read-descriptor + close(fd[0]); + + char* A_d = nullptr; + hipError_t ret = hipMalloc(&A_d, 1024); + + printf("hipMalloc returned : %s\n", hipGetErrorString(ret)); + if (ret == hipSuccess) + testResult = true; + else + testResult = false; + + // send the value on the write-descriptor: + write(fd[1], &testResult, sizeof(testResult)); + + // close the write descriptor: + close(fd[1]); + exit(0); + } + return false; +} + + +TEST_CASE("ChildMalloc") { + auto res = testMallocFromChild(); + REQUIRE(res == true); +} diff --git a/catch/multiproc/hipMallocConcurrency.cc b/catch/multiproc/hipMallocConcurrency.cc new file mode 100644 index 0000000000..72d17c26a0 --- /dev/null +++ b/catch/multiproc/hipMallocConcurrency.cc @@ -0,0 +1,188 @@ +#include +#include +#include +#ifdef __linux__ +#include +#include +#endif +#include +#include +#include +#include + + +#include + +size_t N = 4 * 1024 * 1024; +unsigned blocksPerCU = 6; // to hide latency +unsigned threadsPerBlock = 256; + + +unsigned setNumBlocks(unsigned blocksPerCU, unsigned threadsPerBlock, size_t N) { + int device; + HIPCHECK(hipGetDevice(&device)); + hipDeviceProp_t props; + HIPCHECK(hipGetDeviceProperties(&props, device)); + + unsigned blocks = props.multiProcessorCount * blocksPerCU; + if (blocks * threadsPerBlock > N) { + blocks = (N + threadsPerBlock - 1) / threadsPerBlock; + } + + return blocks; +} + + +/** + * Validates data consitency on supplied gpu + */ +bool validateMemoryOnGPU(int gpu, bool concurOnOneGPU = false) { + size_t Nbytes = N * sizeof(int); + int *A_d, *B_d, *C_d; + int *A_h, *B_h, *C_h; + size_t prevAvl, prevTot, curAvl, curTot; + bool TestPassed = true; + + HIPCHECK(hipSetDevice(gpu)); + HIPCHECK(hipMemGetInfo(&prevAvl, &prevTot)); + printf("tgs allocating..\n"); + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + + unsigned blocks = setNumBlocks(blocksPerCU, threadsPerBlock, N); + + HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); + HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); + + hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, + static_cast(A_d), static_cast(B_d), C_d, N); + + HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + + if (!HipTest::checkVectorADD(A_h, B_h, C_h, N)) { + printf("Validation PASSED for gpu %d from pid %d\n", gpu, getpid()); + } else { + printf("%s : Validation FAILED for gpu %d from pid %d\n", __func__, gpu, getpid()); + TestPassed &= false; + } + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HIPCHECK(hipMemGetInfo(&curAvl, &curTot)); + + if (!concurOnOneGPU && (prevAvl != curAvl || prevTot != curTot)) { + // In concurrent calls on one GPU, we cannot verify leaking in this way + printf( + "%s : Memory allocation mismatch observed." + "Possible memory leak.\n", + __func__); + TestPassed &= false; + } + + return TestPassed; +} + + +#if 1 +/** + * Fetches Gpu device count + */ +void getDeviceCount1(int* pdevCnt) { +#ifdef __linux__ + int fd[2], val = 0; + pid_t childpid; + + // create pipe descriptors + pipe(fd); + + // disable visible_devices env from shell + unsetenv("ROCR_VISIBLE_DEVICES"); + unsetenv("HIP_VISIBLE_DEVICES"); + + childpid = fork(); + + if (childpid > 0) { // Parent + close(fd[1]); + // parent will wait to read the device cnt + read(fd[0], &val, sizeof(val)); + + // close the read-descriptor + close(fd[0]); + + // wait for child exit + wait(NULL); + + *pdevCnt = val; + } else if (!childpid) { // Child + int devCnt = 1; + // writing only, no need for read-descriptor + close(fd[0]); + + HIPCHECK(hipGetDeviceCount(&devCnt)); + // send the value on the write-descriptor: + write(fd[1], &devCnt, sizeof(devCnt)); + + // close the write descriptor: + close(fd[1]); + exit(0); + } else { // failure + *pdevCnt = 1; + return; + } + +#else + HIPCHECK(hipGetDeviceCount(pdevCnt)); +#endif +} +#endif + + +TEST_CASE("hipMallocChild_Concurrency_MultiGpu") { + bool TestPassed = false; +#ifdef __linux__ + // Parallel execution on multiple gpus from different child processes + int devCnt = 1, pid = 0; + + // Get GPU count + getDeviceCount1(&devCnt); + + // Spawn child for each GPU + for (int gpu = 0; gpu < devCnt; gpu++) { + if ((pid = fork()) < 0) { + INFO("Child_Concurrency_MultiGpu : fork() returned error" << pid); + REQUIRE(false); + + } else if (!pid) { // Child process + bool TestPassedChild = false; + TestPassedChild = validateMemoryOnGPU(gpu); + + if (TestPassedChild) { + printf("returning exit(1) for success\n"); + exit(1); // child exit with success status + } else { + printf("Child_Concurrency_MultiGpu : childpid %d failed\n", getpid()); + exit(2); // child exit with failure status + } + } + } + + // Parent shall wait for child to complete + int cnt = 0; + + for (int i = 0; i < devCnt; i++) { + int pidwait = 0, exitStatus; + pidwait = wait(&exitStatus); + + printf("exitStatus for iter:%d is %d\n", i, exitStatus); + if (pidwait < 0) { + break; + } + + if (WEXITSTATUS(exitStatus) == 1) cnt++; + } + + if (cnt && (cnt == devCnt)) TestPassed = true; + +#else + INFO("Test hipMallocChild_Concurrency_MultiGpu skipped on non-linux"); +#endif + REQUIRE(TestPassed == true); +} diff --git a/catch/unit/deviceLib/CMakeLists.txt b/catch/unit/deviceLib/CMakeLists.txt index 421261b518..22de79c687 100644 --- a/catch/unit/deviceLib/CMakeLists.txt +++ b/catch/unit/deviceLib/CMakeLists.txt @@ -1,9 +1,18 @@ # Common Tests - Test independent of all platforms set(TEST_SRC floatMath.cc +) + +# AMD only tests +set(AMD_TEST_SRC vectorTypesDevice.cc ) +if(HIP_PLATFORM MATCHES "amd") + set(TEST_SRC ${TEST_SRC} ${AMD_TEST_SRC}) +endif() + + # Create shared lib of all tests add_library(DeviceLibs SHARED EXCLUDE_FROM_ALL ${TEST_SRC}) diff --git a/catch/unit/kernels/add.cc b/catch/unit/kernels/add.cc index 7adfde51d2..4f70ffef77 100644 --- a/catch/unit/kernels/add.cc +++ b/catch/unit/kernels/add.cc @@ -2,12 +2,11 @@ #include template __global__ void add(T* a, T* b, T* c, size_t size) { - int i = threadIdx.x; - c[i] = a[i] + b[i]; + size_t i = threadIdx.x; + if (i < size) c[i] = a[i] + b[i]; } TEMPLATE_TEST_CASE("Add Kernel", "[kernel][add]", int, long, float, long long, double) { - auto addKernel = add; auto size = GENERATE(as{}, 100, 500, 1000); TestType *d_a, *d_b, *d_c; auto res = hipMalloc(&d_a, sizeof(TestType) * size); @@ -18,7 +17,7 @@ TEMPLATE_TEST_CASE("Add Kernel", "[kernel][add]", int, long, float, long long, d REQUIRE(res == hipSuccess); std::vector a, b, c; - for (int i = 0; i < size; i++) { + for (size_t i = 0; i < size; i++) { a.push_back(i + 1); b.push_back(i + 1); c.push_back(2 * (i + 1)); @@ -29,7 +28,7 @@ TEMPLATE_TEST_CASE("Add Kernel", "[kernel][add]", int, long, float, long long, d res = hipMemcpy(d_b, b.data(), sizeof(TestType) * size, hipMemcpyHostToDevice); REQUIRE(res == hipSuccess); - hipLaunchKernelGGL(addKernel, 1, size, 0, 0, d_a, d_b, d_c, size); + hipLaunchKernelGGL(add, 1, size, 0, 0, d_a, d_b, d_c, size); res = hipMemcpy(a.data(), d_c, sizeof(TestType) * size, hipMemcpyDeviceToHost); REQUIRE(res == hipSuccess); diff --git a/catch/unit/rtc/CMakeLists.txt b/catch/unit/rtc/CMakeLists.txt index 435d372fbc..062e4153c6 100644 --- a/catch/unit/rtc/CMakeLists.txt +++ b/catch/unit/rtc/CMakeLists.txt @@ -1,14 +1,12 @@ -# Common Tests - Test independent of all platforms -set(TEST_SRC +# AMD Tests +set(AMD_TEST_SRC saxpy.cc ) -# Set source File properties -set_source_files_properties(saxpy.cc PROPERTIES COMPILE_FLAGS " -std=c++14 ") -set_source_files_properties(test.cc PROPERTIES COMPILE_FLAGS " -std=c++17 ") +if(HIP_PLATFORM MATCHES "amd") + # Create shared lib of all tests + add_library(RTC SHARED EXCLUDE_FROM_ALL ${AMD_TEST_SRC}) -# Create shared lib of all tests -add_library(RTC SHARED EXCLUDE_FROM_ALL ${TEST_SRC}) - -# Add dependency on build_tests to build it on this custom target -add_dependencies(build_tests RTC) + # Add dependency on build_tests to build it on this custom target + add_dependencies(build_tests RTC) +endif() diff --git a/catch/unit/rtc/saxpy.cc b/catch/unit/rtc/saxpy.cc index 186349ae00..af7ca24a2f 100644 --- a/catch/unit/rtc/saxpy.cc +++ b/catch/unit/rtc/saxpy.cc @@ -15,7 +15,7 @@ static constexpr auto NUM_THREADS{128}; static constexpr auto NUM_BLOCKS{32}; static constexpr auto saxpy{ - R"( +R"( #include extern "C" __global__ @@ -23,8 +23,7 @@ void saxpy(float a, float* x, float* y, float* out, size_t n) { size_t tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < n) { - out[tid] = a * x[tid] + y[tid] ; - + out[tid] = a * x[tid] + y[tid]; } } @@ -72,42 +71,42 @@ TEST_CASE("saxpy", "[hiprtc][saxpy]") { unique_ptr hX{new float[n]}; unique_ptr hY{new float[n]}; unique_ptr hOut{new float[n]}; -for (size_t i = 0; i < n; ++i) { - hX[i] = static_cast(i); - hY[i] = static_cast(i * 2); - } + for (size_t i = 0; i < n; ++i) { + hX[i] = static_cast(i); + hY[i] = static_cast(i * 2); + } - hipDeviceptr_t dX, dY, dOut; - hipMalloc(&dX, bufferSize); - hipMalloc(&dY, bufferSize); - hipMalloc(&dOut, bufferSize); - hipMemcpyHtoD(dX, hX.get(), bufferSize); - hipMemcpyHtoD(dY, hY.get(), bufferSize); + hipDeviceptr_t dX, dY, dOut; + hipMalloc(&dX, bufferSize); + hipMalloc(&dY, bufferSize); + hipMalloc(&dOut, bufferSize); + hipMemcpyHtoD(dX, hX.get(), bufferSize); + hipMemcpyHtoD(dY, hY.get(), bufferSize); - struct { - float a_; - hipDeviceptr_t b_; - hipDeviceptr_t c_; - hipDeviceptr_t d_; - size_t e_; - } args{a, dX, dY, dOut, n}; + struct { + float a_; + hipDeviceptr_t b_; + hipDeviceptr_t c_; + hipDeviceptr_t d_; + size_t e_; + } args{a, dX, dY, dOut, n}; - auto size = sizeof(args); - void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, - HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, - HIP_LAUNCH_PARAM_END}; + auto size = sizeof(args); + void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, + HIP_LAUNCH_PARAM_END}; - hipModuleLaunchKernel(kernel, NUM_BLOCKS, 1, 1, NUM_THREADS, 1, 1, - 0, nullptr, nullptr, config); - hipMemcpyDtoH(hOut.get(), dOut, bufferSize); + hipModuleLaunchKernel(kernel, NUM_BLOCKS, 1, 1, NUM_THREADS, 1, 1, 0, nullptr, nullptr, config); + hipMemcpyDtoH(hOut.get(), dOut, bufferSize); - for (size_t i = 0; i < n; ++i) { - REQUIRE(fabs(a * hX[i] + hY[i] - hOut[i]) > fabs(hOut[i]) * 1e-6); - } + for (size_t i = 0; i < n; ++i) { + INFO("For " << i << " Value: " << fabs(a * hX[i] + hY[i] - hOut[i]) + << " with: " << (fabs(hOut[i] * 1.0f) * 1e-6)); + REQUIRE(fabs(a * hX[i] + hY[i] - hOut[i]) <= fabs(hOut[i]) * 1e-6); + } - hipFree(dX); - hipFree(dY); - hipFree(dOut); + hipFree(dX); + hipFree(dY); + hipFree(dOut); - hipModuleUnload(module); + hipModuleUnload(module); } diff --git a/catch/unit/rtc/test.cc b/catch/unit/rtc/test.cc deleted file mode 100644 index 3b12610458..0000000000 --- a/catch/unit/rtc/test.cc +++ /dev/null @@ -1,6 +0,0 @@ -#include - -TEST_CASE("cpp17 test") { - constexpr auto l = []() { return 2 * 10 * 30; }; - REQUIRE(l() == 600); -}