EXSWHTEC-74 - Implement tests for hipMemcpy and derivatives (#15)
- Basic positive tests - Negative parameter tests
Cette révision appartient à :
@@ -0,0 +1,316 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <utils.hh>
|
||||
#include <resource_guards.hh>
|
||||
|
||||
static inline unsigned int GenerateLinearAllocationFlagCombinations(
|
||||
const LinearAllocs allocation_type) {
|
||||
switch (allocation_type) {
|
||||
case LinearAllocs::hipHostMalloc:
|
||||
return GENERATE(hipHostMallocDefault, hipHostMallocPortable, hipHostMallocMapped,
|
||||
hipHostMallocWriteCombined);
|
||||
case LinearAllocs::mallocAndRegister:
|
||||
case LinearAllocs::hipMallocManaged:
|
||||
case LinearAllocs::malloc:
|
||||
case LinearAllocs::hipMalloc:
|
||||
return 0u;
|
||||
default:
|
||||
assert("Invalid LinearAllocs enumerator");
|
||||
throw std::invalid_argument("Invalid LinearAllocs enumerator");
|
||||
}
|
||||
}
|
||||
|
||||
template <bool should_synchronize, typename F>
|
||||
void MemcpyDeviceToHostShell(F memcpy_func, const hipStream_t kernel_stream = nullptr) {
|
||||
using LA = LinearAllocs;
|
||||
const auto allocation_size = GENERATE(kPageSize / 2, kPageSize, kPageSize * 2);
|
||||
const auto host_allocation_type = GENERATE(LA::malloc, LA::hipHostMalloc);
|
||||
const auto host_allocation_flags = GenerateLinearAllocationFlagCombinations(host_allocation_type);
|
||||
|
||||
LinearAllocGuard<int> host_allocation(host_allocation_type, allocation_size,
|
||||
host_allocation_flags);
|
||||
LinearAllocGuard<int> device_allocation(LA::hipMalloc, allocation_size);
|
||||
|
||||
const auto element_count = allocation_size / sizeof(*device_allocation.ptr());
|
||||
constexpr auto thread_count = 1024;
|
||||
const auto block_count = element_count / thread_count + 1;
|
||||
constexpr int expected_value = 42;
|
||||
VectorSet<<<block_count, thread_count, 0, kernel_stream>>>(device_allocation.ptr(),
|
||||
expected_value, element_count);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
|
||||
HIP_CHECK(memcpy_func(host_allocation.host_ptr(), device_allocation.ptr(), allocation_size));
|
||||
if constexpr (should_synchronize) {
|
||||
HIP_CHECK(hipStreamSynchronize(kernel_stream));
|
||||
}
|
||||
|
||||
ArrayFindIfNot(host_allocation.host_ptr(), expected_value, element_count);
|
||||
}
|
||||
|
||||
template <bool should_synchronize, typename F>
|
||||
void MemcpyHostToDeviceShell(F memcpy_func, const hipStream_t kernel_stream = nullptr) {
|
||||
using LA = LinearAllocs;
|
||||
const auto allocation_size = GENERATE(kPageSize / 2, kPageSize, kPageSize * 2);
|
||||
const auto host_allocation_type = GENERATE(LA::malloc, LA::hipHostMalloc);
|
||||
const auto host_allocation_flags = GenerateLinearAllocationFlagCombinations(host_allocation_type);
|
||||
|
||||
LinearAllocGuard<int> src_host_allocation(host_allocation_type, allocation_size,
|
||||
host_allocation_flags);
|
||||
LinearAllocGuard<int> dst_host_allocation(LA::hipHostMalloc, allocation_size);
|
||||
LinearAllocGuard<int> device_allocation(LA::hipMalloc, allocation_size);
|
||||
|
||||
const auto element_count = allocation_size / sizeof(*device_allocation.ptr());
|
||||
constexpr int fill_value = 42;
|
||||
std::fill_n(src_host_allocation.host_ptr(), element_count, fill_value);
|
||||
std::fill_n(dst_host_allocation.host_ptr(), element_count, 0);
|
||||
|
||||
HIP_CHECK(memcpy_func(device_allocation.ptr(), src_host_allocation.host_ptr(), allocation_size));
|
||||
if constexpr (should_synchronize) {
|
||||
HIP_CHECK(hipStreamSynchronize(kernel_stream));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMemcpy(dst_host_allocation.host_ptr(), device_allocation.ptr(), allocation_size,
|
||||
hipMemcpyDeviceToHost));
|
||||
|
||||
ArrayFindIfNot(dst_host_allocation.host_ptr(), fill_value, element_count);
|
||||
}
|
||||
|
||||
template <bool should_synchronize, typename F>
|
||||
void MemcpyHostToHostShell(F memcpy_func, const hipStream_t kernel_stream = nullptr) {
|
||||
using LA = LinearAllocs;
|
||||
const auto allocation_size = GENERATE(kPageSize / 2, kPageSize, kPageSize * 2);
|
||||
const auto src_allocation_type = GENERATE(LA::malloc, LA::hipHostMalloc);
|
||||
const auto dst_allocation_type = GENERATE(LA::malloc, LA::hipHostMalloc);
|
||||
const auto src_allocation_flags = GenerateLinearAllocationFlagCombinations(src_allocation_type);
|
||||
const auto dst_allocation_flags = GenerateLinearAllocationFlagCombinations(dst_allocation_type);
|
||||
|
||||
LinearAllocGuard<int> src_allocation(src_allocation_type, allocation_size, src_allocation_flags);
|
||||
LinearAllocGuard<int> dst_allocation(dst_allocation_type, allocation_size, dst_allocation_flags);
|
||||
|
||||
const auto element_count = allocation_size / sizeof(*src_allocation.host_ptr());
|
||||
constexpr auto expected_value = 42;
|
||||
std::fill_n(src_allocation.host_ptr(), element_count, expected_value);
|
||||
|
||||
HIP_CHECK(memcpy_func(dst_allocation.host_ptr(), src_allocation.host_ptr(), allocation_size));
|
||||
if constexpr (should_synchronize) {
|
||||
HIP_CHECK(hipStreamSynchronize(kernel_stream));
|
||||
}
|
||||
|
||||
ArrayFindIfNot(dst_allocation.host_ptr(), expected_value, element_count);
|
||||
}
|
||||
|
||||
template <bool should_synchronize, bool enable_peer_access, typename F>
|
||||
void MemcpyDeviceToDeviceShell(F memcpy_func, const hipStream_t kernel_stream = nullptr) {
|
||||
const auto allocation_size = GENERATE(kPageSize / 2, kPageSize, kPageSize * 2);
|
||||
const auto device_count = HipTest::getDeviceCount();
|
||||
const auto src_device = GENERATE_COPY(range(0, device_count));
|
||||
const auto dst_device = GENERATE_COPY(range(0, device_count));
|
||||
INFO("Src device: " << src_device << ", Dst device: " << dst_device);
|
||||
|
||||
HIP_CHECK(hipSetDevice(src_device));
|
||||
if constexpr (enable_peer_access) {
|
||||
if (src_device == dst_device) {
|
||||
return;
|
||||
}
|
||||
int can_access_peer = 0;
|
||||
HIP_CHECK(hipDeviceCanAccessPeer(&can_access_peer, src_device, dst_device));
|
||||
if (!can_access_peer) {
|
||||
INFO("Peer access cannot be enabled between devices " << src_device << " " << dst_device);
|
||||
REQUIRE(can_access_peer);
|
||||
}
|
||||
HIP_CHECK(hipDeviceEnablePeerAccess(dst_device, 0));
|
||||
}
|
||||
|
||||
LinearAllocGuard<int> src_allocation(LinearAllocs::hipMalloc, allocation_size);
|
||||
LinearAllocGuard<int> result(LinearAllocs::hipHostMalloc, allocation_size, hipHostMallocPortable);
|
||||
HIP_CHECK(hipSetDevice(dst_device));
|
||||
LinearAllocGuard<int> dst_allocation(LinearAllocs::hipMalloc, allocation_size);
|
||||
|
||||
const auto element_count = allocation_size / sizeof(*src_allocation.ptr());
|
||||
constexpr auto thread_count = 1024;
|
||||
const auto block_count = element_count / thread_count + 1;
|
||||
constexpr int expected_value = 42;
|
||||
HIP_CHECK(hipSetDevice(src_device));
|
||||
VectorSet<<<block_count, thread_count, 0, kernel_stream>>>(src_allocation.ptr(), expected_value,
|
||||
element_count);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
|
||||
HIP_CHECK(memcpy_func(dst_allocation.ptr(), src_allocation.ptr(), allocation_size));
|
||||
if constexpr (should_synchronize) {
|
||||
HIP_CHECK(hipStreamSynchronize(kernel_stream));
|
||||
}
|
||||
|
||||
HIP_CHECK(
|
||||
hipMemcpy(result.host_ptr(), dst_allocation.ptr(), allocation_size, hipMemcpyDeviceToHost));
|
||||
if constexpr (enable_peer_access) {
|
||||
// If we've gotten this far, EnablePeerAccess must have succeeded, so we only need to check this
|
||||
// condition
|
||||
HIP_CHECK(hipDeviceDisablePeerAccess(dst_device));
|
||||
}
|
||||
|
||||
ArrayFindIfNot(result.host_ptr(), expected_value, element_count);
|
||||
}
|
||||
|
||||
template <bool should_synchronize, typename F> void MemcpyWithDirectionCommonTests(F memcpy_func) {
|
||||
using namespace std::placeholders;
|
||||
SECTION("Device to host") {
|
||||
MemcpyDeviceToHostShell<should_synchronize>(
|
||||
std::bind(memcpy_func, _1, _2, _3, hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
SECTION("Device to host with default kind") {
|
||||
MemcpyDeviceToHostShell<should_synchronize>(
|
||||
std::bind(memcpy_func, _1, _2, _3, hipMemcpyDefault));
|
||||
}
|
||||
|
||||
SECTION("Host to device") {
|
||||
MemcpyHostToDeviceShell<should_synchronize>(
|
||||
std::bind(memcpy_func, _1, _2, _3, hipMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
SECTION("Host to device with default kind") {
|
||||
MemcpyHostToDeviceShell<should_synchronize>(
|
||||
std::bind(memcpy_func, _1, _2, _3, hipMemcpyDefault));
|
||||
}
|
||||
|
||||
SECTION("Host to host") {
|
||||
MemcpyHostToHostShell<should_synchronize>(
|
||||
std::bind(memcpy_func, _1, _2, _3, hipMemcpyHostToHost));
|
||||
}
|
||||
|
||||
SECTION("Host to host with default kind") {
|
||||
MemcpyHostToHostShell<should_synchronize>(std::bind(memcpy_func, _1, _2, _3, hipMemcpyDefault));
|
||||
}
|
||||
|
||||
SECTION("Device to device") {
|
||||
SECTION("Peer access enabled") {
|
||||
MemcpyDeviceToDeviceShell<should_synchronize, true>(
|
||||
std::bind(memcpy_func, _1, _2, _3, hipMemcpyDeviceToDevice));
|
||||
}
|
||||
SECTION("Peer access disabled") {
|
||||
MemcpyDeviceToDeviceShell<should_synchronize, false>(
|
||||
std::bind(memcpy_func, _1, _2, _3, hipMemcpyDeviceToDevice));
|
||||
}
|
||||
}
|
||||
|
||||
SECTION("Device to device with default kind") {
|
||||
SECTION("Peer access enabled") {
|
||||
MemcpyDeviceToDeviceShell<should_synchronize, true>(
|
||||
std::bind(memcpy_func, _1, _2, _3, hipMemcpyDefault));
|
||||
}
|
||||
SECTION("Peer access disabled") {
|
||||
MemcpyDeviceToDeviceShell<should_synchronize, false>(
|
||||
std::bind(memcpy_func, _1, _2, _3, hipMemcpyDefault));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Synchronization behavior checks
|
||||
template <typename F>
|
||||
void MemcpySyncBehaviorCheck(F memcpy_func, const bool should_sync,
|
||||
const hipStream_t kernel_stream) {
|
||||
LaunchDelayKernel(std::chrono::milliseconds{100}, kernel_stream);
|
||||
HIP_CHECK(memcpy_func());
|
||||
if (should_sync) {
|
||||
HIP_CHECK(hipStreamQuery(kernel_stream));
|
||||
} else {
|
||||
HIP_CHECK_ERROR(hipStreamQuery(kernel_stream), hipErrorNotReady);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
void MemcpyHtoDSyncBehavior(F memcpy_func, const bool should_sync,
|
||||
const hipStream_t kernel_stream = nullptr) {
|
||||
using LA = LinearAllocs;
|
||||
const auto host_alloc_type = GENERATE(LA::malloc, LA::hipHostMalloc);
|
||||
LinearAllocGuard<int> host_alloc(host_alloc_type, kPageSize);
|
||||
LinearAllocGuard<int> device_alloc(LA::hipMalloc, kPageSize);
|
||||
MemcpySyncBehaviorCheck(std::bind(memcpy_func, device_alloc.ptr(), host_alloc.ptr(), kPageSize),
|
||||
should_sync, kernel_stream);
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
void MemcpyDtoHPageableSyncBehavior(F memcpy_func, const bool should_sync,
|
||||
const hipStream_t kernel_stream = nullptr) {
|
||||
LinearAllocGuard<int> host_alloc(LinearAllocs::malloc, kPageSize);
|
||||
LinearAllocGuard<int> device_alloc(LinearAllocs::hipMalloc, kPageSize);
|
||||
MemcpySyncBehaviorCheck(std::bind(memcpy_func, host_alloc.ptr(), device_alloc.ptr(), kPageSize),
|
||||
should_sync, kernel_stream);
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
void MemcpyDtoHPinnedSyncBehavior(F memcpy_func, const bool should_sync,
|
||||
const hipStream_t kernel_stream = nullptr) {
|
||||
LinearAllocGuard<int> host_alloc(LinearAllocs::hipHostMalloc, kPageSize);
|
||||
LinearAllocGuard<int> device_alloc(LinearAllocs::hipMalloc, kPageSize);
|
||||
MemcpySyncBehaviorCheck(std::bind(memcpy_func, host_alloc.ptr(), device_alloc.ptr(), kPageSize),
|
||||
should_sync, kernel_stream);
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
void MemcpyDtoDSyncBehavior(F memcpy_func, const bool should_sync,
|
||||
const hipStream_t kernel_stream = nullptr) {
|
||||
LinearAllocGuard<int> src_alloc(LinearAllocs::hipMalloc, kPageSize);
|
||||
LinearAllocGuard<int> dst_alloc(LinearAllocs::hipMalloc, kPageSize);
|
||||
MemcpySyncBehaviorCheck(std::bind(memcpy_func, dst_alloc.ptr(), src_alloc.ptr(), kPageSize),
|
||||
should_sync, kernel_stream);
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
void MemcpyHtoHSyncBehavior(F memcpy_func, const bool should_sync,
|
||||
const hipStream_t kernel_stream = nullptr) {
|
||||
using LA = LinearAllocs;
|
||||
const auto src_alloc_type = GENERATE(LA::malloc, LA::hipHostMalloc);
|
||||
const auto dst_alloc_type = GENERATE(LA::malloc, LA::hipHostMalloc);
|
||||
|
||||
LinearAllocGuard<int> src_alloc(src_alloc_type, kPageSize);
|
||||
LinearAllocGuard<int> dst_alloc(dst_alloc_type, kPageSize);
|
||||
MemcpySyncBehaviorCheck(std::bind(memcpy_func, dst_alloc.ptr(), src_alloc.ptr(), kPageSize),
|
||||
should_sync, kernel_stream);
|
||||
}
|
||||
|
||||
// Common negative tests
|
||||
template <typename F> void MemcpyCommonNegativeTests(F f, void* dst, void* src, size_t count) {
|
||||
SECTION("dst == nullptr") { HIP_CHECK_ERROR(f(nullptr, src, count), hipErrorInvalidValue); }
|
||||
SECTION("src == nullptr") { HIP_CHECK_ERROR(f(dst, nullptr, count), hipErrorInvalidValue); }
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
void MemcpyWithDirectionCommonNegativeTests(F f, void* dst, void* src, size_t count,
|
||||
hipMemcpyKind kind) {
|
||||
using namespace std::placeholders;
|
||||
MemcpyCommonNegativeTests(std::bind(f, _1, _2, _3, kind), dst, src, count);
|
||||
|
||||
// Disabled on AMD due to defect - EXSWHTEC-128
|
||||
#if HT_NVIDIA
|
||||
SECTION("Invalid MemcpyKind") {
|
||||
HIP_CHECK_ERROR(f(dst, src, count, static_cast<hipMemcpyKind>(-1)),
|
||||
hipErrorInvalidMemcpyDirection);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -65,6 +65,7 @@ set(TEST_SRC
|
||||
hipMemPoolApi.cc
|
||||
hipMemcpyPeer.cc
|
||||
hipMemcpyPeerAsync.cc
|
||||
hipMemcpyWithStream_old.cc
|
||||
hipMemcpyWithStream.cc
|
||||
hipMemcpyWithStreamMultiThread.cc
|
||||
hipMemsetAsyncAndKernel.cc
|
||||
@@ -74,7 +75,9 @@ set(TEST_SRC
|
||||
hipMemcpyDtoD.cc
|
||||
hipMemcpyDtoDAsync.cc
|
||||
hipHostMalloc.cc
|
||||
hipMemcpy_old.cc
|
||||
hipMemcpy.cc
|
||||
hipMemcpy_derivatives.cc
|
||||
hipMemcpyAsync.cc
|
||||
hipMemsetFunctional.cc
|
||||
hipMallocPitch.cc
|
||||
@@ -142,6 +145,7 @@ set(TEST_SRC
|
||||
hipMemPoolApi.cc
|
||||
hipMemcpyPeer.cc
|
||||
hipMemcpyPeerAsync.cc
|
||||
hipMemcpyWithStream_old.cc
|
||||
hipMemcpyWithStream.cc
|
||||
hipMemcpyWithStreamMultiThread.cc
|
||||
hipMemsetAsyncAndKernel.cc
|
||||
@@ -151,7 +155,9 @@ set(TEST_SRC
|
||||
hipMemcpyDtoD.cc
|
||||
hipMemcpyDtoDAsync.cc
|
||||
hipHostMalloc.cc
|
||||
hipMemcpy_old.cc
|
||||
hipMemcpy.cc
|
||||
hipMemcpy_derivatives.cc
|
||||
hipMemcpyAsync.cc
|
||||
hipMemsetFunctional.cc
|
||||
hipMallocPitch.cc
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
/*
|
||||
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
@@ -17,603 +19,83 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
This testcase verifies following scenarios
|
||||
1. hipMemcpy API along with kernel launch with different data types
|
||||
2. H2D-D2D-D2H scenarios for unpinned and pinned memory
|
||||
3. Boundary checks with different sizes
|
||||
4. Multithread scenario
|
||||
5. device offset scenario
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_kernels.hh>
|
||||
#include <hip_test_checkers.hh>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <memcpy1d_tests_common.hh>
|
||||
#include <resource_guards.hh>
|
||||
#include <utils.hh>
|
||||
|
||||
#ifdef _WIN32
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <windows.h>
|
||||
#else
|
||||
#include "sys/types.h"
|
||||
#include "sys/sysinfo.h"
|
||||
#endif
|
||||
|
||||
|
||||
static constexpr auto NUM_ELM{4*1024 * 1024};
|
||||
static unsigned blocksPerCU{6}; // to hide latency
|
||||
static unsigned threadsPerBlock{256};
|
||||
|
||||
template<typename T>
|
||||
class DeviceMemory {
|
||||
public:
|
||||
explicit DeviceMemory(size_t numElements);
|
||||
DeviceMemory() = delete;
|
||||
~DeviceMemory();
|
||||
T* A_d() const { return _A_d + _offset; }
|
||||
T* B_d() const { return _B_d + _offset; }
|
||||
T* C_d() const { return _C_d + _offset; }
|
||||
T* C_dd() const { return _C_dd + _offset; }
|
||||
size_t maxNumElements() const { return _maxNumElements; }
|
||||
void offset(int offset) { _offset = offset; }
|
||||
int offset() const { return _offset; }
|
||||
private:
|
||||
T* _A_d;
|
||||
T* _B_d;
|
||||
T* _C_d;
|
||||
T* _C_dd;
|
||||
size_t _maxNumElements;
|
||||
int _offset;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
DeviceMemory<T>::DeviceMemory(size_t numElements) :
|
||||
_maxNumElements(numElements), _offset(0) {
|
||||
T** np = nullptr;
|
||||
HipTest::initArrays(&_A_d, &_B_d, &_C_d, np, np, np, numElements, 0);
|
||||
size_t sizeElements = numElements * sizeof(T);
|
||||
HIP_CHECK(hipMalloc(&_C_dd, sizeElements));
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
DeviceMemory<T>::~DeviceMemory() {
|
||||
T* np = nullptr;
|
||||
HipTest::freeArrays<T>(_A_d, _B_d, _C_d, np, np, np, 0);
|
||||
HIP_CHECK(hipFree(_C_dd));
|
||||
_C_dd = NULL;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
class HostMemory {
|
||||
public:
|
||||
HostMemory(size_t numElements, bool usePinnedHost);
|
||||
HostMemory() = delete;
|
||||
void reset(size_t numElements, bool full = false);
|
||||
~HostMemory();
|
||||
T* A_h() const { return _A_h + _offset; }
|
||||
T* B_h() const { return _B_h + _offset; }
|
||||
T* C_h() const { return _C_h + _offset; }
|
||||
|
||||
size_t maxNumElements() const { return _maxNumElements; }
|
||||
void offset(int offset) { _offset = offset; }
|
||||
int offset() const { return _offset; }
|
||||
|
||||
// Host arrays, secondary copy
|
||||
T* A_hh;
|
||||
T* B_hh;
|
||||
bool _usePinnedHost;
|
||||
|
||||
private:
|
||||
size_t _maxNumElements;
|
||||
int _offset;
|
||||
|
||||
// Host arrays
|
||||
T* _A_h;
|
||||
T* _B_h;
|
||||
T* _C_h;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
HostMemory<T>::HostMemory(size_t numElements, bool usePinnedHost)
|
||||
: _usePinnedHost(usePinnedHost), _maxNumElements(numElements), _offset(0) {
|
||||
T** np = nullptr;
|
||||
HipTest::initArrays(np, np, np, &_A_h, &_B_h, &_C_h,
|
||||
numElements, usePinnedHost);
|
||||
|
||||
A_hh = NULL;
|
||||
B_hh = NULL;
|
||||
|
||||
|
||||
size_t sizeElements = numElements * sizeof(T);
|
||||
|
||||
if (usePinnedHost) {
|
||||
HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&A_hh), sizeElements,
|
||||
hipHostMallocDefault));
|
||||
HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&B_hh), sizeElements,
|
||||
hipHostMallocDefault));
|
||||
} else {
|
||||
A_hh = reinterpret_cast<T*>(malloc(sizeElements));
|
||||
B_hh = reinterpret_cast<T*>(malloc(sizeElements));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostMemory<T>::reset(size_t numElements, bool full) {
|
||||
// Initialize the host data:
|
||||
for (size_t i = 0; i < numElements; i++) {
|
||||
(A_hh)[i] = 1097.0 + i;
|
||||
(B_hh)[i] = 1492.0 + i; // Phi
|
||||
|
||||
if (full) {
|
||||
(_A_h)[i] = 3.146f + i; // Pi
|
||||
(_B_h)[i] = 1.618f + i; // Phi
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HostMemory<T>::~HostMemory() {
|
||||
HipTest::freeArraysForHost(_A_h, _B_h, _C_h, _usePinnedHost);
|
||||
|
||||
if (_usePinnedHost) {
|
||||
HIP_CHECK(hipHostFree(A_hh));
|
||||
HIP_CHECK(hipHostFree(B_hh));
|
||||
|
||||
} else {
|
||||
free(A_hh);
|
||||
free(B_hh);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
void memcpytest2_get_host_memory(size_t *free, size_t *total) {
|
||||
MEMORYSTATUSEX status;
|
||||
status.dwLength = sizeof(status);
|
||||
GlobalMemoryStatusEx(&status);
|
||||
// Windows doesn't allow allocating more than half of system memory to the gpu
|
||||
// Since the runtime also needs space for its internal allocations,
|
||||
// we should not try to allocate more than 40% of reported system memory,
|
||||
// otherwise we can run into OOM issues.
|
||||
*free = static_cast<size_t>(0.4 * status.ullAvailPhys);
|
||||
*total = static_cast<size_t>(0.4 * status.ullTotalPhys);
|
||||
}
|
||||
#else
|
||||
struct sysinfo memInfo;
|
||||
void memcpytest2_get_host_memory(size_t *free, size_t *total) {
|
||||
sysinfo(&memInfo);
|
||||
uint64_t freePhysMem = memInfo.freeram;
|
||||
freePhysMem *= memInfo.mem_unit;
|
||||
*free = freePhysMem;
|
||||
uint64_t totalPhysMem = memInfo.totalram;
|
||||
totalPhysMem *= memInfo.mem_unit;
|
||||
*total = totalPhysMem;
|
||||
}
|
||||
#endif
|
||||
|
||||
//---
|
||||
// Test many different kinds of memory copies.
|
||||
// The subroutine allocates memory , copies to device, runs a vector
|
||||
// add kernel, copies back, and
|
||||
// checks the result.
|
||||
//
|
||||
// IN: numElements controls the number of elements used for allocations.
|
||||
// IN: usePinnedHost : If true, allocate host with hipHostMalloc and is pinned
|
||||
// else allocate host
|
||||
// memory with malloc. IN: useHostToHost : If true, add an extra
|
||||
// host-to-host copy. IN:
|
||||
// useDeviceToDevice : If true, add an extra deviceto-device copy after
|
||||
// result is produced. IN:
|
||||
// useMemkindDefault : If true, use memkinddefault
|
||||
// (runtime figures out direction). if false, use
|
||||
// explicit memcpy direction.
|
||||
//
|
||||
template <typename T>
|
||||
void memcpytest2(DeviceMemory<T>* dmem, HostMemory<T>* hmem,
|
||||
size_t numElements, bool useHostToHost,
|
||||
bool useDeviceToDevice, bool useMemkindDefault) {
|
||||
size_t sizeElements = numElements * sizeof(T);
|
||||
|
||||
hmem->reset(numElements);
|
||||
|
||||
assert(numElements <= dmem->maxNumElements());
|
||||
assert(numElements <= hmem->maxNumElements());
|
||||
|
||||
|
||||
if (useHostToHost) {
|
||||
// Do some extra host-to-host copies here to mix things up:
|
||||
HIP_CHECK(hipMemcpy(hmem->A_hh, hmem->A_h(), sizeElements,
|
||||
useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToHost));
|
||||
HIP_CHECK(hipMemcpy(hmem->B_hh, hmem->B_h(), sizeElements,
|
||||
useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToHost));
|
||||
|
||||
|
||||
HIP_CHECK(hipMemcpy(dmem->A_d(), hmem->A_hh, sizeElements,
|
||||
useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(dmem->B_d(), hmem->B_hh, sizeElements,
|
||||
useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
} else {
|
||||
HIP_CHECK(hipMemcpy(dmem->A_d(), hmem->A_h(), sizeElements,
|
||||
useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(dmem->B_d(), hmem->B_h(), sizeElements,
|
||||
useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(1), dim3(1), 0, 0,
|
||||
static_cast<const T*>(dmem->A_d()), static_cast<const T*>(dmem->B_d()),
|
||||
dmem->C_d(), numElements);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
|
||||
if (useDeviceToDevice) {
|
||||
// Do an extra device-to-device copy here to mix things up:
|
||||
HIP_CHECK(hipMemcpy(dmem->C_dd(), dmem->C_d(), sizeElements,
|
||||
useMemkindDefault ? hipMemcpyDefault : hipMemcpyDeviceToDevice));
|
||||
|
||||
// Destroy the original dmem->C_d():
|
||||
HIP_CHECK(hipMemset(dmem->C_d(), 0x5A, sizeElements));
|
||||
|
||||
HIP_CHECK(hipMemcpy(hmem->C_h(), dmem->C_dd(), sizeElements,
|
||||
useMemkindDefault ? hipMemcpyDefault : hipMemcpyDeviceToHost));
|
||||
} else {
|
||||
HIP_CHECK(hipMemcpy(hmem->C_h(), dmem->C_d(), sizeElements,
|
||||
useMemkindDefault ? hipMemcpyDefault : hipMemcpyDeviceToHost));
|
||||
}
|
||||
TEST_CASE("Unit_hipMemcpy_Positive_Basic") { MemcpyWithDirectionCommonTests<false>(hipMemcpy); }
|
||||
|
||||
TEST_CASE("Unit_hipMemcpy_Positive_Synchronization_Behavior") {
|
||||
using namespace std::placeholders;
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
HipTest::checkVectorADD(hmem->A_h(), hmem->B_h(), hmem->C_h(), numElements);
|
||||
|
||||
// For transfers from pageable host memory to device memory, a stream sync is performed before
|
||||
// the copy is initiated. The function will return once the pageable buffer has been copied to
|
||||
// the staging memory for DMA transfer to device memory, but the DMA to final destination may
|
||||
// not have completed.
|
||||
// For transfers from pinned host memory to device memory, the function is synchronous with
|
||||
// respect to the host
|
||||
SECTION("Host memory to device memory") {
|
||||
MemcpyHtoDSyncBehavior(std::bind(hipMemcpy, _1, _2, _3, hipMemcpyHostToDevice), true);
|
||||
}
|
||||
|
||||
printf(" %s success\n", __func__);
|
||||
}
|
||||
// For transfers from device to either pageable or pinned host memory, the function returns only
|
||||
// once the copy has completed
|
||||
SECTION("Device memory to host memory") {
|
||||
const auto f = std::bind(hipMemcpy, _1, _2, _3, hipMemcpyDeviceToHost);
|
||||
MemcpyDtoHPageableSyncBehavior(f, true);
|
||||
MemcpyDtoHPinnedSyncBehavior(f, true);
|
||||
}
|
||||
|
||||
// Try all the 16 possible combinations to memcpytest2 - usePinnedHost,
|
||||
// useHostToHost,
|
||||
// useDeviceToDevice, useMemkindDefault
|
||||
template <typename T>
|
||||
void memcpytest2_for_type(size_t numElements) {
|
||||
DeviceMemory<T> memD(numElements);
|
||||
HostMemory<T> memU(numElements, 0 /*usePinnedHost*/);
|
||||
HostMemory<T> memP(numElements, 1 /*usePinnedHost*/);
|
||||
// For transfers from device memory to device memory, no host-side synchronization is performed.
|
||||
SECTION("Device memory to device memory") {
|
||||
// This behavior differs on NVIDIA and AMD, on AMD the hipMemcpy calls is synchronous with
|
||||
// respect to the host
|
||||
#if HT_AMD
|
||||
HipTest::HIP_SKIP_TEST(
|
||||
"EXSWCPHIPT-127 - Memcpy from device to device memory behavior differs on AMD and Nvidia");
|
||||
return;
|
||||
#endif
|
||||
MemcpyDtoDSyncBehavior(std::bind(hipMemcpy, _1, _2, _3, hipMemcpyDeviceToDevice), false);
|
||||
}
|
||||
|
||||
for (int usePinnedHost = 0; usePinnedHost <= 1; usePinnedHost++) {
|
||||
for (int useHostToHost = 0; useHostToHost <= 1; useHostToHost++) {
|
||||
for (int useDeviceToDevice = 0; useDeviceToDevice <= 1;
|
||||
useDeviceToDevice++) {
|
||||
for (int useMemkindDefault = 0; useMemkindDefault <= 1;
|
||||
useMemkindDefault++) {
|
||||
memcpytest2<T>(&memD, usePinnedHost ? &memP : &memU,
|
||||
numElements, useHostToHost,
|
||||
useDeviceToDevice, useMemkindDefault);
|
||||
}
|
||||
}
|
||||
}
|
||||
// For transfers from any host memory to any host memory, the function is fully synchronous with
|
||||
// respect to the host
|
||||
SECTION("Host memory to host memory") {
|
||||
MemcpyHtoHSyncBehavior(std::bind(hipMemcpy, _1, _2, _3, hipMemcpyHostToHost), true);
|
||||
}
|
||||
}
|
||||
|
||||
// Try many different sizes to memory copy.
|
||||
template <typename T>
|
||||
void memcpytest2_sizes(size_t maxElem = 0) {
|
||||
int deviceId;
|
||||
HIP_CHECK(hipGetDevice(&deviceId));
|
||||
TEST_CASE("Unit_hipMemcpy_Negative_Parameters") {
|
||||
using namespace std::placeholders;
|
||||
|
||||
size_t free, total, freeCPU, totalCPU;
|
||||
HIP_CHECK(hipMemGetInfo(&free, &total));
|
||||
memcpytest2_get_host_memory(&freeCPU, &totalCPU);
|
||||
|
||||
if (maxElem == 0) {
|
||||
// Use lesser maxElem if not enough host memory available
|
||||
size_t maxElemGPU = free / sizeof(T) / 8;
|
||||
size_t maxElemCPU = freeCPU / sizeof(T) / 8;
|
||||
maxElem = maxElemGPU < maxElemCPU ? maxElemGPU : maxElemCPU;
|
||||
SECTION("Host to device") {
|
||||
LinearAllocGuard<int> device_alloc(LinearAllocs::hipMalloc, kPageSize);
|
||||
LinearAllocGuard<int> host_alloc(LinearAllocs::hipHostMalloc, kPageSize);
|
||||
MemcpyWithDirectionCommonNegativeTests(hipMemcpy, device_alloc.ptr(), host_alloc.ptr(),
|
||||
kPageSize, hipMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipDeviceReset());
|
||||
DeviceMemory<T> memD(maxElem);
|
||||
HostMemory<T> memU(maxElem, 0 /*usePinnedHost*/);
|
||||
HostMemory<T> memP(maxElem, 1 /*usePinnedHost*/);
|
||||
SECTION("Device to host") {
|
||||
LinearAllocGuard<int> device_alloc(LinearAllocs::hipMalloc, kPageSize);
|
||||
LinearAllocGuard<int> host_alloc(LinearAllocs::hipHostMalloc, kPageSize);
|
||||
MemcpyWithDirectionCommonNegativeTests(hipMemcpy, host_alloc.ptr(), device_alloc.ptr(),
|
||||
kPageSize, hipMemcpyDeviceToHost);
|
||||
}
|
||||
|
||||
for (size_t elem = 1; elem <= maxElem; elem *= 2) {
|
||||
memcpytest2<T>(&memD, &memU, elem, 1, 1, 0); // unpinned host
|
||||
memcpytest2<T>(&memD, &memP, elem, 1, 1, 0); // pinned host
|
||||
}
|
||||
}
|
||||
|
||||
// Try many different sizes to memory copy.
|
||||
template <typename T>
|
||||
void memcpytest2_offsets(size_t maxElem, bool devOffsets, bool hostOffsets) {
|
||||
int deviceId;
|
||||
HIP_CHECK(hipGetDevice(&deviceId));
|
||||
|
||||
size_t free, total;
|
||||
HIP_CHECK(hipMemGetInfo(&free, &total));
|
||||
|
||||
HIP_CHECK(hipDeviceReset());
|
||||
DeviceMemory<T> memD(maxElem);
|
||||
HostMemory<T> memU(maxElem, 0 /*usePinnedHost*/);
|
||||
HostMemory<T> memP(maxElem, 1 /*usePinnedHost*/);
|
||||
|
||||
size_t elem = maxElem / 2;
|
||||
|
||||
for (size_t offset = 0; offset < 512; offset++) {
|
||||
assert(elem + offset < maxElem);
|
||||
if (devOffsets) {
|
||||
memD.offset(offset);
|
||||
}
|
||||
if (hostOffsets) {
|
||||
memU.offset(offset);
|
||||
memP.offset(offset);
|
||||
}
|
||||
memcpytest2<T>(&memD, &memU, elem, 1, 1, 0); // unpinned host
|
||||
memcpytest2<T>(&memD, &memP, elem, 1, 1, 0); // pinned host
|
||||
}
|
||||
|
||||
for (size_t offset = 512; offset < elem; offset *= 2) {
|
||||
assert(elem + offset < maxElem);
|
||||
if (devOffsets) {
|
||||
memD.offset(offset);
|
||||
}
|
||||
if (hostOffsets) {
|
||||
memU.offset(offset);
|
||||
memP.offset(offset);
|
||||
}
|
||||
memcpytest2<T>(&memD, &memU, elem, 1, 1, 0); // unpinned host
|
||||
memcpytest2<T>(&memD, &memP, elem, 1, 1, 0); // pinned host
|
||||
}
|
||||
}
|
||||
|
||||
// Create multiple threads to stress multi-thread locking behavior in the
|
||||
// allocation/deallocation/tracking logic:
|
||||
template <typename T>
|
||||
void multiThread_1(bool serialize, bool usePinnedHost) {
|
||||
DeviceMemory<T> memD(NUM_ELM);
|
||||
HostMemory<T> mem1(NUM_ELM, usePinnedHost);
|
||||
HostMemory<T> mem2(NUM_ELM, usePinnedHost);
|
||||
|
||||
std::thread t1(memcpytest2<T>, &memD, &mem1, NUM_ELM, 0, 0, 0);
|
||||
if (serialize) {
|
||||
t1.join();
|
||||
}
|
||||
|
||||
|
||||
std::thread t2(memcpytest2<T>, &memD, &mem2, NUM_ELM, 0, 0, 0);
|
||||
if (serialize) {
|
||||
t2.join();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
This testcase verifies hipMemcpy API
|
||||
Initializes device variables
|
||||
Launches kernel and performs the sum of device variables
|
||||
copies the result to host variable and validates the result.
|
||||
*/
|
||||
TEMPLATE_TEST_CASE("Unit_hipMemcpy_KernelLaunch", "", int, float,
|
||||
double) {
|
||||
size_t Nbytes = NUM_ELM * sizeof(TestType);
|
||||
|
||||
TestType *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr};
|
||||
TestType *A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr};
|
||||
|
||||
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, NUM_ELM, false);
|
||||
|
||||
HIP_CHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
|
||||
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(1), dim3(1), 0, 0,
|
||||
static_cast<const TestType*>(A_d),
|
||||
static_cast<const TestType*>(B_d), C_d, NUM_ELM);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, NUM_ELM);
|
||||
|
||||
HipTest::freeArrays<TestType>(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
||||
}
|
||||
|
||||
/*
|
||||
This testcase verifies the following scenarios
|
||||
1. H2H,H2PinMem and PinnedMem2Host
|
||||
2. H2D-D2D-D2H in same GPU
|
||||
3. Pinned Host Memory to device variables in same GPU
|
||||
4. Device context change
|
||||
5. H2D-D2D-D2H peer GPU
|
||||
*/
|
||||
TEMPLATE_TEST_CASE("Unit_hipMemcpy_H2H-H2D-D2H-H2PinMem", "", int,
|
||||
float, double) {
|
||||
TestType *A_d{nullptr}, *B_d{nullptr};
|
||||
TestType *A_h{nullptr}, *B_h{nullptr};
|
||||
TestType *A_Ph{nullptr}, *B_Ph{nullptr};
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
HipTest::initArrays<TestType>(&A_d, &B_d, nullptr,
|
||||
&A_h, &B_h, nullptr,
|
||||
NUM_ELM*sizeof(TestType));
|
||||
HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
|
||||
&A_Ph, &B_Ph, nullptr,
|
||||
NUM_ELM*sizeof(TestType), true);
|
||||
|
||||
SECTION("H2H, H2PinMem and PinMem2H") {
|
||||
HIP_CHECK(hipMemcpy(B_h, A_h, NUM_ELM*sizeof(TestType),
|
||||
hipMemcpyDefault));
|
||||
HIP_CHECK(hipMemcpy(A_Ph, B_h, NUM_ELM*sizeof(TestType),
|
||||
hipMemcpyDefault));
|
||||
HIP_CHECK(hipMemcpy(B_Ph, A_Ph, NUM_ELM*sizeof(TestType),
|
||||
hipMemcpyDefault));
|
||||
HipTest::checkTest(A_h, B_Ph, NUM_ELM);
|
||||
}
|
||||
|
||||
SECTION("H2D-D2D-D2H-SameGPU") {
|
||||
HIP_CHECK(hipMemcpy(A_d, A_h, NUM_ELM*sizeof(TestType), hipMemcpyDefault));
|
||||
HIP_CHECK(hipMemcpy(B_d, A_d, NUM_ELM*sizeof(TestType), hipMemcpyDefault));
|
||||
HIP_CHECK(hipMemcpy(B_h, B_d, NUM_ELM*sizeof(TestType), hipMemcpyDefault));
|
||||
HipTest::checkTest(A_h, B_h, NUM_ELM);
|
||||
}
|
||||
|
||||
SECTION("pH2D-D2D-D2pH-SameGPU") {
|
||||
HIP_CHECK(hipMemcpy(A_d, A_Ph, NUM_ELM*sizeof(TestType),
|
||||
hipMemcpyDefault));
|
||||
HIP_CHECK(hipMemcpy(B_d, A_d, NUM_ELM*sizeof(TestType), hipMemcpyDefault));
|
||||
HIP_CHECK(hipMemcpy(B_Ph, B_d, NUM_ELM*sizeof(TestType),
|
||||
hipMemcpyDefault));
|
||||
HipTest::checkTest(A_Ph, B_Ph, NUM_ELM);
|
||||
}
|
||||
SECTION("H2D-D2D-D2H-DeviceContextChange") {
|
||||
int deviceCount = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&deviceCount));
|
||||
if (deviceCount < 2) {
|
||||
SUCCEED("deviceCount less then 2");
|
||||
} else {
|
||||
int canAccessPeer = 0;
|
||||
HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, 0, 1));
|
||||
if (canAccessPeer) {
|
||||
HIP_CHECK(hipSetDevice(1));
|
||||
HIP_CHECK(hipMemcpy(A_d, A_h, NUM_ELM*sizeof(TestType),
|
||||
hipMemcpyDefault));
|
||||
HIP_CHECK(hipMemcpy(B_d, A_d, NUM_ELM*sizeof(TestType),
|
||||
hipMemcpyDefault));
|
||||
HIP_CHECK(hipMemcpy(B_h, B_d, NUM_ELM*sizeof(TestType),
|
||||
hipMemcpyDefault));
|
||||
HipTest::checkTest(A_h, B_h, NUM_ELM);
|
||||
} else {
|
||||
SUCCEED("P2P capability is not present");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SECTION("H2D-D2D-D2H-PeerGPU") {
|
||||
int deviceCount = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&deviceCount));
|
||||
if (deviceCount < 2) {
|
||||
SUCCEED("deviceCount less then 2");
|
||||
} else {
|
||||
int canAccessPeer = 0;
|
||||
HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, 0, 1));
|
||||
if (canAccessPeer) {
|
||||
HIP_CHECK(hipSetDevice(1));
|
||||
TestType *C_d{nullptr};
|
||||
HipTest::initArrays<TestType>(nullptr, nullptr, &C_d,
|
||||
nullptr, nullptr, nullptr,
|
||||
NUM_ELM*sizeof(TestType));
|
||||
HIP_CHECK(hipMemcpy(A_d, A_h, NUM_ELM*sizeof(TestType),
|
||||
hipMemcpyDefault));
|
||||
HIP_CHECK(hipMemcpy(C_d, A_d, NUM_ELM*sizeof(TestType),
|
||||
hipMemcpyDefault));
|
||||
HIP_CHECK(hipMemcpy(B_h, C_d, NUM_ELM*sizeof(TestType),
|
||||
hipMemcpyDefault));
|
||||
HipTest::checkTest(A_h, B_h, NUM_ELM);
|
||||
HIP_CHECK(hipFree(C_d));
|
||||
} else {
|
||||
SUCCEED("P2P capability is not present");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
HipTest::freeArrays<TestType>(A_d, B_d, nullptr, A_h, B_h, nullptr, false);
|
||||
HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr, A_Ph,
|
||||
B_Ph, nullptr, true);
|
||||
}
|
||||
/*
|
||||
This testcase verifies the multi thread scenario
|
||||
*/
|
||||
TEST_CASE("Unit_hipMemcpy_MultiThreadWithSerialization") {
|
||||
HIP_CHECK(hipDeviceReset());
|
||||
|
||||
// Simplest cases: serialize the threads, and also used pinned memory:
|
||||
// This verifies that the sub-calls to memcpytest2 are correct.
|
||||
multiThread_1<float>(true, true);
|
||||
|
||||
// Serialize, but use unpinned memory to stress the unpinned memory xfer path.
|
||||
multiThread_1<float>(true, false);
|
||||
}
|
||||
|
||||
/*
|
||||
This testcase verifies hipMemcpy API with pinnedMemory and hostRegister
|
||||
along with kernel launches
|
||||
*/
|
||||
|
||||
TEMPLATE_TEST_CASE("Unit_hipMemcpy_PinnedRegMemWithKernelLaunch",
|
||||
"", int, float, double) {
|
||||
int numDevices = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
||||
if (numDevices < 2) {
|
||||
SUCCEED("No of devices are less than 2");
|
||||
} else {
|
||||
// 1 refers to pinned Memory
|
||||
// 2 refers to register Memory
|
||||
int MallocPinType = GENERATE(0, 1);
|
||||
size_t Nbytes = NUM_ELM * sizeof(TestType);
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU,
|
||||
threadsPerBlock, NUM_ELM);
|
||||
|
||||
TestType *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr};
|
||||
TestType *X_d{nullptr}, *Y_d{nullptr}, *Z_d{nullptr};
|
||||
TestType *A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr};
|
||||
if (MallocPinType) {
|
||||
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, NUM_ELM, true);
|
||||
} else {
|
||||
A_h = reinterpret_cast<TestType*>(malloc(Nbytes));
|
||||
HIP_CHECK(hipHostRegister(A_h, Nbytes, hipHostRegisterDefault));
|
||||
B_h = reinterpret_cast<TestType*>(malloc(Nbytes));
|
||||
HIP_CHECK(hipHostRegister(B_h, Nbytes, hipHostRegisterDefault));
|
||||
C_h = reinterpret_cast<TestType*>(malloc(Nbytes));
|
||||
HIP_CHECK(hipHostRegister(C_h, Nbytes, hipHostRegisterDefault));
|
||||
HipTest::initArrays<TestType>(&A_d, &B_d, &C_d, nullptr, nullptr,
|
||||
nullptr, NUM_ELM, false);
|
||||
HipTest::setDefaultData<TestType>(NUM_ELM, A_h, B_h, C_h);
|
||||
}
|
||||
HIP_CHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
|
||||
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, 0, static_cast<const TestType*>(A_d),
|
||||
static_cast<const TestType*>(B_d), C_d, NUM_ELM);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, NUM_ELM);
|
||||
|
||||
unsigned int seed = time(0);
|
||||
HIP_CHECK(hipSetDevice(HipTest::RAND_R(&seed) % (numDevices-1)+1));
|
||||
|
||||
int device;
|
||||
HIP_CHECK(hipGetDevice(&device));
|
||||
std::cout <<"hipMemcpy is set to happen between device 0 and device "
|
||||
<<device << std::endl;
|
||||
HipTest::initArrays<TestType>(&X_d, &Y_d, &Z_d, nullptr,
|
||||
nullptr, nullptr, NUM_ELM, false);
|
||||
|
||||
for (int j = 0; j < NUM_ELM; j++) {
|
||||
A_h[j] = 0;
|
||||
B_h[j] = 0;
|
||||
C_h[j] = 0;
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(X_d, A_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(B_h, B_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(Y_d, B_h, Nbytes, hipMemcpyHostToDevice));
|
||||
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, 0, static_cast<const TestType*>(X_d),
|
||||
static_cast<const TestType*>(Y_d), Z_d, NUM_ELM);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, NUM_ELM);
|
||||
|
||||
if (MallocPinType) {
|
||||
HipTest::freeArrays<TestType>(A_d, B_d, C_d, A_h, B_h, C_h, true);
|
||||
} else {
|
||||
HIP_CHECK(hipHostUnregister(A_h));
|
||||
free(A_h);
|
||||
HIP_CHECK(hipHostUnregister(B_h));
|
||||
free(B_h);
|
||||
HIP_CHECK(hipHostUnregister(C_h));
|
||||
free(C_h);
|
||||
HipTest::freeArrays<TestType>(A_d, B_d, C_d, nullptr,
|
||||
nullptr, nullptr, false);
|
||||
}
|
||||
HipTest::freeArrays<TestType>(X_d, Y_d, Z_d, nullptr,
|
||||
nullptr, nullptr, false);
|
||||
SECTION("Host to host") {
|
||||
LinearAllocGuard<int> src_alloc(LinearAllocs::hipHostMalloc, kPageSize);
|
||||
LinearAllocGuard<int> dst_alloc(LinearAllocs::hipHostMalloc, kPageSize);
|
||||
MemcpyWithDirectionCommonNegativeTests(hipMemcpy, dst_alloc.ptr(), src_alloc.ptr(), kPageSize,
|
||||
hipMemcpyHostToHost);
|
||||
}
|
||||
|
||||
SECTION("Device to device") {
|
||||
LinearAllocGuard<int> src_alloc(LinearAllocs::hipMalloc, kPageSize);
|
||||
LinearAllocGuard<int> dst_alloc(LinearAllocs::hipMalloc, kPageSize);
|
||||
MemcpyWithDirectionCommonNegativeTests(hipMemcpy, dst_alloc.ptr(), src_alloc.ptr(), kPageSize,
|
||||
hipMemcpyDeviceToDevice);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
/*
|
||||
Copyright (c) 2021-22-present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
@@ -17,570 +19,83 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Different test for checking functionality of
|
||||
* hipError_t hipMemcpyWithStream(void* dst, const void* src, size_t sizeBytes,hipMemcpyKind kind,
|
||||
* hipStream_t stream);
|
||||
*/
|
||||
/*
|
||||
This testfile verifies the following scenarios
|
||||
1. hipMemcpyWithStream with one stream
|
||||
2. hipMemcpyWithStream with two streams
|
||||
3. Multi GPU and single stream
|
||||
4. hipMemcpyWithStream API with testkind DtoH
|
||||
5. hipMemcpyWithStream API with testkind DtoD
|
||||
6. hipMemcpyWithStream API with testkind HtoH
|
||||
7. hipMemcpyWithStream API with testkind TestkindDefault
|
||||
8. hipMemcpyWithStream API with testkind TestkindDefaultForDtoD
|
||||
9. hipMemcpyWithStream API DtoD on same device
|
||||
*/
|
||||
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_kernels.hh>
|
||||
#include <hip_test_checkers.hh>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <memcpy1d_tests_common.hh>
|
||||
#include <resource_guards.hh>
|
||||
#include <utils.hh>
|
||||
|
||||
#include<vector>
|
||||
#include<thread>
|
||||
#include<chrono>
|
||||
TEST_CASE("Unit_hipMemcpy_Positive_Basic") { MemcpyWithDirectionCommonTests<false>(hipMemcpy); }
|
||||
|
||||
#define LEN 64
|
||||
#define SIZE LEN << 2
|
||||
#define THREADS 2
|
||||
#define MAX_THREADS 16
|
||||
TEST_CASE("Unit_hipMemcpy_Positive_Synchronization_Behavior") {
|
||||
using namespace std::placeholders;
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
|
||||
static constexpr size_t N{4 * 1024 * 1024};
|
||||
static const auto MaxGPUDevices{256};
|
||||
static constexpr unsigned blocksPerCU{6}; // to hide latency
|
||||
static constexpr unsigned threadsPerBlock{256};
|
||||
|
||||
enum class ops
|
||||
{ TestwithOnestream,
|
||||
TestwithTwoStream,
|
||||
TestOnMultiGPUwithOneStream,
|
||||
TestkindDtoH,
|
||||
TestkindDtoD,
|
||||
TestkindHtoH,
|
||||
TestkindDefault,
|
||||
TestkindDefaultForDtoD,
|
||||
TestDtoDonSameDevice,
|
||||
END_OF_LIST
|
||||
};
|
||||
|
||||
struct joinable_thread : std::thread {
|
||||
template <class... Xs>
|
||||
explicit joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
|
||||
{} // NOLINT
|
||||
|
||||
joinable_thread& operator=(joinable_thread&& other) = default;
|
||||
joinable_thread(joinable_thread&& other) = default;
|
||||
|
||||
~joinable_thread() {
|
||||
if (this->joinable())
|
||||
this->join();
|
||||
}
|
||||
};
|
||||
|
||||
void TestwithOnestream(void) {
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
int *A_d, *B_d, *C_d;
|
||||
int *A_h, *B_h, *C_h;
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
||||
|
||||
hipStream_t stream;
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
|
||||
HIP_CHECK(hipMemcpyWithStream(A_d, A_h, Nbytes,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyWithStream(B_d, B_h, Nbytes,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, stream, static_cast<const int*>(A_d),
|
||||
static_cast<const int*>(B_d), C_d, N);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
||||
|
||||
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
}
|
||||
|
||||
void TestwithTwoStream(void) {
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
const int NUM_STREAMS = 2;
|
||||
int *A_d[NUM_STREAMS], *B_d[NUM_STREAMS], *C_d[NUM_STREAMS];
|
||||
int *A_h[NUM_STREAMS], *B_h[NUM_STREAMS], *C_h[NUM_STREAMS];
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
|
||||
for (int i=0; i < NUM_STREAMS; ++i) {
|
||||
HipTest::initArrays(&A_d[i], &B_d[i], &C_d[i],
|
||||
&A_h[i], &B_h[i], &C_h[i], N, false);
|
||||
// For transfers from pageable host memory to device memory, a stream sync is performed before
|
||||
// the copy is initiated. The function will return once the pageable buffer has been copied to
|
||||
// the staging memory for DMA transfer to device memory, but the DMA to final destination may
|
||||
// not have completed.
|
||||
// For transfers from pinned host memory to device memory, the function is synchronous with
|
||||
// respect to the host
|
||||
SECTION("Host memory to device memory") {
|
||||
MemcpyHtoDSyncBehavior(std::bind(hipMemcpy, _1, _2, _3, hipMemcpyHostToDevice), true);
|
||||
}
|
||||
|
||||
hipStream_t stream[NUM_STREAMS];
|
||||
for (int i=0; i < NUM_STREAMS; ++i) {
|
||||
HIP_CHECK(hipStreamCreate(&stream[i]));
|
||||
// For transfers from device to either pageable or pinned host memory, the function returns only
|
||||
// once the copy has completed
|
||||
SECTION("Device memory to host memory") {
|
||||
const auto f = std::bind(hipMemcpy, _1, _2, _3, hipMemcpyDeviceToHost);
|
||||
MemcpyDtoHPageableSyncBehavior(f, true);
|
||||
MemcpyDtoHPinnedSyncBehavior(f, true);
|
||||
}
|
||||
|
||||
for (int i=0; i < NUM_STREAMS; ++i) {
|
||||
HIP_CHECK(hipMemcpyWithStream(A_d[i], A_h[i], Nbytes,
|
||||
hipMemcpyHostToDevice, stream[i]));
|
||||
HIP_CHECK(hipMemcpyWithStream(B_d[i], B_h[i], Nbytes,
|
||||
hipMemcpyHostToDevice, stream[i]));
|
||||
}
|
||||
|
||||
for (int i=0; i < NUM_STREAMS; ++i) {
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, stream[i], static_cast<const int*>(A_d[i]),
|
||||
static_cast<const int*>(B_d[i]), C_d[i], N);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
}
|
||||
|
||||
for (int i=0; i < NUM_STREAMS; ++i) {
|
||||
HIP_CHECK(hipStreamSynchronize(stream[i]));
|
||||
HIP_CHECK(hipMemcpy(C_h[i], C_d[i], Nbytes, hipMemcpyDeviceToHost));
|
||||
HipTest::checkVectorADD(A_h[i], B_h[i], C_h[i], N);
|
||||
}
|
||||
|
||||
for (int i=0; i < NUM_STREAMS; ++i) {
|
||||
HipTest::freeArrays(A_d[i], B_d[i], C_d[i], A_h[i], B_h[i], C_h[i], false);
|
||||
HIP_CHECK(hipStreamDestroy(stream[i]));
|
||||
}
|
||||
}
|
||||
|
||||
void TestDtoDonSameDevice(void) {
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
const int NUM_STREAMS = 2;
|
||||
int *A_d[NUM_STREAMS], *B_d[NUM_STREAMS], *C_d[NUM_STREAMS];
|
||||
int *A_h[NUM_STREAMS], *B_h[NUM_STREAMS], *C_h[NUM_STREAMS];
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
|
||||
HipTest::initArrays(&A_d[0], &B_d[0], &C_d[0],
|
||||
&A_h[0], &B_h[0], &C_h[0], N, false);
|
||||
|
||||
|
||||
hipStream_t stream[NUM_STREAMS];
|
||||
for (int i=0; i < NUM_STREAMS; ++i) {
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
HIP_CHECK(hipStreamCreate(&stream[i]));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
HIP_CHECK(hipMalloc(&A_d[1], Nbytes));
|
||||
HIP_CHECK(hipMalloc(&B_d[1], Nbytes));
|
||||
HIP_CHECK(hipMalloc(&C_d[1], Nbytes));
|
||||
C_h[1] = reinterpret_cast<int*>(malloc(Nbytes));
|
||||
HIP_ASSERT(C_h[1] != NULL);
|
||||
|
||||
HIP_CHECK(hipMemcpyWithStream(A_d[0], A_h[0], Nbytes,
|
||||
hipMemcpyHostToDevice, stream[0]));
|
||||
HIP_CHECK(hipMemcpyWithStream(B_d[0], B_h[0], Nbytes,
|
||||
hipMemcpyHostToDevice, stream[0]));
|
||||
|
||||
HIP_CHECK(hipMemcpyWithStream(A_d[1], A_d[0], Nbytes,
|
||||
hipMemcpyDeviceToDevice, stream[1]));
|
||||
HIP_CHECK(hipMemcpyWithStream(B_d[1], B_d[0], Nbytes,
|
||||
hipMemcpyDeviceToDevice, stream[1]));
|
||||
|
||||
|
||||
for (int i=0; i < NUM_STREAMS; ++i) {
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, stream[i], static_cast<const int*>(A_d[i]),
|
||||
static_cast<const int*>(B_d[i]), C_d[i], N);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
}
|
||||
|
||||
for (int i=0; i < NUM_STREAMS; ++i) {
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
HIP_CHECK(hipStreamSynchronize(stream[i]));
|
||||
HIP_CHECK(hipMemcpy(C_h[i], C_d[i], Nbytes, hipMemcpyDeviceToHost));
|
||||
HipTest::checkVectorADD(A_h[0], B_h[0], C_h[i], N);
|
||||
}
|
||||
|
||||
|
||||
HipTest::freeArrays(A_d[0], B_d[0], C_d[0], A_h[0], B_h[0], C_h[0], false);
|
||||
|
||||
if (A_d[1]) {
|
||||
HIP_CHECK(hipFree(A_d[1]));
|
||||
}
|
||||
if (B_d[1]) {
|
||||
HIP_CHECK(hipFree(B_d[1]));
|
||||
}
|
||||
if (C_d[1]) {
|
||||
HIP_CHECK(hipFree(C_d[1]));
|
||||
}
|
||||
if (C_h[1]) {
|
||||
free(C_h[1]);
|
||||
}
|
||||
|
||||
|
||||
for (int i=0; i < NUM_STREAMS; ++i) {
|
||||
HIP_CHECK(hipStreamDestroy(stream[i]));
|
||||
}
|
||||
}
|
||||
|
||||
void TestOnMultiGPUwithOneStream(void) {
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
int NumDevices = 0;
|
||||
|
||||
HIP_CHECK(hipGetDeviceCount(&NumDevices));
|
||||
// If you have single GPU machine the return
|
||||
if (NumDevices <= 1) {
|
||||
SUCCEED("NumDevices <2");
|
||||
} else {
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
int *A_d[MaxGPUDevices], *B_d[MaxGPUDevices], *C_d[MaxGPUDevices];
|
||||
int *A_h[MaxGPUDevices], *B_h[MaxGPUDevices], *C_h[MaxGPUDevices];
|
||||
|
||||
hipStream_t stream[MaxGPUDevices];
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HIP_CHECK(hipStreamCreate(&stream[i]));
|
||||
}
|
||||
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HipTest::initArrays(&A_d[i], &B_d[i], &C_d[i],
|
||||
&A_h[i], &B_h[i], &C_h[i], N, false);
|
||||
}
|
||||
|
||||
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HIP_CHECK(hipMemcpyWithStream(A_d[i], A_h[i], Nbytes,
|
||||
hipMemcpyHostToDevice, stream[i]));
|
||||
HIP_CHECK(hipMemcpyWithStream(B_d[i], B_h[i], Nbytes,
|
||||
hipMemcpyHostToDevice, stream[i]));
|
||||
}
|
||||
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks),
|
||||
dim3(threadsPerBlock), 0, stream[i],
|
||||
static_cast<const int*>(A_d[i]),
|
||||
static_cast<const int*>(B_d[i]), C_d[i], N);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
}
|
||||
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HIP_CHECK(hipStreamSynchronize(stream[i]));
|
||||
HIP_CHECK(hipMemcpy(C_h[i], C_d[i], Nbytes, hipMemcpyDeviceToHost));
|
||||
HipTest::checkVectorADD(A_h[i], B_h[i], C_h[i], N);
|
||||
}
|
||||
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HipTest::freeArrays(A_d[i], B_d[i], C_d[i],
|
||||
A_h[i], B_h[i], C_h[i], false);
|
||||
HIP_CHECK(hipStreamDestroy(stream[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TestkindDtoH(void) {
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
int *A_d, *B_d, *C_d;
|
||||
int *A_h, *B_h, *C_h;
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
||||
|
||||
hipStream_t stream;
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
|
||||
HIP_CHECK(hipMemcpyWithStream(A_d, A_h, Nbytes,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyWithStream(B_d, B_h, Nbytes,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, stream, static_cast<const int*>(A_d),
|
||||
static_cast<const int*>(B_d), C_d, N);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
HIP_CHECK(hipMemcpyWithStream(C_h, C_d, Nbytes,
|
||||
hipMemcpyDeviceToHost, stream));
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
||||
|
||||
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
}
|
||||
|
||||
void TestkindDtoD(void) {
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
int NumDevices = 0;
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
HIP_CHECK(hipGetDeviceCount(&NumDevices));
|
||||
// If you have single GPU machine the return
|
||||
if (NumDevices <= 1) {
|
||||
SUCCEED("NumDevices are less than 2");
|
||||
} else {
|
||||
int *A_d[MaxGPUDevices], *B_d[MaxGPUDevices], *C_d[MaxGPUDevices];
|
||||
int *A_h[MaxGPUDevices], *B_h[MaxGPUDevices], *C_h[MaxGPUDevices];
|
||||
|
||||
hipStream_t stream[MaxGPUDevices];
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HIP_CHECK(hipStreamCreate(&stream[i]));
|
||||
}
|
||||
|
||||
// Initialize and create the host and device elements for first device
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
HipTest::initArrays(&A_d[0], &B_d[0], &C_d[0],
|
||||
&A_h[0], &B_h[0], &C_h[0], N, false);
|
||||
|
||||
for (int i=1; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i))
|
||||
HIP_CHECK(hipMalloc(&A_d[i], Nbytes));
|
||||
HIP_CHECK(hipMalloc(&B_d[i], Nbytes));
|
||||
HIP_CHECK(hipMalloc(&C_d[i], Nbytes));
|
||||
C_h[i] = reinterpret_cast<int*>(malloc(Nbytes));
|
||||
HIP_ASSERT(C_h[i] != NULL);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
HIP_CHECK(hipMemcpyWithStream(A_d[0], A_h[0], Nbytes,
|
||||
hipMemcpyHostToDevice, stream[0]));
|
||||
HIP_CHECK(hipMemcpyWithStream(B_d[0], B_h[0], Nbytes,
|
||||
hipMemcpyHostToDevice, stream[0]));
|
||||
|
||||
// Copying device data from 1st GPU to the rest of the the GPUs that is
|
||||
// NumDevices in the setup. 1st GPU start numbering from 0,1,2..n etc.
|
||||
for (int i=1; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HIP_CHECK(hipMemcpyWithStream(A_d[i], A_d[0], Nbytes,
|
||||
hipMemcpyDeviceToDevice, stream[i]));
|
||||
HIP_CHECK(hipMemcpyWithStream(B_d[i], B_d[0], Nbytes,
|
||||
hipMemcpyDeviceToDevice, stream[i]));
|
||||
}
|
||||
|
||||
|
||||
// Launching the kernel including the 1st GPU to the no of GPUs present
|
||||
// in the setup. 1st GPU start numbering from 0,1,2..n etc.
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0, stream[i], static_cast<const int*>(A_d[i]),
|
||||
static_cast<const int*>(B_d[i]), C_d[i], N);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
}
|
||||
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HIP_CHECK(hipStreamSynchronize(stream[i]));
|
||||
HIP_CHECK(hipMemcpy(C_h[i], C_d[i], Nbytes, hipMemcpyDeviceToHost));
|
||||
HipTest::checkVectorADD(A_h[0], B_h[0], C_h[i], N);
|
||||
}
|
||||
|
||||
HipTest::freeArrays(A_d[0], B_d[0], C_d[0], A_h[0], B_h[0], C_h[0], false);
|
||||
HIP_CHECK(hipStreamDestroy(stream[0]));
|
||||
|
||||
for (int i=1; i < NumDevices; ++i) {
|
||||
if (A_d[i]) {
|
||||
HIP_CHECK(hipFree(A_d[i]));
|
||||
}
|
||||
if (B_d[i]) {
|
||||
HIP_CHECK(hipFree(B_d[i]));
|
||||
}
|
||||
if (C_d[i]) {
|
||||
HIP_CHECK(hipFree(C_d[i]));
|
||||
}
|
||||
if (C_h[i]) {
|
||||
free(C_h[i]);
|
||||
}
|
||||
HIP_CHECK(hipStreamDestroy(stream[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TestkindDefault(void) {
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
int *A_d, *B_d, *C_d;
|
||||
int *A_h, *B_h, *C_h;
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
||||
|
||||
hipStream_t stream;
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
|
||||
HIP_CHECK(hipMemcpyWithStream(A_d, A_h, Nbytes, hipMemcpyDefault, stream));
|
||||
HIP_CHECK(hipMemcpyWithStream(B_d, B_h, Nbytes, hipMemcpyDefault, stream));
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, stream, static_cast<const int*>(A_d),
|
||||
static_cast<const int*>(B_d), C_d, N);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
HIP_CHECK(hipMemcpyWithStream(C_h, C_d, Nbytes, hipMemcpyDefault, stream));
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
||||
|
||||
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
}
|
||||
|
||||
void TestkindDefaultForDtoD(void) {
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
int NumDevices = 0;
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
HIP_CHECK(hipGetDeviceCount(&NumDevices));
|
||||
// Test case will not run on single GPU setup.
|
||||
if (NumDevices <= 1) {
|
||||
SUCCEED("No of Devices < 2");
|
||||
} else {
|
||||
int *A_d[MaxGPUDevices], *B_d[MaxGPUDevices], *C_d[MaxGPUDevices];
|
||||
int *A_h[MaxGPUDevices], *B_h[MaxGPUDevices], *C_h[MaxGPUDevices];
|
||||
|
||||
// Initialize and create the host and device elements for first device
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
HipTest::initArrays(&A_d[0], &B_d[0], &C_d[0],
|
||||
&A_h[0], &B_h[0], &C_h[0], N, false);
|
||||
|
||||
for (int i=1; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HIP_CHECK(hipMalloc(&A_d[i], Nbytes));
|
||||
HIP_CHECK(hipMalloc(&B_d[i], Nbytes));
|
||||
HIP_CHECK(hipMalloc(&C_d[i], Nbytes));
|
||||
C_h[i] = reinterpret_cast<int*>(malloc(Nbytes));
|
||||
HIP_ASSERT(C_h[i] != NULL);
|
||||
}
|
||||
|
||||
hipStream_t stream[MaxGPUDevices];
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HIP_CHECK(hipStreamCreate(&stream[i]));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMemcpyWithStream(A_d[0], A_h[0], Nbytes,
|
||||
hipMemcpyHostToDevice, stream[0]));
|
||||
HIP_CHECK(hipMemcpyWithStream(B_d[0], B_h[0], Nbytes,
|
||||
hipMemcpyHostToDevice, stream[0]));
|
||||
|
||||
// Copying device data from 1st GPU to the rest of the the GPUs
|
||||
// using hipMemcpyDefault kind that is NumDevices in the setup.
|
||||
// 1st GPU start numbering from 0,1,2..n etc.
|
||||
for (int i=1; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipMemcpyWithStream(A_d[i], A_d[0], Nbytes,
|
||||
hipMemcpyDefault, stream[i]));
|
||||
HIP_CHECK(hipMemcpyWithStream(B_d[i], B_d[0], Nbytes,
|
||||
hipMemcpyDefault, stream[i]));
|
||||
}
|
||||
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0, stream[i], static_cast<const int*>(A_d[i]),
|
||||
static_cast<const int*>(B_d[i]), C_d[i], N);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
}
|
||||
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i)); // hipMemcpy will be on this device
|
||||
HIP_CHECK(hipStreamSynchronize(stream[i]));
|
||||
HIP_CHECK(hipMemcpy(C_h[i], C_d[i], Nbytes, hipMemcpyDeviceToHost));
|
||||
// Output of each GPU is getting validated with input of 1st GPU.
|
||||
HipTest::checkVectorADD(A_h[0], B_h[0], C_h[i], N);
|
||||
}
|
||||
|
||||
HipTest::freeArrays(A_d[0], B_d[0], C_d[0], A_h[0], B_h[0], C_h[0], false);
|
||||
HIP_CHECK(hipStreamDestroy(stream[0]));
|
||||
|
||||
for (int i=1; i < NumDevices; ++i) {
|
||||
if (A_d[i]) {
|
||||
HIP_CHECK(hipFree(A_d[i]));
|
||||
}
|
||||
if (B_d[i]) {
|
||||
HIP_CHECK(hipFree(B_d[i]));
|
||||
}
|
||||
if (C_d[i]) {
|
||||
HIP_CHECK(hipFree(C_d[i]));
|
||||
}
|
||||
if (C_h[i]) {
|
||||
free(C_h[i]);
|
||||
}
|
||||
HIP_CHECK(hipStreamDestroy(stream[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TestkindHtoH(void) {
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
int *A_h, *B_h;
|
||||
|
||||
|
||||
// Allocate memory to A_h and B_h
|
||||
A_h = static_cast<int*>(malloc(Nbytes));
|
||||
HIP_ASSERT(A_h != NULL);
|
||||
B_h = static_cast<int*>(malloc(Nbytes));
|
||||
HIP_ASSERT(B_h != NULL);
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
if (A_h) {
|
||||
(A_h)[i] = 3.146f + i; // Pi
|
||||
}
|
||||
}
|
||||
|
||||
hipStream_t stream;
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
|
||||
HIP_CHECK(hipMemcpyWithStream(B_h, A_h, Nbytes, hipMemcpyHostToHost, stream));
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
HIP_ASSERT(A_h[i] == B_h[i]);
|
||||
}
|
||||
|
||||
if (A_h) {
|
||||
free(A_h);
|
||||
}
|
||||
if (B_h) {
|
||||
free(B_h);
|
||||
}
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
}
|
||||
|
||||
|
||||
TEST_CASE("Unit_hipMemcpyWithStream_TestWithOneStream") {
|
||||
TestwithOnestream();
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipMemcpyWithStream_TestwithTwoStream") {
|
||||
TestwithTwoStream();
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipMemcpyWithStream_TestkindDtoH") {
|
||||
TestkindDtoH();
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipMemcpyWithStream_TestkindHtoH") {
|
||||
TestkindHtoH();
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipMemcpyWithStream_TestkindDtoD") {
|
||||
TestkindDtoD();
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipMemcpyWithStream_TestOnMultiGPUwithOneStream") {
|
||||
TestOnMultiGPUwithOneStream();
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipMemcpyWithStream_TestkindDefault") {
|
||||
TestkindDefault();
|
||||
}
|
||||
#ifndef __HIP_PLATFORM_NVCC__
|
||||
TEST_CASE("Unit_hipMemcpyWithStream_TestkindDefaultForDtoD") {
|
||||
TestkindDefaultForDtoD();
|
||||
}
|
||||
// For transfers from device memory to device memory, no host-side synchronization is performed.
|
||||
SECTION("Device memory to device memory") {
|
||||
// This behavior differs on NVIDIA and AMD, on AMD the hipMemcpy calls is synchronous with
|
||||
// respect to the host
|
||||
#if HT_AMD
|
||||
HipTest::HIP_SKIP_TEST(
|
||||
"EXSWCPHIPT-127 - Memcpy from device to device memory behavior differs on AMD and Nvidia");
|
||||
return;
|
||||
#endif
|
||||
MemcpyDtoDSyncBehavior(std::bind(hipMemcpy, _1, _2, _3, hipMemcpyDeviceToDevice), false);
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipMemcpyWithStream_TestDtoDonSameDevice") {
|
||||
TestDtoDonSameDevice();
|
||||
// For transfers from any host memory to any host memory, the function is fully synchronous with
|
||||
// respect to the host
|
||||
SECTION("Host memory to host memory") {
|
||||
MemcpyHtoHSyncBehavior(std::bind(hipMemcpy, _1, _2, _3, hipMemcpyHostToHost), true);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipMemcpy_Negative_Parameters") {
|
||||
using namespace std::placeholders;
|
||||
|
||||
SECTION("Host to device") {
|
||||
LinearAllocGuard<int> device_alloc(LinearAllocs::hipMalloc, kPageSize);
|
||||
LinearAllocGuard<int> host_alloc(LinearAllocs::hipHostMalloc, kPageSize);
|
||||
MemcpyWithDirectionCommonNegativeTests(hipMemcpy, device_alloc.ptr(), host_alloc.ptr(),
|
||||
kPageSize, hipMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
SECTION("Device to host") {
|
||||
LinearAllocGuard<int> device_alloc(LinearAllocs::hipMalloc, kPageSize);
|
||||
LinearAllocGuard<int> host_alloc(LinearAllocs::hipHostMalloc, kPageSize);
|
||||
MemcpyWithDirectionCommonNegativeTests(hipMemcpy, host_alloc.ptr(), device_alloc.ptr(),
|
||||
kPageSize, hipMemcpyDeviceToHost);
|
||||
}
|
||||
|
||||
SECTION("Host to host") {
|
||||
LinearAllocGuard<int> src_alloc(LinearAllocs::hipHostMalloc, kPageSize);
|
||||
LinearAllocGuard<int> dst_alloc(LinearAllocs::hipHostMalloc, kPageSize);
|
||||
MemcpyWithDirectionCommonNegativeTests(hipMemcpy, dst_alloc.ptr(), src_alloc.ptr(), kPageSize,
|
||||
hipMemcpyHostToHost);
|
||||
}
|
||||
|
||||
SECTION("Device to device") {
|
||||
LinearAllocGuard<int> src_alloc(LinearAllocs::hipMalloc, kPageSize);
|
||||
LinearAllocGuard<int> dst_alloc(LinearAllocs::hipMalloc, kPageSize);
|
||||
MemcpyWithDirectionCommonNegativeTests(hipMemcpy, dst_alloc.ptr(), src_alloc.ptr(), kPageSize,
|
||||
hipMemcpyDeviceToDevice);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,586 @@
|
||||
/*
|
||||
Copyright (c) 2021-22-present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Different test for checking functionality of
|
||||
* hipError_t hipMemcpyWithStream(void* dst, const void* src, size_t sizeBytes,hipMemcpyKind kind,
|
||||
* hipStream_t stream);
|
||||
*/
|
||||
/*
|
||||
This testfile verifies the following scenarios
|
||||
1. hipMemcpyWithStream with one stream
|
||||
2. hipMemcpyWithStream with two streams
|
||||
3. Multi GPU and single stream
|
||||
4. hipMemcpyWithStream API with testkind DtoH
|
||||
5. hipMemcpyWithStream API with testkind DtoD
|
||||
6. hipMemcpyWithStream API with testkind HtoH
|
||||
7. hipMemcpyWithStream API with testkind TestkindDefault
|
||||
8. hipMemcpyWithStream API with testkind TestkindDefaultForDtoD
|
||||
9. hipMemcpyWithStream API DtoD on same device
|
||||
*/
|
||||
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_kernels.hh>
|
||||
#include <hip_test_checkers.hh>
|
||||
|
||||
#include<vector>
|
||||
#include<thread>
|
||||
#include<chrono>
|
||||
|
||||
#define LEN 64
|
||||
#define SIZE LEN << 2
|
||||
#define THREADS 2
|
||||
#define MAX_THREADS 16
|
||||
|
||||
static constexpr size_t N{4 * 1024 * 1024};
|
||||
static const auto MaxGPUDevices{256};
|
||||
static constexpr unsigned blocksPerCU{6}; // to hide latency
|
||||
static constexpr unsigned threadsPerBlock{256};
|
||||
|
||||
enum class ops
|
||||
{ TestwithOnestream,
|
||||
TestwithTwoStream,
|
||||
TestOnMultiGPUwithOneStream,
|
||||
TestkindDtoH,
|
||||
TestkindDtoD,
|
||||
TestkindHtoH,
|
||||
TestkindDefault,
|
||||
TestkindDefaultForDtoD,
|
||||
TestDtoDonSameDevice,
|
||||
END_OF_LIST
|
||||
};
|
||||
|
||||
struct joinable_thread : std::thread {
|
||||
template <class... Xs>
|
||||
explicit joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
|
||||
{} // NOLINT
|
||||
|
||||
joinable_thread& operator=(joinable_thread&& other) = default;
|
||||
joinable_thread(joinable_thread&& other) = default;
|
||||
|
||||
~joinable_thread() {
|
||||
if (this->joinable())
|
||||
this->join();
|
||||
}
|
||||
};
|
||||
|
||||
void TestwithOnestream(void) {
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
int *A_d, *B_d, *C_d;
|
||||
int *A_h, *B_h, *C_h;
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
||||
|
||||
hipStream_t stream;
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
|
||||
HIP_CHECK(hipMemcpyWithStream(A_d, A_h, Nbytes,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyWithStream(B_d, B_h, Nbytes,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, stream, static_cast<const int*>(A_d),
|
||||
static_cast<const int*>(B_d), C_d, N);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
||||
|
||||
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
}
|
||||
|
||||
void TestwithTwoStream(void) {
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
const int NUM_STREAMS = 2;
|
||||
int *A_d[NUM_STREAMS], *B_d[NUM_STREAMS], *C_d[NUM_STREAMS];
|
||||
int *A_h[NUM_STREAMS], *B_h[NUM_STREAMS], *C_h[NUM_STREAMS];
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
|
||||
for (int i=0; i < NUM_STREAMS; ++i) {
|
||||
HipTest::initArrays(&A_d[i], &B_d[i], &C_d[i],
|
||||
&A_h[i], &B_h[i], &C_h[i], N, false);
|
||||
}
|
||||
|
||||
hipStream_t stream[NUM_STREAMS];
|
||||
for (int i=0; i < NUM_STREAMS; ++i) {
|
||||
HIP_CHECK(hipStreamCreate(&stream[i]));
|
||||
}
|
||||
|
||||
for (int i=0; i < NUM_STREAMS; ++i) {
|
||||
HIP_CHECK(hipMemcpyWithStream(A_d[i], A_h[i], Nbytes,
|
||||
hipMemcpyHostToDevice, stream[i]));
|
||||
HIP_CHECK(hipMemcpyWithStream(B_d[i], B_h[i], Nbytes,
|
||||
hipMemcpyHostToDevice, stream[i]));
|
||||
}
|
||||
|
||||
for (int i=0; i < NUM_STREAMS; ++i) {
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, stream[i], static_cast<const int*>(A_d[i]),
|
||||
static_cast<const int*>(B_d[i]), C_d[i], N);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
}
|
||||
|
||||
for (int i=0; i < NUM_STREAMS; ++i) {
|
||||
HIP_CHECK(hipStreamSynchronize(stream[i]));
|
||||
HIP_CHECK(hipMemcpy(C_h[i], C_d[i], Nbytes, hipMemcpyDeviceToHost));
|
||||
HipTest::checkVectorADD(A_h[i], B_h[i], C_h[i], N);
|
||||
}
|
||||
|
||||
for (int i=0; i < NUM_STREAMS; ++i) {
|
||||
HipTest::freeArrays(A_d[i], B_d[i], C_d[i], A_h[i], B_h[i], C_h[i], false);
|
||||
HIP_CHECK(hipStreamDestroy(stream[i]));
|
||||
}
|
||||
}
|
||||
|
||||
void TestDtoDonSameDevice(void) {
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
const int NUM_STREAMS = 2;
|
||||
int *A_d[NUM_STREAMS], *B_d[NUM_STREAMS], *C_d[NUM_STREAMS];
|
||||
int *A_h[NUM_STREAMS], *B_h[NUM_STREAMS], *C_h[NUM_STREAMS];
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
|
||||
HipTest::initArrays(&A_d[0], &B_d[0], &C_d[0],
|
||||
&A_h[0], &B_h[0], &C_h[0], N, false);
|
||||
|
||||
|
||||
hipStream_t stream[NUM_STREAMS];
|
||||
for (int i=0; i < NUM_STREAMS; ++i) {
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
HIP_CHECK(hipStreamCreate(&stream[i]));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
HIP_CHECK(hipMalloc(&A_d[1], Nbytes));
|
||||
HIP_CHECK(hipMalloc(&B_d[1], Nbytes));
|
||||
HIP_CHECK(hipMalloc(&C_d[1], Nbytes));
|
||||
C_h[1] = reinterpret_cast<int*>(malloc(Nbytes));
|
||||
HIP_ASSERT(C_h[1] != NULL);
|
||||
|
||||
HIP_CHECK(hipMemcpyWithStream(A_d[0], A_h[0], Nbytes,
|
||||
hipMemcpyHostToDevice, stream[0]));
|
||||
HIP_CHECK(hipMemcpyWithStream(B_d[0], B_h[0], Nbytes,
|
||||
hipMemcpyHostToDevice, stream[0]));
|
||||
|
||||
HIP_CHECK(hipMemcpyWithStream(A_d[1], A_d[0], Nbytes,
|
||||
hipMemcpyDeviceToDevice, stream[1]));
|
||||
HIP_CHECK(hipMemcpyWithStream(B_d[1], B_d[0], Nbytes,
|
||||
hipMemcpyDeviceToDevice, stream[1]));
|
||||
|
||||
|
||||
for (int i=0; i < NUM_STREAMS; ++i) {
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, stream[i], static_cast<const int*>(A_d[i]),
|
||||
static_cast<const int*>(B_d[i]), C_d[i], N);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
}
|
||||
|
||||
for (int i=0; i < NUM_STREAMS; ++i) {
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
HIP_CHECK(hipStreamSynchronize(stream[i]));
|
||||
HIP_CHECK(hipMemcpy(C_h[i], C_d[i], Nbytes, hipMemcpyDeviceToHost));
|
||||
HipTest::checkVectorADD(A_h[0], B_h[0], C_h[i], N);
|
||||
}
|
||||
|
||||
|
||||
HipTest::freeArrays(A_d[0], B_d[0], C_d[0], A_h[0], B_h[0], C_h[0], false);
|
||||
|
||||
if (A_d[1]) {
|
||||
HIP_CHECK(hipFree(A_d[1]));
|
||||
}
|
||||
if (B_d[1]) {
|
||||
HIP_CHECK(hipFree(B_d[1]));
|
||||
}
|
||||
if (C_d[1]) {
|
||||
HIP_CHECK(hipFree(C_d[1]));
|
||||
}
|
||||
if (C_h[1]) {
|
||||
free(C_h[1]);
|
||||
}
|
||||
|
||||
|
||||
for (int i=0; i < NUM_STREAMS; ++i) {
|
||||
HIP_CHECK(hipStreamDestroy(stream[i]));
|
||||
}
|
||||
}
|
||||
|
||||
void TestOnMultiGPUwithOneStream(void) {
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
int NumDevices = 0;
|
||||
|
||||
HIP_CHECK(hipGetDeviceCount(&NumDevices));
|
||||
// If you have single GPU machine the return
|
||||
if (NumDevices <= 1) {
|
||||
SUCCEED("NumDevices <2");
|
||||
} else {
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
int *A_d[MaxGPUDevices], *B_d[MaxGPUDevices], *C_d[MaxGPUDevices];
|
||||
int *A_h[MaxGPUDevices], *B_h[MaxGPUDevices], *C_h[MaxGPUDevices];
|
||||
|
||||
hipStream_t stream[MaxGPUDevices];
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HIP_CHECK(hipStreamCreate(&stream[i]));
|
||||
}
|
||||
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HipTest::initArrays(&A_d[i], &B_d[i], &C_d[i],
|
||||
&A_h[i], &B_h[i], &C_h[i], N, false);
|
||||
}
|
||||
|
||||
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HIP_CHECK(hipMemcpyWithStream(A_d[i], A_h[i], Nbytes,
|
||||
hipMemcpyHostToDevice, stream[i]));
|
||||
HIP_CHECK(hipMemcpyWithStream(B_d[i], B_h[i], Nbytes,
|
||||
hipMemcpyHostToDevice, stream[i]));
|
||||
}
|
||||
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks),
|
||||
dim3(threadsPerBlock), 0, stream[i],
|
||||
static_cast<const int*>(A_d[i]),
|
||||
static_cast<const int*>(B_d[i]), C_d[i], N);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
}
|
||||
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HIP_CHECK(hipStreamSynchronize(stream[i]));
|
||||
HIP_CHECK(hipMemcpy(C_h[i], C_d[i], Nbytes, hipMemcpyDeviceToHost));
|
||||
HipTest::checkVectorADD(A_h[i], B_h[i], C_h[i], N);
|
||||
}
|
||||
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HipTest::freeArrays(A_d[i], B_d[i], C_d[i],
|
||||
A_h[i], B_h[i], C_h[i], false);
|
||||
HIP_CHECK(hipStreamDestroy(stream[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TestkindDtoH(void) {
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
int *A_d, *B_d, *C_d;
|
||||
int *A_h, *B_h, *C_h;
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
||||
|
||||
hipStream_t stream;
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
|
||||
HIP_CHECK(hipMemcpyWithStream(A_d, A_h, Nbytes,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyWithStream(B_d, B_h, Nbytes,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, stream, static_cast<const int*>(A_d),
|
||||
static_cast<const int*>(B_d), C_d, N);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
HIP_CHECK(hipMemcpyWithStream(C_h, C_d, Nbytes,
|
||||
hipMemcpyDeviceToHost, stream));
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
||||
|
||||
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
}
|
||||
|
||||
void TestkindDtoD(void) {
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
int NumDevices = 0;
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
HIP_CHECK(hipGetDeviceCount(&NumDevices));
|
||||
// If you have single GPU machine the return
|
||||
if (NumDevices <= 1) {
|
||||
SUCCEED("NumDevices are less than 2");
|
||||
} else {
|
||||
int *A_d[MaxGPUDevices], *B_d[MaxGPUDevices], *C_d[MaxGPUDevices];
|
||||
int *A_h[MaxGPUDevices], *B_h[MaxGPUDevices], *C_h[MaxGPUDevices];
|
||||
|
||||
hipStream_t stream[MaxGPUDevices];
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HIP_CHECK(hipStreamCreate(&stream[i]));
|
||||
}
|
||||
|
||||
// Initialize and create the host and device elements for first device
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
HipTest::initArrays(&A_d[0], &B_d[0], &C_d[0],
|
||||
&A_h[0], &B_h[0], &C_h[0], N, false);
|
||||
|
||||
for (int i=1; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i))
|
||||
HIP_CHECK(hipMalloc(&A_d[i], Nbytes));
|
||||
HIP_CHECK(hipMalloc(&B_d[i], Nbytes));
|
||||
HIP_CHECK(hipMalloc(&C_d[i], Nbytes));
|
||||
C_h[i] = reinterpret_cast<int*>(malloc(Nbytes));
|
||||
HIP_ASSERT(C_h[i] != NULL);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
HIP_CHECK(hipMemcpyWithStream(A_d[0], A_h[0], Nbytes,
|
||||
hipMemcpyHostToDevice, stream[0]));
|
||||
HIP_CHECK(hipMemcpyWithStream(B_d[0], B_h[0], Nbytes,
|
||||
hipMemcpyHostToDevice, stream[0]));
|
||||
|
||||
// Copying device data from 1st GPU to the rest of the the GPUs that is
|
||||
// NumDevices in the setup. 1st GPU start numbering from 0,1,2..n etc.
|
||||
for (int i=1; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HIP_CHECK(hipMemcpyWithStream(A_d[i], A_d[0], Nbytes,
|
||||
hipMemcpyDeviceToDevice, stream[i]));
|
||||
HIP_CHECK(hipMemcpyWithStream(B_d[i], B_d[0], Nbytes,
|
||||
hipMemcpyDeviceToDevice, stream[i]));
|
||||
}
|
||||
|
||||
|
||||
// Launching the kernel including the 1st GPU to the no of GPUs present
|
||||
// in the setup. 1st GPU start numbering from 0,1,2..n etc.
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0, stream[i], static_cast<const int*>(A_d[i]),
|
||||
static_cast<const int*>(B_d[i]), C_d[i], N);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
}
|
||||
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HIP_CHECK(hipStreamSynchronize(stream[i]));
|
||||
HIP_CHECK(hipMemcpy(C_h[i], C_d[i], Nbytes, hipMemcpyDeviceToHost));
|
||||
HipTest::checkVectorADD(A_h[0], B_h[0], C_h[i], N);
|
||||
}
|
||||
|
||||
HipTest::freeArrays(A_d[0], B_d[0], C_d[0], A_h[0], B_h[0], C_h[0], false);
|
||||
HIP_CHECK(hipStreamDestroy(stream[0]));
|
||||
|
||||
for (int i=1; i < NumDevices; ++i) {
|
||||
if (A_d[i]) {
|
||||
HIP_CHECK(hipFree(A_d[i]));
|
||||
}
|
||||
if (B_d[i]) {
|
||||
HIP_CHECK(hipFree(B_d[i]));
|
||||
}
|
||||
if (C_d[i]) {
|
||||
HIP_CHECK(hipFree(C_d[i]));
|
||||
}
|
||||
if (C_h[i]) {
|
||||
free(C_h[i]);
|
||||
}
|
||||
HIP_CHECK(hipStreamDestroy(stream[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TestkindDefault(void) {
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
int *A_d, *B_d, *C_d;
|
||||
int *A_h, *B_h, *C_h;
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
||||
|
||||
hipStream_t stream;
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
|
||||
HIP_CHECK(hipMemcpyWithStream(A_d, A_h, Nbytes, hipMemcpyDefault, stream));
|
||||
HIP_CHECK(hipMemcpyWithStream(B_d, B_h, Nbytes, hipMemcpyDefault, stream));
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, stream, static_cast<const int*>(A_d),
|
||||
static_cast<const int*>(B_d), C_d, N);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
HIP_CHECK(hipMemcpyWithStream(C_h, C_d, Nbytes, hipMemcpyDefault, stream));
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
||||
|
||||
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
}
|
||||
|
||||
void TestkindDefaultForDtoD(void) {
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
int NumDevices = 0;
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
HIP_CHECK(hipGetDeviceCount(&NumDevices));
|
||||
// Test case will not run on single GPU setup.
|
||||
if (NumDevices <= 1) {
|
||||
SUCCEED("No of Devices < 2");
|
||||
} else {
|
||||
int *A_d[MaxGPUDevices], *B_d[MaxGPUDevices], *C_d[MaxGPUDevices];
|
||||
int *A_h[MaxGPUDevices], *B_h[MaxGPUDevices], *C_h[MaxGPUDevices];
|
||||
|
||||
// Initialize and create the host and device elements for first device
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
HipTest::initArrays(&A_d[0], &B_d[0], &C_d[0],
|
||||
&A_h[0], &B_h[0], &C_h[0], N, false);
|
||||
|
||||
for (int i=1; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HIP_CHECK(hipMalloc(&A_d[i], Nbytes));
|
||||
HIP_CHECK(hipMalloc(&B_d[i], Nbytes));
|
||||
HIP_CHECK(hipMalloc(&C_d[i], Nbytes));
|
||||
C_h[i] = reinterpret_cast<int*>(malloc(Nbytes));
|
||||
HIP_ASSERT(C_h[i] != NULL);
|
||||
}
|
||||
|
||||
hipStream_t stream[MaxGPUDevices];
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HIP_CHECK(hipStreamCreate(&stream[i]));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMemcpyWithStream(A_d[0], A_h[0], Nbytes,
|
||||
hipMemcpyHostToDevice, stream[0]));
|
||||
HIP_CHECK(hipMemcpyWithStream(B_d[0], B_h[0], Nbytes,
|
||||
hipMemcpyHostToDevice, stream[0]));
|
||||
|
||||
// Copying device data from 1st GPU to the rest of the the GPUs
|
||||
// using hipMemcpyDefault kind that is NumDevices in the setup.
|
||||
// 1st GPU start numbering from 0,1,2..n etc.
|
||||
for (int i=1; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipMemcpyWithStream(A_d[i], A_d[0], Nbytes,
|
||||
hipMemcpyDefault, stream[i]));
|
||||
HIP_CHECK(hipMemcpyWithStream(B_d[i], B_d[0], Nbytes,
|
||||
hipMemcpyDefault, stream[i]));
|
||||
}
|
||||
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0, stream[i], static_cast<const int*>(A_d[i]),
|
||||
static_cast<const int*>(B_d[i]), C_d[i], N);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
}
|
||||
|
||||
for (int i=0; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i)); // hipMemcpy will be on this device
|
||||
HIP_CHECK(hipStreamSynchronize(stream[i]));
|
||||
HIP_CHECK(hipMemcpy(C_h[i], C_d[i], Nbytes, hipMemcpyDeviceToHost));
|
||||
// Output of each GPU is getting validated with input of 1st GPU.
|
||||
HipTest::checkVectorADD(A_h[0], B_h[0], C_h[i], N);
|
||||
}
|
||||
|
||||
HipTest::freeArrays(A_d[0], B_d[0], C_d[0], A_h[0], B_h[0], C_h[0], false);
|
||||
HIP_CHECK(hipStreamDestroy(stream[0]));
|
||||
|
||||
for (int i=1; i < NumDevices; ++i) {
|
||||
if (A_d[i]) {
|
||||
HIP_CHECK(hipFree(A_d[i]));
|
||||
}
|
||||
if (B_d[i]) {
|
||||
HIP_CHECK(hipFree(B_d[i]));
|
||||
}
|
||||
if (C_d[i]) {
|
||||
HIP_CHECK(hipFree(C_d[i]));
|
||||
}
|
||||
if (C_h[i]) {
|
||||
free(C_h[i]);
|
||||
}
|
||||
HIP_CHECK(hipStreamDestroy(stream[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TestkindHtoH(void) {
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
int *A_h, *B_h;
|
||||
|
||||
|
||||
// Allocate memory to A_h and B_h
|
||||
A_h = static_cast<int*>(malloc(Nbytes));
|
||||
HIP_ASSERT(A_h != NULL);
|
||||
B_h = static_cast<int*>(malloc(Nbytes));
|
||||
HIP_ASSERT(B_h != NULL);
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
if (A_h) {
|
||||
(A_h)[i] = 3.146f + i; // Pi
|
||||
}
|
||||
}
|
||||
|
||||
hipStream_t stream;
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
|
||||
HIP_CHECK(hipMemcpyWithStream(B_h, A_h, Nbytes, hipMemcpyHostToHost, stream));
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
HIP_ASSERT(A_h[i] == B_h[i]);
|
||||
}
|
||||
|
||||
if (A_h) {
|
||||
free(A_h);
|
||||
}
|
||||
if (B_h) {
|
||||
free(B_h);
|
||||
}
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
}
|
||||
|
||||
|
||||
TEST_CASE("Unit_hipMemcpyWithStream_TestWithOneStream") {
|
||||
TestwithOnestream();
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipMemcpyWithStream_TestwithTwoStream") {
|
||||
TestwithTwoStream();
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipMemcpyWithStream_TestkindDtoH") {
|
||||
TestkindDtoH();
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipMemcpyWithStream_TestkindHtoH") {
|
||||
TestkindHtoH();
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipMemcpyWithStream_TestkindDtoD") {
|
||||
TestkindDtoD();
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipMemcpyWithStream_TestOnMultiGPUwithOneStream") {
|
||||
TestOnMultiGPUwithOneStream();
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipMemcpyWithStream_TestkindDefault") {
|
||||
TestkindDefault();
|
||||
}
|
||||
#ifndef __HIP_PLATFORM_NVCC__
|
||||
TEST_CASE("Unit_hipMemcpyWithStream_TestkindDefaultForDtoD") {
|
||||
TestkindDefaultForDtoD();
|
||||
}
|
||||
#endif
|
||||
|
||||
TEST_CASE("Unit_hipMemcpyWithStream_TestDtoDonSameDevice") {
|
||||
TestDtoDonSameDevice();
|
||||
}
|
||||
@@ -0,0 +1,119 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <memcpy1d_tests_common.hh>
|
||||
#include <resource_guards.hh>
|
||||
#include <utils.hh>
|
||||
|
||||
// hipMemcpyDtoH
|
||||
TEST_CASE("Unit_hipMemcpyDtoH_Positive_Basic") {
|
||||
MemcpyDeviceToHostShell<false>([](void* dst, void* src, size_t count) {
|
||||
return hipMemcpyDtoH(dst, reinterpret_cast<hipDeviceptr_t>(src), count);
|
||||
});
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipMemcpyDtoH_Positive_Synchronization_Behavior") {
|
||||
const auto f = [](void* dst, void* src, size_t count) {
|
||||
return hipMemcpyDtoH(dst, reinterpret_cast<hipDeviceptr_t>(src), count);
|
||||
};
|
||||
MemcpyDtoHPageableSyncBehavior(f, true);
|
||||
MemcpyDtoHPinnedSyncBehavior(f, true);
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipMemcpyDtoH_Negative_Parameters") {
|
||||
using namespace std::placeholders;
|
||||
LinearAllocGuard<int> device_alloc(LinearAllocs::hipMalloc, kPageSize);
|
||||
LinearAllocGuard<int> host_alloc(LinearAllocs::hipHostMalloc, kPageSize);
|
||||
|
||||
MemcpyCommonNegativeTests(
|
||||
[](void* dst, void* src, size_t count) {
|
||||
return hipMemcpyDtoH(dst, reinterpret_cast<hipDeviceptr_t>(src), count);
|
||||
},
|
||||
host_alloc.ptr(), device_alloc.ptr(), kPageSize);
|
||||
}
|
||||
|
||||
// hipMemcpyHtoD
|
||||
TEST_CASE("Unit_hipMemcpyHtoD_Positive_Basic") {
|
||||
MemcpyHostToDeviceShell<false>([](void* dst, void* src, size_t count) {
|
||||
return hipMemcpyHtoD(reinterpret_cast<hipDeviceptr_t>(dst), src, count);
|
||||
});
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipMemcpyHtoD_Positive_Synchronization_Behavior") {
|
||||
MemcpyHtoDSyncBehavior(
|
||||
[](void* dst, void* src, size_t count) {
|
||||
return hipMemcpyHtoD(reinterpret_cast<hipDeviceptr_t>(dst), src, count);
|
||||
},
|
||||
true);
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipMemcpyHtoD_Negative_Parameters") {
|
||||
using namespace std::placeholders;
|
||||
LinearAllocGuard<int> device_alloc(LinearAllocs::hipMalloc, kPageSize);
|
||||
LinearAllocGuard<int> host_alloc(LinearAllocs::hipHostMalloc, kPageSize);
|
||||
|
||||
MemcpyCommonNegativeTests(
|
||||
[](void* dst, void* src, size_t count) {
|
||||
return hipMemcpyHtoD(reinterpret_cast<hipDeviceptr_t>(dst), src, count);
|
||||
},
|
||||
device_alloc.ptr(), host_alloc.ptr(), kPageSize);
|
||||
}
|
||||
|
||||
// hipMemcpyDtoD
|
||||
TEST_CASE("Unit_hipMemcpyDtoD_Positive_Basic") {
|
||||
const auto f = [](void* dst, void* src, size_t count) {
|
||||
return hipMemcpyDtoD(reinterpret_cast<hipDeviceptr_t>(dst),
|
||||
reinterpret_cast<hipDeviceptr_t>(src), count);
|
||||
};
|
||||
SECTION("Peer access enabled") { MemcpyDeviceToDeviceShell<false, true>(f); }
|
||||
SECTION("Peer access disabled") { MemcpyDeviceToDeviceShell<false, false>(f); }
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipMemcpyDtoD_Positive_Synchronization_Behavior") {
|
||||
// This behavior differs on NVIDIA and AMD, on AMD the hipMemcpy calls is synchronous with
|
||||
// respect to the host
|
||||
#if HT_AMD
|
||||
HipTest::HIP_SKIP_TEST(
|
||||
"EXSWCPHIPT-127 - Memcpy from device to device memory behavior differs on AMD and Nvidia");
|
||||
return;
|
||||
#endif
|
||||
MemcpyDtoDSyncBehavior(
|
||||
[](void* dst, void* src, size_t count) {
|
||||
return hipMemcpyDtoD(reinterpret_cast<hipDeviceptr_t>(dst),
|
||||
reinterpret_cast<hipDeviceptr_t>(src), count);
|
||||
},
|
||||
false);
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipMemcpyDtoD_Negative_Parameters") {
|
||||
using namespace std::placeholders;
|
||||
LinearAllocGuard<int> src_alloc(LinearAllocs::hipMalloc, kPageSize);
|
||||
LinearAllocGuard<int> dst_alloc(LinearAllocs::hipMalloc, kPageSize);
|
||||
|
||||
MemcpyCommonNegativeTests(
|
||||
[](void* dst, void* src, size_t count) {
|
||||
return hipMemcpyDtoD(reinterpret_cast<hipDeviceptr_t>(dst),
|
||||
reinterpret_cast<hipDeviceptr_t>(src), count);
|
||||
},
|
||||
dst_alloc.ptr(), src_alloc.ptr(), kPageSize);
|
||||
}
|
||||
@@ -0,0 +1,619 @@
|
||||
/*
|
||||
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
This testcase verifies following scenarios
|
||||
1. hipMemcpy API along with kernel launch with different data types
|
||||
2. H2D-D2D-D2H scenarios for unpinned and pinned memory
|
||||
3. Boundary checks with different sizes
|
||||
4. Multithread scenario
|
||||
5. device offset scenario
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_kernels.hh>
|
||||
#include <hip_test_checkers.hh>
|
||||
|
||||
#ifdef _WIN32
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <windows.h>
|
||||
#else
|
||||
#include "sys/types.h"
|
||||
#include "sys/sysinfo.h"
|
||||
#endif
|
||||
|
||||
|
||||
static constexpr auto NUM_ELM{4*1024 * 1024};
|
||||
static unsigned blocksPerCU{6}; // to hide latency
|
||||
static unsigned threadsPerBlock{256};
|
||||
|
||||
template<typename T>
|
||||
class DeviceMemory {
|
||||
public:
|
||||
explicit DeviceMemory(size_t numElements);
|
||||
DeviceMemory() = delete;
|
||||
~DeviceMemory();
|
||||
T* A_d() const { return _A_d + _offset; }
|
||||
T* B_d() const { return _B_d + _offset; }
|
||||
T* C_d() const { return _C_d + _offset; }
|
||||
T* C_dd() const { return _C_dd + _offset; }
|
||||
size_t maxNumElements() const { return _maxNumElements; }
|
||||
void offset(int offset) { _offset = offset; }
|
||||
int offset() const { return _offset; }
|
||||
private:
|
||||
T* _A_d;
|
||||
T* _B_d;
|
||||
T* _C_d;
|
||||
T* _C_dd;
|
||||
size_t _maxNumElements;
|
||||
int _offset;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
DeviceMemory<T>::DeviceMemory(size_t numElements) :
|
||||
_maxNumElements(numElements), _offset(0) {
|
||||
T** np = nullptr;
|
||||
HipTest::initArrays(&_A_d, &_B_d, &_C_d, np, np, np, numElements, 0);
|
||||
size_t sizeElements = numElements * sizeof(T);
|
||||
HIP_CHECK(hipMalloc(&_C_dd, sizeElements));
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
DeviceMemory<T>::~DeviceMemory() {
|
||||
T* np = nullptr;
|
||||
HipTest::freeArrays<T>(_A_d, _B_d, _C_d, np, np, np, 0);
|
||||
HIP_CHECK(hipFree(_C_dd));
|
||||
_C_dd = NULL;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
class HostMemory {
|
||||
public:
|
||||
HostMemory(size_t numElements, bool usePinnedHost);
|
||||
HostMemory() = delete;
|
||||
void reset(size_t numElements, bool full = false);
|
||||
~HostMemory();
|
||||
T* A_h() const { return _A_h + _offset; }
|
||||
T* B_h() const { return _B_h + _offset; }
|
||||
T* C_h() const { return _C_h + _offset; }
|
||||
|
||||
size_t maxNumElements() const { return _maxNumElements; }
|
||||
void offset(int offset) { _offset = offset; }
|
||||
int offset() const { return _offset; }
|
||||
|
||||
// Host arrays, secondary copy
|
||||
T* A_hh;
|
||||
T* B_hh;
|
||||
bool _usePinnedHost;
|
||||
|
||||
private:
|
||||
size_t _maxNumElements;
|
||||
int _offset;
|
||||
|
||||
// Host arrays
|
||||
T* _A_h;
|
||||
T* _B_h;
|
||||
T* _C_h;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
HostMemory<T>::HostMemory(size_t numElements, bool usePinnedHost)
|
||||
: _usePinnedHost(usePinnedHost), _maxNumElements(numElements), _offset(0) {
|
||||
T** np = nullptr;
|
||||
HipTest::initArrays(np, np, np, &_A_h, &_B_h, &_C_h,
|
||||
numElements, usePinnedHost);
|
||||
|
||||
A_hh = NULL;
|
||||
B_hh = NULL;
|
||||
|
||||
|
||||
size_t sizeElements = numElements * sizeof(T);
|
||||
|
||||
if (usePinnedHost) {
|
||||
HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&A_hh), sizeElements,
|
||||
hipHostMallocDefault));
|
||||
HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&B_hh), sizeElements,
|
||||
hipHostMallocDefault));
|
||||
} else {
|
||||
A_hh = reinterpret_cast<T*>(malloc(sizeElements));
|
||||
B_hh = reinterpret_cast<T*>(malloc(sizeElements));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostMemory<T>::reset(size_t numElements, bool full) {
|
||||
// Initialize the host data:
|
||||
for (size_t i = 0; i < numElements; i++) {
|
||||
(A_hh)[i] = 1097.0 + i;
|
||||
(B_hh)[i] = 1492.0 + i; // Phi
|
||||
|
||||
if (full) {
|
||||
(_A_h)[i] = 3.146f + i; // Pi
|
||||
(_B_h)[i] = 1.618f + i; // Phi
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HostMemory<T>::~HostMemory() {
|
||||
HipTest::freeArraysForHost(_A_h, _B_h, _C_h, _usePinnedHost);
|
||||
|
||||
if (_usePinnedHost) {
|
||||
HIP_CHECK(hipHostFree(A_hh));
|
||||
HIP_CHECK(hipHostFree(B_hh));
|
||||
|
||||
} else {
|
||||
free(A_hh);
|
||||
free(B_hh);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
void memcpytest2_get_host_memory(size_t *free, size_t *total) {
|
||||
MEMORYSTATUSEX status;
|
||||
status.dwLength = sizeof(status);
|
||||
GlobalMemoryStatusEx(&status);
|
||||
// Windows doesn't allow allocating more than half of system memory to the gpu
|
||||
// Since the runtime also needs space for its internal allocations,
|
||||
// we should not try to allocate more than 40% of reported system memory,
|
||||
// otherwise we can run into OOM issues.
|
||||
*free = static_cast<size_t>(0.4 * status.ullAvailPhys);
|
||||
*total = static_cast<size_t>(0.4 * status.ullTotalPhys);
|
||||
}
|
||||
#else
|
||||
struct sysinfo memInfo;
|
||||
void memcpytest2_get_host_memory(size_t *free, size_t *total) {
|
||||
sysinfo(&memInfo);
|
||||
uint64_t freePhysMem = memInfo.freeram;
|
||||
freePhysMem *= memInfo.mem_unit;
|
||||
*free = freePhysMem;
|
||||
uint64_t totalPhysMem = memInfo.totalram;
|
||||
totalPhysMem *= memInfo.mem_unit;
|
||||
*total = totalPhysMem;
|
||||
}
|
||||
#endif
|
||||
|
||||
//---
|
||||
// Test many different kinds of memory copies.
|
||||
// The subroutine allocates memory , copies to device, runs a vector
|
||||
// add kernel, copies back, and
|
||||
// checks the result.
|
||||
//
|
||||
// IN: numElements controls the number of elements used for allocations.
|
||||
// IN: usePinnedHost : If true, allocate host with hipHostMalloc and is pinned
|
||||
// else allocate host
|
||||
// memory with malloc. IN: useHostToHost : If true, add an extra
|
||||
// host-to-host copy. IN:
|
||||
// useDeviceToDevice : If true, add an extra deviceto-device copy after
|
||||
// result is produced. IN:
|
||||
// useMemkindDefault : If true, use memkinddefault
|
||||
// (runtime figures out direction). if false, use
|
||||
// explicit memcpy direction.
|
||||
//
|
||||
template <typename T>
|
||||
void memcpytest2(DeviceMemory<T>* dmem, HostMemory<T>* hmem,
|
||||
size_t numElements, bool useHostToHost,
|
||||
bool useDeviceToDevice, bool useMemkindDefault) {
|
||||
size_t sizeElements = numElements * sizeof(T);
|
||||
|
||||
hmem->reset(numElements);
|
||||
|
||||
assert(numElements <= dmem->maxNumElements());
|
||||
assert(numElements <= hmem->maxNumElements());
|
||||
|
||||
|
||||
if (useHostToHost) {
|
||||
// Do some extra host-to-host copies here to mix things up:
|
||||
HIP_CHECK(hipMemcpy(hmem->A_hh, hmem->A_h(), sizeElements,
|
||||
useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToHost));
|
||||
HIP_CHECK(hipMemcpy(hmem->B_hh, hmem->B_h(), sizeElements,
|
||||
useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToHost));
|
||||
|
||||
|
||||
HIP_CHECK(hipMemcpy(dmem->A_d(), hmem->A_hh, sizeElements,
|
||||
useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(dmem->B_d(), hmem->B_hh, sizeElements,
|
||||
useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
} else {
|
||||
HIP_CHECK(hipMemcpy(dmem->A_d(), hmem->A_h(), sizeElements,
|
||||
useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(dmem->B_d(), hmem->B_h(), sizeElements,
|
||||
useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(1), dim3(1), 0, 0,
|
||||
static_cast<const T*>(dmem->A_d()), static_cast<const T*>(dmem->B_d()),
|
||||
dmem->C_d(), numElements);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
|
||||
if (useDeviceToDevice) {
|
||||
// Do an extra device-to-device copy here to mix things up:
|
||||
HIP_CHECK(hipMemcpy(dmem->C_dd(), dmem->C_d(), sizeElements,
|
||||
useMemkindDefault ? hipMemcpyDefault : hipMemcpyDeviceToDevice));
|
||||
|
||||
// Destroy the original dmem->C_d():
|
||||
HIP_CHECK(hipMemset(dmem->C_d(), 0x5A, sizeElements));
|
||||
|
||||
HIP_CHECK(hipMemcpy(hmem->C_h(), dmem->C_dd(), sizeElements,
|
||||
useMemkindDefault ? hipMemcpyDefault : hipMemcpyDeviceToHost));
|
||||
} else {
|
||||
HIP_CHECK(hipMemcpy(hmem->C_h(), dmem->C_d(), sizeElements,
|
||||
useMemkindDefault ? hipMemcpyDefault : hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
HipTest::checkVectorADD(hmem->A_h(), hmem->B_h(), hmem->C_h(), numElements);
|
||||
|
||||
|
||||
printf(" %s success\n", __func__);
|
||||
}
|
||||
|
||||
// Try all the 16 possible combinations to memcpytest2 - usePinnedHost,
|
||||
// useHostToHost,
|
||||
// useDeviceToDevice, useMemkindDefault
|
||||
template <typename T>
|
||||
void memcpytest2_for_type(size_t numElements) {
|
||||
DeviceMemory<T> memD(numElements);
|
||||
HostMemory<T> memU(numElements, 0 /*usePinnedHost*/);
|
||||
HostMemory<T> memP(numElements, 1 /*usePinnedHost*/);
|
||||
|
||||
for (int usePinnedHost = 0; usePinnedHost <= 1; usePinnedHost++) {
|
||||
for (int useHostToHost = 0; useHostToHost <= 1; useHostToHost++) {
|
||||
for (int useDeviceToDevice = 0; useDeviceToDevice <= 1;
|
||||
useDeviceToDevice++) {
|
||||
for (int useMemkindDefault = 0; useMemkindDefault <= 1;
|
||||
useMemkindDefault++) {
|
||||
memcpytest2<T>(&memD, usePinnedHost ? &memP : &memU,
|
||||
numElements, useHostToHost,
|
||||
useDeviceToDevice, useMemkindDefault);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Try many different sizes to memory copy.
|
||||
template <typename T>
|
||||
void memcpytest2_sizes(size_t maxElem = 0) {
|
||||
int deviceId;
|
||||
HIP_CHECK(hipGetDevice(&deviceId));
|
||||
|
||||
size_t free, total, freeCPU, totalCPU;
|
||||
HIP_CHECK(hipMemGetInfo(&free, &total));
|
||||
memcpytest2_get_host_memory(&freeCPU, &totalCPU);
|
||||
|
||||
if (maxElem == 0) {
|
||||
// Use lesser maxElem if not enough host memory available
|
||||
size_t maxElemGPU = free / sizeof(T) / 8;
|
||||
size_t maxElemCPU = freeCPU / sizeof(T) / 8;
|
||||
maxElem = maxElemGPU < maxElemCPU ? maxElemGPU : maxElemCPU;
|
||||
}
|
||||
|
||||
HIP_CHECK(hipDeviceReset());
|
||||
DeviceMemory<T> memD(maxElem);
|
||||
HostMemory<T> memU(maxElem, 0 /*usePinnedHost*/);
|
||||
HostMemory<T> memP(maxElem, 1 /*usePinnedHost*/);
|
||||
|
||||
for (size_t elem = 1; elem <= maxElem; elem *= 2) {
|
||||
memcpytest2<T>(&memD, &memU, elem, 1, 1, 0); // unpinned host
|
||||
memcpytest2<T>(&memD, &memP, elem, 1, 1, 0); // pinned host
|
||||
}
|
||||
}
|
||||
|
||||
// Try many different sizes to memory copy.
|
||||
template <typename T>
|
||||
void memcpytest2_offsets(size_t maxElem, bool devOffsets, bool hostOffsets) {
|
||||
int deviceId;
|
||||
HIP_CHECK(hipGetDevice(&deviceId));
|
||||
|
||||
size_t free, total;
|
||||
HIP_CHECK(hipMemGetInfo(&free, &total));
|
||||
|
||||
HIP_CHECK(hipDeviceReset());
|
||||
DeviceMemory<T> memD(maxElem);
|
||||
HostMemory<T> memU(maxElem, 0 /*usePinnedHost*/);
|
||||
HostMemory<T> memP(maxElem, 1 /*usePinnedHost*/);
|
||||
|
||||
size_t elem = maxElem / 2;
|
||||
|
||||
for (size_t offset = 0; offset < 512; offset++) {
|
||||
assert(elem + offset < maxElem);
|
||||
if (devOffsets) {
|
||||
memD.offset(offset);
|
||||
}
|
||||
if (hostOffsets) {
|
||||
memU.offset(offset);
|
||||
memP.offset(offset);
|
||||
}
|
||||
memcpytest2<T>(&memD, &memU, elem, 1, 1, 0); // unpinned host
|
||||
memcpytest2<T>(&memD, &memP, elem, 1, 1, 0); // pinned host
|
||||
}
|
||||
|
||||
for (size_t offset = 512; offset < elem; offset *= 2) {
|
||||
assert(elem + offset < maxElem);
|
||||
if (devOffsets) {
|
||||
memD.offset(offset);
|
||||
}
|
||||
if (hostOffsets) {
|
||||
memU.offset(offset);
|
||||
memP.offset(offset);
|
||||
}
|
||||
memcpytest2<T>(&memD, &memU, elem, 1, 1, 0); // unpinned host
|
||||
memcpytest2<T>(&memD, &memP, elem, 1, 1, 0); // pinned host
|
||||
}
|
||||
}
|
||||
|
||||
// Create multiple threads to stress multi-thread locking behavior in the
|
||||
// allocation/deallocation/tracking logic:
|
||||
template <typename T>
|
||||
void multiThread_1(bool serialize, bool usePinnedHost) {
|
||||
DeviceMemory<T> memD(NUM_ELM);
|
||||
HostMemory<T> mem1(NUM_ELM, usePinnedHost);
|
||||
HostMemory<T> mem2(NUM_ELM, usePinnedHost);
|
||||
|
||||
std::thread t1(memcpytest2<T>, &memD, &mem1, NUM_ELM, 0, 0, 0);
|
||||
if (serialize) {
|
||||
t1.join();
|
||||
}
|
||||
|
||||
|
||||
std::thread t2(memcpytest2<T>, &memD, &mem2, NUM_ELM, 0, 0, 0);
|
||||
if (serialize) {
|
||||
t2.join();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
This testcase verifies hipMemcpy API
|
||||
Initializes device variables
|
||||
Launches kernel and performs the sum of device variables
|
||||
copies the result to host variable and validates the result.
|
||||
*/
|
||||
TEMPLATE_TEST_CASE("Unit_hipMemcpy_KernelLaunch", "", int, float,
|
||||
double) {
|
||||
size_t Nbytes = NUM_ELM * sizeof(TestType);
|
||||
|
||||
TestType *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr};
|
||||
TestType *A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr};
|
||||
|
||||
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, NUM_ELM, false);
|
||||
|
||||
HIP_CHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
|
||||
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(1), dim3(1), 0, 0,
|
||||
static_cast<const TestType*>(A_d),
|
||||
static_cast<const TestType*>(B_d), C_d, NUM_ELM);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, NUM_ELM);
|
||||
|
||||
HipTest::freeArrays<TestType>(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
||||
}
|
||||
|
||||
/*
|
||||
This testcase verifies the following scenarios
|
||||
1. H2H,H2PinMem and PinnedMem2Host
|
||||
2. H2D-D2D-D2H in same GPU
|
||||
3. Pinned Host Memory to device variables in same GPU
|
||||
4. Device context change
|
||||
5. H2D-D2D-D2H peer GPU
|
||||
*/
|
||||
TEMPLATE_TEST_CASE("Unit_hipMemcpy_H2H-H2D-D2H-H2PinMem", "", int,
|
||||
float, double) {
|
||||
TestType *A_d{nullptr}, *B_d{nullptr};
|
||||
TestType *A_h{nullptr}, *B_h{nullptr};
|
||||
TestType *A_Ph{nullptr}, *B_Ph{nullptr};
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
HipTest::initArrays<TestType>(&A_d, &B_d, nullptr,
|
||||
&A_h, &B_h, nullptr,
|
||||
NUM_ELM*sizeof(TestType));
|
||||
HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
|
||||
&A_Ph, &B_Ph, nullptr,
|
||||
NUM_ELM*sizeof(TestType), true);
|
||||
|
||||
SECTION("H2H, H2PinMem and PinMem2H") {
|
||||
HIP_CHECK(hipMemcpy(B_h, A_h, NUM_ELM*sizeof(TestType),
|
||||
hipMemcpyDefault));
|
||||
HIP_CHECK(hipMemcpy(A_Ph, B_h, NUM_ELM*sizeof(TestType),
|
||||
hipMemcpyDefault));
|
||||
HIP_CHECK(hipMemcpy(B_Ph, A_Ph, NUM_ELM*sizeof(TestType),
|
||||
hipMemcpyDefault));
|
||||
HipTest::checkTest(A_h, B_Ph, NUM_ELM);
|
||||
}
|
||||
|
||||
SECTION("H2D-D2D-D2H-SameGPU") {
|
||||
HIP_CHECK(hipMemcpy(A_d, A_h, NUM_ELM*sizeof(TestType), hipMemcpyDefault));
|
||||
HIP_CHECK(hipMemcpy(B_d, A_d, NUM_ELM*sizeof(TestType), hipMemcpyDefault));
|
||||
HIP_CHECK(hipMemcpy(B_h, B_d, NUM_ELM*sizeof(TestType), hipMemcpyDefault));
|
||||
HipTest::checkTest(A_h, B_h, NUM_ELM);
|
||||
}
|
||||
|
||||
SECTION("pH2D-D2D-D2pH-SameGPU") {
|
||||
HIP_CHECK(hipMemcpy(A_d, A_Ph, NUM_ELM*sizeof(TestType),
|
||||
hipMemcpyDefault));
|
||||
HIP_CHECK(hipMemcpy(B_d, A_d, NUM_ELM*sizeof(TestType), hipMemcpyDefault));
|
||||
HIP_CHECK(hipMemcpy(B_Ph, B_d, NUM_ELM*sizeof(TestType),
|
||||
hipMemcpyDefault));
|
||||
HipTest::checkTest(A_Ph, B_Ph, NUM_ELM);
|
||||
}
|
||||
SECTION("H2D-D2D-D2H-DeviceContextChange") {
|
||||
int deviceCount = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&deviceCount));
|
||||
if (deviceCount < 2) {
|
||||
SUCCEED("deviceCount less then 2");
|
||||
} else {
|
||||
int canAccessPeer = 0;
|
||||
HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, 0, 1));
|
||||
if (canAccessPeer) {
|
||||
HIP_CHECK(hipSetDevice(1));
|
||||
HIP_CHECK(hipMemcpy(A_d, A_h, NUM_ELM*sizeof(TestType),
|
||||
hipMemcpyDefault));
|
||||
HIP_CHECK(hipMemcpy(B_d, A_d, NUM_ELM*sizeof(TestType),
|
||||
hipMemcpyDefault));
|
||||
HIP_CHECK(hipMemcpy(B_h, B_d, NUM_ELM*sizeof(TestType),
|
||||
hipMemcpyDefault));
|
||||
HipTest::checkTest(A_h, B_h, NUM_ELM);
|
||||
} else {
|
||||
SUCCEED("P2P capability is not present");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SECTION("H2D-D2D-D2H-PeerGPU") {
|
||||
int deviceCount = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&deviceCount));
|
||||
if (deviceCount < 2) {
|
||||
SUCCEED("deviceCount less then 2");
|
||||
} else {
|
||||
int canAccessPeer = 0;
|
||||
HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, 0, 1));
|
||||
if (canAccessPeer) {
|
||||
HIP_CHECK(hipSetDevice(1));
|
||||
TestType *C_d{nullptr};
|
||||
HipTest::initArrays<TestType>(nullptr, nullptr, &C_d,
|
||||
nullptr, nullptr, nullptr,
|
||||
NUM_ELM*sizeof(TestType));
|
||||
HIP_CHECK(hipMemcpy(A_d, A_h, NUM_ELM*sizeof(TestType),
|
||||
hipMemcpyDefault));
|
||||
HIP_CHECK(hipMemcpy(C_d, A_d, NUM_ELM*sizeof(TestType),
|
||||
hipMemcpyDefault));
|
||||
HIP_CHECK(hipMemcpy(B_h, C_d, NUM_ELM*sizeof(TestType),
|
||||
hipMemcpyDefault));
|
||||
HipTest::checkTest(A_h, B_h, NUM_ELM);
|
||||
HIP_CHECK(hipFree(C_d));
|
||||
} else {
|
||||
SUCCEED("P2P capability is not present");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
HipTest::freeArrays<TestType>(A_d, B_d, nullptr, A_h, B_h, nullptr, false);
|
||||
HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr, A_Ph,
|
||||
B_Ph, nullptr, true);
|
||||
}
|
||||
/*
|
||||
This testcase verifies the multi thread scenario
|
||||
*/
|
||||
TEST_CASE("Unit_hipMemcpy_MultiThreadWithSerialization") {
|
||||
HIP_CHECK(hipDeviceReset());
|
||||
|
||||
// Simplest cases: serialize the threads, and also used pinned memory:
|
||||
// This verifies that the sub-calls to memcpytest2 are correct.
|
||||
multiThread_1<float>(true, true);
|
||||
|
||||
// Serialize, but use unpinned memory to stress the unpinned memory xfer path.
|
||||
multiThread_1<float>(true, false);
|
||||
}
|
||||
|
||||
/*
|
||||
This testcase verifies hipMemcpy API with pinnedMemory and hostRegister
|
||||
along with kernel launches
|
||||
*/
|
||||
|
||||
TEMPLATE_TEST_CASE("Unit_hipMemcpy_PinnedRegMemWithKernelLaunch",
|
||||
"", int, float, double) {
|
||||
int numDevices = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
||||
if (numDevices < 2) {
|
||||
SUCCEED("No of devices are less than 2");
|
||||
} else {
|
||||
// 1 refers to pinned Memory
|
||||
// 2 refers to register Memory
|
||||
int MallocPinType = GENERATE(0, 1);
|
||||
size_t Nbytes = NUM_ELM * sizeof(TestType);
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU,
|
||||
threadsPerBlock, NUM_ELM);
|
||||
|
||||
TestType *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr};
|
||||
TestType *X_d{nullptr}, *Y_d{nullptr}, *Z_d{nullptr};
|
||||
TestType *A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr};
|
||||
if (MallocPinType) {
|
||||
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, NUM_ELM, true);
|
||||
} else {
|
||||
A_h = reinterpret_cast<TestType*>(malloc(Nbytes));
|
||||
HIP_CHECK(hipHostRegister(A_h, Nbytes, hipHostRegisterDefault));
|
||||
B_h = reinterpret_cast<TestType*>(malloc(Nbytes));
|
||||
HIP_CHECK(hipHostRegister(B_h, Nbytes, hipHostRegisterDefault));
|
||||
C_h = reinterpret_cast<TestType*>(malloc(Nbytes));
|
||||
HIP_CHECK(hipHostRegister(C_h, Nbytes, hipHostRegisterDefault));
|
||||
HipTest::initArrays<TestType>(&A_d, &B_d, &C_d, nullptr, nullptr,
|
||||
nullptr, NUM_ELM, false);
|
||||
HipTest::setDefaultData<TestType>(NUM_ELM, A_h, B_h, C_h);
|
||||
}
|
||||
HIP_CHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
|
||||
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, 0, static_cast<const TestType*>(A_d),
|
||||
static_cast<const TestType*>(B_d), C_d, NUM_ELM);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, NUM_ELM);
|
||||
|
||||
unsigned int seed = time(0);
|
||||
HIP_CHECK(hipSetDevice(HipTest::RAND_R(&seed) % (numDevices-1)+1));
|
||||
|
||||
int device;
|
||||
HIP_CHECK(hipGetDevice(&device));
|
||||
std::cout <<"hipMemcpy is set to happen between device 0 and device "
|
||||
<<device << std::endl;
|
||||
HipTest::initArrays<TestType>(&X_d, &Y_d, &Z_d, nullptr,
|
||||
nullptr, nullptr, NUM_ELM, false);
|
||||
|
||||
for (int j = 0; j < NUM_ELM; j++) {
|
||||
A_h[j] = 0;
|
||||
B_h[j] = 0;
|
||||
C_h[j] = 0;
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(X_d, A_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(B_h, B_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(Y_d, B_h, Nbytes, hipMemcpyHostToDevice));
|
||||
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, 0, static_cast<const TestType*>(X_d),
|
||||
static_cast<const TestType*>(Y_d), Z_d, NUM_ELM);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, NUM_ELM);
|
||||
|
||||
if (MallocPinType) {
|
||||
HipTest::freeArrays<TestType>(A_d, B_d, C_d, A_h, B_h, C_h, true);
|
||||
} else {
|
||||
HIP_CHECK(hipHostUnregister(A_h));
|
||||
free(A_h);
|
||||
HIP_CHECK(hipHostUnregister(B_h));
|
||||
free(B_h);
|
||||
HIP_CHECK(hipHostUnregister(C_h));
|
||||
free(C_h);
|
||||
HipTest::freeArrays<TestType>(A_d, B_d, C_d, nullptr,
|
||||
nullptr, nullptr, false);
|
||||
}
|
||||
HipTest::freeArrays<TestType>(X_d, Y_d, Z_d, nullptr,
|
||||
nullptr, nullptr, false);
|
||||
}
|
||||
}
|
||||
Référencer dans un nouveau ticket
Bloquer un utilisateur