Files
Alysa Liu e79af13068 rocrtst: add VMM memory accounting test (#1666)
Add VMM test for memory accounting.
2025-12-03 11:27:51 -05:00

1663 خطوط
59 KiB
C++

/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2018, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include <sys/mman.h>
#include <fcntl.h>
#include <algorithm>
#include <iostream>
#include <vector>
#include <memory>
#include <sys/socket.h>
#include "suites/functional/virtual_memory.h"
#include "common/base_rocr_utils.h"
#include "common/common.h"
#include "common/helper_funcs.h"
#include "common/hsatimer.h"
#include "common/concurrent_utils.h"
#include "gtest/gtest.h"
#include "hsa/hsa.h"
// Wrap printf to add first or second process indicator
#define PROCESS_LOG(format, ...) \
{ \
if (verbosity() >= VERBOSE_STANDARD || !parentProcess_) { \
fprintf(stdout, "line:%d P%u: " format, __LINE__, static_cast<int>(!parentProcess_), \
##__VA_ARGS__); \
} \
}
// Fork safe ASSERT_EQ.
#define MSG(y, msg, ...) msg
#define Y(y, ...) y
#define FORK_ASSERT_EQ(x, ...) \
if ((x) != (Y(__VA_ARGS__))) { \
if ((x) != (Y(__VA_ARGS__))) { \
std::cout << MSG(__VA_ARGS__, ""); \
if (parentProcess_) { \
shared_->parent_status = -1; \
} else { \
shared_->child_status = -1; \
} \
ASSERT_EQ(x, Y(__VA_ARGS__)); \
} \
}
static const char kSubTestSeparator[] = " **************************";
static void PrintMemorySubtestHeader(const char* header) {
std::cout << " *** Virtual Memory Functional Subtest: " << header << " ***" << std::endl;
}
static void PrintAgentNameAndType(hsa_agent_t agent) {
hsa_status_t err;
char ag_name[64];
hsa_device_type_t ag_type;
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, ag_name);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &ag_type);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
std::cout << " Agent: " << ag_name << " (";
switch (ag_type) {
case HSA_DEVICE_TYPE_CPU:
std::cout << "CPU)";
break;
case HSA_DEVICE_TYPE_GPU:
std::cout << "GPU)";
break;
case HSA_DEVICE_TYPE_DSP:
std::cout << "DSP)";
break;
case HSA_DEVICE_TYPE_AIE:
std::cout << "AIE)";
break;
}
std::cout << std::endl;
return;
}
VirtMemoryTestBasic::VirtMemoryTestBasic(void) : TestBase() {
set_title("ROCr Virtual Memory Basic Tests");
set_description(" Tests virtual memory API functions");
}
VirtMemoryTestBasic::~VirtMemoryTestBasic(void) {}
void VirtMemoryTestBasic::TestCreateDestroy(hsa_agent_t agent, hsa_amd_memory_pool_t pool) {
hsa_agent_t* agents_accessible;
hsa_amd_pointer_info_t ptrInfo = {};
uint32_t num_agents_accessible = 0;
std::vector<hsa_agent_t> gpus;
rocrtst::pool_info_t pool_i;
hsa_device_type_t ag_type;
char ag_name[64];
void* addrRangeUnmapped;
hsa_status_t err;
void* addrRange;
ASSERT_SUCCESS(hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &ag_type));
ASSERT_SUCCESS(rocrtst::AcquirePoolInfo(pool, &pool_i));
if (ag_type != HSA_DEVICE_TYPE_GPU || !pool_i.alloc_allowed) return;
size_t granule_size = pool_i.alloc_granule;
const size_t sizeof_addrRangeUnmapped = 10 * granule_size;
const size_t sizeof_addrRange = 20 * granule_size;
ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus));
ASSERT_SUCCESS(hsa_amd_vmem_address_reserve(&addrRange, sizeof_addrRange, 0, 0));
ASSERT_SUCCESS(hsa_amd_vmem_address_reserve(&addrRangeUnmapped, sizeof_addrRangeUnmapped, 0, 0));
/* Verify that pointer info for unmapped VA's return expected values */
ptrInfo.size = sizeof(ptrInfo);
ASSERT_SUCCESS(hsa_amd_pointer_info(addrRangeUnmapped, &ptrInfo, &malloc, &num_agents_accessible,
&agents_accessible));
ASSERT_EQ(ptrInfo.type, HSA_EXT_POINTER_TYPE_RESERVED_ADDR);
ASSERT_EQ(ptrInfo.hostBaseAddress, addrRangeUnmapped);
/* For unmapped VA, then size is equal to size of address reservation */
ASSERT_EQ(ptrInfo.sizeInBytes, sizeof_addrRangeUnmapped);
ASSERT_EQ(num_agents_accessible, 0);
/* Verify that pointer info for unmapped VA offset return expected values */
ptrInfo.size = sizeof(ptrInfo);
ASSERT_SUCCESS(hsa_amd_pointer_info(reinterpret_cast<uint8_t*>(addrRangeUnmapped) + 10, &ptrInfo, &malloc,
&num_agents_accessible, &agents_accessible));
ASSERT_EQ(ptrInfo.type, HSA_EXT_POINTER_TYPE_RESERVED_ADDR);
ASSERT_EQ(ptrInfo.hostBaseAddress,
addrRangeUnmapped); // hostBaseAddress is address of reservation instead of offset.
/* For unmapped VA, then size is equal to size of address reservation */
ASSERT_EQ(ptrInfo.sizeInBytes, sizeof_addrRangeUnmapped);
ASSERT_EQ(num_agents_accessible, 0);
hsa_amd_vmem_alloc_handle_t mem_handle;
const size_t sizeof_mem_handle = 10 * granule_size;
ASSERT_SUCCESS(
hsa_amd_vmem_handle_create(pool, sizeof_mem_handle, MEMORY_TYPE_NONE, 0, &mem_handle));
/* Test alloc properties returns correct memory type and pool handle */
hsa_amd_memory_pool_t poolRet;
hsa_amd_memory_type_t memTypeRet;
ASSERT_SUCCESS(hsa_amd_vmem_get_alloc_properties_from_handle(mem_handle, &poolRet, &memTypeRet));
ASSERT_EQ(poolRet.handle, pool.handle);
ASSERT_EQ(memTypeRet, MEMORY_TYPE_NONE);
hsa_amd_vmem_alloc_handle_t mem_handleTypePinned;
ASSERT_SUCCESS(hsa_amd_vmem_handle_create(pool, sizeof_mem_handle, MEMORY_TYPE_PINNED, 0,
&mem_handleTypePinned));
ASSERT_SUCCESS(
hsa_amd_vmem_get_alloc_properties_from_handle(mem_handleTypePinned, &poolRet, &memTypeRet));
ASSERT_EQ(poolRet.handle, pool.handle);
ASSERT_EQ(memTypeRet, MEMORY_TYPE_PINNED);
ASSERT_SUCCESS(hsa_amd_vmem_map(addrRange, sizeof_mem_handle, 0, mem_handle, 0));
/* Verity pointer info on mapped addresses */
ptrInfo.size = sizeof(ptrInfo);
ASSERT_SUCCESS(hsa_amd_pointer_info(addrRange, &ptrInfo, &malloc, &num_agents_accessible,
&agents_accessible));
ASSERT_EQ(ptrInfo.type, HSA_EXT_POINTER_TYPE_HSA_VMEM);
ASSERT_EQ(ptrInfo.sizeInBytes, sizeof_mem_handle); // size matches memory handle
ASSERT_EQ(num_agents_accessible, 0);
// Access to each GPU should be None
for (auto gpuIt = gpus.begin(); gpuIt != gpus.end(); ++gpuIt) {
hsa_access_permission_t perm = HSA_ACCESS_PERMISSION_RW;
ASSERT_SUCCESS(hsa_amd_vmem_get_access(addrRange, &perm, *gpuIt));
ASSERT_EQ(perm, HSA_ACCESS_PERMISSION_NONE);
}
/* Set RO Access to all GPUs */
{
int descIndex = 0;
hsa_amd_memory_access_desc_t desc[gpus.size()];
for (auto gpuIt = gpus.begin(); gpuIt != gpus.end(); ++gpuIt) {
desc[descIndex++] = {HSA_ACCESS_PERMISSION_RO, *gpuIt};
}
ASSERT_SUCCESS(hsa_amd_vmem_set_access(addrRange, 10 * granule_size, desc, gpus.size()));
}
/* Verity pointer info accessible agents on mapped addresses */
ptrInfo.size = sizeof(ptrInfo);
ASSERT_SUCCESS(hsa_amd_pointer_info(addrRange, &ptrInfo, &malloc, &num_agents_accessible,
&agents_accessible));
ASSERT_EQ(ptrInfo.type, HSA_EXT_POINTER_TYPE_HSA_VMEM);
ASSERT_EQ(ptrInfo.sizeInBytes, sizeof_mem_handle); // size matches memory handle
ASSERT_EQ(num_agents_accessible, gpus.size());
ASSERT_NE(agents_accessible, nullptr);
/* Verify agents_accessible is valid */
for (auto gpuIt = gpus.begin(); gpuIt != gpus.end(); ++gpuIt) {
bool found = false;
for (auto i = 0; i < gpus.size(); i++) {
if (agents_accessible[i].handle == (*gpuIt).handle) {
found = true;
break;
}
}
ASSERT_EQ(found, true);
}
free(agents_accessible);
for (auto gpuIt = gpus.begin(); gpuIt != gpus.end(); ++gpuIt) {
hsa_access_permission_t perm = HSA_ACCESS_PERMISSION_NONE;
ASSERT_SUCCESS(hsa_amd_vmem_get_access(addrRange, &perm, *gpuIt));
ASSERT_EQ(perm, HSA_ACCESS_PERMISSION_RO);
/* addrRangeUnmapped was never mapped, so this is an invalid mapping */
err = hsa_amd_vmem_get_access(addrRangeUnmapped, &perm, *gpuIt);
ASSERT_EQ(err, HSA_STATUS_ERROR_INVALID_ALLOCATION);
}
if (gpus.size() > 1) {
/* Call set_access with a smaller list of agents, this should leave access to
* the other GPUs unchanged */
hsa_amd_memory_access_desc_t desc = {HSA_ACCESS_PERMISSION_RW, gpus[1]};
ASSERT_SUCCESS(hsa_amd_vmem_set_access(addrRange, 10 * granule_size, &desc, 1));
size_t i = 0;
for (i = 0; i < gpus.size(); i++) {
hsa_access_permission_t perm = HSA_ACCESS_PERMISSION_NONE;
/* Only 2nd GPU should have RW access */
ASSERT_SUCCESS(hsa_amd_vmem_get_access(addrRange, &perm, gpus[i]));
if (i == 1) {
ASSERT_EQ(perm, HSA_ACCESS_PERMISSION_RW);
} else {
ASSERT_EQ(perm, HSA_ACCESS_PERMISSION_RO);
}
}
}
ASSERT_SUCCESS(hsa_amd_vmem_unmap(addrRange, sizeof_mem_handle));
ASSERT_SUCCESS(hsa_amd_vmem_handle_release(mem_handle));
ASSERT_SUCCESS(hsa_amd_vmem_address_free(addrRange, sizeof_addrRange));
ASSERT_SUCCESS(hsa_amd_vmem_address_free(addrRangeUnmapped, sizeof_addrRangeUnmapped));
}
void VirtMemoryTestBasic::TestCreateDestroy(void) {
hsa_status_t err;
std::vector<std::shared_ptr<rocrtst::agent_pools_t>> agent_pools;
if (verbosity() > 0) {
PrintMemorySubtestHeader("CreateDestroy Test");
}
bool supp = false;
ASSERT_SUCCESS(hsa_system_get_info(HSA_AMD_SYSTEM_INFO_VIRTUAL_MEM_API_SUPPORTED, (void*)&supp));
if (!supp) {
if (verbosity() > 0) {
std::cout << " Virtual Memory API not supported on this system - Skipping." << std::endl;
std::cout << kSubTestSeparator << std::endl;
}
return;
}
ASSERT_SUCCESS(rocrtst::GetAgentPools(&agent_pools));
auto pool_idx = 0;
for (auto a : agent_pools) {
for (auto p : a->pools) {
TestCreateDestroy(a->agent, p);
}
}
if (verbosity() > 0) {
std::cout << " Subtest finished" << std::endl;
std::cout << kSubTestSeparator << std::endl;
}
}
void VirtMemoryTestBasic::TestRefCount(hsa_agent_t agent, hsa_amd_memory_pool_t pool) {
rocrtst::pool_info_t pool_i;
hsa_device_type_t ag_type;
char ag_name[64];
void* addrRangeUnmapped;
hsa_status_t err;
void* addrRange;
ASSERT_SUCCESS(hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, ag_name));
ASSERT_SUCCESS(hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &ag_type));
ASSERT_SUCCESS(rocrtst::AcquirePoolInfo(pool, &pool_i));
if (ag_type != HSA_DEVICE_TYPE_GPU || !pool_i.alloc_allowed) return;
size_t granule_size = pool_i.alloc_granule;
ASSERT_SUCCESS(hsa_amd_vmem_address_reserve(&addrRange, 10 * granule_size, 0, 0));
hsa_amd_vmem_alloc_handle_t mem_handleA1;
ASSERT_SUCCESS(
hsa_amd_vmem_handle_create(pool, 10 * granule_size, MEMORY_TYPE_NONE, 0, &mem_handleA1));
ASSERT_SUCCESS(hsa_amd_vmem_map(addrRange, 10 * granule_size, 0, mem_handleA1, 0));
/* Allocate duplicate handle */
hsa_amd_vmem_alloc_handle_t mem_handleA1Dup;
ASSERT_SUCCESS(hsa_amd_vmem_retain_alloc_handle(&mem_handleA1Dup, addrRange));
/* Try to unmap with incorrect size */
err = hsa_amd_vmem_unmap(addrRange, 5 * granule_size);
ASSERT_NE(err, HSA_STATUS_SUCCESS);
ASSERT_SUCCESS(hsa_amd_vmem_handle_release(mem_handleA1));
/* Try to release duplicate handle twice - second time should fail */
ASSERT_SUCCESS(hsa_amd_vmem_handle_release(mem_handleA1Dup));
/* Already released so should fail*/
err = hsa_amd_vmem_handle_release(mem_handleA1Dup);
ASSERT_NE(err, HSA_STATUS_SUCCESS);
/* Unmap with correct size - un-mapping after releasing the handle is valid */
ASSERT_SUCCESS(hsa_amd_vmem_unmap(addrRange, 10 * granule_size));
/* Try to free with incorrect size */
err = hsa_amd_vmem_address_free(addrRange, 5 * granule_size);
ASSERT_NE(err, HSA_STATUS_SUCCESS);
/* Free with correct size */
ASSERT_SUCCESS(hsa_amd_vmem_address_free(addrRange, 10 * granule_size));
}
void VirtMemoryTestBasic::TestRefCount(void) {
hsa_status_t err;
std::vector<std::shared_ptr<rocrtst::agent_pools_t>> agent_pools;
if (verbosity() > 0) {
PrintMemorySubtestHeader("Reference Count Test");
}
bool supp = false;
ASSERT_SUCCESS(hsa_system_get_info(HSA_AMD_SYSTEM_INFO_VIRTUAL_MEM_API_SUPPORTED, (void*)&supp));
if (!supp) {
if (verbosity() > 0) {
std::cout << " Virtual Memory API not supported on this system - Skipping." << std::endl;
std::cout << kSubTestSeparator << std::endl;
}
return;
}
ASSERT_SUCCESS(rocrtst::GetAgentPools(&agent_pools));
auto pool_idx = 0;
for (auto a : agent_pools) {
for (auto p : a->pools) TestRefCount(a->agent, p);
}
if (verbosity() > 0) {
std::cout << " Subtest finished" << std::endl;
std::cout << kSubTestSeparator << std::endl;
}
}
void VirtMemoryTestBasic::TestPartialMapping(hsa_agent_t agent, hsa_amd_memory_pool_t pool) {
rocrtst::pool_info_t pool_i;
hsa_device_type_t ag_type;
char ag_name[64];
void* addrRangeUnmapped;
hsa_status_t err;
void* addrRange;
ASSERT_SUCCESS(hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &ag_type));
ASSERT_SUCCESS(rocrtst::AcquirePoolInfo(pool, &pool_i));
if (ag_type != HSA_DEVICE_TYPE_GPU || !pool_i.alloc_allowed) return;
size_t granule_size = pool_i.alloc_granule;
/************************************************************************************************
Map partial chunks within the address range and confirm what overlaps fail.
Units below are in multiples of granule_size.
------------------------------------------------------------------
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 |
------------------------------------------------------------------
Step 1: A A A A A A
Step 2: B B B
Step 3: B
Step 4: B B B
***********************************************************************************************/
ASSERT_SUCCESS(hsa_amd_vmem_address_reserve(&addrRange, 15 * granule_size, 0, 0));
hsa_amd_vmem_alloc_handle_t mem_handleA;
// Step 1
ASSERT_SUCCESS(
hsa_amd_vmem_handle_create(pool, 8 * granule_size, MEMORY_TYPE_NONE, 0, &mem_handleA));
ASSERT_SUCCESS(hsa_amd_vmem_map((void*)((uint64_t)addrRange + (2 * granule_size)),
6 * granule_size, 0, mem_handleA, 0));
// Step 2
hsa_amd_vmem_alloc_handle_t mem_handleB;
ASSERT_SUCCESS(
hsa_amd_vmem_handle_create(pool, 8 * granule_size, MEMORY_TYPE_NONE, 0, &mem_handleB));
ASSERT_SUCCESS(hsa_amd_vmem_map((void*)((uint64_t)addrRange + (11 * granule_size)),
3 * granule_size, 0, mem_handleB, 0));
// Step 3
// Should fail as this is exceeding size of address range
err = hsa_amd_vmem_map((void*)((uint64_t)addrRange + (14 * granule_size)),
2 * granule_size, 0, mem_handleB, 0);
ASSERT_NE(err, HSA_STATUS_SUCCESS);
ASSERT_SUCCESS(hsa_amd_vmem_map((void*)((uint64_t)addrRange + (14 * granule_size)),
1 * granule_size, 0, mem_handleB, 0));
// Step 4
// Should fail as this is overlapping with AddressRange[11] already mapped
err = hsa_amd_vmem_map((void*)((uint64_t)addrRange + (8 * granule_size)),
4 * granule_size, 0, mem_handleB, 0);
ASSERT_NE(err, HSA_STATUS_SUCCESS);
ASSERT_SUCCESS(hsa_amd_vmem_map((void*)((uint64_t)addrRange + (8 * granule_size)),
3 * granule_size, 0, mem_handleB, 0));
// Done, unmap all
ASSERT_SUCCESS(
hsa_amd_vmem_unmap((void*)((uint64_t)addrRange + (2 * granule_size)), 6 * granule_size));
ASSERT_SUCCESS(
hsa_amd_vmem_unmap((void*)((uint64_t)addrRange + (8 * granule_size)), 3 * granule_size));
ASSERT_SUCCESS(
hsa_amd_vmem_unmap((void*)((uint64_t)addrRange + (11 * granule_size)), 3 * granule_size));
ASSERT_SUCCESS(
hsa_amd_vmem_unmap((void*)((uint64_t)addrRange + (14 * granule_size)), 1 * granule_size));
ASSERT_SUCCESS(hsa_amd_vmem_address_free(addrRange, 15 * granule_size));
}
void VirtMemoryTestBasic::TestPartialMapping(void) {
hsa_status_t err;
std::vector<std::shared_ptr<rocrtst::agent_pools_t>> agent_pools;
if (verbosity() > 0) {
PrintMemorySubtestHeader("Partial Mapping Test");
}
bool supp = false;
ASSERT_SUCCESS(hsa_system_get_info(HSA_AMD_SYSTEM_INFO_VIRTUAL_MEM_API_SUPPORTED, (void*)&supp));
if (!supp) {
if (verbosity() > 0) {
std::cout << " Virtual Memory API not supported on this system - Skipping." << std::endl;
std::cout << kSubTestSeparator << std::endl;
}
return;
}
ASSERT_SUCCESS(rocrtst::GetAgentPools(&agent_pools));
auto pool_idx = 0;
for (auto a : agent_pools) {
for (auto p : a->pools) TestPartialMapping(a->agent, p);
}
if (verbosity() > 0) {
std::cout << " Subtest finished" << std::endl;
std::cout << kSubTestSeparator << std::endl;
}
}
typedef struct __attribute__((aligned(16))) args_t {
int* a;
int* b;
int* c;
} args;
args* kernArgsVirt = NULL;
// Test to check CPU can read & write to GPU memory
void VirtMemoryTestBasic::CPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent, hsa_agent_t gpuAgent,
hsa_amd_memory_pool_t device_pool) {
hsa_status_t err;
rocrtst::pool_info_t pool_i;
ASSERT_SUCCESS(rocrtst::AcquirePoolInfo(device_pool, &pool_i));
if (!(pool_i.segment == HSA_AMD_SEGMENT_GLOBAL &&
pool_i.global_flag == HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED))
return;
hsa_amd_memory_pool_access_t access;
hsa_amd_agent_memory_pool_get_info(cpuAgent, device_pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
&access);
if (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
if (verbosity() > 0) {
std::cout << " Test not applicable as system is not large bar - Skipping." << std::endl;
std::cout << kSubTestSeparator << std::endl;
}
return;
}
if (!pool_i.alloc_allowed || pool_i.alloc_granule == 0 || pool_i.alloc_alignment == 0) {
if (verbosity() > 0) {
std::cout << " Test not applicable. Skipping." << std::endl;
std::cout << kSubTestSeparator << std::endl;
}
return;
}
auto max_alloc_size = pool_i.alloc_granule * 100;
unsigned int max_element = max_alloc_size / sizeof(unsigned int);
unsigned int* dev_data = NULL;
unsigned int* host_data = NULL;
host_data = (unsigned int*)malloc(max_alloc_size);
ASSERT_NE(host_data, nullptr);
for (unsigned int i = 0; i < max_element; ++i) {
host_data[i] = i;
}
hsa_amd_memory_access_desc_t permsAccess[] = {{HSA_ACCESS_PERMISSION_RW, cpuAgent},
{HSA_ACCESS_PERMISSION_RW, gpuAgent}};
hsa_amd_vmem_alloc_handle_t mem_handle_host, mem_handle_dev;
ASSERT_SUCCESS(
hsa_amd_vmem_address_reserve(reinterpret_cast<void**>(&dev_data), max_alloc_size, 0, 0));
ASSERT_NE(dev_data, nullptr);
ASSERT_SUCCESS(hsa_amd_vmem_handle_create(device_pool, max_alloc_size, MEMORY_TYPE_NONE, 0,
&mem_handle_dev));
ASSERT_SUCCESS(
hsa_amd_vmem_map(reinterpret_cast<void*>(dev_data), max_alloc_size, 0, mem_handle_dev, 0));
// Give device access to host data
ASSERT_SUCCESS(hsa_amd_vmem_set_access(dev_data, max_alloc_size, permsAccess, 2));
// Verify CPU can read & write to GPU memory
std::cout << " Verify CPU can read & write to GPU memory" << std::endl;
for (unsigned int i = 0; i < max_element; ++i) {
dev_data[i] = i; // Write to gpu memory directly
}
for (unsigned int i = 0; i < max_element; ++i) {
if (host_data[i] != dev_data[i]) { // Reading GPU memory
fprintf(stdout,
" Values not mathing !! host_data[%d]:%d ,"
"dev_data[%d]\n",
host_data[i], i, dev_data[i]);
}
}
std::cout << " CPU have read & write to GPU memory successfully" << std::endl;
ASSERT_SUCCESS(hsa_amd_vmem_unmap(dev_data, max_alloc_size));
ASSERT_SUCCESS(hsa_amd_vmem_handle_release(mem_handle_dev));
ASSERT_SUCCESS(hsa_amd_vmem_address_free(reinterpret_cast<void*>(dev_data), max_alloc_size));
free(host_data);
}
void VirtMemoryTestBasic::CPUAccessToGPUMemoryTest(void) {
hsa_status_t err;
// find all cpu agents
std::vector<hsa_agent_t> cpus;
ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateCPUAgents, &cpus));
// find all gpu agents
std::vector<hsa_agent_t> gpus;
ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus));
if (verbosity() > 0) PrintMemorySubtestHeader("CPU To GPU Access test");
bool supp = false;
ASSERT_SUCCESS(hsa_system_get_info(HSA_AMD_SYSTEM_INFO_VIRTUAL_MEM_API_SUPPORTED, (void*)&supp));
if (!supp) {
if (verbosity() > 0) {
std::cout << " Virtual Memory API not supported on this system - Skipping." << std::endl;
std::cout << kSubTestSeparator << std::endl;
}
return;
}
for (unsigned int i = 0; i < gpus.size(); ++i) {
hsa_amd_memory_pool_t gpu_pool;
memset(&gpu_pool, 0, sizeof(gpu_pool));
ASSERT_SUCCESS(
hsa_amd_agent_iterate_memory_pools(gpus[i], rocrtst::GetGlobalMemoryPool, &gpu_pool));
if (gpu_pool.handle == 0) {
std::cout << " No global mempool in gpu agent" << std::endl;
return;
}
CPUAccessToGPUMemoryTest(cpus[0], gpus[i], gpu_pool);
}
if (verbosity() > 0) {
std::cout << " Subtest finished" << std::endl;
std::cout << kSubTestSeparator << std::endl;
}
}
// Test to check GPU can read & write to CPU memory
void VirtMemoryTestBasic::GPUAccessToCPUMemoryTest(hsa_agent_t cpuAgent, hsa_agent_t gpuAgent,
hsa_amd_memory_pool_t device_pool) {
rocrtst::pool_info_t pool_i;
hsa_device_type_t ag_type;
char ag_name[64];
hsa_status_t err;
ASSERT_SUCCESS(rocrtst::AcquirePoolInfo(device_pool, &pool_i));
if (!pool_i.alloc_allowed || pool_i.segment != HSA_AMD_SEGMENT_GLOBAL ||
pool_i.global_flag != HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)
return;
hsa_amd_memory_pool_access_t access;
ASSERT_SUCCESS(hsa_amd_agent_memory_pool_get_info(
cpuAgent, device_pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access));
if (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
if (verbosity() > 0) {
std::cout << " Test not applicable as system is not large bar - Skipping." << std::endl;
std::cout << kSubTestSeparator << std::endl;
return;
}
}
hsa_queue_t* queue = NULL; // command queue
hsa_signal_t signal = {0}; // completion signal
size_t& granule_size = pool_i.alloc_granule;
size_t alloc_size = granule_size * 100;
static const int kMemoryAllocSize = 1024;
unsigned int max_element = alloc_size / sizeof(unsigned int);
// get queue size
uint32_t queue_size = 0;
ASSERT_SUCCESS(hsa_agent_get_info(gpuAgent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size));
// create queue
ASSERT_SUCCESS(
hsa_queue_create(gpuAgent, queue_size, HSA_QUEUE_TYPE_MULTI, NULL, NULL, 0, 0, &queue));
// Find a memory pool that supports kernel arguments.
hsa_amd_memory_pool_t kernarg_pool;
ASSERT_SUCCESS(
hsa_amd_agent_iterate_memory_pools(cpuAgent, rocrtst::GetKernArgMemoryPool, &kernarg_pool));
// Get System Memory Pool on the cpuAgent to allocate host side buffers
hsa_amd_memory_pool_t global_pool;
ASSERT_SUCCESS(
hsa_amd_agent_iterate_memory_pools(cpuAgent, rocrtst::GetGlobalMemoryPool, &global_pool));
struct host_data_t {
int data[kMemoryAllocSize * 4];
int dup_data[kMemoryAllocSize * 4];
int result[kMemoryAllocSize * 4];
};
struct dev_data_t {
int result[kMemoryAllocSize * 4];
};
struct host_data_t* host_data;
struct dev_data_t* dev_data;
ASSERT_SUCCESS(hsa_amd_memory_pool_allocate(global_pool, sizeof(*host_data), 0,
reinterpret_cast<void**>(&host_data)));
// Allow gpuAgent access to all allocated system memory.
ASSERT_SUCCESS(hsa_amd_agents_allow_access(1, &gpuAgent, NULL, host_data));
ASSERT_SUCCESS(hsa_amd_vmem_address_reserve((void**)&dev_data, sizeof(*dev_data), 0, 0));
hsa_amd_vmem_alloc_handle_t mem_handle;
ASSERT_SUCCESS(
hsa_amd_vmem_handle_create(device_pool, sizeof(*dev_data), MEMORY_TYPE_NONE, 0, &mem_handle));
ASSERT_SUCCESS(hsa_amd_vmem_map(dev_data, sizeof(*dev_data), 0, mem_handle, 0));
// Give host and device access to device data
hsa_amd_memory_access_desc_t permsAccess[] = {{HSA_ACCESS_PERMISSION_RW, gpuAgent},
{HSA_ACCESS_PERMISSION_RW, cpuAgent}};
ASSERT_SUCCESS(hsa_amd_vmem_set_access(dev_data, sizeof(*dev_data), permsAccess, 2));
// Allocate the kernel argument buffer from the kernarg_pool.
ASSERT_SUCCESS(hsa_amd_memory_pool_allocate(kernarg_pool, sizeof(args_t), 0,
reinterpret_cast<void**>(&kernArgsVirt)));
// initialize the host buffers
for (int i = 0; i < kMemoryAllocSize; ++i) {
unsigned int seed = time(NULL);
host_data->data[i] = 1 + rand_r(&seed) % 1;
host_data->dup_data[i] = host_data->data[i];
}
memset(host_data->result, 0, sizeof(host_data->result));
memset(dev_data->result, 0, sizeof(dev_data->result));
ASSERT_SUCCESS(hsa_amd_agents_allow_access(1, &gpuAgent, NULL, kernArgsVirt));
kernArgsVirt->a = host_data->data;
kernArgsVirt->b = host_data->result; // system memory passed to gpu for write
kernArgsVirt->c = dev_data->result; // gpu memory to verify that gpu read system data
// Create the executable, get symbol by name and load the code object
set_kernel_file_name("gpuReadWrite_kernels.hsaco");
set_kernel_name("gpuReadWrite");
ASSERT_SUCCESS(rocrtst::LoadKernelFromObjFile(this, &gpuAgent));
// Fill the dispatch packet with
// workgroup_size, grid_size, kernelArgs and completion signal
// Put it on the queue and launch the kernel by ringing the doorbell
// create completion signal
ASSERT_SUCCESS(hsa_signal_create(1, 0, NULL, &signal));
// create aql packet
hsa_kernel_dispatch_packet_t aql;
memset(&aql, 0, sizeof(aql));
// initialize aql packet
aql.workgroup_size_x = 256;
aql.workgroup_size_y = 1;
aql.workgroup_size_z = 1;
aql.grid_size_x = kMemoryAllocSize;
aql.grid_size_y = 1;
aql.grid_size_z = 1;
aql.private_segment_size = 0;
aql.group_segment_size = 0;
aql.kernel_object = kernel_object(); // kernel_code;
aql.kernarg_address = kernArgsVirt;
aql.completion_signal = signal;
// const uint32_t queue_size = queue->size;
const uint32_t queue_mask = queue->size - 1;
// write to command queue
uint64_t index = hsa_queue_load_write_index_relaxed(queue);
hsa_queue_store_write_index_relaxed(queue, index + 1);
rocrtst::WriteAQLToQueueLoc(queue, index, &aql);
hsa_kernel_dispatch_packet_t* q_base_addr =
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue->base_address);
rocrtst::AtomicSetPacketHeader(
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE),
(1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(&q_base_addr[index & queue_mask]));
// ringdoor bell
hsa_signal_store_relaxed(queue->doorbell_signal, index);
// wait for the signal and reset it for future use
while (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1,
HSA_WAIT_STATE_ACTIVE)) {
}
hsa_signal_store_relaxed(signal, 1);
// compare device and host side results
if (verbosity() > 0) {
std::cout << " Check GPU has read the system memory" << std::endl;
}
for (int i = 0; i < kMemoryAllocSize; ++i) {
// printf("Verifying data at index[%d]\n", i);
ASSERT_EQ(dev_data->result[i], host_data->dup_data[i]);
}
if (verbosity() > 0) {
std::cout << " GPU has read the system memory successfully" << std::endl;
std::cout << " Check GPU has written to system memory" << std::endl;
}
for (int i = 0; i < kMemoryAllocSize; ++i) {
ASSERT_EQ(host_data->result[i], i);
}
if (verbosity() > 0) {
std::cout << " GPU has written to system memory successfully" << std::endl;
}
ASSERT_SUCCESS(hsa_amd_vmem_unmap(dev_data, sizeof(*dev_data)));
ASSERT_SUCCESS(hsa_amd_vmem_handle_release(mem_handle));
if (dev_data) {
ASSERT_SUCCESS(hsa_amd_vmem_address_free(dev_data, sizeof(*dev_data)));
}
if (host_data) hsa_memory_free(host_data);
if (kernArgsVirt) {
hsa_memory_free(kernArgsVirt);
}
if (signal.handle) {
hsa_signal_destroy(signal);
}
if (queue) {
hsa_queue_destroy(queue);
}
}
void VirtMemoryTestBasic::GPUAccessToCPUMemoryTest(void) {
hsa_status_t err;
// find all cpu agents
std::vector<hsa_agent_t> cpus;
ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateCPUAgents, &cpus));
// find all gpu agents
std::vector<hsa_agent_t> gpus;
ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus));
if (verbosity() > 0) PrintMemorySubtestHeader("CPU To GPU Access test");
bool supp = false;
ASSERT_SUCCESS(hsa_system_get_info(HSA_AMD_SYSTEM_INFO_VIRTUAL_MEM_API_SUPPORTED, (void*)&supp));
if (!supp) {
if (verbosity() > 0) {
std::cout << " Virtual Memory API not supported on this system - Skipping." << std::endl;
std::cout << kSubTestSeparator << std::endl;
}
return;
}
for (unsigned int i = 0; i < gpus.size(); ++i) {
hsa_amd_memory_pool_t gpu_pool;
memset(&gpu_pool, 0, sizeof(gpu_pool));
ASSERT_SUCCESS(
hsa_amd_agent_iterate_memory_pools(gpus[i], rocrtst::GetGlobalMemoryPool, &gpu_pool));
if (gpu_pool.handle == 0) {
std::cout << "no global mempool in GPU agent" << std::endl;
return;
}
GPUAccessToCPUMemoryTest(cpus[0], gpus[i], gpu_pool);
}
if (verbosity() > 0) {
std::cout << " Subtest finished" << std::endl;
std::cout << kSubTestSeparator << std::endl;
}
}
// Test to check GPU can read & write to GPU memory
void VirtMemoryTestBasic::GPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent, hsa_agent_t gpuAgent,
hsa_amd_memory_pool_t device_pool) {
rocrtst::pool_info_t pool_i;
hsa_device_type_t ag_type;
char ag_name[64];
hsa_status_t err;
ASSERT_SUCCESS(rocrtst::AcquirePoolInfo(device_pool, &pool_i));
if (!pool_i.alloc_allowed || pool_i.segment != HSA_AMD_SEGMENT_GLOBAL ||
pool_i.global_flag != HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)
return;
hsa_amd_memory_pool_access_t access;
ASSERT_SUCCESS(hsa_amd_agent_memory_pool_get_info(
cpuAgent, device_pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access));
if (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
if (verbosity() > 0) {
std::cout << " Test not applicable as system is not large bar - Skipping." << std::endl;
std::cout << kSubTestSeparator << std::endl;
return;
}
}
hsa_queue_t* queue = NULL; // command queue
hsa_signal_t signal = {0}; // completion signal
size_t& granule_size = pool_i.alloc_granule;
size_t alloc_size = granule_size * 100;
static const int kMemoryAllocSize = 4096;
unsigned int max_element = alloc_size / sizeof(unsigned int);
// get queue size
uint32_t queue_size = 0;
ASSERT_SUCCESS(hsa_agent_get_info(gpuAgent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size));
// create queue
ASSERT_SUCCESS(
hsa_queue_create(gpuAgent, queue_size, HSA_QUEUE_TYPE_MULTI, NULL, NULL, 0, 0, &queue));
// Find a memory pool that supports kernel arguments.
hsa_amd_memory_pool_t kernarg_pool;
ASSERT_SUCCESS(
hsa_amd_agent_iterate_memory_pools(cpuAgent, rocrtst::GetKernArgMemoryPool, &kernarg_pool));
// Get System Memory Pool on the cpuAgent to allocate host side buffers
hsa_amd_memory_pool_t global_pool;
ASSERT_SUCCESS(
hsa_amd_agent_iterate_memory_pools(cpuAgent, rocrtst::GetGlobalMemoryPool, &global_pool));
struct host_data_t {
int data[kMemoryAllocSize * 4];
int gpuWrite[kMemoryAllocSize * 4];
int result[kMemoryAllocSize * 4];
};
struct dev_data_t {
int data[kMemoryAllocSize * 4];
int result[kMemoryAllocSize * 4];
};
struct host_data_t* host_data;
struct dev_data_t* dev_data;
ASSERT_SUCCESS(hsa_amd_memory_pool_allocate(global_pool, sizeof(*host_data), 0,
reinterpret_cast<void**>(&host_data)));
// Allow gpuAgent access to all allocated system memory.
ASSERT_SUCCESS(hsa_amd_agents_allow_access(1, &gpuAgent, NULL, host_data));
ASSERT_SUCCESS(hsa_amd_vmem_address_reserve((void**)&dev_data, sizeof(*dev_data), 0, 0));
hsa_amd_vmem_alloc_handle_t mem_handle;
ASSERT_SUCCESS(hsa_amd_vmem_handle_create(device_pool, sizeof(*dev_data), MEMORY_TYPE_PINNED, 0,
&mem_handle));
ASSERT_SUCCESS(hsa_amd_vmem_map(dev_data, sizeof(*dev_data), 0, mem_handle, 0));
// Give host and device access to device data
hsa_amd_memory_access_desc_t permsAccess[] = {{HSA_ACCESS_PERMISSION_RW, gpuAgent}};
ASSERT_SUCCESS(
hsa_amd_vmem_set_access(dev_data, sizeof(*dev_data), permsAccess, ARRAY_SIZE(permsAccess)));
// Allocate the kernel argument buffer from the kernarg_pool.
ASSERT_SUCCESS(hsa_amd_memory_pool_allocate(kernarg_pool, sizeof(args_t), 0,
reinterpret_cast<void**>(&kernArgsVirt)));
// create completion signal
ASSERT_SUCCESS(hsa_signal_create(1, 0, NULL, &signal));
// initialize the host buffers
for (int i = 0; i < kMemoryAllocSize; ++i) {
unsigned int seed = time(NULL);
host_data->data[i] = 1 + rand_r(&seed) % 1;
}
ASSERT_SUCCESS(hsa_amd_memory_async_copy(dev_data->data, gpuAgent, host_data->data, cpuAgent,
kMemoryAllocSize * 4, 0, NULL, signal));
while (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1,
HSA_WAIT_STATE_ACTIVE)) {
}
hsa_signal_store_relaxed(signal, 1);
memset(host_data->result, 0, sizeof(host_data->result));
ASSERT_SUCCESS(hsa_amd_agents_allow_access(1, &gpuAgent, NULL, kernArgsVirt));
kernArgsVirt->a = dev_data->data;
kernArgsVirt->b = host_data->gpuWrite; // system memory passed to gpu for write
kernArgsVirt->c = dev_data->result; // gpu memory to verify that gpu read system data
// Create the executable, get symbol by name and load the code object
set_kernel_file_name("gpuReadWrite_kernels.hsaco");
set_kernel_name("gpuReadWrite");
ASSERT_SUCCESS(rocrtst::LoadKernelFromObjFile(this, &gpuAgent));
// Fill the dispatch packet with
// workgroup_size, grid_size, kernelArgs and completion signal
// Put it on the queue and launch the kernel by ringing the doorbell
// create aql packet
hsa_kernel_dispatch_packet_t aql;
memset(&aql, 0, sizeof(aql));
// initialize aql packet
aql.workgroup_size_x = 256;
aql.workgroup_size_y = 1;
aql.workgroup_size_z = 1;
aql.grid_size_x = kMemoryAllocSize;
aql.grid_size_y = 1;
aql.grid_size_z = 1;
aql.private_segment_size = 0;
aql.group_segment_size = 0;
aql.kernel_object = kernel_object(); // kernel_code;
aql.kernarg_address = kernArgsVirt;
aql.completion_signal = signal;
const uint32_t queue_mask = queue->size - 1;
// write to command queue
uint64_t index = hsa_queue_load_write_index_relaxed(queue);
hsa_queue_store_write_index_relaxed(queue, index + 1);
rocrtst::WriteAQLToQueueLoc(queue, index, &aql);
hsa_kernel_dispatch_packet_t* q_base_addr =
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue->base_address);
rocrtst::AtomicSetPacketHeader(
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE),
(1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(&q_base_addr[index & queue_mask]));
// ringdoor bell
hsa_signal_store_relaxed(queue->doorbell_signal, index);
// wait for the signal and reset it for future use
while (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1,
HSA_WAIT_STATE_ACTIVE)) {
}
hsa_signal_store_relaxed(signal, 1);
ASSERT_SUCCESS(hsa_amd_memory_async_copy(host_data->result, cpuAgent, dev_data->result, gpuAgent,
kMemoryAllocSize * 4, 0, NULL, signal));
while (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1,
HSA_WAIT_STATE_ACTIVE)) {
}
// compare device and host side results
if (verbosity() > 0) {
std::cout << " Check GPU has read the system memory" << std::endl;
}
for (int i = 0; i < kMemoryAllocSize; ++i) {
// printf("Verifying data at index[%d]\n", i);
ASSERT_EQ(host_data->result[i], host_data->data[i]);
}
if (verbosity() > 0) {
std::cout << " GPU has read the system memory successfully" << std::endl;
std::cout << " Check GPU has written to system memory" << std::endl;
}
for (int i = 0; i < kMemoryAllocSize; ++i) {
ASSERT_EQ(host_data->gpuWrite[i], i);
}
if (verbosity() > 0) {
std::cout << " GPU has written to system memory successfully" << std::endl;
}
ASSERT_SUCCESS(hsa_amd_vmem_unmap(dev_data, sizeof(*dev_data)));
ASSERT_SUCCESS(hsa_amd_vmem_handle_release(mem_handle));
if (dev_data) {
ASSERT_SUCCESS(hsa_amd_vmem_address_free(dev_data, sizeof(*dev_data)));
}
if (host_data) hsa_memory_free(host_data);
if (kernArgsVirt) {
hsa_memory_free(kernArgsVirt);
}
if (signal.handle) {
hsa_signal_destroy(signal);
}
if (queue) {
hsa_queue_destroy(queue);
}
}
void VirtMemoryTestBasic::GPUAccessToGPUMemoryTest(void) {
hsa_status_t err;
// find all cpu agents
std::vector<hsa_agent_t> cpus;
ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateCPUAgents, &cpus));
// find all gpu agents
std::vector<hsa_agent_t> gpus;
ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus));
if (verbosity() > 0) PrintMemorySubtestHeader("GPU To GPU Access test");
bool supp = false;
ASSERT_SUCCESS(hsa_system_get_info(HSA_AMD_SYSTEM_INFO_VIRTUAL_MEM_API_SUPPORTED, (void*)&supp));
if (!supp) {
if (verbosity() > 0) {
std::cout << " Virtual Memory API not supported on this system - Skipping." << std::endl;
std::cout << kSubTestSeparator << std::endl;
}
return;
}
for (unsigned int i = 0; i < gpus.size(); ++i) {
hsa_amd_memory_pool_t gpu_pool;
memset(&gpu_pool, 0, sizeof(gpu_pool));
ASSERT_SUCCESS(
hsa_amd_agent_iterate_memory_pools(gpus[i], rocrtst::GetGlobalMemoryPool, &gpu_pool));
if (gpu_pool.handle == 0) {
std::cout << "no global mempool in GPU agent" << std::endl;
return;
}
GPUAccessToGPUMemoryTest(cpus[0], gpus[i], gpu_pool);
}
if (verbosity() > 0) {
std::cout << " Subtest finished" << std::endl;
std::cout << kSubTestSeparator << std::endl;
}
}
void VirtMemoryTestBasic::MemoryAccountingTest(hsa_agent_t agent, hsa_amd_memory_pool_t pool) {
if (verbosity() > 0) {
PrintAgentNameAndType(agent);
}
hsa_status_t err;
hsa_amd_memory_pool_access_t access;
ASSERT_SUCCESS(hsa_amd_agent_memory_pool_get_info(agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access));
if (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) return;
rocrtst::pool_info_t pool_info;
err = rocrtst::AcquirePoolInfo(pool, &pool_info);
if (err != HSA_STATUS_SUCCESS || !pool_info.alloc_allowed) return;
if (pool_info.segment != HSA_AMD_SEGMENT_GLOBAL) return;
hsa_device_type_t device_type;
err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
if (err != HSA_STATUS_SUCCESS || device_type != HSA_DEVICE_TYPE_GPU) return;
bool vmem_supported = false;
err = hsa_system_get_info(HSA_AMD_SYSTEM_INFO_VIRTUAL_MEM_API_SUPPORTED, &vmem_supported);
if (err != HSA_STATUS_SUCCESS || !vmem_supported) return;
rocrtst::pool_info_t pool_i;
ASSERT_SUCCESS(rocrtst::AcquirePoolInfo(pool, &pool_i));
size_t granule_size = pool_i.alloc_rec_granule;
size_t allocation_size = 10 * granule_size;
size_t amount_begin = 0, amount_current = 0;
void* reserved_addr = nullptr;
hsa_amd_vmem_alloc_handle_t mem_handle;
ASSERT_SUCCESS(hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MEMORY_AVAIL, &amount_begin));
ASSERT_SUCCESS(hsa_amd_vmem_address_reserve(&reserved_addr, allocation_size, 0, 0));
hsa_amd_pointer_info_t ptr_info = {};
ptr_info.size = sizeof(ptr_info);
ASSERT_SUCCESS(hsa_amd_pointer_info(reserved_addr, &ptr_info, nullptr, nullptr, nullptr));
ASSERT_SUCCESS(hsa_amd_vmem_handle_create(pool, allocation_size, MEMORY_TYPE_NONE, 0, &mem_handle));
ASSERT_SUCCESS(hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MEMORY_AVAIL, &amount_current));
ASSERT_NEAR(amount_begin - amount_current, allocation_size, 4096);
ASSERT_SUCCESS(hsa_amd_vmem_map(reserved_addr, allocation_size, 0, mem_handle, 0));
hsa_amd_memory_access_desc_t access_desc = {HSA_ACCESS_PERMISSION_RW, agent};
ASSERT_SUCCESS(hsa_amd_vmem_set_access(reserved_addr, allocation_size, &access_desc, 1));
ASSERT_SUCCESS(hsa_amd_vmem_unmap(reserved_addr, allocation_size));
ASSERT_SUCCESS(hsa_amd_vmem_handle_release(mem_handle));
ASSERT_SUCCESS(hsa_amd_vmem_address_free(reserved_addr, allocation_size));
ASSERT_SUCCESS(hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MEMORY_AVAIL, &amount_current));
ASSERT_EQ(amount_begin, amount_current);
}
void VirtMemoryTestBasic::MemoryAccountingTest(void) {
hsa_status_t err;
std::vector<std::shared_ptr<rocrtst::agent_pools_t>> agent_pools;
if (verbosity() > 0) {
PrintMemorySubtestHeader("Memory Accounting Test");
}
err = rocrtst::GetAgentPools(&agent_pools);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
auto pool_idx = 0;
for (auto a : agent_pools) {
for (auto p : a->pools) {
if (verbosity() > 0) {
std::cout << " Pool " << pool_idx++ << ":" << std::endl;
}
MemoryAccountingTest(a->agent, p);
}
}
if (verbosity() > 0) {
std::cout << " Subtest finished" << std::endl;
std::cout << kSubTestSeparator << std::endl;
}
}
void VirtMemoryTestBasic::NonContiguousChunks(hsa_agent_t cpuAgent, hsa_agent_t gpuAgent,
hsa_amd_memory_pool_t device_pool) {
rocrtst::pool_info_t pool_i;
hsa_device_type_t ag_type;
char ag_name[64];
hsa_status_t err;
ASSERT_SUCCESS(rocrtst::AcquirePoolInfo(device_pool, &pool_i));
if (!pool_i.alloc_allowed || pool_i.segment != HSA_AMD_SEGMENT_GLOBAL ||
pool_i.global_flag != HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)
return;
hsa_amd_memory_pool_access_t access;
ASSERT_SUCCESS(hsa_amd_agent_memory_pool_get_info(
cpuAgent, device_pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access));
if (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
if (verbosity() > 0) {
std::cout << " Test not applicable as system is not large bar - Skipping." << std::endl;
std::cout << kSubTestSeparator << std::endl;
return;
}
}
size_t& granule_size = pool_i.alloc_granule;
size_t alloc_size = granule_size * 512;
const unsigned NUM_BUFFERS = 6;
void* addr;
void* addr_chunks[NUM_BUFFERS];
hsa_amd_vmem_alloc_handle_t mem_handles[NUM_BUFFERS];
static const int kMemoryAllocSize = 4096;
unsigned int max_element = alloc_size / sizeof(unsigned int);
ASSERT_SUCCESS(hsa_amd_vmem_address_reserve((void**)&addr, NUM_BUFFERS * alloc_size, 0, 0));
for (unsigned i = 0; i < NUM_BUFFERS; i++) {
// Allocate 6 separate memory memory handles
ASSERT_SUCCESS(hsa_amd_vmem_handle_create(device_pool, alloc_size, MEMORY_TYPE_PINNED, 0,
&(mem_handles[i])));
addr_chunks[i] = ((uint8_t*)addr) + (i * alloc_size);
}
for (unsigned i = 0; i < NUM_BUFFERS; i++) {
// Map each chunk in reverse order
ASSERT_SUCCESS(hsa_amd_vmem_map(addr_chunks[i], alloc_size, 0, mem_handles[NUM_BUFFERS - i - 1],
alloc_size));
}
hsa_amd_memory_access_desc_t permsAccess[] = {{HSA_ACCESS_PERMISSION_RW, gpuAgent}};
ASSERT_SUCCESS(hsa_amd_vmem_set_access(addr, NUM_BUFFERS * alloc_size, permsAccess,
ARRAY_SIZE(permsAccess)));
for (unsigned i = 0; i < NUM_BUFFERS; i++) {
// TODO Map them in opposite order
ASSERT_SUCCESS(hsa_amd_vmem_unmap(addr_chunks[i], alloc_size));
}
ASSERT_SUCCESS(hsa_amd_vmem_address_free(addr, NUM_BUFFERS * alloc_size));
}
void VirtMemoryTestBasic::NonContiguousChunks(void) {
hsa_status_t err;
if (verbosity() > 0) PrintMemorySubtestHeader("GPU To GPU Access test");
bool supp = false;
ASSERT_SUCCESS(hsa_system_get_info(HSA_AMD_SYSTEM_INFO_VIRTUAL_MEM_API_SUPPORTED, (void*)&supp));
if (!supp) {
if (verbosity() > 0) {
std::cout << " Virtual Memory API not supported on this system - Skipping." << std::endl;
std::cout << kSubTestSeparator << std::endl;
}
return;
}
// find all cpu agents
std::vector<hsa_agent_t> cpus;
ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateCPUAgents, &cpus));
// find all gpu agents
std::vector<hsa_agent_t> gpus;
ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus));
for (unsigned int i = 0; i < gpus.size(); ++i) {
hsa_amd_memory_pool_t gpu_pool;
memset(&gpu_pool, 0, sizeof(gpu_pool));
ASSERT_SUCCESS(
hsa_amd_agent_iterate_memory_pools(gpus[i], rocrtst::GetGlobalMemoryPool, &gpu_pool));
if (gpu_pool.handle == 0) {
std::cout << "no global mempool in GPU agent" << std::endl;
return;
}
NonContiguousChunks(cpus[0], gpus[i], gpu_pool);
}
if (verbosity() > 0) {
std::cout << " Subtest finished" << std::endl;
std::cout << kSubTestSeparator << std::endl;
}
}
void VirtMemoryTestBasic::SetUp(void) {
hsa_status_t err;
TestBase::SetUp();
ASSERT_SUCCESS(rocrtst::SetDefaultAgents(this));
ASSERT_SUCCESS(rocrtst::SetPoolsTypical(this));
return;
}
void VirtMemoryTestBasic::Run(void) {
// Compare required profile for this test case with what we're actually
// running on
if (!rocrtst::CheckProfile(this)) {
return;
}
TestBase::Run();
}
void VirtMemoryTestBasic::DisplayTestInfo(void) { TestBase::DisplayTestInfo(); }
void VirtMemoryTestBasic::DisplayResults(void) const {
// Compare required profile for this test case with what we're actually
// running on
if (!rocrtst::CheckProfile(this)) {
return;
}
return;
}
void VirtMemoryTestBasic::Close() {
// This will close handles opened within rocrtst utility calls and call
// hsa_shut_down(), so it should be done after other hsa cleanup
TestBase::Close();
}
VirtMemoryTestInterProcess::VirtMemoryTestInterProcess(void) : TestBase() {
set_title("ROCr Virtual Memory Test - InterProcess ");
set_description(" Tests Virtual Memory API with memory shared between two processes");
}
VirtMemoryTestInterProcess::~VirtMemoryTestInterProcess(void) {}
// See if the other process wrote an error value to the token; if not, write
// the newVal to the token.
static int CheckAndSetToken(std::atomic<int>* token, int newVal) {
if (*token == -1) {
return -1;
} else {
*token = newVal;
}
return 0;
}
static void ClearShared(SharedVirtMem* s) {
s->token = 0;
s->count = 0;
s->size = 0;
s->child_status = 0;
s->parent_status = 0;
memset(&s->sv, 0, sizeof(s->sv));
}
// Any 1-time setup involving member variables used in the rest of the test
// should be done here.
void VirtMemoryTestInterProcess::SetUp(void) {
hsa_status_t err;
// We must fork process before doing HSA stuff, specifically, hsa_init, as
// each process needs to do this.
// Allocate linux shared_ memory.
shared_ = reinterpret_cast<SharedVirtMem*>(mmap(
nullptr, sizeof(SharedVirtMem), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0));
ASSERT_NE(shared_, MAP_FAILED) << "mmap failed to allocated shared_ memory";
// Initialize shared control block to zeros. The field "token"
// is used to signal state changes between the 2 processes.
ClearShared(shared_);
if (socketpair(AF_UNIX, SOCK_DGRAM, 0, shared_->sv) != 0) {
std::cout << "Failed to create Unix-domain socket pair" << std::endl;
ASSERT_EQ(0, 1);
}
// Spawn second process and verify communication
child_ = 0;
child_ = fork();
ASSERT_NE(-1, child_) << "fork failed";
std::atomic<int>* token = &shared_->token;
if (child_ != 0) {
parentProcess_ = true;
// Signal to other process we are waiting, and then wait...
*token = 1;
while (*token == 1) {
sched_yield();
}
PROCESS_LOG("Second process observed, handshake...\n");
*token = 1;
while (*token == 1) {
sched_yield();
}
} else {
parentProcess_ = false;
set_verbosity(0);
PROCESS_LOG("Second process running.\n");
while (*token == 0) {
sched_yield();
}
int ret;
ret = CheckAndSetToken(token, 0);
ASSERT_EQ(0, ret) << "Error detected in child process\n";
// Wait for handshake
while (*token == 0) {
sched_yield();
}
ret = CheckAndSetToken(token, 0);
ASSERT_EQ(0, ret) << "Error detected in child process\n";
}
TestBase::SetUp();
ASSERT_SUCCESS(rocrtst::SetDefaultAgents(this));
ASSERT_SUCCESS(rocrtst::SetPoolsTypical(this));
ASSERT_SUCCESS(hsa_amd_memory_pool_get_info(
device_pool(), HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &min_gpu_mem_granule));
ASSERT_SUCCESS(hsa_amd_memory_pool_get_info(
device_pool(), HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_REC_GRANULE, &rec_gpu_mem_granule));
return;
}
void VirtMemoryTestInterProcess::Run(void) {
// Compare required profile for this test case with what we're actually
// running on
if (!rocrtst::CheckProfile(this)) {
return;
}
TestBase::Run();
// Note: Close() (and hsa_shut_down()) will be called from main()
// processOne is true for parent process, false for child process
if (parentProcess_) {
ParentProcessImpl();
} else {
ChildProcessImpl();
exit(0);
}
}
void VirtMemoryTestInterProcess::DisplayTestInfo(void) { TestBase::DisplayTestInfo(); }
void VirtMemoryTestInterProcess::DisplayResults(void) const {
// Compare required profile for this test case with what we're actually
// running on
if (!rocrtst::CheckProfile(this)) {
return;
}
return;
}
void VirtMemoryTestInterProcess::Close() {
// This will close handles opened within rocrtst utility calls and call
// hsa_shut_down(), so it should be done after other hsa cleanup
TestBase::Close();
}
/* Send the dmabuf_fd to another process via Unix socket */
int VirtMemoryTestInterProcess::SendDmaBufFd(int socket, int dmabuf_fd) {
char* iov_str = (char*)"rocrtst";
struct msghdr msg = {0};
char buf[CMSG_SPACE(sizeof(dmabuf_fd))];
memset(buf, '\0', sizeof(buf));
struct iovec io = {.iov_base = iov_str, .iov_len = strlen(iov_str)};
msg.msg_iov = &io;
msg.msg_iovlen = 1;
msg.msg_control = buf;
msg.msg_controllen = sizeof(buf);
struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(sizeof(dmabuf_fd));
// memmove(CMSG_DATA(cmsg), &dmabuf_fd, sizeof(dmabuf_fd));
memcpy(CMSG_DATA(cmsg), &dmabuf_fd, sizeof(dmabuf_fd));
msg.msg_controllen = CMSG_SPACE(sizeof(dmabuf_fd));
size_t sent = sendmsg(socket, &msg, 0);
return (sent < 0) ? -1 : 0;
}
/* Receive the dmabuf_fd to from process via Unix socket */
int VirtMemoryTestInterProcess::ReceiveDmaBufFd(int socket) {
struct msghdr msg = {0};
/* On Mac OS X, the struct iovec is needed, even if it points to minimal data */
char m_buffer[1];
struct iovec io = {.iov_base = m_buffer, .iov_len = sizeof(m_buffer)};
msg.msg_iov = &io;
msg.msg_iovlen = 1;
char c_buffer[256];
msg.msg_control = c_buffer;
msg.msg_controllen = sizeof(c_buffer);
size_t rcv = recvmsg(socket, &msg, 0);
if (rcv < 0) return -1;
struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
int fd;
memmove(&fd, CMSG_DATA(cmsg), sizeof(fd));
return fd;
}
void VirtMemoryTestInterProcess::ParentProcessImpl() {
hsa_status_t err;
void* addrRange = NULL;
bool supp = false;
ASSERT_SUCCESS(hsa_system_get_info(HSA_AMD_SYSTEM_INFO_VIRTUAL_MEM_API_SUPPORTED, (void*)&supp));
if (!supp) {
if (verbosity() > 0) {
std::cout << " Virtual Memory API not supported on this system - Skipping." << std::endl;
std::cout << kSubTestSeparator << std::endl;
}
return;
}
ASSERT_SUCCESS(hsa_amd_vmem_address_reserve(&addrRange, 20 * rec_gpu_mem_granule, 0, 0));
hsa_amd_vmem_alloc_handle_t exported_handle;
ASSERT_SUCCESS(hsa_amd_vmem_handle_create(device_pool(), 20 * rec_gpu_mem_granule,
MEMORY_TYPE_NONE, 0, &exported_handle));
int dmabuf_fd;
ASSERT_SUCCESS(hsa_amd_vmem_export_shareable_handle(&dmabuf_fd, exported_handle, 0));
ASSERT_GE(dmabuf_fd, 0);
// Signal child process that the gpu buffer is ready to read.
PROCESS_LOG("Parent: Signalling child proces process\n");
CheckAndSetToken(&shared_->token, 1);
close(shared_->sv[1]);
ASSERT_EQ(SendDmaBufFd(shared_->sv[0], dmabuf_fd), 0);
hsa_amd_vmem_alloc_handle_t imported_handle;
ASSERT_SUCCESS(hsa_amd_vmem_import_shareable_handle(dmabuf_fd, &imported_handle));
/* Test importing same handle twice */
hsa_amd_vmem_alloc_handle_t imported_handle2;
ASSERT_SUCCESS(hsa_amd_vmem_import_shareable_handle(dmabuf_fd, &imported_handle2));
ASSERT_SUCCESS(hsa_amd_vmem_map(addrRange, 10 * rec_gpu_mem_granule, 0, imported_handle, 0));
ASSERT_SUCCESS(hsa_amd_vmem_unmap(addrRange, 10 * rec_gpu_mem_granule));
ASSERT_SUCCESS(hsa_amd_vmem_handle_release(imported_handle));
ASSERT_SUCCESS(hsa_amd_vmem_handle_release(imported_handle2));
PROCESS_LOG("Parent: Waiting for child process to signal\n");
while (shared_->token == 1) {
sched_yield();
}
if (shared_->token != 2) {
shared_->token = -1;
}
FORK_ASSERT_EQ(2, shared_->token, "Parent: Error detected in signaling token\n");
PROCESS_LOG("Parent: Waking upon signal from child process\n");
ASSERT_SUCCESS(hsa_amd_vmem_handle_release(exported_handle));
ASSERT_SUCCESS(hsa_amd_vmem_address_free(addrRange, 20 * rec_gpu_mem_granule));
PROCESS_LOG("Parent: Virtual Memory test PASSED\n");
}
void VirtMemoryTestInterProcess::ChildProcessImpl() {
int dmabuf_fd = -1;
bool supp = false;
hsa_status_t err;
ASSERT_SUCCESS(hsa_system_get_info(HSA_AMD_SYSTEM_INFO_VIRTUAL_MEM_API_SUPPORTED, (void*)&supp));
if (!supp) {
if (verbosity() > 0) {
std::cout << " Virtual Memory API not supported on this system - Skipping." << std::endl;
std::cout << kSubTestSeparator << std::endl;
}
return;
}
void* addrRange = NULL;
ASSERT_SUCCESS(hsa_amd_vmem_address_reserve(&addrRange, 20 * rec_gpu_mem_granule, 0, 0));
// Yield until shared token value changes i.e. is updated by parent.
// Validate parent's update is per expectation
PROCESS_LOG("Child: Waiting for parent process to signal\n");
while (shared_->token == 0) {
sched_yield();
}
if (shared_->token != 1) {
shared_->token = -1;
}
FORK_ASSERT_EQ(1, shared_->token, "Child: Error detected in signaling token\n");
PROCESS_LOG("Child: Waking upon signal from parent process\n");
close(shared_->sv[0]);
dmabuf_fd = ReceiveDmaBufFd(shared_->sv[1]);
hsa_amd_vmem_alloc_handle_t imported_handle;
ASSERT_SUCCESS(hsa_amd_vmem_import_shareable_handle(dmabuf_fd, &imported_handle));
ASSERT_SUCCESS(hsa_amd_vmem_map(addrRange, 10 * rec_gpu_mem_granule, 0, imported_handle, 0));
ASSERT_SUCCESS(hsa_amd_vmem_unmap(addrRange, 10 * rec_gpu_mem_granule));
PROCESS_LOG("Child: Signalling parent process\n");
CheckAndSetToken(&shared_->token, 2);
ASSERT_SUCCESS(hsa_amd_vmem_handle_release(imported_handle));
PROCESS_LOG("Child: Virtual Memory test PASSED\n");
}