diff --git a/projects/rocr-runtime/rocrtst/suites/functional/memory_access.cc b/projects/rocr-runtime/rocrtst/suites/functional/memory_access.cc new file mode 100755 index 0000000000..e4fc136b38 --- /dev/null +++ b/projects/rocr-runtime/rocrtst/suites/functional/memory_access.cc @@ -0,0 +1,502 @@ +/* + * ============================================================================= + * ROC Runtime Conformance Release License + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2017, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + + +#include +#include +#include +#include +#include + +#include "suites/functional/memory_access.h" +#include "common/base_rocr_utils.h" +#include "common/common.h" +#include "common/helper_funcs.h" +#include "common/hsatimer.h" +#include "gtest/gtest.h" +#include "hsa/hsa.h" +#include "hsa/hsa_ext_finalize.h" + +static const uint32_t kNumBufferElements = 256; + + + + + +#define RET_IF_HSA_ERR(err) { \ + if ((err) != HSA_STATUS_SUCCESS) { \ + const char* msg = 0; \ + hsa_status_string(err, &msg); \ + std::cout << "hsa api call failure at line " << __LINE__ << ", file: " << \ + __FILE__ << ". Call returned " << err << std::endl; \ + std::cout << msg << std::endl; \ + return (err); \ + } \ +} + + + +MemoryAccessTest::MemoryAccessTest(void) : + TestBase() { + set_num_iteration(10); // Number of iterations to execute of the main test; + // This is a default value which can be overridden + // on the command line. + + set_title("RocR Memory Access Tests"); + set_description("This series of tests check memory allocation" + "on GPU and CPU, i.e. GPU access to system memory " + "and CPU access to GPU memory."); +} + +MemoryAccessTest::~MemoryAccessTest(void) { +} + +// Any 1-time setup involving member variables used in the rest of the test +// should be done here. +void MemoryAccessTest::SetUp(void) { + hsa_status_t err; + + TestBase::SetUp(); + + err = rocrtst::SetDefaultAgents(this); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); + + err = rocrtst::SetPoolsTypical(this); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + return; +} + +void MemoryAccessTest::Run(void) { + // Compare required profile for this test case with what we're actually + // running on + if (!rocrtst::CheckProfile(this)) { + return; + } + + TestBase::Run(); +} + +void MemoryAccessTest::DisplayTestInfo(void) { + TestBase::DisplayTestInfo(); +} + +void MemoryAccessTest::DisplayResults(void) const { + // Compare required profile for this test case with what we're actually + // running on + if (!rocrtst::CheckProfile(this)) { + return; + } + + return; +} + +void MemoryAccessTest::Close() { + // This will close handles opened within rocrtst utility calls and call + // hsa_shut_down(), so it should be done after other hsa cleanup + TestBase::Close(); +} + + + + + +typedef struct __attribute__ ((aligned(16))) args_t { + int *a; + int *b; + int *c; + } args; + + args *kernArgs = NULL; + +static const char kSubTestSeparator[] = " **************************"; + +static void PrintMemorySubtestHeader(const char *header) { + std::cout << " *** Memory Subtest: " << header << " ***" << std::endl; +} + +static const int kMemoryAllocSize = 1024; +// Test to check GPU can read & write to system memory +void MemoryAccessTest::GPUAccessToCPUMemoryTest(hsa_agent_t cpuAgent, + hsa_agent_t gpuAgent) { + hsa_status_t err; + + // Get Global Memory Pool on the gpuAgent to allocate gpu buffers + hsa_amd_memory_pool_t gpu_pool; + err = hsa_amd_agent_iterate_memory_pools(gpuAgent, + rocrtst::GetGlobalMemoryPool, + &gpu_pool); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + hsa_amd_memory_pool_access_t access; + hsa_amd_agent_memory_pool_get_info(cpuAgent, gpu_pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, + &access); + if (access != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { + // hsa objects + hsa_queue_t *queue = NULL; // command queue + hsa_signal_t signal = {0}; // completion signal + + + // get queue size + uint32_t queue_size = 0; + err = hsa_agent_get_info(gpuAgent, + HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + // create queue + err = hsa_queue_create(gpuAgent, + queue_size, HSA_QUEUE_TYPE_MULTI, + NULL, NULL, 0, 0, &queue); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + // Get System Memory Pool on the cpuAgent to allocate host side buffers + hsa_amd_memory_pool_t global_pool; + err = hsa_amd_agent_iterate_memory_pools(cpuAgent, + rocrtst::GetGlobalMemoryPool, + &global_pool); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + + + // Find a memory pool that supports kernel arguments. + hsa_amd_memory_pool_t kernarg_pool; + err = hsa_amd_agent_iterate_memory_pools(cpuAgent, + rocrtst::GetKernArgMemoryPool, + &kernarg_pool); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + // Allocate the host side buffers + // (sys_data,dup_sys_data,cpuResult,kernArg) on system memory + int *sys_data = NULL; + int *dup_sys_data = NULL; + int *cpuResult = NULL; + int *gpuResult = NULL; + + err = hsa_amd_memory_pool_allocate(global_pool, + kMemoryAllocSize, 0, + reinterpret_cast(&cpuResult)); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + err = hsa_amd_memory_pool_allocate(global_pool, + kMemoryAllocSize, 0, + reinterpret_cast(&sys_data)); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + err = hsa_amd_memory_pool_allocate(global_pool, + kMemoryAllocSize, 0, + reinterpret_cast(&dup_sys_data)); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + + // Allocate the kernel argument buffer from the kernarg_pool. + err = hsa_amd_memory_pool_allocate(kernarg_pool, sizeof(args_t), 0, + reinterpret_cast(&kernArgs)); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + // initialize the host buffers + for (int i = 0; i < kMemoryAllocSize; ++i) { + unsigned int seed = time(NULL); + sys_data[i] = 1 + rand_r(&seed) % 1; + dup_sys_data[i] = sys_data[i]; + } + + memset(cpuResult, 0, kMemoryAllocSize * sizeof(int)); + + // for the dGPU, we have coarse grained local memory, + // so allocate memory for it on the GPU's GLOBAL segment . + + // Get local memory of GPU to allocate device side buffers + + err = hsa_amd_memory_pool_allocate(gpu_pool, kMemoryAllocSize, 0, + reinterpret_cast(&gpuResult)); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + + // Allow cpuAgent access to all allocated GPU memory. + err = hsa_amd_agents_allow_access(1, &cpuAgent, NULL, gpuResult); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + memset(gpuResult, 0, kMemoryAllocSize * sizeof(int)); + + // Allow gpuAgent access to all allocated system memory. + err = hsa_amd_agents_allow_access(1, &gpuAgent, NULL, cpuResult); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + err = hsa_amd_agents_allow_access(1, &gpuAgent, NULL, sys_data); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + err = hsa_amd_agents_allow_access(1, &gpuAgent, NULL, dup_sys_data); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + err = hsa_amd_agents_allow_access(1, &gpuAgent, NULL, kernArgs); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + kernArgs->a = sys_data; + kernArgs->b = cpuResult; // system memory passed to gpu for write + kernArgs->c = gpuResult; // gpu memory to verify that gpu read system data + + + // Create the executable, get symbol by name and load the code object + set_kernel_file_name("gpuReadWrite_kernels.hsaco"); + set_kernel_name("gpuReadWrite"); + err = rocrtst::LoadKernelFromObjFile(this, &gpuAgent); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + + + // Fill the dispatch packet with + // workgroup_size, grid_size, kernelArgs and completion signal + // Put it on the queue and launch the kernel by ringing the doorbell + + // create completion signal + err = hsa_signal_create(1, 0, NULL, &signal); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + // create aql packet + hsa_kernel_dispatch_packet_t aql; + memset(&aql, 0, sizeof(aql)); + + // initialize aql packet + aql.header = + (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | + (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + aql.setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + aql.workgroup_size_x = 256; + aql.workgroup_size_y = 1; + aql.workgroup_size_z = 1; + aql.grid_size_x = kMemoryAllocSize; + aql.grid_size_y = 1; + aql.grid_size_z = 1; + aql.private_segment_size = 0; + aql.group_segment_size = 0; + aql.kernel_object = kernel_object(); // kernel_code; + aql.kernarg_address = kernArgs; + aql.completion_signal = signal; + + // const uint32_t queue_size = queue->size; + const uint32_t queue_mask = queue->size - 1; + + // write to command queue + uint64_t index = hsa_queue_load_write_index_relaxed(queue); + reinterpret_cast + (queue->base_address)[index & queue_mask] = aql; + hsa_queue_store_write_index_relaxed(queue, index + 1); + + // ringdoor bell + hsa_signal_store_relaxed(queue->doorbell_signal, index); + // wait for the signal and reset it for future use + while (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, + (uint64_t)-1, HSA_WAIT_STATE_ACTIVE)) { } + hsa_signal_store_relaxed(signal, 1); + + // compare device and host side results + if (verbosity() > 0) { + std::cout<< "check gpu has read the system memory"<< std::endl; + } + for (int i = 0; i < kMemoryAllocSize; ++i) { + ASSERT_EQ(gpuResult[i], dup_sys_data[i]); + } + + if (verbosity() > 0) { + std::cout<< "gpu has read the system memory successfully"<< std::endl; + std::cout<< "check gpu has written to system memory"<< std::endl; + } + for (int i = 0; i < kMemoryAllocSize; ++i) { + ASSERT_EQ(cpuResult[i], i); + } + + if (verbosity() > 0) { + std::cout<< "gpu has written to system memory successfully"<< std::endl; + } + + if (sys_data) { hsa_memory_free(sys_data); } + if (dup_sys_data) { hsa_memory_free(dup_sys_data); } + if (cpuResult) {hsa_memory_free(cpuResult); } + if (gpuResult) {hsa_memory_free(gpuResult); } + if (kernArgs) { hsa_memory_free(kernArgs); } + if (signal.handle) { hsa_signal_destroy(signal); } + if (queue) { hsa_queue_destroy(queue); } + } else { + if (verbosity() > 0) { + std::cout<< "Test not applicable as system is not large bar." + "Skipping."<< std::endl; + std::cout << kSubTestSeparator << std::endl; + } + return; + } +} + + + + +// Test to check cpu can read & write to GPU memory +void MemoryAccessTest::CPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent, + hsa_agent_t gpuAgent, + hsa_amd_memory_pool_t pool) { + hsa_status_t err; + + rocrtst::pool_info_t pool_i; + err = rocrtst::AcquirePoolInfo(pool, &pool_i); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); + + if (pool_i.segment == HSA_AMD_SEGMENT_GLOBAL && + pool_i.global_flag == HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { + hsa_amd_memory_pool_access_t access; + hsa_amd_agent_memory_pool_get_info(cpuAgent, pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, + &access); + if (access != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { + if (!pool_i.alloc_allowed || pool_i.alloc_granule == 0 || + pool_i.alloc_alignment == 0) { + if (verbosity() > 0) { + std::cout << " Test not applicable. Skipping." << std::endl; + std::cout << kSubTestSeparator << std::endl; + } + return; + } + + + auto gran_sz = pool_i.alloc_granule; + auto pool_sz = pool_i.size / gran_sz; + auto max_alloc_size = pool_sz/2; + unsigned int max_element = max_alloc_size/sizeof(unsigned int); + unsigned int *gpu_data; + unsigned int *sys_data; + sys_data = (unsigned int*)malloc(max_alloc_size); + memset(sys_data, 0, max_alloc_size); + for (unsigned int i = 1; i <= max_element; ++i) { + sys_data[i] = i; + } + // err = hsa_amd_agents_allow_access(1, &gpuAgent, NULL, sys_data); + // EXPECT_EQ(err, HSA_STATUS_SUCCESS); + err = hsa_amd_memory_pool_allocate(pool, max_alloc_size, 0, + reinterpret_cast(&gpu_data)); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + /* + if (err == HSA_STATUS_ERROR) { + err = hsa_amd_memory_pool_free(gpu_data); + }*/ + + err = hsa_amd_agents_allow_access(1, &cpuAgent, NULL, gpu_data); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + // EXPECT_EQ(HSA_STATUS_SUCCESS, err); + memset(gpu_data, 0, max_alloc_size); + + // Verify CPU can read & write to GPU memory + std::cout<< "Verify CPU can read & write to GPU memory"<< std::endl; + for (unsigned int i = 1; i <= max_element; ++i) { + gpu_data[i] = i; // Write to gpu memory directly + } + + for (unsigned int i = 1; i <= max_element; ++i) { + if (sys_data[i] != gpu_data[i]) { // Reading GPU memory + fprintf(stdout, "Values not mathing !! sys_data[%d]:%d ," + "gpu_data[%d]\n", sys_data[i], i, gpu_data[i]); + } + } + std::cout<< "CPU have read & write to GPU memory successfully"<< std::endl; + err = hsa_amd_memory_pool_free(gpu_data); + free(sys_data); + } else { + if (verbosity() > 0) { + std::cout<< "Test not applicable as system is not large bar." + "Skipping."<< std::endl; + std::cout << kSubTestSeparator << std::endl; + } + return; + } + } +} + + +void MemoryAccessTest::CPUAccessToGPUMemoryTest(void) { + hsa_status_t err; + + PrintMemorySubtestHeader("CPUAccessToGPUMemoryTest in Memory Pools"); + // find all cpu agents + std::vector cpus; + err = hsa_iterate_agents(rocrtst::IterateCPUAgents, &cpus); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + // find all gpu agents + std::vector gpus; + err = hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + for (unsigned int i = 0 ; i< gpus.size(); ++i) { + hsa_amd_memory_pool_t gpu_pool; + err = hsa_amd_agent_iterate_memory_pools(gpus[i], + rocrtst::GetGlobalMemoryPool, + &gpu_pool); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + CPUAccessToGPUMemoryTest(cpus[0], gpus[i], gpu_pool); + } + if (verbosity() > 0) { + std::cout << "subtest Passed" << std::endl; + std::cout << kSubTestSeparator << std::endl; + } +} + +void MemoryAccessTest::GPUAccessToCPUMemoryTest(void) { + hsa_status_t err; + + PrintMemorySubtestHeader("GPUAccessToCPUMemoryTest in Memory Pools"); + // find all cpu agents + std::vector cpus; + err = hsa_iterate_agents(rocrtst::IterateCPUAgents, &cpus); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + // find all gpu agents + std::vector gpus; + err = hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + for (unsigned int i = 0 ; i< gpus.size(); ++i) { + GPUAccessToCPUMemoryTest(cpus[0], gpus[i]); + } + + if (verbosity() > 0) { + std::cout << "subtest Passed" << std::endl; + std::cout << kSubTestSeparator << std::endl; + } +} + +#undef RET_IF_HSA_ERR diff --git a/projects/rocr-runtime/rocrtst/suites/functional/memory_access.h b/projects/rocr-runtime/rocrtst/suites/functional/memory_access.h new file mode 100755 index 0000000000..be0b9197cc --- /dev/null +++ b/projects/rocr-runtime/rocrtst/suites/functional/memory_access.h @@ -0,0 +1,90 @@ +/* + * ============================================================================= + * ROC Runtime Conformance Release License + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2017, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ +#ifndef ROCRTST_SUITES_FUNCTIONAL_MEMORY_ACCESS_H_ +#define ROCRTST_SUITES_FUNCTIONAL_MEMORY_ACCESS_H_ + + +#include "common/base_rocr.h" +#include "hsa/hsa.h" +#include "suites/test_common/test_base.h" + +class MemoryAccessTest : public TestBase { + public: + MemoryAccessTest(); + + // @Brief: Destructor for test case of MemoryTest + virtual ~MemoryAccessTest(); + + // @Brief: Setup the environment for measurement + virtual void SetUp(); + + // @Brief: Core measurement execution + virtual void Run(); + + // @Brief: Clean up and retrive the resource + virtual void Close(); + + // @Brief: Display results + virtual void DisplayResults() const; + + // @Brief: Display information about what this test does + virtual void DisplayTestInfo(void); + + + // @Brief: This test verify that CPU is able to Read & write GPU memory + void CPUAccessToGPUMemoryTest(void); + + // @Brief: This test verify that GPU is able to Read & write CPU memory + void GPUAccessToCPUMemoryTest(void); + + + private: + void CPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent, + hsa_agent_t gpuAgent, + hsa_amd_memory_pool_t pool); + void GPUAccessToCPUMemoryTest(hsa_agent_t cpuAgent, hsa_agent_t gpuAgent); +}; + +#endif // ROCRTST_SUITES_FUNCTIONAL_MEMORY_ACCESS_H_ diff --git a/projects/rocr-runtime/rocrtst/suites/performance/enqueueLatency.cc b/projects/rocr-runtime/rocrtst/suites/performance/enqueueLatency.cc new file mode 100755 index 0000000000..4f4fbc8a6c --- /dev/null +++ b/projects/rocr-runtime/rocrtst/suites/performance/enqueueLatency.cc @@ -0,0 +1,350 @@ +/* + * ============================================================================= + * ROC Runtime Conformance Release License + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2017, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ +#include +#include +#include + +#include "suites/performance/enqueueLatency.h" +#include "common/base_rocr_utils.h" +#include "common/common.h" +#include "common/os.h" +#include "common/helper_funcs.h" +#include "common/hsatimer.h" +#include "gtest/gtest.h" +#include "hsa/hsa.h" +#include "hsa/hsa_ext_finalize.h" + +#define RET_IF_HSA_ERR(err) { \ + if ((err) != HSA_STATUS_SUCCESS) { \ + const char* msg = 0; \ + hsa_status_string(err, &msg); \ + std::cout << "hsa api call failure at line " << __LINE__ << ", file: " << \ + __FILE__ << ". Call returned " << err << std::endl; \ + std::cout << msg << std::endl; \ + return (err); \ + } \ +} + +EnqueueLatency:: +EnqueueLatency(bool enqueueSinglePacket) : TestBase(), + enqueue_single_(enqueueSinglePacket) { + queue_size_ = 0; + num_of_pkts_ = 100000; + memset(&aql(), 0, sizeof(hsa_kernel_dispatch_packet_t)); + enqueue_time_mean_ = 0.0; + set_num_iteration(100); + + std::string name; + std::string desc; + + name = "Average Enqueue Time"; + desc = "This test measures the time when the packet enqueue to the" + " queue and before the door bell is ring to notify the command processor " + "to execute the packet"; + + + + if (enqueueSinglePacket) { + name += ", Single Packet"; + desc += " One Packet at a time in queue."; + } else { + name += ", Multiple Packets"; + desc += " Multiple i.e. maximum Packets equeued to queue at one time"; + } + + set_title(name); + set_description(desc); +} + +EnqueueLatency::~EnqueueLatency() { +} + +void EnqueueLatency::SetUp() { + hsa_status_t err; + TestBase::SetUp(); + // If it indicates to use default signal, set env var properly + + err = SetDefaultAgents(this); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); +} + +void EnqueueLatency::Run() { + if (!rocrtst::CheckProfile(this)) { + return; + } + hsa_status_t err; + TestBase::Run(); + + // find all gpu agents + std::vector gpus; + err = hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + for (unsigned int i = 0 ; i< gpus.size(); ++i) { + hsa_agent_t* gpu_dev = &gpus[i]; + char agent_name[64]; + err = hsa_agent_get_info(*gpu_dev, HSA_AGENT_INFO_NAME, agent_name); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + set_agent_name(agent_name); + + // Create a queue + hsa_queue_t* q = nullptr; + rocrtst::CreateQueue(*gpu_dev, &q); + ASSERT_NE(q, nullptr); + set_main_queue(q); + + set_kernel_file_name("dispatch_time_kernels.hsaco"); + set_kernel_name("empty_kernel"); + err = rocrtst::LoadKernelFromObjFile(this, gpu_dev); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + // Fill up the kernel packet except header + err = rocrtst::InitializeAQLPacket(this, &aql()); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); + + aql().workgroup_size_x = 1; + aql().grid_size_x = 1; + + // Here, modify the batch size if it is larger than the queue size + if (enqueue_single_) { + EnqueueSinglePacket(); + } else { + hsa_status_t err; + uint32_t size = 0; + err = hsa_agent_get_info(*gpu_dev, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &size); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + num_of_pkts_ = num_of_pkts_ > size ? size : num_of_pkts_; + EnqueueMultiPackets(); + } + } +} + + + +size_t EnqueueLatency::RealIterationNum() { + return num_iteration() * 1.2 + 1; +} + +void EnqueueLatency::EnqueueSinglePacket() { + std::vector timer; + + int it = RealIterationNum(); + const uint32_t queue_mask = main_queue()->size - 1; + + // queue should be empty + ASSERT_EQ(hsa_queue_load_read_index_scacquire(main_queue()), + hsa_queue_load_write_index_scacquire(main_queue())); + + void *q_base_addr = main_queue()->base_address; + rocrtst::PerfTimer p_timer; + for (int i = 0; i < it; i++) { + // Get timing stamp and ring the doorbell to dispatch the kernel. + int id = p_timer.CreateTimer(); + p_timer.StartTimer(id); + // Obtain the current queue write index. + uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1); + + ASSERT_LT(index, main_queue()->size + index); + + // Write the aql packet at the calculated queue index address. + reinterpret_cast( + q_base_addr)[index & queue_mask] = aql(); + + reinterpret_cast( + q_base_addr)[index & queue_mask].header |= + HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; + + p_timer.StopTimer(id); + + timer.push_back(p_timer.ReadTimer(id)); + hsa_signal_store_screlease(main_queue()->doorbell_signal, index); + + // Wait on the dispatch signal until the kernel is finished. + while (hsa_signal_wait_scacquire(aql().completion_signal, + HSA_SIGNAL_CONDITION_LT, 1, (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) { + } + + + + hsa_signal_store_screlease(aql().completion_signal, 1); + + if (verbosity() >= VERBOSE_PROGRESS) { + std::cout << "."; + fflush(stdout); + } + } + + if (verbosity() >= VERBOSE_PROGRESS) { + std::cout << std::endl; + } + + // Abandon the first result and after sort, delete the last 2% value + timer.erase(timer.begin()); + std::sort(timer.begin(), timer.end()); + + timer.erase(timer.begin() + num_iteration(), timer.end()); + + enqueue_time_mean_ = rocrtst::CalcMean(timer); + + return; +} + + + +void EnqueueLatency::EnqueueMultiPackets() { + std::vector timer; + int it = RealIterationNum(); + const uint32_t queue_mask = main_queue()->size - 1; + + // queue should be empty + ASSERT_EQ(hsa_queue_load_read_index_scacquire(main_queue()), + hsa_queue_load_write_index_scacquire(main_queue())); + + rocrtst::PerfTimer p_timer; + + for (int i = 0; i < it; i++) { + // Get timing stamp and ring the doorbell to dispatch the kernel. + int id = p_timer.CreateTimer(); + p_timer.StartTimer(id); + uint64_t* index = + reinterpret_cast(malloc(sizeof(uint64_t) * num_of_pkts_)); + + hsa_signal_store_screlease(aql().completion_signal, num_of_pkts_); + + for (uint32_t j = 0; j < num_of_pkts_; j++) { + // index[j] = hsa_queue_add_write_index_scacq_screl(main_queue(), 1); + index[j] = hsa_queue_add_write_index_relaxed(main_queue(), 1); + + // Write the aql packet at the calculated queue index address. + (reinterpret_cast(( + main_queue()->base_address)))[index[j] & queue_mask] = aql(); + + if (j == num_of_pkts_ - 1) { + (reinterpret_cast( + main_queue()->base_address))[index[j] & queue_mask].header |= + 1 << HSA_PACKET_HEADER_BARRIER; + } + } + + // Set packet header reversly; set all headers except the very first + // one, for now. + for (uint32_t j = num_of_pkts_ - 1; j > 0; j--) { + reinterpret_cast( + (main_queue()->base_address))[index[j] & queue_mask].header |= + HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; + } + + + // Set the very first header... + (reinterpret_cast( + main_queue()->base_address))[index[0] & queue_mask].header |= + HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; + + p_timer.StopTimer(id); + + timer.push_back(p_timer.ReadTimer(id)); + + for (uint32_t j = 0; j < num_of_pkts_; j++) { + hsa_signal_store_screlease(main_queue()->doorbell_signal, index[j]); + } + + // Wait on the dispatch signal until the kernel is finished. + while (hsa_signal_wait_scacquire(aql().completion_signal, + HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0) { + } + + + hsa_signal_store_screlease(aql().completion_signal, 1); + + free(index); + + if (verbosity() >= VERBOSE_PROGRESS) { + std::cout << "."; + fflush(stdout); + } + } + + std::cout << std::endl; + + // Abandon the first result and after sort, delete the last 2% value + timer.erase(timer.begin()); + std::sort(timer.begin(), timer.end()); + + timer.erase(timer.begin() + num_iteration(), timer.end()); + + enqueue_time_mean_ = rocrtst::CalcMean(timer); + + return; +} + + + +void EnqueueLatency::DisplayTestInfo(void) { + TestBase::DisplayTestInfo(); +} + +void EnqueueLatency::DisplayResults(void) const { + if (!rocrtst::CheckProfile(this)) { + return; + } + + TestBase::DisplayResults(); + + std::cout << "Average Time to Completion: "; + if (enqueue_single_) { + std::cout << enqueue_time_mean_ * 1e6; + } else { + std::cout << enqueue_time_mean_ * 1e6 / num_of_pkts_; + } + + std::cout << " uS" << std::endl; + return; +} + +void EnqueueLatency::Close() { + TestBase::Close(); + return; +} diff --git a/projects/rocr-runtime/rocrtst/suites/performance/enqueueLatency.h b/projects/rocr-runtime/rocrtst/suites/performance/enqueueLatency.h new file mode 100755 index 0000000000..b59dfd20bc --- /dev/null +++ b/projects/rocr-runtime/rocrtst/suites/performance/enqueueLatency.h @@ -0,0 +1,109 @@ +/* + * ============================================================================= + * ROC Runtime Conformance Release License + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2017, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +#ifndef ROCRTST_SUITES_PERFORMANCE_ENQUEUELATENCY_H_ +#define ROCRTST_SUITES_PERFORMANCE_ENQUEUELATENCY_H_ +#include + +#include "suites/test_common/test_base.h" +#include "common/base_rocr.h" +#include "common/common.h" +#include "hsa/hsa.h" + +// @Brief: This class is defined to measure the mean latency of enqueuing +// the packets to an empty kernel + +class EnqueueLatency : public TestBase { + public: + // @Brief: Constructor + explicit EnqueueLatency(bool launchSingleKernel); + + // @Brief: Destructor + virtual ~EnqueueLatency(void); + + // @Brief: Set up the environment for the test + virtual void SetUp(void); + + // @Brief: Run the test case + virtual void Run(void); + + // @Brief: Display results we got + virtual void DisplayResults(void) const; + + // @Brief: Display information about what this test does + virtual void DisplayTestInfo(void); + + // @Brief: Clean up and close the runtime + virtual void Close(void); + + // @Brief: Create the executable, get symbol by name and load the code object + // virtual void LoadCodeObject(hsa_agent_t gpuAgent,uint64_t &kernel_code); + + private: + // @Brief: Get actual iteration number + virtual size_t RealIterationNum(void); + + // @Brief: Launch single packet each time + virtual void EnqueueSinglePacket(void); + + // @Brief: Launch multiple packets each time + virtual void EnqueueMultiPackets(void); + + + // @Brief: Indicate if we enqueued single pkt or not + bool enqueue_single_; + + // @Brief: Store the size of queue + uint32_t queue_size_; + + // @Brief: Number of packets in a batch + uint32_t num_of_pkts_; + + // @Brief: Ave. dispatch time + double enqueue_time_mean_; +}; + +#endif // ROCRTST_SUITES_PERFORMANCE_ENQUEUELATENCY_H_ + diff --git a/projects/rocr-runtime/rocrtst/suites/test_common/CMakeLists.txt b/projects/rocr-runtime/rocrtst/suites/test_common/CMakeLists.txt index ef5e895b1a..e8db4c1388 100755 --- a/projects/rocr-runtime/rocrtst/suites/test_common/CMakeLists.txt +++ b/projects/rocr-runtime/rocrtst/suites/test_common/CMakeLists.txt @@ -304,11 +304,16 @@ build_sample_for_devices("test_case_template") #set(CL_FILE_LIST "${KERNELS_DIR}/p2p_mem_access_kernels.cl") #build_sample_for_devices("p2p_mem_access") -# Dispatch Time +# Dispatch Time set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}") set(CL_FILE_LIST "${KERNELS_DIR}/dispatch_time_kernels.cl") build_sample_for_devices("dispatch_time") +# gpuReadWrite +set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}") +set(CL_FILE_LIST "${KERNELS_DIR}/gpuReadWrite_kernels.cl") +build_sample_for_devices("gpuReadWrite") + # Build rules add_executable(${ROCRTST} ${performanceSources} ${functionalSources} ${common_srcs} ${testCommonSources}) diff --git a/projects/rocr-runtime/rocrtst/suites/test_common/kernels/gpuReadWrite_kernels.cl b/projects/rocr-runtime/rocrtst/suites/test_common/kernels/gpuReadWrite_kernels.cl new file mode 100755 index 0000000000..ac45e079f6 --- /dev/null +++ b/projects/rocr-runtime/rocrtst/suites/test_common/kernels/gpuReadWrite_kernels.cl @@ -0,0 +1,53 @@ +/* + * ============================================================================= + * ROC Runtime Conformance Release License + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2017, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +__kernel void gpuReadWrite(__global const int * a, + __global int * b, __global int * c) { + int i = get_global_id(0); + // Reading the system memory and writing to gpu memory + c[i] = a[i]; // a[i] point to system memory while c[i] to gpu memory. + //writing to system memory + b[i] = i; +} diff --git a/projects/rocr-runtime/rocrtst/suites/test_common/main.cc b/projects/rocr-runtime/rocrtst/suites/test_common/main.cc index d22245e0ca..45b8096930 100755 --- a/projects/rocr-runtime/rocrtst/suites/test_common/main.cc +++ b/projects/rocr-runtime/rocrtst/suites/test_common/main.cc @@ -49,10 +49,12 @@ #include "gtest/gtest.h" #include "suites/functional/memory_basic.h" +#include "suites/functional/memory_access.h" #include "suites/functional/ipc.h" #include "suites/performance/dispatch_time.h" #include "suites/performance/memory_async_copy.h" #include "suites/performance/memory_async_copy_numa.h" +#include "suites/performance/enqueueLatency.h" #include "suites/test_common/test_case_template.h" #include "suites/test_common/main.h" #include "suites/test_common/test_common.h" @@ -136,6 +138,13 @@ TEST(rocrtstFunc, IPC) { RunGenericTest(&ipc); } +TEST(rocrtstFunc, MemoryAccessTests) { + MemoryAccessTest mt; + RunCustomTestProlog(&mt); + mt.CPUAccessToGPUMemoryTest(); + mt.GPUAccessToCPUMemoryTest(); + RunCustomTestEpilog(&mt); +} // Temporarily disable this test until hsa_shut_down() is (probably not the // same as with the IPC test above) is addressed. To override the disable, // run with --gtest-also_run_disabled_tests flag. @@ -147,6 +156,13 @@ TEST(rocrtstFunc, DISABLED_Memory_Max_Mem) { RunCustomTestEpilog(&mt); } +TEST(rocrtstPerf, ENQUEUE_LATENCY) { + EnqueueLatency singlePacketequeue(true); + EnqueueLatency multiPacketequeue(false); + RunGenericTest(&singlePacketequeue); + RunGenericTest(&multiPacketequeue); +} + TEST(rocrtstPerf, Memory_Async_Copy) { MemoryAsyncCopy mac; // To do full test, uncomment this: