From 58ca2b745c88d2351f71a8c24d30bb496d37ae62 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Thu, 12 Dec 2019 19:41:34 -0500 Subject: [PATCH 01/39] libhsakmt: Support gfx90a Change-Id: I1ad594eab093f5aa30143ade4e72f2379c9e3616 Signed-off-by: Yong Zhao --- src/libhsakmt.h | 1 + src/pmc_table.c | 1 + src/queues.c | 10 +++++++++- src/topology.c | 6 ++++++ 4 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/libhsakmt.h b/src/libhsakmt.h index cb20026069..4fa6f9c6d3 100644 --- a/src/libhsakmt.h +++ b/src/libhsakmt.h @@ -129,6 +129,7 @@ enum asic_family_type { CHIP_NAVI12, /* 16 */ CHIP_NAVI14, /* 17 */ CHIP_SIENNA_CICHLID, /* 18 */ + CHIP_ALDEBARAN, /* 19 */ CHIP_LAST }; diff --git a/src/pmc_table.c b/src/pmc_table.c index 5d30391497..c4a2dec6a4 100644 --- a/src/pmc_table.c +++ b/src/pmc_table.c @@ -2128,6 +2128,7 @@ HSAKMT_STATUS get_block_properties(uint32_t node_id, case CHIP_RAVEN: case CHIP_RENOIR: case CHIP_ARCTURUS: + case CHIP_ALDEBARAN: *block = vega_blocks[block_id]; break; case CHIP_NAVI10: diff --git a/src/queues.c b/src/queues.c index 206e66e379..aaf667c296 100644 --- a/src/queues.c +++ b/src/queues.c @@ -42,7 +42,8 @@ #define DOORBELL_SIZE_GFX9 8 #define DOORBELLS_PAGE_SIZE(ds) (1024 * (ds)) -#define VGPR_SIZE_PER_CU(asic_family) (asic_family == CHIP_ARCTURUS ? 0x80000 : 0x40000) +#define VGPR_SIZE_PER_CU(asic_family) ((asic_family == CHIP_ARCTURUS || \ + asic_family == CHIP_ALDEBARAN) ? 0x80000 : 0x40000) #define SGPR_SIZE_PER_CU 0x4000 #define LDS_SIZE_PER_CU 0x10000 #define HWREG_SIZE_PER_CU 0x1000 @@ -146,6 +147,12 @@ const struct device_info arcturus_device_info = { .doorbell_size = DOORBELL_SIZE_GFX9, }; +const struct device_info aldebaran_device_info = { + .asic_family = CHIP_ALDEBARAN, + .eop_buffer_size = 4096, + .doorbell_size = DOORBELL_SIZE_GFX9, +}; + const struct device_info navi10_device_info = { .asic_family = CHIP_NAVI10, .eop_buffer_size = 4096, @@ -186,6 +193,7 @@ static const struct device_info *dev_lookup_table[] = { [CHIP_RAVEN] = &raven_device_info, [CHIP_RENOIR] = &renoir_device_info, [CHIP_ARCTURUS] = &arcturus_device_info, + [CHIP_ALDEBARAN] = &aldebaran_device_info, [CHIP_NAVI10] = &navi10_device_info, [CHIP_NAVI12] = &navi12_device_info, [CHIP_NAVI14] = &navi14_device_info, diff --git a/src/topology.c b/src/topology.c index 502337dff1..368be3aaa8 100644 --- a/src/topology.c +++ b/src/topology.c @@ -222,6 +222,12 @@ static const struct hsa_gfxip_table gfxip_lookup_table[] = { { 0x738C, 9, 0, 8, 1, "Arcturus", CHIP_ARCTURUS }, { 0x738E, 9, 0, 8, 1, "Arcturus", CHIP_ARCTURUS }, { 0x7390, 9, 0, 8, 1, "Arcturus", CHIP_ARCTURUS }, + /* Aldebaran */ + { 0x50, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, + { 0x51, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, + { 0x52, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, + { 0x60, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, + { 0x62, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, /* Navi10 */ { 0x7310, 10, 1, 0, 1, "Navi10", CHIP_NAVI10 }, { 0x7312, 10, 1, 0, 1, "Navi10", CHIP_NAVI10 }, From be096582474982946e7f76f042ea5c6950282f92 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Mon, 23 Mar 2020 21:00:07 -0400 Subject: [PATCH 02/39] Program mmCOMPUTE_PGM_RSRC3 on gfx90a Change-Id: If387d137ebd388f5aea930a5f7bca3413dcbfcce Signed-off-by: Yong Zhao --- tests/kfdtest/src/Dispatch.cpp | 8 ++++++++ tests/kfdtest/src/IsaGenerator.cpp | 1 + tests/kfdtest/src/KFDTestFlags.hpp | 1 + tests/kfdtest/src/KFDTestUtil.cpp | 4 +++- 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/kfdtest/src/Dispatch.cpp b/tests/kfdtest/src/Dispatch.cpp index 6ed67ce39b..3aa3892f38 100644 --- a/tests/kfdtest/src/Dispatch.cpp +++ b/tests/kfdtest/src/Dispatch.cpp @@ -30,6 +30,8 @@ #include "KFDBaseComponentTest.hpp" +#define mmCOMPUTE_PGM_RSRC3 0x2e2d + Dispatch::Dispatch(const HsaMemoryBuffer& isaBuf, const bool eventAutoReset) :m_IsaBuf(isaBuf), m_IndirectBuf(PACKETTYPE_PM4, PAGE_SIZE / sizeof(unsigned int), isaBuf.Node()), m_DimX(1), m_DimY(1), m_DimZ(1), m_pArg1(NULL), m_pArg2(NULL), m_pEop(NULL), m_ScratchEn(false), @@ -218,6 +220,12 @@ void Dispatch::BuildIb() { m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_PGM_RSRC1, COMPUTE_PGM_RSRC, ARRAY_SIZE(COMPUTE_PGM_RSRC))); + if (m_FamilyId == FAMILY_AL) { + const unsigned int COMPUTE_PGM_RSRC3[] = {9}; + m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_PGM_RSRC3, COMPUTE_PGM_RSRC3, + ARRAY_SIZE(COMPUTE_PGM_RSRC3))); + } + m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_RESOURCE_LIMITS, COMPUTE_RESOURCE_LIMITS, ARRAY_SIZE(COMPUTE_RESOURCE_LIMITS))); m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_TMPRING_SIZE, COMPUTE_TMPRING_SIZE, diff --git a/tests/kfdtest/src/IsaGenerator.cpp b/tests/kfdtest/src/IsaGenerator.cpp index 9c7376a0a4..3534e02159 100644 --- a/tests/kfdtest/src/IsaGenerator.cpp +++ b/tests/kfdtest/src/IsaGenerator.cpp @@ -92,6 +92,7 @@ IsaGenerator* IsaGenerator::Create(unsigned int familyId) { case FAMILY_AI: case FAMILY_RV: case FAMILY_AR: + case FAMILY_AL: return new IsaGenerator_Gfx9; case FAMILY_NV: return new IsaGenerator_Gfx10; diff --git a/tests/kfdtest/src/KFDTestFlags.hpp b/tests/kfdtest/src/KFDTestFlags.hpp index 921b8bc832..9087ba23f8 100644 --- a/tests/kfdtest/src/KFDTestFlags.hpp +++ b/tests/kfdtest/src/KFDTestFlags.hpp @@ -59,6 +59,7 @@ enum KfdFamilyId { FAMILY_AI, // Arctic Islands FAMILY_RV, // Raven FAMILY_AR, // Arcturus + FAMILY_AL, // Aldebaran FAMILY_NV, // Navi10 }; diff --git a/tests/kfdtest/src/KFDTestUtil.cpp b/tests/kfdtest/src/KFDTestUtil.cpp index b55cd80247..c4ff186686 100644 --- a/tests/kfdtest/src/KFDTestUtil.cpp +++ b/tests/kfdtest/src/KFDTestUtil.cpp @@ -149,8 +149,10 @@ unsigned int FamilyIdFromNode(const HsaNodeProperties *props) { familyId = FAMILY_AI; if (props->EngineId.ui32.Stepping == 2) familyId = FAMILY_RV; - if (props->EngineId.ui32.Stepping == 8) + else if (props->EngineId.ui32.Stepping == 8) familyId = FAMILY_AR; + else if (props->EngineId.ui32.Stepping == 10) + familyId = FAMILY_AL; break; case 10: familyId = FAMILY_NV; From 51b6bcf40d39918fc935274a436ffa0fa69fd05d Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Thu, 12 Dec 2019 20:23:10 -0500 Subject: [PATCH 03/39] kfdtest: Support gfx90a Change-Id: I879ea534729e7adca4892be897dc86f6153aa190 Signed-off-by: Yong Zhao --- tests/kfdtest/scripts/kfdtest.exclude | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/kfdtest/scripts/kfdtest.exclude b/tests/kfdtest/scripts/kfdtest.exclude index 9ecbcc1b55..cc307c1300 100644 --- a/tests/kfdtest/scripts/kfdtest.exclude +++ b/tests/kfdtest/scripts/kfdtest.exclude @@ -192,6 +192,11 @@ FILTER[arcturus]=\ "KFDQMTest.BasicCuMaskingEven:"\ "KFDEvictTest.BurstyTest" +FILTER[aldebaran]=\ +"$BLACKLIST_ALL_ASICS:"\ +"KFDExceptionTest.FaultStorm:"\ +"KFDEvictTest.BurstyTest" + FILTER[navi10]=\ "$BLACKLIST_ALL_ASICS:"\ "KFDMemoryTest.MMBench" From 8881075ab2a3d547474dd92687e640a492d23029 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Mon, 20 Apr 2020 18:09:16 -0400 Subject: [PATCH 04/39] kfdtest: Improve the message when CWSR basic test does not pass This will give out more info. Change-Id: I407422b84bebdf39a886c57736093a035ff02118 Signed-off-by: Yong Zhao --- tests/kfdtest/src/KFDCWSRTest.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/kfdtest/src/KFDCWSRTest.cpp b/tests/kfdtest/src/KFDCWSRTest.cpp index 5a4dbb4340..da2ad9bbd8 100644 --- a/tests/kfdtest/src/KFDCWSRTest.cpp +++ b/tests/kfdtest/src/KFDCWSRTest.cpp @@ -188,11 +188,13 @@ TEST_F(KFDCWSRTest, BasicTest) { int i; for (i = 0 ; i < wave_number; ++i) { if (result1[i] != count1) { - LOG() << "Dispatch 1, work item " << i << ' ' << result1[i] << std::endl; + LOG() << "Dispatch 1, work item [" << std::dec << i << "] " + << result1[i] << " != " << count1 << std::endl; break; } if (result2[i] != count2) { - LOG() << "Dispatch 2, work item " << i << ' ' << result2[i] << std::endl; + LOG() << "Dispatch 2, work item [" << std::dec << i << "] " + << result2[i] << " != " << count2 << std::endl; break; } } From 2464bfc7149b4003be849c85bffb71fb74893658 Mon Sep 17 00:00:00 2001 From: Eric Huang Date: Tue, 12 May 2020 15:12:52 -0400 Subject: [PATCH 05/39] libhsakmt: add new flag for memory mapped as uncached It is to provide an option to map specific memory as uncached on A+A HW platform. Signed-off-by: Eric Huang Change-Id: Ib665cb306a0e78aba3ea5ee2f0e46cb62ae139f8 --- include/hsakmttypes.h | 3 ++- include/linux/kfd_ioctl.h | 1 + src/fmm.c | 10 ++++++++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/include/hsakmttypes.h b/include/hsakmttypes.h index 7a7dc2ad10..b821268c50 100644 --- a/include/hsakmttypes.h +++ b/include/hsakmttypes.h @@ -541,7 +541,8 @@ typedef struct _HsaMemFlags // and optimal alignment requirements unsigned int FixedAddress : 1; // Allocate memory at specified virtual address. Fail if address is not free. unsigned int NoNUMABind: 1; // Don't bind system memory to a specific NUMA node - unsigned int Reserved : 15; + unsigned int Uncached: 1; // Caching flag for fine-grained memory on A+A HW platform + unsigned int Reserved : 14; } ui32; HSAuint32 Value; diff --git a/include/linux/kfd_ioctl.h b/include/linux/kfd_ioctl.h index 6a7dd1f44a..aa00e919dd 100644 --- a/include/linux/kfd_ioctl.h +++ b/include/linux/kfd_ioctl.h @@ -479,6 +479,7 @@ struct kfd_ioctl_acquire_vm_args { #define KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE (1 << 28) #define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27) #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26) +#define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25) /* Allocate memory for later SVM (shared virtual memory) mapping. * diff --git a/src/fmm.c b/src/fmm.c index 6867c69599..5fe6518ea6 100644 --- a/src/fmm.c +++ b/src/fmm.c @@ -1334,8 +1334,11 @@ void *fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t MemorySizeInB aperture = &gpu_mem[gpu_mem_id].gpuvm_aperture; } - if (!flags.ui32.CoarseGrain || svm.disable_cache) + if (!flags.ui32.CoarseGrain || svm.disable_cache) { ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_COHERENT; + if (flags.ui32.Uncached) + ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED; + } mem = __fmm_allocate_device(gpu_id, address, size, aperture, &mmap_offset, ioc_flags, &vm_obj); @@ -1540,8 +1543,11 @@ static void *fmm_allocate_host_gpu(uint32_t node_id, void *address, else aperture = svm.dgpu_alt_aperture; /* always coherent */ - if (!flags.ui32.CoarseGrain || svm.disable_cache) + if (!flags.ui32.CoarseGrain || svm.disable_cache) { ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_COHERENT; + if (flags.ui32.Uncached) + ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED; + } ioc_flags |= fmm_translate_hsa_to_ioc_flags(flags); if (flags.ui32.AQLQueueMemory) From 87f62056f1b613e4e017ebe2480bb3d2d478573e Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Sun, 26 Apr 2020 18:17:32 -0400 Subject: [PATCH 06/39] kfdtest: Move the package definitions to the beginning in CMakeLists.txt This ensures that similiar logic stays together. Signed-off-by: Yong Zhao Change-Id: I32695d7d6a7366bcbf4169e22119d768d111c633 --- tests/kfdtest/CMakeLists.txt | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/kfdtest/CMakeLists.txt b/tests/kfdtest/CMakeLists.txt index d92a6154ad..06eb821b17 100644 --- a/tests/kfdtest/CMakeLists.txt +++ b/tests/kfdtest/CMakeLists.txt @@ -38,6 +38,16 @@ set ( CPACK_PACKAGE_VERSION_MINOR "0" ) set ( CPACK_PACKAGE_VERSION_PATCH "0" ) set ( CPACK_PACKAGE_HOMEPAGE_URL "https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface" ) +## Define default variable and variables for the optional build target hsakmt-dev +set ( SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR} CACHE STRING "Location of hsakmt source code." ) +set ( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE STRING "Default installation directory." ) +set ( CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}" CACHE STRING "Default packaging prefix." ) +set ( CPACK_GENERATOR "DEB;RPM" CACHE STRING "Default packaging generators." ) + +# Debian package specific variables +set ( CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface" ) + + #set ( CMAKE_VERBOSE_MAKEFILE on ) find_package(PkgConfig) @@ -57,15 +67,6 @@ else() include_directories(${DRM_AMDGPU_INCLUDE_DIRS}) endif() -## Define default variable and variables for the optional build target hsakmt-dev -set ( SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR} CACHE STRING "Location of hsakmt source code." ) -set ( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE STRING "Default installation directory." ) -set ( CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}" CACHE STRING "Default packaging prefix." ) -set ( CPACK_GENERATOR "DEB;RPM" CACHE STRING "Default packaging generators." ) - -# Debian package specific variables -set ( CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface" ) - if( DEFINED ENV{LIBHSAKMT_PATH} ) set ( LIBHSAKMT_PATH $ENV{LIBHSAKMT_PATH} ) message ( "LIBHSAKMT_PATH environment variable is set" ) From 33c34506fa3a94c253502028a23c8f78dc318c66 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Sun, 26 Apr 2020 18:21:01 -0400 Subject: [PATCH 07/39] kfdtest: Fix a path error in CMakeLists.txt PKG_CONFIG_PATH environment variable should be set to /lib/pkgconfig, because the *.pc file is located there. Signed-off-by: Yong Zhao Change-Id: Iec503b1c2409987e52fd88fea160c70762686a28 --- tests/kfdtest/CMakeLists.txt | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/kfdtest/CMakeLists.txt b/tests/kfdtest/CMakeLists.txt index 06eb821b17..63627ef04b 100644 --- a/tests/kfdtest/CMakeLists.txt +++ b/tests/kfdtest/CMakeLists.txt @@ -72,15 +72,14 @@ if( DEFINED ENV{LIBHSAKMT_PATH} ) message ( "LIBHSAKMT_PATH environment variable is set" ) else() if ( ${ROCM_INSTALL_PATH} ) - set ( ENV{PKG_CONFIG_PATH} ${ROCM_INSTALL_PATH} ) - pkg_check_modules(HSAKMT libhsakmt) + set ( ENV{PKG_CONFIG_PATH} ${ROCM_INSTALL_PATH}/share/pkgconfig ) else() - set ( ENV{PKG_CONFIG_PATH} /opt/rocm/libhsakmt/ ) - pkg_check_modules(HSAKMT libhsakmt) + set ( ENV{PKG_CONFIG_PATH} /opt/rocm/share/pkgconfig ) endif() + + pkg_check_modules(HSAKMT libhsakmt) + if( NOT HSAKMT_FOUND ) - set ( ENV{PKG_CONFIG_PATH} /opt/rocm/libhsakmt/ ) - pkg_check_modules(HSAKMT libhsakmt) set ( LIBHSAKMT_PATH $ENV{OUT_DIR} ) endif() endif() From 690a1484274a87d56f36cb849500d72a3153b31a Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Thu, 21 May 2020 23:29:41 -0400 Subject: [PATCH 08/39] kfdtest: Add a simple test case to test local memory Given the chance of local memory breakage is so high on emulators, we should use this simple test to check the local memory function. Signed-off-by: Yong Zhao Change-Id: Ifc48c12e11d75cc777ed7ea13e03bf54c2458e12 --- tests/kfdtest/scripts/kfdtest.exclude | 1 + tests/kfdtest/src/KFDLocalMemoryTest.cpp | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/tests/kfdtest/scripts/kfdtest.exclude b/tests/kfdtest/scripts/kfdtest.exclude index cc307c1300..e20c5ad73f 100644 --- a/tests/kfdtest/scripts/kfdtest.exclude +++ b/tests/kfdtest/scripts/kfdtest.exclude @@ -20,6 +20,7 @@ FILTER[core_sws]=\ "KFDQMTest.AllSdmaQueues:"\ "KFDQMTest.AllXgmiSdmaQueues:"\ "KFDQMTest.AllQueues:"\ +"KFDLocalMemoryTest.AccessLocalMem:"\ "KFDEventTest.SignalEvent" # HWS mode diff --git a/tests/kfdtest/src/KFDLocalMemoryTest.cpp b/tests/kfdtest/src/KFDLocalMemoryTest.cpp index 4c86594ab9..33f33dc869 100644 --- a/tests/kfdtest/src/KFDLocalMemoryTest.cpp +++ b/tests/kfdtest/src/KFDLocalMemoryTest.cpp @@ -50,6 +50,28 @@ void KFDLocalMemoryTest::TearDown() { ROUTINE_END } +TEST_F(KFDLocalMemoryTest, AccessLocalMem) { + TEST_START(TESTPROFILE_RUNALL) + + int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; + + //local memory + HsaMemoryBuffer destBuf(PAGE_SIZE, defaultGPUNode, false, true); + + PM4Queue queue; + + ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + + queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As(), 0, 0)); + + queue.Wait4PacketConsumption(); + + EXPECT_SUCCESS(queue.Destroy()); + + TEST_END +} + TEST_F(KFDLocalMemoryTest, BasicTest) { TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); TEST_START(TESTPROFILE_RUNALL); From c861873dae35b31e896ade0c0f52e1be43990d23 Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Mon, 8 Jun 2020 21:01:52 -0400 Subject: [PATCH 09/39] Add SP3 assembler support for gfx90a. Add updated SP3 static library with support for gfx90a and also add initial corresponding changes in kfdtest. Change-Id: I71bc6404ace7f9bf0dd74e712287136aa2b8a03d --- tests/kfdtest/CMakeLists.txt | 1 + tests/kfdtest/sp3/sp3.h | 2 + tests/kfdtest/src/IsaGenerator.cpp | 4 +- tests/kfdtest/src/IsaGenerator_Aldebaran.cpp | 113 +++++++++++++++++++ tests/kfdtest/src/IsaGenerator_Aldebaran.hpp | 49 ++++++++ tests/kfdtest/src/KFDMemoryTest.cpp | 31 ++++- 6 files changed, 197 insertions(+), 3 deletions(-) create mode 100644 tests/kfdtest/src/IsaGenerator_Aldebaran.cpp create mode 100644 tests/kfdtest/src/IsaGenerator_Aldebaran.hpp diff --git a/tests/kfdtest/CMakeLists.txt b/tests/kfdtest/CMakeLists.txt index 63627ef04b..6c12039143 100644 --- a/tests/kfdtest/CMakeLists.txt +++ b/tests/kfdtest/CMakeLists.txt @@ -109,6 +109,7 @@ set (SRC_FILES gtest-1.6.0/gtest-all.cpp src/GoogleTestExtension.cpp src/IndirectBuffer.cpp src/IsaGenerator.cpp + src/IsaGenerator_Aldebaran.cpp src/IsaGenerator_Gfx10.cpp src/IsaGenerator_Gfx72.cpp src/IsaGenerator_Gfx8.cpp diff --git a/tests/kfdtest/sp3/sp3.h b/tests/kfdtest/sp3/sp3.h index d6235be5d8..e44ee406cf 100644 --- a/tests/kfdtest/sp3/sp3.h +++ b/tests/kfdtest/sp3/sp3.h @@ -108,9 +108,11 @@ struct sp3_shader { uint32_t nsgprs; ///< Number of scalar GPRs used. uint32_t nvgprs; ///< Number of vector GPRs used. uint32_t nsvgprs; ///< Number of shared vector GPRs used. + uint32_t naccvgprs; ///< Number of accumulator vector GPRs used (only available in certain projects). uint32_t nsgprs_manual_alloc; uint32_t nvgprs_manual_alloc; uint32_t nsvgprs_manual_alloc; + uint32_t naccvgprs_manual_alloc; uint32_t trap_present; uint32_t user_sgpr_count; uint32_t scratch_en; diff --git a/tests/kfdtest/src/IsaGenerator.cpp b/tests/kfdtest/src/IsaGenerator.cpp index 3534e02159..3e69b5f9df 100644 --- a/tests/kfdtest/src/IsaGenerator.cpp +++ b/tests/kfdtest/src/IsaGenerator.cpp @@ -30,6 +30,7 @@ #include "IsaGenerator_Gfx8.hpp" #include "IsaGenerator_Gfx9.hpp" #include "IsaGenerator_Gfx10.hpp" +#include "IsaGenerator_Aldebaran.hpp" #include "GoogleTestExtension.hpp" @@ -92,8 +93,9 @@ IsaGenerator* IsaGenerator::Create(unsigned int familyId) { case FAMILY_AI: case FAMILY_RV: case FAMILY_AR: - case FAMILY_AL: return new IsaGenerator_Gfx9; + case FAMILY_AL: + return new IsaGenerator_Aldbrn; case FAMILY_NV: return new IsaGenerator_Gfx10; diff --git a/tests/kfdtest/src/IsaGenerator_Aldebaran.cpp b/tests/kfdtest/src/IsaGenerator_Aldebaran.cpp new file mode 100644 index 0000000000..2fcb80fd32 --- /dev/null +++ b/tests/kfdtest/src/IsaGenerator_Aldebaran.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2020 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "IsaGenerator_Aldebaran.hpp" + +#include +#include + +const std::string IsaGenerator_Aldbrn::ASIC_NAME = "ALDEBARAN"; + +/* The binaries are generated from following ISA */ +#if 0 +/* flat_atomic_inc will not support by some PCIE, use flat_atomic_add instead */ +shader atomic_add +asic(ALDEBARAN) +type(CS) + v_mov_b32 v0, s0 + v_mov_b32 v1, s1 + v_mov_b32 v2, 1 + flat_atomic_add v3, v[0:1], v2 slc glc + s_waitcnt 0 + s_endpgm +end + +shader copy_dword +asic(ALDEBARAN) +type(CS) +/* copy the parameters from scalar registers to vector registers */ + v_mov_b32 v0, s0 + v_mov_b32 v1, s1 + v_mov_b32 v2, s2 + v_mov_b32 v3, s3 +/* copy a dword between the passed addresses */ + flat_load_dword v4, v[0:1] slc glc + s_waitcnt 0 + flat_store_dword v[2:3], v4 slc glc + s_endpgm +end + +shader main +asic(ALDEBARAN) +type(CS) +loop: + s_branch loop + s_endpgm +end + + +#endif + +const uint32_t IsaGenerator_Aldbrn::NOOP_ISA[] = { + 0xbf810000 +}; + +const uint32_t IsaGenerator_Aldbrn::COPY_DWORD_ISA[] = { + 0x7e000200, 0x7e020201, + 0x7e040202, 0x7e060203, + 0xdc530000, 0x047f0000, + 0xbf8c0000, 0xdc730000, + 0x007f0402, 0xbf810000 +}; + +const uint32_t IsaGenerator_Aldbrn::INFINITE_LOOP_ISA[] = { + 0xbf82ffff, 0xbf810000 +}; + +const uint32_t IsaGenerator_Aldbrn::ATOMIC_ADD_ISA[] = { + 0x7e000200, 0x7e020201, + 0x7e040281, 0xdd0b0000, + 0x037f0200, 0xbf8c0000, + 0xbf810000, 0x00000000 +}; + +void IsaGenerator_Aldbrn::GetNoopIsa(HsaMemoryBuffer& rBuf) { + std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As()); +} + +void IsaGenerator_Aldbrn::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) { + std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As()); +} + +void IsaGenerator_Aldbrn::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) { + std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As()); +} + +void IsaGenerator_Aldbrn::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) { + std::copy(ATOMIC_ADD_ISA, ATOMIC_ADD_ISA+ARRAY_SIZE(ATOMIC_ADD_ISA), rBuf.As()); +} + +const std::string& IsaGenerator_Aldbrn::GetAsicName() { + return ASIC_NAME; +} + diff --git a/tests/kfdtest/src/IsaGenerator_Aldebaran.hpp b/tests/kfdtest/src/IsaGenerator_Aldebaran.hpp new file mode 100644 index 0000000000..5571b91c26 --- /dev/null +++ b/tests/kfdtest/src/IsaGenerator_Aldebaran.hpp @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2020 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef _ISAGENERATOR_ALDEBARAN_H_ +#define _ISAGENERATOR_ALDEBARAN_H_ + +#include +#include "IsaGenerator.hpp" + +class IsaGenerator_Aldbrn : public IsaGenerator { + public: + virtual void GetNoopIsa(HsaMemoryBuffer& rBuf); + virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf); + virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf); + virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf); + + protected: + virtual const std::string& GetAsicName(); + + private: + static const std::string ASIC_NAME; + + static const uint32_t NOOP_ISA[]; + static const uint32_t COPY_DWORD_ISA[]; + static const uint32_t INFINITE_LOOP_ISA[]; + static const uint32_t ATOMIC_ADD_ISA[]; +}; + +#endif // _ISAGENERATOR_ALDEBARAN_H_ diff --git a/tests/kfdtest/src/KFDMemoryTest.cpp b/tests/kfdtest/src/KFDMemoryTest.cpp index aa816b84e4..c5c0c0afde 100644 --- a/tests/kfdtest/src/KFDMemoryTest.cpp +++ b/tests/kfdtest/src/KFDMemoryTest.cpp @@ -108,6 +108,29 @@ wave_size(32)\n\ end\n\ "; +const char* aldbrn_ScratchCopyDword = +"\ +shader ScratchCopyDword\n\ +asic(ALDEBARAN)\n\ +type(CS)\n\ +/*copy the parameters from scalar registers to vector registers*/\n\ + v_mov_b32 v0, s0\n\ + v_mov_b32 v1, s1\n\ + v_mov_b32 v2, s2\n\ + v_mov_b32 v3, s3\n\ +/*set up the scratch parameters. This assumes a single 16-reg block.*/\n\ + s_mov_b32 flat_scratch_lo, s4\n\ + s_mov_b32 flat_scratch_hi, s5\n\ +/*copy a dword between the passed addresses*/\n\ + flat_load_dword v4, v[0:1] slc\n\ + s_waitcnt vmcnt(0)&lgkmcnt(0)\n\ + flat_store_dword v[2:3], v4 slc\n\ + \n\ + s_endpgm\n\ + \n\ +end\n\ +"; + /* Continuously poll src buffer and check buffer value @@ -650,8 +673,10 @@ TEST_F(KFDMemoryTest, FlatScratchAccess) { const char *pScratchCopyDword; if (m_FamilyId < FAMILY_AI) pScratchCopyDword = gfx8_ScratchCopyDword; - else if (m_FamilyId < FAMILY_NV) + else if (m_FamilyId < FAMILY_AL) pScratchCopyDword = gfx9_ScratchCopyDword; + else if (m_FamilyId == FAMILY_AL) + pScratchCopyDword = aldbrn_ScratchCopyDword; else pScratchCopyDword = gfx10_ScratchCopyDword; m_pIsaGen->CompileShader(pScratchCopyDword, "ScratchCopyDword", isaBuffer); @@ -1508,8 +1533,10 @@ TEST_F(KFDMemoryTest, PtraceAccessInvisibleVram) { const char *pScratchCopyDword; if (m_FamilyId < FAMILY_AI) pScratchCopyDword = gfx8_ScratchCopyDword; - else if (m_FamilyId < FAMILY_NV) + else if (m_FamilyId < FAMILY_AL) pScratchCopyDword = gfx9_ScratchCopyDword; + else if (m_FamilyId == FAMILY_AL) + pScratchCopyDword = aldbrn_ScratchCopyDword; else pScratchCopyDword = gfx10_ScratchCopyDword; From 8c6dd3cbae4756158dd35599ca7585fb18ed651f Mon Sep 17 00:00:00 2001 From: Amber Lin Date: Fri, 19 Jun 2020 15:52:08 -0400 Subject: [PATCH 10/39] libhsakmt: Add device ID used in Simnow Simnow simulator uses 0x7400 as gfx90a's device ID Signed-off-by: Amber Lin Change-Id: I0022509ef643760bc906e537b4fc64f1523fd8bf --- src/topology.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/topology.c b/src/topology.c index 368be3aaa8..dc3c812378 100644 --- a/src/topology.c +++ b/src/topology.c @@ -228,6 +228,7 @@ static const struct hsa_gfxip_table gfxip_lookup_table[] = { { 0x52, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, { 0x60, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, { 0x62, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, + { 0x7400, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, /* Navi10 */ { 0x7310, 10, 1, 0, 1, "Navi10", CHIP_NAVI10 }, { 0x7312, 10, 1, 0, 1, "Navi10", CHIP_NAVI10 }, From da3abfb0f85daa494710a4dbd8e3c80598964e96 Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Wed, 17 Jun 2020 17:31:03 -0400 Subject: [PATCH 11/39] Update build script for SP3 static library Update build script and CMakeLists_sp3.txt file as SP3 directory structure has changed. The SP3 source code with gfx90a suport is merged into a new branch mukjoshi/sp3_gfx90a. Please make sure to checkout this branch before using the build script to generate the static library. Change-Id: I2bf0ade8b2d254cd7648cc8a6d69a83ee51344cd --- .../kfdtest/sp3/lib_helper/CMakeLists_sp3.txt | 24 +++++---- tests/kfdtest/sp3/lib_helper/build_sp3.sh | 2 +- tests/kfdtest/sp3/sp3.h | 53 ++++++++++++++++++- 3 files changed, 66 insertions(+), 13 deletions(-) diff --git a/tests/kfdtest/sp3/lib_helper/CMakeLists_sp3.txt b/tests/kfdtest/sp3/lib_helper/CMakeLists_sp3.txt index db59e3716e..ce8a3cb33f 100644 --- a/tests/kfdtest/sp3/lib_helper/CMakeLists_sp3.txt +++ b/tests/kfdtest/sp3/lib_helper/CMakeLists_sp3.txt @@ -39,7 +39,7 @@ set ( SCLIB_SRC ${PROJECT_SOURCE_DIR} ) #endif() include_directories(${SCLIB_SRC}/sp3) -include_directories(${SCLIB_SRC}/sp3/release_headers) +#include_directories(${SCLIB_SRC}/sp3/release_headers) include_directories(${SCLIB_SRC}/sp3/gen) set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-asic.c ) @@ -53,15 +53,17 @@ set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-cipher.c ) set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-vm.c ) aux_source_directory(${SCLIB_SRC}/sp3/gen SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/si SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/ci SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/gfx8 SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/gfx81 SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/gfx9 SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/gfx10 SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/release_headers/gfx81 SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/release_headers/gfx9 SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/release_headers/gfx10 SRC_FILES) +aux_source_directory(${SCLIB_SRC}/sp3/backend/si/lib SRC_FILES) +aux_source_directory(${SCLIB_SRC}/sp3/backend/ci/lib SRC_FILES) +aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx8/lib SRC_FILES) +aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx81/lib SRC_FILES) +aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx9/lib SRC_FILES) +aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx10/lib SRC_FILES) +aux_source_directory(${SCLIB_SRC}/sp3/backend/aldbrn/lib SRC_FILES) +aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx81/arch SRC_FILES) +aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx9/arch SRC_FILES) +aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx10/arch SRC_FILES) +aux_source_directory(${SCLIB_SRC}/sp3/backend/aldbrn/arch SRC_FILES) message( STATUS "PROJECT_SOURCE_DIR:" ${PROJECT_SOURCE_DIR} ) @@ -70,7 +72,7 @@ message( STATUS "PROJECT_SOURCE_DIR:" ${PROJECT_SOURCE_DIR} ) # message(STATUS "${file}") #endforeach() -set ( CMAKE_C_FLAGS "-DSP3_STATIC_LIB -Wno-error -DPUBLIC_RELEASE -DLITTLEENDIAN_CPU -fPIC -DGFX10_BUILD" ) +set ( CMAKE_C_FLAGS "-DSP3_STATIC_LIB -Wno-error -DPUBLIC_RELEASE -DLITTLEENDIAN_CPU -fPIC -DGFX101_BUILD -DALDBRN_BUILD" ) add_library(amdsp3 ${SRC_FILES}) diff --git a/tests/kfdtest/sp3/lib_helper/build_sp3.sh b/tests/kfdtest/sp3/lib_helper/build_sp3.sh index 7cd20ccfb5..f93f145da6 100755 --- a/tests/kfdtest/sp3/lib_helper/build_sp3.sh +++ b/tests/kfdtest/sp3/lib_helper/build_sp3.sh @@ -44,7 +44,7 @@ popd rsync --progress -a build/libamdsp3.a $LIB_OUTPUT # Put the intermediate header files in the current folder for further processing -rsync --progress -a $SP3_PROJECT/sp3/sp3.h . +rsync --progress -a $SP3_PROJECT/sp3/public/lib/sp3.h . # Remove the build folder and CMakeLists.txt put into SP source folder rm -r build diff --git a/tests/kfdtest/sp3/sp3.h b/tests/kfdtest/sp3/sp3.h index e44ee406cf..513167d595 100644 --- a/tests/kfdtest/sp3/sp3.h +++ b/tests/kfdtest/sp3/sp3.h @@ -54,7 +54,9 @@ enum sp3_shtype { SP3_SHTYPE_HS = 4, SP3_SHTYPE_LS = 5, SP3_SHTYPE_CS = 6, +#ifdef NAVI10LITE_BUILD SP3_SHTYPE_ACV = 7, +#endif }; /// Assorted constants used by sp3 API. @@ -107,7 +109,7 @@ struct sp3_shader { uint32_t size; ///< Size of the compiled shader, in 32-bit words. uint32_t nsgprs; ///< Number of scalar GPRs used. uint32_t nvgprs; ///< Number of vector GPRs used. - uint32_t nsvgprs; ///< Number of shared vector GPRs used. + uint32_t nsvgprs; ///< Number of shared vector GPRs used (only available in certain projects). uint32_t naccvgprs; ///< Number of accumulator vector GPRs used (only available in certain projects). uint32_t nsgprs_manual_alloc; uint32_t nvgprs_manual_alloc; @@ -211,6 +213,13 @@ SP3_EXPORT struct sp3_context *sp3_new(void); /// /// Currently supported options: /// +/// stdlib (string) -- absolute path to standard library files. May be a colon-separated list +/// of paths that will be used to search for stdlib files. Used by sp3_parse_library(). +/// +/// The following options are deprecated because they take integer arguments; you should use +/// sp3_set_option_int() for these settings going forward. They will continue to be accepted by +/// this API to support legacy users. +/// /// Werror (boolean) -- indicates whether warnings should be treated as errors. /// /// wave_size (integer) -- sets the wave size being used by the draw calls that will be using @@ -222,11 +231,53 @@ SP3_EXPORT struct sp3_context *sp3_new(void); /// /// omit_code_end (boolean) -- omit generation of the S_CODE_END footer. /// +/// allow_raw_bits (boolean) -- allow use of the raw_bits() function in sp3 shaders. This is a +/// dangerous option to allow in general so you must explicitly enable this option, otherwise +/// the raw_bits() function will always error out. +/// SP3_EXPORT void sp3_set_option( struct sp3_context *state, const char *option, const char *value); +/// Set option for sp3. +/// +/// @param state sp3 context. +/// @param option Option name. Unknown options will raise an error. +/// @param value Option value. +/// +/// Currently supported options: +/// +/// Werror (boolean) -- indicates whether warnings should be treated as errors. +/// +/// wave_size (integer) -- sets the wave size being used by the draw calls that will be using +/// this shader. Ignored in certain ASICs. You may set this to 32, 64 or the special value 0 +/// to indicate no preference on wave size. The shader will be checked to ensure it is +/// compatible with the size specified here. +/// +/// omit_version (boolean) -- omit generation of the S_VERSION opcode. +/// +/// omit_code_end (boolean) -- omit generation of the S_CODE_END footer. +/// +/// allow_raw_bits (boolean) -- allow use of the raw_bits() function in sp3 shaders. This is a +/// dangerous option to allow in general so you must explicitly enable this option, otherwise +/// the raw_bits() function will always error out. +/// +/// secure_mode (boolean) -- run in secure mode. Disables macro language features in assembly +/// path including calls to custom functions. Useful if sp3 is used as a backend to a web-based +/// assembly tool. +/// +/// debug_encoding (boolean) -- if true, debug encoding selection logic for assembly. Only +/// supported in 10.4+ backends. +/// +/// no_vs_export_check (boolean) -- if true, disable VS export sanity check. Only supported in +/// 10.4+ backends. +/// +SP3_EXPORT void sp3_set_option_int( + struct sp3_context *state, + const char *option, + int32_t value); + /// Parse a file into a context. /// /// Use sp3_compile to generate binary microcode after the shader is parsed. From 97ae33f9de9b3ecca7398291f1a7ca276fd9f7c8 Mon Sep 17 00:00:00 2001 From: Oak Zeng Date: Thu, 9 Jul 2020 11:38:57 -0500 Subject: [PATCH 12/39] Add gfx90a Gopher LSE DID (0x54) Change-Id: Ic0a1e3d01373e0d6ba58e42188dced394423de82 Signed-off-by: Oak Zeng --- src/topology.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/topology.c b/src/topology.c index dc3c812378..eb212f6c5c 100644 --- a/src/topology.c +++ b/src/topology.c @@ -226,6 +226,7 @@ static const struct hsa_gfxip_table gfxip_lookup_table[] = { { 0x50, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, { 0x51, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, { 0x52, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, + { 0x54, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, { 0x60, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, { 0x62, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, { 0x7400, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, From f398d6d20408c3d23974b66b89db719a03ba152c Mon Sep 17 00:00:00 2001 From: Jonathan Kim Date: Thu, 16 Jul 2020 15:28:38 -0400 Subject: [PATCH 13/39] libhsakmt: add host trap send Adding host trap send command. Signed-off-by: Jonathan Kim Change-Id: I291c13f5905e00bc6685a980284a6abd0c98da78 --- include/hsakmt.h | 10 ++++++++++ include/linux/kfd_ioctl.h | 9 +++++++++ src/debug.c | 21 +++++++++++++++++++++ src/libhsakmt.ver | 1 + 4 files changed, 41 insertions(+) diff --git a/include/hsakmt.h b/include/hsakmt.h index 237f80bbc7..dd59962bca 100644 --- a/include/hsakmt.h +++ b/include/hsakmt.h @@ -844,6 +844,16 @@ hsaKmtGetQueueSnapshot( HSAuint32 *QssEntries // IN/OUT ); +/** + Send the host trap +*/ +HSAKMT_STATUS +HSAKMTAPI +hsaKmtSendHostTrap( + HSAuint32 NodeId, //IN + HSAuint32 Pid //IN + ); + /** Set the trap override mask. When debug trap is enabled by hsaKmtEnableDebugTrap() each wave launched has its initial diff --git a/include/linux/kfd_ioctl.h b/include/linux/kfd_ioctl.h index aa00e919dd..e2c4467cbd 100644 --- a/include/linux/kfd_ioctl.h +++ b/include/linux/kfd_ioctl.h @@ -224,6 +224,7 @@ struct kfd_ioctl_dbg_wave_control_args { #define KFD_DBG_EV_STATUS_VMFAULT 2 #define KFD_DBG_EV_STATUS_SUSPENDED 4 #define KFD_DBG_EV_STATUS_NEW_QUEUE 8 +#define KFD_DBG_EV_STATUS_HOST_TRAP_TIMEDOUT 16 #define KFD_DBG_EV_FLAG_CLEAR_STATUS 1 #define KFD_INVALID_QUEUEID 0xffffffff @@ -309,6 +310,14 @@ struct kfd_ioctl_dbg_wave_control_args { */ #define KFD_IOC_DBG_TRAP_SET_ADDRESS_WATCH 9 +/* KFD_IOC_DBG_SEND_HOST_TRAP: + * ptr: unused + * data1: unused + * data2: unused + * data3: unused + */ +#define KFD_IOC_DBG_TRAP_SEND_HOST_TRAP 10 + struct kfd_ioctl_dbg_trap_args { __u64 ptr; /* to KFD -- used for pointer arguments: queue arrays */ __u32 pid; /* to KFD */ diff --git a/src/debug.c b/src/debug.c index caddc79a8e..5ed6631932 100644 --- a/src/debug.c +++ b/src/debug.c @@ -731,3 +731,24 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtClearAddressWatch( NULL); return result; } + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtSendHostTrap( + HSAuint32 NodeId, //IN + HSAuint32 Pid //IN + ) +{ + int result; + + result = debug_trap(NodeId, + KFD_IOC_DBG_TRAP_SEND_HOST_TRAP, + 0, + 0, + 0, + Pid, + 0, + NULL); + + return result; +} diff --git a/src/libhsakmt.ver b/src/libhsakmt.ver index d47f68d2d0..a3203e58c3 100644 --- a/src/libhsakmt.ver +++ b/src/libhsakmt.ver @@ -61,6 +61,7 @@ hsaKmtEnableDebugTrap; hsaKmtEnableDebugTrapWithPollFd; hsaKmtDisableDebugTrap; hsaKmtQueryDebugEvent; +hsaKmtSendHostTrap; hsaKmtGetQueueSnapshot; hsaKmtSetWaveLaunchTrapOverride; hsaKmtSetWaveLaunchMode; From 198b5bd450328477ea43d5692180aafd4d66ebb9 Mon Sep 17 00:00:00 2001 From: Eric Huang Date: Thu, 27 Aug 2020 17:17:43 -0400 Subject: [PATCH 14/39] kfdtest: add function to determine XGMI link to cpu Signed-off-by: Eric Huang Change-Id: I7650f7857f0eecd2ad587634ae11c1cf5116bd97 --- tests/kfdtest/src/KFDTestUtil.cpp | 23 +++++++++++++++++++++++ tests/kfdtest/src/KFDTestUtil.hpp | 5 +++++ 2 files changed, 28 insertions(+) diff --git a/tests/kfdtest/src/KFDTestUtil.cpp b/tests/kfdtest/src/KFDTestUtil.cpp index c4ff186686..c3a528d9c5 100644 --- a/tests/kfdtest/src/KFDTestUtil.cpp +++ b/tests/kfdtest/src/KFDTestUtil.cpp @@ -663,3 +663,26 @@ int HsaNodeInfo::FindAccessiblePeers(std::vector *peers, HSAuint32 ds return peers->size(); } + +const bool HsaNodeInfo::IsNodeXGMItoCPU(int node) const { + const HsaNodeProperties *pNodeProperties; + bool ret = false; + + pNodeProperties = GetNodeProperties(node); + if (pNodeProperties && pNodeProperties->NumIOLinks) { + HsaIoLinkProperties *IolinkProperties = new HsaIoLinkProperties[pNodeProperties->NumIOLinks]; + EXPECT_SUCCESS(hsaKmtGetNodeIoLinkProperties(node, pNodeProperties->NumIOLinks, IolinkProperties)); + + for (int linkId = 0; linkId < pNodeProperties->NumIOLinks; linkId++) { + EXPECT_EQ(node, IolinkProperties[linkId].NodeFrom); + const HsaNodeProperties *pNodeProperties0 = + GetNodeProperties(IolinkProperties[linkId].NodeTo); + if (pNodeProperties0->NumFComputeCores == 0 && + IolinkProperties[linkId].IoLinkType == HSA_IOLINK_TYPE_XGMI) + ret = true; + } + delete [] IolinkProperties; + } + + return ret; +} diff --git a/tests/kfdtest/src/KFDTestUtil.hpp b/tests/kfdtest/src/KFDTestUtil.hpp index 28847f370e..e640d588fc 100644 --- a/tests/kfdtest/src/KFDTestUtil.hpp +++ b/tests/kfdtest/src/KFDTestUtil.hpp @@ -196,6 +196,11 @@ class HsaNodeInfo { const bool AreGPUNodesXGMI(int node0, int node1) const; int FindAccessiblePeers(std::vector *peers, HSAuint32 dstNode, bool bidirectional) const; + /* @brief: to determine if the node is XGMI-linked to CPU + * @param: node index of the node we are looking at + * @return: bool true or false + */ + const bool IsNodeXGMItoCPU(int node) const; }; #endif // __KFD__TEST__UTIL__H__ From 4b3b941bb36b4a35006c00c59a2488d60447c13a Mon Sep 17 00:00:00 2001 From: Eric Huang Date: Thu, 8 Oct 2020 16:18:11 -0400 Subject: [PATCH 15/39] libhsakmt: add device id(0x46) for gfx90a mGPU model in topology Signed-off-by: Eric Huang Change-Id: I43f7c12906c408576e1eb55871d51e7a30569ede --- src/topology.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/topology.c b/src/topology.c index eb212f6c5c..a884e41c67 100644 --- a/src/topology.c +++ b/src/topology.c @@ -230,6 +230,7 @@ static const struct hsa_gfxip_table gfxip_lookup_table[] = { { 0x60, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, { 0x62, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, { 0x7400, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, + { 0x46, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, /* Navi10 */ { 0x7310, 10, 1, 0, 1, "Navi10", CHIP_NAVI10 }, { 0x7312, 10, 1, 0, 1, "Navi10", CHIP_NAVI10 }, From 18ead8815c3eaf673a0b543ec23629254510b436 Mon Sep 17 00:00:00 2001 From: Eric Huang Date: Wed, 14 Oct 2020 11:07:15 -0400 Subject: [PATCH 16/39] KFDTest: fix an exception bug in P2PTest The largebar check will exit exceptionally from test when destination node is not set. Signed-off-by: Eric Huang Change-Id: I8bf0fed613250cc71468208e645fc562fb1a8757 --- tests/kfdtest/src/KFDQMTest.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tests/kfdtest/src/KFDQMTest.cpp b/tests/kfdtest/src/KFDQMTest.cpp index eab1b54885..71768cc2cf 100644 --- a/tests/kfdtest/src/KFDQMTest.cpp +++ b/tests/kfdtest/src/KFDQMTest.cpp @@ -1671,18 +1671,13 @@ TEST_F(KFDQMTest, P2PTest) { HsaMemFlags memFlags = {0}; HsaMemMapFlags mapFlags = {0}; memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB; - memFlags.ui32.HostAccess = 1; + memFlags.ui32.HostAccess = 0; memFlags.ui32.NonPaged = 1; memFlags.ui32.NoNUMABind = 1; unsigned int end = size / sizeof(HSAuint32) - 1; - if (!m_NodeInfo.IsGPUNodeLargeBar(g_TestDstNodeId) && - m_NodeInfo.AreGPUNodesXGMI(g_TestNodeId, g_TestDstNodeId)) { - memFlags.ui32.HostAccess = 0; - } - /* 1. Allocate a system buffer and allow the access to GPUs */ - EXPECT_SUCCESS(hsaKmtAllocMemory(0, size, memFlags, + EXPECT_SUCCESS(hsaKmtAllocMemory(0, size, m_MemoryFlags, reinterpret_cast(&sysBuf))); EXPECT_SUCCESS(hsaKmtMapMemoryToGPUNodes(sysBuf, size, NULL, mapFlags, nodes.size(), &nodes[0])); From 731a06c7047a67f996b3e1d951b3607d43fdf313 Mon Sep 17 00:00:00 2001 From: Kent Russell Date: Thu, 13 Aug 2020 11:01:21 -0400 Subject: [PATCH 17/39] Fix GCC warning regarding strncpy in CPU info strlen(src) should not be used as the length in strncpy. Use memcpy since we know the length of the string, and ensure that we NULL-terminate regardless of length Signed-off-by: Kent Russell Change-Id: I21cc6d106510c69464e7ac9d3fc7da3a1e6d1a68 --- src/topology.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/topology.c b/src/topology.c index a884e41c67..e5fb85e406 100644 --- a/src/topology.c +++ b/src/topology.c @@ -888,13 +888,10 @@ static HSAKMT_STATUS topology_parse_cpuinfo(struct proc_cpuinfo *cpuinfo, if (!strncmp("model name", read_buf, sizeof("model name") - 1)) { p = strchr(read_buf, ':'); p += 2; /* remove ": " */ - p_len = strlen(p); - if (p_len < HSA_PUBLIC_NAME_SIZE) { - /* -1 to remove \n from p */ - strncpy(cpuinfo[proc].model_name, p, p_len - 1); - cpuinfo[proc].model_name[p_len - 1] = '\0'; - } else - strncpy(cpuinfo[proc].model_name, p, HSA_PUBLIC_NAME_SIZE); + p_len = (strlen(p) > HSA_PUBLIC_NAME_SIZE ? + HSA_PUBLIC_NAME_SIZE : strlen(p)); + memcpy(cpuinfo[proc].model_name, p, p_len); + cpuinfo[proc].model_name[p_len - 1] = '\0'; continue; } From 5ae49f2321ae2287aeae8339941997a20fed8241 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Mon, 25 Nov 2019 13:24:36 -0500 Subject: [PATCH 18/39] libhsakmt: add kfd_ioctl.h svm and xnack support Add svm (shared virtual memory) range and xnack mode APIs. Change-Id: Ibd8d7fe566dc200730da0c892caa71aad7589ebd Signed-off-by: Philip Yang Signed-off-by: Alex Sierra --- include/linux/kfd_ioctl.h | 167 +++++++++++++++++++++++++++++++++++++- 1 file changed, 166 insertions(+), 1 deletion(-) diff --git a/include/linux/kfd_ioctl.h b/include/linux/kfd_ioctl.h index e2c4467cbd..a8be1df07d 100644 --- a/include/linux/kfd_ioctl.h +++ b/include/linux/kfd_ioctl.h @@ -638,6 +638,166 @@ struct kfd_ioctl_cross_memory_copy_args { __u64 bytes_copied; }; + +/* Guarantee host access to memory */ +#define KFD_IOCTL_SVM_FLAG_HOST_ACCESS 0x00000001 +/* Fine grained coherency between all devices with access */ +#define KFD_IOCTL_SVM_FLAG_COHERENT 0x00000002 +/* Use any GPU in same hive as preferred device */ +#define KFD_IOCTL_SVM_FLAG_HIVE_LOCAL 0x00000004 +/* GPUs only read, allows replication */ +#define KFD_IOCTL_SVM_FLAG_GPU_RO 0x00000008 +/* Allow execution on GPU */ +#define KFD_IOCTL_SVM_FLAG_GPU_EXEC 0x00000010 + +/** + * kfd_ioctl_svm_op - SVM ioctl operations + * + * @KFD_IOCTL_SVM_OP_SET_ATTR: Modify one or more attributes + * @KFD_IOCTL_SVM_OP_GET_ATTR: Query one or more attributes + */ +enum kfd_ioctl_svm_op { + KFD_IOCTL_SVM_OP_SET_ATTR, + KFD_IOCTL_SVM_OP_GET_ATTR +}; + +/** + * kfd_ioctl_svm_attr_type - SVM attribute types + * + * @KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: gpuid of the preferred location, 0 for + * system memory + * @KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: gpuid of the prefetch location, 0 for + * system memory. Setting this triggers an + * immediate prefetch (migration). + * @KFD_IOCTL_SVM_ATTR_ACCESS: + * @KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: + * @KFD_IOCTL_SVM_ATTR_NO_ACCESS: specify memory access for the gpuid given + * by the attribute value + * @KFD_IOCTL_SVM_ATTR_SET_FLAGS: bitmask of flags to set (see + * KFD_IOCTL_SVM_FLAG_...) + * @KFD_IOCTL_SVM_ATTR_CLR_FLAGS: bitmask of flags to clear + * @KFD_IOCTL_SVM_ATTR_GRANULARITY: migration granularity + * (log2 num pages) + */ +enum kfd_ioctl_svm_attr_type { + KFD_IOCTL_SVM_ATTR_PREFERRED_LOC, + KFD_IOCTL_SVM_ATTR_PREFETCH_LOC, + KFD_IOCTL_SVM_ATTR_ACCESS, + KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE, + KFD_IOCTL_SVM_ATTR_NO_ACCESS, + KFD_IOCTL_SVM_ATTR_SET_FLAGS, + KFD_IOCTL_SVM_ATTR_CLR_FLAGS, + KFD_IOCTL_SVM_ATTR_GRANULARITY +}; + +/** kfd_ioctl_svm_location - Enum for preferred and prefetch locations + * + * GPU IDs are used to specify GPUs as preferred and prefetch locations. + * Below definitions are used for system memory or for leaving the preferred + * location unspecified. + */ +enum kfd_ioctl_svm_location { + KFD_IOCTL_SVM_LOCATION_SYSMEM = 0, + KFD_IOCTL_SVM_LOCATION_UNDEFINED = 0xffffffff +}; + +/** + * kfd_ioctl_svm_attribute - Attributes as pairs of type and value + * + * The meaning of the @value depends on the attribute type. + * + * @type: attribute type (see enum @kfd_ioctl_svm_attr_type) + * @value: attribute value + */ +struct kfd_ioctl_svm_attribute { + __u32 type; + __u32 value; +}; + +/** + * kfd_ioctl_svm_args - Arguments for SVM ioctl + * + * @op specifies the operation to perform (see enum + * @kfd_ioctl_svm_op). @start_addr and @size are common for all + * operations. + * + * A variable number of attributes can be given in @attrs. + * @nattr specifies the number of attributes. New attributes can be + * added in the future without breaking the ABI. If unknown attributes + * are given, the function returns -EINVAL. + * + * @KFD_IOCTL_SVM_OP_SET_ATTR sets attributes for a virtual address + * range. It may overlap existing virtual address ranges. If it does, + * the existing ranges will be split such that the attribute changes + * only apply to the specified address range. + * + * @KFD_IOCTL_SVM_OP_GET_ATTR returns the intersection of attributes + * over all memory in the given range and returns the result as the + * attribute value. If different pages have different preferred or + * prefetch locations, 0xffffffff will be returned for + * @KFD_IOCTL_SVM_ATTR_PREFERRED_LOC or + * @KFD_IOCTL_SVM_ATTR_PREFETCH_LOC resepctively. For + * @KFD_IOCTL_SVM_ATTR_SET_FLAGS, flags of all pages will be + * aggregated by bitwise AND. The minimum migration granularity + * throughout the range will be returned for + * @KFD_IOCTL_SVM_ATTR_GRANULARITY. + * + * Querying of accessibility attributes works by initializing the + * attribute type to @KFD_IOCTL_SVM_ATTR_ACCESS and the value to the + * GPUID being queried. Multiple attributes can be given to allow + * querying multiple GPUIDs. The ioctl function overwrites the + * attribute type to indicate the access for the specified GPU. + * + * @KFD_IOCTL_SVM_ATTR_CLR_FLAGS is invalid for + * @KFD_IOCTL_SVM_OP_GET_ATTR. + */ +struct kfd_ioctl_svm_args { + __u64 start_addr; + __u64 size; + __u32 op; + __u32 nattr; + /* Variable length array of attributes */ + struct kfd_ioctl_svm_attribute attrs[0]; +}; + +/** + * kfd_ioctl_set_xnack_mode_args - Arguments for set_xnack_mode + * + * @xnack_enabled: [in/out] Whether to enable XNACK mode for this process + * + * @xnack_enabled indicates whether recoverable page faults should be + * enabled for the current process. 0 means disabled, positive means + * enabled, negative means leave unchanged. If enabled, virtual address + * translations on GFXv9 and later AMD GPUs can return XNACK and retry + * the access until a valid PTE is available. This is used to implement + * device page faults. + * + * On output, @xnack_enabled returns the (new) current mode (0 or + * positive). Therefore, a negative input value can be used to query + * the current mode without changing it. + * + * The XNACK mode fundamentally changes the way SVM managed memory works + * in the driver, with subtle effects on application performance and + * functionality. + * + * Enabling XNACK mode requires shader programs to be compiled + * differently. Furthermore, not all GPUs support changing the mode + * per-process. Therefore changing the mode is only allowed while no + * user mode queues exist in the process. This ensure that no shader + * code is running that may be compiled for the wrong mode. And GPUs + * that cannot change to the requested mode will be disabled by + * failing subsequent requests to create user mode queues. + * + * This ioctl returns the status of the requested xnack mode. + * + * GFXv8 or older GPUs do not support 48 bit virtual addresses or SVM. + * + * Return: 0 on success, -errno on failure + */ +struct kfd_ioctl_set_xnack_mode_args { + __s32 xnack_enabled; +}; + #define AMDKFD_IOCTL_BASE 'K' #define AMDKFD_IO(nr) _IO(AMDKFD_IOCTL_BASE, nr) #define AMDKFD_IOR(nr, type) _IOR(AMDKFD_IOCTL_BASE, nr, type) @@ -735,8 +895,13 @@ struct kfd_ioctl_cross_memory_copy_args { #define AMDKFD_IOC_ALLOC_QUEUE_GWS \ AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args) +#define AMDKFD_IOC_SVM AMDKFD_IOWR(0x20, struct kfd_ioctl_svm_args) + +#define AMDKFD_IOC_SET_XNACK_MODE \ + AMDKFD_IOWR(0x21, struct kfd_ioctl_set_xnack_mode_args) + #define AMDKFD_COMMAND_START 0x01 -#define AMDKFD_COMMAND_END 0x1F +#define AMDKFD_COMMAND_END 0x22 /* non-upstream ioctls */ #define AMDKFD_IOC_IPC_IMPORT_HANDLE \ From a352639df5b281100ad4beed24756312d449c458 Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Mon, 23 Nov 2020 20:55:30 -0600 Subject: [PATCH 19/39] libhsakmt: add API to support svm and xnack Add function definitions to support SVM (shared virtual memory) and xnack set. Change-Id: Ia97ad9d0c449d8d500d799f702e1a58e87d65a56 Signed-off-by: Alex Sierra Signed-off-by: Philip Yang Signed-off-by: Felix Kuehling --- include/hsakmt.h | 31 +++++++++++++++++++++++++++++++ include/hsakmttypes.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/include/hsakmt.h b/include/hsakmt.h index dd59962bca..51f2467d29 100644 --- a/include/hsakmt.h +++ b/include/hsakmt.h @@ -1219,6 +1219,37 @@ hsaKmtSetMemoryUserData( void * UserData //IN ); +/* Helper functions for calling KFD SVM ioctl */ +HSAKMT_STATUS +HSAKMTAPI +hsaKmtSVMSetAttr( + void *start_addr, // IN: Start of the virtual address range (page-aligned) + HSAuint64 size, // IN: size (page-aligned) + unsigned int nattr, // IN: number of attributes + HSA_SVM_ATTRIBUTE *attrs // IN: array of attributes +); + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtSVMGetAttr( + void *start_addr, // IN: Start of the virtual address range (page-aligned) + HSAuint64 size, // IN: size (page aligned) + unsigned int nattr, // IN: number of attributes + HSA_SVM_ATTRIBUTE *attrs // IN/OUT: array of attributes +); + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtSetXNACKMode( + HSAint32 enable // IN: enable/disable XNACK node. +); + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetXNACKMode( + HSAint32 * enable // OUT: returns XNACK value. +); + #ifdef __cplusplus } //extern "C" #endif diff --git a/include/hsakmttypes.h b/include/hsakmttypes.h index b821268c50..4d7b7d8d73 100644 --- a/include/hsakmttypes.h +++ b/include/hsakmttypes.h @@ -1291,6 +1291,35 @@ typedef struct _HsaMemoryRange { HSAuint64 SizeInBytes; // Size of above memory } HsaMemoryRange; +typedef enum _HSA_SVM_FLAGS { + HSA_SVM_FLAG_HOST_ACCESS = 0x00000001, // Guarantee host access to memory + HSA_SVM_FLAG_COHERENT = 0x00000002, // Fine grained coherency between all devices with access + HSA_SVM_FLAG_HIVE_LOCAL = 0x00000004, // Use any GPU in same hive as preferred device + HSA_SVM_FLAG_GPU_RO = 0x00000008, // GPUs only read, allows replication + HSA_SVM_FLAG_GPU_EXEC = 0x00000010, // Allow execution on GPU +} HSA_SVM_FLAGS; + +typedef enum _HSA_SVM_ATTR_TYPE { + HSA_SVM_ATTR_PREFERRED_LOC, // gpuid of the preferred location, 0 for + // system memory, INVALID_NODEID for + // "don't care" + HSA_SVM_ATTR_PREFETCH_LOC, // gpuid of the prefetch location, 0 for + // system memory. Setting this triggers an + // immediate prefetch (migration) + HSA_SVM_ATTR_ACCESS, + HSA_SVM_ATTR_ACCESS_IN_PLACE, + HSA_SVM_ATTR_NO_ACCESS, // specify memory access for the gpuid given + // by the attribute value + HSA_SVM_ATTR_SET_FLAGS, // bitmask of flags to set (see HSA_SVM_FLAGS) + HSA_SVM_ATTR_CLR_FLAGS, // bitmask of flags to clear + HSA_SVM_ATTR_GRANULARITY // migration granularity (log2 num pages) +} HSA_SVM_ATTR_TYPE; + +typedef struct _HSA_SVM_ATTRIBUTE { + HSAuint32 type; // attribute type (see enum HSA_SVM_ATTR_TYPE) + HSAuint32 value; // attribute value +} HSA_SVM_ATTRIBUTE; + #pragma pack(pop, hsakmttypes_h) From 75e8fe383f269ed8aac28c3e7904d234d9417ea2 Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Mon, 23 Nov 2020 20:56:02 -0600 Subject: [PATCH 20/39] libhsakmt: add SVM thunk implementation Implement SVM (Shared Virtual Memory) in the thunk. Change-Id: I0380150d1d3da48070f9389a06f416d6059d6948 Signed-off-by: Philip Yang Signed-off-by: Sean Keely Signed-off-by: Felix Kuehling Signed-off-by: Alex Sierra --- CMakeLists.txt | 1 + src/libhsakmt.h | 1 + src/libhsakmt.ver | 2 + src/svm.c | 185 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 189 insertions(+) create mode 100644 src/svm.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 0bfe9e1d73..b31f09c1c1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -123,6 +123,7 @@ set ( HSAKMT_SRC "src/debug.c" "src/time.c" "src/topology.c" "src/rbtree.c" + "src/svm.c" "src/version.c") ## Declare the library target name diff --git a/src/libhsakmt.h b/src/libhsakmt.h index 4fa6f9c6d3..86ec0cb9c0 100644 --- a/src/libhsakmt.h +++ b/src/libhsakmt.h @@ -26,6 +26,7 @@ #ifndef LIBHSAKMT_H_INCLUDED #define LIBHSAKMT_H_INCLUDED +#include "linux/kfd_ioctl.h" #include "hsakmt.h" #include "pci_ids.h" #include diff --git a/src/libhsakmt.ver b/src/libhsakmt.ver index a3203e58c3..25b3a0b725 100644 --- a/src/libhsakmt.ver +++ b/src/libhsakmt.ver @@ -72,6 +72,8 @@ hsaKmtGetKernelDebugTrapVersionInfo; hsaKmtGetThunkDebugTrapVersionInfo; hsaKmtSetAddressWatch; hsaKmtClearAddressWatch; +hsaKmtSVMSetAttr; +hsaKmtSVMGetAttr; local: *; }; diff --git a/src/svm.c b/src/svm.c new file mode 100644 index 0000000000..d4bf896069 --- /dev/null +++ b/src/svm.c @@ -0,0 +1,185 @@ +/* + * Copyright © 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#include "libhsakmt.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Helper functions for calling KFD SVM ioctl */ + +HSAKMT_STATUS HSAKMTAPI +hsaKmtSVMSetAttr(void *start_addr, HSAuint64 size, unsigned int nattr, + HSA_SVM_ATTRIBUTE *attrs) +{ + struct kfd_ioctl_svm_args *args; + HSAuint64 s_attr; + HSAKMT_STATUS r; + HSAuint32 i; + + CHECK_KFD_OPEN(); + + pr_debug("%s: address 0x%p size 0x%lx\n", __func__, start_addr, size); + + if (!start_addr || !size) + return HSAKMT_STATUS_INVALID_PARAMETER; + if ((uint64_t)start_addr & (PAGE_SIZE - 1)) + return HSAKMT_STATUS_INVALID_PARAMETER; + if (size & (PAGE_SIZE - 1)) + return HSAKMT_STATUS_INVALID_PARAMETER; + + s_attr = sizeof(*attrs) * nattr; + args = alloca(sizeof(*args) + s_attr); + + args->start_addr = (uint64_t)start_addr; + args->size = size; + args->op = KFD_IOCTL_SVM_OP_SET_ATTR; + args->nattr = nattr; + memcpy(args->attrs, attrs, s_attr); + + for (i = 0; i < nattr; i++) { + if (attrs[i].type != KFD_IOCTL_SVM_ATTR_PREFERRED_LOC && + attrs[i].type != KFD_IOCTL_SVM_ATTR_PREFETCH_LOC && + attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS && + attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE && + attrs[i].type != KFD_IOCTL_SVM_ATTR_NO_ACCESS) + continue; + + if (attrs[i].type == KFD_IOCTL_SVM_ATTR_PREFERRED_LOC && + attrs[i].value == INVALID_NODEID) { + args->attrs[i].value = KFD_IOCTL_SVM_LOCATION_UNDEFINED; + continue; + } + + r = validate_nodeid(attrs[i].value, &args->attrs[i].value); + if (r != HSAKMT_STATUS_SUCCESS) { + pr_debug("invalid node ID: %d\n", attrs[i].value); + return r; + } else if (!args->attrs[i].value && + (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS || + attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE || + attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS)) { + pr_debug("CPU node invalid for access attribute\n"); + return HSAKMT_STATUS_INVALID_NODE_UNIT; + } + } + + /* Driver does one copy_from_user, with extra attrs size */ + r = kmtIoctl(kfd_fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args); + if (r) { + pr_debug("op set range attrs failed %s\n", strerror(errno)); + return HSAKMT_STATUS_ERROR; + } + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtSVMGetAttr(void *start_addr, HSAuint64 size, unsigned int nattr, + HSA_SVM_ATTRIBUTE *attrs) +{ + struct kfd_ioctl_svm_args *args; + HSAuint64 s_attr; + HSAKMT_STATUS r; + HSAuint32 i; + + CHECK_KFD_OPEN(); + + pr_debug("%s: address 0x%p size 0x%lx\n", __func__, start_addr, size); + + if (!start_addr || !size) + return HSAKMT_STATUS_INVALID_PARAMETER; + if ((uint64_t)start_addr & (PAGE_SIZE - 1)) + return HSAKMT_STATUS_INVALID_PARAMETER; + if (size & (PAGE_SIZE - 1)) + return HSAKMT_STATUS_INVALID_PARAMETER; + + s_attr = sizeof(*attrs) * nattr; + args = alloca(sizeof(*args) + s_attr); + + args->start_addr = (uint64_t)start_addr; + args->size = size; + args->op = KFD_IOCTL_SVM_OP_GET_ATTR; + args->nattr = nattr; + memcpy(args->attrs, attrs, s_attr); + + for (i = 0; i < nattr; i++) { + if (attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS && + attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE && + attrs[i].type != KFD_IOCTL_SVM_ATTR_NO_ACCESS) + continue; + + r = validate_nodeid(attrs[i].value, &args->attrs[i].value); + if (r != HSAKMT_STATUS_SUCCESS) { + pr_debug("invalid node ID: %d\n", attrs[i].value); + return r; + } else if (!args->attrs[i].value) { + pr_debug("CPU node invalid for access attribute\n"); + return HSAKMT_STATUS_INVALID_NODE_UNIT; + } + } + + /* Driver does one copy_from_user, with extra attrs size */ + r = kmtIoctl(kfd_fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args); + if (r) { + pr_debug("op get range attrs failed %s\n", strerror(errno)); + return HSAKMT_STATUS_ERROR; + } + + memcpy(attrs, args->attrs, s_attr); + + for (i = 0; i < nattr; i++) { + if (attrs[i].type != KFD_IOCTL_SVM_ATTR_PREFERRED_LOC && + attrs[i].type != KFD_IOCTL_SVM_ATTR_PREFETCH_LOC && + attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS && + attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE && + attrs[i].type != KFD_IOCTL_SVM_ATTR_NO_ACCESS) + continue; + + switch (attrs[i].value) { + case KFD_IOCTL_SVM_LOCATION_SYSMEM: + attrs[i].value = 0; + break; + case KFD_IOCTL_SVM_LOCATION_UNDEFINED: + attrs[i].value = INVALID_NODEID; + break; + default: + r = gpuid_to_nodeid(attrs[i].value, &attrs[i].value); + if (r != HSAKMT_STATUS_SUCCESS) { + pr_debug("invalid GPU ID: %d\n", + attrs[i].value); + return r; + } + } + } + + return HSAKMT_STATUS_SUCCESS; +} From 3f45f602d404c77eb0b9e1234e73144bba69c910 Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Mon, 23 Nov 2020 21:00:49 -0600 Subject: [PATCH 21/39] libhsakmt: add XNACK API set/get mode XNACK API for GPUs that support this mode. This API makes calls to amdgpu driver to configure xnack mode. It supports set xnack mode and query the current mode used. Change-Id: If865fd0e3f900f008243dc49504e1a0694e1791a Signed-off-by: Alex Sierra --- src/libhsakmt.ver | 3 ++- src/svm.c | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/src/libhsakmt.ver b/src/libhsakmt.ver index 25b3a0b725..8a695a82fd 100644 --- a/src/libhsakmt.ver +++ b/src/libhsakmt.ver @@ -74,7 +74,8 @@ hsaKmtSetAddressWatch; hsaKmtClearAddressWatch; hsaKmtSVMSetAttr; hsaKmtSVMGetAttr; - +hsaKmtSetXNACKMode; +hsaKmtGetXNACKMode; local: *; }; diff --git a/src/svm.c b/src/svm.c index d4bf896069..478217259e 100644 --- a/src/svm.c +++ b/src/svm.c @@ -183,3 +183,42 @@ hsaKmtSVMGetAttr(void *start_addr, HSAuint64 size, unsigned int nattr, return HSAKMT_STATUS_SUCCESS; } + +static HSAKMT_STATUS +hsaKmtSetGetXNACKMode(HSAint32 * enable) +{ + struct kfd_ioctl_set_xnack_mode_args args; + + CHECK_KFD_OPEN(); + + args.xnack_enabled = *enable; + + if (kmtIoctl(kfd_fd, AMDKFD_IOC_SET_XNACK_MODE, &args)) { + if (errno == EPERM) { + pr_debug("set mode not supported %s\n", + strerror(errno)); + return HSAKMT_STATUS_NOT_SUPPORTED; + } else if (errno == EBUSY) { + pr_debug("kmtIoctl queues not empty %s\n", + strerror(errno)); + } + return HSAKMT_STATUS_ERROR; + } + + *enable = args.xnack_enabled; + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtSetXNACKMode(HSAint32 enable) +{ + return hsaKmtSetGetXNACKMode(&enable); +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtGetXNACKMode(HSAint32 * enable) +{ + *enable = -1; + return hsaKmtSetGetXNACKMode(enable); +} From 50debca7e9f52bfbceb6f8d77f164d964b0bbae8 Mon Sep 17 00:00:00 2001 From: Oak Zeng Date: Wed, 25 Nov 2020 10:06:53 -0600 Subject: [PATCH 22/39] Support gfx90a real asic device id Change-Id: Ib223b4e890899c3c4e468993a88f849bccc5d182 Signed-off-by: Oak Zeng --- src/topology.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/topology.c b/src/topology.c index e5fb85e406..5be3c9af5a 100644 --- a/src/topology.c +++ b/src/topology.c @@ -230,6 +230,9 @@ static const struct hsa_gfxip_table gfxip_lookup_table[] = { { 0x60, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, { 0x62, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, { 0x7400, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, + { 0x7408, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, + { 0x740C, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, + { 0x740F, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, { 0x46, 9, 0, 10, 1, "Aldebaran", CHIP_ALDEBARAN }, /* Navi10 */ { 0x7310, 10, 1, 0, 1, "Navi10", CHIP_NAVI10 }, From ec7ba38b230e81eb7c52e8cbcdb38cfa55be9256 Mon Sep 17 00:00:00 2001 From: Eric Huang Date: Mon, 30 Nov 2020 15:49:23 -0500 Subject: [PATCH 23/39] kfdtest: blacklist KFDMemoryTest.DeviceHdpFlush on gfx90a Due to cache coherence change, the remote vram mapping is changed to cached, the written value by remote shader will not be read by local shader. So the test will fail. Signed-off-by: Eric Huang Change-Id: I2b64e8a30bed0066e159bad9bb7febae5ebe84aa --- tests/kfdtest/scripts/kfdtest.exclude | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/kfdtest/scripts/kfdtest.exclude b/tests/kfdtest/scripts/kfdtest.exclude index e20c5ad73f..7f6694dd93 100644 --- a/tests/kfdtest/scripts/kfdtest.exclude +++ b/tests/kfdtest/scripts/kfdtest.exclude @@ -196,7 +196,8 @@ FILTER[arcturus]=\ FILTER[aldebaran]=\ "$BLACKLIST_ALL_ASICS:"\ "KFDExceptionTest.FaultStorm:"\ -"KFDEvictTest.BurstyTest" +"KFDEvictTest.BurstyTest:"\ +"KFDMemoryTest.DeviceHdpFlush" FILTER[navi10]=\ "$BLACKLIST_ALL_ASICS:"\ From 1f05b54dc9b835ac370d1d0918d359d157700e8d Mon Sep 17 00:00:00 2001 From: Oak Zeng Date: Sun, 29 Nov 2020 20:58:56 -0600 Subject: [PATCH 24/39] Delete device stepping check On every new asic with new stepping, we need to manually relax this checking. This check is not very helpful. Delete it. Change-Id: I11f813023ca2566d82f6d11121d4be38c296674b Signed-off-by: Oak Zeng --- tests/kfdtest/src/KFDTopologyTest.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/kfdtest/src/KFDTopologyTest.cpp b/tests/kfdtest/src/KFDTopologyTest.cpp index 334317e943..c675e4ca88 100644 --- a/tests/kfdtest/src/KFDTopologyTest.cpp +++ b/tests/kfdtest/src/KFDTopologyTest.cpp @@ -58,7 +58,6 @@ TEST_F(KFDTopologyTest , BasicTest) { EXPECT_GT(pNodeProperties->EngineId.ui32.uCode, 0) << "uCode version is 0"; EXPECT_GE(pNodeProperties->EngineId.ui32.Major, 7) << "Major Version is less than 7"; EXPECT_LT(pNodeProperties->EngineId.ui32.Minor, 10) << "Minor Version is greater than 9"; - EXPECT_LT(pNodeProperties->EngineId.ui32.Stepping, 10) << "Stepping is greater than 9"; EXPECT_GT(pNodeProperties->uCodeEngineVersions.uCodeSDMA, 0) << "sDMA firmware version is 0"; } EXPECT_GT(pNodeProperties->NumMemoryBanks, HSAuint32(0)) << "Node index: " << node << "No MemoryBanks."; From e342c9c890eb57ac9238b6b6df08bbd05ba20af1 Mon Sep 17 00:00:00 2001 From: Amber Lin Date: Tue, 8 Dec 2020 13:44:13 -0500 Subject: [PATCH 25/39] kfdtest: Temporarily blacklist some tests Temporarily blacklist some tests on gfx90a until they are solved. Signed-off-by: Amber Lin Change-Id: I87cc3a996ea7d55ed8f20f5b4eecfd8bb691effd --- tests/kfdtest/scripts/kfdtest.exclude | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/kfdtest/scripts/kfdtest.exclude b/tests/kfdtest/scripts/kfdtest.exclude index 7f6694dd93..d2357b241c 100644 --- a/tests/kfdtest/scripts/kfdtest.exclude +++ b/tests/kfdtest/scripts/kfdtest.exclude @@ -193,11 +193,16 @@ FILTER[arcturus]=\ "KFDQMTest.BasicCuMaskingEven:"\ "KFDEvictTest.BurstyTest" +# KFDCWSRTest.BasicTest and KFDEvictTest.QueueTest (SWDEV-263604) +# KFDPerformanceTest.P2PBandWidthTest (SWDEV-262388) FILTER[aldebaran]=\ "$BLACKLIST_ALL_ASICS:"\ "KFDExceptionTest.FaultStorm:"\ "KFDEvictTest.BurstyTest:"\ -"KFDMemoryTest.DeviceHdpFlush" +"KFDMemoryTest.DeviceHdpFlush:"\ +"KFDCWSRTest.BasicTest:"\ +"KFDEvictTest.QueueTest:"\ +"KFDPerformanceTest.P2PBandWidthTest"\ FILTER[navi10]=\ "$BLACKLIST_ALL_ASICS:"\ From a83f9b67ce33d8196bd48fdf28da0f07bedfbe9b Mon Sep 17 00:00:00 2001 From: Laurent Morichetti Date: Thu, 10 Sep 2020 11:39:15 -0700 Subject: [PATCH 26/39] Update the context save area size Reserve some space in the context save area for the debugger's use. There should be 32 bytes per wave for a given queue. Change-Id: I65ddb6123d0f6afd3149844617ad19023009101d --- include/hsakmttypes.h | 24 +++++++++++++++--------- src/queues.c | 29 ++++++++++++++++++++++++----- 2 files changed, 39 insertions(+), 14 deletions(-) diff --git a/include/hsakmttypes.h b/include/hsakmttypes.h index 4d7b7d8d73..7c2ed115a6 100644 --- a/include/hsakmttypes.h +++ b/include/hsakmttypes.h @@ -653,15 +653,16 @@ typedef enum _HSA_QUEUE_TYPE } HSA_QUEUE_TYPE; /** - The user context save area starts at offset 0 with the - HsaUserContextSaveAreaHeader header followed by the space for a - user space copy of the control stack and the user space wave save - state. The area must be dword aligned. The context save area is - valid for the duration that the associated queue exists. When a - context save occurs, the HsaUserContextSaveAreaHeader header will - be updated with information about the context save. The context save - area is not modified by any other operation, including a context - resume. + The user context save area is page aligned. The HsaUserContextSaveAreaHeader + header starts at offset 0. Space for a user space copy of the control stack + comes next and is immediately followed by the user space wave save state. The + start of the user space wave save state is page aligned. The debugger reserved + area comes next and is 64 byte aligned. + + The user context save area is valid for the duration that the associated + queue exists. When a context save occurs, the HsaUserContextSaveAreaHeader + header will be updated with information about the context save. The context + save area is not modified by any other operation, including a context resume. */ typedef struct @@ -677,6 +678,11 @@ typedef struct // of wave state data. Must be 4 byte aligned. HSAuint32 WaveStateSize; // Byte size of the last saved wave state data. // Must be 4 byte aligned. + HSAuint32 DebugOffset; // Byte offset from start of the user context + // save area to the memory reserved for the + // debugger. Must be 64 byte aligned. + HSAuint32 DebugSize; // Byte size of the memory reserved for the + // debugger. Must be 64 byte aligned. } HsaUserContextSaveAreaHeader; diff --git a/src/queues.c b/src/queues.c index aaf667c296..d8ce23e8a4 100644 --- a/src/queues.c +++ b/src/queues.c @@ -49,7 +49,9 @@ #define HWREG_SIZE_PER_CU 0x1000 #define WG_CONTEXT_DATA_SIZE_PER_CU(asic_family) (VGPR_SIZE_PER_CU(asic_family) + SGPR_SIZE_PER_CU + LDS_SIZE_PER_CU + HWREG_SIZE_PER_CU) #define WAVES_PER_CU 32 -#define CNTL_STACK_BYTES_PER_WAVE 8 +#define CNTL_STACK_BYTES_PER_CU(asic_family) (WAVES_PER_CU * (asic_family >= CHIP_NAVI10 ? 12 : 8)) +#define DEBUGGER_BYTES_ALIGN 64 +#define DEBUGGER_BYTES_PER_CU(asic_family) (WAVES_PER_CU * 32) struct device_info { enum asic_family_type asic_family; @@ -208,6 +210,7 @@ struct queue { void *ctx_save_restore; uint32_t ctx_save_restore_size; uint32_t ctl_stack_size; + uint32_t debug_memory_size; const struct device_info *dev_info; bool use_ats; /* This queue structure is allocated from GPU with page aligned size @@ -426,13 +429,23 @@ static bool update_ctx_save_restore_size(uint32_t nodeid, struct queue *q) uint32_t ctl_stack_size, wg_data_size; uint32_t cu_num = node.NumFComputeCores / node.NumSIMDPerCU; - ctl_stack_size = cu_num * WAVES_PER_CU * CNTL_STACK_BYTES_PER_WAVE + 8; + ctl_stack_size = cu_num * CNTL_STACK_BYTES_PER_CU(q->dev_info->asic_family) + 8; wg_data_size = cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(q->dev_info->asic_family); - q->ctl_stack_size = PAGE_ALIGN_UP(ctl_stack_size - + sizeof(HsaUserContextSaveAreaHeader)); + q->ctl_stack_size = PAGE_ALIGN_UP(sizeof(HsaUserContextSaveAreaHeader) + + ctl_stack_size); + if (q->dev_info->asic_family >= CHIP_NAVI10) { + /* HW design limits control stack size to 0x7000. + * This is insufficient for theoretical PM4 cases + * but sufficient for AQL, limited by SPI events. + */ + q->ctl_stack_size = MIN(q->ctl_stack_size, 0x7000); + } + + q->debug_memory_size = + ALIGN_UP(cu_num * DEBUGGER_BYTES_PER_CU(q->dev_info->asic_family), DEBUGGER_BYTES_ALIGN); q->ctx_save_restore_size = q->ctl_stack_size - + PAGE_ALIGN_UP(wg_data_size); + + PAGE_ALIGN_UP(wg_data_size + q->debug_memory_size); return true; } return false; @@ -560,6 +573,8 @@ static int handle_concrete_asic(struct queue *q, ret = update_ctx_save_restore_size(NodeId, q); if (ret) { + HsaUserContextSaveAreaHeader *header; + args->ctx_save_restore_size = q->ctx_save_restore_size; args->ctl_stack_size = q->ctl_stack_size; q->ctx_save_restore = @@ -570,6 +585,10 @@ static int handle_concrete_asic(struct queue *q, return HSAKMT_STATUS_NO_MEMORY; args->ctx_save_restore_address = (uintptr_t)q->ctx_save_restore; + + header = (HsaUserContextSaveAreaHeader *)q->ctx_save_restore; + header->DebugOffset = q->ctx_save_restore_size - q->debug_memory_size; + header->DebugSize = q->debug_memory_size; } return HSAKMT_STATUS_SUCCESS; From 4cf11c3a7ed9602655268a28de1ee94e6416a881 Mon Sep 17 00:00:00 2001 From: Laurent Morichetti Date: Fri, 16 Oct 2020 00:12:38 -0700 Subject: [PATCH 27/39] libhsakmt: Fix the ctrl stack size calculation On gfx9, the maximum number of wavefronts per queue is the minimum of 40 waves per compute units, or 512 waves per shader engine. On gfx10, there can only be 32 waves per compute units. Signed-off-by: Laurent Morichetti Change-Id: I148d1a4fe6c07cdbfaa1f77939eb29311c81c008 --- src/queues.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/queues.c b/src/queues.c index d8ce23e8a4..8d3bd02a4e 100644 --- a/src/queues.c +++ b/src/queues.c @@ -48,10 +48,9 @@ #define LDS_SIZE_PER_CU 0x10000 #define HWREG_SIZE_PER_CU 0x1000 #define WG_CONTEXT_DATA_SIZE_PER_CU(asic_family) (VGPR_SIZE_PER_CU(asic_family) + SGPR_SIZE_PER_CU + LDS_SIZE_PER_CU + HWREG_SIZE_PER_CU) -#define WAVES_PER_CU 32 -#define CNTL_STACK_BYTES_PER_CU(asic_family) (WAVES_PER_CU * (asic_family >= CHIP_NAVI10 ? 12 : 8)) +#define CNTL_STACK_BYTES_PER_WAVE(asic_family) (asic_family >= CHIP_NAVI10 ? 12 : 8) #define DEBUGGER_BYTES_ALIGN 64 -#define DEBUGGER_BYTES_PER_CU(asic_family) (WAVES_PER_CU * 32) +#define DEBUGGER_BYTES_PER_WAVE(asic_family) 32 struct device_info { enum asic_family_type asic_family; @@ -428,8 +427,11 @@ static bool update_ctx_save_restore_size(uint32_t nodeid, struct queue *q) if (node.NumFComputeCores && node.NumSIMDPerCU) { uint32_t ctl_stack_size, wg_data_size; uint32_t cu_num = node.NumFComputeCores / node.NumSIMDPerCU; + uint32_t wave_num = (q->dev_info->asic_family < CHIP_NAVI10) + ? MIN(cu_num * 40, node.NumShaderBanks / node.NumArrays * 512) + : cu_num * 32; - ctl_stack_size = cu_num * CNTL_STACK_BYTES_PER_CU(q->dev_info->asic_family) + 8; + ctl_stack_size = wave_num * CNTL_STACK_BYTES_PER_WAVE(q->dev_info->asic_family) + 8; wg_data_size = cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(q->dev_info->asic_family); q->ctl_stack_size = PAGE_ALIGN_UP(sizeof(HsaUserContextSaveAreaHeader) + ctl_stack_size); @@ -442,7 +444,7 @@ static bool update_ctx_save_restore_size(uint32_t nodeid, struct queue *q) } q->debug_memory_size = - ALIGN_UP(cu_num * DEBUGGER_BYTES_PER_CU(q->dev_info->asic_family), DEBUGGER_BYTES_ALIGN); + ALIGN_UP(wave_num * DEBUGGER_BYTES_PER_WAVE(q->dev_info->asic_family), DEBUGGER_BYTES_ALIGN); q->ctx_save_restore_size = q->ctl_stack_size + PAGE_ALIGN_UP(wg_data_size + q->debug_memory_size); From 7c05c5240ff085ae8b7d48df5ab0c577651dd07f Mon Sep 17 00:00:00 2001 From: Harish Kasiviswanathan Date: Wed, 30 Dec 2020 18:47:48 -0500 Subject: [PATCH 28/39] libhsakmt: A+A: Mark buffers accessed by CP as UC This change is for the A+A bring-up branch as it needs to made more generic to handle all ASICs. For A+A all the system buffers are mapped as NC (non coherent) unless explicitly marked as UC (uncached). The coherency is then expected to be handled by shader by explicitly using acquire/release instructions. However, CP doesn't have same feature. The buffers used by CP thus have to UC. For now queue buffer and Signal handler memory is marked as UC. This change shouldn't affect other ASICs since Uncached flag is not used in those. However, this change still need to be made more generic. Signed-off-by: Harish Kasiviswanathan Change-Id: I56c37a809913f7f08c94d01b0572d0f4864939aa --- src/events.c | 2 +- src/libhsakmt.h | 2 +- src/queues.c | 16 ++++++++++------ 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/events.c b/src/events.c index 23fb710523..d4c751c0cc 100644 --- a/src/events.c +++ b/src/events.c @@ -76,7 +76,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc, if (is_dgpu && !events_page) { events_page = allocate_exec_aligned_memory_gpu( - KFD_SIGNAL_EVENT_LIMIT * 8, PAGE_SIZE, 0, true, false); + KFD_SIGNAL_EVENT_LIMIT * 8, PAGE_SIZE, 0, true, false, true); if (!events_page) { pthread_mutex_unlock(&hsakmt_mutex); return HSAKMT_STATUS_ERROR; diff --git a/src/libhsakmt.h b/src/libhsakmt.h index 86ec0cb9c0..dee5929614 100644 --- a/src/libhsakmt.h +++ b/src/libhsakmt.h @@ -169,7 +169,7 @@ HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags); void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t NodeId, bool NonPaged, - bool DeviceLocal); + bool DeviceLocal, bool Uncached); void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align); HSAKMT_STATUS init_process_doorbells(unsigned int NumNodes); void destroy_process_doorbells(void); diff --git a/src/queues.c b/src/queues.c index 8d3bd02a4e..b8572b6458 100644 --- a/src/queues.c +++ b/src/queues.c @@ -455,7 +455,8 @@ static bool update_ctx_save_restore_size(uint32_t nodeid, struct queue *q) void *allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t NodeId, bool nonPaged, - bool DeviceLocal) + bool DeviceLocal, + bool Uncached) { void *mem; HSAuint64 gpu_va; @@ -469,6 +470,7 @@ void *allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, flags.ui32.NonPaged = nonPaged; flags.ui32.PageSize = HSA_PAGE_SIZE_4KB; flags.ui32.CoarseGrain = DeviceLocal; + flags.ui32.Uncached = Uncached; /* Get the closest cpu_id to GPU NodeId for system memory allocation * nonPaged=1 system memory allocation uses GTT path @@ -518,11 +520,13 @@ void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align) static void *allocate_exec_aligned_memory(uint32_t size, bool use_ats, uint32_t NodeId, - bool DeviceLocal) + bool DeviceLocal, + bool Uncached) { if (!use_ats) return allocate_exec_aligned_memory_gpu(size, PAGE_SIZE, NodeId, - DeviceLocal, DeviceLocal); + DeviceLocal, DeviceLocal, + Uncached); return allocate_exec_aligned_memory_cpu(size); } @@ -564,7 +568,7 @@ static int handle_concrete_asic(struct queue *q, q->eop_buffer = allocate_exec_aligned_memory(q->dev_info->eop_buffer_size, q->use_ats, - NodeId, true); + NodeId, true, /* Unused for VRAM */false); if (!q->eop_buffer) return HSAKMT_STATUS_NO_MEMORY; @@ -582,7 +586,7 @@ static int handle_concrete_asic(struct queue *q, q->ctx_save_restore = allocate_exec_aligned_memory(q->ctx_save_restore_size, q->use_ats, - NodeId, false); + NodeId, false, false); if (!q->ctx_save_restore) return HSAKMT_STATUS_NO_MEMORY; @@ -639,7 +643,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueue(HSAuint32 NodeId, struct queue *q = allocate_exec_aligned_memory(sizeof(*q), use_ats, - NodeId, false); + NodeId, false, true); if (!q) return HSAKMT_STATUS_NO_MEMORY; From 10674916e4d0a8c66b50c55171e8ecd6957957f1 Mon Sep 17 00:00:00 2001 From: Harish Kasiviswanathan Date: Wed, 30 Dec 2020 18:49:42 -0500 Subject: [PATCH 29/39] libhsakmt: Explicitly mark AQL buffers as UC This change might be redundant if ROCr takes care of it Signed-off-by: Harish Kasiviswanathan Change-Id: I7b67143a8ad21baa61b7eda7b8e5fe0ac1e33830 --- src/fmm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/fmm.c b/src/fmm.c index 5fe6518ea6..a3f4822297 100644 --- a/src/fmm.c +++ b/src/fmm.c @@ -1186,7 +1186,8 @@ static uint32_t fmm_translate_hsa_to_ioc_flags(HsaMemFlags flags) uint32_t ioc_flags = 0; if (flags.ui32.AQLQueueMemory) - ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM; + ioc_flags |= (KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM | + KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED); if (!flags.ui32.ReadOnly) ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE; /* TODO: Since, ROCr interfaces doesn't allow caller to set page From 44adc3dafde7d8c2ae07ddfc6d8abc216d9247ef Mon Sep 17 00:00:00 2001 From: Harish Kasiviswanathan Date: Sun, 27 Dec 2020 10:59:07 -0500 Subject: [PATCH 30/39] kfdtest: Add Uncached flag to HsaMemoryBuffer constructor Signed-off-by: Harish Kasiviswanathan Change-Id: I14b0a73ffb04f4798547fe7003de1440736b413d --- tests/kfdtest/src/KFDTestUtil.cpp | 4 +++- tests/kfdtest/src/KFDTestUtil.hpp | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/kfdtest/src/KFDTestUtil.cpp b/tests/kfdtest/src/KFDTestUtil.cpp index c3a528d9c5..21675dfdf9 100644 --- a/tests/kfdtest/src/KFDTestUtil.cpp +++ b/tests/kfdtest/src/KFDTestUtil.cpp @@ -203,7 +203,7 @@ HSAuint64 GetSystemTickCountInMicroSec() { const HsaMemoryBuffer HsaMemoryBuffer::Null; HsaMemoryBuffer::HsaMemoryBuffer(HSAuint64 size, unsigned int node, bool zero, bool isLocal, bool isExec, - bool isScratch, bool isReadOnly) + bool isScratch, bool isReadOnly, bool isUncached) :m_Size(size), m_pUser(NULL), m_pBuf(NULL), @@ -224,11 +224,13 @@ HsaMemoryBuffer::HsaMemoryBuffer(HSAuint64 size, unsigned int node, bool zero, b m_Flags.ui32.HostAccess = 0; m_Flags.ui32.NonPaged = 1; m_Flags.ui32.CoarseGrain = 1; + EXPECT_EQ(isUncached, 0) << "Uncached flag is relevant only for system or host memory"; } else { m_Flags.ui32.HostAccess = 1; m_Flags.ui32.NonPaged = 0; m_Flags.ui32.CoarseGrain = 0; m_Flags.ui32.NoNUMABind = 1; + m_Flags.ui32.Uncached = isUncached; } if (isExec) diff --git a/tests/kfdtest/src/KFDTestUtil.hpp b/tests/kfdtest/src/KFDTestUtil.hpp index e640d588fc..ef2dd57008 100644 --- a/tests/kfdtest/src/KFDTestUtil.hpp +++ b/tests/kfdtest/src/KFDTestUtil.hpp @@ -66,7 +66,7 @@ class HsaMemoryBuffer { public: HsaMemoryBuffer(HSAuint64 size, unsigned int node, bool zero = true, bool isLocal = false, - bool isExec = false, bool isScratch = false, bool isReadOnly = false); + bool isExec = false, bool isScratch = false, bool isReadOnly = false, bool isUncached = false); HsaMemoryBuffer(void *addr, HSAuint64 size); template RetType As() { From 0e8500b886ba1a53a40ce50e0a2a8526333c7180 Mon Sep 17 00:00:00 2001 From: Harish Kasiviswanathan Date: Sun, 27 Dec 2020 11:01:01 -0500 Subject: [PATCH 31/39] kfdtest: A+A: Mark queue address as UC Refer to commit: " Mark buffers accessed by CP as UC" Signed-off-by: Harish Kasiviswanathan Change-Id: I1816e035dbb3178f28f5e34b050c20ecca282060 --- tests/kfdtest/src/BaseQueue.cpp | 3 ++- tests/kfdtest/src/IndirectBuffer.cpp | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/kfdtest/src/BaseQueue.cpp b/tests/kfdtest/src/BaseQueue.cpp index 835ab4c71d..56ebf6fd18 100644 --- a/tests/kfdtest/src/BaseQueue.cpp +++ b/tests/kfdtest/src/BaseQueue.cpp @@ -48,7 +48,8 @@ HSAKMT_STATUS BaseQueue::Create(unsigned int NodeId, unsigned int size, HSAuint6 memset(&m_Resources, 0, sizeof(m_Resources)); - m_QueueBuf = new HsaMemoryBuffer(size, NodeId, true/*zero*/, false/*local*/, true/*exec*/); + m_QueueBuf = new HsaMemoryBuffer(size, NodeId, true/*zero*/, false/*local*/, true/*exec*/, + /*isScratch */ false, /* isReadOnly */false, /* isUncached */true); if (type == HSA_QUEUE_COMPUTE_AQL) { m_Resources.Queue_read_ptr_aql = &pointers[0]; diff --git a/tests/kfdtest/src/IndirectBuffer.cpp b/tests/kfdtest/src/IndirectBuffer.cpp index b820230b0d..4e3907cc5b 100644 --- a/tests/kfdtest/src/IndirectBuffer.cpp +++ b/tests/kfdtest/src/IndirectBuffer.cpp @@ -30,7 +30,8 @@ IndirectBuffer::IndirectBuffer(PACKETTYPE type, unsigned int sizeInDWords, unsigned int NodeId) :m_NumOfPackets(0), m_MaxSize(sizeInDWords), m_ActualSize(0), m_PacketTypeAllowed(type) { m_IndirectBuf = new HsaMemoryBuffer(sizeInDWords*sizeof(unsigned int), NodeId, true/*zero*/, - false/*local*/, true/*exec*/); + false/*local*/, true/*exec*/, false/*isScratch*/, + false/*isReadOnly*/, true/*isUncached*/); } IndirectBuffer::~IndirectBuffer(void) { From 57f46b53ecc8cefe4d84747736077e86465b92f6 Mon Sep 17 00:00:00 2001 From: Harish Kasiviswanathan Date: Sun, 27 Dec 2020 11:05:19 -0500 Subject: [PATCH 32/39] kfdtest: A+A: CP writes to NC mem need flush Refer to commit "Mark buffers accessed by CP as UC" A+A buffers are mapped as NC. CP (PM4Writes) need ReleaseMem function to ensure the write go through to the memory Signed-off-by: Harish Kasiviswanathan Change-Id: I4ee55a6e40fba078f5950d95c8fee7ee076260bf --- tests/kfdtest/src/KFDLocalMemoryTest.cpp | 6 +++- tests/kfdtest/src/KFDMemoryTest.cpp | 7 +++- tests/kfdtest/src/KFDPMTest.cpp | 8 +++-- tests/kfdtest/src/KFDQMTest.cpp | 42 +++++++++++++++++------- 4 files changed, 48 insertions(+), 15 deletions(-) diff --git a/tests/kfdtest/src/KFDLocalMemoryTest.cpp b/tests/kfdtest/src/KFDLocalMemoryTest.cpp index 33f33dc869..6af6765ac3 100644 --- a/tests/kfdtest/src/KFDLocalMemoryTest.cpp +++ b/tests/kfdtest/src/KFDLocalMemoryTest.cpp @@ -58,6 +58,8 @@ TEST_F(KFDLocalMemoryTest, AccessLocalMem) { //local memory HsaMemoryBuffer destBuf(PAGE_SIZE, defaultGPUNode, false, true); + HsaEvent *event; + ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event)); PM4Queue queue; @@ -65,10 +67,12 @@ TEST_F(KFDLocalMemoryTest, AccessLocalMem) { queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As(), 0, 0)); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); + hsaKmtDestroyEvent(event); EXPECT_SUCCESS(queue.Destroy()); + TEST_END } diff --git a/tests/kfdtest/src/KFDMemoryTest.cpp b/tests/kfdtest/src/KFDMemoryTest.cpp index c5c0c0afde..ea24938116 100644 --- a/tests/kfdtest/src/KFDMemoryTest.cpp +++ b/tests/kfdtest/src/KFDMemoryTest.cpp @@ -432,14 +432,18 @@ TEST_F(KFDMemoryTest, AccessPPRMem) { ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + HsaEvent *event; + ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event)); + queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf, 0xABCDEF09, 0x12345678)); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); WaitOnValue(destBuf, 0xABCDEF09); WaitOnValue(destBuf + 1, 0x12345678); + hsaKmtDestroyEvent(event); EXPECT_SUCCESS(queue.Destroy()); /* This sleep hides the dmesg PPR message storm on Raven, which happens @@ -1455,6 +1459,7 @@ TEST_F(KFDMemoryTest, PtraceAccessInvisibleVram) { mem1 = reinterpret_cast(reinterpret_cast(mem) + VRAM_OFFSET + sizeof(HSAuint64)); PM4Queue queue; ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + queue.PlaceAndSubmitPacket(PM4WriteDataPacket((unsigned int *)mem0, data0[0], data0[1])); queue.PlaceAndSubmitPacket(PM4WriteDataPacket((unsigned int *)mem1, diff --git a/tests/kfdtest/src/KFDPMTest.cpp b/tests/kfdtest/src/KFDPMTest.cpp index 79b385cf72..98c2348a8c 100644 --- a/tests/kfdtest/src/KFDPMTest.cpp +++ b/tests/kfdtest/src/KFDPMTest.cpp @@ -78,8 +78,11 @@ TEST_F(KFDPMTest, SuspendWithIdleQueueAfterWork) { ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + HsaEvent *event; + ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event)); + queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuffer.As(), 0x1, 0x2)); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); WaitOnValue(&(destBuffer.As()[0]), 0x1); WaitOnValue(&(destBuffer.As()[1]), 0x2); @@ -88,7 +91,7 @@ TEST_F(KFDPMTest, SuspendWithIdleQueueAfterWork) { EXPECT_EQ(true, SuspendAndWakeUp()); queue.PlaceAndSubmitPacket(PM4WriteDataPacket(&(destBuffer.As()[2]), 0x3, 0x4)); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); EXPECT_EQ(destBuffer.As()[0], 0); EXPECT_EQ(destBuffer.As()[1], 0); @@ -96,6 +99,7 @@ TEST_F(KFDPMTest, SuspendWithIdleQueueAfterWork) { WaitOnValue(&(destBuffer.As()[2]), 0x3); WaitOnValue(&(destBuffer.As()[3]), 0x4); + hsaKmtDestroyEvent(event); EXPECT_SUCCESS(queue.Destroy()); TEST_END diff --git a/tests/kfdtest/src/KFDQMTest.cpp b/tests/kfdtest/src/KFDQMTest.cpp index 71768cc2cf..086d5e537b 100644 --- a/tests/kfdtest/src/KFDQMTest.cpp +++ b/tests/kfdtest/src/KFDQMTest.cpp @@ -78,13 +78,16 @@ TEST_F(KFDQMTest, SubmitNopCpQueue) { ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; PM4Queue queue; + HsaEvent *event; + ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event)); ASSERT_SUCCESS(queue.Create(defaultGPUNode)); queue.PlaceAndSubmitPacket(PM4NopPacket()); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); + hsaKmtDestroyEvent(event); EXPECT_SUCCESS(queue.Destroy()); TEST_END @@ -99,17 +102,19 @@ TEST_F(KFDQMTest, SubmitPacketCpQueue) { HsaMemoryBuffer destBuf(PAGE_SIZE, defaultGPUNode, false); destBuf.Fill(0xFF); + HsaEvent *event; + ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event)); PM4Queue queue; - ASSERT_SUCCESS(queue.Create(defaultGPUNode)); queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As(), 0, 0)); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); EXPECT_TRUE(WaitOnValue(destBuf.As(), 0)); + hsaKmtDestroyEvent(event); EXPECT_SUCCESS(queue.Destroy()); TEST_END @@ -132,7 +137,7 @@ TEST_F(KFDQMTest, AllCpQueues) { for (unsigned int qidx = 0; qidx < m_numCpQueues; ++qidx) { queues[qidx].PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As()+qidx*2, qidx, qidx)); - + queues[qidx].PlaceAndSubmitPacket(PM4ReleaseMemoryPacket(m_FamilyId, true, 0, 0)); queues[qidx].Wait4PacketConsumption(); EXPECT_TRUE(WaitOnValue(destBuf.As()+qidx*2, qidx)); @@ -330,6 +335,7 @@ TEST_F(KFDQMTest, AllQueues) { for (i = 0; i < numCpQueues; ++i) { cpQueues[i].PlaceAndSubmitPacket(PM4WriteDataPacket(destBufCp.As()+i*2, i, i)); + cpQueues[i].PlaceAndSubmitPacket(PM4ReleaseMemoryPacket(m_FamilyId, true, 0, 0)); cpQueues[i].Wait4PacketConsumption(); @@ -460,9 +466,12 @@ TEST_F(KFDQMTest, DisableCpQueueByUpdateWithNullAddress) { ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + HsaEvent *event; + ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event)); + queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As(), 0, 0)); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); WaitOnValue(destBuf.As(), 0); @@ -480,10 +489,11 @@ TEST_F(KFDQMTest, DisableCpQueueByUpdateWithNullAddress) { EXPECT_SUCCESS(queue.Update(BaseQueue::DEFAULT_QUEUE_PERCENTAGE, BaseQueue::DEFAULT_PRIORITY, false)); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); WaitOnValue(destBuf.As(), 1); + hsaKmtDestroyEvent(event); EXPECT_SUCCESS(queue.Destroy()); TEST_END @@ -544,13 +554,16 @@ TEST_F(KFDQMTest, DisableCpQueueByUpdateWithZeroPercentage) { ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + HsaEvent *event; + ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event)); + PM4WriteDataPacket packet1, packet2; packet1.InitPacket(destBuf.As(), 0, 0); packet2.InitPacket(destBuf.As(), 1, 1); queue.PlaceAndSubmitPacket(packet1); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); WaitOnValue(destBuf.As(), 0); @@ -568,7 +581,7 @@ TEST_F(KFDQMTest, DisableCpQueueByUpdateWithZeroPercentage) { EXPECT_SUCCESS(queue.Update(BaseQueue::DEFAULT_QUEUE_PERCENTAGE, BaseQueue::DEFAULT_PRIORITY, false)); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); WaitOnValue(destBuf.As(), 1); @@ -1228,6 +1241,8 @@ TEST_F(KFDQMTest, CpuWriteCoherence) { HsaMemoryBuffer destBuf(PAGE_SIZE, defaultGPUNode); ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + HsaEvent *event; + ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event)); /* The queue might be full and we fail to submit. There is always one word space unused in queue. * So let rptr one step ahead then we continually submit packet. @@ -1249,10 +1264,11 @@ TEST_F(KFDQMTest, CpuWriteCoherence) { */ queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As(), 0x42, 0x42)); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); WaitOnValue(destBuf.As(), 0x42); + hsaKmtDestroyEvent(event); TEST_END } @@ -1420,18 +1436,22 @@ TEST_F(KFDQMTest, CpQueueWraparound) { ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + HsaEvent *event; + ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event)); + for (unsigned int pktIdx = 0; pktIdx <= PAGE_SIZE/sizeof(PM4WRITE_DATA_CI); ++pktIdx) { queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As(), pktIdx, pktIdx)); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); WaitOnValue(destBuf.As(), pktIdx); } for (unsigned int pktIdx = 0; pktIdx <= PAGE_SIZE/sizeof(PM4WRITE_DATA_CI); ++pktIdx) { queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As(), pktIdx, pktIdx)); - queue.Wait4PacketConsumption(); + queue.Wait4PacketConsumption(event); WaitOnValue(destBuf.As(), pktIdx); } + hsaKmtDestroyEvent(event); EXPECT_SUCCESS(queue.Destroy()); TEST_END From f132fb2cd0d4368336b990a9a61990fbc9d7cdcd Mon Sep 17 00:00:00 2001 From: Oak Zeng Date: Tue, 5 Jan 2021 13:13:24 -0600 Subject: [PATCH 33/39] Make GPU mapping of memory as uncached if HSA_DISABLE_CACHE is set Before gfx90a, coherent memory is uncached. So it was reasonable when environment variable HSA_DISABLE_CACHE is set, memory is mapped as coherent. On gfx90a, coherent memory can be cached, so mapping memory as coherent can't guarantee memory is uncached. When HSA_DISABLE_CACHE is set, we have to map memory as uncached. Change-Id: Ia5ed4cf0ad6aef5644dc8c9e6632b52d606f06f4 Signed-off-by: Oak Zeng --- src/fmm.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/fmm.c b/src/fmm.c index a3f4822297..463a3e2fd1 100644 --- a/src/fmm.c +++ b/src/fmm.c @@ -1335,11 +1335,11 @@ void *fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t MemorySizeInB aperture = &gpu_mem[gpu_mem_id].gpuvm_aperture; } - if (!flags.ui32.CoarseGrain || svm.disable_cache) { + if (!flags.ui32.CoarseGrain) ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_COHERENT; - if (flags.ui32.Uncached) - ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED; - } + + if (flags.ui32.Uncached || svm.disable_cache) + ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED; mem = __fmm_allocate_device(gpu_id, address, size, aperture, &mmap_offset, ioc_flags, &vm_obj); @@ -1544,11 +1544,12 @@ static void *fmm_allocate_host_gpu(uint32_t node_id, void *address, else aperture = svm.dgpu_alt_aperture; /* always coherent */ - if (!flags.ui32.CoarseGrain || svm.disable_cache) { + if (!flags.ui32.CoarseGrain) ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_COHERENT; - if (flags.ui32.Uncached) - ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED; - } + + if (flags.ui32.Uncached || svm.disable_cache) + ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED; + ioc_flags |= fmm_translate_hsa_to_ioc_flags(flags); if (flags.ui32.AQLQueueMemory) From 3a378fcf0b081c2e7cda56db0357159085c3000c Mon Sep 17 00:00:00 2001 From: Eric Huang Date: Wed, 6 Jan 2021 10:26:34 -0500 Subject: [PATCH 34/39] kdftest: remove some kfdtests filtered for gfx90a The three kfdtests have been fixed, so remove them from filter list. Signed-off-by: Eric Huang Change-Id: I101a72476970a9d105e8c0b5c022847757fdd316 --- tests/kfdtest/scripts/kfdtest.exclude | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tests/kfdtest/scripts/kfdtest.exclude b/tests/kfdtest/scripts/kfdtest.exclude index d2357b241c..7f6694dd93 100644 --- a/tests/kfdtest/scripts/kfdtest.exclude +++ b/tests/kfdtest/scripts/kfdtest.exclude @@ -193,16 +193,11 @@ FILTER[arcturus]=\ "KFDQMTest.BasicCuMaskingEven:"\ "KFDEvictTest.BurstyTest" -# KFDCWSRTest.BasicTest and KFDEvictTest.QueueTest (SWDEV-263604) -# KFDPerformanceTest.P2PBandWidthTest (SWDEV-262388) FILTER[aldebaran]=\ "$BLACKLIST_ALL_ASICS:"\ "KFDExceptionTest.FaultStorm:"\ "KFDEvictTest.BurstyTest:"\ -"KFDMemoryTest.DeviceHdpFlush:"\ -"KFDCWSRTest.BasicTest:"\ -"KFDEvictTest.QueueTest:"\ -"KFDPerformanceTest.P2PBandWidthTest"\ +"KFDMemoryTest.DeviceHdpFlush" FILTER[navi10]=\ "$BLACKLIST_ALL_ASICS:"\ From f7759df6e044bd7cad03b4c4a30fb880f5a0fda7 Mon Sep 17 00:00:00 2001 From: Eric Huang Date: Thu, 7 Jan 2021 13:54:19 -0500 Subject: [PATCH 35/39] kfdtest: fix KFDQMTest.Atomics test failure on A+A destBuf is mapped as cached, the intruction flat_atomic_add operates on cache that cause test failed. Adding scc modifier in the instruction will fix the issue. Signed-off-by: Eric Huang Change-Id: I8e138f93ae4f5e23020e3ac1549ef924968a74c5 --- tests/kfdtest/src/IsaGenerator_Aldebaran.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/kfdtest/src/IsaGenerator_Aldebaran.cpp b/tests/kfdtest/src/IsaGenerator_Aldebaran.cpp index 2fcb80fd32..2c377f9111 100644 --- a/tests/kfdtest/src/IsaGenerator_Aldebaran.cpp +++ b/tests/kfdtest/src/IsaGenerator_Aldebaran.cpp @@ -37,7 +37,7 @@ type(CS) v_mov_b32 v0, s0 v_mov_b32 v1, s1 v_mov_b32 v2, 1 - flat_atomic_add v3, v[0:1], v2 slc glc + flat_atomic_add v3, v[0:1], v2 slc glc scc s_waitcnt 0 s_endpgm end @@ -86,7 +86,7 @@ const uint32_t IsaGenerator_Aldbrn::INFINITE_LOOP_ISA[] = { const uint32_t IsaGenerator_Aldbrn::ATOMIC_ADD_ISA[] = { 0x7e000200, 0x7e020201, - 0x7e040281, 0xdd0b0000, + 0x7e040281, 0xdf0b0000, 0x037f0200, 0xbf8c0000, 0xbf810000, 0x00000000 }; From 085005f07b6a959699baea84b1da1fda0258e93f Mon Sep 17 00:00:00 2001 From: Harish Kasiviswanathan Date: Wed, 13 Jan 2021 15:54:03 -0500 Subject: [PATCH 36/39] kfdtest: Add gfx9_PollNCMemory function to support NC memory In A+A all system memory is mapped as NC. So add a new function gfx9_PollNCMemory which will support NC memory. Signed-off-by: Harish Kasiviswanathan Change-Id: I097b95fb156f73d6f480cd4fd262cc6fa5933f69 --- tests/kfdtest/src/KFDMemoryTest.cpp | 32 ++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/tests/kfdtest/src/KFDMemoryTest.cpp b/tests/kfdtest/src/KFDMemoryTest.cpp index ea24938116..54ab73d1c0 100644 --- a/tests/kfdtest/src/KFDMemoryTest.cpp +++ b/tests/kfdtest/src/KFDMemoryTest.cpp @@ -154,6 +154,32 @@ type(CS)\n\ end\n\ "; +/* Similar to gfx9_PollMemory except that the buffer + * polled can be Non-coherant memory. SCC system-level + * cache coherence is not supported in scalar (smem) path. + * Use vmem operations with scc + */ +const char* gfx9_PollNCMemory = +"\ +shader ReadMemory\n\ +asic(ALDEBARAN)\n\ +wave_size(32)\n\ +type(CS)\n\ +/* Assume src address in s0, s1 and dst address in s2, s3*/\n\ + v_mov_b32 v6, 0x5678\n\ + v_mov_b32 v0, s0\n\ + v_mov_b32 v1, s1\n\ + LOOP:\n\ + flat_load_dword v4, v[0:1] scc\n\ + v_cmp_eq_u32 vcc, v4, v6\n\ + s_cbranch_vccz LOOP\n\ + v_mov_b32 v0, s2\n\ + v_mov_b32 v1, s3\n\ + flat_store_dword v[0:1], v6 scc\n\ + s_endpgm\n\ + end\n\ +"; + /* Input: A buffer of at least 3 dwords. * DW0: used as a signal. 0xcafe means it is signaled * DW1: Input buffer for device to read. @@ -336,7 +362,11 @@ TEST_F(KFDMemoryTest, MapUnmapToNodes) { HsaMemoryBuffer srcBuffer(PAGE_SIZE, defaultGPUNode); HsaMemoryBuffer dstBuffer(PAGE_SIZE, defaultGPUNode); - m_pIsaGen->CompileShader(gfx9_PollMemory, "ReadMemory", isaBuffer); + if (m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode)) + /* On A+A system memory is mapped as NC */ + m_pIsaGen->CompileShader(gfx9_PollNCMemory, "ReadMemory", isaBuffer); + else + m_pIsaGen->CompileShader(gfx9_PollMemory, "ReadMemory", isaBuffer); PM4Queue pm4Queue; ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode)); From 9aa521d1ffa58c93abcdd187b6c83af9c0eed29d Mon Sep 17 00:00:00 2001 From: Eric Huang Date: Wed, 15 Apr 2020 16:13:52 -0400 Subject: [PATCH 37/39] KFDTest: add cache coherence tests for gfx90a Three kfd subtests are added to verify new XGMI connection with cache coherence HW link on A+A. Signed-off-by: Eric Huang Change-Id: I6960ec91cbfb696c4e6acb3b79fd83107003acdd --- tests/kfdtest/src/KFDMemoryTest.cpp | 283 ++++++++++++++++++++++++++++ 1 file changed, 283 insertions(+) diff --git a/tests/kfdtest/src/KFDMemoryTest.cpp b/tests/kfdtest/src/KFDMemoryTest.cpp index 54ab73d1c0..95997c0641 100644 --- a/tests/kfdtest/src/KFDMemoryTest.cpp +++ b/tests/kfdtest/src/KFDMemoryTest.cpp @@ -229,6 +229,81 @@ type(CS)\n\ end\n\ "; +/* Continuously poll the flag at src buffer + * After the flag of s[0:1] is 1 filled, + * copy the value from s[0:1]+4 to dst buffer + */ +const char* gfx9_PollAndCopy = +"\ +shader CopyMemory\n\ +wave_size(32)\n\ +type(CS)\n\ +/* Assume src buffer in s[0:1] and dst buffer in s[2:3]*/\n\ + s_movk_i32 s18, 0x1\n\ + LOOP:\n\ + s_load_dword s16, s[0:1], 0x0 glc\n\ + s_cmp_eq_i32 s16, s18\n\ + s_cbranch_scc0 LOOP\n\ + s_load_dword s17, s[0:1], 0x4 glc\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + s_store_dword s17, s[2:3], 0x0 glc:1\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + s_endpgm\n\ + end\n\ +"; + +const char* gfx9aldbrn_PollAndCopy = +"\ +shader CopyMemory\n\ +wave_size(32)\n\ +type(CS)\n\ +/* Assume src buffer in s[0:1] and dst buffer in s[2:3]*/\n\ + v_mov_b32 v0, s0\n\ + v_mov_b32 v1, s1\n\ + v_mov_b32 v18, 0x1\n\ + LOOP:\n\ + flat_load_dword v16, v[0:1] scc:1\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + v_cmp_eq_i32 vcc, v16, v18\n\ + s_cbranch_vccz LOOP\n\ + buffer_invl2\n\ + s_load_dword s17, s[0:1], 0x4 glc\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + s_store_dword s17, s[2:3], 0x0 glc\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + buffer_wbl2\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + s_endpgm\n\ + end\n\ +"; + +/* Input0: A buffer of at least 2 dwords. + * DW0: used as a signal. Write 0x1 to signal + * DW1: Write the value from 2nd input buffer + * for other device to read. + * Input1: A buffer of at least 2 dwords. + * DW0: used as the value to be written. + */ +const char* gfx9aldbrn_WriteFlagAndValue = +"\ +shader WriteMemory\n\ +wave_size(32)\n\ +type(CS)\n\ +/* Assume two inputs buffer in s[0:1] and s[2:3]*/\n\ + v_mov_b32 v0, s0\n\ + v_mov_b32 v1, s1\n\ + s_load_dword s18, s[2:3], 0x0 glc\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + s_store_dword s18, s[0:1], 0x4 glc\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + buffer_wbl2\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + v_mov_b32 v16, 0x1\n\ + flat_store_dword v[0:1], v16 scc:1\n\ + s_endpgm\n\ + end\n\ +"; + //These gfx9_PullMemory, gfx9_CopyOnSignal, gfx9_WriteAndSignal shaders can be used by both gfx9 and gfx10 void KFDMemoryTest::SetUp() { @@ -2258,3 +2333,211 @@ TEST_F(KFDMemoryTest, CacheInvalidateOnRemoteWrite) { TEST_END } + +/* Test is for new cache coherence on Aldebaran. It is to verify + * two GPUs can coherently share a fine grain FB. + */ +TEST_F(KFDMemoryTest, VramCacheCoherenceWithRemoteGPU) { + TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); + TEST_START(TESTPROFILE_RUNALL); + + HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + HsaMemoryBuffer tmpBuffer(PAGE_SIZE, 0, true /* zero */); + volatile HSAuint32 *tmp = tmpBuffer.As(); + const int dwSource = 0x40 * sizeof(int); /* At 3rd cache line */ + const int dwLocation = 0x80 * sizeof(int); /* At 5th cache line */ + + if (m_FamilyId != FAMILY_AL) { + LOG() << "Skipping test: Test requires aldebaran series asics." << std::endl; + return; + } + + const std::vector gpuNodes = m_NodeInfo.GetNodesWithGPU(); + if (gpuNodes.size() < 2) { + LOG() << "Skipping test: At least two GPUs are required." << std::endl; + return; + } + + HSAuint32 nondefaultNode; + for (unsigned i = 0; i < gpuNodes.size(); i++) { + if (gpuNodes.at(i) != defaultGPUNode) { + nondefaultNode = gpuNodes.at(i); + break; + } + } + + unsigned int nodes[2] = {defaultGPUNode, nondefaultNode}; + + /* Allocate a local FB */ + HsaMemoryBuffer buffer(PAGE_SIZE, defaultGPUNode, false/*zero*/, true/*local*/, false/*exec*/); + buffer.MapMemToNodes(&nodes[0], 2); + SDMAQueue sdmaQueue; + ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode)); + buffer.Fill(0, sdmaQueue, 0, PAGE_SIZE); + buffer.Fill(0x5678, sdmaQueue, dwSource, 4); + + /* Read buffer[0] as flag from local shader to fill cache line (64 dws) + * which should has 0 at buffer[1] + */ + PM4Queue queue; + ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); + m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer); + Dispatch dispatch(isaBuffer); + dispatch.SetArgs(buffer.As(), buffer.As()+dwLocation); + dispatch.Submit(queue); + + /* Delay 100ms to make sure shader executed*/ + Delay(100); + + /* Using remote shader to write the flag and copy value from dwSource + * to dwLocation in buffer. + * Local shader should get the flag and execute CopyMemory + */ + PM4Queue queue1; + ASSERT_SUCCESS(queue1.Create(nondefaultNode)); + HsaMemoryBuffer isaBuffer1(PAGE_SIZE, nondefaultNode, true/*zero*/, false/*local*/, true/*exec*/); + m_pIsaGen->CompileShader(gfx9aldbrn_WriteFlagAndValue, "WriteMemory", isaBuffer1); + Dispatch dispatch1(isaBuffer1); + dispatch1.SetArgs(buffer.As(), buffer.As()+dwSource); + dispatch1.Submit(queue1); + dispatch1.Sync(g_TestTimeOut); + + /* Check test result*/ + dispatch.Sync(g_TestTimeOut); + EXPECT_EQ(buffer.IsPattern(dwLocation, 0x5678, sdmaQueue, tmp), true); + + // Clean up + EXPECT_SUCCESS(queue.Destroy()); + EXPECT_SUCCESS(queue1.Destroy()); + EXPECT_SUCCESS(sdmaQueue.Destroy()); + + TEST_END +} + +/* Test is for new cache coherence on A+A(Aldebaran). It is to verify + * new XGMI coherence HW link in caches between CPU and GPUs + * in local FB with fine grain mode. + */ +TEST_F(KFDMemoryTest, VramCacheCoherenceWithCPU) { + TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); + TEST_START(TESTPROFILE_RUNALL); + + if (m_FamilyId != FAMILY_AL) { + LOG() << "Skipping test: Test requires aldebaran series asics." << std::endl; + return; + } + + HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + const int dwLocation = 0x80; + + if (!m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode)) { + LOG() << "Skipping test: XGMI link to CPU is required." << std::endl; + return; + } + + unsigned int *buffer; + HsaMemFlags memFlags = {0}; + /* Allocate a fine grain local FB accessed by CPU */ + memFlags.ui32.HostAccess = 1; + memFlags.ui32.NonPaged = 1; + ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode, PAGE_SIZE, memFlags, + reinterpret_cast(&buffer))); + ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(buffer, PAGE_SIZE, NULL)); + buffer[0] = 0; + buffer[dwLocation] = 0; + + /* Read buffer from shader to fill cache */ + PM4Queue queue; + ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); + m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer); + Dispatch dispatch(isaBuffer); + dispatch.SetArgs(buffer, buffer+dwLocation); + dispatch.Submit(queue); + + /* Delay 100ms to make sure shader executed*/ + Delay(100); + + /* CPU writes to buffer. Shader should get 0x5678 CPU writes + * after cache invalidating(buffer_invl2) and quits + */ + buffer[1] = 0x5678; + buffer[0] = 1; + + /* Check test result*/ + dispatch.Sync(g_TestTimeOut); + EXPECT_EQ(buffer[dwLocation], 0x5678); + + // Clean up + EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(buffer)); + EXPECT_SUCCESS(hsaKmtFreeMemory(buffer, PAGE_SIZE)); + EXPECT_SUCCESS(queue.Destroy()); + + TEST_END +} + +/* Test is for new cache coherence on Aldebaran. It is to verify + * new XGMI coherence HW link in caches between CPU and GPUs + * in system RAM. + */ +TEST_F(KFDMemoryTest, SramCacheCoherenceWithGPU) { + TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); + TEST_START(TESTPROFILE_RUNALL); + + if (m_FamilyId != FAMILY_AL) { + LOG() << "Skipping test: Test requires aldebaran series asics." << std::endl; + return; + } + + unsigned int *fineBuffer = NULL; + unsigned int tmp; + + int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + const int dwLocation = 0x80; + + ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode /* system */, PAGE_SIZE, m_MemoryFlags, + reinterpret_cast(&fineBuffer))); + ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(fineBuffer, PAGE_SIZE, NULL)); + fineBuffer[0] = 0; + fineBuffer[1] = 0; + /* Read buffer from CPU to fill cache */ + tmp = fineBuffer[dwLocation]; + + /* Read fine grain buffer from shader to fill cache */ + PM4Queue queue; + ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); + + if (m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode)) + m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer); + else + m_pIsaGen->CompileShader(gfx9_PollAndCopy, "CopyMemory", isaBuffer); + + Dispatch dispatch(isaBuffer); + dispatch.SetArgs(fineBuffer, fineBuffer+dwLocation); + dispatch.Submit(queue); + + /* Delay 100ms to make sure shader executed*/ + Delay(100); + + /* CPU writes to buffer. Shader should get what CPU writes and quits*/ + fineBuffer[1] = 0x5678; + fineBuffer[0] = 1; + + /* Check test result, based on KFDEventTest.SignalEvent passed. + * if Sync times out, + * it means coherence issue that GPU doesn't read what CPU wrote. + * if buffer value is not expected, + * it means coherence issue that CPU doesn't read what GPU wrote. + */ + dispatch.Sync(g_TestTimeOut); + EXPECT_EQ(fineBuffer[dwLocation], 0x5678); + + // Clean up + EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(fineBuffer)); + EXPECT_SUCCESS(hsaKmtFreeMemory(fineBuffer, PAGE_SIZE)); + EXPECT_SUCCESS(queue.Destroy()); + + TEST_END +} From ae0e74095ec03e279c46e714b4887b3a67984475 Mon Sep 17 00:00:00 2001 From: Oak Zeng Date: Wed, 24 Feb 2021 19:03:35 -0600 Subject: [PATCH 38/39] Allocate coherent uncached memory when HSA_DISABLE_CACHE is set Set the KFD_IOC_ALLOC_MEM_FLAGS_COHERENT flag and KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED flag to allocate uncached coherent memory when HSA_DISABLE_CACHE environment variable is set. At KFD driver, Single KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED flag is not sufficient to allocate uncached memory. We have to use both two flags to allocate uncached memory. Change-Id: Ie490f37b2e696314e60048f5b1b57442431696e9 Signed-off-by: Oak Zeng --- src/fmm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fmm.c b/src/fmm.c index 463a3e2fd1..fc588a1617 100644 --- a/src/fmm.c +++ b/src/fmm.c @@ -1335,7 +1335,7 @@ void *fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t MemorySizeInB aperture = &gpu_mem[gpu_mem_id].gpuvm_aperture; } - if (!flags.ui32.CoarseGrain) + if (!flags.ui32.CoarseGrain || svm.disable_cache) ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_COHERENT; if (flags.ui32.Uncached || svm.disable_cache) @@ -1544,7 +1544,7 @@ static void *fmm_allocate_host_gpu(uint32_t node_id, void *address, else aperture = svm.dgpu_alt_aperture; /* always coherent */ - if (!flags.ui32.CoarseGrain) + if (!flags.ui32.CoarseGrain || svm.disable_cache) ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_COHERENT; if (flags.ui32.Uncached || svm.disable_cache) From e35778ed4dcc83ba001f0a25b1319cb59716ee1a Mon Sep 17 00:00:00 2001 From: Harish Kasiviswanathan Date: Thu, 25 Feb 2021 14:10:49 -0500 Subject: [PATCH 39/39] kfdtest: Temporarily blacklist KFDMemoryTest.PtraceAccess Possibly because of moving to gart table for vram access from Kernel. This test failure shouldn't be a blocker. Temporarily blacklist till a solution is found. Signed-off-by: Harish Kasiviswanathan Change-Id: I99725f368aced863188e30f619288ad4d033b9a6 --- tests/kfdtest/scripts/kfdtest.exclude | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/kfdtest/scripts/kfdtest.exclude b/tests/kfdtest/scripts/kfdtest.exclude index 7f6694dd93..e30ba99958 100644 --- a/tests/kfdtest/scripts/kfdtest.exclude +++ b/tests/kfdtest/scripts/kfdtest.exclude @@ -197,6 +197,7 @@ FILTER[aldebaran]=\ "$BLACKLIST_ALL_ASICS:"\ "KFDExceptionTest.FaultStorm:"\ "KFDEvictTest.BurstyTest:"\ +"KFDMemoryTest.PtraceAccess:"\ "KFDMemoryTest.DeviceHdpFlush" FILTER[navi10]=\