SWDEV-452795 - Disable RAS plugin, fix XGMI
RAS plugin loaded rocm-smi which is in conflict with amd-smi library Main source of grief was the map 'devInfoTypesStrings' that is defined in both rocm-smi and amd-smi We assume that rocm-smi would get lazy-loaded by RAS library and overwrite symbols defined in amd-smi. devInfoTypesStrings in rocm-smi contains different number of elements, the enums are also different. RDC relies on amd-smi's enums. One such enum is kDevGpuMetrics: rocm-smi: kDevGpuMetrics = 68 amd-smi: kDevGpuMetrics = 75 Example of overlapping map definitions: $ objdump --dynamic-syms /opt/rocm/lib/libamd_smi.so | grep devInfoTypesStrings 00000000003c4980 g DO .data.rel.ro0000000000000008 Base devInfoTypesStrings 00000000003db830 g DO .bss0000000000000030 Base _ZN3amd3smi6Device19devInfoTypesStringsE $ objdump --dynamic-syms /opt/rocm/lib/librocm_smi64.so | grep devInfoTypesStrings 00000000003dc590 g DO .bss0000000000000030 Base _ZN3amd3smi6Device19devInfoTypesStringsE 00000000003c9c68 g DO .data.rel.ro0000000000000008 Base devInfoTypesStrings Change-Id: Ib2f2db32b6abd7ebe84e7807c25581461eb86bae Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
Этот коммит содержится в:
@@ -44,10 +44,6 @@ message("Package version: ${VERSION_STRING}")
|
||||
# which requires the gRPC
|
||||
option(BUILD_STANDALONE "Build targets for rdci and rdcd" ON)
|
||||
|
||||
# When cmake -DBUILD_RASLIB=off, it will not build the raslib
|
||||
# which requires the ROCT-Thunk-Interface.
|
||||
option(BUILD_RASLIB "Build targets for raslib" OFF)
|
||||
|
||||
# When cmake -DBUILD_RUNTIME=off, it will not build the librdc_rocr.so
|
||||
# which requires the Rocm run time.
|
||||
option(BUILD_RUNTIME "Build targets for librdc_rocr.so" ON)
|
||||
@@ -149,12 +145,6 @@ if(BUILD_STANDALONE AND GRPC_ROOT STREQUAL GRPC_ROOT_DEFAULT)
|
||||
Continuing without gRPC install")
|
||||
endif()
|
||||
|
||||
if(NOT EXISTS "${CMAKE_SOURCE_DIR}/raslib/.git" AND BUILD_RASLIB)
|
||||
message(FATAL_ERROR "The git submodule raslib is not available. Please run
|
||||
git submodule update --init --recursive
|
||||
If you do not want to build raslib, use cmake -DBUILD_RASLIB=off")
|
||||
endif()
|
||||
|
||||
find_package(SMI
|
||||
NAMES amd_smi
|
||||
HINTS ${ROCM_DIR}/lib/cmake
|
||||
@@ -167,12 +157,6 @@ if(NOT EXISTS "${SMI_INC_DIR}" OR NOT EXISTS "${SMI_LIB_DIR}")
|
||||
make sure amd_smi is installed and present in ${SMI_INC_DIR}.")
|
||||
endif()
|
||||
|
||||
if(BUILD_RASLIB AND NOT DEFINED HSA_DIR)
|
||||
message(FATAL_ERROR "Please specify libhsakmt directory which is required by raslib
|
||||
cmake -DHSA_DIR=<libhsakmt directory>
|
||||
If you do not want to build raslib, use cmake -DBUILD_RASLIB=off")
|
||||
endif()
|
||||
|
||||
set(${RDC}_VERSION_MAJOR "${VERSION_MAJOR}")
|
||||
set(${RDC}_VERSION_MINOR "${VERSION_MINOR}")
|
||||
set(${RDC}_VERSION_PATCH "0")
|
||||
@@ -315,20 +299,6 @@ if(BUILD_STANDALONE)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Raslib
|
||||
if(BUILD_RASLIB)
|
||||
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
|
||||
message(" Build raslib")
|
||||
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
|
||||
|
||||
add_subdirectory("raslib")
|
||||
else()
|
||||
add_library(rdc_ras INTERFACE
|
||||
${PROJECT_SOURCE_DIR}/ras_prebuild/librdc_ras.so)
|
||||
# needed for copying the pre-built library
|
||||
get_target_property(RAS_SOURCES rdc_ras SOURCES)
|
||||
endif()
|
||||
|
||||
# Folders for both standalone and embedded
|
||||
add_subdirectory("rdc_libs")
|
||||
|
||||
@@ -377,24 +347,6 @@ install(DIRECTORY ${PROJECT_SOURCE_DIR}/example
|
||||
DESTINATION ${RDC_SHARE_INSTALL_PREFIX}
|
||||
COMPONENT dev)
|
||||
|
||||
# Prebuild packages to install
|
||||
install(PROGRAMS ${RAS_SOURCES}
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/${RDC}
|
||||
COMPONENT ${CLIENT_COMPONENT})
|
||||
install(DIRECTORY ${PROJECT_SOURCE_DIR}/ras_prebuild/config
|
||||
DESTINATION ${CMAKE_INSTALL_DATADIR}/${RDC}
|
||||
COMPONENT ${CLIENT_COMPONENT})
|
||||
# Don't sp3 grpc install because it floods the terminal
|
||||
set(OLD_CMAKE_INSTALL_MESSAGE ${CMAKE_INSTALL_MESSAGE})
|
||||
set(CMAKE_INSTALL_MESSAGE NEVER)
|
||||
install(DIRECTORY ${PROJECT_SOURCE_DIR}/ras_prebuild/sp3
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/${RDC}
|
||||
COMPONENT ${CLIENT_COMPONENT})
|
||||
# Restore printing verbosity
|
||||
set(CMAKE_INSTALL_MESSAGE ${OLD_CMAKE_INSTALL_MESSAGE})
|
||||
unset(OLD_CMAKE_INSTALL_MESSAGE)
|
||||
|
||||
|
||||
#Identify between SLES and Centos for setting symlink for rdc.service
|
||||
#SLES need serice file in /usr/lib/systemd/system/rdc.service
|
||||
#CENTOS/RHEL Require file in /lib/systemd/system/rdc.service
|
||||
|
||||
@@ -59,13 +59,11 @@ function(create_library_symlink)
|
||||
# Symlink for private libraries
|
||||
set(LIB_RDC_ROCR "librdc_rocr.so")
|
||||
set(LIB_RDC_ROCP "librdc_rocp.so")
|
||||
set(LIB_RDC_RAS "librdc_ras.so")
|
||||
set(LIB_RDC_RVS "librdc_rvs.so")
|
||||
set(LIB_RDC_CLIENT_SMI "librdc_client_smi.so")
|
||||
set(library_files "${LIB_RDC_ROCR}" "${LIB_RDC_ROCR}.${MAJ_VERSION}" "${LIB_RDC_ROCR}.${SO_VERSION}" )
|
||||
set(library_files "${LIB_RDC_ROCP}" "${LIB_RDC_ROCP}.${MAJ_VERSION}" "${LIB_RDC_ROCP}.${SO_VERSION}" )
|
||||
set(library_files "${library_files}" "${LIB_RDC_CLIENT_SMI}" "${LIB_RDC_CLIENT_SMI}.${MAJ_VERSION}" "${LIB_RDC_CLIENT_SMI}.${SO_VERSION}" )
|
||||
set(library_files "${library_files}" "${LIB_RDC_RAS}")
|
||||
set(library_files "${library_files}" "${LIB_RDC_RVS}")
|
||||
|
||||
foreach(file_name ${library_files})
|
||||
|
||||
@@ -43,41 +43,57 @@ FLD_DESC_ENT(RDC_FI_GPU_TEMP, "GPU temperature in millidegrees Celsiu
|
||||
FLD_DESC_ENT(RDC_FI_POWER_USAGE, "Power usage in microwatts", "POWER_USAGE", true)
|
||||
FLD_DESC_ENT(RDC_FI_PCIE_TX, "PCIe Tx utilization in bytes/second", "PCIE_TX", true)
|
||||
FLD_DESC_ENT(RDC_FI_PCIE_RX, "PCIe Rx utilization in bytes/second", "PCIE_RX", true)
|
||||
FLD_DESC_ENT(RDC_FI_PCIE_BANDWIDTH, "PCIe bandwidth in GB/sec", "PCIE_BANDWIDTH", true)
|
||||
FLD_DESC_ENT(RDC_FI_PCIE_BANDWIDTH, "PCIe bandwidth in GB/sec", "PCIE_BANDWIDTH", true)
|
||||
|
||||
FLD_DESC_ENT(RDC_FI_GPU_UTIL, "GPU busy percentage", "GPU_UTIL", true)
|
||||
FLD_DESC_ENT(RDC_FI_GPU_MEMORY_USAGE, "Memory usage of the GPU instance in bytes", "GPU_MEMORY_USAGE", true)
|
||||
FLD_DESC_ENT(RDC_FI_GPU_MEMORY_TOTAL, "Total memory of the GPU instance", "GPU_MEMORY_TOTAL", true)
|
||||
|
||||
// ECC totals
|
||||
FLD_DESC_ENT(RDC_FI_ECC_CORRECT_TOTAL, "Accumulated Single Error Correction", "ECC_CORRECT", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_UNCORRECT_TOTAL, "Accumulated Double Error Detection", "ECC_UNCORRECT", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_SDMA_SEC, "SDMA Single Error Correction", "ECC_SDMA_SEC", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_SDMA_DED, "SDMA Double Error Detection", "ECC_SDMA_DED", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_GFX_SEC, "GFX Single Error Correction", "ECC_GFX_SEC", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_GFX_DED, "GFX Double Error Detection", "ECC_GFX_DED", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_MMHUB_SEC, "MMHUB Single Error Correction", "ECC_MMHUB_SEC", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_MMHUB_DED, "MMHUB Double Error Detection", "ECC_MMHUB_DED", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_ATHUB_SEC, "ATHUB Single Error Correction", "ECC_ATHUB_SEC", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_ATHUB_DED, "ATHUB Double Error Detection", "ECC_ATHUB_DED", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_BIF_SEC, "BIF Single Error Correction", "ECC_BIF_SEC", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_BIF_DED, "BIF Double Error Detection", "ECC_BIF_DED", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_HDP_SEC, "HDP Single Error Correction", "ECC_HDP_SEC", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_HDP_DED, "HDP Double Error Detection", "ECC_HDP_DED", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_XGMI_WAFL_SEC, "XGMI WAFL Single Error Correction", "ECC_XGMI_WAFL_SEC",true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_XGMI_WAFL_DED, "XGMI WAFL Double Error Detection", "ECC_XGMI_WAFL_DED",true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_DF_SEC, "DF Single Error Correction", "ECC_DF_SEC", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_DF_DED, "DF Double Error Detection", "ECC_DF_DED", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_SMN_SEC, "SMN Single Error Correction", "ECC_SMN_SEC", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_SMN_DED, "SMN Double Error Detection", "ECC_SMN_DED", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_SEM_SEC, "SEM Single Error Correction", "ECC_SEM_SEC", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_SEM_DED, "SEM Double Error Detection", "ECC_SEM_DED", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_MP0_SEC, "MP0 Single Error Correction", "ECC_MP0_SEC", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_MP0_DED, "MP0 Double Error Detection", "ECC_MP0_DED", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_MP1_SEC, "MP1 Single Error Correction", "ECC_MP1_SEC", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_MP1_DED, "MP1 Double Error Detection", "ECC_MP1_DED", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_FUSE_SEC, "FUSE Single Error Correction", "ECC_FUSE_SEC", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_FUSE_DED, "FUSE Double Error Detection", "ECC_FUSE_DED", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_UMC_SEC, "UMC Single Error Correction", "ECC_UMC_SEC", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_UMC_DED, "UMC Double Error Detection", "ECC_UMC_DED", true)
|
||||
|
||||
// ECC blocks
|
||||
FLD_DESC_ENT(RDC_FI_ECC_SDMA_CE, "SDMA Correctable Error", "ECC_SDMA_CE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_SDMA_UE, "SDMA Uncorrectable Error", "ECC_SDMA_UE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_GFX_CE, "GFX Correctable Error", "ECC_GFX_CE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_GFX_UE, "GFX Uncorrectable Error", "ECC_GFX_UE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_MMHUB_CE, "MMHUB Correctable Error", "ECC_MMHUB_CE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_MMHUB_UE, "MMHUB Uncorrectable Error", "ECC_MMHUB_UE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_ATHUB_CE, "ATHUB Correctable Error", "ECC_ATHUB_CE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_ATHUB_UE, "ATHUB Uncorrectable Error", "ECC_ATHUB_UE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_PCIE_BIF_CE, "PCIE_BIF Correctable Error", "ECC_PCIE_BIF_CE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_PCIE_BIF_UE, "PCIE_BIF Uncorrectable Error", "ECC_PCIE_BIF_UE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_HDP_CE, "HDP Correctable Error", "ECC_HDP_CE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_HDP_UE, "HDP Uncorrectable Error", "ECC_HDP_UE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_XGMI_WAFL_CE, "XGMI_WAFL Correctable Error", "ECC_XGMI_WAFL_CE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_XGMI_WAFL_UE, "XGMI_WAFL Uncorrectable Error", "ECC_XGMI_WAFL_UE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_DF_CE, "DF Correctable Error", "ECC_DF_CE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_DF_UE, "DF Uncorrectable Error", "ECC_DF_UE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_SMN_CE, "SMN Correctable Error", "ECC_SMN_CE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_SMN_UE, "SMN Uncorrectable Error", "ECC_SMN_UE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_SEM_CE, "SEM Correctable Error", "ECC_SEM_CE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_SEM_UE, "SEM Uncorrectable Error", "ECC_SEM_UE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_MP0_CE, "MP0 Correctable Error", "ECC_MP0_CE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_MP0_UE, "MP0 Uncorrectable Error", "ECC_MP0_UE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_MP1_CE, "MP1 Correctable Error", "ECC_MP1_CE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_MP1_UE, "MP1 Uncorrectable Error", "ECC_MP1_UE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_FUSE_CE, "FUSE Correctable Error", "ECC_FUSE_CE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_FUSE_UE, "FUSE Uncorrectable Error", "ECC_FUSE_UE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_UMC_CE, "UMC Correctable Error", "ECC_UMC_CE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_UMC_UE, "UMC Uncorrectable Error", "ECC_UMC_UE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_MCA_CE, "MCA Correctable Error", "ECC_MCA_CE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_MCA_UE, "MCA Uncorrectable Error", "ECC_MCA_UE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_VCN_CE, "VCN Correctable Error", "ECC_VCN_CE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_VCN_UE, "VCN Uncorrectable Error", "ECC_VCN_UE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_JPEG_CE, "JPEG Correctable Error", "ECC_JPEG_CE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_JPEG_UE, "JPEG Uncorrectable Error", "ECC_JPEG_UE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_IH_CE, "IH Correctable Error", "ECC_IH_CE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_IH_UE, "IH Uncorrectable Error", "ECC_IH_UE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_MPIO_CE, "MPIO Correctable Error", "ECC_MPIO_CE", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_MPIO_UE, "MPIO Uncorrectable Error", "ECC_MPIO_UE", true)
|
||||
|
||||
// XGMI
|
||||
FLD_DESC_ENT(RDC_FI_XGMI_0_READ_KB, "XGMI0 accumulated data read size (KB)", "XGMI_0_READ", true)
|
||||
FLD_DESC_ENT(RDC_FI_XGMI_1_READ_KB, "XGMI1 accumulated data read size (KB)", "XGMI_1_READ", true)
|
||||
FLD_DESC_ENT(RDC_FI_XGMI_2_READ_KB, "XGMI2 accumulated data read size (KB)", "XGMI_2_READ", true)
|
||||
|
||||
+47
-41
@@ -185,47 +185,46 @@ typedef enum {
|
||||
RDC_FI_ECC_CORRECT_TOTAL = 600, //!< Accumulated correctable ECC errors
|
||||
RDC_FI_ECC_UNCORRECT_TOTAL, //!< Accumulated uncorrectable ECC errors
|
||||
|
||||
RDC_FI_ECC_SDMA_SEC, //!< SDMA Single Error Correction
|
||||
RDC_FI_ECC_SDMA_DED, //!< SDMA Double Error Detection
|
||||
|
||||
RDC_FI_ECC_GFX_SEC, //!< GFX Single Error Correction
|
||||
RDC_FI_ECC_GFX_DED, //!< GFX Double Error Detection
|
||||
|
||||
RDC_FI_ECC_MMHUB_SEC, //!< MMHUB Single Error Correction
|
||||
RDC_FI_ECC_MMHUB_DED, //!< MMHUB Double Error Detection
|
||||
|
||||
RDC_FI_ECC_ATHUB_SEC, //!< ATHUB Single Error Correction
|
||||
RDC_FI_ECC_ATHUB_DED, //!< ATHUB Double Error Detection
|
||||
|
||||
RDC_FI_ECC_BIF_SEC, //!< BIF Single Error Correction
|
||||
RDC_FI_ECC_BIF_DED, //!< BIF Double Error Detection
|
||||
|
||||
RDC_FI_ECC_HDP_SEC, //!< HDP Single Error Correction
|
||||
RDC_FI_ECC_HDP_DED, //!< HDP Double Error Detection
|
||||
|
||||
RDC_FI_ECC_XGMI_WAFL_SEC, //!< XGMI WAFL Single Error Correction
|
||||
RDC_FI_ECC_XGMI_WAFL_DED, //!< XGMI WAFL Double Error Detection
|
||||
|
||||
RDC_FI_ECC_DF_SEC, //!< DF Single Error Correction
|
||||
RDC_FI_ECC_DF_DED, //!< DF Double Error Detection
|
||||
|
||||
RDC_FI_ECC_SMN_SEC, //!< SMN Single Error Correction
|
||||
RDC_FI_ECC_SMN_DED, //!< SMN Double Error Detection
|
||||
|
||||
RDC_FI_ECC_SEM_SEC, //!< SEM Single Error Correction
|
||||
RDC_FI_ECC_SEM_DED, //!< SEM Double Error Detection
|
||||
|
||||
RDC_FI_ECC_MP0_SEC, //!< MP0 Single Error Correction
|
||||
RDC_FI_ECC_MP0_DED, //!< MP0 Double Error Detection
|
||||
|
||||
RDC_FI_ECC_MP1_SEC, //!< MP1 Single Error Correction
|
||||
RDC_FI_ECC_MP1_DED, //!< MP1 Double Error Detection
|
||||
|
||||
RDC_FI_ECC_FUSE_SEC, //!< FUSE Single Error Correction
|
||||
RDC_FI_ECC_FUSE_DED, //!< FUSE Double Error Detection
|
||||
|
||||
RDC_FI_ECC_UMC_SEC, //!< UMC Single Error Correction
|
||||
RDC_FI_ECC_UMC_DED, //!< UMC Double Error Detection
|
||||
RDC_FI_ECC_FIRST = 602, //!< FIRST Error Correction and Detection field
|
||||
RDC_FI_ECC_SDMA_CE = RDC_FI_ECC_FIRST,
|
||||
RDC_FI_ECC_SDMA_UE,
|
||||
RDC_FI_ECC_GFX_CE,
|
||||
RDC_FI_ECC_GFX_UE,
|
||||
RDC_FI_ECC_MMHUB_CE,
|
||||
RDC_FI_ECC_MMHUB_UE,
|
||||
RDC_FI_ECC_ATHUB_CE,
|
||||
RDC_FI_ECC_ATHUB_UE,
|
||||
RDC_FI_ECC_PCIE_BIF_CE,
|
||||
RDC_FI_ECC_PCIE_BIF_UE,
|
||||
RDC_FI_ECC_HDP_CE,
|
||||
RDC_FI_ECC_HDP_UE,
|
||||
RDC_FI_ECC_XGMI_WAFL_CE,
|
||||
RDC_FI_ECC_XGMI_WAFL_UE,
|
||||
RDC_FI_ECC_DF_CE,
|
||||
RDC_FI_ECC_DF_UE,
|
||||
RDC_FI_ECC_SMN_CE,
|
||||
RDC_FI_ECC_SMN_UE,
|
||||
RDC_FI_ECC_SEM_CE,
|
||||
RDC_FI_ECC_SEM_UE,
|
||||
RDC_FI_ECC_MP0_CE,
|
||||
RDC_FI_ECC_MP0_UE,
|
||||
RDC_FI_ECC_MP1_CE,
|
||||
RDC_FI_ECC_MP1_UE,
|
||||
RDC_FI_ECC_FUSE_CE,
|
||||
RDC_FI_ECC_FUSE_UE,
|
||||
RDC_FI_ECC_UMC_CE,
|
||||
RDC_FI_ECC_UMC_UE,
|
||||
RDC_FI_ECC_MCA_CE,
|
||||
RDC_FI_ECC_MCA_UE,
|
||||
RDC_FI_ECC_VCN_CE,
|
||||
RDC_FI_ECC_VCN_UE,
|
||||
RDC_FI_ECC_JPEG_CE,
|
||||
RDC_FI_ECC_JPEG_UE,
|
||||
RDC_FI_ECC_IH_CE,
|
||||
RDC_FI_ECC_IH_UE,
|
||||
RDC_FI_ECC_MPIO_CE,
|
||||
RDC_FI_ECC_MPIO_UE,
|
||||
RDC_FI_ECC_LAST = RDC_FI_ECC_MPIO_UE,
|
||||
|
||||
// In new ASCI, such as MI300, the XGMI events is not supported
|
||||
// Using below XGMI related fields to calculate the bandwidth.
|
||||
@@ -320,6 +319,13 @@ typedef enum {
|
||||
|
||||
RDC_EVNT_NOTIF_LAST = RDC_EVNT_NOTIF_RING_HANG,
|
||||
} rdc_field_t;
|
||||
|
||||
// even and odd numbers are used for correctable and uncorrectable errors
|
||||
static_assert(RDC_FI_ECC_SDMA_CE % 2 == 0, "Correctable Error enum is not even");
|
||||
static_assert(RDC_FI_ECC_SDMA_UE % 2 == 1, "Uncorrectable Error enum is not odd");
|
||||
static_assert(RDC_FI_ECC_MPIO_CE % 2 == 0, "Correctable Error enum is not even");
|
||||
static_assert(RDC_FI_ECC_MPIO_UE % 2 == 1, "Uncorrectable Error enum is not odd");
|
||||
|
||||
#define RDC_EVNT_IS_NOTIF_FIELD(FIELD) \
|
||||
((FIELD) >= RDC_EVNT_NOTIF_FIRST && (FIELD) <= RDC_EVNT_NOTIF_LAST)
|
||||
/**
|
||||
|
||||
@@ -22,7 +22,7 @@ THE SOFTWARE.
|
||||
#ifndef INCLUDE_RDC_LIB_RDCDIAGNOSTICLIBINTERFACE_H_
|
||||
#define INCLUDE_RDC_LIB_RDCDIAGNOSTICLIBINTERFACE_H_
|
||||
|
||||
// The telemetry interface for libraries, for example, RAS.
|
||||
// The telemetry interface for libraries, for example, AMD-SMI.
|
||||
#include <rdc/rdc.h>
|
||||
|
||||
extern "C" {
|
||||
|
||||
@@ -22,7 +22,7 @@ THE SOFTWARE.
|
||||
#ifndef INCLUDE_RDC_LIB_RDCTELEMETRYLIBINTERFACE_H_
|
||||
#define INCLUDE_RDC_LIB_RDCTELEMETRYLIBINTERFACE_H_
|
||||
|
||||
// The telemetry interface for libraries, for example, RAS.
|
||||
// The telemetry interface for libraries, for example, AMD-SMI.
|
||||
#include <rdc/rdc.h>
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
@@ -84,7 +84,8 @@ class RdcMetricFetcherImpl final : public RdcMetricFetcher {
|
||||
std::shared_ptr<FieldSMIData> get_smi_data(RdcFieldKey key);
|
||||
|
||||
uint64_t now();
|
||||
void get_ecc_error(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value);
|
||||
void get_ecc(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value);
|
||||
void get_ecc_total(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value);
|
||||
|
||||
//!< return true if starting async_get
|
||||
bool async_get_pcie_throughput(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value);
|
||||
|
||||
@@ -1,89 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#ifndef INCLUDE_RDC_LIB_IMPL_RDCRASLIB_H_
|
||||
#define INCLUDE_RDC_LIB_IMPL_RDCRASLIB_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <list>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "rdc_lib/RdcDiagnostic.h"
|
||||
#include "rdc_lib/RdcLibraryLoader.h"
|
||||
#include "rdc_lib/RdcTelemetry.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
class RdcRasLib : public RdcTelemetry, public RdcDiagnostic {
|
||||
public:
|
||||
// get support field ids
|
||||
rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS],
|
||||
uint32_t* field_count) override;
|
||||
|
||||
// Fetch
|
||||
rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, uint32_t fields_count,
|
||||
rdc_field_value_f callback, void* user_data) override;
|
||||
|
||||
rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint32_t fields_count) override;
|
||||
|
||||
rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count) override;
|
||||
|
||||
rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
|
||||
uint32_t* test_case_count) override;
|
||||
|
||||
// Run a specific test case
|
||||
rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_test_result_t* result) override;
|
||||
|
||||
rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_response_t* response) override;
|
||||
|
||||
rdc_status_t rdc_diag_init(uint64_t flags) override;
|
||||
rdc_status_t rdc_diag_destroy() override;
|
||||
|
||||
RdcRasLib();
|
||||
|
||||
~RdcRasLib();
|
||||
|
||||
private:
|
||||
RdcLibraryLoader lib_loader_;
|
||||
rdc_status_t (*fields_value_get_)(rdc_gpu_field_t*, uint32_t, rdc_field_value_f, void*);
|
||||
rdc_status_t (*fields_query_)(uint32_t[MAX_NUM_FIELDS], uint32_t*);
|
||||
|
||||
rdc_status_t (*fields_watch_)(rdc_gpu_field_t*, uint32_t);
|
||||
rdc_status_t (*fields_unwatch_)(rdc_gpu_field_t*, uint32_t);
|
||||
|
||||
rdc_status_t (*rdc_module_init_)(uint64_t);
|
||||
rdc_status_t (*rdc_module_destroy_)();
|
||||
};
|
||||
typedef std::shared_ptr<RdcRasLib> RdcRasLibPtr;
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
#endif // INCLUDE_RDC_LIB_IMPL_RDCRASLIB_H_
|
||||
@@ -29,7 +29,6 @@ THE SOFTWARE.
|
||||
|
||||
#include "rdc_lib/RdcMetricFetcher.h"
|
||||
#include "rdc_lib/RdcTelemetry.h"
|
||||
#include "rdc_lib/impl/RdcRasLib.h"
|
||||
#include "rdc_lib/impl/RdcSmiLib.h"
|
||||
|
||||
namespace amd {
|
||||
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -1,26 +0,0 @@
|
||||
{
|
||||
"version": "0.0.1",
|
||||
"devices": [
|
||||
{
|
||||
"name": "VEGA20",
|
||||
"ids": [ "0x66A0", "0x66A1", "0x66A2", "0x66A3", "0x66A4", "0x66A7", "0x66AF" ],
|
||||
"config": "vega20.json",
|
||||
"gfx": "libgfx9.so",
|
||||
"sdma": "libsdma4.so"
|
||||
},
|
||||
{
|
||||
"name": "ARCTURUS",
|
||||
"ids": [ "0x738C", "0x7388", "0x738E" ],
|
||||
"config": "arcturus.json",
|
||||
"gfx": "libgfx9.so",
|
||||
"sdma": "libsdma4.so"
|
||||
},
|
||||
{
|
||||
"name": "SIENNA_CICHLID",
|
||||
"ids": [ "0x73A0", "0x73A2", "0x73A3", "0x73AB", "0x73AE", "0x73BF" ],
|
||||
"config": "sienna_cichlid.json",
|
||||
"gfx": "libgfx10.so",
|
||||
"sdma": "libsdma5.so"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,34 +0,0 @@
|
||||
{
|
||||
"version": "0.0.1",
|
||||
"type": {
|
||||
"parity": 1,
|
||||
"single_correctable": 2,
|
||||
"multi_uncorrectable": 4,
|
||||
"poison": 8
|
||||
},
|
||||
"block": {
|
||||
"umc": {
|
||||
"index": 0,
|
||||
"support": 1,
|
||||
"type": [
|
||||
"single_correctable",
|
||||
"multi_uncorrectable",
|
||||
"poison"
|
||||
]
|
||||
}
|
||||
},
|
||||
"tests": [
|
||||
{
|
||||
"name": "ras_umc.0.2",
|
||||
"block": "umc",
|
||||
"type": "single_correctable",
|
||||
"nullDispatchCS": "sp3/gfx10/edc/bin/sienna_cichlid/gc_edc_sqc_inst_bank_snop.bin"
|
||||
},
|
||||
{
|
||||
"name": "ras_umc.0.4",
|
||||
"block": "umc",
|
||||
"type": "multi_uncorrectable",
|
||||
"nullDispatchCS": "sp3/gfx10/edc/bin/sienna_cichlid/gc_edc_sqc_inst_bank_snop.bin"
|
||||
}
|
||||
]
|
||||
}
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Двоичные данные
Двоичный файл не отображается.
Двоичные данные
Двоичный файл не отображается.
@@ -1,31 +0,0 @@
|
||||
shader main
|
||||
asic(GFX10)
|
||||
wave_size(32)
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
for var i = 0; i < 1000; i++
|
||||
s_nop 0x1
|
||||
end
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,42 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
user_sgpr_count(0)
|
||||
|
||||
// Clear ACC VGPR
|
||||
for var vgpr = 0; vgpr < 256; ++vgpr
|
||||
v_accvgpr_write acc[vgpr], 0
|
||||
end
|
||||
|
||||
s_movk_i32 m0, 0x0000
|
||||
s_mov_b32 s10, 0x000000f8
|
||||
s_set_gpr_idx_on s10, 0x8
|
||||
label_0004:
|
||||
v_mov_b32 v0, 0
|
||||
v_mov_b32 v1, 0
|
||||
v_mov_b32 v2, 0
|
||||
v_mov_b32 v3, 0
|
||||
v_mov_b32 v4, 0
|
||||
v_mov_b32 v5, 0
|
||||
v_mov_b32 v6, 0
|
||||
v_mov_b32 v7, 0
|
||||
s_sub_u32 s10, s10, 8
|
||||
s_set_gpr_idx_idx s10
|
||||
s_cbranch_scc0 label_0004
|
||||
s_set_gpr_idx_off
|
||||
v_mbcnt_lo_u32_b32 v1, exec_hi, 0
|
||||
v_mbcnt_hi_u32_b32 v1, exec_lo, v1
|
||||
v_mul_u32_u24 v1, 8, v1
|
||||
s_getreg_b32 s11, hwreg(HW_REG_HW_ID, 4, 2)
|
||||
s_mulk_i32 s11, 0x4000
|
||||
v_add_co_u32 v1, vcc, v1, s11
|
||||
s_mov_b32 s10, 7
|
||||
s_mov_b32 m0, -1
|
||||
label_001B:
|
||||
ds_write2_b64 v1, v[2:3], v[2:3] offset1:64
|
||||
ds_write2_b64 v1, v[4:5], v[4:5] offset0:128 offset1:192
|
||||
v_add_co_u32 v1, vcc, 0x00000800, v1
|
||||
s_sub_u32 s10, s10, 1
|
||||
s_cbranch_scc0 label_001B
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
@@ -1,113 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
// Clear ACC VGPR
|
||||
for var vgpr = 0; vgpr < 256; ++vgpr
|
||||
v_accvgpr_write acc[vgpr], 0
|
||||
end
|
||||
|
||||
//sp3 loop for lifetime
|
||||
s_mov_b32 s12, 0 //init loop idx s12
|
||||
label_0001:
|
||||
s_cmp_lt_i32 s12, s8 //scc = (s12 < s8) ? 1 : 0
|
||||
s_cbranch_scc0 label_0006 //if(scc == 0) then jump to label_0006; else nop
|
||||
v_mov_b32 v4,s12
|
||||
s_add_i32 s12, s12, 1 //add loop incr
|
||||
s_branch label_0001
|
||||
label_0006: //end of SP3 loop
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
s_load_dwordx4 s[40:43], s[0:1], 0x20
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
// Clear VGPR and LDS
|
||||
s_movk_i32 m0, 0x0000
|
||||
s_mov_b32 s12, 0x000000f8
|
||||
s_set_gpr_idx_on s12, 0x8
|
||||
label_0004:
|
||||
v_mov_b32 v0, 0
|
||||
v_mov_b32 v1, 0
|
||||
v_mov_b32 v2, 0
|
||||
v_mov_b32 v3, 0
|
||||
v_mov_b32 v4, 0
|
||||
v_mov_b32 v5, 0
|
||||
v_mov_b32 v6, 0
|
||||
v_mov_b32 v7, 0
|
||||
s_sub_u32 s12, s12, 8
|
||||
s_set_gpr_idx_idx s12
|
||||
s_cbranch_scc0 label_0004
|
||||
s_set_gpr_idx_off
|
||||
v_mbcnt_lo_u32_b32 v1, exec_hi, 0
|
||||
v_mbcnt_hi_u32_b32 v1, exec_lo, v1
|
||||
v_mul_u32_u24 v1, 8, v1
|
||||
s_getreg_b32 s13, hwreg(HW_REG_HW_ID, 4, 2)
|
||||
s_mulk_i32 s13, 0x4000
|
||||
v_add_co_u32 v1, vcc, v1, s13
|
||||
s_mov_b32 s12, 7
|
||||
s_mov_b32 m0, -1
|
||||
label_001B:
|
||||
ds_write2_b64 v1, v[2:3], v[2:3] offset1:64
|
||||
ds_write2_b64 v1, v[4:5], v[4:5] offset0:128 offset1:192
|
||||
v_add_co_u32 v1, vcc, 0x00000800, v1
|
||||
s_sub_u32 s12, s12, 1
|
||||
s_cbranch_scc0 label_001B
|
||||
|
||||
// Save coverage in the memory
|
||||
s_getreg_b32 s20, hwreg(HW_REG_HW_ID, 0, 32)
|
||||
// s12 = SIMD
|
||||
s_lshr_b32 s12,s20,4
|
||||
s_and_b32 s12, s12, 0x3
|
||||
// s13 = CU
|
||||
s_lshr_b32 s13,s20,8
|
||||
s_and_b32 s13, s13, 0xf
|
||||
// s14 = SE
|
||||
s_lshr_b32 s14,s20,13
|
||||
s_and_b32 s14, s14, 0x7
|
||||
// s15 = SE * 16 * 4 + CU * 4 + SIMD
|
||||
s_mul_i32 s16, s14, 64
|
||||
s_mul_i32 s17, s13, 4
|
||||
s_add_i32 s15, s16, s17
|
||||
s_add_i32 s15, s15, s12
|
||||
s_mul_i32 s16, s15, 4
|
||||
|
||||
s_buffer_store_dword s15, s24, s16 glc
|
||||
s_waitcnt 0
|
||||
|
||||
s_buffer_load_dword s17, s24, s16 glc
|
||||
s_waitcnt 0
|
||||
s_endpgm
|
||||
end
|
||||
@@ -1,59 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s8
|
||||
tgid_y_en(1) //s_tgid_y s9
|
||||
tgid_z_en(1) //s_tgid_z s10
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//read mem data
|
||||
s_mov_b32 s31, 0x0
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
v_mov_b32 v10, v0
|
||||
//buffer_load_dword v10, v9, s24, s31 idxen:1 glc:1
|
||||
//s_waitcnt 0
|
||||
//v_mov_b32 v11, v1
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
|
||||
s_mov_b32 s16, 0xa5a50000
|
||||
s_store_dword s16, s[0:1], 0x40 glc
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,60 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s8
|
||||
tgid_y_en(1) //s_tgid_y s9
|
||||
tgid_z_en(1) //s_tgid_z s10
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
|
||||
//read from the GDS
|
||||
v_lshlrev_b32 v10, 2, v3
|
||||
s_mov_b32 m0, 0xFFFF
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
|
||||
ds_read_b32 v11, v10 gds:1
|
||||
s_waitcnt 0
|
||||
|
||||
v_mov_b32 v12, v11
|
||||
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,673 +0,0 @@
|
||||
shader main
|
||||
|
||||
type(CS)
|
||||
|
||||
/*************************************************************************/
|
||||
/* control on how to run the shader */
|
||||
/*************************************************************************/
|
||||
//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run)
|
||||
var EMU_RUN_HACK = 1
|
||||
var EMU_RUN_HACK_RESTORE_NORMAL = 0
|
||||
var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0
|
||||
var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0
|
||||
var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
|
||||
var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
|
||||
var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
|
||||
var SAVE_LDS = 0
|
||||
var WG_BASE_ADDR_LO = 0x9000a000
|
||||
var WG_BASE_ADDR_HI = 0x0
|
||||
var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem
|
||||
var CTX_SAVE_CONTROL = 0x0
|
||||
var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL
|
||||
var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run)
|
||||
var SGPR_SAVE_USE_SQC = 0 //use SQC D$ to do the write
|
||||
var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //need to change BUF_DATA_FORMAT in S_SAVE_BUF_RSRC_WORD3_MISC from 0 to BUF_DATA_FORMAT_32 if set to 1 (i.e. 0x00827FAC)
|
||||
var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing
|
||||
|
||||
/**************************************************************************/
|
||||
/* variables */
|
||||
/**************************************************************************/
|
||||
var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23
|
||||
var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000
|
||||
|
||||
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
|
||||
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
|
||||
var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8
|
||||
var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6
|
||||
var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24
|
||||
var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
|
||||
|
||||
var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400
|
||||
var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10
|
||||
var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100
|
||||
var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8
|
||||
var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF
|
||||
var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0
|
||||
var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10
|
||||
var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800
|
||||
var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11
|
||||
var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21
|
||||
|
||||
var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME
|
||||
var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME
|
||||
var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME
|
||||
|
||||
var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24
|
||||
var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27
|
||||
|
||||
|
||||
/* Save */
|
||||
var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes
|
||||
var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
|
||||
|
||||
var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
|
||||
var S_SAVE_SPI_INIT_ATC_SHIFT = 27
|
||||
var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
|
||||
var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28
|
||||
var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
|
||||
var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26
|
||||
|
||||
var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used
|
||||
var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME
|
||||
var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME
|
||||
var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME
|
||||
|
||||
var s_save_spi_init_lo = exec_lo
|
||||
var s_save_spi_init_hi = exec_hi
|
||||
|
||||
//tba_lo and tba_hi need to be saved/restored
|
||||
var tba_lo = ttmp12
|
||||
var tba_hi = ttmp13
|
||||
var tma_lo = ttmp14
|
||||
var tma_hi = ttmp15
|
||||
|
||||
var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
|
||||
var s_save_pc_hi = ttmp1
|
||||
var s_save_exec_lo = ttmp2
|
||||
var s_save_exec_hi = ttmp3
|
||||
var s_save_status = ttmp4
|
||||
var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine
|
||||
var s_save_xnack_mask_lo = ttmp6
|
||||
var s_save_xnack_mask_hi = ttmp7
|
||||
var s_save_buf_rsrc0 = ttmp8
|
||||
var s_save_buf_rsrc1 = ttmp9
|
||||
var s_save_buf_rsrc2 = ttmp10
|
||||
var s_save_buf_rsrc3 = ttmp11
|
||||
|
||||
var s_save_mem_offset = tma_lo
|
||||
var s_save_alloc_size = s_save_trapsts //conflict
|
||||
var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time)
|
||||
var s_save_m0 = tma_hi
|
||||
|
||||
/* Restore */
|
||||
var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE
|
||||
var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC
|
||||
|
||||
var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
|
||||
var S_RESTORE_SPI_INIT_ATC_SHIFT = 27
|
||||
var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
|
||||
var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28
|
||||
var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
|
||||
var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26
|
||||
|
||||
var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT
|
||||
var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK
|
||||
var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
|
||||
var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK
|
||||
|
||||
var s_restore_spi_init_lo = exec_lo
|
||||
var s_restore_spi_init_hi = exec_hi
|
||||
|
||||
var s_restore_mem_offset = ttmp2
|
||||
var s_restore_alloc_size = ttmp3
|
||||
var s_restore_tmp = ttmp6 //tba_lo/hi need to be restored
|
||||
var s_restore_mem_offset_save = s_restore_tmp //no conflict
|
||||
|
||||
var s_restore_m0 = s_restore_alloc_size //no conflict
|
||||
|
||||
var s_restore_mode = ttmp7
|
||||
|
||||
var s_restore_pc_lo = ttmp0
|
||||
var s_restore_pc_hi = ttmp1
|
||||
var s_restore_exec_lo = tma_lo //no conflict
|
||||
var s_restore_exec_hi = tma_hi //no conflict
|
||||
var s_restore_status = ttmp4
|
||||
var s_restore_trapsts = ttmp5
|
||||
var s_restore_xnack_mask_lo = xnack_mask_lo
|
||||
var s_restore_xnack_mask_hi = xnack_mask_hi
|
||||
var s_restore_buf_rsrc0 = ttmp8
|
||||
var s_restore_buf_rsrc1 = ttmp9
|
||||
var s_restore_buf_rsrc2 = ttmp10
|
||||
var s_restore_buf_rsrc3 = ttmp11
|
||||
|
||||
/**************************************************************************/
|
||||
/* trap handler entry points */
|
||||
/**************************************************************************/
|
||||
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore
|
||||
//FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
|
||||
s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC
|
||||
s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
|
||||
s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE
|
||||
//FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE
|
||||
s_branch L_SKIP_RESTORE //NOT restore, SAVE actually
|
||||
else
|
||||
s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save
|
||||
end
|
||||
|
||||
L_JUMP_TO_RESTORE:
|
||||
s_branch L_RESTORE //restore
|
||||
|
||||
L_SKIP_RESTORE:
|
||||
|
||||
s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
|
||||
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
|
||||
s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save
|
||||
s_cbranch_scc1 L_SAVE //this is the operation for save
|
||||
//the poential code (such as restore STATUS) on this path is for regular trap handling and don't care for compute save & restore
|
||||
|
||||
//EMU will not execute the code since in hack mode it is skipped while in normal mode there is no save in EMU
|
||||
//SIM will only execute the code in normal S/R mode but not in hack mode
|
||||
if (!EMU_RUN_HACK)
|
||||
L_ERROR: //to catch incorrect savectx setting in SIM assuming the trap handler is only used for save & restore
|
||||
s_branch L_ERROR
|
||||
end
|
||||
|
||||
/**************************************************************************/
|
||||
/* save routine */
|
||||
/**************************************************************************/
|
||||
|
||||
L_SAVE:
|
||||
|
||||
//check whether there is mem_viol
|
||||
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
|
||||
s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
|
||||
s_cbranch_scc0 L_NO_PC_REWIND
|
||||
|
||||
//if so, need rewind PC assuming GDS operation gets NACKed
|
||||
s_mov_b32 s_save_tmp, 0 //clear mem_viol bit
|
||||
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit
|
||||
s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
|
||||
s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8
|
||||
s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc
|
||||
|
||||
L_NO_PC_REWIND:
|
||||
s_mov_b32 s_save_tmp, 0 //clear saveCtx bit
|
||||
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit
|
||||
|
||||
s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK
|
||||
s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi
|
||||
s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT
|
||||
s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
|
||||
s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
|
||||
s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY
|
||||
s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
|
||||
s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
|
||||
s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS
|
||||
s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
|
||||
|
||||
s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp
|
||||
|
||||
/* inform SPI the readiness and wait for SPI's go signal */
|
||||
s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI
|
||||
s_mov_b32 s_save_exec_hi, exec_hi
|
||||
s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive
|
||||
if (EMU_RUN_HACK)
|
||||
|
||||
else
|
||||
s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC
|
||||
end
|
||||
|
||||
L_SLEEP:
|
||||
s_sleep 0x2
|
||||
|
||||
if (EMU_RUN_HACK)
|
||||
|
||||
else
|
||||
s_cbranch_execz L_SLEEP
|
||||
end
|
||||
|
||||
|
||||
/* setup Resource Contants */
|
||||
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
|
||||
//calculate wd_addr using absolute thread id
|
||||
v_readlane_b32 s_save_tmp, v9, 0
|
||||
s_lshr_b32 s_save_tmp, s_save_tmp, 6
|
||||
s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
|
||||
s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
|
||||
s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
|
||||
s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
|
||||
else
|
||||
end
|
||||
if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
|
||||
s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
|
||||
s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
|
||||
s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
|
||||
else
|
||||
end
|
||||
|
||||
|
||||
s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo
|
||||
s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi
|
||||
s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
|
||||
s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
|
||||
s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
|
||||
s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
|
||||
s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
|
||||
s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC
|
||||
s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
|
||||
s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
|
||||
s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE
|
||||
|
||||
//FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?)
|
||||
s_mov_b32 s_save_m0, m0 //save M0
|
||||
|
||||
/* global mem offset */
|
||||
s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0
|
||||
|
||||
|
||||
/* the first wave in the threadgroup */
|
||||
s_barrier //FIXME not performance-optimal "LDS is used? wait for other waves in the same TG"
|
||||
s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here
|
||||
s_cbranch_scc0 L_SAVE_VGPR
|
||||
|
||||
/* save LDS */
|
||||
//////////////////////////////
|
||||
L_SAVE_LDS:
|
||||
|
||||
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
|
||||
s_mov_b32 exec_hi, 0xFFFFFFFF
|
||||
|
||||
s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
|
||||
s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero?
|
||||
s_cbranch_scc0 L_SAVE_VGPR //no lds used? jump to L_SAVE_VGPR
|
||||
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
|
||||
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes
|
||||
s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes
|
||||
if (SWIZZLE_EN)
|
||||
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
|
||||
else
|
||||
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
|
||||
end
|
||||
s_mov_b32 m0, 0x0 //lds_offset initial value = 0
|
||||
|
||||
L_SAVE_LDS_LOOP:
|
||||
if (SAVE_LDS)
|
||||
buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1
|
||||
end
|
||||
s_add_u32 m0, m0, 256 //every buffer_store_lds does 256 bytes
|
||||
s_add_u32 s_save_mem_offset, s_save_mem_offset, 256 //mem offset increased by 256 bytes
|
||||
s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
|
||||
s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete?
|
||||
|
||||
|
||||
/* save VGPRs */
|
||||
//////////////////////////////
|
||||
L_SAVE_VGPR:
|
||||
|
||||
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
|
||||
s_mov_b32 exec_hi, 0xFFFFFFFF
|
||||
|
||||
s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
|
||||
s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
|
||||
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible
|
||||
s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
|
||||
if (SWIZZLE_EN)
|
||||
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
|
||||
else
|
||||
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
|
||||
end
|
||||
|
||||
s_mov_b32 m0, 0x0 //VGPR initial index value =0
|
||||
s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
|
||||
s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later
|
||||
|
||||
L_SAVE_VGPR_LOOP:
|
||||
v_mov_b32 v0, v0 //v0 = v[0+m0]
|
||||
|
||||
if(USE_MTBUF_INSTEAD_OF_MUBUF)
|
||||
tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
|
||||
else
|
||||
buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
|
||||
end
|
||||
|
||||
s_add_u32 m0, m0, 1 //next vgpr index
|
||||
s_add_u32 s_save_mem_offset, s_save_mem_offset, 256 //every buffer_store_dword does 256 bytes
|
||||
s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
|
||||
s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete?
|
||||
s_set_gpr_idx_off
|
||||
|
||||
/* save SGPRs */
|
||||
//////////////////////////////
|
||||
s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
|
||||
s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
|
||||
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
|
||||
|
||||
if (SGPR_SAVE_USE_SQC)
|
||||
s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes
|
||||
else
|
||||
s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads)
|
||||
end
|
||||
|
||||
if (SWIZZLE_EN)
|
||||
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
|
||||
else
|
||||
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
|
||||
end
|
||||
|
||||
s_mov_b32 m0, 0x0 //SGPR initial index value =0
|
||||
s_nop 0x0 //Manually inserted wait states
|
||||
|
||||
L_SAVE_SGPR_LOOP:
|
||||
s_movrels_b32 s0, s0 //s0 = s[0+m0]
|
||||
write_sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PV: the best performance should be using s_buffer_store_dwordx4
|
||||
s_add_u32 m0, m0, 1 //next sgpr index
|
||||
s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
|
||||
s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete?
|
||||
|
||||
/* save HW registers */
|
||||
//////////////////////////////
|
||||
L_SAVE_HWREG:
|
||||
s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
|
||||
if (SWIZZLE_EN)
|
||||
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
|
||||
else
|
||||
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
|
||||
end
|
||||
|
||||
|
||||
write_sgpr_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //M0
|
||||
|
||||
if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
|
||||
s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
|
||||
s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
|
||||
s_mov_b32 tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO
|
||||
s_mov_b32 tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI
|
||||
end
|
||||
|
||||
write_sgpr_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PC
|
||||
write_sgpr_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
|
||||
write_sgpr_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //EXEC
|
||||
write_sgpr_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
|
||||
write_sgpr_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //STATUS
|
||||
|
||||
//s_save_trapsts conflicts with s_save_alloc_size
|
||||
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
|
||||
write_sgpr_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TRAPSTS
|
||||
|
||||
write_sgpr_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_LO
|
||||
write_sgpr_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_HI
|
||||
|
||||
//use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
|
||||
s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE
|
||||
write_sgpr_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
|
||||
|
||||
write_sgpr_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TBA_LO
|
||||
write_sgpr_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TBA_HI
|
||||
|
||||
/* S_PGM_END_SAVED */ //FIXME graphics ONLY
|
||||
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
|
||||
s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
|
||||
s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
|
||||
s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
|
||||
s_rfe_b64 s_save_pc_lo //Return to the main shader program
|
||||
else
|
||||
end
|
||||
|
||||
|
||||
s_branch L_END_PGM
|
||||
|
||||
|
||||
|
||||
/**************************************************************************/
|
||||
/* restore routine */
|
||||
/**************************************************************************/
|
||||
|
||||
L_RESTORE:
|
||||
/* Setup Resource Contants */
|
||||
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
|
||||
//calculate wd_addr using absolute thread id
|
||||
v_readlane_b32 s_restore_tmp, v9, 0
|
||||
s_lshr_b32 s_restore_tmp, s_restore_tmp, 6
|
||||
s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
|
||||
s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
|
||||
s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
|
||||
s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
|
||||
else
|
||||
end
|
||||
|
||||
s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo
|
||||
s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi
|
||||
s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
|
||||
s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes)
|
||||
s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
|
||||
s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
|
||||
s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
|
||||
s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC
|
||||
s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
|
||||
s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
|
||||
s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE
|
||||
|
||||
/* global mem offset */
|
||||
s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0
|
||||
|
||||
/* the first wave in the threadgroup */
|
||||
s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
|
||||
s_cbranch_scc0 L_RESTORE_VGPR
|
||||
|
||||
/* restore LDS */
|
||||
//////////////////////////////
|
||||
L_RESTORE_LDS:
|
||||
|
||||
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
|
||||
s_mov_b32 exec_hi, 0xFFFFFFFF
|
||||
|
||||
s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
|
||||
s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero?
|
||||
s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR
|
||||
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
|
||||
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes
|
||||
s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes
|
||||
if (SWIZZLE_EN)
|
||||
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
|
||||
else
|
||||
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
|
||||
end
|
||||
s_mov_b32 m0, 0x0 //lds_offset initial value = 0
|
||||
|
||||
L_RESTORE_LDS_LOOP:
|
||||
if (SAVE_LDS)
|
||||
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1
|
||||
end
|
||||
s_add_u32 m0, m0, 256 //every buffer_load_dword does 256 bytes
|
||||
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256 bytes
|
||||
s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
|
||||
s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete?
|
||||
|
||||
|
||||
/* restore VGPRs */
|
||||
//////////////////////////////
|
||||
L_RESTORE_VGPR:
|
||||
|
||||
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
|
||||
s_mov_b32 exec_hi, 0xFFFFFFFF
|
||||
|
||||
s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
|
||||
s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
|
||||
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value)
|
||||
s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
|
||||
if (SWIZZLE_EN)
|
||||
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
|
||||
else
|
||||
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
|
||||
end
|
||||
s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
|
||||
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256
|
||||
s_mov_b32 m0, 1 //VGPR initial index value = 1
|
||||
s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
|
||||
s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later
|
||||
|
||||
L_RESTORE_VGPR_LOOP:
|
||||
if(USE_MTBUF_INSTEAD_OF_MUBUF)
|
||||
tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
|
||||
else
|
||||
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
|
||||
end
|
||||
s_waitcnt vmcnt(0) //ensure data ready
|
||||
v_mov_b32 v0, v0 //v[0+m0] = v0
|
||||
s_add_u32 m0, m0, 1 //next vgpr index
|
||||
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //every buffer_load_dword does 256 bytes
|
||||
s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
|
||||
s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete?
|
||||
s_set_gpr_idx_off
|
||||
/* VGPR restore on v0 */
|
||||
if(USE_MTBUF_INSTEAD_OF_MUBUF)
|
||||
tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
|
||||
else
|
||||
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
|
||||
end
|
||||
|
||||
|
||||
/* restore SGPRs */
|
||||
//////////////////////////////
|
||||
s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
|
||||
s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
|
||||
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
|
||||
|
||||
if (SGPR_SAVE_USE_SQC)
|
||||
s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes
|
||||
else
|
||||
s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads)
|
||||
end
|
||||
if (SWIZZLE_EN)
|
||||
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
|
||||
else
|
||||
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
|
||||
end
|
||||
read_sgpr_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //save s0 to s_restore_tmp
|
||||
s_mov_b32 m0, 0x1 //SGPR initial index value =1 //go on with with s1
|
||||
|
||||
L_RESTORE_SGPR_LOOP:
|
||||
read_sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PV: further performance improvement can be made
|
||||
s_waitcnt lgkmcnt(0) //ensure data ready
|
||||
s_movreld_b32 s0, s0 //s[0+m0] = s0
|
||||
s_add_u32 m0, m0, 1 //next sgpr index
|
||||
s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
|
||||
s_cbranch_scc1 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete?
|
||||
s_mov_b32 s0, s_restore_tmp /* SGPR restore on s0 */
|
||||
|
||||
/* restore HW registers */
|
||||
//////////////////////////////
|
||||
L_RESTORE_HWREG:
|
||||
s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
|
||||
if (SWIZZLE_EN)
|
||||
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
|
||||
else
|
||||
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
|
||||
end
|
||||
|
||||
read_sgpr_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //M0
|
||||
read_sgpr_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PC
|
||||
read_sgpr_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
|
||||
read_sgpr_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //EXEC
|
||||
read_sgpr_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
|
||||
read_sgpr_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //STATUS
|
||||
read_sgpr_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TRAPSTS
|
||||
read_sgpr_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_LO
|
||||
read_sgpr_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_HI
|
||||
read_sgpr_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //MODE
|
||||
read_sgpr_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TBA_LO
|
||||
read_sgpr_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TBA_HI
|
||||
|
||||
s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS
|
||||
|
||||
s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
|
||||
|
||||
//for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
|
||||
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
|
||||
s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore)
|
||||
s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
|
||||
end
|
||||
if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
|
||||
s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal
|
||||
s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
|
||||
end
|
||||
|
||||
s_mov_b32 m0, s_restore_m0
|
||||
s_mov_b32 exec_lo, s_restore_exec_lo
|
||||
s_mov_b32 exec_hi, s_restore_exec_hi
|
||||
|
||||
s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
|
||||
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
|
||||
s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
|
||||
s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
|
||||
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
|
||||
//s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
|
||||
s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode
|
||||
//reuse s_restore_m0 as a temp register
|
||||
s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
|
||||
s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
|
||||
s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
|
||||
s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero
|
||||
s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
|
||||
s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
|
||||
s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
|
||||
s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
|
||||
s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
|
||||
s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
|
||||
s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
|
||||
s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp
|
||||
s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status
|
||||
|
||||
s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time
|
||||
|
||||
|
||||
// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution
|
||||
s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc
|
||||
|
||||
|
||||
/**************************************************************************/
|
||||
/* the END */
|
||||
/**************************************************************************/
|
||||
L_END_PGM:
|
||||
s_endpgm
|
||||
|
||||
end
|
||||
|
||||
|
||||
/**************************************************************************/
|
||||
/* the helper functions */
|
||||
/**************************************************************************/
|
||||
|
||||
function write_sgpr_to_mem(s, s_rsrc, s_mem_offset, use_sqc, use_mtbuf)
|
||||
if (use_sqc)
|
||||
s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on
|
||||
s_mov_b32 m0, s_mem_offset
|
||||
s_buffer_store_dword s, s_rsrc, m0 glc:1
|
||||
s_add_u32 s_mem_offset, s_mem_offset, 4
|
||||
s_mov_b32 m0, exec_lo
|
||||
elsif (use_mtbuf)
|
||||
v_mov_b32 v0, s
|
||||
tbuffer_store_format_x v0, v0, s_rsrc, s_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
|
||||
s_add_u32 s_mem_offset, s_mem_offset, 256
|
||||
else
|
||||
v_mov_b32 v0, s
|
||||
buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1
|
||||
s_add_u32 s_mem_offset, s_mem_offset, 256
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
|
||||
function read_sgpr_from_mem(s, s_rsrc, s_mem_offset, use_sqc)
|
||||
s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1
|
||||
if (use_sqc)
|
||||
s_add_u32 s_mem_offset, s_mem_offset, 4
|
||||
else
|
||||
s_add_u32 s_mem_offset, s_mem_offset, 256
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(4)
|
||||
v_mov_b32 v0, s0
|
||||
v_mov_b32 v1, s1
|
||||
v_mov_b32 v2, s2
|
||||
v_mov_b32 v3, s3
|
||||
flat_load_dword v4, v[0:1] slc
|
||||
s_waitcnt vmcnt(0)&lgkmcnt(0)
|
||||
v_mov_b32 v5, 0
|
||||
s_sleep 40000
|
||||
LOOP:
|
||||
v_add_co_u32 v5, vcc, 1, v5
|
||||
s_waitcnt vmcnt(0)&lgkmcnt(0)
|
||||
v_cmp_lt_u32 vcc, v5, v4
|
||||
s_cbranch_vccnz LOOP
|
||||
flat_store_dword v[2,3], v5
|
||||
s_waitcnt vmcnt(0)&lgkmcnt(0)
|
||||
s_endpgm
|
||||
end
|
||||
@@ -1,69 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x80
|
||||
|
||||
STORE_LOOP:
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x100
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x100
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_mov_b32 s8, s30
|
||||
s_mov_b32 s31, 0x80
|
||||
|
||||
s_waitcnt 0
|
||||
LOAD_LOOP:
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x100
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x100
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,131 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
|
||||
s_waitcnt 0
|
||||
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,61 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
s_mov_b32 s32, 0x4000
|
||||
v_mul_i32_i24 v9, v9, s32
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x10000
|
||||
|
||||
STORE_LOOP:
|
||||
buffer_store_dword v0, v9, s24, s31 offen:1
|
||||
s_waitcnt 0
|
||||
s_add_u32 s31, s31, 0x10000
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_mov_b32 s16, 0xa5a50000
|
||||
s_store_dword s16, s[0:1], 0x40 glc
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,79 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//read mem data
|
||||
s_mov_b32 s31, 0x0
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
//write it to GDS
|
||||
s_mov_b32 s30, s8
|
||||
v_lshlrev_b32 v10, 2, v9
|
||||
s_mov_b32 m0, 0xFFFF
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
|
||||
STORE_LOOP:
|
||||
ds_write_b32 v10, v0 gds:1 // GPU hang when GPU access the GDS with GFX queue
|
||||
s_waitcnt 0
|
||||
v_add_u16 v10, v10, 0x10
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_mov_b32 s8, s30
|
||||
v_lshlrev_b32 v10, 2, v3
|
||||
|
||||
LOAD_LOOP:
|
||||
ds_read_b32 v11, v10 gds:1
|
||||
s_waitcnt 0
|
||||
v_mov_b32 v12, v11
|
||||
v_add_u16 v10, v10, 0x10
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
|
||||
s_mov_b32 s16, 0xa5a50000
|
||||
s_store_dword s16, s[0:1], 0x40 glc
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,55 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s8
|
||||
tgid_y_en(1) //s_tgid_y s9
|
||||
tgid_z_en(1) //s_tgid_z s10
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//read data from GDS
|
||||
v_lshlrev_b32 v10, 2, v3
|
||||
s_mov_b32 m0, 0xFFFF
|
||||
s_nop 1
|
||||
s_nop 1
|
||||
s_nop 1
|
||||
ds_read_b32 v11, v10 gds:1
|
||||
s_waitcnt 0
|
||||
|
||||
//write the data to memory
|
||||
buffer_store_dword v11, v9, s24, s7 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,68 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
|
||||
user_sgpr_count(4)
|
||||
tgid_x_en(1)
|
||||
tgid_y_en(1)
|
||||
tgid_z_en(1)
|
||||
|
||||
s_getreg_b32 s18, hwreg(HW_REG_HW_ID, 0, 32)
|
||||
s_bfe_u32 s16, s18, 0x2001e // get meid
|
||||
s_bfe_u32 s17, s18, 0x20006 // get pipeid
|
||||
//s_add_u32 s17, s17, s16
|
||||
|
||||
// get ring id
|
||||
v_mov_b32 v20, s17
|
||||
s_and_b32 s17, s17, 0x7
|
||||
|
||||
// Get thread_id inside wave
|
||||
v_mbcnt_lo_u32_b32 v8, 0xffffffff, 0
|
||||
v_mbcnt_hi_u32_b32 v9, 0xffffffff, v8
|
||||
|
||||
s_waitcnt 0
|
||||
|
||||
// init: gds write address
|
||||
v_mov_b32 v13, 0
|
||||
|
||||
// the first 128DW is for ordered-append counter
|
||||
v_mov_b32 v14, 0x80
|
||||
|
||||
// offset ring
|
||||
v_mov_b32 v15, 0x200
|
||||
|
||||
v_mul_lo_u32 v15, v15, v20 // ring offset
|
||||
v_mov_b32 v16, 0x40 // wave_size
|
||||
|
||||
v_mul_lo_u32 v18, v1, s1
|
||||
v_add_co_u32 v18, vcc, v18, v0
|
||||
v_lshrrev_b32 v17,6 ,v18
|
||||
s_mov_b32 s9, s12
|
||||
s_lshr_b32 s9, s9, 6
|
||||
s_and_b32 s9, s9, 0x7ff
|
||||
s_lshl_b32 s17, s17, 18
|
||||
s_or_b32 s9, s9, s17
|
||||
s_mov_b32 m0, s9
|
||||
|
||||
v_mov_b32 v10, 1
|
||||
v_mov_b32 v11, 0
|
||||
ds_ordered_count v11, v10 gds:1 offset0:0 offset1:1
|
||||
s_waitcnt 0
|
||||
|
||||
v_mov_b32 v18, v11
|
||||
|
||||
v_mul_lo_u32 v16, v16, v18 // waves offset before.
|
||||
v_add_co_u32 v13, vcc, v13, v14
|
||||
v_add_co_u32 v13, vcc, v13, v15
|
||||
v_add_co_u32 v13, vcc, v13, v16
|
||||
v_add_co_u32 v13, vcc, v13, v9
|
||||
|
||||
v_lshlrev_b32 v13,2,v13
|
||||
s_mov_b32 m0, 0x4000
|
||||
s_nop 0
|
||||
ds_write_b32 v13, v0 gds:1
|
||||
s_waitcnt 0
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,79 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//read mem data
|
||||
s_mov_b32 s31, 0x0
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
v_lshlrev_b32 v10, 2, v3
|
||||
s_mov_b32 m0, 0xFFFF
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
|
||||
STORE_LOOP:
|
||||
ds_write_b32 v10, v0
|
||||
s_waitcnt 0
|
||||
v_add_u16 v10, v10, 0x10
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_mov_b32 s8, s30
|
||||
v_lshlrev_b32 v10, 2, v3
|
||||
|
||||
LOAD_LOOP:
|
||||
ds_read_b32 v11, v10
|
||||
s_waitcnt 0
|
||||
v_mov_b32 v12, v11
|
||||
v_add_u16 v10, v10, 0x10
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
|
||||
s_mov_b32 s16, 0xa5a50000
|
||||
s_store_dword s16, s[0:1], 0x40 glc
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,55 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s8
|
||||
tgid_y_en(1) //s_tgid_y s9
|
||||
tgid_z_en(1) //s_tgid_z s10
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//read it from LDS
|
||||
v_lshlrev_b32 v10, 2, v3
|
||||
s_mov_b32 m0, 0xFFFF
|
||||
s_nop 1
|
||||
s_nop 1
|
||||
s_nop 1
|
||||
ds_read_b32 v0, v10
|
||||
s_waitcnt 0
|
||||
|
||||
//write the data to memory
|
||||
buffer_store_dword v0, v9, s24, s7 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,52 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s8
|
||||
tgid_y_en(1) //s_tgid_y s9
|
||||
tgid_z_en(1) //s_tgid_z s10
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//read mem data
|
||||
s_mov_b32 s31, 0x0
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
|
||||
//export poisoned data to L2
|
||||
buffer_store_dword v0, v9, s24, s7 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,77 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s8
|
||||
tgid_y_en(1) //s_tgid_y s9
|
||||
tgid_z_en(1) //s_tgid_z s10
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//read mem data
|
||||
s_mov_b32 s31, 0x0
|
||||
//For vega20, we need to set bit 12 low. This bit will just be set low here in the shader.
|
||||
//s_mov_b32 s24, 0x15c000
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
//store it 10 times
|
||||
v_mov_b32 v10, v0
|
||||
v_mov_b32 v11, v0
|
||||
v_mov_b32 v12, v0
|
||||
v_mov_b32 v13, v0
|
||||
v_mov_b32 v14, v0
|
||||
v_mov_b32 v15, v0
|
||||
v_mov_b32 v16, v0
|
||||
v_mov_b32 v17, v0
|
||||
v_mov_b32 v18, v0
|
||||
v_mov_b32 v19, v0
|
||||
|
||||
// read them back
|
||||
v_mov_b32 v29, v10
|
||||
v_mov_b32 v28, v11
|
||||
v_mov_b32 v27, v12
|
||||
v_mov_b32 v26, v13
|
||||
v_mov_b32 v25, v14
|
||||
v_mov_b32 v24, v15
|
||||
v_mov_b32 v23, v16
|
||||
v_mov_b32 v22, v17
|
||||
v_mov_b32 v21, v18
|
||||
v_mov_b32 v20, v19
|
||||
|
||||
//export poisoned data to L2
|
||||
buffer_store_dword v0, v9, s24, s7 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,51 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s8
|
||||
tgid_y_en(1) //s_tgid_y s9
|
||||
tgid_z_en(1) //s_tgid_z s10
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//read mem data
|
||||
s_mov_b32 s31, 0x0
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
//export poisoned data to L2
|
||||
buffer_store_dword v0, v9, s24, s7 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,55 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//SPI may touch s0...sn before shader is run
|
||||
|
||||
s_mov_b32 s16, s2
|
||||
|
||||
//write data
|
||||
s_mov_b32 s30, s0
|
||||
s_mov_b32 s31, s1
|
||||
s_mov_b32 s32, s2
|
||||
s_mov_b32 s33, s3
|
||||
s_mov_b32 s34, s4
|
||||
s_mov_b32 s35, s5
|
||||
s_mov_b32 s36, s6
|
||||
s_mov_b32 s37, s7
|
||||
s_mov_b32 s38, s8
|
||||
s_mov_b32 s39, s9
|
||||
|
||||
//read back
|
||||
s_mov_b32 s0, s30
|
||||
s_mov_b32 s1, s31
|
||||
s_mov_b32 s2, s32
|
||||
s_mov_b32 s3, s33
|
||||
s_mov_b32 s4, s34
|
||||
s_mov_b32 s5, s35
|
||||
s_mov_b32 s6, s36
|
||||
s_mov_b32 s7, s37
|
||||
s_mov_b32 s8, s38
|
||||
s_mov_b32 s9, s39
|
||||
|
||||
s_store_dword s16, s[0:1], 0x0 glc
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,75 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_load_dwordx4 s[20:23], s[0:1], 16 // load atc mem surface rsrc
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
|
||||
var MTYPE_UC = 0x38000000
|
||||
s_or_b32 s27, s27, MTYPE_UC
|
||||
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 m0, 0x0
|
||||
|
||||
|
||||
STORE_LOOP:
|
||||
s_buffer_store_dword s8, s[20:23], m0 glc:1
|
||||
s_waitcnt 0
|
||||
s_add_u32 m0, m0, 4*1024 // step one 4KB page table address
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
var DEBUG_FUNCTION = 0
|
||||
// Remove function check code to half shader run time...
|
||||
if DEBUG_FUNCTION
|
||||
s_mov_b32 s8, s30
|
||||
s_mov_b32 m0, 0x0
|
||||
|
||||
LOAD_LOOP:
|
||||
s_buffer_load_dword s0, s[20:23], m0 glc:1
|
||||
s_waitcnt 0
|
||||
s_add_u32 m0, m0, 4*1024
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
end
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,96 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
/*
|
||||
s_bfe_u32 s33, s8, 0x20004 // extract bank select bits
|
||||
s_lshl_b32 s33, s33, 6 // ((bank_sel & 0x3) << 6) , bank_sel = address[9:8] ^ address[7:6], if 4 bank enabled
|
||||
s_and_b32 s8, s8, 0xf
|
||||
*/
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
|
||||
s_or_b32 s26, s26, 0x1000 //hack the buffer size to enough
|
||||
|
||||
STORE_LOOP:
|
||||
|
||||
var TOUCH_4_BANKS=1
|
||||
if TOUCH_4_BANKS
|
||||
s_mov_b32 m0, 0x0 // BANKA
|
||||
s_buffer_store_dword s8, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
s_buffer_load_dword s32, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
|
||||
s_mov_b32 m0, 0x40 // BANKB
|
||||
s_buffer_store_dword s8, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
s_buffer_load_dword s32, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
/*
|
||||
s_mov_b32 m0, 0x80 // BANKC
|
||||
s_buffer_store_dword s8, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
s_buffer_load_dword s32, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
|
||||
s_mov_b32 m0, 0xC0 // BANKD
|
||||
s_buffer_store_dword s8, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
s_buffer_load_dword s32, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
*/
|
||||
|
||||
end
|
||||
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_dcache_wb // to make emu, sim img match...
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,96 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
/*
|
||||
s_bfe_u32 s33, s8, 0x20004 // extract bank select bits
|
||||
s_lshl_b32 s33, s33, 6 // ((bank_sel & 0x3) << 6) , bank_sel = address[9:8] ^ address[7:6], if 4 bank enabled
|
||||
s_and_b32 s8, s8, 0xf
|
||||
*/
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
|
||||
s_or_b32 s26, s26, 0x1000 //hack the buffer size to enough
|
||||
|
||||
STORE_LOOP:
|
||||
|
||||
var TOUCH_4_BANKS=1
|
||||
if TOUCH_4_BANKS
|
||||
s_mov_b32 m0, 0x0 // BANKA
|
||||
s_buffer_store_dword s8, s[24:27], m0 glc:0
|
||||
s_waitcnt 0
|
||||
s_buffer_load_dword s32, s[24:27], m0 glc:0
|
||||
s_waitcnt 0
|
||||
|
||||
|
||||
s_mov_b32 m0, 0x40 // BANKB
|
||||
s_buffer_store_dword s8, s[24:27], m0 glc:0
|
||||
s_waitcnt 0
|
||||
s_buffer_load_dword s32, s[24:27], m0 glc:0
|
||||
s_waitcnt 0
|
||||
|
||||
/*
|
||||
s_mov_b32 m0, 0x80 // BANKC
|
||||
s_buffer_store_dword s8, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
s_buffer_load_dword s32, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
|
||||
s_mov_b32 m0, 0xC0 // BANKD
|
||||
s_buffer_store_dword s8, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
s_buffer_load_dword s32, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
*/
|
||||
|
||||
end
|
||||
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_dcache_wb // to make emu, sim img match...
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,112 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
s_or_b32 s27, s27, 0x8000000 // changing mtype to non volatile
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
s_mov_b32 s9, 0xaa
|
||||
s_mov_b32 s10, 0xbb
|
||||
s_mov_b32 s11, 0xcc
|
||||
|
||||
// BUFFER STORE OFFSETS FOR BANK A AND BANKB
|
||||
s_mov_b32 s12, 0x0
|
||||
s_mov_b32 s13, 0x10
|
||||
s_mov_b32 s14, 0x40
|
||||
s_mov_b32 s15, 0x50
|
||||
|
||||
|
||||
// The following sequence is needed to inject error in dirty bit ram. Sequence was provided by SQC designer 4/1/2015
|
||||
//1. you have an invalid line in data cache,
|
||||
//2. you write to some of the dwords in that line (the remaining dwords are still invalid),
|
||||
//3. then there is a read request that hit on that line, but it needs the dwords that are not yet there in that line
|
||||
//(in other words, it needs some of the invalid dwords of that line),
|
||||
//4. the request will go to TC,
|
||||
//5. when TC return comes back, the dirty bit rm will be read
|
||||
|
||||
STORE_LOOP:
|
||||
|
||||
var TOUCH_4_BANKS=1
|
||||
if TOUCH_4_BANKS
|
||||
|
||||
s_mov_b32 m0, s13 // BANKA write one dword to tc
|
||||
s_buffer_store_dwordx2 s[8:9], s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
|
||||
s_mov_b32 m0, s12 // BANKA. write one dword to sqc
|
||||
s_buffer_store_dwordx2 s[10:11], s[24:27], m0 glc:0
|
||||
s_waitcnt 0
|
||||
|
||||
s_mov_b32 m0, s13 // BANK A read the dword that is not in cache
|
||||
s_buffer_load_dword s32, s[24:27], m0 glc:0
|
||||
s_waitcnt 0
|
||||
|
||||
s_mov_b32 m0, s15 // BANKB write one dword to tc
|
||||
s_buffer_store_dwordx2 s[8:9], s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
s_mov_b32 m0, s14 // BANKB write one dword to sqc
|
||||
s_buffer_store_dwordx2 s[10:11], s[24:27], m0 glc:0
|
||||
s_waitcnt 0
|
||||
|
||||
s_mov_b32 m0, s15 // BANK B read the dword that is not in cache
|
||||
s_buffer_load_dword s32, s[24:27], m0 glc:0
|
||||
s_waitcnt 0
|
||||
end
|
||||
|
||||
s_add_u32 s12, s12,0x80
|
||||
s_add_u32 s13, s13,0x80
|
||||
s_add_u32 s14, s14,0x80
|
||||
s_add_u32 s15, s15,0x80
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_dcache_wb // to make emu, sim img match...
|
||||
|
||||
s_mov_b32 s16, 0xa5a50000
|
||||
s_store_dword s16, s[0:1], 0x40 glc
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,63 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_load_dwordx4 s[20:23], s[0:1], 16 // load atc mem surface rsrc
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
|
||||
var MTYPE_UC = 0x38000000
|
||||
s_or_b32 s27, s27, MTYPE_UC
|
||||
|
||||
label inst_page[34+1] // 34 4k pages
|
||||
|
||||
for var i =0; i < 34; i++
|
||||
inst_page[i]:
|
||||
//each block are 4k side...
|
||||
s_cbranch_execnz inst_page[i+1] //1 dword
|
||||
for var j = 0; j < (4*1024)/4 -1; j++
|
||||
v_mov_b32 v0, 0 // each with 1 dword
|
||||
end
|
||||
|
||||
end
|
||||
inst_page[34]:
|
||||
|
||||
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,69 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
// don't care about the loop count, fix 8 loops
|
||||
// Totaly number of cacheline equals 2(A,B,)*8
|
||||
|
||||
var num_cache_lines = 16
|
||||
label BLOCK_64B[num_cache_lines]
|
||||
|
||||
|
||||
for var loop = 0; loop < num_cache_lines - 1; loop++
|
||||
BLOCK_64B[loop]:
|
||||
s_branch BLOCK_64B[loop+1] // 1DW
|
||||
for var i = 0; i < 15; i++
|
||||
v_nop
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
// last block
|
||||
for var i = 0; i < 15; i++
|
||||
v_nop
|
||||
end
|
||||
//For uei 2 msb and lsb flipped
|
||||
// s_nop will become v_nop and it will a legal instruction
|
||||
BLOCK_64B[num_cache_lines-1]:
|
||||
for var i = 0; i < 81; i++
|
||||
s_nop 0x1
|
||||
end
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
/** comment, four bank interleave
|
||||
Addr 0x90000000 => Bank A
|
||||
Addr 0x90000040 => Bank B
|
||||
Addr 0x90000080 => Bank C
|
||||
Addr 0x900000c0 => Bank D
|
||||
Addr 0x90000100 => Bank B
|
||||
Addr 0x90000140 => Bank A
|
||||
Addr 0x90000180 => Bank D
|
||||
Addr 0x900001c0 => Bank C
|
||||
Addr 0x90000200 => Bank C
|
||||
Addr 0x90000240 => Bank D
|
||||
Addr 0x90000280 => Bank A
|
||||
Addr 0x900002c0 => Bank B
|
||||
Addr 0x90000300 => Bank D
|
||||
Addr 0x90000340 => Bank C
|
||||
Addr 0x90000380 => Bank B
|
||||
|
||||
**/
|
||||
@@ -1,29 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
for var i = 0; i < 1000; i++
|
||||
s_nop 0x1
|
||||
end
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,51 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s8
|
||||
tgid_y_en(1) //s_tgid_y s9
|
||||
tgid_z_en(1) //s_tgid_z s10
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//read from memory
|
||||
s_mov_b32 s31, 0x0
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
//write the data to memory
|
||||
buffer_store_dword v0, v9, s24, s7 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,73 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
s_mov_b32 s16, 0xa5a50001
|
||||
s_store_dword s16, s[0:1], 0x40 glc
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
STORE_LOOP:
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_waitcnt 0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_mov_b32 s8, s30
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
LOAD_LOOP:
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_waitcnt 0
|
||||
v_mov_b32 v12, v0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
|
||||
s_mov_b32 s16, 0xa5a50000
|
||||
s_store_dword s16, s[0:1], 0x40 glc
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,71 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
//For vega20, we need to set bit 12 low to steer traffic to ea0
|
||||
s_mov_b32 s32, 0xFFFFEFFF
|
||||
s_and_b32 s24, s24, s32
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
STORE_LOOP:
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_waitcnt 0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_mov_b32 s8, s30
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
LOAD_LOOP:
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_waitcnt 0
|
||||
v_mov_b32 v12, v0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,345 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
//set bit 12 low to select EA0
|
||||
s_mov_b32 s32, 0xFFFFEFFF
|
||||
s_and_b32 s24, s24, s32
|
||||
|
||||
s_and_b32 s31, s9, 0x1
|
||||
s_cmpk_eq_i32 s31, 0x1
|
||||
s_cbranch_scc1 ODD_WAVES
|
||||
|
||||
//set bit 12 high to select EA1
|
||||
s_mov_b32 s32, 0x1000
|
||||
s_or_b32 s24, s24, s32
|
||||
|
||||
ODD_WAVES:
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
v_mul_i32_i24 v3, v3, 0x10
|
||||
v_mul_i32_i24 v9, v9, 0x10
|
||||
|
||||
s_mov_b32 s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
|
||||
s_barrier
|
||||
|
||||
s_mov_b32 s31, 0xF0000
|
||||
s_mov_b32 s32, 0x6000
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
|
||||
s_cmpk_lt_i32 s9, 0x2
|
||||
s_cbranch_scc1 ATOMIC_LOOP
|
||||
|
||||
LOAD_LOOP:
|
||||
|
||||
buffer_load_dwordx4 v4, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v8, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v12, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v16, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v20, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v24, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v28, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v32, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v36, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v40, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
|
||||
buffer_load_dwordx4 v4, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v8, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v12, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v16, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v20, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v24, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v28, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v32, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v36, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v40, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
|
||||
buffer_load_dwordx4 v4, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v8, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v12, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v16, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v20, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v24, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v28, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v32, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v36, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v40, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
|
||||
buffer_load_dwordx4 v4, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v8, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v12, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v16, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v20, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v24, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v28, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v32, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v36, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v40, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
|
||||
s_sub_u32 s30, s30, 1
|
||||
s_cmpk_eq_u32 s30, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
|
||||
|
||||
s_cmpk_ge_i32 s9, 0x2
|
||||
s_cbranch_scc1 END
|
||||
|
||||
ATOMIC_LOOP:
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
s_sub_u32 s30, s30, 1
|
||||
s_cmpk_eq_u32 s30, 0
|
||||
s_cbranch_scc0 ATOMIC_LOOP
|
||||
|
||||
//s_waitcnt 0
|
||||
|
||||
END:
|
||||
s_waitcnt 0
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,509 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
//set bit 12 low to select EA0
|
||||
s_mov_b32 s32, 0xFFFFEFFF
|
||||
s_and_b32 s24, s24, s32
|
||||
|
||||
s_and_b32 s31, s9, 0x1
|
||||
s_cmpk_eq_i32 s31, 0x1
|
||||
s_cbranch_scc1 ODD_WAVES
|
||||
|
||||
//set bit 12 high to select EA1
|
||||
s_mov_b32 s32, 0x1000
|
||||
s_or_b32 s24, s24, s32
|
||||
|
||||
ODD_WAVES:
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
v_mul_i32_i24 v3, v3, 0x10
|
||||
v_mul_i32_i24 v9, v9, 0x10
|
||||
|
||||
s_mov_b32 s31, 0x9000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
|
||||
s_barrier
|
||||
|
||||
s_mov_b32 s31, 0xF0000
|
||||
s_mov_b32 s32, 0x9000
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
|
||||
s_cmpk_lt_i32 s9, 0x2
|
||||
s_cbranch_scc1 ATOMIC_LOOP
|
||||
|
||||
s_mov_b32 s20, 0x1
|
||||
|
||||
LOAD_LOOP:
|
||||
|
||||
s_atomic_add s20, s0, 0x100000
|
||||
s_atomic_add s20, s0, 0x100010
|
||||
s_atomic_add s20, s0, 0x100020
|
||||
s_atomic_add s20, s0, 0x100030
|
||||
s_atomic_add s20, s0, 0x100040
|
||||
s_atomic_add s20, s0, 0x100050
|
||||
s_atomic_add s20, s0, 0x100060
|
||||
s_atomic_add s20, s0, 0x100070
|
||||
s_atomic_add s20, s0, 0x100080
|
||||
s_atomic_add s20, s0, 0x100090
|
||||
|
||||
s_atomic_add s20, s0, 0x100100
|
||||
s_atomic_add s20, s0, 0x100110
|
||||
s_atomic_add s20, s0, 0x100120
|
||||
s_atomic_add s20, s0, 0x100130
|
||||
s_atomic_add s20, s0, 0x100140
|
||||
s_atomic_add s20, s0, 0x100150
|
||||
s_atomic_add s20, s0, 0x100160
|
||||
s_atomic_add s20, s0, 0x100170
|
||||
s_atomic_add s20, s0, 0x100180
|
||||
s_atomic_add s20, s0, 0x100190
|
||||
|
||||
s_atomic_add s20, s0, 0x100200
|
||||
s_atomic_add s20, s0, 0x100210
|
||||
s_atomic_add s20, s0, 0x100220
|
||||
s_atomic_add s20, s0, 0x100230
|
||||
s_atomic_add s20, s0, 0x100240
|
||||
s_atomic_add s20, s0, 0x100250
|
||||
s_atomic_add s20, s0, 0x100260
|
||||
s_atomic_add s20, s0, 0x100270
|
||||
s_atomic_add s20, s0, 0x100280
|
||||
s_atomic_add s20, s0, 0x100290
|
||||
|
||||
s_atomic_add s20, s0, 0x100300
|
||||
s_atomic_add s20, s0, 0x100310
|
||||
s_atomic_add s20, s0, 0x100320
|
||||
s_atomic_add s20, s0, 0x100330
|
||||
s_atomic_add s20, s0, 0x100340
|
||||
s_atomic_add s20, s0, 0x100350
|
||||
s_atomic_add s20, s0, 0x100360
|
||||
s_atomic_add s20, s0, 0x100370
|
||||
s_atomic_add s20, s0, 0x100380
|
||||
s_atomic_add s20, s0, 0x100390
|
||||
|
||||
s_atomic_add s20, s0, 0x100400
|
||||
s_atomic_add s20, s0, 0x100404
|
||||
s_atomic_add s20, s0, 0x100408
|
||||
s_atomic_add s20, s0, 0x10040c
|
||||
s_atomic_add s20, s0, 0x100410
|
||||
s_atomic_add s20, s0, 0x100414
|
||||
s_atomic_add s20, s0, 0x100418
|
||||
s_atomic_add s20, s0, 0x10041c
|
||||
s_atomic_add s20, s0, 0x100420
|
||||
s_atomic_add s20, s0, 0x100424
|
||||
s_atomic_add s20, s0, 0x100428
|
||||
s_atomic_add s20, s0, 0x10042c
|
||||
|
||||
s_atomic_add s20, s0, 0x100500
|
||||
s_atomic_add s20, s0, 0x100504
|
||||
s_atomic_add s20, s0, 0x100508
|
||||
s_atomic_add s20, s0, 0x10050c
|
||||
s_atomic_add s20, s0, 0x100510
|
||||
s_atomic_add s20, s0, 0x100514
|
||||
s_atomic_add s20, s0, 0x100518
|
||||
s_atomic_add s20, s0, 0x10051c
|
||||
s_atomic_add s20, s0, 0x100520
|
||||
s_atomic_add s20, s0, 0x100524
|
||||
s_atomic_add s20, s0, 0x100528
|
||||
s_atomic_add s20, s0, 0x10052c
|
||||
|
||||
s_atomic_add s20, s0, 0x100600
|
||||
s_atomic_add s20, s0, 0x100604
|
||||
s_atomic_add s20, s0, 0x100608
|
||||
s_atomic_add s20, s0, 0x10060c
|
||||
s_atomic_add s20, s0, 0x100610
|
||||
s_atomic_add s20, s0, 0x100614
|
||||
s_atomic_add s20, s0, 0x100618
|
||||
s_atomic_add s20, s0, 0x10061c
|
||||
s_atomic_add s20, s0, 0x100620
|
||||
s_atomic_add s20, s0, 0x100624
|
||||
s_atomic_add s20, s0, 0x100628
|
||||
s_atomic_add s20, s0, 0x10062c
|
||||
|
||||
s_atomic_add s20, s0, 0x100700
|
||||
s_atomic_add s20, s0, 0x100704
|
||||
s_atomic_add s20, s0, 0x100708
|
||||
s_atomic_add s20, s0, 0x10070c
|
||||
s_atomic_add s20, s0, 0x100710
|
||||
s_atomic_add s20, s0, 0x100714
|
||||
s_atomic_add s20, s0, 0x100718
|
||||
s_atomic_add s20, s0, 0x10071c
|
||||
s_atomic_add s20, s0, 0x100720
|
||||
s_atomic_add s20, s0, 0x100724
|
||||
s_atomic_add s20, s0, 0x100728
|
||||
s_atomic_add s20, s0, 0x10072c
|
||||
|
||||
s_atomic_add s20, s0, 0x100800
|
||||
s_atomic_add s20, s0, 0x100804
|
||||
s_atomic_add s20, s0, 0x100808
|
||||
s_atomic_add s20, s0, 0x10080c
|
||||
s_atomic_add s20, s0, 0x100810
|
||||
s_atomic_add s20, s0, 0x100814
|
||||
s_atomic_add s20, s0, 0x100818
|
||||
s_atomic_add s20, s0, 0x10081c
|
||||
s_atomic_add s20, s0, 0x100820
|
||||
s_atomic_add s20, s0, 0x100824
|
||||
s_atomic_add s20, s0, 0x100828
|
||||
s_atomic_add s20, s0, 0x10082c
|
||||
|
||||
s_atomic_add s20, s0, 0x100900
|
||||
s_atomic_add s20, s0, 0x100904
|
||||
s_atomic_add s20, s0, 0x100908
|
||||
s_atomic_add s20, s0, 0x10090c
|
||||
s_atomic_add s20, s0, 0x100910
|
||||
s_atomic_add s20, s0, 0x100914
|
||||
s_atomic_add s20, s0, 0x100918
|
||||
s_atomic_add s20, s0, 0x10091c
|
||||
s_atomic_add s20, s0, 0x100920
|
||||
s_atomic_add s20, s0, 0x100924
|
||||
s_atomic_add s20, s0, 0x100928
|
||||
s_atomic_add s20, s0, 0x10092c
|
||||
|
||||
s_atomic_add s20, s0, 0x100a00
|
||||
s_atomic_add s20, s0, 0x100a04
|
||||
s_atomic_add s20, s0, 0x100a08
|
||||
s_atomic_add s20, s0, 0x100a0c
|
||||
s_atomic_add s20, s0, 0x100a10
|
||||
s_atomic_add s20, s0, 0x100a14
|
||||
s_atomic_add s20, s0, 0x100a18
|
||||
s_atomic_add s20, s0, 0x100a1c
|
||||
s_atomic_add s20, s0, 0x100a20
|
||||
s_atomic_add s20, s0, 0x100a24
|
||||
s_atomic_add s20, s0, 0x100a28
|
||||
s_atomic_add s20, s0, 0x100a2c
|
||||
|
||||
s_atomic_add s20, s0, 0x100b00
|
||||
s_atomic_add s20, s0, 0x100b04
|
||||
s_atomic_add s20, s0, 0x100b08
|
||||
s_atomic_add s20, s0, 0x100b0c
|
||||
s_atomic_add s20, s0, 0x100b10
|
||||
s_atomic_add s20, s0, 0x100b14
|
||||
s_atomic_add s20, s0, 0x100b18
|
||||
s_atomic_add s20, s0, 0x100b1c
|
||||
s_atomic_add s20, s0, 0x100b20
|
||||
s_atomic_add s20, s0, 0x100b24
|
||||
s_atomic_add s20, s0, 0x100b28
|
||||
s_atomic_add s20, s0, 0x100b2c
|
||||
|
||||
s_atomic_add s20, s0, 0x100c00
|
||||
s_atomic_add s20, s0, 0x100c04
|
||||
s_atomic_add s20, s0, 0x100c08
|
||||
s_atomic_add s20, s0, 0x100c0c
|
||||
s_atomic_add s20, s0, 0x100c10
|
||||
s_atomic_add s20, s0, 0x100c14
|
||||
s_atomic_add s20, s0, 0x100c18
|
||||
s_atomic_add s20, s0, 0x100c1c
|
||||
s_atomic_add s20, s0, 0x100c20
|
||||
s_atomic_add s20, s0, 0x100c24
|
||||
s_atomic_add s20, s0, 0x100c28
|
||||
s_atomic_add s20, s0, 0x100c2c
|
||||
|
||||
|
||||
s_sub_u32 s30, s30, 1
|
||||
s_cmpk_eq_u32 s30, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
|
||||
|
||||
s_cmpk_ge_i32 s9, 0x2
|
||||
s_cbranch_scc1 END
|
||||
|
||||
ATOMIC_LOOP:
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
s_sub_u32 s30, s30, 1
|
||||
s_cmpk_eq_u32 s30, 0
|
||||
s_cbranch_scc0 ATOMIC_LOOP
|
||||
|
||||
//s_waitcnt 0
|
||||
|
||||
END:
|
||||
s_waitcnt 0
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -1,80 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_load_dwordx4 s[20:23], s[0:1], 16 // load atc mem surface rsrc
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
var MTYPE_UC = 0x38000000
|
||||
s_or_b32 s27, s27, MTYPE_UC
|
||||
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s8, 33 // store 33 times to overflow atcl1 cache...
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
STORE_LOOP:
|
||||
v_add_co_u32 v0, vcc[0:1], v0, 2
|
||||
buffer_store_dword v0, v9, s20, s31 idxen:1 glc:1 slc:1
|
||||
s_waitcnt 0
|
||||
s_add_u32 s31, s31, 4*1024 // step one 4KB page size
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
|
||||
|
||||
var DEBUG_FUNCTION = 0
|
||||
//remove code to half shader run time
|
||||
if DEBUG_FUNCTION
|
||||
s_mov_b32 s8, s30
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
LOAD_LOOP:
|
||||
buffer_load_dword v0, v9, s20, s31 idxen:1 glc:1 slc:1
|
||||
s_waitcnt 0
|
||||
v_mov_b32 v12, v0
|
||||
s_add_u32 s31, s31, 4*1024
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
end
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,80 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_load_dwordx4 s[20:23], s[0:1], 16 // load atc mem surface rsrc
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
var MTYPE_UC = 0x38000000
|
||||
s_or_b32 s27, s27, MTYPE_UC
|
||||
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s8, 33 // store 33 times to overflow atcl1 cache...
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
STORE_LOOP:
|
||||
v_add_co_u32 v0, vcc[0:1], v0, 2
|
||||
buffer_store_dword v0, v9, s20, s31 idxen:1 glc:1 slc:1
|
||||
s_waitcnt 0
|
||||
s_add_u32 s31, s31, 4*1024 // step one 4KB page size
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
|
||||
|
||||
var DEBUG_FUNCTION = 1
|
||||
//remove code to half shader run time
|
||||
if DEBUG_FUNCTION
|
||||
s_mov_b32 s8, 0x20
|
||||
s_mov_b32 s31, 0xffc
|
||||
|
||||
LOAD_LOOP:
|
||||
buffer_load_dwordx2 v[0:1], v9, s20, s31 idxen:1 glc:1 slc:1
|
||||
s_waitcnt 0
|
||||
v_mov_b32 v12, v0
|
||||
s_add_u32 s31, s31, 4*1024
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
end
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,72 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
//bump up the addresses being accessed to generate multiple reads to the pde memories
|
||||
v_mul_u32_u24 v9, 65536, v9
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
//Hack number of records to avoid range checking which we don't want since we want to generate
|
||||
//out of range accesses. we are really trying to generate many reads to the PDEs to get FUE.
|
||||
s_mov_b32 s26, 0xffffffff
|
||||
|
||||
STORE_LOOP:
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_waitcnt 0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_mov_b32 s8, s30
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
LOAD_LOOP:
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_waitcnt 0
|
||||
v_mov_b32 v12, v0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,72 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
//bump up the addresses being accessed to generate multiple reads to the pde memories
|
||||
v_mul_u32_u24 v9, 4096, v9
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
//Hack number of records to avoid range checking which we don't want since we want to generate
|
||||
//out of range accesses. we are really trying to generate many reads to the PDEs to get FUE.
|
||||
s_mov_b32 s26, 0xffffffff
|
||||
|
||||
STORE_LOOP:
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_waitcnt 0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_mov_b32 s8, s30
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
LOAD_LOOP:
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_waitcnt 0
|
||||
v_mov_b32 v12, v0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,47 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s8
|
||||
tgid_y_en(1) //s_tgid_y s9
|
||||
tgid_z_en(1) //s_tgid_z s10
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//read mem data
|
||||
s_mov_b32 s31, 0x0
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
STORE_LOOP:
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_waitcnt 0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
@@ -1,54 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
s_mov_b32 s16, s2
|
||||
|
||||
//SPI may touch v0,v1,v2 before shader is run
|
||||
|
||||
//store it 10 times
|
||||
v_mov_b32 v10, v1
|
||||
v_mov_b32 v11, v2
|
||||
v_mov_b32 v12, v1
|
||||
v_mov_b32 v13, v2
|
||||
v_mov_b32 v14, v1
|
||||
v_mov_b32 v15, v2
|
||||
v_mov_b32 v16, v1
|
||||
v_mov_b32 v17, v2
|
||||
v_mov_b32 v18, v1
|
||||
v_mov_b32 v19, v0
|
||||
|
||||
// read them back
|
||||
v_mov_b32 v29, v10
|
||||
v_mov_b32 v28, v11
|
||||
v_mov_b32 v27, v12
|
||||
v_mov_b32 v26, v13
|
||||
v_mov_b32 v25, v14
|
||||
v_mov_b32 v24, v15
|
||||
v_mov_b32 v23, v16
|
||||
v_mov_b32 v22, v17
|
||||
v_mov_b32 v21, v18
|
||||
v_mov_b32 v20, v19
|
||||
|
||||
s_store_dword s16, s[0:1], 0x0 glc
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
@@ -1,75 +0,0 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(2) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s2
|
||||
tgid_y_en(1) //s_tgid_y s3
|
||||
tgid_z_en(1) //s_tgid_z s4
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
for var vgpr = 0; vgpr < 256; ++vgpr
|
||||
v_accvgpr_read v[vgpr], acc[vgpr]
|
||||
end
|
||||
|
||||
for var vgpr = 0; vgpr < 256; ++vgpr
|
||||
v_accvgpr_write acc[vgpr], v[vgpr]
|
||||
end
|
||||
|
||||
s_movk_i32 m0, 0x0000
|
||||
s_mov_b32 s10, 0x000000f8
|
||||
s_set_gpr_idx_on s10, 0x8
|
||||
label_0004:
|
||||
v_mov_b32 v0, 0
|
||||
v_mov_b32 v1, 0
|
||||
v_mov_b32 v2, 0
|
||||
v_mov_b32 v3, 0
|
||||
v_mov_b32 v4, 0
|
||||
v_mov_b32 v5, 0
|
||||
v_mov_b32 v6, 0
|
||||
v_mov_b32 v7, 0
|
||||
s_sub_u32 s10, s10, 8
|
||||
s_set_gpr_idx_idx s10
|
||||
s_cbranch_scc0 label_0004
|
||||
s_set_gpr_idx_off
|
||||
v_mbcnt_lo_u32_b32 v1, exec_hi, 0
|
||||
v_mbcnt_hi_u32_b32 v1, exec_lo, v1
|
||||
v_mul_u32_u24 v1, 8, v1
|
||||
s_getreg_b32 s11, hwreg(HW_REG_HW_ID, 4, 2)
|
||||
s_mulk_i32 s11, 0x4000
|
||||
v_add_co_u32 v1, vcc, v1, s11
|
||||
s_mov_b32 s10, 7
|
||||
s_mov_b32 m0, -1
|
||||
label_001B:
|
||||
ds_write2_b64 v1, v[2:3], v[2:3] offset1:64
|
||||
ds_write2_b64 v1, v[4:5], v[4:5] offset0:128 offset1:192
|
||||
v_add_co_u32 v1, vcc, 0x00000800, v1
|
||||
s_sub_u32 s10, s10, 1
|
||||
s_cbranch_scc0 label_001B
|
||||
|
||||
s_getreg_b32 s20, hwreg(HW_REG_HW_ID, 0, 32)
|
||||
// s12 = SIMD
|
||||
s_lshr_b32 s12,s20,4
|
||||
s_and_b32 s12, s12, 0x3
|
||||
// s13 = CU
|
||||
s_lshr_b32 s13,s20,8
|
||||
s_and_b32 s13, s13, 0xf
|
||||
// s14 = SE
|
||||
s_lshr_b32 s14,s20,13
|
||||
s_and_b32 s14, s14, 0x7
|
||||
// s15 = SE * 16 * 4 + CU * 4 + SIMD
|
||||
s_mul_i32 s16, s14, 64
|
||||
s_mul_i32 s17, s13, 4
|
||||
s_add_i32 s15, s16, s17
|
||||
s_add_i32 s15, s15, s12
|
||||
s_mul_i32 s16, s15, 4
|
||||
|
||||
s_store_dword s15, s[0:1], s16 glc
|
||||
s_waitcnt 0
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
@@ -1,58 +0,0 @@
|
||||
//s[0:1]: buffer resource
|
||||
//s2: num_threads_x_full
|
||||
//s3: num_threads_x_full * num_threads_y_full
|
||||
//s4: num_threads_x_full * num_threads_y_full * num_threads_z_full
|
||||
//s5: COMPUTE_DIM_X
|
||||
//s6: COMPUTE_DIM_X * COMPUTE_DIM_Y
|
||||
//s7: loop_lifetime
|
||||
//s8: dispatch_offset
|
||||
//s[9:11]: thread group ID
|
||||
//v[0:2]: thread ID
|
||||
|
||||
shader main
|
||||
|
||||
type(CS)
|
||||
user_sgpr_count(9)
|
||||
tgid_x_en(1)
|
||||
tgid_y_en(1)
|
||||
tgid_z_en(1)
|
||||
|
||||
//sp3 loop for lifetime
|
||||
s_mov_b32 s12, 0 //init loop idx s12
|
||||
label_0004:
|
||||
s_cmp_lt_i32 s12, s7 //scc = (s12 < s7) ? 1 : 0
|
||||
s_cbranch_scc0 label_0006 //if(scc == 0) then jump to label_0006; else nop
|
||||
|
||||
v_mov_b32 v4,s12
|
||||
s_add_i32 s12, s12, 1 //add loop incr
|
||||
s_branch label_0004
|
||||
|
||||
label_0006: //end of SP3 loop
|
||||
|
||||
//v3 thread_id_in_group = (tid_z * num_threads_x_full * num_threads_y_full) + (tid_y * num_threads_x_full) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0 //v3 = tid_y * num_threads_x_full + tid_x
|
||||
v_mad_u32_u24 v3, v2, s3, v3 //v3 = tid_z * num_threads_x_ful * num_threads_y_full + v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z * COMPUTE_DIM_X * COMPUTE_DIM_Y) + (tgid_y * COMPUTE_DIM_X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5 //tgid_y * COMPUTE_DIM_X
|
||||
s_add_i32 s28, s28, s_tgid_x //tgid_y * COMPUTE_DIM_X + tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z //tgid_z * COMPUTE_DIM_X * COMPUTE_DIM_Y
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id * (num_threads_x_full * num_threads_y_full * num_threads_z_full) + thread_id_in_group
|
||||
v_mov_b32 v9, s28 //thread_group_id
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//fetch the buffer resource
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
//write absolute thread id using it as an index
|
||||
buffer_store_dword v9, v9, s24, s8 idxen:1
|
||||
s_waitcnt 0
|
||||
|
||||
s_mov_b32 s16, 0xa5a50000
|
||||
s_store_dword s16, s[0:1], 0x40 glc
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
Двоичные данные
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичные данные
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичные данные
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичные данные
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичные данные
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичные данные
Двоичный файл не отображается.
Двоичные данные
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичные данные
Двоичный файл не отображается.
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Ссылка в новой задаче
Block a user