rocm-systems/rocclr/runtime/device/pal/palsettings.cpp

//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#include "top.hpp"
#include "os/os.hpp"
#include "device/device.hpp"
#include "device/pal/paldefs.hpp"
#include "device/pal/palsettings.hpp"

#include <algorithm>

#if defined(_WIN32)
#include "Windows.h"
#include "VersionHelpers.h"
#endif

namespace pal {

/*! \brief information for adjusting maximum workload time
 *
 *  This structure contains the time and OS minor version for max workload time
 *  adjustment for Windows 7 or 8.
 */
struct ModifyMaxWorkload {
  uint32_t time;          //!< max work load time (10x ms)
  uint32_t minorVersion;  //!< OS minor version
#if defined(_WIN32)
  BYTE comparisonOps;  //!< Comparison option
#endif
};


Settings::Settings() {
  // Initialize the GPU device default settings
  oclVersion_ = OpenCL12;
  debugFlags_ = 0;
  remoteAlloc_ = REMOTE_ALLOC;

  stagedXferRead_ = true;
  stagedXferWrite_ = true;
  stagedXferSize_ = GPU_STAGING_BUFFER_SIZE * Ki;

  // We will enable staged read/write if we use local memory
  disablePersistent_ = false;

  imageSupport_ = false;
  hwLDSSize_ = 0;

  // Set this to true when we drop the flag
  doublePrecision_ = ::CL_KHR_FP64;

  // Fill workgroup info size
  maxWorkGroupSize_ = 1024;
  preferredWorkGroupSize_ = 256;

  hostMemDirectAccess_ = HostMemDisable;

  libSelector_ = amd::LibraryUndefined;

  // Enable workload split by default (for 24 bit arithmetic or timeout)
  workloadSplitSize_ = 1 << GPU_WORKLOAD_SPLIT;

  // By default use host blit
  blitEngine_ = BlitEngineHost;
  const static size_t MaxPinnedXferSize = 32;
  pinnedXferSize_ = std::min(GPU_PINNED_XFER_SIZE, MaxPinnedXferSize) * Mi;
  pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_);

  // Disable FP_FAST_FMA defines by default
  reportFMAF_ = false;
  reportFMA_ = false;

  // GPU device by default
  apuSystem_ = false;

  // Disable 64 bit pointers support by default
  use64BitPtr_ = false;

  // Max alloc size is 16GB
  maxAllocSize_ = 16 * static_cast<uint64_t>(Gi);

  // Disable memory dependency tracking by default
  numMemDependencies_ = 0;

  // By default cache isn't present
  cacheLineSize_ = 0;
  cacheSize_ = 0;

  // Initialize transfer buffer size to 1MB by default
  xferBufSize_ = 1024 * Ki;

  // Use image DMA if requested
  imageDMA_ = GPU_IMAGE_DMA;

  // Disable ASIC specific features by default
  viPlus_ = false;
  aiPlus_ = false;
  gfx10Plus_ = false;

  // Number of compute rings.
  numComputeRings_ = 0;

  minWorkloadTime_ = 1;       // 0.1 ms
  maxWorkloadTime_ = 500000;  // 500 ms

  // Controls tiled images in persistent
  //!@note IOL for Linux doesn't setup tiling aperture in CMM/QS
  linearPersistentImage_ = false;

  useSingleScratch_ = GPU_USE_SINGLE_SCRATCH;

  // Device enqueuing settings
  numDeviceEvents_ = 1024;
  numWaitEvents_ = 8;

  numScratchWavesPerCu_ = 16;

  // Don't support platform atomics by default.
  svmAtomics_ = false;

  // Use host queue for device enqueuing by default
  useDeviceQueue_ = GPU_USE_DEVICE_QUEUE;

  // Don't support Denormals for single precision by default
  singleFpDenorm_ = false;

  // Disable SDMA workaround by default
  sdamPageFaultWar_ = false;

  // SQTT buffer size in bytes
  rgpSqttDispCount_ = PAL_RGP_DISP_COUNT;
  rgpSqttWaitIdle_ = true;
  rgpSqttForceDisable_ = false;

  // Sub allocation parameters
  subAllocationMinSize_ = 4 * Ki;
  subAllocationChunkSize_ = 64 * Mi;
  subAllocationMaxSize_ =
    std::min(static_cast<uint64_t>(GPU_MAX_SUBALLOC_SIZE) * Ki, subAllocationChunkSize_);

  maxCmdBuffers_ = 12;
  useLightning_ = GPU_ENABLE_LC;
  enableWgpMode_ = false;
  enableWave32Mode_ = false;
  hsailExplicitXnack_ = false;
  lcWavefrontSize64_ = true;
}

bool Settings::create(const Pal::DeviceProperties& palProp,
                      const Pal::GpuMemoryHeapProperties* heaps, const Pal::WorkStationCaps& wscaps,
                      bool reportAsOCL12Device)
{
  uint32_t osVer = 0x0;

  // Disable thread trace by default for all devices
  threadTraceEnable_ = false;
  bool doublePrecision = true;

  if (doublePrecision) {
    // Report FP_FAST_FMA define if double precision HW
    reportFMA_ = true;
    // FMA is 1/4 speed on Pitcairn, Cape Verde, Devastator and Scrapper
    // Bonaire, Kalindi, Spectre and Spooky so disable
    // FP_FMA_FMAF for those parts in switch below
    reportFMAF_ = true;
  }

  // Update GPU specific settings and info structure if we have any
  ModifyMaxWorkload modifyMaxWorkload = {0};

  // APU systems
  if (palProp.gpuType == Pal::GpuType::Integrated) {
    apuSystem_ = true;
  }

  switch (palProp.revision) {
    case Pal::AsicRevision::Unknown:
      switch (palProp.gfxLevel) {
        case Pal::GfxIpLevel::GfxIp10_1:
        case Pal::GfxIpLevel::GfxIp10:
          gfx10Plus_ = true;
        case Pal::GfxIpLevel::GfxIp9:
          aiPlus_ = true;
          break;
        default:
          assert(0 && "Unknown GfxIP type!");
          return false;
      }
        case Pal::AsicRevision::Navi12:
    case Pal::AsicRevision::Navi14:
    case Pal::AsicRevision::Navi10:
    case Pal::AsicRevision::Navi10_A0:
    case Pal::AsicRevision::Navi10Lite:
      gfx10Plus_ = true;
      hsailExplicitXnack_ = static_cast<uint>(palProp.gpuMemoryProperties.flags.pageMigrationEnabled
        || palProp.gpuMemoryProperties.flags.iommuv2Support);
      enableWgpMode_ = GPU_ENABLE_WGP_MODE;
      if (useLightning_) {
        enableWave32Mode_ = true;
      }
      if (palProp.revision == Pal::AsicRevision::Navi10Lite && useLightning_) {
        enableWave32Mode_ = false;
      }
      if (!flagIsDefault(GPU_ENABLE_WAVE32_MODE)) {
        enableWave32Mode_ = GPU_ENABLE_WAVE32_MODE;
      }
      lcWavefrontSize64_ = !enableWave32Mode_;
      // Fall through to AI (gfx9) ...
    case Pal::AsicRevision::Vega20:
    case Pal::AsicRevision::Vega12:
    case Pal::AsicRevision::Vega10:
    case Pal::AsicRevision::Raven:
    case Pal::AsicRevision::Raven2:
      aiPlus_ = true;
    // Fall through to VI ...
    case Pal::AsicRevision::Carrizo:
    case Pal::AsicRevision::Bristol:
    case Pal::AsicRevision::Stoney:
      if (!aiPlus_) {
        // Fix BSOD/TDR issues observed on Stoney Win7/8.1/10
        minWorkloadTime_ = 1000;
        modifyMaxWorkload.time = 1000;       // Decided by experiment
        modifyMaxWorkload.minorVersion = 1;  // Win 7
#if defined(_WIN32)
        modifyMaxWorkload.comparisonOps = VER_EQUAL;  // Limit to Win 7 only
#endif
      }
    case Pal::AsicRevision::Iceland:
    case Pal::AsicRevision::Tonga:
    case Pal::AsicRevision::Fiji:
    case Pal::AsicRevision::Polaris10:
    case Pal::AsicRevision::Polaris11:
    case Pal::AsicRevision::Polaris12:
      // Disable tiling aperture on VI+
      linearPersistentImage_ = true;
      // Keep this false even though we have support
      // singleFpDenorm_ = true;
      viPlus_ = true;
      // SDMA may have memory access outside of
      // the valid buffer range and cause a page fault
      sdamPageFaultWar_ = true;
      enableExtension(ClKhrFp16);
    // Fall through to CI ...
    case Pal::AsicRevision::Kalindi:
    case Pal::AsicRevision::Godavari:
    case Pal::AsicRevision::Spectre:
    case Pal::AsicRevision::Spooky:
      if (!viPlus_) {
        // Fix BSOD/TDR issues observed on Kaveri Win7 (EPR#416903)
        modifyMaxWorkload.time = 250000;     // 250ms
        modifyMaxWorkload.minorVersion = 1;  // Win 7
#if defined(_WIN32)
        modifyMaxWorkload.comparisonOps = VER_EQUAL;  // limit to Win 7
#endif
      }
    // Fall through ...
    case Pal::AsicRevision::Bonaire:
    case Pal::AsicRevision::Hawaii:
      threadTraceEnable_ = AMD_THREAD_TRACE_ENABLE;
      reportFMAF_ = false;
      if ((palProp.revision == Pal::AsicRevision::Hawaii) || aiPlus_) {
        reportFMAF_ = true;
      }
      // Cache line size is 64 bytes
      cacheLineSize_ = 64;
      // L1 cache size is 16KB
      cacheSize_ = 16 * Ki;

      libSelector_ = amd::GPU_Library_CI;
      if (LP64_SWITCH(false, true)) {
        oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/
            ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR))
            : OpenCL12;
      }
      if (GPU_FORCE_OCL20_32BIT) {
        force32BitOcl20_ = true;
        oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/
            ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR))
            : OpenCL12;
      }
      if (OPENCL_VERSION < 200) {
        oclVersion_ = OpenCL12;
      }
      numComputeRings_ = 8;

      // Cap at OpenCL20 for now
      if (oclVersion_ > OpenCL20) oclVersion_ = OpenCL20;

      // This needs to be cleaned once 64bit addressing is stable
      if (oclVersion_ < OpenCL20) {
        use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR)
            ? LP64_SWITCH(false,
                          /*calAttr.isWorkstation ||*/ true)
            : GPU_FORCE_64BIT_PTR;
      } else {
        if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, true)) {
          use64BitPtr_ = true;
        }
      }

      if (oclVersion_ >= OpenCL20) {
        supportDepthsRGB_ = true;
      }
      if (use64BitPtr_) {
        if (amd::IS_HIP || (GPU_ENABLE_LARGE_ALLOCATION && wscaps.workStationBoard)) {
          maxAllocSize_ = 64ULL * Gi;
        } else {
          maxAllocSize_ = 4048 * Mi;
        }
      } else {
        maxAllocSize_ = 3ULL * Gi;
      }

      // Note: More than 4 command buffers may cause a HW hang
      // with HWSC on pre-gfx9 devices in OCLPerfKernelArguments
      if (!aiPlus_) {
        maxCmdBuffers_ = 4;
      }

      supportRA_ = false;
      numMemDependencies_ = GPU_NUM_MEM_DEPENDENCY;
      break;
    default:
      assert(0 && "Unknown ASIC type!");
      return false;
  }

#if defined(_WIN32)
  if (modifyMaxWorkload.time > 0) {
    OSVERSIONINFOEX versionInfo = {0};
    versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
    versionInfo.dwMajorVersion = 6;
    versionInfo.dwMinorVersion = modifyMaxWorkload.minorVersion;

    DWORDLONG conditionMask = 0;
    VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, modifyMaxWorkload.comparisonOps);
    VER_SET_CONDITION(conditionMask, VER_MINORVERSION, modifyMaxWorkload.comparisonOps);
    if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) {
      maxWorkloadTime_ = modifyMaxWorkload.time;
    }
  }
#endif  // defined(_WIN32)

  // Enable atomics support
  enableExtension(ClKhrInt64BaseAtomics);
  enableExtension(ClKhrInt64ExtendedAtomics);
  enableExtension(ClKhrGlobalInt32BaseAtomics);
  enableExtension(ClKhrGlobalInt32ExtendedAtomics);
  enableExtension(ClKhrLocalInt32BaseAtomics);
  enableExtension(ClKhrLocalInt32ExtendedAtomics);
  enableExtension(ClKhrByteAddressableStore);
  enableExtension(ClKhrGlSharing);
  enableExtension(ClKhrGlEvent);
  enableExtension(ClKhr3DImageWrites);
  enableExtension(ClKhrImage2dFromBuffer);
  enableExtension(ClAmdMediaOps);
  enableExtension(ClAmdMediaOps2);

  if (!useLightning_) {
    enableExtension(ClAmdPopcnt);
    enableExtension(ClAmdVec3);
    enableExtension(ClAmdPrintf);
    enableExtension(ClKhrSpir);
  }
  // Enable some platform extensions
  enableExtension(ClAmdDeviceAttributeQuery);


#ifdef ATI_OS_LINUX
  if (palProp.gpuMemoryProperties.busAddressableMemSize > 0)
#endif
  {
    enableExtension(ClAMDLiquidFlash);
  }

  hwLDSSize_ = (IS_LINUX || gfx10Plus_) ? 64 * Ki : 32 * Ki;

  imageSupport_ = true;

  // Use kernels for blit if appropriate
  blitEngine_ = BlitEngineKernel;

  hostMemDirectAccess_ |= HostMemBuffer;
  // HW doesn't support untiled image writes
  // hostMemDirectAccess_ |= HostMemImage;

  // Make sure device actually supports double precision
  doublePrecision_ = (doublePrecision) ? doublePrecision_ : false;
  if (doublePrecision_) {
    // Enable KHR double precision extension
    enableExtension(ClKhrFp64);
  }

  if (!useLightning_ && doublePrecision) {
    // Enable AMD double precision extension
    doublePrecision_ = true;
    enableExtension(ClAmdFp64);
  }

  if (palProp.gpuMemoryProperties.busAddressableMemSize > 0) {
    // Enable bus addressable memory extension
    enableExtension(ClAMDBusAddressableMemory);
  }
  //! @todo
  /*
      if (calAttr.longIdleDetect) {
          // KMD is unable to detect if we map the visible memory for CPU access, so
          // accessing persistent staged buffer may fail if LongIdleDetct is enabled.
          disablePersistent_ = true;
      }
  */

  svmFineGrainSystem_ = palProp.gpuMemoryProperties.flags.iommuv2Support;
  svmAtomics_ = svmFineGrainSystem_;

// SVM is not currently supported for DX Interop
#if defined(_WIN32)
  enableExtension(ClKhrD3d9Sharing);
  enableExtension(ClKhrD3d10Sharing);
  enableExtension(ClKhrD3d11Sharing);
#endif  // _WIN32

  // Enable some OpenCL 2.0 extensions
  if (oclVersion_ >= OpenCL20) {
    enableExtension(ClKhrGLDepthImages);
    enableExtension(ClKhrSubGroups);
    enableExtension(ClKhrDepthImages);

    if (GPU_MIPMAP) {
      enableExtension(ClKhrMipMapImage);
      enableExtension(ClKhrMipMapImageWrites);
    }

    // Enable HW debug
    if (GPU_ENABLE_HW_DEBUG) {
      enableHwDebug_ = true;
    }

#if defined(_WIN32)
    enableExtension(ClAmdPlanarYuv);
#endif
  }

  if (apuSystem_ &&
      ((heaps[Pal::GpuHeapLocal].heapSize + heaps[Pal::GpuHeapInvisible].heapSize) < (150 * Mi))) {
    remoteAlloc_ = true;
  }

  // Update resource cache size
  if (remoteAlloc_) {
    resourceCacheSize_ = std::max((heaps[Pal::GpuHeapGartUswc].heapSize / 8),
                                  (uint64_t)GPU_RESOURCE_CACHE_SIZE * Mi);
  } else {
    resourceCacheSize_ =
        std::max(((heaps[Pal::GpuHeapLocal].heapSize + heaps[Pal::GpuHeapInvisible].heapSize) / 8),
                 (uint64_t)GPU_RESOURCE_CACHE_SIZE * Mi);
#if !defined(_LP64)
    resourceCacheSize_ = std::min(resourceCacheSize_, 1 * Gi);
#endif
  }

  if (useLightning_) {
    switch (palProp.gfxLevel) {
      case Pal::GfxIpLevel::GfxIp10_1:
      case Pal::GfxIpLevel::GfxIp10:
      case Pal::GfxIpLevel::GfxIp9:
        singleFpDenorm_ = true;
        break;
      default:
        break;
    }
  }

  // Override current device settings
  override();

  return true;
}

void Settings::override() {
  // Limit reported workgroup size
  if (GPU_MAX_WORKGROUP_SIZE != 0) {
    preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE;
  }

  // Override blit engine type
  if (GPU_BLIT_ENGINE_TYPE != BlitEngineDefault) {
    blitEngine_ = GPU_BLIT_ENGINE_TYPE;
  }

  if (!flagIsDefault(DEBUG_GPU_FLAGS)) {
    debugFlags_ = DEBUG_GPU_FLAGS;
  }

  if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) {
    xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki;
  }

  if (!flagIsDefault(GPU_NUM_COMPUTE_RINGS)) {
    numComputeRings_ = GPU_NUM_COMPUTE_RINGS;
  }

  if (!flagIsDefault(GPU_RESOURCE_CACHE_SIZE)) {
    resourceCacheSize_ = GPU_RESOURCE_CACHE_SIZE * Mi;
  }

  if (!flagIsDefault(AMD_GPU_FORCE_SINGLE_FP_DENORM)) {
    switch (AMD_GPU_FORCE_SINGLE_FP_DENORM) {
      case 0:
        singleFpDenorm_ = false;
        break;
      case 1:
        singleFpDenorm_ = true;
        break;
      default:
        break;
    }
  }

  if (!flagIsDefault(GPU_MAX_COMMAND_BUFFERS)) {
    maxCmdBuffers_ = GPU_MAX_COMMAND_BUFFERS;
  }
}

}  // namespace pal