P4 to Git Change 1536925 by vsytchen@vsytchen-ocl-win10 on 2018/04/04 17:20:38

SWDEV-79445 - OCL generic changes and code clean-up 1. This change replaces the use of std::map with std::unordered_map to improve lookup/insert time. 2. Replace the use of std::make_pair and std::pair constructor with uniform initialization for cleaner code. 3. Replace the use of std::Container::iterator type with the auto keyword for cleaner code. 4. Use range based for loops where needed. ReviewBoardURL = http://ocltc.amd.com/reviews/r/14517/diff/ Affected files ... ... //depot/stg/opencl/drivers/opencl/api/hip/hip_platform.cpp#4 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_context.cpp#58 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_d3d10.cpp#16 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_d3d10_amd.hpp#9 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_d3d11.cpp#24 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_d3d11_amd.hpp#13 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_d3d9.cpp#34 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_d3d9_amd.hpp#17 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_gl.cpp#57 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_pipe.cpp#7 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_program.cpp#46 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_svm.cpp#23 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/appprofile.hpp#14 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpuprogram.cpp#72 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpuvirtual.cpp#27 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#216 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#297 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuappprofile.cpp#13 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpubinary.cpp#59 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpucompiler.cpp#158 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#587 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#322 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprintf.cpp#46 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.cpp#237 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.hpp#70 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuresource.cpp#242 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#415 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#143 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palappprofile.cpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palcompiler.cpp#22 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#79 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprintf.cpp#9 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#59 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#60 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#84 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#46 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/CMakeLists.txt#11 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/pro/prodevice.cpp#4 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/pro/prodevice.hpp#5 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocbinary.hpp#6 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/roccompiler.cpp#42 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/roccounters.cpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprintf.cpp#10 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.cpp#81 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#81 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/command.hpp#89 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/commandqueue.cpp#24 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/context.cpp#49 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/context.hpp#29 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/memory.cpp#129 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/memory.hpp#102 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/perfctr.hpp#7 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/program.cpp#91 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/program.hpp#43 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/sampler.hpp#9 edit ... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.cpp#17 edit [ROCm/clr commit: d09ca72f74]
2018-04-04 18:00:17 -04:00
@@ -4,7 +4,7 @@
 #ifndef APPPROFILE_HPP_
 #define APPPROFILE_HPP_

-#include <map>
+#include <unordered_map>
 #include <string>

 namespace amd {
@@ -34,7 +34,7 @@ class AppProfile {
    void* data_;      //!< Pointer to the data
  };

-  typedef std::map<std::string, PropertyData> DataMap;
+  typedef std::unordered_map<std::string, PropertyData> DataMap;

  DataMap propertyDataMap_;
  std::string appFileName_;  // without extension
@@ -702,9 +702,8 @@ bool Program::compileImpl(const std::string& sourceCode,
    std::string headerIncludeName(headerIncludeNames[i]);
    // replace / in path with current os's file separator
    if (amd::Os::fileSeparator() != '/') {
-      for (std::string::iterator it = headerIncludeName.begin(), end = headerIncludeName.end();
-           it != end; ++it) {
-        if (*it == '/') *it = amd::Os::fileSeparator();
+      for (auto& it : headerIncludeName) {
+        if (it == '/') it = amd::Os::fileSeparator();
      }
    }
    size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator());
@@ -1028,8 +1027,8 @@ bool Program::linkImpl(const std::vector<device::Program*>& inputPrograms,
 #if defined(WITH_ONLINE_COMPILER)
  std::vector<std::string*> llvmBinaries(inputPrograms.size());
  std::vector<amd::OclElf::oclElfSections> elfSectionType(inputPrograms.size());
-  std::vector<device::Program*>::const_iterator it = inputPrograms.begin();
-  std::vector<device::Program*>::const_iterator itEnd = inputPrograms.end();
+  auto it = inputPrograms.cbegin();
+  const auto itEnd = inputPrograms.cend();
  for (size_t i = 0; it != itEnd; ++it, ++i) {
    Program* program = (Program*)*it;

@@ -430,10 +430,9 @@ void VirtualCPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd) {

  //! Go through ext objects by one and call member function to execute
  //! a sequence of external graphics API commands for each external object
-  for (std::vector<amd::Memory*>::const_iterator itr = cmd.getMemList().begin();
-       itr != cmd.getMemList().end(); itr++) {
-    if (*itr) {
-      bError |= !((*itr)->mapExtObjectInCQThread());
+  for (const auto& it : cmd.getMemList()) {
+    if (it) {
+      bError |= !(it->mapExtObjectInCQThread());
    }
  }
  if (bError) {
@@ -453,10 +452,9 @@ void VirtualCPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd) {

  bool bError = false;

-  for (std::vector<amd::Memory*>::const_iterator itr = cmd.getMemList().begin();
-       itr != cmd.getMemList().end(); itr++) {
-    if (*itr) {
-      bError |= !((*itr)->unmapExtObjectInCQThread());
+  for (const auto& it : cmd.getMemList()) {
+    if (it) {
+      bError |= !(it->unmapExtObjectInCQThread());
    }
  }
  if (bError) {
@@ -72,7 +72,7 @@ size_t SvmManager::size() {

 void SvmManager::AddSvmBuffer(const void* k, amd::Memory* v) {
  amd::ScopedLock lock(AllocatedLock_);
-  svmBufferMap_.insert(std::pair<uintptr_t, amd::Memory*>(reinterpret_cast<uintptr_t>(k), v));
+  svmBufferMap_.insert({reinterpret_cast<uintptr_t>(k), v});
 }

 void SvmManager::RemoveSvmBuffer(const void* k) {
@@ -83,7 +83,7 @@ void SvmManager::RemoveSvmBuffer(const void* k) {
 amd::Memory* SvmManager::FindSvmBuffer(const void* k) {
  amd::ScopedLock lock(AllocatedLock_);
  uintptr_t key = reinterpret_cast<uintptr_t>(k);
-  std::map<uintptr_t, amd::Memory*>::iterator it = svmBufferMap_.upper_bound(key);
+  auto it = svmBufferMap_.upper_bound(key);
  if (it == svmBufferMap_.begin()) {
    return NULL;
  }
@@ -320,8 +320,7 @@ device::Memory* Device::findMemoryFromVA(const void* ptr, size_t* offset) const
  amd::ScopedLock lk(*vaCacheAccess_);

  uintptr_t key = reinterpret_cast<uintptr_t>(ptr);
-  std::map<uintptr_t, device::Memory*>::iterator it =
-      vaCacheMap_->upper_bound(reinterpret_cast<uintptr_t>(ptr));
+  auto it = vaCacheMap_->upper_bound(reinterpret_cast<uintptr_t>(ptr));
  if (it == vaCacheMap_->begin()) {
    return nullptr;
  }
@@ -352,10 +351,10 @@ std::vector<Device*> Device::getDevices(cl_device_type type, bool offlineDevices
  }

  // Create the list of available devices
-  for (device_iterator it = devices_->begin(); it != devices_->end(); ++it) {
+  for (const auto& it : *devices_) {
    // Check if the device type is matched
-    if ((*it)->IsTypeMatching(type, offlineDevices)) {
-      result.push_back(*it);
+    if (it->IsTypeMatching(type, offlineDevices)) {
+      result.push_back(it);
    }
  }

@@ -369,9 +368,9 @@ size_t Device::numDevices(cl_device_type type, bool offlineDevices) {
    return 0;
  }

-  for (device_iterator it = devices_->begin(); it != devices_->end(); ++it) {
+  for (const auto& it : *devices_) {
    // Check if the device type is matched
-    if ((*it)->IsTypeMatching(type, offlineDevices)) {
+    if (it->IsTypeMatching(type, offlineDevices)) {
      ++result;
    }
  }
@@ -393,7 +392,7 @@ bool Device::getDeviceIDs(cl_device_type deviceType, cl_uint numEntries, cl_devi
    return false;
  }

-  std::vector<amd::Device*>::iterator it = ret.begin();
+  auto it = ret.cbegin();
  cl_uint count = std::min(numEntries, (cl_uint)ret.size());

  while (count--) {
@@ -707,7 +706,7 @@ void Memory::saveMapInfo(const void* mapAddress, const amd::Coord3D origin,

  // Insert into the map if it's the first region
  if (++pInfo->count_ == 1) {
-    writeMapInfo_.insert(std::pair<const void*, WriteMapInfo>(mapAddress, info));
+    writeMapInfo_.insert({mapAddress, info});
  }
 }

@@ -729,9 +728,8 @@ Program::~Program() { clear(); }

 void Program::clear() {
  // Destroy all device kernels
-  kernels_t::const_iterator it;
-  for (it = kernels_.begin(); it != kernels_.end(); ++it) {
-    delete it->second;
+  for (const auto& it : kernels_) {
+    delete it.second;
  }
  kernels_.clear();
 }
@@ -1035,8 +1033,8 @@ cl_int Program::build(const std::string& sourceCode, const char* origOptions,
 bool Program::getCompileOptionsAtLinking(const std::vector<Program*>& inputPrograms,
                                         const amd::option::Options* linkOptions) {
  amd::option::Options compileOptions;
-  std::vector<device::Program*>::const_iterator it = inputPrograms.begin();
-  std::vector<device::Program*>::const_iterator itEnd = inputPrograms.end();
+  auto it = inputPrograms.cbegin();
+  const auto itEnd = inputPrograms.cend();
  for (size_t i = 0; it != itEnd; ++it, ++i) {
    Program* program = *it;

@@ -1473,7 +1471,7 @@ bool ClBinary::createElfBinary(bool doencrypt, Program::type_t type) {
  return true;
 }

-Program::binary_t ClBinary::data() const { return std::make_pair(binary_, size_); }
+Program::binary_t ClBinary::data() const { return {binary_, size_}; }

 bool ClBinary::setBinary(const char* theBinary, size_t theBinarySize, bool allocated) {
  release();
@@ -828,9 +828,9 @@ class Memory : public amd::HeapObject {
  //! NB, the map data below is for an API-level map (from clEnqueueMapBuffer),
  //! not a physical map. When a memory object does not use USE_HOST_PTR we
  //! can use a remote resource and DMA, avoiding the additional CPU memcpy.
-  amd::Memory* mapMemory_;                            //!< Memory used as map target buffer
-  volatile size_t indirectMapCount_;                  //!< Number of maps
-  std::map<const void*, WriteMapInfo> writeMapInfo_;  //!< Saved write map info for partial unmap
+  amd::Memory* mapMemory_;                                      //!< Memory used as map target buffer
+  volatile size_t indirectMapCount_;                            //!< Number of maps
+  std::unordered_map<const void*, WriteMapInfo> writeMapInfo_;  //!< Saved write map info for partial unmap

  //! Increment map count
  void incIndMapCount() { ++indirectMapCount_; }
@@ -1017,7 +1017,7 @@ class Kernel : public amd::HeapObject {
 class Program : public amd::HeapObject {
 public:
  typedef std::pair<const void*, size_t> binary_t;
-  typedef std::map<std::string, Kernel*> kernels_t;
+  typedef std::unordered_map<std::string, Kernel*> kernels_t;
  // type of the program
  typedef enum {
    TYPE_NONE = 0,     // uncompiled
@@ -1347,14 +1347,14 @@ class ClBinary : public amd::HeapObject {

 inline const Program::binary_t Program::binary() const {
  if (clBinary() == NULL) {
-    return std::make_pair((const void*)0, 0);
+    return {(const void*)0, 0};
  }
  return clBinary()->data();
 }

 inline Program::binary_t Program::binary() {
  if (clBinary() == NULL) {
-    return std::make_pair((const void*)0, 0);
+    return {(const void*)0, 0};
  }
  return clBinary()->data();
 }
@@ -1750,7 +1750,6 @@ class Device : public RuntimeObject {
  static AppProfile* rocAppProfile_;
 #endif

-  typedef std::vector<Device*>::iterator device_iterator;
  static std::vector<Device*>* devices_;  //!< All known devices

  Device* parent_;                                    //!< This device's parent
@@ -11,10 +11,8 @@ namespace gpu {

 AppProfile::AppProfile()
    : amd::AppProfile(), enableHighPerformanceState_(true), reportAsOCL12Device_(false) {
-  propertyDataMap_.insert(DataMap::value_type(
-      "HighPerfState", PropertyData(DataType_Boolean, &enableHighPerformanceState_)));
+  propertyDataMap_.insert({"HighPerfState", PropertyData(DataType_Boolean, &enableHighPerformanceState_)});

-  propertyDataMap_.insert(
-      DataMap::value_type("OCL12Device", PropertyData(DataType_Boolean, &reportAsOCL12Device_)));
+  propertyDataMap_.insert({"OCL12Device", PropertyData(DataType_Boolean, &reportAsOCL12Device_)});
 }
 }
@@ -73,16 +73,14 @@ bool ClBinary::loadKernels(NullProgram& program, bool* hasRecompiled) {
       functionNameMap[] maps from a function name (linkage name in the generated code)
       to ElfSymbol_t, which is defined as above.
     */
-    std::map<std::string, ElfSymbol_t*> functionNameMap;
+    std::unordered_map<std::string, ElfSymbol_t*> functionNameMap;

    // Keep all kernel ILs if -use-debugil is present (gpu debugging)
-    std::map<std::string, std::string> kernelILs;
+    std::unordered_map<std::string, std::string> kernelILs;

    ~TempWrapper() {
-      std::map<std::string, ElfSymbol_t *>::iterator I, IB = functionNameMap.begin(),
-                                                        IE = functionNameMap.end();
-      for (I = IB; I != IE; ++I) {
-        delete[](*I).second;
+      for (const auto& it : functionNameMap) {
+        delete[] it.second;
      }

      kernelILs.clear();
@@ -177,10 +175,8 @@ bool ClBinary::loadKernels(NullProgram& program, bool* hasRecompiled) {
    }

    // Append all function metadata to debugIL
-    std::map<std::string, ElfSymbol_t *>::iterator I, IB = tempObj.functionNameMap.begin(),
-                                                      IE = tempObj.functionNameMap.end();
-    for (I = IB; I != IE; ++I) {
-      ElfSymbol_t* elfsymbol = (*I).second;
+    for (const auto& it : tempObj.functionNameMap) {
+      ElfSymbol_t* elfsymbol = it.second;
      if (elfsymbol == NULL) {
        // Not valid, skip
        continue;
@@ -202,11 +198,9 @@ bool ClBinary::loadKernels(NullProgram& program, bool* hasRecompiled) {
    }

    // Now, patch the IL from debugIL into functionNameMap[]
-    std::map<std::string, std::string>::iterator KI, KIB = tempObj.kernelILs.begin(),
-                                                     KIE = tempObj.kernelILs.end();
-    for (KI = KIB; KI != KIE; ++KI) {
-      const std::string& kn = (*KI).first;
-      const std::string& ilstr = (*KI).second;
+    for (const auto& it : tempObj.kernelILs) {
+      const std::string& kn = it.first;
+      const std::string& ilstr = it.second;

      ElfSymbol_t* elfsymbol = tempObj.functionNameMap[kn];
      if (elfsymbol == NULL) {
@@ -225,10 +219,8 @@ bool ClBinary::loadKernels(NullProgram& program, bool* hasRecompiled) {

  bool recompiled = false;
  bool hasKernels = false;
-  std::map<std::string, ElfSymbol_t *>::iterator I, IB = tempObj.functionNameMap.begin(),
-                                                    IE = tempObj.functionNameMap.end();
-  for (I = IB; I != IE; ++I) {
-    ElfSymbol_t* elfsymbol = (*I).second;
+  for (const auto& it : tempObj.functionNameMap) {
+    ElfSymbol_t* elfsymbol = it.second;
    if (elfsymbol == NULL) {
      // Not valid, skip
      continue;
@@ -237,7 +229,7 @@ bool ClBinary::loadKernels(NullProgram& program, bool* hasRecompiled) {
      // and the new binary is needed.
      if (saveAMDIL() && (elfsymbol->SymInfo[NDX_METADATA].size > 0)) {
        std::string fmetadata = "__OpenCL_";
-        fmetadata.append((*I).first);
+        fmetadata.append(it.first);
        fmetadata.append("_fmetadata");

        if (!elfOut()->addSymbol(amd::OclElf::RODATA, fmetadata.c_str(),
@@ -250,7 +242,7 @@ bool ClBinary::loadKernels(NullProgram& program, bool* hasRecompiled) {
      continue;
    }
    amd::OclElf::SymbolInfo* sinfo = &(elfsymbol->SymInfo[0]);
-    std::string FName = (*I).first;
+    std::string FName = it.first;

    // For this kernel, get the demangled kernel name, which is used to identify each kernel.
    const size_t name_sz = FName.size() - (sizeof(_kernel) - 1) - (sizeof(__OpenCL_) - 1);
@@ -74,9 +74,8 @@ bool NullProgram::compileImpl(const std::string& src,
    std::string headerIncludeName(headerIncludeNames[i]);
    // replace / in path with current os's file separator
    if (amd::Os::fileSeparator() != '/') {
-      for (std::string::iterator it = headerIncludeName.begin(), end = headerIncludeName.end();
-           it != end; ++it) {
-        if (*it == '/') *it = amd::Os::fileSeparator();
+      for (auto& it : headerIncludeName) {
+        if (it == '/') it = amd::Os::fileSeparator();
      }
    }
    size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator());
@@ -355,9 +354,8 @@ bool HSAILProgram::compileImpl(const std::string& sourceCode,
    std::string headerIncludeName(headerIncludeNames[i]);
    // replace / in path with current os's file separator
    if (amd::Os::fileSeparator() != '/') {
-      for (std::string::iterator it = headerIncludeName.begin(), end = headerIncludeName.end();
-           it != end; ++it) {
-        if (*it == '/') *it = amd::Os::fileSeparator();
+      for (auto& it : headerIncludeName) {
+        if (it == '/') it = amd::Os::fileSeparator();
      }
    }
    size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator());
@@ -1152,7 +1152,7 @@ device::Program* Device::createProgram(amd::option::Options* options) {
 }

 //! Requested devices list as configured by the GPU_DEVICE_ORDINAL
-typedef std::map<int, bool> requestedDevices_t;
+typedef std::unordered_map<int, bool> requestedDevices_t;

 //! Parses the requested list of devices to be exposed to the user.
 static void parseRequestedDeviceList(requestedDevices_t& requestedDevices) {
@@ -1349,10 +1349,9 @@ bool Kernel::bindGlobalHwCb(VirtualGPU& gpu, VirtualGPU::GslKernelDesc* desc) co

  // Bind HW constant buffers used for the global data store
  const Program::HwConstBuffers& gds = prog().glbHwCb();
-  for (Program::HwConstBuffers::const_iterator it = gds.begin(); (it != gds.end() && result);
-       ++it) {
-    uint idx = it->first;
-    result = bindResource(gpu, *(it->second), idx, ConstantBuffer, idx);
+  for (const auto& it : gds) {
+    uint idx = it.first;
+    result = bindResource(gpu, *(it.second), idx, ConstantBuffer, idx);
  }

  return result;
@@ -1535,16 +1534,16 @@ void Kernel::debug(VirtualGPU& gpu) const {
    }
  }
  const Program::HwConstBuffers& gds = prog().glbHwCb();
-  for (Program::HwConstBuffers::const_iterator it = gds.begin(); it != gds.end(); ++it) {
-    uint idx = it->first;
+  for (const auto& it : gds) {
+    uint idx = it.first;
    std::stringstream fileName;
    fileName << counter++ << "_kernel_" << name() << "_const" << idx << ".bin";
    stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary));
    if (stubWrite.is_open()) {
-      address memory = reinterpret_cast<address>((it->second)->map(&gpu, Resource::ReadOnly));
+      address memory = reinterpret_cast<address>(it.second->map(&gpu, Resource::ReadOnly));
      // Check if we have OpenCL program
-      stubWrite.write(reinterpret_cast<char*>(memory), (it->second)->size());
-      (it->second)->unmap(&gpu);
+      stubWrite.write(reinterpret_cast<char*>(memory), it.second->size());
+      it.second->unmap(&gpu);
      stubWrite.close();
    }
  }
@@ -543,7 +543,7 @@ bool PrintfDbgHSA::init(VirtualGPU& gpu, bool printfEnabled) {
    // First DWORD = Offset to where next information is to
    // be written, initialized to 0
    // Second DWORD = Number of bytes available for printf data
-    // = buffer size – 2*sizeof(uint32_t)
+    // = buffer size – 2*sizeof(uint32_t)
    const uint8_t initSize = 2 * sizeof(uint32_t);
    uint8_t sysMem[initSize];
    memset(sysMem, 0, initSize);
@@ -601,8 +601,6 @@ bool PrintfDbgHSA::output(VirtualGPU& gpu, bool printfEnabled,
        return false;
      }

-
-      std::vector<uint>::const_iterator ita;
      uint sb = 0;
      uint sbt = 0;

@@ -614,8 +612,8 @@ bool PrintfDbgHSA::output(VirtualGPU& gpu, bool printfEnabled,
        }
        const PrintfInfo& info = printfInfo[(*dbgBufferPtr)];
        sb += sizeof(uint32_t);
-        for (ita = info.arguments_.begin(); ita != info.arguments_.end(); ++ita) {
-          sb += *ita;
+        for (const auto& it : info.arguments_) {
+          sb += it;
        }

        if (sbt + sb > bufSize) {
@@ -318,7 +318,7 @@ bool NullProgram::linkImpl(amd::option::Options* options) {
        std::string metadataStr;
        std::vector<ILFunc*> notCalled;
        std::vector<ILFunc*> called;
-        std::map<int, const char**> macros;
+        std::unordered_map<int, const char**> macros;
        size_t j;
        Kernel::InitData initData = {0};

@@ -464,8 +464,8 @@ bool NullProgram::linkImpl(const std::vector<device::Program*>& inputPrograms,
                           amd::option::Options* options, bool createLibrary) {
  std::vector<std::string*> llvmBinaries(inputPrograms.size());
  std::vector<amd::OclElf::oclElfSections> elfSectionType(inputPrograms.size());
-  std::vector<device::Program*>::const_iterator it = inputPrograms.begin();
-  std::vector<device::Program*>::const_iterator itEnd = inputPrograms.end();
+  auto it = inputPrograms.cbegin();
+  const auto itEnd = inputPrograms.cend();
  for (size_t i = 0; it != itEnd; ++it, ++i) {
    NullProgram* program = (NullProgram*)*it;

@@ -682,7 +682,7 @@ bool NullProgram::linkImpl(const std::vector<device::Program*>& inputPrograms,
        std::string metadataStr;
        std::vector<ILFunc*> notCalled;
        std::vector<ILFunc*> called;
-        std::map<int, const char**> macros;
+        std::unordered_map<int, const char**> macros;
        size_t j;
        Kernel::InitData initData = {0};

@@ -1433,7 +1433,7 @@ NullKernel* NullProgram::createKernel(const std::string& name, const Kernel::Ini
 }

 // Invoked from ClBinary
-bool NullProgram::getAllKernelILs(std::map<std::string, std::string>& allKernelILs,
+bool NullProgram::getAllKernelILs(std::unordered_map<std::string, std::string>& allKernelILs,
                                  std::string& programIL, const char* ilKernelName) {
  llvm::CompUnit compunit(programIL);
  if (ilKernelName != NULL) {
@@ -1471,8 +1471,8 @@ bool NullProgram::createBinary(amd::option::Options* options) {
 Program::~Program() {
  // Destroy the global HW constant buffers
  const Program::HwConstBuffers& gds = glbHwCb();
-  for (Program::HwConstBuffers::const_iterator it = gds.begin(); it != gds.end(); ++it) {
-    delete it->second;
+  for (const auto& it : gds) {
+    delete it.second;
  }

  // Destroy the global data store
@@ -1634,8 +1634,8 @@ bool HSAILProgram::finiBuild(bool isBuildGood) {

 bool HSAILProgram::linkImpl(const std::vector<device::Program*>& inputPrograms,
                            amd::option::Options* options, bool createLibrary) {
-  std::vector<device::Program*>::const_iterator it = inputPrograms.begin();
-  std::vector<device::Program*>::const_iterator itEnd = inputPrograms.end();
+  auto it = inputPrograms.cbegin();
+  const auto itEnd = inputPrograms.cend();
  acl_error errorCode;

  // For each program we need to extract the LLVMIR and create
@@ -2037,13 +2037,12 @@ bool HSAILProgram::linkImpl(amd::option::Options* options) {
    }
    std::vector<std::string> vKernels = splitSpaceSeparatedString(kernelNames);
    delete [] kernelNames;
-    std::vector<std::string>::iterator it = vKernels.begin();
    bool dynamicParallelism = false;
    aclMetadata md;
    md.numHiddenKernelArgs = 0;
    size_t sizeOfnumHiddenKernelArgs = sizeof(md.numHiddenKernelArgs);
-    for (it; it != vKernels.end(); ++it) {
-      std::string kernelName(*it);
+    for (const auto& it : vKernels) {
+      std::string kernelName(it);
      std::string openclKernelName = Kernel::openclMangledName(kernelName);
      errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_NUM_KERNEL_HIDDEN_ARGS,
                               openclKernelName.c_str(), &md.numHiddenKernelArgs,
@@ -256,7 +256,7 @@ class NullProgram : public device::Program {
  /*! Get all per-kernel IL from programIL, where programIL is the IL for the
   *  whole compilation unit.
   */
-  bool getAllKernelILs(std::map<std::string, std::string>& allKernelILs, std::string& programIL,
+  bool getAllKernelILs(std::unordered_map<std::string, std::string>& allKernelILs, std::string& programIL,
                       const char* ilKernelName);

 protected:
@@ -322,7 +322,7 @@ class Program : public NullProgram {
                                   size_t binarySize = 0           //!< the machine code size
                                   );

-  typedef std::map<uint, gpu::Memory*> HwConstBuffers;
+  typedef std::unordered_map<uint, gpu::Memory*> HwConstBuffers;

  //! Global HW constant buffers
  const HwConstBuffers& glbHwCb() const { return constBufs_; }
@@ -1744,7 +1744,7 @@ bool ResourceCache::addCalResource(Resource::CalResourceDesc* desc, GslResourceR
      memcpy(descCached, desc, sizeof(Resource::CalResourceDesc));

      // Add the current resource to the cache
-      resCache_.push_front(std::make_pair(descCached, ref));
+      resCache_.push_front({descCached, ref});
      cacheSize_ += size;
      result = true;
    }
@@ -545,9 +545,9 @@ VirtualGPU::~VirtualGPU() {

  uint i;
  // Destroy all kernels
-  for (GslKernels::const_iterator it = gslKernels_.begin(); it != gslKernels_.end(); ++it) {
-    if (it->first != 0) {
-      freeKernelDesc(it->second);
+  for (const auto& it : gslKernels_) {
+    if (it.first != 0) {
+      freeKernelDesc(it.second);
    }
  }
  gslKernels_.clear();
@@ -1365,10 +1365,9 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {

  profilingBegin(vcmd, true);

-  std::vector<amd::Memory*>::const_iterator itr;
-  for (itr = vcmd.memObjects().begin(); itr != vcmd.memObjects().end(); ++itr) {
+  for (const auto& it : vcmd.memObjects()) {
    // Find device memory
-    gpu::Memory* memory = dev().getGpuMemory(*itr);
+    gpu::Memory* memory = dev().getGpuMemory(it);

    if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) {
      memory->mgpuCacheWriteBack();
@@ -2016,7 +2015,7 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {

    // Loop through all outstanding command batches
    while (!cbList_.empty()) {
-      CommandBatchList::const_iterator it = cbList_.begin();
+      const auto it = cbList_.cbegin();
      // Wait for completion
      foundEvent = awaitCompletion(*it, vcmd.waitingEvent());
      // Release a command batch
@@ -2210,8 +2209,8 @@ void VirtualGPU::submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand&
      const size_t memObjSize = cmd.getMemoryObjectSize();
      const std::vector<amd::Memory*>& memObj = cmd.getMemList();
      size_t se = 0;
-      for (std::vector<amd::Memory *>::const_iterator itMemObj = memObj.begin();
-           itMemObj != memObj.end(); ++itMemObj, ++se) {
+      for (auto itMemObj = memObj.cbegin();
+           itMemObj != memObj.cend(); ++itMemObj, ++se) {
        // Find GSL Mem Object
        gslMemObject gslMemObj = dev().getGpuMemory(*itMemObj)->gslResource();

@@ -2297,15 +2296,14 @@ void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) {

  profilingBegin(vcmd);

-  for (std::vector<amd::Memory*>::const_iterator it = vcmd.getMemList().begin();
-       it != vcmd.getMemList().end(); ++it) {
+  for (const auto& it : vcmd.getMemList()) {
    // amd::Memory object should never be NULL
-    assert(*it && "Memory object for interop is NULL");
-    gpu::Memory* memory = dev().getGpuMemory(*it);
+    assert(it && "Memory object for interop is NULL");
+    gpu::Memory* memory = dev().getGpuMemory(it);

    // If resource is a shared copy of original resource, then
    // runtime needs to copy data from original resource
-    (*it)->getInteropObj()->copyOrigToShared();
+    it->getInteropObj()->copyOrigToShared();

    // Check if OpenCL has direct access to the interop memory
    if (memory->interopType() == Memory::InteropDirectAccess) {
@@ -2336,11 +2334,10 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {

  profilingBegin(vcmd);

-  for (std::vector<amd::Memory*>::const_iterator it = vcmd.getMemList().begin();
-       it != vcmd.getMemList().end(); ++it) {
+  for (const auto& it : vcmd.getMemList()) {
    // amd::Memory object should never be NULL
-    assert(*it && "Memory object for interop is NULL");
-    gpu::Memory* memory = dev().getGpuMemory(*it);
+    assert(it && "Memory object for interop is NULL");
+    gpu::Memory* memory = dev().getGpuMemory(it);

    // Check if we can use HW interop
    if (memory->interopType() == Memory::InteropHwEmulation) {
@@ -2362,7 +2359,7 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {

    // If resource is a shared copy of original resource, then
    // runtime needs to copy data back to original resource
-    (*it)->getInteropObj()->copySharedToOrig();
+    it->getInteropObj()->copySharedToOrig();
  }

  profilingEnd(vcmd);
@@ -2513,7 +2510,7 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {
  wait |= state_.forceWait_;
  // Loop through all outstanding command batches
  while (!cbList_.empty()) {
-    CommandBatchList::const_iterator it = cbList_.begin();
+    const auto it = cbList_.cbegin();
    // Check if command batch finished without a wait
    bool finished = true;
    for (uint i = 0; i < AllEngines; ++i) {
@@ -2537,8 +2534,8 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {
 void VirtualGPU::enableSyncedBlit() const { return blitMgr_->enableSynchronization(); }

 void VirtualGPU::releaseMemObjects(bool scratch) {
-  for (GpuEvents::const_iterator it = gpuEvents_.begin(); it != gpuEvents_.end(); ++it) {
-    GpuEvent event = it->second;
+  for (const auto& it : gpuEvents_) {
+    GpuEvent event = it.second;
    waitForEvent(&event);
  }
  // Unbind all resources.So the queue won't have any bound mem objects
@@ -380,8 +380,8 @@ class VirtualGPU : public device::VirtualDevice, public CALGSLContext {
      ) const;

 private:
-  typedef std::map<CALimage, GslKernelDesc*> GslKernels;
-  typedef std::map<gslMemObject, GpuEvent> GpuEvents;
+  typedef std::unordered_map<CALimage, GslKernelDesc*> GslKernels;
+  typedef std::unordered_map<gslMemObject, GpuEvent> GpuEvents;

  //! Finds total amount of necessary iterations
  inline void findIterations(const amd::NDRangeContainer& sizes,  //!< Original workload sizes
@@ -11,10 +11,8 @@ namespace pal {

 AppProfile::AppProfile()
    : amd::AppProfile(), enableHighPerformanceState_(true), reportAsOCL12Device_(false) {
-  propertyDataMap_.insert(DataMap::value_type(
-      "HighPerfState", PropertyData(DataType_Boolean, &enableHighPerformanceState_)));
+  propertyDataMap_.insert({"HighPerfState", PropertyData(DataType_Boolean, &enableHighPerformanceState_)});

-  propertyDataMap_.insert(
-      DataMap::value_type("OCL12Device", PropertyData(DataType_Boolean, &reportAsOCL12Device_)));
+  propertyDataMap_.insert({"OCL12Device", PropertyData(DataType_Boolean, &reportAsOCL12Device_)});
 }
 }
@@ -65,9 +65,8 @@ bool HSAILProgram::compileImpl(const std::string& sourceCode,
    std::string headerIncludeName(headerIncludeNames[i]);
    // replace / in path with current os's file separator
    if (amd::Os::fileSeparator() != '/') {
-      for (std::string::iterator it = headerIncludeName.begin(), end = headerIncludeName.end();
-           it != end; ++it) {
-        if (*it == '/') *it = amd::Os::fileSeparator();
+      for (auto& it : headerIncludeName) {
+        if (it == '/') it = amd::Os::fileSeparator();
      }
    }
    size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator());
@@ -282,9 +281,8 @@ bool LightningProgram::compileImpl(const std::string& sourceCode,
    std::string headerIncludeName(headerIncludeNames[i]);
    // replace / in path with current os's file separator
    if (amd::Os::fileSeparator() != '/') {
-      for (std::string::iterator it = headerIncludeName.begin(), end = headerIncludeName.end();
-           it != end; ++it) {
-        if (*it == '/') *it = amd::Os::fileSeparator();
+      for (auto& it : headerIncludeName) {
+        if (it == '/') it = amd::Os::fileSeparator();
      }
    }
    size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator());
@@ -343,10 +341,10 @@ bool LightningProgram::compileImpl(const std::string& sourceCode,
    case 100:
    case 110:
    case 120:
-      hdr = std::make_pair(opencl1_2_c_amdgcn, opencl1_2_c_amdgcn_size);
+      hdr = {opencl1_2_c_amdgcn, opencl1_2_c_amdgcn_size};
      break;
    case 200:
-      hdr = std::make_pair(opencl2_0_c_amdgcn, opencl2_0_c_amdgcn_size);
+      hdr = {opencl2_0_c_amdgcn, opencl2_0_c_amdgcn_size};
      break;
    default:
      buildLog_ += "Unsupported requested OpenCL C version (-cl-std).\n";
@@ -1081,7 +1081,7 @@ device::Program* Device::createProgram(amd::option::Options* options) {
 }

 //! Requested devices list as configured by the GPU_DEVICE_ORDINAL
-typedef std::map<int, bool> requestedDevices_t;
+typedef std::unordered_map<int, bool> requestedDevices_t;

 //! Parses the requested list of devices to be exposed to the user.
 static void parseRequestedDeviceList(requestedDevices_t& requestedDevices) {
@@ -539,7 +539,7 @@ bool PrintfDbgHSA::init(VirtualGPU& gpu, bool printfEnabled) {
    // First DWORD = Offset to where next information is to
    // be written, initialized to 0
    // Second DWORD = Number of bytes available for printf data
-    // = buffer size – 2*sizeof(uint32_t)
+    // = buffer size � 2*sizeof(uint32_t)
    const uint8_t initSize = 2 * sizeof(uint32_t);
    uint8_t sysMem[initSize];
    memset(sysMem, 0, initSize);
@@ -597,7 +597,6 @@ bool PrintfDbgHSA::output(VirtualGPU& gpu, bool printfEnabled,
        return false;
      }

-      std::vector<uint>::const_iterator ita;
      uint sb = 0;
      uint sbt = 0;

@@ -609,8 +608,8 @@ bool PrintfDbgHSA::output(VirtualGPU& gpu, bool printfEnabled,
        }
        const PrintfInfo& info = printfInfo[(*dbgBufferPtr)];
        sb += sizeof(uint32_t);
-        for (ita = info.arguments_.begin(); ita != info.arguments_.end(); ++ita) {
-          sb += *ita;
+        for (const auto& it : info.arguments_) {
+          sb += it;
        }

        if (sbt + sb > bufSize) {
@@ -228,8 +228,8 @@ bool HSAILProgram::linkImpl(const std::vector<device::Program*>& inputPrograms,
  assert(!"Should not reach here");
  return false;
 #else   // !defined(WITH_LIGHTNING_COMPILER)
-  std::vector<device::Program*>::const_iterator it = inputPrograms.begin();
-  std::vector<device::Program*>::const_iterator itEnd = inputPrograms.end();
+  auto it = inputPrograms.cbegin();
+  const auto itEnd = inputPrograms.cend();
  acl_error errorCode;

  // For each program we need to extract the LLVMIR and create
@@ -656,10 +656,9 @@ bool HSAILProgram::linkImpl(amd::option::Options* options) {
    }
    std::vector<std::string> vKernels = splitSpaceSeparatedString(kernelNames);
    delete [] kernelNames;
-    std::vector<std::string>::iterator it = vKernels.begin();
    bool dynamicParallelism = false;
-    for (it; it != vKernels.end(); ++it) {
-      std::string kernelName(*it);
+    for (const auto& it : vKernels) {
+      std::string kernelName(it);
      std::string openclKernelName = device::Kernel::openclMangledName(kernelName);

      HSAILKernel* aKernel =
@@ -1133,7 +1133,7 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
  Pal::GpuMemoryCreateInfo createInfo = {};
  createInfo.size = desc().width_ * elementSize_;
  createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment);
-  createInfo.alignment = MaxGpuAlignment;
+  createInfo.alignment = desc().scratch_ ? 64*Ki : MaxGpuAlignment;
  createInfo.vaRange = Pal::VaRange::Default;
  createInfo.priority = Pal::GpuMemPriority::Normal;

@@ -1970,7 +1970,7 @@ bool ResourceCache::addGpuMemory(Resource::Descriptor* desc,

      amd::ScopedLock l(&lockCacheOps_);
      // Add the current resource to the cache
-      resCache_.push_front(std::make_pair(descCached, ref));
+      resCache_.push_front({descCached, ref});
      ref->gpu_ = nullptr;
      cacheSize_ += size;
      result = true;
@@ -1802,10 +1802,9 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {

  profilingBegin(vcmd, true);

-  std::vector<amd::Memory*>::const_iterator itr;
-  for (itr = vcmd.memObjects().begin(); itr != vcmd.memObjects().end(); ++itr) {
+  for (const auto& it : vcmd.memObjects()) {
    // Find device memory
-    pal::Memory* memory = dev().getGpuMemory(*itr);
+    pal::Memory* memory = dev().getGpuMemory(it);

    if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) {
      memory->mgpuCacheWriteBack();
@@ -2478,15 +2477,14 @@ void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) {

  profilingBegin(vcmd);

-  for (std::vector<amd::Memory*>::const_iterator it = vcmd.getMemList().begin();
-       it != vcmd.getMemList().end(); ++it) {
+  for (const auto& it : vcmd.getMemList()) {
    // amd::Memory object should never be nullptr
-    assert(*it && "Memory object for interop is nullptr");
-    pal::Memory* memory = dev().getGpuMemory(*it);
+    assert(it && "Memory object for interop is nullptr");
+    pal::Memory* memory = dev().getGpuMemory(it);

    // If resource is a shared copy of original resource, then
    // runtime needs to copy data from original resource
-    (*it)->getInteropObj()->copyOrigToShared();
+    it->getInteropObj()->copyOrigToShared();

    // Check if OpenCL has direct access to the interop memory
    if (memory->interopType() == Memory::InteropDirectAccess) {
@@ -2517,11 +2515,10 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {

  profilingBegin(vcmd);

-  for (std::vector<amd::Memory*>::const_iterator it = vcmd.getMemList().begin();
-       it != vcmd.getMemList().end(); ++it) {
+  for (const auto& it : vcmd.getMemList()) {
    // amd::Memory object should never be nullptr
-    assert(*it && "Memory object for interop is nullptr");
-    pal::Memory* memory = dev().getGpuMemory(*it);
+    assert(it && "Memory object for interop is nullptr");
+    pal::Memory* memory = dev().getGpuMemory(it);

    // Check if we can use HW interop
    if (memory->interopType() == Memory::InteropHwEmulation) {
@@ -2543,7 +2540,7 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {

    // If resource is a shared copy of original resource, then
    // runtime needs to copy data back to original resource
-    (*it)->getInteropObj()->copySharedToOrig();
+    it->getInteropObj()->copySharedToOrig();
  }

  profilingEnd(vcmd);
@@ -163,7 +163,7 @@ class VirtualGPU : public device::VirtualDevice {
    uint cmdBufIdCurrent_;  //!< Current global command buffer ID
    uint cmbBufIdRetired_;  //!< The last retired command buffer ID
    uint cmdCnt_;           //!< Counter of commands
-    std::map<GpuMemoryReference*, uint> memReferences_;
+    std::unordered_map<GpuMemoryReference*, uint> memReferences_;
    Util::VirtualLinearAllocator vlAlloc_;
    std::vector<Pal::GpuMemoryRef> palMemRefs_;
    std::vector<Pal::IGpuMemory*> palMems_;
@@ -51,19 +51,19 @@ foreach(AMDGCN_LIB_TARGET ${AMDGCN_LIB_TARGETS})
  if (${AMDGCN_LIB_TARGET} MATCHES "^oclc_isa_version_[0-9]+_lib$")
    string(REGEX REPLACE "^oclc_isa_version_([0-9]+)_lib$" "\\1" gfxip ${AMDGCN_LIB_TARGET})
    file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/libraries.amdgcn.inc
-      "case ${gfxip}: return std::make_pair(oclc_isa_version_${gfxip}_amdgcn, oclc_isa_version_${gfxip}_amdgcn_size); break;\n")
+      "case ${gfxip}: return {oclc_isa_version_${gfxip}_amdgcn, oclc_isa_version_${gfxip}_amdgcn_size}; break;\n")
  endif()
 endforeach()
 file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/libraries.amdgcn.inc
-  "default: return std::make_pair((const void*)0,(size_t)0);\n}\n}\n")
+  "default: return {(const void*)0,(size_t)0};\n}\n}\n")

 foreach(AMDGCN_LIB_TARGET ${AMDGCN_LIB_TARGETS})
  if (${AMDGCN_LIB_TARGET} MATCHES "oclc_(.*)_on_lib")
    string(REGEX REPLACE "oclc_(.*)_on_lib" "\\1" function ${AMDGCN_LIB_TARGET})
    file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/libraries.amdgcn.inc
-      "static inline std::pair<const char*, size_t> get_oclc_${function}(bool on)\n{ return std::make_pair("
+      "static inline std::pair<const char*, size_t> get_oclc_${function}(bool on)\n{ return {"
                "(const char*)(on ? oclc_${function}_on_amdgcn : oclc_${function}_off_amdgcn),"
-                "on ? oclc_${function}_on_amdgcn_size : oclc_${function}_off_amdgcn_size);}\n")
+                "on ? oclc_${function}_on_amdgcn_size : oclc_${function}_off_amdgcn_size};}\n")
  endif()
 endforeach()

@@ -183,8 +183,7 @@ void* ProDevice::AllocDmaBuffer(hsa_agent_t agent, size_t size, void** host_ptr)
                                          flags, &buf_size, &ptr, nullptr, nullptr)) {
        // Ask GPUPro driver to provide CPU access to allocation
        if (0 == Funcs().AmdgpuBoCpuMap(buf_handle, host_ptr)) {
-          allocs_.insert(std::pair<void*, std::pair<amdgpu_bo_handle, uint32_t>>(
-                         ptr, std::pair<amdgpu_bo_handle, uint32_t>(buf_handle, shared_handle)));
+          allocs_.insert({ptr, {buf_handle, shared_handle}});
        }
        else {
          hsa_amd_interop_unmap_buffer(ptr);
@@ -9,7 +9,7 @@
 #include "profuncs.hpp"
 #include "prodriver.hpp"
 #include "thread/monitor.hpp"
-#include <map>
+#include <unordered_map>

 /*! \addtogroup HSA
 *  @{
@@ -53,7 +53,7 @@ private:
  amdgpu_device_handle  dev_handle_;  //!< AMD gpu device handle
  amdgpu_gpu_info       gpu_info_;    //!< GPU info structure
  amdgpu_heap_info      heap_info_;   //!< Information about memory
-  mutable std::map<void*, std::pair<amdgpu_bo_handle, uint32_t>> allocs_; //!< Alloced memory mapping
+  mutable std::unordered_map<void*, std::pair<amdgpu_bo_handle, uint32_t>> allocs_; //!< Alloced memory mapping
  amd::Monitor*         alloc_ops_;   //!< Serializes memory allocations/destructions
 };

@@ -10,7 +10,7 @@

 namespace roc {

-typedef std::map<std::string, device::Kernel*> NameKernelMap;
+typedef std::unordered_map<std::string, device::Kernel*> NameKernelMap;

 class ClBinary : public device::ClBinary {
 public:
@@ -7,7 +7,6 @@
 #include <sstream>
 #include <fstream>
 #include <iostream>
-#include <iterator>

 #include "os/os.hpp"
 #include "rocdevice.hpp"
@@ -80,9 +79,8 @@ bool HSAILProgram::compileImpl(const std::string& sourceCode,
    std::string headerIncludeName(headerIncludeNames[i]);
    // replace / in path with current os's file separator
    if (amd::Os::fileSeparator() != '/') {
-      for (std::string::iterator it = headerIncludeName.begin(), end = headerIncludeName.end();
-           it != end; ++it) {
-        if (*it == '/') *it = amd::Os::fileSeparator();
+      for (auto& it : headerIncludeName) {
+        if (it == '/') it = amd::Os::fileSeparator();
      }
    }
    size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator());
@@ -249,9 +247,8 @@ bool LightningProgram::compileImpl(const std::string& sourceCode,
    std::string headerIncludeName(headerIncludeNames[i]);
    // replace / in path with current os's file separator
    if (amd::Os::fileSeparator() != '/') {
-      for (std::string::iterator it = headerIncludeName.begin(), end = headerIncludeName.end();
-           it != end; ++it) {
-        if (*it == '/') *it = amd::Os::fileSeparator();
+      for (auto& it : headerIncludeName) {
+        if (it == '/') it = amd::Os::fileSeparator();
      }
    }
    size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator());
@@ -309,10 +306,10 @@ bool LightningProgram::compileImpl(const std::string& sourceCode,
    case 100:
    case 110:
    case 120:
-      hdr = std::make_pair(opencl1_2_c_amdgcn, opencl1_2_c_amdgcn_size);
+      hdr = {opencl1_2_c_amdgcn, opencl1_2_c_amdgcn_size};
      break;
    case 200:
-      hdr = std::make_pair(opencl2_0_c_amdgcn, opencl2_0_c_amdgcn_size);
+      hdr = {opencl2_0_c_amdgcn, opencl2_0_c_amdgcn_size};
      break;
    default:
      buildLog_ += "Unsupported requested OpenCL C version (-cl-std).\n";
@@ -336,12 +336,11 @@ uint64_t PerfCounter::getInfo(uint64_t infoType) const {
                                                              &data);

      uint64_t result = 0;
-      std::vector<hsa_ven_amd_aqlprofile_info_data_t>::iterator it;
-      for (it = data.begin(); it != data.end(); ++it) {
-        if (it->pmc_data.event.block_name == event_.block_name &&
-            it->pmc_data.event.block_index == event_.block_index &&
-            it->pmc_data.event.counter_id == event_.counter_id) {
-            result += it->pmc_data.result;
+      for (const auto& it : data) {
+        if (it.pmc_data.event.block_name == event_.block_name &&
+            it.pmc_data.event.block_index == event_.block_index &&
+            it.pmc_data.event.counter_id == event_.counter_id) {
+            result += it.pmc_data.result;
        }
      }
      return result;
@@ -415,7 +415,6 @@ bool PrintfDbg::output(VirtualGPU& gpu, bool printfEnabled,
      return false;
    }

-    std::vector<uint>::const_iterator ita;
    uint sb = 0;
    uint sbt = 0;

@@ -427,8 +426,8 @@ bool PrintfDbg::output(VirtualGPU& gpu, bool printfEnabled,
      }
      const PrintfInfo& info = printfInfo[(*dbgBufferPtr)];
      sb += sizeof(uint32_t);
-      for (ita = info.arguments_.begin(); ita != info.arguments_.end(); ++ita) {
-        sb += *ita;
+      for (const auto& ita : info.arguments_) {
+        sb += ita;
      }

      size_t idx = 1;
@@ -24,7 +24,6 @@
 #include <fstream>
 #include <sstream>
 #include <iostream>
-#include <iterator>

 namespace roc {

@@ -535,8 +534,8 @@ aclType HSAILProgram::getCompilationStagesFromBinary(std::vector<aclType>& compl

 bool HSAILProgram::linkImpl(const std::vector<device::Program*>& inputPrograms,
                            amd::option::Options* options, bool createLibrary) {
-  std::vector<device::Program*>::const_iterator it = inputPrograms.begin();
-  std::vector<device::Program*>::const_iterator itEnd = inputPrograms.end();
+  auto it = inputPrograms.cbegin();
+  const auto itEnd = inputPrograms.cend();
  acl_error errorCode;

  // For each program we need to extract the LLVMIR and create
@@ -370,11 +370,10 @@ void UnmapMemoryCommand::releaseResources() {

 bool MigrateMemObjectsCommand::validateMemory() {
  if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) {
-    std::vector<amd::Memory*>::const_iterator itr;
-    for (itr = memObjects_.begin(); itr != memObjects_.end(); itr++) {
-      device::Memory* mem = (*itr)->getDeviceMemory(queue()->device());
+    for (const auto& it : memObjects_) {
+      device::Memory* mem = it->getDeviceMemory(queue()->device());
      if (NULL == mem) {
-        LogPrintfError("Can't allocate memory size - 0x%08X bytes!", (*itr)->getSize());
+        LogPrintfError("Can't allocate memory size - 0x%08X bytes!", it->getSize());
        return false;
      }
    }
@@ -434,11 +433,10 @@ cl_int NDRangeKernelCommand::validateMemory() {
 bool ExtObjectsCommand::validateMemory() {
  bool retVal = true;
  if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) {
-    for (std::vector<amd::Memory*>::const_iterator itr = memObjects_.begin();
-         itr != memObjects_.end(); itr++) {
-      device::Memory* mem = (*itr)->getDeviceMemory(queue()->device());
+    for (const auto& it : memObjects_) {
+      device::Memory* mem = it->getDeviceMemory(queue()->device());
      if (NULL == mem) {
-        LogPrintfError("Can't allocate memory size - 0x%08X bytes!", (*itr)->getSize());
+        LogPrintfError("Can't allocate memory size - 0x%08X bytes!", it->getSize());
        return false;
      }
      retVal = processGLResource(mem);
@@ -457,11 +455,10 @@ bool ReleaseExtObjectsCommand::processGLResource(device::Memory* mem) {

 bool MakeBuffersResidentCommand::validateMemory() {
  if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) {
-    for (std::vector<amd::Memory*>::const_iterator itr = memObjects_.begin();
-         itr != memObjects_.end(); itr++) {
-      device::Memory* mem = (*itr)->getDeviceMemory(queue()->device());
+    for (const auto& it : memObjects_) {
+      device::Memory* mem = it->getDeviceMemory(queue()->device());
      if (NULL == mem) {
-        LogPrintfError("Can't allocate memory size - 0x%08X bytes!", (*itr)->getSize());
+        LogPrintfError("Can't allocate memory size - 0x%08X bytes!", it->getSize());
        return false;
      }
    }
@@ -471,16 +468,14 @@ bool MakeBuffersResidentCommand::validateMemory() {
 }
 bool ThreadTraceMemObjectsCommand::validateMemory() {
  if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) {
-    for (std::vector<amd::Memory*>::const_iterator itr = memObjects_.begin();
-         itr != memObjects_.end(); itr++) {
-      device::Memory* mem = (*itr)->getDeviceMemory(queue()->device());
+    for (auto& it = memObjects_.cbegin(); it != memObjects_.cend(); it++) {
+      device::Memory* mem = (*it)->getDeviceMemory(queue()->device());
      if (NULL == mem) {
-        std::vector<amd::Memory*>::const_iterator tmpItr;
-        for (tmpItr = memObjects_.begin(); tmpItr != itr; tmpItr++) {
-          device::Memory* tmpMem = (*tmpItr)->getDeviceMemory(queue()->device());
+        for (auto& tmpIt = memObjects_.cbegin(); tmpIt != it; tmpIt++) {
+          device::Memory* tmpMem = (*tmpIt)->getDeviceMemory(queue()->device());
          delete tmpMem;
        }
-        LogPrintfError("Can't allocate memory size - 0x%08X bytes!", (*itr)->getSize());
+        LogPrintfError("Can't allocate memory size - 0x%08X bytes!", (*it)->getSize());
        return false;
      }
    }
@@ -720,10 +720,9 @@ class MigrateMemObjectsCommand : public Command {
                           const std::vector<amd::Memory*>& memObjects,
                           cl_mem_migration_flags flags)
      : Command(queue, type, eventWaitList), migrationFlags_(flags) {
-    std::vector<amd::Memory*>::const_iterator itr;
-    for (itr = memObjects.begin(); itr != memObjects.end(); itr++) {
-      (*itr)->retain();
-      memObjects_.push_back(*itr);
+    for (const auto& it : memObjects) {
+      it->retain();
+      memObjects_.push_back(it);
    }
  }

@@ -731,9 +730,8 @@ class MigrateMemObjectsCommand : public Command {

  //! Release all resources associated with this command
  void releaseResources() {
-    std::vector<amd::Memory*>::const_iterator itr;
-    for (itr = memObjects_.begin(); itr != memObjects_.end(); itr++) {
-      (*itr)->release();
+    for (const auto& it : memObjects_) {
+      it->release();
    }
    Command::releaseResources();
  }
@@ -837,18 +835,16 @@ class ExtObjectsCommand : public Command {
  ExtObjectsCommand(HostQueue& queue, const EventWaitList& eventWaitList, cl_uint num_objects,
                    const std::vector<amd::Memory*>& memoryObjects, cl_command_type type)
      : Command(queue, type, eventWaitList) {
-    for (std::vector<amd::Memory*>::const_iterator itr = memoryObjects.begin();
-         itr != memoryObjects.end(); itr++) {
-      (*itr)->retain();
-      memObjects_.push_back(*itr);
+    for (const auto& it : memoryObjects) {
+      it->retain();
+      memObjects_.push_back(it);
    }
  }

  //! Release all resources associated with this command
  void releaseResources() {
-    for (std::vector<amd::Memory*>::const_iterator itr = memObjects_.begin();
-         itr != memObjects_.end(); itr++) {
-      (*itr)->release();
+    for (const auto& it : memObjects_) {
+      it->release();
    }
    Command::releaseResources();
  }
@@ -954,9 +950,8 @@ class ThreadTraceMemObjectsCommand : public Command {
  //! Release all resources associated with this command
  void releaseResources() {
    threadTrace_.release();
-    for (std::vector<amd::Memory*>::const_iterator itr = memObjects_.begin();
-         itr != memObjects_.end(); itr++) {
-      (*itr)->release();
+    for (const auto& itr : memObjects_) {
+      itr->release();
    }
    Command::releaseResources();
  }
@@ -1067,19 +1062,17 @@ class MakeBuffersResidentCommand : public Command {
                             const std::vector<amd::Memory*>& memObjects,
                             cl_bus_address_amd* busAddr)
      : Command(queue, type, eventWaitList), busAddresses_(busAddr) {
-    std::vector<amd::Memory*>::const_iterator itr;
-    for (itr = memObjects.begin(); itr != memObjects.end(); itr++) {
-      (*itr)->retain();
-      memObjects_.push_back(*itr);
+    for (const auto& it : memObjects) {
+      it->retain();
+      memObjects_.push_back(it);
    }
  }

  virtual void submit(device::VirtualDevice& device) { device.submitMakeBuffersResident(*this); }

  void releaseResources() {
-    std::vector<amd::Memory*>::const_iterator itr;
-    for (itr = memObjects_.begin(); itr != memObjects_.end(); itr++) {
-      (*itr)->release();
+    for (const auto& it : memObjects_) {
+      it->release();
    }
    Command::releaseResources();
  }
@@ -96,15 +96,14 @@ void HostQueue::loop(device::VirtualDevice* virtualDevice) {

    // Process the command's event wait list.
    const Command::EventWaitList& events = command->eventWaitList();
-    Command::EventWaitList::const_iterator it;
    bool dependencyFailed = false;

-    for (it = events.begin(); it != events.end(); ++it) {
+    for (const auto& it : events) {
      // Only wait if the command is enqueued into another queue.
-      if ((*it)->command().queue() != this) {
+      if (it->command().queue() != this) {
        virtualDevice->flush(head, true);
        tail = head = NULL;
-        dependencyFailed |= !(*it)->awaitCompletion();
+        dependencyFailed |= !it->awaitCompletion();
      }
    }

@@ -62,10 +62,9 @@ Context::~Context() {

  // Dissociate OCL context with any external device
  if (info_.flags_ & (GLDeviceKhr | D3D10DeviceKhr | D3D11DeviceKhr)) {
-    std::vector<Device*>::const_iterator it;
    // Loop through all devices
-    for (it = devices_.begin(); it != devices_.end(); it++) {
-      (*it)->unbindExternalDevice(info_.flags_, info_.hDev_, info_.hCtx_, VALIDATE_ONLY);
+    for (const auto& it : devices_) {
+      it->unbindExternalDevice(info_.flags_, info_.hDev_, info_.hCtx_, VALIDATE_ONLY);
    }
  }

@@ -218,10 +217,9 @@ int Context::create(const intptr_t* properties) {
  // Check if OCL context can be associated with any external device
  if (info_.flags_ & (D3D10DeviceKhr | D3D11DeviceKhr | GLDeviceKhr | D3D9DeviceKhr |
                      D3D9DeviceEXKhr | D3D9DeviceVAKhr)) {
-    std::vector<Device*>::const_iterator it;
    // Loop through all devices
-    for (it = devices_.begin(); it != devices_.end(); it++) {
-      if (!(*it)->bindExternalDevice(info_.flags_, info_.hDev_, info_.hCtx_, VALIDATE_ONLY)) {
+    for (const auto& it : devices_) {
+      if (!it->bindExternalDevice(info_.flags_, info_.hDev_, info_.hCtx_, VALIDATE_ONLY)) {
        result = CL_INVALID_VALUE;
      }
    }
@@ -331,10 +329,9 @@ void Context::svmFree(void* ptr) const {
 }

 bool Context::containsDevice(const Device* device) const {
-  std::vector<Device*>::const_iterator it;

-  for (it = devices_.begin(); it != devices_.end(); ++it) {
-    if (device == *it || (*it)->isAncestor(device)) {
+  for (const auto& it : devices_) {
+    if (device == it || it->isAncestor(device)) {
      return true;
    }
  }
@@ -342,8 +339,8 @@ bool Context::containsDevice(const Device* device) const {
 }

 DeviceQueue* Context::defDeviceQueue(const Device& dev) const {
-  std::map<const Device*, DeviceQueueInfo>::const_iterator it = deviceQueues_.find(&dev);
-  if (it != deviceQueues_.end()) {
+  const auto it = deviceQueues_.find(&dev);
+  if (it != deviceQueues_.cend()) {
    return it->second.defDeviceQueue_;
  } else {
    return NULL;
@@ -11,7 +11,7 @@
 #include "platform/agent.hpp"

 #include <vector>
-#include <map>
+#include <unordered_map>

 namespace amd {

@@ -197,8 +197,8 @@ class Context : public RuntimeObject {
  GLFunctions* glenv_;                   //!< OpenGL context
  Device* customHostAllocDevice_;        //!< Device responsible for host allocations
  std::vector<Device*> svmAllocDevice_;  //!< Devices can support SVM allocations
-  std::map<const Device*, DeviceQueueInfo> deviceQueues_;  //!< Device queues mapping
-  mutable Monitor ctxLock_;                                //!< Lock for the context access
+  std::unordered_map<const Device*, DeviceQueueInfo> deviceQueues_;  //!< Device queues mapping
+  mutable Monitor ctxLock_;                                          //!< Lock for the context access
 };

 /*! @}
@@ -18,6 +18,7 @@
 #include <vector>
 #include <list>
 #include <map>
+#include <unordered_map>

 namespace device {
 class Memory;
@@ -124,7 +125,7 @@ class Memory : public amd::RuntimeObject {
  DeviceMemory* deviceMemories_;

  //! The device alloced state
-  std::map<const Device*, AllocState> deviceAlloced_;
+  std::unordered_map<const Device*, AllocState> deviceAlloced_;

  //! Linked list of destructor callbacks.
  std::atomic<DestructorCallBackEntry*> destructorCallbacks_;
@@ -24,7 +24,7 @@ namespace amd {
 */
 class PerfCounter : public RuntimeObject {
 public:
-  typedef std::map<cl_perfcounter_property, ulong> Properties;
+  typedef std::unordered_map<cl_perfcounter_property, ulong> Properties;

  //! Constructor of the performance counter object
  PerfCounter(const Device& device,    //!< device object
@@ -21,13 +21,12 @@ namespace amd {

 Program::~Program() {
  // Destroy all device programs
-  deviceprograms_t::const_iterator it, itEnd;
-  for (it = devicePrograms_.begin(), itEnd = devicePrograms_.end(); it != itEnd; ++it) {
-    delete it->second;
+  for (const auto& it : devicePrograms_) {
+    delete it.second;
  }

-  for (devicebinary_t::const_iterator IT = binary_.begin(), IE = binary_.end(); IT != IE; ++IT) {
-    const binary_t& Bin = IT->second;
+  for (const auto& it : binary_) {
+    const binary_t& Bin = it.second;
    if (Bin.first) {
      delete[] Bin.first;
    }
@@ -43,8 +42,8 @@ const Symbol* Program::findSymbol(const char* kernelName) const {
    return NULL;
  }

-  symbols_t::const_iterator it = symbolTable_->find(kernelName);
-  return (it == symbolTable_->end()) ? NULL : &it->second;
+  const auto it = symbolTable_->find(kernelName);
+  return (it == symbolTable_->cend()) ? NULL : &it->second;
 }

 cl_int Program::addDeviceProgram(Device& device, const void* image, size_t length,
@@ -151,8 +150,8 @@ cl_int Program::addDeviceProgram(Device& device, const void* image, size_t lengt
 }

 device::Program* Program::getDeviceProgram(const Device& device) const {
-  deviceprograms_t::const_iterator it = devicePrograms_.find(&device.rootDevice());
-  if (it == devicePrograms_.end()) {
+  const auto it = devicePrograms_.find(&device.rootDevice());
+  if (it == devicePrograms_.cend()) {
    return NULL;
  }
  return it->second;
@@ -198,16 +197,15 @@ cl_int Program::compile(const std::vector<Device*>& devices, size_t numHeaders,
  }

  // Compile the program programs associated with the given devices.
-  std::vector<Device*>::const_iterator it;
-  for (it = devices.begin(); it != devices.end(); ++it) {
-    device::Program* devProgram = getDeviceProgram(**it);
+  for (const auto& it : devices) {
+    device::Program* devProgram = getDeviceProgram(*it);
    if (devProgram == NULL) {
-      const binary_t& bin = binary(**it);
-      retval = addDeviceProgram(**it, bin.first, bin.second, &parsedOptions);
+      const binary_t& bin = binary(*it);
+      retval = addDeviceProgram(*it, bin.first, bin.second, &parsedOptions);
      if (retval != CL_SUCCESS) {
        return retval;
      }
-      devProgram = getDeviceProgram(**it);
+      devProgram = getDeviceProgram(*it);
    }

    if (devProgram->type() == device::Program::TYPE_INTERMEDIATE || language_ == SPIRV) {
@@ -277,8 +275,7 @@ cl_int Program::link(const std::vector<Device*>& devices, size_t numInputs,
  }

  // Link the program programs associated with the given devices.
-  std::vector<Device*>::const_iterator it;
-  for (it = devices.begin(); it != devices.end(); ++it) {
+  for (const auto& it : devices) {
    // find the corresponding device program in each input program
    std::vector<device::Program*> inputDevPrograms(numInputs);
    bool found = false;
@@ -288,8 +285,8 @@ cl_int Program::link(const std::vector<Device*>& devices, size_t numInputs,
        parsedOptions.oVariables->BinaryIsSpirv = true;
      }
      deviceprograms_t inputDevProgs = inputProgram.devicePrograms();
-      deviceprograms_t::const_iterator findIt = inputDevProgs.find(*it);
-      if (findIt == inputDevProgs.end()) {
+      const auto findIt = inputDevProgs.find(it);
+      if (findIt == inputDevProgs.cend()) {
        if (found) break;
        continue;
      }
@@ -328,14 +325,14 @@ cl_int Program::link(const std::vector<Device*>& devices, size_t numInputs,
      return CL_INVALID_VALUE;
    }

-    device::Program* devProgram = getDeviceProgram(**it);
+    device::Program* devProgram = getDeviceProgram(*it);
    if (devProgram == NULL) {
-      const binary_t& bin = binary(**it);
-      retval = addDeviceProgram(**it, bin.first, bin.second, &parsedOptions);
+      const binary_t& bin = binary(*it);
+      retval = addDeviceProgram(*it, bin.first, bin.second, &parsedOptions);
      if (retval != CL_SUCCESS) {
        return retval;
      }
-      devProgram = getDeviceProgram(**it);
+      devProgram = getDeviceProgram(*it);
    }

    // We only build a Device-Program once
@@ -359,16 +356,14 @@ cl_int Program::link(const std::vector<Device*>& devices, size_t numInputs,
  }

  // Rebuild the symbol table
-  deviceprograms_t::iterator sit;
-  for (sit = devicePrograms_.begin(); sit != devicePrograms_.end(); ++sit) {
-    const Device& device = *sit->first;
-    const device::Program& program = *sit->second;
+  for (const auto& sit : devicePrograms_) {
+    const Device& device = *(sit.first);
+    const device::Program& program = *(sit.second);

    const device::Program::kernels_t& kernels = program.kernels();
-    device::Program::kernels_t::const_iterator kit;
-    for (kit = kernels.begin(); kit != kernels.end(); ++kit) {
-      const std::string& name = kit->first;
-      const device::Kernel* devKernel = kit->second;
+    for (const auto& it : kernels) {
+      const std::string& name = it.first;
+      const device::Kernel* devKernel = it.second;

      Symbol& symbol = (*symbolTable_)[name];
      if (!symbol.setDeviceKernel(device, devKernel)) {
@@ -379,9 +374,8 @@ cl_int Program::link(const std::vector<Device*>& devices, size_t numInputs,

  // Create a string with all kernel names from the program
  if (kernelNames_.length() == 0) {
-    amd::Program::symbols_t::const_iterator it;
-    for (it = symbols().begin(); it != symbols().end(); ++it) {
-      if (it != symbols().begin()) {
+    for (auto it = symbols().cbegin(); it != symbols().cend(); ++it) {
+      if (it != symbols().cbegin()) {
        kernelNames_.append(1, ';');
      }
      kernelNames_.append(it->first.c_str());
@@ -474,20 +468,19 @@ cl_int Program::build(const std::vector<Device*>& devices, const char* options,
  }

  // Build the program programs associated with the given devices.
-  std::vector<Device*>::const_iterator it;
-  for (it = devices.begin(); it != devices.end(); ++it) {
-    device::Program* devProgram = getDeviceProgram(**it);
+  for (const auto& it : devices) {
+    device::Program* devProgram = getDeviceProgram(*it);
    if (devProgram == NULL) {
-      const binary_t& bin = binary(**it);
+      const binary_t& bin = binary(*it);
      if (sourceCode_.empty() && (bin.first == NULL)) {
        retval = false;
        continue;
      }
-      retval = addDeviceProgram(**it, bin.first, bin.second, &parsedOptions);
+      retval = addDeviceProgram(*it, bin.first, bin.second, &parsedOptions);
      if (retval != CL_SUCCESS) {
        return retval;
      }
-      devProgram = getDeviceProgram(**it);
+      devProgram = getDeviceProgram(*it);
    }

    parsedOptions.oVariables->AssumeAlias = true;
@@ -518,16 +511,14 @@ cl_int Program::build(const std::vector<Device*>& devices, const char* options,
  }

  // Rebuild the symbol table
-  deviceprograms_t::iterator sit;
-  for (sit = devicePrograms_.begin(); sit != devicePrograms_.end(); ++sit) {
-    const Device& device = *sit->first;
-    const device::Program& program = *sit->second;
+  for (const auto& it : devicePrograms_) {
+    const Device& device = *(it.first);
+    const device::Program& program = *(it.second);

    const device::Program::kernels_t& kernels = program.kernels();
-    device::Program::kernels_t::const_iterator kit;
-    for (kit = kernels.begin(); kit != kernels.end(); ++kit) {
-      const std::string& name = kit->first;
-      const device::Kernel* devKernel = kit->second;
+    for (const auto& kit : kernels) {
+      const std::string& name = kit.first;
+      const device::Kernel* devKernel = kit.second;

      Symbol& symbol = (*symbolTable_)[name];
      if (!symbol.setDeviceKernel(device, devKernel)) {
@@ -538,9 +529,8 @@ cl_int Program::build(const std::vector<Device*>& devices, const char* options,

  // Create a string with all kernel names from the program
  if (kernelNames_.length() == 0) {
-    amd::Program::symbols_t::const_iterator it;
-    for (it = symbols().begin(); it != symbols().end(); ++it) {
-      if (it != symbols().begin()) {
+    for (auto it = symbols().cbegin(); it != symbols().cend(); ++it) {
+      if (it != symbols().cbegin()) {
        kernelNames_.append(1, ';');
      }
      kernelNames_.append(it->first.c_str());
@@ -555,12 +545,10 @@ cl_int Program::build(const std::vector<Device*>& devices, const char* options,
 }

 void Program::clear() {
-  deviceprograms_t::iterator sit;
-
  // Destroy old programs if we have any
-  for (sit = devicePrograms_.begin(); sit != devicePrograms_.end(); ++sit) {
+  for (const auto& it : devicePrograms_) {
    // Destroy device program
-    delete sit->second;
+    delete it.second;
  }

  devicePrograms_.clear();
@@ -631,13 +619,13 @@ bool Symbol::setDeviceKernel(const Device& device, const device::Kernel* func, b

 const device::Kernel* Symbol::getDeviceKernel(const Device& device, bool noAlias) const {
  const devicekernels_t* devKernels = (noAlias) ? &deviceKernels_ : &devKernelsNoOpt_;
-  devicekernels_t::const_iterator itEnd = devKernels->end();
-  devicekernels_t::const_iterator it = devKernels->find(&device);
+  const auto itEnd = devKernels->cend();
+  auto it = devKernels->find(&device);
  if (it != itEnd) {
    return it->second;
  }

-  for (it = devKernels->begin(); it != itEnd; ++it) {
+  for (it = devKernels->cbegin(); it != itEnd; ++it) {
    if (it->first->isAncestor(&device)) {
      return it->second;
    }
@@ -35,7 +35,7 @@ namespace amd {
 //! A kernel function symbol
 class Symbol : public HeapObject {
 public:
-  typedef std::map<const Device*, const device::Kernel*> devicekernels_t;
+  typedef std::unordered_map<const Device*, const device::Kernel*> devicekernels_t;

 private:
  devicekernels_t deviceKernels_;    //! All device kernels objects.
@@ -68,9 +68,9 @@ class Program : public RuntimeObject {
 public:
  typedef std::pair<uint8_t*, size_t> binary_t;
  typedef std::set<Device const*> devicelist_t;
-  typedef std::map<Device const*, binary_t> devicebinary_t;
-  typedef std::map<Device const*, device::Program*> deviceprograms_t;
-  typedef std::map<std::string, Symbol> symbols_t;
+  typedef std::unordered_map<Device const*, binary_t> devicebinary_t;
+  typedef std::unordered_map<Device const*, device::Program*> deviceprograms_t;
+  typedef std::unordered_map<std::string, Symbol> symbols_t;

  enum Language {
    Binary = 0,
@@ -14,7 +14,7 @@ namespace amd {
 //! Abstraction layer sampler class
 class Sampler : public RuntimeObject {
 public:
-  typedef std::map<Device const*, device::Sampler*> DeviceSamplers;
+  typedef std::unordered_map<Device const*, device::Sampler*> DeviceSamplers;

  //! \note the sampler states must match the compiler's defines.
  //! See amd_ocl_sys_predef.c
@@ -5,7 +5,7 @@
 #include "top.hpp"
 #include "utils/flags.hpp"

-#include <map>
+#include <unordered_map>
 #include <string>
 #include <cstdlib>
 #include <cstring>
@@ -75,7 +75,7 @@ void Flag::tearDown() {
 }

 bool Flag::init() {
-  typedef std::map<std::string, const char*> vars_type;
+  typedef std::unordered_map<std::string, const char*> vars_type;
  vars_type vars;

 #ifdef _WIN32
@@ -116,8 +116,8 @@ bool Flag::init() {
  for (size_t i = 0; i < numFlags_; ++i) {
    Flag& flag = flags_[i];

-    vars_type::iterator it = vars.find(flag.name_);
-    if (it != vars.end()) {
+    const auto it = vars.find(flag.name_);
+    if (it != vars.cend()) {
      flag.setValue(it->second);
    }
  }