diff --git a/projects/clr/rocclr/device/device.cpp b/projects/clr/rocclr/device/device.cpp
index 423b40312b..490f226cee 100644
--- a/projects/clr/rocclr/device/device.cpp
+++ b/projects/clr/rocclr/device/device.cpp
@@ -739,6 +739,18 @@ bool Device::ValidateHsail() {
   return true;
 }
 
+size_t GetMaxStackSize(const std::string& procName) {
+  if (procName.find("gfx9") != std::string::npos || procName.find("gfx8")
+                                                    != std::string::npos) {
+    return kMaxStackSize9X;
+  } else if (procName.find("gfx11") != std::string::npos || procName.find("gfx10")
+                                                            != std::string::npos) {
+    return kMaxStackSize11X;
+  } else {
+    return kMaxStackSize12X;
+  }
+}
+
 bool Device::create(const Isa &isa) {
   assert(!vaCacheAccess_ && !vaCacheMap_);
   isa_ = &isa;
@@ -755,6 +767,7 @@ bool Device::create(const Isa &isa) {
   if (!amd::IS_HIP) {
     stack_size_ = 16 * Ki;
   }
+  maxStackSize_ = GetMaxStackSize(isa_->processorName());
   return true;
 }
 
@@ -930,7 +943,7 @@ bool Device::disableP2P(amd::Device* ptrDev) {
 }
 
 bool Device::UpdateStackSize(uint64_t stackSize) {
-  if (stackSize > kMaxStackSize) {
+  if (stackSize > maxStackSize_) {
     return false;
   }
   stack_size_ = amd::alignUp(stackSize, 16);
diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp
index bc08ea9775..eb4627b9ea 100644
--- a/projects/clr/rocclr/device/device.hpp
+++ b/projects/clr/rocclr/device/device.hpp
@@ -126,6 +126,16 @@ enum MemRangeAttribute : uint32_t {
 constexpr int CpuDeviceId = static_cast<int>(-1);
 constexpr int InvalidDeviceId = static_cast<int>(-2);
 
+// Max scratch size is device dependent.
+constexpr size_t kWave32 = 32;
+constexpr size_t kWave64 = 64;
+constexpr size_t kScratchBits12X = 18;
+constexpr size_t kScratchBits9X = 15;
+constexpr size_t kCompilerRequired = 64;
+constexpr size_t kMaxStackSize12X = (((1 << kScratchBits12X) - 1) * 256 / kWave32) - kCompilerRequired;
+constexpr size_t kMaxStackSize11X = (((1 << kScratchBits9X) - 1) * 256 / kWave32) - kCompilerRequired;
+constexpr size_t kMaxStackSize9X = (((1 << kScratchBits9X) - 1) * 256 / kWave64) - kCompilerRequired;
+
 enum class ExternalSemaphoreHandleType : uint32_t {
   OpaqueFd = 1,        // Handle is an opaque file descriptor
   OpaqueWin32 = 2,     // Handle is an opaque shared NT handle
@@ -1652,11 +1662,9 @@ class Device : public RuntimeObject {
   static constexpr size_t kMGInfoSizePerDevice = kMGSyncDataSize + sizeof(MGSyncInfo);
   static constexpr size_t kSGInfoSize = kMGSyncDataSize;
 
-  // Amount of space used by each wave is in units of 256 dwords.
-  // As per COMPUTE_TMPRING_SIZE.WAVE_SIZE 24:12
-  // The field size supports a range of 0->(2M-256) dwords per wave64.
-  // Per lane this works out to 131056 bytes or 128K - 16
-  static constexpr size_t kMaxStackSize = ((128 * Ki) - 16);
+  // Max Scratch size is based on ISA and thus per device.
+  // Def value is as per GFX9 being the least among supported devices.
+  size_t maxStackSize_ = kMaxStackSize9X;
 
   typedef std::list<CommandQueue*> CommandQueues;
 
@@ -2131,6 +2139,9 @@ class Device : public RuntimeObject {
     return nullptr;
   }
 
+  //! Returns stack size set for the device
+  size_t MaxStackSize() const { return maxStackSize_; }
+
 #if defined(__clang__)
 #if __has_feature(address_sanitizer)
   virtual device::UriLocator* createUriLocator() const = 0;
diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp
index d12cc72ccc..4a6dc96bbc 100644
--- a/projects/clr/rocclr/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/device/pal/palvirtual.cpp
@@ -2752,6 +2752,15 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
     if ((hsaKernel.workGroupInfo()->usedStackSize_ & 0x1) == 0x1) {
       privateMemSize = std::max<uint32_t>(static_cast<uint32_t>(device().StackSize()),
                                 hsaKernel.workGroupInfo()->scratchRegs_ * sizeof(uint32_t)) ;
+      // Validate privateMemSize is more than max allowed.
+      size_t maxStackSize = device().MaxStackSize();
+      if (privateMemSize > maxStackSize) {
+        ClPrint(amd::LOG_INFO, amd::LOG_KERN,
+          "Scratch size (%zu) exceeds max allowed (%zu) for kernel : %s",
+          privateMemSize, maxStackSize, hsaKernel.name().c_str());
+        LogError("Scratch size exceeds max allowed.");
+        return false;
+      }
     }
 
     // Set up the dispatch information
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index 491b72e802..3ad0098bab 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -3585,9 +3585,18 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
     dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_;
 
     if ((devKernel->workGroupInfo()->usedStackSize_ & 0x1) == 0x1) {
-      dispatchPacket.private_segment_size = std::min<uint64_t>(
-          std::max<uint64_t>(dev().StackSize(), dispatchPacket.private_segment_size),
-          Device::kMaxStackSize);
+      dispatchPacket.private_segment_size = std::max<uint64_t>(dev().StackSize(),
+                                             dispatchPacket.private_segment_size);
+      // Validate privateMemSize is more than max allowed.
+      size_t maxStackSize = dev().MaxStackSize();
+      if (dispatchPacket.private_segment_size > maxStackSize) {
+        ClPrint(amd::LOG_INFO, amd::LOG_KERN,
+          "Scratch size (%u) exceeds max allowed (%zu) for kernel : %s",
+          dispatchPacket.private_segment_size, maxStackSize,
+                gpuKernel.getDemangledName().c_str());
+        LogError("Scratch size exceeds max allowed.");
+        return false;
+      }
     }
 
     // Pass the header accordingly