/* *********************************************************************************************************************** * * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * **********************************************************************************************************************/ /** *********************************************************************************************************************** * @file palBuddyAllocatorImpl.h * @brief PAL utility BuddyAllocator class implementation. *********************************************************************************************************************** */ #pragma once #include "palBuddyAllocator.h" #include "palHashMapImpl.h" #include "palHashSetImpl.h" #include "palInlineFuncs.h" #include "palSysMemory.h" namespace Util { // ===================================================================================================================== template BuddyAllocator::BuddyAllocator( Allocator* pAllocator, gpusize baseAllocSize, gpusize minAllocSize) : m_pAllocator(pAllocator), m_baseAllocKval(SizeToKval(baseAllocSize)), m_minKval(SizeToKval(minAllocSize)), m_pFreeBlockSets(nullptr), m_pUsedBlockMap(nullptr), m_pNumFreeList(nullptr), m_numSuballocations(0), m_pFreeSetMutexes(nullptr), m_usedClaim(false) { // Allocator must be non-null PAL_ASSERT(m_pAllocator != nullptr); // Base allocation size must be POT PAL_ASSERT(KvalToSize(m_baseAllocKval) == baseAllocSize); // Minimum allocation size must be POT PAL_ASSERT(KvalToSize(m_minKval) == minAllocSize); } // ===================================================================================================================== template BuddyAllocator::~BuddyAllocator() { // lock this here to ensure no other thread was doing anything with the buddyAllocator when the destructor is called RWLockAuto freeLock(&m_freeLock); if (m_pFreeBlockSets != nullptr) { const uint32 numKvals = m_baseAllocKval - m_minKval; for (uint32 i = 0; i < numKvals; ++i) { // Call the destructor m_pFreeBlockSets[i].~HashSet(); } // Free the block list array PAL_SAFE_FREE(m_pFreeBlockSets, m_pAllocator); } if (m_pUsedBlockMap != nullptr) { PAL_SAFE_DELETE(m_pUsedBlockMap, m_pAllocator); } if (m_pNumFreeList != nullptr) { PAL_SAFE_DELETE_ARRAY(m_pNumFreeList, m_pAllocator); } if (m_pFreeSetMutexes != nullptr) { PAL_SAFE_DELETE_ARRAY(m_pFreeSetMutexes, m_pAllocator); } } // ===================================================================================================================== // Gets maximum allocation size supported by this buddy allocator. template gpusize BuddyAllocator::MaximumAllocationSize() const { // NOTE: Report one less than our base allocation k-value because there's no sense in suballocating a memory // request which is larger than half a chunk return KvalToSize(m_baseAllocKval - 1); } // ===================================================================================================================== // Initializes the buddy allocator. template Result BuddyAllocator::Init() { PAL_ASSERT(m_pFreeBlockSets == nullptr); PAL_ASSERT(m_pUsedBlockMap == nullptr); PAL_ASSERT(m_pNumFreeList == nullptr); PAL_ASSERT(m_pFreeSetMutexes == nullptr); // start out with success and take it away if something fails. Result result = Result::Success; const uint32 numKvals = m_baseAllocKval - m_minKval; // one hashSet per kval m_pFreeBlockSets = static_cast(PAL_MALLOC(sizeof(FreeSet) * numKvals, m_pAllocator, AllocInternal)); // Initialize the hashSets. if (m_pFreeBlockSets != nullptr) { for (uint32 i = 0; i < numKvals; ++i) { // max number of entries at a level is: 2^distFromTop const uint32 maxEntriesKval = 1 << (m_baseAllocKval - (i + m_minKval)); // 32 is a suitable max, however its the higher kvals won't even need 32 buckets. const uint32 bucketsNeeded = Min(maxEntriesKval / (PAL_CACHE_LINE_BYTES) + 1, 32u); PAL_PLACEMENT_NEW(&m_pFreeBlockSets[i]) FreeSet(bucketsNeeded, m_pAllocator); result = m_pFreeBlockSets[i].Init(); // if we failed the Init of the hashSet, delete the ones we did create, and free the array. This avoids // having to keep track of the hashSets we did initialize in the destructor by just destroying it here. if (result != Result::Success) { for (uint32 j = 0; j <= i; j++) { m_pFreeBlockSets[j].~HashSet(); } PAL_SAFE_FREE(m_pFreeBlockSets, m_pAllocator); break; } } } else { result = Result::ErrorOutOfMemory; } if (result == Result::Success) { m_pNumFreeList = static_cast(PAL_NEW_ARRAY(uint32, numKvals, m_pAllocator, AllocInternal)); if (m_pNumFreeList == nullptr) { result = Result::ErrorOutOfMemory; } } if (result == Result::Success) { m_pFreeSetMutexes = static_cast(PAL_NEW_ARRAY(Mutex, numKvals, m_pAllocator, AllocInternal)); if (m_pFreeSetMutexes == nullptr) { result = Result::ErrorOutOfMemory; } } const uint32 maxUsedEntries = 1 << (m_baseAllocKval - m_minKval); const uint32 usedBucketsNeeded = maxUsedEntries / (PAL_CACHE_LINE_BYTES * 8) + 1; if (result == Result::Success) { // one hashMap for getting the kval a used block is at m_pUsedBlockMap = static_cast(PAL_NEW(UsedMap, m_pAllocator, AllocInternal) (usedBucketsNeeded, m_pAllocator)); if (m_pUsedBlockMap != nullptr) { result = m_pUsedBlockMap->Init(); } else { result = Result::ErrorOutOfMemory; } } // if we successfully allocated all the memory we need, create the first two free blocks. if (result == Result::Success) { memset(m_pNumFreeList, 0, sizeof(uint32) * numKvals); // We need to create the first two largest-size blocks and add them to the last block list const uint32 blockKval = (m_baseAllocKval - 1); const gpusize blockSize = KvalToSize(blockKval); FreeSet* pTopFreeSet = &m_pFreeBlockSets[blockKval - m_minKval]; // mark both of these as free blocks result = pTopFreeSet->Insert(0); if (result == Result::Success) { // even though this will never be reached, to pass the asserts, this needs to be // as this kval result = m_pUsedBlockMap->Insert(0, blockKval + 1); } if (result == Result::Success) { result = pTopFreeSet->Insert(blockSize); } m_pNumFreeList[blockKval - m_minKval] = 2; m_highestFreeKval = blockKval; } PAL_ALERT(result != Result::Success); return result; } // ===================================================================================================================== // Suballocates a block from the base allocation that this buddy allocator manages. If no free space is found then an // appropriate error is returned. // In order for m_pNumFreeList bookkeeping to be correct, ClaimGpuMemory MUST be called directly before this call to // Allocate. The buddyAllocator will still work without this, but the results of ClaimGpuMemory will not be correct. // unless it is called before every call to Allocate. template Result BuddyAllocator::Allocate( gpusize size, gpusize alignment, gpusize* pOffset) { PAL_ASSERT(m_pFreeBlockSets != nullptr); PAL_ASSERT(m_pUsedBlockMap != nullptr); PAL_ASSERT(m_pNumFreeList != nullptr); PAL_ASSERT(m_pFreeSetMutexes != nullptr); PAL_ASSERT(pOffset != nullptr); PAL_ASSERT(size <= MaximumAllocationSize()); // Pad the requested allocation size to the nearest POT of the size and alignment const uint32 kval = Max(SizeToKval(Pow2Pad(Max(size, alignment))), m_minKval); RWLockAuto freeLock(&m_freeLock); Result result = GetNextFreeBlock(kval, pOffset); // mark this kval as used here. if (result == Result::Success) { result = SetKvalUsed(*pOffset, kval); } if (result == Result::Success) { // Increment the number of suballocations this buddy allocator manages AtomicIncrement(&m_numSuballocations); } return result; } // ===================================================================================================================== // Gets the next free block by recursively dividing larger blocks until a suitible sized block is created. template Result BuddyAllocator::GetNextFreeBlock( uint32 kval, gpusize* pOffset) { Result result = Result::ErrorOutOfGpuMemory; if (kval < m_baseAllocKval) { // this lock can not get any more fine grained MutexAuto freeSetLock(&(m_pFreeSetMutexes[kval - m_minKval])); result = PopFromFreeSet(pOffset, kval); if (result == Result::ErrorOutOfGpuMemory) { // we didn't find a block at this kval, search the next level up result = GetNextFreeBlock(kval + 1, pOffset); if (result == Result::Success) { // insert our buddy to the free set gpusize buddyOffset = *pOffset + KvalToSize(kval); result = InsertToFreeSet(buddyOffset, kval); PAL_ASSERT(result == Result::Success); } } else { // only two valid options are ErrorOutOfGpuMemory and Success, other result means the hashing failed. PAL_ASSERT(result == Result::Success); } } PAL_ALERT_MSG(result != Result::Success, "This should only fail if ClaimGpuMemory() is not called before this call to Allocate()."); return result; } // ===================================================================================================================== // Frees the memory at the given offset, if it's buddy is also free, merges the two and recursively calls this again. // This doesn't need any internal locks because Free accquires an exclusive lock on the entire allocator (freeLock), and // the lock on the m_pNumFreeList. These locks could potentially be more fine grained, however freeing and allocating // don't typically happen at the same time, and Freeing is already much faster than allocating. template Result BuddyAllocator::FreeBlock( gpusize offset) { Result result = Result::ErrorUnknown; uint32 usedKval; bool offsetUsed = GetKvalUsed(offset, &usedKval); PAL_ASSERT(offsetUsed); PAL_ASSERT(usedKval >= m_minKval && usedKval < m_baseAllocKval); gpusize buddyOffset = offset ^ KvalToSize(usedKval); gpusize offsetUp = Min(offset, buddyOffset); // we don't want merge if we are on the top level. We also don't want to merge if a call to claim was made that // claimed the buddy we are about to free. if (IsOffsetFree(buddyOffset, usedKval) && (usedKval < m_baseAllocKval -1) && ((m_pNumFreeList[usedKval - m_minKval] > 0) || (m_usedClaim == false))) { // We can combine the two blocks and mark the one in the level above as free // And do this recursively result = RemoveOffsetFromFreeSet(buddyOffset, usedKval); if (result == Result::Success) { // even though the block is going to be freed, need to set the kval as used // so that on the recursive call it will be found and freed again. PAL_ASSERT_MSG((m_pNumFreeList[usedKval - m_minKval] != 0) || (m_usedClaim == false), "This should only fail if ClaimGpuMemory() is not called before this call to Allocate()."); m_pNumFreeList[usedKval - m_minKval] -= 1; result = SetKvalUsed(offsetUp, usedKval + 1); } // if this offset isn't the one that will be set as free in the next level up, we just need to remove it. if ((result == Result::Success) && (offset != offsetUp)) { result = RemoveOffsetFromUsedMap(offset); } if (result == Result::Success) { result = FreeBlock(offsetUp); } } else { // We mark this block as free in this level result = InsertToFreeSet(offset, usedKval); if (result == Result::Success) { m_pNumFreeList[usedKval - m_minKval] += 1; m_highestFreeKval = Util::Max(usedKval, m_highestFreeKval); if (offsetUp == offset) { // if on the same offset as level up, move where the used block is result = SetKvalUsed(offsetUp, usedKval + 1); } else { // if at the top of this offset, remove is from used map result = RemoveOffsetFromUsedMap(offset); } } } return result; } // ===================================================================================================================== // Frees a suballocated block making it available for future re-use. template void BuddyAllocator::Free( gpusize offset, gpusize size, gpusize alignment) { RWLockAuto freeLock(&m_freeLock); MutexAuto numFreeMutex(&m_numFreeMutex); PAL_ASSERT(m_pFreeBlockSets != nullptr); PAL_ASSERT(m_pUsedBlockMap != nullptr); PAL_ASSERT(m_pNumFreeList != nullptr); PAL_ASSERT(m_pFreeSetMutexes != nullptr); Result result = FreeBlock(offset); // Freeing should always succeed unless something went wrong with the allocation scheme PAL_ASSERT(result == Result::Success); // Decrement the number of suballocations this buddy allocator manages AtomicDecrement(&m_numSuballocations); } // ===================================================================================================================== // Claims the memory that will be used when Allocate is called. // Returns ErrorOutOfGpuMemory if this buddyAllocator has no free blocks, otherwise returns Success. template Result BuddyAllocator::ClaimGpuMemory( gpusize size, gpusize alignment) { // Set this to true as soon as the first call to claim is done to signal to Free that claim is being used. m_usedClaim = true; PAL_ASSERT(m_pNumFreeList != nullptr); // Pad the requested allocation size to the nearest POT of the size and alignment uint32 kval = Max(SizeToKval(Pow2Pad(Max(size, alignment))), m_minKval); PAL_ASSERT(kval >= m_minKval && kval < m_baseAllocKval); Result result = Result::ErrorOutOfGpuMemory; // Do this check twice to avoid taking the lock at all if we have no chance of Claiming the memory. This will stop // this thread from locking on this, as well as other threads from waiting longer for no reason. if (kval <= m_highestFreeKval) { MutexAuto numFreeLock(&m_numFreeMutex); if (kval <= m_highestFreeKval) { PAL_ASSERT(m_pNumFreeList[m_highestFreeKval - m_minKval] != 0); result = Result::Success; // First we add one to each level for every buddy we'll insert while (m_pNumFreeList[kval - m_minKval] == 0) { m_pNumFreeList[kval - m_minKval] += 1; kval++; } PAL_ASSERT(kval <= m_highestFreeKval); PAL_ASSERT_MSG(m_pNumFreeList[kval - m_minKval] > 0, "This should only fail if ClaimGpuMemory() is not called before every call to Allocate()."); // Then we subtract one for the block we will use or split to the lower level m_pNumFreeList[kval - m_minKval] -= 1; PAL_ASSERT(m_highestFreeKval >= m_minKval); while (m_pNumFreeList[m_highestFreeKval - m_minKval] == 0) { m_highestFreeKval--; // in this case, there will be no more space left on the entire buddyAllocator if (m_highestFreeKval < m_minKval) { break; } } } } return result; } // ===================================================================================================================== // Used to search through pools before claiming memory to find the one that will fragment the least. pKval will have // be the highest level needed to be split up for this pool, so the pool with the lowest value will be best. Can NOT // guarantee the memory will still be availible by the time this thread calls ClaimGpuMemory. template Result BuddyAllocator::CheckIfOpenMemory( gpusize size, gpusize alignment, uint32* pKval) { PAL_ASSERT(m_pNumFreeList != nullptr); // Pad the requested allocation size to the nearest POT of the size and alignment const uint32 kval = Max(SizeToKval(Pow2Pad(Max(size, alignment))), m_minKval); PAL_ASSERT(kval >= m_minKval && kval < m_baseAllocKval); Result result = Result::ErrorOutOfGpuMemory; if ((kval <= m_highestFreeKval)) { result = Result::Success; if (pKval != nullptr) { uint32 topKval = kval; for (; topKval < m_baseAllocKval; topKval++) { if (m_pNumFreeList[topKval - m_minKval] != 0) { *pKval = topKval; break; } } } } return result; } // Hashset helper functions. // ===================================================================================================================== template Result BuddyAllocator::InsertToFreeSet( gpusize offset, uint32 kval) { FreeSet* pFreeSet = &m_pFreeBlockSets[kval - m_minKval]; PAL_ASSERT(pFreeSet->Contains(offset) == false); Result result = pFreeSet->Insert(offset); return result; } // ===================================================================================================================== template bool BuddyAllocator::GetKvalUsed( gpusize offset, uint32* pKval) { bool isUsed; MutexAuto usedBlockMapLock(&m_usedBlockMapMutex); uint32* usedKval = m_pUsedBlockMap->FindKey(offset); if (usedKval == nullptr) { isUsed = false; } else { isUsed = true; if (pKval != nullptr) { *pKval = *usedKval; } } return isUsed; } // ===================================================================================================================== template Result BuddyAllocator::SetKvalUsed( gpusize offset, uint32 kval) { uint32* pKval; bool existed; MutexAuto usedBlockMapLock(&m_usedBlockMapMutex); Result result = m_pUsedBlockMap->FindAllocate(offset, &existed, &pKval); if (result == Result::Success) { *pKval = kval; } PAL_ASSERT(result == Result::Success); return result; } // ===================================================================================================================== // If there are free blocks at this level, removes one, if not, returns Result::ErrorOutOfGpuMemory template Result BuddyAllocator::PopFromFreeSet( gpusize* pOffset, uint32 kval) { Result result = Result::ErrorUnknown; FreeSet* pFreeSet = &m_pFreeBlockSets[kval - m_minKval]; PAL_ASSERT(pFreeSet != nullptr); auto freeSetIt = pFreeSet->Begin(); if (freeSetIt.Get() != nullptr) { *pOffset = freeSetIt.Get()->key; bool eraseRes = pFreeSet->Erase(*pOffset); if (eraseRes) { result = Result::Success; } else { // we got the offset from the iterator, no reason for it to fail. PAL_ASSERT_ALWAYS(); } } else { result = Result::ErrorOutOfGpuMemory; } return result; } // ===================================================================================================================== template bool BuddyAllocator::IsOffsetFree( gpusize offset, uint32 kval) { bool isIn = m_pFreeBlockSets[kval - m_minKval].Contains(offset); return isIn; } // ===================================================================================================================== template Result BuddyAllocator::RemoveOffsetFromFreeSet( gpusize offset, uint32 kval) { FreeSet* pFreeSet = &m_pFreeBlockSets[kval - m_minKval]; bool eraseRes = pFreeSet->Erase(offset); return (eraseRes) ? Result::Success : Result::ErrorInvalidValue; } // ===================================================================================================================== template Result BuddyAllocator::RemoveOffsetFromUsedMap( gpusize offset) { Result result = Result::Success; MutexAuto usedBlockMapLock(&m_usedBlockMapMutex); bool removeRes = m_pUsedBlockMap->Erase(offset); if (removeRes == false) { result = Result::ErrorInvalidValue; } return result; } } // Pal