swr: [rasterizer core] Fix global arena allocator bug

- Plus some minor code refactoring Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
author: Tim Rowley <timothy.o.rowley@intel.com> 2016-04-04 14:33:26 -0600
committer: Tim Rowley <timothy.o.rowley@intel.com> 2016-04-22 18:48:11 -0500
commit: 794be41f91fbaf0645e422525b464627e2fbeef2 (patch)
tree: 3e94fc0081ab3754fcb75fa8cb86319fff9c86ca
parent: e42f00ee39acf6e88200c36b5ea6a8da6b69dd0f (diff)
2 files changed, 51 insertions, 42 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h
index 64184e16865..d0c0123b19b 100644
--- a/src/gallium/drivers/swr/rasterizer/core/arena.h
+++ b/src/gallium/drivers/swr/rasterizer/core/arena.h
@@ -37,47 +37,54 @@
 #include <atomic>
 #include "core/utils.h"
 
+static const size_t ARENA_BLOCK_ALIGN = 64;
+
+struct ArenaBlock
+{
+    size_t      blockSize = 0;
+    ArenaBlock* pNext = nullptr;
+};
+static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN,
+    "Increase BLOCK_ALIGN size");
+
 class DefaultAllocator
 {
 public:
-    void* AllocateAligned(size_t size, size_t align)
+    ArenaBlock* AllocateAligned(size_t size, size_t align)
     {
-        void* p = _aligned_malloc(size, align);
+        SWR_ASSERT(size >= sizeof(ArenaBlock));
+
+        ArenaBlock* p = new (_aligned_malloc(size, align)) ArenaBlock();
+        p->blockSize = size;
         return p;
     }
-    void  Free(void* pMem)
+
+    void Free(ArenaBlock* pMem)
     {
-        _aligned_free(pMem);
+        if (pMem)
+        {
+            SWR_ASSERT(pMem->blockSize < size_t(0xdddddddd));
+            _aligned_free(pMem);
+        }
     }
 };
 
-static const size_t ARENA_BLOCK_ALIGN = 64;
-
-struct ArenaBlock
-{
-    size_t      blockSize = 0;
-    ArenaBlock* pNext = nullptr;
-};
-static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN,
-              "Increase BLOCK_ALIGN size");
-
 // Caching Allocator for Arena
-template<uint32_t NumBucketsT = 4, uint32_t StartBucketBitT = 16>
+template<uint32_t NumBucketsT = 8, uint32_t StartBucketBitT = 12>
 struct CachingAllocatorT : DefaultAllocator
 {
-    void* AllocateAligned(size_t size, size_t align)
+    ArenaBlock* AllocateAligned(size_t size, size_t align)
     {
         SWR_ASSERT(size >= sizeof(ArenaBlock));
         SWR_ASSERT(size <= uint32_t(-1));
 
-        size_t blockSize = size - ARENA_BLOCK_ALIGN;
-        uint32_t bucket = GetBucketId(blockSize);
+        uint32_t bucket = GetBucketId(size);
 
         {
             // search cached blocks
             std::lock_guard<std::mutex> l(m_mutex);
             ArenaBlock* pPrevBlock = &m_cachedBlocks[bucket];
-            ArenaBlock* pBlock = SearchBlocks(pPrevBlock, blockSize, align);
+            ArenaBlock* pBlock = SearchBlocks(pPrevBlock, size, align);
 
             if (pBlock)
             {
@@ -89,15 +96,15 @@ struct CachingAllocatorT : DefaultAllocator
             }
             else
             {
-                pPrevBlock = &m_oldCachedBlocks[GetBucketId(blockSize)];
-                pBlock = SearchBlocks(pPrevBlock, blockSize, align);
+                pPrevBlock = &m_oldCachedBlocks[bucket];
+                pBlock = SearchBlocks(pPrevBlock, size, align);
 
                 if (pBlock)
                 {
                     m_oldCachedSize -= pBlock->blockSize;
                     if (pBlock == m_pOldLastCachedBlocks[bucket])
                     {
-                        m_pLastCachedBlocks[bucket] = pPrevBlock;
+                        m_pOldLastCachedBlocks[bucket] = pPrevBlock;
                     }
                 }
             }
@@ -126,15 +133,14 @@ struct CachingAllocatorT : DefaultAllocator
         return this->DefaultAllocator::AllocateAligned(size, align);
     }
 
-    void Free(void* pMem)
+    void Free(ArenaBlock* pMem)
     {
         if (pMem)
         {
-            ArenaBlock* pNewBlock = reinterpret_cast<ArenaBlock*>(pMem);
-            SWR_ASSERT(pNewBlock->blockSize >= 0);
+            SWR_ASSERT(pMem->blockSize >= 0);
 
             std::unique_lock<std::mutex> l(m_mutex);
-            InsertCachedBlock(GetBucketId(pNewBlock->blockSize), pNewBlock);
+            InsertCachedBlock(GetBucketId(pMem->blockSize), pMem);
         }
     }
 
@@ -154,7 +160,7 @@ struct CachingAllocatorT : DefaultAllocator
                 {
                     ArenaBlock* pNext = pBlock->pNext;
                     m_oldCachedSize -= pBlock->blockSize;
-                    m_totalAllocated -= (pBlock->blockSize + ARENA_BLOCK_ALIGN);
+                    m_totalAllocated -= pBlock->blockSize;
                     this->DefaultAllocator::Free(pBlock);
                     pBlock = pNext;
                 }
@@ -175,6 +181,11 @@ struct CachingAllocatorT : DefaultAllocator
             }
         }
 
+        if (doFree)
+        {
+            SWR_REL_ASSERT(m_oldCachedSize == 0, "Unknown whereabouts of 0x%llx bytes", (uint64_t)m_oldCachedSize);
+        }
+
         m_oldCachedSize += m_cachedSize;
         m_cachedSize = 0;
     }
@@ -304,7 +315,7 @@ private:
     // buckets, for block sizes < (1 << (start+1)), < (1 << (start+2)), ...
     static const uint32_t   CACHE_NUM_BUCKETS       = NumBucketsT;
     static const uint32_t   CACHE_START_BUCKET_BIT  = StartBucketBitT;
-    static const size_t     MAX_UNUSED_SIZE         = 20 * sizeof(MEGABYTE);
+    static const size_t     MAX_UNUSED_SIZE         = sizeof(MEGABYTE);
 
     ArenaBlock              m_cachedBlocks[CACHE_NUM_BUCKETS];
     ArenaBlock*             m_pLastCachedBlocks[CACHE_NUM_BUCKETS];
@@ -346,7 +357,7 @@ public:
 
             if ((offset + size) <= pCurBlock->blockSize)
             {
-                void* pMem = PtrAdd(pCurBlock, offset + ARENA_BLOCK_ALIGN);
+                void* pMem = PtrAdd(pCurBlock, offset);
                 m_offset = offset + size;
                 return pMem;
             }
@@ -355,24 +366,21 @@ public:
             // a new block
         }
 
-        static const size_t ArenaBlockSize = BlockSizeT - ARENA_BLOCK_ALIGN;
-        size_t blockSize = std::max(size, ArenaBlockSize);
+        static const size_t ArenaBlockSize = BlockSizeT;
+        size_t blockSize = std::max(size + ARENA_BLOCK_ALIGN, ArenaBlockSize);
 
         // Add in one BLOCK_ALIGN unit to store ArenaBlock in.
         blockSize = AlignUp(blockSize, ARENA_BLOCK_ALIGN);
 
-        void *pMem = m_allocator.AllocateAligned(blockSize + ARENA_BLOCK_ALIGN, ARENA_BLOCK_ALIGN);    // Arena blocks are always simd byte aligned.
-        SWR_ASSERT(pMem != nullptr);
-
-        ArenaBlock* pNewBlock = new (pMem) ArenaBlock();
+        ArenaBlock* pNewBlock = m_allocator.AllocateAligned(blockSize, ARENA_BLOCK_ALIGN);    // Arena blocks are always simd byte aligned.
+        SWR_ASSERT(pNewBlock != nullptr);
 
         if (pNewBlock != nullptr)
         {
-            m_offset = 0;
+            m_offset = ARENA_BLOCK_ALIGN;
             pNewBlock->pNext = m_pCurBlock;
 
             m_pCurBlock = pNewBlock;
-            m_pCurBlock->blockSize = blockSize;
         }
 
         return AllocAligned(size, align);
@@ -407,7 +415,7 @@ public:
 
     void Reset(bool removeAll = false)
     {
-        m_offset = 0;
+        m_offset = ARENA_BLOCK_ALIGN;
 
         if (m_pCurBlock)
         {
@@ -431,13 +439,13 @@ public:
 
     bool IsEmpty()
     {
-        return (m_pCurBlock == nullptr) || (m_offset == 0 && m_pCurBlock->pNext == nullptr);
+        return (m_pCurBlock == nullptr) || (m_offset == ARENA_BLOCK_ALIGN && m_pCurBlock->pNext == nullptr);
     }
 
 private:
 
     ArenaBlock*         m_pCurBlock = nullptr;
-    size_t              m_offset    = 0;
+    size_t              m_offset    = ARENA_BLOCK_ALIGN;
 
     /// @note Mutex is only used by sync allocation functions.
     std::mutex          m_mutex;
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index d8ffdaa3ab4..a2212ba8aa4 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -80,9 +80,10 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup
     SWR_ASSERT(pTaskData != nullptr);
 
     // Ensure spill fill memory has been allocated.
-    if (pSpillFillBuffer == nullptr)
+    size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
+    if (spillFillSize && pSpillFillBuffer == nullptr)
     {
-        pSpillFillBuffer = pDC->pArena->AllocAlignedSync(pDC->pState->state.totalSpillFillSize, sizeof(float) * 8);
+        pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES);
     }
 
     const API_STATE& state = GetApiState(pDC);
author	Tim Rowley <timothy.o.rowley@intel.com>	2016-04-04 14:33:26 -0600
committer	Tim Rowley <timothy.o.rowley@intel.com>	2016-04-22 18:48:11 -0500
commit	794be41f91fbaf0645e422525b464627e2fbeef2 (patch)
tree	3e94fc0081ab3754fcb75fa8cb86319fff9c86ca
parent	e42f00ee39acf6e88200c36b5ea6a8da6b69dd0f (diff)