From db084f48ebb1d255fb73fe7e9728e7653fc39eaf Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Sat, 30 Apr 2016 14:07:20 -0600 Subject: swr: [rasterizer] Miscellaneous backend changes Reviewed-by: Bruce Cherniak --- .../drivers/swr/rasterizer/common/simdintrin.h | 20 +++++++++++++++++ .../drivers/swr/rasterizer/core/backend.cpp | 26 +++++----------------- src/gallium/drivers/swr/rasterizer/core/backend.h | 7 ++++-- 3 files changed, 31 insertions(+), 22 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h index 72fe15a3c7..5ec1f71934 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h @@ -139,6 +139,12 @@ __m256 _simdemu_permute_ps(__m256 a, __m256i b) return result; } +INLINE +__m256i _simdemu_permute_epi32(__m256i a, __m256i b) +{ + return _mm256_castps_si256(_simdemu_permute_ps(_mm256_castsi256_ps(a), b)); +} + INLINE __m256i _simdemu_srlv_epi32(__m256i vA, __m256i vCount) { @@ -277,6 +283,7 @@ __m256i _simdemu_sllv_epi32(__m256i vA, __m256i vCount) #define _simd_cmpeq_epi16 _simdemu_cmpeq_epi16 #define _simd_movemask_epi8 _simdemu_movemask_epi8 #define _simd_permute_ps _simdemu_permute_ps +#define _simd_permute_epi32 _simdemu_permute_epi32 #define _simd_srlv_epi32 _simdemu_srlv_epi32 #define _simd_sllv_epi32 _simdemu_sllv_epi32 @@ -449,11 +456,18 @@ int _simdemu_movemask_epi8(__m256i a) #define _simd_permute_ps _mm256_permutevar8x32_ps #define _simd_srlv_epi32 _mm256_srlv_epi32 #define _simd_sllv_epi32 _mm256_sllv_epi32 + +INLINE +simdscalari _simd_permute_epi32(simdscalari a, simdscalari index) +{ + return _simd_castps_si(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(a), index)); +} #endif #define _simd_shuffleps_epi32(vA, vB, imm) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(vA), _mm256_castsi256_ps(vB), imm)) #define _simd_shuffle_ps _mm256_shuffle_ps #define _simd_set1_epi32 _mm256_set1_epi32 +#define _simd_set_epi32 _mm256_set_epi32 #define _simd_set1_epi8 _mm256_set1_epi8 #define _simd_setzero_si _mm256_setzero_si256 #define _simd_cvttps_epi32 _mm256_cvttps_epi32 @@ -473,6 +487,12 @@ simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalar mask) return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), mask)); } +INLINE +simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalari mask) +{ + return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), _simd_castsi_ps(mask))); +} + // convert bitmask to vector mask INLINE simdscalar vMask(int32_t mask) diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp index d2547f3988..376fb3f68c 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp @@ -886,7 +886,9 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy)); for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { + simdscalar activeLanes; if(!(work.anyCoveredSamples & MASK)) {goto Endtile;}; + activeLanes = vMask(work.anyCoveredSamples & MASK); psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx)); // set pixel center positions @@ -909,32 +911,22 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t RDTSC_STOP(BEBarycentric, 0, 0); } - simdscalar activeLanes; if(T::bForcedSampleCount) { // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si())); - activeLanes = _simd_and_ps(vMask(work.anyCoveredSamples & MASK), vSampleMask); + activeLanes = _simd_and_ps(activeLanes, vSampleMask); } // Early-Z? if(T::bCanEarlyZ && !T::bForcedSampleCount) { - activeLanes = _simd_setzero_ps(); uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest); UPDATE_STAT(DepthPassCount, depthPassCount); } - // if we can't do early z, set the active mask to any samples covered in the current simd - else if(!T::bCanEarlyZ && !T::bForcedSampleCount) - { - activeLanes = vMask(work.anyCoveredSamples & MASK); - } // if we have no covered samples that passed depth at this point, go to next tile - if(!_simd_movemask_ps(activeLanes)) - { - goto Endtile; - } + if(!_simd_movemask_ps(activeLanes)) { goto Endtile; }; if(pPSState->usesSourceDepth) { @@ -957,10 +949,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t // update active lanes to remove any discarded or oMask'd pixels activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si()))); - if(!_simd_movemask_ps(activeLanes)) - { - goto Endtile; - } + if(!_simd_movemask_ps(activeLanes)) { goto Endtile; }; // late-Z if(!T::bCanEarlyZ && !T::bForcedSampleCount) @@ -970,10 +959,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t } // if we have no covered samples that passed depth at this point, skip OM and go to next tile - if(!_simd_movemask_ps(activeLanes)) - { - goto Endtile; - } + if(!_simd_movemask_ps(activeLanes)) { goto Endtile; }; // output merger // loop over all samples, broadcasting the results of the PS to all passing pixels diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h index 24ba69ec87..2c11041680 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend.h @@ -423,14 +423,15 @@ struct PixelRateZTestLoop clipDistanceMask(ClipDistanceMask), pDepthBase(depthBase), pStencilBase(stencilBase) {}; INLINE - uint32_t operator()(simdscalar& anyDepthSamplePassed, SWR_PS_CONTEXT& psContext, + uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext, const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0) { uint32_t statCount = 0; + simdscalar anyDepthSamplePassed = _simd_setzero_ps(); for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++) { const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample]; - vCoverageMask[sample] = vMask(pCoverageMask[currentSimdIn8x8] & MASK); + vCoverageMask[sample] = _simd_and_ps(activeLanes, vMask(pCoverageMask[currentSimdIn8x8] & MASK)); if(!_simd_movemask_ps(vCoverageMask[sample])) { @@ -494,6 +495,8 @@ struct PixelRateZTestLoop uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]); statCount += _mm_popcnt_u32(statMask); } + + activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes); // return number of samples that passed depth and coverage return statCount; } -- cgit v1.2.3