llvmpipe: Get blending of normalized 8bit unsigned integers working.

author: José Fonseca <jfonseca@vmware.com> 2009-08-03 00:01:27 +0100
committer: José Fonseca <jfonseca@vmware.com> 2009-08-29 09:21:22 +0100
commit: ede73258a7604109b257bddb029b5f4dad5eb09d (patch)
tree: e0c4569483e42411014239b03fcf3d674b4827e4 /src/gallium/drivers/llvmpipe/lp_bld_arit.c
parent: a6622e6c544d3530a463d6a274a15bfae58f7ccc (diff)
1 files changed, 163 insertions, 6 deletions
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_arit.c b/src/gallium/drivers/llvmpipe/lp_bld_arit.c
index f45b7d82f1..ba272df296 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_arit.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_arit.c
@@ -184,7 +184,7 @@ lp_build_one(union lp_type type)
    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
    unsigned i;
 
-   assert(type.length < LP_MAX_VECTOR_LENGTH);
+   assert(type.length <= LP_MAX_VECTOR_LENGTH);
 
    elem_type = lp_build_elem_type(type);
 
@@ -224,7 +224,7 @@ lp_build_const_aos(union lp_type type,
    unsigned i;
 
    assert(type.length % 4 == 0);
-   assert(type.length < LP_MAX_VECTOR_LENGTH);
+   assert(type.length <= LP_MAX_VECTOR_LENGTH);
 
    elem_type = lp_build_elem_type(type);
 
@@ -421,9 +421,9 @@ lp_build_add(struct lp_build_context *bld,
       if(type.width * type.length == 128 &&
          !type.floating && !type.fixed) {
          if(type.width == 8)
-            intrinsic = type.sign ? "llvm.x86.sse2.adds.b" : "llvm.x86.sse2.addus.b";
+            intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
          if(type.width == 16)
-            intrinsic = type.sign ? "llvm.x86.sse2.adds.w" : "llvm.x86.sse2.addus.w";
+            intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
       }
 #endif
    
@@ -468,9 +468,9 @@ lp_build_sub(struct lp_build_context *bld,
       if(type.width * type.length == 128 &&
          !type.floating && !type.fixed) {
          if(type.width == 8)
-            intrinsic = type.sign ? "llvm.x86.sse2.subs.b" : "llvm.x86.sse2.subus.b";
+            intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
          if(type.width == 16)
-            intrinsic = type.sign ? "llvm.x86.sse2.subs.w" : "llvm.x86.sse2.subus.w";
+            intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
       }
 #endif
    
@@ -490,11 +490,124 @@ lp_build_sub(struct lp_build_context *bld,
 }
 
 
+/**
+ * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
+ */
+static LLVMValueRef 
+lp_build_unpack_shuffle(unsigned n, unsigned lo_hi)
+{
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i, j;
+
+   assert(n <= LP_MAX_VECTOR_LENGTH);
+   assert(lo_hi < 2);
+
+   for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
+      elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
+      elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
+   }
+
+   return LLVMConstVector(elems, n);
+}
+
+
+static LLVMValueRef 
+lp_build_const_vec(LLVMTypeRef type, unsigned n, long long c)
+{
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   assert(n <= LP_MAX_VECTOR_LENGTH);
+
+   for(i = 0; i < n; ++i)
+      elems[i] = LLVMConstInt(type, c, 0);
+
+   return LLVMConstVector(elems, n);
+}
+
+
+/**
+ * Normalized 8bit multiplication.
+ *
+ * - alpha plus one
+ *
+ *     makes the following approximation to the division (Sree)
+ *    
+ *       a*b/255 ~= (a*(b + 1)) >> 256
+ *    
+ *     which is the fastest method that satisfies the following OpenGL criteria
+ *    
+ *       0*0 = 0 and 255*255 = 255
+ *
+ * - geometric series
+ *
+ *     takes the geometric series approximation to the division
+ *
+ *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
+ *
+ *     in this case just the first two terms to fit in 16bit arithmetic
+ *
+ *       t/255 ~= (t + (t >> 8)) >> 8
+ *
+ *     note that just by itself it doesn't satisfies the OpenGL criteria, as
+ *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
+ *     must be used
+ *
+ * - geometric series plus rounding
+ *
+ *     when using a geometric series division instead of truncating the result
+ *     use roundoff in the approximation (Jim Blinn)
+ *
+ *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
+ *
+ *     achieving the exact results
+ *
+ * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995, 
+ *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
+ * @sa Michael Herf, The "double blend trick", May 2000, 
+ *     http://www.stereopsis.com/doubleblend.html
+ */
+static LLVMValueRef
+lp_build_mul_u8n(LLVMBuilderRef builder,
+                 LLVMValueRef a, LLVMValueRef b)
+{
+   static LLVMValueRef c01 = NULL;
+   static LLVMValueRef c08 = NULL;
+   static LLVMValueRef c80 = NULL;
+   LLVMValueRef ab;
+
+   if(!c01) c01 = lp_build_const_vec(LLVMInt16Type(), 8, 0x01);
+   if(!c08) c08 = lp_build_const_vec(LLVMInt16Type(), 8, 0x08);
+   if(!c80) c80 = lp_build_const_vec(LLVMInt16Type(), 8, 0x80);
+   
+#if 0
+   
+   /* a*b/255 ~= (a*(b + 1)) >> 256 */
+   b = LLVMBuildAdd(builder, b, c01, "");
+   ab = LLVMBuildMul(builder, a, b, "");
+
+#else
+   
+   /* t/255 ~= (t + (t >> 8) + 0x80) >> 8 */
+   ab = LLVMBuildMul(builder, a, b, "");
+   ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c08, ""), "");
+   ab = LLVMBuildAdd(builder, ab, c80, "");
+
+#endif
+   
+   ab = LLVMBuildLShr(builder, ab, c08, "");
+
+   return ab;
+}
+
+
 LLVMValueRef
 lp_build_mul(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b)
 {
+   const union lp_type type = bld->type;
+
    if(a == bld->zero)
       return bld->zero;
    if(a == bld->one)
@@ -506,6 +619,50 @@ lp_build_mul(struct lp_build_context *bld,
    if(a == bld->undef || b == bld->undef)
       return bld->undef;
 
+   if(!type.floating && !type.fixed && type.norm) {
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+      if(type.width == 8 && type.length == 16) {
+         LLVMTypeRef i16x8 = LLVMVectorType(LLVMInt16Type(), 8);
+         LLVMTypeRef i8x16 = LLVMVectorType(LLVMInt8Type(), 16);
+         static LLVMValueRef ml = NULL;
+         static LLVMValueRef mh = NULL;
+         LLVMValueRef al, ah, bl, bh;
+         LLVMValueRef abl, abh;
+         LLVMValueRef ab;
+         
+         if(!ml) ml = lp_build_unpack_shuffle(16, 0);
+         if(!mh) mh = lp_build_unpack_shuffle(16, 1);
+
+         /*  PUNPCKLBW, PUNPCKHBW */
+         al = LLVMBuildShuffleVector(bld->builder, a, bld->zero, ml, "");
+         bl = LLVMBuildShuffleVector(bld->builder, b, bld->zero, ml, "");
+         ah = LLVMBuildShuffleVector(bld->builder, a, bld->zero, mh, "");
+         bh = LLVMBuildShuffleVector(bld->builder, b, bld->zero, mh, "");
+
+         /* NOP */
+         al = LLVMBuildBitCast(bld->builder, al, i16x8, "");
+         bl = LLVMBuildBitCast(bld->builder, bl, i16x8, "");
+         ah = LLVMBuildBitCast(bld->builder, ah, i16x8, "");
+         bh = LLVMBuildBitCast(bld->builder, bh, i16x8, "");
+
+         /* PMULLW, PSRLW, PADDW */
+         abl = lp_build_mul_u8n(bld->builder, al, bl);
+         abh = lp_build_mul_u8n(bld->builder, ah, bh);
+
+         /* PACKUSWB */
+         ab = lp_build_intrinsic_binary(bld->builder, "llvm.x86.sse2.packuswb.128" , abl, abh);
+
+         /* NOP */
+         ab = LLVMBuildBitCast(bld->builder, ab, i8x16, "");
+         
+         return ab;
+      }
+#endif
+
+      /* FIXME */
+      assert(0);
+   }
+
    if(LLVMIsConstant(a) && LLVMIsConstant(b))
       return LLVMConstMul(a, b);
author	José Fonseca <jfonseca@vmware.com>	2009-08-03 00:01:27 +0100
committer	José Fonseca <jfonseca@vmware.com>	2009-08-29 09:21:22 +0100
commit	ede73258a7604109b257bddb029b5f4dad5eb09d (patch)
tree	e0c4569483e42411014239b03fcf3d674b4827e4 /src/gallium/drivers/llvmpipe/lp_bld_arit.c
parent	a6622e6c544d3530a463d6a274a15bfae58f7ccc (diff)