gallivm: change cubemaps / derivatives handling, take 55

Turns out the previous "fix" for handling per-pixel face selection and derivatives didn't work out that well - the derivatives were wrong by quite a bit, in theory transformation of the derivatives into cube space should work, but would be _a lot_ more work than the "simplified" transform used. So, for explicit derivatives, I'm just giving up and go back to not honoring them. For implicit derivatives (and the fake explicit ones) however we try something a little different, we just calculate rho as we would for a 3d texture, that is after scaling the coords by the inverse major axis. This gives the same results as calculating the derivs after projection of the coords to the same face as long as all pixels hit the same face (and only without rho_no_opt, otherwise it should be a bit worse). And when not all pixels are hitting the same face, the results aren't so hot but not catastrophically bad (I believe not off by more than a factor of 2 without no_rho_approx and not more than sqrt(2) with no_rho_approx). I think this is better than just picking the wrong face but who knows... Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
author: Roland Scheidegger <sroland@vmware.com> 2013-04-18 17:06:43 +0200
committer: Roland Scheidegger <sroland@vmware.com> 2013-04-18 17:06:43 +0200
commit: 50cbcf0c4680bc13e63fe339ef79ed68b2f33c70 (patch)
tree: 3aa291fea9ee10d3b3f0385ca3008fb3d7f8332a
parent: 0d07f05ee87b5446bc3b85fc7be861c4801bb79e (diff)
3 files changed, 119 insertions, 104 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 666e569210..33338f6435 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -245,16 +245,17 @@ lp_build_rho(struct lp_build_sample_context *bld,
       LLVMValueRef cubesize;
       LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
       /*
-       * If we have derivs too then we have per-pixel cube_rho - doesn't matter
-       * though until we do per-pixel lod.
        * Cube map code did already everything except size mul and per-quad extraction.
        */
+      rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+                                      perquadf_bld->type, cube_rho, 0);
+      if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
+         rho = lp_build_sqrt(perquadf_bld, rho);
+      }
       /* Could optimize this for single quad just skip the broadcast */
       cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
-                                            coord_bld->type, float_size, index0);
-      rho_vec = lp_build_mul(coord_bld, cubesize, cube_rho);
-      rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                      perquadf_bld->type, rho_vec, 0);
+                                            perquadf_bld->type, float_size, index0);
+      rho = lp_build_mul(perquadf_bld, cubesize, rho);
    }
    else if (derivs && !(bld->static_texture_state->target == PIPE_TEXTURE_CUBE)) {
       LLVMValueRef ddmax[3], ddx[3], ddy[3];
@@ -1356,25 +1357,31 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
                      LLVMValueRef *face,
                      LLVMValueRef *face_s,
                      LLVMValueRef *face_t,
-                     LLVMValueRef *rho)
+                     LLVMValueRef *rho,
+                     boolean need_derivs)
 {
    struct lp_build_context *coord_bld = &bld->coord_bld;
    LLVMBuilderRef builder = bld->gallivm->builder;
    struct gallivm_state *gallivm = bld->gallivm;
    LLVMValueRef si, ti, ri;
-   boolean need_derivs = TRUE;
 
    if (1 || coord_bld->type.length > 4) {
       /*
        * Do per-pixel face selection. We cannot however (as we used to do)
        * simply calculate the derivs afterwards (which is very bogus for
-       * explicit derivs anyway) because the values would be "random" when
-       * not all pixels lie on the same face. Hence just transform the derivs
-       * (or rather only the dmax values), which works both for implicit and
-       * explicit derivatives and doesn't add much math (except need to
-       * calculate derivs for 3 instead of 2 coords and have a couple more selects
-       * but cuts some minor math elsewhere). The derivs don't need mirroring,
-       * just selection, since noone cares about the sign.
+       * explicit derivs btw) because the values would be "random" when
+       * not all pixels lie on the same face. So what we do here is just
+       * calculate the derivatives after scaling the coords by the absolute
+       * value of the inverse major axis, and essentially do rho calculation
+       * steps as if it were a 3d texture. This is perfect if all pixels hit
+       * the same face, but not so great at edges, I believe the max error
+       * should be sqrt(2) with no_rho_approx or 2 otherwise (essentially measuring
+       * the 3d distance between 2 points on the cube instead of measuring up/down
+       * the edge). Still this is possibly a win over just selecting the same face
+       * for all pixels. Unfortunately, something like that doesn't work for
+       * explicit derivatives.
+       * TODO: handle explicit derivatives by transforming them alongside coords
+       * somehow.
        */
       struct lp_build_context *cint_bld = &bld->int_coord_bld;
       struct lp_type intctype = cint_bld->type;
@@ -1392,95 +1399,111 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
       LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
       LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
       LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);
-      LLVMValueRef dmax[3], dmaxsnew, dmaxtnew;
 
       assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
       assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
       assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
 
       /*
-       * TODO do this only when needed.
+       * get absolute value (for x/y/z face selection) and sign bit
+       * (for mirroring minor coords and pos/neg face selection)
+       * of the original coords.
        */
-      if (need_derivs && !derivs) {
-         LLVMValueRef ddx_ddy[2], tmp[2];
-         /*
-          * This isn't quite the same as the "ordinary" path since
-          * we need to extract the ds/dt/dr values before further processing.
-          */
-         static const unsigned char swizzle11[] = { /* no-op swizzle */
+      as = lp_build_abs(&bld->coord_bld, s);
+      at = lp_build_abs(&bld->coord_bld, t);
+      ar = lp_build_abs(&bld->coord_bld, r);
+
+      /*
+       * major face determination: select x if x > y else select y
+       * select z if z >= max(x,y) else select previous result
+       * if some axis are the same we chose z over y, y over x - the
+       * dx10 spec seems to ask for it while OpenGL doesn't care (if we
+       * wouldn't care could save a select or two if using different
+       * compares and doing at_g_as_ar last since tnewx and tnewz are the
+       * same).
+       */
+      as_ge_at = lp_build_cmp(coord_bld, PIPE_FUNC_GREATER, as, at);
+      maxasat = lp_build_max(coord_bld, as, at);
+      ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat);
+
+      if (need_derivs) {
+         LLVMValueRef ddx_ddy[2], tmp[2], rho_vec;
+         static const unsigned char swizzle0[] = { /* no-op swizzle */
             0, LP_BLD_SWIZZLE_DONTCARE,
             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
          };
-         static const unsigned char swizzle12[] = {
-            2, LP_BLD_SWIZZLE_DONTCARE,
+         static const unsigned char swizzle1[] = {
+            1, LP_BLD_SWIZZLE_DONTCARE,
             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
          };
-         static const unsigned char swizzle21[] = { /* no-op swizzle */
-            0, LP_BLD_SWIZZLE_DONTCARE,
-            2, LP_BLD_SWIZZLE_DONTCARE
+         static const unsigned char swizzle01[] = { /* no-op swizzle */
+            0, 1,
+            LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
          };
-         static const unsigned char swizzle22[] = {
-            1, LP_BLD_SWIZZLE_DONTCARE,
-            3, LP_BLD_SWIZZLE_DONTCARE
+         static const unsigned char swizzle23[] = {
+            2, 3,
+            LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+         };
+         static const unsigned char swizzle02[] = {
+            0, 2,
+            LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
          };
 
+         /*
+          * scale the s/t/r coords pre-select/mirror so we can calculate
+          * "reasonable" derivs.
+          */
+         ma = lp_build_select(coord_bld, as_ge_at, s, t);
+         ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
+         ima = lp_build_cube_imapos(coord_bld, ma);
+         s = lp_build_mul(coord_bld, s, ima);
+         t = lp_build_mul(coord_bld, t, ima);
+         r = lp_build_mul(coord_bld, r, ima);
+
+         /*
+          * This isn't quite the same as the "ordinary" (3d deriv) path since we
+          * know the texture is square which simplifies things (we can omit the
+          * size mul which happens very early completely here and do it at the
+          * very end).
+          */
          ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
          ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
-         ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
-         ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
 
-         tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle21);
-         tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle22);
-         dmax[0] = lp_build_max(coord_bld, tmp[0], tmp[1]);
-         dmax[1] = lp_build_swizzle_aos(coord_bld, dmax[0], swizzle12);
+         if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
+            ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
+            ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
+         }
+         else {
+            ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
+            ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
+         }
 
-         tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle11);
-         tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle12);
-         dmax[2] = lp_build_max(coord_bld, tmp[0], tmp[1]);
-      }
-      else if (need_derivs) {
-         LLVMValueRef abs_ddx[3], abs_ddy[3];
-         abs_ddx[0] = lp_build_abs(coord_bld, derivs->ddx[0]);
-         abs_ddx[1] = lp_build_abs(coord_bld, derivs->ddx[1]);
-         abs_ddx[2] = lp_build_abs(coord_bld, derivs->ddx[2]);
-         abs_ddy[0] = lp_build_abs(coord_bld, derivs->ddy[0]);
-         abs_ddy[1] = lp_build_abs(coord_bld, derivs->ddy[1]);
-         abs_ddy[2] = lp_build_abs(coord_bld, derivs->ddy[2]);
-         dmax[0] = lp_build_max(coord_bld, abs_ddx[0], abs_ddy[0]);
-         dmax[1] = lp_build_max(coord_bld, abs_ddx[1], abs_ddy[1]);
-         dmax[2] = lp_build_max(coord_bld, abs_ddx[2], abs_ddy[2]);
+         tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
+         tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
+         tmp[2] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
+
+         if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
+            rho_vec = lp_build_add(coord_bld, tmp[0], tmp[1]);
+            rho_vec = lp_build_add(coord_bld, rho_vec, tmp[2]);
+         }
+         else {
+            rho_vec = lp_build_max(coord_bld, tmp[0], tmp[1]);
+            rho_vec = lp_build_max(coord_bld, rho_vec, tmp[2]);
+         }
+
+         tmp[0] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
+         tmp[1] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
+         *rho = lp_build_max(coord_bld, tmp[0], tmp[1]);
       }
 
       si = LLVMBuildBitCast(builder, s, lp_build_vec_type(gallivm, intctype), "");
       ti = LLVMBuildBitCast(builder, t, lp_build_vec_type(gallivm, intctype), "");
       ri = LLVMBuildBitCast(builder, r, lp_build_vec_type(gallivm, intctype), "");
-
-      /*
-       * get absolute value (for x/y/z face selection) and sign bit
-       * (for mirroring minor coords and pos/neg face selection)
-       * of the original coords.
-       */
-      as = lp_build_abs(&bld->coord_bld, s);
-      at = lp_build_abs(&bld->coord_bld, t);
-      ar = lp_build_abs(&bld->coord_bld, r);
       signs = LLVMBuildAnd(builder, si, signmask, "");
       signt = LLVMBuildAnd(builder, ti, signmask, "");
       signr = LLVMBuildAnd(builder, ri, signmask, "");
 
       /*
-       * major face determination: select x if x > y else select y
-       * select z if z >= max(x,y) else select previous result
-       * if some axis are the same we chose z over y, y over x - the
-       * dx10 spec seems to ask for it while OpenGL doesn't care (if we
-       * wouldn't care could save a select or two if using different
-       * compares and doing at_g_as_ar last since tnewx and tnewz are the
-       * same).
-       */
-      as_ge_at = lp_build_cmp(coord_bld, PIPE_FUNC_GREATER, as, at);
-      maxasat = lp_build_max(coord_bld, as, at);
-      ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat);
-
-      /*
        * compute all possible new s/t coords
        * snewx = signs * -r;
        * tnewx = -t;
@@ -1510,23 +1533,19 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
        */
 
       /* select/mirror */
+      if (!need_derivs) {
+         ma = lp_build_select(coord_bld, as_ge_at, s, t);
+      }
       *face_s = lp_build_select(cint_bld, as_ge_at, snewx, snewy);
       *face_t = lp_build_select(cint_bld, as_ge_at, tnewx, tnewy);
-      ma = lp_build_select(coord_bld, as_ge_at, s, t);
       *face = lp_build_select(cint_bld, as_ge_at, facex, facey);
-      if (need_derivs) {
-         dmaxsnew = lp_build_select(coord_bld, as_ge_at, dmax[2], dmax[0]);
-         dmaxtnew = lp_build_select(coord_bld, as_ge_at, dmax[1], dmax[2]);
-      }
 
+      if (!need_derivs) {
+         ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
+      }
       *face_s = lp_build_select(cint_bld, ar_ge_as_at, snewz, *face_s);
       *face_t = lp_build_select(cint_bld, ar_ge_as_at, tnewz, *face_t);
-      ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
       *face = lp_build_select(cint_bld, ar_ge_as_at, facez, *face);
-      if (need_derivs) {
-         dmaxsnew = lp_build_select(coord_bld, ar_ge_as_at, dmax[0], dmaxsnew);
-         dmaxtnew = lp_build_select(coord_bld, ar_ge_as_at, dmax[1], dmaxtnew);
-      }
 
       *face_s = LLVMBuildBitCast(builder, *face_s,
                                lp_build_vec_type(gallivm, coord_bld->type), "");
@@ -1542,26 +1561,15 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
       signma = LLVMBuildLShr(builder, mai, signshift, "");
       *face = LLVMBuildOr(builder, *face, signma, "face");
 
-      ima = lp_build_cube_imapos(coord_bld, ma);
-
       /* project coords */
-      *face_s = lp_build_mul(coord_bld, *face_s, ima);
+      if (!need_derivs) {
+         ima = lp_build_cube_imapos(coord_bld, ma);
+         *face_s = lp_build_mul(coord_bld, *face_s, ima);
+         *face_t = lp_build_mul(coord_bld, *face_t, ima);
+      }
+
       *face_s = lp_build_add(coord_bld, *face_s, posHalf);
-      *face_t = lp_build_mul(coord_bld, *face_t, ima);
       *face_t = lp_build_add(coord_bld, *face_t, posHalf);
-
-      /* project derivs */
-      if (need_derivs) {
-         /*
-          * we do some optimization here, since we know it's square
-          * we can do the max before projection (and before size mul,
-          * which the so-called "rho" is missing here).
-          * For explicit derivs this is fully per-pixel vector, for implicit
-          * derivs only the first value per quad contains useful values.
-          */
-         *rho = lp_build_max(coord_bld, dmaxsnew, dmaxtnew);
-         *rho = lp_build_mul(coord_bld, *rho, ima);
-      }
    }
 
    else {
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index 72af813d42..cde8ce9618 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -437,7 +437,8 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
                      LLVMValueRef *face,
                      LLVMValueRef *face_s,
                      LLVMValueRef *face_t,
-                     LLVMValueRef *rho);
+                     LLVMValueRef *rho,
+                     boolean need_derivs);
 
 
 void
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index d2cc0f362a..ced2103fca 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -1102,7 +1102,13 @@ lp_build_sample_common(struct lp_build_sample_context *bld,
     */
    if (target == PIPE_TEXTURE_CUBE) {
       LLVMValueRef face, face_s, face_t;
-      lp_build_cube_lookup(bld, *s, *t, *r, derivs, &face, &face_s, &face_t, &cube_rho);
+      boolean need_derivs;
+      need_derivs = ((min_filter != mag_filter ||
+                      mip_filter != PIPE_TEX_MIPFILTER_NONE) &&
+                      !bld->static_sampler_state->min_max_lod_equal &&
+                      !explicit_lod);
+      lp_build_cube_lookup(bld, *s, *t, *r, derivs, &face, &face_s, &face_t,
+                           &cube_rho, need_derivs);
       *s = face_s; /* vec */
       *t = face_t; /* vec */
       /* use 'r' to indicate cube face */
author	Roland Scheidegger <sroland@vmware.com>	2013-04-18 17:06:43 +0200
committer	Roland Scheidegger <sroland@vmware.com>	2013-04-18 17:06:43 +0200
commit	50cbcf0c4680bc13e63fe339ef79ed68b2f33c70 (patch)
tree	3aa291fea9ee10d3b3f0385ca3008fb3d7f8332a
parent	0d07f05ee87b5446bc3b85fc7be861c4801bb79e (diff)