From 47d0613eb70b2cb5d8837fe8e12325532a7918f5 Mon Sep 17 00:00:00 2001 From: Roland Scheidegger Date: Sat, 5 Oct 2013 03:26:47 +0200 Subject: gallivm: handle explicit derivatives for cubemaps They need some special handling. Quite complicated. Additionally, use the same code for implicit derivatives too if no_rho_approx and no_quad_lod is set, because it seems while generally it should be ok to use per quad lod for implicit derivatives there's at least some test which insists that in case of cubemaps the shared lod value MUST come from a pixel inside the primitive (due to the derivatives becoming different if a different larger major axis is chosen). v2: based on Brian's feedback, clean up code a bit. And use sign bit of major axis instead of pre-select s/t/r sign for coord mirroring (which should be the same in the end, saves 2 ands). Also fix two bugs with select/mirror of derivatives, the minor axes need to use major axis sign as well (instead of major derivative axis sign), and don't mistakenly use absolute values of major derivative and inverse major values. Reviewed-by: Jose Fonseca --- src/gallium/auxiliary/gallivm/lp_bld_sample.c | 253 +++++++++++++++++----- src/gallium/auxiliary/gallivm/lp_bld_sample.h | 3 +- src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c | 35 ++- 3 files changed, 235 insertions(+), 56 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c index dc593aabac..39c3a2f9d9 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c @@ -273,7 +273,7 @@ lp_build_rho(struct lp_build_sample_context *bld, cubesize = lp_build_mul(rho_bld, cubesize, cubesize); rho = lp_build_mul(rho_bld, cubesize, rho); } - else if (derivs && !(bld->static_texture_state->target == PIPE_TEXTURE_CUBE)) { + else if (derivs) { LLVMValueRef ddmax[3], ddx[3], ddy[3]; for (i = 0; i < dims; i++) { LLVMValueRef floatdim; @@ -1481,6 +1481,21 @@ lp_build_cube_face(struct lp_build_sample_context *bld, } +/** Helper for doing 3-wise selection. + * Returns sel1 ? val2 : (sel0 ? val0 : val1). + */ +static LLVMValueRef +lp_build_select3(struct lp_build_context *sel_bld, + LLVMValueRef sel0, + LLVMValueRef sel1, + LLVMValueRef val0, + LLVMValueRef val1, + LLVMValueRef val2) +{ + LLVMValueRef tmp; + tmp = lp_build_select(sel_bld, sel0, val0, val1); + return lp_build_select(sel_bld, sel1, val2, tmp); +} /** * Generate code to do cube face selection and compute per-face texcoords. @@ -1488,8 +1503,9 @@ lp_build_cube_face(struct lp_build_sample_context *bld, void lp_build_cube_lookup(struct lp_build_sample_context *bld, LLVMValueRef *coords, - const struct lp_derivatives *derivs, /* optional */ + const struct lp_derivatives *derivs_in, /* optional */ LLVMValueRef *rho, + struct lp_derivatives *derivs_out, /* optional */ boolean need_derivs) { struct lp_build_context *coord_bld = &bld->coord_bld; @@ -1512,19 +1528,16 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld, * the edge). Still this is possibly a win over just selecting the same face * for all pixels. Unfortunately, something like that doesn't work for * explicit derivatives. - * TODO: handle explicit derivatives by transforming them alongside coords - * somehow. */ struct lp_build_context *cint_bld = &bld->int_coord_bld; struct lp_type intctype = cint_bld->type; LLVMTypeRef coord_vec_type = coord_bld->vec_type; LLVMTypeRef cint_vec_type = cint_bld->vec_type; - LLVMValueRef signs, signt, signr, signma; LLVMValueRef as, at, ar, face, face_s, face_t; LLVMValueRef as_ge_at, maxasat, ar_ge_as_at; LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz; LLVMValueRef tnegi, rnegi; - LLVMValueRef ma, mai, ima; + LLVMValueRef ma, mai, signma, signmabit, imahalfpos; LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5); LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype, 1 << (intctype.width - 1)); @@ -1563,7 +1576,166 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld, maxasat = lp_build_max(coord_bld, as, at); ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat); - if (need_derivs) { + if (need_derivs && (derivs_in || + ((gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) && + (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX)))) { + /* + * XXX: This is really really complex. + * It is a bit overkill to use this for implicit derivatives as well, + * no way this is worth the cost in practice, but seems to be the + * only way for getting accurate and per-pixel lod values. + */ + LLVMValueRef ima, imahalf, tmp, ddx[3], ddy[3]; + LLVMValueRef madx, mady, madxdivma, madydivma; + LLVMValueRef sdxi, tdxi, rdxi, sdyi, tdyi, rdyi; + LLVMValueRef tdxnegi, rdxnegi, tdynegi, rdynegi; + LLVMValueRef sdxnewx, sdxnewy, sdxnewz, tdxnewx, tdxnewy, tdxnewz; + LLVMValueRef sdynewx, sdynewy, sdynewz, tdynewx, tdynewy, tdynewz; + LLVMValueRef face_sdx, face_tdx, face_sdy, face_tdy; + /* + * s = 1/2 * ( sc / ma + 1) + * t = 1/2 * ( tc / ma + 1) + * + * s' = 1/2 * (sc' * ma - sc * ma') / ma^2 + * t' = 1/2 * (tc' * ma - tc * ma') / ma^2 + * + * dx.s = 0.5 * (dx.sc - sc * dx.ma / ma) / ma + * dx.t = 0.5 * (dx.tc - tc * dx.ma / ma) / ma + * dy.s = 0.5 * (dy.sc - sc * dy.ma / ma) / ma + * dy.t = 0.5 * (dy.tc - tc * dy.ma / ma) / ma + */ + + /* select ma, calculate ima */ + ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r); + mai = LLVMBuildBitCast(builder, ma, cint_vec_type, ""); + signmabit = LLVMBuildAnd(builder, mai, signmask, ""); + ima = lp_build_div(coord_bld, coord_bld->one, ma); + imahalf = lp_build_mul(coord_bld, posHalf, ima); + imahalfpos = lp_build_abs(coord_bld, imahalf); + + if (!derivs_in) { + ddx[0] = lp_build_ddx(coord_bld, s); + ddx[1] = lp_build_ddx(coord_bld, t); + ddx[2] = lp_build_ddx(coord_bld, r); + ddy[0] = lp_build_ddy(coord_bld, s); + ddy[1] = lp_build_ddy(coord_bld, t); + ddy[2] = lp_build_ddy(coord_bld, r); + } + else { + ddx[0] = derivs_in->ddx[0]; + ddx[1] = derivs_in->ddx[1]; + ddx[2] = derivs_in->ddx[2]; + ddy[0] = derivs_in->ddy[0]; + ddy[1] = derivs_in->ddy[1]; + ddy[2] = derivs_in->ddy[2]; + } + + /* select major derivatives */ + madx = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, ddx[0], ddx[1], ddx[2]); + mady = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, ddy[0], ddy[1], ddy[2]); + + si = LLVMBuildBitCast(builder, s, cint_vec_type, ""); + ti = LLVMBuildBitCast(builder, t, cint_vec_type, ""); + ri = LLVMBuildBitCast(builder, r, cint_vec_type, ""); + + sdxi = LLVMBuildBitCast(builder, ddx[0], cint_vec_type, ""); + tdxi = LLVMBuildBitCast(builder, ddx[1], cint_vec_type, ""); + rdxi = LLVMBuildBitCast(builder, ddx[2], cint_vec_type, ""); + + sdyi = LLVMBuildBitCast(builder, ddy[0], cint_vec_type, ""); + tdyi = LLVMBuildBitCast(builder, ddy[1], cint_vec_type, ""); + rdyi = LLVMBuildBitCast(builder, ddy[2], cint_vec_type, ""); + + /* + * compute all possible new s/t coords, which does the mirroring, + * and do the same for derivs minor axes. + * snewx = signma * -r; + * tnewx = -t; + * snewy = s; + * tnewy = signma * r; + * snewz = signma * s; + * tnewz = -t; + */ + tnegi = LLVMBuildXor(builder, ti, signmask, ""); + rnegi = LLVMBuildXor(builder, ri, signmask, ""); + tdxnegi = LLVMBuildXor(builder, tdxi, signmask, ""); + rdxnegi = LLVMBuildXor(builder, rdxi, signmask, ""); + tdynegi = LLVMBuildXor(builder, tdyi, signmask, ""); + rdynegi = LLVMBuildXor(builder, rdyi, signmask, ""); + + snewx = LLVMBuildXor(builder, signmabit, rnegi, ""); + tnewx = tnegi; + sdxnewx = LLVMBuildXor(builder, signmabit, rdxnegi, ""); + tdxnewx = tdxnegi; + sdynewx = LLVMBuildXor(builder, signmabit, rdynegi, ""); + tdynewx = tdynegi; + + snewy = si; + tnewy = LLVMBuildXor(builder, signmabit, ri, ""); + sdxnewy = sdxi; + tdxnewy = LLVMBuildXor(builder, signmabit, rdxi, ""); + sdynewy = sdyi; + tdynewy = LLVMBuildXor(builder, signmabit, rdyi, ""); + + snewz = LLVMBuildXor(builder, signmabit, si, ""); + tnewz = tnegi; + sdxnewz = LLVMBuildXor(builder, signmabit, sdxi, ""); + tdxnewz = tdxnegi; + sdynewz = LLVMBuildXor(builder, signmabit, sdyi, ""); + tdynewz = tdynegi; + + /* select the mirrored values */ + face = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, facex, facey, facez); + face_s = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, snewx, snewy, snewz); + face_t = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tnewx, tnewy, tnewz); + face_sdx = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, sdxnewx, sdxnewy, sdxnewz); + face_tdx = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tdxnewx, tdxnewy, tdxnewz); + face_sdy = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, sdynewx, sdynewy, sdynewz); + face_tdy = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tdynewx, tdynewy, tdynewz); + + face_s = LLVMBuildBitCast(builder, face_s, coord_vec_type, ""); + face_t = LLVMBuildBitCast(builder, face_t, coord_vec_type, ""); + face_sdx = LLVMBuildBitCast(builder, face_sdx, coord_vec_type, ""); + face_tdx = LLVMBuildBitCast(builder, face_tdx, coord_vec_type, ""); + face_sdy = LLVMBuildBitCast(builder, face_sdy, coord_vec_type, ""); + face_tdy = LLVMBuildBitCast(builder, face_tdy, coord_vec_type, ""); + + /* deriv math, dx.s = 0.5 * (dx.sc - sc * dx.ma / ma) / ma */ + madxdivma = lp_build_mul(coord_bld, madx, ima); + tmp = lp_build_mul(coord_bld, madxdivma, face_s); + tmp = lp_build_sub(coord_bld, face_sdx, tmp); + derivs_out->ddx[0] = lp_build_mul(coord_bld, tmp, imahalf); + + /* dx.t = 0.5 * (dx.tc - tc * dx.ma / ma) / ma */ + tmp = lp_build_mul(coord_bld, madxdivma, face_t); + tmp = lp_build_sub(coord_bld, face_tdx, tmp); + derivs_out->ddx[1] = lp_build_mul(coord_bld, tmp, imahalf); + + /* dy.s = 0.5 * (dy.sc - sc * dy.ma / ma) / ma */ + madydivma = lp_build_mul(coord_bld, mady, ima); + tmp = lp_build_mul(coord_bld, madydivma, face_s); + tmp = lp_build_sub(coord_bld, face_sdy, tmp); + derivs_out->ddy[0] = lp_build_mul(coord_bld, tmp, imahalf); + + /* dy.t = 0.5 * (dy.tc - tc * dy.ma / ma) / ma */ + tmp = lp_build_mul(coord_bld, madydivma, face_t); + tmp = lp_build_sub(coord_bld, face_tdy, tmp); + derivs_out->ddy[1] = lp_build_mul(coord_bld, tmp, imahalf); + + signma = LLVMBuildLShr(builder, mai, signshift, ""); + coords[2] = LLVMBuildOr(builder, face, signma, "face"); + + /* project coords */ + face_s = lp_build_mul(coord_bld, face_s, imahalfpos); + face_t = lp_build_mul(coord_bld, face_t, imahalfpos); + + coords[0] = lp_build_add(coord_bld, face_s, posHalf); + coords[1] = lp_build_add(coord_bld, face_t, posHalf); + + return; + } + + else if (need_derivs) { LLVMValueRef ddx_ddy[2], tmp[3], rho_vec; static const unsigned char swizzle0[] = { /* no-op swizzle */ 0, LP_BLD_SWIZZLE_DONTCARE, @@ -1590,12 +1762,11 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld, * scale the s/t/r coords pre-select/mirror so we can calculate * "reasonable" derivs. */ - ma = lp_build_select(coord_bld, as_ge_at, s, t); - ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma); - ima = lp_build_cube_imapos(coord_bld, ma); - s = lp_build_mul(coord_bld, s, ima); - t = lp_build_mul(coord_bld, t, ima); - r = lp_build_mul(coord_bld, r, ima); + ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r); + imahalfpos = lp_build_cube_imapos(coord_bld, ma); + s = lp_build_mul(coord_bld, s, imahalfpos); + t = lp_build_mul(coord_bld, t, imahalfpos); + r = lp_build_mul(coord_bld, r, imahalfpos); /* * This isn't quite the same as the "ordinary" (3d deriv) path since we @@ -1625,56 +1796,41 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld, *rho = lp_build_max(coord_bld, tmp[0], tmp[1]); } + if (!need_derivs) { + ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r); + } + mai = LLVMBuildBitCast(builder, ma, cint_vec_type, ""); + signmabit = LLVMBuildAnd(builder, mai, signmask, ""); + si = LLVMBuildBitCast(builder, s, cint_vec_type, ""); ti = LLVMBuildBitCast(builder, t, cint_vec_type, ""); ri = LLVMBuildBitCast(builder, r, cint_vec_type, ""); - signs = LLVMBuildAnd(builder, si, signmask, ""); - signt = LLVMBuildAnd(builder, ti, signmask, ""); - signr = LLVMBuildAnd(builder, ri, signmask, ""); /* - * compute all possible new s/t coords - * snewx = signs * -r; + * compute all possible new s/t coords, which does the mirroring + * snewx = signma * -r; * tnewx = -t; * snewy = s; - * tnewy = signt * r; - * snewz = signr * s; + * tnewy = signma * r; + * snewz = signma * s; * tnewz = -t; */ tnegi = LLVMBuildXor(builder, ti, signmask, ""); rnegi = LLVMBuildXor(builder, ri, signmask, ""); - snewx = LLVMBuildXor(builder, signs, rnegi, ""); + snewx = LLVMBuildXor(builder, signmabit, rnegi, ""); tnewx = tnegi; snewy = si; - tnewy = LLVMBuildXor(builder, signt, ri, ""); + tnewy = LLVMBuildXor(builder, signmabit, ri, ""); - snewz = LLVMBuildXor(builder, signr, si, ""); + snewz = LLVMBuildXor(builder, signmabit, si, ""); tnewz = tnegi; - /* XXX on x86 unclear if we should cast the values back to float - * or not - on some cpus (nehalem) pblendvb has twice the throughput - * of blendvps though on others there just might be domain - * transition penalties when using it (this depends on what llvm - * will chose for the bit ops above so there appears no "right way", - * but given the boatload of selects let's just use the int type). - */ - - /* select/mirror */ - if (!need_derivs) { - ma = lp_build_select(coord_bld, as_ge_at, s, t); - } - face_s = lp_build_select(cint_bld, as_ge_at, snewx, snewy); - face_t = lp_build_select(cint_bld, as_ge_at, tnewx, tnewy); - face = lp_build_select(cint_bld, as_ge_at, facex, facey); - - if (!need_derivs) { - ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma); - } - face_s = lp_build_select(cint_bld, ar_ge_as_at, snewz, face_s); - face_t = lp_build_select(cint_bld, ar_ge_as_at, tnewz, face_t); - face = lp_build_select(cint_bld, ar_ge_as_at, facez, face); + /* select the mirrored values */ + face_s = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, snewx, snewy, snewz); + face_t = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tnewx, tnewy, tnewz); + face = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, facex, facey, facez); face_s = LLVMBuildBitCast(builder, face_s, coord_vec_type, ""); face_t = LLVMBuildBitCast(builder, face_t, coord_vec_type, ""); @@ -1684,15 +1840,14 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld, * as long as we ensure vblendvps gets used we can actually * skip the comparison and just use sign as a "mask" directly. */ - mai = LLVMBuildBitCast(builder, ma, cint_vec_type, ""); signma = LLVMBuildLShr(builder, mai, signshift, ""); coords[2] = LLVMBuildOr(builder, face, signma, "face"); /* project coords */ if (!need_derivs) { - ima = lp_build_cube_imapos(coord_bld, ma); - face_s = lp_build_mul(coord_bld, face_s, ima); - face_t = lp_build_mul(coord_bld, face_t, ima); + imahalfpos = lp_build_cube_imapos(coord_bld, ma); + face_s = lp_build_mul(coord_bld, face_s, imahalfpos); + face_t = lp_build_mul(coord_bld, face_t, imahalfpos); } coords[0] = lp_build_add(coord_bld, face_s, posHalf); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h index 803a99e3b0..70f03503f0 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h @@ -457,8 +457,9 @@ lp_build_unnormalized_coords(struct lp_build_sample_context *bld, void lp_build_cube_lookup(struct lp_build_sample_context *bld, LLVMValueRef *coords, - const struct lp_derivatives *derivs, /* optional */ + const struct lp_derivatives *derivs_in, /* optional */ LLVMValueRef *rho, + struct lp_derivatives *derivs_out, /* optional */ boolean need_derivs); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c index 33378bcdcd..54dee25bfd 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c @@ -1387,6 +1387,7 @@ lp_build_sample_common(struct lp_build_sample_context *bld, const unsigned target = bld->static_texture_state->target; LLVMValueRef first_level, cube_rho = NULL; LLVMValueRef lod_ipart = NULL; + struct lp_derivatives cube_derivs; /* printf("%s mip %d min %d mag %d\n", __FUNCTION__, @@ -1403,7 +1404,8 @@ lp_build_sample_common(struct lp_build_sample_context *bld, mip_filter != PIPE_TEX_MIPFILTER_NONE) && !bld->static_sampler_state->min_max_lod_equal && !explicit_lod); - lp_build_cube_lookup(bld, coords, derivs, &cube_rho, need_derivs); + lp_build_cube_lookup(bld, coords, derivs, &cube_rho, &cube_derivs, need_derivs); + derivs = &cube_derivs; } else if (target == PIPE_TEXTURE_1D_ARRAY || target == PIPE_TEXTURE_2D_ARRAY) { @@ -2163,9 +2165,24 @@ lp_build_sample_soa(struct gallivm_state *gallivm, * avoided like min and max lod being equal. */ bld.num_mips = bld.num_lods = 1; - if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT && - (explicit_lod || lod_bias || - (derivs && static_texture_state->target != PIPE_TEXTURE_CUBE))) { + + if ((gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) && + (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && + (static_texture_state->target == PIPE_TEXTURE_CUBE) && + (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)) { + /* + * special case for using per-pixel lod even for implicit lod, + * which is generally never required (ok by APIs) except to please + * some (somewhat broken imho) tests (because per-pixel face selection + * can cause derivatives to be different for pixels outside the primitive + * due to the major axis division even if pre-project derivatives are + * looking normal). + */ + bld.num_mips = type.length; + bld.num_lods = type.length; + } + else if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT || + (explicit_lod || lod_bias || derivs)) { if ((is_fetch && target != PIPE_BUFFER) || (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)) { bld.num_mips = type.length; @@ -2371,9 +2388,15 @@ lp_build_sample_soa(struct gallivm_state *gallivm, bld4.texel_type.length = 4; bld4.num_mips = bld4.num_lods = 1; + if ((gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) && + (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && + (static_texture_state->target == PIPE_TEXTURE_CUBE) && + (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)) { + bld4.num_mips = type4.length; + bld4.num_lods = type4.length; + } if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT && - (explicit_lod || lod_bias || - (derivs && static_texture_state->target != PIPE_TEXTURE_CUBE))) { + (explicit_lod || lod_bias || derivs)) { if ((is_fetch && target != PIPE_BUFFER) || (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)) { bld4.num_mips = type4.length; -- cgit v1.2.3